From 6ff77505a7b48473bcd221a54a4a45c545c407b6 Mon Sep 17 00:00:00 2001
From: Sam Weaver <sweaver@relaypro.com>
Date: Wed, 2 Jul 2025 18:23:18 -0400
Subject: [PATCH] Update `Statistex.variance` to use Welford's online algorithm

- Also exposes `m2` statistic so this algorithm can be used from user code
- Added test for the above
- Update dialyzer specs to better tolerate `:ignored` option when calling certain statistic functions
---
 .gitignore              |   1 +
 README.md               |   5 ++
 lib/statistex.ex        | 111 ++++++++++++++++++++++++++++++++--------
 test/statistex_test.exs |  28 ++++++++++
 4 files changed, 125 insertions(+), 20 deletions(-)

diff --git a/.gitignore b/.gitignore
index 33b2fa2..d037918 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ elixir_build_*
 doc
 docs
 /test/tmp
+.DS_Store
 
 # Don't feel like tracking that gives me what I want any more :)
 .tool-versions
diff --git a/README.md b/README.md
index 28ba526..0b05a3a 100644
--- a/README.md
+++ b/README.md
@@ -49,6 +49,7 @@ iex> Statistex.statistics(samples)
   standard_deviation: 10.47189577445799,
   standard_deviation_ratio: 1.46316833512058,
   total: 71.57,
+  m2: 986.9454099999999,
   variance: 109.6606011111111
 }
 # or just calculate the value you need
@@ -127,3 +128,7 @@ A couple of (hopefully) helpful points:
 * `mix test` to run tests
 * `mix dialyzer` to run dialyzer for type checking, might take a while on the first invocation (try building plts first with `mix dialyzer --plt`)
 * `mix credo` to find code style problems
+* To generate code coverage information:
+    * `mix test --cover --export-coverage default`
+    * `mix test.coverage`
+    * Open HTML files in the `cover/` directory
diff --git a/lib/statistex.ex b/lib/statistex.ex
index 8b67d9c..381492c 100644
--- a/lib/statistex.ex
+++ b/lib/statistex.ex
@@ -20,6 +20,7 @@ defmodule Statistex do
   defstruct [
     :total,
     :average,
+    :m2,
     :variance,
     :standard_deviation,
     :standard_deviation_ratio,
@@ -43,6 +44,7 @@ defmodule Statistex do
   @type t :: %__MODULE__{
           total: number,
           average: float,
+          m2: float,
           variance: float,
           standard_deviation: float,
           standard_deviation_ratio: float,
@@ -119,6 +121,7 @@ defmodule Statistex do
       %Statistex{
         total: 4450,
         average: 445.0,
+        m2: 552250.0,
         variance: 61_361.11111111111,
         standard_deviation: 247.71175004652304,
         standard_deviation_ratio: 0.5566556180820742,
@@ -139,9 +142,10 @@ defmodule Statistex do
       %Statistex{
         total: 3450,
         average: 492.85714285714283,
-        variance: 2857.142857142857,
-        standard_deviation: 53.452248382484875,
-        standard_deviation_ratio: 0.1084538372977954,
+        m2: 17142.857142857145,
+        variance: 2857.1428571428573,
+        standard_deviation: 53.45224838248488,
+        standard_deviation_ratio: 0.10845383729779542,
         median: 500.0,
         percentiles: %{25 => 450.0, 50 => 500.0, 75 => 500.0},
         frequency_distribution: %{450 => 3, 500 => 3, 600 => 1},
@@ -195,7 +199,8 @@ defmodule Statistex do
     maximum = List.last(sorted_samples)
 
     average = average(sorted_samples, total: total, sample_size: sample_size)
-    variance = variance(sorted_samples, average: average, sample_size: sample_size)
+    m2 = m2(sorted_samples)
+    variance = variance(sorted_samples, sample_size: sample_size, m2: m2)
 
     frequency_distribution = frequency_distribution(sorted_samples)
 
@@ -209,6 +214,7 @@ defmodule Statistex do
     %__MODULE__{
       total: total,
       average: average,
+      m2: m2,
       variance: variance,
       standard_deviation: standard_deviation,
       standard_deviation_ratio: standard_deviation_ratio,
@@ -298,7 +304,7 @@ defmodule Statistex do
       iex> Statistex.average([])
       ** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number.
   """
-  @spec average(samples, keyword) :: float
+  @spec average(samples | :ignored, keyword) :: float
   def average(samples, options \\ [])
   def average([], _), do: raise(ArgumentError, @empty_list_error_message)
 
@@ -309,6 +315,77 @@ defmodule Statistex do
     total / sample_size
   end
 
+  @doc """
+  Calculate the running sum of squared differences from the current mean.
+
+  This value is only used when trying to calculate the variance in a single pass, using Welford's online algorithm.
+
+  `Argumenterror` is raised if the given list is empty.
+
+  ## Options
+
+  If are performing single-pass variance, you can calculate a new M2 for a single data point by providing your single data point, along with the previous `:sample_size`, `:m2`, and either the `:average` or `:total`. See `StatistexTest` for an example of how this can be done.
+
+  If calculating M2 over your entire dataset, do supply any options (do not use `:total` or `:average` that were previously calculated) or your result will be wrong.
+
+  ## Examples
+
+      iex> Statistex.m2([10])
+      0.0
+
+      iex> Statistex.m2([10, 20])
+      50.0
+
+      iex> Statistex.m2([10, 20, 30])
+      200.0
+
+      iex> Statistex.m2(30, sample_size: 2, m2: 50.0, average: 15.0)
+      200.0
+
+      iex> Statistex.m2([])
+      ** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number.
+  """
+  @spec m2(samples | sample, keyword) :: float
+  def m2(samples, options \\ [])
+  def m2([], _), do: raise(ArgumentError, @empty_list_error_message)
+
+  def m2(samples, options) when is_list(samples) do
+    count = Keyword.get(options, :sample_size, 0)
+    m2 = Keyword.get(options, :m2, 0.0)
+    total = Keyword.get(options, :total, 0.0)
+
+    mean =
+      case {count, total} do
+        {0, 0.0} ->
+          0
+
+        {0, 0} ->
+          0
+
+        _ ->
+          Keyword.get_lazy(options, :average, fn ->
+            average(:ignored, sample_size: count, total: total)
+          end)
+      end
+
+    do_m2(samples, count, mean, m2)
+  end
+
+  def m2(sample, options) do
+    m2([sample], options)
+  end
+
+  defp do_m2([], _, _, m2), do: m2
+
+  defp do_m2([sample | rest], count, mean, m2) do
+    count = count + 1
+    delta = sample - mean
+    mean = mean + delta / count
+    delta2 = sample - mean
+    m2 = m2 + delta * delta2
+    do_m2(rest, count, mean, m2)
+  end
+
   @doc """
   Calculate the variance.
 
@@ -317,7 +394,7 @@ defmodule Statistex do
   `Argumenterror` is raised if the given list is empty.
 
   ## Options
-  If already calculated, the `:average` and `:sample_size` options can be provided to avoid recalulating those values.
+  If already calculated, the `:sample_size` and `:m2` options can be provided to avoid recalulating those values. Should you provide both the provided samples are wholly ignored.
 
   ## Examples
 
@@ -336,28 +413,22 @@ defmodule Statistex do
       iex> Statistex.variance([])
       ** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number.
   """
-  @spec variance(samples, keyword) :: float
+  @spec variance(samples | :ignored, keyword) :: float
   def variance(samples, options \\ [])
   def variance([], _), do: raise(ArgumentError, @empty_list_error_message)
 
   def variance(samples, options) do
     sample_size = Keyword.get_lazy(options, :sample_size, fn -> sample_size(samples) end)
 
-    average =
-      Keyword.get_lazy(options, :average, fn -> average(samples, sample_size: sample_size) end)
+    m2 = Keyword.get_lazy(options, :m2, fn -> m2(samples) end)
 
-    do_variance(samples, average, sample_size)
+    do_variance(sample_size, m2)
   end
 
-  defp do_variance(_samples, _average, 1), do: 0.0
-
-  defp do_variance(samples, average, sample_size) do
-    total_variance =
-      Enum.reduce(samples, 0, fn sample, total ->
-        total + :math.pow(sample - average, 2)
-      end)
+  defp do_variance(1, _m2), do: 0.0
 
-    total_variance / (sample_size - 1)
+  defp do_variance(sample_size, m2) do
+    m2 / (sample_size - 1)
   end
 
   @doc """
@@ -387,7 +458,7 @@ defmodule Statistex do
       iex> Statistex.standard_deviation([])
       ** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number.
   """
-  @spec standard_deviation(samples, keyword) :: float
+  @spec standard_deviation(samples | :ignored, keyword) :: float
   def standard_deviation(samples, options \\ [])
   def standard_deviation([], _), do: raise(ArgumentError, @empty_list_error_message)
 
@@ -425,7 +496,7 @@ defmodule Statistex do
         iex> Statistex.standard_deviation_ratio([])
         ** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number.
   """
-  @spec standard_deviation_ratio(samples, keyword) :: float
+  @spec standard_deviation_ratio(samples | :ignored, keyword) :: float
   def standard_deviation_ratio(samples, options \\ [])
   def standard_deviation_ratio([], _), do: raise(ArgumentError, @empty_list_error_message)
 
diff --git a/test/statistex_test.exs b/test/statistex_test.exs
index 5292270..f756546 100644
--- a/test/statistex_test.exs
+++ b/test/statistex_test.exs
@@ -35,6 +35,7 @@ defmodule Statistex.StatistexTest do
     test "all 0 values do what you think they would" do
       assert Statistex.statistics([0, 0, 0, 0]) == %Statistex{
                average: 0.0,
+               m2: 0.0,
                variance: 0.0,
                standard_deviation: 0.0,
                standard_deviation_ratio: 0.0,
@@ -57,6 +58,7 @@ defmodule Statistex.StatistexTest do
                %Statistex{
                  total: 4500,
                  average: 500.0,
+                 m2: 320_000.0,
                  variance: 40_000.0,
                  standard_deviation: 200.0,
                  standard_deviation_ratio: 0.4,
@@ -78,6 +80,7 @@ defmodule Statistex.StatistexTest do
                %Statistex{
                  total: 4450,
                  average: 445.0,
+                 m2: 552_250.0,
                  variance: 61_361.11111111111,
                  standard_deviation: 247.71175004652304,
                  standard_deviation_ratio: 0.5566556180820742,
@@ -170,6 +173,31 @@ defmodule Statistex.StatistexTest do
     end
   end
 
+  describe ".m2/2" do
+    test "ensure manual on-line variance calculation matches normal API" do
+      samples = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+
+      {sample_size, total, m2} =
+        Enum.reduce(samples, {0, 0, 0.0}, fn sample, {count, total, m2} ->
+          m2 = Statistex.m2(sample, sample_size: count, m2: m2, total: total)
+          count = count + 1
+          total = total + sample
+          {count, total, m2}
+        end)
+
+      assert sample_size == Statistex.sample_size(samples)
+      assert total == Statistex.total(samples)
+      assert m2 == Statistex.m2(samples)
+
+      variance = Statistex.variance(:ignored, sample_size: sample_size, m2: m2)
+
+      assert variance == Statistex.variance(samples)
+
+      assert Statistex.standard_deviation(samples) ==
+               Statistex.standard_deviation(:ignored, variance: variance)
+    end
+  end
+
   describe "property testing as we might get loads of data" do
     property "doesn't blow up no matter what kind of nonempty list of floats it's given" do
       check all(samples <- list_of(float(), min_length: 1)) do