Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ elixir_build_*
doc
docs
/test/tmp
.DS_Store

# Don't feel like tracking that gives me what I want any more :)
.tool-versions
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ iex> Statistex.statistics(samples)
standard_deviation: 10.47189577445799,
standard_deviation_ratio: 1.46316833512058,
total: 71.57,
m2: 986.9454099999999,
variance: 109.6606011111111
}
# or just calculate the value you need
Expand Down Expand Up @@ -127,3 +128,7 @@ A couple of (hopefully) helpful points:
* `mix test` to run tests
* `mix dialyzer` to run dialyzer for type checking, might take a while on the first invocation (try building plts first with `mix dialyzer --plt`)
* `mix credo` to find code style problems
* To generate code coverage information:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't accurate for this project as we use excoveralls - although to be honest haven't used the old school --cover in a long while. So it's mix coveralls.html :)

* `mix test --cover --export-coverage default`
* `mix test.coverage`
* Open HTML files in the `cover/` directory
111 changes: 91 additions & 20 deletions lib/statistex.ex
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ defmodule Statistex do
defstruct [
:total,
:average,
:m2,
:variance,
:standard_deviation,
:standard_deviation_ratio,
Expand All @@ -43,6 +44,7 @@ defmodule Statistex do
@type t :: %__MODULE__{
total: number,
average: float,
m2: float,
variance: float,
standard_deviation: float,
standard_deviation_ratio: float,
Expand Down Expand Up @@ -119,6 +121,7 @@ defmodule Statistex do
%Statistex{
total: 4450,
average: 445.0,
m2: 552250.0,
variance: 61_361.11111111111,
standard_deviation: 247.71175004652304,
standard_deviation_ratio: 0.5566556180820742,
Expand All @@ -139,9 +142,10 @@ defmodule Statistex do
%Statistex{
total: 3450,
average: 492.85714285714283,
variance: 2857.142857142857,
standard_deviation: 53.452248382484875,
standard_deviation_ratio: 0.1084538372977954,
m2: 17142.857142857145,
variance: 2857.1428571428573,
standard_deviation: 53.45224838248488,
standard_deviation_ratio: 0.10845383729779542,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the comment, but the differences are so far back I don't think they matter/a normal test with "to a precision of 5 digits" wouldn't even have caught it.

median: 500.0,
percentiles: %{25 => 450.0, 50 => 500.0, 75 => 500.0},
frequency_distribution: %{450 => 3, 500 => 3, 600 => 1},
Expand Down Expand Up @@ -195,7 +199,8 @@ defmodule Statistex do
maximum = List.last(sorted_samples)

average = average(sorted_samples, total: total, sample_size: sample_size)
variance = variance(sorted_samples, average: average, sample_size: sample_size)
m2 = m2(sorted_samples)
variance = variance(sorted_samples, sample_size: sample_size, m2: m2)

frequency_distribution = frequency_distribution(sorted_samples)

Expand All @@ -209,6 +214,7 @@ defmodule Statistex do
%__MODULE__{
total: total,
average: average,
m2: m2,
variance: variance,
standard_deviation: standard_deviation,
standard_deviation_ratio: standard_deviation_ratio,
Expand Down Expand Up @@ -298,7 +304,7 @@ defmodule Statistex do
iex> Statistex.average([])
** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number.
"""
@spec average(samples, keyword) :: float
@spec average(samples | :ignored, keyword) :: float
def average(samples, options \\ [])
def average([], _), do: raise(ArgumentError, @empty_list_error_message)

Expand All @@ -309,6 +315,77 @@ defmodule Statistex do
total / sample_size
end

@doc """
Calculate the running sum of squared differences from the current mean.

This value is only used when trying to calculate the variance in a single pass, using Welford's online algorithm.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A link similar to the one in your PR may be great here!


`Argumenterror` is raised if the given list is empty.

## Options

If are performing single-pass variance, you can calculate a new M2 for a single data point by providing your single data point, along with the previous `:sample_size`, `:m2`, and either the `:average` or `:total`. See `StatistexTest` for an example of how this can be done.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think the tests ship with the package or the docs, so I don't think should be referenced from the docs :)


If calculating M2 over your entire dataset, do supply any options (do not use `:total` or `:average` that were previously calculated) or your result will be wrong.

## Examples

iex> Statistex.m2([10])
0.0

iex> Statistex.m2([10, 20])
50.0

iex> Statistex.m2([10, 20, 30])
200.0

iex> Statistex.m2(30, sample_size: 2, m2: 50.0, average: 15.0)
200.0

iex> Statistex.m2([])
** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number.
"""
@spec m2(samples | sample, keyword) :: float
def m2(samples, options \\ [])
def m2([], _), do: raise(ArgumentError, @empty_list_error_message)

def m2(samples, options) when is_list(samples) do
count = Keyword.get(options, :sample_size, 0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know it's nitpicky/focussed on wording but as the rest of the code calls this sample_size I think it'd be worth to keep with that wording :) (or since we keep updating it, maybe something like current_sample_size?)

m2 = Keyword.get(options, :m2, 0.0)
total = Keyword.get(options, :total, 0.0)

mean =
case {count, total} do
{0, 0.0} ->
0

{0, 0} ->
0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think here an if works clearer as you can do == 0 and so we don't need to make it 2 separate cases for 0.0 and 0.


_ ->
Keyword.get_lazy(options, :average, fn ->
average(:ignored, sample_size: count, total: total)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think here I'd just extract a (private) calc_average function passing sample_size and total in as the args - it'll be extremely tiny but yeah - avoids the :ignored bit 👀

end)
end

do_m2(samples, count, mean, m2)
end

def m2(sample, options) do
m2([sample], options)
end

defp do_m2([], _, _, m2), do: m2

defp do_m2([sample | rest], count, mean, m2) do
count = count + 1
delta = sample - mean
mean = mean + delta / count
delta2 = sample - mean
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a better name for delta2? I haven't read the algorithm description yet but would hope there is a better name to use 😁

m2 = m2 + delta * delta2
do_m2(rest, count, mean, m2)
end

@doc """
Calculate the variance.

Expand All @@ -317,7 +394,7 @@ defmodule Statistex do
`Argumenterror` is raised if the given list is empty.

## Options
If already calculated, the `:average` and `:sample_size` options can be provided to avoid recalulating those values.
If already calculated, the `:sample_size` and `:m2` options can be provided to avoid recalulating those values. Should you provide both the provided samples are wholly ignored.

## Examples

Expand All @@ -336,28 +413,22 @@ defmodule Statistex do
iex> Statistex.variance([])
** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number.
"""
@spec variance(samples, keyword) :: float
@spec variance(samples | :ignored, keyword) :: float
def variance(samples, options \\ [])
def variance([], _), do: raise(ArgumentError, @empty_list_error_message)

def variance(samples, options) do
sample_size = Keyword.get_lazy(options, :sample_size, fn -> sample_size(samples) end)

average =
Keyword.get_lazy(options, :average, fn -> average(samples, sample_size: sample_size) end)
m2 = Keyword.get_lazy(options, :m2, fn -> m2(samples) end)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not allowing to provide :average any more would be a breaking change which we should avoid (as I wouldn't want to bump a major for this). I suppose that means we'd need to keep the "old" offline way to calculate variance around as well.


do_variance(samples, average, sample_size)
do_variance(sample_size, m2)
end

defp do_variance(_samples, _average, 1), do: 0.0

defp do_variance(samples, average, sample_size) do
total_variance =
Enum.reduce(samples, 0, fn sample, total ->
total + :math.pow(sample - average, 2)
end)
defp do_variance(1, _m2), do: 0.0

total_variance / (sample_size - 1)
defp do_variance(sample_size, m2) do
m2 / (sample_size - 1)
end

@doc """
Expand Down Expand Up @@ -387,7 +458,7 @@ defmodule Statistex do
iex> Statistex.standard_deviation([])
** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number.
"""
@spec standard_deviation(samples, keyword) :: float
@spec standard_deviation(samples | :ignored, keyword) :: float
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, this confused me a bit :) I can now see why you did it, but I don't think it should be part of the type specs. Me passing in :ignored in some of the doc tests was just my cheeky way of showing that if you provide all the other data needed then truly the samples aren't even touched and hence don't matter. There is no special meaning to :ignored - you could pass in a pid or whatever. I don't think people should do that though.

I think if we wanted to highlight/encourage do that more we should supply a function/interface that just doesn't take any samples and either works on direct arguments or the keyword list but raises if any of them aren't there :)

def standard_deviation(samples, options \\ [])
def standard_deviation([], _), do: raise(ArgumentError, @empty_list_error_message)

Expand Down Expand Up @@ -425,7 +496,7 @@ defmodule Statistex do
iex> Statistex.standard_deviation_ratio([])
** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number.
"""
@spec standard_deviation_ratio(samples, keyword) :: float
@spec standard_deviation_ratio(samples | :ignored, keyword) :: float
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(same :ignored comment as above)

def standard_deviation_ratio(samples, options \\ [])
def standard_deviation_ratio([], _), do: raise(ArgumentError, @empty_list_error_message)

Expand Down
28 changes: 28 additions & 0 deletions test/statistex_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ defmodule Statistex.StatistexTest do
test "all 0 values do what you think they would" do
assert Statistex.statistics([0, 0, 0, 0]) == %Statistex{
average: 0.0,
m2: 0.0,
variance: 0.0,
standard_deviation: 0.0,
standard_deviation_ratio: 0.0,
Expand All @@ -57,6 +58,7 @@ defmodule Statistex.StatistexTest do
%Statistex{
total: 4500,
average: 500.0,
m2: 320_000.0,
variance: 40_000.0,
standard_deviation: 200.0,
standard_deviation_ratio: 0.4,
Expand All @@ -78,6 +80,7 @@ defmodule Statistex.StatistexTest do
%Statistex{
total: 4450,
average: 445.0,
m2: 552_250.0,
variance: 61_361.11111111111,
standard_deviation: 247.71175004652304,
standard_deviation_ratio: 0.5566556180820742,
Expand Down Expand Up @@ -170,6 +173,31 @@ defmodule Statistex.StatistexTest do
end
end

describe ".m2/2" do
test "ensure manual on-line variance calculation matches normal API" do
samples = [1, 2, 3, 4, 5, 6, 7, 8, 9]

{sample_size, total, m2} =
Enum.reduce(samples, {0, 0, 0.0}, fn sample, {count, total, m2} ->
m2 = Statistex.m2(sample, sample_size: count, m2: m2, total: total)
count = count + 1
total = total + sample
{count, total, m2}
end)

assert sample_size == Statistex.sample_size(samples)
assert total == Statistex.total(samples)
assert m2 == Statistex.m2(samples)

variance = Statistex.variance(:ignored, sample_size: sample_size, m2: m2)

assert variance == Statistex.variance(samples)

assert Statistex.standard_deviation(samples) ==
Statistex.standard_deviation(:ignored, variance: variance)
end
end

describe "property testing as we might get loads of data" do
property "doesn't blow up no matter what kind of nonempty list of floats it's given" do
check all(samples <- list_of(float(), min_length: 1)) do
Expand Down