diff --git a/src/HypothesisTests.jl b/src/HypothesisTests.jl index 663300c6..8eeee14a 100644 --- a/src/HypothesisTests.jl +++ b/src/HypothesisTests.jl @@ -37,7 +37,7 @@ check_same_length(x::Vector, y::Vector) = if length(x) != length(y) end # Basic function for finding a p-value given a distribution and tail -pvalue(dist::ContinuousUnivariateDistribution, x::Number; tail=:both) = +pvalue(dist::ContinuousUnivariateDistribution, x::Number; tail=:both) = if tail == :both min(2 * min(cdf(dist, x), ccdf(dist, x)), 1.0) elseif tail == :left @@ -48,7 +48,7 @@ pvalue(dist::ContinuousUnivariateDistribution, x::Number; tail=:both) = throw(ArgumentError("tail=$(tail) is invalid")) end -pvalue(dist::DiscreteUnivariateDistribution, x::Number; tail=:both) = +pvalue(dist::DiscreteUnivariateDistribution, x::Number; tail=:both) = if tail == :both min(2 * min(ccdf(dist, x-1), cdf(dist, x)), 1.0) elseif tail == :left @@ -138,4 +138,5 @@ include("t.jl") include("wilcoxon.jl") include("power_divergence.jl") include("anderson_darling.jl") +include("spearman.jl") end diff --git a/src/spearman.jl b/src/spearman.jl new file mode 100644 index 00000000..07a2153e --- /dev/null +++ b/src/spearman.jl @@ -0,0 +1,166 @@ +# spearman.jl +# Spearman's rank correlation test in Julia +# +# Copyright (C) 2016 Diego Javier Zea +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +export CorrelationTest, SpearmanCorrelationTest + +abstract CorrelationTest <: HypothesisTest + +"Sum squared difference of ranks (midranks for ties)" +spearman_S(xrank, yrank) = sumabs2(xrank .- yrank) + +immutable SpearmanCorrelationTest <: CorrelationTest + # Tied ranking for x and y + xrank::Vector{Float64} + yrank::Vector{Float64} + # Adjustment for ties + xtiesadj::Float64 + ytiesadj::Float64 + # Sum squared difference of ranks + S::Float64 + # Number of points + n::Int + # Spearman's ρ + ρ::Float64 + + function SpearmanCorrelationTest(x, y) + n = length(x) + @assert n==length(y) "Variables x and y must have the same length" + xrank, xtiesadj = HypothesisTests.tiedrank_adj(x) + yrank, ytiesadj = HypothesisTests.tiedrank_adj(y) + S = spearman_S(xrank, yrank) + ρ = corspearman(x, y) + new(xrank, yrank, xtiesadj, ytiesadj, S, n, ρ) + end + +end + +testname(::SpearmanCorrelationTest) = "Spearman's rank correlation test" + +# parameter of interest: name, value under h0, point estimate +population_param_of_interest(x::SpearmanCorrelationTest) = ("Spearman's ρ", 0.0, x.ρ) + +function show_params(io::IO, x::SpearmanCorrelationTest, ident) + println(io, ident, "Number of points: ", x.n) + println(io, ident, "Spearman's ρ: ", x.ρ) + println(io, ident, "S (Sum squared difference of ranks): ", x.S) + println(io, ident, "adjustment for ties in x: ", x.xtiesadj) + println(io, ident, "adjustment for ties in y: ", x.ytiesadj) +end + +function P_from_null_S_values(S_null, x::SpearmanCorrelationTest, tail::Symbol) + S_null_mean = mean(S_null) + # S is approximately normally distributed + # S and ρ are inversely proportional + S_null[:] = S_null .- S_null_mean # center + S_centered = x.S - S_null_mean # center + if tail == :both + modS = abs(S_centered) + mean(S_null .<= -modS) + mean(S_null .>= modS) + elseif tail == :right + mean(S_null .<= S_centered) + elseif tail == :left + mean(S_null .>= S_centered) + else + throw(ArgumentError("tail=$(tail) is invalid")) + end +end + +function spearman_P_exact(x::SpearmanCorrelationTest, tail::Symbol) + S_null = Float64[ spearman_S(perm, x.yrank) for perm in permutations(x.xrank) ] + P_from_null_S_values(S_null, x, tail) +end + +function spearman_P_sampling(x::SpearmanCorrelationTest, tail::Symbol) + # 360000 samples gives an se(P) < 0.0005 for P < 0.1 + X = copy(x.xrank) + S_null = Float64[ spearman_S(shuffle!(X), x.yrank) for sample in 1:360000 ] + P_from_null_S_values(S_null, x, tail) +end + +# Use estimated mean and std for the S null distribution as in: +# +# Press WH, Teukolsky SA, Vetterling WT, Flannery BP. +# Numerical recipes in C. +# Cambridge: Cambridge university press; 1996. +function spearman_P_estimated(x::SpearmanCorrelationTest, tail::Symbol) + N = float(x.n) + a = (N^3 - N) + # Numerical Recipes (14.6.6) + S_null_mean = (a/6.) - (x.xtiesadj/12.) - (x.ytiesadj/12.) + # Numerical Recipes (14.6.7) + S_null_std = sqrt( ((N - 1.)/36.) * (1. - (x.xtiesadj/a) ) * (1. - (x.ytiesadj/a) ) ) * N * (N + 1.) + zscore = (x.S - S_null_mean)/S_null_std + # S is approximately normally distributed + # S and ρ are inversely proportional + if tail == :both + cdf(Normal(), -abs(zscore)) + ccdf(Normal(), abs(zscore)) + elseif tail == :right + cdf(Normal(), zscore) + elseif tail == :left + ccdf(Normal(), zscore) + else + throw(ArgumentError("tail=$(tail) is invalid")) + end +end + +# Using T test for n > 10 as in: +# +# McDonald JH. +# Handbook of biological statistics. +# Baltimore, MD: Sparky House Publishing; 2009 Aug. +function spearman_P_ttest(x::SpearmanCorrelationTest, tail::Symbol) + ρ2 = x.ρ^2 + df = x.n-2 + t = sqrt((df*ρ2)/(1-ρ2)) + if tail == :both + cdf(TDist(df), -t) + ccdf(Normal(), t) + elseif tail == :right + x.ρ > 0 ? ccdf(TDist(df), t) : cdf(TDist(df), t) + elseif tail == :left + x.ρ > 0 ? cdf(TDist(df), t) : ccdf(TDist(df), t) + else + throw(ArgumentError("tail=$(tail) is invalid")) + end +end + +function pvalue(x::SpearmanCorrelationTest; tail::Symbol=:both, method::Symbol=:estimated) + if method ∉ Set(Symbol[:sampling, :exact, :estimated, :ttest]) + throw(ArgumentError("method=$(method) is invalid")) + end + if x.n <= 10 + # Exact P value using permutations + return(spearman_P_exact(x, tail)) + else + if method == :sampling + return(spearman_P_sampling(x, tail)) + elseif method == :exact + return(spearman_P_exact(x, tail)) + elseif method == :estimated + return(spearman_P_estimated(x, tail)) + elseif method == :ttest + return(spearman_P_ttest(x, tail)) + end + end +end + diff --git a/test/runtests.jl b/test/runtests.jl index 240638c8..e0f05726 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -9,4 +9,5 @@ include("t.jl") include("wilcoxon.jl") include("power_divergence.jl") include("anderson_darling.jl") +include("spearman.jl") include("common.jl") diff --git a/test/spearman.jl b/test/spearman.jl new file mode 100644 index 00000000..cc4aacfb --- /dev/null +++ b/test/spearman.jl @@ -0,0 +1,89 @@ +using HypothesisTests, Base.Test + +# Test Exact P value: n <= 10 +let x = [44.4, 45.9, 41.9, 53.3, 44.7, 44.1], + y = [2.6, 3.1, 2.5, 5.0, 3.6, 4.0] + + corr = HypothesisTests.SpearmanCorrelationTest(x, y) + + # R values + @test_approx_eq corr.ρ 0.6 + @test_approx_eq_eps HypothesisTests.pvalue(corr) 0.2417 0.0001 + @test_approx_eq_eps HypothesisTests.pvalue(corr, tail=:right) 0.1208 0.0001 + @test_approx_eq_eps HypothesisTests.pvalue(corr, tail=:left) 0.9125 0.0001 + + # Test throws + @test_throws AssertionError HypothesisTests.SpearmanCorrelationTest(x,y[1:5]) + + @test_throws ArgumentError HypothesisTests.pvalue(corr, tail=:greater, method=:exact) + @test_throws ArgumentError HypothesisTests.pvalue(corr, tail=:greater, method=:sampling) + @test_throws ArgumentError HypothesisTests.pvalue(corr, tail=:greater, method=:exact) + @test_throws ArgumentError HypothesisTests.pvalue(corr, tail=:greater, method=:ttest) + + @test_throws ArgumentError HypothesisTests.pvalue(corr, tail=:right, method=:R) +end + +show(IOBuffer(), + HypothesisTests.SpearmanCorrelationTest([44.4, 45.9, 41.9, 53.3, 44., 44.1], + [2.6, 3.1, 2.5, 5.0, 3.6, 4.0]) + ) + +let x = collect(1:11), + y = [6, 5, 4, 3, 2, 1, 7, 11, 10, 9, 8] + # https://stat.ethz.ch/pipermail/r-devel/2009-February/052112.html + # The correct P value is 0.03044548, R 3.2.5 gives 0.03036 + + srand(12345) # Seed for method=:sampling + + corr = HypothesisTests.SpearmanCorrelationTest(x, y) + + @test_approx_eq_eps HypothesisTests.pvalue(corr, tail=:right, method=:exact) 0.03044548 1e-8 + @test_approx_eq_eps HypothesisTests.pvalue(corr, tail=:right, method=:sampling) 0.030 1e-3 + @test_approx_eq_eps HypothesisTests.pvalue(corr, tail=:right, method=:estimated) 0.030 1e-3 + @test_approx_eq_eps HypothesisTests.pvalue(corr, tail=:right, method=:ttest) 0.03 1e-2 + + corr = HypothesisTests.SpearmanCorrelationTest(x, -y) + + @test_approx_eq_eps HypothesisTests.pvalue(corr, tail=:left, method=:exact) 0.03044548 1e-8 + @test_approx_eq_eps HypothesisTests.pvalue(corr, tail=:left, method=:sampling) 0.030 1e-3 + @test_approx_eq_eps HypothesisTests.pvalue(corr, tail=:left, method=:estimated) 0.030 1e-3 + @test_approx_eq_eps HypothesisTests.pvalue(corr, tail=:left, method=:ttest) 0.03 1e-2 +end + +let x = collect(1:10), + y = [5, 4, 3, 2, 1, 6, 10, 9, 8, 7] + + # R's pspearman: 0.05443067 is the exact value + corr = HypothesisTests.SpearmanCorrelationTest(x, y) + @test_approx_eq_eps HypothesisTests.pvalue(corr) 0.05443067 1e-8 +end + +# Using (N-1)N²(N+1)² overflows with N = 10153 and sqrt((N-1)N²(N+1)²) throws an error +# pvalue avoids the Int overflow using float(N) and sqrt(N-1)N(N+1) since N > 0 +srand(12345) # Seed for rand +let x = rand(10153) + + corr = SpearmanCorrelationTest(x, x) + + @test_approx_eq corr.ρ 1.0 + @test_approx_eq pvalue(corr) 0.0 + +end + +# Test S value with ties + +function rho_with_ties(S, N, tx, ty) # S == D + # Equation (14.6.5) from Numerical Recipes for rho with ties + a=(N^3)-N + (1-((6/a)*(S+(tx/12)+(ty/12)))) / (sqrt(1-(tx/a))*sqrt(1-(ty/a))) +end + +function diff_rho(x, y) + corr = SpearmanCorrelationTest(x, y) + corr.ρ - rho_with_ties(corr.S, corr.n, corr.xtiesadj, corr.ytiesadj) +end + +srand(12345) # Seed for rand +for i in 20:100 + @test_approx_eq_eps diff_rho(rand(1:10, i), rand(1:10, i)) 0.0 1e-10 +end