add docs & add examples

szcf-weiya · Oct 17, 2024 · b7103e6 · b7103e6
1 parent 62f42f0
commit b7103e6
Show file tree

Hide file tree

Showing 12 changed files with 228 additions and 58 deletions.
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -2,4 +2,3 @@
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
-SplitClusterTest = "3179d415-6c90-44ae-a551-1cda3561f97d"
diff --git a/docs/make.jl b/docs/make.jl
@@ -5,7 +5,7 @@ using SplitClusterTest
 using Literate
 indir = joinpath(@__DIR__, "..", "examples")
 outdir = joinpath(@__DIR__, "src", "examples")
-files = ["demo-mirror.jl"]
+files = ["two-gaussians.jl", "two-poissons.jl", "cont-poissons.jl"]
 for file in files
     Literate.markdown(joinpath(indir, file), outdir; credit = false)
 end
@@ -14,7 +14,9 @@ makedocs(sitename="SplitClusterTest.jl",
         pages = [
             "Home" => "index.md",
             "Examples" => [
-                "Mirror Statistics" => "examples/demo-mirror.md"
+                "Two Gaussians" => "examples/two-gaussians.md",
+                "Two Poissons" => "examples/two-poissons.md",
+                "Continuous Poissons (Linear Pseduotime)" => "examples/cont-poissons.md"
             ],
             "API" => "api.md"
         ]        

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -1,5 +1,11 @@
 # SplitClusterTest.jl Documentation
 
-> Wang, L., Lin, Y., & Zhao, H. (2024). False Discovery Rate Control via Data Splitting for Testing-after-Clustering (arXiv:2410.06451). arXiv. https://doi.org/10.48550/arXiv.2410.06451
+> Wang, L., Lin, Y., & Zhao, H. (2024). False Discovery Rate Control via Data Splitting for Testing-after-Clustering (arXiv:2410.06451). arXiv. <https://doi.org/10.48550/arXiv.2410.06451>
 >
 
+
+Testing for differences in features between clusters in various applications often leads to inflated false positives when practitioners use the same dataset to identify clusters and then test features, an issue commonly known as “double dipping”. 
+
+To address this challenge, inspired by data-splitting strategies for controlling the false discovery rate (FDR) in regressions ([Dai et al., 2023](https://www.tandfonline.com/doi/abs/10.1080/01621459.2022.2060113)), we present a novel method that applies data-splitting to control FDR while maintaining high power in unsupervised clustering. 
+
+We first divide the dataset into two halves, then apply the conventional testing-after-clustering procedure to each half separately and combine the resulting test statistics to form a new statistic for each feature. The new statistic can help control the FDR due to its property of having a sampling distribution that is symmetric around zero for any null feature.
diff --git a/examples/cont-poissons.jl b/examples/cont-poissons.jl
@@ -0,0 +1,22 @@
+# This section demonstrates the data splitting procedure for selecting relevant features when there exists latent linear pseduotime under the Poisson setting. 
+
+using SplitClusterTest
+using Plots
+
+
+x, cl = gen_data_pois(1000, 2000, 0.5, prop_imp=0.1, type = "continuous")
+
+# Plot the first two PCs of X, and color each point by the pseduotime variable `cl`
+pc1, pc2 = first_two_PCs(x)
+scatter(pc1, pc2, marker_z = cl, label = "")
+
+# Adopt the data splitting procedure to select the relevant features.
+ms = ds(x, ret_ms = true, type = "continuous");
+τ = calc_τ(ms)
+
+# the mirror statistics of relevant features tend to be larger and away from null features, 
+# where the null features still exhibit a symmetric distribution about zero. 
+# Then we can properly take the cutoff to control the FDR, as shown by the red vertical line.
+histogram(ms, label = "")
+Plots.vline!([τ], label = "", lw = 3)
+
diff --git a/examples/demo-mirror.jl b/examples/demo-mirror.jl
diff --git a/examples/two-gaussians.jl b/examples/two-gaussians.jl
@@ -0,0 +1,40 @@
+# This section demonstrates the data splitting procedure for selecting relevant features when there exists (or no) cluster structure under the Gaussian setting. 
+
+using SplitClusterTest
+using Plots
+
+
+# ## Without cluster structure 
+x, cl = gen_data_normal(1000, 2000, 0.0, prop_imp=0.1);
+
+# Plot the first two PCs of X
+pc1, pc2 = first_two_PCs(x)
+scatter(pc1[cl .== 1], pc2[cl .== 1])
+scatter!(pc1[cl .== 2], pc2[cl .== 2])
+
+# perform the data splitting procedure for selecting relevant features
+ms = ds(x, ret_ms = true);
+
+# the mirror statistics are symmetric about zero since all features are null features. 
+histogram(ms, label = "")
+
+
+# ## With cluster structure
+
+x, cl = gen_data_normal(1000, 2000, 0.5, prop_imp=0.1);
+
+# Plot the first two PCs of X
+pc1, pc2 = first_two_PCs(x)
+scatter(pc1[cl .== 1], pc2[cl .== 1])
+scatter!(pc1[cl .== 2], pc2[cl .== 2])
+
+
+# the mirror statistics of relevant features tend to be larger and away from null features, 
+# where the null features still exhibit a symmetric distribution about zero. 
+ms = ds(x, ret_ms = true);
+
+# Then we can properly take the cutoff to control the FDR, as shown by the red vertical line.
+τ = calc_τ(ms)
+histogram(ms, label = "")
+Plots.vline!([τ], label = "", lw = 3)
+
diff --git a/examples/two-poissons.jl b/examples/two-poissons.jl
@@ -0,0 +1,23 @@
+# This section demonstrates the data splitting procedure for selecting relevant features when there exists cluster structure under the Poisson setting. 
+
+using SplitClusterTest
+using Plots
+
+
+x, cl = gen_data_pois(1000, 2000, 0.5, prop_imp=0.1, type = "discrete")
+
+# Plot the first two PCs of X
+pc1, pc2 = first_two_PCs(x)
+scatter(pc1[cl .== 0], pc2[cl .== 0])
+scatter!(pc1[cl .== 1], pc2[cl .== 1])
+
+# Adopt the data splitting procedure to select the relevant features.
+ms = ds(x, ret_ms = true);
+τ = calc_τ(ms)
+
+# the mirror statistics of relevant features tend to be larger and away from null features, 
+# where the null features still exhibit a symmetric distribution about zero. 
+# Then we can properly take the cutoff to control the FDR, as shown by the red vertical line.
+histogram(ms, label = "")
+Plots.vline!([τ], label = "", lw = 3)
+
diff --git a/src/SplitClusterTest.jl b/src/SplitClusterTest.jl
@@ -9,6 +9,7 @@ export gen_data_normal,
        ds,
        mds,
        calc_τ,
-       calc_acc
+       calc_acc,
+       first_two_PCs
 
 end # module SplitClusterTest
diff --git a/src/data.jl b/src/data.jl
@@ -37,7 +37,22 @@ function fixcorr_s1(ρ::Real, p::Int, p1::Int; corr01 = 1)
     return Σ
 end
 
-function gen_data_normal(n::Int, p::Int, δ = 1.0; prop_imp = 0.1, 
+"""
+    gen_data_normal(n::Int, p::Int, δ::Float64; prop_imp = 0.1, corr_structure = "ind", ρ = 0.9, sigma = 0, ...)
+
+Generate `n` samples with `p` features from two Gaussian distributions. 
+
+- `prop_imp`: the proportion of relevant features
+- `corr_structure`: the correlation structure, possible choices:
+    - `ind`: independent
+    - `ar1`: AR(1) structure
+    - `fixcorr`: fixed correlation
+    - `fixcorr_s1_ind`: fixed correlation among relevant features, and no correlation between the null features and relevant features
+    - `fixcorr_s1`: fixed correlation among relevant features, and maximum correlation between the null features and relevant features such that the correlation matrix is positive definite.
+- `ρ`: the correlation coefficient
+- `sigma`: the noise level on the signal strength
+"""
+function gen_data_normal(n::Int, p::Int, δ::Float64; prop_imp = 0.1, 
                                 corr_structure = "ind", ρ = 0.9,
                                 prop_cl1 = 0.5,
                                 order = 1,
@@ -77,6 +92,15 @@ function gen_data_normal(n::Int, p::Int, δ = 1.0; prop_imp = 0.1,
     return Matrix(x), cl
 end
 
+"""
+    gen_data_pois(Λ::AbstractMatrix; ρ = 0.5, block_size = 10)
+
+Generate Poisson samples of the same size `Λ`, which is the mean values of each element. 
+The features can be correlated via the Gaussian Copula, whose correlation matrix is block-diagonal AR(1) structure.
+
+- `ρ`: the correlation coefficient
+- `block_size`: the size of each block in the correlation matrix
+"""
 function gen_data_pois(Λ::AbstractMatrix; ρ = 0.5, block_size = 10)
     n, p = size(Λ)
     Σ = ar1(ρ, block_size)
@@ -95,7 +119,18 @@ function gen_data_pois(Λ::AbstractMatrix; ρ = 0.5, block_size = 10)
     return X
 end
 
-function gen_data_pois(; n = 1000, p = 2000, prop_imp = 0.1, ρ = 0.5, δ = 1.0, block_size = 10, type = "discrete", sigma = 0)
+"""
+    gen_data_pois(n::Int, p::Int, δ::Float64; prop_imp = 0.1, ρ = 0.5, block_size = 10, type = "discrete", sigma = 0)
+
+Generate `n` samples with `p` features from Poisson distributions
+
+- `type`: if `discrete`, then generate from two Poisson distributions; otherwise, each sample has a different mean vector, which forms the linear pseduotime data.
+- `prop_imp`: the proportion of relevant features
+- `sigma`: the noise level on the signal strength
+- `ρ`: the correlation coefficient. If non-zero, take the Gaussian Copula with AR(1) correlation structure with `ρ`
+- `block_size`: the block size when construct the correlation matrix, since the Copula of high dimension is computational expensive.
+"""
+function gen_data_pois(n::Int, p::Int, δ::Float64; prop_imp = 0.1, ρ = 0.5, block_size = 10, type = "discrete", sigma = 0)
     if type == "discrete"
         L = sample(0:1, n)
     else
@@ -108,6 +143,6 @@ function gen_data_pois(; n = 1000, p = 2000, prop_imp = 0.1, ρ = 0.5, δ = 1.0,
     logΛ = LFT .+ log(3) + randn(n, p) * sigma  # intercept
     Λ = exp.(logΛ)
     X = gen_data_pois(Λ, ρ = ρ, block_size = block_size)
-    return X
+    return X, L
 end
 
diff --git a/src/ds.jl b/src/ds.jl
@@ -4,6 +4,11 @@ using RCall
 using LinearAlgebra
 using Distributions
 
+"""
+    calc_τ(ms::AbstractVector, q::Float64 = 0.05, offset::Int = 1)
+
+Calculate the cutoff of the mirror statistics `ms` given the nominal FDR level `q`. It is recommended to take `offset = 1` in the numerator, as discussed in the knockoff paper.
+"""
 function calc_τ(ms::AbstractVector, q::Float64 = 0.05, offset::Int = 1)
     ts = sort(abs.(ms))
     ret = [maximum(ms)]
@@ -52,6 +57,10 @@ function cluster_diff(x::AbstractMatrix, cl::AbstractVector{Int64}; method = "si
     return ds
 end
 
+function pval_glm(x, t)
+   return rcopy(R"summary(glm($x ~ $t, family = 'poisson'))$coefficients[2, 4]")
+end
+
 function calc_sign_pvals(X::AbstractMatrix, t::AbstractVector)
     idx = t .!= -1
     if sum(idx) == 0
@@ -64,7 +73,7 @@ function calc_sign_pvals(X::AbstractMatrix, t::AbstractVector)
     pvals = zeros(p)
     for i = 1:p
         mean_diff = mean(X[idx, i][ii]) - mean(X[idx, i][.!ii])
-        pval = try rcopy(R"pval_glm"(X[idx, i], t[idx]))
+        pval = try pval_glm(X[idx, i], t[idx])
         catch e
             @warn e
             1
@@ -77,20 +86,38 @@ function calc_sign_pvals(X::AbstractMatrix, t::AbstractVector)
 end
 
 cl_kmeans(x::AbstractMatrix; kw...) = kmeans(x', 2; kw...).assignments
+cl_rkmeans(x::AbstractMatrix; kw...) = rcopy(R"kmeans($x, 2)$cluster")
+
 
 function ti_pca(xx::AbstractMatrix)
-    xx = log.(x .+ 1)
+    xx = log.(xx .+ 1)
     t = svd(xx .- mean(xx, dims = 1)).U[:, 1]
     return t
 end
 
+"""
+    ds(::AbstractMatrix; ...)
+
+Select with the nominal FDR level `q` via a single data splitting on data matrix `x`.
+
+- `q`: nominal FDR level
+- `signal_measure`: the signal measurement
+- `ret_ms`: if `true`, then return the mirror statistics; otherwise, return the selection set
+- `type`: if `discrete`, perform the testing after clustering into two groups; 
+        otherwise, perform the testing along pseduotime after estimating the pseduotime
+- `cl_method`: the function for clustering into two groups (only used if `type == discrete`)
+- `ti_method`: the function for estimating the pseduotime (only used if `type != discrete`)
+- `oracle_label`: if provided (it is `nothing` by default), the accuracy of the clustering will be calculated. 
+- `kmeans_whiten`: whether to perform whitening
+- `Σ`: used for kmeans whitening
+"""
 function ds(x::AbstractMatrix; q = 0.05, signal_measure = "tstat", 
-                plt = true, ret_ms = false, 
+                ret_ms = false, 
                 type = "discrete", 
-                cl_method::Function = cl_kmeans, 
+                cl_method::Function = cl_rkmeans, 
                 ti_method::Function = ti_pca, 
                 oracle_label = nothing,
-                kmeans_whiten = false, Σ = nothing, λ = 1e-6)
+                kmeans_whiten = false, Σ = nothing)
     n = size(x, 1)
     n2 = round(Int, n/2)
     idx1 = sample(1:n, n2, replace = false)
@@ -100,7 +127,7 @@ function ds(x::AbstractMatrix; q = 0.05, signal_measure = "tstat",
     if kmeans_whiten
         p = size(x, 2)
         if isnothing(Σ)
-            Σ = est_Σ(x) + λ * 1.0I
+            Σ = est_Σ(x) + 1e-6 * 1.0I
             #Σ = cov(x) + λ * 1.0I
         end
         ev = eigen(Σ)
@@ -137,19 +164,7 @@ function ds(x::AbstractMatrix; q = 0.05, signal_measure = "tstat",
     end
     τ = calc_τ(ms, q)
     m_sel = findall(ms .> τ)
-    if plt
-        @info "selected features: " m_sel'
-        @info "mirror stat: " ms[m_sel]'
-        @info "mirror stat of first 20: " ms[1:20]'
-        @info "d1 of first 20: " d1[1:20]'
-        @info "d2 of first 20: " d2[1:20]'
-        @info "selected features (including negative): " findall(abs.(ms) .> τ)'
-        fig = histogram(ms, label = "")
-        Plots.vline!(fig, [τ], label = "", lw = 3)
-        return fig
-    else
-        return m_sel
-    end
+    return m_sel
 end
 
 
@@ -226,16 +241,20 @@ function sel_inc_rate(inc_rate::AbstractArray{T}; q::Float64 = 0.05, tol = 1e-10
     end
 end
 
-function mds(x::AbstractMatrix; M = 10, q = 0.05, signal_measure = "tstat", plt = false, 
+"""
+    mds(x::AbstractMatrix; M = 10, ...)
+
+Select with the nominal FDR level `q` via `M` times data splitting on data matrix `x`. All paramaters except `M` are passed to `ds`.
+"""
+function mds(x::AbstractMatrix; M = 10, q = 0.05, signal_measure = "tstat", 
                 type = "discrete", 
-                qm = 0.05, # dummy to be consistent with mds5
-                cl_method = cl_kmeans, 
+                cl_method = cl_rkmeans, 
                 ti_method = ti_pca,
                 kmeans_whiten = false, kw...)
     if M == 1
-        return ds(x; ret_ms = false, plt = false, q = q, signal_measure = signal_measure, type = type, ti_method = ti_method, cl_method = cl_method, kmeans_whiten = kmeans_whiten, kw...)
+        return ds(x; ret_ms = false, q = q, signal_measure = signal_measure, type = type, ti_method = ti_method, cl_method = cl_method, kmeans_whiten = kmeans_whiten, kw...)
     end
-    mss = [ds(x; ret_ms = true, plt = false, q = q, signal_measure = signal_measure, type = type, ti_method = ti_method, cl_method = cl_method, kmeans_whiten = kmeans_whiten, kw...) for _ in 1:M]
+    mss = [ds(x; ret_ms = true, q = q, signal_measure = signal_measure, type = type, ti_method = ti_method, cl_method = cl_method, kmeans_whiten = kmeans_whiten, kw...) for _ in 1:M]
     τs = [calc_τ(_ms, q) for _ms in mss]
     inc_rate = calc_inc_rate(mss, τs, sum_in_denom = true)
     ret = sel_inc_rate(inc_rate)

diff --git a/src/utils.jl b/src/utils.jl
@@ -1,3 +1,8 @@
+"""
+    calc_acc(pred::AbstractVector, truth::AbstractVector)
+
+Calculate the accuracy (FDR, F1 score, Precision, Power) for the predicted selection `pred` given the `truth`.
+"""
 function calc_acc(pred::AbstractVector{T}, truth::AbstractVector{T}) where T <: Real
     if length(truth) == 0 # NULL
         if length(pred) == 0
@@ -23,6 +28,17 @@ function calc_acc(pred::AbstractVector{T}, truth::AbstractVector{T}) where T <:
     return fdr, f1, precision, recall
 end
 
+"""
+    first_two_PCs(x::AbstractMatrix)
+
+Calculate the first two principal components of data matrix `x`.
+"""
+function first_two_PCs(x::AbstractMatrix)
+    xc = x .- mean(x, dims = 1)
+    U, S, V = svd(xc)
+    return U[:, 1] * S[1], U[:, 2] * S[2]
+end
+
 function est_Σ(x::AbstractMatrix)
     n, p = size(x)
     cl = rcopy(R"kmeans($x, 2, nstart = 10)$cluster") # the accuracy is low