Merge pull request #184 from gwaygenomics/feature-select-docs

Add explicit documentation for variance_threshold freq_cut and unique_cut
cytomining · Feb 2, 2022 · 1279cd2 · 1279cd2
2 parents 4fee32d + 0841908
commit 1279cd2
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 12 deletions.
diff --git a/pycytominer/feature_select.py b/pycytominer/feature_select.py
@@ -67,9 +67,14 @@ def feature_select(
     corr_method : str, default "pearson"
         Correlation type to compute. Allowed methods are "spearman", "kendall" and "pearson".
     freq_cut : float, default 0.05
-        Ratio (2nd most common feature val / most common).
+        Ratio (2nd most common feature val / most common). Must range between 0 and 1.
+        Remove features lower than freq_cut. A low freq_cut will remove features
+        that have large difference between the most common feature and second most
+        common feature. (e.g. this will remove a feature: [1, 1, 1, 1, 0.01, 0.01, ...])
     unique_cut: float, default 0.01
-        Ratio (num unique features / num samples).
+        Ratio (num unique features / num samples). Must range between 0 and 1.
+        Remove features less than unique cut. A low unique_cut will remove features
+        that have very few different measurements compared to the number of samples.
     compression_options : str or dict, optional
         Contains compression options as input to
         pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.

diff --git a/pycytominer/operations/variance_threshold.py b/pycytominer/operations/variance_threshold.py
@@ -11,8 +11,7 @@
 def variance_threshold(
     population_df, features="infer", samples="all", freq_cut=0.05, unique_cut=0.01
 ):
-    """
-    Exclude features that have low variance (low information content)
+    """Exclude features that have low variance (low information content)
 
     Parameters
     ----------
@@ -25,9 +24,14 @@ def variance_threshold(
     samples : list or str, default "all"
         List of samples to perform operation on. If "all", use all samples to calculate.
     freq_cut : float, default 0.05
-        Ratio (2nd most common feature val / most common).
+        Ratio (2nd most common feature val / most common). Must range between 0 and 1.
+        Remove features lower than freq_cut. A low freq_cut will remove features
+        that have large difference between the most common feature value and second most
+        common feature value. (e.g. this will remove a feature: [1, 1, 1, 1, 0.01, 0.01, ...])
     unique_cut: float, default 0.01
-        Ratio (num unique features / num samples).
+        Ratio (num unique features / num samples). Must range between 0 and 1.
+        Remove features less than unique cut. A low unique_cut will remove features
+        that have very few different measurements compared to the number of samples.
 
     Returns
     -------
@@ -48,7 +52,7 @@ def variance_threshold(
 
     population_df = population_df.loc[:, features]
 
-    # Test if excluded for low frequency
+    # Exclude features with extreme (defined by freq_cut ratio) common values
     excluded_features_freq = population_df.apply(
         lambda x: calculate_frequency(x, freq_cut), axis="rows"
     )
@@ -57,7 +61,7 @@ def variance_threshold(
         excluded_features_freq.isna()
     ].index.tolist()
 
-    # Test if excluded for uniqueness
+    # Exclude features with too many (defined by unique_ratio) values in common
     n = population_df.shape[0]
     num_unique_features = population_df.nunique()
 
@@ -70,16 +74,18 @@ def variance_threshold(
 
 
 def calculate_frequency(feature_column, freq_cut):
-    """
-    Calculate frequency of second most common to most common feature.
+    """Calculate frequency of second most common to most common feature.
     Used in pandas.apply()
 
     Parameters
     ----------
     feature_column : pandas.core.series.series
         Pandas series of the specific feature in the population_df
-    freq_cut : float
-        Ratio (2nd most common feature val / most common).
+    freq_cut : float, default 0.05
+        Ratio (2nd most common feature val / most common). Must range between 0 and 1.
+        Remove features lower than freq_cut. A low freq_cut will remove features
+        that have large difference between the most common feature and second most
+        common feature. (e.g. this will remove a feature: [1, 1, 1, 1, 0.01, 0.01, ...])
 
     Returns
     -------