From 8799cea377dffd59bcb138727f54e7053887b27f Mon Sep 17 00:00:00 2001 From: Gregory Way Date: Wed, 2 Feb 2022 15:40:18 -0700 Subject: [PATCH 1/3] more explicit documentation for variance_threshold operation --- pycytominer/feature_select.py | 9 +++++++-- pycytominer/operations/variance_threshold.py | 20 ++++++++++++++------ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/pycytominer/feature_select.py b/pycytominer/feature_select.py index 518acf99..6050c658 100644 --- a/pycytominer/feature_select.py +++ b/pycytominer/feature_select.py @@ -67,9 +67,14 @@ def feature_select( corr_method : str, default "pearson" Correlation type to compute. Allowed methods are "spearman", "kendall" and "pearson". freq_cut : float, default 0.05 - Ratio (2nd most common feature val / most common). + Ratio (2nd most common feature val / most common). Must range between 0 and 1. + Remove features lower than freq_cut. A low freq_cut will remove features + that have large difference between the most common feature and second most + common feature. (e.g. this will remove a feature: [1, 1, 1, 1, 0.01, 0.01, ...]) unique_cut: float, default 0.01 - Ratio (num unique features / num samples). + Ratio (num unique features / num samples). Must range between 0 and 1. + Remove features less than unique cut. A low unique_cut will remove features + that have very few different measurements compared to the number of samples. compression_options : str or dict, optional Contains compression options as input to pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2. diff --git a/pycytominer/operations/variance_threshold.py b/pycytominer/operations/variance_threshold.py index 4044f3e4..cfb7a23c 100644 --- a/pycytominer/operations/variance_threshold.py +++ b/pycytominer/operations/variance_threshold.py @@ -25,9 +25,14 @@ def variance_threshold( samples : list or str, default "all" List of samples to perform operation on. If "all", use all samples to calculate. freq_cut : float, default 0.05 - Ratio (2nd most common feature val / most common). + Ratio (2nd most common feature val / most common). Must range between 0 and 1. + Remove features lower than freq_cut. A low freq_cut will remove features + that have large difference between the most common feature and second most + common feature. (e.g. this will remove a feature: [1, 1, 1, 1, 0.01, 0.01, ...]) unique_cut: float, default 0.01 - Ratio (num unique features / num samples). + Ratio (num unique features / num samples). Must range between 0 and 1. + Remove features less than unique cut. A low unique_cut will remove features + that have very few different measurements compared to the number of samples. Returns ------- @@ -48,7 +53,7 @@ def variance_threshold( population_df = population_df.loc[:, features] - # Test if excluded for low frequency + # Exclude features with extreme (defined by freq_cut ratio) common values excluded_features_freq = population_df.apply( lambda x: calculate_frequency(x, freq_cut), axis="rows" ) @@ -57,7 +62,7 @@ def variance_threshold( excluded_features_freq.isna() ].index.tolist() - # Test if excluded for uniqueness + # Exclude features with too many (defined by unique_ratio) values in common n = population_df.shape[0] num_unique_features = population_df.nunique() @@ -78,8 +83,11 @@ def calculate_frequency(feature_column, freq_cut): ---------- feature_column : pandas.core.series.series Pandas series of the specific feature in the population_df - freq_cut : float - Ratio (2nd most common feature val / most common). + freq_cut : float, default 0.05 + Ratio (2nd most common feature val / most common). Must range between 0 and 1. + Remove features lower than freq_cut. A low freq_cut will remove features + that have large difference between the most common feature and second most + common feature. (e.g. this will remove a feature: [1, 1, 1, 1, 0.01, 0.01, ...]) Returns ------- From b9ee5c91df281dc90a081520cedee40bdeb3e53a Mon Sep 17 00:00:00 2001 From: Gregory Way Date: Wed, 2 Feb 2022 15:52:05 -0700 Subject: [PATCH 2/3] docs on same docstring line --- pycytominer/operations/variance_threshold.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pycytominer/operations/variance_threshold.py b/pycytominer/operations/variance_threshold.py index cfb7a23c..401a22e5 100644 --- a/pycytominer/operations/variance_threshold.py +++ b/pycytominer/operations/variance_threshold.py @@ -11,8 +11,7 @@ def variance_threshold( population_df, features="infer", samples="all", freq_cut=0.05, unique_cut=0.01 ): - """ - Exclude features that have low variance (low information content) + """Exclude features that have low variance (low information content) Parameters ---------- @@ -75,8 +74,7 @@ def variance_threshold( def calculate_frequency(feature_column, freq_cut): - """ - Calculate frequency of second most common to most common feature. + """Calculate frequency of second most common to most common feature. Used in pandas.apply() Parameters From 0841908aab3ad9385f9826727e8482fbbbbe0b24 Mon Sep 17 00:00:00 2001 From: Gregory Way Date: Wed, 2 Feb 2022 15:54:39 -0700 Subject: [PATCH 3/3] Update pycytominer/operations/variance_threshold.py Co-authored-by: Niranj Chandrasekaran --- pycytominer/operations/variance_threshold.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pycytominer/operations/variance_threshold.py b/pycytominer/operations/variance_threshold.py index 401a22e5..6ba82d2b 100644 --- a/pycytominer/operations/variance_threshold.py +++ b/pycytominer/operations/variance_threshold.py @@ -26,8 +26,8 @@ def variance_threshold( freq_cut : float, default 0.05 Ratio (2nd most common feature val / most common). Must range between 0 and 1. Remove features lower than freq_cut. A low freq_cut will remove features - that have large difference between the most common feature and second most - common feature. (e.g. this will remove a feature: [1, 1, 1, 1, 0.01, 0.01, ...]) + that have large difference between the most common feature value and second most + common feature value. (e.g. this will remove a feature: [1, 1, 1, 1, 0.01, 0.01, ...]) unique_cut: float, default 0.01 Ratio (num unique features / num samples). Must range between 0 and 1. Remove features less than unique cut. A low unique_cut will remove features