diff --git a/pycytominer/feature_select.py b/pycytominer/feature_select.py index 518acf99..6050c658 100644 --- a/pycytominer/feature_select.py +++ b/pycytominer/feature_select.py @@ -67,9 +67,14 @@ def feature_select( corr_method : str, default "pearson" Correlation type to compute. Allowed methods are "spearman", "kendall" and "pearson". freq_cut : float, default 0.05 - Ratio (2nd most common feature val / most common). + Ratio (2nd most common feature val / most common). Must range between 0 and 1. + Remove features lower than freq_cut. A low freq_cut will remove features + that have large difference between the most common feature and second most + common feature. (e.g. this will remove a feature: [1, 1, 1, 1, 0.01, 0.01, ...]) unique_cut: float, default 0.01 - Ratio (num unique features / num samples). + Ratio (num unique features / num samples). Must range between 0 and 1. + Remove features less than unique cut. A low unique_cut will remove features + that have very few different measurements compared to the number of samples. compression_options : str or dict, optional Contains compression options as input to pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2. diff --git a/pycytominer/operations/variance_threshold.py b/pycytominer/operations/variance_threshold.py index 4044f3e4..6ba82d2b 100644 --- a/pycytominer/operations/variance_threshold.py +++ b/pycytominer/operations/variance_threshold.py @@ -11,8 +11,7 @@ def variance_threshold( population_df, features="infer", samples="all", freq_cut=0.05, unique_cut=0.01 ): - """ - Exclude features that have low variance (low information content) + """Exclude features that have low variance (low information content) Parameters ---------- @@ -25,9 +24,14 @@ def variance_threshold( samples : list or str, default "all" List of samples to perform operation on. If "all", use all samples to calculate. freq_cut : float, default 0.05 - Ratio (2nd most common feature val / most common). + Ratio (2nd most common feature val / most common). Must range between 0 and 1. + Remove features lower than freq_cut. A low freq_cut will remove features + that have large difference between the most common feature value and second most + common feature value. (e.g. this will remove a feature: [1, 1, 1, 1, 0.01, 0.01, ...]) unique_cut: float, default 0.01 - Ratio (num unique features / num samples). + Ratio (num unique features / num samples). Must range between 0 and 1. + Remove features less than unique cut. A low unique_cut will remove features + that have very few different measurements compared to the number of samples. Returns ------- @@ -48,7 +52,7 @@ def variance_threshold( population_df = population_df.loc[:, features] - # Test if excluded for low frequency + # Exclude features with extreme (defined by freq_cut ratio) common values excluded_features_freq = population_df.apply( lambda x: calculate_frequency(x, freq_cut), axis="rows" ) @@ -57,7 +61,7 @@ def variance_threshold( excluded_features_freq.isna() ].index.tolist() - # Test if excluded for uniqueness + # Exclude features with too many (defined by unique_ratio) values in common n = population_df.shape[0] num_unique_features = population_df.nunique() @@ -70,16 +74,18 @@ def variance_threshold( def calculate_frequency(feature_column, freq_cut): - """ - Calculate frequency of second most common to most common feature. + """Calculate frequency of second most common to most common feature. Used in pandas.apply() Parameters ---------- feature_column : pandas.core.series.series Pandas series of the specific feature in the population_df - freq_cut : float - Ratio (2nd most common feature val / most common). + freq_cut : float, default 0.05 + Ratio (2nd most common feature val / most common). Must range between 0 and 1. + Remove features lower than freq_cut. A low freq_cut will remove features + that have large difference between the most common feature and second most + common feature. (e.g. this will remove a feature: [1, 1, 1, 1, 0.01, 0.01, ...]) Returns -------