Merge pull request #40 from abstractqqq/debug_chi2

Debug chi2
abstractqqq · Dec 26, 2023 · f8c466b · f8c466b
2 parents 6cd6200 + f86c739
commit f8c466b
Show file tree

Hide file tree

Showing 8 changed files with 481 additions and 104 deletions.
diff --git a/examples/basics.ipynb b/examples/basics.ipynb
diff --git a/python/polars_ds/str2.py b/python/polars_ds/str2.py
@@ -1,5 +1,5 @@
 import polars as pl
-from typing import Union, Optional
+from typing import Union, Optional, Literal
 from polars.utils.udfs import _get_shared_lib_location
 from .type_alias import AhoCorasickMatchKind
 import warnings
@@ -492,7 +492,7 @@ def jw(
         self, other: Union[str, pl.Expr], weight: float = 0.1, parallel: bool = False
     ) -> pl.Expr:
         """
-        Computes the Jaro-Winker similarity between this and the other str.
+        Computes the Jaro-Winkler similarity between this and the other str.
         Jaro-Winkler distance = 1 - Jaro-Winkler sim.
 
         Parameters
@@ -550,6 +550,53 @@ def hamming(
             is_elementwise=True,
         )
 
+    def similar_to_vocab(
+        self,
+        vocab: list[str],
+        threshold: float,
+        metric: Literal["leven", "dleven", "jw", "osa"] = "leven",
+        strategy: Literal["avg", "all", "any"] = "avg",
+    ) -> pl.Expr:
+        """
+        Compare each word in the vocab with the each word in self. Filters self to the words
+        that are most similar to the words in the vocab.
+
+        Parameters
+        ----------
+        vocab
+            Any iterable collection of strings
+        threshold
+            A entry is considered similar to the words in the vocabulary if the similarity
+            is above (>=) the threshold
+        metric
+            Which similarity metric to use. One of `leven`, `dleven`, `jw`, `osa`
+        strategy
+            If `avg`, then will return true if the average similarity is above the threshold.
+            If `all`, then will return true if the similarity to all words in the vocab is above
+            the threshold.
+            If `any`, then will return true if the similarity to any words in the vocab is above
+            the threshold.
+        """
+        if metric == "leven":
+            sims = [self.levenshtein(w, return_sim=True) for w in vocab]
+        elif metric == "dleven":
+            sims = [self.d_levenshtein(w, return_sim=True) for w in vocab]
+        elif metric == "osa":
+            sims = [self.osa(w, return_sim=True) for w in vocab]
+        elif sims == "jw":
+            sims = [self.jw(w, return_sim=True) for w in vocab]
+        else:
+            raise ValueError(f"Unknown metric for find_similar: {metric}")
+
+        if strategy == "all":
+            return pl.all_horizontal(s >= threshold for s in sims)
+        elif strategy == "any":
+            return pl.any_horizontal(s >= threshold for s in sims)
+        elif strategy == "avg":
+            return (pl.sum_horizontal(sims) / len(vocab)) >= threshold
+        else:
+            raise ValueError(f"Unknown strategy for find_similar: {strategy}")
+
     def tokenize(self, pattern: str = r"(?u)\b\w\w+\b", stem: bool = False) -> pl.Expr:
         """
         Tokenize the string according to the pattern. This will only extract the words

diff --git a/src/num_ext/entrophies.rs b/src/num_ext/entrophies.rs
@@ -6,7 +6,6 @@ use pyo3_polars::derive::polars_expr;
 // https://en.wikipedia.org/wiki/Sample_entropy
 // https://en.wikipedia.org/wiki/Approximate_entropy
 
-
 #[polars_expr(output_type=Float64)]
 fn pl_approximate_entropy(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResult<Series> {
     // inputs[0] is radius, the rest are the shifted columns
@@ -33,16 +32,20 @@ fn pl_approximate_entropy(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResu
     let data_1_view = data.slice(s![..n1, ..dim.abs_diff(1)]);
     let tree = build_standard_kdtree(dim.abs_diff(1), leaf_size, &data_1_view)?;
     let nb_in_radius = query_nb_cnt(&tree, data_1_view, &super::l_inf_dist, r, parallel);
-    let phi_m: f64 = nb_in_radius.into_no_null_iter()
-        .fold(0_f64, |acc, x| acc + (x as f64 / n1 as f64).ln()) / n1 as f64;
+    let phi_m: f64 = nb_in_radius
+        .into_no_null_iter()
+        .fold(0_f64, |acc, x| acc + (x as f64 / n1 as f64).ln())
+        / n1 as f64;
 
     // Step 3, 4, 5 for m + 1 in wiki
     let n2 = n1.abs_diff(1);
     let data_2_view = data.slice(s![..n2, ..]);
     let tree = build_standard_kdtree(dim, leaf_size, &data_2_view)?;
     let nb_in_radius = query_nb_cnt(&tree, data_2_view, &super::l_inf_dist, r, parallel);
-    let phi_m1: f64 = nb_in_radius.into_no_null_iter()
-        .fold(0_f64, |acc, x| acc + (x as f64 / n2 as f64).ln()) / n2 as f64;
+    let phi_m1: f64 = nb_in_radius
+        .into_no_null_iter()
+        .fold(0_f64, |acc, x| acc + (x as f64 / n2 as f64).ln())
+        / n2 as f64;
 
     // Output
     Ok(Series::from_vec("", vec![(phi_m1 - phi_m).abs()]))

diff --git a/src/num_ext/knn.rs b/src/num_ext/knn.rs
@@ -80,12 +80,13 @@ fn pl_knn_ptwise(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResult<Series
             .into_par_iter()
             .map(|p| {
                 let s = p.to_slice().unwrap(); // C order makes sure rows are contiguous
-                // tree.nearest(s, k+1, &dist_func)
-                if let Ok(v) = tree.nearest(s, k+1, &dist_func) {
+                                               // tree.nearest(s, k+1, &dist_func)
+                if let Ok(v) = tree.nearest(s, k + 1, &dist_func) {
                     // By construction, this unwrap is safe.
                     // k+ 1 because we include the point itself, and ask for k more neighbors.
                     Some(
-                        v.into_iter().map(|(_, i)| id.get(*i).unwrap())
+                        v.into_iter()
+                            .map(|(_, i)| id.get(*i).unwrap())
                             .collect_vec(),
                     )
                 } else {
@@ -103,9 +104,10 @@ fn pl_knn_ptwise(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResult<Series
     } else {
         for p in data.rows() {
             let s = p.to_slice().unwrap(); // C order makes sure rows are contiguous
-            if let Ok(v) = tree.nearest(s, k+1, &dist_func) {
+            if let Ok(v) = tree.nearest(s, k + 1, &dist_func) {
                 // By construction, this unwrap is safe
-                let w: Vec<u64> = v.into_iter()
+                let w: Vec<u64> = v
+                    .into_iter()
                     .map(|(_, i)| id.get(*i).unwrap())
                     .collect_vec();
                 builder.append_slice(w.as_slice());

diff --git a/src/num_ext/tp_fp.rs b/src/num_ext/tp_fp.rs
@@ -76,9 +76,9 @@ fn pl_combo_b(inputs: &[Series]) -> PolarsResult<Series> {
     let threshold = threshold.get(0).unwrap();
 
     if (actual.len() != predicted.len())
-        | actual.is_empty()
-        | predicted.is_empty()
-        | ((actual.null_count() + predicted.null_count()) > 0)
+        || actual.is_empty()
+        || predicted.is_empty()
+        || ((actual.null_count() + predicted.null_count()) > 0)
     {
         return Err(PolarsError::ComputeError(
             "Binary Metrics Combo: Input columns must be the same length, non-empty, and shouldn't contain nulls."

diff --git a/src/stats_ext/chi2.rs b/src/stats_ext/chi2.rs
@@ -1,12 +1,12 @@
-use super::{simple_stats_output, StatsResult};
+use super::simple_stats_output;
 use crate::stats::gamma;
 use polars::prelude::*;
 use pyo3_polars::derive::polars_expr;
 
 #[polars_expr(output_type_func=simple_stats_output)]
 fn pl_chi2(inputs: &[Series]) -> PolarsResult<Series> {
-    let s1_name = inputs[0].name();
-    let s2_name = inputs[1].name();
+    let s1_name = "s1";
+    let s2_name = "s2";
 
     let u1 = inputs[0].unique()?;
     let u1_len = u1.len();
@@ -18,7 +18,9 @@ fn pl_chi2(inputs: &[Series]) -> PolarsResult<Series> {
     let cross = df1.cross_join(df2);
 
     // Create a "fake" contigency table
-    let df3 = DataFrame::from_iter(inputs[0..2].to_vec())
+    let s1 = inputs[0].clone();
+    let s2 = inputs[1].clone();
+    let df3 = df!(s1_name => s1, s2_name => s2)?
         .lazy()
         .group_by([col(s1_name), col(s2_name)])
         .agg([count().alias("ob")]);
@@ -50,12 +52,16 @@ fn pl_chi2(inputs: &[Series]) -> PolarsResult<Series> {
     // Get the statistic
     let out = final_df.drop_in_place("output")?;
     let stats = out.f64()?;
-    let stats = stats.get(0).unwrap();
+    let stats = stats.get(0).unwrap_or(f64::NAN);
     // Compute p value. It is a special case of Gamma distribution
-    let dof = u1_len.abs_diff(1) * u2_len.abs_diff(1);
-    let (shape, rate) = (dof as f64 / 2., 0.5);
-    let p = gamma::sf(stats, shape, rate).map_err(|e| PolarsError::ComputeError(e.into()));
-    let p = p?;
+    let p = if stats.is_nan() {
+        f64::NAN
+    } else {
+        let dof = u1_len.abs_diff(1) * u2_len.abs_diff(1);
+        let (shape, rate) = (dof as f64 / 2., 0.5);
+        let p = gamma::sf(stats, shape, rate).map_err(|e| PolarsError::ComputeError(e.into()));
+        p?
+    };
     // Get output
     let s = Series::from_vec("statistic", vec![stats]);
     let p = Series::from_vec("pvalue", vec![p]);

diff --git a/src/stats_ext/fstats.rs b/src/stats_ext/fstats.rs
@@ -16,15 +16,27 @@ fn ftest(x: f64, f1: f64, f2: f64) -> Result<StatsResult, String> {
 }
 
 /// An internal helper function to compute f statistic for F test, with the option to comput
-/// the p value too. It shouldn't be used outside.
-/// When return_p is false, returns a Vec
+/// the p value too. It shouldn't be used outside. The API is bad for outsiders to use.
+/// When return_p is false, returns a Vec with f stats.
 /// When return_p is true, returns a Vec that has x_0, .., x_{n-1} = f_0, .., f_{n-1}
 /// where n = inputs.len() - 1 = number of features
-/// x_0, .., x_{n-1} are the same as before, but
-/// x_n, .., x_{2n - 2} = p_0, .., p_{n-1}, are the p values.
+/// And additionally x_n, .., x_{2n - 2} = p_0, .., p_{n-1}, are the p values.
 fn _f_stats(inputs: &[Series], return_p: bool) -> PolarsResult<Vec<f64>> {
-    let target = inputs[0].name();
-    let df = DataFrame::new(inputs.to_vec())?.lazy();
+    let target = "target";
+    let v = inputs
+        .into_iter()
+        .enumerate()
+        .map(|(i, s)| {
+            if i == 0 {
+                s.clone().with_name(target)
+            } else {
+                s.clone().with_name(i.to_string().as_str())
+            }
+        })
+        .collect_vec();
+    let n_cols = v.len();
+
+    let df = DataFrame::new(v)?.lazy();
     // inputs[0] is the group
     // all the rest should numerical
     let mut step_one: Vec<Expr> = Vec::with_capacity(inputs.len() * 2 - 1);
@@ -38,11 +50,12 @@ fn _f_stats(inputs: &[Series], return_p: bool) -> PolarsResult<Vec<f64>> {
             .cast(DataType::UInt32),
     );
 
-    for s in &inputs[1..] {
-        let name = s.name();
-        let n_sum = format!("{}_sum", name);
+    for i in 1..n_cols {
+        let name = i.to_string();
+        let name = name.as_str();
+        let n_sum = format!("{}_sum", i);
         let n_sum = n_sum.as_str();
-        let n_var = format!("{}_var", name);
+        let n_var = format!("{}_var", i);
         let n_var = n_var.as_str();
         step_one.push(col(name).sum().alias(n_sum));
         step_one.push(col(name).var(0).alias(n_var));
@@ -56,7 +69,7 @@ fn _f_stats(inputs: &[Series], return_p: bool) -> PolarsResult<Vec<f64>> {
     }
 
     let mut reference = df
-        .group_by([target])
+        .group_by([col(target)])
         .agg(step_one)
         .select(step_two)
         .collect()?;
@@ -65,18 +78,19 @@ fn _f_stats(inputs: &[Series], return_p: bool) -> PolarsResult<Vec<f64>> {
     let n_classes = reference.drop_in_place("n_classes")?;
     let n_samples = n_samples.u32()?;
     let n_classes = n_classes.u32()?;
-    let n_samples = n_samples.get(0).unwrap();
+    let n_samples = n_samples.get(0).unwrap_or(0);
     let n_classes = n_classes.get(0).unwrap_or(0);
 
-    if n_classes <= 1 {
+    if n_classes <= 1 || n_samples <= 1 {
         return Err(PolarsError::ComputeError(
-            "Number of classes is either 1 or 0, which is invalid.".into(),
+            "Number of classes, or number of samples is either 1 or 0, which is invalid.".into(),
         ));
     }
 
     let df_btw_class = n_classes.abs_diff(1) as f64;
     let df_in_class = n_samples.abs_diff(n_classes) as f64;
 
+    // fstats is 2D
     let fstats = reference.to_ndarray::<Float64Type>(IndexOrder::default())?;
     let scale = df_in_class / df_btw_class;
 
@@ -130,7 +144,7 @@ fn pl_f_stats(inputs: &[Series]) -> PolarsResult<Series> {
 /// and inputs[1] as the column to run F-test. There should be only two columns.
 #[polars_expr(output_type_func=simple_stats_output)]
 fn pl_f_test(inputs: &[Series]) -> PolarsResult<Series> {
-    // Since inputs only has 1 feature, this has to be a size 2 vec.
+    // The variable res has 2 values, the test statistic and p value.
     let res = _f_stats(&inputs[..2], true)?;
     let s = Series::from_vec("statistic", vec![res[0]]);
     let p = Series::from_vec("pvalue", vec![res[1]]);

diff --git a/tests/test.ipynb b/tests/test.ipynb
@@ -9,7 +9,39 @@
    "source": [
     "import polars as pl\n",
     "import numpy as np\n",
-    "import polars_ds as pld"
+    "# import polars_ds as pld"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9a3dbd5e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars_ds as pld\n",
+    "df = pl.DataFrame({\n",
+    "    \"word\":[\"apple\", \"banana\", \"pineapple\", \"asasasas\", \"sasasass\"],\n",
+    "    \"other_data\": [1,2,3,4,5]\n",
+    "})\n",
+    "gibberish = [\"asasasa\", \"sasaaasss\", \"asdasadadfa\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73ca40b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.filter(\n",
+    "    pl.col(\"word\").str2.similar_to_vocab(\n",
+    "        vocab = gibberish,\n",
+    "        threshold = 0.5,\n",
+    "        metric = \"leven\", # Levenshtein similarity. Other options: dleven, osa, jw\n",
+    "        strategy = \"any\" # True if the word is similar to any word in vocab. Other options: \"all\", \"avg\"\n",
+    "    )\n",
+    ")"
    ]
   },
   {
@@ -59,17 +91,32 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "size = 1000\n",
+    "size = 5000\n",
     "df = pl.DataFrame({\n",
-    "    \"id\": range(size),\n",
-    "    \"val1\": np.random.random(size=size),\n",
-    "    \"val2\": np.random.random(size=size),\n",
-    "    \"val3\": np.random.random(size=size),\n",
-    "    \"val4\": np.random.random(size=size),\n",
+    "    \"market_id\": range(size),\n",
+    "    \"group1\": np.random.random(size=size),\n",
+    "    \"group2\": np.random.random(size=size),\n",
+    "    \"category_1\": np.random.randint(low=0, high=5, size=size),\n",
+    "    \"category_2\":np.random.randint(low=0, high=10, size=size)\n",
     "}).with_columns(\n",
-    "    pl.col(\"id\").mod(5)\n",
+    "    pl.col(\"market_id\").mod(3)\n",
     ")\n",
-    "df.head(10)"
+    "df.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1d84105b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# In segment T-test, chi2 test, F test made easy!\n",
+    "df.group_by(\"market_id\").agg(\n",
+    "    pl.col(\"group1\").stats.ttest_ind(pl.col(\"group2\"), equal_var = True).alias(\"t-test\"),\n",
+    "    pl.col(\"category_1\").stats.chi2(pl.col(\"category_2\")).alias(\"chi2-test\"),\n",
+    "    pl.col(\"category_1\").stats.f_test(pl.col(\"group1\")).alias(\"f-test\")\n",
+    ")"
    ]
   },
   {