better examples

abstractqqq · Dec 26, 2023 · f86c739 · f86c739
1 parent 24fd91b
commit f86c739
Show file tree

Hide file tree

Showing 7 changed files with 296 additions and 162 deletions.
diff --git a/examples/basics.ipynb b/examples/basics.ipynb
diff --git a/python/polars_ds/str2.py b/python/polars_ds/str2.py
@@ -1,5 +1,5 @@
 import polars as pl
-from typing import Union, Optional
+from typing import Union, Optional, Literal
 from polars.utils.udfs import _get_shared_lib_location
 from .type_alias import AhoCorasickMatchKind
 import warnings
@@ -492,7 +492,7 @@ def jw(
         self, other: Union[str, pl.Expr], weight: float = 0.1, parallel: bool = False
     ) -> pl.Expr:
         """
-        Computes the Jaro-Winker similarity between this and the other str.
+        Computes the Jaro-Winkler similarity between this and the other str.
         Jaro-Winkler distance = 1 - Jaro-Winkler sim.
 
         Parameters
@@ -550,6 +550,53 @@ def hamming(
             is_elementwise=True,
         )
 
+    def similar_to_vocab(
+        self,
+        vocab: list[str],
+        threshold: float,
+        metric: Literal["leven", "dleven", "jw", "osa"] = "leven",
+        strategy: Literal["avg", "all", "any"] = "avg",
+    ) -> pl.Expr:
+        """
+        Compare each word in the vocab with the each word in self. Filters self to the words
+        that are most similar to the words in the vocab.
+
+        Parameters
+        ----------
+        vocab
+            Any iterable collection of strings
+        threshold
+            A entry is considered similar to the words in the vocabulary if the similarity
+            is above (>=) the threshold
+        metric
+            Which similarity metric to use. One of `leven`, `dleven`, `jw`, `osa`
+        strategy
+            If `avg`, then will return true if the average similarity is above the threshold.
+            If `all`, then will return true if the similarity to all words in the vocab is above
+            the threshold.
+            If `any`, then will return true if the similarity to any words in the vocab is above
+            the threshold.
+        """
+        if metric == "leven":
+            sims = [self.levenshtein(w, return_sim=True) for w in vocab]
+        elif metric == "dleven":
+            sims = [self.d_levenshtein(w, return_sim=True) for w in vocab]
+        elif metric == "osa":
+            sims = [self.osa(w, return_sim=True) for w in vocab]
+        elif sims == "jw":
+            sims = [self.jw(w, return_sim=True) for w in vocab]
+        else:
+            raise ValueError(f"Unknown metric for find_similar: {metric}")
+
+        if strategy == "all":
+            return pl.all_horizontal(s >= threshold for s in sims)
+        elif strategy == "any":
+            return pl.any_horizontal(s >= threshold for s in sims)
+        elif strategy == "avg":
+            return (pl.sum_horizontal(sims) / len(vocab)) >= threshold
+        else:
+            raise ValueError(f"Unknown strategy for find_similar: {strategy}")
+
     def tokenize(self, pattern: str = r"(?u)\b\w\w+\b", stem: bool = False) -> pl.Expr:
         """
         Tokenize the string according to the pattern. This will only extract the words

diff --git a/src/num_ext/entrophies.rs b/src/num_ext/entrophies.rs
@@ -6,7 +6,6 @@ use pyo3_polars::derive::polars_expr;
 // https://en.wikipedia.org/wiki/Sample_entropy
 // https://en.wikipedia.org/wiki/Approximate_entropy
 
-
 #[polars_expr(output_type=Float64)]
 fn pl_approximate_entropy(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResult<Series> {
     // inputs[0] is radius, the rest are the shifted columns
@@ -33,16 +32,20 @@ fn pl_approximate_entropy(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResu
     let data_1_view = data.slice(s![..n1, ..dim.abs_diff(1)]);
     let tree = build_standard_kdtree(dim.abs_diff(1), leaf_size, &data_1_view)?;
     let nb_in_radius = query_nb_cnt(&tree, data_1_view, &super::l_inf_dist, r, parallel);
-    let phi_m: f64 = nb_in_radius.into_no_null_iter()
-        .fold(0_f64, |acc, x| acc + (x as f64 / n1 as f64).ln()) / n1 as f64;
+    let phi_m: f64 = nb_in_radius
+        .into_no_null_iter()
+        .fold(0_f64, |acc, x| acc + (x as f64 / n1 as f64).ln())
+        / n1 as f64;
 
     // Step 3, 4, 5 for m + 1 in wiki
     let n2 = n1.abs_diff(1);
     let data_2_view = data.slice(s![..n2, ..]);
     let tree = build_standard_kdtree(dim, leaf_size, &data_2_view)?;
     let nb_in_radius = query_nb_cnt(&tree, data_2_view, &super::l_inf_dist, r, parallel);
-    let phi_m1: f64 = nb_in_radius.into_no_null_iter()
-        .fold(0_f64, |acc, x| acc + (x as f64 / n2 as f64).ln()) / n2 as f64;
+    let phi_m1: f64 = nb_in_radius
+        .into_no_null_iter()
+        .fold(0_f64, |acc, x| acc + (x as f64 / n2 as f64).ln())
+        / n2 as f64;
 
     // Output
     Ok(Series::from_vec("", vec![(phi_m1 - phi_m).abs()]))

diff --git a/src/num_ext/knn.rs b/src/num_ext/knn.rs
@@ -80,12 +80,13 @@ fn pl_knn_ptwise(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResult<Series
             .into_par_iter()
             .map(|p| {
                 let s = p.to_slice().unwrap(); // C order makes sure rows are contiguous
-                // tree.nearest(s, k+1, &dist_func)
-                if let Ok(v) = tree.nearest(s, k+1, &dist_func) {
+                                               // tree.nearest(s, k+1, &dist_func)
+                if let Ok(v) = tree.nearest(s, k + 1, &dist_func) {
                     // By construction, this unwrap is safe.
                     // k+ 1 because we include the point itself, and ask for k more neighbors.
                     Some(
-                        v.into_iter().map(|(_, i)| id.get(*i).unwrap())
+                        v.into_iter()
+                            .map(|(_, i)| id.get(*i).unwrap())
                             .collect_vec(),
                     )
                 } else {
@@ -103,9 +104,10 @@ fn pl_knn_ptwise(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResult<Series
     } else {
         for p in data.rows() {
             let s = p.to_slice().unwrap(); // C order makes sure rows are contiguous
-            if let Ok(v) = tree.nearest(s, k+1, &dist_func) {
+            if let Ok(v) = tree.nearest(s, k + 1, &dist_func) {
                 // By construction, this unwrap is safe
-                let w: Vec<u64> = v.into_iter()
+                let w: Vec<u64> = v
+                    .into_iter()
                     .map(|(_, i)| id.get(*i).unwrap())
                     .collect_vec();
                 builder.append_slice(w.as_slice());

diff --git a/src/stats_ext/chi2.rs b/src/stats_ext/chi2.rs
@@ -5,7 +5,6 @@ use pyo3_polars::derive::polars_expr;
 
 #[polars_expr(output_type_func=simple_stats_output)]
 fn pl_chi2(inputs: &[Series]) -> PolarsResult<Series> {
-
     let s1_name = "s1";
     let s2_name = "s2";
 

diff --git a/src/stats_ext/fstats.rs b/src/stats_ext/fstats.rs
@@ -22,15 +22,18 @@ fn ftest(x: f64, f1: f64, f2: f64) -> Result<StatsResult, String> {
 /// where n = inputs.len() - 1 = number of features
 /// And additionally x_n, .., x_{2n - 2} = p_0, .., p_{n-1}, are the p values.
 fn _f_stats(inputs: &[Series], return_p: bool) -> PolarsResult<Vec<f64>> {
-
     let target = "target";
-    let v = inputs.into_iter().enumerate().map(|(i , s)| {
-        if i == 0 {
-            s.clone().with_name(target)
-        } else {
-            s.clone().with_name(i.to_string().as_str())
-        }
-    }).collect_vec();
+    let v = inputs
+        .into_iter()
+        .enumerate()
+        .map(|(i, s)| {
+            if i == 0 {
+                s.clone().with_name(target)
+            } else {
+                s.clone().with_name(i.to_string().as_str())
+            }
+        })
+        .collect_vec();
     let n_cols = v.len();
 
     let df = DataFrame::new(v)?.lazy();

diff --git a/tests/test.ipynb b/tests/test.ipynb
@@ -2,14 +2,46 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "529f4422-5c3a-4bd6-abe0-a15edfc62abb",
    "metadata": {},
    "outputs": [],
    "source": [
     "import polars as pl\n",
     "import numpy as np\n",
-    "import polars_ds as pld"
+    "# import polars_ds as pld"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9a3dbd5e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars_ds as pld\n",
+    "df = pl.DataFrame({\n",
+    "    \"word\":[\"apple\", \"banana\", \"pineapple\", \"asasasas\", \"sasasass\"],\n",
+    "    \"other_data\": [1,2,3,4,5]\n",
+    "})\n",
+    "gibberish = [\"asasasa\", \"sasaaasss\", \"asdasadadfa\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73ca40b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.filter(\n",
+    "    pl.col(\"word\").str2.similar_to_vocab(\n",
+    "        vocab = gibberish,\n",
+    "        threshold = 0.5,\n",
+    "        metric = \"leven\", # Levenshtein similarity. Other options: dleven, osa, jw\n",
+    "        strategy = \"any\" # True if the word is similar to any word in vocab. Other options: \"all\", \"avg\"\n",
+    "    )\n",
+    ")"
    ]
   },
   {
@@ -54,42 +86,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "0fbc1c14",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div><style>\n",
-       ".dataframe > thead > tr,\n",
-       ".dataframe > tbody > tr {\n",
-       "  text-align: right;\n",
-       "  white-space: pre-wrap;\n",
-       "}\n",
-       "</style>\n",
-       "<small>shape: (5, 5)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>market_id</th><th>group1</th><th>group2</th><th>category_1</th><th>category_2</th></tr><tr><td>i64</td><td>f64</td><td>f64</td><td>i64</td><td>i64</td></tr></thead><tbody><tr><td>0</td><td>0.863211</td><td>0.05146</td><td>2</td><td>0</td></tr><tr><td>1</td><td>0.375187</td><td>0.841213</td><td>0</td><td>4</td></tr><tr><td>2</td><td>0.731748</td><td>0.775747</td><td>2</td><td>2</td></tr><tr><td>0</td><td>0.366678</td><td>0.236327</td><td>1</td><td>3</td></tr><tr><td>1</td><td>0.521338</td><td>0.728827</td><td>3</td><td>6</td></tr></tbody></table></div>"
-      ],
-      "text/plain": [
-       "shape: (5, 5)\n",
-       "┌───────────┬──────────┬──────────┬────────────┬────────────┐\n",
-       "│ market_id ┆ group1   ┆ group2   ┆ category_1 ┆ category_2 │\n",
-       "│ ---       ┆ ---      ┆ ---      ┆ ---        ┆ ---        │\n",
-       "│ i64       ┆ f64      ┆ f64      ┆ i64        ┆ i64        │\n",
-       "╞═══════════╪══════════╪══════════╪════════════╪════════════╡\n",
-       "│ 0         ┆ 0.863211 ┆ 0.05146  ┆ 2          ┆ 0          │\n",
-       "│ 1         ┆ 0.375187 ┆ 0.841213 ┆ 0          ┆ 4          │\n",
-       "│ 2         ┆ 0.731748 ┆ 0.775747 ┆ 2          ┆ 2          │\n",
-       "│ 0         ┆ 0.366678 ┆ 0.236327 ┆ 1          ┆ 3          │\n",
-       "│ 1         ┆ 0.521338 ┆ 0.728827 ┆ 3          ┆ 6          │\n",
-       "└───────────┴──────────┴──────────┴────────────┴────────────┘"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "size = 5000\n",
     "df = pl.DataFrame({\n",
@@ -106,40 +106,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "1d84105b",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div><style>\n",
-       ".dataframe > thead > tr,\n",
-       ".dataframe > tbody > tr {\n",
-       "  text-align: right;\n",
-       "  white-space: pre-wrap;\n",
-       "}\n",
-       "</style>\n",
-       "<small>shape: (3, 4)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>market_id</th><th>t-test</th><th>chi2-test</th><th>f-test</th></tr><tr><td>i64</td><td>struct[2]</td><td>struct[2]</td><td>struct[2]</td></tr></thead><tbody><tr><td>0</td><td>{-0.678227,0.497675}</td><td>{27.55702,0.842609}</td><td>{3.304733,0.010454}</td></tr><tr><td>1</td><td>{1.049668,0.293947}</td><td>{28.920644,0.792821}</td><td>{1.081389,0.364112}</td></tr><tr><td>2</td><td>{0.246265,0.805493}</td><td>{27.843576,0.832729}</td><td>{0.851298,0.492663}</td></tr></tbody></table></div>"
-      ],
-      "text/plain": [
-       "shape: (3, 4)\n",
-       "┌───────────┬──────────────────────┬──────────────────────┬─────────────────────┐\n",
-       "│ market_id ┆ t-test               ┆ chi2-test            ┆ f-test              │\n",
-       "│ ---       ┆ ---                  ┆ ---                  ┆ ---                 │\n",
-       "│ i64       ┆ struct[2]            ┆ struct[2]            ┆ struct[2]           │\n",
-       "╞═══════════╪══════════════════════╪══════════════════════╪═════════════════════╡\n",
-       "│ 0         ┆ {-0.678227,0.497675} ┆ {27.55702,0.842609}  ┆ {3.304733,0.010454} │\n",
-       "│ 1         ┆ {1.049668,0.293947}  ┆ {28.920644,0.792821} ┆ {1.081389,0.364112} │\n",
-       "│ 2         ┆ {0.246265,0.805493}  ┆ {27.843576,0.832729} ┆ {0.851298,0.492663} │\n",
-       "└───────────┴──────────────────────┴──────────────────────┴─────────────────────┘"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# In segment T-test, chi2 test, F test made easy!\n",
     "df.group_by(\"market_id\").agg(\n",