Skip to content

Commit

Permalink
better examples
Browse files Browse the repository at this point in the history
  • Loading branch information
abstractqqq committed Dec 26, 2023
1 parent 24fd91b commit f86c739
Show file tree
Hide file tree
Showing 7 changed files with 296 additions and 162 deletions.
256 changes: 183 additions & 73 deletions examples/basics.ipynb

Large diffs are not rendered by default.

51 changes: 49 additions & 2 deletions python/polars_ds/str2.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import polars as pl
from typing import Union, Optional
from typing import Union, Optional, Literal
from polars.utils.udfs import _get_shared_lib_location
from .type_alias import AhoCorasickMatchKind
import warnings
Expand Down Expand Up @@ -492,7 +492,7 @@ def jw(
self, other: Union[str, pl.Expr], weight: float = 0.1, parallel: bool = False
) -> pl.Expr:
"""
Computes the Jaro-Winker similarity between this and the other str.
Computes the Jaro-Winkler similarity between this and the other str.
Jaro-Winkler distance = 1 - Jaro-Winkler sim.
Parameters
Expand Down Expand Up @@ -550,6 +550,53 @@ def hamming(
is_elementwise=True,
)

def similar_to_vocab(
self,
vocab: list[str],
threshold: float,
metric: Literal["leven", "dleven", "jw", "osa"] = "leven",
strategy: Literal["avg", "all", "any"] = "avg",
) -> pl.Expr:
"""
Compare each word in the vocab with the each word in self. Filters self to the words
that are most similar to the words in the vocab.
Parameters
----------
vocab
Any iterable collection of strings
threshold
A entry is considered similar to the words in the vocabulary if the similarity
is above (>=) the threshold
metric
Which similarity metric to use. One of `leven`, `dleven`, `jw`, `osa`
strategy
If `avg`, then will return true if the average similarity is above the threshold.
If `all`, then will return true if the similarity to all words in the vocab is above
the threshold.
If `any`, then will return true if the similarity to any words in the vocab is above
the threshold.
"""
if metric == "leven":
sims = [self.levenshtein(w, return_sim=True) for w in vocab]
elif metric == "dleven":
sims = [self.d_levenshtein(w, return_sim=True) for w in vocab]
elif metric == "osa":
sims = [self.osa(w, return_sim=True) for w in vocab]
elif sims == "jw":
sims = [self.jw(w, return_sim=True) for w in vocab]
else:
raise ValueError(f"Unknown metric for find_similar: {metric}")

if strategy == "all":
return pl.all_horizontal(s >= threshold for s in sims)
elif strategy == "any":
return pl.any_horizontal(s >= threshold for s in sims)
elif strategy == "avg":
return (pl.sum_horizontal(sims) / len(vocab)) >= threshold
else:
raise ValueError(f"Unknown strategy for find_similar: {strategy}")

def tokenize(self, pattern: str = r"(?u)\b\w\w+\b", stem: bool = False) -> pl.Expr:
"""
Tokenize the string according to the pattern. This will only extract the words
Expand Down
13 changes: 8 additions & 5 deletions src/num_ext/entrophies.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ use pyo3_polars::derive::polars_expr;
// https://en.wikipedia.org/wiki/Sample_entropy
// https://en.wikipedia.org/wiki/Approximate_entropy


#[polars_expr(output_type=Float64)]
fn pl_approximate_entropy(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResult<Series> {
// inputs[0] is radius, the rest are the shifted columns
Expand All @@ -33,16 +32,20 @@ fn pl_approximate_entropy(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResu
let data_1_view = data.slice(s![..n1, ..dim.abs_diff(1)]);
let tree = build_standard_kdtree(dim.abs_diff(1), leaf_size, &data_1_view)?;
let nb_in_radius = query_nb_cnt(&tree, data_1_view, &super::l_inf_dist, r, parallel);
let phi_m: f64 = nb_in_radius.into_no_null_iter()
.fold(0_f64, |acc, x| acc + (x as f64 / n1 as f64).ln()) / n1 as f64;
let phi_m: f64 = nb_in_radius
.into_no_null_iter()
.fold(0_f64, |acc, x| acc + (x as f64 / n1 as f64).ln())
/ n1 as f64;

// Step 3, 4, 5 for m + 1 in wiki
let n2 = n1.abs_diff(1);
let data_2_view = data.slice(s![..n2, ..]);
let tree = build_standard_kdtree(dim, leaf_size, &data_2_view)?;
let nb_in_radius = query_nb_cnt(&tree, data_2_view, &super::l_inf_dist, r, parallel);
let phi_m1: f64 = nb_in_radius.into_no_null_iter()
.fold(0_f64, |acc, x| acc + (x as f64 / n2 as f64).ln()) / n2 as f64;
let phi_m1: f64 = nb_in_radius
.into_no_null_iter()
.fold(0_f64, |acc, x| acc + (x as f64 / n2 as f64).ln())
/ n2 as f64;

// Output
Ok(Series::from_vec("", vec![(phi_m1 - phi_m).abs()]))
Expand Down
12 changes: 7 additions & 5 deletions src/num_ext/knn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,13 @@ fn pl_knn_ptwise(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResult<Series
.into_par_iter()
.map(|p| {
let s = p.to_slice().unwrap(); // C order makes sure rows are contiguous
// tree.nearest(s, k+1, &dist_func)
if let Ok(v) = tree.nearest(s, k+1, &dist_func) {
// tree.nearest(s, k+1, &dist_func)
if let Ok(v) = tree.nearest(s, k + 1, &dist_func) {
// By construction, this unwrap is safe.
// k+ 1 because we include the point itself, and ask for k more neighbors.
Some(
v.into_iter().map(|(_, i)| id.get(*i).unwrap())
v.into_iter()
.map(|(_, i)| id.get(*i).unwrap())
.collect_vec(),
)
} else {
Expand All @@ -103,9 +104,10 @@ fn pl_knn_ptwise(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResult<Series
} else {
for p in data.rows() {
let s = p.to_slice().unwrap(); // C order makes sure rows are contiguous
if let Ok(v) = tree.nearest(s, k+1, &dist_func) {
if let Ok(v) = tree.nearest(s, k + 1, &dist_func) {
// By construction, this unwrap is safe
let w: Vec<u64> = v.into_iter()
let w: Vec<u64> = v
.into_iter()
.map(|(_, i)| id.get(*i).unwrap())
.collect_vec();
builder.append_slice(w.as_slice());
Expand Down
1 change: 0 additions & 1 deletion src/stats_ext/chi2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ use pyo3_polars::derive::polars_expr;

#[polars_expr(output_type_func=simple_stats_output)]
fn pl_chi2(inputs: &[Series]) -> PolarsResult<Series> {

let s1_name = "s1";
let s2_name = "s2";

Expand Down
19 changes: 11 additions & 8 deletions src/stats_ext/fstats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,18 @@ fn ftest(x: f64, f1: f64, f2: f64) -> Result<StatsResult, String> {
/// where n = inputs.len() - 1 = number of features
/// And additionally x_n, .., x_{2n - 2} = p_0, .., p_{n-1}, are the p values.
fn _f_stats(inputs: &[Series], return_p: bool) -> PolarsResult<Vec<f64>> {

let target = "target";
let v = inputs.into_iter().enumerate().map(|(i , s)| {
if i == 0 {
s.clone().with_name(target)
} else {
s.clone().with_name(i.to_string().as_str())
}
}).collect_vec();
let v = inputs
.into_iter()
.enumerate()
.map(|(i, s)| {
if i == 0 {
s.clone().with_name(target)
} else {
s.clone().with_name(i.to_string().as_str())
}
})
.collect_vec();
let n_cols = v.len();

let df = DataFrame::new(v)?.lazy();
Expand Down
106 changes: 38 additions & 68 deletions tests/test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,46 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "529f4422-5c3a-4bd6-abe0-a15edfc62abb",
"metadata": {},
"outputs": [],
"source": [
"import polars as pl\n",
"import numpy as np\n",
"import polars_ds as pld"
"# import polars_ds as pld"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9a3dbd5e",
"metadata": {},
"outputs": [],
"source": [
"import polars_ds as pld\n",
"df = pl.DataFrame({\n",
" \"word\":[\"apple\", \"banana\", \"pineapple\", \"asasasas\", \"sasasass\"],\n",
" \"other_data\": [1,2,3,4,5]\n",
"})\n",
"gibberish = [\"asasasa\", \"sasaaasss\", \"asdasadadfa\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "73ca40b3",
"metadata": {},
"outputs": [],
"source": [
"df.filter(\n",
" pl.col(\"word\").str2.similar_to_vocab(\n",
" vocab = gibberish,\n",
" threshold = 0.5,\n",
" metric = \"leven\", # Levenshtein similarity. Other options: dleven, osa, jw\n",
" strategy = \"any\" # True if the word is similar to any word in vocab. Other options: \"all\", \"avg\"\n",
" )\n",
")"
]
},
{
Expand Down Expand Up @@ -54,42 +86,10 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "0fbc1c14",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (5, 5)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>market_id</th><th>group1</th><th>group2</th><th>category_1</th><th>category_2</th></tr><tr><td>i64</td><td>f64</td><td>f64</td><td>i64</td><td>i64</td></tr></thead><tbody><tr><td>0</td><td>0.863211</td><td>0.05146</td><td>2</td><td>0</td></tr><tr><td>1</td><td>0.375187</td><td>0.841213</td><td>0</td><td>4</td></tr><tr><td>2</td><td>0.731748</td><td>0.775747</td><td>2</td><td>2</td></tr><tr><td>0</td><td>0.366678</td><td>0.236327</td><td>1</td><td>3</td></tr><tr><td>1</td><td>0.521338</td><td>0.728827</td><td>3</td><td>6</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (5, 5)\n",
"┌───────────┬──────────┬──────────┬────────────┬────────────┐\n",
"│ market_id ┆ group1 ┆ group2 ┆ category_1 ┆ category_2 │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ i64 ┆ f64 ┆ f64 ┆ i64 ┆ i64 │\n",
"╞═══════════╪══════════╪══════════╪════════════╪════════════╡\n",
"│ 0 ┆ 0.863211 ┆ 0.05146 ┆ 2 ┆ 0 │\n",
"│ 1 ┆ 0.375187 ┆ 0.841213 ┆ 0 ┆ 4 │\n",
"│ 2 ┆ 0.731748 ┆ 0.775747 ┆ 2 ┆ 2 │\n",
"│ 0 ┆ 0.366678 ┆ 0.236327 ┆ 1 ┆ 3 │\n",
"│ 1 ┆ 0.521338 ┆ 0.728827 ┆ 3 ┆ 6 │\n",
"└───────────┴──────────┴──────────┴────────────┴────────────┘"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"size = 5000\n",
"df = pl.DataFrame({\n",
Expand All @@ -106,40 +106,10 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"id": "1d84105b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (3, 4)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>market_id</th><th>t-test</th><th>chi2-test</th><th>f-test</th></tr><tr><td>i64</td><td>struct[2]</td><td>struct[2]</td><td>struct[2]</td></tr></thead><tbody><tr><td>0</td><td>{-0.678227,0.497675}</td><td>{27.55702,0.842609}</td><td>{3.304733,0.010454}</td></tr><tr><td>1</td><td>{1.049668,0.293947}</td><td>{28.920644,0.792821}</td><td>{1.081389,0.364112}</td></tr><tr><td>2</td><td>{0.246265,0.805493}</td><td>{27.843576,0.832729}</td><td>{0.851298,0.492663}</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (3, 4)\n",
"┌───────────┬──────────────────────┬──────────────────────┬─────────────────────┐\n",
"│ market_id ┆ t-test ┆ chi2-test ┆ f-test │\n",
"│ --- ┆ --- ┆ --- ┆ --- │\n",
"│ i64 ┆ struct[2] ┆ struct[2] ┆ struct[2] │\n",
"╞═══════════╪══════════════════════╪══════════════════════╪═════════════════════╡\n",
"│ 0 ┆ {-0.678227,0.497675} ┆ {27.55702,0.842609} ┆ {3.304733,0.010454} │\n",
"│ 1 ┆ {1.049668,0.293947} ┆ {28.920644,0.792821} ┆ {1.081389,0.364112} │\n",
"│ 2 ┆ {0.246265,0.805493} ┆ {27.843576,0.832729} ┆ {0.851298,0.492663} │\n",
"└───────────┴──────────────────────┴──────────────────────┴─────────────────────┘"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# In segment T-test, chi2 test, F test made easy!\n",
"df.group_by(\"market_id\").agg(\n",
Expand Down

0 comments on commit f86c739

Please sign in to comment.