Skip to content

Commit

Permalink
Merge pull request #40 from abstractqqq/debug_chi2
Browse files Browse the repository at this point in the history
Debug chi2
  • Loading branch information
abstractqqq authored Dec 26, 2023
2 parents 6cd6200 + f86c739 commit f8c466b
Show file tree
Hide file tree
Showing 8 changed files with 481 additions and 104 deletions.
370 changes: 314 additions & 56 deletions examples/basics.ipynb

Large diffs are not rendered by default.

51 changes: 49 additions & 2 deletions python/polars_ds/str2.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import polars as pl
from typing import Union, Optional
from typing import Union, Optional, Literal
from polars.utils.udfs import _get_shared_lib_location
from .type_alias import AhoCorasickMatchKind
import warnings
Expand Down Expand Up @@ -492,7 +492,7 @@ def jw(
self, other: Union[str, pl.Expr], weight: float = 0.1, parallel: bool = False
) -> pl.Expr:
"""
Computes the Jaro-Winker similarity between this and the other str.
Computes the Jaro-Winkler similarity between this and the other str.
Jaro-Winkler distance = 1 - Jaro-Winkler sim.
Parameters
Expand Down Expand Up @@ -550,6 +550,53 @@ def hamming(
is_elementwise=True,
)

def similar_to_vocab(
self,
vocab: list[str],
threshold: float,
metric: Literal["leven", "dleven", "jw", "osa"] = "leven",
strategy: Literal["avg", "all", "any"] = "avg",
) -> pl.Expr:
"""
Compare each word in the vocab with the each word in self. Filters self to the words
that are most similar to the words in the vocab.
Parameters
----------
vocab
Any iterable collection of strings
threshold
A entry is considered similar to the words in the vocabulary if the similarity
is above (>=) the threshold
metric
Which similarity metric to use. One of `leven`, `dleven`, `jw`, `osa`
strategy
If `avg`, then will return true if the average similarity is above the threshold.
If `all`, then will return true if the similarity to all words in the vocab is above
the threshold.
If `any`, then will return true if the similarity to any words in the vocab is above
the threshold.
"""
if metric == "leven":
sims = [self.levenshtein(w, return_sim=True) for w in vocab]
elif metric == "dleven":
sims = [self.d_levenshtein(w, return_sim=True) for w in vocab]
elif metric == "osa":
sims = [self.osa(w, return_sim=True) for w in vocab]
elif sims == "jw":
sims = [self.jw(w, return_sim=True) for w in vocab]
else:
raise ValueError(f"Unknown metric for find_similar: {metric}")

if strategy == "all":
return pl.all_horizontal(s >= threshold for s in sims)
elif strategy == "any":
return pl.any_horizontal(s >= threshold for s in sims)
elif strategy == "avg":
return (pl.sum_horizontal(sims) / len(vocab)) >= threshold
else:
raise ValueError(f"Unknown strategy for find_similar: {strategy}")

def tokenize(self, pattern: str = r"(?u)\b\w\w+\b", stem: bool = False) -> pl.Expr:
"""
Tokenize the string according to the pattern. This will only extract the words
Expand Down
13 changes: 8 additions & 5 deletions src/num_ext/entrophies.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ use pyo3_polars::derive::polars_expr;
// https://en.wikipedia.org/wiki/Sample_entropy
// https://en.wikipedia.org/wiki/Approximate_entropy


#[polars_expr(output_type=Float64)]
fn pl_approximate_entropy(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResult<Series> {
// inputs[0] is radius, the rest are the shifted columns
Expand All @@ -33,16 +32,20 @@ fn pl_approximate_entropy(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResu
let data_1_view = data.slice(s![..n1, ..dim.abs_diff(1)]);
let tree = build_standard_kdtree(dim.abs_diff(1), leaf_size, &data_1_view)?;
let nb_in_radius = query_nb_cnt(&tree, data_1_view, &super::l_inf_dist, r, parallel);
let phi_m: f64 = nb_in_radius.into_no_null_iter()
.fold(0_f64, |acc, x| acc + (x as f64 / n1 as f64).ln()) / n1 as f64;
let phi_m: f64 = nb_in_radius
.into_no_null_iter()
.fold(0_f64, |acc, x| acc + (x as f64 / n1 as f64).ln())
/ n1 as f64;

// Step 3, 4, 5 for m + 1 in wiki
let n2 = n1.abs_diff(1);
let data_2_view = data.slice(s![..n2, ..]);
let tree = build_standard_kdtree(dim, leaf_size, &data_2_view)?;
let nb_in_radius = query_nb_cnt(&tree, data_2_view, &super::l_inf_dist, r, parallel);
let phi_m1: f64 = nb_in_radius.into_no_null_iter()
.fold(0_f64, |acc, x| acc + (x as f64 / n2 as f64).ln()) / n2 as f64;
let phi_m1: f64 = nb_in_radius
.into_no_null_iter()
.fold(0_f64, |acc, x| acc + (x as f64 / n2 as f64).ln())
/ n2 as f64;

// Output
Ok(Series::from_vec("", vec![(phi_m1 - phi_m).abs()]))
Expand Down
12 changes: 7 additions & 5 deletions src/num_ext/knn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,13 @@ fn pl_knn_ptwise(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResult<Series
.into_par_iter()
.map(|p| {
let s = p.to_slice().unwrap(); // C order makes sure rows are contiguous
// tree.nearest(s, k+1, &dist_func)
if let Ok(v) = tree.nearest(s, k+1, &dist_func) {
// tree.nearest(s, k+1, &dist_func)
if let Ok(v) = tree.nearest(s, k + 1, &dist_func) {
// By construction, this unwrap is safe.
// k+ 1 because we include the point itself, and ask for k more neighbors.
Some(
v.into_iter().map(|(_, i)| id.get(*i).unwrap())
v.into_iter()
.map(|(_, i)| id.get(*i).unwrap())
.collect_vec(),
)
} else {
Expand All @@ -103,9 +104,10 @@ fn pl_knn_ptwise(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResult<Series
} else {
for p in data.rows() {
let s = p.to_slice().unwrap(); // C order makes sure rows are contiguous
if let Ok(v) = tree.nearest(s, k+1, &dist_func) {
if let Ok(v) = tree.nearest(s, k + 1, &dist_func) {
// By construction, this unwrap is safe
let w: Vec<u64> = v.into_iter()
let w: Vec<u64> = v
.into_iter()
.map(|(_, i)| id.get(*i).unwrap())
.collect_vec();
builder.append_slice(w.as_slice());
Expand Down
6 changes: 3 additions & 3 deletions src/num_ext/tp_fp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ fn pl_combo_b(inputs: &[Series]) -> PolarsResult<Series> {
let threshold = threshold.get(0).unwrap();

if (actual.len() != predicted.len())
| actual.is_empty()
| predicted.is_empty()
| ((actual.null_count() + predicted.null_count()) > 0)
|| actual.is_empty()
|| predicted.is_empty()
|| ((actual.null_count() + predicted.null_count()) > 0)
{
return Err(PolarsError::ComputeError(
"Binary Metrics Combo: Input columns must be the same length, non-empty, and shouldn't contain nulls."
Expand Down
24 changes: 15 additions & 9 deletions src/stats_ext/chi2.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
use super::{simple_stats_output, StatsResult};
use super::simple_stats_output;
use crate::stats::gamma;
use polars::prelude::*;
use pyo3_polars::derive::polars_expr;

#[polars_expr(output_type_func=simple_stats_output)]
fn pl_chi2(inputs: &[Series]) -> PolarsResult<Series> {
let s1_name = inputs[0].name();
let s2_name = inputs[1].name();
let s1_name = "s1";
let s2_name = "s2";

let u1 = inputs[0].unique()?;
let u1_len = u1.len();
Expand All @@ -18,7 +18,9 @@ fn pl_chi2(inputs: &[Series]) -> PolarsResult<Series> {
let cross = df1.cross_join(df2);

// Create a "fake" contigency table
let df3 = DataFrame::from_iter(inputs[0..2].to_vec())
let s1 = inputs[0].clone();
let s2 = inputs[1].clone();
let df3 = df!(s1_name => s1, s2_name => s2)?
.lazy()
.group_by([col(s1_name), col(s2_name)])
.agg([count().alias("ob")]);
Expand Down Expand Up @@ -50,12 +52,16 @@ fn pl_chi2(inputs: &[Series]) -> PolarsResult<Series> {
// Get the statistic
let out = final_df.drop_in_place("output")?;
let stats = out.f64()?;
let stats = stats.get(0).unwrap();
let stats = stats.get(0).unwrap_or(f64::NAN);
// Compute p value. It is a special case of Gamma distribution
let dof = u1_len.abs_diff(1) * u2_len.abs_diff(1);
let (shape, rate) = (dof as f64 / 2., 0.5);
let p = gamma::sf(stats, shape, rate).map_err(|e| PolarsError::ComputeError(e.into()));
let p = p?;
let p = if stats.is_nan() {
f64::NAN
} else {
let dof = u1_len.abs_diff(1) * u2_len.abs_diff(1);
let (shape, rate) = (dof as f64 / 2., 0.5);
let p = gamma::sf(stats, shape, rate).map_err(|e| PolarsError::ComputeError(e.into()));
p?
};
// Get output
let s = Series::from_vec("statistic", vec![stats]);
let p = Series::from_vec("pvalue", vec![p]);
Expand Down
44 changes: 29 additions & 15 deletions src/stats_ext/fstats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,27 @@ fn ftest(x: f64, f1: f64, f2: f64) -> Result<StatsResult, String> {
}

/// An internal helper function to compute f statistic for F test, with the option to comput
/// the p value too. It shouldn't be used outside.
/// When return_p is false, returns a Vec
/// the p value too. It shouldn't be used outside. The API is bad for outsiders to use.
/// When return_p is false, returns a Vec with f stats.
/// When return_p is true, returns a Vec that has x_0, .., x_{n-1} = f_0, .., f_{n-1}
/// where n = inputs.len() - 1 = number of features
/// x_0, .., x_{n-1} are the same as before, but
/// x_n, .., x_{2n - 2} = p_0, .., p_{n-1}, are the p values.
/// And additionally x_n, .., x_{2n - 2} = p_0, .., p_{n-1}, are the p values.
fn _f_stats(inputs: &[Series], return_p: bool) -> PolarsResult<Vec<f64>> {
let target = inputs[0].name();
let df = DataFrame::new(inputs.to_vec())?.lazy();
let target = "target";
let v = inputs
.into_iter()
.enumerate()
.map(|(i, s)| {
if i == 0 {
s.clone().with_name(target)
} else {
s.clone().with_name(i.to_string().as_str())
}
})
.collect_vec();
let n_cols = v.len();

let df = DataFrame::new(v)?.lazy();
// inputs[0] is the group
// all the rest should numerical
let mut step_one: Vec<Expr> = Vec::with_capacity(inputs.len() * 2 - 1);
Expand All @@ -38,11 +50,12 @@ fn _f_stats(inputs: &[Series], return_p: bool) -> PolarsResult<Vec<f64>> {
.cast(DataType::UInt32),
);

for s in &inputs[1..] {
let name = s.name();
let n_sum = format!("{}_sum", name);
for i in 1..n_cols {
let name = i.to_string();
let name = name.as_str();
let n_sum = format!("{}_sum", i);
let n_sum = n_sum.as_str();
let n_var = format!("{}_var", name);
let n_var = format!("{}_var", i);
let n_var = n_var.as_str();
step_one.push(col(name).sum().alias(n_sum));
step_one.push(col(name).var(0).alias(n_var));
Expand All @@ -56,7 +69,7 @@ fn _f_stats(inputs: &[Series], return_p: bool) -> PolarsResult<Vec<f64>> {
}

let mut reference = df
.group_by([target])
.group_by([col(target)])
.agg(step_one)
.select(step_two)
.collect()?;
Expand All @@ -65,18 +78,19 @@ fn _f_stats(inputs: &[Series], return_p: bool) -> PolarsResult<Vec<f64>> {
let n_classes = reference.drop_in_place("n_classes")?;
let n_samples = n_samples.u32()?;
let n_classes = n_classes.u32()?;
let n_samples = n_samples.get(0).unwrap();
let n_samples = n_samples.get(0).unwrap_or(0);
let n_classes = n_classes.get(0).unwrap_or(0);

if n_classes <= 1 {
if n_classes <= 1 || n_samples <= 1 {
return Err(PolarsError::ComputeError(
"Number of classes is either 1 or 0, which is invalid.".into(),
"Number of classes, or number of samples is either 1 or 0, which is invalid.".into(),
));
}

let df_btw_class = n_classes.abs_diff(1) as f64;
let df_in_class = n_samples.abs_diff(n_classes) as f64;

// fstats is 2D
let fstats = reference.to_ndarray::<Float64Type>(IndexOrder::default())?;
let scale = df_in_class / df_btw_class;

Expand Down Expand Up @@ -130,7 +144,7 @@ fn pl_f_stats(inputs: &[Series]) -> PolarsResult<Series> {
/// and inputs[1] as the column to run F-test. There should be only two columns.
#[polars_expr(output_type_func=simple_stats_output)]
fn pl_f_test(inputs: &[Series]) -> PolarsResult<Series> {
// Since inputs only has 1 feature, this has to be a size 2 vec.
// The variable res has 2 values, the test statistic and p value.
let res = _f_stats(&inputs[..2], true)?;
let s = Series::from_vec("statistic", vec![res[0]]);
let p = Series::from_vec("pvalue", vec![res[1]]);
Expand Down
65 changes: 56 additions & 9 deletions tests/test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,39 @@
"source": [
"import polars as pl\n",
"import numpy as np\n",
"import polars_ds as pld"
"# import polars_ds as pld"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9a3dbd5e",
"metadata": {},
"outputs": [],
"source": [
"import polars_ds as pld\n",
"df = pl.DataFrame({\n",
" \"word\":[\"apple\", \"banana\", \"pineapple\", \"asasasas\", \"sasasass\"],\n",
" \"other_data\": [1,2,3,4,5]\n",
"})\n",
"gibberish = [\"asasasa\", \"sasaaasss\", \"asdasadadfa\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "73ca40b3",
"metadata": {},
"outputs": [],
"source": [
"df.filter(\n",
" pl.col(\"word\").str2.similar_to_vocab(\n",
" vocab = gibberish,\n",
" threshold = 0.5,\n",
" metric = \"leven\", # Levenshtein similarity. Other options: dleven, osa, jw\n",
" strategy = \"any\" # True if the word is similar to any word in vocab. Other options: \"all\", \"avg\"\n",
" )\n",
")"
]
},
{
Expand Down Expand Up @@ -59,17 +91,32 @@
"metadata": {},
"outputs": [],
"source": [
"size = 1000\n",
"size = 5000\n",
"df = pl.DataFrame({\n",
" \"id\": range(size),\n",
" \"val1\": np.random.random(size=size),\n",
" \"val2\": np.random.random(size=size),\n",
" \"val3\": np.random.random(size=size),\n",
" \"val4\": np.random.random(size=size),\n",
" \"market_id\": range(size),\n",
" \"group1\": np.random.random(size=size),\n",
" \"group2\": np.random.random(size=size),\n",
" \"category_1\": np.random.randint(low=0, high=5, size=size),\n",
" \"category_2\":np.random.randint(low=0, high=10, size=size)\n",
"}).with_columns(\n",
" pl.col(\"id\").mod(5)\n",
" pl.col(\"market_id\").mod(3)\n",
")\n",
"df.head(10)"
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1d84105b",
"metadata": {},
"outputs": [],
"source": [
"# In segment T-test, chi2 test, F test made easy!\n",
"df.group_by(\"market_id\").agg(\n",
" pl.col(\"group1\").stats.ttest_ind(pl.col(\"group2\"), equal_var = True).alias(\"t-test\"),\n",
" pl.col(\"category_1\").stats.chi2(pl.col(\"category_2\")).alias(\"chi2-test\"),\n",
" pl.col(\"category_1\").stats.f_test(pl.col(\"group1\")).alias(\"f-test\")\n",
")"
]
},
{
Expand Down

0 comments on commit f8c466b

Please sign in to comment.