abstractqqq · abstractqqq · Dec 30, 2023 · Dec 29, 2023
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -14,7 +14,7 @@ pyo3 = {version = "*", features = ["extension-module"]}
 pyo3-polars = {version = "*", features = ["derive"]}
 polars = {version = "0.35.4", features = ["performant", "lazy", "dtype-array", "ndarray", "log", "nightly"]}
 num = "0.4.1"
-faer = {version = "0.15", features = ["ndarray", "nightly"]}
+faer = {version = "0.16", features = ["ndarray", "nightly"]}
 serde = {version = "*", features=["derive"]}
 ndarray = {version="0.15.6", features=["rayon"]} # see if we can get rid of this
 hashbrown = {version = "0.14.2", features=["nightly"]}

diff --git a/python/polars_ds/metrics.py b/python/polars_ds/metrics.py
@@ -23,6 +23,23 @@ class MetricExt:
     def __init__(self, expr: pl.Expr):
         self._expr: pl.Expr = expr
 
+    def max_error(self, pred: pl.Expr) -> pl.Expr:
+        """
+        Computes the max absolute error between actual and pred.
+        """
+        x = self._expr - pred
+        return pl.max_horizontal(x.max(), -x.min())
+
+    def mean_gamma_deviance(self, pred: pl.Expr) -> pl.Expr:
+        """
+        Computes the mean gamma deviance between actual and pred.
+
+        Note that this will return NaNs when any value is < 0. This only makes sense when y_true
+        and y_pred as strictly positive.
+        """
+        x = self._expr / pred
+        return 2.0 * (x.log() + x - 1).mean()
+
     def hubor_loss(self, pred: pl.Expr, delta: float) -> pl.Expr:
         """
         Computes huber loss between this and the other expression. This assumes

diff --git a/src/lib.rs b/src/lib.rs
@@ -1,12 +1,8 @@
-mod num_ext;
+mod num;
+mod stats_utils;
 mod stats;
-mod stats_ext;
-mod str_ext;
+mod str2;
 mod utils;
-use polars::{
-    error::{PolarsError, PolarsResult},
-    series::Series,
-};
 use pyo3::{pymodule, types::PyModule, PyResult, Python};
 
 #[cfg(target_os = "linux")]
@@ -16,18 +12,10 @@ use jemallocator::Jemalloc;
 #[cfg(target_os = "linux")]
 static ALLOC: Jemalloc = Jemalloc;
 
-// #[inline]
-// pub fn no_null_in_inputs(inputs: &[Series], err_msg: String) -> PolarsResult<()> {
-//     for s in inputs {
-//         if s.null_count() > 0 {
-//             return Err(PolarsError::ComputeError(err_msg.into()));
-//         }
-//     }
-//     Ok(())
-// }
 
 #[pymodule]
 #[pyo3(name = "_polars_ds")]
 fn _polars_ds(_py: Python<'_>, _m: &PyModule) -> PyResult<()> {
+
     Ok(())
 }
diff --git a/src/num_ext/complex.rs → src/num/complex.rs b/src/num_ext/complex.rs → src/num/complex.rs
diff --git a/src/num_ext/cond_entropy.rs → src/num/cond_entropy.rs b/src/num_ext/cond_entropy.rs → src/num/cond_entropy.rs
diff --git a/src/num_ext/entrophies.rs → src/num/entrophies.rs b/src/num_ext/entrophies.rs → src/num/entrophies.rs
@@ -1,4 +1,4 @@
-use crate::num_ext::knn::{build_standard_kdtree, query_nb_cnt, KdtreeKwargs};
+use crate::num::knn::{build_standard_kdtree, query_nb_cnt, KdtreeKwargs};
 use ndarray::s;
 use polars::prelude::*;
 use pyo3_polars::derive::polars_expr;

diff --git a/src/num_ext/fft.rs → src/num/fft.rs b/src/num_ext/fft.rs → src/num/fft.rs
diff --git a/src/num_ext/gcd_lcm.rs → src/num/gcd_lcm.rs b/src/num_ext/gcd_lcm.rs → src/num/gcd_lcm.rs
diff --git a/src/num_ext/haversine.rs → src/num/haversine.rs b/src/num_ext/haversine.rs → src/num/haversine.rs
diff --git a/src/num_ext/jaccard.rs → src/num/jaccard.rs b/src/num_ext/jaccard.rs → src/num/jaccard.rs
diff --git a/src/num_ext/knn.rs → src/num/knn.rs b/src/num_ext/knn.rs → src/num/knn.rs
diff --git a/src/num_ext/lempel_ziv.rs → src/num/lempel_ziv.rs b/src/num_ext/lempel_ziv.rs → src/num/lempel_ziv.rs
diff --git a/src/num_ext/mod.rs → src/num/mod.rs b/src/num_ext/mod.rs → src/num/mod.rs
diff --git a/src/num_ext/ols.rs → src/num/ols.rs b/src/num_ext/ols.rs → src/num/ols.rs
diff --git a/src/num_ext/powi.rs → src/num/powi.rs b/src/num_ext/powi.rs → src/num/powi.rs
diff --git a/src/num_ext/tp_fp.rs → src/num/tp_fp.rs b/src/num_ext/tp_fp.rs → src/num/tp_fp.rs
diff --git a/src/num_ext/trapz.rs → src/num/trapz.rs b/src/num_ext/trapz.rs → src/num/trapz.rs
diff --git a/src/stats_ext/chi2.rs → src/stats/chi2.rs b/src/stats_ext/chi2.rs → src/stats/chi2.rs
@@ -1,5 +1,5 @@
 use super::simple_stats_output;
-use crate::stats::gamma;
+use crate::stats_utils::gamma;
 use polars::prelude::*;
 use pyo3_polars::derive::polars_expr;
 

diff --git a/src/stats_ext/fstats.rs → src/stats/fstats.rs b/src/stats_ext/fstats.rs → src/stats/fstats.rs
@@ -1,6 +1,6 @@
 /// Multiple F-statistics at once and F test
 use super::{list_float_output, simple_stats_output, StatsResult};
-use crate::stats::beta::fisher_snedecor_sf;
+use crate::stats_utils::beta::fisher_snedecor_sf;
 use itertools::Itertools;
 use polars::prelude::*;
 use pyo3_polars::derive::polars_expr;

diff --git a/src/stats_ext/ks.rs → src/stats/ks.rs b/src/stats_ext/ks.rs → src/stats/ks.rs
@@ -1,5 +1,5 @@
 /// KS statistics.
-use crate::stats_ext::StatsResult;
+use crate::stats::StatsResult;
 use crate::utils::binary_search_right;
 use itertools::Itertools;
 use polars::prelude::*;

diff --git a/src/stats/mod.rs b/src/stats/mod.rs
@@ -1,18 +1,64 @@
-/// This submodule is mostly taken from the project statrs. See credit section in README.md
-/// The reason I do not want to add it as a dependency is that it has a nalgebra dependency for
-/// multi-variate distributions, which is something that I think will not be needed in this
-/// package. Another reason is that if I want to do linear algebra, I would use Faer since Faer
-/// performs better and nalgebra is too much of a dependency for this package right now.
-pub mod beta;
-pub mod gamma;
-pub mod normal;
-
-pub const PREC_ACC: f64 = 0.0000000000000011102230246251565;
-pub const LN_PI: f64 = 1.1447298858494001741434273513530587116472948129153;
-//pub const LN_SQRT_2PI: f64 = 0.91893853320467274178032973640561763986139747363778;
-pub const LN_2_SQRT_E_OVER_PI: f64 = 0.6207822376352452223455184457816472122518527279025978;
-
-#[inline]
-pub fn is_zero(x: f64) -> bool {
-    x.abs() < PREC_ACC
+mod chi2;
+mod fstats;
+mod ks;
+mod normal_test;
+mod sample;
+mod t_test;
+
+use polars::prelude::*;
+
+pub fn list_float_output(_: &[Field]) -> PolarsResult<Field> {
+    Ok(Field::new(
+        "list_float",
+        DataType::List(Box::new(DataType::Float64)),
+    ))
+}
+
+pub fn simple_stats_output(_: &[Field]) -> PolarsResult<Field> {
+    let s = Field::new("statistic", DataType::Float64);
+    let p = Field::new("pvalue", DataType::Float64);
+    let v: Vec<Field> = vec![s, p];
+    Ok(Field::new("", DataType::Struct(v)))
+}
+
+struct StatsResult {
+    pub statistic: f64,
+    pub p: Option<f64>,
+}
+
+impl StatsResult {
+    pub fn new(s: f64, p: f64) -> StatsResult {
+        StatsResult {
+            statistic: s,
+            p: Some(p),
+        }
+    }
+
+    pub fn from_stats(s: f64) -> StatsResult {
+        StatsResult {
+            statistic: s,
+            p: None,
+        }
+    }
+
+    pub fn unwrap_p_or(&self, default: f64) -> f64 {
+        self.p.unwrap_or(default)
+    }
+}
+
+pub enum Alternative {
+    TwoSided,
+    Less,
+    Greater,
+}
+
+impl From<&str> for Alternative {
+    fn from(s: &str) -> Alternative {
+        match s.to_lowercase().as_str() {
+            "two-sided" | "two" => Alternative::TwoSided,
+            "less" => Alternative::Less,
+            "greater" => Alternative::Greater,
+            _ => Alternative::TwoSided,
+        }
+    }
 }
diff --git a/src/stats_ext/normal_test.rs → src/stats/normal_test.rs b/src/stats_ext/normal_test.rs → src/stats/normal_test.rs
@@ -10,7 +10,7 @@
 ///
 /// I chose this over the Shapiro Francia test because the distribution is unknown and would require Monte Carlo
 use super::{simple_stats_output, StatsResult};
-use crate::stats::{gamma, is_zero};
+use crate::stats_utils::{gamma, is_zero};
 use polars::prelude::*;
 use pyo3_polars::derive::polars_expr;
 

diff --git a/src/stats_ext/sample.rs → src/stats/sample.rs b/src/stats_ext/sample.rs → src/stats/sample.rs
diff --git a/src/stats_ext/t_test.rs → src/stats/t_test.rs b/src/stats_ext/t_test.rs → src/stats/t_test.rs
@@ -1,6 +1,6 @@
 /// Student's t test and Welch's t test.
 use super::{simple_stats_output, Alternative, StatsResult};
-use crate::stats::{beta, is_zero};
+use crate::{stats_utils::{beta, is_zero}, stats};
 use polars::prelude::*;
 use pyo3_polars::derive::polars_expr;
 
@@ -110,7 +110,7 @@ fn pl_ttest_2samp(inputs: &[Series]) -> PolarsResult<Series> {
 
     let alt = inputs[5].utf8()?;
     let alt = alt.get(0).unwrap();
-    let alt = super::Alternative::from(alt);
+    let alt = stats::Alternative::from(alt);
 
     let valid = mean1.is_finite() && mean2.is_finite() && var1.is_finite() && var2.is_finite();
     if !valid {
@@ -147,7 +147,7 @@ fn pl_welch_t(inputs: &[Series]) -> PolarsResult<Series> {
 
     let alt = inputs[6].utf8()?;
     let alt = alt.get(0).unwrap();
-    let alt = super::Alternative::from(alt);
+    let alt = stats::Alternative::from(alt);
 
     // No need to check for validity because input is sanitized.
 
@@ -175,7 +175,7 @@ fn pl_ttest_1samp(inputs: &[Series]) -> PolarsResult<Series> {
 
     let alt = inputs[4].utf8()?;
     let alt = alt.get(0).unwrap();
-    let alt = super::Alternative::from(alt);
+    let alt = stats::Alternative::from(alt);
 
     // No need to check for validity because input is sanitized.
 

diff --git a/src/stats_ext/mod.rs b/src/stats_ext/mod.rs
diff --git a/src/stats/beta.rs → src/stats_utils/beta.rs b/src/stats/beta.rs → src/stats_utils/beta.rs
diff --git a/src/stats/gamma.rs → src/stats_utils/gamma.rs b/src/stats/gamma.rs → src/stats_utils/gamma.rs
diff --git a/src/stats_utils/mod.rs b/src/stats_utils/mod.rs
@@ -0,0 +1,18 @@
+/// This submodule is mostly taken from the project statrs. See credit section in README.md
+/// The reason I do not want to add it as a dependency is that it has a nalgebra dependency for
+/// multi-variate distributions, which is something that I think will not be needed in this
+/// package. Another reason is that if I want to do linear algebra, I would use Faer since Faer
+/// performs better and nalgebra is too much of a dependency for this package right now.
+pub mod beta;
+pub mod gamma;
+pub mod normal;
+
+pub const PREC_ACC: f64 = 0.0000000000000011102230246251565;
+pub const LN_PI: f64 = 1.1447298858494001741434273513530587116472948129153;
+//pub const LN_SQRT_2PI: f64 = 0.91893853320467274178032973640561763986139747363778;
+pub const LN_2_SQRT_E_OVER_PI: f64 = 0.6207822376352452223455184457816472122518527279025978;
+
+#[inline]
+pub fn is_zero(x: f64) -> bool {
+    x.abs() < PREC_ACC
+}
diff --git a/src/stats/normal.rs → src/stats_utils/normal.rs b/src/stats/normal.rs → src/stats_utils/normal.rs
diff --git a/src/str_ext/aho_corasick.rs → src/str2/aho_corasick.rs b/src/str_ext/aho_corasick.rs → src/str2/aho_corasick.rs
diff --git a/src/str_ext/consts.rs → src/str2/consts.rs b/src/str_ext/consts.rs → src/str2/consts.rs
diff --git a/src/str_ext/fuzz.rs → src/str2/fuzz.rs b/src/str_ext/fuzz.rs → src/str2/fuzz.rs
diff --git a/src/str_ext/hamming.rs → src/str2/hamming.rs b/src/str_ext/hamming.rs → src/str2/hamming.rs
diff --git a/src/str_ext/inflections.rs → src/str2/inflections.rs b/src/str_ext/inflections.rs → src/str2/inflections.rs
diff --git a/src/str_ext/is_stopword.rs → src/str2/is_stopword.rs b/src/str_ext/is_stopword.rs → src/str2/is_stopword.rs
diff --git a/src/str_ext/jaro.rs → src/str2/jaro.rs b/src/str_ext/jaro.rs → src/str2/jaro.rs
diff --git a/src/str_ext/levenshtein.rs → src/str2/levenshtein.rs b/src/str_ext/levenshtein.rs → src/str2/levenshtein.rs
diff --git a/src/str_ext/mod.rs → src/str2/mod.rs b/src/str_ext/mod.rs → src/str2/mod.rs
diff --git a/src/str_ext/osa.rs → src/str2/osa.rs b/src/str_ext/osa.rs → src/str2/osa.rs
diff --git a/src/str_ext/overlap.rs → src/str2/overlap.rs b/src/str_ext/overlap.rs → src/str2/overlap.rs
diff --git a/...xt/snowball/algorithms/english_stemmer.rs → ...r2/snowball/algorithms/english_stemmer.rs b/...xt/snowball/algorithms/english_stemmer.rs → ...r2/snowball/algorithms/english_stemmer.rs
@@ -5,8 +5,8 @@
 #![allow(unused_mut)]
 #![allow(unused_parens)]
 #![allow(unused_variables)]
-use crate::str_ext::snowball::Among;
-use crate::str_ext::snowball::SnowballEnv;
+use crate::str2::snowball::Among;
+use crate::str2::snowball::SnowballEnv;
 
 static A_0: &'static [Among<Context>; 3] = &[
     Among("arsen", -1, -1, None),

diff --git a/src/str_ext/snowball/algorithms/mod.rs → src/str2/snowball/algorithms/mod.rs b/src/str_ext/snowball/algorithms/mod.rs → src/str2/snowball/algorithms/mod.rs
diff --git a/src/str_ext/snowball/among.rs → src/str2/snowball/among.rs b/src/str_ext/snowball/among.rs → src/str2/snowball/among.rs
@@ -1,4 +1,4 @@
-use crate::str_ext::snowball::SnowballEnv;
+use crate::str2::snowball::SnowballEnv;
 
 pub struct Among<T: 'static>(
     pub &'static str,

diff --git a/src/str_ext/snowball/mod.rs → src/str2/snowball/mod.rs b/src/str_ext/snowball/mod.rs → src/str2/snowball/mod.rs
@@ -21,5 +21,5 @@ mod among;
 mod snowball_env;
 
 // TODO: why do we need this `crate::`?
-pub use crate::str_ext::snowball::among::Among;
-pub use crate::str_ext::snowball::snowball_env::SnowballEnv;
+pub use crate::str2::snowball::among::Among;
+pub use crate::str2::snowball::snowball_env::SnowballEnv;
diff --git a/src/str_ext/snowball/snowball_env.rs → src/str2/snowball/snowball_env.rs b/src/str_ext/snowball/snowball_env.rs → src/str2/snowball/snowball_env.rs
@@ -1,4 +1,4 @@
-use crate::str_ext::snowball::Among;
+use crate::str2::snowball::Among;
 use std::borrow::Cow;
 
 #[derive(Debug, Clone)]

diff --git a/src/str_ext/snowball_stem.rs → src/str2/snowball_stem.rs b/src/str_ext/snowball_stem.rs → src/str2/snowball_stem.rs
diff --git a/src/str_ext/sorensen_dice.rs → src/str2/sorensen_dice.rs b/src/str_ext/sorensen_dice.rs → src/str2/sorensen_dice.rs
diff --git a/src/str_ext/str_jaccard.rs → src/str2/str_jaccard.rs b/src/str_ext/str_jaccard.rs → src/str2/str_jaccard.rs