From 24fd91bad01a13724b355bcb8f70a19487d1c6b3 Mon Sep 17 00:00:00 2001 From: abstractqqq Date: Mon, 25 Dec 2023 22:52:47 -0500 Subject: [PATCH 1/2] fixed bugs for some stats methods when used in group by --- examples/basics.ipynb | 236 ++++++++++++++++++++++++++++++++-------- src/num_ext/tp_fp.rs | 6 +- src/stats_ext/chi2.rs | 25 +++-- src/stats_ext/fstats.rs | 41 ++++--- tests/test.ipynb | 99 +++++++++++++++-- 5 files changed, 325 insertions(+), 82 deletions(-) diff --git a/examples/basics.ipynb b/examples/basics.ipynb index ecc6b709..ac67cf5f 100644 --- a/examples/basics.ipynb +++ b/examples/basics.ipynb @@ -48,7 +48,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 10)
fdummyabx1x2yactualpredicteddummy_groups
f64strf64f64i64i64i64i32f64str
0.0"a"0.9118770.0684890100000-10000000.619805"a"
0.841471"a"0.9686020.6384921100001-9999910.379361"a"
0.909297"a"0.9290540.9238512100002-9999810.871658"a"
0.14112"a"0.6687240.6312933100003-9999710.305585"a"
-0.756802"a"0.1956270.5785414100004-9999610.145813"a"
" + "shape: (5, 10)
fdummyabx1x2yactualpredicteddummy_groups
f64strf64f64i64i64i64i32f64str
0.0"a"0.7966130.631240100000-10000000.538541"a"
0.841471"a"0.9189790.1753941100001-9999900.80966"a"
0.909297"a"0.011640.7542532100002-9999800.671246"a"
0.14112"a"0.640590.9252873100003-9999710.755093"a"
-0.756802"a"0.1631150.538414100004-9999610.946195"a"
" ], "text/plain": [ "shape: (5, 10)\n", @@ -57,11 +57,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ str ┆ f64 ┆ f64 ┆ ┆ i64 ┆ i32 ┆ f64 ┆ str │\n", "╞═══════════╪═══════╪══════════╪══════════╪═══╪═════════╪════════╪═══════════╪══════════════╡\n", - "│ 0.0 ┆ a ┆ 0.911877 ┆ 0.068489 ┆ … ┆ -100000 ┆ 0 ┆ 0.619805 ┆ a │\n", - "│ 0.841471 ┆ a ┆ 0.968602 ┆ 0.638492 ┆ … ┆ -99999 ┆ 1 ┆ 0.379361 ┆ a │\n", - "│ 0.909297 ┆ a ┆ 0.929054 ┆ 0.923851 ┆ … ┆ -99998 ┆ 1 ┆ 0.871658 ┆ a │\n", - "│ 0.14112 ┆ a ┆ 0.668724 ┆ 0.631293 ┆ … ┆ -99997 ┆ 1 ┆ 0.305585 ┆ a │\n", - "│ -0.756802 ┆ a ┆ 0.195627 ┆ 0.578541 ┆ … ┆ -99996 ┆ 1 ┆ 0.145813 ┆ a │\n", + "│ 0.0 ┆ a ┆ 0.796613 ┆ 0.63124 ┆ … ┆ -100000 ┆ 0 ┆ 0.538541 ┆ a │\n", + "│ 0.841471 ┆ a ┆ 0.918979 ┆ 0.175394 ┆ … ┆ -99999 ┆ 0 ┆ 0.80966 ┆ a │\n", + "│ 0.909297 ┆ a ┆ 0.01164 ┆ 0.754253 ┆ … ┆ -99998 ┆ 0 ┆ 0.671246 ┆ a │\n", + "│ 0.14112 ┆ a ┆ 0.64059 ┆ 0.925287 ┆ … ┆ -99997 ┆ 1 ┆ 0.755093 ┆ a │\n", + "│ -0.756802 ┆ a ┆ 0.163115 ┆ 0.53841 ┆ … ┆ -99996 ┆ 1 ┆ 0.946195 ┆ a │\n", "└───────────┴───────┴──────────┴──────────┴───┴─────────┴────────┴───────────┴──────────────┘" ] }, @@ -395,7 +395,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 8)
dummy_groupsl2log lossprecisionrecallfaverage_precisionroc_auc
strf64f64f64f64f64f64f64
"b"0.3342981.0048930.50120.4995220.250180.4993730.498183
"a"0.3340391.0019150.4974270.499960.2493450.4963620.497893
" + "shape: (2, 8)
dummy_groupsl2log lossprecisionrecallfaverage_precisionroc_auc
strf64f64f64f64f64f64f64
"b"0.3359641.0061180.4928470.4950030.2469620.4934210.494359
"a"0.3341211.0016080.4993410.4982650.2494010.4993730.497763
" ], "text/plain": [ "shape: (2, 8)\n", @@ -405,8 +405,8 @@ "│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ --- ┆ f64 │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ f64 ┆ │\n", "╞══════════════╪══════════╪══════════╪═══════════╪══════════╪══════════╪════════════════╪══════════╡\n", - "│ b ┆ 0.334298 ┆ 1.004893 ┆ 0.5012 ┆ 0.499522 ┆ 0.25018 ┆ 0.499373 ┆ 0.498183 │\n", - "│ a ┆ 0.334039 ┆ 1.001915 ┆ 0.497427 ┆ 0.49996 ┆ 0.249345 ┆ 0.496362 ┆ 0.497893 │\n", + "│ b ┆ 0.335964 ┆ 1.006118 ┆ 0.492847 ┆ 0.495003 ┆ 0.246962 ┆ 0.493421 ┆ 0.494359 │\n", + "│ a ┆ 0.334121 ┆ 1.001608 ┆ 0.499341 ┆ 0.498265 ┆ 0.249401 ┆ 0.499373 ┆ 0.497763 │\n", "└──────────────┴──────────┴──────────┴───────────┴──────────┴──────────┴────────────────┴──────────┘" ] }, @@ -494,7 +494,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 1)
sen
str
"hello"
"church"
"going"
"world"
"to"
" + "shape: (5, 1)
sen
str
"church"
"to"
"going"
"world"
"hello"
" ], "text/plain": [ "shape: (5, 1)\n", @@ -503,11 +503,11 @@ "│ --- │\n", "│ str │\n", "╞════════╡\n", - "│ hello │\n", "│ church │\n", + "│ to │\n", "│ going │\n", "│ world │\n", - "│ to │\n", + "│ hello │\n", "└────────┘" ] }, @@ -539,7 +539,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (4, 1)
sen
str
"church"
"go"
"world"
"hello"
" + "shape: (4, 1)
sen
str
"world"
"go"
"hello"
"church"
" ], "text/plain": [ "shape: (4, 1)\n", @@ -548,10 +548,10 @@ "│ --- │\n", "│ str │\n", "╞════════╡\n", - "│ church │\n", - "│ go │\n", "│ world │\n", + "│ go │\n", "│ hello │\n", + "│ church │\n", "└────────┘" ] }, @@ -763,7 +763,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 1)
a
f64
-1.380039
-0.200584
-0.100981
-0.413554
0.014094
" + "shape: (5, 1)
a
f64
0.097629
-0.030844
1.386883
0.591371
1.092199
" ], "text/plain": [ "shape: (5, 1)\n", @@ -772,11 +772,11 @@ "│ --- │\n", "│ f64 │\n", "╞═══════════╡\n", - "│ -1.380039 │\n", - "│ -0.200584 │\n", - "│ -0.100981 │\n", - "│ -0.413554 │\n", - "│ 0.014094 │\n", + "│ 0.097629 │\n", + "│ -0.030844 │\n", + "│ 1.386883 │\n", + "│ 0.591371 │\n", + "│ 1.092199 │\n", "└───────────┘" ] }, @@ -809,7 +809,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1_000, 2)
arandom
f64f64
-1.3800390.846072
-0.2005840.552485
-0.1009810.706321
-0.413554-0.205514
0.0140941.00097
-0.731941-0.246044
0.4287560.721886
-0.7790350.292412
-1.544782.116286
0.0167040.137341
0.774037-0.477099
0.5508741.481822
1.57445-1.786365
-0.219772-0.649539
1.0507121.545373
0.4451070.696054
0.452407-0.555928
-0.9657613.530685
-0.7389671.579904
1.365518-2.268439
0.746657-0.462054
-0.08-0.443372
nullnull
nullnull
" + "shape: (1_000, 2)
arandom
f64f64
0.097629-0.085826
-0.0308442.563895
1.3868831.743471
0.5913712.199912
1.0921991.56007
-0.968060.728469
-0.2506911.077966
-0.805002-0.077792
-1.323398-2.068835
0.1861440.286291
-0.972090.340019
0.6652461.544772
0.1651991.771167
-0.890093-0.064297
0.201987-0.07098
-0.2145610.65106
0.3578540.7858
-0.421794-1.057364
-0.6215581.082361
1.1732951.176845
0.2641440.945366
-1.151482-0.582049
nullnull
nullnull
" ], "text/plain": [ "shape: (1_000, 2)\n", @@ -818,13 +818,13 @@ "│ --- ┆ --- │\n", "│ f64 ┆ f64 │\n", "╞═══════════╪═══════════╡\n", - "│ -1.380039 ┆ 0.846072 │\n", - "│ -0.200584 ┆ 0.552485 │\n", - "│ -0.100981 ┆ 0.706321 │\n", - "│ -0.413554 ┆ -0.205514 │\n", + "│ 0.097629 ┆ -0.085826 │\n", + "│ -0.030844 ┆ 2.563895 │\n", + "│ 1.386883 ┆ 1.743471 │\n", + "│ 0.591371 ┆ 2.199912 │\n", "│ … ┆ … │\n", - "│ 0.746657 ┆ -0.462054 │\n", - "│ -0.08 ┆ -0.443372 │\n", + "│ 0.264144 ┆ 0.945366 │\n", + "│ -1.151482 ┆ -0.582049 │\n", "│ null ┆ null │\n", "│ null ┆ null │\n", "└───────────┴───────────┘" @@ -857,7 +857,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1_000, 2)
arandom_str
f64str
-1.380039"0C"
-0.200584"JEJ"
-0.100981"ATRz"
-0.413554"1"
0.014094"of"
-0.731941"toPG"
0.428756"yQS"
-0.779035"3UZN"
-1.54478"ycz"
0.016704"Jv"
0.774037"JPJ"
0.550874"ukse"
1.57445"3m8"
-0.219772"X"
1.050712"a6TG"
0.445107"22"
0.452407"KY"
-0.965761"H"
-0.738967"M"
1.365518"C"
0.746657"dHn"
-0.08"n"
nullnull
nullnull
" + "shape: (1_000, 2)
arandom_str
f64str
0.097629"Te"
-0.030844"SWs"
1.386883"s1yx"
0.591371"Lt"
1.092199"14"
-0.96806"lP"
-0.250691"q"
-0.805002"z"
-1.323398"K8D"
0.186144"f"
-0.97209"1Uyx"
0.665246"9Ttl"
0.165199"z"
-0.890093"s4"
0.201987"z"
-0.214561"H"
0.357854"ih"
-0.421794"LuLD"
-0.621558"M"
1.173295"v8O"
0.264144"ajS0"
-1.151482"GM4"
nullnull
nullnull
" ], "text/plain": [ "shape: (1_000, 2)\n", @@ -866,13 +866,13 @@ "│ --- ┆ --- │\n", "│ f64 ┆ str │\n", "╞═══════════╪════════════╡\n", - "│ -1.380039 ┆ 0C │\n", - "│ -0.200584 ┆ JEJ │\n", - "│ -0.100981 ┆ ATRz │\n", - "│ -0.413554 ┆ 1 │\n", + "│ 0.097629 ┆ Te │\n", + "│ -0.030844 ┆ SWs │\n", + "│ 1.386883 ┆ s1yx │\n", + "│ 0.591371 ┆ Lt │\n", "│ … ┆ … │\n", - "│ 0.746657 ┆ dHn │\n", - "│ -0.08 ┆ n │\n", + "│ 0.264144 ┆ ajS0 │\n", + "│ -1.151482 ┆ GM4 │\n", "│ null ┆ null │\n", "│ null ┆ null │\n", "└───────────┴────────────┘" @@ -905,7 +905,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1_000, 2)
arandom_str
f64str
-1.380039"87vkz"
-0.200584"FhImg"
-0.100981"zAaq2"
-0.413554"wdiyR"
0.014094"xXfxl"
-0.731941"9nvFc"
0.428756"foIrA"
-0.779035"9l3cn"
-1.54478"wFCEv"
0.016704"SrEgl"
0.774037"jZwKd"
0.550874"SN8XG"
1.57445"mQpW7"
-0.219772"uH0BC"
1.050712"oy9Dy"
0.445107"ADwjJ"
0.452407"Ify2a"
-0.965761"ywaCT"
-0.738967"e3Qos"
1.365518"mkjTa"
0.746657"hZFLE"
-0.08"hrGxB"
nullnull
nullnull
" + "shape: (1_000, 2)
arandom_str
f64str
0.097629"jg7x5"
-0.030844"VsuAu"
1.386883"hT5ub"
0.591371"6AqFN"
1.092199"1hsY7"
-0.96806"qNpiS"
-0.250691"3vaf7"
-0.805002"B0849"
-1.323398"I4gZ6"
0.186144"HPn26"
-0.97209"PIFmq"
0.665246"2c3ir"
0.165199"HmW60"
-0.890093"PuRla"
0.201987"N3sXB"
-0.214561"kS2Ve"
0.357854"bCxwy"
-0.421794"Ao0Ff"
-0.621558"syYKi"
1.173295"dODkd"
0.264144"22mYg"
-1.151482"Q5Q4Y"
nullnull
nullnull
" ], "text/plain": [ "shape: (1_000, 2)\n", @@ -914,13 +914,13 @@ "│ --- ┆ --- │\n", "│ f64 ┆ str │\n", "╞═══════════╪════════════╡\n", - "│ -1.380039 ┆ 87vkz │\n", - "│ -0.200584 ┆ FhImg │\n", - "│ -0.100981 ┆ zAaq2 │\n", - "│ -0.413554 ┆ wdiyR │\n", + "│ 0.097629 ┆ jg7x5 │\n", + "│ -0.030844 ┆ VsuAu │\n", + "│ 1.386883 ┆ hT5ub │\n", + "│ 0.591371 ┆ 6AqFN │\n", "│ … ┆ … │\n", - "│ 0.746657 ┆ hZFLE │\n", - "│ -0.08 ┆ hrGxB │\n", + "│ 0.264144 ┆ 22mYg │\n", + "│ -1.151482 ┆ Q5Q4Y │\n", "│ null ┆ null │\n", "│ null ┆ null │\n", "└───────────┴────────────┘" @@ -953,7 +953,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 4)
t-tests: statisticst-tests: pvaluenormality_test: statisticsnormality_test: pvalue
f64f64f64f64
0.5803480.5617650.8754480.645504
" + "shape: (1, 4)
t-tests: statisticst-tests: pvaluenormality_test: statisticsnormality_test: pvalue
f64f64f64f64
0.2564410.7976460.6505030.722346
" ], "text/plain": [ "shape: (1, 4)\n", @@ -962,7 +962,7 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════════════════════╪═════════════════╪════════════════════════════╪════════════════════════╡\n", - "│ 0.580348 ┆ 0.561765 ┆ 0.875448 ┆ 0.645504 │\n", + "│ 0.256441 ┆ 0.797646 ┆ 0.650503 ┆ 0.722346 │\n", "└─────────────────────┴─────────────────┴────────────────────────────┴────────────────────────┘" ] }, @@ -989,6 +989,154 @@ " , pl.col(\"normality_test\").struct.field(\"pvalue\").alias(\"normality_test: pvalue\")\n", ")" ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "b46a72a5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 5)
market_idgroup1group2category_1category_2
i64f64f64i64i64
00.2911770.36219227
10.3882050.09945116
20.4201540.05846922
00.9764380.12167512
10.5594360.34034503
" + ], + "text/plain": [ + "shape: (5, 5)\n", + "┌───────────┬──────────┬──────────┬────────────┬────────────┐\n", + "│ market_id ┆ group1 ┆ group2 ┆ category_1 ┆ category_2 │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ f64 ┆ f64 ┆ i64 ┆ i64 │\n", + "╞═══════════╪══════════╪══════════╪════════════╪════════════╡\n", + "│ 0 ┆ 0.291177 ┆ 0.362192 ┆ 2 ┆ 7 │\n", + "│ 1 ┆ 0.388205 ┆ 0.099451 ┆ 1 ┆ 6 │\n", + "│ 2 ┆ 0.420154 ┆ 0.058469 ┆ 2 ┆ 2 │\n", + "│ 0 ┆ 0.976438 ┆ 0.121675 ┆ 1 ┆ 2 │\n", + "│ 1 ┆ 0.559436 ┆ 0.340345 ┆ 0 ┆ 3 │\n", + "└───────────┴──────────┴──────────┴────────────┴────────────┘" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "size = 5000\n", + "df = pl.DataFrame({\n", + " \"market_id\": range(size),\n", + " \"group1\": np.random.random(size=size),\n", + " \"group2\": np.random.random(size=size),\n", + " \"category_1\": np.random.randint(low=0, high=5, size=size),\n", + " \"category_2\":np.random.randint(low=0, high=10, size=size)\n", + "}).with_columns(\n", + " pl.col(\"market_id\").mod(3)\n", + ")\n", + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "adc4f66f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (1, 3)
t-testchi2-testf-test
struct[2]struct[2]struct[2]
{-0.568826,0.569487}{20.227158,0.984214}{1.657997,0.156847}
" + ], + "text/plain": [ + "shape: (1, 3)\n", + "┌──────────────────────┬──────────────────────┬─────────────────────┐\n", + "│ t-test ┆ chi2-test ┆ f-test │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ struct[2] ┆ struct[2] ┆ struct[2] │\n", + "╞══════════════════════╪══════════════════════╪═════════════════════╡\n", + "│ {-0.568826,0.569487} ┆ {20.227158,0.984214} ┆ {1.657997,0.156847} │\n", + "└──────────────────────┴──────────────────────┴─────────────────────┘" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# In dataframe statistical tests!\n", + "df.select(\n", + " pl.col(\"group1\").stats.ttest_ind(pl.col(\"group2\"), equal_var = True).alias(\"t-test\"),\n", + " pl.col(\"category_1\").stats.chi2(pl.col(\"category_2\")).alias(\"chi2-test\"),\n", + " pl.col(\"category_1\").stats.f_test(pl.col(\"group1\")).alias(\"f-test\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "65dbb6bd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (3, 4)
market_idt-testchi2-testf-test
i64struct[2]struct[2]struct[2]
0{-0.886846,0.375226}{36.390403,0.450482}{-0.503377,-0.733276}
1{1.072105,0.283751}{23.387811,0.948028}{-2.182564,-0.068697}
2{-1.154529,0.248366}{33.069657,0.608716}{-0.961816,-0.427371}
" + ], + "text/plain": [ + "shape: (3, 4)\n", + "┌───────────┬──────────────────────┬──────────────────────┬───────────────────────┐\n", + "│ market_id ┆ t-test ┆ chi2-test ┆ f-test │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ struct[2] ┆ struct[2] ┆ struct[2] │\n", + "╞═══════════╪══════════════════════╪══════════════════════╪═══════════════════════╡\n", + "│ 0 ┆ {-0.886846,0.375226} ┆ {36.390403,0.450482} ┆ {-0.503377,-0.733276} │\n", + "│ 1 ┆ {1.072105,0.283751} ┆ {23.387811,0.948028} ┆ {-2.182564,-0.068697} │\n", + "│ 2 ┆ {-1.154529,0.248366} ┆ {33.069657,0.608716} ┆ {-0.961816,-0.427371} │\n", + "└───────────┴──────────────────────┴──────────────────────┴───────────────────────┘" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Can also be done in group by context\n", + "df.group_by(\"market_id\").agg(\n", + " pl.col(\"group1\").stats.ttest_ind(pl.col(\"group2\"), equal_var = False).alias(\"t-test\"),\n", + " pl.col(\"category_1\").stats.chi2(pl.col(\"category_2\")).alias(\"chi2-test\"),-\n", + " pl.col(\"category_1\").stats.f_test(pl.col(\"group1\")).alias(\"f-test\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c47ed0d", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/src/num_ext/tp_fp.rs b/src/num_ext/tp_fp.rs index fa4fd015..1b850d6f 100644 --- a/src/num_ext/tp_fp.rs +++ b/src/num_ext/tp_fp.rs @@ -76,9 +76,9 @@ fn pl_combo_b(inputs: &[Series]) -> PolarsResult { let threshold = threshold.get(0).unwrap(); if (actual.len() != predicted.len()) - | actual.is_empty() - | predicted.is_empty() - | ((actual.null_count() + predicted.null_count()) > 0) + || actual.is_empty() + || predicted.is_empty() + || ((actual.null_count() + predicted.null_count()) > 0) { return Err(PolarsError::ComputeError( "Binary Metrics Combo: Input columns must be the same length, non-empty, and shouldn't contain nulls." diff --git a/src/stats_ext/chi2.rs b/src/stats_ext/chi2.rs index 2ee9c6c2..408d8da1 100644 --- a/src/stats_ext/chi2.rs +++ b/src/stats_ext/chi2.rs @@ -1,12 +1,13 @@ -use super::{simple_stats_output, StatsResult}; +use super::simple_stats_output; use crate::stats::gamma; use polars::prelude::*; use pyo3_polars::derive::polars_expr; #[polars_expr(output_type_func=simple_stats_output)] fn pl_chi2(inputs: &[Series]) -> PolarsResult { - let s1_name = inputs[0].name(); - let s2_name = inputs[1].name(); + + let s1_name = "s1"; + let s2_name = "s2"; let u1 = inputs[0].unique()?; let u1_len = u1.len(); @@ -18,7 +19,9 @@ fn pl_chi2(inputs: &[Series]) -> PolarsResult { let cross = df1.cross_join(df2); // Create a "fake" contigency table - let df3 = DataFrame::from_iter(inputs[0..2].to_vec()) + let s1 = inputs[0].clone(); + let s2 = inputs[1].clone(); + let df3 = df!(s1_name => s1, s2_name => s2)? .lazy() .group_by([col(s1_name), col(s2_name)]) .agg([count().alias("ob")]); @@ -50,12 +53,16 @@ fn pl_chi2(inputs: &[Series]) -> PolarsResult { // Get the statistic let out = final_df.drop_in_place("output")?; let stats = out.f64()?; - let stats = stats.get(0).unwrap(); + let stats = stats.get(0).unwrap_or(f64::NAN); // Compute p value. It is a special case of Gamma distribution - let dof = u1_len.abs_diff(1) * u2_len.abs_diff(1); - let (shape, rate) = (dof as f64 / 2., 0.5); - let p = gamma::sf(stats, shape, rate).map_err(|e| PolarsError::ComputeError(e.into())); - let p = p?; + let p = if stats.is_nan() { + f64::NAN + } else { + let dof = u1_len.abs_diff(1) * u2_len.abs_diff(1); + let (shape, rate) = (dof as f64 / 2., 0.5); + let p = gamma::sf(stats, shape, rate).map_err(|e| PolarsError::ComputeError(e.into())); + p? + }; // Get output let s = Series::from_vec("statistic", vec![stats]); let p = Series::from_vec("pvalue", vec![p]); diff --git a/src/stats_ext/fstats.rs b/src/stats_ext/fstats.rs index a6dec3b0..6376b679 100644 --- a/src/stats_ext/fstats.rs +++ b/src/stats_ext/fstats.rs @@ -16,15 +16,24 @@ fn ftest(x: f64, f1: f64, f2: f64) -> Result { } /// An internal helper function to compute f statistic for F test, with the option to comput -/// the p value too. It shouldn't be used outside. -/// When return_p is false, returns a Vec +/// the p value too. It shouldn't be used outside. The API is bad for outsiders to use. +/// When return_p is false, returns a Vec with f stats. /// When return_p is true, returns a Vec that has x_0, .., x_{n-1} = f_0, .., f_{n-1} /// where n = inputs.len() - 1 = number of features -/// x_0, .., x_{n-1} are the same as before, but -/// x_n, .., x_{2n - 2} = p_0, .., p_{n-1}, are the p values. +/// And additionally x_n, .., x_{2n - 2} = p_0, .., p_{n-1}, are the p values. fn _f_stats(inputs: &[Series], return_p: bool) -> PolarsResult> { - let target = inputs[0].name(); - let df = DataFrame::new(inputs.to_vec())?.lazy(); + + let target = "target"; + let v = inputs.into_iter().enumerate().map(|(i , s)| { + if i == 0 { + s.clone().with_name(target) + } else { + s.clone().with_name(i.to_string().as_str()) + } + }).collect_vec(); + let n_cols = v.len(); + + let df = DataFrame::new(v)?.lazy(); // inputs[0] is the group // all the rest should numerical let mut step_one: Vec = Vec::with_capacity(inputs.len() * 2 - 1); @@ -38,11 +47,12 @@ fn _f_stats(inputs: &[Series], return_p: bool) -> PolarsResult> { .cast(DataType::UInt32), ); - for s in &inputs[1..] { - let name = s.name(); - let n_sum = format!("{}_sum", name); + for i in 1..n_cols { + let name = i.to_string(); + let name = name.as_str(); + let n_sum = format!("{}_sum", i); let n_sum = n_sum.as_str(); - let n_var = format!("{}_var", name); + let n_var = format!("{}_var", i); let n_var = n_var.as_str(); step_one.push(col(name).sum().alias(n_sum)); step_one.push(col(name).var(0).alias(n_var)); @@ -56,7 +66,7 @@ fn _f_stats(inputs: &[Series], return_p: bool) -> PolarsResult> { } let mut reference = df - .group_by([target]) + .group_by([col(target)]) .agg(step_one) .select(step_two) .collect()?; @@ -65,18 +75,19 @@ fn _f_stats(inputs: &[Series], return_p: bool) -> PolarsResult> { let n_classes = reference.drop_in_place("n_classes")?; let n_samples = n_samples.u32()?; let n_classes = n_classes.u32()?; - let n_samples = n_samples.get(0).unwrap(); + let n_samples = n_samples.get(0).unwrap_or(0); let n_classes = n_classes.get(0).unwrap_or(0); - if n_classes <= 1 { + if n_classes <= 1 || n_samples <= 1 { return Err(PolarsError::ComputeError( - "Number of classes is either 1 or 0, which is invalid.".into(), + "Number of classes, or number of samples is either 1 or 0, which is invalid.".into(), )); } let df_btw_class = n_classes.abs_diff(1) as f64; let df_in_class = n_samples.abs_diff(n_classes) as f64; + // fstats is 2D let fstats = reference.to_ndarray::(IndexOrder::default())?; let scale = df_in_class / df_btw_class; @@ -130,7 +141,7 @@ fn pl_f_stats(inputs: &[Series]) -> PolarsResult { /// and inputs[1] as the column to run F-test. There should be only two columns. #[polars_expr(output_type_func=simple_stats_output)] fn pl_f_test(inputs: &[Series]) -> PolarsResult { - // Since inputs only has 1 feature, this has to be a size 2 vec. + // The variable res has 2 values, the test statistic and p value. let res = _f_stats(&inputs[..2], true)?; let s = Series::from_vec("statistic", vec![res[0]]); let p = Series::from_vec("pvalue", vec![res[1]]); diff --git a/tests/test.ipynb b/tests/test.ipynb index e1c92e12..a29d31e6 100644 --- a/tests/test.ipynb +++ b/tests/test.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "529f4422-5c3a-4bd6-abe0-a15edfc62abb", "metadata": {}, "outputs": [], @@ -54,22 +54,99 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "0fbc1c14", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 5)
market_idgroup1group2category_1category_2
i64f64f64i64i64
00.8632110.0514620
10.3751870.84121304
20.7317480.77574722
00.3666780.23632713
10.5213380.72882736
" + ], + "text/plain": [ + "shape: (5, 5)\n", + "┌───────────┬──────────┬──────────┬────────────┬────────────┐\n", + "│ market_id ┆ group1 ┆ group2 ┆ category_1 ┆ category_2 │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ f64 ┆ f64 ┆ i64 ┆ i64 │\n", + "╞═══════════╪══════════╪══════════╪════════════╪════════════╡\n", + "│ 0 ┆ 0.863211 ┆ 0.05146 ┆ 2 ┆ 0 │\n", + "│ 1 ┆ 0.375187 ┆ 0.841213 ┆ 0 ┆ 4 │\n", + "│ 2 ┆ 0.731748 ┆ 0.775747 ┆ 2 ┆ 2 │\n", + "│ 0 ┆ 0.366678 ┆ 0.236327 ┆ 1 ┆ 3 │\n", + "│ 1 ┆ 0.521338 ┆ 0.728827 ┆ 3 ┆ 6 │\n", + "└───────────┴──────────┴──────────┴────────────┴────────────┘" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "size = 1000\n", + "size = 5000\n", "df = pl.DataFrame({\n", - " \"id\": range(size),\n", - " \"val1\": np.random.random(size=size),\n", - " \"val2\": np.random.random(size=size),\n", - " \"val3\": np.random.random(size=size),\n", - " \"val4\": np.random.random(size=size),\n", + " \"market_id\": range(size),\n", + " \"group1\": np.random.random(size=size),\n", + " \"group2\": np.random.random(size=size),\n", + " \"category_1\": np.random.randint(low=0, high=5, size=size),\n", + " \"category_2\":np.random.randint(low=0, high=10, size=size)\n", "}).with_columns(\n", - " pl.col(\"id\").mod(5)\n", + " pl.col(\"market_id\").mod(3)\n", ")\n", - "df.head(10)" + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1d84105b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (3, 4)
market_idt-testchi2-testf-test
i64struct[2]struct[2]struct[2]
0{-0.678227,0.497675}{27.55702,0.842609}{3.304733,0.010454}
1{1.049668,0.293947}{28.920644,0.792821}{1.081389,0.364112}
2{0.246265,0.805493}{27.843576,0.832729}{0.851298,0.492663}
" + ], + "text/plain": [ + "shape: (3, 4)\n", + "┌───────────┬──────────────────────┬──────────────────────┬─────────────────────┐\n", + "│ market_id ┆ t-test ┆ chi2-test ┆ f-test │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ struct[2] ┆ struct[2] ┆ struct[2] │\n", + "╞═══════════╪══════════════════════╪══════════════════════╪═════════════════════╡\n", + "│ 0 ┆ {-0.678227,0.497675} ┆ {27.55702,0.842609} ┆ {3.304733,0.010454} │\n", + "│ 1 ┆ {1.049668,0.293947} ┆ {28.920644,0.792821} ┆ {1.081389,0.364112} │\n", + "│ 2 ┆ {0.246265,0.805493} ┆ {27.843576,0.832729} ┆ {0.851298,0.492663} │\n", + "└───────────┴──────────────────────┴──────────────────────┴─────────────────────┘" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# In segment T-test, chi2 test, F test made easy!\n", + "df.group_by(\"market_id\").agg(\n", + " pl.col(\"group1\").stats.ttest_ind(pl.col(\"group2\"), equal_var = True).alias(\"t-test\"),\n", + " pl.col(\"category_1\").stats.chi2(pl.col(\"category_2\")).alias(\"chi2-test\"),\n", + " pl.col(\"category_1\").stats.f_test(pl.col(\"group1\")).alias(\"f-test\")\n", + ")" ] }, { From f86c739c06671d4463380944957ba1b9e1598fa7 Mon Sep 17 00:00:00 2001 From: abstractqqq Date: Tue, 26 Dec 2023 12:03:52 -0500 Subject: [PATCH 2/2] better examples --- examples/basics.ipynb | 256 +++++++++++++++++++++++++++----------- python/polars_ds/str2.py | 51 +++++++- src/num_ext/entrophies.rs | 13 +- src/num_ext/knn.rs | 12 +- src/stats_ext/chi2.rs | 1 - src/stats_ext/fstats.rs | 19 +-- tests/test.ipynb | 106 ++++++---------- 7 files changed, 296 insertions(+), 162 deletions(-) diff --git a/examples/basics.ipynb b/examples/basics.ipynb index ac67cf5f..8eaac2a5 100644 --- a/examples/basics.ipynb +++ b/examples/basics.ipynb @@ -48,7 +48,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 10)
fdummyabx1x2yactualpredicteddummy_groups
f64strf64f64i64i64i64i32f64str
0.0"a"0.7966130.631240100000-10000000.538541"a"
0.841471"a"0.9189790.1753941100001-9999900.80966"a"
0.909297"a"0.011640.7542532100002-9999800.671246"a"
0.14112"a"0.640590.9252873100003-9999710.755093"a"
-0.756802"a"0.1631150.538414100004-9999610.946195"a"
" + "shape: (5, 10)
fdummyabx1x2yactualpredicteddummy_groups
f64strf64f64i64i64i64i32f64str
0.0"a"0.3098840.931330100000-10000010.604835"a"
0.841471"a"0.7787560.0758851100001-9999910.543645"a"
0.909297"a"0.4500770.9895712100002-9999810.10059"a"
0.14112"a"0.6655220.2324983100003-9999710.592351"a"
-0.756802"a"0.2412330.1450974100004-9999600.475785"a"
" ], "text/plain": [ "shape: (5, 10)\n", @@ -57,11 +57,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ str ┆ f64 ┆ f64 ┆ ┆ i64 ┆ i32 ┆ f64 ┆ str │\n", "╞═══════════╪═══════╪══════════╪══════════╪═══╪═════════╪════════╪═══════════╪══════════════╡\n", - "│ 0.0 ┆ a ┆ 0.796613 ┆ 0.63124 ┆ … ┆ -100000 ┆ 0 ┆ 0.538541 ┆ a │\n", - "│ 0.841471 ┆ a ┆ 0.918979 ┆ 0.175394 ┆ … ┆ -99999 ┆ 0 ┆ 0.80966 ┆ a │\n", - "│ 0.909297 ┆ a ┆ 0.01164 ┆ 0.754253 ┆ … ┆ -99998 ┆ 0 ┆ 0.671246 ┆ a │\n", - "│ 0.14112 ┆ a ┆ 0.64059 ┆ 0.925287 ┆ … ┆ -99997 ┆ 1 ┆ 0.755093 ┆ a │\n", - "│ -0.756802 ┆ a ┆ 0.163115 ┆ 0.53841 ┆ … ┆ -99996 ┆ 1 ┆ 0.946195 ┆ a │\n", + "│ 0.0 ┆ a ┆ 0.309884 ┆ 0.93133 ┆ … ┆ -100000 ┆ 1 ┆ 0.604835 ┆ a │\n", + "│ 0.841471 ┆ a ┆ 0.778756 ┆ 0.075885 ┆ … ┆ -99999 ┆ 1 ┆ 0.543645 ┆ a │\n", + "│ 0.909297 ┆ a ┆ 0.450077 ┆ 0.989571 ┆ … ┆ -99998 ┆ 1 ┆ 0.10059 ┆ a │\n", + "│ 0.14112 ┆ a ┆ 0.665522 ┆ 0.232498 ┆ … ┆ -99997 ┆ 1 ┆ 0.592351 ┆ a │\n", + "│ -0.756802 ┆ a ┆ 0.241233 ┆ 0.145097 ┆ … ┆ -99996 ┆ 0 ┆ 0.475785 ┆ a │\n", "└───────────┴───────┴──────────┴──────────┴───┴─────────┴────────┴───────────┴──────────────┘" ] }, @@ -313,7 +313,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 2)
dummylist_float
strlist[f64]
"b"[2.0, -1.0]
"a"[2.0, -1.0]
" + "shape: (2, 2)
dummylist_float
strlist[f64]
"a"[2.0, -1.0]
"b"[2.0, -1.0]
" ], "text/plain": [ "shape: (2, 2)\n", @@ -322,8 +322,8 @@ "│ --- ┆ --- │\n", "│ str ┆ list[f64] │\n", "╞═══════╪═════════════╡\n", - "│ b ┆ [2.0, -1.0] │\n", "│ a ┆ [2.0, -1.0] │\n", + "│ b ┆ [2.0, -1.0] │\n", "└───────┴─────────────┘" ] }, @@ -395,7 +395,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 8)
dummy_groupsl2log lossprecisionrecallfaverage_precisionroc_auc
strf64f64f64f64f64f64f64
"b"0.3359641.0061180.4928470.4950030.2469620.4934210.494359
"a"0.3341211.0016080.4993410.4982650.2494010.4993730.497763
" + "shape: (2, 8)
dummy_groupsl2log lossprecisionrecallfaverage_precisionroc_auc
strf64f64f64f64f64f64f64
"b"0.3340181.0028930.4975120.4978720.2488460.4983450.499504
"a"0.3323190.9979410.5025090.50020.2506760.4994930.501757
" ], "text/plain": [ "shape: (2, 8)\n", @@ -405,8 +405,8 @@ "│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ --- ┆ f64 │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ f64 ┆ │\n", "╞══════════════╪══════════╪══════════╪═══════════╪══════════╪══════════╪════════════════╪══════════╡\n", - "│ b ┆ 0.335964 ┆ 1.006118 ┆ 0.492847 ┆ 0.495003 ┆ 0.246962 ┆ 0.493421 ┆ 0.494359 │\n", - "│ a ┆ 0.334121 ┆ 1.001608 ┆ 0.499341 ┆ 0.498265 ┆ 0.249401 ┆ 0.499373 ┆ 0.497763 │\n", + "│ b ┆ 0.334018 ┆ 1.002893 ┆ 0.497512 ┆ 0.497872 ┆ 0.248846 ┆ 0.498345 ┆ 0.499504 │\n", + "│ a ┆ 0.332319 ┆ 0.997941 ┆ 0.502509 ┆ 0.5002 ┆ 0.250676 ┆ 0.499493 ┆ 0.501757 │\n", "└──────────────┴──────────┴──────────┴───────────┴──────────┴──────────┴────────────────┴──────────┘" ] }, @@ -494,7 +494,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 1)
sen
str
"church"
"to"
"going"
"world"
"hello"
" + "shape: (5, 1)
sen
str
"hello"
"going"
"church"
"to"
"world"
" ], "text/plain": [ "shape: (5, 1)\n", @@ -503,11 +503,11 @@ "│ --- │\n", "│ str │\n", "╞════════╡\n", + "│ hello │\n", + "│ going │\n", "│ church │\n", "│ to │\n", - "│ going │\n", "│ world │\n", - "│ hello │\n", "└────────┘" ] }, @@ -539,7 +539,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (4, 1)
sen
str
"world"
"go"
"hello"
"church"
" + "shape: (4, 1)
sen
str
"go"
"hello"
"church"
"world"
" ], "text/plain": [ "shape: (4, 1)\n", @@ -548,10 +548,10 @@ "│ --- │\n", "│ str │\n", "╞════════╡\n", - "│ world │\n", "│ go │\n", "│ hello │\n", "│ church │\n", + "│ world │\n", "└────────┘" ] }, @@ -741,6 +741,116 @@ ").head()" ] }, + { + "cell_type": "code", + "execution_count": 19, + "id": "dc9477c1", + "metadata": {}, + "outputs": [], + "source": [ + "df = pl.DataFrame({\n", + " \"word\":[\"apple\", \"banana\", \"pineapple\", \"asasasas\", \"sasasass\"],\n", + " \"other_data\": [1,2,3,4,5]\n", + "})\n", + "gibberish = [\"asasasa\", \"sasaaasss\", \"asdasadadfa\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "c50591e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (2, 2)
wordother_data
stri64
"asasasas"4
"sasasass"5
" + ], + "text/plain": [ + "shape: (2, 2)\n", + "┌──────────┬────────────┐\n", + "│ word ┆ other_data │\n", + "│ --- ┆ --- │\n", + "│ str ┆ i64 │\n", + "╞══════════╪════════════╡\n", + "│ asasasas ┆ 4 │\n", + "│ sasasass ┆ 5 │\n", + "└──────────┴────────────┘" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.filter(\n", + " pl.col(\"word\").str2.similar_to_vocab(\n", + " vocab = gibberish,\n", + " threshold = 0.5,\n", + " metric = \"leven\", # Levenshtein similarity. Other options: dleven, osa, jw\n", + " strategy = \"any\" # True if the word is similar to any word in vocab. Other options: \"all\", \"avg\"\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "7ece3794", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 6)
asasasasasaaasssasdasadadfaLCS based Fuzz match - applesOptimal String Alignment - applesJaro-Winkler - apples
f64f64f64f64f64f64
0.1428570.1111110.0909090.8333330.8333330.966667
0.4285710.3333330.2727270.1666670.00.444444
0.1111110.1111110.0909090.5555560.4444440.5
0.8750.6666670.5454550.250.250.527778
0.750.7777780.4545450.250.250.527778
" + ], + "text/plain": [ + "shape: (5, 6)\n", + "┌──────────┬───────────┬─────────────┬────────────────┬───────────────────────────┬────────────────┐\n", + "│ asasasa ┆ sasaaasss ┆ asdasadadfa ┆ LCS based Fuzz ┆ Optimal String Alignment ┆ Jaro-Winkler - │\n", + "│ --- ┆ --- ┆ --- ┆ match - apples ┆ - apple… ┆ apples │\n", + "│ f64 ┆ f64 ┆ f64 ┆ --- ┆ --- ┆ --- │\n", + "│ ┆ ┆ ┆ f64 ┆ f64 ┆ f64 │\n", + "╞══════════╪═══════════╪═════════════╪════════════════╪═══════════════════════════╪════════════════╡\n", + "│ 0.142857 ┆ 0.111111 ┆ 0.090909 ┆ 0.833333 ┆ 0.833333 ┆ 0.966667 │\n", + "│ 0.428571 ┆ 0.333333 ┆ 0.272727 ┆ 0.166667 ┆ 0.0 ┆ 0.444444 │\n", + "│ 0.111111 ┆ 0.111111 ┆ 0.090909 ┆ 0.555556 ┆ 0.444444 ┆ 0.5 │\n", + "│ 0.875 ┆ 0.666667 ┆ 0.545455 ┆ 0.25 ┆ 0.25 ┆ 0.527778 │\n", + "│ 0.75 ┆ 0.777778 ┆ 0.454545 ┆ 0.25 ┆ 0.25 ┆ 0.527778 │\n", + "└──────────┴───────────┴─────────────┴────────────────┴───────────────────────────┴────────────────┘" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.select(\n", + " pl.col(\"word\").str2.levenshtein(\"asasasa\", return_sim=True).alias(\"asasasa\"),\n", + " pl.col(\"word\").str2.levenshtein(\"sasaaasss\", return_sim=True).alias(\"sasaaasss\"),\n", + " pl.col(\"word\").str2.levenshtein(\"asdasadadfa\", return_sim=True).alias(\"asdasadadfa\"),\n", + " pl.col(\"word\").str2.fuzz(\"apples\").alias(\"LCS based Fuzz match - apples\"),\n", + " pl.col(\"word\").str2.osa(\"apples\", return_sim = True).alias(\"Optimal String Alignment - apples\"),\n", + " pl.col(\"word\").str2.jw(\"apples\").alias(\"Jaro-Winkler - apples\"),\n", + ")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -750,7 +860,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -763,7 +873,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 1)
a
f64
0.097629
-0.030844
1.386883
0.591371
1.092199
" + "shape: (5, 1)
a
f64
0.124488
-0.261337
-0.234807
0.4425
0.062444
" ], "text/plain": [ "shape: (5, 1)\n", @@ -772,15 +882,15 @@ "│ --- │\n", "│ f64 │\n", "╞═══════════╡\n", - "│ 0.097629 │\n", - "│ -0.030844 │\n", - "│ 1.386883 │\n", - "│ 0.591371 │\n", - "│ 1.092199 │\n", + "│ 0.124488 │\n", + "│ -0.261337 │\n", + "│ -0.234807 │\n", + "│ 0.4425 │\n", + "│ 0.062444 │\n", "└───────────┘" ] }, - "execution_count": 19, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -796,7 +906,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -809,7 +919,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1_000, 2)
arandom
f64f64
0.097629-0.085826
-0.0308442.563895
1.3868831.743471
0.5913712.199912
1.0921991.56007
-0.968060.728469
-0.2506911.077966
-0.805002-0.077792
-1.323398-2.068835
0.1861440.286291
-0.972090.340019
0.6652461.544772
0.1651991.771167
-0.890093-0.064297
0.201987-0.07098
-0.2145610.65106
0.3578540.7858
-0.421794-1.057364
-0.6215581.082361
1.1732951.176845
0.2641440.945366
-1.151482-0.582049
nullnull
nullnull
" + "shape: (1_000, 2)
arandom
f64f64
0.1244880.929309
-0.2613370.519294
-0.234807-0.169881
0.44251.300929
0.0624440.624859
-0.235291.705604
1.647374-0.577216
0.2762171.28727
-0.5021980.850403
1.5930941.750045
0.5159681.285998
0.0867461.720438
1.229287-2.82693
0.5941181.261264
-0.8813150.574474
-0.5388641.798715
0.9015241.230406
-0.9820392.267089
-1.3058750.719571
0.0702060.75865
0.0517791.629686
0.1311760.888318
nullnull
nullnull
" ], "text/plain": [ "shape: (1_000, 2)\n", @@ -818,19 +928,19 @@ "│ --- ┆ --- │\n", "│ f64 ┆ f64 │\n", "╞═══════════╪═══════════╡\n", - "│ 0.097629 ┆ -0.085826 │\n", - "│ -0.030844 ┆ 2.563895 │\n", - "│ 1.386883 ┆ 1.743471 │\n", - "│ 0.591371 ┆ 2.199912 │\n", + "│ 0.124488 ┆ 0.929309 │\n", + "│ -0.261337 ┆ 0.519294 │\n", + "│ -0.234807 ┆ -0.169881 │\n", + "│ 0.4425 ┆ 1.300929 │\n", "│ … ┆ … │\n", - "│ 0.264144 ┆ 0.945366 │\n", - "│ -1.151482 ┆ -0.582049 │\n", + "│ 0.051779 ┆ 1.629686 │\n", + "│ 0.131176 ┆ 0.888318 │\n", "│ null ┆ null │\n", "│ null ┆ null │\n", "└───────────┴───────────┘" ] }, - "execution_count": 20, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -844,7 +954,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -857,7 +967,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1_000, 2)
arandom_str
f64str
0.097629"Te"
-0.030844"SWs"
1.386883"s1yx"
0.591371"Lt"
1.092199"14"
-0.96806"lP"
-0.250691"q"
-0.805002"z"
-1.323398"K8D"
0.186144"f"
-0.97209"1Uyx"
0.665246"9Ttl"
0.165199"z"
-0.890093"s4"
0.201987"z"
-0.214561"H"
0.357854"ih"
-0.421794"LuLD"
-0.621558"M"
1.173295"v8O"
0.264144"ajS0"
-1.151482"GM4"
nullnull
nullnull
" + "shape: (1_000, 2)
arandom_str
f64str
0.124488"II"
-0.261337"T"
-0.234807"V"
0.4425"SW8"
0.062444"9To"
-0.23529"m3G4"
1.647374"2"
0.276217"VtU"
-0.502198"1"
1.593094"tR9"
0.515968"1k"
0.086746"Wx9"
1.229287"Bxw"
0.594118"33X"
-0.881315"sFoZ"
-0.538864"ozj"
0.901524"9kS"
-0.982039"W"
-1.305875"O8k7"
0.070206"4z"
0.051779"j4"
0.131176"vM"
nullnull
nullnull
" ], "text/plain": [ "shape: (1_000, 2)\n", @@ -866,19 +976,19 @@ "│ --- ┆ --- │\n", "│ f64 ┆ str │\n", "╞═══════════╪════════════╡\n", - "│ 0.097629 ┆ Te │\n", - "│ -0.030844 ┆ SWs │\n", - "│ 1.386883 ┆ s1yx │\n", - "│ 0.591371 ┆ Lt │\n", + "│ 0.124488 ┆ II │\n", + "│ -0.261337 ┆ T │\n", + "│ -0.234807 ┆ V │\n", + "│ 0.4425 ┆ SW8 │\n", "│ … ┆ … │\n", - "│ 0.264144 ┆ ajS0 │\n", - "│ -1.151482 ┆ GM4 │\n", + "│ 0.051779 ┆ j4 │\n", + "│ 0.131176 ┆ vM │\n", "│ null ┆ null │\n", "│ null ┆ null │\n", "└───────────┴────────────┘" ] }, - "execution_count": 21, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -892,7 +1002,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -905,7 +1015,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1_000, 2)
arandom_str
f64str
0.097629"jg7x5"
-0.030844"VsuAu"
1.386883"hT5ub"
0.591371"6AqFN"
1.092199"1hsY7"
-0.96806"qNpiS"
-0.250691"3vaf7"
-0.805002"B0849"
-1.323398"I4gZ6"
0.186144"HPn26"
-0.97209"PIFmq"
0.665246"2c3ir"
0.165199"HmW60"
-0.890093"PuRla"
0.201987"N3sXB"
-0.214561"kS2Ve"
0.357854"bCxwy"
-0.421794"Ao0Ff"
-0.621558"syYKi"
1.173295"dODkd"
0.264144"22mYg"
-1.151482"Q5Q4Y"
nullnull
nullnull
" + "shape: (1_000, 2)
arandom_str
f64str
0.124488"kyw4H"
-0.261337"jgyXc"
-0.234807"YtmDD"
0.4425"zQepW"
0.062444"T9Ejo"
-0.23529"Wpzkx"
1.647374"4np3i"
0.276217"rKeq6"
-0.502198"d1OeJ"
1.593094"kJSV2"
0.515968"Hu3mh"
0.086746"tHBlu"
1.229287"6Dt9t"
0.594118"rwIi9"
-0.881315"h42h0"
-0.538864"ASLeE"
0.901524"OQPdK"
-0.982039"w0tGD"
-1.305875"S1SBL"
0.070206"1dP8f"
0.051779"wxkhJ"
0.131176"vMO8h"
nullnull
nullnull
" ], "text/plain": [ "shape: (1_000, 2)\n", @@ -914,19 +1024,19 @@ "│ --- ┆ --- │\n", "│ f64 ┆ str │\n", "╞═══════════╪════════════╡\n", - "│ 0.097629 ┆ jg7x5 │\n", - "│ -0.030844 ┆ VsuAu │\n", - "│ 1.386883 ┆ hT5ub │\n", - "│ 0.591371 ┆ 6AqFN │\n", + "│ 0.124488 ┆ kyw4H │\n", + "│ -0.261337 ┆ jgyXc │\n", + "│ -0.234807 ┆ YtmDD │\n", + "│ 0.4425 ┆ zQepW │\n", "│ … ┆ … │\n", - "│ 0.264144 ┆ 22mYg │\n", - "│ -1.151482 ┆ Q5Q4Y │\n", + "│ 0.051779 ┆ wxkhJ │\n", + "│ 0.131176 ┆ vMO8h │\n", "│ null ┆ null │\n", "│ null ┆ null │\n", "└───────────┴────────────┘" ] }, - "execution_count": 22, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -940,7 +1050,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -953,7 +1063,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 4)
t-tests: statisticst-tests: pvaluenormality_test: statisticsnormality_test: pvalue
f64f64f64f64
0.2564410.7976460.6505030.722346
" + "shape: (1, 4)
t-tests: statisticst-tests: pvaluenormality_test: statisticsnormality_test: pvalue
f64f64f64f64
-0.8123040.4167480.2250330.893582
" ], "text/plain": [ "shape: (1, 4)\n", @@ -962,11 +1072,11 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════════════════════╪═════════════════╪════════════════════════════╪════════════════════════╡\n", - "│ 0.256441 ┆ 0.797646 ┆ 0.650503 ┆ 0.722346 │\n", + "│ -0.812304 ┆ 0.416748 ┆ 0.225033 ┆ 0.893582 │\n", "└─────────────────────┴─────────────────┴────────────────────────────┴────────────────────────┘" ] }, - "execution_count": 23, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -992,7 +1102,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 27, "id": "b46a72a5", "metadata": {}, "outputs": [ @@ -1006,7 +1116,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 5)
market_idgroup1group2category_1category_2
i64f64f64i64i64
00.2911770.36219227
10.3882050.09945116
20.4201540.05846922
00.9764380.12167512
10.5594360.34034503
" + "shape: (5, 5)
market_idgroup1group2category_1category_2
i64f64f64i64i64
00.667480.9155149
10.6586680.83132634
20.604690.48656532
00.6903750.91817716
10.5423070.31053947
" ], "text/plain": [ "shape: (5, 5)\n", @@ -1015,15 +1125,15 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ f64 ┆ f64 ┆ i64 ┆ i64 │\n", "╞═══════════╪══════════╪══════════╪════════════╪════════════╡\n", - "│ 0 ┆ 0.291177 ┆ 0.362192 ┆ 2 ┆ 7 │\n", - "│ 1 ┆ 0.388205 ┆ 0.099451 ┆ 1 ┆ 6 │\n", - "│ 2 ┆ 0.420154 ┆ 0.058469 ┆ 2 ┆ 2 │\n", - "│ 0 ┆ 0.976438 ┆ 0.121675 ┆ 1 ┆ 2 │\n", - "│ 1 ┆ 0.559436 ┆ 0.340345 ┆ 0 ┆ 3 │\n", + "│ 0 ┆ 0.66748 ┆ 0.91551 ┆ 4 ┆ 9 │\n", + "│ 1 ┆ 0.658668 ┆ 0.831326 ┆ 3 ┆ 4 │\n", + "│ 2 ┆ 0.60469 ┆ 0.486565 ┆ 3 ┆ 2 │\n", + "│ 0 ┆ 0.690375 ┆ 0.918177 ┆ 1 ┆ 6 │\n", + "│ 1 ┆ 0.542307 ┆ 0.310539 ┆ 4 ┆ 7 │\n", "└───────────┴──────────┴──────────┴────────────┴────────────┘" ] }, - "execution_count": 24, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1044,7 +1154,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 28, "id": "adc4f66f", "metadata": {}, "outputs": [ @@ -1058,7 +1168,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 3)
t-testchi2-testf-test
struct[2]struct[2]struct[2]
{-0.568826,0.569487}{20.227158,0.984214}{1.657997,0.156847}
" + "shape: (1, 3)
t-testchi2-testf-test
struct[2]struct[2]struct[2]
{-0.275945,0.782596}{30.999896,0.705189}{1.753513,0.135333}
" ], "text/plain": [ "shape: (1, 3)\n", @@ -1067,11 +1177,11 @@ "│ --- ┆ --- ┆ --- │\n", "│ struct[2] ┆ struct[2] ┆ struct[2] │\n", "╞══════════════════════╪══════════════════════╪═════════════════════╡\n", - "│ {-0.568826,0.569487} ┆ {20.227158,0.984214} ┆ {1.657997,0.156847} │\n", + "│ {-0.275945,0.782596} ┆ {30.999896,0.705189} ┆ {1.753513,0.135333} │\n", "└──────────────────────┴──────────────────────┴─────────────────────┘" ] }, - "execution_count": 25, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1087,7 +1197,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 29, "id": "65dbb6bd", "metadata": {}, "outputs": [ @@ -1101,7 +1211,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (3, 4)
market_idt-testchi2-testf-test
i64struct[2]struct[2]struct[2]
0{-0.886846,0.375226}{36.390403,0.450482}{-0.503377,-0.733276}
1{1.072105,0.283751}{23.387811,0.948028}{-2.182564,-0.068697}
2{-1.154529,0.248366}{33.069657,0.608716}{-0.961816,-0.427371}
" + "shape: (3, 4)
market_idt-testchi2-testf-test
i64struct[2]struct[2]struct[2]
0{-0.48335,0.628879}{44.733163,0.150736}{-0.643994,-0.631164}
1{-0.298857,0.765068}{32.508767,0.635416}{-0.490534,-0.742719}
2{0.310745,0.756014}{27.211027,0.854105}{-1.340865,-0.252492}
" ], "text/plain": [ "shape: (3, 4)\n", @@ -1110,13 +1220,13 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ struct[2] ┆ struct[2] ┆ struct[2] │\n", "╞═══════════╪══════════════════════╪══════════════════════╪═══════════════════════╡\n", - "│ 0 ┆ {-0.886846,0.375226} ┆ {36.390403,0.450482} ┆ {-0.503377,-0.733276} │\n", - "│ 1 ┆ {1.072105,0.283751} ┆ {23.387811,0.948028} ┆ {-2.182564,-0.068697} │\n", - "│ 2 ┆ {-1.154529,0.248366} ┆ {33.069657,0.608716} ┆ {-0.961816,-0.427371} │\n", + "│ 0 ┆ {-0.48335,0.628879} ┆ {44.733163,0.150736} ┆ {-0.643994,-0.631164} │\n", + "│ 1 ┆ {-0.298857,0.765068} ┆ {32.508767,0.635416} ┆ {-0.490534,-0.742719} │\n", + "│ 2 ┆ {0.310745,0.756014} ┆ {27.211027,0.854105} ┆ {-1.340865,-0.252492} │\n", "└───────────┴──────────────────────┴──────────────────────┴───────────────────────┘" ] }, - "execution_count": 27, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } diff --git a/python/polars_ds/str2.py b/python/polars_ds/str2.py index 9f8ac1a6..1bdf909b 100644 --- a/python/polars_ds/str2.py +++ b/python/polars_ds/str2.py @@ -1,5 +1,5 @@ import polars as pl -from typing import Union, Optional +from typing import Union, Optional, Literal from polars.utils.udfs import _get_shared_lib_location from .type_alias import AhoCorasickMatchKind import warnings @@ -492,7 +492,7 @@ def jw( self, other: Union[str, pl.Expr], weight: float = 0.1, parallel: bool = False ) -> pl.Expr: """ - Computes the Jaro-Winker similarity between this and the other str. + Computes the Jaro-Winkler similarity between this and the other str. Jaro-Winkler distance = 1 - Jaro-Winkler sim. Parameters @@ -550,6 +550,53 @@ def hamming( is_elementwise=True, ) + def similar_to_vocab( + self, + vocab: list[str], + threshold: float, + metric: Literal["leven", "dleven", "jw", "osa"] = "leven", + strategy: Literal["avg", "all", "any"] = "avg", + ) -> pl.Expr: + """ + Compare each word in the vocab with the each word in self. Filters self to the words + that are most similar to the words in the vocab. + + Parameters + ---------- + vocab + Any iterable collection of strings + threshold + A entry is considered similar to the words in the vocabulary if the similarity + is above (>=) the threshold + metric + Which similarity metric to use. One of `leven`, `dleven`, `jw`, `osa` + strategy + If `avg`, then will return true if the average similarity is above the threshold. + If `all`, then will return true if the similarity to all words in the vocab is above + the threshold. + If `any`, then will return true if the similarity to any words in the vocab is above + the threshold. + """ + if metric == "leven": + sims = [self.levenshtein(w, return_sim=True) for w in vocab] + elif metric == "dleven": + sims = [self.d_levenshtein(w, return_sim=True) for w in vocab] + elif metric == "osa": + sims = [self.osa(w, return_sim=True) for w in vocab] + elif sims == "jw": + sims = [self.jw(w, return_sim=True) for w in vocab] + else: + raise ValueError(f"Unknown metric for find_similar: {metric}") + + if strategy == "all": + return pl.all_horizontal(s >= threshold for s in sims) + elif strategy == "any": + return pl.any_horizontal(s >= threshold for s in sims) + elif strategy == "avg": + return (pl.sum_horizontal(sims) / len(vocab)) >= threshold + else: + raise ValueError(f"Unknown strategy for find_similar: {strategy}") + def tokenize(self, pattern: str = r"(?u)\b\w\w+\b", stem: bool = False) -> pl.Expr: """ Tokenize the string according to the pattern. This will only extract the words diff --git a/src/num_ext/entrophies.rs b/src/num_ext/entrophies.rs index a03ebc48..9fb9795c 100644 --- a/src/num_ext/entrophies.rs +++ b/src/num_ext/entrophies.rs @@ -6,7 +6,6 @@ use pyo3_polars::derive::polars_expr; // https://en.wikipedia.org/wiki/Sample_entropy // https://en.wikipedia.org/wiki/Approximate_entropy - #[polars_expr(output_type=Float64)] fn pl_approximate_entropy(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResult { // inputs[0] is radius, the rest are the shifted columns @@ -33,16 +32,20 @@ fn pl_approximate_entropy(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResu let data_1_view = data.slice(s![..n1, ..dim.abs_diff(1)]); let tree = build_standard_kdtree(dim.abs_diff(1), leaf_size, &data_1_view)?; let nb_in_radius = query_nb_cnt(&tree, data_1_view, &super::l_inf_dist, r, parallel); - let phi_m: f64 = nb_in_radius.into_no_null_iter() - .fold(0_f64, |acc, x| acc + (x as f64 / n1 as f64).ln()) / n1 as f64; + let phi_m: f64 = nb_in_radius + .into_no_null_iter() + .fold(0_f64, |acc, x| acc + (x as f64 / n1 as f64).ln()) + / n1 as f64; // Step 3, 4, 5 for m + 1 in wiki let n2 = n1.abs_diff(1); let data_2_view = data.slice(s![..n2, ..]); let tree = build_standard_kdtree(dim, leaf_size, &data_2_view)?; let nb_in_radius = query_nb_cnt(&tree, data_2_view, &super::l_inf_dist, r, parallel); - let phi_m1: f64 = nb_in_radius.into_no_null_iter() - .fold(0_f64, |acc, x| acc + (x as f64 / n2 as f64).ln()) / n2 as f64; + let phi_m1: f64 = nb_in_radius + .into_no_null_iter() + .fold(0_f64, |acc, x| acc + (x as f64 / n2 as f64).ln()) + / n2 as f64; // Output Ok(Series::from_vec("", vec![(phi_m1 - phi_m).abs()])) diff --git a/src/num_ext/knn.rs b/src/num_ext/knn.rs index 380d5d96..58701bb0 100644 --- a/src/num_ext/knn.rs +++ b/src/num_ext/knn.rs @@ -80,12 +80,13 @@ fn pl_knn_ptwise(inputs: &[Series], kwargs: KdtreeKwargs) -> PolarsResult PolarsResult = v.into_iter() + let w: Vec = v + .into_iter() .map(|(_, i)| id.get(*i).unwrap()) .collect_vec(); builder.append_slice(w.as_slice()); diff --git a/src/stats_ext/chi2.rs b/src/stats_ext/chi2.rs index 408d8da1..a6df113d 100644 --- a/src/stats_ext/chi2.rs +++ b/src/stats_ext/chi2.rs @@ -5,7 +5,6 @@ use pyo3_polars::derive::polars_expr; #[polars_expr(output_type_func=simple_stats_output)] fn pl_chi2(inputs: &[Series]) -> PolarsResult { - let s1_name = "s1"; let s2_name = "s2"; diff --git a/src/stats_ext/fstats.rs b/src/stats_ext/fstats.rs index 6376b679..24eafee0 100644 --- a/src/stats_ext/fstats.rs +++ b/src/stats_ext/fstats.rs @@ -22,15 +22,18 @@ fn ftest(x: f64, f1: f64, f2: f64) -> Result { /// where n = inputs.len() - 1 = number of features /// And additionally x_n, .., x_{2n - 2} = p_0, .., p_{n-1}, are the p values. fn _f_stats(inputs: &[Series], return_p: bool) -> PolarsResult> { - let target = "target"; - let v = inputs.into_iter().enumerate().map(|(i , s)| { - if i == 0 { - s.clone().with_name(target) - } else { - s.clone().with_name(i.to_string().as_str()) - } - }).collect_vec(); + let v = inputs + .into_iter() + .enumerate() + .map(|(i, s)| { + if i == 0 { + s.clone().with_name(target) + } else { + s.clone().with_name(i.to_string().as_str()) + } + }) + .collect_vec(); let n_cols = v.len(); let df = DataFrame::new(v)?.lazy(); diff --git a/tests/test.ipynb b/tests/test.ipynb index a29d31e6..941af85c 100644 --- a/tests/test.ipynb +++ b/tests/test.ipynb @@ -2,14 +2,46 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "529f4422-5c3a-4bd6-abe0-a15edfc62abb", "metadata": {}, "outputs": [], "source": [ "import polars as pl\n", "import numpy as np\n", - "import polars_ds as pld" + "# import polars_ds as pld" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a3dbd5e", + "metadata": {}, + "outputs": [], + "source": [ + "import polars_ds as pld\n", + "df = pl.DataFrame({\n", + " \"word\":[\"apple\", \"banana\", \"pineapple\", \"asasasas\", \"sasasass\"],\n", + " \"other_data\": [1,2,3,4,5]\n", + "})\n", + "gibberish = [\"asasasa\", \"sasaaasss\", \"asdasadadfa\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73ca40b3", + "metadata": {}, + "outputs": [], + "source": [ + "df.filter(\n", + " pl.col(\"word\").str2.similar_to_vocab(\n", + " vocab = gibberish,\n", + " threshold = 0.5,\n", + " metric = \"leven\", # Levenshtein similarity. Other options: dleven, osa, jw\n", + " strategy = \"any\" # True if the word is similar to any word in vocab. Other options: \"all\", \"avg\"\n", + " )\n", + ")" ] }, { @@ -54,42 +86,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "0fbc1c14", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (5, 5)
market_idgroup1group2category_1category_2
i64f64f64i64i64
00.8632110.0514620
10.3751870.84121304
20.7317480.77574722
00.3666780.23632713
10.5213380.72882736
" - ], - "text/plain": [ - "shape: (5, 5)\n", - "┌───────────┬──────────┬──────────┬────────────┬────────────┐\n", - "│ market_id ┆ group1 ┆ group2 ┆ category_1 ┆ category_2 │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ i64 ┆ f64 ┆ f64 ┆ i64 ┆ i64 │\n", - "╞═══════════╪══════════╪══════════╪════════════╪════════════╡\n", - "│ 0 ┆ 0.863211 ┆ 0.05146 ┆ 2 ┆ 0 │\n", - "│ 1 ┆ 0.375187 ┆ 0.841213 ┆ 0 ┆ 4 │\n", - "│ 2 ┆ 0.731748 ┆ 0.775747 ┆ 2 ┆ 2 │\n", - "│ 0 ┆ 0.366678 ┆ 0.236327 ┆ 1 ┆ 3 │\n", - "│ 1 ┆ 0.521338 ┆ 0.728827 ┆ 3 ┆ 6 │\n", - "└───────────┴──────────┴──────────┴────────────┴────────────┘" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "size = 5000\n", "df = pl.DataFrame({\n", @@ -106,40 +106,10 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "1d84105b", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (3, 4)
market_idt-testchi2-testf-test
i64struct[2]struct[2]struct[2]
0{-0.678227,0.497675}{27.55702,0.842609}{3.304733,0.010454}
1{1.049668,0.293947}{28.920644,0.792821}{1.081389,0.364112}
2{0.246265,0.805493}{27.843576,0.832729}{0.851298,0.492663}
" - ], - "text/plain": [ - "shape: (3, 4)\n", - "┌───────────┬──────────────────────┬──────────────────────┬─────────────────────┐\n", - "│ market_id ┆ t-test ┆ chi2-test ┆ f-test │\n", - "│ --- ┆ --- ┆ --- ┆ --- │\n", - "│ i64 ┆ struct[2] ┆ struct[2] ┆ struct[2] │\n", - "╞═══════════╪══════════════════════╪══════════════════════╪═════════════════════╡\n", - "│ 0 ┆ {-0.678227,0.497675} ┆ {27.55702,0.842609} ┆ {3.304733,0.010454} │\n", - "│ 1 ┆ {1.049668,0.293947} ┆ {28.920644,0.792821} ┆ {1.081389,0.364112} │\n", - "│ 2 ┆ {0.246265,0.805493} ┆ {27.843576,0.832729} ┆ {0.851298,0.492663} │\n", - "└───────────┴──────────────────────┴──────────────────────┴─────────────────────┘" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# In segment T-test, chi2 test, F test made easy!\n", "df.group_by(\"market_id\").agg(\n",