Merge pull request #116 from holukas/more-stats

More stats
holukas · May 23, 2024 · ceebdb4 · ceebdb4
2 parents 7878a8b + eea4e42
commit ceebdb4
Show file tree

Hide file tree

Showing 10 changed files with 1,204 additions and 120 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,33 @@
 
 ![DIIVE](images/logo_diive1_256px.png)
 
+## v0.76.2 | 23 May 2024
+
+### Additions
+
+- Added function to calculate absolute double differences of a time series, which is the sum of absolute differences
+  between a data record and its preceding and next record. Used in class `zScoreIncrements` for finding (isolated)
+  outliers that are distant from neighboring records. (`diive.core.dfun.stats.double_diff_absolute`)
+- Added small function to calculate z-score stats of a time series (`diive.core.dfun.stats.sstats_zscore`)
+- Added small function to calculate stats for absolute double differences of a time
+  series (`diive.core.dfun.stats.sstats_doublediff_abs`)
+
+### Changes
+
+- Changed the algorithm for outlier detection when using `zScoreIncrements`. Data points are now flagged as outliers if
+  the z-scores of three absolute differences (previous record, next record and the sum of both) all exceed a specified
+  threshold.  (`diive.pkgs.outlierdetection.incremental.zScoreIncrements`)
+
+### Notebooks
+
+- Added new notebook for outlier detection using
+  class `LocalOutlierFactorAllData` (`notebooks/OutlierDetection/LocalOutlierFactorAllData.ipynb`)
+
+### Tests
+
+- Added new test case
+  for `LocalOutlierFactorAllData` (`tests.test_outlierdetection.TestOutlierDetection.test_lof_alldata`)
+
 ## v0.76.1 | 17 May 2024
 
 ### Additions

diff --git a/diive/core/dfun/stats.py b/diive/core/dfun/stats.py
@@ -1,7 +1,9 @@
 # import diive.pkgs.dfun
 # from stats.boxes import insert_statsboxes_txt
-from pandas import Series, DataFrame
 import pandas as pd
+from pandas import Series, DataFrame
+
+from diive.core.funcs.funcs import zscore
 
 
 def q75(x):
@@ -84,14 +86,14 @@ def sstats(s: Series) -> DataFrame:
     df.loc['MISSING', col] = series_numvals_missing(s)
     df.loc['MISSING_PERC', col] = series_perc_missing(s)
     df.loc['MEAN', col] = s.mean()
+    df.loc['MEDIAN', col] = s.quantile(q=0.50)
     df.loc['SD', col] = s.std()
     df.loc['VAR', col] = s.var()
     df.loc['SD/MEAN'] = series_sd_over_mean(s)
     # df.loc['MAD', col] = s.mad()  # deprecated in pandas
     # df.loc['CUMSUM_MIN', col] = s.cummin().iloc[-1]
     # df.loc['CUMSUM_MAX', col] = s.cummax().iloc[-1]
     df.loc['SUM', col] = s.sum()
-    df.loc['MEDIAN', col] = s.quantile(q=0.50)
     df.loc['MIN', col] = s.min()
     df.loc['MAX', col] = s.max()
     df.loc['P01', col] = s.quantile(q=0.01)
@@ -103,11 +105,40 @@ def sstats(s: Series) -> DataFrame:
     return df
 
 
+def sstats_doublediff_abs(s: Series) -> DataFrame:
+    """Calculate stats for absolute double difference of series."""
+    doublediff_abs, diff_to_prev_abs, diff_to_next_abs = double_diff_absolute(s=s)
+    df = sstats(s=doublediff_abs)
+    return df
+
+
+def sstats_zscore(s: Series) -> DataFrame:
+    """Calculate stats for z-scores of series."""
+    z = zscore(series=s)
+    df = sstats(s=z)
+    return df
+
+
+def double_diff_absolute(s: Series) -> tuple[Series, Series, Series]:
+    """Calculate the absolute sum of differences between a data point and
+    the respective preceding and next value."""
+    shifted_prev = s.shift(1)
+    diff_to_prev = s - shifted_prev
+    diff_to_prev_abs = diff_to_prev.abs()
+    shifted_next = s.shift(-1)
+    diff_to_next = s - shifted_next
+    diff_to_next_abs = diff_to_next.abs()
+    doublediff_abs = diff_to_prev_abs + diff_to_next_abs
+    # dd_abs = dd_abs ** 2
+    doublediff_abs.name = 'DOUBLE_DIFF_ABS'
+    return doublediff_abs, diff_to_prev_abs, diff_to_next_abs
+
+
 def example():
-    from diive.configs.exampledata import load_exampledata_pickle
-    df = load_exampledata_pickle()
-    series = df['NEE_CUT_REF_orig'].copy()
-    stats = sstats(series)
+    from diive.configs.exampledata import load_exampledata_parquet
+    df = load_exampledata_parquet()
+    series = df['NEE_CUT_REF_f'].copy()
+    stats = sstats_doublediff_abs(series)
     print(stats)
 
 

diff --git a/diive/core/utils/prints.py b/diive/core/utils/prints.py
@@ -61,10 +61,11 @@ def __init__(self, id: str, spacing: bool = True):
 
     def section(self):
         if self.spacing:
-            print("")
-            print("")
+            pass
+            # print("")
+            # print("")
         # self.str(txt=f"{'=' * 40}")
-        self.str(txt=f"{self.id}")
+        self.str(txt=f"running {self.id} ...")
         # self.str(txt=f"{'=' * 40}")
 
     def str(self, txt: str):

diff --git a/diive/pkgs/outlierdetection/incremental.py b/diive/pkgs/outlierdetection/incremental.py
@@ -8,6 +8,7 @@
 """
 from pandas import Series, DatetimeIndex
 
+from diive.core.dfun.stats import double_diff_absolute
 from diive.core.base.flagbase import FlagBase
 from diive.core.utils.prints import ConsoleOutputDecorator
 from diive.pkgs.outlierdetection.zscore import zScore
@@ -25,17 +26,32 @@ def __init__(self,
                  verbose: bool = False):
         """Identify outliers based on the z-score of record increments.
 
+        First, several absolute increments are calcualted for each data record at time t:
+            (1) increment1(t) = absolute( value(t) - value(t-1) )
+            (2) increment2(t) = absolute( value(t) - value(t+1) )
+            (3) increment1+2(t) = increment1(t) + increment2(t)
+
+        Second, z-scores are calculated for each of these increments:
+            (4) z-scores of increment1(t)
+            (5) z-scores of increment2(t)
+            (6) z-scores of increment1+2(t)
+
+        Third, all data records where z-score > *thres_zscore* are flagged:
+            (7) z-scores of increment1(t) > *thres_zscore* --> flag=2
+            (8) z-scores of increment2(t) > *thres_zscore* --> flag=2
+            (9) z-scores of increment1+2(t) > *thres_zscore* --> flag=2
+
+        Fourth, all data records where all three increments were flagged are flagged as outlier.
+            The sum of three flags in (7), (8) and (9) = 2 + 2 + 2 = 6 = outlier.
+
+        Only data records where all three flags were raised are flagged as outlier.
+
         Args:
             series: Time series in which outliers are identified.
             idstr: Identifier, added as suffix to output variable names.
             thres_zscore: Threshold for z-score, scores above this value will
                 be flagged as outlier. NOTE that in this case the z-scores are
-                calculated from the increments between data records in *series*,
-                whereby the increment at a point in time t is calculated as:
-                (1) increment1(t) = absolute( value(t) - value(t-1) )
-                (2) increment2(t) = absolute( value(t) - value(t+1) )
-                (3) increment(t) = increment1(t) + increment2(t)
-                (4) z-scores are calculated from increment(t)
+                calculated from the increments between data records in *series*.
             showplot: Show plot with results from the outlier detection.
             verbose: Print more text output.
 
@@ -66,53 +82,64 @@ def _flagtests(self, iteration) -> tuple[DatetimeIndex, DatetimeIndex, int]:
         """Perform tests required for this flag"""
 
         s = self.filteredseries.copy()
-        shifted_prev = s.shift(1)
-        increment_to_prev = s - shifted_prev
-        shifted_next = s.shift(-1)
-        increment_to_next = s - shifted_next
 
-        increment = increment_to_prev.abs() + increment_to_next.abs()
+        doublediff_abs, diff_to_prev_abs, diff_to_next_abs = double_diff_absolute(s=s)
+
+        # Run z-score test for all three diff series
+        flag_collect = Series(index=doublediff_abs.index, data=doublediff_abs)
+        for diff_ix, diff in enumerate([doublediff_abs, diff_to_prev_abs, diff_to_next_abs]):
+            flagtest_zscore = zScore(series=diff, thres_zscore=self.thres_zscore,
+                                     plottitle=f"z-score of {self.series.name} increments",
+                                     showplot=False, verbose=False)
+            flagtest_zscore.calc(repeat=False)
+            flag_zscore = flagtest_zscore.get_flag()
+            if diff_ix == 0:
+                flag_collect = flag_zscore.copy()
+            else:
+                flag_collect = flag_collect.add(flag_zscore)
 
-        increment.name = 'INCREMENT'
 
+        # import matplotlib.pyplot as plt
+        # flag_collect.plot()
+        # plt.show()
+
+        # increment.name = 'INCREMENT'
         # Run z-score test on increments and get resulting flag
-        flagtest_zscore = zScore(series=increment, thres_zscore=self.thres_zscore,
-                                 plottitle=f"z-score of {self.series.name} increments",
-                                 showplot=False, verbose=False)
-        flagtest_zscore.calc(repeat=False)
-        flag_zscore = flagtest_zscore.get_flag()
+        # flagtest_zscore = zScore(series=increment, thres_zscore=self.thres_zscore,
+        #                          plottitle=f"z-score of {self.series.name} increments",
+        #                          showplot=False, verbose=False)
+        # flagtest_zscore.calc(repeat=False)
+        # flag_zscore = flagtest_zscore.get_flag()
 
         # import pandas as pd
         # import matplotlib.pyplot as plt
         # df = pd.DataFrame(
         #     {
         #         'series': s,
-        #         # 'shifted_prev': shifted_prev,
-        #         'increment_to_prev': increment_to_prev,
-        #         # 'shifted_next': shifted_next,
-        #         'increment_to_next': increment_to_next,
-        #         'increment': increment,
-        #         'flag_zscore': flag_zscore,
+        #         'doublediff_abs': doublediff_abs,
+        #         'flag_zscore': flag_collect,
         #     }
         # )
         # df.plot(subplots=True)
         # plt.show()
 
-        ok = flag_zscore == 0
+        ok = flag_collect < 6
         ok = ok[ok].index
-        rejected = flag_zscore == 2
+        rejected = flag_collect == 6  # z-score flags for all three diffs are 2 and 3*2=6
         rejected = rejected[rejected].index
         n_outliers = len(rejected)
 
         if self.verbose:
             print(
-                f"ITERATION#{iteration}: Total found {increment.name} outliers: {n_outliers} values (daytime+nighttime)")
+                f"ITERATION#{iteration}: Total found outliers: {n_outliers} values (daytime+nighttime)")
 
         return ok, rejected, n_outliers
 
 
 def example():
     import importlib.metadata
+    import pandas as pd
+    import matplotlib.pyplot as plt
     import diive.configs.exampledata as ed
     from diive.pkgs.createvar.noise import add_impulse_noise
     from diive.core.plotting.timeseries import TimeSeries
@@ -127,19 +154,33 @@ def example():
     s_noise = add_impulse_noise(series=s,
                                 factor_low=-10,
                                 factor_high=3,
-                                contamination=0.04)  # Add impulse noise (spikes)
+                                contamination=0.04,
+                                seed=42)  # Add impulse noise (spikes)
     s_noise.name = f"{s.name}+noise"
     TimeSeries(s_noise).plot()
 
     zsi = zScoreIncrements(
         series=s_noise,
-        thres_zscore=4.5,
+        thres_zscore=5.5,
         showplot=True,
-        verbose=False
-    )
+        verbose=False)
 
     zsi.calc(repeat=True)
 
+    flag = zsi.get_flag()
+
+    frame = {'s': s, 's_noise': s_noise, 'flag': flag}
+    checkdf = pd.DataFrame.from_dict(frame)
+    good_data = checkdf.loc[checkdf['flag'] == 0]['s_noise']
+    rejected_data = checkdf.loc[checkdf['flag'] == 2]['s_noise']
+
+    fig, ax = plt.subplots()
+    ax.plot(good_data, color="#42A5F5", label="not an outlier", lw=0, ms=5, marker="o")
+    ax.plot(rejected_data, color="red", label="outlier", lw=0, ms=7, marker="X")
+    plt.title("Result")
+    plt.legend()
+    plt.show()
+
 
 if __name__ == '__main__':
     example()
diff --git a/diive/pkgs/outlierdetection/lof.py b/diive/pkgs/outlierdetection/lof.py
@@ -409,3 +409,36 @@ def _plot(self, df: DataFrame):
         title = f"Outlier detection - local outlier factor"
         fig.suptitle(title, fontsize=theme.FIGHEADER_FONTSIZE)
         fig.show()
+
+def example():
+    import importlib.metadata
+    import diive.configs.exampledata as ed
+    from diive.pkgs.createvar.noise import add_impulse_noise
+    from diive.core.plotting.timeseries import TimeSeries
+    import warnings
+    warnings.filterwarnings('ignore')
+    version_diive = importlib.metadata.version("diive")
+    print(f"diive version: v{version_diive}")
+    df = ed.load_exampledata_parquet()
+    s = df['Tair_f'].copy()
+    s = s.loc[s.index.year == 2018].copy()
+    s = s.loc[s.index.month == 7].copy()
+    s_noise = add_impulse_noise(series=s,
+                                factor_low=-10,
+                                factor_high=3,
+                                contamination=0.04)  # Add impulse noise (spikes)
+    s_noise.name = f"{s.name}+noise"
+    TimeSeries(s_noise).plot()
+
+    lofa = LocalOutlierFactorAllData(
+        series=s_noise,
+        n_neighbors=200,
+        contamination='auto',
+        showplot=True
+    )
+
+    lofa.calc(repeat=True)
+
+
+if __name__ == '__main__':
+    example()
diff --git a/notebooks/OVERVIEW.ipynb b/notebooks/OVERVIEW.ipynb
@@ -26,7 +26,7 @@
    "metadata": {},
    "source": [
     "---\n",
-    "**Last updated:**: 17 May 2024  \n",
+    "**Last updated:**: 20 May 2024  \n",
     "**Author**: Lukas Hörtnagl (holukas@ethz.ch)  \n",
     "Overview of example notebooks for the time series processing library `diive`."
    ]
@@ -256,6 +256,7 @@
     "- [Absolute limits, separately for daytime and nighttime](OutlierDetection/AbsoluteLimitsDaytimeNighttime.ipynb)\n",
     "- [Absolute limits](OutlierDetection/AbsoluteLimits.ipynb)\n",
     "- <b>NEW!</b> [Incremental z-score: Identify outliers based on the z-score of double increments](OutlierDetection/zScoreIncremental.ipynb)\n",
+    "- <b>NEW!</b> [Local outlier factor across all data](OutlierDetection/LocalOutlierFactorAllData.ipynb)\n",
     "- <b>NEW!</b> [Local standard deviation](OutlierDetection/LocalSD.ipynb)"
    ]
   },