Skip to content

Commit

Permalink
Merge pull request #116 from holukas/more-stats
Browse files Browse the repository at this point in the history
More stats
  • Loading branch information
holukas authored May 23, 2024
2 parents 7878a8b + eea4e42 commit ceebdb4
Show file tree
Hide file tree
Showing 10 changed files with 1,204 additions and 120 deletions.
27 changes: 27 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,33 @@

![DIIVE](images/logo_diive1_256px.png)

## v0.76.2 | 23 May 2024

### Additions

- Added function to calculate absolute double differences of a time series, which is the sum of absolute differences
between a data record and its preceding and next record. Used in class `zScoreIncrements` for finding (isolated)
outliers that are distant from neighboring records. (`diive.core.dfun.stats.double_diff_absolute`)
- Added small function to calculate z-score stats of a time series (`diive.core.dfun.stats.sstats_zscore`)
- Added small function to calculate stats for absolute double differences of a time
series (`diive.core.dfun.stats.sstats_doublediff_abs`)

### Changes

- Changed the algorithm for outlier detection when using `zScoreIncrements`. Data points are now flagged as outliers if
the z-scores of three absolute differences (previous record, next record and the sum of both) all exceed a specified
threshold. (`diive.pkgs.outlierdetection.incremental.zScoreIncrements`)

### Notebooks

- Added new notebook for outlier detection using
class `LocalOutlierFactorAllData` (`notebooks/OutlierDetection/LocalOutlierFactorAllData.ipynb`)

### Tests

- Added new test case
for `LocalOutlierFactorAllData` (`tests.test_outlierdetection.TestOutlierDetection.test_lof_alldata`)

## v0.76.1 | 17 May 2024

### Additions
Expand Down
43 changes: 37 additions & 6 deletions diive/core/dfun/stats.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# import diive.pkgs.dfun
# from stats.boxes import insert_statsboxes_txt
from pandas import Series, DataFrame
import pandas as pd
from pandas import Series, DataFrame

from diive.core.funcs.funcs import zscore


def q75(x):
Expand Down Expand Up @@ -84,14 +86,14 @@ def sstats(s: Series) -> DataFrame:
df.loc['MISSING', col] = series_numvals_missing(s)
df.loc['MISSING_PERC', col] = series_perc_missing(s)
df.loc['MEAN', col] = s.mean()
df.loc['MEDIAN', col] = s.quantile(q=0.50)
df.loc['SD', col] = s.std()
df.loc['VAR', col] = s.var()
df.loc['SD/MEAN'] = series_sd_over_mean(s)
# df.loc['MAD', col] = s.mad() # deprecated in pandas
# df.loc['CUMSUM_MIN', col] = s.cummin().iloc[-1]
# df.loc['CUMSUM_MAX', col] = s.cummax().iloc[-1]
df.loc['SUM', col] = s.sum()
df.loc['MEDIAN', col] = s.quantile(q=0.50)
df.loc['MIN', col] = s.min()
df.loc['MAX', col] = s.max()
df.loc['P01', col] = s.quantile(q=0.01)
Expand All @@ -103,11 +105,40 @@ def sstats(s: Series) -> DataFrame:
return df


def sstats_doublediff_abs(s: Series) -> DataFrame:
"""Calculate stats for absolute double difference of series."""
doublediff_abs, diff_to_prev_abs, diff_to_next_abs = double_diff_absolute(s=s)
df = sstats(s=doublediff_abs)
return df


def sstats_zscore(s: Series) -> DataFrame:
"""Calculate stats for z-scores of series."""
z = zscore(series=s)
df = sstats(s=z)
return df


def double_diff_absolute(s: Series) -> tuple[Series, Series, Series]:
"""Calculate the absolute sum of differences between a data point and
the respective preceding and next value."""
shifted_prev = s.shift(1)
diff_to_prev = s - shifted_prev
diff_to_prev_abs = diff_to_prev.abs()
shifted_next = s.shift(-1)
diff_to_next = s - shifted_next
diff_to_next_abs = diff_to_next.abs()
doublediff_abs = diff_to_prev_abs + diff_to_next_abs
# dd_abs = dd_abs ** 2
doublediff_abs.name = 'DOUBLE_DIFF_ABS'
return doublediff_abs, diff_to_prev_abs, diff_to_next_abs


def example():
from diive.configs.exampledata import load_exampledata_pickle
df = load_exampledata_pickle()
series = df['NEE_CUT_REF_orig'].copy()
stats = sstats(series)
from diive.configs.exampledata import load_exampledata_parquet
df = load_exampledata_parquet()
series = df['NEE_CUT_REF_f'].copy()
stats = sstats_doublediff_abs(series)
print(stats)


Expand Down
7 changes: 4 additions & 3 deletions diive/core/utils/prints.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,11 @@ def __init__(self, id: str, spacing: bool = True):

def section(self):
if self.spacing:
print("")
print("")
pass
# print("")
# print("")
# self.str(txt=f"{'=' * 40}")
self.str(txt=f"{self.id}")
self.str(txt=f"running {self.id} ...")
# self.str(txt=f"{'=' * 40}")

def str(self, txt: str):
Expand Down
101 changes: 71 additions & 30 deletions diive/pkgs/outlierdetection/incremental.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"""
from pandas import Series, DatetimeIndex

from diive.core.dfun.stats import double_diff_absolute
from diive.core.base.flagbase import FlagBase
from diive.core.utils.prints import ConsoleOutputDecorator
from diive.pkgs.outlierdetection.zscore import zScore
Expand All @@ -25,17 +26,32 @@ def __init__(self,
verbose: bool = False):
"""Identify outliers based on the z-score of record increments.
First, several absolute increments are calcualted for each data record at time t:
(1) increment1(t) = absolute( value(t) - value(t-1) )
(2) increment2(t) = absolute( value(t) - value(t+1) )
(3) increment1+2(t) = increment1(t) + increment2(t)
Second, z-scores are calculated for each of these increments:
(4) z-scores of increment1(t)
(5) z-scores of increment2(t)
(6) z-scores of increment1+2(t)
Third, all data records where z-score > *thres_zscore* are flagged:
(7) z-scores of increment1(t) > *thres_zscore* --> flag=2
(8) z-scores of increment2(t) > *thres_zscore* --> flag=2
(9) z-scores of increment1+2(t) > *thres_zscore* --> flag=2
Fourth, all data records where all three increments were flagged are flagged as outlier.
The sum of three flags in (7), (8) and (9) = 2 + 2 + 2 = 6 = outlier.
Only data records where all three flags were raised are flagged as outlier.
Args:
series: Time series in which outliers are identified.
idstr: Identifier, added as suffix to output variable names.
thres_zscore: Threshold for z-score, scores above this value will
be flagged as outlier. NOTE that in this case the z-scores are
calculated from the increments between data records in *series*,
whereby the increment at a point in time t is calculated as:
(1) increment1(t) = absolute( value(t) - value(t-1) )
(2) increment2(t) = absolute( value(t) - value(t+1) )
(3) increment(t) = increment1(t) + increment2(t)
(4) z-scores are calculated from increment(t)
calculated from the increments between data records in *series*.
showplot: Show plot with results from the outlier detection.
verbose: Print more text output.
Expand Down Expand Up @@ -66,53 +82,64 @@ def _flagtests(self, iteration) -> tuple[DatetimeIndex, DatetimeIndex, int]:
"""Perform tests required for this flag"""

s = self.filteredseries.copy()
shifted_prev = s.shift(1)
increment_to_prev = s - shifted_prev
shifted_next = s.shift(-1)
increment_to_next = s - shifted_next

increment = increment_to_prev.abs() + increment_to_next.abs()
doublediff_abs, diff_to_prev_abs, diff_to_next_abs = double_diff_absolute(s=s)

# Run z-score test for all three diff series
flag_collect = Series(index=doublediff_abs.index, data=doublediff_abs)
for diff_ix, diff in enumerate([doublediff_abs, diff_to_prev_abs, diff_to_next_abs]):
flagtest_zscore = zScore(series=diff, thres_zscore=self.thres_zscore,
plottitle=f"z-score of {self.series.name} increments",
showplot=False, verbose=False)
flagtest_zscore.calc(repeat=False)
flag_zscore = flagtest_zscore.get_flag()
if diff_ix == 0:
flag_collect = flag_zscore.copy()
else:
flag_collect = flag_collect.add(flag_zscore)

increment.name = 'INCREMENT'

# import matplotlib.pyplot as plt
# flag_collect.plot()
# plt.show()

# increment.name = 'INCREMENT'
# Run z-score test on increments and get resulting flag
flagtest_zscore = zScore(series=increment, thres_zscore=self.thres_zscore,
plottitle=f"z-score of {self.series.name} increments",
showplot=False, verbose=False)
flagtest_zscore.calc(repeat=False)
flag_zscore = flagtest_zscore.get_flag()
# flagtest_zscore = zScore(series=increment, thres_zscore=self.thres_zscore,
# plottitle=f"z-score of {self.series.name} increments",
# showplot=False, verbose=False)
# flagtest_zscore.calc(repeat=False)
# flag_zscore = flagtest_zscore.get_flag()

# import pandas as pd
# import matplotlib.pyplot as plt
# df = pd.DataFrame(
# {
# 'series': s,
# # 'shifted_prev': shifted_prev,
# 'increment_to_prev': increment_to_prev,
# # 'shifted_next': shifted_next,
# 'increment_to_next': increment_to_next,
# 'increment': increment,
# 'flag_zscore': flag_zscore,
# 'doublediff_abs': doublediff_abs,
# 'flag_zscore': flag_collect,
# }
# )
# df.plot(subplots=True)
# plt.show()

ok = flag_zscore == 0
ok = flag_collect < 6
ok = ok[ok].index
rejected = flag_zscore == 2
rejected = flag_collect == 6 # z-score flags for all three diffs are 2 and 3*2=6
rejected = rejected[rejected].index
n_outliers = len(rejected)

if self.verbose:
print(
f"ITERATION#{iteration}: Total found {increment.name} outliers: {n_outliers} values (daytime+nighttime)")
f"ITERATION#{iteration}: Total found outliers: {n_outliers} values (daytime+nighttime)")

return ok, rejected, n_outliers


def example():
import importlib.metadata
import pandas as pd
import matplotlib.pyplot as plt
import diive.configs.exampledata as ed
from diive.pkgs.createvar.noise import add_impulse_noise
from diive.core.plotting.timeseries import TimeSeries
Expand All @@ -127,19 +154,33 @@ def example():
s_noise = add_impulse_noise(series=s,
factor_low=-10,
factor_high=3,
contamination=0.04) # Add impulse noise (spikes)
contamination=0.04,
seed=42) # Add impulse noise (spikes)
s_noise.name = f"{s.name}+noise"
TimeSeries(s_noise).plot()

zsi = zScoreIncrements(
series=s_noise,
thres_zscore=4.5,
thres_zscore=5.5,
showplot=True,
verbose=False
)
verbose=False)

zsi.calc(repeat=True)

flag = zsi.get_flag()

frame = {'s': s, 's_noise': s_noise, 'flag': flag}
checkdf = pd.DataFrame.from_dict(frame)
good_data = checkdf.loc[checkdf['flag'] == 0]['s_noise']
rejected_data = checkdf.loc[checkdf['flag'] == 2]['s_noise']

fig, ax = plt.subplots()
ax.plot(good_data, color="#42A5F5", label="not an outlier", lw=0, ms=5, marker="o")
ax.plot(rejected_data, color="red", label="outlier", lw=0, ms=7, marker="X")
plt.title("Result")
plt.legend()
plt.show()


if __name__ == '__main__':
example()
33 changes: 33 additions & 0 deletions diive/pkgs/outlierdetection/lof.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,3 +409,36 @@ def _plot(self, df: DataFrame):
title = f"Outlier detection - local outlier factor"
fig.suptitle(title, fontsize=theme.FIGHEADER_FONTSIZE)
fig.show()

def example():
import importlib.metadata
import diive.configs.exampledata as ed
from diive.pkgs.createvar.noise import add_impulse_noise
from diive.core.plotting.timeseries import TimeSeries
import warnings
warnings.filterwarnings('ignore')
version_diive = importlib.metadata.version("diive")
print(f"diive version: v{version_diive}")
df = ed.load_exampledata_parquet()
s = df['Tair_f'].copy()
s = s.loc[s.index.year == 2018].copy()
s = s.loc[s.index.month == 7].copy()
s_noise = add_impulse_noise(series=s,
factor_low=-10,
factor_high=3,
contamination=0.04) # Add impulse noise (spikes)
s_noise.name = f"{s.name}+noise"
TimeSeries(s_noise).plot()

lofa = LocalOutlierFactorAllData(
series=s_noise,
n_neighbors=200,
contamination='auto',
showplot=True
)

lofa.calc(repeat=True)


if __name__ == '__main__':
example()
3 changes: 2 additions & 1 deletion notebooks/OVERVIEW.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"metadata": {},
"source": [
"---\n",
"**Last updated:**: 17 May 2024 \n",
"**Last updated:**: 20 May 2024 \n",
"**Author**: Lukas Hörtnagl (holukas@ethz.ch) \n",
"Overview of example notebooks for the time series processing library `diive`."
]
Expand Down Expand Up @@ -256,6 +256,7 @@
"- [Absolute limits, separately for daytime and nighttime](OutlierDetection/AbsoluteLimitsDaytimeNighttime.ipynb)\n",
"- [Absolute limits](OutlierDetection/AbsoluteLimits.ipynb)\n",
"- <b>NEW!</b> [Incremental z-score: Identify outliers based on the z-score of double increments](OutlierDetection/zScoreIncremental.ipynb)\n",
"- <b>NEW!</b> [Local outlier factor across all data](OutlierDetection/LocalOutlierFactorAllData.ipynb)\n",
"- <b>NEW!</b> [Local standard deviation](OutlierDetection/LocalSD.ipynb)"
]
},
Expand Down
Loading

0 comments on commit ceebdb4

Please sign in to comment.