Updated XGBoost, some fixes

holukas · Apr 26, 2024 · 0fec103 · 0fec103
1 parent 333a694
commit 0fec103
Show file tree

Hide file tree

Showing 13 changed files with 5,658 additions and 2,574 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,7 +2,27 @@
 
 ![DIIVE](images/logo_diive1_256px.png)
 
-## v0.75.0 | XX XXX 2024
+## v0.75.0 | 26 Apr 2024
+
+### XGBoost gap-filling
+
+[XGBoost](https://xgboost.readthedocs.io/en/stable/index.html) can now be used to fill gaps in time series data.
+In `diive`, `XGBoost` is implemented in class `XGBoostTS`, which adds additional options for easily including e.g.
+lagged variants of feature variables, timestamp info (DOY, month, ...) and a continuous record number. It also allows
+direct feature reduction by including a purely random feature (consisting of completely random numbers) and calculating
+the 'permutation importance'. All features where the permutation importance is lower than for the random feature can
+then be removed from the dataset, i.e., the list of features, before building the final model.
+
+`XGBoostTS` and `RandomForestTS` both use the same base class `MlRegressorGapFillingBase`. This base class will also
+facilitate the implementation of other gap-filling algorithms in the future.
+
+Another fun (for me) addition is the new class `TimeSince`. It allows to calculate the time since the last occurrence of
+specific conditions. One example where this class can be useful is the calculation of 'time since last precipitation',
+expressed as number of records, which can be helpful in identifying dry conditions. More examples: 'time since freezing
+conditions' based on air temperature; 'time since management' based on management info, e.g. fertilization events.
+Please see the notebook for some illustrative examples.
+
+**Please note that `diive` is still under developement and bugs can be expected.**
 
 ### New features
 
@@ -18,13 +38,18 @@
   methods. At the moment used by `RandomForestTS` and `XGBoostTS`. (`diive.core.ml.common.MlRegressorGapFillingBase`)
 - Added option to change line color directly in `TimeSeries` plots (`diive.core.plotting.timeseries.TimeSeries.plot`)
 
+### Notebooks
+
+- Added new notebook for gap-filling using `XGBoostTS` with mininmal settings (`notebooks/GapFilling/XGBoostGapFillingMinimal.ipynb`)
+- Added new notebook for gap-filling using `XGBoostTS` with more extensive settings (`notebooks/GapFilling/XGBoostGapFillingExtensive.ipynb`)
+- Added new notebook for creating `TimeSince` variables (`notebooks/CalculateVariable/TimeSince.ipynb`)
+
 ### Tests
 
 - Added test case for XGBoost gap-filling (`tests.test_gapfilling.TestGapFilling.test_gapfilling_xgboost`)
 - Updated test case for random forest gap-filling (`tests.test_gapfilling.TestGapFilling.test_gapfilling_randomforest`)
 - Harmonized test case for XGBoostTS with test case of RandomForestTS
-- TODO Added test case for quick random forest
-  gap-filling (`tests.test_gapfilling.TestGapFilling.test_random_forest_quickfill`)
+- Added test case for `TimeSince` variable creation (`tests.test_createvar.TestCreateVar.test_timesince`)
 
 ## v0.74.1 | 23 Apr 2024
 

diff --git a/README.md b/README.md
@@ -98,7 +98,7 @@ Format data to specific formats
 
 Fill gaps in time series with various methods
 
-- XGBoostTS ([notebook example](https://github.com/holukas/diive/blob/main/notebooks/GapFilling/XGBoostGapFilling.ipynb))
+- XGBoostTS ([notebook example (minimal)](https://github.com/holukas/diive/blob/main/notebooks/GapFilling/XGBoostGapFillingMinimal.ipynb), [notebook example (more extensive)](https://github.com/holukas/diive/blob/main/notebooks/GapFilling/XGBoostGapFillingExtensive.ipynb))
 - RandomForestTS ([notebook example](https://github.com/holukas/diive/blob/main/notebooks/GapFilling/RandomForestGapFilling.ipynb))
 - Linear interpolation ([notebook example](https://github.com/holukas/diive/blob/main/notebooks/GapFilling/LinearInterpolation.ipynb))
 - Quick random forest gap-filling ([notebook example](https://github.com/holukas/diive/blob/main/notebooks/GapFilling/QuickRandomForestGapFilling.ipynb))

diff --git a/diive/core/ml/common.py b/diive/core/ml/common.py
@@ -31,6 +31,7 @@ def __init__(
             perm_n_repeats: int = 10,
             test_size: float = 0.25,
             features_lag: list = None,
+            features_lag_exclude_cols: list = None,
             include_timestamp_as_features: bool = False,
             add_continuous_record_number: bool = False,
             sanitize_timestamp: bool = False,
@@ -66,6 +67,10 @@ def __init__(
                     TA+1  = [  6,   7,   8, NaN]  --> each TA record is paired with the next record TA+1
                     TA+2  = [  7,   8, NaN, NaN]
 
+            features_lag_exclude_cols:
+                List of predictors for which no lagged variants are added.
+                Example: with ['A', 'B'] no lagged variants for variables 'A' and 'B' are added.
+
             include_timestamp_as_features:
                 Include timestamp info as integer data: year, season, month, week, doy, hour
 
@@ -89,13 +94,21 @@ def __init__(
         self.model_df = input_df.copy()
         self.target_col = target_col
         self.kwargs = kwargs
-        self.perm_n_repeats = perm_n_repeats
+        self.perm_n_repeats = perm_n_repeats if perm_n_repeats > 0 else 1
         self.test_size = test_size
         self.features_lag = features_lag
+        self.features_lag_exclude_cols = features_lag_exclude_cols
         self.verbose = verbose
 
+        if self.regressor == RandomForestRegressor:
+            self.gfsuffix = '_gfRF'
+        elif self.regressor == XGBRegressor:
+            self.gfsuffix = '_gfXG'
+        else:
+            self.gfsuffix = '_gf'
+
         if self.features_lag and (len(self.model_df.columns) > 1):
-            self.model_df = self._lag_features()
+            self.model_df = self._lag_features(features_lag_exclude_cols=features_lag_exclude_cols)
 
         if include_timestamp_as_features:
             self.model_df = include_timestamp_as_cols(df=self.model_df, txt="")
@@ -603,12 +616,15 @@ def _add_random_variable(self, df: DataFrame) -> tuple[DataFrame, str]:
         # df[random_col] = np.random.rand(df.shape[0], 1)
         return df, random_col
 
-    def _lag_features(self):
+    def _lag_features(self, features_lag_exclude_cols):
         """Add lagged variants of variables as new features"""
+        exclude_cols = [self.target_col]
+        if features_lag_exclude_cols:
+            exclude_cols += features_lag_exclude_cols
         return fr.lagged_variants(df=self.model_df,
                                   stepsize=1,
                                   lag=self.features_lag,
-                                  exclude_cols=[self.target_col])
+                                  exclude_cols=exclude_cols)
 
     def _check_n_cols(self):
         """Check number of columns"""
@@ -781,7 +797,7 @@ def _define_cols(self):
         self.pred_fullmodel_col = ".PREDICTIONS_FULLMODEL"
         self.pred_fallback_col = ".PREDICTIONS_FALLBACK"
         self.pred_gaps_col = ".GAP_PREDICTIONS"
-        self.target_gapfilled_col = f"{self.target_col}_gfRF"
+        self.target_gapfilled_col = f"{self.target_col}{self.gfsuffix}"
         self.target_gapfilled_flag_col = f"FLAG_{self.target_gapfilled_col}_ISFILLED"  # "[0=measured]"
         self.target_gapfilled_cumu_col = ".GAPFILLED_CUMULATIVE"
 

diff --git a/diive/core/plotting/timeseries.py b/diive/core/plotting/timeseries.py
@@ -135,6 +135,9 @@ def plot(self, color: str = None):
                           zorder=99, label=label)
         self._apply_format()
 
+        if self.showplot:
+            self.fig.show()
+
     def _apply_format(self):
         """Format matplotlib plot"""
 

diff --git a/diive/pkgs/createvar/timesince.py b/diive/pkgs/createvar/timesince.py
@@ -69,16 +69,16 @@ def _setup(self) -> pd.DataFrame:
         df[self.series.name] = self.series.copy()
 
         # Upper limit
-        if self.upper_lim:
-            df[self.upper_lim_col] = self.upper_lim
-        else:
+        if self.upper_lim is None:
             df[self.upper_lim_col] = self.series.max()
+        else:
+            df[self.upper_lim_col] = self.upper_lim
 
         # Lower limit
-        if self.lower_lim:
-            df[self.lower_lim_col] = self.lower_lim
-        else:
+        if self.lower_lim is None:
             df[self.lower_lim_col] = self.series.min()
+        else:
+            df[self.lower_lim_col] = self.lower_lim
 
         df[self.flag_col] = pd.NA
         return df

diff --git a/diive/pkgs/gapfilling/xgboost_ts.py b/diive/pkgs/gapfilling/xgboost_ts.py
@@ -27,9 +27,10 @@
 
 class XGBoostTS(MlRegressorGapFillingBase):
 
-    def __init__(self, input_df: DataFrame, target_col: str or tuple, verbose: int = 0, perm_n_repeats: int = 10,
-                 test_size: float = 0.25, features_lag: list = None, include_timestamp_as_features: bool = False,
-                 add_continuous_record_number: bool = False, sanitize_timestamp: bool = False, **kwargs):
+    def __init__(self, input_df: DataFrame, target_col: str or tuple, verbose: int = 0, perm_n_repeats: int = 3,
+                 test_size: float = 0.25, features_lag: list = None, features_lag_exclude_cols: list = None,
+                 include_timestamp_as_features: bool = False, add_continuous_record_number: bool = False,
+                 sanitize_timestamp: bool = False, **kwargs):
         """
         Gap-fill timeseries with predictions from random forest model
 
@@ -42,6 +43,7 @@ def __init__(self, input_df: DataFrame, target_col: str or tuple, verbose: int =
 
             perm_n_repeats:
                 Number of repeats for calculating permutation feature importance.
+                Must be greater than 0.
 
             test_size:
                 Proportion of the dataset to include in the test split,
@@ -60,6 +62,10 @@ def __init__(self, input_df: DataFrame, target_col: str or tuple, verbose: int =
                     TA+1  = [  6,   7,   8, NaN]  --> each TA record is paired with the next record TA+1
                     TA+2  = [  7,   8, NaN, NaN]
 
+            features_lag_exclude_cols:
+                List of predictors for which no lagged variants are added.
+                Example: with ['A', 'B'] no lagged variants for variables 'A' and 'B' are added.
+
             include_timestamp_as_features:
                 Include timestamp info as integer data: year, season, month, week, doy, hour
 
@@ -90,6 +96,7 @@ def __init__(self, input_df: DataFrame, target_col: str or tuple, verbose: int =
             perm_n_repeats=perm_n_repeats,
             test_size=test_size,
             features_lag=features_lag,
+            features_lag_exclude_cols=features_lag_exclude_cols,
             include_timestamp_as_features=include_timestamp_as_features,
             add_continuous_record_number=add_continuous_record_number,
             sanitize_timestamp=sanitize_timestamp,
@@ -101,7 +108,7 @@ def example_xgbts():
     # Setup, user settings
     # TARGET_COL = 'LE_orig'
     TARGET_COL = 'NEE_CUT_REF_orig'
-    subsetcols = [TARGET_COL, 'Tair_f', 'VPD_f', 'Rg_f']
+    subsetcols = [TARGET_COL, 'Tair_f', 'VPD_f', 'Rg_f', 'SWC_FF0_0.15_1', 'PPFD']
 
     # Example data
     from diive.configs.exampledata import load_exampledata_parquet
@@ -113,16 +120,43 @@ def example_xgbts():
     # df = df[remove].copy()
 
     # Subset
+    keep = df_orig.index.year >= 2000
     # keep = df_orig.index.year >= 2021
-    # df = df_orig[keep].copy()
-    df = df_orig.copy()
+    df = df_orig[keep].copy()
+    df_orig = df_orig[keep].copy()
+    # df = df_orig.copy()
 
     # Subset with target and features
     # Only High-quality (QCF=0) measured NEE used for model training in this example
     lowquality = df["QCF_NEE"] > 0
     df.loc[lowquality, TARGET_COL] = np.nan
     df = df[subsetcols].copy()
 
+    # Calculate additional features
+    from diive.pkgs.createvar.timesince import TimeSince
+    ts = TimeSince(df['Tair_f'], upper_lim=0, include_lim=True)
+    ts.calc()
+    ts_series = ts.get_timesince()
+    # xxx = ts.get_full_results()
+    df['TA>0'] = ts_series
+
+    ts = TimeSince(df['Tair_f'], lower_lim=20, include_lim=True)
+    ts.calc()
+    ts_series = ts.get_timesince()
+    # xxx = ts.get_full_results()
+    df['TA>20'] = ts_series
+
+    from diive.pkgs.createvar.daynightflag import DaytimeNighttimeFlag
+    dnf = DaytimeNighttimeFlag(
+        timestamp_index=df.index,
+        nighttime_threshold=50,
+        lat=46.815333,
+        lon=9.855972,
+        utc_offset=1)
+    results = dnf.get_results()
+    df['DAYTIME'] = results['DAYTIME'].copy()
+    df['NIGHTTIME'] = results['NIGHTTIME'].copy()
+
     # from diive.core.plotting.timeseries import TimeSeries  # For simple (interactive) time series plotting
     # TimeSeries(series=df[TARGET_COL]).plot()
 
@@ -137,13 +171,15 @@ def example_xgbts():
         verbose=1,
         # features_lag=None,
         features_lag=[-1, -1],
+        features_lag_exclude_cols=['Rg_f', 'TA>0', 'TA>20', 'DAYTIME', 'NIGHTTIME'],
         # include_timestamp_as_features=False,
         include_timestamp_as_features=True,
         # add_continuous_record_number=False,
         add_continuous_record_number=True,
         sanitize_timestamp=True,
         perm_n_repeats=9,
-        n_estimators=99,
+        n_estimators=199,
+        # n_estimators=99,
         random_state=42,
         # booster='gbtree',  # gbtree (default), gblinear, dart
         # device='cpu',
@@ -161,7 +197,7 @@ def example_xgbts():
         # colsample_bynode=1,
         # reg_lambda=1,
         # reg_alpha=0,
-        tree_method='hist',  # auto, hist, approx, exact
+        tree_method='auto',  # auto, hist, approx, exact
         # scale_pos_weight=1,
         # grow_policy='depthwise',  # depthwise, lossguide
         # max_leaves=0,
@@ -203,15 +239,15 @@ def example_xgbts():
 
     # mds = df_orig['NEE_CUT_REF_f'].copy()
     # mds = mds[mds.index.year >= 2016]
-    import matplotlib.pyplot as plt
+    from diive.core.plotting.timeseries import TimeSeries
     # # rfts.gapfilling_df_['.PREDICTIONS_FALLBACK'].cumsum().plot()
     # # rfts.gapfilling_df_['.PREDICTIONS_FULLMODEL'].cumsum().plot()
     # # rfts.gapfilling_df_['.PREDICTIONS'].cumsum().plot()
-    xgbts.get_gapfilled_target().cumsum().plot()
-    df_orig['NEE_CUT_REF_f'].cumsum().plot()
+    TimeSeries(series=xgbts.get_gapfilled_target().cumsum()).plot()
+    TimeSeries(df_orig['NEE_CUT_REF_f'].cumsum()).plot()
     # mds.cumsum().plot()
     # plt.legend()
-    plt.show()
+    # plt.show()
 
     # from diive.core.plotting.timeseries import TimeSeries  # For simple (interactive) time series plotting
     # TimeSeries(series=df[TARGET_COL]).plot()