Merge pull request #218 from holukas/indev

Indev
holukas · Sep 22, 2024 · d6f9a3d · d6f9a3d
2 parents e6fc944 + 06bef6b
commit d6f9a3d
Show file tree

Hide file tree

Showing 41 changed files with 8,681 additions and 3,842 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,43 @@
 
 ![DIIVE](images/logo_diive1_256px.png)
 
+## v0.82.1 | XX Sep 2024
+
+## Notebooks
+
+- Added notebook showing an example for `LongTermGapFillingRandomForestTS` (
+  `notebooks/GapFilling/LongTermRandomForestGapFilling.ipynb`)
+- Added notebook example for `MeasurementOffset` (`notebooks/Corrections/MeasurementOffset.ipynb`)
+
+## Tests
+
+- Added unittest for `LongTermGapFillingRandomForestTS` (
+  `tests.test_gapfilling.TestGapFilling.test_gapfilling_longterm_randomforest`)
+- Added unittest for `WindDirOffset` (`tests.test_corrections.TestCorrections.test_winddiroffset`)
+- Added unittest for `DaytimeNighttimeFlag` (`tests.test_createvar.TestCreateVar.test_daytime_nighttime_flag`)
+- Added unittest for `calc_vpd_from_ta_rh` (`tests.test_createvar.TestCreateVar.test_calc_vpd`)
+- Added unittest for `percentiles101` (`tests.test_analyses.TestAnalyses.test_percentiles`)
+- Added unittest for `GapFinder` (`tests.test_analyses.TestAnalyses.test_gapfinder`)
+- Added unittest for `SortingBinsMethod` (`tests.test_analyses.TestAnalyses.test_sorting_bins_method`)
+- Added unittest for `daily_correlation` (`tests.test_analyses.TestAnalyses.test_daily_correlation`)
+- Added unittest for `QuantileXYAggZ` (`tests.test_analyses.TestCreateVar.test_quantilexyaggz`)
+- 49/49 unittests ran successfully
+
+### Bugfixes
+
+- Fixed bug that caused results from long-term gap-filling to be inconsistent *despite* using a fixed random state. I
+  found the following: when reducing features across years, the removal of duplicate features from a list of found
+  features created a list where the order of elements changed each run. This in turn produced slightly different
+  gap-filling results each time the long-term gap-filling was executed. Used Python version where this issue occurred
+  was `3.9.19`.
+    - Here is a simplified example, where `input_list` is a list of elements with some duplicate elements:
+    - Running `output_list = list(set(input_list))` generates `output_list` where the elements would have a different
+      output order each run. The elements were otherwise the same, only their order changed.
+    - To keep the order of elements consistent it was necessary to `output_list.sort()`.
+    - (`diive.pkgs.gapfilling.longterm.LongTermGapFillingBase.reduce_features_across_years`)
+- Corrected wind direction could be 360°, but will now be 0° (
+  `diive.pkgs.corrections.winddiroffset.WindDirOffset._correct_degrees`)
+
 ## v0.82.0 | 19 Sep 2024
 
 ## Long-term gap-filling

diff --git a/README.md b/README.md
@@ -35,6 +35,7 @@ Recent releases: [Releases](https://github.com/holukas/diive/releases)
 
 ### Corrections
 
+- **Offset correction for measurement**: correct measurement by offset in comparison to replicate ([notebook example](https://github.com/holukas/diive/blob/main/notebooks/Corrections/MeasurementOffset.ipynb))
 - **Offset correction radiation**: correct nighttime offset of radiation data and set nighttime to zero
 - **Offset correction relative humidity**: correct RH values > 100%
 - **Offset correction wind direction**: correct wind directions by offset, calculated based on reference time period ([notebook example](https://github.com/holukas/diive/blob/main/notebooks/Corrections/WindDirectionOffset.ipynb))
@@ -101,6 +102,7 @@ _Fill gaps in time series with various methods._
 
 - **XGBoostTS** ([notebook example (minimal)](https://github.com/holukas/diive/blob/main/notebooks/GapFilling/XGBoostGapFillingMinimal.ipynb), [notebook example (more extensive)](https://github.com/holukas/diive/blob/main/notebooks/GapFilling/XGBoostGapFillingExtensive.ipynb))
 - **RandomForestTS** ([notebook example](https://github.com/holukas/diive/blob/main/notebooks/GapFilling/RandomForestGapFilling.ipynb))
+- **Long-term gap-filling using RandomForestTS** ([notebook example](https://github.com/holukas/diive/blob/main/notebooks/GapFilling/LongTermRandomForestGapFilling.ipynb))
 - **Linear interpolation** ([notebook example](https://github.com/holukas/diive/blob/main/notebooks/GapFilling/LinearInterpolation.ipynb))
 - **Quick random forest gap-filling** ([notebook example](https://github.com/holukas/diive/blob/main/notebooks/GapFilling/QuickRandomForestGapFilling.ipynb))
 

diff --git a/diive/core/ml/common.py b/diive/core/ml/common.py
@@ -1,6 +1,7 @@
 """
 kudos: https://datascience.stackexchange.com/questions/15135/train-test-validation-set-splitting-in-sklearn
 """
+
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
@@ -24,22 +25,19 @@
 
 class MlRegressorGapFillingBase:
 
-    def __init__(
-            self,
-            regressor,
-            input_df: DataFrame,
-            target_col: str or tuple,
-            verbose: int = 0,
-            perm_n_repeats: int = 10,
-            test_size: float = 0.20,
-            features_lag: list[int, int] = None,
-            features_lag_exclude_cols: list = None,
-            include_timestamp_as_features: bool = False,
-            add_continuous_record_number: bool = False,
-            sanitize_timestamp: bool = False,
-            random_state: int = None,
-            **kwargs
-    ):
+    def __init__(self,
+                 regressor,
+                 input_df: DataFrame,
+                 target_col: str or tuple,
+                 verbose: int = 0,
+                 features_lag: list = None,
+                 features_lag_exclude_cols: list = None,
+                 include_timestamp_as_features: bool = False,
+                 add_continuous_record_number: bool = False,
+                 sanitize_timestamp: bool = False,
+                 perm_n_repeats: int = 10,
+                 test_size: float = 0.25,
+                 **kwargs):
         """
         Gap-fill timeseries with predictions from random forest model
 
@@ -104,14 +102,9 @@ def __init__(
         self.include_timestamp_as_features = include_timestamp_as_features
         self.add_continuous_record_number = add_continuous_record_number
         self.sanitize_timestamp = sanitize_timestamp
-        self.random_state = random_state
         self.kwargs = kwargs
 
-        # Update model kwargs with random state
-        if self.random_state:
-            self.kwargs['random_state'] = self.random_state
-        else:
-            self.kwargs['random_state'] = None
+        self._random_state = self.kwargs['random_state'] if 'random_state' in self.kwargs else None
 
         if self.regressor == RandomForestRegressor:
             self.gfsuffix = '_gfRF'
@@ -138,11 +131,13 @@ def __init__(
                 print(f"    --> {nc} ({fstats[nc]['count'].astype(int)} values)")
             print(f"This means that not all target values can be predicted based on the full model.")
 
-        # Create training (80%) and testing dataset (20%)
+        # Create training (75%) and testing dataset (25%)
         # Sort index to keep original order
         _temp_df = self.model_df.copy().dropna()
+
         self.train_df, self.test_df = train_test_split(_temp_df, test_size=self.test_size,
-                                                       random_state=self.random_state, shuffle=True)
+                                                       random_state=self._random_state, shuffle=True)
+
         self.train_df = self.train_df.sort_index()
         self.test_df = self.test_df.sort_index()
 
@@ -161,7 +156,7 @@ def __init__(
         self._scores = dict()
         self._scores_traintest = dict()
         self._accepted_features = []
-        self._rejected_features = "None."
+        self._rejected_features = []
 
         self.n_timeseriessplits = None
 
@@ -309,7 +304,8 @@ def trainmodel(self,
         idtxt = f"TRAIN & TEST "
 
         # Set training and testing data
-        y_train = np.array(self.train_df[self.target_col])
+        train_df = self.train_df.copy()
+        y_train = np.array(train_df[self.target_col])
         X_train = np.array(self.train_df.drop(self.target_col, axis=1))
         X_test = np.array(self.test_df.drop(self.target_col, axis=1))
         y_test = np.array(self.test_df[self.target_col])
@@ -346,7 +342,8 @@ def trainmodel(self,
             plot_observed_predicted(predictions=pred_y_test,
                                     targets=y_test,
                                     scores=self.scores_traintest_,
-                                    infotxt=f"{idtxt} trained on training set, tested on test set")
+                                    infotxt=f"{idtxt} trained on training set, tested on test set",
+                                    random_state=self._random_state)
 
             print(f">>> Plotting residuals and prediction error ...")
             plot_prediction_residuals_error_regr(
@@ -462,6 +459,8 @@ def reduce_features(self, factor: float = 1):
         # Instantiate model with params
         model = self.regressor(**self.kwargs)
 
+        model.get_params()
+
         # Fit model to training data
         model = self._fitmodel(model=model, X_train=X, y_train=y, X_test=X, y_test=y)
 
@@ -746,10 +745,11 @@ def _permutation_importance(self, model, X, y, X_names) -> DataFrame:
         """Calculate permutation importance"""
 
         # https://scikit-learn.org/stable/modules/permutation_importance.html#permutation-feature-importance
+
         fi = permutation_importance(estimator=model,
                                     X=X, y=y,
                                     n_repeats=self.perm_n_repeats,
-                                    random_state=self.random_state,
+                                    random_state=self._random_state,
                                     scoring='r2',
                                     n_jobs=-1)
 
@@ -765,7 +765,7 @@ def _permutation_importance(self, model, X, y, X_names) -> DataFrame:
     def _add_random_variable(self, df: DataFrame) -> tuple[DataFrame, str]:
         # Add random variable as benchmark for relevant feature importances
         random_col = '.RANDOM'  # Random variable as benchmark for relevant importances
-        df[random_col] = np.random.RandomState(self.kwargs['random_state']).randn(df.shape[0], 1)
+        df[random_col] = np.random.RandomState(self._random_state).randn(df.shape[0], 1)
         # df[random_col] = np.random.rand(df.shape[0], 1)
         return df, random_col
 
@@ -827,7 +827,8 @@ def _fillgaps_fullmodel(self, showplot_scores, showplot_importance):
             plot_observed_predicted(predictions=pred_y,
                                     targets=y,
                                     scores=self.scores_,
-                                    infotxt=f"trained on training set, tested on FULL set")
+                                    infotxt=f"trained on training set, tested on FULL set",
+                                    random_state=self._random_state)
 
             # print(f">>> Plotting residuals and prediction error based on all data ...")
             # plot_prediction_residuals_error_regr(

diff --git a/diive/pkgs/corrections/offsetcorrection.py b/diive/pkgs/corrections/offsetcorrection.py
@@ -2,7 +2,7 @@
 import numpy as np
 import pandas as pd
 from pandas import Series, DataFrame
-
+import decimal
 import diive.core.dfun.frames as frames
 from diive.core.plotting.plotfuncs import quickplot
 from diive.core.utils.prints import ConsoleOutputDecorator
@@ -50,6 +50,9 @@ def __init__(self,
         self.offset_end = offset_end
         self.offset_stepsize = offset_stepsize
 
+        d = decimal.Decimal(str(offset_stepsize))
+        self.n_digits_after_comma = abs(d.as_tuple().exponent)
+
         # Wind directions shifted by offset that yielded maximum absolute
         # correlation with reference
         self.measurement_shifted = self.measurement.copy()
@@ -60,8 +63,6 @@ def __init__(self,
         # Correct wind directions
         self.replicate_corrected = self._correct_measurement()
 
-        # self.showplots()
-
     def get_corrected_measurement(self):
         return self.replicate_corrected
 
@@ -90,16 +91,17 @@ def _calc_shift_correlations(self):
             abs_diff = float(abs_diff)
             offsets_df.loc[len(offsets_df)] = [offset, abs_diff]
 
-            print(f"#{counter}  {offset} {abs_diff}")
-            fig = plt.figure()
-            r.plot(x_compat=True, label="true measurement")
-            m_shifted.plot(x_compat=True, label=f"corrected by offset")
-            plt.title(f"OFFSET: {offset}  /  SUM_ABS_DIFF: {abs_diff}")
-            plt.legend(loc='upper right')
-            path = rf"C:\Users\nopan\Desktop\temp\{counter}.png"
-            fig.tight_layout()
-            # fig.show()
-            fig.savefig(path)
+            print(f"#{counter}   trying with offset: {offset:.{self.n_digits_after_comma}f}   "
+                  f"found absolute difference: {abs_diff}")
+            # fig = plt.figure()
+            # r.plot(x_compat=True, label="true measurement")
+            # m_shifted.plot(x_compat=True, label=f"corrected by offset")
+            # plt.title(f"OFFSET: {offset}  /  SUM_ABS_DIFF: {abs_diff}")
+            # plt.legend(loc='upper right')
+            # path = rf"C:\Users\nopan\Desktop\temp\{counter}.png"
+            # fig.tight_layout()
+            # # fig.show()
+            # fig.savefig(path)
 
         offsets_df = offsets_df.sort_values(by='ABS_DIFF', ascending=True)
         min_ix = offsets_df['ABS_DIFF'] == offsets_df['ABS_DIFF'].min()

diff --git a/diive/pkgs/corrections/winddiroffset.py b/diive/pkgs/corrections/winddiroffset.py
@@ -126,7 +126,7 @@ def showplots(self):
 
     def _correct_degrees(self, s: Series):
         """Correct degree values that go below zero or above 360"""
-        _locs_above360 = s > 360
+        _locs_above360 = s >= 360
         s[_locs_above360] -= 360
         _locs_belowzero = s < 0
         s[_locs_belowzero] += 360

diff --git a/diive/pkgs/fluxprocessingchain/fluxprocessingchain.py b/diive/pkgs/fluxprocessingchain/fluxprocessingchain.py
@@ -164,10 +164,9 @@ def showplot_gapfilled_cumulative(self, gain: float = 1, units: str = "", per_ye
         else:
             df = gfvars[gapfilled_vars].copy()
             df = df.multiply(gain)
-            series_units = r'($\mathrm{gC\ m^{-2}}$)'
             Cumulative(
                 df=df,
-                units=series_units,
+                units=units,
                 start_year=start_year,
                 end_year=end_year).plot()