Merge pull request #233 from holukas/indev

Indev
holukas · Oct 23, 2024 · 3b3a48e · 3b3a48e
2 parents d6e0481 + 273eecb
commit 3b3a48e
Show file tree

Hide file tree

Showing 19 changed files with 8,504 additions and 72 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,29 @@
 
 ![DIIVE](images/logo_diive1_256px.png)
 
+## v0.83.1 | 23 Oct 2024
+
+## Changes
+
+- When detecting the frequency from the time delta of records, the inferred frequency is accepted if the most frequent
+  timedelta was found for more than 50% of records (`diive.core.times.times.timestamp_infer_freq_from_timedelta`)
+- Storage terms are now gap-filled using the rolling median in an expanding time window (
+  `FluxStorageCorrectionSinglePointEddyPro._gapfill_storage_term`)
+
+## Notebooks
+
+- Added notebook example for using the flux processing chain for CH4 flux from a subcanopy eddy covariance station (
+  `notebooks/Workbench/CH-DAS_2023_FluxProcessingChain/FluxProcessingChain_NEE_CH-DAS_2023.ipynb`)
+
+## Bugfixes
+
+- Fixed info for storage term correction report to account for cases when more storage terms than flux records are
+  available (`FluxStorageCorrectionSinglePointEddyPro.report`)
+
+### Tests
+
+- 50/50 unittests ran successfully
+
 ## v0.83.0 | 4 Oct 2024
 
 ## MDS gap-filling
@@ -33,12 +56,11 @@ gap-fill meteorological data) will follow.
 ### Changes
 
 - **Storage correction**: By default, values missing in the storage term are now filled with a rolling mean in an
-  expanding
-  time window. Testing showed that the (single point) storage term is missing for between 2-3% of the data, which I
-  think is reason enough to make filling these gaps the default option. Previously, it was optional to fill the gaps
-  using random forest, however, results were not great since only the timestamp info was used as model features. Plots
-  generated during Level-3.1 were also updated, now better showing the storage terms (gap-filled and non-gap-filled) and
-  the flag indicating filled values (
+  expanding time window. Testing showed that the (single point) storage term is missing for between 2-3% of the data,
+  which I think is reason enough to make filling these gaps the default option. Previously, it was optional to fill the
+  gaps using random forest, however, results were not great since only the timestamp info was used as model features.
+  Plots generated during Level-3.1 were also updated, now better showing the storage terms (gap-filled and
+  non-gap-filled) and the flag indicating filled values (
   `diive.pkgs.fluxprocessingchain.level31_storagecorrection.FluxStorageCorrectionSinglePointEddyPro`)
 
 ### Notebooks

diff --git a/diive/core/times/times.py b/diive/core/times/times.py
@@ -823,13 +823,20 @@ def timestamp_infer_freq_from_timedelta(timestamp_ix: pd.DatetimeIndex) -> tuple
         most_frequent_delta]  # Number of occurrences for most frequent delta
     most_frequent_delta_perc = most_frequent_delta_counts / n_rows  # Fraction
     # Check whether the most frequent delta appears in >99% of all data rows
-    if most_frequent_delta_perc > 0.90:
+    if most_frequent_delta_perc > 0.50:
         inferred_freq = to_offset(most_frequent_delta)
         inferred_freq = inferred_freq.freqstr
         # inferred_freq = timedelta_to_string(most_frequent_delta)
-        freqinfo = '>90% occurrence'
+        freqinfo = f'{most_frequent_delta_perc * 100:.0f}% occurrence'
         # most_frequent_delta = pd.to_timedelta(most_frequent_delta)
         return inferred_freq, freqinfo
+    # if most_frequent_delta_perc > 0.90:
+    #     inferred_freq = to_offset(most_frequent_delta)
+    #     inferred_freq = inferred_freq.freqstr
+    #     # inferred_freq = timedelta_to_string(most_frequent_delta)
+    #     freqinfo = '>90% occurrence'
+    #     # most_frequent_delta = pd.to_timedelta(most_frequent_delta)
+    #     return inferred_freq, freqinfo
     else:
         freqinfo = '-failed-'
         return inferred_freq, freqinfo

diff --git a/diive/pkgs/flux/uncertainty.py b/diive/pkgs/flux/uncertainty.py
@@ -438,7 +438,8 @@ def example():
 
     # Restrict data for testing
     from diive.core.dfun.frames import df_between_two_dates
-    data_df = df_between_two_dates(df=data_df, start_date='2022-06-01', end_date='2022-12-03').copy()
+    data_df = df_between_two_dates(df=data_df, start_date='2022-06-01', end_date='2022-07-01').copy()
+    # data_df = df_between_two_dates(df=data_df, start_date='2022-06-01', end_date='2022-12-03').copy()
 
     # Subset
     subset = data_df[['NEE_CUT_REF_orig', 'NEE_CUT_REF_f', 'NEE_CUT_16_f', 'NEE_CUT_84_f',

diff --git a/diive/pkgs/fluxprocessingchain/fluxprocessingchain.py b/diive/pkgs/fluxprocessingchain/fluxprocessingchain.py
@@ -925,14 +925,21 @@ def example():
     # Source data
     from pathlib import Path
     from diive.core.io.files import load_parquet
-    SOURCEDIR = r"F:\Sync\luhk_work\20 - CODING\29 - WORKBENCH\dataset_cha_fp2024_2005-2023\40_FLUXES_L1_IRGA+QCL+LGR_mergeData"
-    FILENAME = r"41.1_CH-CHA_IRGA_LGR+QCL_Level-1_eddypro_fluxnet_2005-2023_meteo7.parquet"
-    FILEPATH = Path(SOURCEDIR) / FILENAME
-    maindf = load_parquet(filepath=FILEPATH)
-    locs = (maindf.index.year >= 2019) & (maindf.index.year <= 2023)
+    # SOURCEDIR = r"F:\Sync\luhk_work\20 - CODING\29 - WORKBENCH\dataset_cha_fp2024_2005-2023\50_FluxProcessingChain"
+    # FILENAME = r"51.1_FluxProcessingChain_after-L3.3_NEE.parquet"
+    # FILEPATH = Path(SOURCEDIR) / FILENAME
+    # maindf = load_parquet(filepath=FILEPATH)
+    SOURCEDIRS = [r"F:\Sync\luhk_work\20 - CODING\29 - WORKBENCH\dataset_das_2023"]
+    ep = LoadEddyProOutputFiles(sourcedir=SOURCEDIRS, filetype='EDDYPRO-FLUXNET-CSV-30MIN')
+    ep.searchfiles()
+    ep.loadfiles()
+    maindf = ep.maindf
+    metadata = ep.metadata
+
+    # locs = (maindf.index.year >= 2019) & (maindf.index.year <= 2023)
     # locs = (maindf.index.year >= 2021) & (maindf.index.year <= 2023)
-    maindf = maindf.loc[locs, :].copy()
-    metadata = None
+    # maindf = maindf.loc[locs, :].copy()
+    # metadata = None
     # print(maindf)
 
     # import matplotlib.pyplot as plt
@@ -956,8 +963,9 @@ def example():
     # plt.show()
 
     # Flux processing chain settings
+    FLUXVAR = "LE"
     # FLUXVAR = "H"
-    FLUXVAR = "FC"
+    # FLUXVAR = "FC"
     SITE_LAT = 47.210227
     SITE_LON = 8.410645
     UTC_OFFSET = 1
@@ -1037,9 +1045,8 @@ def example():
     fpc.level2_quality_flag_expansion(**LEVEL2_SETTINGS)
     fpc.finalize_level2()
     # fpc.level2_qcf.showplot_qcf_heatmaps()
-    fpc.level2_qcf.report_qcf_evolution()
+    # fpc.level2_qcf.report_qcf_evolution()
     # fpc.level2_qcf.analyze_highest_quality_flux()
-
     # fpc.level2_qcf.report_qcf_flags()
     # fpc.level2.results
     # fpc.fpc_df

diff --git a/diive/pkgs/fluxprocessingchain/level31_storagecorrection.py b/diive/pkgs/fluxprocessingchain/level31_storagecorrection.py
@@ -57,28 +57,39 @@ def report(self):
         print(f"The original, non-gapfilled storage term was available for "
               f"{n_storageterm} records ({self.strgcol}).")
 
-        n_missing = n_flux - n_storageterm
-        print(f"This means that the storage term {self.strgcol} is missing for "
-              f"{n_missing} measured flux ({self.fluxcol}) records.")
-        print(f"Without gap-filling the storage term {self.strgcol}, "
-              f"{n_missing} measured flux records ({self.fluxcol}) are lost.")
+        # Generate temporary subset where all flux values are available and check for missing storage
+        _subset = pd.concat([self.results[self.fluxcol], self.results[self.strgcol]], axis=1)
+        _subset = _subset.dropna(subset=[self.fluxcol])
+        n_orig_missing_strg = _subset[self.strgcol].isnull().sum()
+        print(f"The original storage term {self.strgcol} was missing for {n_orig_missing_strg} "
+              f"flux records.")
+        print(f"Without gap-filling the storage term ({self.strgcol}), "
+              f"{n_orig_missing_strg} measured flux records ({self.fluxcol}) are lost.")
+
+        if (n_storageterm > n_flux) & (n_orig_missing_strg > 0):
+            print(f"NOTE: There were more values available for storage term {self.strgcol} "
+                  f"than for flux {self.fluxcol}.\n"
+                  f"However, for {n_orig_missing_strg} flux records "
+                  f"no concurrent storage terms were available.")
 
         if self.gapfilled_strgcol:
             print(f"\nFor this run, gap-filling of {self.strgcol} was * SELECTED *.")
 
             locs_fluxmissing = self.results[self.fluxcol].isnull()
             fluxavailable = self.results[~locs_fluxmissing].copy()
             locs_isfilled = fluxavailable[self.flag_isgapfilled] == 1
+            locs_isorig = fluxavailable[self.flag_isgapfilled] == 0
             n_isfilled = len(fluxavailable[locs_isfilled])  # Filled storage terms for available fluxes
+            n_isorig = len(fluxavailable[locs_isorig])  # Filled storage terms for available fluxes
             print(f"After gap-filling the storage term, it was available for an additional "
                   f"{n_isfilled} records ({self.gapfilled_strgcol}).")
 
-            perc1 = (n_storageterm / n_flux) * 100
-            perc2 = (n_missing / n_flux) * 100
+            perc1 = (n_isorig / n_flux) * 100
+            perc2 = (n_orig_missing_strg / n_flux) * 100
             n_flux_corrected = self.results[self.flux_corrected_col].dropna().count()
             print(f"\nIn the storage-corrected flux {self.flux_corrected_col} with {n_flux_corrected} records, "
                   f"\n  - {perc1:.1f}% ({n_storageterm} records) of used storage terms come from originally calculated data ({self.strgcol})"
-                  f"\n  - {perc2:.1f}% ({n_missing} records) of used storage terms come from gap-filled data ({self.gapfilled_strgcol})")
+                  f"\n  - {perc2:.1f}% ({n_orig_missing_strg} records) of used storage terms come from gap-filled data ({self.gapfilled_strgcol})")
 
             filledstats = sstats(fluxavailable[locs_isfilled][self.gapfilled_strgcol])
             print(f"\nStats for gap-filled storage terms:"
@@ -99,7 +110,7 @@ def _gapfill_storage_term(self) -> DataFrame:
         """
 
         # New columns
-        self.gapfilled_strgcol = f"{self.strgcol}_gfRMEAN{self.idstr}"
+        self.gapfilled_strgcol = f"{self.strgcol}_gfRMED{self.idstr}"
         self.flag_isgapfilled = f"FLAG_{self.gapfilled_strgcol}_ISFILLED"
 
         # Generate temporary subset where all flux values are available and check for missing storage
@@ -114,13 +125,13 @@ def _gapfill_storage_term(self) -> DataFrame:
         window_size = 0
         while n_still_missing_strg > 0:
             window_size = 3 if window_size == 0 else window_size + 2
-            rmean = self.results[self.strgcol].rolling(window=window_size, center=True, min_periods=3).mean()
+            rmedian = self.results[self.strgcol].rolling(window=window_size, center=True, min_periods=3).median()
             locs = gapfilled_df[self.gapfilled_strgcol].isnull()
-            gapfilled_df.loc[locs, self.gapfilled_strgcol] = rmean
+            gapfilled_df.loc[locs, self.gapfilled_strgcol] = rmedian
             gapfilled_df.loc[locs, self.flag_isgapfilled] = 1
             n_still_missing_strg = gapfilled_df[self.gapfilled_strgcol].isnull().sum()
             print(f"Gap-filling storage-term {self.strgcol} "
-                  f"with rolling mean (window size = {window_size} records) ...")
+                  f"with rolling median (window size = {window_size} records) ...")
             print(f"Missing values for storage term {self.gapfilled_strgcol}: {n_still_missing_strg}")
 
         gapfilled_df = gapfilled_df[[self.gapfilled_strgcol, self.flag_isgapfilled]].copy()
@@ -229,18 +240,17 @@ def showplot(self, maxflux: float = None):
 
 
 def example():
-    # Load data from pickle (much faster loading)
-    from diive.configs.exampledata import load_exampledata_EDDYPRO_FLUXNET_CSV_30MIN
-    df, _ = load_exampledata_EDDYPRO_FLUXNET_CSV_30MIN()
-    s = FluxStorageCorrectionSinglePointEddyPro(df=df, fluxcol='FC')
-    s.storage_correction()
-    # s.showplot(maxflux=20)
-    # print(s.storage)
-    s.report()
-
-    df = s.addresults()
-
-    # [print(c) for c in df.columns if "TAU" in c]
+    pass
+    # # Load data from pickle (much faster loading)
+    # from diive.configs.exampledata import load_exampledata_EDDYPRO_FLUXNET_CSV_30MIN
+    # df, _ = load_exampledata_EDDYPRO_FLUXNET_CSV_30MIN()
+    # s = FluxStorageCorrectionSinglePointEddyPro(df=df, fluxcol='FC')
+    # s.storage_correction()
+    # # s.showplot(maxflux=20)
+    # # print(s.storage)
+    # s.report()
+    # # df = s.addresults()
+    # # [print(c) for c in df.columns if "TAU" in c]
 
 
 if __name__ == '__main__':

diff --git a/docs/source/diive.core.plotting.rst b/docs/source/diive.core.plotting.rst
@@ -28,6 +28,14 @@ diive.core.plotting.cumulative module
    :undoc-members:
    :show-inheritance:
 
+diive.core.plotting.dielcycle module
+------------------------------------
+
+.. automodule:: diive.core.plotting.dielcycle
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 diive.core.plotting.fitplot module
 ----------------------------------
 
@@ -36,6 +44,14 @@ diive.core.plotting.fitplot module
    :undoc-members:
    :show-inheritance:
 
+diive.core.plotting.heatmap\_base module
+----------------------------------------
+
+.. automodule:: diive.core.plotting.heatmap_base
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 diive.core.plotting.heatmap\_datetime module
 --------------------------------------------
 
@@ -52,6 +68,22 @@ diive.core.plotting.heatmap\_xyz module
    :undoc-members:
    :show-inheritance:
 
+diive.core.plotting.histogram module
+------------------------------------
+
+.. automodule:: diive.core.plotting.histogram
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+diive.core.plotting.outlier\_dtnt module
+----------------------------------------
+
+.. automodule:: diive.core.plotting.outlier_dtnt
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 diive.core.plotting.plotfuncs module
 ------------------------------------
 

diff --git a/docs/source/diive.core.utils.rst b/docs/source/diive.core.utils.rst
@@ -4,22 +4,6 @@ diive.core.utils package
 Submodules
 ----------
 
-diive.core.utils.dirs module
-----------------------------
-
-.. automodule:: diive.core.utils.dirs
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-diive.core.utils.insert module
-------------------------------
-
-.. automodule:: diive.core.utils.insert
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
 diive.core.utils.prints module
 ------------------------------
 

diff --git a/docs/source/diive.pkgs.corrections.rst b/docs/source/diive.pkgs.corrections.rst
@@ -4,6 +4,14 @@ diive.pkgs.corrections package
 Submodules
 ----------
 
+diive.pkgs.corrections.measurementoffset module
+-----------------------------------------------
+
+.. automodule:: diive.pkgs.corrections.measurementoffset
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 diive.pkgs.corrections.offsetcorrection module
 ----------------------------------------------
 

diff --git a/docs/source/diive.pkgs.createvar.rst b/docs/source/diive.pkgs.createvar.rst
@@ -12,6 +12,22 @@ diive.pkgs.createvar.daynightflag module
    :undoc-members:
    :show-inheritance:
 
+diive.pkgs.createvar.events module
+----------------------------------
+
+.. automodule:: diive.pkgs.createvar.events
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+diive.pkgs.createvar.noise module
+---------------------------------
+
+.. automodule:: diive.pkgs.createvar.noise
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 diive.pkgs.createvar.potentialradiation module
 ----------------------------------------------
 
@@ -20,6 +36,14 @@ diive.pkgs.createvar.potentialradiation module
    :undoc-members:
    :show-inheritance:
 
+diive.pkgs.createvar.timesince module
+-------------------------------------
+
+.. automodule:: diive.pkgs.createvar.timesince
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 diive.pkgs.createvar.vpd module
 -------------------------------
 

diff --git a/docs/source/diive.pkgs.flux.rst b/docs/source/diive.pkgs.flux.rst
@@ -36,18 +36,18 @@ diive.pkgs.flux.criticalheatdays module
    :undoc-members:
    :show-inheritance:
 
-diive.pkgs.flux.uncertainty module
-----------------------------------
+diive.pkgs.flux.hqflux module
+-----------------------------
 
-.. automodule:: diive.pkgs.flux.uncertainty
+.. automodule:: diive.pkgs.flux.hqflux
    :members:
    :undoc-members:
    :show-inheritance:
 
-diive.pkgs.flux.ustar\_detection module
----------------------------------------
+diive.pkgs.flux.uncertainty module
+----------------------------------
 
-.. automodule:: diive.pkgs.flux.ustar_detection
+.. automodule:: diive.pkgs.flux.uncertainty
    :members:
    :undoc-members:
    :show-inheritance: