Merge pull request #54 from holukas/bugfixes

v0.70.0
holukas · Feb 28, 2024 · f1976fa · f1976fa
2 parents fbea42b + e9a2fec
commit f1976fa
Show file tree

Hide file tree

Showing 11 changed files with 1,353 additions and 2,336 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,41 @@
 
 ![DIIVE](images/logo_diive1_256px.png)
 
+## v0.70.0 | 28 Feb 2024
+
+`diive.core.io.filereader.example_ep_fluxnet`
+
+### New features
+
+- In `StepwiseOutlierDetection`, it is now possible to re-run an outlier detection method. The re-run(s)
+  would produce flag(s) with the same name(s) as for the first (original) run. Therefore, an integer is added
+  to the flag name. For example, if the test z-score daytime/nighttime is run the first time, it produces the
+  flag with the name `FLAG_TA_T1_2_1_OUTLIER_ZSCOREDTNT_TEST`. When the test is run again (e.g. with different
+  settings) then the name of the flag of this second run is `FLAG_TA_T1_2_1_OUTLIER_ZSCOREDTNT_2_TEST`,
+  etc ... The script now checks whether a flag of the same name was already created, in which case an
+  integer is added to the flag name. These re-runs are now available in addition to the `repeat=True` keyword.
+  (`diive.pkgs.outlierdetection.stepwiseoutlierdetection.StepwiseOutlierDetection.addflag`)
+  Example:
+    - `METHOD` with `SETTINGS` is applied with `repeat=True` and therefore repeated until no more outliers
+      were found with these settings. The name of the flag produced is `TEST_METHOD_FLAG`.
+    - Next, `METHOD` is applied again with `repeat=True`, but this time with different `SETTINGS`. Like before,
+      the test is repeated until no more outliers were found with the new settings. The name of the flag produced
+      is `TEST_METHOD_2_FLAG`.
+    - `METHOD` can be re-run any number of times, each time producing a new
+      flag: `TEST_METHOD_3_FLAG`, `TEST_METHOD_4_FLAG`, ...
+- Added new function to format timestamps to FLUXNET ISO
+  format (`YYYYMMDDhhmm`) (`diive.core.times.times.format_timestamp_to_fluxnet_format`)
+
+### Bugfixes
+
+- Refactored and fixed class to reformat data for FLUXNET
+  upload (`diive.pkgs.formats.fluxnet.FormatEddyProFluxnetFileForUpload`)
+- Fixed `None` error when reading data files (`diive.core.io.filereader.DataFileReader._parse_file`)
+
+### Notebooks
+
+- Updated notebook `FormatEddyProFluxnetFileForUpload.ipynb`
+
 ## v0.69.0 | 23 Feb 2024
 
 ### New features

diff --git a/README.md b/README.md
@@ -30,7 +30,8 @@ More notebooks are added constantly.
 
 ### Create variable
 
-- Calculate daytime flag, nighttime flag and potential radiation from latitude and longitude ([notebook example](notebooks/CalculateVariable/Daytime_and_nighttime_flag.ipynb))
+- Calculate daytime flag, nighttime flag and potential radiation from latitude and
+  longitude ([notebook example](notebooks/CalculateVariable/Daytime_and_nighttime_flag.ipynb))
 - Day/night flag from sun angle
 - VPD from air temperature and RH ([notebook example](notebooks/CalculateVariable/Calculate_VPD_from_TA_and_RH.ipynb))
 
@@ -123,7 +124,8 @@ One way to install and use `diive` with a specific Python version on a local mac
   `conda create --name diive-env python=3.9.7`
 - Activate the new environment: `conda activate diive-env`
 - Install `diive` version directly from source code:
-  `pip install https://github.com/holukas/diive/archive/refs/tags/v0.63.1.tar.gz`
+  `pip install https://github.com/holukas/diive/archive/refs/tags/v0.63.1.tar.gz` (select .tar.gz file of the desired
+  version)
 - If you want to use `diive` in Jupyter notebooks, you can install Jupyterlab.
   In this example Jupyterlab is installed from the `conda` distribution channel `conda-forge`:
   `conda install -c conda-forge jupyterlab`

diff --git a/diive/core/io/filereader.py b/diive/core/io/filereader.py
@@ -492,8 +492,7 @@ def _configure_timestamp_parsing(self):
         parse_dates = {_temp_parsed_index_col: parse_dates}
         # date_parser = lambda x: dt.datetime.strptime(x, self.timestamp_datetime_format)
         # date_parser = lambda x: pd.to_datetime(x, format=self.timestamp_datetime_format, errors='coerce')
-        return parse_dates
-
+        return parse_dates, parsed_index_col, _temp_parsed_index_col
 
     def _parse_file(self, headercols_list):
         """Parse data file without header"""
@@ -502,10 +501,8 @@ def _parse_file(self, headercols_list):
         parsed_index_col = None
         _temp_parsed_index_col = None
 
-
         if self.timestamp_idx_col:
-            parse_dates = self._configure_timestamp_parsing()
-
+            parse_dates, parsed_index_col, _temp_parsed_index_col = self._configure_timestamp_parsing()
 
         data_df = pd.read_csv(
             self.filepath,
@@ -577,15 +574,41 @@ def example_icosfile():
 
 
 def example_ep_fluxnet():
-    FOLDER = r"Z:\CH-FRU_Fruebuel\20_ec_fluxes\2023\Level-0"
+    from diive.core.times.times import insert_timestamp, format_timestamp_to_fluxnet_format
+
+    FOLDER = r"L:\Sync\luhk_work\CURRENT\fru_prep"
+    OUTDIR = r"L:\Sync\luhk_work\CURRENT\fru_prep"
+
     filepaths = search_files(FOLDER, "*.csv")
-    filepaths = [fp for fp in filepaths
-                 if fp.stem.startswith("eddypro_")
-                 and "_fluxnet_" in fp.stem
-                 and fp.stem.endswith("_adv")]
+    # filepaths = [fp for fp in filepaths
+    #              if fp.stem.startswith("eddypro_")
+    #              and "_fluxnet_" in fp.stem
+    #              and fp.stem.endswith("_adv")]
     print(filepaths)
 
     loaddatafile = MultiDataFileReader(filetype='EDDYPRO_FLUXNET_30MIN', filepaths=filepaths)
+    df = loaddatafile.data_df
+
+    # # Store original column order
+    # orig_col_order = df.columns
+
+    # Set all missing values to -9999 as required by FLUXNET
+    df = df.fillna(-9999)
+
+    # Add timestamp column TIMESTAMP_END
+    df = insert_timestamp(data=df, convention='end', insert_as_first_col=True, verbose=True)
+    # Add timestamp column TIMESTAMP_START
+    df = insert_timestamp(data=df, convention='start', insert_as_first_col=True, verbose=True)
+
+    print("\nAdjusting timestamp formats of TIMESTAMP_START and TIMESTAMP_END to %Y%m%d%H%M ...")
+    df['TIMESTAMP_END'] = format_timestamp_to_fluxnet_format(df=df, timestamp_col='TIMESTAMP_END')
+    df['TIMESTAMP_START'] = format_timestamp_to_fluxnet_format(df=df, timestamp_col='TIMESTAMP_START')
+
+    # # Restore original column order
+    # df = df[orig_col_order].copy()
+
+    outpath = Path(OUTDIR) / 'merged.csv'
+    df.to_csv(outpath, index=False)
 
 
 def example_toa5():
@@ -617,6 +640,6 @@ def example_toa5():
 
 
 if __name__ == '__main__':
-    # example_ep_fluxnet()
+    example_ep_fluxnet()
     # example_icosfile()
-    example_toa5()
+    # example_toa5()
diff --git a/diive/core/times/times.py b/diive/core/times/times.py
@@ -9,6 +9,16 @@
 from pandas.tseries.frequencies import to_offset
 
 
+def format_timestamp_to_fluxnet_format(df: DataFrame, timestamp_col: str) -> Series:
+    """Apply FLUXNET timestamp format (YYYYMMDDhhmm) to timestamp columns (not index).
+
+    Timestamp must be available as data column.
+    """
+    print(f"\nFormatting timestamp column {timestamp_col} to %Y%m%d%H%M ...")
+    timestamp = df[timestamp_col].dt.strftime('%Y%m%d%H%M')
+    return timestamp
+
+
 def detect_freq_groups(index: DatetimeIndex) -> Series:
     """
     Analyze timestamp for records where the time resolution is absolutely certain

diff --git a/diive/pkgs/fluxprocessingchain/level2_qualityflags.py b/diive/pkgs/fluxprocessingchain/level2_qualityflags.py
@@ -14,28 +14,34 @@
 
 
 class FluxQualityFlagsEddyPro:
-    """
-    XXX
-    """
 
     def __init__(self,
                  dfin: DataFrame,
-                 units: dict,
                  fluxcol: str,
                  basevar: str,
                  filetype: Literal['EDDYPRO_FLUXNET_30MIN', 'EDDYPRO_FULL_OUTPUT_30MIN'],
-                 idstr: str = None):
+                 idstr: str = None,
+                 units: dict = None):
         """
         Create QCF (quality-control flag) for selected flags, calculated
-        from EddyPro's _fluxnet_ output files
+        from EddyPro's _fluxnet_ or _full_output_ results files.
 
         Args:
-            dfin: Dataframe containing data from EddyPro's _fluxnet_ file
-            fluxcol: Name of the flux variable in *df*
-            idstr: Suffix added to output variable names
+            dfin: Dataframe containing EddyPro flux calculation results.
+            fluxcol: Name of the flux variable in *dfin*.
+            idstr: Suffix added to output variable names.
+            filetype: Filetype of the input file.
+            basevar: Name of the variable that was used to calculate the flux, e.g. 'CO2_CONC' for CO2 flux.
+            units: Dictionary of columns names and their units, only needed
+                when *filetype='EDDYPRO_FULL_OUTPUT_30MIN'*.
         """
         self.fluxcol = fluxcol
         self.dfin = dfin.copy()
+
+        if not units and filetype == 'EDDYPRO_FULL_OUTPUT_30MIN':
+            raise Exception("ERROR: No units found. Units are needed when working "
+                            "with filetype EDDYPRO_FULL_OUTPUT_30MIN.")
+
         self.units = units
         self.idstr = validate_id_string(idstr=idstr)
         self.basevar = basevar

diff --git a/diive/pkgs/formats/fluxnet.py b/diive/pkgs/formats/fluxnet.py
@@ -1,14 +1,15 @@
 import re
 from pathlib import Path
 
+import numpy as np
 from pandas import DataFrame
 
 from diive.core.io.files import loadfiles
 from diive.core.times.times import current_date_str_condensed
+from diive.core.times.times import format_timestamp_to_fluxnet_format
 from diive.core.times.times import insert_timestamp
-from diive.pkgs.fluxprocessingchain.level2_qualityflags import FluxQualityFlagsEddyPro
 from diive.pkgs.outlierdetection.manualremoval import ManualRemoval
-from diive.pkgs.qaqc.qcf import FlagQCF
+from diive.pkgs.qaqc.eddyproflags import flag_signal_strength_eddypro_test
 
 # Names of variables in the EddyPro _fluxnet_ output file
 VARS_CO2 = ['FC', 'FC_SSITC_TEST', 'SC_SINGLE', 'CO2']
@@ -153,18 +154,39 @@ def remove_erroneous_data(self, var: str, remove_dates: list, showplot: bool):
             elif isinstance(d, list):
                 print(f"    REMOVING data for {var} time range between {d} (dates are inclusive)")
         series = self.merged_df[var].copy()
-        mr = ManualRemoval(series=series)
-        mr._calc(remove_dates=remove_dates, showplot=showplot)
-        self._merged_df[var] = mr.filteredseries.copy()
-        print(" Done.")
+        n_vals_before = series.dropna().count()
+        flagtest = ManualRemoval(series=series, remove_dates=remove_dates,
+                                 showplot=showplot, verbose=True)
+        flagtest.calc(repeat=False)
+        flag = flagtest.get_flag()
+
+        # Locations where flag is > 0
+        reject = flag > 0
+
+        # Remove rejected series values from series (i.e., set to missing values)
+        series.loc[reject] = np.nan
+
+        # Insert filtered series in dataset
+        self._merged_df[var] = series.copy()
+
+        # Info number of rejected values
+        n_vals_after = self.merged_df[var].dropna().count()
+        n_rejected = reject.sum()
+        print(f"Manual removal rejected {n_rejected} values of {var}, all rejected "
+              f"value were removed from the dataset.")
+        print(f"\nAvailable values of {var} before removing fluxes: {n_vals_before}")
+        print(f"Available values of {var} after removing fluxes: {n_vals_after}")
 
     def apply_fluxnet_format(self):
         self._subset_fluxnet = self._make_subset(df=self.merged_df)
         self._subset_fluxnet = self._missing_values(df=self._subset_fluxnet)
         self._subset_fluxnet = self._rename_to_variable_codes(df=self._subset_fluxnet)
         self._subset_fluxnet = self._rename_add_suffix(df=self._subset_fluxnet)
         self._subset_fluxnet = self._insert_timestamp_columns(df=self._subset_fluxnet)
-        self._subset_fluxnet = self._adjust_timestamp_formats(df=self._subset_fluxnet)
+        self._subset_fluxnet['TIMESTAMP_END'] = \
+            format_timestamp_to_fluxnet_format(df=self._subset_fluxnet, timestamp_col='TIMESTAMP_END')
+        self._subset_fluxnet['TIMESTAMP_START'] = \
+            format_timestamp_to_fluxnet_format(df=self._subset_fluxnet, timestamp_col='TIMESTAMP_START')
 
     def export_yearly_files(self):
         """Create one file per year"""
@@ -192,14 +214,6 @@ def _missing_values(df: DataFrame):
         print("\nSetting all missing values to -9999 ...")
         return df.fillna(-9999)
 
-    @staticmethod
-    def _adjust_timestamp_formats(df: DataFrame):
-        """Apply FLUXNET timestamp format (YYYYMMDDhhmm) to timestamp columns (not index)"""
-        print("\nAdjusting timestamp formats of TIMESTAMP_START and TIMESTAMP_END to %Y%m%d%H%M ...")
-        df['TIMESTAMP_END'] = df['TIMESTAMP_END'].dt.strftime('%Y%m%d%H%M')
-        df['TIMESTAMP_START'] = df['TIMESTAMP_START'].dt.strftime('%Y%m%d%H%M')
-        return df
-
     @staticmethod
     def _insert_timestamp_columns(df: DataFrame):
         """Insert timestamp columns denoting start and end of averaging interval"""
@@ -278,39 +292,26 @@ def remove_low_signal_data(self,
         levelid = 'L2'  # ID to identify newly created columns
         df = self.merged_df.copy()
         keepcols = df.columns.copy()  # Original columns in df, used to keep only original variable names
-
         n_vals_before = df[fluxcol].dropna().count()
 
-        # Perform quality tests
-        # Here, only the signal strength test is specifically needed, but the creation
-        # of the overall quality flag QCF also requires the missing values test
-        print(f"\nPerforming quality checks ...\n")
-        fluxqc = FluxQualityFlagsEddyPro(fluxcol=fluxcol, dfin=df, idstr=levelid)
-        fluxqc.missing_vals_test()
-        fluxqc.signal_strength_test(signal_strength_col=signal_strength_col,
-                                    method=method,
-                                    threshold=threshold)
-        df = fluxqc.addflags()  # Dataframe with flag columns added
-
-        # Calculate overall quality flag QCF
-        print(f"\nGenerating overall quality flag QCF ...")
-        qcf = FlagQCF(series=df[f'{fluxcol}'], df=df, idstr=levelid, swinpot=df['SW_IN_POT'], nighttime_threshold=50)
-        qcf.calculate(daytime_accept_qcf_below=2, nighttimetime_accept_qcf_below=2)
-        qcf.showplot_qcf_heatmaps(maxabsval=50)
-        qcf.report_qcf_evolution()
-        # qcf.report_qcf_flags()
-        qcf.report_qcf_series()
-        # qcf.showplot_qcf_timeseries()
-        df = qcf.get()
-
-        # Overwrite the original flux data with the quality-controlled flux data
-        fluxcolqcf = f'{fluxcol}_L2_QCF'  # Name of the quality-controlled flux variable, with bad values removed
-        print(f"\nReplacing values of original flux variable {fluxcol} "
-              f"with quality-controlled {fluxcolqcf} (bad values removed) data ... ")
-        df[fluxcol] = df[fluxcolqcf].copy()
+        print(f"\nPerforming signal strength / AGC quality check ...\n")
+        flag = flag_signal_strength_eddypro_test(df=df,
+                                                 signal_strength_col=signal_strength_col,
+                                                 var_col=fluxcol,
+                                                 method=method,
+                                                 threshold=threshold,
+                                                 idstr=levelid)
+        # Locations where flag is > 0
+        reject = flag > 0
 
-        n_vals_after = df[fluxcol].dropna().count()
+        # Remove rejected fluxcol values from dataset (i.e., set to missing values)
+        df.loc[reject, fluxcol] = np.nan
 
+        # Info number of rejected values
+        n_vals_after = df[fluxcol].dropna().count()
+        n_rejected = reject.sum()
+        print(f"{signal_strength_col} rejected {n_rejected} values of {fluxcol}, all rejected "
+              f"value were removed from the dataset.")
         print(f"\nAvailable values of {fluxcol} before removing low signal fluxes: {n_vals_before}")
         print(f"Available values of {fluxcol} after removing low signal fluxes: {n_vals_after}")
 
@@ -325,8 +326,8 @@ def example():
     # data_df, metadata_df = load_exampledata_eddypro_fluxnet_CSV_30MIN()
 
     # Setup
-    SOURCE = r"F:\Sync\luhk_work\CURRENT\fru\Level-1_results_fluxnet\0-eddypro_fluxnet_files"
-    OUTDIR = r"F:\Sync\luhk_work\CURRENT\fru\Level-1_results_fluxnet\1-formatted_for_upload"
+    SOURCE = r"F:\Sync\luhk_work\CURRENT\fru\Level-1_results_fluxnet_2023\0-eddypro_fluxnet_files"  # This is the folder where datafiles are searched
+    OUTDIR = r"F:\Sync\luhk_work\CURRENT\fru\Level-1_results_fluxnet_2023\1-formatted_for_upload"  # Output yearly CSV to this folder
 
     # Imports
     import importlib.metadata
@@ -364,9 +365,9 @@ def example():
     # Remove problematic time periods
     fxn.remove_erroneous_data(var='FC',
                               remove_dates=[
-                                  '2005-11-01 23:58:15',
-                                  ['2005-11-05 00:00:15', '2005-12-07 14:15:00'],
-                                  ['2005-06-01', '2005-08-15']
+                                  '2023-11-01 23:58:15',
+                                  ['2023-11-05 00:00:15', '2023-12-07 14:15:00'],
+                                  ['2023-06-01', '2023-08-15']
                               ],
                               showplot=True)
 

diff --git a/diive/pkgs/outlierdetection/stepwiseoutlierdetection.py b/diive/pkgs/outlierdetection/stepwiseoutlierdetection.py
@@ -239,7 +239,20 @@ def addflag(self):
             self._flags[flag.name] = flag.copy()
             print(f"++Added flag column {flag.name} to flag data")
         else:
-            pass  # todo check(?)
+            # It is possible to re-run an outlier method, which produces a flag
+            # with the same name as for the first (original) run. In this case
+            # an integer is added to the flag name. For example, if the test
+            # z-score daytime/nighttime is run the first time, it produces the flag with the name
+            # FLAG_TA_T1_2_1_OUTLIER_ZSCOREDTNT_TEST. When the test is run again
+            # (e.g. with different settings) then the name of the flag of this second
+            # run is FLAG_TA_T1_2_1_OUTLIER_ZSCOREDTNT_2_TEST, etc ...
+            new_flagname = flag.name
+            rerun = 1
+            while new_flagname in self.flags.columns:
+                rerun += 1
+                new_flagname = flag.name.replace('_TEST', f'_{rerun}_TEST')
+            self._flags[new_flagname] = flag.copy()
+            print(f"++Added flag column {new_flagname} to flag data")
 
         # # Name of filtered series in last results is the same as the original name
         # self._series_hires_cleaned = self.last_flag[self.series_hires_orig.name]