AIStream-Peelout · isaacmg · Nov 7, 2022 · Nov 7, 2022 · Nov 7, 2022 · Nov 8, 2022
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -252,8 +252,8 @@ jobs:
           name: Trainer tests
           when: always
           command: |
-            coverage run flood_forecast/trainer.py -p tests/gru_vanilla.json
             echo -e 'GRU Vanilla test'
+            coverage run flood_forecast/trainer.py -p tests/gru_vanilla.json
             coverage run flood_forecast/trainer.py -p tests/classification_test.json
             coverage run flood_forecast/trainer.py -p tests/test_inf_single.json
             echo -e 'test informer single target'
@@ -324,6 +324,7 @@ jobs:
           name: Trainer1 tests
           when: always
           command: |
+            coverage run flood_forecast/trainer.py -p tests/transformer_b_series.json
             coverage run flood_forecast/trainer.py -p tests/cross_former.json
             coverage run flood_forecast/trainer.py -p tests/nlinear.json
             coverage run flood_forecast/trainer.py -p tests/dsanet_3.json

diff --git a/.flake8 b/.flake8
@@ -1,4 +1,4 @@
 [flake8]
 max_line_length=121
-ignore=E305,W504,E126,E401
+ignore=E305,W504,E126,E401,E721
 max-complexity=19
diff --git a/flood_forecast/basic/base_line_methods.py b/flood_forecast/basic/base_line_methods.py
@@ -5,7 +5,7 @@ class NaiveBase(torch.nn.Module):
     """
     A very simple baseline model that returns
     the fixed value based on the input sequence.
-    No learning used at all a
+    No learning used at all.
     """
 
     def __init__(self, seq_length: int, n_time_series: int, output_seq_len=1, metric: str = "last"):
@@ -19,6 +19,14 @@ def __init__(self, seq_length: int, n_time_series: int, output_seq_len=1, metric
         self.metric_function = self.metric_dict[metric]
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """_summary_
+
+        Args:
+            x (torch.Tensor): _description_
+
+        Returns:
+            torch.Tensor: _description_
+        """
         return self.metric_function(x, self.output_seq_len)
 
 

diff --git a/flood_forecast/evaluator.py b/flood_forecast/evaluator.py
@@ -204,7 +204,7 @@ def infer_on_torch_model(
 ) -> Tuple[pd.DataFrame, torch.Tensor, int, int, CSVTestLoader, List[pd.DataFrame]]:
     """
     Function to handle both test evaluation and inference on a test data-frame.
-    :param model: The time series model present
+    :param model: The time series model present in the model zoo
     :param test_csv_path: The path to the test data-frame
     :return:
         df: df including training and test data
@@ -339,7 +339,7 @@ def handle_ci_multi(prediction_samples: torch.Tensor, csv_test_loader: CSVTestLo
     :type df_pred: [type]
     :param decoder_param: [description]
     :type decoder_param: bool
-    :param history_length: [description]
+    :param history_length: The number of historical time-steps
     :type history_length: int
     :param num_samples: The number of samples to generate (i.e. larger ci)
     :type num_samples: int

diff --git a/flood_forecast/pre_dict.py b/flood_forecast/pre_dict.py
@@ -1,7 +1,7 @@
 from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
 from flood_forecast.preprocessing.interpolate_preprocess import (interpolate_missing_values,
                                                                  back_forward_generic, forward_back_generic)
-
+# SAMMY IS TOO LITTLE TO BE REAL DOG
 scaler_dict = {
     "StandardScaler": StandardScaler,
     "RobustScaler": RobustScaler,

diff --git a/flood_forecast/preprocessing/process_usgs.py b/flood_forecast/preprocessing/process_usgs.py
@@ -7,7 +7,9 @@
 
 
 def make_usgs_data(start_date: datetime, end_date: datetime, site_number: str) -> pd.DataFrame:
-    """"""
+    """
+
+    """
     base_url = "https://nwis.waterdata.usgs.gov/usa/nwis/uv/?cb_00060=on&cb_00065&format=rdb&"
     full_url = base_url + "site_no=" + site_number + "&period=&begin_date=" + \
         start_date.strftime("%Y-%m-%d") + "&end_date=" + end_date.strftime("%Y-%m-%d")

diff --git a/flood_forecast/preprocessing/pytorch_loaders.py b/flood_forecast/preprocessing/pytorch_loaders.py
@@ -7,6 +7,7 @@
 from flood_forecast.preprocessing.buil_dataset import get_data
 from datetime import datetime
 from flood_forecast.preprocessing.temporal_feats import feature_fix
+from copy import deepcopy
 
 
 class CSVDataLoader(Dataset):
@@ -71,6 +72,8 @@ def __init__(
         self.scale = None
         if scaled_cols is None:
             scaled_cols = relevant_cols
+        print("scaled cols are")
+        print(scaled_cols)
         if start_stamp != 0 and end_stamp is not None:
             self.df = self.df[start_stamp:end_stamp]
         elif start_stamp != 0:
@@ -168,21 +171,44 @@ def __init__(self, series_id_col: str, main_params: dict, return_method: str, re
         :param return_all: Whether to return all items, defaults to True
         :type return_all: bool, optional
         """
-        main_params["relevant_cols"].append(series_id_col)
-        super().__init__(**main_params)
+        main_params1 = deepcopy(main_params)
+        if "scaled_cols" not in main_params1:
+            main_params1["scaled_cols"] = main_params1["relevant_cols"].copy()
+            print("The scaled cols are below")
+            print(main_params1["scaled_cols"])
+        main_params1["relevant_cols"].append(series_id_col)
+        super().__init__(**main_params1)
         self.series_id_col = series_id_col
         self.return_method = return_method
         self.return_all_series = return_all
         self.unique_cols = self.original_df[series_id_col].dropna().unique().tolist()
         df_list = []
+        self.df = self.df.reset_index()
         self.unique_dict = {}
+        print("The series id column is below:")
+        print(self.series_id_col)
         for col in self.unique_cols:
-            df_list.append(self.df[self.df[self.series_id_col] == col])
+            new_df = self.df[self.df[self.series_id_col] == col]
+            df_list.append(new_df)
+            print(new_df.columns)
         self.listed_vals = df_list
         self.__make_unique_dict__()
+        self.__validate_data__in_df()
         print(self.unique_dict)
         print("unique dict")
 
+    def __validate_data__in_df(self):
+        """Makes sure the data in the data-frame is the proper length for each series e
+        """
+        if self.return_all_series:
+            len_first = len(self.listed_vals[0])
+            print("Length of first series is:" + str(len_first))
+            for series in self.listed_vals:
+                print("Length of first series is:" + str(len(series)))
+                series_bool = len(series) == len_first
+                if not series_bool:
+                    raise IndexError("The length of sub-series data-frames are not equal.")
+
     def __make_unique_dict__(self):
         for i in range(0, len(self.unique_cols)):
             self.unique_dict[self.unique_cols[i]] = i
@@ -198,12 +224,13 @@ def __getitem__(self, idx: int) -> Tuple[Dict, Dict]:
         if self.return_all_series:
             src_list = {}
             targ_list = {}
-            print(self.unique_cols)
             for va in self.listed_vals:
-                t = torch.Tensor(va.iloc[idx: self.forecast_history + idx].values)[:, :len(self.relevant_cols3) - 1]
+                # We need to exclude the index column on one end and the series id column on the other
+                t = torch.Tensor(va.iloc[idx: self.forecast_history + idx].values)[:, 1:-1]
+                print(t.shape)
                 targ_start_idx = idx + self.forecast_history
                 idx2 = va[self.series_id_col].iloc[0]
-                targ = torch.Tensor(va.iloc[targ_start_idx: targ_start_idx + self.forecast_length].to_numpy())
+                targ = torch.Tensor(va.iloc[targ_start_idx: targ_start_idx + self.forecast_length].to_numpy())[:, 1:-1]
                 src_list[self.unique_dict[idx2]] = t
                 targ_list[self.unique_dict[idx2]] = targ
             return src_list, targ_list
@@ -214,6 +241,12 @@ def __getitem__(self, idx: int) -> Tuple[Dict, Dict]:
     def __sample_series_id__(idx, series_id):
         pass
 
+    def __len__(self) -> int:
+        if self.return_all_series:
+            return len(self.listed_vals[0]) - self.forecast_history - self.forecast_length - 1
+        else:
+            raise NotImplementedError("Current code only supports returning all the series at each iteration")
+
 
 class CSVTestLoader(CSVDataLoader):
     def __init__(
@@ -334,7 +367,7 @@ def __init__(
 
         :param file_path: The path to the file
         :type file_path: str
-        :param relevant_cols: d
+        :param relevant_cols: The relevant columns
         :type relevant_cols: List
         :param scaling: [description], defaults to None
         :type scaling: [type], optional
@@ -488,7 +521,7 @@ def __len__(self) -> int:
 
 
 class TemporalTestLoader(CSVTestLoader):
-    def __init__(self, time_feats, kwargs={}, decoder_step_len=None):
+    def __init__(self, time_feats: List[str], kwargs={}, decoder_step_len=None):
         """A test data-loader class for data in the format of the TemporalLoader.
 
         :param time_feats: The temporal featuers to use in encoding.
@@ -567,7 +600,7 @@ def __init__(self, series_marker_column: str, csv_loader_params: Dict, pad_lengt
         self.grouped_df = self.df.groupby(series_marker_column)
         self.n_classes = n_classes
 
-    def get_item_forecast(self, idx):
+    def get_item_forecast(self, idx: int):
         pass
 
     def get_item_classification(self, idx: int):

diff --git a/flood_forecast/pytorch_training.py b/flood_forecast/pytorch_training.py
@@ -11,6 +11,7 @@
 from flood_forecast.basic.linear_regression import simple_decode
 from flood_forecast.training_utils import EarlyStopper
 from flood_forecast.custom.custom_opt import GaussianLoss, MASELoss
+from flood_forecast.series_id_helper import handle_csv_id_output, handle_csv_id_validation
 from torch.nn import CrossEntropyLoss
 
 
@@ -356,7 +357,7 @@ def torch_single_train(model: PyTorchForecast,
     :type model: PyTorchForecast
     :param opt: The optimizer to use in the code
     :type opt: optim.Optimizer
-    :param criterion: [description]
+    :param criterion: [m
     :type criterion: Type[torch.nn.modules.loss._Loss]
     :param data_loader: [description]
     :type data_loader: DataLoader
@@ -405,50 +406,50 @@ def torch_single_train(model: PyTorchForecast,
             trg = trg[0]
             trg[:, -pred_len:, :] = torch.zeros_like(trg[:, -pred_len:, :].long()).float().to(model.device)
             # Assign to avoid other if statement
-        elif "SeriesIDLoader" == model.params["dataset_params"]["class"]:
-            pass
-        src = src.to(model.device)
-        trg = trg.to(model.device)
-        output = model.model(src, **forward_params)
-        if hasattr(model.model, "pred_len"):
-            multi_targets = mulit_targets_copy
-            pred_len = model.model.pred_len
-            output = output[:, :, 0:multi_targets]
-            labels = trg[:, -pred_len:, 0:multi_targets]
-            multi_targets = False
-        if model.params["dataset_params"]["class"] == "GeneralClassificationLoader":
-            labels = trg
-        elif multi_targets == 1:
-            labels = trg[:, :, 0]
-        elif multi_targets > 1:
-            labels = trg[:, :, 0:multi_targets]
-        if probablistic:
-            output1 = output
-            output = output.mean
-            output_std = output1.stddev
-        if type(criterion) == list:
-            loss = multi_crit(criterion, output, labels, None)
+        if "SeriesIDLoader" == model.params["dataset_params"]["class"]:
+            running_loss += handle_csv_id_output(src, trg, model, criterion, opt, False, multi_targets)
+            i += 1
         else:
-            loss = compute_loss(labels, output, src, criterion, None, probablistic, output_std, m=multi_targets)
-        if loss > 100:
-            print("Warning: high loss detected")
-        loss.backward()
-        opt.step()
-        if torch.isnan(loss) or loss == float('inf'):
-            raise ValueError("Error infinite or NaN loss detected. Try normalizing data or performing interpolation")
-        running_loss += loss.item()
-        i += 1
+            src = src.to(model.device)
+            trg = trg.to(model.device)
+            output = model.model(src, **forward_params)
+            if hasattr(model.model, "pred_len"):
+                multi_targets = mulit_targets_copy
+                pred_len = model.model.pred_len
+                output = output[:, :, 0:multi_targets]
+                labels = trg[:, -pred_len:, 0:multi_targets]
+                multi_targets = False
+            if model.params["dataset_params"]["class"] == "GeneralClassificationLoader":
+                labels = trg
+            elif model.params["dataset_params"]["class"] == "CSVSeriesIDLoader":
+                labels = trg
+            elif multi_targets == 1:
+                labels = trg[:, :, 0]
+            elif multi_targets > 1:
+                labels = trg[:, :, 0:multi_targets]
+            if probablistic:
+                output1 = output
+                output = output.mean
+                output_std = output1.stddev
+            if type(criterion) == list:
+                loss = multi_crit(criterion, output, labels, None)
+            else:
+                loss = compute_loss(labels, output, src, criterion, None, probablistic, output_std, m=multi_targets)
+            if loss > 100:
+                print("Warning: high loss detected")
+            loss.backward()
+            opt.step()
+            if torch.isnan(loss) or loss == float('inf'):
+                raise ValueError("Error infinite or NaN loss detected. Try normalizing data or performing interpolation")
+            running_loss += loss.item()
+            i += 1
     print("The running loss is: ")
     print(running_loss)
     print("The number of items in train is: " + str(i))
     total_loss = running_loss / float(i)
     return total_loss
 
 
-def handle_crit_list():
-    pass
-
-
 def compute_validation(validation_loader: DataLoader,
                        model,
                        epoch: int,
@@ -509,6 +510,10 @@ def compute_validation(validation_loader: DataLoader,
         label_list = []
         mod_output_list = []
         for src, targ in validation_loader:
+            if validation_loader.dataset.__class__.__name__ == "CSVSeriesIDLoader":
+                scaled_crit = handle_csv_id_validation(src, targ, model, criterion, False, multi_targets)
+                unscaled_crit = {}
+                continue
             src = src if isinstance(src, list) else src.to(device)
             targ = targ if isinstance(targ, list) else targ.to(device)
             # targ = targ if isinstance(targ, list) else targ.to(device)
@@ -591,12 +596,14 @@ def compute_validation(validation_loader: DataLoader,
         print("Plotting test classification metrics")
         label_list = torch.cat(label_list)
         label_list = label_list[:, 0, :].detach().cpu()
-        mod_output1 = torch.cat(mod_output_list)[:, 0, :].detach().cpu().numpy()
+        mod_output1 = torch.cat(mod_output_list)[:, 0, :].detach().cpu()
+        d = torch.nn.Softmax(dim=1)
+        mod_output_final = d(mod_output1).numpy()
         fin = label_list.max(dim=1)[1]
-        wandb.log({"roc_" + str(epoch): wandb.plot.roc_curve(fin, mod_output1, classes_to_plot=None, labels=None,
+        wandb.log({"roc_" + str(epoch): wandb.plot.roc_curve(fin, mod_output_final, classes_to_plot=None, labels=None,
                                                              title="roc_" + str(epoch))})
-        wandb.log({"pr": wandb.plot.pr_curve(fin, mod_output1)})
-        wandb.log({"conf_": wandb.plot.confusion_matrix(probs=mod_output1,
+        wandb.log({"pr": wandb.plot.pr_curve(fin, mod_output_final)})
+        wandb.log({"conf_": wandb.plot.confusion_matrix(probs=mod_output_final,
                    y_true=fin.detach().cpu().numpy(), class_names=None)})
     model.train()
     return list(scaled_crit.values())[0]