diff --git a/README.md b/README.md index d627918..165a207 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,7 @@ run_forecast( prediction_length=10, backtest_months=1, stride=10, + metric="smape", train_predict_ratio=2, data_quality_check=True, resample=False, @@ -81,29 +82,30 @@ run_forecast( #### Parameters description: -- ```train_data``` is a delta table name that stores the input dataset. -- ```scoring_data``` is a delta table name that stores the [dynamic future regressors](https://nixtlaverse.nixtla.io/neuralforecast/examples/exogenous_variables.html#3-training-with-exogenous-variables). If not provided or if the same name as ```train_data``` is provided, the models will ignore the future dynamical regressors. -- ```scoring_output``` is a delta table where you write your forecasting output. This table will be created if does not exist -- ```evaluation_output``` is a delta table where you write the evalution results from all backtesting trials from all time series and all models. This table will be created if does not exist. -- ```group_id``` is a column storing the unique id that groups your dataset to each time series. -- ```date_col``` is your time column name. -- ```target``` is your target column name. -- ```freq``` is your prediction frequency. Currently, "D" for daily and "M" for monthly are supported. Note that ```freq``` supported is as per the model basis, hence check the model documentation carefully. -- ```prediction_length``` is your forecasting horizon in the number of steps. -- ```backtest_months``` specifies how many previous months you use for backtesting. -- ```stride``` is the number of steps in which you update your backtesting trial start date when going from one trial to the next. -- ```train_predict_ratio``` specifies the minimum length required for your training dataset with respect to ```prediction_length```. If ```train_predict_ratio```=2, you need to have training dataset that is at least twice as long as ```prediciton_length```. -- ```data_quality_check``` checks the quality of the input data if set to True. See [data_quality_checks.py](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/mmf_sa/data_quality_checks.py) for the full details of the checks. -- ```resample``` backfills skipped entries with 0 if set to True. Default is False. -- ```active_models``` is a list of models you want to use. -- ```experiment_path``` to keep metrics under the MLFlow. -- ```use_case_name``` a new column will be created under the delta Table, in case you save multiple trials under 1 table. +- ```train_data``` is a delta table name that stores the input dataset. +- ```scoring_data``` is a delta table name that stores the [dynamic future regressors](https://nixtlaverse.nixtla.io/neuralforecast/examples/exogenous_variables.html#3-training-with-exogenous-variables). If not provided or if the same name as ```train_data``` is provided, the models will ignore the future dynamical regressors. +- ```scoring_output``` is a delta table where you write your forecasting output. This table will be created if does not exist +- ```evaluation_output``` is a delta table where you write the evaluation results from all backtesting trials from all time series and all models. This table will be created if does not exist. +- ```group_id``` is a column storing the unique id that groups your dataset to each time series. +- ```date_col``` is your time column name. +- ```target``` is your target column name. +- ```freq``` is your prediction frequency. Currently, "D" for daily and "M" for monthly are supported. Note that ```freq``` supported is as per the model basis, hence check the model documentation carefully. +- ```prediction_length``` is your forecasting horizon in the number of steps. +- ```backtest_months``` specifies how many previous months you use for backtesting. +- ```stride``` is the number of steps in which you update your backtesting trial start date when going from one trial to the next. +- ```metric``` is the metric to log in the evaluation table and MLFlow. Supported metrics are mape and smape. Default is smape. +- ```train_predict_ratio``` specifies the minimum length required for your training dataset with respect to ```prediction_length```. If ```train_predict_ratio```=2, you need to have training dataset that is at least twice as long as ```prediciton_length```. +- ```data_quality_check``` checks the quality of the input data if set to True (default False). See [data_quality_checks.py](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/mmf_sa/data_quality_checks.py) for the full details of the checks. +- ```resample``` backfills skipped entries with 0 if set to True. Only relevant when data_quality_check is True. Default is False. If data_quality_check is True and resample is False, the check removes all time series with skipped dates. +- ```active_models``` is a list of models you want to use. +- ```experiment_path``` to keep metrics under the MLFlow. +- ```use_case_name``` a new column will be created under the delta Table, in case you save multiple trials under 1 table. To modify the model hyperparameters, change the values in [mmf_sa/models/models_conf.yaml](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/mmf_sa/models/models_conf.yaml) or overwrite these values in [mmf_sa/forecasting_conf.yaml](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/mmf_sa/forecasting_conf.yaml). -MMF is fully integrated with MLflow, so once the training kicks off, the experiments will be visible in the MLflow Tracking UI with the corresponding metrics and parameters (note that we do not log all local models in MLFlow but we store the binaries in the tables ```evaluation_output``` and ```scoring_output```). The metric you see in the MLflow Tracking UI is a simple mean over backtesting trials over all time series. +MMF is fully integrated with MLflow, so once the training kicks off, the experiments will be visible in the MLflow Tracking UI with the corresponding metrics and parameters (note that we do not log all local models in MLFlow, but we store the binaries in the tables ```evaluation_output``` and ```scoring_output```). The metric you see in the MLflow Tracking UI is a simple mean over backtesting trials over all time series. -We encourage you to reading through [examples/local_univariate_daily.py](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/examples/local_univariate_daily.py) notebook to better understand how local models can be applied to your time series using MMF. Other example notebooks for monthly forecasting and forecasting with exogenous regressors can be found in [examples/local_univariate_monthly.py](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/examples/local_univariate_monthly.py) and [examples/local_univariate_external_regressors_daily.py](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/examples/local_univariate_external_regressors_daily.py). +We encourage you to read through [examples/local_univariate_daily.py](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/examples/local_univariate_daily.py) notebook to better understand how local models can be applied to your time series using MMF. Other example notebooks for monthly forecasting and forecasting with exogenous regressors can be found in [examples/local_univariate_monthly.py](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/examples/local_univariate_monthly.py) and [examples/local_univariate_external_regressors_daily.py](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/examples/local_univariate_external_regressors_daily.py). ### Global Models @@ -157,6 +159,7 @@ run_forecast( prediction_length=10, backtest_months=1, stride=10, + metric="smape", train_predict_ratio=2, data_quality_check=True, resample=False, @@ -179,7 +182,7 @@ To modify the model hyperparameters or reset the range of the hyperparameter sea MMF is fully integrated with MLflow and so once the training kicks off, the experiments will be visible in the MLflow Tracking UI with the corresponding metrics and parameters. Once the training is complete the models will be logged to MLFlow and registered to Unity Catalog. -We encourage you to reading through [examples/global_daily.py](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/examples/global_daily.py) notebook to better understand how global models can be applied to your time series using MMF. Other example notebooks for monthly forecasting and forecasting with exogenous regressors can be found in [examples/global_monthly.py](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/examples/global_monthly.py) and [examples/global_external_regressors_daily.py](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/examples/global_external_regressors_daily.py) respectively. +We encourage you to read through [examples/global_daily.py](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/examples/global_daily.py) notebook to better understand how global models can be applied to your time series using MMF. Other example notebooks for monthly forecasting and forecasting with exogenous regressors can be found in [examples/global_monthly.py](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/examples/global_monthly.py) and [examples/global_external_regressors_daily.py](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/examples/global_external_regressors_daily.py) respectively. ### Foundation Models @@ -221,7 +224,7 @@ To modify the model hyperparameters, change the values in [mmf_sa/models/models_ MMF is fully integrated with MLflow and so once the training kicks off, the experiments will be visible in the MLflow Tracking UI with the corresponding metrics and parameters. During the evaluation, the models are logged and registered to Unity Catalog. -We encourage you to reading through [examples/foundation_daily.py](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/examples/foundation_daily.py) notebook to better understand how foundation models can be applied to your time series using MMF. An example notebook for monthly forecasting can be found in [examples/foundation_monthly.py](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/examples/foundation_monthly.py). +We encourage you to read through [examples/foundation_daily.py](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/examples/foundation_daily.py) notebook to better understand how foundation models can be applied to your time series using MMF. An example notebook for monthly forecasting can be found in [examples/foundation_monthly.py](https://github.com/databricks-industry-solutions/many-model-forecasting/blob/main/examples/foundation_monthly.py). #### Using Foundation Models on Databricks diff --git a/examples/local_univariate_daily.py b/examples/local_univariate_daily.py index c4c1688..07b48df 100644 --- a/examples/local_univariate_daily.py +++ b/examples/local_univariate_daily.py @@ -181,6 +181,7 @@ def transform_group(df): prediction_length=10, backtest_months=1, stride=10, + metric="smape", train_predict_ratio=1, data_quality_check=False, resample=False, diff --git a/examples/local_univariate_external_regressors_daily.py b/examples/local_univariate_external_regressors_daily.py index a3907a3..4e5b4a4 100644 --- a/examples/local_univariate_external_regressors_daily.py +++ b/examples/local_univariate_external_regressors_daily.py @@ -151,6 +151,7 @@ prediction_length=10, backtest_months=1, stride=10, + metric="smape", train_predict_ratio=1, active_models=active_models, data_quality_check=False, diff --git a/examples/local_univariate_monthly.py b/examples/local_univariate_monthly.py index dead4fb..556f8fe 100644 --- a/examples/local_univariate_monthly.py +++ b/examples/local_univariate_monthly.py @@ -179,6 +179,7 @@ def transform_group(df): prediction_length=3, backtest_months=12, stride=1, + metric="smape", train_predict_ratio=1, data_quality_check=False, resample=False, diff --git a/examples/m5-examples/data_preparation_m5.py b/examples/m5-examples/data_preparation_m5.py index 34f8c8a..c7d8560 100644 --- a/examples/m5-examples/data_preparation_m5.py +++ b/examples/m5-examples/data_preparation_m5.py @@ -53,14 +53,26 @@ random.seed(7) unique_ids = list(daily_train["unique_id"].unique()) + +unique_id_100 = sorted(random.sample(unique_ids, 100)) unique_id_1000 = sorted(random.sample(unique_ids, 1000)) unique_id_10000 = sorted(random.sample(unique_ids, 10000)) +daily_train_100 = daily_train[daily_train["unique_id"].isin(unique_id_100)] daily_train_1000 = daily_train[daily_train["unique_id"].isin(unique_id_1000)] daily_train_10000 = daily_train[daily_train["unique_id"].isin(unique_id_10000)] # COMMAND ---------- +( + spark.createDataFrame(daily_train_100) + .write.format("delta").mode("overwrite") + .saveAsTable(f"{catalog}.{db}.daily_train_100") +) +print(f"Saved data to {catalog}.{db}.daily_train_100") + +# COMMAND ---------- + ( spark.createDataFrame(daily_train_1000) .write.format("delta").mode("overwrite") @@ -76,11 +88,3 @@ .saveAsTable(f"{catalog}.{db}.daily_train_10000") ) print(f"Saved data to {catalog}.{db}.daily_train_10000") - -# COMMAND ---------- - -display(spark.sql(f"select * from {catalog}.{db}.daily_train_1000")) - -# COMMAND ---------- - - diff --git a/examples/m5-examples/foundation_daily_m5.py b/examples/m5-examples/foundation_daily_m5.py index b081a44..6d1c2a8 100644 --- a/examples/m5-examples/foundation_daily_m5.py +++ b/examples/m5-examples/foundation_daily_m5.py @@ -20,7 +20,7 @@ catalog = "mmf" # Name of the catalog we use to manage our assets db = "m5" # Name of the schema we use to manage our assets (e.g. datasets) -n = 1000 # Number of items: choose from [1000, 10000, 'full']. full is 35k +n = 1000 # Number of items: choose from [100, 1000, 10000, 'full']. full is 35k table = f"daily_train_{n}" # Training table name user_email = spark.sql('select current_user() as user').collect()[0]['user'] # User email diff --git a/examples/m5-examples/global_daily_m5.py b/examples/m5-examples/global_daily_m5.py index 197518f..bf7de03 100644 --- a/examples/m5-examples/global_daily_m5.py +++ b/examples/m5-examples/global_daily_m5.py @@ -21,7 +21,7 @@ catalog = "mmf" # Name of the catalog we use to manage our assets db = "m5" # Name of the schema we use to manage our assets (e.g. datasets) -n = 1000 # Number of items: choose from [1000, 10000, 'full']. full is 35k +n = 1000 # Number of items: choose from [100, 1000, 10000, 'full']. full is 35k table = f"daily_train_{n}" # Training table name user_email = spark.sql('select current_user() as user').collect()[0]['user'] # User email diff --git a/examples/m5-examples/local_univariate_daily_m5.py b/examples/m5-examples/local_univariate_daily_m5.py index d27c345..812a936 100644 --- a/examples/m5-examples/local_univariate_daily_m5.py +++ b/examples/m5-examples/local_univariate_daily_m5.py @@ -77,6 +77,7 @@ prediction_length=28, backtest_months=3, stride=7, + metric="smape", train_predict_ratio=1, data_quality_check=True, resample=False, diff --git a/examples/m5-examples/run_daily_m5.py b/examples/m5-examples/run_daily_m5.py index 5018903..6818244 100644 --- a/examples/m5-examples/run_daily_m5.py +++ b/examples/m5-examples/run_daily_m5.py @@ -41,6 +41,7 @@ prediction_length=28, backtest_months=3, stride=7, + metric="smape", train_predict_ratio=1, data_quality_check=True, resample=False, diff --git a/examples/run_daily.py b/examples/run_daily.py index 5581ad7..3ce64ea 100644 --- a/examples/run_daily.py +++ b/examples/run_daily.py @@ -37,6 +37,7 @@ prediction_length=10, backtest_months=1, stride=10, + metric="smape", train_predict_ratio=1, data_quality_check=True, resample=False, diff --git a/examples/run_external_regressors_daily.py b/examples/run_external_regressors_daily.py index ff4b3d6..5999613 100644 --- a/examples/run_external_regressors_daily.py +++ b/examples/run_external_regressors_daily.py @@ -38,6 +38,7 @@ prediction_length=10, backtest_months=1, stride=10, + metric="smape", train_predict_ratio=1, active_models=[model], data_quality_check=True, diff --git a/examples/run_monthly.py b/examples/run_monthly.py index 2119ab8..5da624d 100644 --- a/examples/run_monthly.py +++ b/examples/run_monthly.py @@ -37,6 +37,7 @@ prediction_length=3, backtest_months=12, stride=1, + metric="smape", train_predict_ratio=1, data_quality_check=True, resample=False, diff --git a/mmf_sa/Forecaster.py b/mmf_sa/Forecaster.py index 0dc8c75..99c0a7d 100644 --- a/mmf_sa/Forecaster.py +++ b/mmf_sa/Forecaster.py @@ -197,60 +197,60 @@ def evaluate_local_model(self, model_conf): self (Forecaster): A Forecaster object. model_conf (dict): A dictionary specifying the model configuration. """ - src_df = self.resolve_source("train_data") - src_df, removed = DataQualityChecks(src_df, self.conf, self.spark).run() + with mlflow.start_run(experiment_id=self.experiment_id): + src_df = self.resolve_source("train_data") + src_df, removed = DataQualityChecks(src_df, self.conf, self.spark).run() - # Specifying the output schema for Pandas UDF - output_schema = StructType( - [ - StructField( - self.conf["group_id"], src_df.schema[self.conf["group_id"]].dataType - ), - StructField("backtest_window_start_date", DateType()), - StructField("metric_name", StringType()), - StructField("metric_value", DoubleType()), - StructField("forecast", ArrayType(DoubleType())), - StructField("actual", ArrayType(DoubleType())), - StructField("model_pickle", BinaryType()), - ] - ) - - model = self.model_registry.get_model(model_conf["name"]) + # Specifying the output schema for Pandas UDF + output_schema = StructType( + [ + StructField( + self.conf["group_id"], src_df.schema[self.conf["group_id"]].dataType + ), + StructField("backtest_window_start_date", DateType()), + StructField("metric_name", StringType()), + StructField("metric_value", DoubleType()), + StructField("forecast", ArrayType(DoubleType())), + StructField("actual", ArrayType(DoubleType())), + StructField("model_pickle", BinaryType()), + ] + ) - # Use Pandas UDF to forecast individual group - evaluate_one_local_model_fn = functools.partial( - Forecaster.evaluate_one_local_model, model=model - ) - res_sdf = ( - src_df.groupby(self.conf["group_id"]) - .applyInPandas(evaluate_one_local_model_fn, schema=output_schema) - ) + model = self.model_registry.get_model(model_conf["name"]) - # Write evaluation result to a delta table - if self.conf.get("evaluation_output", None) is not None: - ( - res_sdf.withColumn(self.conf["group_id"], col(self.conf["group_id"]).cast(StringType())) - .withColumn("run_id", lit(self.run_id)) - .withColumn("run_date", lit(self.run_date)) - .withColumn("model", lit(model_conf["name"])) - .withColumn("use_case", lit(self.conf["use_case_name"])) - .withColumn("model_uri", lit("")) - .write.mode("append") - .saveAsTable(self.conf.get("evaluation_output")) + # Use Pandas UDF to forecast individual group + evaluate_one_local_model_fn = functools.partial( + Forecaster.evaluate_one_local_model, model=model + ) + res_sdf = ( + src_df.groupby(self.conf["group_id"]) + .applyInPandas(evaluate_one_local_model_fn, schema=output_schema) ) - # Compute aggregated metrics - res_df = ( - res_sdf.groupby(["metric_name"]) - .mean("metric_value") - .withColumnRenamed("avg(metric_value)", "metric_value") - .toPandas() - ) - # Print out aggregated metrics - print(res_df) + # Write evaluation result to a delta table + if self.conf.get("evaluation_output", None) is not None: + ( + res_sdf.withColumn(self.conf["group_id"], col(self.conf["group_id"]).cast(StringType())) + .withColumn("run_id", lit(self.run_id)) + .withColumn("run_date", lit(self.run_date)) + .withColumn("model", lit(model_conf["name"])) + .withColumn("use_case", lit(self.conf["use_case_name"])) + .withColumn("model_uri", lit("")) + .write.mode("append") + .saveAsTable(self.conf.get("evaluation_output")) + ) - # Log aggregated metrics to MLflow - with mlflow.start_run(experiment_id=self.experiment_id): + # Compute aggregated metrics + res_df = ( + res_sdf.groupby(["metric_name"]) + .mean("metric_value") + .withColumnRenamed("avg(metric_value)", "metric_value") + .toPandas() + ) + # Print out aggregated metrics + print(res_df) + + # Log aggregated metrics to MLflow for rec in res_df.values: metric_name, metric_value = rec mlflow.log_metric(metric_name, metric_value) @@ -276,9 +276,9 @@ def evaluate_one_local_model( ) group_id = pdf[model.params["group_id"]].iloc[0] try: - pdf = pdf.fillna(0.1) + pdf = pdf.fillna(0) # Fix here - pdf[model.params["target"]] = pdf[model.params["target"]].clip(0.1) + pdf[model.params["target"]] = pdf[model.params["target"]].clip(0) metrics_df = model.backtest(pdf, start=split_date, group_id=group_id) return metrics_df except Exception as err: diff --git a/mmf_sa/__init__.py b/mmf_sa/__init__.py index fec9b94..52438ee 100644 --- a/mmf_sa/__init__.py +++ b/mmf_sa/__init__.py @@ -22,7 +22,6 @@ def run_forecast( backtest_months: int, stride: int, metric: str = "smape", - resample: bool = False, scoring_data: Union[str, pd.DataFrame, DataFrame] = None, scoring_output: str = None, evaluation_output: str = None, @@ -35,7 +34,8 @@ def run_forecast( accelerator: str = "cpu", backtest_retrain: bool = None, train_predict_ratio: int = None, - data_quality_check: bool = None, + data_quality_check: bool = False, + resample: bool = False, experiment_path: str = None, run_id: str = None, conf: Union[str, Dict[str, Any], OmegaConf] = None, @@ -56,8 +56,7 @@ def run_forecast( prediction_length (int): An integer specifying the prediction length: i.e. forecasting horizon. backtest_months (int): An integer specifying the number of backtest months. stride (int): An integer specifying the stride length. - metric (str): A string specifying the metric to use for evaluation. Default is smape. - resample (bool): A boolean specifying whether to back-fill skipped entries with 0. Default is False. + metric (str): A string specifying the metric to use for evaluation. Supported metrics are mape and smape. Default is smape. scoring_data (Union[str, pd.DataFrame, DataFrame]): Scoring data as a string of delta table name, pandas DataFrame, or Spark DataFrame. scoring_output (str): A string specifying the output table name for scoring. evaluation_output (str): A string specifying the output table name for evaluation. @@ -70,7 +69,8 @@ def run_forecast( accelerator (str): A string specifying the accelerator to use: cpu or gpu. Default is cpu. backtest_retrain (bool): A boolean specifying whether to retrain the model during backtesting. Currently, not supported. train_predict_ratio (int): An integer specifying the train predict ratio. - data_quality_check (bool): A boolean specifying whether to check the data quality. + data_quality_check (bool): A boolean specifying whether to check the data quality. Default is False. + resample (bool): A boolean specifying whether to back-fill skipped entries with 0. Only relevant when data_quality_check is True. Default is False. experiment_path (str): A string specifying the experiment path. run_id (str): A string specifying the run id. If not provided a random string is generated and assigned to each run. conf (Union[str, Dict[str, Any], OmegaConf]): A configuration object. diff --git a/mmf_sa/models/abstract_model.py b/mmf_sa/models/abstract_model.py index 6b14153..0264e2b 100644 --- a/mmf_sa/models/abstract_model.py +++ b/mmf_sa/models/abstract_model.py @@ -133,13 +133,19 @@ def calculate_metrics( Returns: metrics (Dict[str, Union[str, float, bytes]]): A dictionary specifying the metrics. """ pred_df, model_fitted = self.predict(hist_df, val_df) - smape = mean_absolute_percentage_error( - val_df[self.params["target"]], - pred_df[self.params["target"]], - symmetric=True, - ) + if self.params["metric"] == "smape": - metric_value = smape + metric_value = mean_absolute_percentage_error( + val_df[self.params["target"]], + pred_df[self.params["target"]], + symmetric=True, + ) + elif self.params["metric"] == "mape": + metric_value = mean_absolute_percentage_error( + val_df[self.params["target"]], + pred_df[self.params["target"]], + symmetric=False, + ) else: raise Exception(f"Metric {self.params['metric']} not supported!") diff --git a/mmf_sa/models/chronosforecast/ChronosPipeline.py b/mmf_sa/models/chronosforecast/ChronosPipeline.py index 832f3f2..1b79df9 100644 --- a/mmf_sa/models/chronosforecast/ChronosPipeline.py +++ b/mmf_sa/models/chronosforecast/ChronosPipeline.py @@ -115,6 +115,8 @@ def calculate_metrics( metrics = [] if self.params["metric"] == "smape": metric_name = "smape" + elif self.params["metric"] == "mape": + metric_name = "mape" else: raise Exception(f"Metric {self.params['metric']} not supported!") for key in keys: @@ -123,14 +125,16 @@ def calculate_metrics( try: if metric_name == "smape": metric_value = mean_absolute_percentage_error(actual, forecast, symmetric=True) + elif metric_name == "mape": + metric_value = mean_absolute_percentage_error(actual, forecast, symmetric=False) metrics.extend( [( key, curr_date, metric_name, metric_value, - actual, forecast, + actual, b'', )]) except: diff --git a/mmf_sa/models/models_conf.yaml b/mmf_sa/models/models_conf.yaml index 33f48a1..6744024 100644 --- a/mmf_sa/models/models_conf.yaml +++ b/mmf_sa/models/models_conf.yaml @@ -191,7 +191,7 @@ models: model_class: NeuralFcRNN framework: NeuralForecast model_type: global - max_steps: 200 + max_steps: 500 input_size_factor: 2 loss: smape learning_rate: 0.001 @@ -209,7 +209,7 @@ models: model_class: NeuralFcLSTM framework: NeuralForecast model_type: global - max_steps: 200 + max_steps: 500 input_size_factor: 2 loss: smape learning_rate: 0.001 @@ -226,7 +226,7 @@ models: model_class: NeuralFcNBEATSx framework: NeuralForecast model_type: global - max_steps: 100 + max_steps: 500 input_size_factor: 2 loss: smape learning_rate: 0.001 @@ -240,7 +240,7 @@ models: model_class: NeuralFcNHITS framework: NeuralForecast model_type: global - max_steps: 200 + max_steps: 500 input_size_factor: 2 loss: smape learning_rate: 0.001 @@ -258,7 +258,7 @@ models: model_class: NeuralFcAutoRNN framework: NeuralForecast model_type: global - max_steps: 200 + max_steps: 500 num_samples: 20 loss: smape encoder_hidden_size: [ 50, 100, 200, 300 ] @@ -271,7 +271,7 @@ models: model_class: NeuralFcAutoLSTM framework: NeuralForecast model_type: global - max_steps: 200 + max_steps: 500 num_samples: 20 loss: smape encoder_hidden_size: [ 50, 100, 200, 300 ] @@ -284,7 +284,7 @@ models: model_class: NeuralFcAutoNBEATSx framework: NeuralForecast model_type: global - max_steps: 200 + max_steps: 500 num_samples: 20 loss: smape input_size: 4 @@ -295,7 +295,7 @@ models: model_class: NeuralFcAutoNHITS framework: NeuralForecast model_type: global - max_steps: 200 + max_steps: 500 num_samples: 20 loss: smape input_size: 4 @@ -308,7 +308,7 @@ models: model_class: NeuralFcAutoTiDE framework: NeuralForecast model_type: global - max_steps: 200 + max_steps: 500 num_samples: 20 loss: smape input_size: 4 @@ -326,7 +326,7 @@ models: model_class: NeuralFcAutoPatchTST framework: NeuralForecast model_type: global - max_steps: 200 + max_steps: 500 num_samples: 20 loss: smape input_size: 4 diff --git a/mmf_sa/models/moiraiforecast/MoiraiPipeline.py b/mmf_sa/models/moiraiforecast/MoiraiPipeline.py index 861444c..59860ee 100644 --- a/mmf_sa/models/moiraiforecast/MoiraiPipeline.py +++ b/mmf_sa/models/moiraiforecast/MoiraiPipeline.py @@ -115,6 +115,8 @@ def calculate_metrics( metrics = [] if self.params["metric"] == "smape": metric_name = "smape" + elif self.params["metric"] == "mape": + metric_name = "mape" else: raise Exception(f"Metric {self.params['metric']} not supported!") for key in keys: @@ -123,14 +125,16 @@ def calculate_metrics( try: if metric_name == "smape": metric_value = mean_absolute_percentage_error(actual, forecast, symmetric=True) + elif metric_name == "mape": + metric_value = mean_absolute_percentage_error(actual, forecast, symmetric=False) metrics.extend( [( key, curr_date, metric_name, metric_value, - actual, forecast, + actual, b'', )]) except: diff --git a/mmf_sa/models/momentforecast/MomentPipeline.py b/mmf_sa/models/momentforecast/MomentPipeline.py index b72f975..9ec2267 100644 --- a/mmf_sa/models/momentforecast/MomentPipeline.py +++ b/mmf_sa/models/momentforecast/MomentPipeline.py @@ -113,6 +113,8 @@ def calculate_metrics( metrics = [] if self.params["metric"] == "smape": metric_name = "smape" + elif self.params["metric"] == "mape": + metric_name = "mape" else: raise Exception(f"Metric {self.params['metric']} not supported!") for key in keys: @@ -121,14 +123,16 @@ def calculate_metrics( try: if metric_name == "smape": metric_value = mean_absolute_percentage_error(actual, forecast, symmetric=True) + elif metric_name == "mape": + metric_value = mean_absolute_percentage_error(actual, forecast, symmetric=False) metrics.extend( [( key, curr_date, metric_name, metric_value, - actual, forecast, + actual, b'', )]) except: diff --git a/mmf_sa/models/neuralforecast/NeuralForecastPipeline.py b/mmf_sa/models/neuralforecast/NeuralForecastPipeline.py index 38c061a..68037d2 100644 --- a/mmf_sa/models/neuralforecast/NeuralForecastPipeline.py +++ b/mmf_sa/models/neuralforecast/NeuralForecastPipeline.py @@ -49,7 +49,7 @@ def prepare_data(self, df: pd.DataFrame, future: bool = False) -> pd.DataFrame: ) else: # Prepare historical dataframe with or without exogenous regressors for training - df[self.params.target] = df[self.params.target].clip(0.1) + df[self.params.target] = df[self.params.target].clip(0) if 'dynamic_future' in self.params.keys(): try: _df = ( @@ -122,7 +122,7 @@ def predict(self, hist_df: pd.DataFrame, val_df: pd.DataFrame = None): target: self.params.target, } ) - forecast_df[self.params.target] = forecast_df[self.params.target].clip(0.01) + forecast_df[self.params.target] = forecast_df[self.params.target].clip(0) return forecast_df, self.model @@ -155,7 +155,7 @@ def forecast(self, df: pd.DataFrame, spark=None): target: self.params.target, } ) - forecast_df[self.params.target] = forecast_df[self.params.target].clip(0.01) + forecast_df[self.params.target] = forecast_df[self.params.target].clip(0) return forecast_df, self.model def calculate_metrics( @@ -166,6 +166,8 @@ def calculate_metrics( metrics = [] if self.params["metric"] == "smape": metric_name = "smape" + elif self.params["metric"] == "mape": + metric_name = "mape" else: raise Exception(f"Metric {self.params['metric']} not supported!") for key in keys: @@ -175,14 +177,17 @@ def calculate_metrics( try: if metric_name == "smape": metric_value = mean_absolute_percentage_error(actual, forecast, symmetric=True) + elif metric_name == "mape": + metric_value = mean_absolute_percentage_error(actual, forecast, symmetric=False) + metrics.extend( [( key, curr_date, metric_name, metric_value, - actual.to_numpy(), forecast.to_numpy(), + actual.to_numpy(), b'', )]) except: diff --git a/mmf_sa/models/sktime/SKTimeForecastingPipeline.py b/mmf_sa/models/sktime/SKTimeForecastingPipeline.py index 11b2bc0..fef00d6 100644 --- a/mmf_sa/models/sktime/SKTimeForecastingPipeline.py +++ b/mmf_sa/models/sktime/SKTimeForecastingPipeline.py @@ -32,8 +32,8 @@ def create_param_grid(self) -> Dict[str, Any]: return {} def prepare_data(self, df: pd.DataFrame) -> pd.DataFrame: - df = df.copy().fillna(0.1) - df[self.params.target] = df[self.params.target].clip(0.1) + df = df.copy().fillna(0) + df[self.params.target] = df[self.params.target].clip(0) date_idx = pd.date_range( start=df[self.params.date_col].min(), end=df[self.params.date_col].max(), @@ -75,7 +75,7 @@ def predict(self, hist_df: pd.DataFrame, val_df: pd.DataFrame = None): ) forecast_df = pd.DataFrame(data=[], index=date_idx).reset_index() forecast_df[self.params.target] = pred_df.y.values - forecast_df[self.params.target] = forecast_df[self.params.target].clip(0.01) + forecast_df[self.params.target] = forecast_df[self.params.target].clip(0) return forecast_df, self.model def forecast(self, x, spark=None): diff --git a/mmf_sa/models/statsforecast/StatsFcForecastingPipeline.py b/mmf_sa/models/statsforecast/StatsFcForecastingPipeline.py index 665ade0..64cdc90 100644 --- a/mmf_sa/models/statsforecast/StatsFcForecastingPipeline.py +++ b/mmf_sa/models/statsforecast/StatsFcForecastingPipeline.py @@ -31,7 +31,7 @@ def prepare_data(self, df: pd.DataFrame, future: bool = False) -> pd.DataFrame: if not future: # Prepare historical dataframe with/out exogenous regressors for training # Fix here - df[self.params.target] = df[self.params.target].clip(0.1) + df[self.params.target] = df[self.params.target].clip(0) if 'dynamic_future' in self.params.keys(): try: df_statsfc = ( @@ -97,7 +97,7 @@ def predict(self, hist_df: pd.DataFrame, val_df: pd.DataFrame = None): } ) # Fix here - forecast_df[self.params.target] = forecast_df[self.params.target].clip(0.01) + forecast_df[self.params.target] = forecast_df[self.params.target].clip(0) return forecast_df, self.model def forecast(self, df: pd.DataFrame, spark=None): @@ -134,7 +134,7 @@ def forecast(self, df: pd.DataFrame, spark=None): } ) # Fix here - forecast_df[self.params.target] = forecast_df[self.params.target].clip(0.01) + forecast_df[self.params.target] = forecast_df[self.params.target].clip(0) return forecast_df, self.model diff --git a/mmf_sa/models/timesfmforecast/TimesFMPipeline.py b/mmf_sa/models/timesfmforecast/TimesFMPipeline.py index 6f07b1c..0385f14 100644 --- a/mmf_sa/models/timesfmforecast/TimesFMPipeline.py +++ b/mmf_sa/models/timesfmforecast/TimesFMPipeline.py @@ -68,6 +68,8 @@ def calculate_metrics( metrics = [] if self.params["metric"] == "smape": metric_name = "smape" + elif self.params["metric"] == "mape": + metric_name = "mape" else: raise Exception(f"Metric {self.params['metric']} not supported!") for key in keys: @@ -76,14 +78,17 @@ def calculate_metrics( try: if metric_name == "smape": metric_value = mean_absolute_percentage_error(actual, forecast, symmetric=True) + elif metric_name == "mape": + metric_value = mean_absolute_percentage_error(actual, forecast, symmetric=False) + metrics.extend( [( key, curr_date, metric_name, metric_value, - actual, forecast, + actual, b'', )]) except: