diff --git a/environment.yml b/environment.yml index d34d498..a78fab3 100644 --- a/environment.yml +++ b/environment.yml @@ -11,7 +11,7 @@ dependencies: - psutil - birdy #- tigramite -#- tefs +#- tefs==0.3.1 - pandas - scikit-learn - numpy diff --git a/hawk/analysis/main.py b/hawk/analysis/main.py index ac423ae..2d51add 100644 --- a/hawk/analysis/main.py +++ b/hawk/analysis/main.py @@ -2,11 +2,11 @@ import os import pandas as pd +from tefs.metrics import regression_analysis from tigramite.independence_tests.cmiknn import CMIknn from tigramite.independence_tests.parcorr import ParCorr from .file_management import save_to_pkl_file -from .metrics import regression_analysis from .pcmci_tools import initialize_tigramite_df from .postprocessing import ( run_postprocessing_pcmci, diff --git a/hawk/analysis/metrics.py b/hawk/analysis/metrics.py deleted file mode 100644 index 066f0c6..0000000 --- a/hawk/analysis/metrics.py +++ /dev/null @@ -1,124 +0,0 @@ -from typing import Any, Dict, Optional, Tuple - -import pandas as pd -from sklearn.linear_model import LinearRegression -from sklearn.metrics import r2_score -from sklearn.model_selection import BaseCrossValidator, cross_val_score - -inputs_names_lags_doc = """ -:param inputs_names_lags: A dictionary mapping input feature names to their corresponding list of lags. - For example, {'feature1': [1, 2], 'feature2': [1]} indicates 'feature1' should be lagged by 1 and 2 periods, - and 'feature2' by 1 period. -""" - -target_name_doc = """ -:param target_name: The name of the target variable in the DataFrame. -""" - - -def prepare_data_with_lags( - df: pd.DataFrame, - inputs_names_lags: Dict[str, list[int]], - target_name: str, -) -> Tuple[pd.DataFrame, pd.Series]: - f""" - Prepares data for regression by generating lagged features for specified variables and targets. - - :param df: The pandas DataFrame containing the time series data. - {inputs_names_lags_doc} - {target_name_doc} - :return: A tuple containing the lagged features DataFrame and the target variable Series. - """ - - required_columns = set([*inputs_names_lags.keys(), target_name]) - if not required_columns.issubset(set(df.columns)): - raise ValueError( - "DataFrame 'df' must contain all the columns specified in 'features_names' and 'targets_names'." - ) - - for lags in inputs_names_lags.values(): - if lags and min(lags) < 0: - raise ValueError("Lag for independent variables must be a non-negative integer.") - - # Initialize a list to hold all DataFrame chunks - lagged_chunks = [] - - # Generate lagged inputs for the independent variables - for input, lags in inputs_names_lags.items(): - for lag in lags: - lagged_chunk = df[input].shift(lag).to_frame(f"{input}_t-{lag}") - lagged_chunks.append(lagged_chunk) - - # Adding target column - lagged_chunks.append(df[target_name].to_frame(target_name)) - - # Concatenate chunks - df_lagged = pd.concat(lagged_chunks, axis=1) - - # Dropping rows with NaN values caused by shifting - df_lagged = df_lagged.dropna() - - return df_lagged.drop(columns=target_name), df_lagged[target_name] - - -def regression_analysis( - inputs_names_lags: Dict[str, list[int]], - target_name: str, - df: Optional[pd.DataFrame] = None, - cv_scheme: Optional[BaseCrossValidator] = None, - df_train: Optional[pd.DataFrame] = None, - df_test: Optional[pd.DataFrame] = None, -) -> Any: - f""" - Performs regression analysis with support for either cross-validation or a train-test split, - based on the arguments provided. - - {inputs_names_lags_doc} - {target_name_doc} - :param df: DataFrame for cross-validation mode. If specified, cv_scheme must also be provided. - :param cv_scheme: Cross-validator object for cross-validation mode. If specified, df must also be provided. - :param df_train: Training DataFrame for train-test split mode. Required if df_test is provided. - :param df_test: Testing DataFrame for train-test split mode. Requires df_train to be specified. - :return: Cross-validated scores or R-squared scores from train-test evaluation. - """ - - # Check that exactly one mode is specified - cross_val_mode = bool(df is not None and cv_scheme is not None) - train_test_mode = bool(df_train is not None and df_test is not None) - if not (cross_val_mode ^ train_test_mode): - raise ValueError( - "Specify either a 'cv_scheme' and 'df', or a train-test split with 'df_train' and 'df_test', not both." - ) - - if cross_val_mode: - if df is None or cv_scheme is None: - raise ValueError("Both 'df' and 'cv_scheme' must be specified for cross-validation mode.") - - X, y = prepare_data_with_lags( - df, - inputs_names_lags, - target_name, - ) - - model = LinearRegression() - return cross_val_score(model, X, y, cv=cv_scheme) - - elif train_test_mode: - if df_train is None or df_test is None: - raise ValueError("Both 'df_train' and 'df_test' must be specified for train-test split mode.") - - X_train, y_train = prepare_data_with_lags( - df_train, - inputs_names_lags, - target_name, - ) - - X_test, y_test = prepare_data_with_lags( - df_test, - inputs_names_lags, - target_name, - ) - - model = LinearRegression().fit(X_train, y_train) - y_pred = model.predict(X_test) - return r2_score(y_test, y_pred) diff --git a/hawk/analysis/postprocessing.py b/hawk/analysis/postprocessing.py index 8928932..bff5452 100644 --- a/hawk/analysis/postprocessing.py +++ b/hawk/analysis/postprocessing.py @@ -6,10 +6,10 @@ import numpy as np import pandas as pd import seaborn as sns +from tefs.metrics import regression_analysis # from tigramite import plotting as tp from .file_management import save_to_pkl_file -from .metrics import regression_analysis from .pcmci_tools import get_connected_variables @@ -257,7 +257,7 @@ def run_postprocessing_pcmci( target_file_plots = {} for image_format in image_formats: target_file_plot = os.path.join( - destination_path, "algorithm_results", "pcmci", f"feature_presence.{image_format}" + destination_path, "algorithm_results", "pcmci", f"feature_presence_pcmci.{image_format}" ) os.makedirs(os.path.dirname(target_file_plot), exist_ok=True) plt.savefig(target_file_plot, bbox_inches="tight") @@ -384,7 +384,9 @@ def run_postprocessing_tefs( ) target_file_plots = {} for image_format in image_formats: - target_file_plot = os.path.join(destination_path, "algorithm_results", "te", f"feature_presence.{image_format}") + target_file_plot = os.path.join( + destination_path, "algorithm_results", "te", f"feature_presence_tefs.{image_format}" + ) os.makedirs(os.path.dirname(target_file_plot), exist_ok=True) plt.savefig(target_file_plot, bbox_inches="tight") target_file_plots[image_format] = target_file_plot diff --git a/hawk/processes/simulation_interactive.py b/hawk/processes/simulation_interactive.py index 2d3bd68..e833b84 100644 --- a/hawk/processes/simulation_interactive.py +++ b/hawk/processes/simulation_interactive.py @@ -1,9 +1,5 @@ from birdy import WPSClient -train_file_path = "Emiliani1_train.csv" -test_file_path = "Emiliani1_test.csv" -target_column_name = "cyclostationary_mean_rr_4w_1" - # ----------------- WPS ----------------- wps = WPSClient("http://localhost:5002/wps", verify=False) @@ -11,14 +7,14 @@ # Input some data for the causal process resp = wps.causal( - dataset_train=open(train_file_path), - dataset_test=open(test_file_path), - target_column_name=target_column_name, + dataset_train="https://raw.githubusercontent.com/climateintelligence/hawk/main/hawk/demo/Ticino_train.csv", + dataset_test="https://raw.githubusercontent.com/climateintelligence/hawk/main/hawk/demo/Ticino_train.csv", + target_column_name="target", pcmci_test_choice="ParCorr", - pcmci_max_lag="0", - tefs_direction="both", - tefs_use_contemporary_features="Yes", - tefs_max_lag_features="1", + pcmci_max_lag="1", + tefs_direction="forward", + tefs_use_contemporary_features=True, + tefs_max_lag_features="2", tefs_max_lag_target="1", ) diff --git a/hawk/processes/wps_causal.py b/hawk/processes/wps_causal.py index c411275..4f549ae 100644 --- a/hawk/processes/wps_causal.py +++ b/hawk/processes/wps_causal.py @@ -10,7 +10,7 @@ FORMAT_PNG = Format("image/png", extension=".png", encoding="base64") FORMAT_PDF = Format("application/pdf", extension=".pdf", encoding="utf-8") -FORMAT_PICKLE = Format("application/octet-stream", extension=".pkl", encoding="utf-8") +FORMAT_PICKLE = Format("application/octet-stream", extension=".pkl") class Causal(Process): @@ -39,12 +39,14 @@ def __init__(self): "Target Column Name", data_type="string", abstract="Please enter the case-specific name of the target variable in the dataframe.", + default="target", ), LiteralInput( "pcmci_test_choice", "PCMCI Test Choice", data_type="string", abstract="Choose the independence test to be used in PCMCI.", + default="ParCorr", allowed_values=[ "ParCorr", "CMIknn", @@ -55,6 +57,7 @@ def __init__(self): "PCMCI Max Lag", data_type="string", abstract="Choose the maximum lag to test used in PCMCI.", + default="1", allowed_values=[ "0", "1", @@ -69,6 +72,7 @@ def __init__(self): "TEFS Direction", data_type="string", abstract="Choose the direction of the TEFS algorithm.", + default="both", allowed_values=[ "forward", "backward", @@ -79,14 +83,15 @@ def __init__(self): "tefs_use_contemporary_features", "TEFS Use Contemporary Features", data_type="boolean", - abstract="Choose whether to use comtemporary features in the TEFS algorithm.", - default="Yes", + abstract="Choose whether to use contemporary features in the TEFS algorithm.", + default=True, ), LiteralInput( "tefs_max_lag_features", "TEFS Max Lag Features", data_type="string", abstract="Choose the maximum lag of the features in the TEFS algorithm.", + default="1", allowed_values=[ "no_lag", "1", @@ -101,6 +106,7 @@ def __init__(self): "TEFS Max Lag Target", data_type="string", abstract="Choose the maximum lag of the target in the TEFS algorithm.", + default="1", allowed_values=[ "1", "2", diff --git a/requirements.txt b/requirements.txt index f89c934..2bcfac0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ psutil pywps>=4.5.1,<4.6 birdhouse-birdy tigramite>=5.2.5.1 -tefs +tefs==0.3.1 pandas scikit-learn numpy \ No newline at end of file diff --git a/tests/test_causal_analysis.py b/tests/test_causal_analysis.py index a9a0ae2..64948a4 100644 --- a/tests/test_causal_analysis.py +++ b/tests/test_causal_analysis.py @@ -34,8 +34,8 @@ def test_causal_analysis(): df_test = pd.read_csv("hawk/demo/Ticino_test.csv", header=0) target_column_name = "target" pcmci_test_choice = "ParCorr" - pcmci_max_lag = 0 - tefs_direction = "forward" + pcmci_max_lag = 2 + tefs_direction = "both" tefs_use_contemporary_features = True tefs_max_lag_features = 1 tefs_max_lag_target = 1