From aafe8f481a332b351a3eb6e70dea14bf23b0eb3b Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Fri, 22 Mar 2024 23:29:43 +0100 Subject: [PATCH 01/51] Update dependencies --- environment.yml | 4 ++++ requirements.txt | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/environment.yml b/environment.yml index 3a54c15..120e234 100644 --- a/environment.yml +++ b/environment.yml @@ -9,5 +9,9 @@ dependencies: - jinja2 - click - psutil +- birdhouse-birdy +- tigramite +- tefs +- pandas # tests - pytest diff --git a/requirements.txt b/requirements.txt index 01cdf08..e0d398c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,7 @@ click jinja2 psutil pywps>=4.5.1,<4.6 +birdhouse-birdy +tigramite +tefs +pandas \ No newline at end of file From 4d5a1cf84cec65f30e99a25dc2ead517292f717f Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Fri, 22 Mar 2024 23:30:03 +0100 Subject: [PATCH 02/51] Dump code from thesis --- hawk/analysis/__init__.py | 0 hawk/analysis/config_pcmci.py | 78 + hawk/analysis/config_te.py | 54 + hawk/analysis/metrics.py | 117 + hawk/analysis/pcmci_tools.py | 53 + hawk/analysis/run_postprocessing.py | 431 ++ .../run_postprocessing_followup.ipynb | 4860 +++++++++++++++++ hawk/analysis/run_simulation_pcmci.py | 109 + hawk/analysis/run_simulation_te.py | 85 + hawk/processes/simulation_interactive.py | 19 + 10 files changed, 5806 insertions(+) create mode 100644 hawk/analysis/__init__.py create mode 100644 hawk/analysis/config_pcmci.py create mode 100644 hawk/analysis/config_te.py create mode 100644 hawk/analysis/metrics.py create mode 100644 hawk/analysis/pcmci_tools.py create mode 100644 hawk/analysis/run_postprocessing.py create mode 100644 hawk/analysis/run_postprocessing_followup.ipynb create mode 100644 hawk/analysis/run_simulation_pcmci.py create mode 100644 hawk/analysis/run_simulation_te.py create mode 100644 hawk/processes/simulation_interactive.py diff --git a/hawk/analysis/__init__.py b/hawk/analysis/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hawk/analysis/config_pcmci.py b/hawk/analysis/config_pcmci.py new file mode 100644 index 0000000..efe3cfd --- /dev/null +++ b/hawk/analysis/config_pcmci.py @@ -0,0 +1,78 @@ +import itertools + +import numpy as np +from tigramite.independence_tests.cmiknn import CMIknn +from tigramite.independence_tests.parcorr import ParCorr + +from ..data.data_ticino import ( + df_ticino, + df_ticino_snowlakes, + df_ticino_snowlakes_test, + df_ticino_snowlakes_tigramite, + df_ticino_snowlakes_train, + df_ticino_test, + df_ticino_tigramite, + df_ticino_train, + var_names_ticino, + var_names_ticino_snowlakes, +) + +seed = 42 +np.random.seed(seed) + +# Define the tests +parcorr = ParCorr(significance="analytic") +cmiknn = CMIknn(significance="shuffle_test", knn=0.1, shuffle_neighbors=5, transform="ranks", sig_samples=200) + +# Create the dictionary of tests +independence_tests = { + "parcorr": parcorr, + "cmiknn": cmiknn, +} + +# Create the dictionary of datasets +datasets = { + "snowlakes": { + "full_tigramite": df_ticino_snowlakes_tigramite, + "full": df_ticino_snowlakes, + "train": df_ticino_snowlakes_train, + "test": df_ticino_snowlakes_test, + "var_names": var_names_ticino_snowlakes, + }, +} + +# Variables +lag_options = [ + 0, + 1, +] +independence_tests_options = [ + "parcorr", + "cmiknn", +] +# NOTE add here if you want the base algorithm as well +algorithm_options = [ + "pcmci_plus", +] +dataset_options = [ + "normal", + "snowlakes", +] + +# Generating the configurations +configurations = [] + +for lag, independencetest, algorithm, dataset_name in itertools.product(lag_options, independence_tests_options, algorithm_options, dataset_options): + configuration = { + "params": { + "lag": lag, + "independencetest": independencetest, + "algorithm": algorithm, + }, + "dataset_name": dataset_name, + } + configurations.append(configuration) + + +def load_ticino(): + return datasets, configurations, independence_tests diff --git a/hawk/analysis/config_te.py b/hawk/analysis/config_te.py new file mode 100644 index 0000000..c2bd14d --- /dev/null +++ b/hawk/analysis/config_te.py @@ -0,0 +1,54 @@ +import itertools + +import numpy as np + +# Load here the dataset +# ... + +np.random.seed(42) + +# Define the different dataframes to use +datasets = { + "normal": { + "full": df_ticino, + "train": df_ticino_train, + "test": df_ticino_test, + "var_names": df_ticino.columns, + }, +} + +# Constants +# - `threshold` is set to be large in the forward direction (give me all the information) and 0 in the backward direction. +# - `k` rule of thumb: $1/20$ of the number of samples (try 5,10,20,30...) (TODO) +lagtarget = [1] +threshold_forward = float("inf") +threshold_backward = 0 +k = 10 + +# Variables set by the configuration +lagfeatures_options = [[0], [0, 1]] +directions = ["forward", "backward"] +dataset_names = [ + "normal", +] + +# Generating the configurations +configurations = [] + +for lagfeatures, direction, dataset_name in itertools.product(lagfeatures_options, directions, dataset_names): + threshold = threshold_forward if direction == "forward" else threshold_backward + configuration = { + "params": { + "lagfeatures": lagfeatures, + "lagtarget": lagtarget, + "direction": direction, + "threshold": threshold, # NOTE: the threshold is set here, although it is not used during the simulation, but only during the postprocessing, might be better to change this behavior + "k": k, + }, + "dataset_name": dataset_name, + } + configurations.append(configuration) + + +def load_te(): + return datasets, configurations diff --git a/hawk/analysis/metrics.py b/hawk/analysis/metrics.py new file mode 100644 index 0000000..a049508 --- /dev/null +++ b/hawk/analysis/metrics.py @@ -0,0 +1,117 @@ +from typing import Any, Dict, Optional + +import pandas as pd +from sklearn.linear_model import LinearRegression +from sklearn.metrics import r2_score +from sklearn.model_selection import BaseCrossValidator, cross_val_score + +inputs_names_lags_doc = """ +:param inputs_names_lags: A dictionary mapping input feature names to their corresponding list of lags. + For example, {'feature1': [1, 2], 'feature2': [1]} indicates 'feature1' should be lagged by 1 and 2 periods, + and 'feature2' by 1 period. +""" + +target_name_doc = """ +:param target_name: The name of the target variable in the DataFrame. +""" + +def prepare_data_with_lags( + df: pd.DataFrame, + inputs_names_lags: Dict[str, list[int]], + target_name: str, +) -> pd.DataFrame: + f""" + Prepares data for regression by generating lagged features for specified variables and targets. + + :param df: The pandas DataFrame containing the time series data. + {inputs_names_lags_doc} + {target_name_doc} + :return: A tuple containing the lagged features DataFrame and the target variable Series. + """ + + required_columns = set([*inputs_names_lags.keys(), target_name]) + if not required_columns.issubset(set(df.columns)): + raise ValueError("DataFrame 'df' must contain all the columns specified in 'features_names' and 'targets_names'.") + + for lags in inputs_names_lags.values(): + if min(lags) < 0: + raise ValueError("Lag for independent variables must be a non-negative integer.") + + # Initialize a list to hold all DataFrame chunks + lagged_chunks = [] + + # Generate lagged inputs for the independent variables + for input, lags in inputs_names_lags.items(): + for lag in lags: + lagged_chunk = df[input].shift(lag).to_frame(f"{input}_t-{lag}") + lagged_chunks.append(lagged_chunk) + + # Adding target column + lagged_chunks.append(df[target_name].to_frame(target_name)) + + # Concatenate chunks + df_lagged = pd.concat(lagged_chunks, axis=1) + + # Dropping rows with NaN values caused by shifting + df_lagged = df_lagged.dropna() + + return df_lagged.drop(columns=target_name), df_lagged[target_name] + + + + +def regression_analysis( + inputs_names_lags: Dict[str, list[int]], + target_name: str, + df: Optional[pd.DataFrame] = None, + cv_scheme: Optional[BaseCrossValidator] = None, + df_train: Optional[pd.DataFrame] = None, + df_test: Optional[pd.DataFrame] = None +) -> Any: + f""" + Performs regression analysis with support for either cross-validation or a train-test split, + based on the arguments provided. + + {inputs_names_lags_doc} + {target_name_doc} + :param df: DataFrame for cross-validation mode. If specified, cv_scheme must also be provided. + :param cv_scheme: Cross-validator object for cross-validation mode. If specified, df must also be provided. + :param df_train: Training DataFrame for train-test split mode. Required if df_test is provided. + :param df_test: Testing DataFrame for train-test split mode. Requires df_train to be specified. + :return: Cross-validated scores or R-squared scores from train-test evaluation. + """ + + # Check that exactly one mode is specified + cross_val_mode = bool(df is not None and cv_scheme is not None) + train_test_mode = bool(df_train is not None and df_test is not None) + if not (cross_val_mode ^ train_test_mode): + raise ValueError("Specify either cross-validation with 'cv_scheme' and 'df', or a train-test split with 'df_train' and 'df_test', not both.") + + if cross_val_mode: + + X, y = prepare_data_with_lags( + df, + inputs_names_lags, + target_name, + ) + + model = LinearRegression() + return cross_val_score(model, X, y, cv=cv_scheme) + + elif train_test_mode: + + X_train, y_train = prepare_data_with_lags( + df_train, + inputs_names_lags, + target_name, + ) + + X_test, y_test = prepare_data_with_lags( + df_test, + inputs_names_lags, + target_name, + ) + + model = LinearRegression().fit(X_train, y_train) + y_pred = model.predict(X_test) + return r2_score(y_test, y_pred) diff --git a/hawk/analysis/pcmci_tools.py b/hawk/analysis/pcmci_tools.py new file mode 100644 index 0000000..63a3859 --- /dev/null +++ b/hawk/analysis/pcmci_tools.py @@ -0,0 +1,53 @@ +import numpy as np +import pandas as pd +from tigramite import data_processing as pp + + +def get_connected_variables(graph: np.ndarray, var_names: list[str]) -> list[str]: + """ + Get the variables connected to the target in the graph. + The target is assumed to be the last variable. + The connection is considered of any type: from, to, or undefined. + + :param graph: the graph of the PCMCI algorithm, i.e. what's returned by PCMCI.run_pcmci(), array of shape [N, N, tau_max+1] + :param var_names: the names of the variables + """ + + assert len(graph.shape) == 3, "The graph must be a 3D array" + assert graph.shape[0] == graph.shape[1], "The graph must be square" + + # Inspecting the results object + # results['p_matrix'] + # results['val_matrix'] + # results['graph'] + # results['graph'][-1] # last element (target connections, target is always last) + + # in the array replace the empty with 0, otherwise it's 1 (there's a connection) + np.where(graph[-1] == "", 0, 1) + + # transpose it and add it to a dataframe with the variable names + # each row is a lag (when lag is 1, I have a row for lag 0 and one for lag 1) + target_connections = pd.DataFrame(np.where(graph[-1] == "", 0, 1).T, columns=var_names) + + # ignore autocorrelation (a connection from a variable to itself) + target_connections = target_connections.drop(var_names[-1], axis=1) + + # drop all columns with only zeros (no connection) and keep the names + connected_variables = list(target_connections.loc[:, (target_connections != 0).any(axis=0)].columns.values) + + return connected_variables + + +def initialize_tigramite_df(df: pd.DataFrame): + """ + Initialize a tigramite dataframe from a pandas dataframe + + :param df: pandas dataframe + :return: tigramite dataframe and variable names tuple + """ + + var_names = df.columns + + dataframe = pp.DataFrame(df.values, datatime={0: np.arange(len(df))}, var_names=var_names) + + return dataframe, var_names diff --git a/hawk/analysis/run_postprocessing.py b/hawk/analysis/run_postprocessing.py new file mode 100644 index 0000000..2943cce --- /dev/null +++ b/hawk/analysis/run_postprocessing.py @@ -0,0 +1,431 @@ +import os +import re + +import matplotlib.colors as mcolors +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +import thesis.constants as constants +import thesis.file_management as file_management +from tefs.metrics import regression_analysis +from thesis import datasets_and_configurations_loaders, pcmci_tools +from tigramite import plotting as tp + +plt.rc("text", usetex=True) + + +# Adjusted custom sort key function to handle lag sequences and replace them with the last lag +def general_custom_sort_key(s): + # Find all sequences like [0,1], [1,2,3,5,6], etc. + sequences = re.findall(r"\[\d+(?:,\d+)*\]", s) + + # Process each found sequence + for seq in sequences: + # Split the sequence into individual numbers, convert to int, and take the last number + last_num = seq.strip("[]").split(",")[-1] + + # Replace the original sequence with a format that ensures correct sorting + s = s.replace(seq, f"[{last_num}]", 1) # Replace only the first occurrence to maintain structure + + return s + + +def plot_feature_presence_and_r2(df_presence, scores_values, scores_labels): + # Create a figure with two subplots + fig, (ax_bar, ax_heatmap) = plt.subplots( + 2, + gridspec_kw={"height_ratios": [3, 10]}, + sharex=True, + figsize=(12, 5), + ) + + # Plot the heatmap of R2 values + sns.heatmap( + np.stack(scores_values), + annot=True, + fmt=".3f", + ax=ax_bar, + cmap="Greens", + vmin=0, + vmax=0.5, + cbar=False, + linewidths=0.2, + linecolor="black", + clip_on=False, + annot_kws={"size": 8}, + yticklabels=scores_labels, + ) + # ax_bar.set_yticks([]) + ax_bar.set_title(r"$R^2$ Value of the configuration") + ax_bar.tick_params(left=True, bottom=False) + ax_bar.set_yticklabels(ax_bar.get_yticklabels(), rotation=0) + + cmap_presence = mcolors.ListedColormap(["white", "black", "#851010"]) + bounds = [0, 1, 2, 3] # Boundaries for 0 -> white, 1 -> black, 2 -> red (#851010) + norm = mcolors.BoundaryNorm(bounds, cmap_presence.N) + + # Plot the heatmap of presences + sns.heatmap( + df_presence, + cmap=cmap_presence, + norm=norm, + cbar=False, + ax=ax_heatmap, + linewidths=0.2, + linecolor="black", + clip_on=False, + ) + ax_heatmap.set_ylabel("Feature name") + ax_heatmap.set_xlabel("Simulation ID") + ax_heatmap.tick_params(left=True, bottom=False) + + return fig, (ax_bar, ax_heatmap) + + +def main(): + # List all files in the results folder ending with .pkl + results_files = sorted([file for file in os.listdir(constants.path_results) if file.endswith(".pkl")], key=general_custom_sort_key) + + results_pcmci = {} + results_te = {} + + for file in results_files: + parts = file.split("_") + algorithm = parts[0] + basin = parts[1] + key = file.split(basin)[1][1:-4] + + if algorithm == "pcmci": + if basin not in results_pcmci: + results_pcmci[basin] = {} + + results_pcmci[basin][key] = file_management.load_from_pkl_file(os.path.join(constants.path_results, file)) + + elif algorithm == "te": + + if basin not in results_te: + results_te[basin] = {} + + results_te[basin][key] = file_management.load_from_pkl_file(os.path.join(constants.path_results, file)) + + # -------------------- PCMCI -------------------- + + for basin_name, basin_results in results_pcmci.items(): + datasets, _, _ = datasets_and_configurations_loaders["pcmci"].get(basin_name)() + + all_basin_variables = set() + results_table_pcmci = [] + for key, simulation in basin_results.items(): + dataframe = datasets[simulation["dataset_name"]] + var_names = dataframe["var_names"] + all_basin_variables.update(var_names.values) + + results = simulation["results"] + + # Plot only the connections to any of the target variables + temp_graph = results["graph"].copy() + + # Show only the connections to the target variables + # Identify the indexes of the target variables + # target_vars = np.where(["target" in var for var in var_names.values])[0] + # for i in range(temp_graph.shape[0]): + # for j in range(temp_graph.shape[1]): + # # if the edge is not connected to the target variables + # if i not in target_vars and j not in target_vars: + # # remove the edge + # temp_graph[i, j, :] = '' + # temp_graph[j, i, :] = '' + + # Base arguments for tp.plot_graph + plot_args = { + "val_matrix": results["val_matrix"], + "graph": temp_graph, + "var_names": var_names, + "link_colorbar_label": "cross-MCI", + "node_colorbar_label": "auto-MCI", + "show_autodependency_lags": False, + } + + # Additional arguments to include if the independence_test is CMIknn + if simulation["params"]["independencetest"] == "cmiknn": + plot_args.update( + { + "vmin_edges": 0.0, + "vmax_edges": 0.1, + "edge_ticks": 0.05, + "cmap_edges": "OrRd", + "vmin_nodes": 0, + "vmax_nodes": 0.1, + "node_ticks": 0.1, + "cmap_nodes": "OrRd", + } + ) + + # Plot causal graph + target_file = os.path.join(constants.path_figures, "algorithm_results", basin_name, "pcmci", key + ".pdf") + if not os.path.exists(target_file): + fig, ax = plt.subplots() + tp.plot_graph(**plot_args, fig_ax=(fig, ax)) + os.makedirs(os.path.dirname(target_file), exist_ok=True) + plt.savefig(target_file, bbox_inches="tight") + plt.close(fig) + + # Plot time series graph if lag > 0 + if simulation["params"]["lag"] > 0: + target_file = os.path.join(constants.path_figures, "algorithm_results", basin_name, "pcmci", key + "_timeseries.pdf") + if not os.path.exists(target_file): + fig, ax = plt.subplots() + tp.plot_time_series_graph( + figsize=(6, 4), + fig_ax=(fig, ax), + val_matrix=results["val_matrix"], + graph=results["graph"], + var_names=var_names, + link_colorbar_label="MCI", + ) + os.makedirs(os.path.dirname(target_file), exist_ok=True) + plt.savefig(target_file, bbox_inches="tight") + plt.close(fig) + + # Extract the selected features + selected_features = pcmci_tools.get_connected_variables(results["graph"], var_names) + basin_results[key]["selected_features"] = selected_features + + # Compute the R2 scores + inputs_names_lags = {feature: [0] for feature in selected_features} + score_r2 = ( + regression_analysis( + inputs_names_lags=inputs_names_lags, + target_name="target", + df_train=dataframe["train"], + df_test=dataframe["test"], + ) + if len(selected_features) > 0 + else np.nan + ) + basin_results[key]["score_r2"] = score_r2 + + inputs_names_lags = {feature: list(range(0, simulation["params"]["lag"] + 1)) for feature in selected_features} + score_r2_lag = ( + regression_analysis( + inputs_names_lags=inputs_names_lags, + target_name="target", + df_train=dataframe["train"], + df_test=dataframe["test"], + ) + if len(selected_features) > 0 + else np.nan + ) + basin_results[key]["score_r2_lag"] = score_r2_lag + + inputs_names_lags = {feature: list(range(0, simulation["params"]["lag"] + 1)) for feature in selected_features} + inputs_names_lags["target"] = list(range(1, simulation["params"]["lag"] + 1)) + score_r2_lag_ar = regression_analysis( + inputs_names_lags=inputs_names_lags, + target_name="target", + df_train=dataframe["train"], + df_test=dataframe["test"], + ) + basin_results[key]["score_r2_lag_ar"] = score_r2_lag_ar + + # Table of results + results_table_pcmci.append( + { + "selected_features": " ".join(selected_features), + "score_r2": score_r2, + "score_r2_lag": score_r2_lag, + "score_r2_lag_ar": score_r2_lag_ar, + "dataset": simulation["dataset_name"], + "algorithm": simulation["params"]["algorithm"], + "independencetest": simulation["params"]["independencetest"], + "lag": simulation["params"]["lag"], + "execution_time": simulation["execution_time"], + } + ) + + results_table_pcmci = pd.DataFrame.from_records(results_table_pcmci) + + # Export the file to pkl + file_management.save_to_pkl_file(os.path.join(constants.path_table_objects, f"results_table_{basin_name}_pcmci.pkl"), results_table_pcmci) + + # Feature presences heatmap + if "target" in all_basin_variables: + all_basin_variables.remove("target") + all_basin_variables = sorted(list(all_basin_variables)) + df_presence = pd.DataFrame(index=all_basin_variables, columns=range(len(basin_results))) + scores = [] + scores_lag = [] + scores_lag_ar = [] + + for index, key in enumerate(basin_results): + simulation = basin_results[key] + scores.append(simulation["score_r2"]) + scores_lag.append(simulation["score_r2_lag"]) + scores_lag_ar.append(simulation["score_r2_lag_ar"]) + + # loop through the rows of the df, if the feature is in the list of selected features, put a 1 + for feature in df_presence.index: + if feature in simulation["selected_features"]: + df_presence.loc[feature, index] = 1 + else: + df_presence.loc[feature, index] = 0 + if feature not in datasets[simulation["dataset_name"]]["var_names"]: + df_presence.loc[feature, index] = 2 + + df_presence = df_presence.astype(float) + scores = np.array(scores) + scores_lag = np.array(scores_lag) + scores_lag_ar = np.array(scores_lag_ar) + + fig, ax = plot_feature_presence_and_r2( + df_presence=df_presence, + scores_values=[scores, scores_lag, scores_lag_ar], + scores_labels=[r"$R^2$", r"$R^2$ (lag)", r"$R^2$ (lag + AR)"], + ) + target_file = os.path.join(constants.path_figures, "algorithm_results", basin_name, "pcmci", "feature_presence.pdf") + os.makedirs(os.path.dirname(target_file), exist_ok=True) + plt.savefig(target_file, bbox_inches="tight") + plt.close(fig) + + # -------------------- TRANSFER ENTROPY -------------------- + + for basin_name, basin_results in results_te.items(): + datasets, _ = datasets_and_configurations_loaders["te"].get(basin_name)() + + all_basin_variables = set() + results_table_te = [] + for key, simulation in basin_results.items(): + dataset_name = simulation["dataset_name"] + dataframe = datasets[dataset_name] + var_names = dataframe["var_names"] + all_basin_variables.update(var_names) + + results = simulation["results"] + lagfeatures = simulation["params"]["lagfeatures"] + lagtarget = simulation["params"]["lagtarget"] + + # Plot the results + fig, ax = plt.subplots() + results.plot_te_results(ax=ax) + target_dir = os.path.join(constants.path_figures, "algorithm_results", basin_name, "te", key + ".pdf") + os.makedirs(os.path.dirname(target_dir), exist_ok=True) + plt.savefig(target_dir, bbox_inches="tight") + plt.close(fig) + + # Extract the selected features + selected_features_names = results.select_features(simulation["params"]["threshold"]) + basin_results[key]["selected_features"] = selected_features_names + + # get the r2 score on the test set + inputs_names_lags = {feature: [0] for feature in selected_features_names} + score_r2 = ( + regression_analysis( + inputs_names_lags=inputs_names_lags, + target_name="target", + df_train=dataframe["train"], + df_test=dataframe["test"], + ) + if len(selected_features_names) > 0 + else np.nan + ) + basin_results[key]["score_r2"] = score_r2 + + inputs_names_lags = {feature: lagfeatures for feature in selected_features_names} + score_r2_lag = ( + regression_analysis( + inputs_names_lags=inputs_names_lags, + target_name="target", + df_train=dataframe["train"], + df_test=dataframe["test"], + ) + if len(selected_features_names) > 0 + else np.nan + ) + basin_results[key]["score_r2_lag"] = score_r2_lag + + inputs_names_lags = {feature: lagfeatures for feature in selected_features_names} + inputs_names_lags["target"] = lagtarget + score_r2_lag_ar = regression_analysis( + inputs_names_lags=inputs_names_lags, + target_name="target", + df_train=dataframe["train"], + df_test=dataframe["test"], + ) + basin_results[key]["score_r2_lag_ar"] = score_r2_lag_ar + + # Table of results + results_table_te.append( + { + "selected_features": " ".join(selected_features_names), + "score_r2": score_r2, + "score_r2_lag": score_r2_lag, + "score_r2_lag_ar": score_r2_lag_ar, + "dataset": dataset_name, + "lagfeatures": simulation["params"]["lagfeatures"], + "lagtarget": simulation["params"]["lagtarget"], + "direction": simulation["params"]["direction"], # not putting threshold and k + "execution_time": simulation["execution_time"], + } + ) + + results_table_te = pd.DataFrame.from_records(results_table_te) + + # Export the file to pkl + file_management.save_to_pkl_file(os.path.join(constants.path_table_objects, f"results_table_{basin_name}_te.pkl"), results_table_te) + + # Feature presences heatmap + if "target" in all_basin_variables: + all_basin_variables.remove("target") + all_basin_variables = sorted(list(all_basin_variables)) + df_presence = pd.DataFrame(index=all_basin_variables, columns=range(len(basin_results))) + scores = [] + scores_lag = [] + scores_lag_ar = [] + + for index, key in enumerate(basin_results): + simulation = basin_results[key] + scores.append(simulation["score_r2"]) + scores_lag.append(simulation["score_r2_lag"]) + scores_lag_ar.append(simulation["score_r2_lag_ar"]) + + # loop through the rows of the df, if the feature is in the list of selected features, put a 1 + for feature in df_presence.index: + if feature in simulation["selected_features"]: + df_presence.loc[feature, index] = 1 + else: + df_presence.loc[feature, index] = 0 + if feature not in datasets[simulation["dataset_name"]]["var_names"]: + df_presence.loc[feature, index] = 2 + + df_presence = df_presence.astype(float) + scores = np.array(scores) + scores_lag = np.array(scores_lag) + scores_lag_ar = np.array(scores_lag_ar) + + fig, ax = plot_feature_presence_and_r2( + df_presence=df_presence, + scores_values=[scores, scores_lag, scores_lag_ar], + scores_labels=[r"$R^2$", r"$R^2$ (lag)", r"$R^2$ (lag + AR)"], + ) + target_file = os.path.join(constants.path_figures, "algorithm_results", basin_name, "te", "feature_presence.pdf") + os.makedirs(os.path.dirname(target_file), exist_ok=True) + plt.savefig(target_file, bbox_inches="tight") + plt.close(fig) + + +if __name__ == "__main__": + main() + + # Optional execution of the notebook + + # import nbformat + # from nbconvert.preprocessors import ExecutePreprocessor + + # filename = 'droughts_postprocessing_pt2_and_wrapper.ipynb' + # with open(filename) as ff: + # nb_in = nbformat.read(ff, nbformat.NO_CONVERT) + + # ep = ExecutePreprocessor(timeout=600, kernel_name='thesis') + + # nb_out = ep.preprocess(nb_in) diff --git a/hawk/analysis/run_postprocessing_followup.ipynb b/hawk/analysis/run_postprocessing_followup.ipynb new file mode 100644 index 0000000..72dd614 --- /dev/null +++ b/hawk/analysis/run_postprocessing_followup.ipynb @@ -0,0 +1,4860 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3584ce0f", + "metadata": {}, + "source": [ + "# [Post-processing of results (second part)](#toc0_)" + ] + }, + { + "cell_type": "markdown", + "id": "8b66567c", + "metadata": {}, + "source": [ + "**Table of contents** \n", + "- [Post-processing of results (second part)](#toc1_) \n", + " - [Preliminaries](#toc1_1_) \n", + " - [Import libraries](#toc1_1_1_) \n", + " - [Utility functions](#toc1_1_2_) \n", + " - [Utilities for the summarized version](#toc1_1_3_) \n", + " - [Utilities for the full version](#toc1_1_4_) \n", + " - [Load the results](#toc1_1_5_) \n", + " - [Basin: E12GM](#toc1_2_) \n", + " - [Versione full](#toc1_2_1_) \n", + " - [PCMCI](#toc1_2_1_1_) \n", + " - [TEFS](#toc1_2_1_2_) \n", + " - [Versione summarized](#toc1_2_2_) \n", + " - [PCMCI](#toc1_2_2_1_) \n", + " - [TEFS](#toc1_2_2_2_) \n", + " - [Versione full senza CMI](#toc1_2_3_) \n", + " - [TEFS](#toc1_2_3_1_) \n", + " - [TEFS as wrapper on E12GM](#toc1_2_4_) \n", + " - [Linking the wrapper to the original filter method](#toc1_2_5_) \n", + " - [Basin: Ticino](#toc1_3_) \n", + " - [Versione full](#toc1_3_1_) \n", + " - [PCMCI](#toc1_3_1_1_) \n", + " - [TEFS](#toc1_3_1_2_) \n", + "\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "30e07d51", + "metadata": {}, + "source": [ + "## [Preliminaries](#toc0_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### [Import libraries](#toc0_)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "24d19ad5", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.chdir(os.path.dirname(os.path.abspath(__file__)))\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import scipy\n", + "import thesis.constants as constants\n", + "import thesis.file_management as file_management\n", + "from sklearn.model_selection import (\n", + " KFold,\n", + " TimeSeriesSplit,\n", + ")\n", + "from tefs.metrics import regression_analysis\n", + "from thesis import datasets_and_configurations_loaders" + ] + }, + { + "cell_type": "markdown", + "id": "d46401c9", + "metadata": {}, + "source": [ + "Set the retina resolution" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b047fdf8", + "metadata": {}, + "outputs": [], + "source": [ + "%config InlineBackend.figure_format = 'retina'" + ] + }, + { + "cell_type": "markdown", + "id": "2f374adf", + "metadata": {}, + "source": [ + "Enable the use of LaTeX for plots." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9be4a454", + "metadata": {}, + "outputs": [], + "source": [ + "plt.rc(\"text\", usetex=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### [Utility functions](#toc0_)\n", + "\n", + "General purpose formatter functions, valid for all stylers." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e4b8f2ce", + "metadata": {}, + "outputs": [], + "source": [ + "def makecell_code_formatter(x):\n", + " \"\"\"\n", + " Format a string to be used in a LaTeX table cell. Specifically, given a list of names, it will format them as a\n", + " single column with each name in a separate row and in a monospaced font.\n", + " \"\"\"\n", + " elements = x.split(\" \")\n", + " formatted_elements = [f\"\\\\texttt{{{element}}}\" for element in elements]\n", + " return \"\\\\makecell[l]{\" + \"\\\\\\\\ \".join(formatted_elements) + \"}\"\n", + "\n", + "\n", + "def format_time(seconds):\n", + " \"\"\"\n", + " Format a time in seconds to a human-readable format.\n", + " \"\"\"\n", + " return f\"{seconds:.3f}s\"\n", + "\n", + "\n", + "def highlight_row(s, row_indexes, color):\n", + " \"\"\"\n", + " Highlight the given row indexes of Series s with the given color.\n", + " \"\"\"\n", + " if \"test\" in s.name.lower():\n", + " return [\"\" for _ in s] # No styling for columns with \"test\" in their name\n", + " return [\"background-color: \" + color if i in row_indexes else \"\" for i in range(len(s))]\n", + "\n", + "\n", + "def color_direction(v):\n", + " \"\"\"\n", + " Color the text of a cell according to the direction of the value.\n", + " \"\"\"\n", + " color = \"black\"\n", + " if v == \"backward\":\n", + " color = \"red\"\n", + " elif v == \"forward\":\n", + " color = \"blue\"\n", + " return f\"color: {color}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### [Utilities for the summarized version](#toc0_)\n", + "\n", + "Some utilities are specific to the summarized version of the table of results." + ] + }, + { + "cell_type": "markdown", + "id": "d8369205", + "metadata": {}, + "source": [ + "For the PCMCI version." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5b725bd8", + "metadata": {}, + "outputs": [], + "source": [ + "def make_pcmci_pretty(styler):\n", + " styler.format(subset=[\"score_r2_lag\", \"score_r2_lag_ar\"], precision=3)\n", + " styler.background_gradient(cmap=\"Greens\", subset=[\"score_r2_lag\", \"score_r2_lag_ar\"], vmin=0, vmax=0.5)\n", + " styler.format(subset=[\"execution_time\"], precision=2)\n", + " # styler.format(formatter=format_time, subset=[\"execution_time\"])\n", + " styler.background_gradient(cmap=\"Reds\", subset=[\"execution_time\"], vmax=8000)\n", + " return styler\n", + "\n", + "\n", + "from pandas.io.formats.style_render import _escape_latex\n", + "\n", + "# I lost too much time trying to figure out why I can't format the \"names\" of the indexes (to escape them)\n", + "# The guy who wrote it didn't allow for this possibility, but thankfully (https://stackoverflow.com/questions/72716879/is-there-a-function-to-format-the-index-name-in-a-pandas-styler-dataframe-style)\n", + "# he proposed a workaround and opened a github issue (https://github.com/pandas-dev/pandas/issues/47489)\n", + "# But he didn't realize that this doesn't work on multi-indexes, so I had to modify his code a bit\n", + "\n", + "\n", + "def export_pcmci_df_to_latex(df, target_file, code_escaped_columns=[]):\n", + " temp_df = df.copy()\n", + " for level in range(temp_df.index.nlevels):\n", + " if temp_df.index.get_level_values(level).name is not None:\n", + " temp_df.index.set_names(_escape_latex(temp_df.index.get_level_values(level).name), level=level, inplace=True)\n", + " for level in range(temp_df.columns.nlevels):\n", + " if temp_df.columns.get_level_values(level).name is not None:\n", + " temp_df.columns.set_names(_escape_latex(temp_df.columns.get_level_values(level).name), level=level, inplace=True)\n", + "\n", + " with open(target_file, \"w\") as f:\n", + " f.write(\n", + " temp_df.style.pipe(make_pcmci_pretty)\n", + " .format_index(escape=\"latex\", axis=\"index\")\n", + " .format_index(escape=\"latex\", axis=\"columns\")\n", + " .format(formatter=makecell_code_formatter, escape=\"latex\", subset=code_escaped_columns)\n", + " .to_latex(hrules=True, clines=\"all;index\", convert_css=True, column_format=\"cclccccrr\")\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "580cc079", + "metadata": {}, + "source": [ + "For the TEFS version." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3406e415", + "metadata": {}, + "outputs": [], + "source": [ + "def make_te_pretty(styler):\n", + " styler.format(subset=[\"score_r2_lag\", \"score_r2_lag_ar\"], precision=3)\n", + " styler.background_gradient(cmap=\"Greens\", subset=[\"score_r2_lag\", \"score_r2_lag_ar\"], vmin=0, vmax=0.5)\n", + " styler.format(subset=[\"execution_time\"], precision=2)\n", + " styler.background_gradient(cmap=\"Reds\", subset=[\"execution_time\"], vmax=8000)\n", + " return styler\n", + "\n", + "\n", + "def export_te_df_to_latex(df, target_file, code_escaped_columns=[]):\n", + " temp_df = df.copy()\n", + " for level in range(temp_df.index.nlevels):\n", + " if temp_df.index.get_level_values(level).name is not None:\n", + " temp_df.index.set_names(_escape_latex(temp_df.index.get_level_values(level).name), level=level, inplace=True)\n", + " for level in range(temp_df.columns.nlevels):\n", + " if temp_df.columns.get_level_values(level).name is not None:\n", + " temp_df.columns.set_names(_escape_latex(temp_df.columns.get_level_values(level).name), level=level, inplace=True)\n", + "\n", + " with open(target_file, \"w\") as f:\n", + " f.write(\n", + " temp_df.style.pipe(make_te_pretty)\n", + " .format_index(escape=\"latex\", axis=\"index\")\n", + " .format_index(escape=\"latex\", axis=\"columns\")\n", + " .format(formatter=makecell_code_formatter, escape=\"latex\", subset=code_escaped_columns)\n", + " .to_latex(hrules=True, clines=\"all;index\", convert_css=True, column_format=\"cclccccrr\")\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### [Utilities for the full version](#toc0_)\n", + "\n", + "Some utilities are specific to the full version of the table of results." + ] + }, + { + "cell_type": "markdown", + "id": "f5c023e4", + "metadata": {}, + "source": [ + "For the PCMCI version." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0ec91349", + "metadata": {}, + "outputs": [], + "source": [ + "def make_pcmci_all_pretty(styler):\n", + " styler.format(subset=[\"score_r2\", \"score_r2_lag\", \"score_r2_lag_ar\"], precision=3)\n", + " styler.background_gradient(cmap=\"Greens\", subset=[\"score_r2\", \"score_r2_lag\", \"score_r2_lag_ar\"], vmin=0, vmax=0.5)\n", + " styler.format(formatter=format_time, subset=[\"execution_time\"])\n", + " styler.background_gradient(cmap=\"Reds\", subset=[\"execution_time\"], vmax=8000)\n", + " return styler\n", + "\n", + "\n", + "def export_results_dataframe_pcmci(df: pd.DataFrame, target_file: str):\n", + " with open(target_file, \"w\") as f:\n", + " f.write(\n", + " df.style.pipe(make_pcmci_all_pretty)\n", + " .format_index(escape=\"latex\", axis=1)\n", + " .format(formatter=makecell_code_formatter, escape=\"latex\", subset=[\"selected_features\", \"dataset\", \"algorithm\", \"independencetest\"])\n", + " .to_latex(hrules=True, clines=\"all;data\", convert_css=True, column_format=\"llccclllcr\")\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "e563bd7f", + "metadata": {}, + "source": [ + "For the TEFS version." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ece2aa88", + "metadata": {}, + "outputs": [], + "source": [ + "# https://pandas.pydata.org/docs/reference/api/pandas.io.formats.style.Styler.html\n", + "# https://pandas.pydata.org/docs/reference/api/pandas.io.formats.style.Styler.format.html\n", + "# https://pandas.pydata.org/docs/reference/api/pandas.io.formats.style.Styler.to_latex.html\n", + "# https://www.youtube.com/watch?v=JGefS6WPm1E\n", + "# https://tex.stackexchange.com/questions/2441/how-to-add-a-forced-line-break-inside-a-table-cell\n", + "def make_te_all_pretty(styler):\n", + " styler.format(subset=[\"score_r2\", \"score_r2_lag\", \"score_r2_lag_ar\"], precision=3)\n", + " styler.background_gradient(cmap=\"Greens\", subset=[\"score_r2\", \"score_r2_lag\", \"score_r2_lag_ar\"], vmin=0, vmax=0.5)\n", + " #styler.map(color_direction, subset=[\"direction\"])\n", + " styler.format(formatter=format_time, subset=[\"execution_time\"])\n", + " styler.background_gradient(cmap=\"Reds\", subset=[\"execution_time\"], vmax=8000)\n", + " # styler.apply(highlight_row, row_indexes=[5,7], color='yellow', axis=0)\n", + " return styler\n", + "\n", + "\n", + "def export_results_dataframe_te(df: pd.DataFrame, target_file: str):\n", + " with open(target_file, \"w\") as f:\n", + " f.write(\n", + " df.style.pipe(make_te_all_pretty)\n", + " .format_index(escape=\"latex\", axis=1)\n", + " .format(formatter=makecell_code_formatter, escape=\"latex\", subset=[\"selected_features\", \"dataset\", \"direction\"])\n", + " .to_latex(hrules=True, clines=\"all;data\", convert_css=True, column_format=\"llccclcclr\")\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### [Load the results](#toc0_)\n", + "\n", + "Load the previously exported pandas dataframes containing the results of the analysis." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "9aae6d86", + "metadata": {}, + "outputs": [], + "source": [ + "results_e12gm_pcmci = file_management.load_from_pkl_file(os.path.join(constants.path_table_objects, \"results_table_e12gm_pcmci.pkl\"))\n", + "results_e12gm_te = file_management.load_from_pkl_file(os.path.join(constants.path_table_objects, \"results_table_e12gm_te.pkl\"))\n", + "results_ticino_pcmci = file_management.load_from_pkl_file(os.path.join(constants.path_table_objects, \"results_table_ticino_pcmci.pkl\"))\n", + "results_ticino_te = file_management.load_from_pkl_file(os.path.join(constants.path_table_objects, \"results_table_ticino_te.pkl\"))\n", + "\n", + "results_e12gm_noCMI_te = file_management.load_from_pkl_file(os.path.join(constants.path_table_objects, \"results_table_e12gm_noCMI_te.pkl\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7377b8af", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Emiliani1Emiliani2GardaMincioTicino
solo (lag=0)0.2860710.2435340.1713070.154807
solo (lag=0,1)0.2840430.3061110.1919330.177646
solo + extended (lag=0)0.3444480.2924310.1854930.162634
solo + extended (lag=0,1)0.3732610.2927330.1695520.167326
ar(1)0.2752320.2920350.2896530.199807
ar(1) + solo (lag=0,1)0.4249010.4325530.4377950.331817
ar(1) + solo + extended (lag=0,1)0.4687670.4304170.4136160.330220
\n", + "
" + ], + "text/plain": [ + " Emiliani1 Emiliani2 GardaMincio Ticino\n", + "solo (lag=0) 0.286071 0.243534 0.171307 0.154807\n", + "solo (lag=0,1) 0.284043 0.306111 0.191933 0.177646\n", + "solo + extended (lag=0) 0.344448 0.292431 0.185493 0.162634\n", + "solo + extended (lag=0,1) 0.373261 0.292733 0.169552 0.167326\n", + "ar(1) 0.275232 0.292035 0.289653 0.199807\n", + "ar(1) + solo (lag=0,1) 0.424901 0.432553 0.437795 0.331817\n", + "ar(1) + solo + extended (lag=0,1) 0.468767 0.430417 0.413616 0.330220" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from scripts.run_benchmark import baseline\n", + "\n", + "baseline" + ] + }, + { + "cell_type": "markdown", + "id": "6d0bfac1", + "metadata": {}, + "source": [ + "## [Basin: Ticino](#toc0_)" + ] + }, + { + "cell_type": "markdown", + "id": "6477aa9b", + "metadata": {}, + "source": [ + "### [Full version](#toc0_)" + ] + }, + { + "cell_type": "markdown", + "id": "7223c7a0", + "metadata": {}, + "source": [ + "#### [PCMCI](#toc0_)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "1afbe3b5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 selected_featuresscore_r2score_r2_lagscore_r2_lag_ardatasetalgorithmindependencetestlagexecution_time
0cyclostationary_mean_tg_0 cyclostationary_mean_rr_4w_00.1490.1490.149normalpcmci_pluscmiknn032.662s
1cyclostationary_mean_tg_0 cyclostationary_mean_rr_4w_00.1490.1490.149normalpcmci_plusparcorr00.018s
2cyclostationary_mean_tg_00.1380.1880.331normalpcmci_pluscmiknn1112.996s
3cyclostationary_mean_tg_00.1380.1880.331normalpcmci_plusparcorr10.069s
4cyclostationary_mean_HS_0 cyclostationary_mean_tg_0 cyclostationary_mean_rr_4w_00.1490.1490.149snowlakespcmci_pluscmiknn0264.168s
5cyclostationary_mean_tg_0 cyclostationary_mean_rr_4w_00.1490.1490.149snowlakespcmci_plusparcorr00.146s
6cyclostationary_mean_tg_00.1380.1880.331snowlakespcmci_pluscmiknn1347.146s
7cyclostationary_mean_tg_00.1380.1880.331snowlakespcmci_plusparcorr10.285s
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_file = os.path.join(constants.path_table_tex, \"ticino_pcmci_full.tex\")\n", + "os.makedirs(os.path.dirname(target_file), exist_ok=True)\n", + "export_results_dataframe_pcmci(results_ticino_pcmci, target_file)\n", + "results_ticino_pcmci.style.pipe(make_pcmci_all_pretty)" + ] + }, + { + "cell_type": "markdown", + "id": "81df3268", + "metadata": {}, + "source": [ + "#### [TEFS](#toc0_)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "62939c64", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 selected_featuresscore_r2score_r2_lagscore_r2_lag_ardatasetlagfeatureslagtargetdirectionexecution_time
0cyclostationary_mean_tg_00.1380.1380.346normal[0][1]backward0.401s
1cyclostationary_mean_tg_00.1380.1380.346normal[0][1]forward0.372s
2cyclostationary_mean_tg_00.1380.1880.331normal[0, 1][1]backward0.415s
3cyclostationary_mean_tg_00.1380.1880.331normal[0, 1][1]forward0.388s
4cyclostat_level_Lugano cyclostationary_mean_tg_00.1940.1940.360snowlakes[0][1]backward1.650s
5cyclostat_level_Lugano cyclostationary_mean_tg_00.1940.1940.360snowlakes[0][1]forward1.612s
6cyclostationary_mean_HS_0 cyclostationary_mean_tg_20.0670.0830.309snowlakes[0, 1][1]backward1.777s
7cyclostat_level_Lugano cyclostationary_mean_tg_00.1940.2380.345snowlakes[0, 1][1]forward1.499s
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_file = os.path.join(constants.path_table_tex, \"ticino_te_full.tex\")\n", + "export_results_dataframe_te(results_ticino_te, target_file)\n", + "results_ticino_te.style.pipe(make_te_all_pretty)" + ] + }, + { + "cell_type": "markdown", + "id": "f465bea2", + "metadata": {}, + "source": [ + "## [Basin: E12GM](#toc0_)" + ] + }, + { + "cell_type": "markdown", + "id": "e78c4399", + "metadata": {}, + "source": [ + "### [Full version](#toc0_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### [PCMCI](#toc0_)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "48873e13", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 selected_featuresscore_r2score_r2_lagscore_r2_lag_ardatasetalgorithmindependencetestlagexecution_time
0E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_0 E1cyclostationary_mean_rr_1w_16 E1cyclostationary_mean_rr_24w_20.2860.2860.286df_E1pcmci_pluscmiknn0384.152s
1E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_0 E1cyclostationary_mean_rr_24w_20.2890.2890.289df_E1pcmci_plusparcorr00.046s
2E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_0 E1cyclostationary_mean_rr_1w_160.2510.2550.420df_E1pcmci_pluscmiknn11308.914s
3E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_00.2570.2650.421df_E1pcmci_plusparcorr10.274s
4E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_rr_1w_16 E1cyclostationary_mean_rr_24w_20.1810.1810.181df_E1allfeaturespcmci_pluscmiknn05532.618s
5E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_rr_24w_2 E2cyclostationary_mean_tg_00.2540.2540.254df_E1allfeaturespcmci_plusparcorr00.678s
6E1cyclostationary_mean_rr_4w_10.1510.1480.368df_E1allfeaturespcmci_pluscmiknn17726.129s
7E1cyclostationary_mean_rr_4w_10.1510.1480.368df_E1allfeaturespcmci_plusparcorr110.819s
8E2cyclostationary_mean_tg_0 E2cyclostationary_mean_rr_8w_0 E2cyclostationary_mean_rr_4w_50.2300.2300.230df_E2pcmci_pluscmiknn0810.721s
9E2cyclostationary_mean_tg_0 E2cyclostationary_mean_rr_4w_50.2220.2220.222df_E2pcmci_plusparcorr00.058s
10E2cyclostationary_mean_tg_0 E2cyclostationary_mean_rr_4w_50.2220.2390.416df_E2pcmci_pluscmiknn11596.577s
11E2cyclostationary_mean_tg_0 E2cyclostationary_mean_rr_8w_0 E2cyclostationary_mean_rr_4w_50.2300.2480.402df_E2pcmci_plusparcorr10.315s
12E1cyclostationary_mean_rr_24w_2 E2cyclostationary_mean_tg_0 GMcyclostationary_mean_rr_4w_10.2980.2980.298df_E2allfeaturespcmci_pluscmiknn05081.932s
13E1cyclostationary_mean_rr_4w_1 E2cyclostationary_mean_tg_00.1910.1910.191df_E2allfeaturespcmci_plusparcorr00.341s
14E2cyclostationary_mean_tg_00.1120.1460.391df_E2allfeaturespcmci_pluscmiknn15268.379s
15E2cyclostationary_mean_tg_00.1120.1460.391df_E2allfeaturespcmci_plusparcorr18.094s
16GMcyclostationary_mean_tg_1w_0 GMcyclostationary_mean_rr_4w_10.1770.1770.177df_GMpcmci_pluscmiknn087.161s
17GMcyclostationary_mean_tg_1w_0 GMcyclostationary_mean_rr_4w_10.1770.1770.177df_GMpcmci_plusparcorr00.016s
18GMcyclostationary_mean_tg_1w_0 GMcyclostationary_mean_rr_4w_10.1770.2030.443df_GMpcmci_pluscmiknn1504.896s
19GMcyclostationary_mean_tg_1w_0 GMcyclostationary_mean_rr_4w_10.1770.2030.443df_GMpcmci_plusparcorr10.089s
20E1cyclostationary_mean_rr_1w_16 E1cyclostationary_mean_rr_24w_2 E2cyclostationary_mean_tg_00.0880.0880.088df_GMallfeaturespcmci_pluscmiknn01826.917s
21E1cyclostationary_mean_rr_4w_1 E2cyclostationary_mean_tg_00.1100.1100.110df_GMallfeaturespcmci_plusparcorr00.345s
22E1cyclostationary_mean_rr_4w_1 E2cyclostationary_mean_tg_00.1100.1200.406df_GMallfeaturespcmci_pluscmiknn14688.385s
23E1cyclostationary_mean_rr_4w_1 E2cyclostationary_mean_tg_00.1100.1200.406df_GMallfeaturespcmci_plusparcorr19.835s
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_file = os.path.join(constants.path_table_tex, \"e12gm_pcmci_full.tex\")\n", + "os.makedirs(os.path.dirname(target_file), exist_ok=True)\n", + "export_results_dataframe_pcmci(results_e12gm_pcmci, target_file)\n", + "results_e12gm_pcmci.style.pipe(make_pcmci_all_pretty)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### [TEFS](#toc0_)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "0dcb9a0b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 selected_featuresscore_r2score_r2_lagscore_r2_lag_ardatasetlagfeatureslagtargetdirectionexecution_time
0E1cyclostationary_mean_rr_1w_16 E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_00.2510.2510.457df_E1[0][1]backward0.688s
1E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_0 E1cyclostationary_mean_rr_1w_160.2510.2510.457df_E1[0][1]forward0.633s
2E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_00.2570.2650.421df_E1[0, 1][1]backward0.757s
3E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_00.2570.2650.421df_E1[0, 1][1]forward0.659s
4GMcyclostationary_mean_tg_1w_0 E2cyclostationary_mean_tg_0 E1cyclostationary_mean_rr_4w_10.2310.2310.405df_E1allfeatures[0][1]backward6.806s
5E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_0 E1cyclostationary_mean_rr_1w_160.2510.2510.457df_E1allfeatures[0][1]forward6.430s
6E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_00.2570.2650.421df_E1allfeatures[0, 1][1]backward7.315s
7E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_00.2570.2650.421df_E1allfeatures[0, 1][1]forward6.634s
8E2cyclostationary_mean_rr_8w_0 E2cyclostationary_mean_tg_00.1900.1900.423df_E2[0][1]backward1.123s
9E2cyclostationary_mean_tg_0 E2cyclostationary_mean_rr_8w_00.1900.1900.423df_E2[0][1]forward0.999s
10E2cyclostationary_mean_tg_00.1120.1460.391df_E2[0, 1][1]backward1.220s
11E2cyclostationary_mean_tg_00.1120.1460.391df_E2[0, 1][1]forward1.053s
12E2cyclostationary_mean_rr_8w_0 E2cyclostationary_mean_tg_00.1900.1900.423df_E2allfeatures[0][1]backward7.055s
13E2cyclostationary_mean_tg_0 E2cyclostationary_mean_rr_8w_00.1900.1900.423df_E2allfeatures[0][1]forward5.996s
14E2cyclostationary_mean_tg_00.1120.1460.391df_E2allfeatures[0, 1][1]backward7.272s
15GMcyclostationary_mean_tg_1w_00.1340.1460.422df_E2allfeatures[0, 1][1]forward6.608s
16GMcyclostationary_mean_rr_4w_1 GMcyclostationary_mean_tg_1w_00.1770.1770.359df_GM[0][1]backward0.357s
17GMcyclostationary_mean_tg_1w_0 GMcyclostationary_mean_rr_4w_10.1770.1770.359df_GM[0][1]forward0.350s
18GMcyclostationary_mean_tg_1w_00.0340.0500.411df_GM[0, 1][1]backward0.399s
19GMcyclostationary_mean_tg_1w_00.0340.0500.411df_GM[0, 1][1]forward0.356s
20E2cyclostationary_mean_tg_4w_0 GMcyclostationary_mean_rr_4w_1 E2cyclostationary_mean_tg_00.1580.1580.457df_GMallfeatures[0][1]backward6.943s
21E1cyclostationary_mean_rr_1w_16 E2cyclostationary_mean_tg_00.0130.0130.392df_GMallfeatures[0][1]forward6.280s
22E2cyclostationary_mean_rr_8w_0 E2cyclostationary_mean_tg_00.0640.0850.377df_GMallfeatures[0, 1][1]backward7.320s
23GMcyclostationary_mean_tg_1w_00.0340.0500.411df_GMallfeatures[0, 1][1]forward6.558s
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_file = os.path.join(constants.path_table_tex, \"e12gm_te_full.tex\")\n", + "export_results_dataframe_te(results_e12gm_te, target_file)\n", + "results_e12gm_te.style.pipe(make_te_all_pretty)" + ] + }, + { + "cell_type": "markdown", + "id": "b0494047", + "metadata": {}, + "source": [ + "### [Summarized version](#toc0_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### [PCMCI](#toc0_)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "ab8ddcde", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
selected_featuresscore_r2score_r2_lagscore_r2_lag_ardatasetalgorithmindependencetestlagexecution_time
0E1cyclostationary_mean_rr_4w_1 E1cyclostationa...0.2860710.2860710.286071df_E1pcmci_pluscmiknn0384.151514
1E1cyclostationary_mean_rr_4w_1 E1cyclostationa...0.2892500.2892500.289250df_E1pcmci_plusparcorr00.045593
2E1cyclostationary_mean_rr_4w_1 E1cyclostationa...0.2510600.2554660.419614df_E1pcmci_pluscmiknn11308.913636
3E1cyclostationary_mean_rr_4w_1 E1cyclostationa...0.2566390.2651900.420932df_E1pcmci_plusparcorr10.274470
4E1cyclostationary_mean_rr_4w_1 E1cyclostationa...0.1814240.1814240.181424df_E1allfeaturespcmci_pluscmiknn05532.617838
5E1cyclostationary_mean_rr_4w_1 E1cyclostationa...0.2542810.2542810.254281df_E1allfeaturespcmci_plusparcorr00.678409
6E1cyclostationary_mean_rr_4w_10.1505580.1476860.367535df_E1allfeaturespcmci_pluscmiknn17726.128555
7E1cyclostationary_mean_rr_4w_10.1505580.1476860.367535df_E1allfeaturespcmci_plusparcorr110.818769
8E2cyclostationary_mean_tg_0 E2cyclostationary_...0.2303890.2303890.230389df_E2pcmci_pluscmiknn0810.720993
9E2cyclostationary_mean_tg_0 E2cyclostationary_...0.2215220.2215220.221522df_E2pcmci_plusparcorr00.057513
10E2cyclostationary_mean_tg_0 E2cyclostationary_...0.2215220.2391270.415827df_E2pcmci_pluscmiknn11596.576730
11E2cyclostationary_mean_tg_0 E2cyclostationary_...0.2303890.2484500.401565df_E2pcmci_plusparcorr10.314501
12E1cyclostationary_mean_rr_24w_2 E2cyclostation...0.2977710.2977710.297771df_E2allfeaturespcmci_pluscmiknn05081.931851
13E1cyclostationary_mean_rr_4w_1 E2cyclostationa...0.1910820.1910820.191082df_E2allfeaturespcmci_plusparcorr00.341040
14E2cyclostationary_mean_tg_00.1115140.1461930.390651df_E2allfeaturespcmci_pluscmiknn15268.378825
15E2cyclostationary_mean_tg_00.1115140.1461930.390651df_E2allfeaturespcmci_plusparcorr18.094075
16GMcyclostationary_mean_tg_1w_0 GMcyclostationa...0.1765630.1765630.176563df_GMpcmci_pluscmiknn087.160898
17GMcyclostationary_mean_tg_1w_0 GMcyclostationa...0.1765630.1765630.176563df_GMpcmci_plusparcorr00.016086
18GMcyclostationary_mean_tg_1w_0 GMcyclostationa...0.1765630.2034890.442869df_GMpcmci_pluscmiknn1504.895575
19GMcyclostationary_mean_tg_1w_0 GMcyclostationa...0.1765630.2034890.442869df_GMpcmci_plusparcorr10.089332
20E1cyclostationary_mean_rr_1w_16 E1cyclostation...0.0879250.0879250.087925df_GMallfeaturespcmci_pluscmiknn01826.917106
21E1cyclostationary_mean_rr_4w_1 E2cyclostationa...0.1097710.1097710.109771df_GMallfeaturespcmci_plusparcorr00.345339
22E1cyclostationary_mean_rr_4w_1 E2cyclostationa...0.1097710.1199790.406443df_GMallfeaturespcmci_pluscmiknn14688.384780
23E1cyclostationary_mean_rr_4w_1 E2cyclostationa...0.1097710.1199790.406443df_GMallfeaturespcmci_plusparcorr19.835356
\n", + "
" + ], + "text/plain": [ + " selected_features score_r2 score_r2_lag \\\n", + "0 E1cyclostationary_mean_rr_4w_1 E1cyclostationa... 0.286071 0.286071 \n", + "1 E1cyclostationary_mean_rr_4w_1 E1cyclostationa... 0.289250 0.289250 \n", + "2 E1cyclostationary_mean_rr_4w_1 E1cyclostationa... 0.251060 0.255466 \n", + "3 E1cyclostationary_mean_rr_4w_1 E1cyclostationa... 0.256639 0.265190 \n", + "4 E1cyclostationary_mean_rr_4w_1 E1cyclostationa... 0.181424 0.181424 \n", + "5 E1cyclostationary_mean_rr_4w_1 E1cyclostationa... 0.254281 0.254281 \n", + "6 E1cyclostationary_mean_rr_4w_1 0.150558 0.147686 \n", + "7 E1cyclostationary_mean_rr_4w_1 0.150558 0.147686 \n", + "8 E2cyclostationary_mean_tg_0 E2cyclostationary_... 0.230389 0.230389 \n", + "9 E2cyclostationary_mean_tg_0 E2cyclostationary_... 0.221522 0.221522 \n", + "10 E2cyclostationary_mean_tg_0 E2cyclostationary_... 0.221522 0.239127 \n", + "11 E2cyclostationary_mean_tg_0 E2cyclostationary_... 0.230389 0.248450 \n", + "12 E1cyclostationary_mean_rr_24w_2 E2cyclostation... 0.297771 0.297771 \n", + "13 E1cyclostationary_mean_rr_4w_1 E2cyclostationa... 0.191082 0.191082 \n", + "14 E2cyclostationary_mean_tg_0 0.111514 0.146193 \n", + "15 E2cyclostationary_mean_tg_0 0.111514 0.146193 \n", + "16 GMcyclostationary_mean_tg_1w_0 GMcyclostationa... 0.176563 0.176563 \n", + "17 GMcyclostationary_mean_tg_1w_0 GMcyclostationa... 0.176563 0.176563 \n", + "18 GMcyclostationary_mean_tg_1w_0 GMcyclostationa... 0.176563 0.203489 \n", + "19 GMcyclostationary_mean_tg_1w_0 GMcyclostationa... 0.176563 0.203489 \n", + "20 E1cyclostationary_mean_rr_1w_16 E1cyclostation... 0.087925 0.087925 \n", + "21 E1cyclostationary_mean_rr_4w_1 E2cyclostationa... 0.109771 0.109771 \n", + "22 E1cyclostationary_mean_rr_4w_1 E2cyclostationa... 0.109771 0.119979 \n", + "23 E1cyclostationary_mean_rr_4w_1 E2cyclostationa... 0.109771 0.119979 \n", + "\n", + " score_r2_lag_ar dataset algorithm independencetest lag \\\n", + "0 0.286071 df_E1 pcmci_plus cmiknn 0 \n", + "1 0.289250 df_E1 pcmci_plus parcorr 0 \n", + "2 0.419614 df_E1 pcmci_plus cmiknn 1 \n", + "3 0.420932 df_E1 pcmci_plus parcorr 1 \n", + "4 0.181424 df_E1allfeatures pcmci_plus cmiknn 0 \n", + "5 0.254281 df_E1allfeatures pcmci_plus parcorr 0 \n", + "6 0.367535 df_E1allfeatures pcmci_plus cmiknn 1 \n", + "7 0.367535 df_E1allfeatures pcmci_plus parcorr 1 \n", + "8 0.230389 df_E2 pcmci_plus cmiknn 0 \n", + "9 0.221522 df_E2 pcmci_plus parcorr 0 \n", + "10 0.415827 df_E2 pcmci_plus cmiknn 1 \n", + "11 0.401565 df_E2 pcmci_plus parcorr 1 \n", + "12 0.297771 df_E2allfeatures pcmci_plus cmiknn 0 \n", + "13 0.191082 df_E2allfeatures pcmci_plus parcorr 0 \n", + "14 0.390651 df_E2allfeatures pcmci_plus cmiknn 1 \n", + "15 0.390651 df_E2allfeatures pcmci_plus parcorr 1 \n", + "16 0.176563 df_GM pcmci_plus cmiknn 0 \n", + "17 0.176563 df_GM pcmci_plus parcorr 0 \n", + "18 0.442869 df_GM pcmci_plus cmiknn 1 \n", + "19 0.442869 df_GM pcmci_plus parcorr 1 \n", + "20 0.087925 df_GMallfeatures pcmci_plus cmiknn 0 \n", + "21 0.109771 df_GMallfeatures pcmci_plus parcorr 0 \n", + "22 0.406443 df_GMallfeatures pcmci_plus cmiknn 1 \n", + "23 0.406443 df_GMallfeatures pcmci_plus parcorr 1 \n", + "\n", + " execution_time \n", + "0 384.151514 \n", + "1 0.045593 \n", + "2 1308.913636 \n", + "3 0.274470 \n", + "4 5532.617838 \n", + "5 0.678409 \n", + "6 7726.128555 \n", + "7 10.818769 \n", + "8 810.720993 \n", + "9 0.057513 \n", + "10 1596.576730 \n", + "11 0.314501 \n", + "12 5081.931851 \n", + "13 0.341040 \n", + "14 5268.378825 \n", + "15 8.094075 \n", + "16 87.160898 \n", + "17 0.016086 \n", + "18 504.895575 \n", + "19 0.089332 \n", + "20 1826.917106 \n", + "21 0.345339 \n", + "22 4688.384780 \n", + "23 9.835356 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results_e12gm_pcmci" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "a8dcdca7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
score_r2_lag_arscore_r2_lagexecution_time
features_setsingleallsingleallsingleall
datasetindependencetestfeatures_lag
E1cmiknncontemporary0.2860710.1814240.2860710.181424384.1515145532.617838
contemporary + 1-lagged0.4196140.3675350.2554660.1476861308.9136367726.128555
parcorrcontemporary0.2892500.2542810.2892500.2542810.0455930.678409
contemporary + 1-lagged0.4209320.3675350.2651900.1476860.27447010.818769
E2cmiknncontemporary0.2303890.2977710.2303890.297771810.7209935081.931851
contemporary + 1-lagged0.4158270.3906510.2391270.1461931596.5767305268.378825
parcorrcontemporary0.2215220.1910820.2215220.1910820.0575130.341040
contemporary + 1-lagged0.4015650.3906510.2484500.1461930.3145018.094075
GMcmiknncontemporary0.1765630.0879250.1765630.08792587.1608981826.917106
contemporary + 1-lagged0.4428690.4064430.2034890.119979504.8955754688.384780
parcorrcontemporary0.1765630.1097710.1765630.1097710.0160860.345339
contemporary + 1-lagged0.4428690.4064430.2034890.1199790.0893329.835356
\n", + "
" + ], + "text/plain": [ + " score_r2_lag_ar \\\n", + "features_set single all \n", + "dataset independencetest features_lag \n", + "E1 cmiknn contemporary 0.286071 0.181424 \n", + " contemporary + 1-lagged 0.419614 0.367535 \n", + " parcorr contemporary 0.289250 0.254281 \n", + " contemporary + 1-lagged 0.420932 0.367535 \n", + "E2 cmiknn contemporary 0.230389 0.297771 \n", + " contemporary + 1-lagged 0.415827 0.390651 \n", + " parcorr contemporary 0.221522 0.191082 \n", + " contemporary + 1-lagged 0.401565 0.390651 \n", + "GM cmiknn contemporary 0.176563 0.087925 \n", + " contemporary + 1-lagged 0.442869 0.406443 \n", + " parcorr contemporary 0.176563 0.109771 \n", + " contemporary + 1-lagged 0.442869 0.406443 \n", + "\n", + " score_r2_lag \\\n", + "features_set single all \n", + "dataset independencetest features_lag \n", + "E1 cmiknn contemporary 0.286071 0.181424 \n", + " contemporary + 1-lagged 0.255466 0.147686 \n", + " parcorr contemporary 0.289250 0.254281 \n", + " contemporary + 1-lagged 0.265190 0.147686 \n", + "E2 cmiknn contemporary 0.230389 0.297771 \n", + " contemporary + 1-lagged 0.239127 0.146193 \n", + " parcorr contemporary 0.221522 0.191082 \n", + " contemporary + 1-lagged 0.248450 0.146193 \n", + "GM cmiknn contemporary 0.176563 0.087925 \n", + " contemporary + 1-lagged 0.203489 0.119979 \n", + " parcorr contemporary 0.176563 0.109771 \n", + " contemporary + 1-lagged 0.203489 0.119979 \n", + "\n", + " execution_time \n", + "features_set single all \n", + "dataset independencetest features_lag \n", + "E1 cmiknn contemporary 384.151514 5532.617838 \n", + " contemporary + 1-lagged 1308.913636 7726.128555 \n", + " parcorr contemporary 0.045593 0.678409 \n", + " contemporary + 1-lagged 0.274470 10.818769 \n", + "E2 cmiknn contemporary 810.720993 5081.931851 \n", + " contemporary + 1-lagged 1596.576730 5268.378825 \n", + " parcorr contemporary 0.057513 0.341040 \n", + " contemporary + 1-lagged 0.314501 8.094075 \n", + "GM cmiknn contemporary 87.160898 1826.917106 \n", + " contemporary + 1-lagged 504.895575 4688.384780 \n", + " parcorr contemporary 0.016086 0.345339 \n", + " contemporary + 1-lagged 0.089332 9.835356 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results_e12gm_pcmci[\"features_lag\"] = results_e12gm_pcmci[\"lag\"].map(\n", + " {\n", + " 0: \"contemporary\",\n", + " 1: \"contemporary + 1-lagged\",\n", + " }\n", + ")\n", + "results_e12gm_pcmci[\"features_set\"] = results_e12gm_pcmci[\"dataset\"].apply(lambda x: \"all\" if \"all\" in x else \"single\")\n", + "\n", + "results_e12gm_pcmci[\"dataset\"] = results_e12gm_pcmci[\"dataset\"].apply(lambda x: x[3:5])\n", + "\n", + "results_e12gm_pcmci = results_e12gm_pcmci\\\n", + " .drop(columns=[\"selected_features\", \"algorithm\", \"lag\", \"score_r2\"])\\\n", + " .set_index([\"dataset\", \"independencetest\", \"features_lag\", \"features_set\"])\\\n", + " .unstack(\"features_set\").sort_index(axis=1, ascending=False) # fmt: off\n", + "\n", + "results_e12gm_pcmci" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "fd05d126-994e-4054-8772-796ecea12b36", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
   score_r2_lag_arscore_r2_lagexecution_time
  features_setsingleallsingleallsingleall
datasetindependencetestfeatures_lag      
E1cmiknncontemporary0.2860.1810.2860.181384.155532.62
contemporary + 1-lagged0.4200.3680.2550.1481308.917726.13
parcorrcontemporary0.2890.2540.2890.2540.050.68
contemporary + 1-lagged0.4210.3680.2650.1480.2710.82
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_file = os.path.join(constants.path_table_tex, \"e12gm_E1_pcmci.tex\")\n", + "os.makedirs(os.path.dirname(target_file), exist_ok=True)\n", + "export_pcmci_df_to_latex(results_e12gm_pcmci.iloc[:4, :], target_file)\n", + "results_e12gm_pcmci.iloc[:4, :].style.pipe(make_pcmci_pretty)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "7531accd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
   score_r2_lag_arscore_r2_lagexecution_time
  features_setsingleallsingleallsingleall
datasetindependencetestfeatures_lag      
E2cmiknncontemporary0.2300.2980.2300.298810.725081.93
contemporary + 1-lagged0.4160.3910.2390.1461596.585268.38
parcorrcontemporary0.2220.1910.2220.1910.060.34
contemporary + 1-lagged0.4020.3910.2480.1460.318.09
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_file = os.path.join(constants.path_table_tex, \"e12gm_E2_pcmci.tex\")\n", + "os.makedirs(os.path.dirname(target_file), exist_ok=True)\n", + "export_pcmci_df_to_latex(results_e12gm_pcmci.iloc[4:8, :], target_file)\n", + "results_e12gm_pcmci.iloc[4:8, :].style.pipe(make_pcmci_pretty)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "382a776d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
   score_r2_lag_arscore_r2_lagexecution_time
  features_setsingleallsingleallsingleall
datasetindependencetestfeatures_lag      
GMcmiknncontemporary0.1770.0880.1770.08887.161826.92
contemporary + 1-lagged0.4430.4060.2030.120504.904688.38
parcorrcontemporary0.1770.1100.1770.1100.020.35
contemporary + 1-lagged0.4430.4060.2030.1200.099.84
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_file = os.path.join(constants.path_table_tex, \"e12gm_GM_pcmci.tex\")\n", + "os.makedirs(os.path.dirname(target_file), exist_ok=True)\n", + "export_pcmci_df_to_latex(results_e12gm_pcmci.iloc[8:, :], target_file)\n", + "results_e12gm_pcmci.iloc[8:, :].style.pipe(make_pcmci_pretty)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### [TEFS](#toc0_)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "340b625d", + "metadata": {}, + "outputs": [], + "source": [ + "def check_list(cell):\n", + " if cell == [0]:\n", + " return \"contemporary\"\n", + " elif cell == [0, 1]:\n", + " return \"contemporary + 1-lagged\"\n", + "\n", + "results_e12gm_te[\"features_lag\"] = results_e12gm_te[\"lagfeatures\"].apply(check_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "fb7a85b6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
score_r2_lag_arscore_r2_lagexecution_time
features_setsingleallsingleallsingleall
datasetdirectionfeatures_lag
E1backwardcontemporary0.4568400.4050680.2510600.2314620.6881956.806478
contemporary + 1-lagged0.4209320.4209320.2651900.2651900.7572997.315034
forwardcontemporary0.4568400.4568400.2510600.2510600.6331476.430187
contemporary + 1-lagged0.4209320.4209320.2651900.2651900.6592326.633933
E2backwardcontemporary0.4233740.4233740.1903340.1903341.1225267.054950
contemporary + 1-lagged0.3906510.3906510.1461930.1461931.2195917.271728
forwardcontemporary0.4233740.4233740.1903340.1903340.9987145.996039
contemporary + 1-lagged0.3906510.4218090.1461930.1461691.0533896.607501
GMbackwardcontemporary0.3591310.4572280.1765630.1578900.3567086.942717
contemporary + 1-lagged0.4105810.3767460.0500870.0853620.3992917.320499
forwardcontemporary0.3591310.3916540.1765630.0129590.3497546.280308
contemporary + 1-lagged0.4105810.4105810.0500870.0500870.3560366.558476
\n", + "
" + ], + "text/plain": [ + " score_r2_lag_ar \\\n", + "features_set single all \n", + "dataset direction features_lag \n", + "E1 backward contemporary 0.456840 0.405068 \n", + " contemporary + 1-lagged 0.420932 0.420932 \n", + " forward contemporary 0.456840 0.456840 \n", + " contemporary + 1-lagged 0.420932 0.420932 \n", + "E2 backward contemporary 0.423374 0.423374 \n", + " contemporary + 1-lagged 0.390651 0.390651 \n", + " forward contemporary 0.423374 0.423374 \n", + " contemporary + 1-lagged 0.390651 0.421809 \n", + "GM backward contemporary 0.359131 0.457228 \n", + " contemporary + 1-lagged 0.410581 0.376746 \n", + " forward contemporary 0.359131 0.391654 \n", + " contemporary + 1-lagged 0.410581 0.410581 \n", + "\n", + " score_r2_lag \\\n", + "features_set single all \n", + "dataset direction features_lag \n", + "E1 backward contemporary 0.251060 0.231462 \n", + " contemporary + 1-lagged 0.265190 0.265190 \n", + " forward contemporary 0.251060 0.251060 \n", + " contemporary + 1-lagged 0.265190 0.265190 \n", + "E2 backward contemporary 0.190334 0.190334 \n", + " contemporary + 1-lagged 0.146193 0.146193 \n", + " forward contemporary 0.190334 0.190334 \n", + " contemporary + 1-lagged 0.146193 0.146169 \n", + "GM backward contemporary 0.176563 0.157890 \n", + " contemporary + 1-lagged 0.050087 0.085362 \n", + " forward contemporary 0.176563 0.012959 \n", + " contemporary + 1-lagged 0.050087 0.050087 \n", + "\n", + " execution_time \n", + "features_set single all \n", + "dataset direction features_lag \n", + "E1 backward contemporary 0.688195 6.806478 \n", + " contemporary + 1-lagged 0.757299 7.315034 \n", + " forward contemporary 0.633147 6.430187 \n", + " contemporary + 1-lagged 0.659232 6.633933 \n", + "E2 backward contemporary 1.122526 7.054950 \n", + " contemporary + 1-lagged 1.219591 7.271728 \n", + " forward contemporary 0.998714 5.996039 \n", + " contemporary + 1-lagged 1.053389 6.607501 \n", + "GM backward contemporary 0.356708 6.942717 \n", + " contemporary + 1-lagged 0.399291 7.320499 \n", + " forward contemporary 0.349754 6.280308 \n", + " contemporary + 1-lagged 0.356036 6.558476 " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results_e12gm_te[\"features_set\"] = results_e12gm_te[\"dataset\"].apply(lambda x: \"all\" if \"all\" in x else \"single\")\n", + "# df_e12gm_te[\"CMI\"] = df_e12gm_te[\"dataset\"].apply(lambda x: \"noCMI\" if \"noCMI\" in x else \"yesCMI\")\n", + "results_e12gm_te[\"dataset\"] = results_e12gm_te[\"dataset\"].apply(lambda x: x[3:5])\n", + "\n", + "results_e12gm_te = results_e12gm_te\\\n", + " .drop(columns=[\"selected_features\", \"lagtarget\", \"lagfeatures\", \"score_r2\"])\\\n", + " .set_index([\"dataset\", \"direction\", \"features_lag\", \"features_set\"])\\\n", + " .unstack(\"features_set\").sort_index(axis=1, ascending=False) # fmt: off\n", + "\n", + "results_e12gm_te" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "ec374849", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
   score_r2_lag_arscore_r2_lagexecution_time
  features_setsingleallsingleallsingleall
datasetdirectionfeatures_lag      
E1backwardcontemporary0.4570.4050.2510.2310.696.81
contemporary + 1-lagged0.4210.4210.2650.2650.767.32
forwardcontemporary0.4570.4570.2510.2510.636.43
contemporary + 1-lagged0.4210.4210.2650.2650.666.63
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_file = os.path.join(constants.path_table_tex, \"e12gm_E1_te.tex\")\n", + "os.makedirs(os.path.dirname(target_file), exist_ok=True)\n", + "export_te_df_to_latex(results_e12gm_te.iloc[:4, :], target_file)\n", + "results_e12gm_te.iloc[:4, :].style.pipe(make_te_pretty)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "40017b46", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
   score_r2_lag_arscore_r2_lagexecution_time
  features_setsingleallsingleallsingleall
datasetdirectionfeatures_lag      
E2backwardcontemporary0.4230.4230.1900.1901.127.05
contemporary + 1-lagged0.3910.3910.1460.1461.227.27
forwardcontemporary0.4230.4230.1900.1901.006.00
contemporary + 1-lagged0.3910.4220.1460.1461.056.61
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_file = os.path.join(constants.path_table_tex, \"e12gm_E2_te.tex\")\n", + "os.makedirs(os.path.dirname(target_file), exist_ok=True)\n", + "export_te_df_to_latex(results_e12gm_te.iloc[4:8, :], target_file)\n", + "results_e12gm_te.iloc[4:8, :].style.pipe(make_te_pretty)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "186abb37", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
   score_r2_lag_arscore_r2_lagexecution_time
  features_setsingleallsingleallsingleall
datasetdirectionfeatures_lag      
GMbackwardcontemporary0.3590.4570.1770.1580.366.94
contemporary + 1-lagged0.4110.3770.0500.0850.407.32
forwardcontemporary0.3590.3920.1770.0130.356.28
contemporary + 1-lagged0.4110.4110.0500.0500.366.56
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_file = os.path.join(constants.path_table_tex, \"e12gm_GM_te.tex\")\n", + "os.makedirs(os.path.dirname(target_file), exist_ok=True)\n", + "export_te_df_to_latex(results_e12gm_te.iloc[8:, :], target_file)\n", + "results_e12gm_te.iloc[8:, :].style.pipe(make_te_pretty)" + ] + }, + { + "cell_type": "markdown", + "id": "7a1a352c", + "metadata": {}, + "source": [ + "### [Full version without CMI](#toc0_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### [TEFS](#toc0_)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "1be8b53b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 selected_featuresscore_r2score_r2_lagscore_r2_lag_ardatasetlagfeatureslagtargetdirectionexecution_time
0E1cyclostationary_mean_tg_12w_3 E1cyclostationary_mean_rr_1w_10 E1cyclostationary_mean_tg_8w_1 E1cyclostationary_mean_tg_6 E1cyclostationary_mean_tg_12w_2 E1cyclostationary_mean_rr_1w_9 E1cyclostationary_mean_tg_0 E1cyclostationary_mean_rr_4w_10.2760.2760.482df_E1_noCMI[0][1]backward717.581s
1E1cyclostationary_mean_rr_12w_0 E1cyclostationary_mean_rr_1w_5 E1cyclostationary_mean_tg_60.2540.2540.471df_E1_noCMI[0][1]forward641.229s
2E1cyclostationary_mean_rr_8w_4 E1cyclostationary_mean_tg_00.2610.2620.402df_E1_noCMI[0, 1][1]backward766.952s
3E1cyclostationary_mean_rr_12w_0 E1cyclostationary_mean_tg_6 E1cyclostationary_mean_rr_8w_40.2150.2120.395df_E1_noCMI[0, 1][1]forward633.929s
4E1cyclostationary_mean_rr_8w_4 E1cyclostationary_mean_tg_60.2040.2040.424df_E1allfeatures_noCMI[0][1]backward2451.030s
5E1cyclostationary_mean_rr_12w_0 GMcyclostationary_mean_tg_0 E1cyclostationary_mean_rr_1w_50.2810.2810.484df_E1allfeatures_noCMI[0][1]forward2047.873s
6E1cyclostationary_mean_rr_12w_4 E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_12w_0 E1cyclostationary_mean_tg_8w_4 E1cyclostationary_mean_tg_12w_7 E2cyclostationary_mean_tg_8w_2 E1cyclostationary_mean_tg_60.2570.3060.428df_E1allfeatures_noCMI[0, 1][1]backward2612.832s
7E1cyclostationary_mean_rr_12w_0 E1cyclostationary_mean_tg_6 E1cyclostationary_mean_rr_8w_40.2150.2120.395df_E1allfeatures_noCMI[0, 1][1]forward2190.678s
8E2cyclostationary_mean_tg_1 E2cyclostationary_mean_tg_3 E2cyclostationary_mean_tg_4 E2cyclostationary_mean_tg_8w_20.1740.1740.461df_E2_noCMI[0][1]backward148.270s
9E2cyclostationary_mean_tg_1 E2cyclostationary_mean_rr_1w_30.1980.1980.440df_E2_noCMI[0][1]forward124.161s
10E2cyclostationary_mean_tg_4 E2cyclostationary_mean_tg_0 E2cyclostationary_mean_tg_1w_1 E2cyclostationary_mean_rr_8w_00.1900.2560.440df_E2_noCMI[0, 1][1]backward160.542s
11E2cyclostationary_mean_tg_1w_00.1700.1800.451df_E2_noCMI[0, 1][1]forward135.600s
12E2cyclostationary_mean_tg_00.1120.1120.408df_E2allfeatures_noCMI[0][1]backward2488.332s
13E2cyclostationary_mean_tg_1 E1cyclostationary_mean_rr_1w_150.1560.1560.421df_E2allfeatures_noCMI[0][1]forward2019.690s
14E2cyclostationary_mean_tg_1w_00.1700.1800.451df_E2allfeatures_noCMI[0, 1][1]backward3022.579s
15E2cyclostationary_mean_tg_1w_00.1700.1800.451df_E2allfeatures_noCMI[0, 1][1]forward2181.964s
16GMcyclostationary_mean_tg_8w_0 GMcyclostationary_mean_rr_24w_0 GMcyclostationary_mean_tg_4w_0 GMcyclostationary_mean_rr_12w_1 GMcyclostationary_mean_tg_00.2130.2130.451df_GM_noCMI[0][1]backward47.713s
17GMcyclostationary_mean_tg_0 GMcyclostationary_mean_rr_1w_1 GMcyclostationary_mean_rr_1w_30.1380.1380.440df_GM_noCMI[0][1]forward41.189s
18GMcyclostationary_mean_rr_12w_0 GMcyclostationary_mean_tg_8w_0 GMcyclostationary_mean_rr_12w_1 GMcyclostationary_mean_tg_1w_10.2460.2340.446df_GM_noCMI[0, 1][1]backward55.535s
19GMcyclostationary_mean_tg_1w_10.0580.0590.430df_GM_noCMI[0, 1][1]forward42.982s
20E2cyclostationary_mean_tg_1w_1 E2cyclostationary_mean_tg_40.0400.0400.379df_GMallfeatures_noCMI[0][1]backward2462.089s
21E1cyclostationary_mean_tg_8w_4 E1cyclostationary_mean_rr_8w_0 E2cyclostationary_mean_tg_40.1120.1120.427df_GMallfeatures_noCMI[0][1]forward2019.530s
22E2cyclostationary_mean_tg_40.0160.0440.376df_GMallfeatures_noCMI[0, 1][1]backward2700.420s
23E2cyclostationary_mean_tg_1w_00.0490.0660.439df_GMallfeatures_noCMI[0, 1][1]forward2170.156s
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_file = os.path.join(constants.path_table_tex, \"e12gm_noCMI_te_full.tex\")\n", + "export_results_dataframe_te(results_e12gm_noCMI_te, target_file)\n", + "results_e12gm_noCMI_te.style.pipe(make_te_all_pretty)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### [TEFS as wrapper on E12GM](#toc0_)" + ] + }, + { + "cell_type": "markdown", + "id": "9dfb99d4", + "metadata": {}, + "source": [ + "In this way the method becomes a wrapper because we are making a selection solely by looking at the performance in regression." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "93010e78", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['te_e12gm_datasetdf_E1_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_E1_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_E1_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_E1_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_E1_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_E1_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_E1_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_E1_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_E1allfeatures_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_E1allfeatures_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_E1allfeatures_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_E1allfeatures_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_E1allfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_E1allfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_E1allfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_E1allfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_E2_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_E2_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_E2_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_E2_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_E2_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_E2_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_E2_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_E2_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_E2allfeatures_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_E2allfeatures_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_E2allfeatures_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_E2allfeatures_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_E2allfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_E2allfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_E2allfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_E2allfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_GM_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_GM_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_GM_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_GM_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_GM_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_GM_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_GM_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_GM_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_GMallfeatures_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_GMallfeatures_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_GMallfeatures_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_GMallfeatures_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_GMallfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_GMallfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_GMallfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_GMallfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_ticino_datasetnormal_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_ticino_datasetnormal_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_ticino_datasetnormal_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_ticino_datasetnormal_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_ticino_datasetsnowlakes_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_ticino_datasetsnowlakes_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_ticino_datasetsnowlakes_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_ticino_datasetsnowlakes_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl']" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load all TEFS simulations\n", + "results_files = sorted([file for file in os.listdir(constants.path_results) if file.endswith(\".pkl\")])\n", + "config_list = [file for file in results_files if file.split(\"_\")[0] == \"te\"]\n", + "config_list" + ] + }, + { + "cell_type": "markdown", + "id": "cdb4f193", + "metadata": {}, + "source": [ + "Here I make two plots, one with a single line using fixed train and test, and one with cross-validation, in this case with `KFold`, but a version with `TimeSeriesSplit` is also available." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "f9444671", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Skipping te_e12gm_datasetdf_E1_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1allfeatures_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1allfeatures_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1allfeatures_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1allfeatures_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1allfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1allfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1allfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1allfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2allfeatures_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2allfeatures_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2allfeatures_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2allfeatures_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2allfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2allfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2allfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2allfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GM_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GM_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GM_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GM_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GM_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GM_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GM_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GM_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GMallfeatures_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GMallfeatures_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GMallfeatures_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GMallfeatures_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GMallfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GMallfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GMallfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GMallfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_ticino_datasetnormal_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_ticino_datasetnormal_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_ticino_datasetnormal_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_ticino_datasetnormal_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_ticino_datasetsnowlakes_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_ticino_datasetsnowlakes_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_ticino_datasetsnowlakes_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_ticino_datasetsnowlakes_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n" + ] + } + ], + "source": [ + "for config_name in config_list:\n", + " basename = os.path.splitext(os.path.basename(config_name))[0]\n", + " target_file_train_test = os.path.join(constants.path_figures, \"tefs_as_wrapper\", f\"{basename}_wrapper.pdf\")\n", + " target_file_cv = os.path.join(constants.path_figures, \"tefs_as_wrapper_cv\", f\"{basename}_wrapper_cv.pdf\")\n", + " if os.path.exists(target_file_train_test) and os.path.exists(target_file_cv):\n", + " print(f\"Skipping {config_name}...\")\n", + " continue\n", + "\n", + " print(f\"Processing {config_name}...\")\n", + "\n", + " # --------------------- Load simulation ---------------------\n", + " simulation = file_management.load_from_pkl_file(\n", + " os.path.join(\n", + " constants.path_results,\n", + " config_name,\n", + " )\n", + " )\n", + "\n", + " # --------------------- Load corresponding dataset ---------------------\n", + " basin_name = config_name.split(\"_\")[1]\n", + " datasets, _ = datasets_and_configurations_loaders[\"te\"].get(basin_name)()\n", + " dataset_name = simulation[\"dataset_name\"]\n", + " dataframe = datasets[dataset_name]\n", + "\n", + " target_columns = [\"target\"]\n", + " features_columns = dataframe[\"full\"].drop(columns=target_columns).columns\n", + "\n", + " # --------------------- Select features using threshold (conservative) ---------------------\n", + " selected_features_names_with_threshold = simulation[\"results\"].select_features(simulation[\"params\"][\"threshold\"])\n", + " n_features_selected_with_threshold = len(selected_features_names_with_threshold)\n", + "\n", + " # --------------------- Compute test R2 for each number of features ---------------------\n", + " test_r2_train_test = []\n", + " test_r2_cv = []\n", + " num_total_features = len(dataframe[\"full\"].columns) - 1 # -1 because the last column is the target\n", + " for num_features in range(0, num_total_features + 1):\n", + " if num_features == 0:\n", + " selected_features_names = []\n", + " else:\n", + " selected_features_names = simulation[\"results\"].select_n_features(num_features)\n", + "\n", + " lagfeatures = simulation[\"params\"][\"lagfeatures\"]\n", + " lagtarget = simulation[\"params\"][\"lagtarget\"]\n", + "\n", + " inputs_names_lags = {feature: lagfeatures for feature in selected_features_names}\n", + " inputs_names_lags[\"target\"] = lagtarget\n", + "\n", + " # --- Compute the train_test version ---\n", + " test_r2_train_test.append(\n", + " regression_analysis(\n", + " inputs_names_lags=inputs_names_lags,\n", + " target_name=target_columns[0],\n", + " df_train=dataframe[\"train\"],\n", + " df_test=dataframe[\"test\"],\n", + " )\n", + " )\n", + "\n", + " # --- Compute the cross-validation version ---\n", + " # To perform a cross-validation, we need to concatenate the train and test sets\n", + " unified_df = pd.concat([dataframe[\"train\"], dataframe[\"test\"]], axis=0).reset_index(drop=True)\n", + "\n", + " # Fixed window size\n", + " # n_samples = unified_df.shape[0]\n", + " # n_splits = 5\n", + " # cv_scheme = TimeSeriesSplit(\n", + " # n_splits=n_splits,\n", + " # max_train_size=n_samples // (n_splits + 1),\n", + " # )\n", + "\n", + " # Regular KFold\n", + " cv_scheme = KFold(n_splits=4) # 4 splits is about using the same test set size\n", + "\n", + " test_r2_cv.append(\n", + " regression_analysis(\n", + " inputs_names_lags=inputs_names_lags,\n", + " target_name=target_columns[0],\n", + " df=unified_df,\n", + " cv_scheme=cv_scheme,\n", + " )\n", + " )\n", + "\n", + " test_r2_train_test = np.array(test_r2_train_test)\n", + " test_r2_cv = np.array(test_r2_cv)\n", + "\n", + " # --------------------- Plot train test version ---------------------\n", + " fig, ax = plt.subplots(figsize=(10, 5))\n", + " ax.plot(test_r2_train_test, marker=\"o\", label=\"Fixed train-test\")\n", + " maxima = np.where(test_r2_train_test == test_r2_train_test.max())[0]\n", + " ax.plot(maxima, test_r2_train_test[maxima], marker=\"o\", color=\"red\", linestyle=\"None\", label=\"Maximum\", markersize=10)\n", + " ax.plot(n_features_selected_with_threshold, test_r2_train_test[n_features_selected_with_threshold], marker=\"o\", color=\"green\", linestyle=\"None\", label=\"TEFS (conservative)\", markersize=10)\n", + " ax.set_xlabel(\"Number of features\")\n", + " ax.set_ylabel(\"Test $R^2$\")\n", + "\n", + " if simulation[\"params\"][\"threshold\"] == np.inf:\n", + " threshold_text = \"\\infty\"\n", + " elif simulation[\"params\"][\"threshold\"] == -np.inf:\n", + " threshold_text = \"-\\infty\"\n", + " else:\n", + " threshold_text = simulation[\"params\"][\"threshold\"]\n", + "\n", + " title_text = f\"TEFS on basin {basin_name.upper()} with dataset {dataset_name}\\n[lagfeatures $={simulation['params']['lagfeatures']}$, lagtarget $={simulation['params']['lagtarget']}$, direction = {simulation['params']['direction']}, threshold $={threshold_text}]$\"\n", + " ax.set_title(title_text)\n", + " ax.legend()\n", + " if num_total_features < 30:\n", + " step = 1\n", + " elif num_total_features < 80:\n", + " step = 5\n", + " else:\n", + " step = 10\n", + " ax.set_xticks(range(0, num_total_features + 1, step))\n", + " ax.set_xticklabels(range(0, num_total_features + 1, step))\n", + " ax.set_ylim(-0.1, 0.55)\n", + " ax.grid()\n", + "\n", + " os.makedirs(os.path.dirname(target_file_train_test), exist_ok=True)\n", + " plt.savefig(target_file_train_test, bbox_inches=\"tight\")\n", + " plt.close(fig)\n", + "\n", + " # --------------------- Plot cross-validation version ---------------------\n", + " fig, ax = plt.subplots(figsize=(10, 5))\n", + " ax.plot(test_r2_cv.mean(axis=1), marker=\"o\", label=\"Cross-validation\")\n", + " maxima = np.where(test_r2_cv.mean(axis=1) == test_r2_cv.mean(axis=1).max())[0]\n", + " ax.plot(maxima, test_r2_cv.mean(axis=1)[maxima], marker=\"o\", color=\"red\", linestyle=\"None\", label=\"Maximum\", markersize=10)\n", + " ax.plot(n_features_selected_with_threshold, test_r2_cv.mean(axis=1)[n_features_selected_with_threshold], marker=\"o\", color=\"green\", linestyle=\"None\", label=\"TEFS (conservative)\", markersize=10)\n", + "\n", + " # plot confidence interval bands from cross-validation based on mean and standard deviation (90% confidence)\n", + " alpha = 0.1\n", + " quantile = scipy.stats.norm.ppf(1 - alpha / 2)\n", + " ax.fill_between(range(test_r2_cv.shape[0]), test_r2_cv.mean(axis=1) - test_r2_cv.std(axis=1) * quantile / np.sqrt(test_r2_cv.shape[1]), test_r2_cv.mean(axis=1) + test_r2_cv.std(axis=1) * quantile / np.sqrt(test_r2_cv.shape[1]), alpha=0.3)\n", + "\n", + " ax.set_xlabel(\"Number of features\")\n", + " ax.set_ylabel(\"Test $R^2$\")\n", + "\n", + " if simulation[\"params\"][\"threshold\"] == np.inf:\n", + " threshold_text = \"\\infty\"\n", + " elif simulation[\"params\"][\"threshold\"] == -np.inf:\n", + " threshold_text = \"-\\infty\"\n", + " else:\n", + " threshold_text = simulation[\"params\"][\"threshold\"]\n", + "\n", + " title_text = f\"TEFS on basin {basin_name.upper()} with dataset {dataset_name}\\n[lagfeatures $={simulation['params']['lagfeatures']}$, lagtarget $={simulation['params']['lagtarget']}$, direction = {simulation['params']['direction']}, threshold $={threshold_text}]$\"\n", + " ax.set_title(title_text)\n", + " ax.legend()\n", + " if num_total_features < 30:\n", + " step = 1\n", + " elif num_total_features < 80:\n", + " step = 5\n", + " else:\n", + " step = 10\n", + " ax.set_xticks(range(0, num_total_features + 1, step))\n", + " ax.set_xticklabels(range(0, num_total_features + 1, step))\n", + " ax.set_ylim(-0.1, 0.55)\n", + " ax.grid()\n", + "\n", + " os.makedirs(os.path.dirname(target_file_cv), exist_ok=True)\n", + " plt.savefig(target_file_cv, bbox_inches=\"tight\")\n", + " plt.close(fig)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### [Linking the wrapper to the original filter method](#toc0_)" + ] + }, + { + "cell_type": "markdown", + "id": "b5ce711d", + "metadata": {}, + "source": [ + "Here I look for all configurations without CMI and match them to those with CMI. I show the plot above where I see the algorithm as a wrapper and highlight with vertical bars the points at which the variables chosen in the version with CMI were added/removed. There is also the option to choose variables manually (ideally the most common ones \"by eye\")." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "4fc3f237", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('te_e12gm_datasetdf_E1_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", + " 'te_e12gm_datasetdf_E1_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl'),\n", + " ('te_e12gm_datasetdf_E1_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", + " 'te_e12gm_datasetdf_E1_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl')]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "config_matches = []\n", + "for config_name in config_list:\n", + " if \"noCMI\" in config_name and config_name.replace(\"_noCMI\", \"\") in config_list:\n", + " config_matches.append((config_name, config_name.replace(\"_noCMI\", \"\")))\n", + "\n", + "config_matches[:2]" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "8e0836bb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Skipping te_e12gm_datasetdf_E1_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1allfeatures_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1allfeatures_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1allfeatures_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E1allfeatures_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2allfeatures_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2allfeatures_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2allfeatures_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_E2allfeatures_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GM_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GM_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GM_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GM_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GMallfeatures_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GMallfeatures_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GMallfeatures_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", + "Skipping te_e12gm_datasetdf_GMallfeatures_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n" + ] + } + ], + "source": [ + "for config_name_noCMI, config_name in config_matches:\n", + " basename = os.path.splitext(os.path.basename(config_name_noCMI))[0]\n", + " target_file = os.path.join(constants.path_figures, \"tefs_as_wrapper_mapping_filter\", f\"{basename}_wrapper_mapping_filter.pdf\")\n", + " if os.path.exists(target_file):\n", + " print(f\"Skipping {config_name}...\")\n", + " continue\n", + "\n", + " print(f\"Processing {config_name}...\")\n", + "\n", + " simulation_noCMI = file_management.load_from_pkl_file(\n", + " os.path.join(\n", + " constants.path_results,\n", + " config_name_noCMI,\n", + " )\n", + " )\n", + "\n", + " simulation = file_management.load_from_pkl_file(\n", + " os.path.join(\n", + " constants.path_results,\n", + " config_name,\n", + " )\n", + " )\n", + "\n", + " # feature selected with CMI\n", + " basin_name = config_name.split(\"_\")[1]\n", + " datasets, _ = datasets_and_configurations_loaders[\"te\"].get(basin_name)()\n", + " dataset_name = simulation[\"dataset_name\"]\n", + " dataframe = datasets[dataset_name]\n", + " target_columns = [\"target\"]\n", + " features_columns = dataframe[\"full\"].drop(columns=target_columns).columns\n", + "\n", + " selected_features_names_with_threshold = simulation[\"results\"].select_features(simulation[\"params\"][\"threshold\"])\n", + " n_features_selected_with_threshold = len(selected_features_names_with_threshold)\n", + "\n", + " # choose manually\n", + " selected_features_names_with_threshold = [\"E1cyclostationary_mean_rr_4w_1\", \"E2cyclostationary_mean_tg_0\"]\n", + "\n", + " # Load the noCMI version and process it\n", + " basin_name = config_name_noCMI.split(\"_\")[1]\n", + " datasets, _ = datasets_and_configurations_loaders[\"te\"].get(basin_name)()\n", + " dataset_name_noCMI = simulation_noCMI[\"dataset_name\"]\n", + " dataframe_noCMI = datasets[dataset_name_noCMI]\n", + " target_columns = [\"target\"]\n", + " features_columns_noCMI = dataframe_noCMI[\"full\"].drop(columns=target_columns).columns\n", + "\n", + " test_r2_train_test = []\n", + "\n", + " selected_features_names_previous = [] # new part\n", + " corresponding_features_indexes = {} # new part\n", + "\n", + " num_total_features = len(dataframe_noCMI[\"full\"].columns) - 1 # -1 because the last column is the target\n", + " for num_features in range(0, num_total_features + 1):\n", + " if num_features == 0:\n", + " selected_features_names = []\n", + " else:\n", + " selected_features_names_previous = selected_features_names.copy() # new part\n", + " selected_features_names = simulation_noCMI[\"results\"].select_n_features(num_features)\n", + "\n", + " # if the feature that has been just added is in selected_features_names_with_threshold, add num_features to corresponding_features_indexes\n", + " # looking at the set difference\n", + " new_feature_name = list(set(selected_features_names).difference(set(selected_features_names_previous)))[0]\n", + " if new_feature_name in selected_features_names_with_threshold:\n", + " corresponding_features_indexes[num_features] = new_feature_name\n", + "\n", + " lagfeatures = simulation_noCMI[\"params\"][\"lagfeatures\"]\n", + " lagtarget = simulation_noCMI[\"params\"][\"lagtarget\"]\n", + "\n", + " inputs_names_lags = {feature: lagfeatures for feature in selected_features_names}\n", + " inputs_names_lags[\"target\"] = lagtarget\n", + "\n", + " # --- Compute the train_test version ---\n", + " test_r2_train_test.append(\n", + " regression_analysis(\n", + " inputs_names_lags=inputs_names_lags,\n", + " target_name=target_columns[0],\n", + " df_train=dataframe_noCMI[\"train\"],\n", + " df_test=dataframe_noCMI[\"test\"],\n", + " )\n", + " )\n", + "\n", + " test_r2_train_test = np.array(test_r2_train_test)\n", + "\n", + " # --------------------- Plot ---------------------\n", + " fig, ax = plt.subplots(figsize=(10, 5))\n", + " ax.plot(test_r2_train_test, marker=\"o\", label=\"Fixed train-test\")\n", + "\n", + " # Get the default color cycle\n", + " color_cycle = plt.rcParams[\"axes.prop_cycle\"].by_key()[\"color\"]\n", + "\n", + " # plot vertical lines in corresponding_features_indexes\n", + " for i, (key, value) in enumerate(corresponding_features_indexes.items()):\n", + " ax.axvline(x=key, linestyle=\"--\", color=color_cycle[i + 1 % len(color_cycle)], label=f\"{value}\")\n", + "\n", + " maxima = np.where(test_r2_train_test == test_r2_train_test.max())[0]\n", + " ax.plot(maxima, test_r2_train_test[maxima], marker=\"o\", color=\"red\", linestyle=\"None\", label=\"Maximum\", markersize=10)\n", + " ax.plot(n_features_selected_with_threshold, test_r2_train_test[n_features_selected_with_threshold], marker=\"o\", color=\"green\", linestyle=\"None\", label=\"TEFS (conservative)\", markersize=10)\n", + " ax.set_xlabel(\"Number of features\")\n", + " ax.set_ylabel(\"Test $R^2$\")\n", + "\n", + " if simulation[\"params\"][\"threshold\"] == np.inf:\n", + " threshold_text = \"\\infty\"\n", + " elif simulation[\"params\"][\"threshold\"] == -np.inf:\n", + " threshold_text = \"-\\infty\"\n", + " else:\n", + " threshold_text = simulation[\"params\"][\"threshold\"]\n", + "\n", + " title_text = f\"TEFS on basin {basin_name.upper()} with dataset {dataset_name}\\n[lagfeatures $={simulation['params']['lagfeatures']}$, lagtarget $={simulation['params']['lagtarget']}$, direction = {simulation['params']['direction']}, threshold $={threshold_text}]$\"\n", + " ax.set_title(title_text)\n", + " ax.legend()\n", + " if num_total_features < 30:\n", + " step = 1\n", + " elif num_total_features < 80:\n", + " step = 5\n", + " else:\n", + " step = 10\n", + "\n", + " ax.set_xticks(range(0, num_total_features + 1, step))\n", + " ax.set_xticklabels(range(0, num_total_features + 1, step))\n", + " ax.set_ylim(-0.1, 0.55)\n", + " ax.grid()\n", + "\n", + " os.makedirs(os.path.dirname(target_file), exist_ok=True)\n", + " plt.savefig(target_file, bbox_inches=\"tight\")\n", + " plt.close(fig)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/hawk/analysis/run_simulation_pcmci.py b/hawk/analysis/run_simulation_pcmci.py new file mode 100644 index 0000000..85ac217 --- /dev/null +++ b/hawk/analysis/run_simulation_pcmci.py @@ -0,0 +1,109 @@ +import argparse +import os +import time + +import thesis.constants as constants +import thesis.file_management as file_management +from thesis import datasets_and_configurations_loaders +from tigramite.pcmci import PCMCI + + +def main(): + # Argument parsing + parser = argparse.ArgumentParser(description="Run simulation script for Transfer Entropy analysis.") + parser.add_argument("--basin", type=str, help="Name of the basin") + args = parser.parse_args() + + loader = datasets_and_configurations_loaders["pcmci"].get(args.basin) + + if not loader: + raise ValueError("Invalid basin name") + + datasets, configurations, independence_tests = loader() + + # Run experiments + for config in configurations: + params = config["params"] + dataset_name = config["dataset_name"] + dataframe = datasets[dataset_name] + + independence_test = independence_tests[params["independencetest"]] + algorithm = params["algorithm"] + lag = params["lag"] + + # Construct a unique identifier for the configuration + param_str = "_".join(f"{k}{v}" for k, v in params.items()) + config_id = f"dataset{dataset_name}_{param_str}" + target_file = os.path.join(constants.path_results, f"pcmci_{args.basin}_{config_id}.pkl") + if os.path.exists(target_file): + print(f"Skipping config {config_id} because results already exist") + continue + + print(f"Running experiment with config: {config}") + + pcmci = PCMCI(dataframe=dataframe["full_tigramite"], cond_ind_test=independence_test, verbosity=2) + + # if inspect_data: + # # Investigating data dependencies and lag functions + # correlations = pcmci.get_lagged_dependencies(tau_max=20, val_only=True)['val_matrix'] + + # matrix_lags = None #np.argmax(np.abs(correlations), axis=2) + # tp.plot_scatterplots(dataframe=dataframe, add_scatterplot_args={'matrix_lags':matrix_lags}); plt.show() + + # tp.plot_densityplots(dataframe=dataframe, add_densityplot_args={'matrix_lags':matrix_lags}); plt.show() + + start_time = time.time() + if algorithm == "pcmci": + results = pcmci.run_pcmci(tau_max=lag, pc_alpha=0.05, alpha_level=0.01) + elif algorithm == "pcmci_plus": + results = pcmci.run_pcmciplus(tau_min=0, tau_max=lag) + else: + raise ValueError(f"Invalid algorithm {algorithm}") + end_time = time.time() + execution_time = end_time - start_time + + # if show_p_val: + # print("p-values") + # print(results['p_matrix'].round(3)) + # print("MCI partial correlations") + # print(results['val_matrix'].round(2)) + + q_matrix = pcmci.get_corrected_pvalues( + p_matrix=results["p_matrix"], + tau_max=lag, + fdr_method="fdr_bh", + ) + + # if print_significant_links: + # pcmci.print_significant_links( + # p_matrix = q_matrix, + # val_matrix = results['val_matrix'], + # alpha_level = 0.01) + + graph = pcmci.get_graph_from_pmatrix( + p_matrix=q_matrix, + alpha_level=0.01, + tau_min=0, + tau_max=lag, + link_assumptions=None, + ) + + results["graph"] = graph + + # Save results to the dictionary + current_result = { + "results": results, + "params": params, + "dataset_name": dataset_name, + "basin": args.basin, + "execution_time": execution_time, + } + + # Save the object to a pickle file + file_management.save_to_pkl_file(target_file=target_file, data=current_result) + + print("-" * 80) + + +if __name__ == "__main__": + main() diff --git a/hawk/analysis/run_simulation_te.py b/hawk/analysis/run_simulation_te.py new file mode 100644 index 0000000..c9144f3 --- /dev/null +++ b/hawk/analysis/run_simulation_te.py @@ -0,0 +1,85 @@ +import argparse +import os +import time + +import thesis.constants as constants +import thesis.file_management as file_management +from tefs import TEFS +from thesis import datasets_and_configurations_loaders + + +def main(): + # Argument parsing + parser = argparse.ArgumentParser(description="Run simulation script for Transfer Entropy analysis.") + parser.add_argument("--basin", type=str, help="Name of the basin") + parser.add_argument("--n_jobs", type=int, default=1, help="Number of parallel jobs to run") + args = parser.parse_args() + + loader = datasets_and_configurations_loaders["te"].get(args.basin) + + if not loader: + raise ValueError("Invalid basin name") + + datasets, configurations = loader() + + # Run experiments + for config in configurations: + params = config["params"] + dataset_name = config["dataset_name"] + dataframe = datasets[dataset_name] + + # extract the parameters + direction = params["direction"] + lagfeatures = params["lagfeatures"] + lagtarget = params["lagtarget"] + k = params["k"] + + # Construct a unique identifier for the configuration + param_str = "_".join(f"{k}{v}" for k, v in params.items()) + param_str = param_str.replace(" ", "") + config_id = f"dataset{dataset_name}_{param_str}" + target_file = os.path.join(constants.path_results, f"te_{args.basin}_{config_id}.pkl") + if os.path.exists(target_file): + print(f"Skipping config {config_id} because results already exist") + continue + + print(f"Running experiment with config: {config}") + + features = dataframe["full"].drop(columns=["target"]) + target = dataframe["full"]["target"] + var_names = list(features.columns) + + # run the feature selection algorithm + start_time = time.time() + fs = TEFS( + features=features.values, + target=target.values, + k=k, + lag_features=lagfeatures, + lag_target=lagtarget, + direction=direction, + verbose=1, + var_names=var_names, + n_jobs=args.n_jobs, + ) + fs.fit() + end_time = time.time() + execution_time = end_time - start_time + + # Save results to the dictionary + current_result = { + "results": fs, + "params": params, + "dataset_name": dataset_name, + "basin": args.basin, + "execution_time": execution_time, + } + + # Save the object to a pickle file + file_management.save_to_pkl_file(target_file=target_file, data=current_result) + + print("-" * 80) + + +if __name__ == "__main__": + main() diff --git a/hawk/processes/simulation_interactive.py b/hawk/processes/simulation_interactive.py new file mode 100644 index 0000000..7042bb3 --- /dev/null +++ b/hawk/processes/simulation_interactive.py @@ -0,0 +1,19 @@ +import numpy as np +import pandas as pd +from birdy import WPSClient +#from keras import models + + +url = "http://localhost:5000/wps" +wps = WPSClient(url, verify=False) +help(wps) + + +resp = wps.hello(name="Pluto") +print(resp) +resp.get() + + +resp = wps.cyclone(start_day="2019-01-04", end_day="2019-01-06", area="Sindian") +print(resp) +resp.get() From 6bef9be9fb4bdd523441f148f627c537d9a05f5d Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Fri, 22 Mar 2024 23:30:39 +0100 Subject: [PATCH 03/51] Add first draft of the process --- hawk/processes/wps_causal.py | 142 +++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 hawk/processes/wps_causal.py diff --git a/hawk/processes/wps_causal.py b/hawk/processes/wps_causal.py new file mode 100644 index 0000000..42b29c6 --- /dev/null +++ b/hawk/processes/wps_causal.py @@ -0,0 +1,142 @@ +from pywps import Process, LiteralInput, LiteralOutput, UOM, ComplexInput, ComplexOutput +from pywps.app.Common import Metadata +from pywps import FORMATS, Format +from pathlib import Path +import logging + +LOGGER = logging.getLogger("PYWPS") + + +class Causal(Process): + """A nice process saying 'hello'.""" + + def __init__(self): + inputs = [ + LiteralInput( + "target_column_name", + "Target Column Name", + data_type="string", + abstract="Please enter the case-specific name of the target variable in the dataframe.", + ), + LiteralInput( + "pcmci_test_choice", + "PCMCI Test Choice", + data_type="string", + abstract="Choose the independence test to be used in PCMCI.", + allowed_values=[ + "ParCorr", + "CMIknn", + ], + ), + LiteralInput( + "pcmci_max_lag", + "PCMCI Max Lag", + data_type="string", + abstract="Choose the maximum lag to test used in PCMCI.", + allowed_values=[ + "0", + "1", + "2", + "3", + "4", + "5", + ], + ), + LiteralInput( + "tefs_direction", + "TEFS Direction", + data_type="string", + abstract="Choose the direction of the TEFS algorithm.", + allowed_values=[ + "forward", + "backward", + "both", + ], + ), + LiteralInput( + "tefs_use_comtemporary_features", + "TEFS Use Comtemporary Features", + data_type="boolean", + abstract="Choose whether to use comtemporary features in the TEFS algorithm.", + default=False, + ), + LiteralInput( + "tefs_max_lag_features", + "TEFS Max Lag Features", + data_type="string", + abstract="Choose the maximum lag of the features in the TEFS algorithm.", + allowed_values=[ + "no_lag" "1", + "2", + "3", + "4", + "5", + ], + ), + LiteralInput( + "tefs_max_lag_target", + "TEFS Max Lag Target", + data_type="string", + abstract="Choose the maximum lag of the target in the TEFS algorithm.", + allowed_values=[ + "1", + "2", + "3", + "4", + "5", + ], + ), + ] + outputs = [ + LiteralOutput( + "output", + "Output response", + abstract="A friendly Hello from us.", + keywords=["output", "result", "response"], + data_type="string", + ) + ] + + super(Causal, self).__init__( + self._handler, + identifier="hello", + title="Say Hello", + abstract="Just says a friendly Hello." + "Returns a literal string output with Hello plus the inputed name.", + keywords=["hello", "demo"], + metadata=[ + Metadata("PyWPS", "https://pywps.org/"), + Metadata("Birdhouse", "http://bird-house.github.io/"), + Metadata("PyWPS Demo", "https://pywps-demo.readthedocs.io/en/latest/"), + Metadata( + "Emu: PyWPS examples", "https://emu.readthedocs.io/en/latest/" + ), + ], + version="1.5", + inputs=inputs, + outputs=outputs, + store_supported=True, + status_supported=True, + ) + + def _handler(self, request, response): + response.update_status("Processing started", 0) + + # read the respons + target_column_name = request.inputs["target_column_name"][0].data + pcmci_test_choice = request.inputs["pcmci_test_choice"][0].data + pcmci_max_lag = request.inputs["pcmci_max_lag"][0].data + tefs_direction = request.inputs["tefs_direction"][0].data + tefs_use_comtemporary_features = request.inputs[ + "tefs_use_comtemporary_features" + ][0].data + tefs_max_lag_features = request.inputs["tefs_max_lag_features"][0].data + tefs_max_lag_target = request.inputs["tefs_max_lag_target"][0].data + + workdir = Path(self.workdir) + + # connect to the analysis class + + response.outputs["output"].data = "Hello " + request.inputs["name"][0].data + response.outputs["output"].uom = UOM("unity") + return response From edc4e29b2baad7931a73eca621e9d9fecb44b57c Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Sat, 23 Mar 2024 16:57:39 +0100 Subject: [PATCH 04/51] Draft skeleton of inputs and outputs --- hawk/processes/wps_causal.py | 113 +++++++++++++++++++++++++++++++---- 1 file changed, 100 insertions(+), 13 deletions(-) diff --git a/hawk/processes/wps_causal.py b/hawk/processes/wps_causal.py index 42b29c6..1409db5 100644 --- a/hawk/processes/wps_causal.py +++ b/hawk/processes/wps_causal.py @@ -3,15 +3,35 @@ from pywps import FORMATS, Format from pathlib import Path import logging +import pandas as pd +from hawk.analysis import CausalAnalysis LOGGER = logging.getLogger("PYWPS") +FORMAT_PNG = Format("image/png", extension=".png", encoding="base64") +FORMAT_PICKLE = Format("application/octet-stream", extension=".pkl", encoding="utf-8") class Causal(Process): """A nice process saying 'hello'.""" def __init__(self): inputs = [ + ComplexInput( + "dataset_train", + "Train Dataset", + abstract="Please add the train csv file here.", + min_occurs=1, + max_occurs=1, + supported_formats=[FORMATS.CSV], + ), + ComplexInput( + "dataset_test", + "Test Dataset", + abstract="Please add the test csv file here.", + min_occurs=1, + max_occurs=1, + supported_formats=[FORMATS.CSV], + ), LiteralInput( "target_column_name", "Target Column Name", @@ -88,13 +108,56 @@ def __init__(self): ), ] outputs = [ - LiteralOutput( - "output", - "Output response", - abstract="A friendly Hello from us.", - keywords=["output", "result", "response"], - data_type="string", - ) + ComplexOutput( + "pkl_baseline", + "Baseline Scores", + abstract="The baseline scores on the initial data.", + as_reference=True, + supported_formats=[FORMAT_PICKLE], + ), + ComplexOutput( + "png_pcmci", + "Selected features by PCMCI", + abstract="The selected features by PCMCI.", + as_reference=True, + supported_formats=[FORMAT_PNG], + ), + ComplexOutput( + "pkl_pcmci", + "PCMCI Results Details", + abstract="The PCMCI results details.", + as_reference=True, + supported_formats=[FORMAT_PICKLE], + ), + ComplexOutput( + "png_tefs", + "Selected features by TEFS", + abstract="The selected features by TEFS.", + as_reference=True, + supported_formats=[FORMAT_PNG], + ), + ComplexOutput( + "pkl_tefs", + "TEFS Results", + abstract="The TEFS results.", + as_reference=True, + supported_formats=[FORMAT_PICKLE], + ), + ComplexOutput( + "png_tefs_wrapper", + "Wrapper scores by TEFS", + abstract="The wrapper scores evolution by TEFS.", + as_reference=True, + supported_formats=[FORMAT_PNG], + ), + ComplexOutput( + "pkl_tefs_wrapper", + "TEFS Wrapper Scores Evolution details", + abstract="The TEFS wrapper scores evolution details.", + as_reference=True, + supported_formats=[FORMAT_PICKLE], + ), + ] super(Causal, self).__init__( @@ -122,14 +185,17 @@ def __init__(self): def _handler(self, request, response): response.update_status("Processing started", 0) - # read the respons + # Read the inputs target_column_name = request.inputs["target_column_name"][0].data + + df_train = pd.read_csv(request.inputs["dataset_train"][0].file) + df_test = pd.read_csv(request.inputs["dataset_test"][0].file) + pcmci_test_choice = request.inputs["pcmci_test_choice"][0].data pcmci_max_lag = request.inputs["pcmci_max_lag"][0].data + tefs_direction = request.inputs["tefs_direction"][0].data - tefs_use_comtemporary_features = request.inputs[ - "tefs_use_comtemporary_features" - ][0].data + tefs_use_comtemporary_features = request.inputs["tefs_use_comtemporary_features"][0].data tefs_max_lag_features = request.inputs["tefs_max_lag_features"][0].data tefs_max_lag_target = request.inputs["tefs_max_lag_target"][0].data @@ -137,6 +203,27 @@ def _handler(self, request, response): # connect to the analysis class - response.outputs["output"].data = "Hello " + request.inputs["name"][0].data - response.outputs["output"].uom = UOM("unity") + causal_analysis = CausalAnalysis( + df_train, + df_test, + target_column_name, + pcmci_test_choice, + pcmci_max_lag, + tefs_direction, + tefs_use_comtemporary_features, + tefs_max_lag_features, + tefs_max_lag_target, + workdir, + ) + + causal_analysis.run() + + response.outputs["pkl_baseline"].file = causal_analysis.baseline + response.outputs["png_pcmci"].file = causal_analysis.plot_pcmci + response.outputs["pkl_pcmci"].file = causal_analysis.details_pcmci + response.outputs["png_tefs"].file = causal_analysis.plot_tefs + response.outputs["pkl_tefs"].file = causal_analysis.details_tefs + response.outputs["png_tefs_wrapper"].file = causal_analysis.plot_tefs_wrapper + response.outputs["pkl_tefs_wrapper"].file = causal_analysis.details_tefs_wrapper + return response From d0746a2a0634596ed7e441ba7097630839624f48 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Sat, 23 Mar 2024 16:57:53 +0100 Subject: [PATCH 05/51] Create CausalAnalysis class and add baseline run --- hawk/analysis/__init__.py | 1 + hawk/analysis/main.py | 74 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 hawk/analysis/main.py diff --git a/hawk/analysis/__init__.py b/hawk/analysis/__init__.py index e69de29..a18465f 100644 --- a/hawk/analysis/__init__.py +++ b/hawk/analysis/__init__.py @@ -0,0 +1 @@ +from .main import CausalAnalysis # noqa \ No newline at end of file diff --git a/hawk/analysis/main.py b/hawk/analysis/main.py new file mode 100644 index 0000000..288c320 --- /dev/null +++ b/hawk/analysis/main.py @@ -0,0 +1,74 @@ +import hawk.analysis.pcmci_tools as pcmci_tools +from hawk.analysis.metrics import regression_analysis + +class CausalAnalysis: + def __init__( + self, + df_train, + df_test, + target_column_name, + pcmci_test_choice, + pcmci_max_lag, + tefs_direction, + tefs_use_comtemporary_features, + tefs_max_lag_features, + tefs_max_lag_target, + workdir, + ): + self.df_train = df_train + self.df_test = df_test + self.target_column_name = target_column_name + self.pcmci_test_choice = pcmci_test_choice + self.pcmci_max_lag = pcmci_max_lag + self.tefs_direction = tefs_direction + self.tefs_use_comtemporary_features = tefs_use_comtemporary_features + self.tefs_max_lag_features = tefs_max_lag_features + self.tefs_max_lag_target = tefs_max_lag_target + self.workdir = workdir + + self.tefs_features_lags = [] + if self.tefs_use_comtemporary_features: + self.tefs_features_lags.append(0) + self.tefs_features_lags.extend( + list(range(1, self.tefs_max_lag_features + 1)) + ) + + self.baseline = None + self.plot_pcmci = None + self.details_pcmci = None + self.plot_tefs = None + self.details_tefs = None + self.plot_tefs_wrapper = None + self.details_tefs_wrapper = None + + def run_baseline_analysis(self): + + baseline = {} + + features_names = self.df_train.columns.tolist() + + configs = [] + + # Autoregressive baselines + for i in range(1, self.tefs_max_lag_target): + configs.append((f"AR({i})", {self.target_column_name: list(range(1, i + 1))})) + + # With all features + configs.append(("All features", {feature: self.tefs_features_lags for feature in features_names})) + + + for label, inputs_names_lags in configs: + baseline[label] = { + "inputs": inputs_names_lags, + "r2": regression_analysis( + inputs_names_lags=inputs_names_lags, + target_name=self.target_column_name, + df_train=self.df_train, + df_test=self.df_test, + ) + } + + return baseline + + def run(self): + self.baseline = self.run_baseline_analysis() From 43532a209fba3f5fc6ab892f7260e426e3d24bd7 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 15:08:44 +0100 Subject: [PATCH 06/51] Cleanup run simulation file TEFS --- hawk/analysis/run_simulation_te.py | 85 ------------------------------ hawk/analysis/simulation_tefs.py | 53 +++++++++++++++++++ 2 files changed, 53 insertions(+), 85 deletions(-) delete mode 100644 hawk/analysis/run_simulation_te.py create mode 100644 hawk/analysis/simulation_tefs.py diff --git a/hawk/analysis/run_simulation_te.py b/hawk/analysis/run_simulation_te.py deleted file mode 100644 index c9144f3..0000000 --- a/hawk/analysis/run_simulation_te.py +++ /dev/null @@ -1,85 +0,0 @@ -import argparse -import os -import time - -import thesis.constants as constants -import thesis.file_management as file_management -from tefs import TEFS -from thesis import datasets_and_configurations_loaders - - -def main(): - # Argument parsing - parser = argparse.ArgumentParser(description="Run simulation script for Transfer Entropy analysis.") - parser.add_argument("--basin", type=str, help="Name of the basin") - parser.add_argument("--n_jobs", type=int, default=1, help="Number of parallel jobs to run") - args = parser.parse_args() - - loader = datasets_and_configurations_loaders["te"].get(args.basin) - - if not loader: - raise ValueError("Invalid basin name") - - datasets, configurations = loader() - - # Run experiments - for config in configurations: - params = config["params"] - dataset_name = config["dataset_name"] - dataframe = datasets[dataset_name] - - # extract the parameters - direction = params["direction"] - lagfeatures = params["lagfeatures"] - lagtarget = params["lagtarget"] - k = params["k"] - - # Construct a unique identifier for the configuration - param_str = "_".join(f"{k}{v}" for k, v in params.items()) - param_str = param_str.replace(" ", "") - config_id = f"dataset{dataset_name}_{param_str}" - target_file = os.path.join(constants.path_results, f"te_{args.basin}_{config_id}.pkl") - if os.path.exists(target_file): - print(f"Skipping config {config_id} because results already exist") - continue - - print(f"Running experiment with config: {config}") - - features = dataframe["full"].drop(columns=["target"]) - target = dataframe["full"]["target"] - var_names = list(features.columns) - - # run the feature selection algorithm - start_time = time.time() - fs = TEFS( - features=features.values, - target=target.values, - k=k, - lag_features=lagfeatures, - lag_target=lagtarget, - direction=direction, - verbose=1, - var_names=var_names, - n_jobs=args.n_jobs, - ) - fs.fit() - end_time = time.time() - execution_time = end_time - start_time - - # Save results to the dictionary - current_result = { - "results": fs, - "params": params, - "dataset_name": dataset_name, - "basin": args.basin, - "execution_time": execution_time, - } - - # Save the object to a pickle file - file_management.save_to_pkl_file(target_file=target_file, data=current_result) - - print("-" * 80) - - -if __name__ == "__main__": - main() diff --git a/hawk/analysis/simulation_tefs.py b/hawk/analysis/simulation_tefs.py new file mode 100644 index 0000000..cc72f0b --- /dev/null +++ b/hawk/analysis/simulation_tefs.py @@ -0,0 +1,53 @@ +import time + +from tefs import TEFS + + +def run( + datasets, + config, + n_jobs=1, +): + params = config["params"] + dataset_name = config["dataset_name"] + dataframe = datasets[dataset_name] + + # extract the parameters + direction = params["direction"] + lagfeatures = params["lagfeatures"] + lagtarget = params["lagtarget"] + k = params["k"] + + # Construct a unique identifier for the configuration + # param_str = "_".join(f"{k}{v}" for k, v in params.items()) + # param_str = param_str.replace(" ", "") + # config_id = f"dataset{dataset_name}_{param_str}" + + features = dataframe["full"].drop(columns=["target"]) + target = dataframe["full"]["target"] + var_names = list(features.columns) + + # run the feature selection algorithm + start_time = time.time() + fs = TEFS( + features=features.values, + target=target.values, + k=k, + lag_features=lagfeatures, + lag_target=lagtarget, + direction=direction, + verbose=1, + var_names=var_names, + n_jobs=n_jobs, + ) + fs.fit() + end_time = time.time() + execution_time = end_time - start_time + + # Save results to the dictionary + return { + "results": fs, + "params": params, + "dataset_name": dataset_name, + "execution_time": execution_time, + } From 6bcdccf78ed6ea5eba79abb2ac87c403293359cf Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 15:08:55 +0100 Subject: [PATCH 07/51] Cleanup run simulation file PCMCI --- hawk/analysis/run_simulation_pcmci.py | 109 -------------------------- hawk/analysis/simulation_pcmci.py | 58 ++++++++++++++ 2 files changed, 58 insertions(+), 109 deletions(-) delete mode 100644 hawk/analysis/run_simulation_pcmci.py create mode 100644 hawk/analysis/simulation_pcmci.py diff --git a/hawk/analysis/run_simulation_pcmci.py b/hawk/analysis/run_simulation_pcmci.py deleted file mode 100644 index 85ac217..0000000 --- a/hawk/analysis/run_simulation_pcmci.py +++ /dev/null @@ -1,109 +0,0 @@ -import argparse -import os -import time - -import thesis.constants as constants -import thesis.file_management as file_management -from thesis import datasets_and_configurations_loaders -from tigramite.pcmci import PCMCI - - -def main(): - # Argument parsing - parser = argparse.ArgumentParser(description="Run simulation script for Transfer Entropy analysis.") - parser.add_argument("--basin", type=str, help="Name of the basin") - args = parser.parse_args() - - loader = datasets_and_configurations_loaders["pcmci"].get(args.basin) - - if not loader: - raise ValueError("Invalid basin name") - - datasets, configurations, independence_tests = loader() - - # Run experiments - for config in configurations: - params = config["params"] - dataset_name = config["dataset_name"] - dataframe = datasets[dataset_name] - - independence_test = independence_tests[params["independencetest"]] - algorithm = params["algorithm"] - lag = params["lag"] - - # Construct a unique identifier for the configuration - param_str = "_".join(f"{k}{v}" for k, v in params.items()) - config_id = f"dataset{dataset_name}_{param_str}" - target_file = os.path.join(constants.path_results, f"pcmci_{args.basin}_{config_id}.pkl") - if os.path.exists(target_file): - print(f"Skipping config {config_id} because results already exist") - continue - - print(f"Running experiment with config: {config}") - - pcmci = PCMCI(dataframe=dataframe["full_tigramite"], cond_ind_test=independence_test, verbosity=2) - - # if inspect_data: - # # Investigating data dependencies and lag functions - # correlations = pcmci.get_lagged_dependencies(tau_max=20, val_only=True)['val_matrix'] - - # matrix_lags = None #np.argmax(np.abs(correlations), axis=2) - # tp.plot_scatterplots(dataframe=dataframe, add_scatterplot_args={'matrix_lags':matrix_lags}); plt.show() - - # tp.plot_densityplots(dataframe=dataframe, add_densityplot_args={'matrix_lags':matrix_lags}); plt.show() - - start_time = time.time() - if algorithm == "pcmci": - results = pcmci.run_pcmci(tau_max=lag, pc_alpha=0.05, alpha_level=0.01) - elif algorithm == "pcmci_plus": - results = pcmci.run_pcmciplus(tau_min=0, tau_max=lag) - else: - raise ValueError(f"Invalid algorithm {algorithm}") - end_time = time.time() - execution_time = end_time - start_time - - # if show_p_val: - # print("p-values") - # print(results['p_matrix'].round(3)) - # print("MCI partial correlations") - # print(results['val_matrix'].round(2)) - - q_matrix = pcmci.get_corrected_pvalues( - p_matrix=results["p_matrix"], - tau_max=lag, - fdr_method="fdr_bh", - ) - - # if print_significant_links: - # pcmci.print_significant_links( - # p_matrix = q_matrix, - # val_matrix = results['val_matrix'], - # alpha_level = 0.01) - - graph = pcmci.get_graph_from_pmatrix( - p_matrix=q_matrix, - alpha_level=0.01, - tau_min=0, - tau_max=lag, - link_assumptions=None, - ) - - results["graph"] = graph - - # Save results to the dictionary - current_result = { - "results": results, - "params": params, - "dataset_name": dataset_name, - "basin": args.basin, - "execution_time": execution_time, - } - - # Save the object to a pickle file - file_management.save_to_pkl_file(target_file=target_file, data=current_result) - - print("-" * 80) - - -if __name__ == "__main__": - main() diff --git a/hawk/analysis/simulation_pcmci.py b/hawk/analysis/simulation_pcmci.py new file mode 100644 index 0000000..04626b8 --- /dev/null +++ b/hawk/analysis/simulation_pcmci.py @@ -0,0 +1,58 @@ +import time + +from tigramite.pcmci import PCMCI + + +def run( + datasets, + config, + independence_tests, +): + params = config["params"] + dataset_name = config["dataset_name"] + dataframe = datasets[dataset_name] + + independence_test = independence_tests[params["independencetest"]] + algorithm = params["algorithm"] + lag = params["lag"] + + # Construct a unique identifier for the configuration + # param_str = "_".join(f"{k}{v}" for k, v in params.items()) + # config_id = f"dataset{dataset_name}_{param_str}" + + print(f"Running experiment with config: {config}") + + pcmci = PCMCI(dataframe=dataframe["full_tigramite"], cond_ind_test=independence_test, verbosity=2) + + start_time = time.time() + if algorithm == "pcmci": + results = pcmci.run_pcmci(tau_max=lag, pc_alpha=0.05, alpha_level=0.01) + elif algorithm == "pcmci_plus": + results = pcmci.run_pcmciplus(tau_min=0, tau_max=lag) + else: + raise ValueError(f"Invalid algorithm {algorithm}") + end_time = time.time() + execution_time = end_time - start_time + + q_matrix = pcmci.get_corrected_pvalues( + p_matrix=results["p_matrix"], + tau_max=lag, + fdr_method="fdr_bh", + ) + + graph = pcmci.get_graph_from_pmatrix( + p_matrix=q_matrix, + alpha_level=0.01, + tau_min=0, + tau_max=lag, + link_assumptions=None, + ) + + results["graph"] = graph + + return { + "results": results, + "params": params, + "dataset_name": dataset_name, + "execution_time": execution_time, + } From c469808c7363a685178539a209266f6cbc33cd0d Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 15:09:21 +0100 Subject: [PATCH 08/51] Fix typo --- hawk/processes/wps_causal.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/hawk/processes/wps_causal.py b/hawk/processes/wps_causal.py index 1409db5..193bea6 100644 --- a/hawk/processes/wps_causal.py +++ b/hawk/processes/wps_causal.py @@ -74,8 +74,8 @@ def __init__(self): ], ), LiteralInput( - "tefs_use_comtemporary_features", - "TEFS Use Comtemporary Features", + "tefs_use_contemporary_features", + "TEFS Use Contemporary Features", data_type="boolean", abstract="Choose whether to use comtemporary features in the TEFS algorithm.", default=False, @@ -162,8 +162,8 @@ def __init__(self): super(Causal, self).__init__( self._handler, - identifier="hello", - title="Say Hello", + identifier="causal", + title="Causal Analysis", abstract="Just says a friendly Hello." "Returns a literal string output with Hello plus the inputed name.", keywords=["hello", "demo"], @@ -195,7 +195,7 @@ def _handler(self, request, response): pcmci_max_lag = request.inputs["pcmci_max_lag"][0].data tefs_direction = request.inputs["tefs_direction"][0].data - tefs_use_comtemporary_features = request.inputs["tefs_use_comtemporary_features"][0].data + tefs_use_contemporary_features = request.inputs["tefs_use_contemporary_features"][0].data tefs_max_lag_features = request.inputs["tefs_max_lag_features"][0].data tefs_max_lag_target = request.inputs["tefs_max_lag_target"][0].data @@ -210,7 +210,7 @@ def _handler(self, request, response): pcmci_test_choice, pcmci_max_lag, tefs_direction, - tefs_use_comtemporary_features, + tefs_use_contemporary_features, tefs_max_lag_features, tefs_max_lag_target, workdir, From 49c88d6ee25dac66472c13e8a22403497a12aec3 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 15:11:06 +0100 Subject: [PATCH 09/51] Add configuration builder to main file --- hawk/analysis/config_pcmci.py | 78 ----------------- hawk/analysis/config_te.py | 54 ------------ hawk/analysis/main.py | 159 +++++++++++++++++++++++++++++++--- 3 files changed, 147 insertions(+), 144 deletions(-) delete mode 100644 hawk/analysis/config_pcmci.py delete mode 100644 hawk/analysis/config_te.py diff --git a/hawk/analysis/config_pcmci.py b/hawk/analysis/config_pcmci.py deleted file mode 100644 index efe3cfd..0000000 --- a/hawk/analysis/config_pcmci.py +++ /dev/null @@ -1,78 +0,0 @@ -import itertools - -import numpy as np -from tigramite.independence_tests.cmiknn import CMIknn -from tigramite.independence_tests.parcorr import ParCorr - -from ..data.data_ticino import ( - df_ticino, - df_ticino_snowlakes, - df_ticino_snowlakes_test, - df_ticino_snowlakes_tigramite, - df_ticino_snowlakes_train, - df_ticino_test, - df_ticino_tigramite, - df_ticino_train, - var_names_ticino, - var_names_ticino_snowlakes, -) - -seed = 42 -np.random.seed(seed) - -# Define the tests -parcorr = ParCorr(significance="analytic") -cmiknn = CMIknn(significance="shuffle_test", knn=0.1, shuffle_neighbors=5, transform="ranks", sig_samples=200) - -# Create the dictionary of tests -independence_tests = { - "parcorr": parcorr, - "cmiknn": cmiknn, -} - -# Create the dictionary of datasets -datasets = { - "snowlakes": { - "full_tigramite": df_ticino_snowlakes_tigramite, - "full": df_ticino_snowlakes, - "train": df_ticino_snowlakes_train, - "test": df_ticino_snowlakes_test, - "var_names": var_names_ticino_snowlakes, - }, -} - -# Variables -lag_options = [ - 0, - 1, -] -independence_tests_options = [ - "parcorr", - "cmiknn", -] -# NOTE add here if you want the base algorithm as well -algorithm_options = [ - "pcmci_plus", -] -dataset_options = [ - "normal", - "snowlakes", -] - -# Generating the configurations -configurations = [] - -for lag, independencetest, algorithm, dataset_name in itertools.product(lag_options, independence_tests_options, algorithm_options, dataset_options): - configuration = { - "params": { - "lag": lag, - "independencetest": independencetest, - "algorithm": algorithm, - }, - "dataset_name": dataset_name, - } - configurations.append(configuration) - - -def load_ticino(): - return datasets, configurations, independence_tests diff --git a/hawk/analysis/config_te.py b/hawk/analysis/config_te.py deleted file mode 100644 index c2bd14d..0000000 --- a/hawk/analysis/config_te.py +++ /dev/null @@ -1,54 +0,0 @@ -import itertools - -import numpy as np - -# Load here the dataset -# ... - -np.random.seed(42) - -# Define the different dataframes to use -datasets = { - "normal": { - "full": df_ticino, - "train": df_ticino_train, - "test": df_ticino_test, - "var_names": df_ticino.columns, - }, -} - -# Constants -# - `threshold` is set to be large in the forward direction (give me all the information) and 0 in the backward direction. -# - `k` rule of thumb: $1/20$ of the number of samples (try 5,10,20,30...) (TODO) -lagtarget = [1] -threshold_forward = float("inf") -threshold_backward = 0 -k = 10 - -# Variables set by the configuration -lagfeatures_options = [[0], [0, 1]] -directions = ["forward", "backward"] -dataset_names = [ - "normal", -] - -# Generating the configurations -configurations = [] - -for lagfeatures, direction, dataset_name in itertools.product(lagfeatures_options, directions, dataset_names): - threshold = threshold_forward if direction == "forward" else threshold_backward - configuration = { - "params": { - "lagfeatures": lagfeatures, - "lagtarget": lagtarget, - "direction": direction, - "threshold": threshold, # NOTE: the threshold is set here, although it is not used during the simulation, but only during the postprocessing, might be better to change this behavior - "k": k, - }, - "dataset_name": dataset_name, - } - configurations.append(configuration) - - -def load_te(): - return datasets, configurations diff --git a/hawk/analysis/main.py b/hawk/analysis/main.py index 288c320..82321ac 100644 --- a/hawk/analysis/main.py +++ b/hawk/analysis/main.py @@ -1,6 +1,14 @@ +import itertools + +from tigramite.independence_tests.cmiknn import CMIknn +from tigramite.independence_tests.parcorr import ParCorr + import hawk.analysis.pcmci_tools as pcmci_tools +import hawk.analysis.simulation_pcmci as simulation_pcmci +import hawk.analysis.simulation_tefs as simulation_tefs from hawk.analysis.metrics import regression_analysis + class CausalAnalysis: def __init__( self, @@ -10,7 +18,7 @@ def __init__( pcmci_test_choice, pcmci_max_lag, tefs_direction, - tefs_use_comtemporary_features, + tefs_use_contemporary_features, tefs_max_lag_features, tefs_max_lag_target, workdir, @@ -21,17 +29,19 @@ def __init__( self.pcmci_test_choice = pcmci_test_choice self.pcmci_max_lag = pcmci_max_lag self.tefs_direction = tefs_direction - self.tefs_use_comtemporary_features = tefs_use_comtemporary_features + self.tefs_use_contemporary_features = tefs_use_contemporary_features self.tefs_max_lag_features = tefs_max_lag_features self.tefs_max_lag_target = tefs_max_lag_target self.workdir = workdir self.tefs_features_lags = [] - if self.tefs_use_comtemporary_features: + if self.tefs_use_contemporary_features: self.tefs_features_lags.append(0) - self.tefs_features_lags.extend( - list(range(1, self.tefs_max_lag_features + 1)) - ) + self.tefs_features_lags.extend(list(range(1, self.tefs_max_lag_features + 1))) + + self.tefs_target_lags = list(range(1, self.tefs_max_lag_target + 1)) + + self.pcmci_features_lags = list(range(0, self.pcmci_max_lag + 1)) self.baseline = None self.plot_pcmci = None @@ -40,9 +50,8 @@ def __init__( self.details_tefs = None self.plot_tefs_wrapper = None self.details_tefs_wrapper = None - + def run_baseline_analysis(self): - baseline = {} features_names = self.df_train.columns.tolist() @@ -54,8 +63,12 @@ def run_baseline_analysis(self): configs.append((f"AR({i})", {self.target_column_name: list(range(1, i + 1))})) # With all features - configs.append(("All features", {feature: self.tefs_features_lags for feature in features_names})) - + configs.append( + ( + "All features", + {feature: self.tefs_features_lags for feature in features_names}, + ) + ) for label, inputs_names_lags in configs: baseline[label] = { @@ -65,10 +78,132 @@ def run_baseline_analysis(self): target_name=self.target_column_name, df_train=self.df_train, df_test=self.df_test, - ) + ), } return baseline - + + def run_tefs_analysis( + self, + k=10, + threshold_forward=float("inf"), + threshold_backward=0, + ): + # Grid of options + + lagtarget_options = [self.tefs_target_lags[: i + 1] for i in range(len(self.tefs_target_lags))] + + lagfeatures_options = [self.tefs_features_lags[: i + 1] for i in range(len(self.tefs_features_lags))] + + if self.tefs_direction == "both": + directions = ["forward", "backward"] + else: + directions = [self.tefs_direction] + + # Define the different dataframes to use + datasets = { + "normal": { + "full": df_ticino, + "train": df_ticino_train, + "test": df_ticino_test, + "var_names": df_ticino.columns, + }, + } + + dataset_names = [ + "normal", + ] + + # Create the configurations + configurations = [] + + for lagfeatures, lagtarget, direction, dataset_name in itertools.product( + lagfeatures_options, lagtarget_options, directions, dataset_names + ): + threshold = threshold_forward if direction == "forward" else threshold_backward + configuration = { + "params": { + "lagfeatures": lagfeatures, + "lagtarget": lagtarget, + "direction": direction, + "threshold": threshold, + "k": k, + }, + "dataset_name": dataset_name, + } + configurations.append(configuration) + + # Run the analysis + for config in configurations: + simulation_tefs.run( + datasets=datasets, + config=config, + ) + + def run_pcmci_analysis( + self, + ): + lag_options = [self.pcmci_features_lags[: i + 1] for i in range(len(self.pcmci_features_lags))] + + # Define the tests + parcorr = ParCorr(significance="analytic") + cmiknn = CMIknn(significance="shuffle_test", knn=0.1, shuffle_neighbors=5, transform="ranks", sig_samples=200) + + # Create the dictionary of tests + independence_tests = { + "parcorr": parcorr, + "cmiknn": cmiknn, + } + + # Create the dictionary of datasets + datasets = { + "snowlakes": { + "full_tigramite": df_ticino_snowlakes_tigramite, + "full": df_ticino_snowlakes, + "train": df_ticino_snowlakes_train, + "test": df_ticino_snowlakes_test, + "var_names": var_names_ticino_snowlakes, + }, + } + + independence_tests_options = [ + "parcorr", + "cmiknn", + ] + + algorithm_options = [ + "pcmci_plus", + ] + + dataset_options = [ + "normal", + ] + + # Generating the configurations + configurations = [] + + for lag, independencetest, algorithm, dataset_name in itertools.product( + lag_options, independence_tests_options, algorithm_options, dataset_options + ): + configuration = { + "params": { + "lag": lag, + "independencetest": independencetest, + "algorithm": algorithm, + }, + "dataset_name": dataset_name, + } + configurations.append(configuration) + + results = [] + for config in configurations: + results.append( + simulation_pcmci.run( + datasets=datasets, + config=config, + independence_tests=independence_tests, + ) + ) + def run(self): self.baseline = self.run_baseline_analysis() From ac80116647aab3a3ba812fa2ef344fa188444899 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 15:23:39 +0100 Subject: [PATCH 10/51] Implement datasets handling --- hawk/analysis/main.py | 63 ++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/hawk/analysis/main.py b/hawk/analysis/main.py index 82321ac..ae890f4 100644 --- a/hawk/analysis/main.py +++ b/hawk/analysis/main.py @@ -1,5 +1,6 @@ import itertools +import pandas as pd from tigramite.independence_tests.cmiknn import CMIknn from tigramite.independence_tests.parcorr import ParCorr @@ -23,8 +24,19 @@ def __init__( tefs_max_lag_target, workdir, ): - self.df_train = df_train - self.df_test = df_test + df_full = pd.concat([df_train, df_test], axis=1).reset_index(drop=True) + df_full_tigramite = pcmci_tools.initialize_tigramite_df(df_full) + + self.datasets = { + "normal": { + "full_tigramite": df_full_tigramite, + "full": df_full, + "train": df_train, + "test": df_test, + "var_names": df_train.columns.tolist(), + }, + } + self.target_column_name = target_column_name self.pcmci_test_choice = pcmci_test_choice self.pcmci_max_lag = pcmci_max_lag @@ -54,7 +66,7 @@ def __init__( def run_baseline_analysis(self): baseline = {} - features_names = self.df_train.columns.tolist() + features_names = self.datasets["normal"]["var_names"] configs = [] @@ -76,8 +88,8 @@ def run_baseline_analysis(self): "r2": regression_analysis( inputs_names_lags=inputs_names_lags, target_name=self.target_column_name, - df_train=self.df_train, - df_test=self.df_test, + df_train=self.datasets["normale"]["train"], + df_test=self.datasets["normale"]["test"], ), } @@ -100,16 +112,6 @@ def run_tefs_analysis( else: directions = [self.tefs_direction] - # Define the different dataframes to use - datasets = { - "normal": { - "full": df_ticino, - "train": df_ticino_train, - "test": df_ticino_test, - "var_names": df_ticino.columns, - }, - } - dataset_names = [ "normal", ] @@ -134,12 +136,17 @@ def run_tefs_analysis( configurations.append(configuration) # Run the analysis + results = [] for config in configurations: - simulation_tefs.run( - datasets=datasets, - config=config, + results.append( + simulation_tefs.run( + datasets=self.datasets, + config=config, + ) ) + return results + def run_pcmci_analysis( self, ): @@ -155,17 +162,6 @@ def run_pcmci_analysis( "cmiknn": cmiknn, } - # Create the dictionary of datasets - datasets = { - "snowlakes": { - "full_tigramite": df_ticino_snowlakes_tigramite, - "full": df_ticino_snowlakes, - "train": df_ticino_snowlakes_train, - "test": df_ticino_snowlakes_test, - "var_names": var_names_ticino_snowlakes, - }, - } - independence_tests_options = [ "parcorr", "cmiknn", @@ -195,15 +191,22 @@ def run_pcmci_analysis( } configurations.append(configuration) + # Run the analysis results = [] for config in configurations: results.append( simulation_pcmci.run( - datasets=datasets, + datasets=self.datasets, config=config, independence_tests=independence_tests, ) ) + return results + def run(self): self.baseline = self.run_baseline_analysis() + tefs_results = self.run_tefs_analysis() + pcmci_results = self.run_pcmci_analysis() + + # post-processing From 7c21f223f08750c664bfc93cf05b76fac5265f9c Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 15:23:56 +0100 Subject: [PATCH 11/51] Simplify tigramite helper function --- hawk/analysis/pcmci_tools.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/hawk/analysis/pcmci_tools.py b/hawk/analysis/pcmci_tools.py index 63a3859..67357c4 100644 --- a/hawk/analysis/pcmci_tools.py +++ b/hawk/analysis/pcmci_tools.py @@ -46,8 +46,10 @@ def initialize_tigramite_df(df: pd.DataFrame): :return: tigramite dataframe and variable names tuple """ - var_names = df.columns + dataframe = pp.DataFrame( + df.values, + datatime={0: np.arange(len(df))}, + var_names=df.columns, + ) - dataframe = pp.DataFrame(df.values, datatime={0: np.arange(len(df))}, var_names=var_names) - - return dataframe, var_names + return dataframe From 12a0901fec64337e44b6b56e6c52450939d044b8 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 21:13:06 +0100 Subject: [PATCH 12/51] First draft restructure postprocessing --- hawk/analysis/run_postprocessing.py | 617 +++++++++++++--------------- 1 file changed, 290 insertions(+), 327 deletions(-) diff --git a/hawk/analysis/run_postprocessing.py b/hawk/analysis/run_postprocessing.py index 2943cce..4fa3c50 100644 --- a/hawk/analysis/run_postprocessing.py +++ b/hawk/analysis/run_postprocessing.py @@ -6,13 +6,10 @@ import numpy as np import pandas as pd import seaborn as sns -import thesis.constants as constants -import thesis.file_management as file_management -from tefs.metrics import regression_analysis -from thesis import datasets_and_configurations_loaders, pcmci_tools from tigramite import plotting as tp -plt.rc("text", usetex=True) +from .metrics import regression_analysis +from .pcmci_tools import get_connected_variables # Adjusted custom sort key function to handle lag sequences and replace them with the last lag @@ -83,349 +80,315 @@ def plot_feature_presence_and_r2(df_presence, scores_values, scores_labels): return fig, (ax_bar, ax_heatmap) -def main(): - # List all files in the results folder ending with .pkl - results_files = sorted([file for file in os.listdir(constants.path_results) if file.endswith(".pkl")], key=general_custom_sort_key) - - results_pcmci = {} - results_te = {} - - for file in results_files: - parts = file.split("_") - algorithm = parts[0] - basin = parts[1] - key = file.split(basin)[1][1:-4] - - if algorithm == "pcmci": - if basin not in results_pcmci: - results_pcmci[basin] = {} - - results_pcmci[basin][key] = file_management.load_from_pkl_file(os.path.join(constants.path_results, file)) - - elif algorithm == "te": - - if basin not in results_te: - results_te[basin] = {} - - results_te[basin][key] = file_management.load_from_pkl_file(os.path.join(constants.path_results, file)) - - # -------------------- PCMCI -------------------- - - for basin_name, basin_results in results_pcmci.items(): - datasets, _, _ = datasets_and_configurations_loaders["pcmci"].get(basin_name)() - - all_basin_variables = set() - results_table_pcmci = [] - for key, simulation in basin_results.items(): - dataframe = datasets[simulation["dataset_name"]] - var_names = dataframe["var_names"] - all_basin_variables.update(var_names.values) - - results = simulation["results"] - - # Plot only the connections to any of the target variables - temp_graph = results["graph"].copy() - - # Show only the connections to the target variables - # Identify the indexes of the target variables - # target_vars = np.where(["target" in var for var in var_names.values])[0] - # for i in range(temp_graph.shape[0]): - # for j in range(temp_graph.shape[1]): - # # if the edge is not connected to the target variables - # if i not in target_vars and j not in target_vars: - # # remove the edge - # temp_graph[i, j, :] = '' - # temp_graph[j, i, :] = '' - - # Base arguments for tp.plot_graph - plot_args = { - "val_matrix": results["val_matrix"], - "graph": temp_graph, - "var_names": var_names, - "link_colorbar_label": "cross-MCI", - "node_colorbar_label": "auto-MCI", - "show_autodependency_lags": False, - } - - # Additional arguments to include if the independence_test is CMIknn - if simulation["params"]["independencetest"] == "cmiknn": - plot_args.update( - { - "vmin_edges": 0.0, - "vmax_edges": 0.1, - "edge_ticks": 0.05, - "cmap_edges": "OrRd", - "vmin_nodes": 0, - "vmax_nodes": 0.1, - "node_ticks": 0.1, - "cmap_nodes": "OrRd", - } - ) - - # Plot causal graph - target_file = os.path.join(constants.path_figures, "algorithm_results", basin_name, "pcmci", key + ".pdf") - if not os.path.exists(target_file): - fig, ax = plt.subplots() - tp.plot_graph(**plot_args, fig_ax=(fig, ax)) - os.makedirs(os.path.dirname(target_file), exist_ok=True) - plt.savefig(target_file, bbox_inches="tight") - plt.close(fig) - - # Plot time series graph if lag > 0 - if simulation["params"]["lag"] > 0: - target_file = os.path.join(constants.path_figures, "algorithm_results", basin_name, "pcmci", key + "_timeseries.pdf") - if not os.path.exists(target_file): - fig, ax = plt.subplots() - tp.plot_time_series_graph( - figsize=(6, 4), - fig_ax=(fig, ax), - val_matrix=results["val_matrix"], - graph=results["graph"], - var_names=var_names, - link_colorbar_label="MCI", - ) - os.makedirs(os.path.dirname(target_file), exist_ok=True) - plt.savefig(target_file, bbox_inches="tight") - plt.close(fig) - - # Extract the selected features - selected_features = pcmci_tools.get_connected_variables(results["graph"], var_names) - basin_results[key]["selected_features"] = selected_features - - # Compute the R2 scores - inputs_names_lags = {feature: [0] for feature in selected_features} - score_r2 = ( - regression_analysis( - inputs_names_lags=inputs_names_lags, - target_name="target", - df_train=dataframe["train"], - df_test=dataframe["test"], - ) - if len(selected_features) > 0 - else np.nan - ) - basin_results[key]["score_r2"] = score_r2 - - inputs_names_lags = {feature: list(range(0, simulation["params"]["lag"] + 1)) for feature in selected_features} - score_r2_lag = ( - regression_analysis( - inputs_names_lags=inputs_names_lags, - target_name="target", - df_train=dataframe["train"], - df_test=dataframe["test"], - ) - if len(selected_features) > 0 - else np.nan +def run_postprocessing_pcmci( + results_pcmci, + datasets, + destination_path, +): + all_basin_variables = set() + results_table_pcmci = [] + for simulation in results_pcmci: + dataframe = datasets[simulation["dataset_name"]] + var_names = dataframe["var_names"] + all_basin_variables.update(var_names.values) + + results = simulation["results"] + + # Plot only the connections to any of the target variables + temp_graph = results["graph"].copy() + + # Show only the connections to the target variables + # Identify the indexes of the target variables + # target_vars = np.where(["target" in var for var in var_names.values])[0] + # for i in range(temp_graph.shape[0]): + # for j in range(temp_graph.shape[1]): + # # if the edge is not connected to the target variables + # if i not in target_vars and j not in target_vars: + # # remove the edge + # temp_graph[i, j, :] = '' + # temp_graph[j, i, :] = '' + + # Base arguments for tp.plot_graph + plot_args = { + "val_matrix": results["val_matrix"], + "graph": temp_graph, + "var_names": var_names, + "link_colorbar_label": "cross-MCI", + "node_colorbar_label": "auto-MCI", + "show_autodependency_lags": False, + } + + # Additional arguments to include if the independence_test is CMIknn + if simulation["params"]["independencetest"] == "cmiknn": + plot_args.update( + { + "vmin_edges": 0.0, + "vmax_edges": 0.1, + "edge_ticks": 0.05, + "cmap_edges": "OrRd", + "vmin_nodes": 0, + "vmax_nodes": 0.1, + "node_ticks": 0.1, + "cmap_nodes": "OrRd", + } ) - basin_results[key]["score_r2_lag"] = score_r2_lag - inputs_names_lags = {feature: list(range(0, simulation["params"]["lag"] + 1)) for feature in selected_features} - inputs_names_lags["target"] = list(range(1, simulation["params"]["lag"] + 1)) - score_r2_lag_ar = regression_analysis( + # Plot causal graph + # target_file = os.path.join(constants.path_figures, "algorithm_results", basin_name, "pcmci", key + ".pdf") + # if not os.path.exists(target_file): + # fig, ax = plt.subplots() + # tp.plot_graph(**plot_args, fig_ax=(fig, ax)) + # os.makedirs(os.path.dirname(target_file), exist_ok=True) + # plt.savefig(target_file, bbox_inches="tight") + # plt.close(fig) + + # # Plot time series graph if lag > 0 + # if simulation["params"]["lag"] > 0: + # target_file = os.path.join( + # constants.path_figures, "algorithm_results", basin_name, "pcmci", key + "_timeseries.pdf" + # ) + # if not os.path.exists(target_file): + # fig, ax = plt.subplots() + # tp.plot_time_series_graph( + # figsize=(6, 4), + # fig_ax=(fig, ax), + # val_matrix=results["val_matrix"], + # graph=results["graph"], + # var_names=var_names, + # link_colorbar_label="MCI", + # ) + # os.makedirs(os.path.dirname(target_file), exist_ok=True) + # plt.savefig(target_file, bbox_inches="tight") + # plt.close(fig) + + # Extract the selected features + selected_features = get_connected_variables(results["graph"], var_names) + basin_results[key]["selected_features"] = selected_features + + # Compute the R2 scores + inputs_names_lags = {feature: [0] for feature in selected_features} + score_r2 = ( + regression_analysis( inputs_names_lags=inputs_names_lags, target_name="target", df_train=dataframe["train"], df_test=dataframe["test"], ) - basin_results[key]["score_r2_lag_ar"] = score_r2_lag_ar + if len(selected_features) > 0 + else np.nan + ) + basin_results[key]["score_r2"] = score_r2 - # Table of results - results_table_pcmci.append( - { - "selected_features": " ".join(selected_features), - "score_r2": score_r2, - "score_r2_lag": score_r2_lag, - "score_r2_lag_ar": score_r2_lag_ar, - "dataset": simulation["dataset_name"], - "algorithm": simulation["params"]["algorithm"], - "independencetest": simulation["params"]["independencetest"], - "lag": simulation["params"]["lag"], - "execution_time": simulation["execution_time"], - } + inputs_names_lags = {feature: list(range(0, simulation["params"]["lag"] + 1)) for feature in selected_features} + score_r2_lag = ( + regression_analysis( + inputs_names_lags=inputs_names_lags, + target_name="target", + df_train=dataframe["train"], + df_test=dataframe["test"], ) - - results_table_pcmci = pd.DataFrame.from_records(results_table_pcmci) - - # Export the file to pkl - file_management.save_to_pkl_file(os.path.join(constants.path_table_objects, f"results_table_{basin_name}_pcmci.pkl"), results_table_pcmci) - - # Feature presences heatmap - if "target" in all_basin_variables: - all_basin_variables.remove("target") - all_basin_variables = sorted(list(all_basin_variables)) - df_presence = pd.DataFrame(index=all_basin_variables, columns=range(len(basin_results))) - scores = [] - scores_lag = [] - scores_lag_ar = [] - - for index, key in enumerate(basin_results): - simulation = basin_results[key] - scores.append(simulation["score_r2"]) - scores_lag.append(simulation["score_r2_lag"]) - scores_lag_ar.append(simulation["score_r2_lag_ar"]) - - # loop through the rows of the df, if the feature is in the list of selected features, put a 1 - for feature in df_presence.index: - if feature in simulation["selected_features"]: - df_presence.loc[feature, index] = 1 - else: - df_presence.loc[feature, index] = 0 - if feature not in datasets[simulation["dataset_name"]]["var_names"]: - df_presence.loc[feature, index] = 2 - - df_presence = df_presence.astype(float) - scores = np.array(scores) - scores_lag = np.array(scores_lag) - scores_lag_ar = np.array(scores_lag_ar) - - fig, ax = plot_feature_presence_and_r2( - df_presence=df_presence, - scores_values=[scores, scores_lag, scores_lag_ar], - scores_labels=[r"$R^2$", r"$R^2$ (lag)", r"$R^2$ (lag + AR)"], + if len(selected_features) > 0 + else np.nan + ) + basin_results[key]["score_r2_lag"] = score_r2_lag + + inputs_names_lags = {feature: list(range(0, simulation["params"]["lag"] + 1)) for feature in selected_features} + inputs_names_lags["target"] = list(range(1, simulation["params"]["lag"] + 1)) + score_r2_lag_ar = regression_analysis( + inputs_names_lags=inputs_names_lags, + target_name="target", + df_train=dataframe["train"], + df_test=dataframe["test"], ) - target_file = os.path.join(constants.path_figures, "algorithm_results", basin_name, "pcmci", "feature_presence.pdf") - os.makedirs(os.path.dirname(target_file), exist_ok=True) - plt.savefig(target_file, bbox_inches="tight") + basin_results[key]["score_r2_lag_ar"] = score_r2_lag_ar + + # Table of results + results_table_pcmci.append( + { + "selected_features": " ".join(selected_features), + "score_r2": score_r2, + "score_r2_lag": score_r2_lag, + "score_r2_lag_ar": score_r2_lag_ar, + "dataset": simulation["dataset_name"], + "algorithm": simulation["params"]["algorithm"], + "independencetest": simulation["params"]["independencetest"], + "lag": simulation["params"]["lag"], + "execution_time": simulation["execution_time"], + } + ) + + results_table_pcmci = pd.DataFrame.from_records(results_table_pcmci) + + # Export the file to pkl + file_management.save_to_pkl_file( + os.path.join(constants.path_table_objects, f"results_table_{basin_name}_pcmci.pkl"), results_table_pcmci + ) + + # Feature presences heatmap + if "target" in all_basin_variables: + all_basin_variables.remove("target") + all_basin_variables = sorted(list(all_basin_variables)) + df_presence = pd.DataFrame(index=all_basin_variables, columns=range(len(basin_results))) + scores = [] + scores_lag = [] + scores_lag_ar = [] + + for index, key in enumerate(basin_results): + simulation = basin_results[key] + scores.append(simulation["score_r2"]) + scores_lag.append(simulation["score_r2_lag"]) + scores_lag_ar.append(simulation["score_r2_lag_ar"]) + + # loop through the rows of the df, if the feature is in the list of selected features, put a 1 + for feature in df_presence.index: + if feature in simulation["selected_features"]: + df_presence.loc[feature, index] = 1 + else: + df_presence.loc[feature, index] = 0 + if feature not in datasets[simulation["dataset_name"]]["var_names"]: + df_presence.loc[feature, index] = 2 + + df_presence = df_presence.astype(float) + scores = np.array(scores) + scores_lag = np.array(scores_lag) + scores_lag_ar = np.array(scores_lag_ar) + + fig, ax = plot_feature_presence_and_r2( + df_presence=df_presence, + scores_values=[scores, scores_lag, scores_lag_ar], + scores_labels=[r"$R^2$", r"$R^2$ (lag)", r"$R^2$ (lag + AR)"], + ) + target_file = os.path.join(destination_path, "algorithm_results", "pcmci", "feature_presence.pdf") + os.makedirs(os.path.dirname(target_file), exist_ok=True) + plt.savefig(target_file, bbox_inches="tight") + plt.close(fig) + + return target_file + + +def run_postprocessing_tefs( + results_tefs, + datasets, + destination_path, +): + all_basin_variables = set() + results_table_te = [] + for simulation in results_tefs: + dataset_name = simulation["dataset_name"] + dataframe = datasets[dataset_name] + var_names = dataframe["var_names"] + all_basin_variables.update(var_names) + + results = simulation["results"] + lagfeatures = simulation["params"]["lagfeatures"] + lagtarget = simulation["params"]["lagtarget"] + + # Plot the results + fig, ax = plt.subplots() + results.plot_te_results(ax=ax) + target_dir = os.path.join(constants.path_figures, "algorithm_results", basin_name, "te", key + ".pdf") + os.makedirs(os.path.dirname(target_dir), exist_ok=True) + plt.savefig(target_dir, bbox_inches="tight") plt.close(fig) - # -------------------- TRANSFER ENTROPY -------------------- - - for basin_name, basin_results in results_te.items(): - datasets, _ = datasets_and_configurations_loaders["te"].get(basin_name)() - - all_basin_variables = set() - results_table_te = [] - for key, simulation in basin_results.items(): - dataset_name = simulation["dataset_name"] - dataframe = datasets[dataset_name] - var_names = dataframe["var_names"] - all_basin_variables.update(var_names) - - results = simulation["results"] - lagfeatures = simulation["params"]["lagfeatures"] - lagtarget = simulation["params"]["lagtarget"] - - # Plot the results - fig, ax = plt.subplots() - results.plot_te_results(ax=ax) - target_dir = os.path.join(constants.path_figures, "algorithm_results", basin_name, "te", key + ".pdf") - os.makedirs(os.path.dirname(target_dir), exist_ok=True) - plt.savefig(target_dir, bbox_inches="tight") - plt.close(fig) - - # Extract the selected features - selected_features_names = results.select_features(simulation["params"]["threshold"]) - basin_results[key]["selected_features"] = selected_features_names - - # get the r2 score on the test set - inputs_names_lags = {feature: [0] for feature in selected_features_names} - score_r2 = ( - regression_analysis( - inputs_names_lags=inputs_names_lags, - target_name="target", - df_train=dataframe["train"], - df_test=dataframe["test"], - ) - if len(selected_features_names) > 0 - else np.nan - ) - basin_results[key]["score_r2"] = score_r2 - - inputs_names_lags = {feature: lagfeatures for feature in selected_features_names} - score_r2_lag = ( - regression_analysis( - inputs_names_lags=inputs_names_lags, - target_name="target", - df_train=dataframe["train"], - df_test=dataframe["test"], - ) - if len(selected_features_names) > 0 - else np.nan - ) - basin_results[key]["score_r2_lag"] = score_r2_lag + # Extract the selected features + selected_features_names = results.select_features(simulation["params"]["threshold"]) + basin_results[key]["selected_features"] = selected_features_names - inputs_names_lags = {feature: lagfeatures for feature in selected_features_names} - inputs_names_lags["target"] = lagtarget - score_r2_lag_ar = regression_analysis( + # get the r2 score on the test set + inputs_names_lags = {feature: [0] for feature in selected_features_names} + score_r2 = ( + regression_analysis( inputs_names_lags=inputs_names_lags, target_name="target", df_train=dataframe["train"], df_test=dataframe["test"], ) - basin_results[key]["score_r2_lag_ar"] = score_r2_lag_ar + if len(selected_features_names) > 0 + else np.nan + ) + basin_results[key]["score_r2"] = score_r2 - # Table of results - results_table_te.append( - { - "selected_features": " ".join(selected_features_names), - "score_r2": score_r2, - "score_r2_lag": score_r2_lag, - "score_r2_lag_ar": score_r2_lag_ar, - "dataset": dataset_name, - "lagfeatures": simulation["params"]["lagfeatures"], - "lagtarget": simulation["params"]["lagtarget"], - "direction": simulation["params"]["direction"], # not putting threshold and k - "execution_time": simulation["execution_time"], - } + inputs_names_lags = {feature: lagfeatures for feature in selected_features_names} + score_r2_lag = ( + regression_analysis( + inputs_names_lags=inputs_names_lags, + target_name="target", + df_train=dataframe["train"], + df_test=dataframe["test"], ) - - results_table_te = pd.DataFrame.from_records(results_table_te) - - # Export the file to pkl - file_management.save_to_pkl_file(os.path.join(constants.path_table_objects, f"results_table_{basin_name}_te.pkl"), results_table_te) - - # Feature presences heatmap - if "target" in all_basin_variables: - all_basin_variables.remove("target") - all_basin_variables = sorted(list(all_basin_variables)) - df_presence = pd.DataFrame(index=all_basin_variables, columns=range(len(basin_results))) - scores = [] - scores_lag = [] - scores_lag_ar = [] - - for index, key in enumerate(basin_results): - simulation = basin_results[key] - scores.append(simulation["score_r2"]) - scores_lag.append(simulation["score_r2_lag"]) - scores_lag_ar.append(simulation["score_r2_lag_ar"]) - - # loop through the rows of the df, if the feature is in the list of selected features, put a 1 - for feature in df_presence.index: - if feature in simulation["selected_features"]: - df_presence.loc[feature, index] = 1 - else: - df_presence.loc[feature, index] = 0 - if feature not in datasets[simulation["dataset_name"]]["var_names"]: - df_presence.loc[feature, index] = 2 - - df_presence = df_presence.astype(float) - scores = np.array(scores) - scores_lag = np.array(scores_lag) - scores_lag_ar = np.array(scores_lag_ar) - - fig, ax = plot_feature_presence_and_r2( - df_presence=df_presence, - scores_values=[scores, scores_lag, scores_lag_ar], - scores_labels=[r"$R^2$", r"$R^2$ (lag)", r"$R^2$ (lag + AR)"], + if len(selected_features_names) > 0 + else np.nan + ) + basin_results[key]["score_r2_lag"] = score_r2_lag + + inputs_names_lags = {feature: lagfeatures for feature in selected_features_names} + inputs_names_lags["target"] = lagtarget + score_r2_lag_ar = regression_analysis( + inputs_names_lags=inputs_names_lags, + target_name="target", + df_train=dataframe["train"], + df_test=dataframe["test"], + ) + basin_results[key]["score_r2_lag_ar"] = score_r2_lag_ar + + # Table of results + results_table_te.append( + { + "selected_features": " ".join(selected_features_names), + "score_r2": score_r2, + "score_r2_lag": score_r2_lag, + "score_r2_lag_ar": score_r2_lag_ar, + "dataset": dataset_name, + "lagfeatures": simulation["params"]["lagfeatures"], + "lagtarget": simulation["params"]["lagtarget"], + "direction": simulation["params"]["direction"], # not putting threshold and k + "execution_time": simulation["execution_time"], + } ) - target_file = os.path.join(constants.path_figures, "algorithm_results", basin_name, "te", "feature_presence.pdf") - os.makedirs(os.path.dirname(target_file), exist_ok=True) - plt.savefig(target_file, bbox_inches="tight") - plt.close(fig) - - -if __name__ == "__main__": - main() - - # Optional execution of the notebook - - # import nbformat - # from nbconvert.preprocessors import ExecutePreprocessor - # filename = 'droughts_postprocessing_pt2_and_wrapper.ipynb' - # with open(filename) as ff: - # nb_in = nbformat.read(ff, nbformat.NO_CONVERT) + results_table_te = pd.DataFrame.from_records(results_table_te) - # ep = ExecutePreprocessor(timeout=600, kernel_name='thesis') + # Export the file to pkl + file_management.save_to_pkl_file( + os.path.join(destination_path, f"results_table_{basin_name}_te.pkl"), results_table_te + ) - # nb_out = ep.preprocess(nb_in) + # Feature presences heatmap + if "target" in all_basin_variables: + all_basin_variables.remove("target") + all_basin_variables = sorted(list(all_basin_variables)) + df_presence = pd.DataFrame(index=all_basin_variables, columns=range(len(basin_results))) + scores = [] + scores_lag = [] + scores_lag_ar = [] + + for index, key in enumerate(basin_results): + simulation = basin_results[key] + scores.append(simulation["score_r2"]) + scores_lag.append(simulation["score_r2_lag"]) + scores_lag_ar.append(simulation["score_r2_lag_ar"]) + + # loop through the rows of the df, if the feature is in the list of selected features, put a 1 + for feature in df_presence.index: + if feature in simulation["selected_features"]: + df_presence.loc[feature, index] = 1 + else: + df_presence.loc[feature, index] = 0 + if feature not in datasets[simulation["dataset_name"]]["var_names"]: + df_presence.loc[feature, index] = 2 + + df_presence = df_presence.astype(float) + scores = np.array(scores) + scores_lag = np.array(scores_lag) + scores_lag_ar = np.array(scores_lag_ar) + + fig, ax = plot_feature_presence_and_r2( + df_presence=df_presence, + scores_values=[scores, scores_lag, scores_lag_ar], + scores_labels=[r"$R^2$", r"$R^2$ (lag)", r"$R^2$ (lag + AR)"], + ) + target_file = os.path.join(destination_path, "algorithm_results", basin_name, "te", "feature_presence.pdf") + os.makedirs(os.path.dirname(target_file), exist_ok=True) + plt.savefig(target_file, bbox_inches="tight") + plt.close(fig) From b2ce9ff05530e13113aab5d0c77f621aa961c0e7 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 21:14:28 +0100 Subject: [PATCH 13/51] Rename file --- hawk/analysis/{run_postprocessing.py => postprocessing.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename hawk/analysis/{run_postprocessing.py => postprocessing.py} (100%) diff --git a/hawk/analysis/run_postprocessing.py b/hawk/analysis/postprocessing.py similarity index 100% rename from hawk/analysis/run_postprocessing.py rename to hawk/analysis/postprocessing.py From b9a834f797e59b0d2f082785a27ed55eb70e53e5 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 21:38:33 +0100 Subject: [PATCH 14/51] Add file management helper file --- hawk/analysis/file_management.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 hawk/analysis/file_management.py diff --git a/hawk/analysis/file_management.py b/hawk/analysis/file_management.py new file mode 100644 index 0000000..986492f --- /dev/null +++ b/hawk/analysis/file_management.py @@ -0,0 +1,28 @@ +import os +import pickle +from typing import Any + + +def save_to_pkl_file(target_file: str, data: Any, overwrite: bool = True) -> None: + + # Check if the file already exists + if os.path.exists(target_file) and not overwrite: + raise ValueError(f"File {target_file} already exists.") + + # Create the directory and parent directories if they don't exist + os.makedirs(os.path.dirname(target_file), exist_ok=True) + + # Save the data to the file + with open(target_file, "wb") as f: + pickle.dump(data, f) + +def load_from_pkl_file(source_file: str) -> Any: + # Check if the file exists + if not os.path.exists(source_file): + raise ValueError(f"File {source_file} does not exist.") + + # Load the data from the file + with open(source_file, "rb") as f: + data = pickle.load(f) + + return data \ No newline at end of file From f8be6d051db8cd19263548c0c617f61f4f60fa14 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 21:38:50 +0100 Subject: [PATCH 15/51] Fine tune postprocessing file --- hawk/analysis/postprocessing.py | 61 +++++++++++++-------------------- 1 file changed, 24 insertions(+), 37 deletions(-) diff --git a/hawk/analysis/postprocessing.py b/hawk/analysis/postprocessing.py index 4fa3c50..3fa2d23 100644 --- a/hawk/analysis/postprocessing.py +++ b/hawk/analysis/postprocessing.py @@ -8,6 +8,7 @@ import seaborn as sns from tigramite import plotting as tp +from .file_management import save_to_pkl_file from .metrics import regression_analysis from .pcmci_tools import get_connected_variables @@ -163,7 +164,6 @@ def run_postprocessing_pcmci( # Extract the selected features selected_features = get_connected_variables(results["graph"], var_names) - basin_results[key]["selected_features"] = selected_features # Compute the R2 scores inputs_names_lags = {feature: [0] for feature in selected_features} @@ -177,7 +177,6 @@ def run_postprocessing_pcmci( if len(selected_features) > 0 else np.nan ) - basin_results[key]["score_r2"] = score_r2 inputs_names_lags = {feature: list(range(0, simulation["params"]["lag"] + 1)) for feature in selected_features} score_r2_lag = ( @@ -190,7 +189,6 @@ def run_postprocessing_pcmci( if len(selected_features) > 0 else np.nan ) - basin_results[key]["score_r2_lag"] = score_r2_lag inputs_names_lags = {feature: list(range(0, simulation["params"]["lag"] + 1)) for feature in selected_features} inputs_names_lags["target"] = list(range(1, simulation["params"]["lag"] + 1)) @@ -200,7 +198,6 @@ def run_postprocessing_pcmci( df_train=dataframe["train"], df_test=dataframe["test"], ) - basin_results[key]["score_r2_lag_ar"] = score_r2_lag_ar # Table of results results_table_pcmci.append( @@ -217,24 +214,20 @@ def run_postprocessing_pcmci( } ) - results_table_pcmci = pd.DataFrame.from_records(results_table_pcmci) - # Export the file to pkl - file_management.save_to_pkl_file( - os.path.join(constants.path_table_objects, f"results_table_{basin_name}_pcmci.pkl"), results_table_pcmci - ) + target_file_results_details = os.path.join(destination_path, "results_details_pcmci.pkl") + save_to_pkl_file(target_file_results_details, results_table_pcmci) # Feature presences heatmap if "target" in all_basin_variables: all_basin_variables.remove("target") all_basin_variables = sorted(list(all_basin_variables)) - df_presence = pd.DataFrame(index=all_basin_variables, columns=range(len(basin_results))) + df_presence = pd.DataFrame(index=all_basin_variables, columns=range(len(results_pcmci))) scores = [] scores_lag = [] scores_lag_ar = [] - for index, key in enumerate(basin_results): - simulation = basin_results[key] + for index, simulation in enumerate(results_pcmci): scores.append(simulation["score_r2"]) scores_lag.append(simulation["score_r2_lag"]) scores_lag_ar.append(simulation["score_r2_lag_ar"]) @@ -258,12 +251,12 @@ def run_postprocessing_pcmci( scores_values=[scores, scores_lag, scores_lag_ar], scores_labels=[r"$R^2$", r"$R^2$ (lag)", r"$R^2$ (lag + AR)"], ) - target_file = os.path.join(destination_path, "algorithm_results", "pcmci", "feature_presence.pdf") - os.makedirs(os.path.dirname(target_file), exist_ok=True) - plt.savefig(target_file, bbox_inches="tight") + target_file_plot = os.path.join(destination_path, "algorithm_results", "pcmci", "feature_presence.pdf") + os.makedirs(os.path.dirname(target_file_plot), exist_ok=True) + plt.savefig(target_file_plot, bbox_inches="tight") plt.close(fig) - return target_file + return target_file_plot, target_file_results_details def run_postprocessing_tefs( @@ -284,16 +277,15 @@ def run_postprocessing_tefs( lagtarget = simulation["params"]["lagtarget"] # Plot the results - fig, ax = plt.subplots() - results.plot_te_results(ax=ax) - target_dir = os.path.join(constants.path_figures, "algorithm_results", basin_name, "te", key + ".pdf") - os.makedirs(os.path.dirname(target_dir), exist_ok=True) - plt.savefig(target_dir, bbox_inches="tight") - plt.close(fig) + # fig, ax = plt.subplots() + # results.plot_te_results(ax=ax) + # target_dir = os.path.join(constants.path_figures, "algorithm_results", basin_name, "te", key + ".pdf") + # os.makedirs(os.path.dirname(target_dir), exist_ok=True) + # plt.savefig(target_dir, bbox_inches="tight") + # plt.close(fig) # Extract the selected features selected_features_names = results.select_features(simulation["params"]["threshold"]) - basin_results[key]["selected_features"] = selected_features_names # get the r2 score on the test set inputs_names_lags = {feature: [0] for feature in selected_features_names} @@ -307,7 +299,6 @@ def run_postprocessing_tefs( if len(selected_features_names) > 0 else np.nan ) - basin_results[key]["score_r2"] = score_r2 inputs_names_lags = {feature: lagfeatures for feature in selected_features_names} score_r2_lag = ( @@ -320,7 +311,6 @@ def run_postprocessing_tefs( if len(selected_features_names) > 0 else np.nan ) - basin_results[key]["score_r2_lag"] = score_r2_lag inputs_names_lags = {feature: lagfeatures for feature in selected_features_names} inputs_names_lags["target"] = lagtarget @@ -330,7 +320,6 @@ def run_postprocessing_tefs( df_train=dataframe["train"], df_test=dataframe["test"], ) - basin_results[key]["score_r2_lag_ar"] = score_r2_lag_ar # Table of results results_table_te.append( @@ -347,24 +336,20 @@ def run_postprocessing_tefs( } ) - results_table_te = pd.DataFrame.from_records(results_table_te) - # Export the file to pkl - file_management.save_to_pkl_file( - os.path.join(destination_path, f"results_table_{basin_name}_te.pkl"), results_table_te - ) + target_file_results_details = os.path.join(destination_path, "results_details_te.pkl") + save_to_pkl_file(target_file_results_details, results_table_te) # Feature presences heatmap if "target" in all_basin_variables: all_basin_variables.remove("target") all_basin_variables = sorted(list(all_basin_variables)) - df_presence = pd.DataFrame(index=all_basin_variables, columns=range(len(basin_results))) + df_presence = pd.DataFrame(index=all_basin_variables, columns=range(len(results_tefs))) scores = [] scores_lag = [] scores_lag_ar = [] - for index, key in enumerate(basin_results): - simulation = basin_results[key] + for index, simulation in enumerate(results_tefs): scores.append(simulation["score_r2"]) scores_lag.append(simulation["score_r2_lag"]) scores_lag_ar.append(simulation["score_r2_lag_ar"]) @@ -388,7 +373,9 @@ def run_postprocessing_tefs( scores_values=[scores, scores_lag, scores_lag_ar], scores_labels=[r"$R^2$", r"$R^2$ (lag)", r"$R^2$ (lag + AR)"], ) - target_file = os.path.join(destination_path, "algorithm_results", basin_name, "te", "feature_presence.pdf") - os.makedirs(os.path.dirname(target_file), exist_ok=True) - plt.savefig(target_file, bbox_inches="tight") + target_file_plot = os.path.join(destination_path, "algorithm_results", "te", "feature_presence.pdf") + os.makedirs(os.path.dirname(target_file_plot), exist_ok=True) + plt.savefig(target_file_plot, bbox_inches="tight") plt.close(fig) + + return target_file_plot, target_file_results_details From a0af0e4c7b8d918c7736aad1a289013a6de1a558 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 21:39:15 +0100 Subject: [PATCH 16/51] Comment unused import --- hawk/analysis/postprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hawk/analysis/postprocessing.py b/hawk/analysis/postprocessing.py index 3fa2d23..98fcbec 100644 --- a/hawk/analysis/postprocessing.py +++ b/hawk/analysis/postprocessing.py @@ -6,8 +6,8 @@ import numpy as np import pandas as pd import seaborn as sns -from tigramite import plotting as tp +# from tigramite import plotting as tp from .file_management import save_to_pkl_file from .metrics import regression_analysis from .pcmci_tools import get_connected_variables From 8619f9198b904891d88710b1f3713000a5276e89 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 21:39:41 +0100 Subject: [PATCH 17/51] Fix imports using relative --- hawk/analysis/main.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/hawk/analysis/main.py b/hawk/analysis/main.py index ae890f4..74f68d2 100644 --- a/hawk/analysis/main.py +++ b/hawk/analysis/main.py @@ -4,10 +4,12 @@ from tigramite.independence_tests.cmiknn import CMIknn from tigramite.independence_tests.parcorr import ParCorr -import hawk.analysis.pcmci_tools as pcmci_tools import hawk.analysis.simulation_pcmci as simulation_pcmci import hawk.analysis.simulation_tefs as simulation_tefs -from hawk.analysis.metrics import regression_analysis + +from .metrics import regression_analysis +from .pcmci_tools import initialize_tigramite_df +from .postprocessing import run_postprocessing_pcmci, run_postprocessing_tefs class CausalAnalysis: @@ -25,7 +27,7 @@ def __init__( workdir, ): df_full = pd.concat([df_train, df_test], axis=1).reset_index(drop=True) - df_full_tigramite = pcmci_tools.initialize_tigramite_df(df_full) + df_full_tigramite = initialize_tigramite_df(df_full) self.datasets = { "normal": { @@ -154,7 +156,13 @@ def run_pcmci_analysis( # Define the tests parcorr = ParCorr(significance="analytic") - cmiknn = CMIknn(significance="shuffle_test", knn=0.1, shuffle_neighbors=5, transform="ranks", sig_samples=200) + cmiknn = CMIknn( + significance="shuffle_test", + knn=0.1, + shuffle_neighbors=5, + transform="ranks", + sig_samples=200, + ) # Create the dictionary of tests independence_tests = { @@ -209,4 +217,6 @@ def run(self): tefs_results = self.run_tefs_analysis() pcmci_results = self.run_pcmci_analysis() - # post-processing + # post-processing passing self.workdir + self.plot_pcmci, self.details_pcmci = run_postprocessing_pcmci(pcmci_results, self.datasets, self.workdir) + self.plot_tefs, self.details_tefs = run_postprocessing_tefs(tefs_results, self.datasets, self.workdir) From 544b7d2ace85d4946ae7b05007e196ab26f03b8f Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 21:42:02 +0100 Subject: [PATCH 18/51] Merge simulation files --- hawk/analysis/main.py | 8 ++- .../{simulation_pcmci.py => simulation.py} | 53 ++++++++++++++++++- hawk/analysis/simulation_tefs.py | 53 ------------------- 3 files changed, 55 insertions(+), 59 deletions(-) rename hawk/analysis/{simulation_pcmci.py => simulation.py} (53%) delete mode 100644 hawk/analysis/simulation_tefs.py diff --git a/hawk/analysis/main.py b/hawk/analysis/main.py index 74f68d2..1a35e8d 100644 --- a/hawk/analysis/main.py +++ b/hawk/analysis/main.py @@ -4,12 +4,10 @@ from tigramite.independence_tests.cmiknn import CMIknn from tigramite.independence_tests.parcorr import ParCorr -import hawk.analysis.simulation_pcmci as simulation_pcmci -import hawk.analysis.simulation_tefs as simulation_tefs - from .metrics import regression_analysis from .pcmci_tools import initialize_tigramite_df from .postprocessing import run_postprocessing_pcmci, run_postprocessing_tefs +from .simulation import run_simulation_pcmci, run_simulation_tefs class CausalAnalysis: @@ -141,7 +139,7 @@ def run_tefs_analysis( results = [] for config in configurations: results.append( - simulation_tefs.run( + run_simulation_tefs( datasets=self.datasets, config=config, ) @@ -203,7 +201,7 @@ def run_pcmci_analysis( results = [] for config in configurations: results.append( - simulation_pcmci.run( + run_simulation_pcmci( datasets=self.datasets, config=config, independence_tests=independence_tests, diff --git a/hawk/analysis/simulation_pcmci.py b/hawk/analysis/simulation.py similarity index 53% rename from hawk/analysis/simulation_pcmci.py rename to hawk/analysis/simulation.py index 04626b8..d048f28 100644 --- a/hawk/analysis/simulation_pcmci.py +++ b/hawk/analysis/simulation.py @@ -1,9 +1,10 @@ import time +from tefs import TEFS from tigramite.pcmci import PCMCI -def run( +def run_simulation_pcmci( datasets, config, independence_tests, @@ -56,3 +57,53 @@ def run( "dataset_name": dataset_name, "execution_time": execution_time, } + + +def run_simulation_tefs( + datasets, + config, + n_jobs=1, +): + params = config["params"] + dataset_name = config["dataset_name"] + dataframe = datasets[dataset_name] + + # extract the parameters + direction = params["direction"] + lagfeatures = params["lagfeatures"] + lagtarget = params["lagtarget"] + k = params["k"] + + # Construct a unique identifier for the configuration + # param_str = "_".join(f"{k}{v}" for k, v in params.items()) + # param_str = param_str.replace(" ", "") + # config_id = f"dataset{dataset_name}_{param_str}" + + features = dataframe["full"].drop(columns=["target"]) + target = dataframe["full"]["target"] + var_names = list(features.columns) + + # run the feature selection algorithm + start_time = time.time() + fs = TEFS( + features=features.values, + target=target.values, + k=k, + lag_features=lagfeatures, + lag_target=lagtarget, + direction=direction, + verbose=1, + var_names=var_names, + n_jobs=n_jobs, + ) + fs.fit() + end_time = time.time() + execution_time = end_time - start_time + + # Save results to the dictionary + return { + "results": fs, + "params": params, + "dataset_name": dataset_name, + "execution_time": execution_time, + } diff --git a/hawk/analysis/simulation_tefs.py b/hawk/analysis/simulation_tefs.py deleted file mode 100644 index cc72f0b..0000000 --- a/hawk/analysis/simulation_tefs.py +++ /dev/null @@ -1,53 +0,0 @@ -import time - -from tefs import TEFS - - -def run( - datasets, - config, - n_jobs=1, -): - params = config["params"] - dataset_name = config["dataset_name"] - dataframe = datasets[dataset_name] - - # extract the parameters - direction = params["direction"] - lagfeatures = params["lagfeatures"] - lagtarget = params["lagtarget"] - k = params["k"] - - # Construct a unique identifier for the configuration - # param_str = "_".join(f"{k}{v}" for k, v in params.items()) - # param_str = param_str.replace(" ", "") - # config_id = f"dataset{dataset_name}_{param_str}" - - features = dataframe["full"].drop(columns=["target"]) - target = dataframe["full"]["target"] - var_names = list(features.columns) - - # run the feature selection algorithm - start_time = time.time() - fs = TEFS( - features=features.values, - target=target.values, - k=k, - lag_features=lagfeatures, - lag_target=lagtarget, - direction=direction, - verbose=1, - var_names=var_names, - n_jobs=n_jobs, - ) - fs.fit() - end_time = time.time() - execution_time = end_time - start_time - - # Save results to the dictionary - return { - "results": fs, - "params": params, - "dataset_name": dataset_name, - "execution_time": execution_time, - } From e5e61fc450ef630779011f2963a47eb2306b08ee Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 22:03:08 +0100 Subject: [PATCH 19/51] Add wrapper analysis --- hawk/analysis/main.py | 10 +- hawk/analysis/postprocessing.py | 168 +++++++++++++++++++++++++++++++- 2 files changed, 175 insertions(+), 3 deletions(-) diff --git a/hawk/analysis/main.py b/hawk/analysis/main.py index 1a35e8d..4ed9f07 100644 --- a/hawk/analysis/main.py +++ b/hawk/analysis/main.py @@ -6,7 +6,11 @@ from .metrics import regression_analysis from .pcmci_tools import initialize_tigramite_df -from .postprocessing import run_postprocessing_pcmci, run_postprocessing_tefs +from .postprocessing import ( + run_postprocessing_pcmci, + run_postprocessing_tefs, + run_postprocessing_tefs_wrapper, +) from .simulation import run_simulation_pcmci, run_simulation_tefs @@ -215,6 +219,8 @@ def run(self): tefs_results = self.run_tefs_analysis() pcmci_results = self.run_pcmci_analysis() - # post-processing passing self.workdir self.plot_pcmci, self.details_pcmci = run_postprocessing_pcmci(pcmci_results, self.datasets, self.workdir) self.plot_tefs, self.details_tefs = run_postprocessing_tefs(tefs_results, self.datasets, self.workdir) + self.plot_tefs_wrapper, self.details_tefs_wrapper = run_postprocessing_tefs_wrapper( + tefs_results, self.datasets, self.workdir + ) diff --git a/hawk/analysis/postprocessing.py b/hawk/analysis/postprocessing.py index 98fcbec..009e150 100644 --- a/hawk/analysis/postprocessing.py +++ b/hawk/analysis/postprocessing.py @@ -316,7 +316,7 @@ def run_postprocessing_tefs( inputs_names_lags["target"] = lagtarget score_r2_lag_ar = regression_analysis( inputs_names_lags=inputs_names_lags, - target_name="target", + target_name="target", # TODO change to use the target column name given by the user df_train=dataframe["train"], df_test=dataframe["test"], ) @@ -379,3 +379,169 @@ def run_postprocessing_tefs( plt.close(fig) return target_file_plot, target_file_results_details + + +def run_postprocessing_tefs_wrapper( + results_tefs, + datasets, + destination_path, +): + results_table_tefs_wrapper = [] + target_file_train_test = os.path.join(destination_path, "tefs_as_wrapper", "wrapper.pdf") + # target_file_cv = os.path.join(constants.path_figures, "tefs_as_wrapper_cv", f"{basename}_wrapper_cv.pdf") + + fig, ax = plt.subplots(figsize=(10, 5)) + + for simulation in results_tefs: + # --------------------- Load corresponding dataset --------------------- + dataset_name = simulation["dataset_name"] + dataframe = datasets[dataset_name] + + target_columns = ["target"] + features_columns = dataframe["full"].drop(columns=target_columns).columns + + # --------------------- Select features using threshold (conservative) --------------------- + # selected_features_names_with_threshold = simulation["results"].select_features(simulation["params"]["threshold"]) + # n_features_selected_with_threshold = len(selected_features_names_with_threshold) + + # --------------------- Compute test R2 for each number of features --------------------- + test_r2_train_test = [] + # test_r2_cv = [] + num_total_features = len(features_columns) + for num_features in range(0, num_total_features + 1): + if num_features == 0: + selected_features_names = [] + else: + selected_features_names = simulation["results"].select_n_features(num_features) + + lagfeatures = simulation["params"]["lagfeatures"] + lagtarget = simulation["params"]["lagtarget"] + + inputs_names_lags = {feature: lagfeatures for feature in selected_features_names} + inputs_names_lags["target"] = lagtarget + + # --- Compute the train_test version --- + test_r2_train_test.append( + regression_analysis( + inputs_names_lags=inputs_names_lags, + target_name=target_columns[0], + df_train=dataframe["train"], + df_test=dataframe["test"], + ) + ) + + # # --- Compute the cross-validation version --- + # # To perform a cross-validation, we need to concatenate the train and test sets + # unified_df = pd.concat([dataframe["train"], dataframe["test"]], axis=0).reset_index(drop=True) + + # # Fixed window size + # # n_samples = unified_df.shape[0] + # # n_splits = 5 + # # cv_scheme = TimeSeriesSplit( + # # n_splits=n_splits, + # # max_train_size=n_samples // (n_splits + 1), + # # ) + + # # Regular KFold + # cv_scheme = KFold(n_splits=4) # 4 splits is about using the same test set size + + # test_r2_cv.append( + # regression_analysis( + # inputs_names_lags=inputs_names_lags, + # target_name=target_columns[0], + # df=unified_df, + # cv_scheme=cv_scheme, + # ) + # ) + + test_r2_train_test = np.array(test_r2_train_test) + # test_r2_cv = np.array(test_r2_cv) + + results_table_tefs_wrapper.append({"test_r2_train_test": test_r2_train_test}) + + # Export the file to pkl + target_file_results_details = os.path.join(destination_path, "results_details_tefs_wrapper.pkl") + save_to_pkl_file(target_file_results_details, results_table_tefs_wrapper) + + param_str = "_".join(f"{k}{v}" for k, v in simulation["params"].items()) + ax.plot(test_r2_train_test, marker="o", label=param_str) + maxima = np.where(test_r2_train_test == test_r2_train_test.max())[0] + ax.plot( + maxima, + test_r2_train_test[maxima], + marker="o", + color="red", + linestyle="None", + label="Maximum", + markersize=6, + ) + # ax.plot( + # n_features_selected_with_threshold, + # test_r2_train_test[n_features_selected_with_threshold], + # marker="o", + # color="green", + # linestyle="None", + # label="TEFS (conservative)", + # markersize=10, + # ) + + ax.set_xlabel("Number of features") + ax.set_ylabel("Test $R^2$") + ax.set_title("TEFS Wrapper") + ax.legend() + if num_total_features < 30: + step = 1 + elif num_total_features < 80: + step = 5 + else: + step = 10 + ax.set_xticks(range(0, num_total_features + 1, step)) + ax.set_xticklabels(range(0, num_total_features + 1, step)) + ax.set_ylim(-0.1, 0.55) + ax.grid() + + os.makedirs(os.path.dirname(target_file_train_test), exist_ok=True) + plt.savefig(target_file_train_test, bbox_inches="tight") + plt.close(fig) + + return target_file_train_test, target_file_results_details + + # # --------------------- Plot cross-validation version --------------------- + # fig, ax = plt.subplots(figsize=(10, 5)) + # ax.plot(test_r2_cv.mean(axis=1), marker="o", label="Cross-validation") + # maxima = np.where(test_r2_cv.mean(axis=1) == test_r2_cv.mean(axis=1).max())[0] + # ax.plot(maxima, test_r2_cv.mean(axis=1)[maxima], marker="o", color="red", linestyle="None", label="Maximum", markersize=10) + # ax.plot(n_features_selected_with_threshold, test_r2_cv.mean(axis=1)[n_features_selected_with_threshold], marker="o", color="green", linestyle="None", label="TEFS (conservative)", markersize=10) + + # # plot confidence interval bands from cross-validation based on mean and standard deviation (90% confidence) + # alpha = 0.1 + # quantile = scipy.stats.norm.ppf(1 - alpha / 2) + # ax.fill_between(range(test_r2_cv.shape[0]), test_r2_cv.mean(axis=1) - test_r2_cv.std(axis=1) * quantile / np.sqrt(test_r2_cv.shape[1]), test_r2_cv.mean(axis=1) + test_r2_cv.std(axis=1) * quantile / np.sqrt(test_r2_cv.shape[1]), alpha=0.3) + + # ax.set_xlabel("Number of features") + # ax.set_ylabel("Test $R^2$") + + # if simulation["params"]["threshold"] == np.inf: + # threshold_text = "\infty" + # elif simulation["params"]["threshold"] == -np.inf: + # threshold_text = "-\infty" + # else: + # threshold_text = simulation["params"]["threshold"] + + # title_text = f"TEFS on basin {basin_name.upper()} with dataset {dataset_name}\n[lagfeatures $={simulation['params']['lagfeatures']}$, lagtarget $={simulation['params']['lagtarget']}$, direction = {simulation['params']['direction']}, threshold $={threshold_text}]$" + # ax.set_title(title_text) + # ax.legend() + # if num_total_features < 30: + # step = 1 + # elif num_total_features < 80: + # step = 5 + # else: + # step = 10 + # ax.set_xticks(range(0, num_total_features + 1, step)) + # ax.set_xticklabels(range(0, num_total_features + 1, step)) + # ax.set_ylim(-0.1, 0.55) + # ax.grid() + + # os.makedirs(os.path.dirname(target_file_cv), exist_ok=True) + # plt.savefig(target_file_cv, bbox_inches="tight") + # plt.close(fig) From 58b4afddcf197e969e56af2643c8b77f9d4e7fe3 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 22:05:17 +0100 Subject: [PATCH 20/51] Format file management util --- hawk/analysis/file_management.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hawk/analysis/file_management.py b/hawk/analysis/file_management.py index 986492f..80cf141 100644 --- a/hawk/analysis/file_management.py +++ b/hawk/analysis/file_management.py @@ -4,11 +4,10 @@ def save_to_pkl_file(target_file: str, data: Any, overwrite: bool = True) -> None: - # Check if the file already exists if os.path.exists(target_file) and not overwrite: raise ValueError(f"File {target_file} already exists.") - + # Create the directory and parent directories if they don't exist os.makedirs(os.path.dirname(target_file), exist_ok=True) @@ -16,6 +15,7 @@ def save_to_pkl_file(target_file: str, data: Any, overwrite: bool = True) -> Non with open(target_file, "wb") as f: pickle.dump(data, f) + def load_from_pkl_file(source_file: str) -> Any: # Check if the file exists if not os.path.exists(source_file): @@ -24,5 +24,5 @@ def load_from_pkl_file(source_file: str) -> Any: # Load the data from the file with open(source_file, "rb") as f: data = pickle.load(f) - - return data \ No newline at end of file + + return data From 2f9844e7237672d94f9a851d498e24010a024332 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 22:13:46 +0100 Subject: [PATCH 21/51] Format metrics file --- hawk/analysis/metrics.py | 43 +++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/hawk/analysis/metrics.py b/hawk/analysis/metrics.py index a049508..788a29e 100644 --- a/hawk/analysis/metrics.py +++ b/hawk/analysis/metrics.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Tuple import pandas as pd from sklearn.linear_model import LinearRegression @@ -15,11 +15,12 @@ :param target_name: The name of the target variable in the DataFrame. """ + def prepare_data_with_lags( - df: pd.DataFrame, + df: pd.DataFrame, inputs_names_lags: Dict[str, list[int]], target_name: str, -) -> pd.DataFrame: +) -> Tuple[pd.DataFrame, pd.Series]: f""" Prepares data for regression by generating lagged features for specified variables and targets. @@ -31,33 +32,33 @@ def prepare_data_with_lags( required_columns = set([*inputs_names_lags.keys(), target_name]) if not required_columns.issubset(set(df.columns)): - raise ValueError("DataFrame 'df' must contain all the columns specified in 'features_names' and 'targets_names'.") + raise ValueError( + "DataFrame 'df' must contain all the columns specified in 'features_names' and 'targets_names'." + ) for lags in inputs_names_lags.values(): if min(lags) < 0: raise ValueError("Lag for independent variables must be a non-negative integer.") - + # Initialize a list to hold all DataFrame chunks lagged_chunks = [] - + # Generate lagged inputs for the independent variables for input, lags in inputs_names_lags.items(): for lag in lags: lagged_chunk = df[input].shift(lag).to_frame(f"{input}_t-{lag}") lagged_chunks.append(lagged_chunk) - + # Adding target column lagged_chunks.append(df[target_name].to_frame(target_name)) # Concatenate chunks df_lagged = pd.concat(lagged_chunks, axis=1) - + # Dropping rows with NaN values caused by shifting df_lagged = df_lagged.dropna() - - return df_lagged.drop(columns=target_name), df_lagged[target_name] - + return df_lagged.drop(columns=target_name), df_lagged[target_name] def regression_analysis( @@ -66,7 +67,7 @@ def regression_analysis( df: Optional[pd.DataFrame] = None, cv_scheme: Optional[BaseCrossValidator] = None, df_train: Optional[pd.DataFrame] = None, - df_test: Optional[pd.DataFrame] = None + df_test: Optional[pd.DataFrame] = None, ) -> Any: f""" Performs regression analysis with support for either cross-validation or a train-test split, @@ -85,21 +86,27 @@ def regression_analysis( cross_val_mode = bool(df is not None and cv_scheme is not None) train_test_mode = bool(df_train is not None and df_test is not None) if not (cross_val_mode ^ train_test_mode): - raise ValueError("Specify either cross-validation with 'cv_scheme' and 'df', or a train-test split with 'df_train' and 'df_test', not both.") - + raise ValueError( + "Specify either cross-validation with 'cv_scheme' and 'df', or a train-test split with 'df_train' and 'df_test', not both." + ) + if cross_val_mode: + if df is None or cv_scheme is None: + raise ValueError("Both 'df' and 'cv_scheme' must be specified for cross-validation mode.") X, y = prepare_data_with_lags( df, inputs_names_lags, target_name, ) - + model = LinearRegression() return cross_val_score(model, X, y, cv=cv_scheme) - + elif train_test_mode: - + if df_train is None or df_test is None: + raise ValueError("Both 'df_train' and 'df_test' must be specified for train-test split mode.") + X_train, y_train = prepare_data_with_lags( df_train, inputs_names_lags, @@ -111,7 +118,7 @@ def regression_analysis( inputs_names_lags, target_name, ) - + model = LinearRegression().fit(X_train, y_train) y_pred = model.predict(X_test) return r2_score(y_test, y_pred) From 78de313f43e1f0f9deb7daf2cb0f9fd997e25523 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 22:13:56 +0100 Subject: [PATCH 22/51] Add dependencies --- environment.yml | 2 ++ requirements.txt | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 120e234..a70b750 100644 --- a/environment.yml +++ b/environment.yml @@ -13,5 +13,7 @@ dependencies: - tigramite - tefs - pandas +- scikit-learn +- numpy # tests - pytest diff --git a/requirements.txt b/requirements.txt index e0d398c..28a8093 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,6 @@ pywps>=4.5.1,<4.6 birdhouse-birdy tigramite tefs -pandas \ No newline at end of file +pandas +scikit-learn +numpy \ No newline at end of file From 811fb7b3639ad0175be19ab776b6502ffef4726f Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 22:17:10 +0100 Subject: [PATCH 23/51] Run formats --- hawk/analysis/metrics.py | 10 +++++----- hawk/analysis/pcmci_tools.py | 2 +- hawk/analysis/postprocessing.py | 12 ++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/hawk/analysis/metrics.py b/hawk/analysis/metrics.py index 788a29e..fe56201 100644 --- a/hawk/analysis/metrics.py +++ b/hawk/analysis/metrics.py @@ -6,8 +6,8 @@ from sklearn.model_selection import BaseCrossValidator, cross_val_score inputs_names_lags_doc = """ -:param inputs_names_lags: A dictionary mapping input feature names to their corresponding list of lags. - For example, {'feature1': [1, 2], 'feature2': [1]} indicates 'feature1' should be lagged by 1 and 2 periods, +:param inputs_names_lags: A dictionary mapping input feature names to their corresponding list of lags. + For example, {'feature1': [1, 2], 'feature2': [1]} indicates 'feature1' should be lagged by 1 and 2 periods, and 'feature2' by 1 period. """ @@ -23,7 +23,7 @@ def prepare_data_with_lags( ) -> Tuple[pd.DataFrame, pd.Series]: f""" Prepares data for regression by generating lagged features for specified variables and targets. - + :param df: The pandas DataFrame containing the time series data. {inputs_names_lags_doc} {target_name_doc} @@ -72,7 +72,7 @@ def regression_analysis( f""" Performs regression analysis with support for either cross-validation or a train-test split, based on the arguments provided. - + {inputs_names_lags_doc} {target_name_doc} :param df: DataFrame for cross-validation mode. If specified, cv_scheme must also be provided. @@ -87,7 +87,7 @@ def regression_analysis( train_test_mode = bool(df_train is not None and df_test is not None) if not (cross_val_mode ^ train_test_mode): raise ValueError( - "Specify either cross-validation with 'cv_scheme' and 'df', or a train-test split with 'df_train' and 'df_test', not both." + "Specify either a 'cv_scheme' and 'df', or a train-test split with 'df_train' and 'df_test', not both." ) if cross_val_mode: diff --git a/hawk/analysis/pcmci_tools.py b/hawk/analysis/pcmci_tools.py index 67357c4..6ce7a7c 100644 --- a/hawk/analysis/pcmci_tools.py +++ b/hawk/analysis/pcmci_tools.py @@ -9,7 +9,7 @@ def get_connected_variables(graph: np.ndarray, var_names: list[str]) -> list[str The target is assumed to be the last variable. The connection is considered of any type: from, to, or undefined. - :param graph: the graph of the PCMCI algorithm, i.e. what's returned by PCMCI.run_pcmci(), array of shape [N, N, tau_max+1] + :param graph: the graph of the PCMCI algorithm, i.e. what's returned by PCMCI.run_pcmci() :param var_names: the names of the variables """ diff --git a/hawk/analysis/postprocessing.py b/hawk/analysis/postprocessing.py index 009e150..8f6037f 100644 --- a/hawk/analysis/postprocessing.py +++ b/hawk/analysis/postprocessing.py @@ -401,8 +401,8 @@ def run_postprocessing_tefs_wrapper( features_columns = dataframe["full"].drop(columns=target_columns).columns # --------------------- Select features using threshold (conservative) --------------------- - # selected_features_names_with_threshold = simulation["results"].select_features(simulation["params"]["threshold"]) - # n_features_selected_with_threshold = len(selected_features_names_with_threshold) + # selected_features_names_with_threshold = simulation["results"].select_features(simulation["params"]["threshold"]) # noqa + # n_features_selected_with_threshold = len(selected_features_names_with_threshold) # noqa # --------------------- Compute test R2 for each number of features --------------------- test_r2_train_test = [] @@ -510,13 +510,13 @@ def run_postprocessing_tefs_wrapper( # fig, ax = plt.subplots(figsize=(10, 5)) # ax.plot(test_r2_cv.mean(axis=1), marker="o", label="Cross-validation") # maxima = np.where(test_r2_cv.mean(axis=1) == test_r2_cv.mean(axis=1).max())[0] - # ax.plot(maxima, test_r2_cv.mean(axis=1)[maxima], marker="o", color="red", linestyle="None", label="Maximum", markersize=10) - # ax.plot(n_features_selected_with_threshold, test_r2_cv.mean(axis=1)[n_features_selected_with_threshold], marker="o", color="green", linestyle="None", label="TEFS (conservative)", markersize=10) + # ax.plot(maxima, test_r2_cv.mean(axis=1)[maxima], marker="o", color="red", linestyle="None", label="Maximum", markersize=10) # noqa + # ax.plot(n_features_selected_with_threshold, test_r2_cv.mean(axis=1)[n_features_selected_with_threshold], marker="o", color="green", linestyle="None", label="TEFS (conservative)", markersize=10) # noqa # # plot confidence interval bands from cross-validation based on mean and standard deviation (90% confidence) # alpha = 0.1 # quantile = scipy.stats.norm.ppf(1 - alpha / 2) - # ax.fill_between(range(test_r2_cv.shape[0]), test_r2_cv.mean(axis=1) - test_r2_cv.std(axis=1) * quantile / np.sqrt(test_r2_cv.shape[1]), test_r2_cv.mean(axis=1) + test_r2_cv.std(axis=1) * quantile / np.sqrt(test_r2_cv.shape[1]), alpha=0.3) + # ax.fill_between(range(test_r2_cv.shape[0]), test_r2_cv.mean(axis=1) - test_r2_cv.std(axis=1) * quantile / np.sqrt(test_r2_cv.shape[1]), test_r2_cv.mean(axis=1) + test_r2_cv.std(axis=1) * quantile / np.sqrt(test_r2_cv.shape[1]), alpha=0.3) # noqa # ax.set_xlabel("Number of features") # ax.set_ylabel("Test $R^2$") @@ -528,7 +528,7 @@ def run_postprocessing_tefs_wrapper( # else: # threshold_text = simulation["params"]["threshold"] - # title_text = f"TEFS on basin {basin_name.upper()} with dataset {dataset_name}\n[lagfeatures $={simulation['params']['lagfeatures']}$, lagtarget $={simulation['params']['lagtarget']}$, direction = {simulation['params']['direction']}, threshold $={threshold_text}]$" + # title_text = f"TEFS on basin {basin_name.upper()} with dataset {dataset_name}\n[lagfeatures $={simulation['params']['lagfeatures']}$, lagtarget $={simulation['params']['lagtarget']}$, direction = {simulation['params']['direction']}, threshold $={threshold_text}]$" # noqa # ax.set_title(title_text) # ax.legend() # if num_total_features < 30: From 38e1bdcfef3d00a78510ec52de9363546cbecd82 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 22:17:48 +0100 Subject: [PATCH 24/51] Format wps --- hawk/processes/wps_causal.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/hawk/processes/wps_causal.py b/hawk/processes/wps_causal.py index 193bea6..d66b020 100644 --- a/hawk/processes/wps_causal.py +++ b/hawk/processes/wps_causal.py @@ -1,4 +1,4 @@ -from pywps import Process, LiteralInput, LiteralOutput, UOM, ComplexInput, ComplexOutput +from pywps import Process, LiteralInput, LiteralOutput, UOM, ComplexInput, ComplexOutput # noqa from pywps.app.Common import Metadata from pywps import FORMATS, Format from pathlib import Path @@ -11,6 +11,7 @@ FORMAT_PNG = Format("image/png", extension=".png", encoding="base64") FORMAT_PICKLE = Format("application/octet-stream", extension=".pkl", encoding="utf-8") + class Causal(Process): """A nice process saying 'hello'.""" @@ -157,23 +158,19 @@ def __init__(self): as_reference=True, supported_formats=[FORMAT_PICKLE], ), - ] super(Causal, self).__init__( self._handler, identifier="causal", title="Causal Analysis", - abstract="Just says a friendly Hello." - "Returns a literal string output with Hello plus the inputed name.", + abstract="Just says a friendly Hello. Returns a literal string output with Hello plus the inputed name.", keywords=["hello", "demo"], metadata=[ Metadata("PyWPS", "https://pywps.org/"), Metadata("Birdhouse", "http://bird-house.github.io/"), Metadata("PyWPS Demo", "https://pywps-demo.readthedocs.io/en/latest/"), - Metadata( - "Emu: PyWPS examples", "https://emu.readthedocs.io/en/latest/" - ), + Metadata("Emu: PyWPS examples", "https://emu.readthedocs.io/en/latest/"), ], version="1.5", inputs=inputs, From 74385a92f90f19ebb5dbda554f043e35f04fd5b4 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 22:38:15 +0100 Subject: [PATCH 25/51] Removing Python 3.11 and 3.12 since not yet supported by tigramite --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b9fd5f1..39435d4 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10"] steps: - name: Checkout repository and submodules uses: actions/checkout@v4 From 17524793c69c09544fcb1fa0bb7f32b7b7a75cf5 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Mon, 25 Mar 2024 23:11:01 +0100 Subject: [PATCH 26/51] Add fake df and lint --- hawk/processes/simulation_interactive.py | 32 ++++++++++++++++++------ 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/hawk/processes/simulation_interactive.py b/hawk/processes/simulation_interactive.py index 7042bb3..f4882cd 100644 --- a/hawk/processes/simulation_interactive.py +++ b/hawk/processes/simulation_interactive.py @@ -1,19 +1,35 @@ import numpy as np import pandas as pd from birdy import WPSClient -#from keras import models +# from keras import models -url = "http://localhost:5000/wps" -wps = WPSClient(url, verify=False) -help(wps) +np.random.seed(0) +n = 1000 # number of samples +m = 15 # number of features -resp = wps.hello(name="Pluto") -print(resp) -resp.get() +data = {} +for i in range(1, m + 1): + data[f"x{i}"] = np.random.normal(size=n) + +data["y"] = sum(data.values()) + np.random.normal(size=n) + +data = pd.DataFrame(data) +n_test = int(0.20 * n) +n_train = n - n_test +data_test = data[n_train:] +data = data[:n_train] + +data.head() + +target_name = "y" + +url = "http://localhost:5000/wps" +wps = WPSClient(url, verify=False) +help(wps) -resp = wps.cyclone(start_day="2019-01-04", end_day="2019-01-06", area="Sindian") +resp = wps.causal() print(resp) resp.get() From c113befbd93c2052a64f35082f7f8e99b1ea9af0 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Tue, 26 Mar 2024 16:56:50 +0100 Subject: [PATCH 27/51] Delete notebook --- .../run_postprocessing_followup.ipynb | 4860 ----------------- 1 file changed, 4860 deletions(-) delete mode 100644 hawk/analysis/run_postprocessing_followup.ipynb diff --git a/hawk/analysis/run_postprocessing_followup.ipynb b/hawk/analysis/run_postprocessing_followup.ipynb deleted file mode 100644 index 72dd614..0000000 --- a/hawk/analysis/run_postprocessing_followup.ipynb +++ /dev/null @@ -1,4860 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "3584ce0f", - "metadata": {}, - "source": [ - "# [Post-processing of results (second part)](#toc0_)" - ] - }, - { - "cell_type": "markdown", - "id": "8b66567c", - "metadata": {}, - "source": [ - "**Table of contents** \n", - "- [Post-processing of results (second part)](#toc1_) \n", - " - [Preliminaries](#toc1_1_) \n", - " - [Import libraries](#toc1_1_1_) \n", - " - [Utility functions](#toc1_1_2_) \n", - " - [Utilities for the summarized version](#toc1_1_3_) \n", - " - [Utilities for the full version](#toc1_1_4_) \n", - " - [Load the results](#toc1_1_5_) \n", - " - [Basin: E12GM](#toc1_2_) \n", - " - [Versione full](#toc1_2_1_) \n", - " - [PCMCI](#toc1_2_1_1_) \n", - " - [TEFS](#toc1_2_1_2_) \n", - " - [Versione summarized](#toc1_2_2_) \n", - " - [PCMCI](#toc1_2_2_1_) \n", - " - [TEFS](#toc1_2_2_2_) \n", - " - [Versione full senza CMI](#toc1_2_3_) \n", - " - [TEFS](#toc1_2_3_1_) \n", - " - [TEFS as wrapper on E12GM](#toc1_2_4_) \n", - " - [Linking the wrapper to the original filter method](#toc1_2_5_) \n", - " - [Basin: Ticino](#toc1_3_) \n", - " - [Versione full](#toc1_3_1_) \n", - " - [PCMCI](#toc1_3_1_1_) \n", - " - [TEFS](#toc1_3_1_2_) \n", - "\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "id": "30e07d51", - "metadata": {}, - "source": [ - "## [Preliminaries](#toc0_)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### [Import libraries](#toc0_)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "24d19ad5", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "os.chdir(os.path.dirname(os.path.abspath(__file__)))\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "import pandas as pd\n", - "import scipy\n", - "import thesis.constants as constants\n", - "import thesis.file_management as file_management\n", - "from sklearn.model_selection import (\n", - " KFold,\n", - " TimeSeriesSplit,\n", - ")\n", - "from tefs.metrics import regression_analysis\n", - "from thesis import datasets_and_configurations_loaders" - ] - }, - { - "cell_type": "markdown", - "id": "d46401c9", - "metadata": {}, - "source": [ - "Set the retina resolution" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "b047fdf8", - "metadata": {}, - "outputs": [], - "source": [ - "%config InlineBackend.figure_format = 'retina'" - ] - }, - { - "cell_type": "markdown", - "id": "2f374adf", - "metadata": {}, - "source": [ - "Enable the use of LaTeX for plots." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "9be4a454", - "metadata": {}, - "outputs": [], - "source": [ - "plt.rc(\"text\", usetex=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### [Utility functions](#toc0_)\n", - "\n", - "General purpose formatter functions, valid for all stylers." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "e4b8f2ce", - "metadata": {}, - "outputs": [], - "source": [ - "def makecell_code_formatter(x):\n", - " \"\"\"\n", - " Format a string to be used in a LaTeX table cell. Specifically, given a list of names, it will format them as a\n", - " single column with each name in a separate row and in a monospaced font.\n", - " \"\"\"\n", - " elements = x.split(\" \")\n", - " formatted_elements = [f\"\\\\texttt{{{element}}}\" for element in elements]\n", - " return \"\\\\makecell[l]{\" + \"\\\\\\\\ \".join(formatted_elements) + \"}\"\n", - "\n", - "\n", - "def format_time(seconds):\n", - " \"\"\"\n", - " Format a time in seconds to a human-readable format.\n", - " \"\"\"\n", - " return f\"{seconds:.3f}s\"\n", - "\n", - "\n", - "def highlight_row(s, row_indexes, color):\n", - " \"\"\"\n", - " Highlight the given row indexes of Series s with the given color.\n", - " \"\"\"\n", - " if \"test\" in s.name.lower():\n", - " return [\"\" for _ in s] # No styling for columns with \"test\" in their name\n", - " return [\"background-color: \" + color if i in row_indexes else \"\" for i in range(len(s))]\n", - "\n", - "\n", - "def color_direction(v):\n", - " \"\"\"\n", - " Color the text of a cell according to the direction of the value.\n", - " \"\"\"\n", - " color = \"black\"\n", - " if v == \"backward\":\n", - " color = \"red\"\n", - " elif v == \"forward\":\n", - " color = \"blue\"\n", - " return f\"color: {color}\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### [Utilities for the summarized version](#toc0_)\n", - "\n", - "Some utilities are specific to the summarized version of the table of results." - ] - }, - { - "cell_type": "markdown", - "id": "d8369205", - "metadata": {}, - "source": [ - "For the PCMCI version." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "5b725bd8", - "metadata": {}, - "outputs": [], - "source": [ - "def make_pcmci_pretty(styler):\n", - " styler.format(subset=[\"score_r2_lag\", \"score_r2_lag_ar\"], precision=3)\n", - " styler.background_gradient(cmap=\"Greens\", subset=[\"score_r2_lag\", \"score_r2_lag_ar\"], vmin=0, vmax=0.5)\n", - " styler.format(subset=[\"execution_time\"], precision=2)\n", - " # styler.format(formatter=format_time, subset=[\"execution_time\"])\n", - " styler.background_gradient(cmap=\"Reds\", subset=[\"execution_time\"], vmax=8000)\n", - " return styler\n", - "\n", - "\n", - "from pandas.io.formats.style_render import _escape_latex\n", - "\n", - "# I lost too much time trying to figure out why I can't format the \"names\" of the indexes (to escape them)\n", - "# The guy who wrote it didn't allow for this possibility, but thankfully (https://stackoverflow.com/questions/72716879/is-there-a-function-to-format-the-index-name-in-a-pandas-styler-dataframe-style)\n", - "# he proposed a workaround and opened a github issue (https://github.com/pandas-dev/pandas/issues/47489)\n", - "# But he didn't realize that this doesn't work on multi-indexes, so I had to modify his code a bit\n", - "\n", - "\n", - "def export_pcmci_df_to_latex(df, target_file, code_escaped_columns=[]):\n", - " temp_df = df.copy()\n", - " for level in range(temp_df.index.nlevels):\n", - " if temp_df.index.get_level_values(level).name is not None:\n", - " temp_df.index.set_names(_escape_latex(temp_df.index.get_level_values(level).name), level=level, inplace=True)\n", - " for level in range(temp_df.columns.nlevels):\n", - " if temp_df.columns.get_level_values(level).name is not None:\n", - " temp_df.columns.set_names(_escape_latex(temp_df.columns.get_level_values(level).name), level=level, inplace=True)\n", - "\n", - " with open(target_file, \"w\") as f:\n", - " f.write(\n", - " temp_df.style.pipe(make_pcmci_pretty)\n", - " .format_index(escape=\"latex\", axis=\"index\")\n", - " .format_index(escape=\"latex\", axis=\"columns\")\n", - " .format(formatter=makecell_code_formatter, escape=\"latex\", subset=code_escaped_columns)\n", - " .to_latex(hrules=True, clines=\"all;index\", convert_css=True, column_format=\"cclccccrr\")\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "580cc079", - "metadata": {}, - "source": [ - "For the TEFS version." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "3406e415", - "metadata": {}, - "outputs": [], - "source": [ - "def make_te_pretty(styler):\n", - " styler.format(subset=[\"score_r2_lag\", \"score_r2_lag_ar\"], precision=3)\n", - " styler.background_gradient(cmap=\"Greens\", subset=[\"score_r2_lag\", \"score_r2_lag_ar\"], vmin=0, vmax=0.5)\n", - " styler.format(subset=[\"execution_time\"], precision=2)\n", - " styler.background_gradient(cmap=\"Reds\", subset=[\"execution_time\"], vmax=8000)\n", - " return styler\n", - "\n", - "\n", - "def export_te_df_to_latex(df, target_file, code_escaped_columns=[]):\n", - " temp_df = df.copy()\n", - " for level in range(temp_df.index.nlevels):\n", - " if temp_df.index.get_level_values(level).name is not None:\n", - " temp_df.index.set_names(_escape_latex(temp_df.index.get_level_values(level).name), level=level, inplace=True)\n", - " for level in range(temp_df.columns.nlevels):\n", - " if temp_df.columns.get_level_values(level).name is not None:\n", - " temp_df.columns.set_names(_escape_latex(temp_df.columns.get_level_values(level).name), level=level, inplace=True)\n", - "\n", - " with open(target_file, \"w\") as f:\n", - " f.write(\n", - " temp_df.style.pipe(make_te_pretty)\n", - " .format_index(escape=\"latex\", axis=\"index\")\n", - " .format_index(escape=\"latex\", axis=\"columns\")\n", - " .format(formatter=makecell_code_formatter, escape=\"latex\", subset=code_escaped_columns)\n", - " .to_latex(hrules=True, clines=\"all;index\", convert_css=True, column_format=\"cclccccrr\")\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### [Utilities for the full version](#toc0_)\n", - "\n", - "Some utilities are specific to the full version of the table of results." - ] - }, - { - "cell_type": "markdown", - "id": "f5c023e4", - "metadata": {}, - "source": [ - "For the PCMCI version." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "0ec91349", - "metadata": {}, - "outputs": [], - "source": [ - "def make_pcmci_all_pretty(styler):\n", - " styler.format(subset=[\"score_r2\", \"score_r2_lag\", \"score_r2_lag_ar\"], precision=3)\n", - " styler.background_gradient(cmap=\"Greens\", subset=[\"score_r2\", \"score_r2_lag\", \"score_r2_lag_ar\"], vmin=0, vmax=0.5)\n", - " styler.format(formatter=format_time, subset=[\"execution_time\"])\n", - " styler.background_gradient(cmap=\"Reds\", subset=[\"execution_time\"], vmax=8000)\n", - " return styler\n", - "\n", - "\n", - "def export_results_dataframe_pcmci(df: pd.DataFrame, target_file: str):\n", - " with open(target_file, \"w\") as f:\n", - " f.write(\n", - " df.style.pipe(make_pcmci_all_pretty)\n", - " .format_index(escape=\"latex\", axis=1)\n", - " .format(formatter=makecell_code_formatter, escape=\"latex\", subset=[\"selected_features\", \"dataset\", \"algorithm\", \"independencetest\"])\n", - " .to_latex(hrules=True, clines=\"all;data\", convert_css=True, column_format=\"llccclllcr\")\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "e563bd7f", - "metadata": {}, - "source": [ - "For the TEFS version." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "ece2aa88", - "metadata": {}, - "outputs": [], - "source": [ - "# https://pandas.pydata.org/docs/reference/api/pandas.io.formats.style.Styler.html\n", - "# https://pandas.pydata.org/docs/reference/api/pandas.io.formats.style.Styler.format.html\n", - "# https://pandas.pydata.org/docs/reference/api/pandas.io.formats.style.Styler.to_latex.html\n", - "# https://www.youtube.com/watch?v=JGefS6WPm1E\n", - "# https://tex.stackexchange.com/questions/2441/how-to-add-a-forced-line-break-inside-a-table-cell\n", - "def make_te_all_pretty(styler):\n", - " styler.format(subset=[\"score_r2\", \"score_r2_lag\", \"score_r2_lag_ar\"], precision=3)\n", - " styler.background_gradient(cmap=\"Greens\", subset=[\"score_r2\", \"score_r2_lag\", \"score_r2_lag_ar\"], vmin=0, vmax=0.5)\n", - " #styler.map(color_direction, subset=[\"direction\"])\n", - " styler.format(formatter=format_time, subset=[\"execution_time\"])\n", - " styler.background_gradient(cmap=\"Reds\", subset=[\"execution_time\"], vmax=8000)\n", - " # styler.apply(highlight_row, row_indexes=[5,7], color='yellow', axis=0)\n", - " return styler\n", - "\n", - "\n", - "def export_results_dataframe_te(df: pd.DataFrame, target_file: str):\n", - " with open(target_file, \"w\") as f:\n", - " f.write(\n", - " df.style.pipe(make_te_all_pretty)\n", - " .format_index(escape=\"latex\", axis=1)\n", - " .format(formatter=makecell_code_formatter, escape=\"latex\", subset=[\"selected_features\", \"dataset\", \"direction\"])\n", - " .to_latex(hrules=True, clines=\"all;data\", convert_css=True, column_format=\"llccclcclr\")\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### [Load the results](#toc0_)\n", - "\n", - "Load the previously exported pandas dataframes containing the results of the analysis." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "9aae6d86", - "metadata": {}, - "outputs": [], - "source": [ - "results_e12gm_pcmci = file_management.load_from_pkl_file(os.path.join(constants.path_table_objects, \"results_table_e12gm_pcmci.pkl\"))\n", - "results_e12gm_te = file_management.load_from_pkl_file(os.path.join(constants.path_table_objects, \"results_table_e12gm_te.pkl\"))\n", - "results_ticino_pcmci = file_management.load_from_pkl_file(os.path.join(constants.path_table_objects, \"results_table_ticino_pcmci.pkl\"))\n", - "results_ticino_te = file_management.load_from_pkl_file(os.path.join(constants.path_table_objects, \"results_table_ticino_te.pkl\"))\n", - "\n", - "results_e12gm_noCMI_te = file_management.load_from_pkl_file(os.path.join(constants.path_table_objects, \"results_table_e12gm_noCMI_te.pkl\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "7377b8af", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Emiliani1Emiliani2GardaMincioTicino
solo (lag=0)0.2860710.2435340.1713070.154807
solo (lag=0,1)0.2840430.3061110.1919330.177646
solo + extended (lag=0)0.3444480.2924310.1854930.162634
solo + extended (lag=0,1)0.3732610.2927330.1695520.167326
ar(1)0.2752320.2920350.2896530.199807
ar(1) + solo (lag=0,1)0.4249010.4325530.4377950.331817
ar(1) + solo + extended (lag=0,1)0.4687670.4304170.4136160.330220
\n", - "
" - ], - "text/plain": [ - " Emiliani1 Emiliani2 GardaMincio Ticino\n", - "solo (lag=0) 0.286071 0.243534 0.171307 0.154807\n", - "solo (lag=0,1) 0.284043 0.306111 0.191933 0.177646\n", - "solo + extended (lag=0) 0.344448 0.292431 0.185493 0.162634\n", - "solo + extended (lag=0,1) 0.373261 0.292733 0.169552 0.167326\n", - "ar(1) 0.275232 0.292035 0.289653 0.199807\n", - "ar(1) + solo (lag=0,1) 0.424901 0.432553 0.437795 0.331817\n", - "ar(1) + solo + extended (lag=0,1) 0.468767 0.430417 0.413616 0.330220" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from scripts.run_benchmark import baseline\n", - "\n", - "baseline" - ] - }, - { - "cell_type": "markdown", - "id": "6d0bfac1", - "metadata": {}, - "source": [ - "## [Basin: Ticino](#toc0_)" - ] - }, - { - "cell_type": "markdown", - "id": "6477aa9b", - "metadata": {}, - "source": [ - "### [Full version](#toc0_)" - ] - }, - { - "cell_type": "markdown", - "id": "7223c7a0", - "metadata": {}, - "source": [ - "#### [PCMCI](#toc0_)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "1afbe3b5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 selected_featuresscore_r2score_r2_lagscore_r2_lag_ardatasetalgorithmindependencetestlagexecution_time
0cyclostationary_mean_tg_0 cyclostationary_mean_rr_4w_00.1490.1490.149normalpcmci_pluscmiknn032.662s
1cyclostationary_mean_tg_0 cyclostationary_mean_rr_4w_00.1490.1490.149normalpcmci_plusparcorr00.018s
2cyclostationary_mean_tg_00.1380.1880.331normalpcmci_pluscmiknn1112.996s
3cyclostationary_mean_tg_00.1380.1880.331normalpcmci_plusparcorr10.069s
4cyclostationary_mean_HS_0 cyclostationary_mean_tg_0 cyclostationary_mean_rr_4w_00.1490.1490.149snowlakespcmci_pluscmiknn0264.168s
5cyclostationary_mean_tg_0 cyclostationary_mean_rr_4w_00.1490.1490.149snowlakespcmci_plusparcorr00.146s
6cyclostationary_mean_tg_00.1380.1880.331snowlakespcmci_pluscmiknn1347.146s
7cyclostationary_mean_tg_00.1380.1880.331snowlakespcmci_plusparcorr10.285s
\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_file = os.path.join(constants.path_table_tex, \"ticino_pcmci_full.tex\")\n", - "os.makedirs(os.path.dirname(target_file), exist_ok=True)\n", - "export_results_dataframe_pcmci(results_ticino_pcmci, target_file)\n", - "results_ticino_pcmci.style.pipe(make_pcmci_all_pretty)" - ] - }, - { - "cell_type": "markdown", - "id": "81df3268", - "metadata": {}, - "source": [ - "#### [TEFS](#toc0_)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "62939c64", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 selected_featuresscore_r2score_r2_lagscore_r2_lag_ardatasetlagfeatureslagtargetdirectionexecution_time
0cyclostationary_mean_tg_00.1380.1380.346normal[0][1]backward0.401s
1cyclostationary_mean_tg_00.1380.1380.346normal[0][1]forward0.372s
2cyclostationary_mean_tg_00.1380.1880.331normal[0, 1][1]backward0.415s
3cyclostationary_mean_tg_00.1380.1880.331normal[0, 1][1]forward0.388s
4cyclostat_level_Lugano cyclostationary_mean_tg_00.1940.1940.360snowlakes[0][1]backward1.650s
5cyclostat_level_Lugano cyclostationary_mean_tg_00.1940.1940.360snowlakes[0][1]forward1.612s
6cyclostationary_mean_HS_0 cyclostationary_mean_tg_20.0670.0830.309snowlakes[0, 1][1]backward1.777s
7cyclostat_level_Lugano cyclostationary_mean_tg_00.1940.2380.345snowlakes[0, 1][1]forward1.499s
\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_file = os.path.join(constants.path_table_tex, \"ticino_te_full.tex\")\n", - "export_results_dataframe_te(results_ticino_te, target_file)\n", - "results_ticino_te.style.pipe(make_te_all_pretty)" - ] - }, - { - "cell_type": "markdown", - "id": "f465bea2", - "metadata": {}, - "source": [ - "## [Basin: E12GM](#toc0_)" - ] - }, - { - "cell_type": "markdown", - "id": "e78c4399", - "metadata": {}, - "source": [ - "### [Full version](#toc0_)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### [PCMCI](#toc0_)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "48873e13", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 selected_featuresscore_r2score_r2_lagscore_r2_lag_ardatasetalgorithmindependencetestlagexecution_time
0E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_0 E1cyclostationary_mean_rr_1w_16 E1cyclostationary_mean_rr_24w_20.2860.2860.286df_E1pcmci_pluscmiknn0384.152s
1E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_0 E1cyclostationary_mean_rr_24w_20.2890.2890.289df_E1pcmci_plusparcorr00.046s
2E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_0 E1cyclostationary_mean_rr_1w_160.2510.2550.420df_E1pcmci_pluscmiknn11308.914s
3E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_00.2570.2650.421df_E1pcmci_plusparcorr10.274s
4E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_rr_1w_16 E1cyclostationary_mean_rr_24w_20.1810.1810.181df_E1allfeaturespcmci_pluscmiknn05532.618s
5E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_rr_24w_2 E2cyclostationary_mean_tg_00.2540.2540.254df_E1allfeaturespcmci_plusparcorr00.678s
6E1cyclostationary_mean_rr_4w_10.1510.1480.368df_E1allfeaturespcmci_pluscmiknn17726.129s
7E1cyclostationary_mean_rr_4w_10.1510.1480.368df_E1allfeaturespcmci_plusparcorr110.819s
8E2cyclostationary_mean_tg_0 E2cyclostationary_mean_rr_8w_0 E2cyclostationary_mean_rr_4w_50.2300.2300.230df_E2pcmci_pluscmiknn0810.721s
9E2cyclostationary_mean_tg_0 E2cyclostationary_mean_rr_4w_50.2220.2220.222df_E2pcmci_plusparcorr00.058s
10E2cyclostationary_mean_tg_0 E2cyclostationary_mean_rr_4w_50.2220.2390.416df_E2pcmci_pluscmiknn11596.577s
11E2cyclostationary_mean_tg_0 E2cyclostationary_mean_rr_8w_0 E2cyclostationary_mean_rr_4w_50.2300.2480.402df_E2pcmci_plusparcorr10.315s
12E1cyclostationary_mean_rr_24w_2 E2cyclostationary_mean_tg_0 GMcyclostationary_mean_rr_4w_10.2980.2980.298df_E2allfeaturespcmci_pluscmiknn05081.932s
13E1cyclostationary_mean_rr_4w_1 E2cyclostationary_mean_tg_00.1910.1910.191df_E2allfeaturespcmci_plusparcorr00.341s
14E2cyclostationary_mean_tg_00.1120.1460.391df_E2allfeaturespcmci_pluscmiknn15268.379s
15E2cyclostationary_mean_tg_00.1120.1460.391df_E2allfeaturespcmci_plusparcorr18.094s
16GMcyclostationary_mean_tg_1w_0 GMcyclostationary_mean_rr_4w_10.1770.1770.177df_GMpcmci_pluscmiknn087.161s
17GMcyclostationary_mean_tg_1w_0 GMcyclostationary_mean_rr_4w_10.1770.1770.177df_GMpcmci_plusparcorr00.016s
18GMcyclostationary_mean_tg_1w_0 GMcyclostationary_mean_rr_4w_10.1770.2030.443df_GMpcmci_pluscmiknn1504.896s
19GMcyclostationary_mean_tg_1w_0 GMcyclostationary_mean_rr_4w_10.1770.2030.443df_GMpcmci_plusparcorr10.089s
20E1cyclostationary_mean_rr_1w_16 E1cyclostationary_mean_rr_24w_2 E2cyclostationary_mean_tg_00.0880.0880.088df_GMallfeaturespcmci_pluscmiknn01826.917s
21E1cyclostationary_mean_rr_4w_1 E2cyclostationary_mean_tg_00.1100.1100.110df_GMallfeaturespcmci_plusparcorr00.345s
22E1cyclostationary_mean_rr_4w_1 E2cyclostationary_mean_tg_00.1100.1200.406df_GMallfeaturespcmci_pluscmiknn14688.385s
23E1cyclostationary_mean_rr_4w_1 E2cyclostationary_mean_tg_00.1100.1200.406df_GMallfeaturespcmci_plusparcorr19.835s
\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_file = os.path.join(constants.path_table_tex, \"e12gm_pcmci_full.tex\")\n", - "os.makedirs(os.path.dirname(target_file), exist_ok=True)\n", - "export_results_dataframe_pcmci(results_e12gm_pcmci, target_file)\n", - "results_e12gm_pcmci.style.pipe(make_pcmci_all_pretty)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### [TEFS](#toc0_)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "0dcb9a0b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 selected_featuresscore_r2score_r2_lagscore_r2_lag_ardatasetlagfeatureslagtargetdirectionexecution_time
0E1cyclostationary_mean_rr_1w_16 E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_00.2510.2510.457df_E1[0][1]backward0.688s
1E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_0 E1cyclostationary_mean_rr_1w_160.2510.2510.457df_E1[0][1]forward0.633s
2E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_00.2570.2650.421df_E1[0, 1][1]backward0.757s
3E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_00.2570.2650.421df_E1[0, 1][1]forward0.659s
4GMcyclostationary_mean_tg_1w_0 E2cyclostationary_mean_tg_0 E1cyclostationary_mean_rr_4w_10.2310.2310.405df_E1allfeatures[0][1]backward6.806s
5E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_0 E1cyclostationary_mean_rr_1w_160.2510.2510.457df_E1allfeatures[0][1]forward6.430s
6E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_00.2570.2650.421df_E1allfeatures[0, 1][1]backward7.315s
7E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_00.2570.2650.421df_E1allfeatures[0, 1][1]forward6.634s
8E2cyclostationary_mean_rr_8w_0 E2cyclostationary_mean_tg_00.1900.1900.423df_E2[0][1]backward1.123s
9E2cyclostationary_mean_tg_0 E2cyclostationary_mean_rr_8w_00.1900.1900.423df_E2[0][1]forward0.999s
10E2cyclostationary_mean_tg_00.1120.1460.391df_E2[0, 1][1]backward1.220s
11E2cyclostationary_mean_tg_00.1120.1460.391df_E2[0, 1][1]forward1.053s
12E2cyclostationary_mean_rr_8w_0 E2cyclostationary_mean_tg_00.1900.1900.423df_E2allfeatures[0][1]backward7.055s
13E2cyclostationary_mean_tg_0 E2cyclostationary_mean_rr_8w_00.1900.1900.423df_E2allfeatures[0][1]forward5.996s
14E2cyclostationary_mean_tg_00.1120.1460.391df_E2allfeatures[0, 1][1]backward7.272s
15GMcyclostationary_mean_tg_1w_00.1340.1460.422df_E2allfeatures[0, 1][1]forward6.608s
16GMcyclostationary_mean_rr_4w_1 GMcyclostationary_mean_tg_1w_00.1770.1770.359df_GM[0][1]backward0.357s
17GMcyclostationary_mean_tg_1w_0 GMcyclostationary_mean_rr_4w_10.1770.1770.359df_GM[0][1]forward0.350s
18GMcyclostationary_mean_tg_1w_00.0340.0500.411df_GM[0, 1][1]backward0.399s
19GMcyclostationary_mean_tg_1w_00.0340.0500.411df_GM[0, 1][1]forward0.356s
20E2cyclostationary_mean_tg_4w_0 GMcyclostationary_mean_rr_4w_1 E2cyclostationary_mean_tg_00.1580.1580.457df_GMallfeatures[0][1]backward6.943s
21E1cyclostationary_mean_rr_1w_16 E2cyclostationary_mean_tg_00.0130.0130.392df_GMallfeatures[0][1]forward6.280s
22E2cyclostationary_mean_rr_8w_0 E2cyclostationary_mean_tg_00.0640.0850.377df_GMallfeatures[0, 1][1]backward7.320s
23GMcyclostationary_mean_tg_1w_00.0340.0500.411df_GMallfeatures[0, 1][1]forward6.558s
\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_file = os.path.join(constants.path_table_tex, \"e12gm_te_full.tex\")\n", - "export_results_dataframe_te(results_e12gm_te, target_file)\n", - "results_e12gm_te.style.pipe(make_te_all_pretty)" - ] - }, - { - "cell_type": "markdown", - "id": "b0494047", - "metadata": {}, - "source": [ - "### [Summarized version](#toc0_)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### [PCMCI](#toc0_)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "ab8ddcde", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
selected_featuresscore_r2score_r2_lagscore_r2_lag_ardatasetalgorithmindependencetestlagexecution_time
0E1cyclostationary_mean_rr_4w_1 E1cyclostationa...0.2860710.2860710.286071df_E1pcmci_pluscmiknn0384.151514
1E1cyclostationary_mean_rr_4w_1 E1cyclostationa...0.2892500.2892500.289250df_E1pcmci_plusparcorr00.045593
2E1cyclostationary_mean_rr_4w_1 E1cyclostationa...0.2510600.2554660.419614df_E1pcmci_pluscmiknn11308.913636
3E1cyclostationary_mean_rr_4w_1 E1cyclostationa...0.2566390.2651900.420932df_E1pcmci_plusparcorr10.274470
4E1cyclostationary_mean_rr_4w_1 E1cyclostationa...0.1814240.1814240.181424df_E1allfeaturespcmci_pluscmiknn05532.617838
5E1cyclostationary_mean_rr_4w_1 E1cyclostationa...0.2542810.2542810.254281df_E1allfeaturespcmci_plusparcorr00.678409
6E1cyclostationary_mean_rr_4w_10.1505580.1476860.367535df_E1allfeaturespcmci_pluscmiknn17726.128555
7E1cyclostationary_mean_rr_4w_10.1505580.1476860.367535df_E1allfeaturespcmci_plusparcorr110.818769
8E2cyclostationary_mean_tg_0 E2cyclostationary_...0.2303890.2303890.230389df_E2pcmci_pluscmiknn0810.720993
9E2cyclostationary_mean_tg_0 E2cyclostationary_...0.2215220.2215220.221522df_E2pcmci_plusparcorr00.057513
10E2cyclostationary_mean_tg_0 E2cyclostationary_...0.2215220.2391270.415827df_E2pcmci_pluscmiknn11596.576730
11E2cyclostationary_mean_tg_0 E2cyclostationary_...0.2303890.2484500.401565df_E2pcmci_plusparcorr10.314501
12E1cyclostationary_mean_rr_24w_2 E2cyclostation...0.2977710.2977710.297771df_E2allfeaturespcmci_pluscmiknn05081.931851
13E1cyclostationary_mean_rr_4w_1 E2cyclostationa...0.1910820.1910820.191082df_E2allfeaturespcmci_plusparcorr00.341040
14E2cyclostationary_mean_tg_00.1115140.1461930.390651df_E2allfeaturespcmci_pluscmiknn15268.378825
15E2cyclostationary_mean_tg_00.1115140.1461930.390651df_E2allfeaturespcmci_plusparcorr18.094075
16GMcyclostationary_mean_tg_1w_0 GMcyclostationa...0.1765630.1765630.176563df_GMpcmci_pluscmiknn087.160898
17GMcyclostationary_mean_tg_1w_0 GMcyclostationa...0.1765630.1765630.176563df_GMpcmci_plusparcorr00.016086
18GMcyclostationary_mean_tg_1w_0 GMcyclostationa...0.1765630.2034890.442869df_GMpcmci_pluscmiknn1504.895575
19GMcyclostationary_mean_tg_1w_0 GMcyclostationa...0.1765630.2034890.442869df_GMpcmci_plusparcorr10.089332
20E1cyclostationary_mean_rr_1w_16 E1cyclostation...0.0879250.0879250.087925df_GMallfeaturespcmci_pluscmiknn01826.917106
21E1cyclostationary_mean_rr_4w_1 E2cyclostationa...0.1097710.1097710.109771df_GMallfeaturespcmci_plusparcorr00.345339
22E1cyclostationary_mean_rr_4w_1 E2cyclostationa...0.1097710.1199790.406443df_GMallfeaturespcmci_pluscmiknn14688.384780
23E1cyclostationary_mean_rr_4w_1 E2cyclostationa...0.1097710.1199790.406443df_GMallfeaturespcmci_plusparcorr19.835356
\n", - "
" - ], - "text/plain": [ - " selected_features score_r2 score_r2_lag \\\n", - "0 E1cyclostationary_mean_rr_4w_1 E1cyclostationa... 0.286071 0.286071 \n", - "1 E1cyclostationary_mean_rr_4w_1 E1cyclostationa... 0.289250 0.289250 \n", - "2 E1cyclostationary_mean_rr_4w_1 E1cyclostationa... 0.251060 0.255466 \n", - "3 E1cyclostationary_mean_rr_4w_1 E1cyclostationa... 0.256639 0.265190 \n", - "4 E1cyclostationary_mean_rr_4w_1 E1cyclostationa... 0.181424 0.181424 \n", - "5 E1cyclostationary_mean_rr_4w_1 E1cyclostationa... 0.254281 0.254281 \n", - "6 E1cyclostationary_mean_rr_4w_1 0.150558 0.147686 \n", - "7 E1cyclostationary_mean_rr_4w_1 0.150558 0.147686 \n", - "8 E2cyclostationary_mean_tg_0 E2cyclostationary_... 0.230389 0.230389 \n", - "9 E2cyclostationary_mean_tg_0 E2cyclostationary_... 0.221522 0.221522 \n", - "10 E2cyclostationary_mean_tg_0 E2cyclostationary_... 0.221522 0.239127 \n", - "11 E2cyclostationary_mean_tg_0 E2cyclostationary_... 0.230389 0.248450 \n", - "12 E1cyclostationary_mean_rr_24w_2 E2cyclostation... 0.297771 0.297771 \n", - "13 E1cyclostationary_mean_rr_4w_1 E2cyclostationa... 0.191082 0.191082 \n", - "14 E2cyclostationary_mean_tg_0 0.111514 0.146193 \n", - "15 E2cyclostationary_mean_tg_0 0.111514 0.146193 \n", - "16 GMcyclostationary_mean_tg_1w_0 GMcyclostationa... 0.176563 0.176563 \n", - "17 GMcyclostationary_mean_tg_1w_0 GMcyclostationa... 0.176563 0.176563 \n", - "18 GMcyclostationary_mean_tg_1w_0 GMcyclostationa... 0.176563 0.203489 \n", - "19 GMcyclostationary_mean_tg_1w_0 GMcyclostationa... 0.176563 0.203489 \n", - "20 E1cyclostationary_mean_rr_1w_16 E1cyclostation... 0.087925 0.087925 \n", - "21 E1cyclostationary_mean_rr_4w_1 E2cyclostationa... 0.109771 0.109771 \n", - "22 E1cyclostationary_mean_rr_4w_1 E2cyclostationa... 0.109771 0.119979 \n", - "23 E1cyclostationary_mean_rr_4w_1 E2cyclostationa... 0.109771 0.119979 \n", - "\n", - " score_r2_lag_ar dataset algorithm independencetest lag \\\n", - "0 0.286071 df_E1 pcmci_plus cmiknn 0 \n", - "1 0.289250 df_E1 pcmci_plus parcorr 0 \n", - "2 0.419614 df_E1 pcmci_plus cmiknn 1 \n", - "3 0.420932 df_E1 pcmci_plus parcorr 1 \n", - "4 0.181424 df_E1allfeatures pcmci_plus cmiknn 0 \n", - "5 0.254281 df_E1allfeatures pcmci_plus parcorr 0 \n", - "6 0.367535 df_E1allfeatures pcmci_plus cmiknn 1 \n", - "7 0.367535 df_E1allfeatures pcmci_plus parcorr 1 \n", - "8 0.230389 df_E2 pcmci_plus cmiknn 0 \n", - "9 0.221522 df_E2 pcmci_plus parcorr 0 \n", - "10 0.415827 df_E2 pcmci_plus cmiknn 1 \n", - "11 0.401565 df_E2 pcmci_plus parcorr 1 \n", - "12 0.297771 df_E2allfeatures pcmci_plus cmiknn 0 \n", - "13 0.191082 df_E2allfeatures pcmci_plus parcorr 0 \n", - "14 0.390651 df_E2allfeatures pcmci_plus cmiknn 1 \n", - "15 0.390651 df_E2allfeatures pcmci_plus parcorr 1 \n", - "16 0.176563 df_GM pcmci_plus cmiknn 0 \n", - "17 0.176563 df_GM pcmci_plus parcorr 0 \n", - "18 0.442869 df_GM pcmci_plus cmiknn 1 \n", - "19 0.442869 df_GM pcmci_plus parcorr 1 \n", - "20 0.087925 df_GMallfeatures pcmci_plus cmiknn 0 \n", - "21 0.109771 df_GMallfeatures pcmci_plus parcorr 0 \n", - "22 0.406443 df_GMallfeatures pcmci_plus cmiknn 1 \n", - "23 0.406443 df_GMallfeatures pcmci_plus parcorr 1 \n", - "\n", - " execution_time \n", - "0 384.151514 \n", - "1 0.045593 \n", - "2 1308.913636 \n", - "3 0.274470 \n", - "4 5532.617838 \n", - "5 0.678409 \n", - "6 7726.128555 \n", - "7 10.818769 \n", - "8 810.720993 \n", - "9 0.057513 \n", - "10 1596.576730 \n", - "11 0.314501 \n", - "12 5081.931851 \n", - "13 0.341040 \n", - "14 5268.378825 \n", - "15 8.094075 \n", - "16 87.160898 \n", - "17 0.016086 \n", - "18 504.895575 \n", - "19 0.089332 \n", - "20 1826.917106 \n", - "21 0.345339 \n", - "22 4688.384780 \n", - "23 9.835356 " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "results_e12gm_pcmci" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "a8dcdca7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
score_r2_lag_arscore_r2_lagexecution_time
features_setsingleallsingleallsingleall
datasetindependencetestfeatures_lag
E1cmiknncontemporary0.2860710.1814240.2860710.181424384.1515145532.617838
contemporary + 1-lagged0.4196140.3675350.2554660.1476861308.9136367726.128555
parcorrcontemporary0.2892500.2542810.2892500.2542810.0455930.678409
contemporary + 1-lagged0.4209320.3675350.2651900.1476860.27447010.818769
E2cmiknncontemporary0.2303890.2977710.2303890.297771810.7209935081.931851
contemporary + 1-lagged0.4158270.3906510.2391270.1461931596.5767305268.378825
parcorrcontemporary0.2215220.1910820.2215220.1910820.0575130.341040
contemporary + 1-lagged0.4015650.3906510.2484500.1461930.3145018.094075
GMcmiknncontemporary0.1765630.0879250.1765630.08792587.1608981826.917106
contemporary + 1-lagged0.4428690.4064430.2034890.119979504.8955754688.384780
parcorrcontemporary0.1765630.1097710.1765630.1097710.0160860.345339
contemporary + 1-lagged0.4428690.4064430.2034890.1199790.0893329.835356
\n", - "
" - ], - "text/plain": [ - " score_r2_lag_ar \\\n", - "features_set single all \n", - "dataset independencetest features_lag \n", - "E1 cmiknn contemporary 0.286071 0.181424 \n", - " contemporary + 1-lagged 0.419614 0.367535 \n", - " parcorr contemporary 0.289250 0.254281 \n", - " contemporary + 1-lagged 0.420932 0.367535 \n", - "E2 cmiknn contemporary 0.230389 0.297771 \n", - " contemporary + 1-lagged 0.415827 0.390651 \n", - " parcorr contemporary 0.221522 0.191082 \n", - " contemporary + 1-lagged 0.401565 0.390651 \n", - "GM cmiknn contemporary 0.176563 0.087925 \n", - " contemporary + 1-lagged 0.442869 0.406443 \n", - " parcorr contemporary 0.176563 0.109771 \n", - " contemporary + 1-lagged 0.442869 0.406443 \n", - "\n", - " score_r2_lag \\\n", - "features_set single all \n", - "dataset independencetest features_lag \n", - "E1 cmiknn contemporary 0.286071 0.181424 \n", - " contemporary + 1-lagged 0.255466 0.147686 \n", - " parcorr contemporary 0.289250 0.254281 \n", - " contemporary + 1-lagged 0.265190 0.147686 \n", - "E2 cmiknn contemporary 0.230389 0.297771 \n", - " contemporary + 1-lagged 0.239127 0.146193 \n", - " parcorr contemporary 0.221522 0.191082 \n", - " contemporary + 1-lagged 0.248450 0.146193 \n", - "GM cmiknn contemporary 0.176563 0.087925 \n", - " contemporary + 1-lagged 0.203489 0.119979 \n", - " parcorr contemporary 0.176563 0.109771 \n", - " contemporary + 1-lagged 0.203489 0.119979 \n", - "\n", - " execution_time \n", - "features_set single all \n", - "dataset independencetest features_lag \n", - "E1 cmiknn contemporary 384.151514 5532.617838 \n", - " contemporary + 1-lagged 1308.913636 7726.128555 \n", - " parcorr contemporary 0.045593 0.678409 \n", - " contemporary + 1-lagged 0.274470 10.818769 \n", - "E2 cmiknn contemporary 810.720993 5081.931851 \n", - " contemporary + 1-lagged 1596.576730 5268.378825 \n", - " parcorr contemporary 0.057513 0.341040 \n", - " contemporary + 1-lagged 0.314501 8.094075 \n", - "GM cmiknn contemporary 87.160898 1826.917106 \n", - " contemporary + 1-lagged 504.895575 4688.384780 \n", - " parcorr contemporary 0.016086 0.345339 \n", - " contemporary + 1-lagged 0.089332 9.835356 " - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "results_e12gm_pcmci[\"features_lag\"] = results_e12gm_pcmci[\"lag\"].map(\n", - " {\n", - " 0: \"contemporary\",\n", - " 1: \"contemporary + 1-lagged\",\n", - " }\n", - ")\n", - "results_e12gm_pcmci[\"features_set\"] = results_e12gm_pcmci[\"dataset\"].apply(lambda x: \"all\" if \"all\" in x else \"single\")\n", - "\n", - "results_e12gm_pcmci[\"dataset\"] = results_e12gm_pcmci[\"dataset\"].apply(lambda x: x[3:5])\n", - "\n", - "results_e12gm_pcmci = results_e12gm_pcmci\\\n", - " .drop(columns=[\"selected_features\", \"algorithm\", \"lag\", \"score_r2\"])\\\n", - " .set_index([\"dataset\", \"independencetest\", \"features_lag\", \"features_set\"])\\\n", - " .unstack(\"features_set\").sort_index(axis=1, ascending=False) # fmt: off\n", - "\n", - "results_e12gm_pcmci" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "fd05d126-994e-4054-8772-796ecea12b36", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
   score_r2_lag_arscore_r2_lagexecution_time
  features_setsingleallsingleallsingleall
datasetindependencetestfeatures_lag      
E1cmiknncontemporary0.2860.1810.2860.181384.155532.62
contemporary + 1-lagged0.4200.3680.2550.1481308.917726.13
parcorrcontemporary0.2890.2540.2890.2540.050.68
contemporary + 1-lagged0.4210.3680.2650.1480.2710.82
\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_file = os.path.join(constants.path_table_tex, \"e12gm_E1_pcmci.tex\")\n", - "os.makedirs(os.path.dirname(target_file), exist_ok=True)\n", - "export_pcmci_df_to_latex(results_e12gm_pcmci.iloc[:4, :], target_file)\n", - "results_e12gm_pcmci.iloc[:4, :].style.pipe(make_pcmci_pretty)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "7531accd", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
   score_r2_lag_arscore_r2_lagexecution_time
  features_setsingleallsingleallsingleall
datasetindependencetestfeatures_lag      
E2cmiknncontemporary0.2300.2980.2300.298810.725081.93
contemporary + 1-lagged0.4160.3910.2390.1461596.585268.38
parcorrcontemporary0.2220.1910.2220.1910.060.34
contemporary + 1-lagged0.4020.3910.2480.1460.318.09
\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_file = os.path.join(constants.path_table_tex, \"e12gm_E2_pcmci.tex\")\n", - "os.makedirs(os.path.dirname(target_file), exist_ok=True)\n", - "export_pcmci_df_to_latex(results_e12gm_pcmci.iloc[4:8, :], target_file)\n", - "results_e12gm_pcmci.iloc[4:8, :].style.pipe(make_pcmci_pretty)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "382a776d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
   score_r2_lag_arscore_r2_lagexecution_time
  features_setsingleallsingleallsingleall
datasetindependencetestfeatures_lag      
GMcmiknncontemporary0.1770.0880.1770.08887.161826.92
contemporary + 1-lagged0.4430.4060.2030.120504.904688.38
parcorrcontemporary0.1770.1100.1770.1100.020.35
contemporary + 1-lagged0.4430.4060.2030.1200.099.84
\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_file = os.path.join(constants.path_table_tex, \"e12gm_GM_pcmci.tex\")\n", - "os.makedirs(os.path.dirname(target_file), exist_ok=True)\n", - "export_pcmci_df_to_latex(results_e12gm_pcmci.iloc[8:, :], target_file)\n", - "results_e12gm_pcmci.iloc[8:, :].style.pipe(make_pcmci_pretty)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### [TEFS](#toc0_)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "340b625d", - "metadata": {}, - "outputs": [], - "source": [ - "def check_list(cell):\n", - " if cell == [0]:\n", - " return \"contemporary\"\n", - " elif cell == [0, 1]:\n", - " return \"contemporary + 1-lagged\"\n", - "\n", - "results_e12gm_te[\"features_lag\"] = results_e12gm_te[\"lagfeatures\"].apply(check_list)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "fb7a85b6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
score_r2_lag_arscore_r2_lagexecution_time
features_setsingleallsingleallsingleall
datasetdirectionfeatures_lag
E1backwardcontemporary0.4568400.4050680.2510600.2314620.6881956.806478
contemporary + 1-lagged0.4209320.4209320.2651900.2651900.7572997.315034
forwardcontemporary0.4568400.4568400.2510600.2510600.6331476.430187
contemporary + 1-lagged0.4209320.4209320.2651900.2651900.6592326.633933
E2backwardcontemporary0.4233740.4233740.1903340.1903341.1225267.054950
contemporary + 1-lagged0.3906510.3906510.1461930.1461931.2195917.271728
forwardcontemporary0.4233740.4233740.1903340.1903340.9987145.996039
contemporary + 1-lagged0.3906510.4218090.1461930.1461691.0533896.607501
GMbackwardcontemporary0.3591310.4572280.1765630.1578900.3567086.942717
contemporary + 1-lagged0.4105810.3767460.0500870.0853620.3992917.320499
forwardcontemporary0.3591310.3916540.1765630.0129590.3497546.280308
contemporary + 1-lagged0.4105810.4105810.0500870.0500870.3560366.558476
\n", - "
" - ], - "text/plain": [ - " score_r2_lag_ar \\\n", - "features_set single all \n", - "dataset direction features_lag \n", - "E1 backward contemporary 0.456840 0.405068 \n", - " contemporary + 1-lagged 0.420932 0.420932 \n", - " forward contemporary 0.456840 0.456840 \n", - " contemporary + 1-lagged 0.420932 0.420932 \n", - "E2 backward contemporary 0.423374 0.423374 \n", - " contemporary + 1-lagged 0.390651 0.390651 \n", - " forward contemporary 0.423374 0.423374 \n", - " contemporary + 1-lagged 0.390651 0.421809 \n", - "GM backward contemporary 0.359131 0.457228 \n", - " contemporary + 1-lagged 0.410581 0.376746 \n", - " forward contemporary 0.359131 0.391654 \n", - " contemporary + 1-lagged 0.410581 0.410581 \n", - "\n", - " score_r2_lag \\\n", - "features_set single all \n", - "dataset direction features_lag \n", - "E1 backward contemporary 0.251060 0.231462 \n", - " contemporary + 1-lagged 0.265190 0.265190 \n", - " forward contemporary 0.251060 0.251060 \n", - " contemporary + 1-lagged 0.265190 0.265190 \n", - "E2 backward contemporary 0.190334 0.190334 \n", - " contemporary + 1-lagged 0.146193 0.146193 \n", - " forward contemporary 0.190334 0.190334 \n", - " contemporary + 1-lagged 0.146193 0.146169 \n", - "GM backward contemporary 0.176563 0.157890 \n", - " contemporary + 1-lagged 0.050087 0.085362 \n", - " forward contemporary 0.176563 0.012959 \n", - " contemporary + 1-lagged 0.050087 0.050087 \n", - "\n", - " execution_time \n", - "features_set single all \n", - "dataset direction features_lag \n", - "E1 backward contemporary 0.688195 6.806478 \n", - " contemporary + 1-lagged 0.757299 7.315034 \n", - " forward contemporary 0.633147 6.430187 \n", - " contemporary + 1-lagged 0.659232 6.633933 \n", - "E2 backward contemporary 1.122526 7.054950 \n", - " contemporary + 1-lagged 1.219591 7.271728 \n", - " forward contemporary 0.998714 5.996039 \n", - " contemporary + 1-lagged 1.053389 6.607501 \n", - "GM backward contemporary 0.356708 6.942717 \n", - " contemporary + 1-lagged 0.399291 7.320499 \n", - " forward contemporary 0.349754 6.280308 \n", - " contemporary + 1-lagged 0.356036 6.558476 " - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "results_e12gm_te[\"features_set\"] = results_e12gm_te[\"dataset\"].apply(lambda x: \"all\" if \"all\" in x else \"single\")\n", - "# df_e12gm_te[\"CMI\"] = df_e12gm_te[\"dataset\"].apply(lambda x: \"noCMI\" if \"noCMI\" in x else \"yesCMI\")\n", - "results_e12gm_te[\"dataset\"] = results_e12gm_te[\"dataset\"].apply(lambda x: x[3:5])\n", - "\n", - "results_e12gm_te = results_e12gm_te\\\n", - " .drop(columns=[\"selected_features\", \"lagtarget\", \"lagfeatures\", \"score_r2\"])\\\n", - " .set_index([\"dataset\", \"direction\", \"features_lag\", \"features_set\"])\\\n", - " .unstack(\"features_set\").sort_index(axis=1, ascending=False) # fmt: off\n", - "\n", - "results_e12gm_te" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "ec374849", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
   score_r2_lag_arscore_r2_lagexecution_time
  features_setsingleallsingleallsingleall
datasetdirectionfeatures_lag      
E1backwardcontemporary0.4570.4050.2510.2310.696.81
contemporary + 1-lagged0.4210.4210.2650.2650.767.32
forwardcontemporary0.4570.4570.2510.2510.636.43
contemporary + 1-lagged0.4210.4210.2650.2650.666.63
\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_file = os.path.join(constants.path_table_tex, \"e12gm_E1_te.tex\")\n", - "os.makedirs(os.path.dirname(target_file), exist_ok=True)\n", - "export_te_df_to_latex(results_e12gm_te.iloc[:4, :], target_file)\n", - "results_e12gm_te.iloc[:4, :].style.pipe(make_te_pretty)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "40017b46", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
   score_r2_lag_arscore_r2_lagexecution_time
  features_setsingleallsingleallsingleall
datasetdirectionfeatures_lag      
E2backwardcontemporary0.4230.4230.1900.1901.127.05
contemporary + 1-lagged0.3910.3910.1460.1461.227.27
forwardcontemporary0.4230.4230.1900.1901.006.00
contemporary + 1-lagged0.3910.4220.1460.1461.056.61
\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_file = os.path.join(constants.path_table_tex, \"e12gm_E2_te.tex\")\n", - "os.makedirs(os.path.dirname(target_file), exist_ok=True)\n", - "export_te_df_to_latex(results_e12gm_te.iloc[4:8, :], target_file)\n", - "results_e12gm_te.iloc[4:8, :].style.pipe(make_te_pretty)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "186abb37", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
   score_r2_lag_arscore_r2_lagexecution_time
  features_setsingleallsingleallsingleall
datasetdirectionfeatures_lag      
GMbackwardcontemporary0.3590.4570.1770.1580.366.94
contemporary + 1-lagged0.4110.3770.0500.0850.407.32
forwardcontemporary0.3590.3920.1770.0130.356.28
contemporary + 1-lagged0.4110.4110.0500.0500.366.56
\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_file = os.path.join(constants.path_table_tex, \"e12gm_GM_te.tex\")\n", - "os.makedirs(os.path.dirname(target_file), exist_ok=True)\n", - "export_te_df_to_latex(results_e12gm_te.iloc[8:, :], target_file)\n", - "results_e12gm_te.iloc[8:, :].style.pipe(make_te_pretty)" - ] - }, - { - "cell_type": "markdown", - "id": "7a1a352c", - "metadata": {}, - "source": [ - "### [Full version without CMI](#toc0_)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### [TEFS](#toc0_)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "1be8b53b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 selected_featuresscore_r2score_r2_lagscore_r2_lag_ardatasetlagfeatureslagtargetdirectionexecution_time
0E1cyclostationary_mean_tg_12w_3 E1cyclostationary_mean_rr_1w_10 E1cyclostationary_mean_tg_8w_1 E1cyclostationary_mean_tg_6 E1cyclostationary_mean_tg_12w_2 E1cyclostationary_mean_rr_1w_9 E1cyclostationary_mean_tg_0 E1cyclostationary_mean_rr_4w_10.2760.2760.482df_E1_noCMI[0][1]backward717.581s
1E1cyclostationary_mean_rr_12w_0 E1cyclostationary_mean_rr_1w_5 E1cyclostationary_mean_tg_60.2540.2540.471df_E1_noCMI[0][1]forward641.229s
2E1cyclostationary_mean_rr_8w_4 E1cyclostationary_mean_tg_00.2610.2620.402df_E1_noCMI[0, 1][1]backward766.952s
3E1cyclostationary_mean_rr_12w_0 E1cyclostationary_mean_tg_6 E1cyclostationary_mean_rr_8w_40.2150.2120.395df_E1_noCMI[0, 1][1]forward633.929s
4E1cyclostationary_mean_rr_8w_4 E1cyclostationary_mean_tg_60.2040.2040.424df_E1allfeatures_noCMI[0][1]backward2451.030s
5E1cyclostationary_mean_rr_12w_0 GMcyclostationary_mean_tg_0 E1cyclostationary_mean_rr_1w_50.2810.2810.484df_E1allfeatures_noCMI[0][1]forward2047.873s
6E1cyclostationary_mean_rr_12w_4 E1cyclostationary_mean_rr_4w_1 E1cyclostationary_mean_tg_12w_0 E1cyclostationary_mean_tg_8w_4 E1cyclostationary_mean_tg_12w_7 E2cyclostationary_mean_tg_8w_2 E1cyclostationary_mean_tg_60.2570.3060.428df_E1allfeatures_noCMI[0, 1][1]backward2612.832s
7E1cyclostationary_mean_rr_12w_0 E1cyclostationary_mean_tg_6 E1cyclostationary_mean_rr_8w_40.2150.2120.395df_E1allfeatures_noCMI[0, 1][1]forward2190.678s
8E2cyclostationary_mean_tg_1 E2cyclostationary_mean_tg_3 E2cyclostationary_mean_tg_4 E2cyclostationary_mean_tg_8w_20.1740.1740.461df_E2_noCMI[0][1]backward148.270s
9E2cyclostationary_mean_tg_1 E2cyclostationary_mean_rr_1w_30.1980.1980.440df_E2_noCMI[0][1]forward124.161s
10E2cyclostationary_mean_tg_4 E2cyclostationary_mean_tg_0 E2cyclostationary_mean_tg_1w_1 E2cyclostationary_mean_rr_8w_00.1900.2560.440df_E2_noCMI[0, 1][1]backward160.542s
11E2cyclostationary_mean_tg_1w_00.1700.1800.451df_E2_noCMI[0, 1][1]forward135.600s
12E2cyclostationary_mean_tg_00.1120.1120.408df_E2allfeatures_noCMI[0][1]backward2488.332s
13E2cyclostationary_mean_tg_1 E1cyclostationary_mean_rr_1w_150.1560.1560.421df_E2allfeatures_noCMI[0][1]forward2019.690s
14E2cyclostationary_mean_tg_1w_00.1700.1800.451df_E2allfeatures_noCMI[0, 1][1]backward3022.579s
15E2cyclostationary_mean_tg_1w_00.1700.1800.451df_E2allfeatures_noCMI[0, 1][1]forward2181.964s
16GMcyclostationary_mean_tg_8w_0 GMcyclostationary_mean_rr_24w_0 GMcyclostationary_mean_tg_4w_0 GMcyclostationary_mean_rr_12w_1 GMcyclostationary_mean_tg_00.2130.2130.451df_GM_noCMI[0][1]backward47.713s
17GMcyclostationary_mean_tg_0 GMcyclostationary_mean_rr_1w_1 GMcyclostationary_mean_rr_1w_30.1380.1380.440df_GM_noCMI[0][1]forward41.189s
18GMcyclostationary_mean_rr_12w_0 GMcyclostationary_mean_tg_8w_0 GMcyclostationary_mean_rr_12w_1 GMcyclostationary_mean_tg_1w_10.2460.2340.446df_GM_noCMI[0, 1][1]backward55.535s
19GMcyclostationary_mean_tg_1w_10.0580.0590.430df_GM_noCMI[0, 1][1]forward42.982s
20E2cyclostationary_mean_tg_1w_1 E2cyclostationary_mean_tg_40.0400.0400.379df_GMallfeatures_noCMI[0][1]backward2462.089s
21E1cyclostationary_mean_tg_8w_4 E1cyclostationary_mean_rr_8w_0 E2cyclostationary_mean_tg_40.1120.1120.427df_GMallfeatures_noCMI[0][1]forward2019.530s
22E2cyclostationary_mean_tg_40.0160.0440.376df_GMallfeatures_noCMI[0, 1][1]backward2700.420s
23E2cyclostationary_mean_tg_1w_00.0490.0660.439df_GMallfeatures_noCMI[0, 1][1]forward2170.156s
\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_file = os.path.join(constants.path_table_tex, \"e12gm_noCMI_te_full.tex\")\n", - "export_results_dataframe_te(results_e12gm_noCMI_te, target_file)\n", - "results_e12gm_noCMI_te.style.pipe(make_te_all_pretty)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### [TEFS as wrapper on E12GM](#toc0_)" - ] - }, - { - "cell_type": "markdown", - "id": "9dfb99d4", - "metadata": {}, - "source": [ - "In this way the method becomes a wrapper because we are making a selection solely by looking at the performance in regression." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "93010e78", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['te_e12gm_datasetdf_E1_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_E1_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_E1_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_E1_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_E1_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_E1_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_E1_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_E1_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_E1allfeatures_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_E1allfeatures_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_E1allfeatures_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_E1allfeatures_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_E1allfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_E1allfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_E1allfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_E1allfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_E2_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_E2_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_E2_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_E2_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_E2_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_E2_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_E2_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_E2_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_E2allfeatures_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_E2allfeatures_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_E2allfeatures_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_E2allfeatures_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_E2allfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_E2allfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_E2allfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_E2allfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_GM_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_GM_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_GM_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_GM_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_GM_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_GM_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_GM_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_GM_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_GMallfeatures_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_GMallfeatures_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_GMallfeatures_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_GMallfeatures_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_GMallfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_GMallfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_GMallfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_GMallfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_ticino_datasetnormal_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_ticino_datasetnormal_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_ticino_datasetnormal_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_ticino_datasetnormal_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_ticino_datasetsnowlakes_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_ticino_datasetsnowlakes_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_ticino_datasetsnowlakes_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_ticino_datasetsnowlakes_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl']" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Load all TEFS simulations\n", - "results_files = sorted([file for file in os.listdir(constants.path_results) if file.endswith(\".pkl\")])\n", - "config_list = [file for file in results_files if file.split(\"_\")[0] == \"te\"]\n", - "config_list" - ] - }, - { - "cell_type": "markdown", - "id": "cdb4f193", - "metadata": {}, - "source": [ - "Here I make two plots, one with a single line using fixed train and test, and one with cross-validation, in this case with `KFold`, but a version with `TimeSeriesSplit` is also available." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "f9444671", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Skipping te_e12gm_datasetdf_E1_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1allfeatures_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1allfeatures_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1allfeatures_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1allfeatures_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1allfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1allfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1allfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1allfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2allfeatures_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2allfeatures_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2allfeatures_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2allfeatures_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2allfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2allfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2allfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2allfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GM_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GM_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GM_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GM_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GM_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GM_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GM_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GM_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GMallfeatures_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GMallfeatures_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GMallfeatures_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GMallfeatures_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GMallfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GMallfeatures_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GMallfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GMallfeatures_noCMI_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_ticino_datasetnormal_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_ticino_datasetnormal_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_ticino_datasetnormal_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_ticino_datasetnormal_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_ticino_datasetsnowlakes_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_ticino_datasetsnowlakes_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_ticino_datasetsnowlakes_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_ticino_datasetsnowlakes_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n" - ] - } - ], - "source": [ - "for config_name in config_list:\n", - " basename = os.path.splitext(os.path.basename(config_name))[0]\n", - " target_file_train_test = os.path.join(constants.path_figures, \"tefs_as_wrapper\", f\"{basename}_wrapper.pdf\")\n", - " target_file_cv = os.path.join(constants.path_figures, \"tefs_as_wrapper_cv\", f\"{basename}_wrapper_cv.pdf\")\n", - " if os.path.exists(target_file_train_test) and os.path.exists(target_file_cv):\n", - " print(f\"Skipping {config_name}...\")\n", - " continue\n", - "\n", - " print(f\"Processing {config_name}...\")\n", - "\n", - " # --------------------- Load simulation ---------------------\n", - " simulation = file_management.load_from_pkl_file(\n", - " os.path.join(\n", - " constants.path_results,\n", - " config_name,\n", - " )\n", - " )\n", - "\n", - " # --------------------- Load corresponding dataset ---------------------\n", - " basin_name = config_name.split(\"_\")[1]\n", - " datasets, _ = datasets_and_configurations_loaders[\"te\"].get(basin_name)()\n", - " dataset_name = simulation[\"dataset_name\"]\n", - " dataframe = datasets[dataset_name]\n", - "\n", - " target_columns = [\"target\"]\n", - " features_columns = dataframe[\"full\"].drop(columns=target_columns).columns\n", - "\n", - " # --------------------- Select features using threshold (conservative) ---------------------\n", - " selected_features_names_with_threshold = simulation[\"results\"].select_features(simulation[\"params\"][\"threshold\"])\n", - " n_features_selected_with_threshold = len(selected_features_names_with_threshold)\n", - "\n", - " # --------------------- Compute test R2 for each number of features ---------------------\n", - " test_r2_train_test = []\n", - " test_r2_cv = []\n", - " num_total_features = len(dataframe[\"full\"].columns) - 1 # -1 because the last column is the target\n", - " for num_features in range(0, num_total_features + 1):\n", - " if num_features == 0:\n", - " selected_features_names = []\n", - " else:\n", - " selected_features_names = simulation[\"results\"].select_n_features(num_features)\n", - "\n", - " lagfeatures = simulation[\"params\"][\"lagfeatures\"]\n", - " lagtarget = simulation[\"params\"][\"lagtarget\"]\n", - "\n", - " inputs_names_lags = {feature: lagfeatures for feature in selected_features_names}\n", - " inputs_names_lags[\"target\"] = lagtarget\n", - "\n", - " # --- Compute the train_test version ---\n", - " test_r2_train_test.append(\n", - " regression_analysis(\n", - " inputs_names_lags=inputs_names_lags,\n", - " target_name=target_columns[0],\n", - " df_train=dataframe[\"train\"],\n", - " df_test=dataframe[\"test\"],\n", - " )\n", - " )\n", - "\n", - " # --- Compute the cross-validation version ---\n", - " # To perform a cross-validation, we need to concatenate the train and test sets\n", - " unified_df = pd.concat([dataframe[\"train\"], dataframe[\"test\"]], axis=0).reset_index(drop=True)\n", - "\n", - " # Fixed window size\n", - " # n_samples = unified_df.shape[0]\n", - " # n_splits = 5\n", - " # cv_scheme = TimeSeriesSplit(\n", - " # n_splits=n_splits,\n", - " # max_train_size=n_samples // (n_splits + 1),\n", - " # )\n", - "\n", - " # Regular KFold\n", - " cv_scheme = KFold(n_splits=4) # 4 splits is about using the same test set size\n", - "\n", - " test_r2_cv.append(\n", - " regression_analysis(\n", - " inputs_names_lags=inputs_names_lags,\n", - " target_name=target_columns[0],\n", - " df=unified_df,\n", - " cv_scheme=cv_scheme,\n", - " )\n", - " )\n", - "\n", - " test_r2_train_test = np.array(test_r2_train_test)\n", - " test_r2_cv = np.array(test_r2_cv)\n", - "\n", - " # --------------------- Plot train test version ---------------------\n", - " fig, ax = plt.subplots(figsize=(10, 5))\n", - " ax.plot(test_r2_train_test, marker=\"o\", label=\"Fixed train-test\")\n", - " maxima = np.where(test_r2_train_test == test_r2_train_test.max())[0]\n", - " ax.plot(maxima, test_r2_train_test[maxima], marker=\"o\", color=\"red\", linestyle=\"None\", label=\"Maximum\", markersize=10)\n", - " ax.plot(n_features_selected_with_threshold, test_r2_train_test[n_features_selected_with_threshold], marker=\"o\", color=\"green\", linestyle=\"None\", label=\"TEFS (conservative)\", markersize=10)\n", - " ax.set_xlabel(\"Number of features\")\n", - " ax.set_ylabel(\"Test $R^2$\")\n", - "\n", - " if simulation[\"params\"][\"threshold\"] == np.inf:\n", - " threshold_text = \"\\infty\"\n", - " elif simulation[\"params\"][\"threshold\"] == -np.inf:\n", - " threshold_text = \"-\\infty\"\n", - " else:\n", - " threshold_text = simulation[\"params\"][\"threshold\"]\n", - "\n", - " title_text = f\"TEFS on basin {basin_name.upper()} with dataset {dataset_name}\\n[lagfeatures $={simulation['params']['lagfeatures']}$, lagtarget $={simulation['params']['lagtarget']}$, direction = {simulation['params']['direction']}, threshold $={threshold_text}]$\"\n", - " ax.set_title(title_text)\n", - " ax.legend()\n", - " if num_total_features < 30:\n", - " step = 1\n", - " elif num_total_features < 80:\n", - " step = 5\n", - " else:\n", - " step = 10\n", - " ax.set_xticks(range(0, num_total_features + 1, step))\n", - " ax.set_xticklabels(range(0, num_total_features + 1, step))\n", - " ax.set_ylim(-0.1, 0.55)\n", - " ax.grid()\n", - "\n", - " os.makedirs(os.path.dirname(target_file_train_test), exist_ok=True)\n", - " plt.savefig(target_file_train_test, bbox_inches=\"tight\")\n", - " plt.close(fig)\n", - "\n", - " # --------------------- Plot cross-validation version ---------------------\n", - " fig, ax = plt.subplots(figsize=(10, 5))\n", - " ax.plot(test_r2_cv.mean(axis=1), marker=\"o\", label=\"Cross-validation\")\n", - " maxima = np.where(test_r2_cv.mean(axis=1) == test_r2_cv.mean(axis=1).max())[0]\n", - " ax.plot(maxima, test_r2_cv.mean(axis=1)[maxima], marker=\"o\", color=\"red\", linestyle=\"None\", label=\"Maximum\", markersize=10)\n", - " ax.plot(n_features_selected_with_threshold, test_r2_cv.mean(axis=1)[n_features_selected_with_threshold], marker=\"o\", color=\"green\", linestyle=\"None\", label=\"TEFS (conservative)\", markersize=10)\n", - "\n", - " # plot confidence interval bands from cross-validation based on mean and standard deviation (90% confidence)\n", - " alpha = 0.1\n", - " quantile = scipy.stats.norm.ppf(1 - alpha / 2)\n", - " ax.fill_between(range(test_r2_cv.shape[0]), test_r2_cv.mean(axis=1) - test_r2_cv.std(axis=1) * quantile / np.sqrt(test_r2_cv.shape[1]), test_r2_cv.mean(axis=1) + test_r2_cv.std(axis=1) * quantile / np.sqrt(test_r2_cv.shape[1]), alpha=0.3)\n", - "\n", - " ax.set_xlabel(\"Number of features\")\n", - " ax.set_ylabel(\"Test $R^2$\")\n", - "\n", - " if simulation[\"params\"][\"threshold\"] == np.inf:\n", - " threshold_text = \"\\infty\"\n", - " elif simulation[\"params\"][\"threshold\"] == -np.inf:\n", - " threshold_text = \"-\\infty\"\n", - " else:\n", - " threshold_text = simulation[\"params\"][\"threshold\"]\n", - "\n", - " title_text = f\"TEFS on basin {basin_name.upper()} with dataset {dataset_name}\\n[lagfeatures $={simulation['params']['lagfeatures']}$, lagtarget $={simulation['params']['lagtarget']}$, direction = {simulation['params']['direction']}, threshold $={threshold_text}]$\"\n", - " ax.set_title(title_text)\n", - " ax.legend()\n", - " if num_total_features < 30:\n", - " step = 1\n", - " elif num_total_features < 80:\n", - " step = 5\n", - " else:\n", - " step = 10\n", - " ax.set_xticks(range(0, num_total_features + 1, step))\n", - " ax.set_xticklabels(range(0, num_total_features + 1, step))\n", - " ax.set_ylim(-0.1, 0.55)\n", - " ax.grid()\n", - "\n", - " os.makedirs(os.path.dirname(target_file_cv), exist_ok=True)\n", - " plt.savefig(target_file_cv, bbox_inches=\"tight\")\n", - " plt.close(fig)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### [Linking the wrapper to the original filter method](#toc0_)" - ] - }, - { - "cell_type": "markdown", - "id": "b5ce711d", - "metadata": {}, - "source": [ - "Here I look for all configurations without CMI and match them to those with CMI. I show the plot above where I see the algorithm as a wrapper and highlight with vertical bars the points at which the variables chosen in the version with CMI were added/removed. There is also the option to choose variables manually (ideally the most common ones \"by eye\")." - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "4fc3f237", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('te_e12gm_datasetdf_E1_noCMI_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl',\n", - " 'te_e12gm_datasetdf_E1_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl'),\n", - " ('te_e12gm_datasetdf_E1_noCMI_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl',\n", - " 'te_e12gm_datasetdf_E1_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl')]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "config_matches = []\n", - "for config_name in config_list:\n", - " if \"noCMI\" in config_name and config_name.replace(\"_noCMI\", \"\") in config_list:\n", - " config_matches.append((config_name, config_name.replace(\"_noCMI\", \"\")))\n", - "\n", - "config_matches[:2]" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "8e0836bb", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Skipping te_e12gm_datasetdf_E1_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1allfeatures_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1allfeatures_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1allfeatures_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E1allfeatures_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2allfeatures_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2allfeatures_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2allfeatures_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_E2allfeatures_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GM_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GM_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GM_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GM_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GMallfeatures_lagfeatures[0,1]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GMallfeatures_lagfeatures[0,1]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GMallfeatures_lagfeatures[0]_lagtarget[1]_directionbackward_threshold0_k10.pkl...\n", - "Skipping te_e12gm_datasetdf_GMallfeatures_lagfeatures[0]_lagtarget[1]_directionforward_thresholdinf_k10.pkl...\n" - ] - } - ], - "source": [ - "for config_name_noCMI, config_name in config_matches:\n", - " basename = os.path.splitext(os.path.basename(config_name_noCMI))[0]\n", - " target_file = os.path.join(constants.path_figures, \"tefs_as_wrapper_mapping_filter\", f\"{basename}_wrapper_mapping_filter.pdf\")\n", - " if os.path.exists(target_file):\n", - " print(f\"Skipping {config_name}...\")\n", - " continue\n", - "\n", - " print(f\"Processing {config_name}...\")\n", - "\n", - " simulation_noCMI = file_management.load_from_pkl_file(\n", - " os.path.join(\n", - " constants.path_results,\n", - " config_name_noCMI,\n", - " )\n", - " )\n", - "\n", - " simulation = file_management.load_from_pkl_file(\n", - " os.path.join(\n", - " constants.path_results,\n", - " config_name,\n", - " )\n", - " )\n", - "\n", - " # feature selected with CMI\n", - " basin_name = config_name.split(\"_\")[1]\n", - " datasets, _ = datasets_and_configurations_loaders[\"te\"].get(basin_name)()\n", - " dataset_name = simulation[\"dataset_name\"]\n", - " dataframe = datasets[dataset_name]\n", - " target_columns = [\"target\"]\n", - " features_columns = dataframe[\"full\"].drop(columns=target_columns).columns\n", - "\n", - " selected_features_names_with_threshold = simulation[\"results\"].select_features(simulation[\"params\"][\"threshold\"])\n", - " n_features_selected_with_threshold = len(selected_features_names_with_threshold)\n", - "\n", - " # choose manually\n", - " selected_features_names_with_threshold = [\"E1cyclostationary_mean_rr_4w_1\", \"E2cyclostationary_mean_tg_0\"]\n", - "\n", - " # Load the noCMI version and process it\n", - " basin_name = config_name_noCMI.split(\"_\")[1]\n", - " datasets, _ = datasets_and_configurations_loaders[\"te\"].get(basin_name)()\n", - " dataset_name_noCMI = simulation_noCMI[\"dataset_name\"]\n", - " dataframe_noCMI = datasets[dataset_name_noCMI]\n", - " target_columns = [\"target\"]\n", - " features_columns_noCMI = dataframe_noCMI[\"full\"].drop(columns=target_columns).columns\n", - "\n", - " test_r2_train_test = []\n", - "\n", - " selected_features_names_previous = [] # new part\n", - " corresponding_features_indexes = {} # new part\n", - "\n", - " num_total_features = len(dataframe_noCMI[\"full\"].columns) - 1 # -1 because the last column is the target\n", - " for num_features in range(0, num_total_features + 1):\n", - " if num_features == 0:\n", - " selected_features_names = []\n", - " else:\n", - " selected_features_names_previous = selected_features_names.copy() # new part\n", - " selected_features_names = simulation_noCMI[\"results\"].select_n_features(num_features)\n", - "\n", - " # if the feature that has been just added is in selected_features_names_with_threshold, add num_features to corresponding_features_indexes\n", - " # looking at the set difference\n", - " new_feature_name = list(set(selected_features_names).difference(set(selected_features_names_previous)))[0]\n", - " if new_feature_name in selected_features_names_with_threshold:\n", - " corresponding_features_indexes[num_features] = new_feature_name\n", - "\n", - " lagfeatures = simulation_noCMI[\"params\"][\"lagfeatures\"]\n", - " lagtarget = simulation_noCMI[\"params\"][\"lagtarget\"]\n", - "\n", - " inputs_names_lags = {feature: lagfeatures for feature in selected_features_names}\n", - " inputs_names_lags[\"target\"] = lagtarget\n", - "\n", - " # --- Compute the train_test version ---\n", - " test_r2_train_test.append(\n", - " regression_analysis(\n", - " inputs_names_lags=inputs_names_lags,\n", - " target_name=target_columns[0],\n", - " df_train=dataframe_noCMI[\"train\"],\n", - " df_test=dataframe_noCMI[\"test\"],\n", - " )\n", - " )\n", - "\n", - " test_r2_train_test = np.array(test_r2_train_test)\n", - "\n", - " # --------------------- Plot ---------------------\n", - " fig, ax = plt.subplots(figsize=(10, 5))\n", - " ax.plot(test_r2_train_test, marker=\"o\", label=\"Fixed train-test\")\n", - "\n", - " # Get the default color cycle\n", - " color_cycle = plt.rcParams[\"axes.prop_cycle\"].by_key()[\"color\"]\n", - "\n", - " # plot vertical lines in corresponding_features_indexes\n", - " for i, (key, value) in enumerate(corresponding_features_indexes.items()):\n", - " ax.axvline(x=key, linestyle=\"--\", color=color_cycle[i + 1 % len(color_cycle)], label=f\"{value}\")\n", - "\n", - " maxima = np.where(test_r2_train_test == test_r2_train_test.max())[0]\n", - " ax.plot(maxima, test_r2_train_test[maxima], marker=\"o\", color=\"red\", linestyle=\"None\", label=\"Maximum\", markersize=10)\n", - " ax.plot(n_features_selected_with_threshold, test_r2_train_test[n_features_selected_with_threshold], marker=\"o\", color=\"green\", linestyle=\"None\", label=\"TEFS (conservative)\", markersize=10)\n", - " ax.set_xlabel(\"Number of features\")\n", - " ax.set_ylabel(\"Test $R^2$\")\n", - "\n", - " if simulation[\"params\"][\"threshold\"] == np.inf:\n", - " threshold_text = \"\\infty\"\n", - " elif simulation[\"params\"][\"threshold\"] == -np.inf:\n", - " threshold_text = \"-\\infty\"\n", - " else:\n", - " threshold_text = simulation[\"params\"][\"threshold\"]\n", - "\n", - " title_text = f\"TEFS on basin {basin_name.upper()} with dataset {dataset_name}\\n[lagfeatures $={simulation['params']['lagfeatures']}$, lagtarget $={simulation['params']['lagtarget']}$, direction = {simulation['params']['direction']}, threshold $={threshold_text}]$\"\n", - " ax.set_title(title_text)\n", - " ax.legend()\n", - " if num_total_features < 30:\n", - " step = 1\n", - " elif num_total_features < 80:\n", - " step = 5\n", - " else:\n", - " step = 10\n", - "\n", - " ax.set_xticks(range(0, num_total_features + 1, step))\n", - " ax.set_xticklabels(range(0, num_total_features + 1, step))\n", - " ax.set_ylim(-0.1, 0.55)\n", - " ax.grid()\n", - "\n", - " os.makedirs(os.path.dirname(target_file), exist_ok=True)\n", - " plt.savefig(target_file, bbox_inches=\"tight\")\n", - " plt.close(fig)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 48b2100f00d4c82dcde7c895b90b88600d774609 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Tue, 26 Mar 2024 16:57:14 +0100 Subject: [PATCH 28/51] Few changes to the interactive test file (still not working) --- hawk/processes/simulation_interactive.py | 32 ++++++++++++++++++------ 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/hawk/processes/simulation_interactive.py b/hawk/processes/simulation_interactive.py index f4882cd..e0f6eeb 100644 --- a/hawk/processes/simulation_interactive.py +++ b/hawk/processes/simulation_interactive.py @@ -2,9 +2,7 @@ import pandas as pd from birdy import WPSClient -# from keras import models - - +# ----------- Generate some data ----------- np.random.seed(0) n = 1000 # number of samples m = 15 # number of features @@ -13,7 +11,8 @@ for i in range(1, m + 1): data[f"x{i}"] = np.random.normal(size=n) -data["y"] = sum(data.values()) + np.random.normal(size=n) +target_name = "target" +data[target_name] = sum(data.values()) + np.random.normal(size=n) data = pd.DataFrame(data) @@ -24,12 +23,29 @@ data.head() -target_name = "y" -url = "http://localhost:5000/wps" -wps = WPSClient(url, verify=False) +train_file_path = "./train_dataset.csv" +test_file_path = "./test_dataset.csv" +data.to_csv(train_file_path, index=False) +data_test.to_csv(test_file_path, index=False) + +# ----------------- WPS ----------------- + +wps = WPSClient("http://localhost:5000/wps", verify=False) help(wps) -resp = wps.causal() +# Input some data for the causal process +resp = wps.causal( + dataset_train=train_file_path, + dataset_test=test_file_path, + target_column_name=target_name, + pcmci_test_choice="parcorr", + pcmci_max_lag=1, + tefs_direction="both", + tefs_use_contemporary_features=True, + tefs_max_lag_features=1, + tefs_max_lag_target=1, +) + print(resp) resp.get() From 758ac8b94d690dcaf1f861639c8e173771318aa7 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Tue, 26 Mar 2024 16:57:28 +0100 Subject: [PATCH 29/51] Import causal in the processes --- hawk/processes/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hawk/processes/__init__.py b/hawk/processes/__init__.py index 437441e..e8a8308 100644 --- a/hawk/processes/__init__.py +++ b/hawk/processes/__init__.py @@ -1,5 +1,7 @@ +from .wps_causal import Causal from .wps_say_hello import SayHello processes = [ SayHello(), + Causal(), ] From f78755969a2bae820ca1fc82af12c39f5471186a Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Tue, 26 Mar 2024 19:35:56 +0100 Subject: [PATCH 30/51] Possible fix of the test --- tests/test_wps_caps.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_wps_caps.py b/tests/test_wps_caps.py index 8613d73..9c638c8 100644 --- a/tests/test_wps_caps.py +++ b/tests/test_wps_caps.py @@ -1,8 +1,9 @@ from pywps import Service -from .common import client_for from hawk.processes import processes +from .common import client_for + def test_wps_caps(): client = client_for(Service(processes=processes)) @@ -12,5 +13,6 @@ def test_wps_caps(): '/wps:Process' '/ows:Identifier') assert sorted(names.split()) == [ + 'causal', 'hello', ] From 76aaa815c60517b64a4752adcc35696c9b3e865c Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Tue, 26 Mar 2024 19:41:28 +0100 Subject: [PATCH 31/51] Raise error in edge case --- hawk/processes/wps_causal.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hawk/processes/wps_causal.py b/hawk/processes/wps_causal.py index d66b020..72953ff 100644 --- a/hawk/processes/wps_causal.py +++ b/hawk/processes/wps_causal.py @@ -198,7 +198,8 @@ def _handler(self, request, response): workdir = Path(self.workdir) - # connect to the analysis class + if tefs_use_contemporary_features == False and tefs_max_lag_features == "no_lag": + raise ValueError("You cannot use no lag features and not use contemporary features in TEFS.") causal_analysis = CausalAnalysis( df_train, From 4f03dc8f7188c98ddeb108c0704090b09c58b621 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Tue, 26 Mar 2024 19:47:04 +0100 Subject: [PATCH 32/51] Use target input by user --- hawk/analysis/main.py | 20 ++++++++++++++++--- hawk/analysis/postprocessing.py | 34 +++++++++++++++++---------------- hawk/analysis/simulation.py | 5 +++-- 3 files changed, 38 insertions(+), 21 deletions(-) diff --git a/hawk/analysis/main.py b/hawk/analysis/main.py index 4ed9f07..422964b 100644 --- a/hawk/analysis/main.py +++ b/hawk/analysis/main.py @@ -145,6 +145,7 @@ def run_tefs_analysis( results.append( run_simulation_tefs( datasets=self.datasets, + target_column_name=self.target_column_name, config=config, ) ) @@ -219,8 +220,21 @@ def run(self): tefs_results = self.run_tefs_analysis() pcmci_results = self.run_pcmci_analysis() - self.plot_pcmci, self.details_pcmci = run_postprocessing_pcmci(pcmci_results, self.datasets, self.workdir) - self.plot_tefs, self.details_tefs = run_postprocessing_tefs(tefs_results, self.datasets, self.workdir) + self.plot_pcmci, self.details_pcmci = run_postprocessing_pcmci( + results_pcmci=pcmci_results, + target_column_name=self.target_column_name, + datasets=self.datasets, + destination_path=self.workdir, + ) + self.plot_tefs, self.details_tefs = run_postprocessing_tefs( + results_tefs=tefs_results, + target_column_name=self.target_column_name, + datasets=self.datasets, + destination_path=self.workdir, + ) self.plot_tefs_wrapper, self.details_tefs_wrapper = run_postprocessing_tefs_wrapper( - tefs_results, self.datasets, self.workdir + results_tefs=tefs_results, + target_column_name=self.target_column_name, + datasets=self.datasets, + destination_path=self.workdir, ) diff --git a/hawk/analysis/postprocessing.py b/hawk/analysis/postprocessing.py index 8f6037f..56436bd 100644 --- a/hawk/analysis/postprocessing.py +++ b/hawk/analysis/postprocessing.py @@ -83,6 +83,7 @@ def plot_feature_presence_and_r2(df_presence, scores_values, scores_labels): def run_postprocessing_pcmci( results_pcmci, + target_column_name, datasets, destination_path, ): @@ -170,7 +171,7 @@ def run_postprocessing_pcmci( score_r2 = ( regression_analysis( inputs_names_lags=inputs_names_lags, - target_name="target", + target_name=target_column_name, df_train=dataframe["train"], df_test=dataframe["test"], ) @@ -182,7 +183,7 @@ def run_postprocessing_pcmci( score_r2_lag = ( regression_analysis( inputs_names_lags=inputs_names_lags, - target_name="target", + target_name=target_column_name, df_train=dataframe["train"], df_test=dataframe["test"], ) @@ -191,10 +192,10 @@ def run_postprocessing_pcmci( ) inputs_names_lags = {feature: list(range(0, simulation["params"]["lag"] + 1)) for feature in selected_features} - inputs_names_lags["target"] = list(range(1, simulation["params"]["lag"] + 1)) + inputs_names_lags[target_column_name] = list(range(1, simulation["params"]["lag"] + 1)) score_r2_lag_ar = regression_analysis( inputs_names_lags=inputs_names_lags, - target_name="target", + target_name=target_column_name, df_train=dataframe["train"], df_test=dataframe["test"], ) @@ -219,8 +220,8 @@ def run_postprocessing_pcmci( save_to_pkl_file(target_file_results_details, results_table_pcmci) # Feature presences heatmap - if "target" in all_basin_variables: - all_basin_variables.remove("target") + if target_column_name in all_basin_variables: + all_basin_variables.remove(target_column_name) all_basin_variables = sorted(list(all_basin_variables)) df_presence = pd.DataFrame(index=all_basin_variables, columns=range(len(results_pcmci))) scores = [] @@ -261,6 +262,7 @@ def run_postprocessing_pcmci( def run_postprocessing_tefs( results_tefs, + target_column_name, datasets, destination_path, ): @@ -292,7 +294,7 @@ def run_postprocessing_tefs( score_r2 = ( regression_analysis( inputs_names_lags=inputs_names_lags, - target_name="target", + target_name=target_column_name, df_train=dataframe["train"], df_test=dataframe["test"], ) @@ -304,7 +306,7 @@ def run_postprocessing_tefs( score_r2_lag = ( regression_analysis( inputs_names_lags=inputs_names_lags, - target_name="target", + target_name=target_column_name, df_train=dataframe["train"], df_test=dataframe["test"], ) @@ -313,10 +315,10 @@ def run_postprocessing_tefs( ) inputs_names_lags = {feature: lagfeatures for feature in selected_features_names} - inputs_names_lags["target"] = lagtarget + inputs_names_lags[target_column_name] = lagtarget score_r2_lag_ar = regression_analysis( inputs_names_lags=inputs_names_lags, - target_name="target", # TODO change to use the target column name given by the user + target_name=target_column_name, # TODO change to use the target column name given by the user df_train=dataframe["train"], df_test=dataframe["test"], ) @@ -341,8 +343,8 @@ def run_postprocessing_tefs( save_to_pkl_file(target_file_results_details, results_table_te) # Feature presences heatmap - if "target" in all_basin_variables: - all_basin_variables.remove("target") + if target_column_name in all_basin_variables: + all_basin_variables.remove(target_column_name) all_basin_variables = sorted(list(all_basin_variables)) df_presence = pd.DataFrame(index=all_basin_variables, columns=range(len(results_tefs))) scores = [] @@ -383,6 +385,7 @@ def run_postprocessing_tefs( def run_postprocessing_tefs_wrapper( results_tefs, + target_column_name, datasets, destination_path, ): @@ -397,8 +400,7 @@ def run_postprocessing_tefs_wrapper( dataset_name = simulation["dataset_name"] dataframe = datasets[dataset_name] - target_columns = ["target"] - features_columns = dataframe["full"].drop(columns=target_columns).columns + features_columns = dataframe["full"].drop(columns=[target_column_name]).columns # --------------------- Select features using threshold (conservative) --------------------- # selected_features_names_with_threshold = simulation["results"].select_features(simulation["params"]["threshold"]) # noqa @@ -418,13 +420,13 @@ def run_postprocessing_tefs_wrapper( lagtarget = simulation["params"]["lagtarget"] inputs_names_lags = {feature: lagfeatures for feature in selected_features_names} - inputs_names_lags["target"] = lagtarget + inputs_names_lags[target_column_name] = lagtarget # --- Compute the train_test version --- test_r2_train_test.append( regression_analysis( inputs_names_lags=inputs_names_lags, - target_name=target_columns[0], + target_name=target_column_name, df_train=dataframe["train"], df_test=dataframe["test"], ) diff --git a/hawk/analysis/simulation.py b/hawk/analysis/simulation.py index d048f28..ec21c8b 100644 --- a/hawk/analysis/simulation.py +++ b/hawk/analysis/simulation.py @@ -62,6 +62,7 @@ def run_simulation_pcmci( def run_simulation_tefs( datasets, config, + target_column_name, n_jobs=1, ): params = config["params"] @@ -79,8 +80,8 @@ def run_simulation_tefs( # param_str = param_str.replace(" ", "") # config_id = f"dataset{dataset_name}_{param_str}" - features = dataframe["full"].drop(columns=["target"]) - target = dataframe["full"]["target"] + features = dataframe["full"].drop(columns=[target_column_name]) + target = dataframe["full"][target_column_name] var_names = list(features.columns) # run the feature selection algorithm From 80931204debaae5414cdb0bcfd50a346b3e76e49 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Tue, 26 Mar 2024 19:48:20 +0100 Subject: [PATCH 33/51] Make linting happy --- hawk/processes/wps_causal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hawk/processes/wps_causal.py b/hawk/processes/wps_causal.py index 72953ff..25bccc6 100644 --- a/hawk/processes/wps_causal.py +++ b/hawk/processes/wps_causal.py @@ -198,7 +198,7 @@ def _handler(self, request, response): workdir = Path(self.workdir) - if tefs_use_contemporary_features == False and tefs_max_lag_features == "no_lag": + if not tefs_use_contemporary_features and tefs_max_lag_features == "no_lag": raise ValueError("You cannot use no lag features and not use contemporary features in TEFS.") causal_analysis = CausalAnalysis( From e22c28e75c503c9d23107ae684629ae666196f8e Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Tue, 26 Mar 2024 23:26:59 +0100 Subject: [PATCH 34/51] Change inputs hoping to make it work --- hawk/processes/simulation_interactive.py | 10 +++++----- hawk/processes/wps_causal.py | 5 +++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/hawk/processes/simulation_interactive.py b/hawk/processes/simulation_interactive.py index e0f6eeb..e189b3e 100644 --- a/hawk/processes/simulation_interactive.py +++ b/hawk/processes/simulation_interactive.py @@ -39,12 +39,12 @@ dataset_train=train_file_path, dataset_test=test_file_path, target_column_name=target_name, - pcmci_test_choice="parcorr", - pcmci_max_lag=1, + pcmci_test_choice="ParCorr", + pcmci_max_lag="1", tefs_direction="both", - tefs_use_contemporary_features=True, - tefs_max_lag_features=1, - tefs_max_lag_target=1, + tefs_use_contemporary_features="Yes", + tefs_max_lag_features="1", + tefs_max_lag_target="1", ) print(resp) diff --git a/hawk/processes/wps_causal.py b/hawk/processes/wps_causal.py index 25bccc6..6515ba7 100644 --- a/hawk/processes/wps_causal.py +++ b/hawk/processes/wps_causal.py @@ -79,7 +79,7 @@ def __init__(self): "TEFS Use Contemporary Features", data_type="boolean", abstract="Choose whether to use comtemporary features in the TEFS algorithm.", - default=False, + default="Yes", ), LiteralInput( "tefs_max_lag_features", @@ -87,7 +87,8 @@ def __init__(self): data_type="string", abstract="Choose the maximum lag of the features in the TEFS algorithm.", allowed_values=[ - "no_lag" "1", + "no_lag", + "1", "2", "3", "4", From 8f2075873cd999afbac727caa471dead9bbf1994 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Wed, 27 Mar 2024 17:54:23 +0100 Subject: [PATCH 35/51] Fix axis of concatenation --- hawk/analysis/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hawk/analysis/main.py b/hawk/analysis/main.py index 422964b..0c682ae 100644 --- a/hawk/analysis/main.py +++ b/hawk/analysis/main.py @@ -28,7 +28,7 @@ def __init__( tefs_max_lag_target, workdir, ): - df_full = pd.concat([df_train, df_test], axis=1).reset_index(drop=True) + df_full = pd.concat([df_train, df_test], axis=0).reset_index(drop=True) df_full_tigramite = initialize_tigramite_df(df_full) self.datasets = { From 265acd93bf32a127aea4e1be4feba70c2b0be277 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Wed, 27 Mar 2024 17:54:35 +0100 Subject: [PATCH 36/51] Fix typo --- hawk/analysis/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hawk/analysis/main.py b/hawk/analysis/main.py index 0c682ae..fc3d533 100644 --- a/hawk/analysis/main.py +++ b/hawk/analysis/main.py @@ -92,8 +92,8 @@ def run_baseline_analysis(self): "r2": regression_analysis( inputs_names_lags=inputs_names_lags, target_name=self.target_column_name, - df_train=self.datasets["normale"]["train"], - df_test=self.datasets["normale"]["test"], + df_train=self.datasets["normal"]["train"], + df_test=self.datasets["normal"]["test"], ), } From 6b12d196010732aeb04f6ee4a5d9a707199bdae3 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Wed, 27 Mar 2024 17:55:01 +0100 Subject: [PATCH 37/51] Select PCMCI test based on user input --- hawk/analysis/main.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hawk/analysis/main.py b/hawk/analysis/main.py index fc3d533..10d5ac5 100644 --- a/hawk/analysis/main.py +++ b/hawk/analysis/main.py @@ -173,10 +173,10 @@ def run_pcmci_analysis( "cmiknn": cmiknn, } - independence_tests_options = [ - "parcorr", - "cmiknn", - ] + if self.pcmci_test_choice == "ParCorr": + independence_tests_options = ["parcorr"] + elif self.pcmci_test_choice == "CMIknn": + independence_tests_options = ["cmiknn"] algorithm_options = [ "pcmci_plus", From 7f4696083ac004d1253c01b90b767d087e3c0c0b Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Wed, 27 Mar 2024 17:55:24 +0100 Subject: [PATCH 38/51] Fix lag options PCMCI --- hawk/analysis/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hawk/analysis/main.py b/hawk/analysis/main.py index 10d5ac5..7f5d06b 100644 --- a/hawk/analysis/main.py +++ b/hawk/analysis/main.py @@ -155,7 +155,7 @@ def run_tefs_analysis( def run_pcmci_analysis( self, ): - lag_options = [self.pcmci_features_lags[: i + 1] for i in range(len(self.pcmci_features_lags))] + lag_options = self.pcmci_features_lags # max lag # Define the tests parcorr = ParCorr(significance="analytic") From d6f571888fa12867baea2ad4388f2c6fbe4c8a9a Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Wed, 27 Mar 2024 17:55:49 +0100 Subject: [PATCH 39/51] Fix basin variable updates --- hawk/analysis/postprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hawk/analysis/postprocessing.py b/hawk/analysis/postprocessing.py index 56436bd..093c837 100644 --- a/hawk/analysis/postprocessing.py +++ b/hawk/analysis/postprocessing.py @@ -92,7 +92,7 @@ def run_postprocessing_pcmci( for simulation in results_pcmci: dataframe = datasets[simulation["dataset_name"]] var_names = dataframe["var_names"] - all_basin_variables.update(var_names.values) + all_basin_variables.update(var_names) results = simulation["results"] From b41668ca24d653a2ea2528ffa1b4042eaf7e5f89 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Wed, 27 Mar 2024 17:56:18 +0100 Subject: [PATCH 40/51] Cast input data --- hawk/processes/wps_causal.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hawk/processes/wps_causal.py b/hawk/processes/wps_causal.py index 6515ba7..3a1e337 100644 --- a/hawk/processes/wps_causal.py +++ b/hawk/processes/wps_causal.py @@ -190,12 +190,12 @@ def _handler(self, request, response): df_test = pd.read_csv(request.inputs["dataset_test"][0].file) pcmci_test_choice = request.inputs["pcmci_test_choice"][0].data - pcmci_max_lag = request.inputs["pcmci_max_lag"][0].data + pcmci_max_lag = int(request.inputs["pcmci_max_lag"][0].data) tefs_direction = request.inputs["tefs_direction"][0].data tefs_use_contemporary_features = request.inputs["tefs_use_contemporary_features"][0].data - tefs_max_lag_features = request.inputs["tefs_max_lag_features"][0].data - tefs_max_lag_target = request.inputs["tefs_max_lag_target"][0].data + tefs_max_lag_features = int(request.inputs["tefs_max_lag_features"][0].data) + tefs_max_lag_target = int(request.inputs["tefs_max_lag_target"][0].data) workdir = Path(self.workdir) From 2de2d6c4061e68c791508a19fd42398a65c3b6d1 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Wed, 27 Mar 2024 17:56:51 +0100 Subject: [PATCH 41/51] Require last version of tigramite --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 28a8093..f89c934 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ jinja2 psutil pywps>=4.5.1,<4.6 birdhouse-birdy -tigramite +tigramite>=5.2.5.1 tefs pandas scikit-learn From 83e7a57c3a3b86843e6718865dea8b8f50514fb3 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Wed, 27 Mar 2024 20:15:57 +0100 Subject: [PATCH 42/51] Expand y axis limits --- hawk/analysis/postprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hawk/analysis/postprocessing.py b/hawk/analysis/postprocessing.py index 093c837..8897913 100644 --- a/hawk/analysis/postprocessing.py +++ b/hawk/analysis/postprocessing.py @@ -499,7 +499,7 @@ def run_postprocessing_tefs_wrapper( step = 10 ax.set_xticks(range(0, num_total_features + 1, step)) ax.set_xticklabels(range(0, num_total_features + 1, step)) - ax.set_ylim(-0.1, 0.55) + ax.set_ylim(-0.1, 1.1) ax.grid() os.makedirs(os.path.dirname(target_file_train_test), exist_ok=True) From 46d7c6203dc751ec92e74853df80f2232e416579 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Wed, 27 Mar 2024 20:16:09 +0100 Subject: [PATCH 43/51] Move legend outside the plot --- hawk/analysis/postprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hawk/analysis/postprocessing.py b/hawk/analysis/postprocessing.py index 8897913..9481b10 100644 --- a/hawk/analysis/postprocessing.py +++ b/hawk/analysis/postprocessing.py @@ -490,7 +490,7 @@ def run_postprocessing_tefs_wrapper( ax.set_xlabel("Number of features") ax.set_ylabel("Test $R^2$") ax.set_title("TEFS Wrapper") - ax.legend() + ax.legend(title="Configurations", bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0) if num_total_features < 30: step = 1 elif num_total_features < 80: From 5046c1ef3a86101d296e920089b8e74e2ac2e2fa Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Wed, 27 Mar 2024 20:16:23 +0100 Subject: [PATCH 44/51] Hide maximum point (too confusing) --- hawk/analysis/postprocessing.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/hawk/analysis/postprocessing.py b/hawk/analysis/postprocessing.py index 9481b10..e0e0e89 100644 --- a/hawk/analysis/postprocessing.py +++ b/hawk/analysis/postprocessing.py @@ -467,16 +467,16 @@ def run_postprocessing_tefs_wrapper( param_str = "_".join(f"{k}{v}" for k, v in simulation["params"].items()) ax.plot(test_r2_train_test, marker="o", label=param_str) - maxima = np.where(test_r2_train_test == test_r2_train_test.max())[0] - ax.plot( - maxima, - test_r2_train_test[maxima], - marker="o", - color="red", - linestyle="None", - label="Maximum", - markersize=6, - ) + # maxima = np.where(test_r2_train_test == test_r2_train_test.max())[0] + # ax.plot( + # maxima, + # test_r2_train_test[maxima], + # marker="o", + # color="red", + # linestyle="None", + # label="Maximum", + # markersize=6, + # ) # ax.plot( # n_features_selected_with_threshold, # test_r2_train_test[n_features_selected_with_threshold], From 236090a0d26f76b57fdf4cdff009acaa31f85796 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Wed, 27 Mar 2024 20:18:58 +0100 Subject: [PATCH 45/51] Fix key erros in postprocessing --- hawk/analysis/postprocessing.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/hawk/analysis/postprocessing.py b/hawk/analysis/postprocessing.py index e0e0e89..8d27db5 100644 --- a/hawk/analysis/postprocessing.py +++ b/hawk/analysis/postprocessing.py @@ -207,7 +207,7 @@ def run_postprocessing_pcmci( "score_r2": score_r2, "score_r2_lag": score_r2_lag, "score_r2_lag_ar": score_r2_lag_ar, - "dataset": simulation["dataset_name"], + "dataset_name": simulation["dataset_name"], "algorithm": simulation["params"]["algorithm"], "independencetest": simulation["params"]["independencetest"], "lag": simulation["params"]["lag"], @@ -228,18 +228,18 @@ def run_postprocessing_pcmci( scores_lag = [] scores_lag_ar = [] - for index, simulation in enumerate(results_pcmci): - scores.append(simulation["score_r2"]) - scores_lag.append(simulation["score_r2_lag"]) - scores_lag_ar.append(simulation["score_r2_lag_ar"]) + for index, result in enumerate(results_table_pcmci): + scores.append(result["score_r2"]) + scores_lag.append(result["score_r2_lag"]) + scores_lag_ar.append(result["score_r2_lag_ar"]) # loop through the rows of the df, if the feature is in the list of selected features, put a 1 for feature in df_presence.index: - if feature in simulation["selected_features"]: + if feature in result["selected_features"]: df_presence.loc[feature, index] = 1 else: df_presence.loc[feature, index] = 0 - if feature not in datasets[simulation["dataset_name"]]["var_names"]: + if feature not in datasets[result["dataset_name"]]["var_names"]: df_presence.loc[feature, index] = 2 df_presence = df_presence.astype(float) @@ -269,8 +269,7 @@ def run_postprocessing_tefs( all_basin_variables = set() results_table_te = [] for simulation in results_tefs: - dataset_name = simulation["dataset_name"] - dataframe = datasets[dataset_name] + dataframe = datasets[simulation["dataset_name"]] var_names = dataframe["var_names"] all_basin_variables.update(var_names) @@ -330,7 +329,7 @@ def run_postprocessing_tefs( "score_r2": score_r2, "score_r2_lag": score_r2_lag, "score_r2_lag_ar": score_r2_lag_ar, - "dataset": dataset_name, + "dataset_name": simulation["dataset_name"], "lagfeatures": simulation["params"]["lagfeatures"], "lagtarget": simulation["params"]["lagtarget"], "direction": simulation["params"]["direction"], # not putting threshold and k @@ -351,18 +350,18 @@ def run_postprocessing_tefs( scores_lag = [] scores_lag_ar = [] - for index, simulation in enumerate(results_tefs): - scores.append(simulation["score_r2"]) - scores_lag.append(simulation["score_r2_lag"]) - scores_lag_ar.append(simulation["score_r2_lag_ar"]) + for index, result in enumerate(results_table_te): + scores.append(result["score_r2"]) + scores_lag.append(result["score_r2_lag"]) + scores_lag_ar.append(result["score_r2_lag_ar"]) # loop through the rows of the df, if the feature is in the list of selected features, put a 1 for feature in df_presence.index: - if feature in simulation["selected_features"]: + if feature in result["selected_features"]: df_presence.loc[feature, index] = 1 else: df_presence.loc[feature, index] = 0 - if feature not in datasets[simulation["dataset_name"]]["var_names"]: + if feature not in datasets[result["dataset_name"]]["var_names"]: df_presence.loc[feature, index] = 2 df_presence = df_presence.astype(float) From eb05931d000f66ee3c59aa0995170b77f0c9995f Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Wed, 27 Mar 2024 20:20:14 +0100 Subject: [PATCH 46/51] Che that list is not empty before taking the min --- hawk/analysis/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hawk/analysis/metrics.py b/hawk/analysis/metrics.py index fe56201..066f0c6 100644 --- a/hawk/analysis/metrics.py +++ b/hawk/analysis/metrics.py @@ -37,7 +37,7 @@ def prepare_data_with_lags( ) for lags in inputs_names_lags.values(): - if min(lags) < 0: + if lags and min(lags) < 0: raise ValueError("Lag for independent variables must be a non-negative integer.") # Initialize a list to hold all DataFrame chunks From 1be1cdcf47d9b57be588c10a9b30b7bc136c8cbb Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Wed, 27 Mar 2024 20:24:57 +0100 Subject: [PATCH 47/51] In the baseline analysis, return the saved pickle file path --- hawk/analysis/main.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/hawk/analysis/main.py b/hawk/analysis/main.py index 7f5d06b..cfdce88 100644 --- a/hawk/analysis/main.py +++ b/hawk/analysis/main.py @@ -1,9 +1,11 @@ import itertools +import os import pandas as pd from tigramite.independence_tests.cmiknn import CMIknn from tigramite.independence_tests.parcorr import ParCorr +from .file_management import save_to_pkl_file from .metrics import regression_analysis from .pcmci_tools import initialize_tigramite_df from .postprocessing import ( @@ -97,7 +99,10 @@ def run_baseline_analysis(self): ), } - return baseline + target_file = os.path.join(self.workdir, "baseline.pkl") + save_to_pkl_file(target_file, baseline) + + return target_file def run_tefs_analysis( self, From 1670270cf491a233c7d423b5cbb025565d76e189 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Wed, 27 Mar 2024 20:25:40 +0100 Subject: [PATCH 48/51] Implement response update --- hawk/analysis/main.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/hawk/analysis/main.py b/hawk/analysis/main.py index cfdce88..ccc504d 100644 --- a/hawk/analysis/main.py +++ b/hawk/analysis/main.py @@ -29,7 +29,10 @@ def __init__( tefs_max_lag_features, tefs_max_lag_target, workdir, + response=None, ): + self.response = response + df_full = pd.concat([df_train, df_test], axis=0).reset_index(drop=True) df_full_tigramite = initialize_tigramite_df(df_full) @@ -221,22 +224,34 @@ def run_pcmci_analysis( return results def run(self): + if self.response: + self.response.update_status("Performing baseline analysis", 16) self.baseline = self.run_baseline_analysis() + if self.response: + self.response.update_status("Performing TEFS analysis", 33) tefs_results = self.run_tefs_analysis() + if self.response: + self.response.update_status("Performing PCMCI analysis", 66) pcmci_results = self.run_pcmci_analysis() + if self.response: + self.response.update_status("Postprocessing PCMCI", 80) self.plot_pcmci, self.details_pcmci = run_postprocessing_pcmci( results_pcmci=pcmci_results, target_column_name=self.target_column_name, datasets=self.datasets, destination_path=self.workdir, ) + if self.response: + self.response.update_status("Postprocessing TEFS", 90) self.plot_tefs, self.details_tefs = run_postprocessing_tefs( results_tefs=tefs_results, target_column_name=self.target_column_name, datasets=self.datasets, destination_path=self.workdir, ) + if self.response: + self.response.update_status("Postprocessing TEFS Wrapper", 95) self.plot_tefs_wrapper, self.details_tefs_wrapper = run_postprocessing_tefs_wrapper( results_tefs=tefs_results, target_column_name=self.target_column_name, From f987e2717d75e08617837ecfd924aa946fe71617 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Wed, 27 Mar 2024 20:26:09 +0100 Subject: [PATCH 49/51] Temporary fix for making PCMCI selection work --- hawk/analysis/main.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hawk/analysis/main.py b/hawk/analysis/main.py index ccc504d..b311416 100644 --- a/hawk/analysis/main.py +++ b/hawk/analysis/main.py @@ -33,6 +33,11 @@ def __init__( ): self.response = response + # Move target column as last column for the get_connected_variables + # function which requires it (TODO this would be interesting to be fixed) + df_train = df_train[[col for col in df_train.columns if col != target_column_name] + [target_column_name]] + df_test = df_test[[col for col in df_test.columns if col != target_column_name] + [target_column_name]] + df_full = pd.concat([df_train, df_test], axis=0).reset_index(drop=True) df_full_tigramite = initialize_tigramite_df(df_full) From 9f228e794d1783553be1f1f3f7d5e6c8471b4bcd Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Wed, 27 Mar 2024 20:26:28 +0100 Subject: [PATCH 50/51] Update reponse in the wps file as well --- hawk/processes/wps_causal.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hawk/processes/wps_causal.py b/hawk/processes/wps_causal.py index 3a1e337..1f25c35 100644 --- a/hawk/processes/wps_causal.py +++ b/hawk/processes/wps_causal.py @@ -213,6 +213,7 @@ def _handler(self, request, response): tefs_max_lag_features, tefs_max_lag_target, workdir, + response, ) causal_analysis.run() @@ -225,4 +226,6 @@ def _handler(self, request, response): response.outputs["png_tefs_wrapper"].file = causal_analysis.plot_tefs_wrapper response.outputs["pkl_tefs_wrapper"].file = causal_analysis.details_tefs_wrapper + response.update_status("Processing completed", 100) + return response From c4be4e24cc408eb86667d253155b2d342a18e9a1 Mon Sep 17 00:00:00 2001 From: Teo Bucci Date: Wed, 27 Mar 2024 20:32:38 +0100 Subject: [PATCH 51/51] Simplify testing file --- hawk/processes/simulation_interactive.py | 41 +++++------------------- 1 file changed, 8 insertions(+), 33 deletions(-) diff --git a/hawk/processes/simulation_interactive.py b/hawk/processes/simulation_interactive.py index e189b3e..2d3bd68 100644 --- a/hawk/processes/simulation_interactive.py +++ b/hawk/processes/simulation_interactive.py @@ -1,46 +1,21 @@ -import numpy as np -import pandas as pd from birdy import WPSClient -# ----------- Generate some data ----------- -np.random.seed(0) -n = 1000 # number of samples -m = 15 # number of features - -data = {} -for i in range(1, m + 1): - data[f"x{i}"] = np.random.normal(size=n) - -target_name = "target" -data[target_name] = sum(data.values()) + np.random.normal(size=n) - -data = pd.DataFrame(data) - -n_test = int(0.20 * n) -n_train = n - n_test -data_test = data[n_train:] -data = data[:n_train] - -data.head() - - -train_file_path = "./train_dataset.csv" -test_file_path = "./test_dataset.csv" -data.to_csv(train_file_path, index=False) -data_test.to_csv(test_file_path, index=False) +train_file_path = "Emiliani1_train.csv" +test_file_path = "Emiliani1_test.csv" +target_column_name = "cyclostationary_mean_rr_4w_1" # ----------------- WPS ----------------- -wps = WPSClient("http://localhost:5000/wps", verify=False) +wps = WPSClient("http://localhost:5002/wps", verify=False) help(wps) # Input some data for the causal process resp = wps.causal( - dataset_train=train_file_path, - dataset_test=test_file_path, - target_column_name=target_name, + dataset_train=open(train_file_path), + dataset_test=open(test_file_path), + target_column_name=target_column_name, pcmci_test_choice="ParCorr", - pcmci_max_lag="1", + pcmci_max_lag="0", tefs_direction="both", tefs_use_contemporary_features="Yes", tefs_max_lag_features="1",