diff --git a/model.pth b/model.pth index 944b69a4..4786a139 100644 Binary files a/model.pth and b/model.pth differ diff --git a/src/config/config.json b/src/config/config.json index bf5d4ec8..44ae91d7 100644 --- a/src/config/config.json +++ b/src/config/config.json @@ -2,7 +2,7 @@ "logger" : { "level" : "INFO", "dir" : "./logs", - "tofile" : false, + "tofile" : true, "datefmt" : "%Y-%m-%d, %H:%M:%S", "filename_datefmt" : "%Y-%m-%d" } diff --git a/src/neural_networks/preprocessing.py b/src/neural_networks/preprocessing.py index 9e0d3c44..0d7dafb3 100644 --- a/src/neural_networks/preprocessing.py +++ b/src/neural_networks/preprocessing.py @@ -1,11 +1,18 @@ -from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler -from typing import Any +from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler, OneHotEncoder +from functools import lru_cache +from typing import Any, Callable import pandas as pd +import inspect + +from utils.terminal import getlogger + +logger = getlogger() def encode_label(df : pd.DataFrame, column : str, inplace : bool = True) -> LabelEncoder: encoder = LabelEncoder() df[f"{column}_encoded" if not inplace else column] = encoder.fit_transform(df[column]) + logger.info(f"Encoded Column '{column}' with LabelEncoder") return encoder def apply_scaler(df : pd.DataFrame, columns : list[str], scaler : MinMaxScaler | StandardScaler | MaxAbsScaler | RobustScaler, inplace : bool = True) -> MinMaxScaler | StandardScaler | MaxAbsScaler | RobustScaler: @@ -13,4 +20,36 @@ def apply_scaler(df : pd.DataFrame, columns : list[str], scaler : MinMaxScaler | for column in columns: df[f"{column}_scaled" if not inplace else column] = scaler.fit_transform(df[[column]]) + logger.info(f"Applied Scalar '{scaler.__class__}' to columns: {columns}") return scaler + +def apply_onehot_encoder(df: pd.DataFrame, columns: list[str], inplace: bool = True) -> OneHotEncoder: + encoder = OneHotEncoder(handle_unknown="error") + + for column in columns: + onehot_encoded_df = pd.DataFrame(encoder.fit_transform(df[[column]]).toarray()) + if not inplace: + for i, col in enumerate(onehot_encoded_df.columns): + df[f"{column}_onehot_{i}"] = col + else: + for i, col in enumerate(onehot_encoded_df.columns): + df[column + f"_onehot_{i}"] = col + + logger.info(f"Applied OneHotEncoder to columns: {columns}") + return encoder + +def create_feature(df: pd.DataFrame, feature_name: str, func: Callable[[pd.Series], pd.Series]) -> None: + """ + Create new features by applying a Lambda function to the input DataFrame. + + Parameters: + df (pandas.DataFrame): Input DataFrame + lambda_func (function): Lambda function to apply to the DataFrame + """ + #@lru_cache(maxsize=1000) + #def apply_lambda(row : pd.Series): + #return func(row) + + df[feature_name] = df.apply(lambda row: func(row), axis=1) + + logger.info(f"Created Feature named '{feature_name}': {inspect.getsource(func).strip()}") \ No newline at end of file diff --git a/src/train_neural_network.py b/src/train_neural_network.py index 86e0e66a..42e21705 100644 --- a/src/train_neural_network.py +++ b/src/train_neural_network.py @@ -1,5 +1,6 @@ from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler from sklearn.model_selection import train_test_split +from datetime import datetime import pandas as pd import numpy as np @@ -8,7 +9,11 @@ import torch.nn as nn import torch -from neural_networks.preprocessing import encode_label, apply_scaler +from neural_networks.preprocessing import \ + encode_label, \ + apply_scaler, \ + create_feature, \ + apply_onehot_encoder from neural_networks.models.lstm import LSTM from neural_networks.training import train, test from utils.terminal import getlogger @@ -34,15 +39,71 @@ region_encoder = encode_label(df, "region") city_encoder = encode_label(df, "city") -logger.info(f"Applying scalers") -min_max_scaler = apply_scaler(df, ["feelsLike", "cloudcover", "humidity", "precip", "pressure", "temp", "uvIndex", "visibility", "windspeed", "weatherDescription", "winddir16Point"], MinMaxScaler()) -standard_scaler = apply_scaler(df, ["feelsLike", "cloudcover", "humidity", "precip", "pressure", "temp", "uvIndex", "visibility", "windspeed", "weatherDescription", "winddir16Point"], StandardScaler()) +logger.info(f"Creating new Features") +create_feature(df, "heat_index", lambda row: (row["feelsLike"] + row["temp"]) / 2 * row["humidity"]) +create_feature(df,"weather_index", lambda row: (row["precip"] + row["pressure"] + row["windspeed"]) / 3) +create_feature(df, "sky_index", lambda row: (row["cloudcover"] * row["visibility"] + row["uvIndex"]) / 2) + +create_feature(df, "date_of_year", lambda row: (datetime(int(row["year"]), int(row["month"]), int(row["day"])).timetuple().tm_yday - 1) / 365 * 2 - 1) +create_feature(df, "time_of_day", lambda row: ((row["hour"] + row["minute"]) / 60) % 24) +create_feature(df, "lat_to_eq", lambda row: abs(row["latitude"] - 23.4368)) +create_feature(df, "lon_to_me", lambda row: abs(row["longitude"])) +create_feature(df, "lat_band", lambda row: int((row["latitude"] + 90) / 30)) +create_feature(df, "lon_range", lambda row: int((row["longitude"] / 60) % 2)) -X = df[['year', 'month', 'day','hour', 'minute', 'latitude', 'longitude']] -y = df[["feelsLike", "cloudcover", "humidity", "precip", "pressure", "temp", "uvIndex", "visibility", "windspeed", "weatherDescription", "winddir16Point"]] -#y = df[["feelsLike", "cloudcover", "humidity", "precip", "pressure", "temp", "uvIndex", "visibility", "windspeed"]] +logger.info(f"Applying scalers") -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) +min_max_scaler = apply_scaler(df, + [ + #"date_of_year", "time_of_day", + #"lat_to_eq", "lon_to_me", + #"lat_band", "lon_range", + "heat_index", "weather_index", "sky_index", + "feelsLike", "cloudcover", "humidity", "precip", "pressure", "temp", + "uvIndex", "visibility", "windspeed", "weatherDescription", "winddir16Point"], + MinMaxScaler() +) +#standard_scaler = apply_scaler(df, ["feelsLike", "cloudcover", "humidity", "precip", "pressure", "temp", "uvIndex", "visibility", "windspeed", "weatherDescription", "winddir16Point"], StandardScaler()) + +X = df[[ + 'year', + 'month', + 'day', + 'hour', + 'minute', + 'latitude', + 'longitude', + 'date_of_year', + 'time_of_day', + #'lat_to_eq', + #'lon_to_me', + 'lat_band', + #'lon_range' +]] + +y = df[[ + "feelsLike", + "cloudcover", + "humidity", + "precip", + "pressure", + "temp", + "uvIndex", + "visibility", + "windspeed", + #"weatherDescription", + #"winddir16Point", + "heat_index", + "weather_index", + "sky_index" +]] + +print(X.head(50)) +print(y.head(50)) + +input("Press Enter to continue...") + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(device) X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(device)