Skip to content

Commit

Permalink
Features and preprocessing logging
Browse files Browse the repository at this point in the history
  • Loading branch information
GitGinocchio committed Jan 2, 2025
1 parent a0428ac commit f8eb3ce
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 11 deletions.
Binary file modified model.pth
Binary file not shown.
2 changes: 1 addition & 1 deletion src/config/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"logger" : {
"level" : "INFO",
"dir" : "./logs",
"tofile" : false,
"tofile" : true,
"datefmt" : "%Y-%m-%d, %H:%M:%S",
"filename_datefmt" : "%Y-%m-%d"
}
Expand Down
43 changes: 41 additions & 2 deletions src/neural_networks/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,55 @@
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler
from typing import Any
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler, OneHotEncoder
from functools import lru_cache
from typing import Any, Callable
import pandas as pd
import inspect

from utils.terminal import getlogger

logger = getlogger()

def encode_label(df : pd.DataFrame, column : str, inplace : bool = True) -> LabelEncoder:
encoder = LabelEncoder()
df[f"{column}_encoded" if not inplace else column] = encoder.fit_transform(df[column])

logger.info(f"Encoded Column '{column}' with LabelEncoder")
return encoder

def apply_scaler(df : pd.DataFrame, columns : list[str], scaler : MinMaxScaler | StandardScaler | MaxAbsScaler | RobustScaler, inplace : bool = True) -> MinMaxScaler | StandardScaler | MaxAbsScaler | RobustScaler:

for column in columns:
df[f"{column}_scaled" if not inplace else column] = scaler.fit_transform(df[[column]])

logger.info(f"Applied Scalar '{scaler.__class__}' to columns: {columns}")
return scaler

def apply_onehot_encoder(df: pd.DataFrame, columns: list[str], inplace: bool = True) -> OneHotEncoder:
encoder = OneHotEncoder(handle_unknown="error")

for column in columns:
onehot_encoded_df = pd.DataFrame(encoder.fit_transform(df[[column]]).toarray())
if not inplace:
for i, col in enumerate(onehot_encoded_df.columns):
df[f"{column}_onehot_{i}"] = col
else:
for i, col in enumerate(onehot_encoded_df.columns):
df[column + f"_onehot_{i}"] = col

logger.info(f"Applied OneHotEncoder to columns: {columns}")
return encoder

def create_feature(df: pd.DataFrame, feature_name: str, func: Callable[[pd.Series], pd.Series]) -> None:
"""
Create new features by applying a Lambda function to the input DataFrame.
Parameters:
df (pandas.DataFrame): Input DataFrame
lambda_func (function): Lambda function to apply to the DataFrame
"""
#@lru_cache(maxsize=1000)
#def apply_lambda(row : pd.Series):
#return func(row)

df[feature_name] = df.apply(lambda row: func(row), axis=1)

logger.info(f"Created Feature named '{feature_name}': {inspect.getsource(func).strip()}")
77 changes: 69 additions & 8 deletions src/train_neural_network.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler
from sklearn.model_selection import train_test_split
from datetime import datetime
import pandas as pd
import numpy as np

Expand All @@ -8,7 +9,11 @@
import torch.nn as nn
import torch

from neural_networks.preprocessing import encode_label, apply_scaler
from neural_networks.preprocessing import \
encode_label, \
apply_scaler, \
create_feature, \
apply_onehot_encoder
from neural_networks.models.lstm import LSTM
from neural_networks.training import train, test
from utils.terminal import getlogger
Expand All @@ -34,15 +39,71 @@
region_encoder = encode_label(df, "region")
city_encoder = encode_label(df, "city")

logger.info(f"Applying scalers")
min_max_scaler = apply_scaler(df, ["feelsLike", "cloudcover", "humidity", "precip", "pressure", "temp", "uvIndex", "visibility", "windspeed", "weatherDescription", "winddir16Point"], MinMaxScaler())
standard_scaler = apply_scaler(df, ["feelsLike", "cloudcover", "humidity", "precip", "pressure", "temp", "uvIndex", "visibility", "windspeed", "weatherDescription", "winddir16Point"], StandardScaler())
logger.info(f"Creating new Features")
create_feature(df, "heat_index", lambda row: (row["feelsLike"] + row["temp"]) / 2 * row["humidity"])
create_feature(df,"weather_index", lambda row: (row["precip"] + row["pressure"] + row["windspeed"]) / 3)
create_feature(df, "sky_index", lambda row: (row["cloudcover"] * row["visibility"] + row["uvIndex"]) / 2)

create_feature(df, "date_of_year", lambda row: (datetime(int(row["year"]), int(row["month"]), int(row["day"])).timetuple().tm_yday - 1) / 365 * 2 - 1)
create_feature(df, "time_of_day", lambda row: ((row["hour"] + row["minute"]) / 60) % 24)
create_feature(df, "lat_to_eq", lambda row: abs(row["latitude"] - 23.4368))
create_feature(df, "lon_to_me", lambda row: abs(row["longitude"]))
create_feature(df, "lat_band", lambda row: int((row["latitude"] + 90) / 30))
create_feature(df, "lon_range", lambda row: int((row["longitude"] / 60) % 2))

X = df[['year', 'month', 'day','hour', 'minute', 'latitude', 'longitude']]
y = df[["feelsLike", "cloudcover", "humidity", "precip", "pressure", "temp", "uvIndex", "visibility", "windspeed", "weatherDescription", "winddir16Point"]]
#y = df[["feelsLike", "cloudcover", "humidity", "precip", "pressure", "temp", "uvIndex", "visibility", "windspeed"]]
logger.info(f"Applying scalers")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
min_max_scaler = apply_scaler(df,
[
#"date_of_year", "time_of_day",
#"lat_to_eq", "lon_to_me",
#"lat_band", "lon_range",
"heat_index", "weather_index", "sky_index",
"feelsLike", "cloudcover", "humidity", "precip", "pressure", "temp",
"uvIndex", "visibility", "windspeed", "weatherDescription", "winddir16Point"],
MinMaxScaler()
)
#standard_scaler = apply_scaler(df, ["feelsLike", "cloudcover", "humidity", "precip", "pressure", "temp", "uvIndex", "visibility", "windspeed", "weatherDescription", "winddir16Point"], StandardScaler())

X = df[[
'year',
'month',
'day',
'hour',
'minute',
'latitude',
'longitude',
'date_of_year',
'time_of_day',
#'lat_to_eq',
#'lon_to_me',
'lat_band',
#'lon_range'
]]

y = df[[
"feelsLike",
"cloudcover",
"humidity",
"precip",
"pressure",
"temp",
"uvIndex",
"visibility",
"windspeed",
#"weatherDescription",
#"winddir16Point",
"heat_index",
"weather_index",
"sky_index"
]]

print(X.head(50))
print(y.head(50))

input("Press Enter to continue...")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(device)
Expand Down

0 comments on commit f8eb3ce

Please sign in to comment.