Skip to content

Commit

Permalink
dataloading in separate file
Browse files Browse the repository at this point in the history
  • Loading branch information
Kirill456Z committed Dec 20, 2024
1 parent 48b7955 commit 47b85ff
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 90 deletions.
60 changes: 60 additions & 0 deletions src/scripts/load_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go


archetype_data = pd.read_csv('../../data/enriched/persona_identification/archetype_predictions_joined.csv')

character_data = pd.read_csv('../../data/MovieSummaries/character_processed.csv')

character_data = character_data.rename(columns={
'Wikipedia movie ID': "wikipedia_movie_id",
'Freebase movie ID': "fb_movie_id",
'Character name': "character_name",
'Actor gender': "actor_gender",
'Actor height (in meters)': "actor_height",
'Actor ethnicity (Freebase ID)': "fb_actor_eth_id",
'Actor name': "actor_name",
'Freebase character/actor map ID': "fb_char_actor_map_id",
'Freebase character ID': "fb_char_id",
'Freebase actor ID': "fb_actor_id",
})

character_data = character_data.drop_duplicates(subset=["fb_movie_id", "fb_actor_id", "character_name"])

actor_data = pd.read_csv('../../data/enriched/actors/actors_freebase.csv')
actor_data = actor_data[["education", "professions_num", "date_of_birth", "nationality", "gender", "place_of_birth", "height", "weight", "religion", "id"]]

merged = pd.merge(
archetype_data,
character_data,
how="inner",
left_on=["actor_fb_id", "movie_fb_id", "character_name"],
right_on=["fb_actor_id", "fb_movie_id", "character_name"]
)
merged = pd.merge(merged, actor_data, how="left", left_on="actor_fb_id", right_on="id").copy()

merged.loc[merged.actor_height.isna() & ~merged.height.isna(), "actor_height"] = merged[merged.actor_height.isna() & ~merged.height.isna()].height
merged.loc[merged.actor_gender.isna() & ~merged.gender.isna(), "actor_gender"] = merged[merged.actor_gender.isna() & ~merged.gender.isna()].gender

data = merged[[
'prediction', 'character_name',
'movie_name', 'actor_gender', 'actor_height',
'actor_name', 'actor_date_of_birth', 'movie_release_date', 'ethn_name',
'race', 'education', 'professions_num', 'nationality',
'gender', 'place_of_birth', 'weight', 'religion', "fb_movie_id", "fb_actor_id"
]].copy()
# # delete some ourliers, by looking at the histogram
MIN_HEIGHT = 0.8
MAX_HEIGHT = 2.7 # Max Palmen had height 249 cm
data = data[((data.actor_height >= MIN_HEIGHT) & (data.actor_height <= MAX_HEIGHT)) | data.actor_height.isna()].copy()
data["years_in_film"] = (pd.to_datetime(data.movie_release_date) - pd.to_datetime(data.actor_date_of_birth)).dt.days / 365.25
data["actor_bmi"] = data.weight / (data.actor_height ** 2)
data.loc[~data.education.isna(), "education"] = data.loc[~data.education.isna(), "education"].astype(int)
data.loc[data.actor_gender == "Male", "actor_gender"] = "M"
data.loc[data.actor_gender == "Female", "actor_gender"] = "F"
data.rename(columns={"prediction": "archetype"}, inplace=True)
111 changes: 21 additions & 90 deletions src/story/Question_7.ipynb

Large diffs are not rendered by default.

0 comments on commit 47b85ff

Please sign in to comment.