-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpostprocess_data.py
38 lines (31 loc) · 1.36 KB
/
postprocess_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:light
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.3'
# jupytext_version: 1.0.5
# kernelspec:
# display_name: altmetrics
# language: python
# name: altmetrics
# ---
import pandas as pd
from pathlib import Path
output_dir = Path("output/")
researchers = pd.read_csv(output_dir / "researchers.csv", index_col="id", dtype={'ID_number':str})
phds = pd.read_csv(output_dir / "phds.csv", index_col="id")
postdocs = pd.read_csv(output_dir / "postdocs.csv", index_col="id")
# combine all multiple PhDs
merged_phds = phds.groupby("rid")[['phd_institution','phd_institution_other','phd_year','phd_type']]
merged_phds = merged_phds.apply(lambda x: x.apply(lambda y: " | ".join([str(_) for _ in y])))
# combine all multiple postgrads
merged_postdocs = postdocs.groupby("rid")[['postdoc_year', 'postdoc_status', 'postdoc_institution']]
merged_postdocs = merged_postdocs.apply(lambda x: x.apply(lambda y: " | ".join([str(_) for _ in y])))
# merge all dataframes
researchers = researchers.merge(merged_phds, left_index=True, right_index=True, how="left")
researchers = researchers.merge(merged_postdocs, left_index=True, right_index=True, how="left")
researchers = researchers.rename(columns={'id':'rid'})
researchers.to_csv(output_dir / "researchers_phds_postdocs.csv")