-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmess_data.py
67 lines (48 loc) · 1.9 KB
/
mess_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import random
import pandas as pd
import numpy as np
random.seed(2019)
def add_tall_people(df, frac=0.001):
nanidx = df.sample(frac=frac).index
df.iloc[nanidx, 10] = (pd.to_numeric(df.iloc[nanidx, 10].str.slice(0, 3)) + 100).astype(str) + ' cm'
return df
def add_short_people(df, frac=0.001):
nanidx = df.sample(frac=frac).index
df.iloc[nanidx, 10] = (pd.to_numeric(df.iloc[nanidx, 10].str.slice(0, 3)) - 100).astype(str) + ' cm'
return df
def add_heavy_people(df, frac=0.001):
nanidx = df.sample(frac=frac).index
df.iloc[nanidx, 11] = (pd.to_numeric(df.iloc[nanidx, 11].str.slice(0, 2)) + 105).astype(str) + ' kg'
return df
def add_light_people(df, frac=0.001):
nanidx = df.sample(frac=frac).index
df.iloc[nanidx, 11] = (pd.to_numeric(df.iloc[nanidx, 11].str.slice(0, 2)) -50).astype(str) + ' kg'
return df
def add_nan(df, frac = 0.1):
nanidx = df.sample(frac=frac).index
df.iloc[nanidx, :] = np.nan
return df
def change_case(df, frac = 0.3):
nanidx = df.sample(frac=frac).index
df.iloc[nanidx, 1] = df.iloc[nanidx, 1].str.upper()
nanidx = df.sample(frac=frac).index
df.iloc[nanidx, 1] = df.iloc[nanidx, 1].str.lower()
return df
def add_impossible_data(df, frac = 0.01):
nanidx = df.sample(frac=frac/2).index
df.iloc[nanidx, 10] = (pd.to_numeric(df.iloc[nanidx, 10].str.slice(0, 3)) - 400).astype(str) + ' cm'
nanidx = df.sample(frac=frac/4).index
df.iloc[nanidx, 11] = (pd.to_numeric(df.iloc[nanidx, 11].str.slice(0, 3)) - 400).astype(str) + ' kg'
return df
def perform_modifs(df):
df = add_tall_people(df)
df = add_short_people(df)
df = add_heavy_people(df)
df = add_light_people(df)
df = add_nan(df)
df = change_case(df)
df = add_impossible_data(df)
return df
data = pd.read_csv('OGFullData.csv')
data = perform_modifs(data)
data.to_csv('FullData.csv', index=False)