forked from beakeyd/ML1819--task-107--team-41
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTextPreProcessing.py
68 lines (59 loc) · 2.67 KB
/
TextPreProcessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import re
try:
import json
except ImportError:
import simplejson as json
jsonFile="data/DavidsPruned.json"
with open(jsonFile) as data:
nltk.download('stopwords')
nltk.download('punkt')
tokenizer = RegexpTokenizer(r'\w+')
data = json.load(data)
copy = []
stop_words = set(stopwords.words('english'))
i = 0
porter = PorterStemmer()
# stop word removal
for i in range(0, len(data) - 1):
d = data[i]
word_tokens_tweet = tokenizer.tokenize(d['tweet'])
word_tokens_name = tokenizer.tokenize(d['name'])
word_tokens_description = tokenizer.tokenize(d['description'])
word_tokens_screen_name = tokenizer.tokenize(d['screen_name'])
filtered_sentence_tweet = [w for w in word_tokens_tweet if not w in stop_words]
filtered_sentence_tweet = []
for w in word_tokens_tweet:
if w not in stop_words:
w = porter.stem(w)
filtered_sentence_tweet.append(w)
filtered_sentence_name = [w for w in word_tokens_name if not w in stop_words]
filtered_sentence_name = []
for w in word_tokens_name:
if w not in stop_words:
w = porter.stem(w)
filtered_sentence_name.append(w)
filtered_sentence_description = [w for w in word_tokens_description if not w in stop_words]
filtered_sentence_description = []
for w in word_tokens_description:
if w not in stop_words:
w = porter.stem(w)
filtered_sentence_description.append(w)
filtered_sentence_screen_name = [w for w in word_tokens_screen_name if not w in stop_words]
filtered_sentence_screen_name = []
for w in word_tokens_screen_name:
if w not in stop_words:
w = porter.stem(w)
filtered_sentence_screen_name.append(w)
d['tweet'] = " ".join(str(x) for x in filtered_sentence_tweet)
d['name'] = " ".join(str(x) for x in filtered_sentence_name)
d['description'] = " ".join(str(x) for x in filtered_sentence_description)
d['screen_name'] = " ".join(str(x) for x in filtered_sentence_screen_name)
copy.append(d)
i = i + 1
print(i)
with open('data/new_twitter_gender_data.json', 'w') as output1:
json.dump(copy, output1)