-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplitting.py
51 lines (38 loc) · 1.04 KB
/
splitting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
'''
Seperating clauses that define Power of Director
'''
def powerDir(txtfile):
#opening output file from pdf's
f = open(txtfile, "r")
#splitting text into clauses
text = f.read().split(".\n\n")
arr = []
for i in text:
arr.append(i)
arr2 = []
a = 1
#Clauses containing the phrase Director
for j in arr:
if 'Director' in j:
arr2.append(j)
# removing stop words and creating bag of frequency matrix
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True,stop_words='english')
X = vectorizer.fit_transform(arr2)
# performing K-means clustering
kmeans = KMeans(n_clusters=2,init='k-means++',max_iter=100).fit(X)
# labels assigned after K-means clustering
arr3 = kmeans.labels_
#output file
outfile2 = "power.txt"
outfile = "notpower.txt"
file = open(outfile,"a")
file2 = open(outfile2,"a")
for i,val in enumerate(arr3):
if val == 1:
file.write(arr2[i])
else:
file2.write(arr2[i])
file.close()
file2.close()