-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlemmatize.py
80 lines (63 loc) · 2.47 KB
/
lemmatize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
##################################
## Author: Pranav ################
##################################
from nltk.corpus import wordnet as wn
from nltk import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize, pos_tag
from collections import Counter
import pickle
# load the frequency dictionary
with open('freqdict.pkl', 'rb') as freqdict:
frequency_dict = pickle.load(freqdict)
WN_NOUN = 'n'
WN_VERB = 'v'
WN_ADJECTIVE = 'a'
WN_ADJECTIVE_SATELLITE = 's'
WN_ADVERB = 'r'
wordnet_lemmatizer = WordNetLemmatizer()
def nounify(word, from_pos, to_pos = "n"):
""" Returns nounification """
# convert to singular
word = wordnet_lemmatizer.lemmatize(word)
synsets = wn.synsets(word, pos=from_pos)
# Word not found
if not synsets:
return None
# Get all lemmas of the word (consider 'a'and 's' equivalent)
lemmas = []
for s in synsets:
for l in s.lemmas():
if s.name().split('.')[1] == from_pos or from_pos in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE) and s.name().split('.')[1] in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE):
lemmas += [l]
# Get related forms
derivationally_related_forms = [(l, l.derivationally_related_forms()) for l in lemmas]
# filter only the desired pos (consider 'a' and 's' equivalent)
related_noun_lemmas = []
for drf in derivationally_related_forms:
for l in drf[1]:
if l.synset().name().split('.')[1] == to_pos or to_pos in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE) and l.synset().name().split('.')[1] in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE):
related_noun_lemmas += [l]
# Extract the words from the lemmas
words = [l.name() for l in related_noun_lemmas]
# lowercase and remove duplicates
filtered = list(set([x.lower() for x in words if x.lower() != word.lower()]))
# choose the most common word for that particular relation
words = [(x, frequency_dict[x]) for x in filtered]
words.sort(key = lambda w: -w[1])
if len(words) > 0 and len(words[0][0]) > 0:
return words[0][0]
else:
return None
def get_wordnet_pos(treebank_tag):
''' returns wordnet POS tag '''
if treebank_tag.startswith('J'):
return wn.ADJ
elif treebank_tag.startswith('V'):
return wn.VERB
elif treebank_tag.startswith('N'):
return wn.NOUN
elif treebank_tag.startswith('R'):
return wn.ADV
else:
return None # for easy if-statement