-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdgmDict-pvlists.py
183 lines (171 loc) · 7.76 KB
/
pdgmDict-pvlists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/local/bin/python3
'''
Makes two 'list' files and two 'dict' files, and associated mdb files
out of the termcluster's 'common' sections :
pdgm-values: for common value display in list boxes. Uses prop-
list given in pdgmPropOrder if given, otherwise
'default'.
pdgm-props: gives common prop:val pairs, plus '%' and pdgm-prop vars
pdgm-propvals: maps pdgm-values strings onto pdgm-props
(plus '%' + prop-vars if NOT number-person-gender-token)
pdgmdb: db file associated with pdgm-propvals
pdgm-label: gives propval list associated with each pdgm label
labldb: db file assicated with pdgm-label
pdgmdb: db file for mapping of val-string to propval string,
'''
import json
import shelve
import sys
#def pdgmidx(lang)
# For CL argument
language = sys.argv[1]
# For single lang:
#language = input('Type language name: ')
languagenames = (language, )
# For corpus:
#languagenames = ('aari', 'afar', 'alaaba', 'alagwa', 'akkadian-ob', 'arabic', 'arbore', 'awngi', 'bayso', 'beja-alm', 'beja-hud', 'beja-rei', 'beja-rop', 'beja-van', 'beja-wed', 'berber-ghadames', 'bilin', 'boni-jara', 'boni-kijee-bala', 'boni-kilii', 'burji', 'burunge', 'coptic-sahidic', 'dahalo', 'dhaasanac', 'dizi', 'egyptian-middle', 'elmolo', 'gawwada', 'gedeo', 'geez', 'hadiyya', 'hausa', 'hdi', 'hebrew', 'iraqw', 'kambaata', 'kemant', 'khamtanga', 'koorete', 'maale', 'mubi', 'oromo', 'rendille', 'saho', 'shinassha', 'sidaama', 'somali', 'syriac', 'tsamakko', 'wolaytta', 'yaaku', 'yemsa')
# Have decided to put all pdgm labels with full pdgm info in
# central db file. Other options:
# 1) make label->pdgm dict for each language
# 2) put label->pdgm in SQL database
for lang in languagenames:
print(str('LANG: ' + lang))
lfile = str('../aama-data/data/' + lang + '/' + lang + '-pdgms.json')
jdata = json.load(open(lfile))
# Lists 'pnames' for display in reference pdgm selection list.
# Gives the 'common' values for each reference pdgm, and indicates the
# pdgm's props hthat are not the 'default' 'number person gender'
outfile1 = str('pvlists/' + lang + '-pdgm-values.txt')
mdbfile = str('pvlists/' + lang + '-pdgmdb')
ldbfile = str('pvlists/' + lang + '-labldb')
shelffile1 = shelve.open(mdbfile)
shelffile2 = shelve.open(ldbfile)
# 'tcprops' = ordered list of all properties, formal + morphosyntactic
# occurring in termcluster.common section. Read in from json file
tcprops = jdata['pdgmPropOrder']
print('tcprops:')
print(str(tcprops))
#pdgmdict = ''
#pdgmlabels = ''
#pprops = ''
pvallist = []
# get the number of pdgms in the file
tccount = len(jdata['termclusters'])
print(str('tccount:' + str(tccount)))
for i in range(tccount):
# read-in 'common' section
plabel = jdata['termclusters'][i]['label']
# print(str('plabel= ' + plabel))
tccommon = jdata['termclusters'][i]['common']
tpltcc = list(tccommon.items())
#print(str('tpltcc: ' + str(tpltcc)))
# Check that all props in tccommon are covered vy
# tcprops. If not, add them at the end.
if tcprops != ["default"]:
for tup in tpltcc:
if tup[0] not in tcprops:
tcprops.append(tup[0])
print(str('NEW TCPROP!: ' + str(tup[0])))
pdgmvals = []
pdgmpropvals = []
# NEED TO DECIDE IF FOLLOWING IS NECESSARY [101822]
# If so have to change conventions about '-'
# Initialize pdgmpropvals list with language
# [unless do this already with tpltcc]
# Could also add lang to pdgmvals, but not clear
# at this point that that would be necessary
langprop = str("language:" + lang)
pdgmpropvals.append(langprop)
lexval = ''
# for val put tup[1] in list
# for propval put tuples in list in format tup[0]:tup[1] ('prop:val')
# In following, non-default option simmply puts (prop:)value in
# tcprops order, but writes 'morph' if this is a 'morphClass' pdgm;
#otherwise takes default (= alphabretic) order,
# but puts pos at head and lexeme at tail
if tcprops != ["default"]:
for prop in tcprops:
for tup in tpltcc:
if tup[0] == prop:
if tup[0] == 'morphClass':
pdgmvals.append(str('morph' + tup[1]))
pdgmpropvals.append(str(tup[0] + ":" + tup[1]))
else:
pdgmvals.append(str(tup[1]))
pdgmpropvals.append(str(tup[0] + ":" + tup[1]))
else:
for tup in tpltcc:
if tup[0] == 'pos':
pdgmvals.insert(0,str(tup[1]))
pdgmpropvals.insert(0,str(tup[0] + ":" + tup[1]))
elif tup[0] == 'lexeme':
lexval = str(tup[1])
elif tup[0] == 'morphClass':
pdgmvals.append(str('morph' + tup[1]))
pdgmpropvals.append(str(tup[0] + ":" + tup[1]))
else:
pdgmvals.append(str(tup[1]))
pdgmpropvals.append(str(tup[0] + ":" + tup[1]))
if lexval:
pdgmvals.append(lexval)
pdgmpropvals.append(str('lexeme:' + lexval))
#print(str(pdgmvals))
# read sel from row-0 of 'terms'
sel = jdata['termclusters'][i]['terms'][0]
# if not default (num,pers,gen,token), add to pval list
# selprops used only if want non-default sel in pdgm list
selprops = ''
#selprops2 = str("%" + ",".join(sel) + "%")
# Test whether sel is a subset of the default png pdgm selset
pngselset = {'number', 'person', 'gender', 'token', 'token-note'}
selset = set(sel)
# If the props of sel not all contained in pngselset
if not selset <= pngselset:
selprops = str("%" + ",".join(sel) + "%")
# do defaullt lists
ppvstring = ','.join(pdgmpropvals)
# with ALL %
#ppvstring2 = str(ppvstring + selprops2)
# with only non-default %
ppvstring = str(ppvstring + selprops)
# following version if want non-default sel in pdgm list
pvalstring = str(','.join(pdgmvals) + selprops)
# else, list with NO %
#pvalstring = (','.join(pdgmvals))
#pdgmlabels += str('"' + plabel + '": "' + ppvstring + '",\n')
#pdgmdict += str('"' + pvalstring + '": "' + ppvstring + '",\n')
pvallist.append(pvalstring)
#pprops += str(' ' + ppvstring2 + '\n')
shelffile1[pvalstring] = ppvstring
shelffile2[plabel] = ppvstring
shelffile1.close()
shelffile2.close()
# write files with sorted pvals
pvalsort = sorted(pvallist)
#pvalsort.insert(0,str(','.join(tcprops)))
pvals = '\n'.join(pvalsort)
#pvals = '\n'.join(pvallist)
# pdgm-vals
file = open(outfile1, "w")
file.write(str(pvals))
file.close()
'''
Edit this file to eliminate vars that only contribute to outfiles 2,3,4
#print(str("pvals: " + lang))
#print(str(pvals))
#print(str("pdgmdict: " + lang))
#print(str(pdgmdict))
# pdgm-props
file = open(outfile2, "w")
file.write(str(pprops))
file.close
# pdgm-propvals
file = open(outfile3, "w")
file.write(str(pdgmdict))
file.close()
# pdgm labels
file = open(outfile4, "w")
file.write(str(pdgmlabels))
file.close
NOTE: script which generates these files script-bck/pdgmDict-newlists.py
'''