forked from koskenni/pytwolc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtwexamp.py
121 lines (108 loc) · 4.68 KB
/
twexamp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""A module for reading two-level examples
The examples are assumed to be as space-separated one-level
representation and they are compiled into a single automaton.
At the same time, the alphabet used in the examples is
collected in several forms.
cfg.examples_fst -- the transducer which accepts exactly the examples
cfg.symbol_pair_set -- a tuple of string pairs suitable for e.g.
hfst.rules.restriction
"""
import re
import cfg
import twbt
def read_fst(filename="examples.fst"):
"""Reads in a previously stored example FST file
"""
import hfst
exfile = hfst.HfstInputStream(filename)
cfg.examples_fst = exfile.read()
pair_symbols = cfg.examples_fst.get_property("x-pair_symbols")
# print("pair_symbols", pair_symbols) ##
pair_symbol_lst = re.split(r" +", pair_symbols)
for pair in pair_symbol_lst:
cfg.pair_symbol_set.add(pair)
(insym, outsym) = cfg.pairsym2sympair(pair)
cfg.symbol_pair_set.add((insym, outsym))
cfg.input_symbol_set.add(insym)
cfg.output_symbol_set.add(outsym)
cfg.all_pairs_fst = hfst.empty_fst()
for insym, outsym in cfg.symbol_pair_set:
in_quoted = re.sub(r"([{}])", r"%\1", insym)
#print(in_quoted, outsym)### tilts if insym contains bad chars
pair_fst = hfst.regex(in_quoted + ':' + outsym)
cfg.all_pairs_fst.disjunct(pair_fst)
cfg.all_pairs_fst.remove_epsilons()
cfg.all_pairs_fst.minimize()
if cfg.verbosity >= 30:
twbt.ppfst(cfg.all_pairs_fst, title="cfg.all_pairs_fst")
return
def read_examples(filename="test.pstr", build_fsts=True):
"""Reads the examples from the file whose name is 'filename'.
The file must contain one example per line and each line consists of
a space separated sequence of pair-symbols. The examples are processed into
"""
if build_fsts:
import hfst
examples_bfst = hfst.HfstBasicTransducer()
exfile = open(filename, "r")
for line_nl in exfile:
line = line_nl.strip()
if not line or line.startswith("!"):
continue
pairsym_lst = re.split("\s+", line)
symbol_pair_lst = [cfg.pairsym2sympair(pairsym) for pairsym in pairsym_lst]
# print("symbol_pair_lst:", symbol_pair_lst) ##
pair_symbol_str = " ".join([cfg.sympair2pairsym(insym, outsym)
for insym,outsym
in symbol_pair_lst])
# print("pair_symbol_lst:", pair_symbol_lst) ##
cfg.example_lst.append(pair_symbol_str)
cfg.example_set.add(pair_symbol_str) # spaces normalized
#LINE_FST = hfst.tokenized_fst(symbol_pair_lst)
# twbt.printfst(LINE_FST, True) ##
if build_fsts:
examples_bfst.disjunct(symbol_pair_lst, 0)
for insym, outsym in symbol_pair_lst:
cfg.symbol_pair_set.add((insym, outsym))
exfile.close()
if cfg.verbosity >= 30:
print("List of examples:", cfg.example_lst)
print("List of alphabet symbol pairs:", sorted(cfg.symbol_pair_set))
if build_fsts:
cfg.examples_fst = hfst.HfstTransducer(examples_bfst)
cfg.examples_fst.set_name(filename)
cfg.examples_fst.minimize()
if cfg.verbosity >= 30:
twbt.ppfst(cfg.examples_fst, False, title="Example file as FST") ##
for insym, outsym in cfg.symbol_pair_set:
cfg.input_symbol_set.add(insym)
cfg.output_symbol_set.add(outsym)
for insym, outsym in cfg.symbol_pair_set:
pair_symbol = cfg.sympair2pairsym(insym, outsym)
cfg.pair_symbol_set.add(pair_symbol)
if build_fsts:
pair_symbol_lst = [insym+':'+outsym for insym, outsym in cfg.symbol_pair_set]
pair_symbol_str = " ".join(sorted(pair_symbol_lst))
# print("symbol pairs:", pair_symbol_str) ##
cfg.examples_fst.set_property("x-pair_symbols", pair_symbol_str)
return
if __name__ == "__main__":
import hfst
import argparse
arpar = argparse.ArgumentParser("python3 twexamp.py")
arpar.add_argument("examples", help="example pair strings file",
default="examples.pstr")
arpar.add_argument("output", help="file to which write the example FST",
default="")
arpar.add_argument("-v", "--verbosity",
help="level of diagnostic output",
type=int, default=0)
args = arpar.parse_args()
cfg.verbosity = args.verbosity
read_examples(args.examples, build_fsts=True)
if args.output:
exfile = hfst.HfstOutputStream(filename=args.output)
exfile.write(cfg.examples_fst)
exfile.flush()
exfile.close()
print("--- example fst written to ", args.output ," ---")