-
Notifications
You must be signed in to change notification settings - Fork 0
/
uroman.py
42 lines (32 loc) · 1.09 KB
/
uroman.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/usr/bin/env python3
import codecs
import os.path
import os
from tqdm import tqdm
class Uroman:
def __init__(self):
pass
def romanize(self, sentences, temp_path='./temp', lang=None):
# sentences: a list of sentences
uroman_path = "/mounts/Users/student/yihong/Documents/concept_align/uroman/bin/"
# create parallel text
in_path = temp_path + '/sentence' + ".txt"
out_path = temp_path + '/sentence_roman' + ".txt"
if not os.path.exists(temp_path):
os.makedirs(temp_path)
fa_file = codecs.open(in_path, "w", "utf-8")
for sentence in sentences:
fa_file.write(sentence + "\n")
fa_file.close()
if lang is None:
os.system(uroman_path + "uroman.pl < {0} > {1} ".format(in_path, out_path))
else:
os.system(uroman_path + "uroman.pl -l {0} < {1} > {2} ".format(lang, in_path, out_path))
romanize_sentences = []
f1 = open(out_path, "r", encoding='utf-8')
print("Transliteration starts ...")
for line in tqdm(f1.readlines()):
romanize_sentences.append(line.strip())
os.system("rm {}".format(in_path))
os.system("rm {}".format(out_path))
return romanize_sentences