-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_precleaner.py
108 lines (91 loc) · 4.5 KB
/
text_precleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import re
def main(do_x):
file_orig = open('resources/orig/full_divina.txt', 'r+', encoding='utf-8')
if do_x:
file_dest = open('resources/X.csv', 'w+', encoding='utf-8')
else:
file_dest = open('resources/y.csv', 'w+', encoding='utf-8')
lines_orig = file_orig.readlines()
lines_dest = re.sub(r'([,])|([.])|([;])|([:])|([!])|([?])|'
r'([-])|([«])|([»])|(["])|([“])|([‟])|'
r'([”])|(\()|(\))|(\[)|(\])|(—)',
"", ''.join(lines_orig).lower())
# lines_dest = re.sub(r'(’)|(‘)', '\'', lines_dest)
lines_dest = re.sub(r' ’ ', ' i ', lines_dest)
lines_dest = re.sub(r'.*\d', '', lines_dest)
lines_dest = re.sub(r'.*• canto.*', '', lines_dest)
# lines_dest = re.sub(r'ä', 'a', lines_dest)
# lines_dest = re.sub(r'ë', 'e', lines_dest)
# lines_dest = re.sub(r'ï', 'i', lines_dest)
# lines_dest = re.sub(r'ö', 'o', lines_dest)
# lines_dest = re.sub(r'ü', 'u', lines_dest)
# lines_dest = re.sub(r'á', 'à', lines_dest)
# lines_dest = re.sub(r'í', 'ì', lines_dest)
# lines_dest = re.sub(r'ó', 'ò', lines_dest)
# lines_dest = re.sub(r'ú', 'ù', lines_dest)
# lines_dest = re.sub(r'k', 'c', lines_dest)
# lines_dest = re.sub(r'j', 'g', lines_dest)
# lines_dest = re.sub(r'x', 'cs', lines_dest)
# lines_dest = re.sub(r'y', 'i', lines_dest)
# lines_dest = re.sub(r'(\w+) \'(\w+)', r"\1'\2", lines_dest) # !!!
# lines_dest = re.sub(r'(\w+)\' (\w+)', r"\1'\2", lines_dest) # !!!
lines_dest = re.sub(r'\n\n\n', '\n', lines_dest)
lines_dest = re.sub(r'\n\n\n', '\n', lines_dest)
lines_dest = re.sub(r'\n\n', '\n', lines_dest)
lines_dest = re.sub(r'\n\n', '\n', lines_dest)
lines_dest = re.sub(r'\n ', '\n', lines_dest)
lines_dest = re.sub(r'\n ', '\n', lines_dest)
lines_dest = re.sub(r'\n ', '\n', lines_dest)
lines_dest = re.sub(r'\n ', '\n', lines_dest)
lines_dest = re.sub(r'^\n', '', lines_dest)
# lines_dest = re.sub(r'\n\|', '\n', lines_dest)
# lines_dest = re.sub(r'^\|', '', lines_dest)
lines_dest = re.sub(r'\n', ' \n', lines_dest)
# lines_dest = re.sub(r'(\w)è(\w)', r'\1e\2', lines_dest)
# lines_dest = re.sub(r'(\w)à(\w)', r'\1a\2', lines_dest)
# lines_dest = re.sub(r'(\w)ù(\w)', r'\1u\2', lines_dest)
# lines_dest = re.sub(r'(\w)ì(\w)', r'\1i\2', lines_dest)
# lines_dest = re.sub(r'(\w)ò(\w)', r'\1o\2', lines_dest)
# lines_dest = re.sub(r'(\w+i)\'', r'\1i', lines_dest)
lines_dest = re.sub(r'[ ]+', r' ', lines_dest)
lines_dest = re.sub(r' $', r'\n', lines_dest)
if do_x:
lines_dest = re.sub(r'\|', r'', lines_dest)
lines_dest = re.sub(r' \n', r'\n', lines_dest)
lines_dest = re.sub(r' ', r'<s>', lines_dest)
lines_dest = re.sub(r'^', r'<start>', lines_dest)
lines_dest = re.sub(r'\n', r'\n<start>', lines_dest)
lines_dest = re.sub(r'<start>$', r'', lines_dest)
lines_dest = re.sub(r'\n', r'<end>\n', lines_dest)
else:
lines_dest = re.sub(r'\|', r'<syl>', lines_dest)
lines_dest = re.sub(r' \n', r'\n', lines_dest)
lines_dest = re.sub(r' ', r'<s>', lines_dest)
lines_dest = re.sub(r'^', r'<start>', lines_dest)
lines_dest = re.sub(r'\n', r'\n<start>', lines_dest)
lines_dest = re.sub(r'<start>$', r'', lines_dest)
lines_dest = re.sub(r'\n', r'<end>\n', lines_dest)
# lines_dest = re.sub(r'(^-)|(\n-)', r'\n', lines_dest)
file_dest.writelines(lines_dest)
file_dest.close()
file_orig.close()
if __name__ == '__main__':
main(do_x=False)
main(do_x=True)
def sub_cleaner(raw_line):
line_orig = raw_line
line_dest = re.sub(r'([,])|([.])|([;])|([:])|([!])|([?])|'
r'([-])|([«])|([»])|(["])|([“])|([‟])|'
r'([”])|(\()|(\))|(\[)|(\])|(—)',
"", line_orig.lower())
line_dest = re.sub(r' ’ ', ' i ', line_dest)
line_dest = re.sub(r'.*\d', '', line_dest)
line_dest = re.sub(r'.*• canto.*', '', line_dest)
line_dest = re.sub(r'[ ]+', r' ', line_dest)
line_dest = re.sub(r' $', r'\n', line_dest)
line_dest = re.sub(r'\|', r'', line_dest)
line_dest = re.sub(r' ', r'<s>', line_dest)
line_dest = re.sub(r'^', r'<start>', line_dest)
line_dest = re.sub(r'<start>$', r'', line_dest)
line_dest += '<end>'
return line_dest