-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
issue #72 including Python module for the extraction of noun and adje…
…ctive bases
- Loading branch information
Showing
2 changed files
with
414 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
#! /usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
# Author: Leonel Figueiredo de Alencar | ||
# leonel.de.alencar@ufc.br | ||
# Date: April 20, 2018, updated February 18, 2020 | ||
|
||
"""This module is the first component of the architecture of a generator of Portuguese diminutives. It extracts possible diminutive formation bases from existing nouns and adjectives encoded in MorphoBr's format, as described in the following paper: | ||
ALENCAR, Leonel Figueiredo de; CUCONATO , Bruno; RADEMAKER, Alexandre. MorphoBr: an open source large-coverage full-form lexicon for morphological analysis of Portuguese. Texto Livre: Linguagem e Tecnologia, Belo Horizonte, v. 11, n. 3, p. 1-25, set.- dez. 2018. | ||
ISSN 1983-3652 | ||
DOI: 10.17851/1983-3652.11.3.1-25 | ||
http://www.periodicos.letras.ufmg.br/index.php/textolivre/article/view/14294. | ||
Unplausible bases are filtered out, see details below. The extracted bases are converted to spaced-text format and written to different files, according to the classification expected by the finite-state grammar in the morphotactic-grammar.lexc file. Some examples may help clarify the pipeline: | ||
Input in MorphoBr' format: | ||
agulhão agulha+N+AUG+M+SG | ||
agulhões agulha+N+AUG+M+PL | ||
agulhona agulha+N+AUG+F+SG | ||
agulhonas agulha+N+AUG+F+PL | ||
Output generated by this module (written to different files): | ||
a g u l h a +N +AUG | ||
a g u l h ã o | ||
a g u l h a +N +AUG | ||
a g u l h õ e s | ||
a g u l h a +N +AUG | ||
a g u l h o n a | ||
a g u l h a +N +AUG | ||
a g u l h o n a s | ||
""" | ||
import os, sys, re | ||
|
||
EXCLUDE_TAGS=["+DIM","+SUPER"] | ||
EXTENSION=".stxt" | ||
|
||
"""Regex pattern matching itens that can not function as bases for | ||
morphological derivations. This includes one or more consonants before a space | ||
at the beginning of a line, for example: | ||
b b+N+M+SG | ||
c c+N+M+SG | ||
d d+N+M+SG | ||
These itens are in fact abbreviations. As such, they cannot feed diminutive formation, | ||
e.g. *bzinho 'little b' is ungrammatical (the correct form is 'bezinho', from 'bê', the name of | ||
letter b). | ||
The regex pattern also matches abbreviations such as 'ha' (for hectare) and chemical symbols ('Ba', 'Ca', etc.). | ||
""" | ||
CONS="[bcdfghjklmnpqrstvwxyz]" | ||
ABB=re.compile(r"(?i)(%s{1,}|%s[aeo])\s" % (CONS,CONS)) | ||
|
||
aug_m_sg = open("aug_m_sg%s" % EXTENSION,"w") | ||
aug_m_pl = open("aug_m_pl%s" % EXTENSION,"w") | ||
aug_f_sg = open("aug_f_sg%s" % EXTENSION,"w") | ||
aug_f_pl = open("aug_f_pl%s" % EXTENSION,"w") | ||
|
||
wdlm_in_s_m_sg = open("wdlm_in_s_m_sg%s" % EXTENSION,"w") | ||
wdlm_in_s_m_pl = open("wdlm_in_s_m_pl%s" % EXTENSION,"w") | ||
wdlm_in_s_f_sg = open("wdlm_in_s_f_sg%s" % EXTENSION,"w") | ||
wdlm_in_s_f_pl = open("wdlm_in_s_f_pl%s" % EXTENSION,"w") | ||
|
||
masc_in_a_sg = open("masc_in_a_sg%s" % EXTENSION,"w") | ||
fem_in_o_sg = open("fem_in_o_sg%s" % EXTENSION,"w") | ||
masc_in_a_pl = open("masc_in_a_pl%s" % EXTENSION,"w") | ||
fem_in_o_pl = open("fem_in_o_pl%s" % EXTENSION,"w") | ||
|
||
other_m_sg = open("other_m_sg%s" % EXTENSION,"w") | ||
other_m_pl = open("other_m_pl%s" % EXTENSION,"w") | ||
other_f_sg = open("other_f_sg%s" % EXTENSION,"w") | ||
other_f_pl = open("other_f_pl%s" % EXTENSION,"w") | ||
|
||
def extract_entries(infile): | ||
return [entry.strip().decode("utf-8") for entry in open(infile,"rU").readlines() if ignore_entry(entry.strip()) ] | ||
|
||
def split_entry(entry): | ||
return re.split(r"\s+",entry) | ||
|
||
def exclude_abbr(entry): | ||
if ABB.match(entry): | ||
return True | ||
return False | ||
|
||
def exclude_tag(entry): | ||
for tag in EXCLUDE_TAGS: | ||
if tag in entry: | ||
return True | ||
return False | ||
|
||
def ignore_entry(entry): | ||
if entry == "" or exclude_tag(entry) or exclude_abbr(entry): | ||
return False | ||
else: | ||
return True | ||
|
||
def space(word): | ||
return " ".join(list(word)) | ||
|
||
def convert_entry(word,lemma,tags): | ||
return "%s %s\n%s" % (space(lemma),"+%s" % " +".join(tags),space(word)) | ||
|
||
def parse_entry(entry): | ||
word,parse=split_entry(entry) | ||
lemma,tags=re.split(r"\+",parse,1) | ||
return word,lemma,tags | ||
|
||
def WordLemmaInS(word,lemma): | ||
if word.endswith("s") and lemma.endswith("s") and word == lemma: | ||
return True | ||
else: | ||
return False | ||
|
||
|
||
def NonCanonGendMarker(word,tags): | ||
if ("-" in word and "+M+PL" in tags and word.endswith("a") | ||
or "-" in word and "+F+PL" in tags and word.endswith("o") | ||
or "+M+SG" in tags and word.endswith("a") | ||
or "+M+PL" in tags and word.endswith("as") # N-N compounds like 'aços-liga' | ||
or "+F+SG" in tags and word.endswith("o") | ||
or "+F+PL" in tags and word.endswith("os") # N-N compounds like 'amostras-tipo' | ||
): | ||
return True | ||
else: | ||
return False | ||
|
||
def write_entries(entries): | ||
for entry in entries: | ||
word,lemma,tags=parse_entry(entry) | ||
if "+AUG" in tags: | ||
stxt=convert_entry(word,lemma,re.split(r"\+",tags)[:2]).encode("utf-8") | ||
if "+M+SG" in tags: | ||
aug_m_sg.write("%s\n\n" % stxt) | ||
elif "+M+PL" in tags: | ||
aug_m_pl.write("%s\n\n" % stxt) | ||
elif "+F+SG" in tags: | ||
aug_f_sg.write("%s\n\n" % stxt) | ||
else: | ||
aug_f_pl.write("%s\n\n" % stxt) | ||
|
||
elif WordLemmaInS(word,lemma): # TODO: use re.split(r"\+",tags)[:-2], excluding gender and number tags, | ||
# but including other tags besides the category tag (this may be useful in the future) | ||
stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") | ||
if "+M+SG" in tags: | ||
wdlm_in_s_m_sg.write("%s\n\n" % stxt) | ||
elif "+M+PL" in tags: | ||
wdlm_in_s_m_pl.write("%s\n\n" % stxt) | ||
elif "+F+SG" in tags: | ||
wdlm_in_s_f_sg.write("%s\n\n" % stxt) | ||
else: | ||
wdlm_in_s_f_pl.write("%s\n\n" % stxt) | ||
|
||
elif NonCanonGendMarker(word,tags): # TODO: see the above comment | ||
stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") | ||
if "+M+SG" in tags: | ||
masc_in_a_sg.write("%s\n\n" % stxt) | ||
elif "+F+SG" in tags: | ||
fem_in_o_sg.write("%s\n\n" % stxt) | ||
#else: # discard plural forms | ||
# this generates incorrect plurals of compounds like 'cebeça-chata' (23/01/2020) | ||
#pass | ||
elif "+F+PL" in tags: | ||
fem_in_o_pl.write("%s\n\n" % stxt) | ||
elif "+M+PL" in tags: | ||
masc_in_a_pl.write("%s\n\n" % stxt) | ||
else: | ||
stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") # TODO: tags[:-2] (see above) | ||
if "+M+SG" in tags: | ||
other_m_sg.write("%s\n\n" % stxt) | ||
elif "+M+PL" in tags: | ||
other_m_pl.write("%s\n\n" % stxt) | ||
elif "+F+SG" in tags: | ||
other_f_sg.write("%s\n\n" % stxt) | ||
else: | ||
other_f_pl.write("%s\n\n" % stxt) | ||
|
||
def main(): | ||
for infile in sys.argv[1:]: | ||
entries=extract_entries(infile) | ||
write_entries(entries) | ||
aug_m_sg.close() | ||
aug_m_pl.close() | ||
aug_f_sg.close() | ||
aug_f_pl.close() | ||
wdlm_in_s_m_sg.close() | ||
wdlm_in_s_m_pl.close() | ||
wdlm_in_s_f_sg.close() | ||
wdlm_in_s_f_pl.close() | ||
masc_in_a_sg.close() | ||
fem_in_o_sg.close() | ||
masc_in_a_pl.close() | ||
fem_in_o_pl.close() | ||
other_m_sg.close() | ||
other_m_pl.close() | ||
other_f_sg.close() | ||
other_f_pl.close() | ||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.