Skip to content

Commit

Permalink
issue #72 including Python module for the extraction of noun and adje…
Browse files Browse the repository at this point in the history
…ctive bases
  • Loading branch information
leoalenc committed Feb 19, 2020
1 parent 48f653e commit ee896cf
Show file tree
Hide file tree
Showing 2 changed files with 414 additions and 0 deletions.
207 changes: 207 additions & 0 deletions ExtractWordLemmaPairs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-

# Author: Leonel Figueiredo de Alencar
# leonel.de.alencar@ufc.br
# Date: April 20, 2018, updated February 18, 2020

"""This module is the first component of the architecture of a generator of Portuguese diminutives. It extracts possible diminutive formation bases from existing nouns and adjectives encoded in MorphoBr's format, as described in the following paper:
ALENCAR, Leonel Figueiredo de; CUCONATO , Bruno; RADEMAKER, Alexandre. MorphoBr: an open source large-coverage full-form lexicon for morphological analysis of Portuguese. Texto Livre: Linguagem e Tecnologia, Belo Horizonte, v. 11, n. 3, p. 1-25, set.- dez. 2018.
ISSN 1983-3652
DOI: 10.17851/1983-3652.11.3.1-25
http://www.periodicos.letras.ufmg.br/index.php/textolivre/article/view/14294.
Unplausible bases are filtered out, see details below. The extracted bases are converted to spaced-text format and written to different files, according to the classification expected by the finite-state grammar in the morphotactic-grammar.lexc file. Some examples may help clarify the pipeline:
Input in MorphoBr' format:
agulhão agulha+N+AUG+M+SG
agulhões agulha+N+AUG+M+PL
agulhona agulha+N+AUG+F+SG
agulhonas agulha+N+AUG+F+PL
Output generated by this module (written to different files):
a g u l h a +N +AUG
a g u l h ã o
a g u l h a +N +AUG
a g u l h õ e s
a g u l h a +N +AUG
a g u l h o n a
a g u l h a +N +AUG
a g u l h o n a s
"""
import os, sys, re

EXCLUDE_TAGS=["+DIM","+SUPER"]
EXTENSION=".stxt"

"""Regex pattern matching itens that can not function as bases for
morphological derivations. This includes one or more consonants before a space
at the beginning of a line, for example:
b b+N+M+SG
c c+N+M+SG
d d+N+M+SG
These itens are in fact abbreviations. As such, they cannot feed diminutive formation,
e.g. *bzinho 'little b' is ungrammatical (the correct form is 'bezinho', from 'bê', the name of
letter b).
The regex pattern also matches abbreviations such as 'ha' (for hectare) and chemical symbols ('Ba', 'Ca', etc.).
"""
CONS="[bcdfghjklmnpqrstvwxyz]"
ABB=re.compile(r"(?i)(%s{1,}|%s[aeo])\s" % (CONS,CONS))

aug_m_sg = open("aug_m_sg%s" % EXTENSION,"w")
aug_m_pl = open("aug_m_pl%s" % EXTENSION,"w")
aug_f_sg = open("aug_f_sg%s" % EXTENSION,"w")
aug_f_pl = open("aug_f_pl%s" % EXTENSION,"w")

wdlm_in_s_m_sg = open("wdlm_in_s_m_sg%s" % EXTENSION,"w")
wdlm_in_s_m_pl = open("wdlm_in_s_m_pl%s" % EXTENSION,"w")
wdlm_in_s_f_sg = open("wdlm_in_s_f_sg%s" % EXTENSION,"w")
wdlm_in_s_f_pl = open("wdlm_in_s_f_pl%s" % EXTENSION,"w")

masc_in_a_sg = open("masc_in_a_sg%s" % EXTENSION,"w")
fem_in_o_sg = open("fem_in_o_sg%s" % EXTENSION,"w")
masc_in_a_pl = open("masc_in_a_pl%s" % EXTENSION,"w")
fem_in_o_pl = open("fem_in_o_pl%s" % EXTENSION,"w")

other_m_sg = open("other_m_sg%s" % EXTENSION,"w")
other_m_pl = open("other_m_pl%s" % EXTENSION,"w")
other_f_sg = open("other_f_sg%s" % EXTENSION,"w")
other_f_pl = open("other_f_pl%s" % EXTENSION,"w")

def extract_entries(infile):
return [entry.strip().decode("utf-8") for entry in open(infile,"rU").readlines() if ignore_entry(entry.strip()) ]

def split_entry(entry):
return re.split(r"\s+",entry)

def exclude_abbr(entry):
if ABB.match(entry):
return True
return False

def exclude_tag(entry):
for tag in EXCLUDE_TAGS:
if tag in entry:
return True
return False

def ignore_entry(entry):
if entry == "" or exclude_tag(entry) or exclude_abbr(entry):
return False
else:
return True

def space(word):
return " ".join(list(word))

def convert_entry(word,lemma,tags):
return "%s %s\n%s" % (space(lemma),"+%s" % " +".join(tags),space(word))

def parse_entry(entry):
word,parse=split_entry(entry)
lemma,tags=re.split(r"\+",parse,1)
return word,lemma,tags

def WordLemmaInS(word,lemma):
if word.endswith("s") and lemma.endswith("s") and word == lemma:
return True
else:
return False


def NonCanonGendMarker(word,tags):
if ("-" in word and "+M+PL" in tags and word.endswith("a")
or "-" in word and "+F+PL" in tags and word.endswith("o")
or "+M+SG" in tags and word.endswith("a")
or "+M+PL" in tags and word.endswith("as") # N-N compounds like 'aços-liga'
or "+F+SG" in tags and word.endswith("o")
or "+F+PL" in tags and word.endswith("os") # N-N compounds like 'amostras-tipo'
):
return True
else:
return False

def write_entries(entries):
for entry in entries:
word,lemma,tags=parse_entry(entry)
if "+AUG" in tags:
stxt=convert_entry(word,lemma,re.split(r"\+",tags)[:2]).encode("utf-8")
if "+M+SG" in tags:
aug_m_sg.write("%s\n\n" % stxt)
elif "+M+PL" in tags:
aug_m_pl.write("%s\n\n" % stxt)
elif "+F+SG" in tags:
aug_f_sg.write("%s\n\n" % stxt)
else:
aug_f_pl.write("%s\n\n" % stxt)

elif WordLemmaInS(word,lemma): # TODO: use re.split(r"\+",tags)[:-2], excluding gender and number tags,
# but including other tags besides the category tag (this may be useful in the future)
stxt=convert_entry(word,lemma,tags[0]).encode("utf-8")
if "+M+SG" in tags:
wdlm_in_s_m_sg.write("%s\n\n" % stxt)
elif "+M+PL" in tags:
wdlm_in_s_m_pl.write("%s\n\n" % stxt)
elif "+F+SG" in tags:
wdlm_in_s_f_sg.write("%s\n\n" % stxt)
else:
wdlm_in_s_f_pl.write("%s\n\n" % stxt)

elif NonCanonGendMarker(word,tags): # TODO: see the above comment
stxt=convert_entry(word,lemma,tags[0]).encode("utf-8")
if "+M+SG" in tags:
masc_in_a_sg.write("%s\n\n" % stxt)
elif "+F+SG" in tags:
fem_in_o_sg.write("%s\n\n" % stxt)
#else: # discard plural forms
# this generates incorrect plurals of compounds like 'cebeça-chata' (23/01/2020)
#pass
elif "+F+PL" in tags:
fem_in_o_pl.write("%s\n\n" % stxt)
elif "+M+PL" in tags:
masc_in_a_pl.write("%s\n\n" % stxt)
else:
stxt=convert_entry(word,lemma,tags[0]).encode("utf-8") # TODO: tags[:-2] (see above)
if "+M+SG" in tags:
other_m_sg.write("%s\n\n" % stxt)
elif "+M+PL" in tags:
other_m_pl.write("%s\n\n" % stxt)
elif "+F+SG" in tags:
other_f_sg.write("%s\n\n" % stxt)
else:
other_f_pl.write("%s\n\n" % stxt)

def main():
for infile in sys.argv[1:]:
entries=extract_entries(infile)
write_entries(entries)
aug_m_sg.close()
aug_m_pl.close()
aug_f_sg.close()
aug_f_pl.close()
wdlm_in_s_m_sg.close()
wdlm_in_s_m_pl.close()
wdlm_in_s_f_sg.close()
wdlm_in_s_f_pl.close()
masc_in_a_sg.close()
fem_in_o_sg.close()
masc_in_a_pl.close()
fem_in_o_pl.close()
other_m_sg.close()
other_m_pl.close()
other_f_sg.close()
other_f_pl.close()

if __name__ == '__main__':
main()
Loading

0 comments on commit ee896cf

Please sign in to comment.