Skip to content

Commit

Permalink
parser Intents() clean() now has a "deep" cleaning function that also…
Browse files Browse the repository at this point in the history
… removes Intents found in "inc" attribute

- fixed major bug in parser Intents() clean()
- added deep cleaning function for parser Intents() clean that also removes Intents in "inc" attribute if both the parent Intent and at least one of the "inc" Intents were found
- pareser Intents() clean() now removes "typo_stem" matches as well and returnes the merged results
- test_entities now take accents, improved regex declarations, accidental regex declarations and affixes and prefixes not declared as lists into account and prints a warning message
- increased version number to 1.1.0 due to major update
  • Loading branch information
sedthh committed May 4, 2018
1 parent 43aeb77 commit 1c5bf66
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 21 deletions.
2 changes: 1 addition & 1 deletion lara/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Lara - Lingusitic Aim Recognizer API

__all__ = 'nlp','parser','stemmer','entities'
__version__ = '1.0.8'
__version__ = '1.1.0'
__version_info__ = tuple(int(num) for num in __version__.split('.'))

import sys
Expand Down
45 changes: 30 additions & 15 deletions lara/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,21 +271,22 @@ def match_set(self, text=""):
return set()

# Remove matches from text
def clean(self, text=""):
def clean(self, text="", deep=False):
if text:
return self._get_clean_text(text)
return self._get_clean_text(text,deep)
else:
return ""

# Returns text without the inflected forms of matched intents
def _get_clean_text(self, text):
text = lara.nlp.trim(text)
def _get_clean_text(self, text, deep):
text = lara.nlp.trim(text)
typo_text = lara.nlp.strip_accents(lara.nlp.remove_double_letters(text))
fix_text = text
c_text = text
c_typo_text = typo_text
if text:
for key, value in self.intents.items():
ignore = False
allow = -1
allow = -1
for item in self.intents[key]:
if 'exc' in item and item['exc']:
for exc in item['exc']:
Expand All @@ -301,14 +302,25 @@ def _get_clean_text(self, text):
allow = 1
elif self._match_pattern(typo_text,inc,True)[0]: # typo_stem
allow = 1
if not ignore and allow in (-1,1):
max_words = _re.words(text)
for item in self.intents[key]:
if not ignore and allow in (-1,1):
max_words = _re.words(text)
if item['max_words'] <= max_words:
fix_text = self._match_pattern(fix_text,item,False,True) # stem
fix_text = self._match_pattern(fix_text,item,True,True) # typo_stem

return fix_text
c_text = self._match_pattern(c_text,item,False,True,deep) # stem
c_typo_text = self._match_pattern(c_typo_text,item,True,True,deep) # typo_stem
# attempt to merge results
c_text = lara.nlp.trim(c_text).split()
c_typo_text = lara.nlp.trim(c_typo_text).split()
fix_text = []
last = 0
for word in c_text:
x_word = lara.nlp.strip_accents(lara.nlp.remove_double_letters(word))
for i in range(last,len(c_typo_text)):
if x_word==c_typo_text[i]:
fix_text.append(word)
last = i
break
return ' '.join(fix_text)
return text

# Get score for intents in text
def _get_score(self, text, greedy=True):
Expand Down Expand Up @@ -355,12 +367,12 @@ def _get_score(self, text, greedy=True):
return score

# Find an intent in text
def _match_pattern(self, text, item, is_clean=False, delete=False):
def _match_pattern(self, text, item, is_clean=False, delete=False, deep=False):
if text:
if not delete and item['max_words']:
if _re.words(text)>item['max_words']:
return (False,0)

if is_clean:
select = 'typo_'
else:
Expand All @@ -383,6 +395,9 @@ def _match_pattern(self, text, item, is_clean=False, delete=False):
match = match[0]
if item['match_stem'] or (item['ignorecase'] and match.lower() != item[select+'stem'].lower()) or (match.lower() != item[select+'stem']):
tmp = _re.sub(boundary+r'('+re.escape(match)+r')'+boundary,re.IGNORECASE,'',tmp)
if deep and 'inc' in item:
for inc in item['inc']:
tmp = self._match_pattern(tmp, inc, is_clean, delete, deep)
return tmp
else:
if not item['match_stem']:
Expand Down
37 changes: 32 additions & 5 deletions tests/test_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,22 @@
valid_keys = set(['stem','clean_stem','affix','clean_affix','prefix','clean_prefix','wordclass','inc','exc','score','clean_score','match_stem','ignorecase','boundary','max_words'])
valid_class = set(['noun','verb','adjective','regex','emoji','special'])
is_regex = set(['|','(',')','+','*','+','?','\\','[',']','{','}'])
accents = set(['á','Á','é','É','í','Í','ü','Ü','ű','Ű','ú','Ú','ö','Ö','ő','Ő','ó','Ó'])

def validate_intent(intents):
for intent,declaration in intents.items():
for item in declaration:
for char in intent:
if char in accents:
print(intent,"key has accents in declaration")
break
for item in declaration:
validate_intent_item(item,intent)
if 'inc' in item:
for sub_item in item['inc']:
validate_intent_item(sub_item,intent)
if 'exc' in item:
for sub_item in item['exc']:
validate_intent_item(sub_item,intent)
validate_intent_item(sub_item,intent)

def validate_intent_item(item,intent):
for key in item:
Expand All @@ -27,6 +32,12 @@ def validate_intent_item(item,intent):
if 'wordclass' in item:
if item['wordclass'] not in valid_class:
print(intent,'has invalid "wordclass" declared')
if 'affix' in item:
if not isinstance(item['affix'], list) and not isinstance(item['affix'], tuple):
print(intent,'has "affix" declared, but not as a list:',item['stem'])
if 'prefix' in item:
if not isinstance(item['prefix'], list) and not isinstance(item['prefix'], tuple):
print(intent,'has "prefix" declared, but not as a list:',item['stem'])
if 'stem' not in item:
print(intent,'missing "stem" key')
else:
Expand All @@ -40,7 +51,7 @@ def validate_intent_item(item,intent):
switch = True
elif char==']':
switch = False
elif char in ('á','Á','é','É','í','Í','ü','Ü','ű','Ű','ú','Ú','ö','Ö','ő','Ő','ó','Ó'):
elif char in accents:
if not switch:
print(intent,'has accents declared in regular expression without counterparts:',item['stem'])
break
Expand All @@ -51,13 +62,29 @@ def validate_intent_item(item,intent):
last = last+char
else:
last = char
elif 'wordclass' in item and item['wordclass']=='regex':
really = False
for char in item['stem']:
if not char.isalnum() and char not in (' ','-'):
really = True
break
if not really:
print(intent,'probably has a regex "wordclass" declared by accident in',item['stem'])
if any(test in item['stem'] for test in is_regex):
if 'wordclass' not in item or item['wordclass']!='regex':
print(intent,'probably has a regex "wordclass" declared otherwise in',item['stem'])

@pytest.mark.parametrize("entity", [
"common","commands","counties","dow","smalltalk","emoji","disallow","tone"
])
def test_entities(entity):
parenthesis_check = eval('parser.Intents(entities.'+entity+'()).match_set("test")')
eval('validate_intent(entities.'+entity+'())')
eval('validate_intent(entities.'+entity+'())')



valid_keys = set(['stem','clean_stem','affix','clean_affix','prefix','clean_prefix','wordclass','inc','exc','score','clean_score','match_stem','ignorecase','boundary','max_words'])
valid_class = set(['noun','verb','adjective','regex','emoji','special'])
is_regex = set(['|','(',')','+','*','+','?','\\','[',']','{','}'])
accents = set(['á','Á','é','É','í','Í','ü','Ü','ű','Ű','ú','Ú','ö','Ö','ő','Ő','ó','Ó'])

0 comments on commit 1c5bf66

Please sign in to comment.