Skip to content

Commit

Permalink
emails
Browse files Browse the repository at this point in the history
  • Loading branch information
philipperemy committed Oct 9, 2024
1 parent a8216a6 commit 6d79fee
Show file tree
Hide file tree
Showing 3 changed files with 261 additions and 28 deletions.
35 changes: 20 additions & 15 deletions api/server.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import json
import logging
import sys
from typing import Union
from typing import Union, Optional, Dict

from flask import Flask, request
from paste.translogger import TransLogger
from waitress import serve

from names_dataset import NameDataset, NameWrapper
from names_dataset.emails import extract_names_from_email
from names_dataset.emails import extract_names_from_email, try_to_split_with_two_last_names

logger = logging.getLogger(__name__)
logging.basicConfig(
Expand Down Expand Up @@ -47,6 +47,16 @@ def str2bool(s: Union[bool, str]) -> bool:
return False


def package_name(name: str, identifier: str) -> Optional[Dict]:
if name is not None:
result = nd.search(name)[identifier]
if result is not None:
result['name'] = name.title()
return result
else:
return None


@app.route('/split', methods=['GET'])
def split():
try:
Expand All @@ -59,21 +69,16 @@ def split():
)
else:
first_name, last_name = extract_names_from_email(nd, q)
if first_name is not None:
result_first_name = nd.search(first_name)['first_name']
if result_first_name is not None:
result_first_name['name'] = first_name
else:
result_first_name = None
if last_name is not None:
result_last_name = nd.search(last_name)['last_name']
if result_last_name is not None:
result_last_name['name'] = last_name
else:
result_last_name = None
last_name2 = None
if first_name is None or last_name is None:
first_name, last_name, last_name2 = try_to_split_with_two_last_names(nd, q)
result_first_name = package_name(first_name, 'first_name')
result_last_name = package_name(last_name, 'last_name')
result_last_name2 = package_name(last_name2, 'last_name')
result = {
'first_name': result_first_name,
'last_name': result_last_name
'last_name': result_last_name,
'last_name2': result_last_name2
}
return generate_output({'result': result}, status=True)
except Exception as e:
Expand Down
55 changes: 44 additions & 11 deletions names_dataset/emails.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from collections import Counter
from typing import Dict

import numpy as np
Expand Down Expand Up @@ -60,10 +61,37 @@ def _general_score(nd: NameDataset, candidate: str):
return float('-inf')


def try_to_split_with_two_last_names(nd: NameDataset, email: str):
c = Counter()
for i in range(1, len(email)):
first_name, last_name = extract_names_from_email(nd, email[0:i])
if first_name is not None:
c[first_name] += 1
if last_name is not None:
c[last_name] += 1
most_common = c.most_common(1)
if len(most_common) > 0:
candidate1 = most_common[0][0]
candidate2, candidate3 = extract_names_from_email(nd, email.replace(candidate1, ''))

fn1, ln1 = _infer_first_and_last_names(candidate1, candidate2, nd)
fn2, ln2 = _infer_first_and_last_names(candidate1, candidate3, nd)
fn3, ln3 = _infer_first_and_last_names(candidate2, candidate3, nd)

real_first_name = Counter([fn1, fn2, fn3]).most_common(1)[0][0]
last_names = list({candidate1, candidate2, candidate3} - {real_first_name})
last_name1 = last_names[0]
last_name2 = last_names[1]
if email.index(last_name1) < email.index(last_name2):
last_name1, last_name2 = last_name1, last_name2
else:
last_name1, last_name2 = last_name2, last_name1
return real_first_name, last_name1, last_name2
return None, None, None


def extract_names_from_email(nd: NameDataset, email: str):
email = email.strip()
if '' in email:
email = email.split(' ')[0]
if '@' not in email:
email += '@gmail.com'

Expand Down Expand Up @@ -116,15 +144,7 @@ def extract_names_from_email(nd: NameDataset, email: str):
last_name = None

if first_name is not None and last_name is not None:
fn_1 = nd.search(first_name)['first_name']
ln_1 = nd.search(last_name)['last_name']
fn_2 = nd.search(first_name)['last_name']
ln_2 = nd.search(last_name)['first_name']
if fn_1 is not None and ln_1 is not None and fn_2 is not None and ln_2 is not None:
score_1 = _compute_score(fn_1) + _compute_score(ln_1)
score_2 = _compute_score(fn_2) + _compute_score(ln_2)
if score_2 > score_1:
first_name, last_name = last_name, first_name
first_name, last_name = _infer_first_and_last_names(first_name, last_name, nd)

if first_name is not None:
first_name = first_name.lower()
Expand All @@ -133,3 +153,16 @@ def extract_names_from_email(nd: NameDataset, email: str):
last_name = last_name.lower()

return first_name, last_name


def _infer_first_and_last_names(first_name, last_name, nd):
fn_1 = nd.search(first_name)['first_name']
ln_1 = nd.search(last_name)['last_name']
fn_2 = nd.search(first_name)['last_name']
ln_2 = nd.search(last_name)['first_name']
if fn_1 is not None and ln_1 is not None and fn_2 is not None and ln_2 is not None:
score_1 = _compute_score(fn_1) + _compute_score(ln_1)
score_2 = _compute_score(fn_2) + _compute_score(ln_2)
if score_2 > score_1:
first_name, last_name = last_name, first_name
return first_name, last_name
199 changes: 197 additions & 2 deletions tests/test_from_emails.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,205 @@
import unittest

from names_dataset import NameDataset
from names_dataset.emails import extract_names_from_email
from names_dataset.emails import extract_names_from_email, try_to_split_with_two_last_names


class TestEmail(unittest.TestCase):

def test_1(self):
def test_with_three_3(self):
inputs = [
'perezmartiisabel',
'isabelmartiperez',
'martiperezisabel',
'isabelperezmarti',

'garciafernandezmaria',
'mariafernandezgarcia',

'gonzalezlopezana',
'analopezgonzalez',

'rodriguezhernandezjuan',
'juanhernandezrodriguez',

'suarezdominguezcarlos',
'carlosdominguezsuarez',

'sanchezruizlucia',
'luciaruizsanchez',

'gomeznunezmiguel',
'miguelnunezgomez',

]

outputs = [
['isabel', 'perez', 'marti'],
['isabel', 'marti', 'perez'],
['isabel', 'marti', 'perez'],
['isabel', 'perez', 'marti'],

['maria', 'garcia', 'fernandez'],
['maria', 'fernandez', 'garcia'],

['ana', 'gonzalez', 'lopez'],
['ana', 'lopez', 'gonzalez'],

['juan', 'rodriguez', 'hernandez'],
['juan', 'hernandez', 'rodriguez'],

['carlos', 'suarez', 'dominguez'],
['carlos', 'dominguez', 'suarez'],

['lucia', 'sanchez', 'ruiz'],
['lucia', 'ruiz', 'sanchez'],

['miguel', 'gomez', 'nunez'],
['miguel', 'nunez', 'gomez'],
]
inputs2 = []
for i in inputs:
inputs2.append(i.split('@')[0])

nd = NameDataset()
for input_, output_ in zip(inputs2, outputs):
first_name, last_name, last_name2 = try_to_split_with_two_last_names(nd, input_)
print(input_)
print('output=', first_name, last_name, last_name2)
print('expected=', output_[0], output_[1], output_[2])
self.assertEqual(output_[0], first_name)
self.assertEqual(output_[1], last_name)
self.assertEqual(output_[2], last_name2)
print('[OK]')

def test_with_three_2(self):
inputs = [
'torresmoralesines',

'perezmartiisabel',
'isabelmartiperez',
'martiperezisabel',
'isabelperezmarti',

'garciafernandezmaria',
'mariafernandezgarcia',
'fernandezgarciamaria',
'mariagarciafernandez',

'gonzalezlopezana',
'analopezgonzalez',
'lopezgonzalezana',
'anagonzalezlopez',

'rodriguezhernandezjuan',
'juanhernandezrodriguez',
'hernandezrodriguezjuan',
'juanrodriguezhernandez',

'suarezdominguezcarlos',
'carlosdominguezsuarez',
'dominguezsuarezcarlos',
'carlossuarezdominguez',

'sanchezruizlucia',
'luciaruizsanchez',
'ruizsanchezlucia',
'luciasanchezruiz',

'gomeznunezmiguel',
'miguelnunezgomez',
'nunezgomezmiguel',
'miguelgomeznunez',

'moralestorresines',
'inestorresmorales',
]

outputs = [
['ines', 'torres', 'morales'],

['isabel', 'perez', 'marti'],
['isabel', 'marti', 'perez'],
['isabel', 'marti', 'perez'],
['isabel', 'perez', 'marti'],

['maria', 'garcia', 'fernandez'],
['maria', 'fernandez', 'garcia'],
['maria', 'fernandez', 'garcia'],
['maria', 'garcia', 'fernandez'],

['ana', 'gonzalez', 'lopez'],
['ana', 'lopez', 'gonzalez'],
['ana', 'lopez', 'gonzalez'],
['ana', 'gonzalez', 'lopez'],

['juan', 'rodriguez', 'hernandez'],
['juan', 'hernandez', 'rodriguez'],
['juan', 'hernandez', 'rodriguez'],
['juan', 'rodriguez', 'hernandez'],

['carlos', 'suarez', 'dominguez'],
['carlos', 'dominguez', 'suarez'],
['carlos', 'dominguez', 'suarez'],
['carlos', 'suarez', 'dominguez'],

['lucia', 'sanchez', 'ruiz'],
['lucia', 'ruiz', 'sanchez'],
['lucia', 'ruiz', 'sanchez'],
['lucia', 'sanchez', 'ruiz'],

['miguel', 'gomez', 'nunez'],
['miguel', 'nunez', 'gomez'],
['miguel', 'nunez', 'gomez'],
['miguel', 'gomez', 'nunez'],

['ines', 'morales', 'torres'],
['ines', 'torres', 'morales'],
]
inputs2 = []
for i in inputs:
inputs2.append(i.split('@')[0])

nd = NameDataset()
for input_, output_ in zip(inputs2, outputs):
first_name, last_name, last_name2 = try_to_split_with_two_last_names(nd, input_)
print(input_)
print('output=', first_name, last_name, last_name2)
print('expected=', output_[0], output_[1], output_[2])
self.assertEqual(output_[0], first_name)
self.assertEqual(output_[1], last_name)
self.assertEqual(output_[2], last_name2)
print('[OK]')

def test_with_three_1(self):
inputs = [
'perezmartiisabel',
'isabelmartiperez',
'martiperezisabel',
'isabelperezmarti',
]
outputs = [
['isabel', 'perez', 'marti'],
['isabel', 'marti', 'perez'],

['isabel', 'marti', 'perez'],
['isabel', 'perez', 'marti'],
]
inputs2 = []
for i in inputs:
inputs2.append(i.split('@')[0])

nd = NameDataset()
for input_, output_ in zip(inputs2, outputs):
first_name, last_name, last_name2 = try_to_split_with_two_last_names(nd, input_)
print(input_)
print('output=', first_name, last_name, last_name2)
self.assertEqual(output_[0], first_name)
self.assertEqual(output_[1], last_name)
self.assertEqual(output_[2], last_name2)
print('[OK]')

def test_with_two(self):
inputs = [
'info@skysense.jp',
'isabelle.remy.fr@gmail.com',
Expand All @@ -28,6 +221,7 @@ def test_1(self):
'remy_j@example.com',
'j_remy123@example.com',
'philippe.remy1@example.com',
'perezmarti',
]
inputs2 = []
for i in inputs:
Expand All @@ -54,6 +248,7 @@ def test_1(self):
['remy', None],
[None, 'remy'],
['philippe', 'remy'],
['perez', 'marti'],
]

nd = NameDataset()
Expand Down

0 comments on commit 6d79fee

Please sign in to comment.