Skip to content

Commit

Permalink
emails
Browse files Browse the repository at this point in the history
  • Loading branch information
philipperemy committed Oct 1, 2024
1 parent 6ae42a6 commit abf83fc
Show file tree
Hide file tree
Showing 2 changed files with 189 additions and 0 deletions.
129 changes: 129 additions & 0 deletions names_dataset/emails.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import re
from typing import Dict

import numpy as np

from names_dataset import NameDataset

nd = NameDataset()


def _compute_score(ranks: Dict):
values = {a: b for a, b in ranks['rank'].items() if b is not None}.values()
if len(values) == 0:
return float('-inf')
return -min(values)


def _score(candidate: str):
if len(candidate) == 0:
return float('-inf')
first_name = nd.search(candidate)['first_name']
last_name = nd.search(candidate)['last_name']
if first_name is None and last_name is None:
return float('-inf')
if first_name is not None and last_name is not None:
s1 = _compute_score(first_name)
s2 = _compute_score(last_name)
return max(s1, s2)
if first_name is not None:
return _compute_score(first_name)
if last_name is not None:
return _compute_score(last_name)


# Function to infer the best split between first and last name
def _infer_best_split(full_name: str):
max_score = _score(full_name)
best_split = (full_name, None)

# Try all possible ways to split the full_name
for i in range(0, len(full_name)): # Start at 1 to ensure both parts have characters
first = full_name[:i]
last = full_name[i:]

# Calculate total score for the split
total_score = _score(first) + _score(last)

# If this split has a higher score, update the best split
if total_score > max_score:
max_score = total_score
best_split = (first, last)

return best_split, max_score


def _general_score(candidate: str):
c = nd.search(candidate)
s1 = _compute_score(c['first_name'])
s2 = _compute_score(c['last_name'])
return max(s1, s2)


def extract_names_from_email(email: str):
email = ''.join([e for e in list(email) if not e.isnumeric()])

prefix, suffix = email.split('@')

no_names = ['contact', 'sales', 'info', 'hello', 'reply']
for no_name in no_names:
if no_name in prefix:
return None, None

if 'contact' in prefix:
return None, None

for e in ['.', '_', '-']:
if prefix.count(e) >= 2:
c_list = prefix.split(e)
scores = [_general_score(c) for c in c_list]
a, b = np.array(c_list)[np.argsort(scores)][-2:]
email = f'{a}.{b}@{suffix}'

patterns = [
r"(?P<first>[a-zA-Z]+)\.(?P<last>[a-zA-Z]+)@", # first.last@example.com
r"(?P<first>[a-zA-Z]+)_(?P<last>[a-zA-Z]+)@", # first_last@example.com
r"(?P<last>[a-zA-Z]+)\.(?P<first>[a-zA-Z]+)@", # last.first@example.com
r"(?P<first>[a-zA-Z]+)-(?P<last>[a-zA-Z]+)@", # first-last@example.com
r"(?P<first>[a-zA-Z])[._](?P<last>[a-zA-Z]+)@", # f.last@example.com or f_last@example.com
r"(?P<first>[a-zA-Z]+)[._](?P<last>[a-zA-Z])[._]@", # first.l@example.com or first_l@example.com
]

# Try matching each pattern with the email
first_name, last_name = None, None
had_matched = False
for pattern in patterns:
match = re.match(pattern, email)
if match:
first_name = match.group('first').capitalize()
last_name = match.group('last').capitalize()
had_matched = True
break

if not had_matched:
prefix = email.split('@')[0]
(first_name, last_name), max_score = _infer_best_split(prefix)

if first_name is not None and len(first_name) == 1:
first_name = None
if last_name is not None and len(last_name) == 1:
last_name = None

if first_name is not None and last_name is not None:
fn_1 = nd.search(first_name)['first_name']
ln_1 = nd.search(last_name)['last_name']
fn_2 = nd.search(first_name)['last_name']
ln_2 = nd.search(last_name)['first_name']
if fn_1 is not None and ln_1 is not None and fn_2 is not None and ln_2 is not None:
score_1 = _compute_score(fn_1) + _compute_score(ln_1)
score_2 = _compute_score(fn_2) + _compute_score(ln_2)
if score_2 > score_1:
first_name, last_name = last_name, first_name

if first_name is not None:
first_name = first_name.lower()

if last_name is not None:
last_name = last_name.lower()

return first_name, last_name
60 changes: 60 additions & 0 deletions tests/test_from_emails.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import unittest

from names_dataset.emails import extract_names_from_email


class TestEmail(unittest.TestCase):

def test_1(self):
inputs = [
'info@skysense.jp',
'isabelle.remy.fr@gmail.com',
'philippe.remy@example.com',
'philipperemy@example.com',
'philippe.d@example.com',
'p.remy123@example.com',
'philippe@example.com',
'philippe_remy@example.com',
'remy.philippe@example.com',
'remyphilippe@example.com',
'j_remy@example.com',
'philippe.remy123@example.com',
'philippe.d@example.com',
'philippe.d@example.com',
'j.remy@example.com',
'remyphilippe123@example.com',
'philippe-d@example.com',
'remy_j@example.com',
'j_remy123@example.com',
'philippe.remy1@example.com',
]

outputs = [
[None, None],
['isabelle', 'remy'],
['philippe', 'remy'],
['philippe', 'remy'],
['philippe', None],
[None, 'remy'],
['philippe', None],
['philippe', 'remy'],
['philippe', 'remy'],
['philippe', 'remy'],
[None, 'remy'],
['philippe', 'remy'],
['philippe', None],
['philippe', None],
[None, 'remy'],
['philippe', 'remy'],
['philippe', None],
['remy', None],
[None, 'remy'],
['philippe', 'remy'],
]

for input_, output_ in zip(inputs, outputs):
first_name, last_name = extract_names_from_email(input_)
print(input_)
self.assertEqual(output_[0], first_name)
self.assertEqual(output_[1], last_name)
print('[OK]')

0 comments on commit abf83fc

Please sign in to comment.