diff --git a/names_dataset/emails.py b/names_dataset/emails.py new file mode 100644 index 0000000..62e6a98 --- /dev/null +++ b/names_dataset/emails.py @@ -0,0 +1,129 @@ +import re +from typing import Dict + +import numpy as np + +from names_dataset import NameDataset + +nd = NameDataset() + + +def _compute_score(ranks: Dict): + values = {a: b for a, b in ranks['rank'].items() if b is not None}.values() + if len(values) == 0: + return float('-inf') + return -min(values) + + +def _score(candidate: str): + if len(candidate) == 0: + return float('-inf') + first_name = nd.search(candidate)['first_name'] + last_name = nd.search(candidate)['last_name'] + if first_name is None and last_name is None: + return float('-inf') + if first_name is not None and last_name is not None: + s1 = _compute_score(first_name) + s2 = _compute_score(last_name) + return max(s1, s2) + if first_name is not None: + return _compute_score(first_name) + if last_name is not None: + return _compute_score(last_name) + + +# Function to infer the best split between first and last name +def _infer_best_split(full_name: str): + max_score = _score(full_name) + best_split = (full_name, None) + + # Try all possible ways to split the full_name + for i in range(0, len(full_name)): # Start at 1 to ensure both parts have characters + first = full_name[:i] + last = full_name[i:] + + # Calculate total score for the split + total_score = _score(first) + _score(last) + + # If this split has a higher score, update the best split + if total_score > max_score: + max_score = total_score + best_split = (first, last) + + return best_split, max_score + + +def _general_score(candidate: str): + c = nd.search(candidate) + s1 = _compute_score(c['first_name']) + s2 = _compute_score(c['last_name']) + return max(s1, s2) + + +def extract_names_from_email(email: str): + email = ''.join([e for e in list(email) if not e.isnumeric()]) + + prefix, suffix = email.split('@') + + no_names = ['contact', 'sales', 'info', 'hello', 'reply'] + for no_name in no_names: + if no_name in prefix: + return None, None + + if 'contact' in prefix: + return None, None + + for e in ['.', '_', '-']: + if prefix.count(e) >= 2: + c_list = prefix.split(e) + scores = [_general_score(c) for c in c_list] + a, b = np.array(c_list)[np.argsort(scores)][-2:] + email = f'{a}.{b}@{suffix}' + + patterns = [ + r"(?P[a-zA-Z]+)\.(?P[a-zA-Z]+)@", # first.last@example.com + r"(?P[a-zA-Z]+)_(?P[a-zA-Z]+)@", # first_last@example.com + r"(?P[a-zA-Z]+)\.(?P[a-zA-Z]+)@", # last.first@example.com + r"(?P[a-zA-Z]+)-(?P[a-zA-Z]+)@", # first-last@example.com + r"(?P[a-zA-Z])[._](?P[a-zA-Z]+)@", # f.last@example.com or f_last@example.com + r"(?P[a-zA-Z]+)[._](?P[a-zA-Z])[._]@", # first.l@example.com or first_l@example.com + ] + + # Try matching each pattern with the email + first_name, last_name = None, None + had_matched = False + for pattern in patterns: + match = re.match(pattern, email) + if match: + first_name = match.group('first').capitalize() + last_name = match.group('last').capitalize() + had_matched = True + break + + if not had_matched: + prefix = email.split('@')[0] + (first_name, last_name), max_score = _infer_best_split(prefix) + + if first_name is not None and len(first_name) == 1: + first_name = None + if last_name is not None and len(last_name) == 1: + last_name = None + + if first_name is not None and last_name is not None: + fn_1 = nd.search(first_name)['first_name'] + ln_1 = nd.search(last_name)['last_name'] + fn_2 = nd.search(first_name)['last_name'] + ln_2 = nd.search(last_name)['first_name'] + if fn_1 is not None and ln_1 is not None and fn_2 is not None and ln_2 is not None: + score_1 = _compute_score(fn_1) + _compute_score(ln_1) + score_2 = _compute_score(fn_2) + _compute_score(ln_2) + if score_2 > score_1: + first_name, last_name = last_name, first_name + + if first_name is not None: + first_name = first_name.lower() + + if last_name is not None: + last_name = last_name.lower() + + return first_name, last_name diff --git a/tests/test_from_emails.py b/tests/test_from_emails.py new file mode 100644 index 0000000..68768af --- /dev/null +++ b/tests/test_from_emails.py @@ -0,0 +1,60 @@ +import unittest + +from names_dataset.emails import extract_names_from_email + + +class TestEmail(unittest.TestCase): + + def test_1(self): + inputs = [ + 'info@skysense.jp', + 'isabelle.remy.fr@gmail.com', + 'philippe.remy@example.com', + 'philipperemy@example.com', + 'philippe.d@example.com', + 'p.remy123@example.com', + 'philippe@example.com', + 'philippe_remy@example.com', + 'remy.philippe@example.com', + 'remyphilippe@example.com', + 'j_remy@example.com', + 'philippe.remy123@example.com', + 'philippe.d@example.com', + 'philippe.d@example.com', + 'j.remy@example.com', + 'remyphilippe123@example.com', + 'philippe-d@example.com', + 'remy_j@example.com', + 'j_remy123@example.com', + 'philippe.remy1@example.com', + ] + + outputs = [ + [None, None], + ['isabelle', 'remy'], + ['philippe', 'remy'], + ['philippe', 'remy'], + ['philippe', None], + [None, 'remy'], + ['philippe', None], + ['philippe', 'remy'], + ['philippe', 'remy'], + ['philippe', 'remy'], + [None, 'remy'], + ['philippe', 'remy'], + ['philippe', None], + ['philippe', None], + [None, 'remy'], + ['philippe', 'remy'], + ['philippe', None], + ['remy', None], + [None, 'remy'], + ['philippe', 'remy'], + ] + + for input_, output_ in zip(inputs, outputs): + first_name, last_name = extract_names_from_email(input_) + print(input_) + self.assertEqual(output_[0], first_name) + self.assertEqual(output_[1], last_name) + print('[OK]')