emails

philipperemy · Oct 1, 2024 · abf83fc · abf83fc
1 parent 6ae42a6
commit abf83fc
Show file tree

Hide file tree

Showing 2 changed files with 189 additions and 0 deletions.
diff --git a/names_dataset/emails.py b/names_dataset/emails.py
@@ -0,0 +1,129 @@
+import re
+from typing import Dict
+
+import numpy as np
+
+from names_dataset import NameDataset
+
+nd = NameDataset()
+
+
+def _compute_score(ranks: Dict):
+    values = {a: b for a, b in ranks['rank'].items() if b is not None}.values()
+    if len(values) == 0:
+        return float('-inf')
+    return -min(values)
+
+
+def _score(candidate: str):
+    if len(candidate) == 0:
+        return float('-inf')
+    first_name = nd.search(candidate)['first_name']
+    last_name = nd.search(candidate)['last_name']
+    if first_name is None and last_name is None:
+        return float('-inf')
+    if first_name is not None and last_name is not None:
+        s1 = _compute_score(first_name)
+        s2 = _compute_score(last_name)
+        return max(s1, s2)
+    if first_name is not None:
+        return _compute_score(first_name)
+    if last_name is not None:
+        return _compute_score(last_name)
+
+
+# Function to infer the best split between first and last name
+def _infer_best_split(full_name: str):
+    max_score = _score(full_name)
+    best_split = (full_name, None)
+
+    # Try all possible ways to split the full_name
+    for i in range(0, len(full_name)):  # Start at 1 to ensure both parts have characters
+        first = full_name[:i]
+        last = full_name[i:]
+
+        # Calculate total score for the split
+        total_score = _score(first) + _score(last)
+
+        # If this split has a higher score, update the best split
+        if total_score > max_score:
+            max_score = total_score
+            best_split = (first, last)
+
+    return best_split, max_score
+
+
+def _general_score(candidate: str):
+    c = nd.search(candidate)
+    s1 = _compute_score(c['first_name'])
+    s2 = _compute_score(c['last_name'])
+    return max(s1, s2)
+
+
+def extract_names_from_email(email: str):
+    email = ''.join([e for e in list(email) if not e.isnumeric()])
+
+    prefix, suffix = email.split('@')
+
+    no_names = ['contact', 'sales', 'info', 'hello', 'reply']
+    for no_name in no_names:
+        if no_name in prefix:
+            return None, None
+
+    if 'contact' in prefix:
+        return None, None
+
+    for e in ['.', '_', '-']:
+        if prefix.count(e) >= 2:
+            c_list = prefix.split(e)
+            scores = [_general_score(c) for c in c_list]
+            a, b = np.array(c_list)[np.argsort(scores)][-2:]
+            email = f'{a}.{b}@{suffix}'
+
+    patterns = [
+        r"(?P<first>[a-zA-Z]+)\.(?P<last>[a-zA-Z]+)@",  # first.last@example.com
+        r"(?P<first>[a-zA-Z]+)_(?P<last>[a-zA-Z]+)@",  # first_last@example.com
+        r"(?P<last>[a-zA-Z]+)\.(?P<first>[a-zA-Z]+)@",  # last.first@example.com
+        r"(?P<first>[a-zA-Z]+)-(?P<last>[a-zA-Z]+)@",  # first-last@example.com
+        r"(?P<first>[a-zA-Z])[._](?P<last>[a-zA-Z]+)@",  # f.last@example.com or f_last@example.com
+        r"(?P<first>[a-zA-Z]+)[._](?P<last>[a-zA-Z])[._]@",  # first.l@example.com or first_l@example.com
+    ]
+
+    # Try matching each pattern with the email
+    first_name, last_name = None, None
+    had_matched = False
+    for pattern in patterns:
+        match = re.match(pattern, email)
+        if match:
+            first_name = match.group('first').capitalize()
+            last_name = match.group('last').capitalize()
+            had_matched = True
+            break
+
+    if not had_matched:
+        prefix = email.split('@')[0]
+        (first_name, last_name), max_score = _infer_best_split(prefix)
+
+    if first_name is not None and len(first_name) == 1:
+        first_name = None
+    if last_name is not None and len(last_name) == 1:
+        last_name = None
+
+    if first_name is not None and last_name is not None:
+        fn_1 = nd.search(first_name)['first_name']
+        ln_1 = nd.search(last_name)['last_name']
+        fn_2 = nd.search(first_name)['last_name']
+        ln_2 = nd.search(last_name)['first_name']
+        if fn_1 is not None and ln_1 is not None and fn_2 is not None and ln_2 is not None:
+            score_1 = _compute_score(fn_1) + _compute_score(ln_1)
+            score_2 = _compute_score(fn_2) + _compute_score(ln_2)
+            if score_2 > score_1:
+                first_name, last_name = last_name, first_name
+
+    if first_name is not None:
+        first_name = first_name.lower()
+
+    if last_name is not None:
+        last_name = last_name.lower()
+
+    return first_name, last_name
diff --git a/tests/test_from_emails.py b/tests/test_from_emails.py
@@ -0,0 +1,60 @@
+import unittest
+
+from names_dataset.emails import extract_names_from_email
+
+
+class TestEmail(unittest.TestCase):
+
+    def test_1(self):
+        inputs = [
+            'info@skysense.jp',
+            'isabelle.remy.fr@gmail.com',
+            'philippe.remy@example.com',
+            'philipperemy@example.com',
+            'philippe.d@example.com',
+            'p.remy123@example.com',
+            'philippe@example.com',
+            'philippe_remy@example.com',
+            'remy.philippe@example.com',
+            'remyphilippe@example.com',
+            'j_remy@example.com',
+            'philippe.remy123@example.com',
+            'philippe.d@example.com',
+            'philippe.d@example.com',
+            'j.remy@example.com',
+            'remyphilippe123@example.com',
+            'philippe-d@example.com',
+            'remy_j@example.com',
+            'j_remy123@example.com',
+            'philippe.remy1@example.com',
+        ]
+
+        outputs = [
+            [None, None],
+            ['isabelle', 'remy'],
+            ['philippe', 'remy'],
+            ['philippe', 'remy'],
+            ['philippe', None],
+            [None, 'remy'],
+            ['philippe', None],
+            ['philippe', 'remy'],
+            ['philippe', 'remy'],
+            ['philippe', 'remy'],
+            [None, 'remy'],
+            ['philippe', 'remy'],
+            ['philippe', None],
+            ['philippe', None],
+            [None, 'remy'],
+            ['philippe', 'remy'],
+            ['philippe', None],
+            ['remy', None],
+            [None, 'remy'],
+            ['philippe', 'remy'],
+        ]
+
+        for input_, output_ in zip(inputs, outputs):
+            first_name, last_name = extract_names_from_email(input_)
+            print(input_)
+            self.assertEqual(output_[0], first_name)
+            self.assertEqual(output_[1], last_name)
+            print('[OK]')