diff --git a/api/server.py b/api/server.py index 01c00f3..2a1f7db 100644 --- a/api/server.py +++ b/api/server.py @@ -1,11 +1,11 @@ import json import logging import sys -from typing import Union +from typing import Union, Optional, Any, List from flask import Flask, request from paste.translogger import TransLogger -from waitress import serve +from waitress import serve as _serve from names_dataset import NameDataset, NameWrapper @@ -20,41 +20,74 @@ nd = NameDataset() -def generate_output(d: Union[str, dict], status: bool) -> str: +def _generate_output(d: Union[str, dict]) -> str: + status = 'error' not in d return json.dumps({'status': status, 'message': d}, ensure_ascii=False, default=str) -@app.errorhandler(404) -def invalid_route(e): - return generate_output('invalid endpoint', status=False) +def _validate_input( + req, names: Union[str, List[str]], + required: bool = True, + default: Optional[Any] = None, + var_type: Any = str +): + if isinstance(names, str): + names = [names] + var = None + for name in names: + var = req.args.get(name) + if var is not None: + break + if var is None and required: + raise ValueError(f'Provide a parameter for [{names[0]}].') + elif var is None: + var = default + return var_type(var) if var is not None else None -@app.route('/') -def main(): - return generate_output('Welcome user! Name dataset api. query /search to perform a search.', status=True) +@app.errorhandler(404) +def _invalid_route(e): + return _generate_output('invalid endpoint') -def str2bool(s: Union[bool, str]) -> bool: +def _str2bool(s: Union[bool, str]) -> bool: if isinstance(s, bool): return s # noinspection PyBroadException try: return bool(eval(s)) except Exception: - if s.lower() in ['1', '0', 'true', 'y']: + if s.lower() in {'1', 'true', 'y', 't', 'yes', 'on'}: return True - return False + elif s.lower() in {'0', 'false', 'n', 'no', 'off'}: + return False + raise ValueError(f'Cannot convert to boolean: [{s}].') + + +def _process_inputs(req): + name = _validate_input(req, 'name', required=True) + n = _validate_input(req, 'n', required=False, default=5, var_type=int) + use_first_names = _validate_input(req, 'use_first_names', required=True, var_type=_str2bool) + gender = _validate_input(req, 'gender', required=False) + country_alpha2 = _validate_input(req, 'country_alpha2', required=False) + return name, n, use_first_names, gender, country_alpha2 + + +@app.route('/') +def _main(): + endpoints = [a for a, b in globals().items() if not str(a).startswith('_') and 'function' in str(b)] + return _generate_output(f'Welcome to the Name Search API! List of endpoints: [{", ".join(sorted(endpoints))}].') @app.route('/country_codes', methods=['GET']) def country_codes(): try: req = request - alpha_2 = str2bool(req.args.get('alpha_2', False)) + alpha_2 = _str2bool(req.args.get('alpha_2', False)) result = nd.get_country_codes(alpha_2=alpha_2) - return generate_output({'result': result}, status=True) + return _generate_output({'result': result}) except Exception as e: - return generate_output({'error': str(e)}, status=True) + return _generate_output({'error': str(e)}) @app.route('/top', methods=['GET']) @@ -62,29 +95,52 @@ def top(): try: req = request n = int(req.args.get('n', 100)) - use_first_names = str2bool(req.args.get('use_first_names', True)) + use_first_names = _str2bool(req.args.get('use_first_names', True)) country_alpha2 = req.args.get('country_alpha2', None) gender = req.args.get('gender', None) result = nd.get_top_names(n, use_first_names, country_alpha2, gender) - return generate_output({'result': result}, status=True) + return _generate_output({'result': result}) except Exception as e: - return generate_output({'error': str(e)}, status=True) + return _generate_output({'error': str(e)}) @app.route('/search', methods=['GET']) -def search(): +def search(): # legacy. try: - req = request - q = req.args.get('q') - if q is None: - return generate_output('provide a parameter q, for example q=Mike', status=False) + name = request.args.get('q') + if name is None: + return _generate_output('provide a parameter q, for example q=Mike') else: - result = nd.search(q) + result = nd.search(name) result['describe'] = NameWrapper(result).describe - return generate_output({'result': result}, status=True) + return _generate_output({'result': result}) + except Exception as e: + return _generate_output({'error': str(e)}) + + +@app.route('/fuzzy_search', methods=['GET']) +def fuzzy_search(): + try: + name, n, use_first_names, gender, country_alpha2 = _process_inputs(request) + result = nd.fuzzy_search( + name=name, n=n, use_first_names=use_first_names, country_alpha2=country_alpha2, gender=gender + ) + return _generate_output({'result': result}) + except Exception as e: + return _generate_output({'error': str(e)}) + + +@app.route('/autocomplete', methods=['GET']) +def autocomplete(): + try: + name, n, use_first_names, gender, country_alpha2 = _process_inputs(request) + result = nd.auto_complete( + name=name, n=n, use_first_names=use_first_names, country_alpha2=country_alpha2, gender=gender + ) + return _generate_output({'result': result}) except Exception as e: - return generate_output({'error': str(e)}, status=True) + return _generate_output({'error': str(e)}) if __name__ == '__main__': - serve(TransLogger(app, setup_console_handler=False), port=8888, threads=4) + _serve(TransLogger(app, setup_console_handler=False), port=8888, threads=4) diff --git a/names_dataset/nd_v3.py b/names_dataset/nd_v3.py index 388c3a1..e8bc13a 100644 --- a/names_dataset/nd_v3.py +++ b/names_dataset/nd_v3.py @@ -5,7 +5,7 @@ import zipfile from collections import defaultdict from pathlib import Path -from typing import Optional +from typing import Optional, Dict, List import pycountry @@ -49,6 +49,68 @@ def describe(self): return f'{self.gender}, {self.country}' +def _autocomplete_search( + prefix: str, + names_dict: Dict[str, Dict], + n: int = 5, + gender: Optional[str] = None, + country_alpha2: Optional[str] = None, + max_rank: int = 5000 +) -> List[Dict]: + matching_names = [] + for name, info in names_dict.items(): + if name.startswith(prefix) and not name.startswith(prefix + ' '): + if gender is None or (len(info['gender']) > 0 and gender == max(info['gender'], key=info['gender'].get)): + matching_names.append(name) + result = [] + for name in matching_names: + attrs = names_dict[name] + ranks = attrs['rank'] + if len(ranks) <= 1: + continue + if country_alpha2 is not None: + if country_alpha2 not in ranks: + continue + rank = ranks[country_alpha2] + else: + rank = int(sum(ranks.values()) / len(ranks)) + result.append({'name': name, 'rank': rank}) + result = sorted(result, key=lambda x: x['rank']) + result = [r for r in result if r['rank'] < max_rank][:n] + return result + + +def _fuzzy_search( + fuzzy_name: str, + names_dict: Dict[str, Dict], + n: int = 5, + gender: Optional[str] = None, + country_alpha2: Optional[str] = None, +) -> List[Dict]: + from fuzzywuzzy import fuzz + closest_names = [] + for name, info in names_dict.items(): + similarity = fuzz.ratio(fuzzy_name, name) + if gender is None or (len(info['gender']) > 0 and gender == max(info['gender'], key=info['gender'].get)): + closest_names.append((name, similarity)) + closest_names.sort(key=lambda x: x[1], reverse=True) + result = [] + for name in closest_names[0:n * 5]: + attrs = names_dict[name[0]] + ranks = attrs['rank'] + if len(ranks) == 0: + continue + if country_alpha2 is not None: + if country_alpha2 not in ranks: + continue + measure = ranks[country_alpha2] + else: + measure = int(sum(ranks.values()) / len(ranks)) + result.append({'name': name[0], 'measure': measure}) + result = sorted(result, key=lambda x: x['measure'])[0:n] + return result + + class NameDataset: def __init__(self, load_first_names=True, load_last_names=True): @@ -58,6 +120,36 @@ def __init__(self, load_first_names=True, load_last_names=True): last_names_filename = Path(os.path.dirname(__file__)) / 'v3/last_names.zip' self.first_names = self._read_json_from_zip(first_names_filename) if load_first_names else None self.last_names = self._read_json_from_zip(last_names_filename) if load_last_names else None + self.country_codes = self.get_country_codes(alpha_2=True) + + def auto_complete( + self, + name: str, + n: int = 5, + use_first_names: bool = True, + country_alpha2: Optional[str] = None, + gender: Optional[str] = None, + *args, **kwargs + ) -> List[Dict]: + name, gender = self._process_inputs(name, use_first_names, gender, country_alpha2) + names_dict = self.first_names if use_first_names else self.last_names + return _autocomplete_search( + n=n, prefix=name, names_dict=names_dict, gender=gender, country_alpha2=country_alpha2, *args, **kwargs + ) + + def fuzzy_search( + self, + name: str, + n: int = 5, + use_first_names: bool = True, + country_alpha2: Optional[str] = None, + gender: Optional[str] = None, + ) -> List[Dict]: + name, gender = self._process_inputs(name, use_first_names, gender, country_alpha2) + names_dict = self.first_names if use_first_names else self.last_names + return _fuzzy_search( + n=n, fuzzy_name=name, names_dict=names_dict, gender=gender, country_alpha2=country_alpha2 + ) @staticmethod def _read_json_from_zip(zip_file): @@ -65,6 +157,29 @@ def _read_json_from_zip(zip_file): with z.open(z.filelist[0]) as f: return json.load(f) + def _process_inputs( + self, + name: str, + use_first_names: bool, + gender: Optional[str] = None, + country_alpha2: Optional[str] = None + ): + q_name = name.strip().title() + if use_first_names and self.first_names is None: + raise ValueError('Select [load_first_names=True] at init.') + if not use_first_names and self.last_names is None: + raise ValueError('Select [load_last_names=True] at init.') + if gender is not None: + if gender.title() in {'M', 'Male'}: + gender = 'M' + elif gender.title() in {'F', 'Female'}: + gender = 'F' + else: + raise ValueError('Invalid gender value.') + if country_alpha2 is not None and country_alpha2 not in self.country_codes: + raise ValueError(f'Invalid Country alpha-2 code. Valid are: {",".join(self.country_codes)}.') + return q_name, gender + def search(self, name: str): key = name.strip().title() fn = self._post_process(self.first_names.get(key)) if self.first_names is not None else None