From 23f4407004c978884961f8f3f6c3823571c9df99 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Mon, 13 Jan 2025 17:46:20 +0100 Subject: [PATCH 1/3] Query param to enable AND filtering for facets --- tests/test_search.py | 14 ++++++++++++++ yente/routers/match.py | 1 + yente/routers/search.py | 32 ++++++++++++++++++++++++-------- yente/routers/util.py | 2 +- yente/search/queries.py | 41 ++++++++++++++++++++++++++++------------- 5 files changed, 68 insertions(+), 22 deletions(-) diff --git a/tests/test_search.py b/tests/test_search.py index e5381fa1..df53571e 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -119,6 +119,20 @@ def test_search_filter_countries_remove(): assert len(results) == 0, results +def test_search_filter_countries_operator(): + res = client.get("/search/default?q=vladimir putin&countries=ke&countries=ru") + assert res.status_code == 200, res + results = res.json()["results"] + assert len(results) > 0, results + + res = client.get( + "/search/default?q=vladimir putin&filter_op=and&countries=ke&countries=ru" + ) + assert res.status_code == 200, res + results = res.json()["results"] + assert len(results) == 0, results + + def test_search_facet_datasets_default(): res = client.get("/search/default") assert res.status_code == 200, res diff --git a/yente/routers/match.py b/yente/routers/match.py index b9653817..6c386290 100644 --- a/yente/routers/match.py +++ b/yente/routers/match.py @@ -63,6 +63,7 @@ async def match( fuzzy: bool = Query( settings.MATCH_FUZZY, title="Use slow matching for candidate generation, does not affect scores", + deprecated=True, ), changed_since: Optional[str] = Query( None, diff --git a/yente/routers/search.py b/yente/routers/search.py index 1765be78..e46d2a3a 100644 --- a/yente/routers/search.py +++ b/yente/routers/search.py @@ -11,7 +11,7 @@ from yente.provider import SearchProvider, get_provider from yente.search.queries import parse_sorts, text_query from yente.search.queries import facet_aggregations -from yente.search.queries import FilterDict +from yente.search.queries import FilterDict, Operator from yente.search.search import get_entity, search_entities from yente.search.search import result_entities, result_facets, result_total from yente.search.nested import serialize_entity @@ -28,7 +28,6 @@ class Facet(str, enum.Enum): DATASETS = "datasets" SCHEMA = "schema" COUNTRIES = "countries" - NAMES = "names" IDENTIFIERS = "identifiers" TOPICS = "topics" GENDERS = "genders" @@ -55,10 +54,13 @@ async def search( settings.BASE_SCHEMA, title="Types of entities that can match the search" ), include_dataset: List[str] = Query( - [], title="Only include the given datasets in results" + [], + title="Restrict the search scope to datasets (that are in the given scope) to search entities within.", + description="Limit the results to entities that are part of at least one of the given datasets.", ), exclude_dataset: List[str] = Query( - [], title="Remove the given datasets from results" + [], + title="Remove specific datasets (that are in the given scope) from the search scope.", ), exclude_schema: List[str] = Query( [], title="Remove the given types of entities from results" @@ -72,7 +74,10 @@ async def search( topics: List[str] = Query( [], title="Filter by entity topics (e.g. sanction, role.pep)" ), - datasets: List[str] = Query([], title="Use `include_dataset` instead"), + datasets: List[str] = Query( + [], + title="Filter by dataset names, for faceting use (respects operator choice).", + ), limit: int = Query( settings.DEFAULT_PAGE, title="Number of results to return", le=settings.MAX_PAGE ), @@ -80,13 +85,23 @@ async def search( 0, title="Start at result with given offset", le=settings.MAX_OFFSET ), sort: List[str] = Query([], title="Sorting criteria"), - target: Optional[bool] = Query(None, title="Include only targeted entities"), + target: Optional[bool] = Query( + None, + title="Include only targeted entities", + description="Please specify a list of topics of concern, instead.", + deprecated=True, + ), fuzzy: bool = Query(False, title="Allow fuzzy query syntax"), simple: bool = Query(False, title="Use simple syntax for user-facing query boxes"), facets: List[Facet] = Query( DEFAULT_FACETS, title="Facet counts to include in response.", ), + filter_op: Operator = Query( + "OR", + title="Define behaviour of multiple filters on one field", + description="Logic to use when combining multiple filters on the same field (topics, countries, datasets). Please specify AND for new integrations (to override a legacy default) and when building a faceted user interface.", + ), provider: SearchProvider = Depends(get_provider), ) -> SearchResponse: """Search endpoint for matching entities based on a simple piece of text, e.g. @@ -105,8 +120,8 @@ async def search( filters: FilterDict = { "countries": countries, "topics": topics, + "datasets": datasets, } - include_dataset.extend(datasets) if target is not None: filters["target"] = target query = text_query( @@ -117,9 +132,10 @@ async def search( fuzzy=fuzzy, simple=simple, include_dataset=include_dataset, - exclude_schema=exclude_schema, exclude_dataset=exclude_dataset, + exclude_schema=exclude_schema, changed_since=changed_since, + filter_op=filter_op, ) aggregations = facet_aggregations([f.value for f in facets]) resp = await search_entities( diff --git a/yente/routers/util.py b/yente/routers/util.py index 8fb77ce5..afb9cfe6 100644 --- a/yente/routers/util.py +++ b/yente/routers/util.py @@ -9,7 +9,7 @@ PATH_DATASET = Path( - description="Data source or collection name to be queries", + description="Data source or collection name to scope the query to.", examples=["default"], ) QUERY_PREFIX = Query("", min_length=0, description="Search prefix") diff --git a/yente/search/queries.py b/yente/search/queries.py index 7bc2fbe3..5cb51722 100644 --- a/yente/search/queries.py +++ b/yente/search/queries.py @@ -1,5 +1,6 @@ +import enum from pprint import pprint # noqa -from typing import Any, Dict, Generator, List, Tuple, Union, Optional +from typing import Any, Dict, Generator, List, Set, Tuple, Union, Optional from followthemoney.schema import Schema from followthemoney.proxy import EntityProxy from followthemoney.types import registry @@ -16,31 +17,38 @@ Clause = Dict[str, Any] +class Operator(str, enum.Enum): + AND = "AND" + OR = "OR" + + def filter_query( shoulds: List[Clause], - dataset: Optional[Dataset] = None, + scope_dataset: Optional[Dataset] = None, schema: Optional[Schema] = None, filters: FilterDict = {}, include_dataset: List[str] = [], exclude_schema: List[str] = [], exclude_dataset: List[str] = [], changed_since: Optional[str] = None, + filter_op: Operator = Operator.AND, ) -> Clause: filterqs: List[Clause] = [] must_not: List[Clause] = [] - datasets: List[str] = include_dataset - if not len(datasets) and dataset is not None: - datasets = dataset.dataset_names - for exclude_ds in exclude_dataset: + + datasets: Set[str] = set(scope_dataset.dataset_names) + if len(include_dataset): + datasets = datasets.intersection(include_dataset) + if len(exclude_dataset): # This is logically a bit more consistent, but doesn't describe the use # case of wanting to screen all the entities from datasets X, Y but not Z: # must_not.append({"term": {"datasets": exclude_ds}}) - if exclude_ds in datasets: - datasets.remove(exclude_ds) + datasets = datasets.difference(exclude_dataset) if len(datasets): - filterqs.append({"terms": {"datasets": datasets}}) + filterqs.append({"terms": {"datasets": list(datasets)}}) else: filterqs.append({"match_none": {}}) + if schema is not None: schemata = schema.matchable_schemata if not schema.matchable: @@ -53,7 +61,12 @@ def filter_query( continue values = [v for v in values if len(v)] if len(values): - filterqs.append({"terms": {field: values}}) + if filter_op == Operator.OR: + filterqs.append({"terms": {field: values}}) + continue + elif filter_op == Operator.AND: + for v in values: + filterqs.append({"term": {field: v}}) if changed_since is not None: filterqs.append({"range": {"last_change": {"gt": changed_since}}}) @@ -121,7 +134,7 @@ def entity_query( return filter_query( shoulds, filters=filters, - dataset=dataset, + scope_dataset=dataset, schema=entity.schema, include_dataset=include_dataset, exclude_schema=exclude_schema, @@ -141,6 +154,7 @@ def text_query( exclude_schema: List[str] = [], exclude_dataset: List[str] = [], changed_since: Optional[str] = None, + filter_op: Operator = Operator.AND, ) -> Clause: if not len(query.strip()): should: Clause = {"match_all": {}} @@ -168,13 +182,14 @@ def text_query( # log.info("Query", should=should) return filter_query( [should], - dataset=dataset, + scope_dataset=dataset, schema=schema, filters=filters, include_dataset=include_dataset, exclude_schema=exclude_schema, exclude_dataset=exclude_dataset, changed_since=changed_since, + filter_op=filter_op, ) @@ -186,7 +201,7 @@ def prefix_query( should: Clause = {"match_none": {}} else: should = {"match_phrase_prefix": {"names": {"query": prefix, "slop": 2}}} - return filter_query([should], dataset=dataset) + return filter_query([should], scope_dataset=dataset) def facet_aggregations(fields: List[str] = []) -> Clause: From 6b28ba49b63ae2a214c8bc80d4893ead3c02d0bb Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Mon, 13 Jan 2025 18:40:30 +0100 Subject: [PATCH 2/3] change case --- tests/test_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_search.py b/tests/test_search.py index df53571e..8f3912e8 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -126,7 +126,7 @@ def test_search_filter_countries_operator(): assert len(results) > 0, results res = client.get( - "/search/default?q=vladimir putin&filter_op=and&countries=ke&countries=ru" + "/search/default?q=vladimir putin&filter_op=AND&countries=ke&countries=ru" ) assert res.status_code == 200, res results = res.json()["results"] From 06fae5696577c89bcd730c74164bcd1456dba641 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Mon, 13 Jan 2025 18:59:58 +0100 Subject: [PATCH 3/3] Force dataset argument to filter_query --- yente/search/queries.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yente/search/queries.py b/yente/search/queries.py index 5cb51722..978d71a1 100644 --- a/yente/search/queries.py +++ b/yente/search/queries.py @@ -23,8 +23,8 @@ class Operator(str, enum.Enum): def filter_query( + scope_dataset: Dataset, shoulds: List[Clause], - scope_dataset: Optional[Dataset] = None, schema: Optional[Schema] = None, filters: FilterDict = {}, include_dataset: List[str] = [], @@ -132,9 +132,9 @@ def entity_query( shoulds.append({"match": {"text": value}}) return filter_query( + dataset, shoulds, filters=filters, - scope_dataset=dataset, schema=entity.schema, include_dataset=include_dataset, exclude_schema=exclude_schema, @@ -181,8 +181,8 @@ def text_query( } # log.info("Query", should=should) return filter_query( + dataset, [should], - scope_dataset=dataset, schema=schema, filters=filters, include_dataset=include_dataset, @@ -201,7 +201,7 @@ def prefix_query( should: Clause = {"match_none": {}} else: should = {"match_phrase_prefix": {"names": {"query": prefix, "slop": 2}}} - return filter_query([should], scope_dataset=dataset) + return filter_query(dataset, [should]) def facet_aggregations(fields: List[str] = []) -> Clause: