From 193468872f387943d2524422ec1c64d83c36a716 Mon Sep 17 00:00:00 2001 From: Madison Swain-Bowden Date: Wed, 6 Mar 2024 15:47:16 -0800 Subject: [PATCH] Preemptively fail unparsable/unusable Europeana URLs (#3845) --- .../providers/provider_api_scripts/europeana.py | 10 +++++++++- .../provider_api_scripts/test_europeana.py | 14 ++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/catalog/dags/providers/provider_api_scripts/europeana.py b/catalog/dags/providers/provider_api_scripts/europeana.py index 63f44685da0..e96c2d2720d 100644 --- a/catalog/dags/providers/provider_api_scripts/europeana.py +++ b/catalog/dags/providers/provider_api_scripts/europeana.py @@ -98,7 +98,15 @@ def get_record_data(self, data: dict) -> dict | None: @raise_if_empty def _get_image_url(self, data: dict) -> str | None: group = data.get("edmIsShownBy") - return group[0] if group else None + if not group: + return None + url = group[0] + # Some Europeana URLs may have prefixes, or reference Dropbox (which we can't + # include in our catalog because we cannot access them directly ourselves). + # E.g.: L-APC248-https://www.dropbox.com/s/i1pqizm1joof8y1/Belgium_Diptyque%20_MAR-SGP-CO1.jpg?raw=1 + if "dropbox.com" in url: + return None + return url @raise_if_empty def _get_foreign_identifier(self, data: dict) -> str | None: diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_europeana.py b/catalog/tests/dags/providers/provider_api_scripts/test_europeana.py index 31e0d422abf..a87aab5d7c4 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_europeana.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_europeana.py @@ -9,6 +9,7 @@ from common.loader import provider_details as prov from common.storage.image import ImageStore from providers.provider_api_scripts.europeana import ( + EmptyRequiredFieldException, EuropeanaDataIngester, EuropeanaRecordBuilder, ) @@ -254,6 +255,19 @@ def test_get_foreign_landing_url_without_edmIsShownAt(record_builder): ) +@pytest.mark.parametrize( + "data", + [ + {}, + {"edmIsShownBy": None}, + {"edmIsShownBy": ["dropbox.com/value"]}, + ], +) +def test_get_image_url_empty(data, record_builder): + with pytest.raises(EmptyRequiredFieldException): + assert record_builder._get_image_url(data) + + @pytest.mark.parametrize( "item_data, expected", [