From 9062ad8af9b72dac3709196d3651ca8534e59cf7 Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Fri, 24 May 2024 09:55:22 -0400 Subject: [PATCH] Skip suppressed OGM records during harvest Why these changes are being introduced: It was pointed out that OGM records, both GBL1 and Aardvark, were getting returned by the GeoHarvester even when the source record has a version of suppressed=true. How this addresses that need: * Adds SourceRecord.is_suppressed property that is extended by source specific classes * This is checked during OGMHarvester loop for source records and those that are suppressed are skipped, thereby not getting returned by harvester Side effects of this change: * Suppressed OGM records will not get returned by GeoHarvester Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/GDT-329 --- harvester/harvest/ogm.py | 21 ++++++++++++------- harvester/records/formats/aardvark.py | 4 ++++ harvester/records/formats/gbl1.py | 4 ++++ harvester/records/record.py | 5 +++++ tests/conftest.py | 3 +-- .../fixtures/ogm/files/edu.earth/record2.json | 3 ++- .../fixtures/ogm/files/edu.venus/record2.json | 3 ++- tests/test_harvest/test_ogm_harvester.py | 4 ++-- 8 files changed, 34 insertions(+), 13 deletions(-) diff --git a/harvester/harvest/ogm.py b/harvester/harvest/ogm.py index 3831495..a179235 100644 --- a/harvester/harvest/ogm.py +++ b/harvester/harvest/ogm.py @@ -90,6 +90,8 @@ def _get_source_records( used to return the files in scope. The appropriate args are passed to this method once identified. + If the OGM source record self-identifies as suppressed, it will be skipped here. + Args: retrieve_records_func: one of two possible methods from OGMRepository - get_current_records() @@ -105,15 +107,20 @@ def _get_source_records( ogm_records_iterator = repo.filter_records(retrieve_records_func(repo, *args)) for ogm_record in ogm_records_iterator: + source_record = self.create_source_record( + repo.metadata_format, + ogm_record.identifier, + ogm_record.harvest_event, + ogm_record.read(), + repo_config, + ) + + if source_record.is_suppressed: + continue + yield Record( identifier=ogm_record.identifier, - source_record=self.create_source_record( - repo.metadata_format, - ogm_record.identifier, - ogm_record.harvest_event, - ogm_record.read(), - repo_config, - ), + source_record=source_record, ) if self.remove_local_repos: diff --git a/harvester/records/formats/aardvark.py b/harvester/records/formats/aardvark.py index 6b3d4ea..fa55892 100644 --- a/harvester/records/formats/aardvark.py +++ b/harvester/records/formats/aardvark.py @@ -16,6 +16,10 @@ class Aardvark(JSONSourceRecord): metadata_format: Literal["aardvark"] = field(default="aardvark") + @property + def is_suppressed(self) -> bool | None: + return self.parsed_data.get("gbl_suppressed_b") + ########################## # Required Field Methods ########################## diff --git a/harvester/records/formats/gbl1.py b/harvester/records/formats/gbl1.py index 1df6e4c..4001320 100644 --- a/harvester/records/formats/gbl1.py +++ b/harvester/records/formats/gbl1.py @@ -16,6 +16,10 @@ class GBL1(JSONSourceRecord): metadata_format: Literal["gbl1"] = field(default="gbl1") + @property + def is_suppressed(self) -> bool | None: + return self.parsed_data.get("suppressed_b") + def _convert_scalar_to_array(self, field_name: str) -> list[str]: """Convert a single, scalar GBL1 value to Aardvark array.""" if value := self.parsed_data.get(field_name): diff --git a/harvester/records/record.py b/harvester/records/record.py index e908796..00cad03 100644 --- a/harvester/records/record.py +++ b/harvester/records/record.py @@ -219,6 +219,11 @@ def is_deleted(self) -> bool: return True return False + @property + def is_suppressed(self) -> bool | None: + """Property to indicate if source record self-identified as suppressed.""" + return False + def get_controlled_dct_format_s_term(self, value: str | None) -> str | None: """Get a single controlled term for dct_format_s from original value. diff --git a/tests/conftest.py b/tests/conftest.py index 507f611..4b749cd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -586,10 +586,9 @@ def ogm_incremental_harvester(): @pytest.fixture def ogm_full_record_set(): + """Full set of identifiers after suppressed and invalid records skipped.""" return { - "edu.earth:5f5ac295b365", "edu.earth:3072f18cdeb5", - "edu.venus:996864ca615e", "edu.venus:7fe1e637995f", "edu.pluto:83509b6d7e03", "edu.pluto:83fd37f6a879", diff --git a/tests/fixtures/ogm/files/edu.earth/record2.json b/tests/fixtures/ogm/files/edu.earth/record2.json index 93686c4..75d90df 100644 --- a/tests/fixtures/ogm/files/edu.earth/record2.json +++ b/tests/fixtures/ogm/files/edu.earth/record2.json @@ -28,5 +28,6 @@ "layer_modified_dt": "2012-11-28T23:35:27.343Z", "layer_slug_s": "ark28722-s70026", "solr_geom": "ENVELOPE(-122.653308, -122.18269, 38.674384, 38.148358)", - "solr_year_i": 2010 + "solr_year_i": 2010, + "suppressed_b": true } \ No newline at end of file diff --git a/tests/fixtures/ogm/files/edu.venus/record2.json b/tests/fixtures/ogm/files/edu.venus/record2.json index 64bf52d..c7d308d 100644 --- a/tests/fixtures/ogm/files/edu.venus/record2.json +++ b/tests/fixtures/ogm/files/edu.venus/record2.json @@ -54,5 +54,6 @@ "39015091195514_01" ], "gbl_mdModified_dt": "2023-02-03T21:22:59Z", - "gbl_mdVersion_s": "Aardvark" + "gbl_mdVersion_s": "Aardvark", + "gbl_suppressed_b": true } \ No newline at end of file diff --git a/tests/test_harvest/test_ogm_harvester.py b/tests/test_harvest/test_ogm_harvester.py index 5dc3e0f..dd0d54d 100644 --- a/tests/test_harvest/test_ogm_harvester.py +++ b/tests/test_harvest/test_ogm_harvester.py @@ -209,7 +209,7 @@ def test_ogm_record_read_from_git_history(ogm_record_from_git_history): def test_ogm_harvester_get_full_source_records(ogm_full_harvester, ogm_full_record_set): records = list(ogm_full_harvester.get_source_records()) - assert len(records) == 6 + assert len(records) == 4 assert {record.identifier for record in records} == ogm_full_record_set @@ -218,7 +218,7 @@ def test_ogm_harvester_get_incremental_source_records_early_date( ): ogm_incremental_harvester.from_date = "1995-01-01" records = list(ogm_incremental_harvester.get_source_records()) - assert len(records) == 6 + assert len(records) == 4 assert {record.identifier for record in records} == ogm_full_record_set