Skip to content

Commit

Permalink
Skip suppressed OGM records during harvest
Browse files Browse the repository at this point in the history
Why these changes are being introduced:

It was pointed out that OGM records, both GBL1 and Aardvark,
were getting returned by the GeoHarvester even when the source
record has a version of suppressed=true.

How this addresses that need:
* Adds SourceRecord.is_suppressed property that is extended
by source specific classes
* This is checked during OGMHarvester loop for source records
and those that are suppressed are skipped, thereby not
getting returned by harvester

Side effects of this change:
* Suppressed OGM records will not get returned by GeoHarvester

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/GDT-329
  • Loading branch information
ghukill committed May 24, 2024
1 parent e6b7e62 commit 9062ad8
Show file tree
Hide file tree
Showing 8 changed files with 34 additions and 13 deletions.
21 changes: 14 additions & 7 deletions harvester/harvest/ogm.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ def _get_source_records(
used to return the files in scope. The appropriate args are passed to this
method once identified.
If the OGM source record self-identifies as suppressed, it will be skipped here.
Args:
retrieve_records_func: one of two possible methods from OGMRepository
- get_current_records()
Expand All @@ -105,15 +107,20 @@ def _get_source_records(
ogm_records_iterator = repo.filter_records(retrieve_records_func(repo, *args))

for ogm_record in ogm_records_iterator:
source_record = self.create_source_record(
repo.metadata_format,
ogm_record.identifier,
ogm_record.harvest_event,
ogm_record.read(),
repo_config,
)

if source_record.is_suppressed:
continue

yield Record(
identifier=ogm_record.identifier,
source_record=self.create_source_record(
repo.metadata_format,
ogm_record.identifier,
ogm_record.harvest_event,
ogm_record.read(),
repo_config,
),
source_record=source_record,
)

if self.remove_local_repos:
Expand Down
4 changes: 4 additions & 0 deletions harvester/records/formats/aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ class Aardvark(JSONSourceRecord):

metadata_format: Literal["aardvark"] = field(default="aardvark")

@property
def is_suppressed(self) -> bool | None:
return self.parsed_data.get("gbl_suppressed_b")

##########################
# Required Field Methods
##########################
Expand Down
4 changes: 4 additions & 0 deletions harvester/records/formats/gbl1.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ class GBL1(JSONSourceRecord):

metadata_format: Literal["gbl1"] = field(default="gbl1")

@property
def is_suppressed(self) -> bool | None:
return self.parsed_data.get("suppressed_b")

def _convert_scalar_to_array(self, field_name: str) -> list[str]:
"""Convert a single, scalar GBL1 value to Aardvark array."""
if value := self.parsed_data.get(field_name):
Expand Down
5 changes: 5 additions & 0 deletions harvester/records/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,11 @@ def is_deleted(self) -> bool:
return True
return False

@property
def is_suppressed(self) -> bool | None:
"""Property to indicate if source record self-identified as suppressed."""
return False

def get_controlled_dct_format_s_term(self, value: str | None) -> str | None:
"""Get a single controlled term for dct_format_s from original value.
Expand Down
3 changes: 1 addition & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,10 +586,9 @@ def ogm_incremental_harvester():

@pytest.fixture
def ogm_full_record_set():
"""Full set of identifiers after suppressed and invalid records skipped."""
return {
"edu.earth:5f5ac295b365",
"edu.earth:3072f18cdeb5",
"edu.venus:996864ca615e",
"edu.venus:7fe1e637995f",
"edu.pluto:83509b6d7e03",
"edu.pluto:83fd37f6a879",
Expand Down
3 changes: 2 additions & 1 deletion tests/fixtures/ogm/files/edu.earth/record2.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,6 @@
"layer_modified_dt": "2012-11-28T23:35:27.343Z",
"layer_slug_s": "ark28722-s70026",
"solr_geom": "ENVELOPE(-122.653308, -122.18269, 38.674384, 38.148358)",
"solr_year_i": 2010
"solr_year_i": 2010,
"suppressed_b": true
}
3 changes: 2 additions & 1 deletion tests/fixtures/ogm/files/edu.venus/record2.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,5 +54,6 @@
"39015091195514_01"
],
"gbl_mdModified_dt": "2023-02-03T21:22:59Z",
"gbl_mdVersion_s": "Aardvark"
"gbl_mdVersion_s": "Aardvark",
"gbl_suppressed_b": true
}
4 changes: 2 additions & 2 deletions tests/test_harvest/test_ogm_harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def test_ogm_record_read_from_git_history(ogm_record_from_git_history):

def test_ogm_harvester_get_full_source_records(ogm_full_harvester, ogm_full_record_set):
records = list(ogm_full_harvester.get_source_records())
assert len(records) == 6
assert len(records) == 4
assert {record.identifier for record in records} == ogm_full_record_set


Expand All @@ -218,7 +218,7 @@ def test_ogm_harvester_get_incremental_source_records_early_date(
):
ogm_incremental_harvester.from_date = "1995-01-01"
records = list(ogm_incremental_harvester.get_source_records())
assert len(records) == 6
assert len(records) == 4
assert {record.identifier for record in records} == ogm_full_record_set


Expand Down

0 comments on commit 9062ad8

Please sign in to comment.