From 8374cf9c12120af5f03d2f838164b9f751d6e286 Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Thu, 24 Oct 2024 11:21:22 +0200 Subject: [PATCH 01/32] Outfactor generic CERL part of SBTI API --- edpop_explorer/__init__.py | 3 +- edpop_explorer/cerl.py | 107 +++++++++++++++++++++++++++ edpop_explorer/readers/sbtireader.py | 91 ++--------------------- 3 files changed, 116 insertions(+), 85 deletions(-) create mode 100644 edpop_explorer/cerl.py diff --git a/edpop_explorer/__init__.py b/edpop_explorer/__init__.py index 5a23a45..ced924c 100644 --- a/edpop_explorer/__init__.py +++ b/edpop_explorer/__init__.py @@ -6,7 +6,7 @@ 'BasePreparedQuery', 'PreparedQueryType', 'Record', 'RawData', 'RecordError', 'BibliographicalRecord', 'BiographicalRecord', 'LazyRecordMixin', - 'SRUReader', + 'SRUReader', 'CERLReader', 'Marc21Data', 'Marc21Field', 'Marc21BibliographicalRecord', 'Marc21DataMixin', 'SRUMarc21Reader', 'SRUMarc21BibliographicalReader', 'BIBLIOGRAPHICAL', 'BIOGRAPHICAL' @@ -32,4 +32,5 @@ Marc21Data, Marc21Field, Marc21BibliographicalRecord, Marc21DataMixin, SRUMarc21Reader, SRUMarc21BibliographicalReader ) +from .cerl import CERLReader diff --git a/edpop_explorer/cerl.py b/edpop_explorer/cerl.py new file mode 100644 index 0000000..600462d --- /dev/null +++ b/edpop_explorer/cerl.py @@ -0,0 +1,107 @@ +from abc import abstractmethod + +from rdflib import URIRef +import requests +from typing import List, Dict, Optional + +from edpop_explorer import ( + Reader, Record, ReaderError, BiographicalRecord, Field, BIOGRAPHICAL +) + + +class CERLReader(Reader): + """A generic reader class for the CERL databases on the ``data.cerl.org`` + platform. + + This is an abstract class -- to use, derive from this class, set the + ``API_URL``, ``API_BY_ID_BASE_URL`` and ``LINK_BASE_URL`` constant + attributes, and implement the ``_convert_record`` class method.""" + API_URL: str + """The base URL of the search API, of the form ``https://data.cerl.org//_search``.""" + API_BY_ID_BASE_URL: str + """The base URL of the API for retrieving single records, of the form ``https://data.cerl.org//``.""" + LINK_BASE_URL: str + """The base URL for userfriendly representations of single records.""" + additional_params: Optional[Dict[str, str]] = None + DEFAULT_RECORDS_PER_PAGE = 10 + + @classmethod + def get_by_id(cls, identifier: str) -> Record: + try: + response = requests.get( + cls.API_BY_ID_BASE_URL + identifier, + headers={ + 'Accept': 'application/json' + }, + ).json() + except requests.exceptions.JSONDecodeError: + raise ReaderError(f"Item with id {identifier} does not exist.") + except requests.exceptions.RequestException as err: + raise ReaderError(f"Error during server request: {err}") + return cls._convert_record(response) + + + @classmethod + @abstractmethod + def _convert_record(cls, rawrecord: dict) -> Record: + pass + + def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> List[Record]: + assert isinstance(self.prepared_query, str) + if maximum_records is None: + maximum_records = self.DEFAULT_RECORDS_PER_PAGE + print(f'The query is: {self.prepared_query}') + try: + response = requests.get( + self.API_URL, + params={ + 'query': self.prepared_query, + 'from': start_record, + 'size': maximum_records, + 'mode': 'default', + 'sort': 'default' + }, + headers={ + 'Accept': 'application/json' + } + ).json() + except ( + requests.exceptions.RequestException + ) as err: + raise ReaderError('Error during server request: ' + str(err)) + + # TODO: check for error responses + try: + if response['hits'] is None: + self.number_of_results = 0 + else: + self.number_of_results = response['hits']['value'] + except KeyError: + raise ReaderError('Number of hits not given in server response') + + if 'rows' not in response: + # There are no rows in the response, so stop here + return [] + + records: List[Record] = [] + for rawrecord in response['rows']: + record = self._convert_record(rawrecord) + records.append(record) + + return records + + @classmethod + def transform_query(cls, query) -> str: + # No transformation needed + return query + + def fetch_range(self, range_to_fetch: range) -> range: + if self.prepared_query is None: + raise ReaderError('First call prepare_query') + start_record = range_to_fetch.start + number_to_fetch = range_to_fetch.stop - start_record + results = self._perform_query(start_record, number_to_fetch) + for i, result in enumerate(results): + self.records[i] = result + return range(start_record, start_record + len(results)) + diff --git a/edpop_explorer/readers/sbtireader.py b/edpop_explorer/readers/sbtireader.py index 148a5bf..14f77d7 100644 --- a/edpop_explorer/readers/sbtireader.py +++ b/edpop_explorer/readers/sbtireader.py @@ -1,17 +1,16 @@ from rdflib import URIRef -import requests from typing import List, Dict, Optional from edpop_explorer import ( - Reader, Record, ReaderError, BiographicalRecord, Field, BIOGRAPHICAL + BiographicalRecord, Field, BIOGRAPHICAL ) +from edpop_explorer.cerl import CERLReader -class SBTIReader(Reader): - api_url = 'https://data.cerl.org/sbti/_search' - api_by_id_base_url = 'https://data.cerl.org/sbti/' - link_base_url = 'https://data.cerl.org/sbti/' - fetching_exhausted: bool = False +class SBTIReader(CERLReader): + API_URL = 'https://data.cerl.org/sbti/_search' + API_BY_ID_BASE_URL = 'https://data.cerl.org/sbti/' + LINK_BASE_URL = 'https://data.cerl.org/sbti/' additional_params: Optional[Dict[str, str]] = None CATALOG_URIREF = URIRef( 'https://edpop.hum.uu.nl/readers/sbti' @@ -34,21 +33,6 @@ def _get_name_field(cls, data: dict) -> Optional[Field]: field = Field(f"{name}") return field - @classmethod - def get_by_id(cls, identifier: str) -> BiographicalRecord: - try: - response = requests.get( - cls.api_by_id_base_url + identifier, - headers={ - 'Accept': 'application/json' - }, - ).json() - except requests.exceptions.JSONDecodeError: - raise ReaderError(f"Item with id {identifier} does not exist.") - except requests.exceptions.RequestException as err: - raise ReaderError(f"Error during server request: {err}") - return cls._convert_record(response) - @classmethod def _convert_record(cls, rawrecord: dict) -> BiographicalRecord: @@ -58,7 +42,7 @@ def _convert_record(cls, rawrecord: dict) -> BiographicalRecord: if not record.identifier: record.identifier = rawrecord.get('_id', None) if record.identifier: - record.link = cls.link_base_url + record.identifier + record.link = cls.LINK_BASE_URL + record.identifier # Add fields heading = rawrecord.get("heading", None) @@ -83,64 +67,3 @@ def _convert_record(cls, rawrecord: dict) -> BiographicalRecord: return record - def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> List[Record]: - assert isinstance(self.prepared_query, str) - if maximum_records is None: - maximum_records = self.DEFAULT_RECORDS_PER_PAGE - print(f'The query is: {self.prepared_query}') - try: - response = requests.get( - self.api_url, - params={ - 'query': self.prepared_query, - 'from': start_record, - 'size': maximum_records, - 'mode': 'default', - 'sort': 'default' - }, - headers={ - 'Accept': 'application/json' - } - ).json() - except ( - requests.exceptions.RequestException - ) as err: - raise ReaderError('Error during server request: ' + str(err)) - - # TODO: check for error responses - try: - if response['hits'] is None: - self.number_of_results = 0 - else: - self.number_of_results = response['hits']['value'] - except KeyError: - raise ReaderError('Number of hits not given in server response') - - if 'rows' not in response: - # There are no rows in the response, so stop here - return [] - - records: List[Record] = [] - for rawrecord in response['rows']: - record = self._convert_record(rawrecord) - records.append(record) - - return records - - @classmethod - def transform_query(cls, query) -> str: - # No transformation needed - return query - - def fetch_range(self, range_to_fetch: range) -> range: - if self.prepared_query is None: - raise ReaderError('First call prepare_query') - if self.fetching_exhausted: - return range(0) - start_record = range_to_fetch.start - number_to_fetch = range_to_fetch.stop - start_record - results = self._perform_query(start_record, number_to_fetch) - for i, result in enumerate(results): - self.records[i] = result - return range(start_record, start_record + len(results)) - From e386722282bf6eea58ebbad770ce1ad5e20c1826 Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Thu, 24 Oct 2024 16:33:17 +0200 Subject: [PATCH 02/32] Fix bug: previous records were being overwritten --- edpop_explorer/cerl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edpop_explorer/cerl.py b/edpop_explorer/cerl.py index 600462d..61dbf01 100644 --- a/edpop_explorer/cerl.py +++ b/edpop_explorer/cerl.py @@ -102,6 +102,6 @@ def fetch_range(self, range_to_fetch: range) -> range: number_to_fetch = range_to_fetch.stop - start_record results = self._perform_query(start_record, number_to_fetch) for i, result in enumerate(results): - self.records[i] = result + self.records[i + range_to_fetch.start] = result return range(start_record, start_record + len(results)) From 6488e560817042514898c74aa9827ccfaf26994a Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Thu, 24 Oct 2024 16:59:13 +0200 Subject: [PATCH 03/32] Add fields that belong to STCN --- edpop_explorer/fields.py | 47 +++- edpop_explorer/normalization/relators.py | 307 +++++++++++++++++++++++ edpop_explorer/normalizers.py | 1 - edpop_explorer/record.py | 10 + tests/test_field.py | 31 --- 5 files changed, 360 insertions(+), 36 deletions(-) create mode 100644 edpop_explorer/normalization/relators.py diff --git a/edpop_explorer/fields.py b/edpop_explorer/fields.py index cd80892..cfd5336 100644 --- a/edpop_explorer/fields.py +++ b/edpop_explorer/fields.py @@ -4,11 +4,14 @@ from typing import Optional, Callable, List, Tuple +from iso639 import Lang +from iso639.exceptions import InvalidLanguageValue from rdflib import Graph, Literal, BNode, RDF, URIRef from rdflib.term import Node from edpop_explorer import EDPOPREC, normalizers from edpop_explorer.normalizers import NormalizationResult +from edpop_explorer.normalization import relators DATATYPES = { 'string': { @@ -70,7 +73,6 @@ class Field: #: by default. subject_node: Node _subfields: List[Tuple[str, URIRef, str]] - normalized_text: Optional[str] = None #: Subfield -- indicates whether the value of this field is explicitly #: marked as unknown in the original record. unknown: Optional[bool] = None @@ -88,7 +90,7 @@ def __init__(self, original_text: str) -> None: self.original_text = original_text self._subfields = [ ('original_text', EDPOPREC.originalText, 'string'), - ('normalized_text', EDPOPREC.normalizedText, 'string'), + ('summary_text', EDPOPREC.summaryText, 'string'), ('unknown', EDPOPREC.unknown, 'boolean'), ('authority_record', EDPOPREC.authorityRecord, 'string'), ] @@ -140,9 +142,13 @@ def to_graph(self) -> Graph: )) return graph + @property + def summary_text(self) -> Optional[str]: + return None + def __str__(self) -> str: - if self.normalized_text is not None: - return self.normalized_text + if self.summary_text is not None: + return self.summary_text else: return self.original_text @@ -171,3 +177,36 @@ def __init__(self, original_text: str) -> None: ('language_code', EDPOPREC.languageCode, 'string') ) + @property + def summary_text(self) -> Optional[str]: + try: + language = Lang(self.language_code) + return language.name + except InvalidLanguageValue: + return None + + +class ContributorField(Field): + _rdf_class = EDPOPREC.ContributorField + role: Optional[str] = None + name: Optional[str] = None + + def __init__(self, original_text: str) -> None: + super().__init__(original_text) + self._subfields.extend(( + ('name', EDPOPREC.name, 'string'), + ('role', EDPOPREC.role, 'string'), + )) + + @property + def summary_text(self) -> Optional[str]: + role = relators.relator_dict.get(self.role, self.role) + name = self.name if self.name is not None else self.original_text + if role is not None: + return f"{name} ({role})" + else: + return name + + + + diff --git a/edpop_explorer/normalization/relators.py b/edpop_explorer/normalization/relators.py new file mode 100644 index 0000000..da6cd33 --- /dev/null +++ b/edpop_explorer/normalization/relators.py @@ -0,0 +1,307 @@ +# Relator dictionary taken from https://www.loc.gov/marc/relators/relacode.html + +relator_dict = { + "abr": "abridger", + "acp": "art copyist", + "act": "actor", + "adi": "art director", + "adp": "adapter", + "aft": "author of afterword, colophon, etc.", + "anc": "announcer", + "anl": "analyst", + "anm": "animator", + "ann": "annotator", + "ant": "bibliographic antecedent", + "ape": "appellee", + "apl": "appellant", + "app": "applicant", + "aqt": "author in quotations or text abstracts", + "arc": "architect", + "ard": "artistic director", + "arr": "arranger", + "art": "artist", + "asg": "assignee", + "asn": "associated name", + "ato": "autographer", + "att": "attributed name", + "auc": "auctioneer", + "aud": "author of dialog", + "aue": "audio engineer", + "aui": "author of introduction, etc.", + "aup": "audio producer", + "aus": "screenwriter", + "aut": "author", + "bdd": "binding designer", + "bjd": "bookjacket designer", + "bka": "book artist", + "bkd": "book designer", + "bkp": "book producer", + "blw": "blurb writer", + "bnd": "binder", + "bpd": "bookplate designer", + "brd": "broadcaster", + "brl": "braille embosser", + "bsl": "bookseller", + "cad": "casting director", + "cas": "caster", + "ccp": "conceptor", + "chrc": "choreographer", + "-clb": "collaborator", + "cli": "client", + "cll": "calligrapher", + "clr": "colorist", + "clt": "collotyper", + "cmm": "commentator", + "cmp": "composer", + "cmt": "compositor", + "cnd": "conductor", + "cng": "cinematographer", + "cns": "censor", + "coe": "contestant-appellee", + "col": "collector", + "com": "compiler", + "con": "conservator", + "cop": "camera operator", + "cor": "collection registrar", + "cos": "contestant", + "cot": "contestant-appellant", + "cou": "court governed", + "cov": "cover designer", + "cpc": "copyright claimant", + "cpe": "complainant-appellee", + "cph": "copyright holder", + "cpl": "complainant", + "cpt": "complainant-appellant", + "cre": "creator", + "crp": "correspondent", + "crr": "corrector", + "crt": "court reporter", + "csl": "consultant", + "csp": "consultant to a project", + "cst": "costume designer", + "ctb": "contributor", + "cte": "contestee-appellee", + "ctg": "cartographer", + "ctr": "contractor", + "cts": "contestee", + "ctt": "contestee-appellant", + "cur": "curator", + "cwt": "commentator for written text", + "dbd": "dubbing director", + "dbp": "distribution place", + "dfd": "defendant", + "dfe": "defendant-appellee", + "dft": "defendant-appellant", + "dgc": "degree committee member", + "dgg": "degree granting institution", + "dgs": "degree supervisor", + "dis": "dissertant", + "djo": "dj", + "dln": "delineator", + "dnc": "dancer", + "dnr": "donor", + "dpc": "depicted", + "dpt": "depositor", + "drm": "draftsman", + "drt": "director", + "dsr": "designer", + "dst": "distributor", + "dtc": "data contributor", + "dte": "dedicatee", + "dtm": "data manager", + "dto": "dedicator", + "dub": "dubious author", + "edc": "editor of compilation", + "edd": "editorial director", + "edm": "editor of moving image work", + "edt": "editor", + "egr": "engraver", + "elg": "electrician", + "elt": "electrotyper", + "eng": "engineer", + "enj": "enacting jurisdiction", + "etr": "etcher", + "evp": "event place", + "exp": "expert", + "fac": "facsimilist", + "fds": "film distributor", + "fld": "field director", + "flm": "film editor", + "fmd": "film director", + "fmk": "filmmaker", + "fmo": "former owner", + "fmp": "film producer", + "fnd": "funder", + "fon": "founder", + "fpy": "first party", + "frg": "forger", + "gdv": "game developer", + "gis": "geographic information specialist", + "-grt": "graphic technician", + "his": "host institution", + "hnr": "honoree", + "hst": "host", + "ill": "illustrator", + "ilu": "illuminator", + "ins": "inscriber", + "inv": "inventor", + "isb": "issuing body", + "itr": "instrumentalist", + "ive": "interviewee", + "ivr": "interviewer", + "jud": "judge", + "jug": "jurisdiction governed", + "lbr": "laboratory", + "lbt": "librettist", + "ldr": "laboratory director", + "led": "lead", + "lee": "libelee-appellee", + "lel": "libelee", + "len": "lender", + "let": "libelee-appellant", + "lgd": "lighting designer", + "lie": "libelant-appellee", + "lil": "libelant", + "lit": "libelant-appellant", + "lsa": "landscape architect", + "lse": "licensee", + "lso": "licensor", + "ltg": "lithographer", + "ltr": "letterer", + "lyr": "lyricist", + "mcp": "music copyist", + "mdc": "metadata contact", + "med": "medium", + "mfp": "manufacture place", + "mfr": "manufacturer", + "mka": "makeup artist", + "mod": "moderator", + "mon": "monitor", + "mrb": "marbler", + "mrk": "markup editor", + "msd": "musical director", + "mte": "metal-engraver", + "mtk": "minute taker", + "mup": "music programmer", + "mus": "musician", + "mxe": "mixing engineer", + "nan": "news anchor", + "nrt": "narrator", + "onp": "onscreen participant", + "opn": "opponent", + "org": "originator", + "orm": "organizer", + "osp": "onscreen presenter", + "oth": "other", + "own": "owner", + "pad": "place of address", + "pan": "panelist", + "pat": "patron", + "pbd": "publishing director", + "pbl": "publisher", + "pdr": "project director", + "pfr": "proofreader", + "pht": "photographer", + "plt": "platemaker", + "pma": "permitting agency", + "pmn": "production manager", + "pop": "printer of plates", + "ppm": "papermaker", + "ppt": "puppeteer", + "pra": "praeses", + "prc": "process contact", + "prd": "production personnel", + "pre": "presenter", + "prf": "performer", + "prg": "programmer", + "prm": "printmaker", + "prn": "production company", + "pro": "producer", + "prp": "production place", + "prs": "production designer", + "prt": "printer", + "prv": "provider", + "pta": "patent applicant", + "pte": "plaintiff-appellee", + "ptf": "plaintiff", + "pth": "patent holder", + "ptt": "plaintiff-appellant", + "pup": "publication place", + "rap": "rapporteur", + "rbr": "rubricator", + "rcd": "recordist", + "rce": "recording engineer", + "rcp": "addressee", + "rdd": "radio director", + "red": "redaktor", + "ren": "renderer", + "res": "researcher", + "rev": "reviewer", + "rpc": "radio producer", + "rps": "repository", + "rpt": "reporter", + "rpy": "responsible party", + "rse": "respondent-appellee", + "rsg": "restager", + "rsp": "respondent", + "rsr": "restorationist", + "rst": "respondent-appellant", + "rth": "research team head", + "rtm": "research team member", + "rxa": "remix artist", + "sad": "scientific advisor", + "sce": "scenarist", + "scl": "sculptor", + "scr": "scribe", + "sde": "sound engineer", + "sds": "sound designer", + "sec": "secretary", + "sfx": "special effects provider", + "sgd": "stage director", + "sgn": "signer", + "sht": "spporting host", + "sll": "seller", + "sng": "singer", + "spk": "speaker", + "spn": "sponsor", + "spy": "second party", + "srv": "surveyor", + "std": "set designer", + "stg": "setting", + "stl": "storyteller", + "stm": "stage manager", + "stn": "standards body", + "str": "stereotyper", + "swd": "software developer", + "tad": "technical advisor", + "tau": "television writer", + "tcd": "technical director", + "tch": "teacher", + "ths": "thesis advisor", + "tld": "television director", + "tlg": "television guest", + "tlh": "television host", + "tlp": "television producer", + "trc": "transcriber", + "trl": "translator", + "tyd": "type designer", + "tyg": "typographer", + "uvp": "university place", + "vac": "voice actor", + "vdg": "videographer", + "vfx": "visual effects provider", + "voc": "vocalist", + "wac": "writer of added commentary", + "wal": "writer of added lyrics", + "wam": "writer of accompanying material", + "wat": "writer of added text", + "wdc": "woodcutter", + "wde": "wood engraver", + "wfs": "writer of film story", + "wft": "writer of intertitles", + "win": "writer of introduction", + "wit": "witness", + "wpr": "writer of preface", + "wst": "writer of supplementary textual content", + "wts": "writer of television story" +} diff --git a/edpop_explorer/normalizers.py b/edpop_explorer/normalizers.py index 8cdf415..1e89a8d 100644 --- a/edpop_explorer/normalizers.py +++ b/edpop_explorer/normalizers.py @@ -18,7 +18,6 @@ def normalize_by_language_code(field) -> NormalizationResult: try: language = Lang(field.original_text) field.language_code = language.pt3 - field.normalized_text = language.name return NormalizationResult.SUCCESS except InvalidLanguageValue: return NormalizationResult.FAIL diff --git a/edpop_explorer/record.py b/edpop_explorer/record.py index 10fecb3..9d6d16b 100644 --- a/edpop_explorer/record.py +++ b/edpop_explorer/record.py @@ -215,6 +215,11 @@ class BibliographicalRecord(Record): physical_description: Optional[Field] = None bookseller: Optional[Field] = None location: Optional[Field] = None + format: Optional[Field] = None + fingerprint: Optional[Field] = None + collation_formula: Optional[Field] = None + genres: Optional[List[Field]] = None + holdings: Optional[List[Field]] = None def __init__(self, from_reader: Type["Reader"]): super().__init__(from_reader) @@ -232,6 +237,11 @@ def __init__(self, from_reader: Type["Reader"]): ('physical_description', EDPOPREC.physicalDescription, Field), ('bookseller', EDPOPREC.bookseller, Field), ('location', EDPOPREC.location, Field), + ('format', EDPOPREC.format, Field), + ('fingerprint', EDPOPREC.fingerprint, Field), + ('collation_formula', EDPOPREC.collationFormula, Field), + ('genres', EDPOPREC.genre, Field), + ('holdings', EDPOPREC.holdings, Field), ] def __str__(self) -> str: diff --git a/tests/test_field.py b/tests/test_field.py index c2a5cb5..5860274 100644 --- a/tests/test_field.py +++ b/tests/test_field.py @@ -32,14 +32,6 @@ def test_to_graph(self, basic_field: Field): EDPOPREC.originalText, Literal(basic_field.original_text) ) in graph - # Test string from property - basic_field.normalized_text = 'normalized' - graph = basic_field.to_graph() - assert ( - basic_field.subject_node, - EDPOPREC.normalizedText, - Literal(basic_field.normalized_text) - ) in graph # Test boolean basic_field.unknown = True graph = basic_field.to_graph() @@ -62,29 +54,6 @@ def test_to_graph(self, basic_field: Field): with raises(FieldError): basic_field.to_graph() - def test_normalized_text(self, basic_field: Field): - # If nothing is set, this should be None - assert basic_field.normalized_text is None - # Set normalized text by hand - text = 'normalized' - basic_field.normalized_text = text - assert basic_field.normalized_text == text - # Now test a class with automatic normalized text creation - - def complex_normalizer(field): - field.normalized_text = field.original_text.capitalize() - return NormalizationResult.SUCCESS - - class ComplexField(Field): - normalizer = complex_normalizer - title = 'title' - complex_field = ComplexField(title) - complex_field.normalize() - assert complex_field.normalized_text == title.capitalize() - # A manual normalized text should override this - complex_field.normalized_text = text - assert complex_field.normalized_text == text - class TestLocationField: def test_basic_form(self, basic_location_field: LocationField): From 3cc511b289e885f797c94509032b1cdd2362f7a2 Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Thu, 24 Oct 2024 16:59:26 +0200 Subject: [PATCH 04/32] Remove redundant expression --- edpop_explorer/edpopxshell.py | 1 - 1 file changed, 1 deletion(-) diff --git a/edpop_explorer/edpopxshell.py b/edpop_explorer/edpopxshell.py index 14656ab..2ac1c2e 100644 --- a/edpop_explorer/edpopxshell.py +++ b/edpop_explorer/edpopxshell.py @@ -95,7 +95,6 @@ def show_record(self, record: Record) -> None: )) recordtype = str(record._rdf_class).rsplit('/',1)[1] self.poutput(f'Record type: {recordtype}') - self.poutput if record.identifier: self.poutput(f'Identifier: {record.identifier}') if record.link: From 9ac0a31f6e9cdda2fa9134d3ce31b496a720b548 Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Thu, 24 Oct 2024 16:59:37 +0200 Subject: [PATCH 05/32] Formatting --- edpop_explorer/readers/sbtireader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/edpop_explorer/readers/sbtireader.py b/edpop_explorer/readers/sbtireader.py index 14f77d7..f37c692 100644 --- a/edpop_explorer/readers/sbtireader.py +++ b/edpop_explorer/readers/sbtireader.py @@ -33,7 +33,6 @@ def _get_name_field(cls, data: dict) -> Optional[Field]: field = Field(f"{name}") return field - @classmethod def _convert_record(cls, rawrecord: dict) -> BiographicalRecord: record = BiographicalRecord(from_reader=cls) From 497618397e097f0741d3d3ea0531b41002c90519 Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Thu, 24 Oct 2024 17:00:01 +0200 Subject: [PATCH 06/32] Replace STCN reader: use different API --- edpop_explorer/readers/stcn.py | 237 ++++++++++++++++++++++----------- 1 file changed, 156 insertions(+), 81 deletions(-) diff --git a/edpop_explorer/readers/stcn.py b/edpop_explorer/readers/stcn.py index ef1bab2..527c775 100644 --- a/edpop_explorer/readers/stcn.py +++ b/edpop_explorer/readers/stcn.py @@ -2,31 +2,36 @@ from rdflib.term import Node from typing import List, Optional, Tuple -from edpop_explorer import Field, BIBLIOGRAPHICAL -from edpop_explorer.fields import LanguageField +from edpop_explorer import Field, BIBLIOGRAPHICAL, BibliographicalRecord, LocationField +from edpop_explorer.cerl import CERLReader +from edpop_explorer.fields import LanguageField, ContributorField from edpop_explorer.sparqlreader import ( SparqlReader, BibliographicalRDFRecord ) -def _get_properties_from_iri(iri: str, properties: List[Node]) -> \ - Tuple[List[Node], Graph]: - '''Get the first objects of the requested properties of a certain IRI - as strings.''' - subject_graph = Graph() - subject_graph.parse(iri) - objects: List[Node] = [] - for prop in properties: - for obj in subject_graph.objects(URIRef(iri), prop): - objects.append(obj) - return objects, subject_graph - - -class STCNReader(SparqlReader): - endpoint = 'http://data.bibliotheken.nl/sparql' - filter = '?s schema:mainEntityOfPage/schema:isPartOf ' \ - ' .' - name_predicate = '' +def _remove_markup(input_str: str) -> str: + """Remove STCN-specific markup""" + return input_str.replace('`IT`', '').replace('`LO`', '') + + +def safeget(dictionary: Optional[dict], attribute_chain: tuple, first: bool = False): + attribute = attribute_chain[0] + if dictionary is None or attribute not in dictionary: + return None + value = dictionary[attribute] + if first and isinstance(value, list): + value = value[0] + if len(attribute_chain) == 1: + return value + else: + return safeget(value, attribute_chain[1:], first) + + +class STCNReader(CERLReader): + API_URL = 'https://data.cerl.org/stcn/_search' + API_BY_ID_BASE_URL = 'https://data.cerl.org/stcn/' + LINK_BASE_URL = 'https://data.cerl.org/stcn/' CATALOG_URIREF = URIRef( 'https://edpop.hum.uu.nl/readers/stcn' ) @@ -35,70 +40,140 @@ class STCNReader(SparqlReader): SHORT_NAME = "Short-Title Catalogue Netherlands (STCN)" DESCRIPTION = "National biography of The Netherlands until 1801" - def __init__(self): - super().__init__() + @classmethod + def _get_title(cls, rawrecord: dict) -> Optional[Field]: + title = safeget(rawrecord, ("display", "title")) + if isinstance(title, str): + title = _remove_markup(title) + return Field(title) + + @classmethod + def _get_contributors(cls, rawrecord: dict) -> list[Field]: + actors = safeget(rawrecord, ("data", "agent")) + if not actors: + return [] + contributors = [] + for actor in actors: + name = actor.get("preferred", None) + if name is None: + continue + contributor = ContributorField(name) + contributor.name = name + contributor.role = safeget(actor, ('role',), first=True) + contributors.append(contributor) + return contributors @classmethod - def convert_record( - cls, graph: Graph, record: BibliographicalRDFRecord - ) -> None: - SCHEMA = Namespace('http://schema.org/') - # First get the title and languages fields, which are simple - # properties - assert record.identifier is not None - subject_node = URIRef(record.identifier) - for name in graph.objects(subject_node, SCHEMA.name): - record.title = Field(str(name)) - break - record.languages = [] - for language in graph.objects(subject_node, SCHEMA.inLanguage): - field = LanguageField(str(language)) + def _get_publisher_or_printer(cls, rawrecord: dict) -> Optional[Field]: + # TODO: support multiple publishers/printers + provision_agent = safeget(rawrecord, ("data", "provisionAgent"), first=True) + if provision_agent is None: + return None + name = provision_agent.get("preferred", None) + if name is None: + return None + field = Field(name) + return field + + @classmethod + def _get_place_of_publication(cls, rawrecord: dict) -> Optional[Field]: + place = safeget(rawrecord, ("data", "provisionAgent", "place"), first=True) + if place is None: + return None + else: + field = LocationField(place) + field.location_type = LocationField.LOCALITY + return field + + @classmethod + def _get_languages(cls, rawrecord: dict) -> list[Field]: + languages = safeget(rawrecord, ("data", "language")) + if languages is None: + return [] + fields = [] + for language in languages: + field = LanguageField(language) field.normalize() - record.languages.append(field) - # Now get the information from blank nodes - record.contributors = [] - for author in graph.objects(subject_node, SCHEMA.author): - name_field = None - for name in graph.objects(author, SCHEMA.name): - name_field = Field(str(name)) - # TODO: add role and authority record - if name_field: - record.contributors.append(name_field) - for publication in graph.objects(subject_node, SCHEMA.publication): - year_field = None - for startDate in graph.objects(publication, SCHEMA.startDate): - year_field = Field(str(startDate)) - if year_field: - record.dating = year_field - # TODO: publisher and location (not a blank node) - published_by_iri = None - for publishedBy in graph.objects(publication, SCHEMA.publishedBy): - published_by_iri = str(publishedBy) - break - if published_by_iri: - [name, location_node], pubgraph = _get_properties_from_iri( - published_by_iri, [SCHEMA.name, SCHEMA.location] - ) - record.publisher_or_printer = Field(str(name)) - address_node = None - for address in pubgraph.objects(location_node, SCHEMA.address): - address_node = address - break - if address_node: - for addressLocality in pubgraph.objects( - address_node, SCHEMA.addressLocality - ): - record.place_of_publication = Field( - str(addressLocality) - ) - break + fields.append(field) + return fields + + @classmethod + def _get_dating(cls, rawrecord: dict) -> Optional[Field]: + dating = safeget(rawrecord, ("data", "date")) + if dating is not None: + return Field(dating) + + @classmethod + def _get_extent(cls, rawrecord: dict) -> Optional[Field]: + sheets = safeget(rawrecord, ("data", "extent", "sheets")) + if sheets is None: + return None + extent = f"{sheets} sheets" + return Field(extent) + + @classmethod + def _get_format(cls, rawrecord: dict) -> Optional[Field]: + format = safeget(rawrecord, ("data", "format", "format")) + if format is None: + return None + return Field(format) + + @classmethod + def _get_collation_formula(cls, rawrecord: dict) -> Optional[Field]: + collations = safeget(rawrecord, ("data", "extent", "collation")) + if not collations: + return None + # Multiple collation formulas are possible, but this seems to be rare. + collation_string = ' ; '.join([x.get("value") for x in collations if "value" in x]) + return Field(collation_string) + + @classmethod + def _get_fingerprint(cls, rawrecord: dict) -> Optional[Field]: + fingerprints = safeget(rawrecord, ("data", "fingerprint")) + if not fingerprints: + return None + # Multiple fingerprints are possible, but this seems to be rare + fingerprint_string = ' ; '.join([x.get("fingerprint") for x in fingerprints if "fingerprint" in x]) + return Field(fingerprint_string) + + @classmethod + def _get_genres(cls, rawrecord: dict) -> list[Field]: + subjecttopics = safeget(rawrecord, ("data", "subjectTopic")) + if subjecttopics is None: + return [] + fields = [Field(x["preferred"]) for x in subjecttopics if "preferred" in x] + return fields + + @classmethod + def _get_holdings(cls, rawrecord: dict) -> list[Field]: + holdings = safeget(rawrecord, ("data", "holdings")) + if holdings is None: + return [] + fields = [] + for holding in holdings: + institution = safeget(holding, ("data", "institutionName")) + shelfmark = safeget(holding, ("data", "shelfmark")) + summary = f"{institution} - {shelfmark}" + fields.append(Field(summary)) + return fields @classmethod - def _create_lazy_record( - cls, iri: str, name: Optional[str]=None - ) -> BibliographicalRDFRecord: - record = BibliographicalRDFRecord(cls) - record.identifier = iri - record.link = iri - record.title = Field(name) if name else None + def _convert_record(cls, rawrecord: dict) -> BibliographicalRecord: + record = BibliographicalRecord(from_reader=cls) + record.data = rawrecord + record.identifier = rawrecord.get('id', None) + if record.identifier: + record.link = cls.LINK_BASE_URL + record.identifier + record.title = cls._get_title(rawrecord) + record.contributors = cls._get_contributors(rawrecord) + record.publisher_or_printer = cls._get_publisher_or_printer(rawrecord) + record.place_of_publication = cls._get_place_of_publication(rawrecord) + record.dating = cls._get_dating(rawrecord) + record.languages = cls._get_languages(rawrecord) + record.extent = cls._get_extent(rawrecord) + record.format = cls._get_format(rawrecord) + record.collation_formula = cls._get_collation_formula(rawrecord) + record.fingerprint = cls._get_fingerprint(rawrecord) + record.genres = cls._get_genres(rawrecord) + record.holdings = cls._get_holdings(rawrecord) return record From 4a8a7dcaa03abe2cac7f0637df817d6eec73f773 Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Thu, 24 Oct 2024 17:04:20 +0200 Subject: [PATCH 07/32] Black formatting --- docs/conf.py | 19 ++- edpop_explorer/__init__.py | 66 +++++++--- edpop_explorer/__main__.py | 5 +- edpop_explorer/cerl.py | 52 ++++---- edpop_explorer/edpopxshell.py | 132 ++++++++++--------- edpop_explorer/fields.py | 91 ++++++-------- edpop_explorer/normalization/relators.py | 2 +- edpop_explorer/normalizers.py | 6 +- edpop_explorer/rdf.py | 5 +- edpop_explorer/reader.py | 66 +++++----- edpop_explorer/readers/__init__.py | 4 +- edpop_explorer/readers/bibliopolis.py | 14 +-- edpop_explorer/readers/bnf.py | 28 ++--- edpop_explorer/readers/cerl_thesaurus.py | 46 ++++--- edpop_explorer/readers/dutch_almanacs.py | 54 ++++---- edpop_explorer/readers/fbtee.py | 71 +++++------ edpop_explorer/readers/gallica.py | 57 ++++----- edpop_explorer/readers/hpb.py | 24 ++-- edpop_explorer/readers/kb.py | 33 +++-- edpop_explorer/readers/kvcs.py | 54 ++++---- edpop_explorer/readers/pierre_belle.py | 56 +++++---- edpop_explorer/readers/sbtireader.py | 23 ++-- edpop_explorer/readers/stcn.py | 28 ++--- edpop_explorer/readers/ustc.py | 79 ++++++------ edpop_explorer/readers/vd.py | 67 +++++----- edpop_explorer/record.py | 153 +++++++++++------------ edpop_explorer/sparqlreader.py | 68 +++++----- edpop_explorer/srumarc21reader.py | 130 +++++++++---------- edpop_explorer/srureader.py | 31 ++--- tests/conftest.py | 12 +- tests/test_allreaders.py | 6 +- tests/test_field.py | 34 ++--- tests/test_reader.py | 5 +- tests/test_record.py | 52 ++++---- tests/test_srureader.py | 29 ++--- 35 files changed, 802 insertions(+), 800 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 522f359..31cbdcd 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -13,9 +13,9 @@ # -- Project information ----------------------------------------------------- -project = 'EDPOP Explorer' -copyright = '2023' -author = 'Utrecht University' +project = "EDPOP Explorer" +copyright = "2023" +author = "Utrecht University" # -- General configuration --------------------------------------------------- @@ -24,17 +24,17 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # -- Options for HTML output ------------------------------------------------- @@ -42,10 +42,9 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'alabaster' +html_theme = "alabaster" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - +html_static_path = ["_static"] diff --git a/edpop_explorer/__init__.py b/edpop_explorer/__init__.py index ced924c..6b07025 100644 --- a/edpop_explorer/__init__.py +++ b/edpop_explorer/__init__.py @@ -1,15 +1,33 @@ __all__ = [ - 'EDPOPREC', 'RELATORS', 'bind_common_namespaces', - 'Field', 'FieldError', 'LocationField', - 'Reader', 'ReaderError', 'NotFoundError', - 'GetByIdBasedOnQueryMixin', 'DatabaseFileMixin', - 'BasePreparedQuery', 'PreparedQueryType', - 'Record', 'RawData', 'RecordError', 'BibliographicalRecord', - 'BiographicalRecord', 'LazyRecordMixin', - 'SRUReader', 'CERLReader', - 'Marc21Data', 'Marc21Field', 'Marc21BibliographicalRecord', - 'Marc21DataMixin', 'SRUMarc21Reader', 'SRUMarc21BibliographicalReader', - 'BIBLIOGRAPHICAL', 'BIOGRAPHICAL' + "EDPOPREC", + "RELATORS", + "bind_common_namespaces", + "Field", + "FieldError", + "LocationField", + "Reader", + "ReaderError", + "NotFoundError", + "GetByIdBasedOnQueryMixin", + "DatabaseFileMixin", + "BasePreparedQuery", + "PreparedQueryType", + "Record", + "RawData", + "RecordError", + "BibliographicalRecord", + "BiographicalRecord", + "LazyRecordMixin", + "SRUReader", + "CERLReader", + "Marc21Data", + "Marc21Field", + "Marc21BibliographicalRecord", + "Marc21DataMixin", + "SRUMarc21Reader", + "SRUMarc21BibliographicalReader", + "BIBLIOGRAPHICAL", + "BIOGRAPHICAL", ] # Define here to avoid circular imports @@ -20,17 +38,29 @@ from .rdf import EDPOPREC, RELATORS, bind_common_namespaces from .fields import Field, FieldError, LocationField from .reader import ( - Reader, ReaderError, GetByIdBasedOnQueryMixin, BasePreparedQuery, - PreparedQueryType, NotFoundError, DatabaseFileMixin + Reader, + ReaderError, + GetByIdBasedOnQueryMixin, + BasePreparedQuery, + PreparedQueryType, + NotFoundError, + DatabaseFileMixin, ) from .record import ( - Record, RawData, RecordError, BibliographicalRecord, BiographicalRecord, - LazyRecordMixin + Record, + RawData, + RecordError, + BibliographicalRecord, + BiographicalRecord, + LazyRecordMixin, ) from .srureader import SRUReader from .srumarc21reader import ( - Marc21Data, Marc21Field, Marc21BibliographicalRecord, Marc21DataMixin, - SRUMarc21Reader, SRUMarc21BibliographicalReader + Marc21Data, + Marc21Field, + Marc21BibliographicalRecord, + Marc21DataMixin, + SRUMarc21Reader, + SRUMarc21BibliographicalReader, ) from .cerl import CERLReader - diff --git a/edpop_explorer/__main__.py b/edpop_explorer/__main__.py index d8fd652..7e42a46 100644 --- a/edpop_explorer/__main__.py +++ b/edpop_explorer/__main__.py @@ -7,11 +7,12 @@ try: from colorama import just_fix_windows_console + just_fix_windows_console() except ImportError: pass -historyfile = Path(AppDirs('edpop-explorer', 'cdh').user_data_dir) / 'history' +historyfile = Path(AppDirs("edpop-explorer", "cdh").user_data_dir) / "history" def save_history() -> None: @@ -27,5 +28,5 @@ def main() -> None: save_history() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/edpop_explorer/cerl.py b/edpop_explorer/cerl.py index 61dbf01..d10ba4a 100644 --- a/edpop_explorer/cerl.py +++ b/edpop_explorer/cerl.py @@ -5,7 +5,12 @@ from typing import List, Dict, Optional from edpop_explorer import ( - Reader, Record, ReaderError, BiographicalRecord, Field, BIOGRAPHICAL + Reader, + Record, + ReaderError, + BiographicalRecord, + Field, + BIOGRAPHICAL, ) @@ -16,6 +21,7 @@ class CERLReader(Reader): This is an abstract class -- to use, derive from this class, set the ``API_URL``, ``API_BY_ID_BASE_URL`` and ``LINK_BASE_URL`` constant attributes, and implement the ``_convert_record`` class method.""" + API_URL: str """The base URL of the search API, of the form ``https://data.cerl.org//_search``.""" API_BY_ID_BASE_URL: str @@ -30,9 +36,7 @@ def get_by_id(cls, identifier: str) -> Record: try: response = requests.get( cls.API_BY_ID_BASE_URL + identifier, - headers={ - 'Accept': 'application/json' - }, + headers={"Accept": "application/json"}, ).json() except requests.exceptions.JSONDecodeError: raise ReaderError(f"Item with id {identifier} does not exist.") @@ -40,51 +44,48 @@ def get_by_id(cls, identifier: str) -> Record: raise ReaderError(f"Error during server request: {err}") return cls._convert_record(response) - @classmethod @abstractmethod def _convert_record(cls, rawrecord: dict) -> Record: pass - def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> List[Record]: + def _perform_query( + self, start_record: int, maximum_records: Optional[int] + ) -> List[Record]: assert isinstance(self.prepared_query, str) if maximum_records is None: maximum_records = self.DEFAULT_RECORDS_PER_PAGE - print(f'The query is: {self.prepared_query}') + print(f"The query is: {self.prepared_query}") try: response = requests.get( self.API_URL, params={ - 'query': self.prepared_query, - 'from': start_record, - 'size': maximum_records, - 'mode': 'default', - 'sort': 'default' + "query": self.prepared_query, + "from": start_record, + "size": maximum_records, + "mode": "default", + "sort": "default", }, - headers={ - 'Accept': 'application/json' - } + headers={"Accept": "application/json"}, ).json() - except ( - requests.exceptions.RequestException - ) as err: - raise ReaderError('Error during server request: ' + str(err)) + except requests.exceptions.RequestException as err: + raise ReaderError("Error during server request: " + str(err)) # TODO: check for error responses try: - if response['hits'] is None: + if response["hits"] is None: self.number_of_results = 0 else: - self.number_of_results = response['hits']['value'] + self.number_of_results = response["hits"]["value"] except KeyError: - raise ReaderError('Number of hits not given in server response') + raise ReaderError("Number of hits not given in server response") - if 'rows' not in response: + if "rows" not in response: # There are no rows in the response, so stop here return [] records: List[Record] = [] - for rawrecord in response['rows']: + for rawrecord in response["rows"]: record = self._convert_record(rawrecord) records.append(record) @@ -97,11 +98,10 @@ def transform_query(cls, query) -> str: def fetch_range(self, range_to_fetch: range) -> range: if self.prepared_query is None: - raise ReaderError('First call prepare_query') + raise ReaderError("First call prepare_query") start_record = range_to_fetch.start number_to_fetch = range_to_fetch.stop - start_record results = self._perform_query(start_record, number_to_fetch) for i, result in enumerate(results): self.records[i + range_to_fetch.start] = result return range(start_record, start_record + len(results)) - diff --git a/edpop_explorer/edpopxshell.py b/edpop_explorer/edpopxshell.py index 2ac1c2e..bb7479f 100644 --- a/edpop_explorer/edpopxshell.py +++ b/edpop_explorer/edpopxshell.py @@ -30,12 +30,12 @@ class EDPOPXShell(cmd2.Cmd): intro = ( - 'Welcome to the EDPOP explorer!\n' - 'Type to perform a query.\n' - 'Type identifier to retrieve a specific record.\n' - 'Type ‘help’ for all commands.\n' + "Welcome to the EDPOP explorer!\n" + "Type to perform a query.\n" + "Type identifier to retrieve a specific record.\n" + "Type ‘help’ for all commands.\n" ) - prompt = '[edpop-explorer] # ' + prompt = "[edpop-explorer] # " reader: Optional[Reader] = None shown: int = 0 RECORDS_PER_PAGE = 10 @@ -44,78 +44,78 @@ def __init__(self): super().__init__() self.exact = False - self.add_settable(cmd2.Settable( - 'exact', bool, 'use exact queries without preprocessing', self - )) + self.add_settable( + cmd2.Settable( + "exact", bool, "use exact queries without preprocessing", self + ) + ) def get_record_from_argument(self, args) -> Optional[Record]: """Get the record requested by the user; show error message and return None if this fails""" if self.reader is None: - self.perror('First perform an initial search') + self.perror("First perform an initial search") return try: # TODO: consider using argparse index = int(args) - 1 except (TypeError, ValueError): - self.perror('Please provide a valid number') + self.perror("Please provide a valid number") return try: record = self.reader.records[index] except IndexError: - self.perror('Please provide a record number that has been loaded') + self.perror("Please provide a record number that has been loaded") return return record def do_next(self, args) -> None: if self.reader is None: - self.perror('First perform an initial search') + self.perror("First perform an initial search") return assert self.reader.number_of_results is not None assert self.reader.number_fetched is not None if self.shown >= self.reader.number_of_results: - self.perror('All records have been shown') + self.perror("All records have been shown") else: if self.reader.number_fetched - self.shown < self.RECORDS_PER_PAGE: self.reader.fetch() - self.shown += self._show_records(self.reader.records, - self.shown, - self.RECORDS_PER_PAGE) + self.shown += self._show_records( + self.reader.records, self.shown, self.RECORDS_PER_PAGE + ) def do_show(self, args) -> None: - '''Show a normalized version of the record with the given number.''' + """Show a normalized version of the record with the given number.""" record = self.get_record_from_argument(args) if record is not None: self.show_record(record) def show_record(self, record: Record) -> None: record.fetch() # Necessary in case this is a lazy record - self.poutput(cmd2.ansi.style_success( - record, bold=True - )) - recordtype = str(record._rdf_class).rsplit('/',1)[1] - self.poutput(f'Record type: {recordtype}') + self.poutput(cmd2.ansi.style_success(record, bold=True)) + recordtype = str(record._rdf_class).rsplit("/", 1)[1] + self.poutput(f"Record type: {recordtype}") if record.identifier: - self.poutput(f'Identifier: {record.identifier}') + self.poutput(f"Identifier: {record.identifier}") if record.link: - self.poutput('URL: ' + str(record.link)) - self.poutput(cmd2.ansi.style('Fields:', bold=True)) + self.poutput("URL: " + str(record.link)) + self.poutput(cmd2.ansi.style("Fields:", bold=True)) for fieldname, _, _ in record._fields: - fieldname_human = fieldname.capitalize().replace('_', ' ') + fieldname_human = fieldname.capitalize().replace("_", " ") # TODO: make a field iterator for Record value = getattr(record, fieldname) if value: if isinstance(value, list): - text = '\n' + '\n'.join([(' - ' + str(x)) for x in value]) + text = "\n" + "\n".join([(" - " + str(x)) for x in value]) else: text = str(value) self.poutput( - cmd2.ansi.style(f'- {fieldname_human}: ', bold=True) + text + cmd2.ansi.style(f"- {fieldname_human}: ", bold=True) + text ) def do_showrdf(self, args) -> None: - '''Show an RDF representation of the record with the given number - in Turtle format.''' + """Show an RDF representation of the record with the given number + in Turtle format.""" record = self.get_record_from_argument(args) if record is None: return @@ -123,27 +123,27 @@ def do_showrdf(self, args) -> None: graph = record.to_graph() ttl = graph.serialize() highlighted = highlight( - ttl, TurtleLexer(), Terminal256Formatter(style='vim') + ttl, TurtleLexer(), Terminal256Formatter(style="vim") ) self.poutput(highlighted) except ReaderError as err: - self.perror('Cannot generate RDF: {}'.format(err)) + self.perror("Cannot generate RDF: {}".format(err)) def do_showraw(self, args) -> None: - '''Show the raw data of the record with the given number in the - source catalog.''' + """Show the raw data of the record with the given number in the + source catalog.""" record = self.get_record_from_argument(args) if record is None: return data = record.get_data_dict() yaml_data = yaml.dump(data, allow_unicode=True) highlighted = highlight( - yaml_data, YamlLexer(), Terminal256Formatter(style='vim') + yaml_data, YamlLexer(), Terminal256Formatter(style="vim") ) self.poutput(highlighted) def do_hpb(self, args) -> None: - 'CERL\'s Heritage of the Printed Book Database' + "CERL's Heritage of the Printed Book Database" self._query(HPBReader, args) def do_vd16(self, args) -> None: @@ -164,54 +164,54 @@ def do_vd18(self, args) -> None: def do_vdlied(self, args) -> None: """Verzeichnis der deutschsprachigen Liedflugschriften""" self._query(VDLiedReader, args) - + def do_bnf(self, args) -> None: """Bibliothèque nationale de France""" self._query(BnFReader, args) - + def do_gallica(self, args) -> None: - 'Gallica' + "Gallica" self._query(GallicaReader, args) - + def do_ct(self, args) -> None: - 'CERL Thesaurus' + "CERL Thesaurus" self._query(CERLThesaurusReader, args) - + def do_stcn(self, args) -> None: - 'Short Title Catalogue Netherlands' + "Short Title Catalogue Netherlands" self._query(STCNReader, args) - + def do_sbti(self, args) -> None: - 'Scottish Book Trade Index' + "Scottish Book Trade Index" self._query(SBTIReader, args) - + def do_fbtee(self, args) -> None: - 'French Book Trade in Enlightenment Europe' + "French Book Trade in Enlightenment Europe" self._query(FBTEEReader, args) - + def do_ustc(self, args) -> None: - 'Universal Short Title Catalogue' + "Universal Short Title Catalogue" self._query(USTCReader, args) - + def do_kb(self, args) -> None: - 'Koninklijke Bibliotheek' + "Koninklijke Bibliotheek" self._query(KBReader, args) def do_kvcs(self, args) -> None: - 'Drukkers & Uitgevers in KVCS' + "Drukkers & Uitgevers in KVCS" self._query(KVCSReader, args) def do_dutalm(self, args) -> None: - 'Bibliography of Dutch Almanacs 1570-1710' + "Bibliography of Dutch Almanacs 1570-1710" self._query(DutchAlmanacsReader, args) def do_pb(self, args) -> None: - 'BIBLIOGRAPHY OF EARLY MODERN EDITIONS OF PIERRE DE PROVENCE ET LA BELLE MAGUELONNE (CA. 1470–CA. 1800)' + "BIBLIOGRAPHY OF EARLY MODERN EDITIONS OF PIERRE DE PROVENCE ET LA BELLE MAGUELONNE (CA. 1470–CA. 1800)" self._query(PierreBelleReader, args) - def _show_records(self, records: List[Optional[Record]], - start: int, - limit=math.inf) -> int: + def _show_records( + self, records: List[Optional[Record]], start: int, limit=math.inf + ) -> int: """Show the records from start, with limit as the maximum number of records to show. Return the number of records shown.""" total = len(records) @@ -222,15 +222,13 @@ def _show_records(self, records: List[Optional[Record]], count = int(min(remaining, limit)) digits = len(str(total)) for i in range(start, start + count): - print('{:{digits}} - {}'.format( - i + 1, str(records[i]), digits=digits - )) + print("{:{digits}} - {}".format(i + 1, str(records[i]), digits=digits)) return count def _query(self, readerclass: Type[Reader], query: str): IDENTIFIER_PREFIX = "identifier " if query.startswith(IDENTIFIER_PREFIX): - identifier = query[len(IDENTIFIER_PREFIX):] + identifier = query[len(IDENTIFIER_PREFIX) :] try: record = readerclass.get_by_id(identifier) except ReaderError as err: @@ -244,22 +242,18 @@ def _query(self, readerclass: Type[Reader], query: str): if not self.exact: self.reader.prepare_query(query) self.pfeedback( - 'Performing query: {}'.format(self.reader.prepared_query) + "Performing query: {}".format(self.reader.prepared_query) ) else: self.reader.set_query(query) - self.pfeedback( - 'Performing exact query: {}'.format(query) - ) + self.pfeedback("Performing exact query: {}".format(query)) self.reader.fetch() except ReaderError as err: - self.perror('Error while fetching results: {}'.format(err)) + self.perror("Error while fetching results: {}".format(err)) self.reader = None self.shown = 0 return - self.pfeedback( - '{} records found.'.format(self.reader.number_of_results) - ) + self.pfeedback("{} records found.".format(self.reader.number_of_results)) self.shown += self._show_records( self.reader.records, self.shown, self.RECORDS_PER_PAGE ) diff --git a/edpop_explorer/fields.py b/edpop_explorer/fields.py index cfd5336..c4fc8ba 100644 --- a/edpop_explorer/fields.py +++ b/edpop_explorer/fields.py @@ -14,25 +14,23 @@ from edpop_explorer.normalization import relators DATATYPES = { - 'string': { - 'input_type': str, - 'converter': (lambda x: Literal(x)), + "string": { + "input_type": str, + "converter": (lambda x: Literal(x)), }, - 'boolean': { - 'input_type': bool, - 'converter': (lambda x: Literal(x)), + "boolean": { + "input_type": bool, + "converter": (lambda x: Literal(x)), }, - 'edtf': { - 'input_type': str, - 'converter': ( - lambda x: Literal( - x, datatype=URIRef("http://id.loc.gov/datatypes/edtf") - ) - ) + "edtf": { + "input_type": str, + "converter": ( + lambda x: Literal(x, datatype=URIRef("http://id.loc.gov/datatypes/edtf")) + ), }, - 'uriref': { - 'input_type': URIRef, - 'converter': lambda x: x, + "uriref": { + "input_type": URIRef, + "converter": lambda x: x, }, } @@ -53,8 +51,8 @@ class Field: not the case for this base class. In those cases, it is still possible to set this field using the ``set_normalized_text`` method. Except ``original_text``, all subfields are optional and are None by default. - Use ``to_graph()`` to obtain an RDF graph. The subject node is by default - a blank node, but this may be overridden by setting the subject_node + Use ``to_graph()`` to obtain an RDF graph. The subject node is by default + a blank node, but this may be overridden by setting the subject_node attribute. Subclasses should override the ``_rdf_class`` attribute to the corresponding @@ -65,8 +63,9 @@ class Field: by one using ``self.SUBFIELDS.append(('', EDPOPREC., ''))``, where is any of the datatypes defined in the ``DATATYPES`` constant of this module. - Subclasses may furthermore define the ``_normalized_text`` private + Subclasses may furthermore define the ``_normalized_text`` private method.""" + #: Subfield -- text of this field according to the original record. original_text: str #: This field's subject node if converted to RDF. This is a blank node @@ -80,19 +79,17 @@ class Field: authority_record: Optional[str] = None normalizer: Optional[Callable] = None _rdf_class: Node = EDPOPREC.Field - + def __init__(self, original_text: str) -> None: if not isinstance(original_text, str): - raise FieldError( - f'original_text should be str, not {type(original_text)}' - ) + raise FieldError(f"original_text should be str, not {type(original_text)}") self.subject_node = BNode() self.original_text = original_text self._subfields = [ - ('original_text', EDPOPREC.originalText, 'string'), - ('summary_text', EDPOPREC.summaryText, 'string'), - ('unknown', EDPOPREC.unknown, 'boolean'), - ('authority_record', EDPOPREC.authorityRecord, 'string'), + ("original_text", EDPOPREC.originalText, "string"), + ("summary_text", EDPOPREC.summaryText, "string"), + ("unknown", EDPOPREC.unknown, "boolean"), + ("authority_record", EDPOPREC.authorityRecord, "string"), ] def normalize(self) -> NormalizationResult: @@ -102,14 +99,10 @@ def normalize(self) -> NormalizationResult: return self.normalizer() def to_graph(self) -> Graph: - '''Create an ``rdflib`` RDF graph according to the current data.''' + """Create an ``rdflib`` RDF graph according to the current data.""" assert isinstance(self.subject_node, Node) graph = Graph() - graph.add(( - self.subject_node, - RDF.type, - self._rdf_class - )) + graph.add((self.subject_node, RDF.type, self._rdf_class)) for subfield in self._subfields: attrname, propref, datatype = subfield value = getattr(self, attrname, None) @@ -125,21 +118,17 @@ def to_graph(self) -> Graph: "{self.__class__} but it does not exist" ) else: - input_type = typedef['input_type'] + input_type = typedef["input_type"] if not isinstance(value, input_type): raise FieldError( f"Subfield {attrname} should be of type {str(input_type)} but " "it is {str(type(value))}" ) else: - converter = typedef['converter'] + converter = typedef["converter"] converted = converter(value) assert isinstance(converted, Node) - graph.add(( - self.subject_node, - propref, - converted - )) + graph.add((self.subject_node, propref, converted)) return graph @property @@ -161,9 +150,7 @@ class LocationField(Field): def __init__(self, original_text: str) -> None: super().__init__(original_text) - self._subfields.append( - ('location_type', EDPOPREC.locationType, 'uriref') - ) + self._subfields.append(("location_type", EDPOPREC.locationType, "uriref")) class LanguageField(Field): @@ -173,9 +160,7 @@ class LanguageField(Field): def __init__(self, original_text: str) -> None: super().__init__(original_text) - self._subfields.append( - ('language_code', EDPOPREC.languageCode, 'string') - ) + self._subfields.append(("language_code", EDPOPREC.languageCode, "string")) @property def summary_text(self) -> Optional[str]: @@ -193,10 +178,12 @@ class ContributorField(Field): def __init__(self, original_text: str) -> None: super().__init__(original_text) - self._subfields.extend(( - ('name', EDPOPREC.name, 'string'), - ('role', EDPOPREC.role, 'string'), - )) + self._subfields.extend( + ( + ("name", EDPOPREC.name, "string"), + ("role", EDPOPREC.role, "string"), + ) + ) @property def summary_text(self) -> Optional[str]: @@ -206,7 +193,3 @@ def summary_text(self) -> Optional[str]: return f"{name} ({role})" else: return name - - - - diff --git a/edpop_explorer/normalization/relators.py b/edpop_explorer/normalization/relators.py index da6cd33..8e1bc4c 100644 --- a/edpop_explorer/normalization/relators.py +++ b/edpop_explorer/normalization/relators.py @@ -303,5 +303,5 @@ "wit": "witness", "wpr": "writer of preface", "wst": "writer of supplementary textual content", - "wts": "writer of television story" + "wts": "writer of television story", } diff --git a/edpop_explorer/normalizers.py b/edpop_explorer/normalizers.py index 1e89a8d..cad75e2 100644 --- a/edpop_explorer/normalizers.py +++ b/edpop_explorer/normalizers.py @@ -4,9 +4,9 @@ class NormalizationResult(Enum): - SUCCESS = 'success' - NO_DATA = 'nodata' - FAIL = 'fail' + SUCCESS = "success" + NO_DATA = "nodata" + FAIL = "fail" def normalize_by_language_code(field) -> NormalizationResult: diff --git a/edpop_explorer/rdf.py b/edpop_explorer/rdf.py index 25f462a..e8bd50b 100644 --- a/edpop_explorer/rdf.py +++ b/edpop_explorer/rdf.py @@ -3,10 +3,10 @@ from rdflib.namespace import Namespace from rdflib import Graph, RDF, RDFS -EDPOPREC = Namespace('https://dhstatic.hum.uu.nl/edpop-records/0.1.0/') +EDPOPREC = Namespace("https://dhstatic.hum.uu.nl/edpop-records/0.1.0/") """EDPOP Record Ontology""" -RELATORS = Namespace('http://id.loc.gov/vocabulary/relators/') +RELATORS = Namespace("http://id.loc.gov/vocabulary/relators/") """Library of Congress relators. See: https://id.loc.gov/vocabulary/relators.html""" @@ -19,4 +19,3 @@ def bind_common_namespaces(graph: Graph) -> None: graph.bind("rdf", RDF) graph.bind("rdfs", RDFS) graph.bind("edpoprec", EDPOPREC) - diff --git a/edpop_explorer/reader.py b/edpop_explorer/reader.py index e6fd385..51c6b62 100644 --- a/edpop_explorer/reader.py +++ b/edpop_explorer/reader.py @@ -12,7 +12,10 @@ from edpop_explorer import ( - EDPOPREC, BIBLIOGRAPHICAL, BIOGRAPHICAL, bind_common_namespaces + EDPOPREC, + BIBLIOGRAPHICAL, + BIOGRAPHICAL, + bind_common_namespaces, ) from .record import Record @@ -22,6 +25,7 @@ class BasePreparedQuery: """Empty base dataclass for prepared queries. For prepared queries that can be represented by a single string, do not inherit from this class but use a simple string instead.""" + pass @@ -45,6 +49,7 @@ class Reader(ABC): ``fetch_range()`` should populate the ``records``, ``number_of_results``, ``number_fetched`` and ``range_fetched`` attributes. """ + number_of_results: Optional[int] = None """The total number of results for the query, or None if fetching has not yet started and the number is not yet known.""" @@ -111,9 +116,7 @@ def adjust_start_record(self, start_number: int) -> None: records.""" self._fetch_position = start_number - def fetch( - self, number: Optional[int] = None - ) -> range: + def fetch(self, number: Optional[int] = None) -> range: """Perform an initial or subsequent query. Most readers fetch a limited number of records at once -- this number depends on the reader but it may be adjusted using the ``number`` parameter. @@ -126,8 +129,9 @@ def fetch( return range(0) if number is None: number = self.DEFAULT_RECORDS_PER_PAGE - resulting_range = self.fetch_range(range(self._fetch_position, - self._fetch_position + number)) + resulting_range = self.fetch_range( + range(self._fetch_position, self._fetch_position + number) + ) self._fetch_position = resulting_range.stop return resulting_range @@ -160,9 +164,9 @@ def get(self, index: int, allow_fetching: bool = True) -> Record: # Try to fetch, if it is allowed, and if there is a chance that # it is successful (by verifying that index is not out of # available range, if known) - if (allow_fetching and - (self.number_of_results is None - or self.number_of_results <= index)): + if allow_fetching and ( + self.number_of_results is None or self.number_of_results <= index + ): # Fetch and try again self.fetch_range(range(index, index + 1)) record = self.records.get(index) @@ -200,7 +204,7 @@ def iri_to_identifier(cls, iri: str) -> str: "not a string." ) if iri.startswith(cls.IRI_PREFIX): - return unquote(iri[len(cls.IRI_PREFIX):]) + return unquote(iri[len(cls.IRI_PREFIX) :]) else: raise ReaderError( f"Cannot convert IRI {iri} to identifier: IRI does not start " @@ -209,13 +213,13 @@ def iri_to_identifier(cls, iri: str) -> str: @classmethod def catalog_to_graph(cls) -> Graph: - '''Create an RDF representation of the catalog that this reader - supports as an instance of EDPOPREC:Catalog.''' + """Create an RDF representation of the catalog that this reader + supports as an instance of EDPOPREC:Catalog.""" g = Graph() if not cls.CATALOG_URIREF: raise ReaderError( - 'Cannot create graph because catalog IRI has not been set. ' - 'This should have been done on class level.' + "Cannot create graph because catalog IRI has not been set. " + "This should have been done on class level." ) # Set reader class @@ -293,8 +297,9 @@ class GetByIdBasedOnQueryMixin(ABC): @classmethod def get_by_id(cls, identifier: str) -> Record: reader = cls() - assert isinstance(reader, Reader), \ - "GetByIdBasedOnQueryMixin should be used on Reader subclass" + assert isinstance( + reader, Reader + ), "GetByIdBasedOnQueryMixin should be used on Reader subclass" reader.set_query(cls._prepare_get_by_id_query(identifier)) reader.fetch() if reader.number_of_results == 0: @@ -326,6 +331,7 @@ class DatabaseFileMixin: using the filename specified in the constant attribute ``DATABASE_FILENAME``, which has to be specified by the user of this mixin.""" + DATABASE_URL: Optional[str] = None """The URL to download the database file from. If this attribute is ``None``, automatically downloading the database file is not supported.""" @@ -341,9 +347,10 @@ class DatabaseFileMixin: def prepare_data(self) -> None: """Prepare the database file by confirming that it is available, and if not, by attempting to download it.""" - self.database_path = Path( - AppDirs('edpop-explorer', 'cdh').user_data_dir - ) / self.DATABASE_FILENAME + self.database_path = ( + Path(AppDirs("edpop-explorer", "cdh").user_data_dir) + / self.DATABASE_FILENAME + ) if not self.database_path.exists(): if self.DATABASE_URL is None: # No database URL is given, so the user has to get the database @@ -353,37 +360,36 @@ def prepare_data(self) -> None: # the Windows Store... db_dir = self.database_path.parent.resolve() error_message = ( - f'{self.__class__.__name__} database not found. Please obtain the file ' - f'{self.DATABASE_FILENAME} from the project team and add it ' - f'to the following directory: {db_dir}' + f"{self.__class__.__name__} database not found. Please obtain the file " + f"{self.DATABASE_FILENAME} from the project team and add it " + f"to the following directory: {db_dir}" ) raise ReaderError(error_message) else: self._download_database() def _download_database(self) -> None: - print('Downloading database...') + print("Downloading database...") response = requests.get(self.DATABASE_URL) if response.ok: try: self.database_path.parent.mkdir(exist_ok=True, parents=True) - with open(self.database_path, 'wb') as f: + with open(self.database_path, "wb") as f: f.write(response.content) except OSError as err: - raise ReaderError( - f'Error writing database file to disk: {err}' - ) + raise ReaderError(f"Error writing database file to disk: {err}") else: raise ReaderError( - f'Error downloading database file from {self.DATABASE_URL}' + f"Error downloading database file from {self.DATABASE_URL}" ) - print(f'Successfully saved database to {self.database_path}.') - print(f'See license: {self.DATABASE_LICENSE}') + print(f"Successfully saved database to {self.database_path}.") + print(f"See license: {self.DATABASE_LICENSE}") class ReaderError(Exception): """Generic exception for failures in ``Reader`` class. More specific errors derive from this class.""" + pass diff --git a/edpop_explorer/readers/__init__.py b/edpop_explorer/readers/__init__.py index 3acfd63..ad776e2 100644 --- a/edpop_explorer/readers/__init__.py +++ b/edpop_explorer/readers/__init__.py @@ -1,4 +1,4 @@ -'''This package contains concrete subclasses of ``Reader``.''' +"""This package contains concrete subclasses of ``Reader``.""" __all__ = [ "BnFReader", @@ -51,5 +51,5 @@ def _get_all_readers() -> List[Type[Reader]]: all_readers.append(cls) return all_readers -ALL_READERS = _get_all_readers() +ALL_READERS = _get_all_readers() diff --git a/edpop_explorer/readers/bibliopolis.py b/edpop_explorer/readers/bibliopolis.py index 3d37146..929b76e 100644 --- a/edpop_explorer/readers/bibliopolis.py +++ b/edpop_explorer/readers/bibliopolis.py @@ -6,20 +6,16 @@ class BibliopolisReader(SRUReader): # Note that this reader is currently deactivated by default because # the API is not working. It is not possible for the moment to # test this reader. - sru_url = 'http://jsru.kb.nl/sru/sru' - sru_version = '1.2' - HPB_LINK = 'http://hpb.cerl.org/record/{}' - CATALOG_URIREF = URIRef( - 'https://edpop.hum.uu.nl/readers/bibliopolis' - ) + sru_url = "http://jsru.kb.nl/sru/sru" + sru_version = "1.2" + HPB_LINK = "http://hpb.cerl.org/record/{}" + CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/bibliopolis") IRI_PREFIX = "https://edpop.hum.uu.nl/readers/bibliopolis/" SHORT_NAME = "Bibliopolis" def __init__(self): super().__init__() - self.additional_params = { - 'x-collection': 'Bibliopolis' - } + self.additional_params = {"x-collection": "Bibliopolis"} def _convert_record(self, sruthirecord: dict) -> Record: record = Record(from_reader=self.__class__) diff --git a/edpop_explorer/readers/bnf.py b/edpop_explorer/readers/bnf.py index 39c3366..d0bbc5c 100644 --- a/edpop_explorer/readers/bnf.py +++ b/edpop_explorer/readers/bnf.py @@ -6,32 +6,30 @@ class BnFReader(SRUMarc21BibliographicalReader): - sru_url = 'http://catalogue.bnf.fr/api/SRU' - sru_version = '1.2' - HPB_LINK = 'http://hpb.cerl.org/record/{}' - marcxchange_prefix = 'info:lc/xmlns/marcxchange-v2:' - CATALOG_URIREF = URIRef( - 'https://edpop.hum.uu.nl/readers/bnf' - ) + sru_url = "http://catalogue.bnf.fr/api/SRU" + sru_version = "1.2" + HPB_LINK = "http://hpb.cerl.org/record/{}" + marcxchange_prefix = "info:lc/xmlns/marcxchange-v2:" + CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/bnf") IRI_PREFIX = "https://edpop.hum.uu.nl/readers/bnf/" SHORT_NAME = "Bibliothèque nationale de France (BnF)" DESCRIPTION = "General catalogue of the French National Library" - _title_field_subfield = ('200', 'a') - _alternative_title_field_subfield = ('500', 'a') - _publisher_field_subfield = ('201', 'c') - _place_field_subfield = ('210', 'a') - _dating_field_subfield = ('210', 'd') - _language_field_subfield = ('101', 'a') + _title_field_subfield = ("200", "a") + _alternative_title_field_subfield = ("500", "a") + _publisher_field_subfield = ("201", "c") + _place_field_subfield = ("210", "a") + _dating_field_subfield = ("210", "d") + _language_field_subfield = ("101", "a") # TODO: add format etc @classmethod def transform_query(cls, query: str) -> str: - return 'bib.anywhere all ({})'.format(query) + return "bib.anywhere all ({})".format(query) @classmethod def _get_link(cls, data: Marc21Data) -> Optional[str]: # The link can be found in control field 003 - return data.controlfields.get('003', None) + return data.controlfields.get("003", None) @classmethod def _prepare_get_by_id_query(cls, identifier: str) -> str: diff --git a/edpop_explorer/readers/cerl_thesaurus.py b/edpop_explorer/readers/cerl_thesaurus.py index 8f7cdb2..3cb455d 100644 --- a/edpop_explorer/readers/cerl_thesaurus.py +++ b/edpop_explorer/readers/cerl_thesaurus.py @@ -6,36 +6,34 @@ class CERLThesaurusReader(SRUReader): - sru_url = 'https://data.cerl.org/thesaurus/_sru' - sru_version = '1.2' - CERL_LINK = 'https://data.cerl.org/thesaurus/{}' - CTAS_PREFIX = 'http://sru.cerl.org/ctas/dtd/1.1:' - CATALOG_URIREF = URIRef( - 'https://edpop.hum.uu.nl/readers/cerlthesaurus' - ) + sru_url = "https://data.cerl.org/thesaurus/_sru" + sru_version = "1.2" + CERL_LINK = "https://data.cerl.org/thesaurus/{}" + CTAS_PREFIX = "http://sru.cerl.org/ctas/dtd/1.1:" + CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/cerlthesaurus") IRI_PREFIX = "https://edpop.hum.uu.nl/readers/cerlthesaurus/" READERTYPE = BIOGRAPHICAL SHORT_NAME = "CERL Thesaurus" - DESCRIPTION = "The CERL Thesaurus file contains forms of imprint " \ - "places, imprint names, personal names and corporate names as "\ - "found in material printed before the middle of the nineteenth "\ - "century - including variant spellings, forms in Latin and "\ + DESCRIPTION = ( + "The CERL Thesaurus file contains forms of imprint " + "places, imprint names, personal names and corporate names as " + "found in material printed before the middle of the nineteenth " + "century - including variant spellings, forms in Latin and " "other languages, and fictitious names." + ) @classmethod - def _get_acceptable_names( - cls, namelist: List[Dict[str, str]] - ) -> List[str]: + def _get_acceptable_names(cls, namelist: List[Dict[str, str]]) -> List[str]: names = [] for name in namelist: - if name['name'] in ['single', 'full']: - names.append(name['text']) + if name["name"] in ["single", "full"]: + names.append(name["text"]) return names - + @classmethod def _convert_record(cls, sruthirecord: dict) -> Record: record = BiographicalRecord(from_reader=cls) - record.identifier = sruthirecord['id'] + record.identifier = sruthirecord["id"] record.link = cls.CERL_LINK.format(record.identifier) record.data = sruthirecord @@ -47,17 +45,17 @@ def _convert_record(cls, sruthirecord: dict) -> Record: # display name) and variantForm (multiple variant names). We will # use these respectively for name and variantName. PREFIX = cls.CTAS_PREFIX - headingform = sruthirecord.get(PREFIX + 'headingForm', None) + headingform = sruthirecord.get(PREFIX + "headingForm", None) if headingform and isinstance(headingform, list): names = cls._get_acceptable_names(headingform) if len(names): record.name = Field(names[0]) # If no headingForm was defined, try display if not record.name: - display = sruthirecord.get(PREFIX + 'display', None) + display = sruthirecord.get(PREFIX + "display", None) if display: record.name = Field(display) - variantform = sruthirecord.get(PREFIX + 'variantForm', None) + variantform = sruthirecord.get(PREFIX + "variantForm", None) if variantform and isinstance(variantform, list): names = cls._get_acceptable_names(variantform) record.variant_names = [Field(x) for x in names] @@ -65,17 +63,17 @@ def _convert_record(cls, sruthirecord: dict) -> Record: # Add activityNote. This field can have only one value in CT. # NB: this data is very inconsistent and often includes other information # than somebody's activity - consider ignoring - activitynote = sruthirecord.get(PREFIX + 'activityNote') + activitynote = sruthirecord.get(PREFIX + "activityNote") if activitynote: record.activities = [Field(activitynote)] # Add biographicalData, which appears to be in all cases the years # that somebody was alive or that an entity existed - biographicaldata = sruthirecord.get(PREFIX + 'biographicalData') + biographicaldata = sruthirecord.get(PREFIX + "biographicalData") if biographicaldata: record.timespan = Field(biographicaldata) # Add geographicalNote, which appears to be a country in all cases. # Add it to places of activity. - geographicalnote = sruthirecord.get(PREFIX + 'geographicalNote') + geographicalnote = sruthirecord.get(PREFIX + "geographicalNote") if geographicalnote: field = LocationField(geographicalnote) field.location_type = LocationField.COUNTRY diff --git a/edpop_explorer/readers/dutch_almanacs.py b/edpop_explorer/readers/dutch_almanacs.py index fae77ab..eff1b8d 100644 --- a/edpop_explorer/readers/dutch_almanacs.py +++ b/edpop_explorer/readers/dutch_almanacs.py @@ -1,16 +1,22 @@ import csv from typing import List -from edpop_explorer import Reader, ReaderError, Field, BibliographicalRecord, BIBLIOGRAPHICAL, DatabaseFileMixin +from edpop_explorer import ( + Reader, + ReaderError, + Field, + BibliographicalRecord, + BIBLIOGRAPHICAL, + DatabaseFileMixin, +) from rdflib import URIRef class DutchAlmanacsReader(DatabaseFileMixin, Reader): - """ Dutch Almanacs database reader. Access with command 'dutalm'.""" - DATABASE_URL = 'https://dhstatic.hum.uu.nl/edpop/biblio_dutchalmanacs.csv' - DATABASE_FILENAME = 'biblio_dutchalmanacs.csv' - CATALOG_URIREF = URIRef( - 'https://edpop.hum.uu.nl/readers/dutch_almanacs' - ) + """Dutch Almanacs database reader. Access with command 'dutalm'.""" + + DATABASE_URL = "https://dhstatic.hum.uu.nl/edpop/biblio_dutchalmanacs.csv" + DATABASE_FILENAME = "biblio_dutchalmanacs.csv" + CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/dutch_almanacs") IRI_PREFIX = "https://edpop.hum.uu.nl/readers/dutch_almanacs/" FETCH_ALL_AT_ONCE = True SHORT_NAME = "Dutch Almanacs" @@ -21,15 +27,17 @@ class DutchAlmanacsReader(DatabaseFileMixin, Reader): def _convert_record(cls, rawrecord: dict) -> BibliographicalRecord: record = BibliographicalRecord(from_reader=cls) record.data = rawrecord - record.identifier = rawrecord['ID'] - record.dating = Field(rawrecord['Jaar']) - record.place_of_publication = Field(rawrecord['Plaats uitgave']) - record.bookseller = Field(rawrecord['Boekverkoper']) - record.contributors = [Field(author.strip()) for author in rawrecord['Auteur'].split('/')] - record.title = Field(rawrecord['Titel']) - record.physical_description = Field(rawrecord['Formaat']) - record.location = Field(rawrecord['Vindplaats']) - record.publisher_or_printer = Field(rawrecord['Drukker']) + record.identifier = rawrecord["ID"] + record.dating = Field(rawrecord["Jaar"]) + record.place_of_publication = Field(rawrecord["Plaats uitgave"]) + record.bookseller = Field(rawrecord["Boekverkoper"]) + record.contributors = [ + Field(author.strip()) for author in rawrecord["Auteur"].split("/") + ] + record.title = Field(rawrecord["Titel"]) + record.physical_description = Field(rawrecord["Formaat"]) + record.location = Field(rawrecord["Vindplaats"]) + record.publisher_or_printer = Field(rawrecord["Drukker"]) return record @classmethod @@ -41,10 +49,10 @@ def transform_query(cls, query) -> str: def get_by_id(cls, identifier: str) -> BibliographicalRecord: reader = cls() reader.prepare_data() - with open(reader.database_path, 'r', encoding='utf-8-sig') as file: - reader = csv.DictReader(file, delimiter=';') + with open(reader.database_path, "r", encoding="utf-8-sig") as file: + reader = csv.DictReader(file, delimiter=";") for row in reader: - if row['ID'] == identifier: + if row["ID"] == identifier: return cls._convert_record(row) raise ReaderError(f"Item with id {identifier} does not exist.") @@ -54,8 +62,8 @@ def _perform_query(self) -> List[BibliographicalRecord]: # Search query in all columns, and fetch results based on query results = [] - with open(self.database_path, 'r', encoding='utf-8-sig') as file: - reader = csv.DictReader(file, delimiter=';') + with open(self.database_path, "r", encoding="utf-8-sig") as file: + reader = csv.DictReader(file, delimiter=";") for row in reader: for key in row.keys(): if self.prepared_query.lower() in row[key].lower(): @@ -72,11 +80,11 @@ def _perform_query(self) -> List[BibliographicalRecord]: def fetch_range(self, range_to_fetch: range) -> range: if self.prepared_query is None: - raise ReaderError('First call prepare_query') + raise ReaderError("First call prepare_query") if self.fetching_exhausted: return range(0) start_record = range_to_fetch.start results = self._perform_query() for i, result in enumerate(results): self.records[i] = result - return range(start_record, start_record + len(results)) \ No newline at end of file + return range(start_record, start_record + len(results)) diff --git a/edpop_explorer/readers/fbtee.py b/edpop_explorer/readers/fbtee.py index 6bc9d87..2bf2d5f 100644 --- a/edpop_explorer/readers/fbtee.py +++ b/edpop_explorer/readers/fbtee.py @@ -3,7 +3,12 @@ from typing import Optional from edpop_explorer import ( - Reader, BibliographicalRecord, ReaderError, Field, BIBLIOGRAPHICAL, DatabaseFileMixin + Reader, + BibliographicalRecord, + ReaderError, + Field, + BIBLIOGRAPHICAL, + DatabaseFileMixin, ) from edpop_explorer.fields import LanguageField from edpop_explorer.reader import GetByIdBasedOnQueryMixin @@ -11,58 +16,54 @@ class FBTEEReader(DatabaseFileMixin, GetByIdBasedOnQueryMixin, Reader): - DATABASE_URL = 'https://dhstatic.hum.uu.nl/edpop/cl.sqlite3' - DATABASE_FILENAME = 'cl.sqlite3' - DATABASE_LICENSE = 'https://dhstatic.hum.uu.nl/edpop/LICENSE.txt' - FBTEE_LINK = 'http://fbtee.uws.edu.au/stn/interface/browse.php?t=book&' \ - 'id={}' + DATABASE_URL = "https://dhstatic.hum.uu.nl/edpop/cl.sqlite3" + DATABASE_FILENAME = "cl.sqlite3" + DATABASE_LICENSE = "https://dhstatic.hum.uu.nl/edpop/LICENSE.txt" + FBTEE_LINK = "http://fbtee.uws.edu.au/stn/interface/browse.php?t=book&" "id={}" READERTYPE = BIBLIOGRAPHICAL - CATALOG_URIREF = URIRef( - 'https://edpop.hum.uu.nl/readers/fbtee' - ) + CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/fbtee") IRI_PREFIX = "https://edpop.hum.uu.nl/readers/fbtee/" prepared_query: Optional[SQLPreparedQuery] = None FETCH_ALL_AT_ONCE = True SHORT_NAME = "French Book Trade in Enlightenment Europe (FBTEE)" - DESCRIPTION = "Mapping the Trade of the Société Typographique de " \ - "Neuchâtel, 1769-1794" + DESCRIPTION = ( + "Mapping the Trade of the Société Typographique de " "Neuchâtel, 1769-1794" + ) @classmethod def _prepare_get_by_id_query(cls, identifier: str) -> SQLPreparedQuery: return SQLPreparedQuery( - where_statement="WHERE book_code = ?", - arguments=[identifier] + where_statement="WHERE book_code = ?", arguments=[identifier] ) @classmethod def transform_query(cls, query: str) -> SQLPreparedQuery: return SQLPreparedQuery( - where_statement='WHERE full_book_title LIKE ?', - arguments=[f'%{query}%'] + where_statement="WHERE full_book_title LIKE ?", arguments=[f"%{query}%"] ) @classmethod def _add_fields(cls, record: BibliographicalRecord) -> None: assert isinstance(record.data, dict) - record.title = Field(record.data['full_book_title']) - if record.data['languages']: - languages = record.data['languages'].split(sep=', ') + record.title = Field(record.data["full_book_title"]) + if record.data["languages"]: + languages = record.data["languages"].split(sep=", ") record.languages = [LanguageField(x) for x in languages] [x.normalize() for x in record.languages] - pages = record.data['pages'] + pages = record.data["pages"] if pages: record.extent = Field(pages) - place = record.data['stated_publication_places'] + place = record.data["stated_publication_places"] if place: record.place_of_publication = Field(place) - year = record.data['stated_publication_years'] + year = record.data["stated_publication_years"] if year: record.dating = Field(year) - publisher = record.data['stated_publishers'] + publisher = record.data["stated_publishers"] if publisher: record.publisher_or_printer = Field(publisher) record.contributors = [] - for author in record.data['authors']: + for author in record.data["authors"]: # author is tuple of author code and author name record.contributors.append(Field(author[1])) @@ -72,26 +73,26 @@ def fetch_range(self, range_to_fetch: range) -> range: # the dataset is small. self.prepare_data() if not self.prepared_query: - raise ReaderError('First call prepare_query method') + raise ReaderError("First call prepare_query method") if self.fetching_exhausted: return range(0) with sqlite3.connect(str(self.database_path)) as con: cur = con.cursor() - columns = [x[1] for x in cur.execute('PRAGMA table_info(books)')] + columns = [x[1] for x in cur.execute("PRAGMA table_info(books)")] res = cur.execute( - 'SELECT B.*, BA.author_code, A.author_name FROM books B ' - 'LEFT OUTER JOIN books_authors BA on B.book_code=BA.book_code ' - 'JOIN authors A on BA.author_code=A.author_code ' - f'{self.prepared_query.where_statement} ' - 'ORDER BY B.book_code', - self.prepared_query.arguments + "SELECT B.*, BA.author_code, A.author_name FROM books B " + "LEFT OUTER JOIN books_authors BA on B.book_code=BA.book_code " + "JOIN authors A on BA.author_code=A.author_code " + f"{self.prepared_query.where_statement} " + "ORDER BY B.book_code", + self.prepared_query.arguments, ) - last_book_code = '' + last_book_code = "" i = -1 for row in res: # Since we are joining with another table, a book may be repeated, # so check if this is a new item - book_code: str = row[columns.index('book_code')] + book_code: str = row[columns.index("book_code")] if last_book_code != book_code: # We have a new book, so update i i += 1 @@ -101,7 +102,7 @@ def fetch_range(self, range_to_fetch: range) -> range: record.data[columns[j]] = row[j] record.identifier = book_code record.link = self.FBTEE_LINK.format(book_code) - record.data['authors'] = [] + record.data["authors"] = [] self.records[i] = record last_book_code = book_code # Add author_code and author_name to the last record @@ -109,7 +110,7 @@ def fetch_range(self, range_to_fetch: range) -> range: author_code = row[len(columns)] author_name = row[len(columns) + 1] assert isinstance(self.records[i].data, dict) - self.records[i].data['authors'].append((author_code, author_name)) + self.records[i].data["authors"].append((author_code, author_name)) for record_number in self.records: record = self.records[record_number] assert isinstance(record, BibliographicalRecord) diff --git a/edpop_explorer/readers/gallica.py b/edpop_explorer/readers/gallica.py index 02a36f9..c9137d5 100644 --- a/edpop_explorer/readers/gallica.py +++ b/edpop_explorer/readers/gallica.py @@ -18,11 +18,11 @@ def _force_list(data) -> list: def _force_string(data) -> Optional[str]: - '''Transform data into one string or None. Can be used if a single + """Transform data into one string or None. Can be used if a single string is expected, but if there is a possibility that it is a - list.''' + list.""" if isinstance(data, list): - return ' ; '.join([str(x) for x in data]) + return " ; ".join([str(x) for x in data]) elif data is None: return None else: @@ -30,20 +30,19 @@ def _force_string(data) -> Optional[str]: class GallicaReader(SRUReader): - sru_url = 'https://gallica.bnf.fr/SRU' - sru_version = '1.2' - CERL_LINK = 'https://data.cerl.org/thesaurus/{}' - CTAS_PREFIX = 'http://sru.cerl.org/ctas/dtd/1.1:' - CATALOG_URIREF = URIRef( - 'https://edpop.hum.uu.nl/readers/gallica' - ) + sru_url = "https://gallica.bnf.fr/SRU" + sru_version = "1.2" + CERL_LINK = "https://data.cerl.org/thesaurus/{}" + CTAS_PREFIX = "http://sru.cerl.org/ctas/dtd/1.1:" + CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/gallica") IRI_PREFIX = "https://edpop.hum.uu.nl/readers/gallica/" DOCUMENT_API_URL = "https://gallica.bnf.fr/services/OAIRecord?ark={}" IDENTIFIER_PREFIX = "https://gallica.bnf.fr/" READERTYPE = BIBLIOGRAPHICAL SHORT_NAME = "Gallica" - DESCRIPTION = "Digital library of the Bibliothèque nationale de France " \ - "and its partners" + DESCRIPTION = ( + "Digital library of the Bibliothèque nationale de France " "and its partners" + ) @classmethod def _convert_record(cls, sruthirecord: dict) -> BibliographicalRecord: @@ -53,30 +52,30 @@ def _convert_record(cls, sruthirecord: dict) -> BibliographicalRecord: # string from sruthi, in the latter case as a list of strings. # Take the first string starting with https:// as the identifier # and as the link. - identifiers = _force_list(sruthirecord.get('identifier', None)) + identifiers = _force_list(sruthirecord.get("identifier", None)) for identifier in identifiers: if identifier.startswith(cls.IDENTIFIER_PREFIX): - record.identifier = identifier[len(cls.IDENTIFIER_PREFIX):] + record.identifier = identifier[len(cls.IDENTIFIER_PREFIX) :] record.link = identifier record.data = {} for key in sruthirecord: - if key in ['schema', 'id']: + if key in ["schema", "id"]: continue - showkey: str = key.replace(cls.CTAS_PREFIX, 'ctas:') + showkey: str = key.replace(cls.CTAS_PREFIX, "ctas:") record.data[showkey] = sruthirecord[key] record.data = sruthirecord - title = _force_string(sruthirecord.get('title', None)) + title = _force_string(sruthirecord.get("title", None)) if title: record.title = Field(title) - creators = _force_list(sruthirecord.get('creator', None)) + creators = _force_list(sruthirecord.get("creator", None)) record.contributors = [Field(x) for x in creators] - dating = _force_string(sruthirecord.get('date', None)) + dating = _force_string(sruthirecord.get("date", None)) if dating: record.dating = Field(dating) - languages = _force_list(sruthirecord.get('language', None)) + languages = _force_list(sruthirecord.get("language", None)) record.languages = [LanguageField(x) for x in languages] [x.normalize() for x in record.languages] - publisher = _force_string(sruthirecord.get('publisher', None)) + publisher = _force_string(sruthirecord.get("publisher", None)) if publisher: record.publisher_or_printer = Field(publisher) @@ -84,10 +83,12 @@ def _convert_record(cls, sruthirecord: dict) -> BibliographicalRecord: # the number of views, the MIME type and the extent. # Try finding the extent by filtering out the other two. # This seems to work correctly. - format_strings = _force_list(sruthirecord.get('format', None)) + format_strings = _force_list(sruthirecord.get("format", None)) for formatstr in format_strings: - if not (formatstr.startswith('Nombre total de vues') or - re.match('$[a-z]+/[a-z]+^', formatstr)): + if not ( + formatstr.startswith("Nombre total de vues") + or re.match("$[a-z]+/[a-z]+^", formatstr) + ): record.extent = Field(formatstr) break @@ -95,7 +96,7 @@ def _convert_record(cls, sruthirecord: dict) -> BibliographicalRecord: @classmethod def get_by_id(cls, identifier: str) -> BibliographicalRecord: - # Getting by id works via another interface (a simple XML API), but the + # Getting by id works via another interface (a simple XML API), but the # returned data is the same in a slightly different format. Hence, # convert it to JSON just like sruthi does and extract the right piece # of data. @@ -115,8 +116,8 @@ def get_by_id(cls, identifier: str) -> BibliographicalRecord: data = response_as_dict["results"]["notice"]["record"]["metadata"]["dc"] # The returned XML has elements with attributes, while these attributes # are missing from the XML that is sent back by the SRU interface. - # An attribute-less element is represented as a simple string by - # xmltodict, while an attribute with elements is represented as a + # An attribute-less element is represented as a simple string by + # xmltodict, while an attribute with elements is represented as a # dict where the contents is in the value of "text". Replace these # dicts with simple strings. (Not a very clean solution but refactoring # is not worth the time at this point.) @@ -131,4 +132,4 @@ def get_by_id(cls, identifier: str) -> BibliographicalRecord: @classmethod def transform_query(cls, query: str) -> str: - return 'gallica all {}'.format(query) + return "gallica all {}".format(query) diff --git a/edpop_explorer/readers/hpb.py b/edpop_explorer/readers/hpb.py index b98f75c..c905f67 100644 --- a/edpop_explorer/readers/hpb.py +++ b/edpop_explorer/readers/hpb.py @@ -1,18 +1,14 @@ from rdflib import URIRef from typing import Optional -from edpop_explorer import ( - SRUMarc21BibliographicalReader, Marc21Data, BIBLIOGRAPHICAL -) +from edpop_explorer import SRUMarc21BibliographicalReader, Marc21Data, BIBLIOGRAPHICAL class HPBReader(SRUMarc21BibliographicalReader): - sru_url = 'http://sru.k10plus.de/hpb' - sru_version = '1.1' - HPB_LINK = 'http://hpb.cerl.org/record/{}' - CATALOG_URIREF = URIRef( - 'https://edpop.hum.uu.nl/readers/hpb' - ) + sru_url = "http://sru.k10plus.de/hpb" + sru_version = "1.1" + HPB_LINK = "http://hpb.cerl.org/record/{}" + CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/hpb") READERTYPE = BIBLIOGRAPHICAL IRI_PREFIX = "https://edpop.hum.uu.nl/readers/hpb/" SHORT_NAME = "Heritage of the Printed Book (HPB)" @@ -33,17 +29,15 @@ def _prepare_get_by_id_query(cls, identifier: str) -> str: return f"pica.cid={identifier}" @classmethod - def _get_identifier(cls, data:Marc21Data) -> Optional[str]: + def _get_identifier(cls, data: Marc21Data) -> Optional[str]: # The record id can be found in field 035 in subfield a starting # with (CERL), like this: (CERL)HU-SzSEK.01.bibJAT603188. # The URI can then be created using HPB_URI. # HPB records have field 035 two times. - fields035 = data.get_fields('035') + fields035 = data.get_fields("035") for field in fields035: - if 'a' in field.subfields and \ - field.subfields['a'].startswith('(CERL)'): - return field.subfields['a'][len('(CERL)'):] - + if "a" in field.subfields and field.subfields["a"].startswith("(CERL)"): + return field.subfields["a"][len("(CERL)") :] @classmethod def _get_link(cls, data: Marc21Data) -> Optional[str]: diff --git a/edpop_explorer/readers/kb.py b/edpop_explorer/readers/kb.py index b383388..1dd5552 100644 --- a/edpop_explorer/readers/kb.py +++ b/edpop_explorer/readers/kb.py @@ -6,12 +6,10 @@ class KBReader(SRUReader): - sru_url = 'http://jsru.kb.nl/sru' - sru_version = '1.2' - KB_LINK = 'https://webggc.oclc.org/cbs/DB=2.37/PPN?PPN={}' - CATALOG_URIREF = URIRef( - 'https://edpop.hum.uu.nl/readers/kb' - ) + sru_url = "http://jsru.kb.nl/sru" + sru_version = "1.2" + KB_LINK = "https://webggc.oclc.org/cbs/DB=2.37/PPN?PPN={}" + CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/kb") READERTYPE = BIBLIOGRAPHICAL IRI_PREFIX = "https://edpop.hum.uu.nl/readers/kb/" SHORT_NAME = "Koninklijke Bibliotheek (KB)" @@ -20,9 +18,7 @@ class KBReader(SRUReader): def __init__(self): super().__init__() # The KB SRU requires 'x-collection' as an additional GET parameter - self.session.params = { - 'x-collection': 'GGC' - } + self.session.params = {"x-collection": "GGC"} @classmethod def transform_query(cls, query: str) -> str: @@ -32,12 +28,12 @@ def _find_ppn(self, data: dict): """Try to find the PPN given the data that comes from the SRU server; return None if PPN cannot be found""" # This seems to work fine; not thoroughly tested. - oai_pmh_identifier = data.get('OaiPmhIdentifier', None) + oai_pmh_identifier = data.get("OaiPmhIdentifier", None) if not isinstance(oai_pmh_identifier, str): return None - PREFIX = 'GGC:AC:' + PREFIX = "GGC:AC:" if oai_pmh_identifier and oai_pmh_identifier.startswith(PREFIX): - return oai_pmh_identifier[len(PREFIX):] + return oai_pmh_identifier[len(PREFIX) :] return None def _convert_record(self, sruthirecord: dict) -> BibliographicalRecord: @@ -56,14 +52,14 @@ def _convert_record(self, sruthirecord: dict) -> BibliographicalRecord: record.languages = self._get_languages(sruthirecord) # TODO: add the other fields return record - + def _get_title(self, data) -> Optional[Field]: - if 'title' in data: - title = data['title'] + if "title" in data: + title = data["title"] if isinstance(title, list): # Title contains a list of strings if it consists of multiple # parts - return Field(' : '.join(title)) + return Field(" : ".join(title)) else: return Field(title) else: @@ -75,10 +71,11 @@ def _get_languages(self, data) -> Optional[List[Field]]: # One of them is always a three-letter language code, so only # pass on these. NB: there is a possibility that not all entries # consisting of three characters are language codes. - if 'language' not in data: + if "language" not in data: return [] fields = [ - LanguageField(x) for x in data['language'] + LanguageField(x) + for x in data["language"] if isinstance(x, str) and len(x) == 3 ] for field in fields: diff --git a/edpop_explorer/readers/kvcs.py b/edpop_explorer/readers/kvcs.py index c7834f1..b17291f 100644 --- a/edpop_explorer/readers/kvcs.py +++ b/edpop_explorer/readers/kvcs.py @@ -1,16 +1,22 @@ import csv from typing import List -from edpop_explorer import Reader, ReaderError, Field, BiographicalRecord, BIOGRAPHICAL, DatabaseFileMixin +from edpop_explorer import ( + Reader, + ReaderError, + Field, + BiographicalRecord, + BIOGRAPHICAL, + DatabaseFileMixin, +) from rdflib import URIRef class KVCSReader(DatabaseFileMixin, Reader): - """ KVCS database reader. Access with command 'kvcs'.""" - DATABASE_URL = 'https://dhstatic.hum.uu.nl/edpop/biblio_kvcs.csv' - DATABASE_FILENAME = 'biblio_kvcs.csv' - CATALOG_URIREF = URIRef( - 'https://edpop.hum.uu.nl/readers/kvcs' - ) + """KVCS database reader. Access with command 'kvcs'.""" + + DATABASE_URL = "https://dhstatic.hum.uu.nl/edpop/biblio_kvcs.csv" + DATABASE_FILENAME = "biblio_kvcs.csv" + CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/kvcs") IRI_PREFIX = "https://edpop.hum.uu.nl/readers/kvcs/" FETCH_ALL_AT_ONCE = True SHORT_NAME = "KVCS" @@ -21,13 +27,13 @@ class KVCSReader(DatabaseFileMixin, Reader): def _convert_record(cls, rawrecord: dict) -> BiographicalRecord: record = BiographicalRecord(from_reader=cls) record.data = rawrecord - record.identifier = rawrecord['ID'] - record.name = Field(rawrecord['Name']) - record.gender = Field(rawrecord['Gender']) - record.lifespan = Field(rawrecord['Years of life']) - record.places_of_activity = Field(rawrecord['City']) - record.activity_timespan = Field(rawrecord['Years of activity']) - record.activities = Field(rawrecord['Kind of print and sales activities']) + record.identifier = rawrecord["ID"] + record.name = Field(rawrecord["Name"]) + record.gender = Field(rawrecord["Gender"]) + record.lifespan = Field(rawrecord["Years of life"]) + record.places_of_activity = Field(rawrecord["City"]) + record.activity_timespan = Field(rawrecord["Years of activity"]) + record.activities = Field(rawrecord["Kind of print and sales activities"]) return record @classmethod @@ -39,27 +45,27 @@ def transform_query(cls, query) -> str: def get_by_id(cls, identifier: str) -> BiographicalRecord: reader = cls() reader.prepare_data() - with open(reader.database_path, 'r', encoding='utf-8-sig') as file: - reader = csv.DictReader(file, delimiter=';') + with open(reader.database_path, "r", encoding="utf-8-sig") as file: + reader = csv.DictReader(file, delimiter=";") for row in reader: - if row['ID'] == identifier: + if row["ID"] == identifier: return cls._convert_record(row) raise ReaderError(f"Item with id {identifier} does not exist.") - + def _perform_query(self) -> List[BiographicalRecord]: assert isinstance(self.prepared_query, str) self.prepare_data() - + # Search query in all columns, and fetch results based on query results = [] - with open(self.database_path, 'r', encoding='utf-8-sig') as file: - reader = csv.DictReader(file, delimiter=';') + with open(self.database_path, "r", encoding="utf-8-sig") as file: + reader = csv.DictReader(file, delimiter=";") for row in reader: for key in row.keys(): if self.prepared_query.lower() in row[key].lower(): results.append(row) break - + self.number_of_results = len(results) records = [] for result in results: @@ -70,11 +76,11 @@ def _perform_query(self) -> List[BiographicalRecord]: def fetch_range(self, range_to_fetch: range) -> range: if self.prepared_query is None: - raise ReaderError('First call prepare_query') + raise ReaderError("First call prepare_query") if self.fetching_exhausted: return range(0) start_record = range_to_fetch.start results = self._perform_query() for i, result in enumerate(results): self.records[i] = result - return range(start_record, start_record + len(results)) \ No newline at end of file + return range(start_record, start_record + len(results)) diff --git a/edpop_explorer/readers/pierre_belle.py b/edpop_explorer/readers/pierre_belle.py index 0ba6af0..0d5c1fd 100644 --- a/edpop_explorer/readers/pierre_belle.py +++ b/edpop_explorer/readers/pierre_belle.py @@ -1,36 +1,44 @@ import csv from typing import List -from edpop_explorer import Reader, ReaderError, BibliographicalRecord, Field, DatabaseFileMixin, BIBLIOGRAPHICAL +from edpop_explorer import ( + Reader, + ReaderError, + BibliographicalRecord, + Field, + DatabaseFileMixin, + BIBLIOGRAPHICAL, +) from rdflib import URIRef from edpop_explorer.fields import LanguageField class PierreBelleReader(DatabaseFileMixin, Reader): - """ Pierre-Belle database reader. Access with command 'pb'.""" - DATABASE_URL = 'https://dhstatic.hum.uu.nl/edpop/biblio_pierrebelle.csv' - DATABASE_FILENAME = 'biblio_pierrebelle.csv' - CATALOG_URIREF = URIRef( - 'https://edpop.hum.uu.nl/readers/pierre_belle' - ) + """Pierre-Belle database reader. Access with command 'pb'.""" + + DATABASE_URL = "https://dhstatic.hum.uu.nl/edpop/biblio_pierrebelle.csv" + DATABASE_FILENAME = "biblio_pierrebelle.csv" + CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/pierre_belle") IRI_PREFIX = "https://edpop.hum.uu.nl/readers/pierre_belle/" FETCH_ALL_AT_ONCE = True READERTYPE = BIBLIOGRAPHICAL SHORT_NAME = "Pierre and Belle" - DESCRIPTION = "Bibliography of early modern editions of Pierre de " \ + DESCRIPTION = ( + "Bibliography of early modern editions of Pierre de " "Provence et la Belle Maguelonne (ca. 1470-ca. 1800)" + ) @classmethod def _convert_record(cls, rawrecord: dict) -> BibliographicalRecord: record = BibliographicalRecord(from_reader=cls) record.data = rawrecord - record.identifier = rawrecord['ID'] - record.title = Field(rawrecord['Shortened title']) - record.languages = [LanguageField(rawrecord['Language'])] + record.identifier = rawrecord["ID"] + record.title = Field(rawrecord["Shortened title"]) + record.languages = [LanguageField(rawrecord["Language"])] [x.normalize() for x in record.languages] - record.publisher_or_printer = Field(rawrecord['Publisher']) - record.place_of_publication = Field(rawrecord['Place of publication']) - record.dating = Field(rawrecord['Date']) + record.publisher_or_printer = Field(rawrecord["Publisher"]) + record.place_of_publication = Field(rawrecord["Place of publication"]) + record.dating = Field(rawrecord["Date"]) return record @classmethod @@ -42,27 +50,27 @@ def transform_query(cls, query) -> str: def get_by_id(cls, identifier: str) -> BibliographicalRecord: reader = cls() reader.prepare_data() - with open(reader.database_path, 'r', encoding='utf-8-sig') as file: - reader = csv.DictReader(file, delimiter=';') + with open(reader.database_path, "r", encoding="utf-8-sig") as file: + reader = csv.DictReader(file, delimiter=";") for row in reader: - if row['ID'] == identifier: + if row["ID"] == identifier: return cls._convert_record(row) raise ReaderError(f"Item with id {identifier} does not exist.") - + def _perform_query(self) -> List[BibliographicalRecord]: assert isinstance(self.prepared_query, str) self.prepare_data() - + # Search query in all columns, and fetch results based on query results = [] - with open(self.database_path, 'r', encoding='utf-8-sig') as file: - reader = csv.DictReader(file, delimiter=';') + with open(self.database_path, "r", encoding="utf-8-sig") as file: + reader = csv.DictReader(file, delimiter=";") for row in reader: for key in row.keys(): if self.prepared_query in row[key]: results.append(row) break - + self.number_of_results = len(results) records = [] for result in results: @@ -73,11 +81,11 @@ def _perform_query(self) -> List[BibliographicalRecord]: def fetch_range(self, range_to_fetch: range) -> range: if self.prepared_query is None: - raise ReaderError('First call prepare_query') + raise ReaderError("First call prepare_query") if self.fetching_exhausted: return range(0) start_record = range_to_fetch.start results = self._perform_query() for i, result in enumerate(results): self.records[i] = result - return range(start_record, start_record + len(results)) \ No newline at end of file + return range(start_record, start_record + len(results)) diff --git a/edpop_explorer/readers/sbtireader.py b/edpop_explorer/readers/sbtireader.py index f37c692..7971cc3 100644 --- a/edpop_explorer/readers/sbtireader.py +++ b/edpop_explorer/readers/sbtireader.py @@ -1,26 +1,24 @@ from rdflib import URIRef from typing import List, Dict, Optional -from edpop_explorer import ( - BiographicalRecord, Field, BIOGRAPHICAL -) +from edpop_explorer import BiographicalRecord, Field, BIOGRAPHICAL from edpop_explorer.cerl import CERLReader class SBTIReader(CERLReader): - API_URL = 'https://data.cerl.org/sbti/_search' - API_BY_ID_BASE_URL = 'https://data.cerl.org/sbti/' - LINK_BASE_URL = 'https://data.cerl.org/sbti/' + API_URL = "https://data.cerl.org/sbti/_search" + API_BY_ID_BASE_URL = "https://data.cerl.org/sbti/" + LINK_BASE_URL = "https://data.cerl.org/sbti/" additional_params: Optional[Dict[str, str]] = None - CATALOG_URIREF = URIRef( - 'https://edpop.hum.uu.nl/readers/sbti' - ) + CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/sbti") IRI_PREFIX = "https://edpop.hum.uu.nl/readers/sbti/" DEFAULT_RECORDS_PER_PAGE = 10 READERTYPE = BIOGRAPHICAL SHORT_NAME = "Scottish Book Trade Index (SBTI)" - DESCRIPTION = "An index of the names, trades and addresses of people "\ + DESCRIPTION = ( + "An index of the names, trades and addresses of people " "involved in printing in Scotland up to 1850" + ) @classmethod def _get_name_field(cls, data: dict) -> Optional[Field]: @@ -37,9 +35,9 @@ def _get_name_field(cls, data: dict) -> Optional[Field]: def _convert_record(cls, rawrecord: dict) -> BiographicalRecord: record = BiographicalRecord(from_reader=cls) record.data = rawrecord - record.identifier = rawrecord.get('id', None) + record.identifier = rawrecord.get("id", None) if not record.identifier: - record.identifier = rawrecord.get('_id', None) + record.identifier = rawrecord.get("_id", None) if record.identifier: record.link = cls.LINK_BASE_URL + record.identifier @@ -65,4 +63,3 @@ def _convert_record(cls, rawrecord: dict) -> BiographicalRecord: record.places_of_activity.append(field) return record - diff --git a/edpop_explorer/readers/stcn.py b/edpop_explorer/readers/stcn.py index 527c775..c9476ce 100644 --- a/edpop_explorer/readers/stcn.py +++ b/edpop_explorer/readers/stcn.py @@ -5,14 +5,12 @@ from edpop_explorer import Field, BIBLIOGRAPHICAL, BibliographicalRecord, LocationField from edpop_explorer.cerl import CERLReader from edpop_explorer.fields import LanguageField, ContributorField -from edpop_explorer.sparqlreader import ( - SparqlReader, BibliographicalRDFRecord -) +from edpop_explorer.sparqlreader import SparqlReader, BibliographicalRDFRecord def _remove_markup(input_str: str) -> str: """Remove STCN-specific markup""" - return input_str.replace('`IT`', '').replace('`LO`', '') + return input_str.replace("`IT`", "").replace("`LO`", "") def safeget(dictionary: Optional[dict], attribute_chain: tuple, first: bool = False): @@ -29,12 +27,10 @@ def safeget(dictionary: Optional[dict], attribute_chain: tuple, first: bool = Fa class STCNReader(CERLReader): - API_URL = 'https://data.cerl.org/stcn/_search' - API_BY_ID_BASE_URL = 'https://data.cerl.org/stcn/' - LINK_BASE_URL = 'https://data.cerl.org/stcn/' - CATALOG_URIREF = URIRef( - 'https://edpop.hum.uu.nl/readers/stcn' - ) + API_URL = "https://data.cerl.org/stcn/_search" + API_BY_ID_BASE_URL = "https://data.cerl.org/stcn/" + LINK_BASE_URL = "https://data.cerl.org/stcn/" + CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/stcn") IRI_PREFIX = "https://edpop.hum.uu.nl/readers/stcn/" READERTYPE = BIBLIOGRAPHICAL SHORT_NAME = "Short-Title Catalogue Netherlands (STCN)" @@ -59,7 +55,7 @@ def _get_contributors(cls, rawrecord: dict) -> list[Field]: continue contributor = ContributorField(name) contributor.name = name - contributor.role = safeget(actor, ('role',), first=True) + contributor.role = safeget(actor, ("role",), first=True) contributors.append(contributor) return contributors @@ -124,7 +120,9 @@ def _get_collation_formula(cls, rawrecord: dict) -> Optional[Field]: if not collations: return None # Multiple collation formulas are possible, but this seems to be rare. - collation_string = ' ; '.join([x.get("value") for x in collations if "value" in x]) + collation_string = " ; ".join( + [x.get("value") for x in collations if "value" in x] + ) return Field(collation_string) @classmethod @@ -133,7 +131,9 @@ def _get_fingerprint(cls, rawrecord: dict) -> Optional[Field]: if not fingerprints: return None # Multiple fingerprints are possible, but this seems to be rare - fingerprint_string = ' ; '.join([x.get("fingerprint") for x in fingerprints if "fingerprint" in x]) + fingerprint_string = " ; ".join( + [x.get("fingerprint") for x in fingerprints if "fingerprint" in x] + ) return Field(fingerprint_string) @classmethod @@ -161,7 +161,7 @@ def _get_holdings(cls, rawrecord: dict) -> list[Field]: def _convert_record(cls, rawrecord: dict) -> BibliographicalRecord: record = BibliographicalRecord(from_reader=cls) record.data = rawrecord - record.identifier = rawrecord.get('id', None) + record.identifier = rawrecord.get("id", None) if record.identifier: record.link = cls.LINK_BASE_URL + record.identifier record.title = cls._get_title(rawrecord) diff --git a/edpop_explorer/readers/ustc.py b/edpop_explorer/readers/ustc.py index 231a43a..2a5f360 100644 --- a/edpop_explorer/readers/ustc.py +++ b/edpop_explorer/readers/ustc.py @@ -3,20 +3,23 @@ from rdflib import URIRef from edpop_explorer import ( - Reader, BibliographicalRecord, ReaderError, Field, BIBLIOGRAPHICAL, - GetByIdBasedOnQueryMixin, DatabaseFileMixin + Reader, + BibliographicalRecord, + ReaderError, + Field, + BIBLIOGRAPHICAL, + GetByIdBasedOnQueryMixin, + DatabaseFileMixin, ) from edpop_explorer.fields import LanguageField from edpop_explorer.sql import SQLPreparedQuery class USTCReader(DatabaseFileMixin, GetByIdBasedOnQueryMixin, Reader): - DATABASE_FILENAME = 'ustc.sqlite3' - USTC_LINK = 'https://www.ustc.ac.uk/editions/{}' + DATABASE_FILENAME = "ustc.sqlite3" + USTC_LINK = "https://www.ustc.ac.uk/editions/{}" READERTYPE = BIBLIOGRAPHICAL - CATALOG_URIREF = URIRef( - 'https://edpop.hum.uu.nl/readers/ustc' - ) + CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/ustc") IRI_PREFIX = "https://edpop.hum.uu.nl/readers/ustc/" prepared_query: Optional[SQLPreparedQuery] = None FETCH_ALL_AT_ONCE = True @@ -27,19 +30,19 @@ class USTCReader(DatabaseFileMixin, GetByIdBasedOnQueryMixin, Reader): def transform_query(cls, query: str) -> SQLPreparedQuery: if len(query.strip()) < 3: # Do not allow very short USTC queries because they are very slow - raise ReaderError('USTC query must have at least 3 characters.') - where_statement = ( - 'WHERE E.std_title LIKE ? ' - 'OR E.author_name_1 LIKE ? ' - 'OR E.author_name_2 LIKE ? ' - 'OR E.author_name_3 LIKE ? ' - 'OR E.author_name_4 LIKE ? ' - 'OR E.author_name_5 LIKE ? ' - 'OR E.author_name_6 LIKE ? ' - 'OR E.author_name_7 LIKE ? ' - 'OR E.author_name_8 LIKE ? ' + raise ReaderError("USTC query must have at least 3 characters.") + where_statement = ( + "WHERE E.std_title LIKE ? " + "OR E.author_name_1 LIKE ? " + "OR E.author_name_2 LIKE ? " + "OR E.author_name_3 LIKE ? " + "OR E.author_name_4 LIKE ? " + "OR E.author_name_5 LIKE ? " + "OR E.author_name_6 LIKE ? " + "OR E.author_name_7 LIKE ? " + "OR E.author_name_8 LIKE ? " ) - like_argument = '%' + query + '%' + like_argument = "%" + query + "%" arguments: List[Union[str, int]] = [like_argument for _ in range(9)] return SQLPreparedQuery(where_statement, arguments) @@ -50,8 +53,7 @@ def _prepare_get_by_id_query(cls, identifier: str) -> SQLPreparedQuery: except ValueError: raise ReaderError(f"Identifier {identifier} is not an integer") return SQLPreparedQuery( - where_statement="WHERE E.sn = ?", - arguments=[identifier_int] + where_statement="WHERE E.sn = ?", arguments=[identifier_int] ) def fetch_range(self, range_to_fetch: range) -> range: @@ -62,19 +64,19 @@ def fetch_range(self, range_to_fetch: range) -> range: # locally stored. if not self.prepared_query: - raise ReaderError('No query has been set') + raise ReaderError("No query has been set") if self.fetching_exhausted: return range(0) cur = con.cursor() - columns = [x[1] for x in cur.execute('PRAGMA table_info(editions)')] + columns = [x[1] for x in cur.execute("PRAGMA table_info(editions)")] # This kind of query is far from ideal, but the alternative is to # implement SQLite full text search which is probably too much work # for our current goal (i.e. getting insight in the data structures) res = cur.execute( - 'SELECT E.* FROM editions E ' + "SELECT E.* FROM editions E " + self.prepared_query.where_statement - + ' ORDER BY E.id', + + " ORDER BY E.id", self.prepared_query.arguments, ) for i, row in enumerate(res): @@ -89,29 +91,28 @@ def fetch_range(self, range_to_fetch: range) -> range: def _convert_record(self, data: dict) -> BibliographicalRecord: record = BibliographicalRecord(from_reader=self.__class__) record.data = data - record.identifier = data['sn'] - record.link = self.USTC_LINK.format(data['sn']) - record.title = Field(data['std_title']) + record.identifier = data["sn"] + record.link = self.USTC_LINK.format(data["sn"]) + record.title = Field(data["std_title"]) record.contributors = [] for i in range(8): - fieldname = f'author_name_{i + 1}' + fieldname = f"author_name_{i + 1}" if data[fieldname]: record.contributors.append(Field(data[fieldname])) - if data['printer_name_1']: + if data["printer_name_1"]: # TODO: support for multiple printers - record.publisher_or_printer = Field(data['printer_name_1']) - if data['place']: - record.place_of_publication = Field(data['place']) - if data['year']: - record.dating = Field(data['year']) + record.publisher_or_printer = Field(data["printer_name_1"]) + if data["place"]: + record.place_of_publication = Field(data["place"]) + if data["year"]: + record.dating = Field(data["year"]) record.languages = [] for i in range(4): - fieldname = f'language_{i + 1}' + fieldname = f"language_{i + 1}" if data[fieldname]: field = LanguageField(data[fieldname]) field.normalize() record.languages.append(field) - if data['pagination']: - record.extent = Field(data['pagination']) + if data["pagination"]: + record.extent = Field(data["pagination"]) return record - diff --git a/edpop_explorer/readers/vd.py b/edpop_explorer/readers/vd.py index 260e805..8cd16e3 100644 --- a/edpop_explorer/readers/vd.py +++ b/edpop_explorer/readers/vd.py @@ -5,14 +5,14 @@ from edpop_explorer import SRUMarc21BibliographicalReader, Marc21Data -class VDCommonMixin(): +class VDCommonMixin: LINK_FORMAT: str @classmethod def _get_identifier(cls, data: Marc21Data) -> Optional[str]: - field024 = data.get_first_field('024') + field024 = data.get_first_field("024") if field024: - return field024.subfields.get('a', None) + return field024.subfields.get("a", None) else: return None @@ -20,16 +20,14 @@ def _get_identifier(cls, data: Marc21Data) -> Optional[str]: def _get_link(cls, record: Marc21Data) -> Optional[str]: identifier = cls._get_identifier(record) if identifier: - return cls.LINK_FORMAT.format(identifier).replace(' ', '+') + return cls.LINK_FORMAT.format(identifier).replace(" ", "+") class VD16Reader(VDCommonMixin, SRUMarc21BibliographicalReader): - sru_url = 'http://bvbr.bib-bvb.de:5661/bvb01sru' - sru_version = '1.1' - LINK_FORMAT = 'http://gateway-bayern.de/{}' # Spaces should be replaced by + - CATALOG_URIREF = URIRef( - 'https://edpop.hum.uu.nl/readers/vd16' - ) + sru_url = "http://bvbr.bib-bvb.de:5661/bvb01sru" + sru_version = "1.1" + LINK_FORMAT = "http://gateway-bayern.de/{}" # Spaces should be replaced by + + CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/vd16") IRI_PREFIX = "https://edpop.hum.uu.nl/readers/vd16/" SHORT_NAME = "VD16" DESCRIPTION = "Verzeichnis der im deutschen Sprachbereich erschienenen Drucke des 16. Jahrhunderts" @@ -38,17 +36,14 @@ class VD16Reader(VDCommonMixin, SRUMarc21BibliographicalReader): def transform_query(cls, query: str) -> str: # This SRU URL combines multiple databases, so make sure only VD16 is # queried - return 'VD16 and ({})'.format(query) + return "VD16 and ({})".format(query) class VD17Reader(VDCommonMixin, SRUMarc21BibliographicalReader): - sru_url = 'http://sru.k10plus.de/vd17' - sru_version = '1.1' - LINK_FORMAT = \ - 'https://kxp.k10plus.de/DB=1.28/CMD?ACT=SRCHA&IKT=8079&TRM=%27{}%27' - CATALOG_URIREF = URIRef( - 'https://edpop.hum.uu.nl/readers/vd17' - ) + sru_url = "http://sru.k10plus.de/vd17" + sru_version = "1.1" + LINK_FORMAT = "https://kxp.k10plus.de/DB=1.28/CMD?ACT=SRCHA&IKT=8079&TRM=%27{}%27" + CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/vd17") IRI_PREFIX = "https://edpop.hum.uu.nl/readers/vd17/" SHORT_NAME = "VD17" DESCRIPTION = "Verzeichnis der im deutschen Sprachbereich erschienenen Drucke des 17. Jahrhunderts" @@ -59,14 +54,14 @@ def transform_query(cls, query: str) -> str: class VD18Reader(VDCommonMixin, SRUMarc21BibliographicalReader): - sru_url = 'http://sru.k10plus.de/vd18' - sru_version = '1.1' - LINK_FORMAT = 'https://kxp.k10plus.de/DB=1.65/SET=1/TTL=1/CMD?ACT=SRCHA&' \ - 'IKT=1016&SRT=YOP&TRM={}&ADI_MAT=B&MATCFILTER=Y&MATCSET=Y&ADI_MAT=T&' \ - 'REC=*' - CATALOG_URIREF = URIRef( - 'https://edpop.hum.uu.nl/readers/vd18' + sru_url = "http://sru.k10plus.de/vd18" + sru_version = "1.1" + LINK_FORMAT = ( + "https://kxp.k10plus.de/DB=1.65/SET=1/TTL=1/CMD?ACT=SRCHA&" + "IKT=1016&SRT=YOP&TRM={}&ADI_MAT=B&MATCFILTER=Y&MATCSET=Y&ADI_MAT=T&" + "REC=*" ) + CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/vd18") IRI_PREFIX = "https://edpop.hum.uu.nl/readers/vd18/" SHORT_NAME = "VD18" DESCRIPTION = "Verzeichnis der im deutschen Sprachbereich erschienenen Drucke des 18. Jahrhunderts" @@ -79,22 +74,22 @@ def transform_query(cls, query: str) -> str: def _get_identifier(cls, record: Marc21Data): # The record id is in field 024 for which subfield 2 is vd18. There # may be more than one occurance of field 024. - fields024 = record.get_fields('024') + fields024 = record.get_fields("024") for field in fields024: - if '2' in field.subfields and \ - 'a' in field.subfields and \ - field.subfields['2'] == 'vd18': - return field.subfields['a'][5:] + if ( + "2" in field.subfields + and "a" in field.subfields + and field.subfields["2"] == "vd18" + ): + return field.subfields["a"][5:] return None class VDLiedReader(VDCommonMixin, SRUMarc21BibliographicalReader): - sru_url = 'http://sru.k10plus.de/vdlied' - sru_version = '1.1' - LINK_FORMAT = 'https://gso.gbv.de/DB=1.60/PPNSET?PPN={}' - CATALOG_URIREF = URIRef( - 'https://edpop.hum.uu.nl/readers/vdlied' - ) + sru_url = "http://sru.k10plus.de/vdlied" + sru_version = "1.1" + LINK_FORMAT = "https://gso.gbv.de/DB=1.60/PPNSET?PPN={}" + CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/vdlied") IRI_PREFIX = "https://edpop.hum.uu.nl/readers/vdlied/" SHORT_NAME = "VDLied" DESCRIPTION = "Das Verzeichnis der deutschsprachigen Liedflugschriften" diff --git a/edpop_explorer/record.py b/edpop_explorer/record.py index 9d6d16b..8c38d6f 100644 --- a/edpop_explorer/record.py +++ b/edpop_explorer/record.py @@ -4,7 +4,11 @@ from rdflib import URIRef, Graph, BNode, RDF, Literal from edpop_explorer import ( - EDPOPREC, Field, BIBLIOGRAPHICAL, BIOGRAPHICAL, bind_common_namespaces + EDPOPREC, + Field, + BIBLIOGRAPHICAL, + BIOGRAPHICAL, + bind_common_namespaces, ) if TYPE_CHECKING: @@ -43,7 +47,7 @@ class Record: basic attributes and the fields are ``None`` by default. Subclasses should override the ``_rdf_class`` attribute to - the corresponding RDF class. They should define additional + the corresponding RDF class. They should define additional fields by adding additional public attributes defaulting to ``None`` and by registring them in the ``_fields`` attribute. For registring, a constructor ``__init__`` should be defined @@ -52,17 +56,18 @@ class Record: ``('', EDPOPREC., )``. """ + #: The raw original data of a record. data: Union[None, dict, RawData] = None _fields: List[Tuple[str, URIRef, Type[Field]]] _rdf_class: Node = EDPOPREC.Record link: Optional[str] = None - '''A user-friendly link where the user can find the record.''' + """A user-friendly link where the user can find the record.""" identifier: Optional[str] = None - '''Unique identifier used by the source catalog.''' + """Unique identifier used by the source catalog.""" from_reader: Type["Reader"] - '''The subject node, which will be used to convert the record to - RDF. This is a blank node by default.''' + """The subject node, which will be used to convert the record to + RDF. This is a blank node by default.""" _graph: Optional[Graph] = None _bnode: Optional[BNode] = None @@ -71,10 +76,10 @@ def __init__(self, from_reader: Type["Reader"]): self.from_reader = from_reader def to_graph(self) -> Graph: - '''Return an RDF graph for this record.''' + """Return an RDF graph for this record.""" self.fetch() g = Graph() - + # Set basic properties rdfclass = EDPOPREC.Record if self.from_reader: @@ -82,37 +87,28 @@ def to_graph(self) -> Graph: rdfclass = EDPOPREC.BiographicalRecord elif self.from_reader.READERTYPE == BIBLIOGRAPHICAL: rdfclass = EDPOPREC.BibliographicalRecord - g.add(( - self.subject_node, - RDF.type, - rdfclass - )) - if self.from_reader is not None and \ - self.from_reader.CATALOG_URIREF is not None: - g.add(( - self.subject_node, - EDPOPREC.fromCatalog, - self.from_reader.CATALOG_URIREF - )) + g.add((self.subject_node, RDF.type, rdfclass)) + if self.from_reader is not None and self.from_reader.CATALOG_URIREF is not None: + g.add( + ( + self.subject_node, + EDPOPREC.fromCatalog, + self.from_reader.CATALOG_URIREF, + ) + ) if self.identifier: - g.add(( - self.subject_node, - EDPOPREC.identifier, - Literal(self.identifier) - )) + g.add((self.subject_node, EDPOPREC.identifier, Literal(self.identifier))) if self.link: - g.add(( - self.subject_node, - EDPOPREC.publicURL, - Literal(self.link) - )) + g.add((self.subject_node, EDPOPREC.publicURL, Literal(self.link))) original_data = self.get_data_dict() if original_data is not None: - g.add(( - self.subject_node, - EDPOPREC.originalData, - Literal(original_data, datatype=RDF.JSON) - )) + g.add( + ( + self.subject_node, + EDPOPREC.originalData, + Literal(original_data, datatype=RDF.JSON), + ) + ) # Put all fields from self.FIELDS in the graph by accessing # the associated attributes or properties. If they contain a @@ -162,21 +158,21 @@ def get_data_dict(self) -> Optional[dict]: def __str__(self): if self.identifier: - return f'{self.__class__} object ({self.identifier})' + return f"{self.__class__} object ({self.identifier})" else: - return f'{self.__class__} object' + return f"{self.__class__} object" def fetch(self) -> None: - '''Fetch the full contents of the record if this record works with + """Fetch the full contents of the record if this record works with lazy loading (i.e., if the record's class derives from ``RDFRecordMixin``). If the record is not lazy, this method does - nothing.''' + nothing.""" pass @property def iri(self) -> Optional[str]: - '''A stable IRI based on the `identifier` attribute. `None` if - the `identifier` attribute is not set.''' + """A stable IRI based on the `identifier` attribute. `None` if + the `identifier` attribute is not set.""" if self.identifier: return self.from_reader.identifier_to_iri(self.identifier) else: @@ -184,8 +180,8 @@ def iri(self) -> Optional[str]: @property def subject_node(self) -> Node: - '''A subject node based on the `identifier` attribute. If the - `identifier` attribute is not set, a blank node.''' + """A subject node based on the `identifier` attribute. If the + `identifier` attribute is not set, a blank node.""" iri = self.iri if iri is not None: return URIRef(iri) @@ -197,11 +193,12 @@ def subject_node(self) -> Node: class BibliographicalRecord(Record): - '''Python representation of edpoprec:BibliographicalRecord. + """Python representation of edpoprec:BibliographicalRecord. This subclass adds fields that are specific for bibliographical records. - ''' + """ + _rdf_class = EDPOPREC.BibliographicalRecord title: Optional[Field] = None alternative_title: Optional[Field] = None @@ -225,23 +222,23 @@ def __init__(self, from_reader: Type["Reader"]): super().__init__(from_reader) assert isinstance(self._fields, list) self._fields += [ - ('title', EDPOPREC.title, Field), - ('alternative_title', EDPOPREC.alternativeTitle, Field), - ('contributors', EDPOPREC.contributor, Field), - ('publisher_or_printer', EDPOPREC.publisherOrPrinter, Field), - ('place_of_publication', EDPOPREC.placeOfPublication, Field), - ('dating', EDPOPREC.dating, Field), - ('languages', EDPOPREC.language, Field), - ('extent', EDPOPREC.extent, Field), - ('size', EDPOPREC.size, Field), - ('physical_description', EDPOPREC.physicalDescription, Field), - ('bookseller', EDPOPREC.bookseller, Field), - ('location', EDPOPREC.location, Field), - ('format', EDPOPREC.format, Field), - ('fingerprint', EDPOPREC.fingerprint, Field), - ('collation_formula', EDPOPREC.collationFormula, Field), - ('genres', EDPOPREC.genre, Field), - ('holdings', EDPOPREC.holdings, Field), + ("title", EDPOPREC.title, Field), + ("alternative_title", EDPOPREC.alternativeTitle, Field), + ("contributors", EDPOPREC.contributor, Field), + ("publisher_or_printer", EDPOPREC.publisherOrPrinter, Field), + ("place_of_publication", EDPOPREC.placeOfPublication, Field), + ("dating", EDPOPREC.dating, Field), + ("languages", EDPOPREC.language, Field), + ("extent", EDPOPREC.extent, Field), + ("size", EDPOPREC.size, Field), + ("physical_description", EDPOPREC.physicalDescription, Field), + ("bookseller", EDPOPREC.bookseller, Field), + ("location", EDPOPREC.location, Field), + ("format", EDPOPREC.format, Field), + ("fingerprint", EDPOPREC.fingerprint, Field), + ("collation_formula", EDPOPREC.collationFormula, Field), + ("genres", EDPOPREC.genre, Field), + ("holdings", EDPOPREC.holdings, Field), ] def __str__(self) -> str: @@ -252,10 +249,11 @@ def __str__(self) -> str: class BiographicalRecord(Record): - '''Python representation of edpoprec:BiographicalRecord. + """Python representation of edpoprec:BiographicalRecord. This subclass adds fields that are specific for biographical records. - ''' + """ + _rdf_class = EDPOPREC.BiographicalRecord name: Optional[Field] = None variant_names: Optional[List[Field]] = None @@ -271,15 +269,15 @@ def __init__(self, from_reader: Type["Reader"]): super().__init__(from_reader) assert isinstance(self._fields, list) self._fields += [ - ('name', EDPOPREC.title, Field), - ('variant_names', EDPOPREC.variantName, Field), - ('place_of_birth', EDPOPREC.placeOfBirth, Field), - ('place_of_death', EDPOPREC.placeOfDeath, Field), - ('places_of_activity', EDPOPREC.placeOfActivity, Field), - ('activity_timespan', EDPOPREC.timespan, Field), - ('activities', EDPOPREC.activity, Field), - ('gender', EDPOPREC.gender, Field), - ('lifespan', EDPOPREC.lifespan, Field), + ("name", EDPOPREC.title, Field), + ("variant_names", EDPOPREC.variantName, Field), + ("place_of_birth", EDPOPREC.placeOfBirth, Field), + ("place_of_death", EDPOPREC.placeOfDeath, Field), + ("places_of_activity", EDPOPREC.placeOfActivity, Field), + ("activity_timespan", EDPOPREC.timespan, Field), + ("activities", EDPOPREC.activity, Field), + ("gender", EDPOPREC.gender, Field), + ("lifespan", EDPOPREC.lifespan, Field), ] def __str__(self) -> str: @@ -290,13 +288,14 @@ def __str__(self) -> str: class LazyRecordMixin(ABC): - '''Abstract mixin that adds an interface for lazy loading to a Record. + """Abstract mixin that adds an interface for lazy loading to a Record. To use, implement the ``fetch()`` method and make sure that it fills - the record's ``data`` attributes and its Fields and that the - ``fetched`` attribute is set to ``True``.''' + the record's ``data`` attributes and its Fields and that the + ``fetched`` attribute is set to ``True``.""" + fetched: bool = False - + @abstractmethod def fetch(self) -> None: pass diff --git a/edpop_explorer/sparqlreader.py b/edpop_explorer/sparqlreader.py index 2d6f969..09fb648 100644 --- a/edpop_explorer/sparqlreader.py +++ b/edpop_explorer/sparqlreader.py @@ -7,26 +7,28 @@ from typing_extensions import override from edpop_explorer import ( - Reader, Record, BibliographicalRecord, ReaderError, RecordError, - LazyRecordMixin + Reader, + Record, + BibliographicalRecord, + ReaderError, + RecordError, + LazyRecordMixin, ) PREFIXES = { - 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', - 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', - 'schema': 'http://schema.org/', - 'owl': 'http://www.w3.org/2002/07/owl#', + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "schema": "http://schema.org/", + "owl": "http://www.w3.org/2002/07/owl#", } -PREFIXES_REVERSE_REPLACEMENT_TABLE = { - PREFIXES[key]: (key + ':') for key in PREFIXES -} +PREFIXES_REVERSE_REPLACEMENT_TABLE = {PREFIXES[key]: (key + ":") for key in PREFIXES} -PREFIX_DEFINITIONS = '\n'.join([ - f'prefix {key}: <{PREFIXES[key]}>' for key in PREFIXES -]) +PREFIX_DEFINITIONS = "\n".join([f"prefix {key}: <{PREFIXES[key]}>" for key in PREFIXES]) -prepare_listing_query = (PREFIX_DEFINITIONS + """ +prepare_listing_query = ( + PREFIX_DEFINITIONS + + """ select ?s ?name where {{ ?s ?p ?o . @@ -35,7 +37,8 @@ FILTER (regex(?o, "{query}","i")) }} order by ?s -""").format +""" +).format prepare_lookup_query = """ prefix schema: @@ -47,8 +50,8 @@ def replace_fqu_with_prefixed_uris(inputstring: str) -> str: - '''Replace fully qualified URIs to prefixed URIs if they occur in - the prefix table in the prefixes attribute''' + """Replace fully qualified URIs to prefixed URIs if they occur in + the prefix table in the prefixes attribute""" for key in PREFIXES_REVERSE_REPLACEMENT_TABLE: inputstring = inputstring.replace( key, PREFIXES_REVERSE_REPLACEMENT_TABLE[key], 1 @@ -57,7 +60,8 @@ def replace_fqu_with_prefixed_uris(inputstring: str) -> str: class RDFRecordMixin(LazyRecordMixin): - '''Mixin that adds lazy RDF fetching functionality to a Record.''' + """Mixin that adds lazy RDF fetching functionality to a Record.""" + identifier: Optional[str] = None fetched: bool = False data: Optional[dict] = None @@ -70,9 +74,7 @@ def fetch(self) -> None: # as data that rdflib can process. We might need to support # IRIs that can only be accessed via an endpoint as well. if not self.identifier: - raise RecordError( - 'identifier (subject IRI) has not been set' - ) + raise RecordError("identifier (subject IRI) has not been set") if self.fetched: return try: @@ -86,9 +88,7 @@ def fetch(self) -> None: f"{self.identifier}: {err}" ) # Convert to JSON for raw data attribute - self.data = json.loads( - self.original_graph.serialize(format="json-ld") - ) + self.data = json.loads(self.original_graph.serialize(format="json-ld")) # Call Reader's data conversion method to fill the record's Fields assert isinstance(self, Record) assert issubclass(self.from_reader, SparqlReader) @@ -112,9 +112,7 @@ class SparqlReader(Reader): @override def transform_query(cls, query: str): return prepare_listing_query( - name_predicate=cls.name_predicate, - filter=cls.filter, - query=query + name_predicate=cls.name_predicate, filter=cls.filter, query=query ) @classmethod @@ -126,7 +124,7 @@ def get_by_id(cls, identifier: str) -> Record: def fetch_range(self, range_to_fetch: range) -> range: # Fetch all records at one, because this is an expensive operation. if not self.prepared_query: - raise ReaderError('First call prepare_query method') + raise ReaderError("First call prepare_query method") if self.fetching_exhausted: return range(0, 0) wrapper = SPARQLWrapper(self.endpoint) @@ -135,29 +133,27 @@ def fetch_range(self, range_to_fetch: range) -> range: try: response = wrapper.queryAndConvert() except SPARQLExceptions.QueryBadFormed as err: - raise ReaderError( - 'Malformed SPARQL query: {}'.format(err) - ) + raise ReaderError("Malformed SPARQL query: {}".format(err)) assert isinstance(response, dict) - results = response['results']['bindings'] + results = response["results"]["bindings"] self.records = {} self.number_of_results = len(results) for i, result in enumerate(results): - iri = result['s']['value'] - name = result['name']['value'] + iri = result["s"]["value"] + name = result["name"]["value"] self.records[i] = self._create_lazy_record(iri, name) return range(0, self.number_of_results) @classmethod @abstractmethod def convert_record(cls, graph: Graph, record: Record) -> None: - '''Convert data from an RDF graph to Fields in a Record. The - Record is changed in-place.''' + """Convert data from an RDF graph to Fields in a Record. The + Record is changed in-place.""" pass @classmethod @abstractmethod - def _create_lazy_record(cls, iri: str, name: Optional[str]=None) -> Record: + def _create_lazy_record(cls, iri: str, name: Optional[str] = None) -> Record: """Create a Record/LazyRecordMixin record object. This is the lazy record that is created after running the SPARQL diff --git a/edpop_explorer/srumarc21reader.py b/edpop_explorer/srumarc21reader.py index f02a80a..6549c3a 100644 --- a/edpop_explorer/srumarc21reader.py +++ b/edpop_explorer/srumarc21reader.py @@ -5,22 +5,26 @@ from abc import abstractmethod from edpop_explorer import ( - BibliographicalRecord, RawData, SRUReader, Field, BIBLIOGRAPHICAL + BibliographicalRecord, + RawData, + SRUReader, + Field, + BIBLIOGRAPHICAL, ) from edpop_explorer.fields import LanguageField -READABLE_FIELDS_FILE = Path(__file__).parent / 'M21_fields.csv' +READABLE_FIELDS_FILE = Path(__file__).parent / "M21_fields.csv" translation_dictionary: Dict[str, str] = {} with open(READABLE_FIELDS_FILE) as dictionary_file: reader = csv.DictReader(dictionary_file) for row in reader: - translation_dictionary[row['Tag number']] = \ - row[' Tag description'].strip() + translation_dictionary[row["Tag number"]] = row[" Tag description"].strip() @dataclass class Marc21Field: """Python representation of a single field in a Marc21 record""" + fieldnumber: str indicator1: str indicator2: str @@ -28,28 +32,24 @@ class Marc21Field: description: Optional[str] = None def __str__(self): - ''' + """ Return the usual marc21 representation - ''' + """ sf = [] - ind1 = self.indicator1 if self.indicator1.rstrip() != '' else '#' - ind2 = self.indicator1 if self.indicator2.rstrip() != '' else '#' - description = ' ({})'.format(self.description) \ - if self.description else '' + ind1 = self.indicator1 if self.indicator1.rstrip() != "" else "#" + ind2 = self.indicator1 if self.indicator2.rstrip() != "" else "#" + description = " ({})".format(self.description) if self.description else "" for subfield in self.subfields: - sf.append('$${} {}'.format(subfield, self.subfields[subfield])) - return '{}{}: {} {} {}'.format( - self.fieldnumber, - description, - ind1, - ind2, - ' '.join(sf) + sf.append("$${} {}".format(subfield, self.subfields[subfield])) + return "{}{}: {} {} {}".format( + self.fieldnumber, description, ind1, ind2, " ".join(sf) ) @dataclass class Marc21Data(RawData): """Python representation of the data inside a Marc21 record""" + # We use a list for the fields and not a dictionary because they may # appear more than once fields: List[Marc21Field] = dataclass_field(default_factory=list) @@ -57,18 +57,18 @@ class Marc21Data(RawData): raw: dict = dataclass_field(default_factory=dict) def get_first_field(self, fieldnumber: str) -> Optional[Marc21Field]: - '''Return the first occurance of a field with a given field number. + """Return the first occurance of a field with a given field number. May be useful for fields that appear only once, such as 245. - Return None if field is not found.''' + Return None if field is not found.""" for field in self.fields: if field.fieldnumber == fieldnumber: return field return None def get_first_subfield(self, fieldnumber: str, subfield: str) -> Optional[str]: - '''Return the requested subfield of the first occurance of a field with + """Return the requested subfield of the first occurance of a field with the given field number. Return None if field is not found or if the - subfield is not present on the first occurance of the field.''' + subfield is not present on the first occurance of the field.""" field = self.get_first_field(fieldnumber) if field is not None: return field.subfields.get(subfield, None) @@ -76,8 +76,8 @@ def get_first_subfield(self, fieldnumber: str, subfield: str) -> Optional[str]: return None def get_fields(self, fieldnumber: str) -> List[Marc21Field]: - '''Return a list of fields with a given field number. May return an - empty list if field does not occur.''' + """Return a list of fields with a given field number. May return an + empty list if field does not occur.""" returned_fields: List[Marc21Field] = [] for field in self.fields: if field.fieldnumber == fieldnumber: @@ -85,9 +85,9 @@ def get_fields(self, fieldnumber: str) -> List[Marc21Field]: return returned_fields def get_all_subfields(self, fieldnumber: str, subfield: str) -> List[str]: - '''Return a list of subfields that matches the requested field number + """Return a list of subfields that matches the requested field number and subfield. May return an empty list if the field and subfield do not - occur.''' + occur.""" fields = self.get_fields(fieldnumber) returned_subfields: List[str] = [] for field in fields: @@ -99,10 +99,11 @@ def to_dict(self) -> dict: return self.raw -class Marc21DataMixin(): +class Marc21DataMixin: """A mixin that adds a ``data`` attribute to a Record class to contain an instance of ``Marc21Data``. """ + data: Optional[Marc21Data] = None def show_record(self) -> str: @@ -111,26 +112,28 @@ def show_record(self) -> str: field_strings = [] for field in self.data.fields: field_strings.append(str(field)) - return '\n'.join(field_strings) + return "\n".join(field_strings) + class SRUMarc21Reader(SRUReader): - '''Subclass of ``SRUReader`` that adds Marc21 functionality. + """Subclass of ``SRUReader`` that adds Marc21 functionality. This class is still abstract and to create concrete readers - the ``_get_link()``, ``_get_identifier()`` + the ``_get_link()``, ``_get_identifier()`` and ``_convert_record`` methods should be implemented. .. automethod:: _convert_record .. automethod:: _get_link - .. automethod:: _get_identifier''' - marcxchange_prefix: str = '' + .. automethod:: _get_identifier""" + + marcxchange_prefix: str = "" @classmethod def _get_subfields(cls, sruthifield) -> list: # If there is only one subfield, sruthi puts it directly in # a dict, otherwise it uses a list of dicts. Make sure that # we always have a list. - subfielddata = sruthifield[f'{cls.marcxchange_prefix}subfield'] + subfielddata = sruthifield[f"{cls.marcxchange_prefix}subfield"] if isinstance(subfielddata, dict): sruthisubfields = [subfielddata] else: @@ -146,21 +149,20 @@ def _convert_to_marc21data(cls, sruthirecord: dict) -> Marc21Data: # The controlfield and the datafield contain multiple fields. # The controlfield consists of simple pairs of tags (field numbers) # and texts (field values). - for sruthicontrolfield in \ - sruthirecord[f'{cls.marcxchange_prefix}controlfield']: - tag = sruthicontrolfield['tag'] - text = sruthicontrolfield['text'] + for sruthicontrolfield in sruthirecord[f"{cls.marcxchange_prefix}controlfield"]: + tag = sruthicontrolfield["tag"] + text = sruthicontrolfield["text"] data.controlfields[tag] = text # The datafield is more complex; these fields also have two indicators, # one-digit numbers that carry special meanings, and multiple subfields # that each have a one-character code. - for sruthifield in sruthirecord[f'{cls.marcxchange_prefix}datafield']: - fieldnumber = sruthifield['tag'] + for sruthifield in sruthirecord[f"{cls.marcxchange_prefix}datafield"]: + fieldnumber = sruthifield["tag"] field = Marc21Field( fieldnumber=fieldnumber, - indicator1=sruthifield['ind1'], - indicator2=sruthifield['ind2'], - subfields={} + indicator1=sruthifield["ind1"], + indicator2=sruthifield["ind2"], + subfields={}, ) # The translation_dictionary contains descriptions for a number # of important fields. Include them so that the user can more @@ -170,52 +172,53 @@ def _convert_to_marc21data(cls, sruthirecord: dict) -> Marc21Data: sruthisubfields = cls._get_subfields(sruthifield) for sruthisubfield in sruthisubfields: - field.subfields[sruthisubfield['code']] = \ - sruthisubfield['text'] + field.subfields[sruthisubfield["code"]] = sruthisubfield["text"] data.fields.append(field) return data - + @classmethod @abstractmethod def _get_link(cls, data: Marc21Data) -> Optional[str]: - '''Get a public URL according to the Marc21 data or ``None`` if it - is not available.''' + """Get a public URL according to the Marc21 data or ``None`` if it + is not available.""" pass @classmethod @abstractmethod def _get_identifier(cls, data: Marc21Data) -> Optional[str]: - '''Get the unique identifier from the Marc21 data or ``None`` if it - is not available.''' + """Get the unique identifier from the Marc21 data or ``None`` if it + is not available.""" pass class Marc21BibliographicalRecord(Marc21DataMixin, BibliographicalRecord): - '''A combination of ``BibliographicalRecord`` and ``Marc21DataMixin``.''' + """A combination of ``BibliographicalRecord`` and ``Marc21DataMixin``.""" + pass class SRUMarc21BibliographicalReader(SRUMarc21Reader): - '''Subclass of ``SRUMarc21Reader`` that adds functionality to create + """Subclass of ``SRUMarc21Reader`` that adds functionality to create instances of ``BibliographicRecord``. This subclass assumes that the Marc21 data is according to the standard format of Marc21 for bibliographical data. See: https://www.loc.gov/marc/bibliographic/ - ''' - _title_field_subfield = ('245', 'a') - _alternative_title_field_subfield = ('246', 'a') - _publisher_field_subfield = ('264', 'b') - _language_field_subfield = ('041', 'a') - _place_field_subfield = ('264', 'a') - _dating_field_subfield = ('264', 'c') - _extent_field_subfield = ('300', 'a') - _physical_description_field_subfield = ('300', 'b') - _size_field_subfield = ('300', 'c') + """ + + _title_field_subfield = ("245", "a") + _alternative_title_field_subfield = ("246", "a") + _publisher_field_subfield = ("264", "b") + _language_field_subfield = ("041", "a") + _place_field_subfield = ("264", "a") + _dating_field_subfield = ("264", "c") + _extent_field_subfield = ("300", "a") + _physical_description_field_subfield = ("300", "b") + _size_field_subfield = ("300", "c") records: List[Marc21BibliographicalRecord] READERTYPE = BIBLIOGRAPHICAL - + @classmethod def _convert_record(cls, sruthirecord: dict) -> Marc21BibliographicalRecord: record = Marc21BibliographicalRecord(from_reader=cls) @@ -270,10 +273,9 @@ def _convert_record(cls, sruthirecord: dict) -> Marc21BibliographicalRecord: @classmethod def _get_contributors(cls, data: Marc21Data) -> List[Field]: contributors: List[Field] = [] - contributor_fields = data.get_fields('100') + contributor_fields = data.get_fields("100") for field in contributor_fields: - name = field.subfields.get('a') + name = field.subfields.get("a") if name: contributors.append(Field(name)) return contributors - diff --git a/edpop_explorer/srureader.py b/edpop_explorer/srureader.py index 0017a7d..36b8503 100644 --- a/edpop_explorer/srureader.py +++ b/edpop_explorer/srureader.py @@ -8,25 +8,26 @@ class SRUReader(GetByIdBasedOnQueryMixin, Reader): - '''Subclass of ``Reader`` that adds basic SRU functionality + """Subclass of ``Reader`` that adds basic SRU functionality using the ``sruthi`` library. This class is still abstract and subclasses should implement the ``transform_query()`` and ``_convert_record()`` methods and set the attributes ``sru_url`` and ``sru_version``. - + The ``_prepare_get_by_id_query()`` method by default returns the transformed version of the identifier as a query, which normally works, but this may be optimised by overriding it. - .. automethod:: _convert_record''' + .. automethod:: _convert_record""" + sru_url: str - '''URL of the SRU API.''' + """URL of the SRU API.""" sru_version: str - '''Version of the SRU protocol. Can be '1.1' or '1.2'.''' + """Version of the SRU protocol. Can be '1.1' or '1.2'.""" query: Optional[str] = None session: requests.Session - '''The ``Session`` object of the ``requests`` library.''' + """The ``Session`` object of the ``requests`` library.""" def __init__(self): # Set a session to allow reuse of HTTP sessions and to set additional @@ -43,15 +44,17 @@ def transform_query(cls, query: str) -> str: @classmethod @abstractmethod def _convert_record(cls, sruthirecord: dict) -> Record: - '''Convert the output of ``sruthi`` into an instance of - (a subclass of) ``Record``.''' + """Convert the output of ``sruthi`` into an instance of + (a subclass of) ``Record``.""" pass @classmethod def _prepare_get_by_id_query(cls, identifier: str) -> str: return cls.transform_query(identifier) - def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> List[Record]: + def _perform_query( + self, start_record: int, maximum_records: Optional[int] + ) -> List[Record]: if maximum_records is None: maximum_records = self.DEFAULT_RECORDS_PER_PAGE try: @@ -61,12 +64,10 @@ def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> L start_record=start_record, maximum_records=maximum_records, sru_version=self.sru_version, - session=self.session + session=self.session, ) - except ( - sruthi.errors.SruError - ) as err: - raise ReaderError('Server returned error: ' + str(err)) + except sruthi.errors.SruError as err: + raise ReaderError("Server returned error: " + str(err)) self.number_of_results = response.count @@ -85,7 +86,7 @@ def fetch_range(self, range_to_fetch: range) -> range: if self.fetching_exhausted: return range(0, 0) if self.prepared_query is None: - raise ReaderError('First call prepare_query') + raise ReaderError("First call prepare_query") start_number = range_to_fetch.start start_number_sru = start_number + 1 # SRU starts at 1 records_to_fetch = range_to_fetch.stop - range_to_fetch.start diff --git a/tests/conftest.py b/tests/conftest.py index 7134a1f..c6be388 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,13 @@ def pytest_addoption(parser): - parser.addoption('--requests', action='store_true', dest="requests", - default=False, help="enable tests with real API requests") + parser.addoption( + "--requests", + action="store_true", + dest="requests", + default=False, + help="enable tests with real API requests", + ) + def pytest_configure(config): if not config.option.requests: - setattr(config.option, 'markexpr', 'not requests') + setattr(config.option, "markexpr", "not requests") diff --git a/tests/test_allreaders.py b/tests/test_allreaders.py index fa58b11..60b180e 100644 --- a/tests/test_allreaders.py +++ b/tests/test_allreaders.py @@ -92,7 +92,11 @@ def test_realrequest(readercls: Type[Reader]): # there was a mistake with the offsets. But just give a warning, # because there are APIs that (by mistake?) return duplicated # records. - warnings.warn(UserWarning("Last record from first fetch is same as first record from second fetch")) + warnings.warn( + UserWarning( + "Last record from first fetch is same as first record from second fetch" + ) + ) else: assert reader.number_fetched == fetched_before assert rng2 == range(0) diff --git a/tests/test_field.py b/tests/test_field.py index 5860274..e4196cc 100644 --- a/tests/test_field.py +++ b/tests/test_field.py @@ -9,18 +9,18 @@ @fixture def basic_field() -> Field: - return Field('Dit is een boektitel') + return Field("Dit is een boektitel") @fixture def basic_location_field() -> LocationField: - field = LocationField('Voorschoten') + field = LocationField("Voorschoten") return field class TestField: def test_init(self, basic_field: Field): - assert basic_field.original_text == 'Dit is een boektitel' + assert basic_field.original_text == "Dit is een boektitel" assert isinstance(basic_field.subject_node, Node) def test_to_graph(self, basic_field: Field): @@ -30,27 +30,21 @@ def test_to_graph(self, basic_field: Field): assert ( basic_field.subject_node, EDPOPREC.originalText, - Literal(basic_field.original_text) + Literal(basic_field.original_text), ) in graph # Test boolean basic_field.unknown = True graph = basic_field.to_graph() - assert ( - basic_field.subject_node, - EDPOPREC.unknown, - Literal(True) - ) in graph + assert (basic_field.subject_node, EDPOPREC.unknown, Literal(True)) in graph # Invalid type on object should give exception - basic_field.unknown = 'other value' # type: ignore + basic_field.unknown = "other value" # type: ignore with raises(FieldError): basic_field.to_graph() # Nonexisting datatype defined in class on SUBFIELDS should give # exception basic_field._subfields = basic_field._subfields.copy() - basic_field._subfields.append( - ('other', EDPOPREC.other, 'othertype') - ) - basic_field.other = 'text' # type: ignore + basic_field._subfields.append(("other", EDPOPREC.other, "othertype")) + basic_field.other = "text" # type: ignore with raises(FieldError): basic_field.to_graph() @@ -59,18 +53,10 @@ class TestLocationField: def test_basic_form(self, basic_location_field: LocationField): field = basic_location_field graph = field.to_graph() - assert ( - field.subject_node, - EDPOPREC.locationType, - None - ) not in graph + assert (field.subject_node, EDPOPREC.locationType, None) not in graph def test_location_type(self, basic_location_field: LocationField): field = basic_location_field field.location_type = LocationField.LOCALITY graph = field.to_graph() - assert ( - field.subject_node, - EDPOPREC.locationType, - EDPOPREC.locality - ) in graph + assert (field.subject_node, EDPOPREC.locationType, EDPOPREC.locality) in graph diff --git a/tests/test_reader.py b/tests/test_reader.py index b2e77f7..791d3f8 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1,4 +1,3 @@ - from typing_extensions import override import pytest @@ -69,9 +68,7 @@ def test_iri_to_identifier_invalid(): def test_iri_to_identifier_and_vv_noprefixset(): with pytest.raises(ReaderError): - SimpleReaderNoIRIPrefix.iri_to_identifier( - "http://example.com/records/reader/1" - ) + SimpleReaderNoIRIPrefix.iri_to_identifier("http://example.com/records/reader/1") with pytest.raises(ReaderError): SimpleReaderNoIRIPrefix.identifier_to_iri("1") diff --git a/tests/test_record.py b/tests/test_record.py index 2118b1b..832727f 100644 --- a/tests/test_record.py +++ b/tests/test_record.py @@ -8,7 +8,7 @@ class SimpleReader(Reader): - CATALOG_URIREF = URIRef('http://example.com/reader') + CATALOG_URIREF = URIRef("http://example.com/reader") IRI_PREFIX = "http://example.com/records/reader/" @@ -19,17 +19,19 @@ class SimpleRecord(Record): def __init__(self, from_reader): super().__init__(from_reader) - self._fields.extend([ - ('testfield', EDPOPREC.testField, Field), - ('multiplefield', EDPOPREC.multipleField, Field) - ]) + self._fields.extend( + [ + ("testfield", EDPOPREC.testField, Field), + ("multiplefield", EDPOPREC.multipleField, Field), + ] + ) @pytest.fixture def basic_record(): record = SimpleRecord(SimpleReader) - record.link = 'http://example.com' - record.identifier = '123' + record.link = "http://example.com" + record.identifier = "123" return record @@ -43,26 +45,27 @@ def test_iri_empty(basic_record: SimpleRecord): def test_subject_node(basic_record: SimpleRecord): - assert basic_record.subject_node == \ - URIRef("http://example.com/records/reader/123") + assert basic_record.subject_node == URIRef("http://example.com/records/reader/123") def test_to_graph_empty(): # Test if it works with an empty record record = Record(SimpleReader) g = record.to_graph() - assert ( - record.subject_node, EDPOPREC.fromCatalog, SimpleReader.CATALOG_URIREF - ) in g - + assert (record.subject_node, EDPOPREC.fromCatalog, SimpleReader.CATALOG_URIREF) in g + def test_to_graph_basic_attributes(basic_record): g = basic_record.to_graph() assert ( - basic_record.subject_node, EDPOPREC.publicURL, Literal(basic_record.link) + basic_record.subject_node, + EDPOPREC.publicURL, + Literal(basic_record.link), ) in g assert ( - basic_record.subject_node, EDPOPREC.identifier, Literal(basic_record.identifier) + basic_record.subject_node, + EDPOPREC.identifier, + Literal(basic_record.identifier), ) in g @@ -73,25 +76,23 @@ def test_to_graph_empty_field(basic_record): def test_to_graph_field_normal_value(basic_record): - basic_record.testfield = Field('test') + basic_record.testfield = Field("test") g = basic_record.to_graph() assert (basic_record.subject_node, EDPOPREC.testField, None) in g - + def test_to_graph_string_in_field(basic_record): - basic_record.testfield = 'test' # type: ignore + basic_record.testfield = "test" # type: ignore with pytest.raises(RecordError): basic_record.to_graph() - + + def test_to_graph_field_multiple_values(basic_record): # Try a field that accepts multiple values - basic_record.multiplefield = [ - Field('v1'), Field('v2') - ] + basic_record.multiplefield = [Field("v1"), Field("v2")] g = basic_record.to_graph() - assert len(list( - g.objects(basic_record.subject_node, EDPOPREC.multipleField) - )) == 2 + assert len(list(g.objects(basic_record.subject_node, EDPOPREC.multipleField))) == 2 + def test_biographicalrecord(): # Basic test to check if class definition is sane; the logic should be @@ -99,6 +100,7 @@ def test_biographicalrecord(): record = BiographicalRecord(SimpleReader) record.to_graph() + def test_biographicalrecord_str(): record = BiographicalRecord(SimpleReader) personname = "Person" diff --git a/tests/test_srureader.py b/tests/test_srureader.py index 649d15e..44cffb7 100644 --- a/tests/test_srureader.py +++ b/tests/test_srureader.py @@ -6,7 +6,7 @@ from edpop_explorer import SRUMarc21BibliographicalReader, Marc21Data -TESTDATA = json.load(open(Path(__file__).parent / 'TESTDATA', 'r')) +TESTDATA = json.load(open(Path(__file__).parent / "TESTDATA", "r")) class MockReader(SRUMarc21BibliographicalReader): @@ -20,35 +20,32 @@ def _get_link(cls, data: Marc21Data) -> Optional[str]: @classmethod def _get_identifier(cls, data: Marc21Data) -> Optional[str]: - return 'id' + return "id" class TestSRUMarc21BibliographicalReader: - @patch('edpop_explorer.srureader.sruthi') + @patch("edpop_explorer.srureader.sruthi") def test_fetch(self, mock_sruthi): mock_sruthi.searchretrieve.return_value = TESTDATA reader = MockReader() - reader.sru_url = '' - reader.sru_version = '1.1' - reader.prepare_query('testquery') + reader.sru_url = "" + reader.sru_version = "1.1" + reader.prepare_query("testquery") reader.fetch() results = reader.records # Field with multiple subfields data = results[0].data assert data is not None - firstfield = data.get_first_field('245') + firstfield = data.get_first_field("245") assert firstfield is not None - assert firstfield.subfields['a'] == \ - 'Aeschylus: Eumenides.' + assert firstfield.subfields["a"] == "Aeschylus: Eumenides." # Field with a single subfield - firstfield = data.get_first_field('650') + firstfield = data.get_first_field("650") assert firstfield is not None - assert firstfield.subfields['a'] == \ - 'Aeschylus Eumenides.' + assert firstfield.subfields["a"] == "Aeschylus Eumenides." # Field's description - assert firstfield.description == \ - 'Subject Added Entry - Topical Term' + assert firstfield.description == "Subject Added Entry - Topical Term" # Field that occurs multiple times - assert len(data.get_fields('500')) == 5 + assert len(data.get_fields("500")) == 5 # Control field - assert data.controlfields['007'] == 'tu' + assert data.controlfields["007"] == "tu" From 8d7805d94eb5177e4699301ab4b050026da21e61 Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Thu, 24 Oct 2024 17:08:08 +0200 Subject: [PATCH 08/32] Revert "Black formatting" This reverts commit 4a8a7dcaa03abe2cac7f0637df817d6eec73f773. --- docs/conf.py | 19 +-- edpop_explorer/__init__.py | 66 +++------- edpop_explorer/__main__.py | 5 +- edpop_explorer/cerl.py | 52 ++++---- edpop_explorer/edpopxshell.py | 132 +++++++++---------- edpop_explorer/fields.py | 91 ++++++++------ edpop_explorer/normalization/relators.py | 2 +- edpop_explorer/normalizers.py | 6 +- edpop_explorer/rdf.py | 5 +- edpop_explorer/reader.py | 66 +++++----- edpop_explorer/readers/__init__.py | 4 +- edpop_explorer/readers/bibliopolis.py | 14 ++- edpop_explorer/readers/bnf.py | 28 +++-- edpop_explorer/readers/cerl_thesaurus.py | 46 +++---- edpop_explorer/readers/dutch_almanacs.py | 54 ++++---- edpop_explorer/readers/fbtee.py | 71 ++++++----- edpop_explorer/readers/gallica.py | 57 +++++---- edpop_explorer/readers/hpb.py | 24 ++-- edpop_explorer/readers/kb.py | 33 ++--- edpop_explorer/readers/kvcs.py | 54 ++++---- edpop_explorer/readers/pierre_belle.py | 56 ++++----- edpop_explorer/readers/sbtireader.py | 23 ++-- edpop_explorer/readers/stcn.py | 28 ++--- edpop_explorer/readers/ustc.py | 79 ++++++------ edpop_explorer/readers/vd.py | 67 +++++----- edpop_explorer/record.py | 153 ++++++++++++----------- edpop_explorer/sparqlreader.py | 68 +++++----- edpop_explorer/srumarc21reader.py | 130 ++++++++++--------- edpop_explorer/srureader.py | 31 +++-- tests/conftest.py | 12 +- tests/test_allreaders.py | 6 +- tests/test_field.py | 34 +++-- tests/test_reader.py | 5 +- tests/test_record.py | 52 ++++---- tests/test_srureader.py | 29 +++-- 35 files changed, 800 insertions(+), 802 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 31cbdcd..522f359 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -13,9 +13,9 @@ # -- Project information ----------------------------------------------------- -project = "EDPOP Explorer" -copyright = "2023" -author = "Utrecht University" +project = 'EDPOP Explorer' +copyright = '2023' +author = 'Utrecht University' # -- General configuration --------------------------------------------------- @@ -24,17 +24,17 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.autosummary", + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', ] # Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] +templates_path = ['_templates'] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] # -- Options for HTML output ------------------------------------------------- @@ -42,9 +42,10 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = "alabaster" +html_theme = 'alabaster' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] +html_static_path = ['_static'] + diff --git a/edpop_explorer/__init__.py b/edpop_explorer/__init__.py index 6b07025..ced924c 100644 --- a/edpop_explorer/__init__.py +++ b/edpop_explorer/__init__.py @@ -1,33 +1,15 @@ __all__ = [ - "EDPOPREC", - "RELATORS", - "bind_common_namespaces", - "Field", - "FieldError", - "LocationField", - "Reader", - "ReaderError", - "NotFoundError", - "GetByIdBasedOnQueryMixin", - "DatabaseFileMixin", - "BasePreparedQuery", - "PreparedQueryType", - "Record", - "RawData", - "RecordError", - "BibliographicalRecord", - "BiographicalRecord", - "LazyRecordMixin", - "SRUReader", - "CERLReader", - "Marc21Data", - "Marc21Field", - "Marc21BibliographicalRecord", - "Marc21DataMixin", - "SRUMarc21Reader", - "SRUMarc21BibliographicalReader", - "BIBLIOGRAPHICAL", - "BIOGRAPHICAL", + 'EDPOPREC', 'RELATORS', 'bind_common_namespaces', + 'Field', 'FieldError', 'LocationField', + 'Reader', 'ReaderError', 'NotFoundError', + 'GetByIdBasedOnQueryMixin', 'DatabaseFileMixin', + 'BasePreparedQuery', 'PreparedQueryType', + 'Record', 'RawData', 'RecordError', 'BibliographicalRecord', + 'BiographicalRecord', 'LazyRecordMixin', + 'SRUReader', 'CERLReader', + 'Marc21Data', 'Marc21Field', 'Marc21BibliographicalRecord', + 'Marc21DataMixin', 'SRUMarc21Reader', 'SRUMarc21BibliographicalReader', + 'BIBLIOGRAPHICAL', 'BIOGRAPHICAL' ] # Define here to avoid circular imports @@ -38,29 +20,17 @@ from .rdf import EDPOPREC, RELATORS, bind_common_namespaces from .fields import Field, FieldError, LocationField from .reader import ( - Reader, - ReaderError, - GetByIdBasedOnQueryMixin, - BasePreparedQuery, - PreparedQueryType, - NotFoundError, - DatabaseFileMixin, + Reader, ReaderError, GetByIdBasedOnQueryMixin, BasePreparedQuery, + PreparedQueryType, NotFoundError, DatabaseFileMixin ) from .record import ( - Record, - RawData, - RecordError, - BibliographicalRecord, - BiographicalRecord, - LazyRecordMixin, + Record, RawData, RecordError, BibliographicalRecord, BiographicalRecord, + LazyRecordMixin ) from .srureader import SRUReader from .srumarc21reader import ( - Marc21Data, - Marc21Field, - Marc21BibliographicalRecord, - Marc21DataMixin, - SRUMarc21Reader, - SRUMarc21BibliographicalReader, + Marc21Data, Marc21Field, Marc21BibliographicalRecord, Marc21DataMixin, + SRUMarc21Reader, SRUMarc21BibliographicalReader ) from .cerl import CERLReader + diff --git a/edpop_explorer/__main__.py b/edpop_explorer/__main__.py index 7e42a46..d8fd652 100644 --- a/edpop_explorer/__main__.py +++ b/edpop_explorer/__main__.py @@ -7,12 +7,11 @@ try: from colorama import just_fix_windows_console - just_fix_windows_console() except ImportError: pass -historyfile = Path(AppDirs("edpop-explorer", "cdh").user_data_dir) / "history" +historyfile = Path(AppDirs('edpop-explorer', 'cdh').user_data_dir) / 'history' def save_history() -> None: @@ -28,5 +27,5 @@ def main() -> None: save_history() -if __name__ == "__main__": +if __name__ == '__main__': main() diff --git a/edpop_explorer/cerl.py b/edpop_explorer/cerl.py index d10ba4a..61dbf01 100644 --- a/edpop_explorer/cerl.py +++ b/edpop_explorer/cerl.py @@ -5,12 +5,7 @@ from typing import List, Dict, Optional from edpop_explorer import ( - Reader, - Record, - ReaderError, - BiographicalRecord, - Field, - BIOGRAPHICAL, + Reader, Record, ReaderError, BiographicalRecord, Field, BIOGRAPHICAL ) @@ -21,7 +16,6 @@ class CERLReader(Reader): This is an abstract class -- to use, derive from this class, set the ``API_URL``, ``API_BY_ID_BASE_URL`` and ``LINK_BASE_URL`` constant attributes, and implement the ``_convert_record`` class method.""" - API_URL: str """The base URL of the search API, of the form ``https://data.cerl.org//_search``.""" API_BY_ID_BASE_URL: str @@ -36,7 +30,9 @@ def get_by_id(cls, identifier: str) -> Record: try: response = requests.get( cls.API_BY_ID_BASE_URL + identifier, - headers={"Accept": "application/json"}, + headers={ + 'Accept': 'application/json' + }, ).json() except requests.exceptions.JSONDecodeError: raise ReaderError(f"Item with id {identifier} does not exist.") @@ -44,48 +40,51 @@ def get_by_id(cls, identifier: str) -> Record: raise ReaderError(f"Error during server request: {err}") return cls._convert_record(response) + @classmethod @abstractmethod def _convert_record(cls, rawrecord: dict) -> Record: pass - def _perform_query( - self, start_record: int, maximum_records: Optional[int] - ) -> List[Record]: + def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> List[Record]: assert isinstance(self.prepared_query, str) if maximum_records is None: maximum_records = self.DEFAULT_RECORDS_PER_PAGE - print(f"The query is: {self.prepared_query}") + print(f'The query is: {self.prepared_query}') try: response = requests.get( self.API_URL, params={ - "query": self.prepared_query, - "from": start_record, - "size": maximum_records, - "mode": "default", - "sort": "default", + 'query': self.prepared_query, + 'from': start_record, + 'size': maximum_records, + 'mode': 'default', + 'sort': 'default' }, - headers={"Accept": "application/json"}, + headers={ + 'Accept': 'application/json' + } ).json() - except requests.exceptions.RequestException as err: - raise ReaderError("Error during server request: " + str(err)) + except ( + requests.exceptions.RequestException + ) as err: + raise ReaderError('Error during server request: ' + str(err)) # TODO: check for error responses try: - if response["hits"] is None: + if response['hits'] is None: self.number_of_results = 0 else: - self.number_of_results = response["hits"]["value"] + self.number_of_results = response['hits']['value'] except KeyError: - raise ReaderError("Number of hits not given in server response") + raise ReaderError('Number of hits not given in server response') - if "rows" not in response: + if 'rows' not in response: # There are no rows in the response, so stop here return [] records: List[Record] = [] - for rawrecord in response["rows"]: + for rawrecord in response['rows']: record = self._convert_record(rawrecord) records.append(record) @@ -98,10 +97,11 @@ def transform_query(cls, query) -> str: def fetch_range(self, range_to_fetch: range) -> range: if self.prepared_query is None: - raise ReaderError("First call prepare_query") + raise ReaderError('First call prepare_query') start_record = range_to_fetch.start number_to_fetch = range_to_fetch.stop - start_record results = self._perform_query(start_record, number_to_fetch) for i, result in enumerate(results): self.records[i + range_to_fetch.start] = result return range(start_record, start_record + len(results)) + diff --git a/edpop_explorer/edpopxshell.py b/edpop_explorer/edpopxshell.py index bb7479f..2ac1c2e 100644 --- a/edpop_explorer/edpopxshell.py +++ b/edpop_explorer/edpopxshell.py @@ -30,12 +30,12 @@ class EDPOPXShell(cmd2.Cmd): intro = ( - "Welcome to the EDPOP explorer!\n" - "Type to perform a query.\n" - "Type identifier to retrieve a specific record.\n" - "Type ‘help’ for all commands.\n" + 'Welcome to the EDPOP explorer!\n' + 'Type to perform a query.\n' + 'Type identifier to retrieve a specific record.\n' + 'Type ‘help’ for all commands.\n' ) - prompt = "[edpop-explorer] # " + prompt = '[edpop-explorer] # ' reader: Optional[Reader] = None shown: int = 0 RECORDS_PER_PAGE = 10 @@ -44,78 +44,78 @@ def __init__(self): super().__init__() self.exact = False - self.add_settable( - cmd2.Settable( - "exact", bool, "use exact queries without preprocessing", self - ) - ) + self.add_settable(cmd2.Settable( + 'exact', bool, 'use exact queries without preprocessing', self + )) def get_record_from_argument(self, args) -> Optional[Record]: """Get the record requested by the user; show error message and return None if this fails""" if self.reader is None: - self.perror("First perform an initial search") + self.perror('First perform an initial search') return try: # TODO: consider using argparse index = int(args) - 1 except (TypeError, ValueError): - self.perror("Please provide a valid number") + self.perror('Please provide a valid number') return try: record = self.reader.records[index] except IndexError: - self.perror("Please provide a record number that has been loaded") + self.perror('Please provide a record number that has been loaded') return return record def do_next(self, args) -> None: if self.reader is None: - self.perror("First perform an initial search") + self.perror('First perform an initial search') return assert self.reader.number_of_results is not None assert self.reader.number_fetched is not None if self.shown >= self.reader.number_of_results: - self.perror("All records have been shown") + self.perror('All records have been shown') else: if self.reader.number_fetched - self.shown < self.RECORDS_PER_PAGE: self.reader.fetch() - self.shown += self._show_records( - self.reader.records, self.shown, self.RECORDS_PER_PAGE - ) + self.shown += self._show_records(self.reader.records, + self.shown, + self.RECORDS_PER_PAGE) def do_show(self, args) -> None: - """Show a normalized version of the record with the given number.""" + '''Show a normalized version of the record with the given number.''' record = self.get_record_from_argument(args) if record is not None: self.show_record(record) def show_record(self, record: Record) -> None: record.fetch() # Necessary in case this is a lazy record - self.poutput(cmd2.ansi.style_success(record, bold=True)) - recordtype = str(record._rdf_class).rsplit("/", 1)[1] - self.poutput(f"Record type: {recordtype}") + self.poutput(cmd2.ansi.style_success( + record, bold=True + )) + recordtype = str(record._rdf_class).rsplit('/',1)[1] + self.poutput(f'Record type: {recordtype}') if record.identifier: - self.poutput(f"Identifier: {record.identifier}") + self.poutput(f'Identifier: {record.identifier}') if record.link: - self.poutput("URL: " + str(record.link)) - self.poutput(cmd2.ansi.style("Fields:", bold=True)) + self.poutput('URL: ' + str(record.link)) + self.poutput(cmd2.ansi.style('Fields:', bold=True)) for fieldname, _, _ in record._fields: - fieldname_human = fieldname.capitalize().replace("_", " ") + fieldname_human = fieldname.capitalize().replace('_', ' ') # TODO: make a field iterator for Record value = getattr(record, fieldname) if value: if isinstance(value, list): - text = "\n" + "\n".join([(" - " + str(x)) for x in value]) + text = '\n' + '\n'.join([(' - ' + str(x)) for x in value]) else: text = str(value) self.poutput( - cmd2.ansi.style(f"- {fieldname_human}: ", bold=True) + text + cmd2.ansi.style(f'- {fieldname_human}: ', bold=True) + text ) def do_showrdf(self, args) -> None: - """Show an RDF representation of the record with the given number - in Turtle format.""" + '''Show an RDF representation of the record with the given number + in Turtle format.''' record = self.get_record_from_argument(args) if record is None: return @@ -123,27 +123,27 @@ def do_showrdf(self, args) -> None: graph = record.to_graph() ttl = graph.serialize() highlighted = highlight( - ttl, TurtleLexer(), Terminal256Formatter(style="vim") + ttl, TurtleLexer(), Terminal256Formatter(style='vim') ) self.poutput(highlighted) except ReaderError as err: - self.perror("Cannot generate RDF: {}".format(err)) + self.perror('Cannot generate RDF: {}'.format(err)) def do_showraw(self, args) -> None: - """Show the raw data of the record with the given number in the - source catalog.""" + '''Show the raw data of the record with the given number in the + source catalog.''' record = self.get_record_from_argument(args) if record is None: return data = record.get_data_dict() yaml_data = yaml.dump(data, allow_unicode=True) highlighted = highlight( - yaml_data, YamlLexer(), Terminal256Formatter(style="vim") + yaml_data, YamlLexer(), Terminal256Formatter(style='vim') ) self.poutput(highlighted) def do_hpb(self, args) -> None: - "CERL's Heritage of the Printed Book Database" + 'CERL\'s Heritage of the Printed Book Database' self._query(HPBReader, args) def do_vd16(self, args) -> None: @@ -164,54 +164,54 @@ def do_vd18(self, args) -> None: def do_vdlied(self, args) -> None: """Verzeichnis der deutschsprachigen Liedflugschriften""" self._query(VDLiedReader, args) - + def do_bnf(self, args) -> None: """Bibliothèque nationale de France""" self._query(BnFReader, args) - + def do_gallica(self, args) -> None: - "Gallica" + 'Gallica' self._query(GallicaReader, args) - + def do_ct(self, args) -> None: - "CERL Thesaurus" + 'CERL Thesaurus' self._query(CERLThesaurusReader, args) - + def do_stcn(self, args) -> None: - "Short Title Catalogue Netherlands" + 'Short Title Catalogue Netherlands' self._query(STCNReader, args) - + def do_sbti(self, args) -> None: - "Scottish Book Trade Index" + 'Scottish Book Trade Index' self._query(SBTIReader, args) - + def do_fbtee(self, args) -> None: - "French Book Trade in Enlightenment Europe" + 'French Book Trade in Enlightenment Europe' self._query(FBTEEReader, args) - + def do_ustc(self, args) -> None: - "Universal Short Title Catalogue" + 'Universal Short Title Catalogue' self._query(USTCReader, args) - + def do_kb(self, args) -> None: - "Koninklijke Bibliotheek" + 'Koninklijke Bibliotheek' self._query(KBReader, args) def do_kvcs(self, args) -> None: - "Drukkers & Uitgevers in KVCS" + 'Drukkers & Uitgevers in KVCS' self._query(KVCSReader, args) def do_dutalm(self, args) -> None: - "Bibliography of Dutch Almanacs 1570-1710" + 'Bibliography of Dutch Almanacs 1570-1710' self._query(DutchAlmanacsReader, args) def do_pb(self, args) -> None: - "BIBLIOGRAPHY OF EARLY MODERN EDITIONS OF PIERRE DE PROVENCE ET LA BELLE MAGUELONNE (CA. 1470–CA. 1800)" + 'BIBLIOGRAPHY OF EARLY MODERN EDITIONS OF PIERRE DE PROVENCE ET LA BELLE MAGUELONNE (CA. 1470–CA. 1800)' self._query(PierreBelleReader, args) - def _show_records( - self, records: List[Optional[Record]], start: int, limit=math.inf - ) -> int: + def _show_records(self, records: List[Optional[Record]], + start: int, + limit=math.inf) -> int: """Show the records from start, with limit as the maximum number of records to show. Return the number of records shown.""" total = len(records) @@ -222,13 +222,15 @@ def _show_records( count = int(min(remaining, limit)) digits = len(str(total)) for i in range(start, start + count): - print("{:{digits}} - {}".format(i + 1, str(records[i]), digits=digits)) + print('{:{digits}} - {}'.format( + i + 1, str(records[i]), digits=digits + )) return count def _query(self, readerclass: Type[Reader], query: str): IDENTIFIER_PREFIX = "identifier " if query.startswith(IDENTIFIER_PREFIX): - identifier = query[len(IDENTIFIER_PREFIX) :] + identifier = query[len(IDENTIFIER_PREFIX):] try: record = readerclass.get_by_id(identifier) except ReaderError as err: @@ -242,18 +244,22 @@ def _query(self, readerclass: Type[Reader], query: str): if not self.exact: self.reader.prepare_query(query) self.pfeedback( - "Performing query: {}".format(self.reader.prepared_query) + 'Performing query: {}'.format(self.reader.prepared_query) ) else: self.reader.set_query(query) - self.pfeedback("Performing exact query: {}".format(query)) + self.pfeedback( + 'Performing exact query: {}'.format(query) + ) self.reader.fetch() except ReaderError as err: - self.perror("Error while fetching results: {}".format(err)) + self.perror('Error while fetching results: {}'.format(err)) self.reader = None self.shown = 0 return - self.pfeedback("{} records found.".format(self.reader.number_of_results)) + self.pfeedback( + '{} records found.'.format(self.reader.number_of_results) + ) self.shown += self._show_records( self.reader.records, self.shown, self.RECORDS_PER_PAGE ) diff --git a/edpop_explorer/fields.py b/edpop_explorer/fields.py index c4fc8ba..cfd5336 100644 --- a/edpop_explorer/fields.py +++ b/edpop_explorer/fields.py @@ -14,23 +14,25 @@ from edpop_explorer.normalization import relators DATATYPES = { - "string": { - "input_type": str, - "converter": (lambda x: Literal(x)), + 'string': { + 'input_type': str, + 'converter': (lambda x: Literal(x)), }, - "boolean": { - "input_type": bool, - "converter": (lambda x: Literal(x)), + 'boolean': { + 'input_type': bool, + 'converter': (lambda x: Literal(x)), }, - "edtf": { - "input_type": str, - "converter": ( - lambda x: Literal(x, datatype=URIRef("http://id.loc.gov/datatypes/edtf")) - ), + 'edtf': { + 'input_type': str, + 'converter': ( + lambda x: Literal( + x, datatype=URIRef("http://id.loc.gov/datatypes/edtf") + ) + ) }, - "uriref": { - "input_type": URIRef, - "converter": lambda x: x, + 'uriref': { + 'input_type': URIRef, + 'converter': lambda x: x, }, } @@ -51,8 +53,8 @@ class Field: not the case for this base class. In those cases, it is still possible to set this field using the ``set_normalized_text`` method. Except ``original_text``, all subfields are optional and are None by default. - Use ``to_graph()`` to obtain an RDF graph. The subject node is by default - a blank node, but this may be overridden by setting the subject_node + Use ``to_graph()`` to obtain an RDF graph. The subject node is by default + a blank node, but this may be overridden by setting the subject_node attribute. Subclasses should override the ``_rdf_class`` attribute to the corresponding @@ -63,9 +65,8 @@ class Field: by one using ``self.SUBFIELDS.append(('', EDPOPREC., ''))``, where is any of the datatypes defined in the ``DATATYPES`` constant of this module. - Subclasses may furthermore define the ``_normalized_text`` private + Subclasses may furthermore define the ``_normalized_text`` private method.""" - #: Subfield -- text of this field according to the original record. original_text: str #: This field's subject node if converted to RDF. This is a blank node @@ -79,17 +80,19 @@ class Field: authority_record: Optional[str] = None normalizer: Optional[Callable] = None _rdf_class: Node = EDPOPREC.Field - + def __init__(self, original_text: str) -> None: if not isinstance(original_text, str): - raise FieldError(f"original_text should be str, not {type(original_text)}") + raise FieldError( + f'original_text should be str, not {type(original_text)}' + ) self.subject_node = BNode() self.original_text = original_text self._subfields = [ - ("original_text", EDPOPREC.originalText, "string"), - ("summary_text", EDPOPREC.summaryText, "string"), - ("unknown", EDPOPREC.unknown, "boolean"), - ("authority_record", EDPOPREC.authorityRecord, "string"), + ('original_text', EDPOPREC.originalText, 'string'), + ('summary_text', EDPOPREC.summaryText, 'string'), + ('unknown', EDPOPREC.unknown, 'boolean'), + ('authority_record', EDPOPREC.authorityRecord, 'string'), ] def normalize(self) -> NormalizationResult: @@ -99,10 +102,14 @@ def normalize(self) -> NormalizationResult: return self.normalizer() def to_graph(self) -> Graph: - """Create an ``rdflib`` RDF graph according to the current data.""" + '''Create an ``rdflib`` RDF graph according to the current data.''' assert isinstance(self.subject_node, Node) graph = Graph() - graph.add((self.subject_node, RDF.type, self._rdf_class)) + graph.add(( + self.subject_node, + RDF.type, + self._rdf_class + )) for subfield in self._subfields: attrname, propref, datatype = subfield value = getattr(self, attrname, None) @@ -118,17 +125,21 @@ def to_graph(self) -> Graph: "{self.__class__} but it does not exist" ) else: - input_type = typedef["input_type"] + input_type = typedef['input_type'] if not isinstance(value, input_type): raise FieldError( f"Subfield {attrname} should be of type {str(input_type)} but " "it is {str(type(value))}" ) else: - converter = typedef["converter"] + converter = typedef['converter'] converted = converter(value) assert isinstance(converted, Node) - graph.add((self.subject_node, propref, converted)) + graph.add(( + self.subject_node, + propref, + converted + )) return graph @property @@ -150,7 +161,9 @@ class LocationField(Field): def __init__(self, original_text: str) -> None: super().__init__(original_text) - self._subfields.append(("location_type", EDPOPREC.locationType, "uriref")) + self._subfields.append( + ('location_type', EDPOPREC.locationType, 'uriref') + ) class LanguageField(Field): @@ -160,7 +173,9 @@ class LanguageField(Field): def __init__(self, original_text: str) -> None: super().__init__(original_text) - self._subfields.append(("language_code", EDPOPREC.languageCode, "string")) + self._subfields.append( + ('language_code', EDPOPREC.languageCode, 'string') + ) @property def summary_text(self) -> Optional[str]: @@ -178,12 +193,10 @@ class ContributorField(Field): def __init__(self, original_text: str) -> None: super().__init__(original_text) - self._subfields.extend( - ( - ("name", EDPOPREC.name, "string"), - ("role", EDPOPREC.role, "string"), - ) - ) + self._subfields.extend(( + ('name', EDPOPREC.name, 'string'), + ('role', EDPOPREC.role, 'string'), + )) @property def summary_text(self) -> Optional[str]: @@ -193,3 +206,7 @@ def summary_text(self) -> Optional[str]: return f"{name} ({role})" else: return name + + + + diff --git a/edpop_explorer/normalization/relators.py b/edpop_explorer/normalization/relators.py index 8e1bc4c..da6cd33 100644 --- a/edpop_explorer/normalization/relators.py +++ b/edpop_explorer/normalization/relators.py @@ -303,5 +303,5 @@ "wit": "witness", "wpr": "writer of preface", "wst": "writer of supplementary textual content", - "wts": "writer of television story", + "wts": "writer of television story" } diff --git a/edpop_explorer/normalizers.py b/edpop_explorer/normalizers.py index cad75e2..1e89a8d 100644 --- a/edpop_explorer/normalizers.py +++ b/edpop_explorer/normalizers.py @@ -4,9 +4,9 @@ class NormalizationResult(Enum): - SUCCESS = "success" - NO_DATA = "nodata" - FAIL = "fail" + SUCCESS = 'success' + NO_DATA = 'nodata' + FAIL = 'fail' def normalize_by_language_code(field) -> NormalizationResult: diff --git a/edpop_explorer/rdf.py b/edpop_explorer/rdf.py index e8bd50b..25f462a 100644 --- a/edpop_explorer/rdf.py +++ b/edpop_explorer/rdf.py @@ -3,10 +3,10 @@ from rdflib.namespace import Namespace from rdflib import Graph, RDF, RDFS -EDPOPREC = Namespace("https://dhstatic.hum.uu.nl/edpop-records/0.1.0/") +EDPOPREC = Namespace('https://dhstatic.hum.uu.nl/edpop-records/0.1.0/') """EDPOP Record Ontology""" -RELATORS = Namespace("http://id.loc.gov/vocabulary/relators/") +RELATORS = Namespace('http://id.loc.gov/vocabulary/relators/') """Library of Congress relators. See: https://id.loc.gov/vocabulary/relators.html""" @@ -19,3 +19,4 @@ def bind_common_namespaces(graph: Graph) -> None: graph.bind("rdf", RDF) graph.bind("rdfs", RDFS) graph.bind("edpoprec", EDPOPREC) + diff --git a/edpop_explorer/reader.py b/edpop_explorer/reader.py index 51c6b62..e6fd385 100644 --- a/edpop_explorer/reader.py +++ b/edpop_explorer/reader.py @@ -12,10 +12,7 @@ from edpop_explorer import ( - EDPOPREC, - BIBLIOGRAPHICAL, - BIOGRAPHICAL, - bind_common_namespaces, + EDPOPREC, BIBLIOGRAPHICAL, BIOGRAPHICAL, bind_common_namespaces ) from .record import Record @@ -25,7 +22,6 @@ class BasePreparedQuery: """Empty base dataclass for prepared queries. For prepared queries that can be represented by a single string, do not inherit from this class but use a simple string instead.""" - pass @@ -49,7 +45,6 @@ class Reader(ABC): ``fetch_range()`` should populate the ``records``, ``number_of_results``, ``number_fetched`` and ``range_fetched`` attributes. """ - number_of_results: Optional[int] = None """The total number of results for the query, or None if fetching has not yet started and the number is not yet known.""" @@ -116,7 +111,9 @@ def adjust_start_record(self, start_number: int) -> None: records.""" self._fetch_position = start_number - def fetch(self, number: Optional[int] = None) -> range: + def fetch( + self, number: Optional[int] = None + ) -> range: """Perform an initial or subsequent query. Most readers fetch a limited number of records at once -- this number depends on the reader but it may be adjusted using the ``number`` parameter. @@ -129,9 +126,8 @@ def fetch(self, number: Optional[int] = None) -> range: return range(0) if number is None: number = self.DEFAULT_RECORDS_PER_PAGE - resulting_range = self.fetch_range( - range(self._fetch_position, self._fetch_position + number) - ) + resulting_range = self.fetch_range(range(self._fetch_position, + self._fetch_position + number)) self._fetch_position = resulting_range.stop return resulting_range @@ -164,9 +160,9 @@ def get(self, index: int, allow_fetching: bool = True) -> Record: # Try to fetch, if it is allowed, and if there is a chance that # it is successful (by verifying that index is not out of # available range, if known) - if allow_fetching and ( - self.number_of_results is None or self.number_of_results <= index - ): + if (allow_fetching and + (self.number_of_results is None + or self.number_of_results <= index)): # Fetch and try again self.fetch_range(range(index, index + 1)) record = self.records.get(index) @@ -204,7 +200,7 @@ def iri_to_identifier(cls, iri: str) -> str: "not a string." ) if iri.startswith(cls.IRI_PREFIX): - return unquote(iri[len(cls.IRI_PREFIX) :]) + return unquote(iri[len(cls.IRI_PREFIX):]) else: raise ReaderError( f"Cannot convert IRI {iri} to identifier: IRI does not start " @@ -213,13 +209,13 @@ def iri_to_identifier(cls, iri: str) -> str: @classmethod def catalog_to_graph(cls) -> Graph: - """Create an RDF representation of the catalog that this reader - supports as an instance of EDPOPREC:Catalog.""" + '''Create an RDF representation of the catalog that this reader + supports as an instance of EDPOPREC:Catalog.''' g = Graph() if not cls.CATALOG_URIREF: raise ReaderError( - "Cannot create graph because catalog IRI has not been set. " - "This should have been done on class level." + 'Cannot create graph because catalog IRI has not been set. ' + 'This should have been done on class level.' ) # Set reader class @@ -297,9 +293,8 @@ class GetByIdBasedOnQueryMixin(ABC): @classmethod def get_by_id(cls, identifier: str) -> Record: reader = cls() - assert isinstance( - reader, Reader - ), "GetByIdBasedOnQueryMixin should be used on Reader subclass" + assert isinstance(reader, Reader), \ + "GetByIdBasedOnQueryMixin should be used on Reader subclass" reader.set_query(cls._prepare_get_by_id_query(identifier)) reader.fetch() if reader.number_of_results == 0: @@ -331,7 +326,6 @@ class DatabaseFileMixin: using the filename specified in the constant attribute ``DATABASE_FILENAME``, which has to be specified by the user of this mixin.""" - DATABASE_URL: Optional[str] = None """The URL to download the database file from. If this attribute is ``None``, automatically downloading the database file is not supported.""" @@ -347,10 +341,9 @@ class DatabaseFileMixin: def prepare_data(self) -> None: """Prepare the database file by confirming that it is available, and if not, by attempting to download it.""" - self.database_path = ( - Path(AppDirs("edpop-explorer", "cdh").user_data_dir) - / self.DATABASE_FILENAME - ) + self.database_path = Path( + AppDirs('edpop-explorer', 'cdh').user_data_dir + ) / self.DATABASE_FILENAME if not self.database_path.exists(): if self.DATABASE_URL is None: # No database URL is given, so the user has to get the database @@ -360,36 +353,37 @@ def prepare_data(self) -> None: # the Windows Store... db_dir = self.database_path.parent.resolve() error_message = ( - f"{self.__class__.__name__} database not found. Please obtain the file " - f"{self.DATABASE_FILENAME} from the project team and add it " - f"to the following directory: {db_dir}" + f'{self.__class__.__name__} database not found. Please obtain the file ' + f'{self.DATABASE_FILENAME} from the project team and add it ' + f'to the following directory: {db_dir}' ) raise ReaderError(error_message) else: self._download_database() def _download_database(self) -> None: - print("Downloading database...") + print('Downloading database...') response = requests.get(self.DATABASE_URL) if response.ok: try: self.database_path.parent.mkdir(exist_ok=True, parents=True) - with open(self.database_path, "wb") as f: + with open(self.database_path, 'wb') as f: f.write(response.content) except OSError as err: - raise ReaderError(f"Error writing database file to disk: {err}") + raise ReaderError( + f'Error writing database file to disk: {err}' + ) else: raise ReaderError( - f"Error downloading database file from {self.DATABASE_URL}" + f'Error downloading database file from {self.DATABASE_URL}' ) - print(f"Successfully saved database to {self.database_path}.") - print(f"See license: {self.DATABASE_LICENSE}") + print(f'Successfully saved database to {self.database_path}.') + print(f'See license: {self.DATABASE_LICENSE}') class ReaderError(Exception): """Generic exception for failures in ``Reader`` class. More specific errors derive from this class.""" - pass diff --git a/edpop_explorer/readers/__init__.py b/edpop_explorer/readers/__init__.py index ad776e2..3acfd63 100644 --- a/edpop_explorer/readers/__init__.py +++ b/edpop_explorer/readers/__init__.py @@ -1,4 +1,4 @@ -"""This package contains concrete subclasses of ``Reader``.""" +'''This package contains concrete subclasses of ``Reader``.''' __all__ = [ "BnFReader", @@ -51,5 +51,5 @@ def _get_all_readers() -> List[Type[Reader]]: all_readers.append(cls) return all_readers - ALL_READERS = _get_all_readers() + diff --git a/edpop_explorer/readers/bibliopolis.py b/edpop_explorer/readers/bibliopolis.py index 929b76e..3d37146 100644 --- a/edpop_explorer/readers/bibliopolis.py +++ b/edpop_explorer/readers/bibliopolis.py @@ -6,16 +6,20 @@ class BibliopolisReader(SRUReader): # Note that this reader is currently deactivated by default because # the API is not working. It is not possible for the moment to # test this reader. - sru_url = "http://jsru.kb.nl/sru/sru" - sru_version = "1.2" - HPB_LINK = "http://hpb.cerl.org/record/{}" - CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/bibliopolis") + sru_url = 'http://jsru.kb.nl/sru/sru' + sru_version = '1.2' + HPB_LINK = 'http://hpb.cerl.org/record/{}' + CATALOG_URIREF = URIRef( + 'https://edpop.hum.uu.nl/readers/bibliopolis' + ) IRI_PREFIX = "https://edpop.hum.uu.nl/readers/bibliopolis/" SHORT_NAME = "Bibliopolis" def __init__(self): super().__init__() - self.additional_params = {"x-collection": "Bibliopolis"} + self.additional_params = { + 'x-collection': 'Bibliopolis' + } def _convert_record(self, sruthirecord: dict) -> Record: record = Record(from_reader=self.__class__) diff --git a/edpop_explorer/readers/bnf.py b/edpop_explorer/readers/bnf.py index d0bbc5c..39c3366 100644 --- a/edpop_explorer/readers/bnf.py +++ b/edpop_explorer/readers/bnf.py @@ -6,30 +6,32 @@ class BnFReader(SRUMarc21BibliographicalReader): - sru_url = "http://catalogue.bnf.fr/api/SRU" - sru_version = "1.2" - HPB_LINK = "http://hpb.cerl.org/record/{}" - marcxchange_prefix = "info:lc/xmlns/marcxchange-v2:" - CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/bnf") + sru_url = 'http://catalogue.bnf.fr/api/SRU' + sru_version = '1.2' + HPB_LINK = 'http://hpb.cerl.org/record/{}' + marcxchange_prefix = 'info:lc/xmlns/marcxchange-v2:' + CATALOG_URIREF = URIRef( + 'https://edpop.hum.uu.nl/readers/bnf' + ) IRI_PREFIX = "https://edpop.hum.uu.nl/readers/bnf/" SHORT_NAME = "Bibliothèque nationale de France (BnF)" DESCRIPTION = "General catalogue of the French National Library" - _title_field_subfield = ("200", "a") - _alternative_title_field_subfield = ("500", "a") - _publisher_field_subfield = ("201", "c") - _place_field_subfield = ("210", "a") - _dating_field_subfield = ("210", "d") - _language_field_subfield = ("101", "a") + _title_field_subfield = ('200', 'a') + _alternative_title_field_subfield = ('500', 'a') + _publisher_field_subfield = ('201', 'c') + _place_field_subfield = ('210', 'a') + _dating_field_subfield = ('210', 'd') + _language_field_subfield = ('101', 'a') # TODO: add format etc @classmethod def transform_query(cls, query: str) -> str: - return "bib.anywhere all ({})".format(query) + return 'bib.anywhere all ({})'.format(query) @classmethod def _get_link(cls, data: Marc21Data) -> Optional[str]: # The link can be found in control field 003 - return data.controlfields.get("003", None) + return data.controlfields.get('003', None) @classmethod def _prepare_get_by_id_query(cls, identifier: str) -> str: diff --git a/edpop_explorer/readers/cerl_thesaurus.py b/edpop_explorer/readers/cerl_thesaurus.py index 3cb455d..8f7cdb2 100644 --- a/edpop_explorer/readers/cerl_thesaurus.py +++ b/edpop_explorer/readers/cerl_thesaurus.py @@ -6,34 +6,36 @@ class CERLThesaurusReader(SRUReader): - sru_url = "https://data.cerl.org/thesaurus/_sru" - sru_version = "1.2" - CERL_LINK = "https://data.cerl.org/thesaurus/{}" - CTAS_PREFIX = "http://sru.cerl.org/ctas/dtd/1.1:" - CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/cerlthesaurus") + sru_url = 'https://data.cerl.org/thesaurus/_sru' + sru_version = '1.2' + CERL_LINK = 'https://data.cerl.org/thesaurus/{}' + CTAS_PREFIX = 'http://sru.cerl.org/ctas/dtd/1.1:' + CATALOG_URIREF = URIRef( + 'https://edpop.hum.uu.nl/readers/cerlthesaurus' + ) IRI_PREFIX = "https://edpop.hum.uu.nl/readers/cerlthesaurus/" READERTYPE = BIOGRAPHICAL SHORT_NAME = "CERL Thesaurus" - DESCRIPTION = ( - "The CERL Thesaurus file contains forms of imprint " - "places, imprint names, personal names and corporate names as " - "found in material printed before the middle of the nineteenth " - "century - including variant spellings, forms in Latin and " + DESCRIPTION = "The CERL Thesaurus file contains forms of imprint " \ + "places, imprint names, personal names and corporate names as "\ + "found in material printed before the middle of the nineteenth "\ + "century - including variant spellings, forms in Latin and "\ "other languages, and fictitious names." - ) @classmethod - def _get_acceptable_names(cls, namelist: List[Dict[str, str]]) -> List[str]: + def _get_acceptable_names( + cls, namelist: List[Dict[str, str]] + ) -> List[str]: names = [] for name in namelist: - if name["name"] in ["single", "full"]: - names.append(name["text"]) + if name['name'] in ['single', 'full']: + names.append(name['text']) return names - + @classmethod def _convert_record(cls, sruthirecord: dict) -> Record: record = BiographicalRecord(from_reader=cls) - record.identifier = sruthirecord["id"] + record.identifier = sruthirecord['id'] record.link = cls.CERL_LINK.format(record.identifier) record.data = sruthirecord @@ -45,17 +47,17 @@ def _convert_record(cls, sruthirecord: dict) -> Record: # display name) and variantForm (multiple variant names). We will # use these respectively for name and variantName. PREFIX = cls.CTAS_PREFIX - headingform = sruthirecord.get(PREFIX + "headingForm", None) + headingform = sruthirecord.get(PREFIX + 'headingForm', None) if headingform and isinstance(headingform, list): names = cls._get_acceptable_names(headingform) if len(names): record.name = Field(names[0]) # If no headingForm was defined, try display if not record.name: - display = sruthirecord.get(PREFIX + "display", None) + display = sruthirecord.get(PREFIX + 'display', None) if display: record.name = Field(display) - variantform = sruthirecord.get(PREFIX + "variantForm", None) + variantform = sruthirecord.get(PREFIX + 'variantForm', None) if variantform and isinstance(variantform, list): names = cls._get_acceptable_names(variantform) record.variant_names = [Field(x) for x in names] @@ -63,17 +65,17 @@ def _convert_record(cls, sruthirecord: dict) -> Record: # Add activityNote. This field can have only one value in CT. # NB: this data is very inconsistent and often includes other information # than somebody's activity - consider ignoring - activitynote = sruthirecord.get(PREFIX + "activityNote") + activitynote = sruthirecord.get(PREFIX + 'activityNote') if activitynote: record.activities = [Field(activitynote)] # Add biographicalData, which appears to be in all cases the years # that somebody was alive or that an entity existed - biographicaldata = sruthirecord.get(PREFIX + "biographicalData") + biographicaldata = sruthirecord.get(PREFIX + 'biographicalData') if biographicaldata: record.timespan = Field(biographicaldata) # Add geographicalNote, which appears to be a country in all cases. # Add it to places of activity. - geographicalnote = sruthirecord.get(PREFIX + "geographicalNote") + geographicalnote = sruthirecord.get(PREFIX + 'geographicalNote') if geographicalnote: field = LocationField(geographicalnote) field.location_type = LocationField.COUNTRY diff --git a/edpop_explorer/readers/dutch_almanacs.py b/edpop_explorer/readers/dutch_almanacs.py index eff1b8d..fae77ab 100644 --- a/edpop_explorer/readers/dutch_almanacs.py +++ b/edpop_explorer/readers/dutch_almanacs.py @@ -1,22 +1,16 @@ import csv from typing import List -from edpop_explorer import ( - Reader, - ReaderError, - Field, - BibliographicalRecord, - BIBLIOGRAPHICAL, - DatabaseFileMixin, -) +from edpop_explorer import Reader, ReaderError, Field, BibliographicalRecord, BIBLIOGRAPHICAL, DatabaseFileMixin from rdflib import URIRef class DutchAlmanacsReader(DatabaseFileMixin, Reader): - """Dutch Almanacs database reader. Access with command 'dutalm'.""" - - DATABASE_URL = "https://dhstatic.hum.uu.nl/edpop/biblio_dutchalmanacs.csv" - DATABASE_FILENAME = "biblio_dutchalmanacs.csv" - CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/dutch_almanacs") + """ Dutch Almanacs database reader. Access with command 'dutalm'.""" + DATABASE_URL = 'https://dhstatic.hum.uu.nl/edpop/biblio_dutchalmanacs.csv' + DATABASE_FILENAME = 'biblio_dutchalmanacs.csv' + CATALOG_URIREF = URIRef( + 'https://edpop.hum.uu.nl/readers/dutch_almanacs' + ) IRI_PREFIX = "https://edpop.hum.uu.nl/readers/dutch_almanacs/" FETCH_ALL_AT_ONCE = True SHORT_NAME = "Dutch Almanacs" @@ -27,17 +21,15 @@ class DutchAlmanacsReader(DatabaseFileMixin, Reader): def _convert_record(cls, rawrecord: dict) -> BibliographicalRecord: record = BibliographicalRecord(from_reader=cls) record.data = rawrecord - record.identifier = rawrecord["ID"] - record.dating = Field(rawrecord["Jaar"]) - record.place_of_publication = Field(rawrecord["Plaats uitgave"]) - record.bookseller = Field(rawrecord["Boekverkoper"]) - record.contributors = [ - Field(author.strip()) for author in rawrecord["Auteur"].split("/") - ] - record.title = Field(rawrecord["Titel"]) - record.physical_description = Field(rawrecord["Formaat"]) - record.location = Field(rawrecord["Vindplaats"]) - record.publisher_or_printer = Field(rawrecord["Drukker"]) + record.identifier = rawrecord['ID'] + record.dating = Field(rawrecord['Jaar']) + record.place_of_publication = Field(rawrecord['Plaats uitgave']) + record.bookseller = Field(rawrecord['Boekverkoper']) + record.contributors = [Field(author.strip()) for author in rawrecord['Auteur'].split('/')] + record.title = Field(rawrecord['Titel']) + record.physical_description = Field(rawrecord['Formaat']) + record.location = Field(rawrecord['Vindplaats']) + record.publisher_or_printer = Field(rawrecord['Drukker']) return record @classmethod @@ -49,10 +41,10 @@ def transform_query(cls, query) -> str: def get_by_id(cls, identifier: str) -> BibliographicalRecord: reader = cls() reader.prepare_data() - with open(reader.database_path, "r", encoding="utf-8-sig") as file: - reader = csv.DictReader(file, delimiter=";") + with open(reader.database_path, 'r', encoding='utf-8-sig') as file: + reader = csv.DictReader(file, delimiter=';') for row in reader: - if row["ID"] == identifier: + if row['ID'] == identifier: return cls._convert_record(row) raise ReaderError(f"Item with id {identifier} does not exist.") @@ -62,8 +54,8 @@ def _perform_query(self) -> List[BibliographicalRecord]: # Search query in all columns, and fetch results based on query results = [] - with open(self.database_path, "r", encoding="utf-8-sig") as file: - reader = csv.DictReader(file, delimiter=";") + with open(self.database_path, 'r', encoding='utf-8-sig') as file: + reader = csv.DictReader(file, delimiter=';') for row in reader: for key in row.keys(): if self.prepared_query.lower() in row[key].lower(): @@ -80,11 +72,11 @@ def _perform_query(self) -> List[BibliographicalRecord]: def fetch_range(self, range_to_fetch: range) -> range: if self.prepared_query is None: - raise ReaderError("First call prepare_query") + raise ReaderError('First call prepare_query') if self.fetching_exhausted: return range(0) start_record = range_to_fetch.start results = self._perform_query() for i, result in enumerate(results): self.records[i] = result - return range(start_record, start_record + len(results)) + return range(start_record, start_record + len(results)) \ No newline at end of file diff --git a/edpop_explorer/readers/fbtee.py b/edpop_explorer/readers/fbtee.py index 2bf2d5f..6bc9d87 100644 --- a/edpop_explorer/readers/fbtee.py +++ b/edpop_explorer/readers/fbtee.py @@ -3,12 +3,7 @@ from typing import Optional from edpop_explorer import ( - Reader, - BibliographicalRecord, - ReaderError, - Field, - BIBLIOGRAPHICAL, - DatabaseFileMixin, + Reader, BibliographicalRecord, ReaderError, Field, BIBLIOGRAPHICAL, DatabaseFileMixin ) from edpop_explorer.fields import LanguageField from edpop_explorer.reader import GetByIdBasedOnQueryMixin @@ -16,54 +11,58 @@ class FBTEEReader(DatabaseFileMixin, GetByIdBasedOnQueryMixin, Reader): - DATABASE_URL = "https://dhstatic.hum.uu.nl/edpop/cl.sqlite3" - DATABASE_FILENAME = "cl.sqlite3" - DATABASE_LICENSE = "https://dhstatic.hum.uu.nl/edpop/LICENSE.txt" - FBTEE_LINK = "http://fbtee.uws.edu.au/stn/interface/browse.php?t=book&" "id={}" + DATABASE_URL = 'https://dhstatic.hum.uu.nl/edpop/cl.sqlite3' + DATABASE_FILENAME = 'cl.sqlite3' + DATABASE_LICENSE = 'https://dhstatic.hum.uu.nl/edpop/LICENSE.txt' + FBTEE_LINK = 'http://fbtee.uws.edu.au/stn/interface/browse.php?t=book&' \ + 'id={}' READERTYPE = BIBLIOGRAPHICAL - CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/fbtee") + CATALOG_URIREF = URIRef( + 'https://edpop.hum.uu.nl/readers/fbtee' + ) IRI_PREFIX = "https://edpop.hum.uu.nl/readers/fbtee/" prepared_query: Optional[SQLPreparedQuery] = None FETCH_ALL_AT_ONCE = True SHORT_NAME = "French Book Trade in Enlightenment Europe (FBTEE)" - DESCRIPTION = ( - "Mapping the Trade of the Société Typographique de " "Neuchâtel, 1769-1794" - ) + DESCRIPTION = "Mapping the Trade of the Société Typographique de " \ + "Neuchâtel, 1769-1794" @classmethod def _prepare_get_by_id_query(cls, identifier: str) -> SQLPreparedQuery: return SQLPreparedQuery( - where_statement="WHERE book_code = ?", arguments=[identifier] + where_statement="WHERE book_code = ?", + arguments=[identifier] ) @classmethod def transform_query(cls, query: str) -> SQLPreparedQuery: return SQLPreparedQuery( - where_statement="WHERE full_book_title LIKE ?", arguments=[f"%{query}%"] + where_statement='WHERE full_book_title LIKE ?', + arguments=[f'%{query}%'] ) @classmethod def _add_fields(cls, record: BibliographicalRecord) -> None: assert isinstance(record.data, dict) - record.title = Field(record.data["full_book_title"]) - if record.data["languages"]: - languages = record.data["languages"].split(sep=", ") + record.title = Field(record.data['full_book_title']) + if record.data['languages']: + languages = record.data['languages'].split(sep=', ') record.languages = [LanguageField(x) for x in languages] [x.normalize() for x in record.languages] - pages = record.data["pages"] + pages = record.data['pages'] if pages: record.extent = Field(pages) - place = record.data["stated_publication_places"] + place = record.data['stated_publication_places'] if place: record.place_of_publication = Field(place) - year = record.data["stated_publication_years"] + year = record.data['stated_publication_years'] if year: record.dating = Field(year) - publisher = record.data["stated_publishers"] + publisher = record.data['stated_publishers'] if publisher: record.publisher_or_printer = Field(publisher) record.contributors = [] - for author in record.data["authors"]: + for author in record.data['authors']: # author is tuple of author code and author name record.contributors.append(Field(author[1])) @@ -73,26 +72,26 @@ def fetch_range(self, range_to_fetch: range) -> range: # the dataset is small. self.prepare_data() if not self.prepared_query: - raise ReaderError("First call prepare_query method") + raise ReaderError('First call prepare_query method') if self.fetching_exhausted: return range(0) with sqlite3.connect(str(self.database_path)) as con: cur = con.cursor() - columns = [x[1] for x in cur.execute("PRAGMA table_info(books)")] + columns = [x[1] for x in cur.execute('PRAGMA table_info(books)')] res = cur.execute( - "SELECT B.*, BA.author_code, A.author_name FROM books B " - "LEFT OUTER JOIN books_authors BA on B.book_code=BA.book_code " - "JOIN authors A on BA.author_code=A.author_code " - f"{self.prepared_query.where_statement} " - "ORDER BY B.book_code", - self.prepared_query.arguments, + 'SELECT B.*, BA.author_code, A.author_name FROM books B ' + 'LEFT OUTER JOIN books_authors BA on B.book_code=BA.book_code ' + 'JOIN authors A on BA.author_code=A.author_code ' + f'{self.prepared_query.where_statement} ' + 'ORDER BY B.book_code', + self.prepared_query.arguments ) - last_book_code = "" + last_book_code = '' i = -1 for row in res: # Since we are joining with another table, a book may be repeated, # so check if this is a new item - book_code: str = row[columns.index("book_code")] + book_code: str = row[columns.index('book_code')] if last_book_code != book_code: # We have a new book, so update i i += 1 @@ -102,7 +101,7 @@ def fetch_range(self, range_to_fetch: range) -> range: record.data[columns[j]] = row[j] record.identifier = book_code record.link = self.FBTEE_LINK.format(book_code) - record.data["authors"] = [] + record.data['authors'] = [] self.records[i] = record last_book_code = book_code # Add author_code and author_name to the last record @@ -110,7 +109,7 @@ def fetch_range(self, range_to_fetch: range) -> range: author_code = row[len(columns)] author_name = row[len(columns) + 1] assert isinstance(self.records[i].data, dict) - self.records[i].data["authors"].append((author_code, author_name)) + self.records[i].data['authors'].append((author_code, author_name)) for record_number in self.records: record = self.records[record_number] assert isinstance(record, BibliographicalRecord) diff --git a/edpop_explorer/readers/gallica.py b/edpop_explorer/readers/gallica.py index c9137d5..02a36f9 100644 --- a/edpop_explorer/readers/gallica.py +++ b/edpop_explorer/readers/gallica.py @@ -18,11 +18,11 @@ def _force_list(data) -> list: def _force_string(data) -> Optional[str]: - """Transform data into one string or None. Can be used if a single + '''Transform data into one string or None. Can be used if a single string is expected, but if there is a possibility that it is a - list.""" + list.''' if isinstance(data, list): - return " ; ".join([str(x) for x in data]) + return ' ; '.join([str(x) for x in data]) elif data is None: return None else: @@ -30,19 +30,20 @@ def _force_string(data) -> Optional[str]: class GallicaReader(SRUReader): - sru_url = "https://gallica.bnf.fr/SRU" - sru_version = "1.2" - CERL_LINK = "https://data.cerl.org/thesaurus/{}" - CTAS_PREFIX = "http://sru.cerl.org/ctas/dtd/1.1:" - CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/gallica") + sru_url = 'https://gallica.bnf.fr/SRU' + sru_version = '1.2' + CERL_LINK = 'https://data.cerl.org/thesaurus/{}' + CTAS_PREFIX = 'http://sru.cerl.org/ctas/dtd/1.1:' + CATALOG_URIREF = URIRef( + 'https://edpop.hum.uu.nl/readers/gallica' + ) IRI_PREFIX = "https://edpop.hum.uu.nl/readers/gallica/" DOCUMENT_API_URL = "https://gallica.bnf.fr/services/OAIRecord?ark={}" IDENTIFIER_PREFIX = "https://gallica.bnf.fr/" READERTYPE = BIBLIOGRAPHICAL SHORT_NAME = "Gallica" - DESCRIPTION = ( - "Digital library of the Bibliothèque nationale de France " "and its partners" - ) + DESCRIPTION = "Digital library of the Bibliothèque nationale de France " \ + "and its partners" @classmethod def _convert_record(cls, sruthirecord: dict) -> BibliographicalRecord: @@ -52,30 +53,30 @@ def _convert_record(cls, sruthirecord: dict) -> BibliographicalRecord: # string from sruthi, in the latter case as a list of strings. # Take the first string starting with https:// as the identifier # and as the link. - identifiers = _force_list(sruthirecord.get("identifier", None)) + identifiers = _force_list(sruthirecord.get('identifier', None)) for identifier in identifiers: if identifier.startswith(cls.IDENTIFIER_PREFIX): - record.identifier = identifier[len(cls.IDENTIFIER_PREFIX) :] + record.identifier = identifier[len(cls.IDENTIFIER_PREFIX):] record.link = identifier record.data = {} for key in sruthirecord: - if key in ["schema", "id"]: + if key in ['schema', 'id']: continue - showkey: str = key.replace(cls.CTAS_PREFIX, "ctas:") + showkey: str = key.replace(cls.CTAS_PREFIX, 'ctas:') record.data[showkey] = sruthirecord[key] record.data = sruthirecord - title = _force_string(sruthirecord.get("title", None)) + title = _force_string(sruthirecord.get('title', None)) if title: record.title = Field(title) - creators = _force_list(sruthirecord.get("creator", None)) + creators = _force_list(sruthirecord.get('creator', None)) record.contributors = [Field(x) for x in creators] - dating = _force_string(sruthirecord.get("date", None)) + dating = _force_string(sruthirecord.get('date', None)) if dating: record.dating = Field(dating) - languages = _force_list(sruthirecord.get("language", None)) + languages = _force_list(sruthirecord.get('language', None)) record.languages = [LanguageField(x) for x in languages] [x.normalize() for x in record.languages] - publisher = _force_string(sruthirecord.get("publisher", None)) + publisher = _force_string(sruthirecord.get('publisher', None)) if publisher: record.publisher_or_printer = Field(publisher) @@ -83,12 +84,10 @@ def _convert_record(cls, sruthirecord: dict) -> BibliographicalRecord: # the number of views, the MIME type and the extent. # Try finding the extent by filtering out the other two. # This seems to work correctly. - format_strings = _force_list(sruthirecord.get("format", None)) + format_strings = _force_list(sruthirecord.get('format', None)) for formatstr in format_strings: - if not ( - formatstr.startswith("Nombre total de vues") - or re.match("$[a-z]+/[a-z]+^", formatstr) - ): + if not (formatstr.startswith('Nombre total de vues') or + re.match('$[a-z]+/[a-z]+^', formatstr)): record.extent = Field(formatstr) break @@ -96,7 +95,7 @@ def _convert_record(cls, sruthirecord: dict) -> BibliographicalRecord: @classmethod def get_by_id(cls, identifier: str) -> BibliographicalRecord: - # Getting by id works via another interface (a simple XML API), but the + # Getting by id works via another interface (a simple XML API), but the # returned data is the same in a slightly different format. Hence, # convert it to JSON just like sruthi does and extract the right piece # of data. @@ -116,8 +115,8 @@ def get_by_id(cls, identifier: str) -> BibliographicalRecord: data = response_as_dict["results"]["notice"]["record"]["metadata"]["dc"] # The returned XML has elements with attributes, while these attributes # are missing from the XML that is sent back by the SRU interface. - # An attribute-less element is represented as a simple string by - # xmltodict, while an attribute with elements is represented as a + # An attribute-less element is represented as a simple string by + # xmltodict, while an attribute with elements is represented as a # dict where the contents is in the value of "text". Replace these # dicts with simple strings. (Not a very clean solution but refactoring # is not worth the time at this point.) @@ -132,4 +131,4 @@ def get_by_id(cls, identifier: str) -> BibliographicalRecord: @classmethod def transform_query(cls, query: str) -> str: - return "gallica all {}".format(query) + return 'gallica all {}'.format(query) diff --git a/edpop_explorer/readers/hpb.py b/edpop_explorer/readers/hpb.py index c905f67..b98f75c 100644 --- a/edpop_explorer/readers/hpb.py +++ b/edpop_explorer/readers/hpb.py @@ -1,14 +1,18 @@ from rdflib import URIRef from typing import Optional -from edpop_explorer import SRUMarc21BibliographicalReader, Marc21Data, BIBLIOGRAPHICAL +from edpop_explorer import ( + SRUMarc21BibliographicalReader, Marc21Data, BIBLIOGRAPHICAL +) class HPBReader(SRUMarc21BibliographicalReader): - sru_url = "http://sru.k10plus.de/hpb" - sru_version = "1.1" - HPB_LINK = "http://hpb.cerl.org/record/{}" - CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/hpb") + sru_url = 'http://sru.k10plus.de/hpb' + sru_version = '1.1' + HPB_LINK = 'http://hpb.cerl.org/record/{}' + CATALOG_URIREF = URIRef( + 'https://edpop.hum.uu.nl/readers/hpb' + ) READERTYPE = BIBLIOGRAPHICAL IRI_PREFIX = "https://edpop.hum.uu.nl/readers/hpb/" SHORT_NAME = "Heritage of the Printed Book (HPB)" @@ -29,15 +33,17 @@ def _prepare_get_by_id_query(cls, identifier: str) -> str: return f"pica.cid={identifier}" @classmethod - def _get_identifier(cls, data: Marc21Data) -> Optional[str]: + def _get_identifier(cls, data:Marc21Data) -> Optional[str]: # The record id can be found in field 035 in subfield a starting # with (CERL), like this: (CERL)HU-SzSEK.01.bibJAT603188. # The URI can then be created using HPB_URI. # HPB records have field 035 two times. - fields035 = data.get_fields("035") + fields035 = data.get_fields('035') for field in fields035: - if "a" in field.subfields and field.subfields["a"].startswith("(CERL)"): - return field.subfields["a"][len("(CERL)") :] + if 'a' in field.subfields and \ + field.subfields['a'].startswith('(CERL)'): + return field.subfields['a'][len('(CERL)'):] + @classmethod def _get_link(cls, data: Marc21Data) -> Optional[str]: diff --git a/edpop_explorer/readers/kb.py b/edpop_explorer/readers/kb.py index 1dd5552..b383388 100644 --- a/edpop_explorer/readers/kb.py +++ b/edpop_explorer/readers/kb.py @@ -6,10 +6,12 @@ class KBReader(SRUReader): - sru_url = "http://jsru.kb.nl/sru" - sru_version = "1.2" - KB_LINK = "https://webggc.oclc.org/cbs/DB=2.37/PPN?PPN={}" - CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/kb") + sru_url = 'http://jsru.kb.nl/sru' + sru_version = '1.2' + KB_LINK = 'https://webggc.oclc.org/cbs/DB=2.37/PPN?PPN={}' + CATALOG_URIREF = URIRef( + 'https://edpop.hum.uu.nl/readers/kb' + ) READERTYPE = BIBLIOGRAPHICAL IRI_PREFIX = "https://edpop.hum.uu.nl/readers/kb/" SHORT_NAME = "Koninklijke Bibliotheek (KB)" @@ -18,7 +20,9 @@ class KBReader(SRUReader): def __init__(self): super().__init__() # The KB SRU requires 'x-collection' as an additional GET parameter - self.session.params = {"x-collection": "GGC"} + self.session.params = { + 'x-collection': 'GGC' + } @classmethod def transform_query(cls, query: str) -> str: @@ -28,12 +32,12 @@ def _find_ppn(self, data: dict): """Try to find the PPN given the data that comes from the SRU server; return None if PPN cannot be found""" # This seems to work fine; not thoroughly tested. - oai_pmh_identifier = data.get("OaiPmhIdentifier", None) + oai_pmh_identifier = data.get('OaiPmhIdentifier', None) if not isinstance(oai_pmh_identifier, str): return None - PREFIX = "GGC:AC:" + PREFIX = 'GGC:AC:' if oai_pmh_identifier and oai_pmh_identifier.startswith(PREFIX): - return oai_pmh_identifier[len(PREFIX) :] + return oai_pmh_identifier[len(PREFIX):] return None def _convert_record(self, sruthirecord: dict) -> BibliographicalRecord: @@ -52,14 +56,14 @@ def _convert_record(self, sruthirecord: dict) -> BibliographicalRecord: record.languages = self._get_languages(sruthirecord) # TODO: add the other fields return record - + def _get_title(self, data) -> Optional[Field]: - if "title" in data: - title = data["title"] + if 'title' in data: + title = data['title'] if isinstance(title, list): # Title contains a list of strings if it consists of multiple # parts - return Field(" : ".join(title)) + return Field(' : '.join(title)) else: return Field(title) else: @@ -71,11 +75,10 @@ def _get_languages(self, data) -> Optional[List[Field]]: # One of them is always a three-letter language code, so only # pass on these. NB: there is a possibility that not all entries # consisting of three characters are language codes. - if "language" not in data: + if 'language' not in data: return [] fields = [ - LanguageField(x) - for x in data["language"] + LanguageField(x) for x in data['language'] if isinstance(x, str) and len(x) == 3 ] for field in fields: diff --git a/edpop_explorer/readers/kvcs.py b/edpop_explorer/readers/kvcs.py index b17291f..c7834f1 100644 --- a/edpop_explorer/readers/kvcs.py +++ b/edpop_explorer/readers/kvcs.py @@ -1,22 +1,16 @@ import csv from typing import List -from edpop_explorer import ( - Reader, - ReaderError, - Field, - BiographicalRecord, - BIOGRAPHICAL, - DatabaseFileMixin, -) +from edpop_explorer import Reader, ReaderError, Field, BiographicalRecord, BIOGRAPHICAL, DatabaseFileMixin from rdflib import URIRef class KVCSReader(DatabaseFileMixin, Reader): - """KVCS database reader. Access with command 'kvcs'.""" - - DATABASE_URL = "https://dhstatic.hum.uu.nl/edpop/biblio_kvcs.csv" - DATABASE_FILENAME = "biblio_kvcs.csv" - CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/kvcs") + """ KVCS database reader. Access with command 'kvcs'.""" + DATABASE_URL = 'https://dhstatic.hum.uu.nl/edpop/biblio_kvcs.csv' + DATABASE_FILENAME = 'biblio_kvcs.csv' + CATALOG_URIREF = URIRef( + 'https://edpop.hum.uu.nl/readers/kvcs' + ) IRI_PREFIX = "https://edpop.hum.uu.nl/readers/kvcs/" FETCH_ALL_AT_ONCE = True SHORT_NAME = "KVCS" @@ -27,13 +21,13 @@ class KVCSReader(DatabaseFileMixin, Reader): def _convert_record(cls, rawrecord: dict) -> BiographicalRecord: record = BiographicalRecord(from_reader=cls) record.data = rawrecord - record.identifier = rawrecord["ID"] - record.name = Field(rawrecord["Name"]) - record.gender = Field(rawrecord["Gender"]) - record.lifespan = Field(rawrecord["Years of life"]) - record.places_of_activity = Field(rawrecord["City"]) - record.activity_timespan = Field(rawrecord["Years of activity"]) - record.activities = Field(rawrecord["Kind of print and sales activities"]) + record.identifier = rawrecord['ID'] + record.name = Field(rawrecord['Name']) + record.gender = Field(rawrecord['Gender']) + record.lifespan = Field(rawrecord['Years of life']) + record.places_of_activity = Field(rawrecord['City']) + record.activity_timespan = Field(rawrecord['Years of activity']) + record.activities = Field(rawrecord['Kind of print and sales activities']) return record @classmethod @@ -45,27 +39,27 @@ def transform_query(cls, query) -> str: def get_by_id(cls, identifier: str) -> BiographicalRecord: reader = cls() reader.prepare_data() - with open(reader.database_path, "r", encoding="utf-8-sig") as file: - reader = csv.DictReader(file, delimiter=";") + with open(reader.database_path, 'r', encoding='utf-8-sig') as file: + reader = csv.DictReader(file, delimiter=';') for row in reader: - if row["ID"] == identifier: + if row['ID'] == identifier: return cls._convert_record(row) raise ReaderError(f"Item with id {identifier} does not exist.") - + def _perform_query(self) -> List[BiographicalRecord]: assert isinstance(self.prepared_query, str) self.prepare_data() - + # Search query in all columns, and fetch results based on query results = [] - with open(self.database_path, "r", encoding="utf-8-sig") as file: - reader = csv.DictReader(file, delimiter=";") + with open(self.database_path, 'r', encoding='utf-8-sig') as file: + reader = csv.DictReader(file, delimiter=';') for row in reader: for key in row.keys(): if self.prepared_query.lower() in row[key].lower(): results.append(row) break - + self.number_of_results = len(results) records = [] for result in results: @@ -76,11 +70,11 @@ def _perform_query(self) -> List[BiographicalRecord]: def fetch_range(self, range_to_fetch: range) -> range: if self.prepared_query is None: - raise ReaderError("First call prepare_query") + raise ReaderError('First call prepare_query') if self.fetching_exhausted: return range(0) start_record = range_to_fetch.start results = self._perform_query() for i, result in enumerate(results): self.records[i] = result - return range(start_record, start_record + len(results)) + return range(start_record, start_record + len(results)) \ No newline at end of file diff --git a/edpop_explorer/readers/pierre_belle.py b/edpop_explorer/readers/pierre_belle.py index 0d5c1fd..0ba6af0 100644 --- a/edpop_explorer/readers/pierre_belle.py +++ b/edpop_explorer/readers/pierre_belle.py @@ -1,44 +1,36 @@ import csv from typing import List -from edpop_explorer import ( - Reader, - ReaderError, - BibliographicalRecord, - Field, - DatabaseFileMixin, - BIBLIOGRAPHICAL, -) +from edpop_explorer import Reader, ReaderError, BibliographicalRecord, Field, DatabaseFileMixin, BIBLIOGRAPHICAL from rdflib import URIRef from edpop_explorer.fields import LanguageField class PierreBelleReader(DatabaseFileMixin, Reader): - """Pierre-Belle database reader. Access with command 'pb'.""" - - DATABASE_URL = "https://dhstatic.hum.uu.nl/edpop/biblio_pierrebelle.csv" - DATABASE_FILENAME = "biblio_pierrebelle.csv" - CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/pierre_belle") + """ Pierre-Belle database reader. Access with command 'pb'.""" + DATABASE_URL = 'https://dhstatic.hum.uu.nl/edpop/biblio_pierrebelle.csv' + DATABASE_FILENAME = 'biblio_pierrebelle.csv' + CATALOG_URIREF = URIRef( + 'https://edpop.hum.uu.nl/readers/pierre_belle' + ) IRI_PREFIX = "https://edpop.hum.uu.nl/readers/pierre_belle/" FETCH_ALL_AT_ONCE = True READERTYPE = BIBLIOGRAPHICAL SHORT_NAME = "Pierre and Belle" - DESCRIPTION = ( - "Bibliography of early modern editions of Pierre de " + DESCRIPTION = "Bibliography of early modern editions of Pierre de " \ "Provence et la Belle Maguelonne (ca. 1470-ca. 1800)" - ) @classmethod def _convert_record(cls, rawrecord: dict) -> BibliographicalRecord: record = BibliographicalRecord(from_reader=cls) record.data = rawrecord - record.identifier = rawrecord["ID"] - record.title = Field(rawrecord["Shortened title"]) - record.languages = [LanguageField(rawrecord["Language"])] + record.identifier = rawrecord['ID'] + record.title = Field(rawrecord['Shortened title']) + record.languages = [LanguageField(rawrecord['Language'])] [x.normalize() for x in record.languages] - record.publisher_or_printer = Field(rawrecord["Publisher"]) - record.place_of_publication = Field(rawrecord["Place of publication"]) - record.dating = Field(rawrecord["Date"]) + record.publisher_or_printer = Field(rawrecord['Publisher']) + record.place_of_publication = Field(rawrecord['Place of publication']) + record.dating = Field(rawrecord['Date']) return record @classmethod @@ -50,27 +42,27 @@ def transform_query(cls, query) -> str: def get_by_id(cls, identifier: str) -> BibliographicalRecord: reader = cls() reader.prepare_data() - with open(reader.database_path, "r", encoding="utf-8-sig") as file: - reader = csv.DictReader(file, delimiter=";") + with open(reader.database_path, 'r', encoding='utf-8-sig') as file: + reader = csv.DictReader(file, delimiter=';') for row in reader: - if row["ID"] == identifier: + if row['ID'] == identifier: return cls._convert_record(row) raise ReaderError(f"Item with id {identifier} does not exist.") - + def _perform_query(self) -> List[BibliographicalRecord]: assert isinstance(self.prepared_query, str) self.prepare_data() - + # Search query in all columns, and fetch results based on query results = [] - with open(self.database_path, "r", encoding="utf-8-sig") as file: - reader = csv.DictReader(file, delimiter=";") + with open(self.database_path, 'r', encoding='utf-8-sig') as file: + reader = csv.DictReader(file, delimiter=';') for row in reader: for key in row.keys(): if self.prepared_query in row[key]: results.append(row) break - + self.number_of_results = len(results) records = [] for result in results: @@ -81,11 +73,11 @@ def _perform_query(self) -> List[BibliographicalRecord]: def fetch_range(self, range_to_fetch: range) -> range: if self.prepared_query is None: - raise ReaderError("First call prepare_query") + raise ReaderError('First call prepare_query') if self.fetching_exhausted: return range(0) start_record = range_to_fetch.start results = self._perform_query() for i, result in enumerate(results): self.records[i] = result - return range(start_record, start_record + len(results)) + return range(start_record, start_record + len(results)) \ No newline at end of file diff --git a/edpop_explorer/readers/sbtireader.py b/edpop_explorer/readers/sbtireader.py index 7971cc3..f37c692 100644 --- a/edpop_explorer/readers/sbtireader.py +++ b/edpop_explorer/readers/sbtireader.py @@ -1,24 +1,26 @@ from rdflib import URIRef from typing import List, Dict, Optional -from edpop_explorer import BiographicalRecord, Field, BIOGRAPHICAL +from edpop_explorer import ( + BiographicalRecord, Field, BIOGRAPHICAL +) from edpop_explorer.cerl import CERLReader class SBTIReader(CERLReader): - API_URL = "https://data.cerl.org/sbti/_search" - API_BY_ID_BASE_URL = "https://data.cerl.org/sbti/" - LINK_BASE_URL = "https://data.cerl.org/sbti/" + API_URL = 'https://data.cerl.org/sbti/_search' + API_BY_ID_BASE_URL = 'https://data.cerl.org/sbti/' + LINK_BASE_URL = 'https://data.cerl.org/sbti/' additional_params: Optional[Dict[str, str]] = None - CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/sbti") + CATALOG_URIREF = URIRef( + 'https://edpop.hum.uu.nl/readers/sbti' + ) IRI_PREFIX = "https://edpop.hum.uu.nl/readers/sbti/" DEFAULT_RECORDS_PER_PAGE = 10 READERTYPE = BIOGRAPHICAL SHORT_NAME = "Scottish Book Trade Index (SBTI)" - DESCRIPTION = ( - "An index of the names, trades and addresses of people " + DESCRIPTION = "An index of the names, trades and addresses of people "\ "involved in printing in Scotland up to 1850" - ) @classmethod def _get_name_field(cls, data: dict) -> Optional[Field]: @@ -35,9 +37,9 @@ def _get_name_field(cls, data: dict) -> Optional[Field]: def _convert_record(cls, rawrecord: dict) -> BiographicalRecord: record = BiographicalRecord(from_reader=cls) record.data = rawrecord - record.identifier = rawrecord.get("id", None) + record.identifier = rawrecord.get('id', None) if not record.identifier: - record.identifier = rawrecord.get("_id", None) + record.identifier = rawrecord.get('_id', None) if record.identifier: record.link = cls.LINK_BASE_URL + record.identifier @@ -63,3 +65,4 @@ def _convert_record(cls, rawrecord: dict) -> BiographicalRecord: record.places_of_activity.append(field) return record + diff --git a/edpop_explorer/readers/stcn.py b/edpop_explorer/readers/stcn.py index c9476ce..527c775 100644 --- a/edpop_explorer/readers/stcn.py +++ b/edpop_explorer/readers/stcn.py @@ -5,12 +5,14 @@ from edpop_explorer import Field, BIBLIOGRAPHICAL, BibliographicalRecord, LocationField from edpop_explorer.cerl import CERLReader from edpop_explorer.fields import LanguageField, ContributorField -from edpop_explorer.sparqlreader import SparqlReader, BibliographicalRDFRecord +from edpop_explorer.sparqlreader import ( + SparqlReader, BibliographicalRDFRecord +) def _remove_markup(input_str: str) -> str: """Remove STCN-specific markup""" - return input_str.replace("`IT`", "").replace("`LO`", "") + return input_str.replace('`IT`', '').replace('`LO`', '') def safeget(dictionary: Optional[dict], attribute_chain: tuple, first: bool = False): @@ -27,10 +29,12 @@ def safeget(dictionary: Optional[dict], attribute_chain: tuple, first: bool = Fa class STCNReader(CERLReader): - API_URL = "https://data.cerl.org/stcn/_search" - API_BY_ID_BASE_URL = "https://data.cerl.org/stcn/" - LINK_BASE_URL = "https://data.cerl.org/stcn/" - CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/stcn") + API_URL = 'https://data.cerl.org/stcn/_search' + API_BY_ID_BASE_URL = 'https://data.cerl.org/stcn/' + LINK_BASE_URL = 'https://data.cerl.org/stcn/' + CATALOG_URIREF = URIRef( + 'https://edpop.hum.uu.nl/readers/stcn' + ) IRI_PREFIX = "https://edpop.hum.uu.nl/readers/stcn/" READERTYPE = BIBLIOGRAPHICAL SHORT_NAME = "Short-Title Catalogue Netherlands (STCN)" @@ -55,7 +59,7 @@ def _get_contributors(cls, rawrecord: dict) -> list[Field]: continue contributor = ContributorField(name) contributor.name = name - contributor.role = safeget(actor, ("role",), first=True) + contributor.role = safeget(actor, ('role',), first=True) contributors.append(contributor) return contributors @@ -120,9 +124,7 @@ def _get_collation_formula(cls, rawrecord: dict) -> Optional[Field]: if not collations: return None # Multiple collation formulas are possible, but this seems to be rare. - collation_string = " ; ".join( - [x.get("value") for x in collations if "value" in x] - ) + collation_string = ' ; '.join([x.get("value") for x in collations if "value" in x]) return Field(collation_string) @classmethod @@ -131,9 +133,7 @@ def _get_fingerprint(cls, rawrecord: dict) -> Optional[Field]: if not fingerprints: return None # Multiple fingerprints are possible, but this seems to be rare - fingerprint_string = " ; ".join( - [x.get("fingerprint") for x in fingerprints if "fingerprint" in x] - ) + fingerprint_string = ' ; '.join([x.get("fingerprint") for x in fingerprints if "fingerprint" in x]) return Field(fingerprint_string) @classmethod @@ -161,7 +161,7 @@ def _get_holdings(cls, rawrecord: dict) -> list[Field]: def _convert_record(cls, rawrecord: dict) -> BibliographicalRecord: record = BibliographicalRecord(from_reader=cls) record.data = rawrecord - record.identifier = rawrecord.get("id", None) + record.identifier = rawrecord.get('id', None) if record.identifier: record.link = cls.LINK_BASE_URL + record.identifier record.title = cls._get_title(rawrecord) diff --git a/edpop_explorer/readers/ustc.py b/edpop_explorer/readers/ustc.py index 2a5f360..231a43a 100644 --- a/edpop_explorer/readers/ustc.py +++ b/edpop_explorer/readers/ustc.py @@ -3,23 +3,20 @@ from rdflib import URIRef from edpop_explorer import ( - Reader, - BibliographicalRecord, - ReaderError, - Field, - BIBLIOGRAPHICAL, - GetByIdBasedOnQueryMixin, - DatabaseFileMixin, + Reader, BibliographicalRecord, ReaderError, Field, BIBLIOGRAPHICAL, + GetByIdBasedOnQueryMixin, DatabaseFileMixin ) from edpop_explorer.fields import LanguageField from edpop_explorer.sql import SQLPreparedQuery class USTCReader(DatabaseFileMixin, GetByIdBasedOnQueryMixin, Reader): - DATABASE_FILENAME = "ustc.sqlite3" - USTC_LINK = "https://www.ustc.ac.uk/editions/{}" + DATABASE_FILENAME = 'ustc.sqlite3' + USTC_LINK = 'https://www.ustc.ac.uk/editions/{}' READERTYPE = BIBLIOGRAPHICAL - CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/ustc") + CATALOG_URIREF = URIRef( + 'https://edpop.hum.uu.nl/readers/ustc' + ) IRI_PREFIX = "https://edpop.hum.uu.nl/readers/ustc/" prepared_query: Optional[SQLPreparedQuery] = None FETCH_ALL_AT_ONCE = True @@ -30,19 +27,19 @@ class USTCReader(DatabaseFileMixin, GetByIdBasedOnQueryMixin, Reader): def transform_query(cls, query: str) -> SQLPreparedQuery: if len(query.strip()) < 3: # Do not allow very short USTC queries because they are very slow - raise ReaderError("USTC query must have at least 3 characters.") - where_statement = ( - "WHERE E.std_title LIKE ? " - "OR E.author_name_1 LIKE ? " - "OR E.author_name_2 LIKE ? " - "OR E.author_name_3 LIKE ? " - "OR E.author_name_4 LIKE ? " - "OR E.author_name_5 LIKE ? " - "OR E.author_name_6 LIKE ? " - "OR E.author_name_7 LIKE ? " - "OR E.author_name_8 LIKE ? " + raise ReaderError('USTC query must have at least 3 characters.') + where_statement = ( + 'WHERE E.std_title LIKE ? ' + 'OR E.author_name_1 LIKE ? ' + 'OR E.author_name_2 LIKE ? ' + 'OR E.author_name_3 LIKE ? ' + 'OR E.author_name_4 LIKE ? ' + 'OR E.author_name_5 LIKE ? ' + 'OR E.author_name_6 LIKE ? ' + 'OR E.author_name_7 LIKE ? ' + 'OR E.author_name_8 LIKE ? ' ) - like_argument = "%" + query + "%" + like_argument = '%' + query + '%' arguments: List[Union[str, int]] = [like_argument for _ in range(9)] return SQLPreparedQuery(where_statement, arguments) @@ -53,7 +50,8 @@ def _prepare_get_by_id_query(cls, identifier: str) -> SQLPreparedQuery: except ValueError: raise ReaderError(f"Identifier {identifier} is not an integer") return SQLPreparedQuery( - where_statement="WHERE E.sn = ?", arguments=[identifier_int] + where_statement="WHERE E.sn = ?", + arguments=[identifier_int] ) def fetch_range(self, range_to_fetch: range) -> range: @@ -64,19 +62,19 @@ def fetch_range(self, range_to_fetch: range) -> range: # locally stored. if not self.prepared_query: - raise ReaderError("No query has been set") + raise ReaderError('No query has been set') if self.fetching_exhausted: return range(0) cur = con.cursor() - columns = [x[1] for x in cur.execute("PRAGMA table_info(editions)")] + columns = [x[1] for x in cur.execute('PRAGMA table_info(editions)')] # This kind of query is far from ideal, but the alternative is to # implement SQLite full text search which is probably too much work # for our current goal (i.e. getting insight in the data structures) res = cur.execute( - "SELECT E.* FROM editions E " + 'SELECT E.* FROM editions E ' + self.prepared_query.where_statement - + " ORDER BY E.id", + + ' ORDER BY E.id', self.prepared_query.arguments, ) for i, row in enumerate(res): @@ -91,28 +89,29 @@ def fetch_range(self, range_to_fetch: range) -> range: def _convert_record(self, data: dict) -> BibliographicalRecord: record = BibliographicalRecord(from_reader=self.__class__) record.data = data - record.identifier = data["sn"] - record.link = self.USTC_LINK.format(data["sn"]) - record.title = Field(data["std_title"]) + record.identifier = data['sn'] + record.link = self.USTC_LINK.format(data['sn']) + record.title = Field(data['std_title']) record.contributors = [] for i in range(8): - fieldname = f"author_name_{i + 1}" + fieldname = f'author_name_{i + 1}' if data[fieldname]: record.contributors.append(Field(data[fieldname])) - if data["printer_name_1"]: + if data['printer_name_1']: # TODO: support for multiple printers - record.publisher_or_printer = Field(data["printer_name_1"]) - if data["place"]: - record.place_of_publication = Field(data["place"]) - if data["year"]: - record.dating = Field(data["year"]) + record.publisher_or_printer = Field(data['printer_name_1']) + if data['place']: + record.place_of_publication = Field(data['place']) + if data['year']: + record.dating = Field(data['year']) record.languages = [] for i in range(4): - fieldname = f"language_{i + 1}" + fieldname = f'language_{i + 1}' if data[fieldname]: field = LanguageField(data[fieldname]) field.normalize() record.languages.append(field) - if data["pagination"]: - record.extent = Field(data["pagination"]) + if data['pagination']: + record.extent = Field(data['pagination']) return record + diff --git a/edpop_explorer/readers/vd.py b/edpop_explorer/readers/vd.py index 8cd16e3..260e805 100644 --- a/edpop_explorer/readers/vd.py +++ b/edpop_explorer/readers/vd.py @@ -5,14 +5,14 @@ from edpop_explorer import SRUMarc21BibliographicalReader, Marc21Data -class VDCommonMixin: +class VDCommonMixin(): LINK_FORMAT: str @classmethod def _get_identifier(cls, data: Marc21Data) -> Optional[str]: - field024 = data.get_first_field("024") + field024 = data.get_first_field('024') if field024: - return field024.subfields.get("a", None) + return field024.subfields.get('a', None) else: return None @@ -20,14 +20,16 @@ def _get_identifier(cls, data: Marc21Data) -> Optional[str]: def _get_link(cls, record: Marc21Data) -> Optional[str]: identifier = cls._get_identifier(record) if identifier: - return cls.LINK_FORMAT.format(identifier).replace(" ", "+") + return cls.LINK_FORMAT.format(identifier).replace(' ', '+') class VD16Reader(VDCommonMixin, SRUMarc21BibliographicalReader): - sru_url = "http://bvbr.bib-bvb.de:5661/bvb01sru" - sru_version = "1.1" - LINK_FORMAT = "http://gateway-bayern.de/{}" # Spaces should be replaced by + - CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/vd16") + sru_url = 'http://bvbr.bib-bvb.de:5661/bvb01sru' + sru_version = '1.1' + LINK_FORMAT = 'http://gateway-bayern.de/{}' # Spaces should be replaced by + + CATALOG_URIREF = URIRef( + 'https://edpop.hum.uu.nl/readers/vd16' + ) IRI_PREFIX = "https://edpop.hum.uu.nl/readers/vd16/" SHORT_NAME = "VD16" DESCRIPTION = "Verzeichnis der im deutschen Sprachbereich erschienenen Drucke des 16. Jahrhunderts" @@ -36,14 +38,17 @@ class VD16Reader(VDCommonMixin, SRUMarc21BibliographicalReader): def transform_query(cls, query: str) -> str: # This SRU URL combines multiple databases, so make sure only VD16 is # queried - return "VD16 and ({})".format(query) + return 'VD16 and ({})'.format(query) class VD17Reader(VDCommonMixin, SRUMarc21BibliographicalReader): - sru_url = "http://sru.k10plus.de/vd17" - sru_version = "1.1" - LINK_FORMAT = "https://kxp.k10plus.de/DB=1.28/CMD?ACT=SRCHA&IKT=8079&TRM=%27{}%27" - CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/vd17") + sru_url = 'http://sru.k10plus.de/vd17' + sru_version = '1.1' + LINK_FORMAT = \ + 'https://kxp.k10plus.de/DB=1.28/CMD?ACT=SRCHA&IKT=8079&TRM=%27{}%27' + CATALOG_URIREF = URIRef( + 'https://edpop.hum.uu.nl/readers/vd17' + ) IRI_PREFIX = "https://edpop.hum.uu.nl/readers/vd17/" SHORT_NAME = "VD17" DESCRIPTION = "Verzeichnis der im deutschen Sprachbereich erschienenen Drucke des 17. Jahrhunderts" @@ -54,14 +59,14 @@ def transform_query(cls, query: str) -> str: class VD18Reader(VDCommonMixin, SRUMarc21BibliographicalReader): - sru_url = "http://sru.k10plus.de/vd18" - sru_version = "1.1" - LINK_FORMAT = ( - "https://kxp.k10plus.de/DB=1.65/SET=1/TTL=1/CMD?ACT=SRCHA&" - "IKT=1016&SRT=YOP&TRM={}&ADI_MAT=B&MATCFILTER=Y&MATCSET=Y&ADI_MAT=T&" - "REC=*" + sru_url = 'http://sru.k10plus.de/vd18' + sru_version = '1.1' + LINK_FORMAT = 'https://kxp.k10plus.de/DB=1.65/SET=1/TTL=1/CMD?ACT=SRCHA&' \ + 'IKT=1016&SRT=YOP&TRM={}&ADI_MAT=B&MATCFILTER=Y&MATCSET=Y&ADI_MAT=T&' \ + 'REC=*' + CATALOG_URIREF = URIRef( + 'https://edpop.hum.uu.nl/readers/vd18' ) - CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/vd18") IRI_PREFIX = "https://edpop.hum.uu.nl/readers/vd18/" SHORT_NAME = "VD18" DESCRIPTION = "Verzeichnis der im deutschen Sprachbereich erschienenen Drucke des 18. Jahrhunderts" @@ -74,22 +79,22 @@ def transform_query(cls, query: str) -> str: def _get_identifier(cls, record: Marc21Data): # The record id is in field 024 for which subfield 2 is vd18. There # may be more than one occurance of field 024. - fields024 = record.get_fields("024") + fields024 = record.get_fields('024') for field in fields024: - if ( - "2" in field.subfields - and "a" in field.subfields - and field.subfields["2"] == "vd18" - ): - return field.subfields["a"][5:] + if '2' in field.subfields and \ + 'a' in field.subfields and \ + field.subfields['2'] == 'vd18': + return field.subfields['a'][5:] return None class VDLiedReader(VDCommonMixin, SRUMarc21BibliographicalReader): - sru_url = "http://sru.k10plus.de/vdlied" - sru_version = "1.1" - LINK_FORMAT = "https://gso.gbv.de/DB=1.60/PPNSET?PPN={}" - CATALOG_URIREF = URIRef("https://edpop.hum.uu.nl/readers/vdlied") + sru_url = 'http://sru.k10plus.de/vdlied' + sru_version = '1.1' + LINK_FORMAT = 'https://gso.gbv.de/DB=1.60/PPNSET?PPN={}' + CATALOG_URIREF = URIRef( + 'https://edpop.hum.uu.nl/readers/vdlied' + ) IRI_PREFIX = "https://edpop.hum.uu.nl/readers/vdlied/" SHORT_NAME = "VDLied" DESCRIPTION = "Das Verzeichnis der deutschsprachigen Liedflugschriften" diff --git a/edpop_explorer/record.py b/edpop_explorer/record.py index 8c38d6f..9d6d16b 100644 --- a/edpop_explorer/record.py +++ b/edpop_explorer/record.py @@ -4,11 +4,7 @@ from rdflib import URIRef, Graph, BNode, RDF, Literal from edpop_explorer import ( - EDPOPREC, - Field, - BIBLIOGRAPHICAL, - BIOGRAPHICAL, - bind_common_namespaces, + EDPOPREC, Field, BIBLIOGRAPHICAL, BIOGRAPHICAL, bind_common_namespaces ) if TYPE_CHECKING: @@ -47,7 +43,7 @@ class Record: basic attributes and the fields are ``None`` by default. Subclasses should override the ``_rdf_class`` attribute to - the corresponding RDF class. They should define additional + the corresponding RDF class. They should define additional fields by adding additional public attributes defaulting to ``None`` and by registring them in the ``_fields`` attribute. For registring, a constructor ``__init__`` should be defined @@ -56,18 +52,17 @@ class Record: ``('', EDPOPREC., )``. """ - #: The raw original data of a record. data: Union[None, dict, RawData] = None _fields: List[Tuple[str, URIRef, Type[Field]]] _rdf_class: Node = EDPOPREC.Record link: Optional[str] = None - """A user-friendly link where the user can find the record.""" + '''A user-friendly link where the user can find the record.''' identifier: Optional[str] = None - """Unique identifier used by the source catalog.""" + '''Unique identifier used by the source catalog.''' from_reader: Type["Reader"] - """The subject node, which will be used to convert the record to - RDF. This is a blank node by default.""" + '''The subject node, which will be used to convert the record to + RDF. This is a blank node by default.''' _graph: Optional[Graph] = None _bnode: Optional[BNode] = None @@ -76,10 +71,10 @@ def __init__(self, from_reader: Type["Reader"]): self.from_reader = from_reader def to_graph(self) -> Graph: - """Return an RDF graph for this record.""" + '''Return an RDF graph for this record.''' self.fetch() g = Graph() - + # Set basic properties rdfclass = EDPOPREC.Record if self.from_reader: @@ -87,28 +82,37 @@ def to_graph(self) -> Graph: rdfclass = EDPOPREC.BiographicalRecord elif self.from_reader.READERTYPE == BIBLIOGRAPHICAL: rdfclass = EDPOPREC.BibliographicalRecord - g.add((self.subject_node, RDF.type, rdfclass)) - if self.from_reader is not None and self.from_reader.CATALOG_URIREF is not None: - g.add( - ( - self.subject_node, - EDPOPREC.fromCatalog, - self.from_reader.CATALOG_URIREF, - ) - ) + g.add(( + self.subject_node, + RDF.type, + rdfclass + )) + if self.from_reader is not None and \ + self.from_reader.CATALOG_URIREF is not None: + g.add(( + self.subject_node, + EDPOPREC.fromCatalog, + self.from_reader.CATALOG_URIREF + )) if self.identifier: - g.add((self.subject_node, EDPOPREC.identifier, Literal(self.identifier))) + g.add(( + self.subject_node, + EDPOPREC.identifier, + Literal(self.identifier) + )) if self.link: - g.add((self.subject_node, EDPOPREC.publicURL, Literal(self.link))) + g.add(( + self.subject_node, + EDPOPREC.publicURL, + Literal(self.link) + )) original_data = self.get_data_dict() if original_data is not None: - g.add( - ( - self.subject_node, - EDPOPREC.originalData, - Literal(original_data, datatype=RDF.JSON), - ) - ) + g.add(( + self.subject_node, + EDPOPREC.originalData, + Literal(original_data, datatype=RDF.JSON) + )) # Put all fields from self.FIELDS in the graph by accessing # the associated attributes or properties. If they contain a @@ -158,21 +162,21 @@ def get_data_dict(self) -> Optional[dict]: def __str__(self): if self.identifier: - return f"{self.__class__} object ({self.identifier})" + return f'{self.__class__} object ({self.identifier})' else: - return f"{self.__class__} object" + return f'{self.__class__} object' def fetch(self) -> None: - """Fetch the full contents of the record if this record works with + '''Fetch the full contents of the record if this record works with lazy loading (i.e., if the record's class derives from ``RDFRecordMixin``). If the record is not lazy, this method does - nothing.""" + nothing.''' pass @property def iri(self) -> Optional[str]: - """A stable IRI based on the `identifier` attribute. `None` if - the `identifier` attribute is not set.""" + '''A stable IRI based on the `identifier` attribute. `None` if + the `identifier` attribute is not set.''' if self.identifier: return self.from_reader.identifier_to_iri(self.identifier) else: @@ -180,8 +184,8 @@ def iri(self) -> Optional[str]: @property def subject_node(self) -> Node: - """A subject node based on the `identifier` attribute. If the - `identifier` attribute is not set, a blank node.""" + '''A subject node based on the `identifier` attribute. If the + `identifier` attribute is not set, a blank node.''' iri = self.iri if iri is not None: return URIRef(iri) @@ -193,12 +197,11 @@ def subject_node(self) -> Node: class BibliographicalRecord(Record): - """Python representation of edpoprec:BibliographicalRecord. + '''Python representation of edpoprec:BibliographicalRecord. This subclass adds fields that are specific for bibliographical records. - """ - + ''' _rdf_class = EDPOPREC.BibliographicalRecord title: Optional[Field] = None alternative_title: Optional[Field] = None @@ -222,23 +225,23 @@ def __init__(self, from_reader: Type["Reader"]): super().__init__(from_reader) assert isinstance(self._fields, list) self._fields += [ - ("title", EDPOPREC.title, Field), - ("alternative_title", EDPOPREC.alternativeTitle, Field), - ("contributors", EDPOPREC.contributor, Field), - ("publisher_or_printer", EDPOPREC.publisherOrPrinter, Field), - ("place_of_publication", EDPOPREC.placeOfPublication, Field), - ("dating", EDPOPREC.dating, Field), - ("languages", EDPOPREC.language, Field), - ("extent", EDPOPREC.extent, Field), - ("size", EDPOPREC.size, Field), - ("physical_description", EDPOPREC.physicalDescription, Field), - ("bookseller", EDPOPREC.bookseller, Field), - ("location", EDPOPREC.location, Field), - ("format", EDPOPREC.format, Field), - ("fingerprint", EDPOPREC.fingerprint, Field), - ("collation_formula", EDPOPREC.collationFormula, Field), - ("genres", EDPOPREC.genre, Field), - ("holdings", EDPOPREC.holdings, Field), + ('title', EDPOPREC.title, Field), + ('alternative_title', EDPOPREC.alternativeTitle, Field), + ('contributors', EDPOPREC.contributor, Field), + ('publisher_or_printer', EDPOPREC.publisherOrPrinter, Field), + ('place_of_publication', EDPOPREC.placeOfPublication, Field), + ('dating', EDPOPREC.dating, Field), + ('languages', EDPOPREC.language, Field), + ('extent', EDPOPREC.extent, Field), + ('size', EDPOPREC.size, Field), + ('physical_description', EDPOPREC.physicalDescription, Field), + ('bookseller', EDPOPREC.bookseller, Field), + ('location', EDPOPREC.location, Field), + ('format', EDPOPREC.format, Field), + ('fingerprint', EDPOPREC.fingerprint, Field), + ('collation_formula', EDPOPREC.collationFormula, Field), + ('genres', EDPOPREC.genre, Field), + ('holdings', EDPOPREC.holdings, Field), ] def __str__(self) -> str: @@ -249,11 +252,10 @@ def __str__(self) -> str: class BiographicalRecord(Record): - """Python representation of edpoprec:BiographicalRecord. + '''Python representation of edpoprec:BiographicalRecord. This subclass adds fields that are specific for biographical records. - """ - + ''' _rdf_class = EDPOPREC.BiographicalRecord name: Optional[Field] = None variant_names: Optional[List[Field]] = None @@ -269,15 +271,15 @@ def __init__(self, from_reader: Type["Reader"]): super().__init__(from_reader) assert isinstance(self._fields, list) self._fields += [ - ("name", EDPOPREC.title, Field), - ("variant_names", EDPOPREC.variantName, Field), - ("place_of_birth", EDPOPREC.placeOfBirth, Field), - ("place_of_death", EDPOPREC.placeOfDeath, Field), - ("places_of_activity", EDPOPREC.placeOfActivity, Field), - ("activity_timespan", EDPOPREC.timespan, Field), - ("activities", EDPOPREC.activity, Field), - ("gender", EDPOPREC.gender, Field), - ("lifespan", EDPOPREC.lifespan, Field), + ('name', EDPOPREC.title, Field), + ('variant_names', EDPOPREC.variantName, Field), + ('place_of_birth', EDPOPREC.placeOfBirth, Field), + ('place_of_death', EDPOPREC.placeOfDeath, Field), + ('places_of_activity', EDPOPREC.placeOfActivity, Field), + ('activity_timespan', EDPOPREC.timespan, Field), + ('activities', EDPOPREC.activity, Field), + ('gender', EDPOPREC.gender, Field), + ('lifespan', EDPOPREC.lifespan, Field), ] def __str__(self) -> str: @@ -288,14 +290,13 @@ def __str__(self) -> str: class LazyRecordMixin(ABC): - """Abstract mixin that adds an interface for lazy loading to a Record. + '''Abstract mixin that adds an interface for lazy loading to a Record. To use, implement the ``fetch()`` method and make sure that it fills - the record's ``data`` attributes and its Fields and that the - ``fetched`` attribute is set to ``True``.""" - + the record's ``data`` attributes and its Fields and that the + ``fetched`` attribute is set to ``True``.''' fetched: bool = False - + @abstractmethod def fetch(self) -> None: pass diff --git a/edpop_explorer/sparqlreader.py b/edpop_explorer/sparqlreader.py index 09fb648..2d6f969 100644 --- a/edpop_explorer/sparqlreader.py +++ b/edpop_explorer/sparqlreader.py @@ -7,28 +7,26 @@ from typing_extensions import override from edpop_explorer import ( - Reader, - Record, - BibliographicalRecord, - ReaderError, - RecordError, - LazyRecordMixin, + Reader, Record, BibliographicalRecord, ReaderError, RecordError, + LazyRecordMixin ) PREFIXES = { - "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", - "rdfs": "http://www.w3.org/2000/01/rdf-schema#", - "schema": "http://schema.org/", - "owl": "http://www.w3.org/2002/07/owl#", + 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', + 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', + 'schema': 'http://schema.org/', + 'owl': 'http://www.w3.org/2002/07/owl#', } -PREFIXES_REVERSE_REPLACEMENT_TABLE = {PREFIXES[key]: (key + ":") for key in PREFIXES} +PREFIXES_REVERSE_REPLACEMENT_TABLE = { + PREFIXES[key]: (key + ':') for key in PREFIXES +} -PREFIX_DEFINITIONS = "\n".join([f"prefix {key}: <{PREFIXES[key]}>" for key in PREFIXES]) +PREFIX_DEFINITIONS = '\n'.join([ + f'prefix {key}: <{PREFIXES[key]}>' for key in PREFIXES +]) -prepare_listing_query = ( - PREFIX_DEFINITIONS - + """ +prepare_listing_query = (PREFIX_DEFINITIONS + """ select ?s ?name where {{ ?s ?p ?o . @@ -37,8 +35,7 @@ FILTER (regex(?o, "{query}","i")) }} order by ?s -""" -).format +""").format prepare_lookup_query = """ prefix schema: @@ -50,8 +47,8 @@ def replace_fqu_with_prefixed_uris(inputstring: str) -> str: - """Replace fully qualified URIs to prefixed URIs if they occur in - the prefix table in the prefixes attribute""" + '''Replace fully qualified URIs to prefixed URIs if they occur in + the prefix table in the prefixes attribute''' for key in PREFIXES_REVERSE_REPLACEMENT_TABLE: inputstring = inputstring.replace( key, PREFIXES_REVERSE_REPLACEMENT_TABLE[key], 1 @@ -60,8 +57,7 @@ def replace_fqu_with_prefixed_uris(inputstring: str) -> str: class RDFRecordMixin(LazyRecordMixin): - """Mixin that adds lazy RDF fetching functionality to a Record.""" - + '''Mixin that adds lazy RDF fetching functionality to a Record.''' identifier: Optional[str] = None fetched: bool = False data: Optional[dict] = None @@ -74,7 +70,9 @@ def fetch(self) -> None: # as data that rdflib can process. We might need to support # IRIs that can only be accessed via an endpoint as well. if not self.identifier: - raise RecordError("identifier (subject IRI) has not been set") + raise RecordError( + 'identifier (subject IRI) has not been set' + ) if self.fetched: return try: @@ -88,7 +86,9 @@ def fetch(self) -> None: f"{self.identifier}: {err}" ) # Convert to JSON for raw data attribute - self.data = json.loads(self.original_graph.serialize(format="json-ld")) + self.data = json.loads( + self.original_graph.serialize(format="json-ld") + ) # Call Reader's data conversion method to fill the record's Fields assert isinstance(self, Record) assert issubclass(self.from_reader, SparqlReader) @@ -112,7 +112,9 @@ class SparqlReader(Reader): @override def transform_query(cls, query: str): return prepare_listing_query( - name_predicate=cls.name_predicate, filter=cls.filter, query=query + name_predicate=cls.name_predicate, + filter=cls.filter, + query=query ) @classmethod @@ -124,7 +126,7 @@ def get_by_id(cls, identifier: str) -> Record: def fetch_range(self, range_to_fetch: range) -> range: # Fetch all records at one, because this is an expensive operation. if not self.prepared_query: - raise ReaderError("First call prepare_query method") + raise ReaderError('First call prepare_query method') if self.fetching_exhausted: return range(0, 0) wrapper = SPARQLWrapper(self.endpoint) @@ -133,27 +135,29 @@ def fetch_range(self, range_to_fetch: range) -> range: try: response = wrapper.queryAndConvert() except SPARQLExceptions.QueryBadFormed as err: - raise ReaderError("Malformed SPARQL query: {}".format(err)) + raise ReaderError( + 'Malformed SPARQL query: {}'.format(err) + ) assert isinstance(response, dict) - results = response["results"]["bindings"] + results = response['results']['bindings'] self.records = {} self.number_of_results = len(results) for i, result in enumerate(results): - iri = result["s"]["value"] - name = result["name"]["value"] + iri = result['s']['value'] + name = result['name']['value'] self.records[i] = self._create_lazy_record(iri, name) return range(0, self.number_of_results) @classmethod @abstractmethod def convert_record(cls, graph: Graph, record: Record) -> None: - """Convert data from an RDF graph to Fields in a Record. The - Record is changed in-place.""" + '''Convert data from an RDF graph to Fields in a Record. The + Record is changed in-place.''' pass @classmethod @abstractmethod - def _create_lazy_record(cls, iri: str, name: Optional[str] = None) -> Record: + def _create_lazy_record(cls, iri: str, name: Optional[str]=None) -> Record: """Create a Record/LazyRecordMixin record object. This is the lazy record that is created after running the SPARQL diff --git a/edpop_explorer/srumarc21reader.py b/edpop_explorer/srumarc21reader.py index 6549c3a..f02a80a 100644 --- a/edpop_explorer/srumarc21reader.py +++ b/edpop_explorer/srumarc21reader.py @@ -5,26 +5,22 @@ from abc import abstractmethod from edpop_explorer import ( - BibliographicalRecord, - RawData, - SRUReader, - Field, - BIBLIOGRAPHICAL, + BibliographicalRecord, RawData, SRUReader, Field, BIBLIOGRAPHICAL ) from edpop_explorer.fields import LanguageField -READABLE_FIELDS_FILE = Path(__file__).parent / "M21_fields.csv" +READABLE_FIELDS_FILE = Path(__file__).parent / 'M21_fields.csv' translation_dictionary: Dict[str, str] = {} with open(READABLE_FIELDS_FILE) as dictionary_file: reader = csv.DictReader(dictionary_file) for row in reader: - translation_dictionary[row["Tag number"]] = row[" Tag description"].strip() + translation_dictionary[row['Tag number']] = \ + row[' Tag description'].strip() @dataclass class Marc21Field: """Python representation of a single field in a Marc21 record""" - fieldnumber: str indicator1: str indicator2: str @@ -32,24 +28,28 @@ class Marc21Field: description: Optional[str] = None def __str__(self): - """ + ''' Return the usual marc21 representation - """ + ''' sf = [] - ind1 = self.indicator1 if self.indicator1.rstrip() != "" else "#" - ind2 = self.indicator1 if self.indicator2.rstrip() != "" else "#" - description = " ({})".format(self.description) if self.description else "" + ind1 = self.indicator1 if self.indicator1.rstrip() != '' else '#' + ind2 = self.indicator1 if self.indicator2.rstrip() != '' else '#' + description = ' ({})'.format(self.description) \ + if self.description else '' for subfield in self.subfields: - sf.append("$${} {}".format(subfield, self.subfields[subfield])) - return "{}{}: {} {} {}".format( - self.fieldnumber, description, ind1, ind2, " ".join(sf) + sf.append('$${} {}'.format(subfield, self.subfields[subfield])) + return '{}{}: {} {} {}'.format( + self.fieldnumber, + description, + ind1, + ind2, + ' '.join(sf) ) @dataclass class Marc21Data(RawData): """Python representation of the data inside a Marc21 record""" - # We use a list for the fields and not a dictionary because they may # appear more than once fields: List[Marc21Field] = dataclass_field(default_factory=list) @@ -57,18 +57,18 @@ class Marc21Data(RawData): raw: dict = dataclass_field(default_factory=dict) def get_first_field(self, fieldnumber: str) -> Optional[Marc21Field]: - """Return the first occurance of a field with a given field number. + '''Return the first occurance of a field with a given field number. May be useful for fields that appear only once, such as 245. - Return None if field is not found.""" + Return None if field is not found.''' for field in self.fields: if field.fieldnumber == fieldnumber: return field return None def get_first_subfield(self, fieldnumber: str, subfield: str) -> Optional[str]: - """Return the requested subfield of the first occurance of a field with + '''Return the requested subfield of the first occurance of a field with the given field number. Return None if field is not found or if the - subfield is not present on the first occurance of the field.""" + subfield is not present on the first occurance of the field.''' field = self.get_first_field(fieldnumber) if field is not None: return field.subfields.get(subfield, None) @@ -76,8 +76,8 @@ def get_first_subfield(self, fieldnumber: str, subfield: str) -> Optional[str]: return None def get_fields(self, fieldnumber: str) -> List[Marc21Field]: - """Return a list of fields with a given field number. May return an - empty list if field does not occur.""" + '''Return a list of fields with a given field number. May return an + empty list if field does not occur.''' returned_fields: List[Marc21Field] = [] for field in self.fields: if field.fieldnumber == fieldnumber: @@ -85,9 +85,9 @@ def get_fields(self, fieldnumber: str) -> List[Marc21Field]: return returned_fields def get_all_subfields(self, fieldnumber: str, subfield: str) -> List[str]: - """Return a list of subfields that matches the requested field number + '''Return a list of subfields that matches the requested field number and subfield. May return an empty list if the field and subfield do not - occur.""" + occur.''' fields = self.get_fields(fieldnumber) returned_subfields: List[str] = [] for field in fields: @@ -99,11 +99,10 @@ def to_dict(self) -> dict: return self.raw -class Marc21DataMixin: +class Marc21DataMixin(): """A mixin that adds a ``data`` attribute to a Record class to contain an instance of ``Marc21Data``. """ - data: Optional[Marc21Data] = None def show_record(self) -> str: @@ -112,28 +111,26 @@ def show_record(self) -> str: field_strings = [] for field in self.data.fields: field_strings.append(str(field)) - return "\n".join(field_strings) - + return '\n'.join(field_strings) class SRUMarc21Reader(SRUReader): - """Subclass of ``SRUReader`` that adds Marc21 functionality. + '''Subclass of ``SRUReader`` that adds Marc21 functionality. This class is still abstract and to create concrete readers - the ``_get_link()``, ``_get_identifier()`` + the ``_get_link()``, ``_get_identifier()`` and ``_convert_record`` methods should be implemented. .. automethod:: _convert_record .. automethod:: _get_link - .. automethod:: _get_identifier""" - - marcxchange_prefix: str = "" + .. automethod:: _get_identifier''' + marcxchange_prefix: str = '' @classmethod def _get_subfields(cls, sruthifield) -> list: # If there is only one subfield, sruthi puts it directly in # a dict, otherwise it uses a list of dicts. Make sure that # we always have a list. - subfielddata = sruthifield[f"{cls.marcxchange_prefix}subfield"] + subfielddata = sruthifield[f'{cls.marcxchange_prefix}subfield'] if isinstance(subfielddata, dict): sruthisubfields = [subfielddata] else: @@ -149,20 +146,21 @@ def _convert_to_marc21data(cls, sruthirecord: dict) -> Marc21Data: # The controlfield and the datafield contain multiple fields. # The controlfield consists of simple pairs of tags (field numbers) # and texts (field values). - for sruthicontrolfield in sruthirecord[f"{cls.marcxchange_prefix}controlfield"]: - tag = sruthicontrolfield["tag"] - text = sruthicontrolfield["text"] + for sruthicontrolfield in \ + sruthirecord[f'{cls.marcxchange_prefix}controlfield']: + tag = sruthicontrolfield['tag'] + text = sruthicontrolfield['text'] data.controlfields[tag] = text # The datafield is more complex; these fields also have two indicators, # one-digit numbers that carry special meanings, and multiple subfields # that each have a one-character code. - for sruthifield in sruthirecord[f"{cls.marcxchange_prefix}datafield"]: - fieldnumber = sruthifield["tag"] + for sruthifield in sruthirecord[f'{cls.marcxchange_prefix}datafield']: + fieldnumber = sruthifield['tag'] field = Marc21Field( fieldnumber=fieldnumber, - indicator1=sruthifield["ind1"], - indicator2=sruthifield["ind2"], - subfields={}, + indicator1=sruthifield['ind1'], + indicator2=sruthifield['ind2'], + subfields={} ) # The translation_dictionary contains descriptions for a number # of important fields. Include them so that the user can more @@ -172,53 +170,52 @@ def _convert_to_marc21data(cls, sruthirecord: dict) -> Marc21Data: sruthisubfields = cls._get_subfields(sruthifield) for sruthisubfield in sruthisubfields: - field.subfields[sruthisubfield["code"]] = sruthisubfield["text"] + field.subfields[sruthisubfield['code']] = \ + sruthisubfield['text'] data.fields.append(field) return data - + @classmethod @abstractmethod def _get_link(cls, data: Marc21Data) -> Optional[str]: - """Get a public URL according to the Marc21 data or ``None`` if it - is not available.""" + '''Get a public URL according to the Marc21 data or ``None`` if it + is not available.''' pass @classmethod @abstractmethod def _get_identifier(cls, data: Marc21Data) -> Optional[str]: - """Get the unique identifier from the Marc21 data or ``None`` if it - is not available.""" + '''Get the unique identifier from the Marc21 data or ``None`` if it + is not available.''' pass class Marc21BibliographicalRecord(Marc21DataMixin, BibliographicalRecord): - """A combination of ``BibliographicalRecord`` and ``Marc21DataMixin``.""" - + '''A combination of ``BibliographicalRecord`` and ``Marc21DataMixin``.''' pass class SRUMarc21BibliographicalReader(SRUMarc21Reader): - """Subclass of ``SRUMarc21Reader`` that adds functionality to create + '''Subclass of ``SRUMarc21Reader`` that adds functionality to create instances of ``BibliographicRecord``. This subclass assumes that the Marc21 data is according to the standard format of Marc21 for bibliographical data. See: https://www.loc.gov/marc/bibliographic/ - """ - - _title_field_subfield = ("245", "a") - _alternative_title_field_subfield = ("246", "a") - _publisher_field_subfield = ("264", "b") - _language_field_subfield = ("041", "a") - _place_field_subfield = ("264", "a") - _dating_field_subfield = ("264", "c") - _extent_field_subfield = ("300", "a") - _physical_description_field_subfield = ("300", "b") - _size_field_subfield = ("300", "c") + ''' + _title_field_subfield = ('245', 'a') + _alternative_title_field_subfield = ('246', 'a') + _publisher_field_subfield = ('264', 'b') + _language_field_subfield = ('041', 'a') + _place_field_subfield = ('264', 'a') + _dating_field_subfield = ('264', 'c') + _extent_field_subfield = ('300', 'a') + _physical_description_field_subfield = ('300', 'b') + _size_field_subfield = ('300', 'c') records: List[Marc21BibliographicalRecord] READERTYPE = BIBLIOGRAPHICAL - + @classmethod def _convert_record(cls, sruthirecord: dict) -> Marc21BibliographicalRecord: record = Marc21BibliographicalRecord(from_reader=cls) @@ -273,9 +270,10 @@ def _convert_record(cls, sruthirecord: dict) -> Marc21BibliographicalRecord: @classmethod def _get_contributors(cls, data: Marc21Data) -> List[Field]: contributors: List[Field] = [] - contributor_fields = data.get_fields("100") + contributor_fields = data.get_fields('100') for field in contributor_fields: - name = field.subfields.get("a") + name = field.subfields.get('a') if name: contributors.append(Field(name)) return contributors + diff --git a/edpop_explorer/srureader.py b/edpop_explorer/srureader.py index 36b8503..0017a7d 100644 --- a/edpop_explorer/srureader.py +++ b/edpop_explorer/srureader.py @@ -8,26 +8,25 @@ class SRUReader(GetByIdBasedOnQueryMixin, Reader): - """Subclass of ``Reader`` that adds basic SRU functionality + '''Subclass of ``Reader`` that adds basic SRU functionality using the ``sruthi`` library. This class is still abstract and subclasses should implement the ``transform_query()`` and ``_convert_record()`` methods and set the attributes ``sru_url`` and ``sru_version``. - + The ``_prepare_get_by_id_query()`` method by default returns the transformed version of the identifier as a query, which normally works, but this may be optimised by overriding it. - .. automethod:: _convert_record""" - + .. automethod:: _convert_record''' sru_url: str - """URL of the SRU API.""" + '''URL of the SRU API.''' sru_version: str - """Version of the SRU protocol. Can be '1.1' or '1.2'.""" + '''Version of the SRU protocol. Can be '1.1' or '1.2'.''' query: Optional[str] = None session: requests.Session - """The ``Session`` object of the ``requests`` library.""" + '''The ``Session`` object of the ``requests`` library.''' def __init__(self): # Set a session to allow reuse of HTTP sessions and to set additional @@ -44,17 +43,15 @@ def transform_query(cls, query: str) -> str: @classmethod @abstractmethod def _convert_record(cls, sruthirecord: dict) -> Record: - """Convert the output of ``sruthi`` into an instance of - (a subclass of) ``Record``.""" + '''Convert the output of ``sruthi`` into an instance of + (a subclass of) ``Record``.''' pass @classmethod def _prepare_get_by_id_query(cls, identifier: str) -> str: return cls.transform_query(identifier) - def _perform_query( - self, start_record: int, maximum_records: Optional[int] - ) -> List[Record]: + def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> List[Record]: if maximum_records is None: maximum_records = self.DEFAULT_RECORDS_PER_PAGE try: @@ -64,10 +61,12 @@ def _perform_query( start_record=start_record, maximum_records=maximum_records, sru_version=self.sru_version, - session=self.session, + session=self.session ) - except sruthi.errors.SruError as err: - raise ReaderError("Server returned error: " + str(err)) + except ( + sruthi.errors.SruError + ) as err: + raise ReaderError('Server returned error: ' + str(err)) self.number_of_results = response.count @@ -86,7 +85,7 @@ def fetch_range(self, range_to_fetch: range) -> range: if self.fetching_exhausted: return range(0, 0) if self.prepared_query is None: - raise ReaderError("First call prepare_query") + raise ReaderError('First call prepare_query') start_number = range_to_fetch.start start_number_sru = start_number + 1 # SRU starts at 1 records_to_fetch = range_to_fetch.stop - range_to_fetch.start diff --git a/tests/conftest.py b/tests/conftest.py index c6be388..7134a1f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,13 +1,7 @@ def pytest_addoption(parser): - parser.addoption( - "--requests", - action="store_true", - dest="requests", - default=False, - help="enable tests with real API requests", - ) - + parser.addoption('--requests', action='store_true', dest="requests", + default=False, help="enable tests with real API requests") def pytest_configure(config): if not config.option.requests: - setattr(config.option, "markexpr", "not requests") + setattr(config.option, 'markexpr', 'not requests') diff --git a/tests/test_allreaders.py b/tests/test_allreaders.py index 60b180e..fa58b11 100644 --- a/tests/test_allreaders.py +++ b/tests/test_allreaders.py @@ -92,11 +92,7 @@ def test_realrequest(readercls: Type[Reader]): # there was a mistake with the offsets. But just give a warning, # because there are APIs that (by mistake?) return duplicated # records. - warnings.warn( - UserWarning( - "Last record from first fetch is same as first record from second fetch" - ) - ) + warnings.warn(UserWarning("Last record from first fetch is same as first record from second fetch")) else: assert reader.number_fetched == fetched_before assert rng2 == range(0) diff --git a/tests/test_field.py b/tests/test_field.py index e4196cc..5860274 100644 --- a/tests/test_field.py +++ b/tests/test_field.py @@ -9,18 +9,18 @@ @fixture def basic_field() -> Field: - return Field("Dit is een boektitel") + return Field('Dit is een boektitel') @fixture def basic_location_field() -> LocationField: - field = LocationField("Voorschoten") + field = LocationField('Voorschoten') return field class TestField: def test_init(self, basic_field: Field): - assert basic_field.original_text == "Dit is een boektitel" + assert basic_field.original_text == 'Dit is een boektitel' assert isinstance(basic_field.subject_node, Node) def test_to_graph(self, basic_field: Field): @@ -30,21 +30,27 @@ def test_to_graph(self, basic_field: Field): assert ( basic_field.subject_node, EDPOPREC.originalText, - Literal(basic_field.original_text), + Literal(basic_field.original_text) ) in graph # Test boolean basic_field.unknown = True graph = basic_field.to_graph() - assert (basic_field.subject_node, EDPOPREC.unknown, Literal(True)) in graph + assert ( + basic_field.subject_node, + EDPOPREC.unknown, + Literal(True) + ) in graph # Invalid type on object should give exception - basic_field.unknown = "other value" # type: ignore + basic_field.unknown = 'other value' # type: ignore with raises(FieldError): basic_field.to_graph() # Nonexisting datatype defined in class on SUBFIELDS should give # exception basic_field._subfields = basic_field._subfields.copy() - basic_field._subfields.append(("other", EDPOPREC.other, "othertype")) - basic_field.other = "text" # type: ignore + basic_field._subfields.append( + ('other', EDPOPREC.other, 'othertype') + ) + basic_field.other = 'text' # type: ignore with raises(FieldError): basic_field.to_graph() @@ -53,10 +59,18 @@ class TestLocationField: def test_basic_form(self, basic_location_field: LocationField): field = basic_location_field graph = field.to_graph() - assert (field.subject_node, EDPOPREC.locationType, None) not in graph + assert ( + field.subject_node, + EDPOPREC.locationType, + None + ) not in graph def test_location_type(self, basic_location_field: LocationField): field = basic_location_field field.location_type = LocationField.LOCALITY graph = field.to_graph() - assert (field.subject_node, EDPOPREC.locationType, EDPOPREC.locality) in graph + assert ( + field.subject_node, + EDPOPREC.locationType, + EDPOPREC.locality + ) in graph diff --git a/tests/test_reader.py b/tests/test_reader.py index 791d3f8..b2e77f7 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1,3 +1,4 @@ + from typing_extensions import override import pytest @@ -68,7 +69,9 @@ def test_iri_to_identifier_invalid(): def test_iri_to_identifier_and_vv_noprefixset(): with pytest.raises(ReaderError): - SimpleReaderNoIRIPrefix.iri_to_identifier("http://example.com/records/reader/1") + SimpleReaderNoIRIPrefix.iri_to_identifier( + "http://example.com/records/reader/1" + ) with pytest.raises(ReaderError): SimpleReaderNoIRIPrefix.identifier_to_iri("1") diff --git a/tests/test_record.py b/tests/test_record.py index 832727f..2118b1b 100644 --- a/tests/test_record.py +++ b/tests/test_record.py @@ -8,7 +8,7 @@ class SimpleReader(Reader): - CATALOG_URIREF = URIRef("http://example.com/reader") + CATALOG_URIREF = URIRef('http://example.com/reader') IRI_PREFIX = "http://example.com/records/reader/" @@ -19,19 +19,17 @@ class SimpleRecord(Record): def __init__(self, from_reader): super().__init__(from_reader) - self._fields.extend( - [ - ("testfield", EDPOPREC.testField, Field), - ("multiplefield", EDPOPREC.multipleField, Field), - ] - ) + self._fields.extend([ + ('testfield', EDPOPREC.testField, Field), + ('multiplefield', EDPOPREC.multipleField, Field) + ]) @pytest.fixture def basic_record(): record = SimpleRecord(SimpleReader) - record.link = "http://example.com" - record.identifier = "123" + record.link = 'http://example.com' + record.identifier = '123' return record @@ -45,27 +43,26 @@ def test_iri_empty(basic_record: SimpleRecord): def test_subject_node(basic_record: SimpleRecord): - assert basic_record.subject_node == URIRef("http://example.com/records/reader/123") + assert basic_record.subject_node == \ + URIRef("http://example.com/records/reader/123") def test_to_graph_empty(): # Test if it works with an empty record record = Record(SimpleReader) g = record.to_graph() - assert (record.subject_node, EDPOPREC.fromCatalog, SimpleReader.CATALOG_URIREF) in g - + assert ( + record.subject_node, EDPOPREC.fromCatalog, SimpleReader.CATALOG_URIREF + ) in g + def test_to_graph_basic_attributes(basic_record): g = basic_record.to_graph() assert ( - basic_record.subject_node, - EDPOPREC.publicURL, - Literal(basic_record.link), + basic_record.subject_node, EDPOPREC.publicURL, Literal(basic_record.link) ) in g assert ( - basic_record.subject_node, - EDPOPREC.identifier, - Literal(basic_record.identifier), + basic_record.subject_node, EDPOPREC.identifier, Literal(basic_record.identifier) ) in g @@ -76,23 +73,25 @@ def test_to_graph_empty_field(basic_record): def test_to_graph_field_normal_value(basic_record): - basic_record.testfield = Field("test") + basic_record.testfield = Field('test') g = basic_record.to_graph() assert (basic_record.subject_node, EDPOPREC.testField, None) in g - + def test_to_graph_string_in_field(basic_record): - basic_record.testfield = "test" # type: ignore + basic_record.testfield = 'test' # type: ignore with pytest.raises(RecordError): basic_record.to_graph() - - + def test_to_graph_field_multiple_values(basic_record): # Try a field that accepts multiple values - basic_record.multiplefield = [Field("v1"), Field("v2")] + basic_record.multiplefield = [ + Field('v1'), Field('v2') + ] g = basic_record.to_graph() - assert len(list(g.objects(basic_record.subject_node, EDPOPREC.multipleField))) == 2 - + assert len(list( + g.objects(basic_record.subject_node, EDPOPREC.multipleField) + )) == 2 def test_biographicalrecord(): # Basic test to check if class definition is sane; the logic should be @@ -100,7 +99,6 @@ def test_biographicalrecord(): record = BiographicalRecord(SimpleReader) record.to_graph() - def test_biographicalrecord_str(): record = BiographicalRecord(SimpleReader) personname = "Person" diff --git a/tests/test_srureader.py b/tests/test_srureader.py index 44cffb7..649d15e 100644 --- a/tests/test_srureader.py +++ b/tests/test_srureader.py @@ -6,7 +6,7 @@ from edpop_explorer import SRUMarc21BibliographicalReader, Marc21Data -TESTDATA = json.load(open(Path(__file__).parent / "TESTDATA", "r")) +TESTDATA = json.load(open(Path(__file__).parent / 'TESTDATA', 'r')) class MockReader(SRUMarc21BibliographicalReader): @@ -20,32 +20,35 @@ def _get_link(cls, data: Marc21Data) -> Optional[str]: @classmethod def _get_identifier(cls, data: Marc21Data) -> Optional[str]: - return "id" + return 'id' class TestSRUMarc21BibliographicalReader: - @patch("edpop_explorer.srureader.sruthi") + @patch('edpop_explorer.srureader.sruthi') def test_fetch(self, mock_sruthi): mock_sruthi.searchretrieve.return_value = TESTDATA reader = MockReader() - reader.sru_url = "" - reader.sru_version = "1.1" - reader.prepare_query("testquery") + reader.sru_url = '' + reader.sru_version = '1.1' + reader.prepare_query('testquery') reader.fetch() results = reader.records # Field with multiple subfields data = results[0].data assert data is not None - firstfield = data.get_first_field("245") + firstfield = data.get_first_field('245') assert firstfield is not None - assert firstfield.subfields["a"] == "Aeschylus: Eumenides." + assert firstfield.subfields['a'] == \ + 'Aeschylus: Eumenides.' # Field with a single subfield - firstfield = data.get_first_field("650") + firstfield = data.get_first_field('650') assert firstfield is not None - assert firstfield.subfields["a"] == "Aeschylus Eumenides." + assert firstfield.subfields['a'] == \ + 'Aeschylus Eumenides.' # Field's description - assert firstfield.description == "Subject Added Entry - Topical Term" + assert firstfield.description == \ + 'Subject Added Entry - Topical Term' # Field that occurs multiple times - assert len(data.get_fields("500")) == 5 + assert len(data.get_fields('500')) == 5 # Control field - assert data.controlfields["007"] == "tu" + assert data.controlfields['007'] == 'tu' From 466a418279eb3e5da6ba7441400d0e754e78e4d4 Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Thu, 24 Oct 2024 17:19:03 +0200 Subject: [PATCH 09/32] Avoid format attribute name --- edpop_explorer/readers/stcn.py | 8 ++++---- edpop_explorer/record.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/edpop_explorer/readers/stcn.py b/edpop_explorer/readers/stcn.py index 527c775..313a2e2 100644 --- a/edpop_explorer/readers/stcn.py +++ b/edpop_explorer/readers/stcn.py @@ -113,10 +113,10 @@ def _get_extent(cls, rawrecord: dict) -> Optional[Field]: @classmethod def _get_format(cls, rawrecord: dict) -> Optional[Field]: - format = safeget(rawrecord, ("data", "format", "format")) - if format is None: + format_ = safeget(rawrecord, ("data", "format", "format")) + if format_ is None: return None - return Field(format) + return Field(format_) @classmethod def _get_collation_formula(cls, rawrecord: dict) -> Optional[Field]: @@ -171,7 +171,7 @@ def _convert_record(cls, rawrecord: dict) -> BibliographicalRecord: record.dating = cls._get_dating(rawrecord) record.languages = cls._get_languages(rawrecord) record.extent = cls._get_extent(rawrecord) - record.format = cls._get_format(rawrecord) + record.bibliographical_format = cls._get_format(rawrecord) record.collation_formula = cls._get_collation_formula(rawrecord) record.fingerprint = cls._get_fingerprint(rawrecord) record.genres = cls._get_genres(rawrecord) diff --git a/edpop_explorer/record.py b/edpop_explorer/record.py index 9d6d16b..3451b6b 100644 --- a/edpop_explorer/record.py +++ b/edpop_explorer/record.py @@ -215,7 +215,7 @@ class BibliographicalRecord(Record): physical_description: Optional[Field] = None bookseller: Optional[Field] = None location: Optional[Field] = None - format: Optional[Field] = None + bibliographical_format: Optional[Field] = None fingerprint: Optional[Field] = None collation_formula: Optional[Field] = None genres: Optional[List[Field]] = None @@ -237,7 +237,7 @@ def __init__(self, from_reader: Type["Reader"]): ('physical_description', EDPOPREC.physicalDescription, Field), ('bookseller', EDPOPREC.bookseller, Field), ('location', EDPOPREC.location, Field), - ('format', EDPOPREC.format, Field), + ('bibliographical_format', EDPOPREC.bibliographicalFormat, Field), ('fingerprint', EDPOPREC.fingerprint, Field), ('collation_formula', EDPOPREC.collationFormula, Field), ('genres', EDPOPREC.genre, Field), From 87becc48f5ac4ef97079d3f9200caa4bfbfeda9f Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Thu, 7 Nov 2024 15:14:13 +0100 Subject: [PATCH 10/32] Fix name of "name" field in bibliographical record --- edpop_explorer/record.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edpop_explorer/record.py b/edpop_explorer/record.py index 3451b6b..f4abed9 100644 --- a/edpop_explorer/record.py +++ b/edpop_explorer/record.py @@ -271,7 +271,7 @@ def __init__(self, from_reader: Type["Reader"]): super().__init__(from_reader) assert isinstance(self._fields, list) self._fields += [ - ('name', EDPOPREC.title, Field), + ('name', EDPOPREC.name, Field), ('variant_names', EDPOPREC.variantName, Field), ('place_of_birth', EDPOPREC.placeOfBirth, Field), ('place_of_death', EDPOPREC.placeOfDeath, Field), From 9390bc97999feb9122f7d19f13d1e6662a8b6b4a Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Thu, 19 Dec 2024 15:05:58 +0100 Subject: [PATCH 11/32] Create a separate persons reader for STCN --- edpop_explorer/readers/stcn.py | 74 ++++++++++++++++++++++++++++++++-- 1 file changed, 70 insertions(+), 4 deletions(-) diff --git a/edpop_explorer/readers/stcn.py b/edpop_explorer/readers/stcn.py index 313a2e2..0237875 100644 --- a/edpop_explorer/readers/stcn.py +++ b/edpop_explorer/readers/stcn.py @@ -2,9 +2,10 @@ from rdflib.term import Node from typing import List, Optional, Tuple -from edpop_explorer import Field, BIBLIOGRAPHICAL, BibliographicalRecord, LocationField +from edpop_explorer import Field, BIBLIOGRAPHICAL, BibliographicalRecord, LocationField, BIOGRAPHICAL, \ + BiographicalRecord from edpop_explorer.cerl import CERLReader -from edpop_explorer.fields import LanguageField, ContributorField +from edpop_explorer.fields import LanguageField, ContributorField, ActivityField from edpop_explorer.sparqlreader import ( SparqlReader, BibliographicalRDFRecord ) @@ -28,8 +29,68 @@ def safeget(dictionary: Optional[dict], attribute_chain: tuple, first: bool = Fa return safeget(value, attribute_chain[1:], first) -class STCNReader(CERLReader): +class STCNBaseReader(CERLReader): + """STCN uses the same search API for its bibliographical records and + its biographical records (persons and publishers/printers), but the + data format as well as detail pages are different. This base class + builds on CERLReader and adds the API URL.""" API_URL = 'https://data.cerl.org/stcn/_search' + + +class STCNPersonsReader(STCNBaseReader): + API_BY_ID_BASE_URL = 'https://data.cerl.org/stcn_persons/' + LINK_BASE_URL = 'https://data.cerl.org/stcn_persons/' + CATALOG_URIREF = URIRef( + 'https://edpop.hum.uu.nl/readers/stcn' + ) + IRI_PREFIX = "https://edpop.hum.uu.nl/readers/stcn-persons/" + READERTYPE = BIOGRAPHICAL + SHORT_NAME = "STCN Persons" + DESCRIPTION = "National bibliography of The Netherlands until 1801 – persons" + + @classmethod + def transform_query(cls, query) -> str: + # Only person records + return f"({query}) AND data.type:pers" + + @classmethod + def _get_names(cls, rawrecord: dict) -> tuple[Optional[Field], Optional[List[Field]]]: + preferred_name = safeget(rawrecord, ('shortDisplay',)) + namelist = safeget(rawrecord, ('data', 'agent')) + alternative_names = None + if namelist: + alternative_names = [x["variants"] for x in namelist if x["variants"] != preferred_name] + preferred_name_field = Field(preferred_name) if preferred_name else None + alternative_names_field = [Field(x) for x in alternative_names] if alternative_names else None + return preferred_name_field, alternative_names_field + + @classmethod + def _get_timespan(cls, rawrecord: dict) -> Optional[Field]: + timespan = safeget(rawrecord, ("dates",)) + if timespan: + return Field(timespan) + + @classmethod + def _get_activities(cls, rawrecord: dict) -> Optional[list[Field]]: + profession_notes = safeget(rawrecord, ("data", "professionNote",)) + if not profession_notes: + return None + return [Field(x) for x in profession_notes] + + @classmethod + def _convert_record(cls, rawrecord: dict) -> BiographicalRecord: + record = BiographicalRecord(from_reader=cls) + record.data = rawrecord + record.identifier = rawrecord.get('id', None) + if record.identifier: + record.link = cls.LINK_BASE_URL + record.identifier + record.name, record.variant_names = cls._get_names(rawrecord) + record.timespan = cls._get_timespan(rawrecord) + record.activities = cls._get_activities(rawrecord) + return record + + +class STCNReader(STCNBaseReader): API_BY_ID_BASE_URL = 'https://data.cerl.org/stcn/' LINK_BASE_URL = 'https://data.cerl.org/stcn/' CATALOG_URIREF = URIRef( @@ -38,7 +99,12 @@ class STCNReader(CERLReader): IRI_PREFIX = "https://edpop.hum.uu.nl/readers/stcn/" READERTYPE = BIBLIOGRAPHICAL SHORT_NAME = "Short-Title Catalogue Netherlands (STCN)" - DESCRIPTION = "National biography of The Netherlands until 1801" + DESCRIPTION = "National bibliography of The Netherlands until 1801" + + @classmethod + def transform_query(cls, query) -> str: + # Filter out bibliographical records + return f"({query}) NOT data.type:pers NOT data.type:impr" @classmethod def _get_title(cls, rawrecord: dict) -> Optional[Field]: From 2f15e2b949bd6a17a920038d8b98d71cc1f40441 Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Thu, 19 Dec 2024 15:08:08 +0100 Subject: [PATCH 12/32] Remove unused and nonexisting ActivityField import --- edpop_explorer/fields.py | 3 --- edpop_explorer/readers/stcn.py | 2 +- edpop_explorer/record.py | 4 ++-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/edpop_explorer/fields.py b/edpop_explorer/fields.py index cfd5336..64da99a 100644 --- a/edpop_explorer/fields.py +++ b/edpop_explorer/fields.py @@ -207,6 +207,3 @@ def summary_text(self) -> Optional[str]: else: return name - - - diff --git a/edpop_explorer/readers/stcn.py b/edpop_explorer/readers/stcn.py index 0237875..fa3f5fe 100644 --- a/edpop_explorer/readers/stcn.py +++ b/edpop_explorer/readers/stcn.py @@ -5,7 +5,7 @@ from edpop_explorer import Field, BIBLIOGRAPHICAL, BibliographicalRecord, LocationField, BIOGRAPHICAL, \ BiographicalRecord from edpop_explorer.cerl import CERLReader -from edpop_explorer.fields import LanguageField, ContributorField, ActivityField +from edpop_explorer.fields import LanguageField, ContributorField from edpop_explorer.sparqlreader import ( SparqlReader, BibliographicalRDFRecord ) diff --git a/edpop_explorer/record.py b/edpop_explorer/record.py index f4abed9..9447905 100644 --- a/edpop_explorer/record.py +++ b/edpop_explorer/record.py @@ -265,7 +265,7 @@ class BiographicalRecord(Record): activity_timespan: Optional[Field] = None activities: Optional[List[Field]] = None gender: Optional[Field] = None - lifespan: Optional[Field] = None + timespan: Optional[Field] = None def __init__(self, from_reader: Type["Reader"]): super().__init__(from_reader) @@ -279,7 +279,7 @@ def __init__(self, from_reader: Type["Reader"]): ('activity_timespan', EDPOPREC.timespan, Field), ('activities', EDPOPREC.activity, Field), ('gender', EDPOPREC.gender, Field), - ('lifespan', EDPOPREC.lifespan, Field), + ('timespan', EDPOPREC.timespan, Field), ] def __str__(self) -> str: From facfbf1be80e47ee9134e90493ecfa956ef9f872 Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Thu, 19 Dec 2024 15:09:01 +0100 Subject: [PATCH 13/32] Align KVCS to new BiographicalRecord fields --- edpop_explorer/readers/kvcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edpop_explorer/readers/kvcs.py b/edpop_explorer/readers/kvcs.py index c7834f1..237e7ea 100644 --- a/edpop_explorer/readers/kvcs.py +++ b/edpop_explorer/readers/kvcs.py @@ -24,7 +24,7 @@ def _convert_record(cls, rawrecord: dict) -> BiographicalRecord: record.identifier = rawrecord['ID'] record.name = Field(rawrecord['Name']) record.gender = Field(rawrecord['Gender']) - record.lifespan = Field(rawrecord['Years of life']) + record.timespan = Field(rawrecord['Years of life']) record.places_of_activity = Field(rawrecord['City']) record.activity_timespan = Field(rawrecord['Years of activity']) record.activities = Field(rawrecord['Kind of print and sales activities']) From e603efebca86be50d6ae16f5bf9d76ee031a14aa Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Thu, 19 Dec 2024 15:09:21 +0100 Subject: [PATCH 14/32] Add STCNPersonsReader to shell --- edpop_explorer/edpopxshell.py | 5 +++++ edpop_explorer/readers/__init__.py | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/edpop_explorer/edpopxshell.py b/edpop_explorer/edpopxshell.py index 2ac1c2e..d0c4552 100644 --- a/edpop_explorer/edpopxshell.py +++ b/edpop_explorer/edpopxshell.py @@ -19,6 +19,7 @@ VDLiedReader, KBReader, STCNReader, + STCNPersonsReader, SBTIReader, USTCReader, BnFReader, @@ -180,6 +181,10 @@ def do_ct(self, args) -> None: def do_stcn(self, args) -> None: 'Short Title Catalogue Netherlands' self._query(STCNReader, args) + + def do_stcnpers(self, args) -> None: + 'Short Title Catalogue Netherlands – Persons' + self._query(STCNPersonsReader, args) def do_sbti(self, args) -> None: 'Scottish Book Trade Index' diff --git a/edpop_explorer/readers/__init__.py b/edpop_explorer/readers/__init__.py index 3acfd63..fe8a645 100644 --- a/edpop_explorer/readers/__init__.py +++ b/edpop_explorer/readers/__init__.py @@ -13,6 +13,7 @@ "VD18Reader", "VDLiedReader", "STCNReader", + "STCNPersonsReader", "USTCReader", "KVCSReader", "DutchAlmanacsReader", @@ -29,7 +30,7 @@ from .hpb import HPBReader from .kb import KBReader from .sbtireader import SBTIReader -from .stcn import STCNReader +from .stcn import STCNReader, STCNPersonsReader from .ustc import USTCReader from .vd import VD16Reader, VD17Reader, VD18Reader, VDLiedReader from .kvcs import KVCSReader From 3e579c1ea5acd96a45e47ac3bb466c307b42ebec Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Thu, 19 Dec 2024 15:28:21 +0100 Subject: [PATCH 15/32] Fix errors detected by ruff --- edpop_explorer/cerl.py | 3 +-- edpop_explorer/readers/sbtireader.py | 2 +- edpop_explorer/readers/stcn.py | 8 ++------ tests/test_field.py | 1 - 4 files changed, 4 insertions(+), 10 deletions(-) diff --git a/edpop_explorer/cerl.py b/edpop_explorer/cerl.py index 61dbf01..ecdc00f 100644 --- a/edpop_explorer/cerl.py +++ b/edpop_explorer/cerl.py @@ -1,11 +1,10 @@ from abc import abstractmethod -from rdflib import URIRef import requests from typing import List, Dict, Optional from edpop_explorer import ( - Reader, Record, ReaderError, BiographicalRecord, Field, BIOGRAPHICAL + Reader, Record, ReaderError ) diff --git a/edpop_explorer/readers/sbtireader.py b/edpop_explorer/readers/sbtireader.py index f37c692..a23e6ef 100644 --- a/edpop_explorer/readers/sbtireader.py +++ b/edpop_explorer/readers/sbtireader.py @@ -1,5 +1,5 @@ from rdflib import URIRef -from typing import List, Dict, Optional +from typing import Dict, Optional from edpop_explorer import ( BiographicalRecord, Field, BIOGRAPHICAL diff --git a/edpop_explorer/readers/stcn.py b/edpop_explorer/readers/stcn.py index fa3f5fe..31499a3 100644 --- a/edpop_explorer/readers/stcn.py +++ b/edpop_explorer/readers/stcn.py @@ -1,14 +1,10 @@ -from rdflib import Graph, Namespace, URIRef -from rdflib.term import Node -from typing import List, Optional, Tuple +from rdflib import URIRef +from typing import List, Optional from edpop_explorer import Field, BIBLIOGRAPHICAL, BibliographicalRecord, LocationField, BIOGRAPHICAL, \ BiographicalRecord from edpop_explorer.cerl import CERLReader from edpop_explorer.fields import LanguageField, ContributorField -from edpop_explorer.sparqlreader import ( - SparqlReader, BibliographicalRDFRecord -) def _remove_markup(input_str: str) -> str: diff --git a/tests/test_field.py b/tests/test_field.py index 5860274..7cf93ec 100644 --- a/tests/test_field.py +++ b/tests/test_field.py @@ -4,7 +4,6 @@ from edpop_explorer import Field, FieldError, LocationField from edpop_explorer import EDPOPREC -from edpop_explorer.normalizers import NormalizationResult @fixture From 6dd7dd5ec3faf8baa1c225e8748c2f201d13922a Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Thu, 19 Dec 2024 15:32:46 +0100 Subject: [PATCH 16/32] Python 3.8 compatibility --- edpop_explorer/readers/stcn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/edpop_explorer/readers/stcn.py b/edpop_explorer/readers/stcn.py index 31499a3..44c8114 100644 --- a/edpop_explorer/readers/stcn.py +++ b/edpop_explorer/readers/stcn.py @@ -1,5 +1,5 @@ from rdflib import URIRef -from typing import List, Optional +from typing import List, Optional, Tuple from edpop_explorer import Field, BIBLIOGRAPHICAL, BibliographicalRecord, LocationField, BIOGRAPHICAL, \ BiographicalRecord @@ -50,7 +50,7 @@ def transform_query(cls, query) -> str: return f"({query}) AND data.type:pers" @classmethod - def _get_names(cls, rawrecord: dict) -> tuple[Optional[Field], Optional[List[Field]]]: + def _get_names(cls, rawrecord: dict) -> Tuple[Optional[Field], Optional[List[Field]]]: preferred_name = safeget(rawrecord, ('shortDisplay',)) namelist = safeget(rawrecord, ('data', 'agent')) alternative_names = None From dc1fc0cecb3bd490c7764312fb497fd852561894 Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Thu, 19 Dec 2024 15:36:49 +0100 Subject: [PATCH 17/32] More Python 3.8 compatibility --- edpop_explorer/readers/stcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edpop_explorer/readers/stcn.py b/edpop_explorer/readers/stcn.py index 44c8114..eddd6ca 100644 --- a/edpop_explorer/readers/stcn.py +++ b/edpop_explorer/readers/stcn.py @@ -67,7 +67,7 @@ def _get_timespan(cls, rawrecord: dict) -> Optional[Field]: return Field(timespan) @classmethod - def _get_activities(cls, rawrecord: dict) -> Optional[list[Field]]: + def _get_activities(cls, rawrecord: dict) -> Optional[List[Field]]: profession_notes = safeget(rawrecord, ("data", "professionNote",)) if not profession_notes: return None From 8b6c45b93559243a7da2b100bef4ca0495463fda Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Thu, 19 Dec 2024 15:39:23 +0100 Subject: [PATCH 18/32] More Python 3.8 compatibility --- edpop_explorer/readers/stcn.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/edpop_explorer/readers/stcn.py b/edpop_explorer/readers/stcn.py index eddd6ca..da4d9d1 100644 --- a/edpop_explorer/readers/stcn.py +++ b/edpop_explorer/readers/stcn.py @@ -110,7 +110,7 @@ def _get_title(cls, rawrecord: dict) -> Optional[Field]: return Field(title) @classmethod - def _get_contributors(cls, rawrecord: dict) -> list[Field]: + def _get_contributors(cls, rawrecord: dict) -> List[Field]: actors = safeget(rawrecord, ("data", "agent")) if not actors: return [] @@ -199,7 +199,7 @@ def _get_fingerprint(cls, rawrecord: dict) -> Optional[Field]: return Field(fingerprint_string) @classmethod - def _get_genres(cls, rawrecord: dict) -> list[Field]: + def _get_genres(cls, rawrecord: dict) -> List[Field]: subjecttopics = safeget(rawrecord, ("data", "subjectTopic")) if subjecttopics is None: return [] @@ -207,7 +207,7 @@ def _get_genres(cls, rawrecord: dict) -> list[Field]: return fields @classmethod - def _get_holdings(cls, rawrecord: dict) -> list[Field]: + def _get_holdings(cls, rawrecord: dict) -> List[Field]: holdings = safeget(rawrecord, ("data", "holdings")) if holdings is None: return [] From e03e407bf3072bb707bb58901e64f63d1a14768c Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Thu, 19 Dec 2024 15:41:18 +0100 Subject: [PATCH 19/32] More Python 3.8 compatibility --- edpop_explorer/readers/stcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edpop_explorer/readers/stcn.py b/edpop_explorer/readers/stcn.py index da4d9d1..5f1abe1 100644 --- a/edpop_explorer/readers/stcn.py +++ b/edpop_explorer/readers/stcn.py @@ -148,7 +148,7 @@ def _get_place_of_publication(cls, rawrecord: dict) -> Optional[Field]: return field @classmethod - def _get_languages(cls, rawrecord: dict) -> list[Field]: + def _get_languages(cls, rawrecord: dict) -> List[Field]: languages = safeget(rawrecord, ("data", "language")) if languages is None: return [] From 04575d555700019d5cec4d57282ced0d0a6bf07e Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Mon, 23 Dec 2024 09:51:10 +0100 Subject: [PATCH 20/32] Clarify difference between STCN Persons and STCN Printers databases --- edpop_explorer/readers/stcn.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/edpop_explorer/readers/stcn.py b/edpop_explorer/readers/stcn.py index 5f1abe1..0c17927 100644 --- a/edpop_explorer/readers/stcn.py +++ b/edpop_explorer/readers/stcn.py @@ -34,6 +34,8 @@ class STCNBaseReader(CERLReader): class STCNPersonsReader(STCNBaseReader): + """STCN Persons reader. This reader does not include printers and + publishers, because they are in a separate database.""" API_BY_ID_BASE_URL = 'https://data.cerl.org/stcn_persons/' LINK_BASE_URL = 'https://data.cerl.org/stcn_persons/' CATALOG_URIREF = URIRef( From 29969e6d3c045a6b122eeb382aa9a9980a77c373 Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Mon, 23 Dec 2024 09:51:20 +0100 Subject: [PATCH 21/32] Add spacing after docstring --- edpop_explorer/cerl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/edpop_explorer/cerl.py b/edpop_explorer/cerl.py index ecdc00f..8cd4ed7 100644 --- a/edpop_explorer/cerl.py +++ b/edpop_explorer/cerl.py @@ -15,6 +15,7 @@ class CERLReader(Reader): This is an abstract class -- to use, derive from this class, set the ``API_URL``, ``API_BY_ID_BASE_URL`` and ``LINK_BASE_URL`` constant attributes, and implement the ``_convert_record`` class method.""" + API_URL: str """The base URL of the search API, of the form ``https://data.cerl.org//_search``.""" API_BY_ID_BASE_URL: str From 28962d044671acbd29a52279ed6f10e5355d8e56 Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Mon, 23 Dec 2024 09:55:04 +0100 Subject: [PATCH 22/32] Clarify what the STCN Persons database contains --- edpop_explorer/edpopxshell.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edpop_explorer/edpopxshell.py b/edpop_explorer/edpopxshell.py index d0c4552..bb5758a 100644 --- a/edpop_explorer/edpopxshell.py +++ b/edpop_explorer/edpopxshell.py @@ -183,7 +183,7 @@ def do_stcn(self, args) -> None: self._query(STCNReader, args) def do_stcnpers(self, args) -> None: - 'Short Title Catalogue Netherlands – Persons' + 'Short Title Catalogue Netherlands – Persons (authors and other contributors)' self._query(STCNPersonsReader, args) def do_sbti(self, args) -> None: From 9fb1c5ff8a988ff5a8cab9f01597993bcb59c69e Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Mon, 23 Dec 2024 09:55:11 +0100 Subject: [PATCH 23/32] Remove leftover print --- edpop_explorer/cerl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/edpop_explorer/cerl.py b/edpop_explorer/cerl.py index 8cd4ed7..e124134 100644 --- a/edpop_explorer/cerl.py +++ b/edpop_explorer/cerl.py @@ -50,7 +50,6 @@ def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> L assert isinstance(self.prepared_query, str) if maximum_records is None: maximum_records = self.DEFAULT_RECORDS_PER_PAGE - print(f'The query is: {self.prepared_query}') try: response = requests.get( self.API_URL, From c150a6d08361b901f141c3c1467a3c9456a248eb Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Mon, 23 Dec 2024 10:05:52 +0100 Subject: [PATCH 24/32] Add a check for the length of attribute_chain --- edpop_explorer/readers/stcn.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/edpop_explorer/readers/stcn.py b/edpop_explorer/readers/stcn.py index 0c17927..b2e6abb 100644 --- a/edpop_explorer/readers/stcn.py +++ b/edpop_explorer/readers/stcn.py @@ -13,6 +13,8 @@ def _remove_markup(input_str: str) -> str: def safeget(dictionary: Optional[dict], attribute_chain: tuple, first: bool = False): + if len(attribute_chain) == 0: + raise ValueError("The attribute_chain argument cannot be empty") attribute = attribute_chain[0] if dictionary is None or attribute not in dictionary: return None From c6e9228208aaf250d89d0aa5c1cfe4acfe229a5c Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Mon, 23 Dec 2024 11:57:48 +0100 Subject: [PATCH 25/32] Add unit tests for safeget function --- tests/readers/__init__.py | 0 tests/readers/test_stcn.py | 41 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 tests/readers/__init__.py create mode 100644 tests/readers/test_stcn.py diff --git a/tests/readers/__init__.py b/tests/readers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/readers/test_stcn.py b/tests/readers/test_stcn.py new file mode 100644 index 0000000..ab6b45c --- /dev/null +++ b/tests/readers/test_stcn.py @@ -0,0 +1,41 @@ +import pytest + +from edpop_explorer.readers.stcn import safeget + + +def test_safeget_empty_attribute_chain(): + with pytest.raises(ValueError): + safeget(None, ()) + +def test_safeget_empty_dict(): + assert safeget({}, ("attribute",)) is None + +def test_safeget_none(): + assert safeget(None, ("attribute",)) is None + +def test_safeget_simple(): + assert safeget({"attribute": "value"}, ("attribute",)) == "value" + +def test_safeget_nested(): + assert safeget( + { + "attribute": {"attribute2": "value"} + }, ("attribute", "attribute2") + ) == "value" + +def test_safeget_nested_first_attribute_none(): + assert safeget({ + "attribute": None + }, ("attribute", "attribute2")) is None + +def test_safeget_nested_first_attribute_nonexistent(): + assert safeget({ + "other_attribute": None + }, ("attribute", "attribute2")) is None + +def test_safeget_nested_second_attribute_nonexistent(): + assert safeget({ + "attribute": { + "other_attribute": "value" + } + }, ("attribute", "attribute2")) is None \ No newline at end of file From 721e9a627133de46729cdd3ebb49bfcdf592799f Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Mon, 23 Dec 2024 12:00:09 +0100 Subject: [PATCH 26/32] Simpler formatting --- edpop_explorer/cerl.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/edpop_explorer/cerl.py b/edpop_explorer/cerl.py index e124134..8308c77 100644 --- a/edpop_explorer/cerl.py +++ b/edpop_explorer/cerl.py @@ -64,9 +64,7 @@ def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> L 'Accept': 'application/json' } ).json() - except ( - requests.exceptions.RequestException - ) as err: + except requests.exceptions.RequestException as err: raise ReaderError('Error during server request: ' + str(err)) # TODO: check for error responses From 145a6fcf94865ddcb61e144f5595f77acaf79b52 Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Mon, 23 Dec 2024 12:54:01 +0100 Subject: [PATCH 27/32] Simplify _get_contributors method --- edpop_explorer/readers/stcn.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/edpop_explorer/readers/stcn.py b/edpop_explorer/readers/stcn.py index b2e6abb..bdc10c1 100644 --- a/edpop_explorer/readers/stcn.py +++ b/edpop_explorer/readers/stcn.py @@ -1,3 +1,5 @@ +from operator import methodcaller + from rdflib import URIRef from typing import List, Optional, Tuple @@ -27,6 +29,13 @@ def safeget(dictionary: Optional[dict], attribute_chain: tuple, first: bool = Fa return safeget(value, attribute_chain[1:], first) +def _wrap_contributor(actor_data: dict) -> ContributorField: + field = ContributorField(actor_data['preferred']) + field.name = actor_data['preferred'] + field.role = safeget(actor_data, ('role',), first=True) + return field + + class STCNBaseReader(CERLReader): """STCN uses the same search API for its bibliographical records and its biographical records (persons and publishers/printers), but the @@ -118,16 +127,7 @@ def _get_contributors(cls, rawrecord: dict) -> List[Field]: actors = safeget(rawrecord, ("data", "agent")) if not actors: return [] - contributors = [] - for actor in actors: - name = actor.get("preferred", None) - if name is None: - continue - contributor = ContributorField(name) - contributor.name = name - contributor.role = safeget(actor, ('role',), first=True) - contributors.append(contributor) - return contributors + return [_wrap_contributor(x) for x in actors if x.get('preferred')] @classmethod def _get_publisher_or_printer(cls, rawrecord: dict) -> Optional[Field]: From ede0473e8a4a8b721d418713ebbd0c7b49d3422f Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Mon, 23 Dec 2024 12:56:47 +0100 Subject: [PATCH 28/32] Add docstring to safeget method and an additional test --- edpop_explorer/readers/stcn.py | 3 +++ tests/readers/test_stcn.py | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/edpop_explorer/readers/stcn.py b/edpop_explorer/readers/stcn.py index bdc10c1..9ccc0ec 100644 --- a/edpop_explorer/readers/stcn.py +++ b/edpop_explorer/readers/stcn.py @@ -15,6 +15,9 @@ def _remove_markup(input_str: str) -> str: def safeget(dictionary: Optional[dict], attribute_chain: tuple, first: bool = False): + """Safely get a (nested) attribute in a JSON-like structure. If the + result is a list and ``first`` is ``True``, return the first item + of the list.""" if len(attribute_chain) == 0: raise ValueError("The attribute_chain argument cannot be empty") attribute = attribute_chain[0] diff --git a/tests/readers/test_stcn.py b/tests/readers/test_stcn.py index ab6b45c..4a5ab51 100644 --- a/tests/readers/test_stcn.py +++ b/tests/readers/test_stcn.py @@ -38,4 +38,9 @@ def test_safeget_nested_second_attribute_nonexistent(): "attribute": { "other_attribute": "value" } - }, ("attribute", "attribute2")) is None \ No newline at end of file + }, ("attribute", "attribute2")) is None + +def test_safeget_first(): + assert safeget({ + "attribute": ["value1", "value2"] + }, ("attribute",), True) == "value1" \ No newline at end of file From d34f9a4af8abfef9bcdb5406f32a7fd6960d19ab Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Mon, 23 Dec 2024 13:14:19 +0100 Subject: [PATCH 29/32] Simplify code for record creation --- edpop_explorer/cerl.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/edpop_explorer/cerl.py b/edpop_explorer/cerl.py index 8308c77..a28e463 100644 --- a/edpop_explorer/cerl.py +++ b/edpop_explorer/cerl.py @@ -76,16 +76,7 @@ def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> L except KeyError: raise ReaderError('Number of hits not given in server response') - if 'rows' not in response: - # There are no rows in the response, so stop here - return [] - - records: List[Record] = [] - for rawrecord in response['rows']: - record = self._convert_record(rawrecord) - records.append(record) - - return records + return [self._convert_record(x) for x in response['rows']] if 'rows' in response else [] @classmethod def transform_query(cls, query) -> str: From 4ad8f6b316a8cb2248861b8913a66a97eafbe0b8 Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Mon, 23 Dec 2024 13:18:25 +0100 Subject: [PATCH 30/32] Use list comprehension for holdings list --- edpop_explorer/readers/stcn.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/edpop_explorer/readers/stcn.py b/edpop_explorer/readers/stcn.py index 9ccc0ec..0052670 100644 --- a/edpop_explorer/readers/stcn.py +++ b/edpop_explorer/readers/stcn.py @@ -39,6 +39,13 @@ def _wrap_contributor(actor_data: dict) -> ContributorField: return field +def _wrap_holding(holding_data: dict) -> Field: + institution = safeget(holding_data, ("data", "institutionName")) + shelfmark = safeget(holding_data, ("data", "shelfmark")) + summary = f"{institution} - {shelfmark}" + return Field(summary) + + class STCNBaseReader(CERLReader): """STCN uses the same search API for its bibliographical records and its biographical records (persons and publishers/printers), but the @@ -218,13 +225,7 @@ def _get_holdings(cls, rawrecord: dict) -> List[Field]: holdings = safeget(rawrecord, ("data", "holdings")) if holdings is None: return [] - fields = [] - for holding in holdings: - institution = safeget(holding, ("data", "institutionName")) - shelfmark = safeget(holding, ("data", "shelfmark")) - summary = f"{institution} - {shelfmark}" - fields.append(Field(summary)) - return fields + return [_wrap_holding(x) for x in holdings] @classmethod def _convert_record(cls, rawrecord: dict) -> BibliographicalRecord: From 33187049d88270d133d8eb490127518798b7e2db Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Mon, 23 Dec 2024 13:35:22 +0100 Subject: [PATCH 31/32] Consistently use safeget --- edpop_explorer/readers/stcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edpop_explorer/readers/stcn.py b/edpop_explorer/readers/stcn.py index 0052670..523a38c 100644 --- a/edpop_explorer/readers/stcn.py +++ b/edpop_explorer/readers/stcn.py @@ -145,7 +145,7 @@ def _get_publisher_or_printer(cls, rawrecord: dict) -> Optional[Field]: provision_agent = safeget(rawrecord, ("data", "provisionAgent"), first=True) if provision_agent is None: return None - name = provision_agent.get("preferred", None) + name = safeget(provision_agent, ("preferred",)) if name is None: return None field = Field(name) From edc52ef0077072968f9b3bc33d75020657c122cc Mon Sep 17 00:00:00 2001 From: Tijmen Baarda Date: Mon, 23 Dec 2024 13:36:56 +0100 Subject: [PATCH 32/32] Remove unused import --- edpop_explorer/readers/stcn.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/edpop_explorer/readers/stcn.py b/edpop_explorer/readers/stcn.py index 523a38c..e8c847d 100644 --- a/edpop_explorer/readers/stcn.py +++ b/edpop_explorer/readers/stcn.py @@ -1,5 +1,3 @@ -from operator import methodcaller - from rdflib import URIRef from typing import List, Optional, Tuple