diff --git a/edpop_explorer/__init__.py b/edpop_explorer/__init__.py index 5a23a45..ced924c 100644 --- a/edpop_explorer/__init__.py +++ b/edpop_explorer/__init__.py @@ -6,7 +6,7 @@ 'BasePreparedQuery', 'PreparedQueryType', 'Record', 'RawData', 'RecordError', 'BibliographicalRecord', 'BiographicalRecord', 'LazyRecordMixin', - 'SRUReader', + 'SRUReader', 'CERLReader', 'Marc21Data', 'Marc21Field', 'Marc21BibliographicalRecord', 'Marc21DataMixin', 'SRUMarc21Reader', 'SRUMarc21BibliographicalReader', 'BIBLIOGRAPHICAL', 'BIOGRAPHICAL' @@ -32,4 +32,5 @@ Marc21Data, Marc21Field, Marc21BibliographicalRecord, Marc21DataMixin, SRUMarc21Reader, SRUMarc21BibliographicalReader ) +from .cerl import CERLReader diff --git a/edpop_explorer/cerl.py b/edpop_explorer/cerl.py new file mode 100644 index 0000000..a28e463 --- /dev/null +++ b/edpop_explorer/cerl.py @@ -0,0 +1,95 @@ +from abc import abstractmethod + +import requests +from typing import List, Dict, Optional + +from edpop_explorer import ( + Reader, Record, ReaderError +) + + +class CERLReader(Reader): + """A generic reader class for the CERL databases on the ``data.cerl.org`` + platform. + + This is an abstract class -- to use, derive from this class, set the + ``API_URL``, ``API_BY_ID_BASE_URL`` and ``LINK_BASE_URL`` constant + attributes, and implement the ``_convert_record`` class method.""" + + API_URL: str + """The base URL of the search API, of the form ``https://data.cerl.org//_search``.""" + API_BY_ID_BASE_URL: str + """The base URL of the API for retrieving single records, of the form ``https://data.cerl.org//``.""" + LINK_BASE_URL: str + """The base URL for userfriendly representations of single records.""" + additional_params: Optional[Dict[str, str]] = None + DEFAULT_RECORDS_PER_PAGE = 10 + + @classmethod + def get_by_id(cls, identifier: str) -> Record: + try: + response = requests.get( + cls.API_BY_ID_BASE_URL + identifier, + headers={ + 'Accept': 'application/json' + }, + ).json() + except requests.exceptions.JSONDecodeError: + raise ReaderError(f"Item with id {identifier} does not exist.") + except requests.exceptions.RequestException as err: + raise ReaderError(f"Error during server request: {err}") + return cls._convert_record(response) + + + @classmethod + @abstractmethod + def _convert_record(cls, rawrecord: dict) -> Record: + pass + + def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> List[Record]: + assert isinstance(self.prepared_query, str) + if maximum_records is None: + maximum_records = self.DEFAULT_RECORDS_PER_PAGE + try: + response = requests.get( + self.API_URL, + params={ + 'query': self.prepared_query, + 'from': start_record, + 'size': maximum_records, + 'mode': 'default', + 'sort': 'default' + }, + headers={ + 'Accept': 'application/json' + } + ).json() + except requests.exceptions.RequestException as err: + raise ReaderError('Error during server request: ' + str(err)) + + # TODO: check for error responses + try: + if response['hits'] is None: + self.number_of_results = 0 + else: + self.number_of_results = response['hits']['value'] + except KeyError: + raise ReaderError('Number of hits not given in server response') + + return [self._convert_record(x) for x in response['rows']] if 'rows' in response else [] + + @classmethod + def transform_query(cls, query) -> str: + # No transformation needed + return query + + def fetch_range(self, range_to_fetch: range) -> range: + if self.prepared_query is None: + raise ReaderError('First call prepare_query') + start_record = range_to_fetch.start + number_to_fetch = range_to_fetch.stop - start_record + results = self._perform_query(start_record, number_to_fetch) + for i, result in enumerate(results): + self.records[i + range_to_fetch.start] = result + return range(start_record, start_record + len(results)) + diff --git a/edpop_explorer/edpopxshell.py b/edpop_explorer/edpopxshell.py index 14656ab..bb5758a 100644 --- a/edpop_explorer/edpopxshell.py +++ b/edpop_explorer/edpopxshell.py @@ -19,6 +19,7 @@ VDLiedReader, KBReader, STCNReader, + STCNPersonsReader, SBTIReader, USTCReader, BnFReader, @@ -95,7 +96,6 @@ def show_record(self, record: Record) -> None: )) recordtype = str(record._rdf_class).rsplit('/',1)[1] self.poutput(f'Record type: {recordtype}') - self.poutput if record.identifier: self.poutput(f'Identifier: {record.identifier}') if record.link: @@ -181,6 +181,10 @@ def do_ct(self, args) -> None: def do_stcn(self, args) -> None: 'Short Title Catalogue Netherlands' self._query(STCNReader, args) + + def do_stcnpers(self, args) -> None: + 'Short Title Catalogue Netherlands – Persons (authors and other contributors)' + self._query(STCNPersonsReader, args) def do_sbti(self, args) -> None: 'Scottish Book Trade Index' diff --git a/edpop_explorer/fields.py b/edpop_explorer/fields.py index cd80892..64da99a 100644 --- a/edpop_explorer/fields.py +++ b/edpop_explorer/fields.py @@ -4,11 +4,14 @@ from typing import Optional, Callable, List, Tuple +from iso639 import Lang +from iso639.exceptions import InvalidLanguageValue from rdflib import Graph, Literal, BNode, RDF, URIRef from rdflib.term import Node from edpop_explorer import EDPOPREC, normalizers from edpop_explorer.normalizers import NormalizationResult +from edpop_explorer.normalization import relators DATATYPES = { 'string': { @@ -70,7 +73,6 @@ class Field: #: by default. subject_node: Node _subfields: List[Tuple[str, URIRef, str]] - normalized_text: Optional[str] = None #: Subfield -- indicates whether the value of this field is explicitly #: marked as unknown in the original record. unknown: Optional[bool] = None @@ -88,7 +90,7 @@ def __init__(self, original_text: str) -> None: self.original_text = original_text self._subfields = [ ('original_text', EDPOPREC.originalText, 'string'), - ('normalized_text', EDPOPREC.normalizedText, 'string'), + ('summary_text', EDPOPREC.summaryText, 'string'), ('unknown', EDPOPREC.unknown, 'boolean'), ('authority_record', EDPOPREC.authorityRecord, 'string'), ] @@ -140,9 +142,13 @@ def to_graph(self) -> Graph: )) return graph + @property + def summary_text(self) -> Optional[str]: + return None + def __str__(self) -> str: - if self.normalized_text is not None: - return self.normalized_text + if self.summary_text is not None: + return self.summary_text else: return self.original_text @@ -171,3 +177,33 @@ def __init__(self, original_text: str) -> None: ('language_code', EDPOPREC.languageCode, 'string') ) + @property + def summary_text(self) -> Optional[str]: + try: + language = Lang(self.language_code) + return language.name + except InvalidLanguageValue: + return None + + +class ContributorField(Field): + _rdf_class = EDPOPREC.ContributorField + role: Optional[str] = None + name: Optional[str] = None + + def __init__(self, original_text: str) -> None: + super().__init__(original_text) + self._subfields.extend(( + ('name', EDPOPREC.name, 'string'), + ('role', EDPOPREC.role, 'string'), + )) + + @property + def summary_text(self) -> Optional[str]: + role = relators.relator_dict.get(self.role, self.role) + name = self.name if self.name is not None else self.original_text + if role is not None: + return f"{name} ({role})" + else: + return name + diff --git a/edpop_explorer/normalization/relators.py b/edpop_explorer/normalization/relators.py new file mode 100644 index 0000000..da6cd33 --- /dev/null +++ b/edpop_explorer/normalization/relators.py @@ -0,0 +1,307 @@ +# Relator dictionary taken from https://www.loc.gov/marc/relators/relacode.html + +relator_dict = { + "abr": "abridger", + "acp": "art copyist", + "act": "actor", + "adi": "art director", + "adp": "adapter", + "aft": "author of afterword, colophon, etc.", + "anc": "announcer", + "anl": "analyst", + "anm": "animator", + "ann": "annotator", + "ant": "bibliographic antecedent", + "ape": "appellee", + "apl": "appellant", + "app": "applicant", + "aqt": "author in quotations or text abstracts", + "arc": "architect", + "ard": "artistic director", + "arr": "arranger", + "art": "artist", + "asg": "assignee", + "asn": "associated name", + "ato": "autographer", + "att": "attributed name", + "auc": "auctioneer", + "aud": "author of dialog", + "aue": "audio engineer", + "aui": "author of introduction, etc.", + "aup": "audio producer", + "aus": "screenwriter", + "aut": "author", + "bdd": "binding designer", + "bjd": "bookjacket designer", + "bka": "book artist", + "bkd": "book designer", + "bkp": "book producer", + "blw": "blurb writer", + "bnd": "binder", + "bpd": "bookplate designer", + "brd": "broadcaster", + "brl": "braille embosser", + "bsl": "bookseller", + "cad": "casting director", + "cas": "caster", + "ccp": "conceptor", + "chrc": "choreographer", + "-clb": "collaborator", + "cli": "client", + "cll": "calligrapher", + "clr": "colorist", + "clt": "collotyper", + "cmm": "commentator", + "cmp": "composer", + "cmt": "compositor", + "cnd": "conductor", + "cng": "cinematographer", + "cns": "censor", + "coe": "contestant-appellee", + "col": "collector", + "com": "compiler", + "con": "conservator", + "cop": "camera operator", + "cor": "collection registrar", + "cos": "contestant", + "cot": "contestant-appellant", + "cou": "court governed", + "cov": "cover designer", + "cpc": "copyright claimant", + "cpe": "complainant-appellee", + "cph": "copyright holder", + "cpl": "complainant", + "cpt": "complainant-appellant", + "cre": "creator", + "crp": "correspondent", + "crr": "corrector", + "crt": "court reporter", + "csl": "consultant", + "csp": "consultant to a project", + "cst": "costume designer", + "ctb": "contributor", + "cte": "contestee-appellee", + "ctg": "cartographer", + "ctr": "contractor", + "cts": "contestee", + "ctt": "contestee-appellant", + "cur": "curator", + "cwt": "commentator for written text", + "dbd": "dubbing director", + "dbp": "distribution place", + "dfd": "defendant", + "dfe": "defendant-appellee", + "dft": "defendant-appellant", + "dgc": "degree committee member", + "dgg": "degree granting institution", + "dgs": "degree supervisor", + "dis": "dissertant", + "djo": "dj", + "dln": "delineator", + "dnc": "dancer", + "dnr": "donor", + "dpc": "depicted", + "dpt": "depositor", + "drm": "draftsman", + "drt": "director", + "dsr": "designer", + "dst": "distributor", + "dtc": "data contributor", + "dte": "dedicatee", + "dtm": "data manager", + "dto": "dedicator", + "dub": "dubious author", + "edc": "editor of compilation", + "edd": "editorial director", + "edm": "editor of moving image work", + "edt": "editor", + "egr": "engraver", + "elg": "electrician", + "elt": "electrotyper", + "eng": "engineer", + "enj": "enacting jurisdiction", + "etr": "etcher", + "evp": "event place", + "exp": "expert", + "fac": "facsimilist", + "fds": "film distributor", + "fld": "field director", + "flm": "film editor", + "fmd": "film director", + "fmk": "filmmaker", + "fmo": "former owner", + "fmp": "film producer", + "fnd": "funder", + "fon": "founder", + "fpy": "first party", + "frg": "forger", + "gdv": "game developer", + "gis": "geographic information specialist", + "-grt": "graphic technician", + "his": "host institution", + "hnr": "honoree", + "hst": "host", + "ill": "illustrator", + "ilu": "illuminator", + "ins": "inscriber", + "inv": "inventor", + "isb": "issuing body", + "itr": "instrumentalist", + "ive": "interviewee", + "ivr": "interviewer", + "jud": "judge", + "jug": "jurisdiction governed", + "lbr": "laboratory", + "lbt": "librettist", + "ldr": "laboratory director", + "led": "lead", + "lee": "libelee-appellee", + "lel": "libelee", + "len": "lender", + "let": "libelee-appellant", + "lgd": "lighting designer", + "lie": "libelant-appellee", + "lil": "libelant", + "lit": "libelant-appellant", + "lsa": "landscape architect", + "lse": "licensee", + "lso": "licensor", + "ltg": "lithographer", + "ltr": "letterer", + "lyr": "lyricist", + "mcp": "music copyist", + "mdc": "metadata contact", + "med": "medium", + "mfp": "manufacture place", + "mfr": "manufacturer", + "mka": "makeup artist", + "mod": "moderator", + "mon": "monitor", + "mrb": "marbler", + "mrk": "markup editor", + "msd": "musical director", + "mte": "metal-engraver", + "mtk": "minute taker", + "mup": "music programmer", + "mus": "musician", + "mxe": "mixing engineer", + "nan": "news anchor", + "nrt": "narrator", + "onp": "onscreen participant", + "opn": "opponent", + "org": "originator", + "orm": "organizer", + "osp": "onscreen presenter", + "oth": "other", + "own": "owner", + "pad": "place of address", + "pan": "panelist", + "pat": "patron", + "pbd": "publishing director", + "pbl": "publisher", + "pdr": "project director", + "pfr": "proofreader", + "pht": "photographer", + "plt": "platemaker", + "pma": "permitting agency", + "pmn": "production manager", + "pop": "printer of plates", + "ppm": "papermaker", + "ppt": "puppeteer", + "pra": "praeses", + "prc": "process contact", + "prd": "production personnel", + "pre": "presenter", + "prf": "performer", + "prg": "programmer", + "prm": "printmaker", + "prn": "production company", + "pro": "producer", + "prp": "production place", + "prs": "production designer", + "prt": "printer", + "prv": "provider", + "pta": "patent applicant", + "pte": "plaintiff-appellee", + "ptf": "plaintiff", + "pth": "patent holder", + "ptt": "plaintiff-appellant", + "pup": "publication place", + "rap": "rapporteur", + "rbr": "rubricator", + "rcd": "recordist", + "rce": "recording engineer", + "rcp": "addressee", + "rdd": "radio director", + "red": "redaktor", + "ren": "renderer", + "res": "researcher", + "rev": "reviewer", + "rpc": "radio producer", + "rps": "repository", + "rpt": "reporter", + "rpy": "responsible party", + "rse": "respondent-appellee", + "rsg": "restager", + "rsp": "respondent", + "rsr": "restorationist", + "rst": "respondent-appellant", + "rth": "research team head", + "rtm": "research team member", + "rxa": "remix artist", + "sad": "scientific advisor", + "sce": "scenarist", + "scl": "sculptor", + "scr": "scribe", + "sde": "sound engineer", + "sds": "sound designer", + "sec": "secretary", + "sfx": "special effects provider", + "sgd": "stage director", + "sgn": "signer", + "sht": "spporting host", + "sll": "seller", + "sng": "singer", + "spk": "speaker", + "spn": "sponsor", + "spy": "second party", + "srv": "surveyor", + "std": "set designer", + "stg": "setting", + "stl": "storyteller", + "stm": "stage manager", + "stn": "standards body", + "str": "stereotyper", + "swd": "software developer", + "tad": "technical advisor", + "tau": "television writer", + "tcd": "technical director", + "tch": "teacher", + "ths": "thesis advisor", + "tld": "television director", + "tlg": "television guest", + "tlh": "television host", + "tlp": "television producer", + "trc": "transcriber", + "trl": "translator", + "tyd": "type designer", + "tyg": "typographer", + "uvp": "university place", + "vac": "voice actor", + "vdg": "videographer", + "vfx": "visual effects provider", + "voc": "vocalist", + "wac": "writer of added commentary", + "wal": "writer of added lyrics", + "wam": "writer of accompanying material", + "wat": "writer of added text", + "wdc": "woodcutter", + "wde": "wood engraver", + "wfs": "writer of film story", + "wft": "writer of intertitles", + "win": "writer of introduction", + "wit": "witness", + "wpr": "writer of preface", + "wst": "writer of supplementary textual content", + "wts": "writer of television story" +} diff --git a/edpop_explorer/normalizers.py b/edpop_explorer/normalizers.py index 8cdf415..1e89a8d 100644 --- a/edpop_explorer/normalizers.py +++ b/edpop_explorer/normalizers.py @@ -18,7 +18,6 @@ def normalize_by_language_code(field) -> NormalizationResult: try: language = Lang(field.original_text) field.language_code = language.pt3 - field.normalized_text = language.name return NormalizationResult.SUCCESS except InvalidLanguageValue: return NormalizationResult.FAIL diff --git a/edpop_explorer/readers/__init__.py b/edpop_explorer/readers/__init__.py index 3acfd63..fe8a645 100644 --- a/edpop_explorer/readers/__init__.py +++ b/edpop_explorer/readers/__init__.py @@ -13,6 +13,7 @@ "VD18Reader", "VDLiedReader", "STCNReader", + "STCNPersonsReader", "USTCReader", "KVCSReader", "DutchAlmanacsReader", @@ -29,7 +30,7 @@ from .hpb import HPBReader from .kb import KBReader from .sbtireader import SBTIReader -from .stcn import STCNReader +from .stcn import STCNReader, STCNPersonsReader from .ustc import USTCReader from .vd import VD16Reader, VD17Reader, VD18Reader, VDLiedReader from .kvcs import KVCSReader diff --git a/edpop_explorer/readers/kvcs.py b/edpop_explorer/readers/kvcs.py index c7834f1..237e7ea 100644 --- a/edpop_explorer/readers/kvcs.py +++ b/edpop_explorer/readers/kvcs.py @@ -24,7 +24,7 @@ def _convert_record(cls, rawrecord: dict) -> BiographicalRecord: record.identifier = rawrecord['ID'] record.name = Field(rawrecord['Name']) record.gender = Field(rawrecord['Gender']) - record.lifespan = Field(rawrecord['Years of life']) + record.timespan = Field(rawrecord['Years of life']) record.places_of_activity = Field(rawrecord['City']) record.activity_timespan = Field(rawrecord['Years of activity']) record.activities = Field(rawrecord['Kind of print and sales activities']) diff --git a/edpop_explorer/readers/sbtireader.py b/edpop_explorer/readers/sbtireader.py index 148a5bf..a23e6ef 100644 --- a/edpop_explorer/readers/sbtireader.py +++ b/edpop_explorer/readers/sbtireader.py @@ -1,17 +1,16 @@ from rdflib import URIRef -import requests -from typing import List, Dict, Optional +from typing import Dict, Optional from edpop_explorer import ( - Reader, Record, ReaderError, BiographicalRecord, Field, BIOGRAPHICAL + BiographicalRecord, Field, BIOGRAPHICAL ) +from edpop_explorer.cerl import CERLReader -class SBTIReader(Reader): - api_url = 'https://data.cerl.org/sbti/_search' - api_by_id_base_url = 'https://data.cerl.org/sbti/' - link_base_url = 'https://data.cerl.org/sbti/' - fetching_exhausted: bool = False +class SBTIReader(CERLReader): + API_URL = 'https://data.cerl.org/sbti/_search' + API_BY_ID_BASE_URL = 'https://data.cerl.org/sbti/' + LINK_BASE_URL = 'https://data.cerl.org/sbti/' additional_params: Optional[Dict[str, str]] = None CATALOG_URIREF = URIRef( 'https://edpop.hum.uu.nl/readers/sbti' @@ -34,22 +33,6 @@ def _get_name_field(cls, data: dict) -> Optional[Field]: field = Field(f"{name}") return field - @classmethod - def get_by_id(cls, identifier: str) -> BiographicalRecord: - try: - response = requests.get( - cls.api_by_id_base_url + identifier, - headers={ - 'Accept': 'application/json' - }, - ).json() - except requests.exceptions.JSONDecodeError: - raise ReaderError(f"Item with id {identifier} does not exist.") - except requests.exceptions.RequestException as err: - raise ReaderError(f"Error during server request: {err}") - return cls._convert_record(response) - - @classmethod def _convert_record(cls, rawrecord: dict) -> BiographicalRecord: record = BiographicalRecord(from_reader=cls) @@ -58,7 +41,7 @@ def _convert_record(cls, rawrecord: dict) -> BiographicalRecord: if not record.identifier: record.identifier = rawrecord.get('_id', None) if record.identifier: - record.link = cls.link_base_url + record.identifier + record.link = cls.LINK_BASE_URL + record.identifier # Add fields heading = rawrecord.get("heading", None) @@ -83,64 +66,3 @@ def _convert_record(cls, rawrecord: dict) -> BiographicalRecord: return record - def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> List[Record]: - assert isinstance(self.prepared_query, str) - if maximum_records is None: - maximum_records = self.DEFAULT_RECORDS_PER_PAGE - print(f'The query is: {self.prepared_query}') - try: - response = requests.get( - self.api_url, - params={ - 'query': self.prepared_query, - 'from': start_record, - 'size': maximum_records, - 'mode': 'default', - 'sort': 'default' - }, - headers={ - 'Accept': 'application/json' - } - ).json() - except ( - requests.exceptions.RequestException - ) as err: - raise ReaderError('Error during server request: ' + str(err)) - - # TODO: check for error responses - try: - if response['hits'] is None: - self.number_of_results = 0 - else: - self.number_of_results = response['hits']['value'] - except KeyError: - raise ReaderError('Number of hits not given in server response') - - if 'rows' not in response: - # There are no rows in the response, so stop here - return [] - - records: List[Record] = [] - for rawrecord in response['rows']: - record = self._convert_record(rawrecord) - records.append(record) - - return records - - @classmethod - def transform_query(cls, query) -> str: - # No transformation needed - return query - - def fetch_range(self, range_to_fetch: range) -> range: - if self.prepared_query is None: - raise ReaderError('First call prepare_query') - if self.fetching_exhausted: - return range(0) - start_record = range_to_fetch.start - number_to_fetch = range_to_fetch.stop - start_record - results = self._perform_query(start_record, number_to_fetch) - for i, result in enumerate(results): - self.records[i] = result - return range(start_record, start_record + len(results)) - diff --git a/edpop_explorer/readers/stcn.py b/edpop_explorer/readers/stcn.py index ef1bab2..e8c847d 100644 --- a/edpop_explorer/readers/stcn.py +++ b/edpop_explorer/readers/stcn.py @@ -1,104 +1,247 @@ -from rdflib import Graph, Namespace, URIRef -from rdflib.term import Node +from rdflib import URIRef from typing import List, Optional, Tuple -from edpop_explorer import Field, BIBLIOGRAPHICAL -from edpop_explorer.fields import LanguageField -from edpop_explorer.sparqlreader import ( - SparqlReader, BibliographicalRDFRecord -) - - -def _get_properties_from_iri(iri: str, properties: List[Node]) -> \ - Tuple[List[Node], Graph]: - '''Get the first objects of the requested properties of a certain IRI - as strings.''' - subject_graph = Graph() - subject_graph.parse(iri) - objects: List[Node] = [] - for prop in properties: - for obj in subject_graph.objects(URIRef(iri), prop): - objects.append(obj) - return objects, subject_graph - - -class STCNReader(SparqlReader): - endpoint = 'http://data.bibliotheken.nl/sparql' - filter = '?s schema:mainEntityOfPage/schema:isPartOf ' \ - ' .' - name_predicate = '' +from edpop_explorer import Field, BIBLIOGRAPHICAL, BibliographicalRecord, LocationField, BIOGRAPHICAL, \ + BiographicalRecord +from edpop_explorer.cerl import CERLReader +from edpop_explorer.fields import LanguageField, ContributorField + + +def _remove_markup(input_str: str) -> str: + """Remove STCN-specific markup""" + return input_str.replace('`IT`', '').replace('`LO`', '') + + +def safeget(dictionary: Optional[dict], attribute_chain: tuple, first: bool = False): + """Safely get a (nested) attribute in a JSON-like structure. If the + result is a list and ``first`` is ``True``, return the first item + of the list.""" + if len(attribute_chain) == 0: + raise ValueError("The attribute_chain argument cannot be empty") + attribute = attribute_chain[0] + if dictionary is None or attribute not in dictionary: + return None + value = dictionary[attribute] + if first and isinstance(value, list): + value = value[0] + if len(attribute_chain) == 1: + return value + else: + return safeget(value, attribute_chain[1:], first) + + +def _wrap_contributor(actor_data: dict) -> ContributorField: + field = ContributorField(actor_data['preferred']) + field.name = actor_data['preferred'] + field.role = safeget(actor_data, ('role',), first=True) + return field + + +def _wrap_holding(holding_data: dict) -> Field: + institution = safeget(holding_data, ("data", "institutionName")) + shelfmark = safeget(holding_data, ("data", "shelfmark")) + summary = f"{institution} - {shelfmark}" + return Field(summary) + + +class STCNBaseReader(CERLReader): + """STCN uses the same search API for its bibliographical records and + its biographical records (persons and publishers/printers), but the + data format as well as detail pages are different. This base class + builds on CERLReader and adds the API URL.""" + API_URL = 'https://data.cerl.org/stcn/_search' + + +class STCNPersonsReader(STCNBaseReader): + """STCN Persons reader. This reader does not include printers and + publishers, because they are in a separate database.""" + API_BY_ID_BASE_URL = 'https://data.cerl.org/stcn_persons/' + LINK_BASE_URL = 'https://data.cerl.org/stcn_persons/' + CATALOG_URIREF = URIRef( + 'https://edpop.hum.uu.nl/readers/stcn' + ) + IRI_PREFIX = "https://edpop.hum.uu.nl/readers/stcn-persons/" + READERTYPE = BIOGRAPHICAL + SHORT_NAME = "STCN Persons" + DESCRIPTION = "National bibliography of The Netherlands until 1801 – persons" + + @classmethod + def transform_query(cls, query) -> str: + # Only person records + return f"({query}) AND data.type:pers" + + @classmethod + def _get_names(cls, rawrecord: dict) -> Tuple[Optional[Field], Optional[List[Field]]]: + preferred_name = safeget(rawrecord, ('shortDisplay',)) + namelist = safeget(rawrecord, ('data', 'agent')) + alternative_names = None + if namelist: + alternative_names = [x["variants"] for x in namelist if x["variants"] != preferred_name] + preferred_name_field = Field(preferred_name) if preferred_name else None + alternative_names_field = [Field(x) for x in alternative_names] if alternative_names else None + return preferred_name_field, alternative_names_field + + @classmethod + def _get_timespan(cls, rawrecord: dict) -> Optional[Field]: + timespan = safeget(rawrecord, ("dates",)) + if timespan: + return Field(timespan) + + @classmethod + def _get_activities(cls, rawrecord: dict) -> Optional[List[Field]]: + profession_notes = safeget(rawrecord, ("data", "professionNote",)) + if not profession_notes: + return None + return [Field(x) for x in profession_notes] + + @classmethod + def _convert_record(cls, rawrecord: dict) -> BiographicalRecord: + record = BiographicalRecord(from_reader=cls) + record.data = rawrecord + record.identifier = rawrecord.get('id', None) + if record.identifier: + record.link = cls.LINK_BASE_URL + record.identifier + record.name, record.variant_names = cls._get_names(rawrecord) + record.timespan = cls._get_timespan(rawrecord) + record.activities = cls._get_activities(rawrecord) + return record + + +class STCNReader(STCNBaseReader): + API_BY_ID_BASE_URL = 'https://data.cerl.org/stcn/' + LINK_BASE_URL = 'https://data.cerl.org/stcn/' CATALOG_URIREF = URIRef( 'https://edpop.hum.uu.nl/readers/stcn' ) IRI_PREFIX = "https://edpop.hum.uu.nl/readers/stcn/" READERTYPE = BIBLIOGRAPHICAL SHORT_NAME = "Short-Title Catalogue Netherlands (STCN)" - DESCRIPTION = "National biography of The Netherlands until 1801" - - def __init__(self): - super().__init__() - - @classmethod - def convert_record( - cls, graph: Graph, record: BibliographicalRDFRecord - ) -> None: - SCHEMA = Namespace('http://schema.org/') - # First get the title and languages fields, which are simple - # properties - assert record.identifier is not None - subject_node = URIRef(record.identifier) - for name in graph.objects(subject_node, SCHEMA.name): - record.title = Field(str(name)) - break - record.languages = [] - for language in graph.objects(subject_node, SCHEMA.inLanguage): - field = LanguageField(str(language)) + DESCRIPTION = "National bibliography of The Netherlands until 1801" + + @classmethod + def transform_query(cls, query) -> str: + # Filter out bibliographical records + return f"({query}) NOT data.type:pers NOT data.type:impr" + + @classmethod + def _get_title(cls, rawrecord: dict) -> Optional[Field]: + title = safeget(rawrecord, ("display", "title")) + if isinstance(title, str): + title = _remove_markup(title) + return Field(title) + + @classmethod + def _get_contributors(cls, rawrecord: dict) -> List[Field]: + actors = safeget(rawrecord, ("data", "agent")) + if not actors: + return [] + return [_wrap_contributor(x) for x in actors if x.get('preferred')] + + @classmethod + def _get_publisher_or_printer(cls, rawrecord: dict) -> Optional[Field]: + # TODO: support multiple publishers/printers + provision_agent = safeget(rawrecord, ("data", "provisionAgent"), first=True) + if provision_agent is None: + return None + name = safeget(provision_agent, ("preferred",)) + if name is None: + return None + field = Field(name) + return field + + @classmethod + def _get_place_of_publication(cls, rawrecord: dict) -> Optional[Field]: + place = safeget(rawrecord, ("data", "provisionAgent", "place"), first=True) + if place is None: + return None + else: + field = LocationField(place) + field.location_type = LocationField.LOCALITY + return field + + @classmethod + def _get_languages(cls, rawrecord: dict) -> List[Field]: + languages = safeget(rawrecord, ("data", "language")) + if languages is None: + return [] + fields = [] + for language in languages: + field = LanguageField(language) field.normalize() - record.languages.append(field) - # Now get the information from blank nodes - record.contributors = [] - for author in graph.objects(subject_node, SCHEMA.author): - name_field = None - for name in graph.objects(author, SCHEMA.name): - name_field = Field(str(name)) - # TODO: add role and authority record - if name_field: - record.contributors.append(name_field) - for publication in graph.objects(subject_node, SCHEMA.publication): - year_field = None - for startDate in graph.objects(publication, SCHEMA.startDate): - year_field = Field(str(startDate)) - if year_field: - record.dating = year_field - # TODO: publisher and location (not a blank node) - published_by_iri = None - for publishedBy in graph.objects(publication, SCHEMA.publishedBy): - published_by_iri = str(publishedBy) - break - if published_by_iri: - [name, location_node], pubgraph = _get_properties_from_iri( - published_by_iri, [SCHEMA.name, SCHEMA.location] - ) - record.publisher_or_printer = Field(str(name)) - address_node = None - for address in pubgraph.objects(location_node, SCHEMA.address): - address_node = address - break - if address_node: - for addressLocality in pubgraph.objects( - address_node, SCHEMA.addressLocality - ): - record.place_of_publication = Field( - str(addressLocality) - ) - break - - @classmethod - def _create_lazy_record( - cls, iri: str, name: Optional[str]=None - ) -> BibliographicalRDFRecord: - record = BibliographicalRDFRecord(cls) - record.identifier = iri - record.link = iri - record.title = Field(name) if name else None + fields.append(field) + return fields + + @classmethod + def _get_dating(cls, rawrecord: dict) -> Optional[Field]: + dating = safeget(rawrecord, ("data", "date")) + if dating is not None: + return Field(dating) + + @classmethod + def _get_extent(cls, rawrecord: dict) -> Optional[Field]: + sheets = safeget(rawrecord, ("data", "extent", "sheets")) + if sheets is None: + return None + extent = f"{sheets} sheets" + return Field(extent) + + @classmethod + def _get_format(cls, rawrecord: dict) -> Optional[Field]: + format_ = safeget(rawrecord, ("data", "format", "format")) + if format_ is None: + return None + return Field(format_) + + @classmethod + def _get_collation_formula(cls, rawrecord: dict) -> Optional[Field]: + collations = safeget(rawrecord, ("data", "extent", "collation")) + if not collations: + return None + # Multiple collation formulas are possible, but this seems to be rare. + collation_string = ' ; '.join([x.get("value") for x in collations if "value" in x]) + return Field(collation_string) + + @classmethod + def _get_fingerprint(cls, rawrecord: dict) -> Optional[Field]: + fingerprints = safeget(rawrecord, ("data", "fingerprint")) + if not fingerprints: + return None + # Multiple fingerprints are possible, but this seems to be rare + fingerprint_string = ' ; '.join([x.get("fingerprint") for x in fingerprints if "fingerprint" in x]) + return Field(fingerprint_string) + + @classmethod + def _get_genres(cls, rawrecord: dict) -> List[Field]: + subjecttopics = safeget(rawrecord, ("data", "subjectTopic")) + if subjecttopics is None: + return [] + fields = [Field(x["preferred"]) for x in subjecttopics if "preferred" in x] + return fields + + @classmethod + def _get_holdings(cls, rawrecord: dict) -> List[Field]: + holdings = safeget(rawrecord, ("data", "holdings")) + if holdings is None: + return [] + return [_wrap_holding(x) for x in holdings] + + @classmethod + def _convert_record(cls, rawrecord: dict) -> BibliographicalRecord: + record = BibliographicalRecord(from_reader=cls) + record.data = rawrecord + record.identifier = rawrecord.get('id', None) + if record.identifier: + record.link = cls.LINK_BASE_URL + record.identifier + record.title = cls._get_title(rawrecord) + record.contributors = cls._get_contributors(rawrecord) + record.publisher_or_printer = cls._get_publisher_or_printer(rawrecord) + record.place_of_publication = cls._get_place_of_publication(rawrecord) + record.dating = cls._get_dating(rawrecord) + record.languages = cls._get_languages(rawrecord) + record.extent = cls._get_extent(rawrecord) + record.bibliographical_format = cls._get_format(rawrecord) + record.collation_formula = cls._get_collation_formula(rawrecord) + record.fingerprint = cls._get_fingerprint(rawrecord) + record.genres = cls._get_genres(rawrecord) + record.holdings = cls._get_holdings(rawrecord) return record diff --git a/edpop_explorer/record.py b/edpop_explorer/record.py index 10fecb3..9447905 100644 --- a/edpop_explorer/record.py +++ b/edpop_explorer/record.py @@ -215,6 +215,11 @@ class BibliographicalRecord(Record): physical_description: Optional[Field] = None bookseller: Optional[Field] = None location: Optional[Field] = None + bibliographical_format: Optional[Field] = None + fingerprint: Optional[Field] = None + collation_formula: Optional[Field] = None + genres: Optional[List[Field]] = None + holdings: Optional[List[Field]] = None def __init__(self, from_reader: Type["Reader"]): super().__init__(from_reader) @@ -232,6 +237,11 @@ def __init__(self, from_reader: Type["Reader"]): ('physical_description', EDPOPREC.physicalDescription, Field), ('bookseller', EDPOPREC.bookseller, Field), ('location', EDPOPREC.location, Field), + ('bibliographical_format', EDPOPREC.bibliographicalFormat, Field), + ('fingerprint', EDPOPREC.fingerprint, Field), + ('collation_formula', EDPOPREC.collationFormula, Field), + ('genres', EDPOPREC.genre, Field), + ('holdings', EDPOPREC.holdings, Field), ] def __str__(self) -> str: @@ -255,13 +265,13 @@ class BiographicalRecord(Record): activity_timespan: Optional[Field] = None activities: Optional[List[Field]] = None gender: Optional[Field] = None - lifespan: Optional[Field] = None + timespan: Optional[Field] = None def __init__(self, from_reader: Type["Reader"]): super().__init__(from_reader) assert isinstance(self._fields, list) self._fields += [ - ('name', EDPOPREC.title, Field), + ('name', EDPOPREC.name, Field), ('variant_names', EDPOPREC.variantName, Field), ('place_of_birth', EDPOPREC.placeOfBirth, Field), ('place_of_death', EDPOPREC.placeOfDeath, Field), @@ -269,7 +279,7 @@ def __init__(self, from_reader: Type["Reader"]): ('activity_timespan', EDPOPREC.timespan, Field), ('activities', EDPOPREC.activity, Field), ('gender', EDPOPREC.gender, Field), - ('lifespan', EDPOPREC.lifespan, Field), + ('timespan', EDPOPREC.timespan, Field), ] def __str__(self) -> str: diff --git a/tests/readers/__init__.py b/tests/readers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/readers/test_stcn.py b/tests/readers/test_stcn.py new file mode 100644 index 0000000..4a5ab51 --- /dev/null +++ b/tests/readers/test_stcn.py @@ -0,0 +1,46 @@ +import pytest + +from edpop_explorer.readers.stcn import safeget + + +def test_safeget_empty_attribute_chain(): + with pytest.raises(ValueError): + safeget(None, ()) + +def test_safeget_empty_dict(): + assert safeget({}, ("attribute",)) is None + +def test_safeget_none(): + assert safeget(None, ("attribute",)) is None + +def test_safeget_simple(): + assert safeget({"attribute": "value"}, ("attribute",)) == "value" + +def test_safeget_nested(): + assert safeget( + { + "attribute": {"attribute2": "value"} + }, ("attribute", "attribute2") + ) == "value" + +def test_safeget_nested_first_attribute_none(): + assert safeget({ + "attribute": None + }, ("attribute", "attribute2")) is None + +def test_safeget_nested_first_attribute_nonexistent(): + assert safeget({ + "other_attribute": None + }, ("attribute", "attribute2")) is None + +def test_safeget_nested_second_attribute_nonexistent(): + assert safeget({ + "attribute": { + "other_attribute": "value" + } + }, ("attribute", "attribute2")) is None + +def test_safeget_first(): + assert safeget({ + "attribute": ["value1", "value2"] + }, ("attribute",), True) == "value1" \ No newline at end of file diff --git a/tests/test_field.py b/tests/test_field.py index c2a5cb5..7cf93ec 100644 --- a/tests/test_field.py +++ b/tests/test_field.py @@ -4,7 +4,6 @@ from edpop_explorer import Field, FieldError, LocationField from edpop_explorer import EDPOPREC -from edpop_explorer.normalizers import NormalizationResult @fixture @@ -32,14 +31,6 @@ def test_to_graph(self, basic_field: Field): EDPOPREC.originalText, Literal(basic_field.original_text) ) in graph - # Test string from property - basic_field.normalized_text = 'normalized' - graph = basic_field.to_graph() - assert ( - basic_field.subject_node, - EDPOPREC.normalizedText, - Literal(basic_field.normalized_text) - ) in graph # Test boolean basic_field.unknown = True graph = basic_field.to_graph() @@ -62,29 +53,6 @@ def test_to_graph(self, basic_field: Field): with raises(FieldError): basic_field.to_graph() - def test_normalized_text(self, basic_field: Field): - # If nothing is set, this should be None - assert basic_field.normalized_text is None - # Set normalized text by hand - text = 'normalized' - basic_field.normalized_text = text - assert basic_field.normalized_text == text - # Now test a class with automatic normalized text creation - - def complex_normalizer(field): - field.normalized_text = field.original_text.capitalize() - return NormalizationResult.SUCCESS - - class ComplexField(Field): - normalizer = complex_normalizer - title = 'title' - complex_field = ComplexField(title) - complex_field.normalize() - assert complex_field.normalized_text == title.capitalize() - # A manual normalized text should override this - complex_field.normalized_text = text - assert complex_field.normalized_text == text - class TestLocationField: def test_basic_form(self, basic_location_field: LocationField):