Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Normalize the language field #48

Merged
merged 4 commits into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 22 additions & 27 deletions edpop_explorer/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
"""

from typing import Optional, Callable, List, Tuple

from rdflib import Graph, Literal, BNode, RDF, URIRef
from rdflib.term import Node

from edpop_explorer import EDPOPREC
from edpop_explorer import EDPOPREC, normalizers
from edpop_explorer.normalizers import NormalizationResult

DATATYPES = {
'string': {
Expand Down Expand Up @@ -68,13 +70,13 @@ class Field:
#: by default.
subject_node: Node
_subfields: List[Tuple[str, URIRef, str]]
_normalized_text: Optional[str] = None
normalized_text: Optional[str] = None
#: Subfield -- indicates whether the value of this field is explicitly
#: marked as unknown in the original record.
unknown: Optional[bool] = None
#: Subfield -- may contain the URI of an authority record
authority_record: Optional[str] = None
_create_normalized_text: Optional[Callable] = None
normalizer: Optional[Callable] = None
_rdf_class: Node = EDPOPREC.Field

def __init__(self, original_text: str) -> None:
Expand All @@ -91,30 +93,11 @@ def __init__(self, original_text: str) -> None:
('authority_record', EDPOPREC.authorityRecord, 'string'),
]

def set_normalized_text(self, text: Optional[str]):
"""Manually set the normalized text.

In case of subclasses that support automatic creation of the
normalized text, this method will override the automatic version.
Give None as an argument to reset the normalized text."""
self._normalized_text = text

@property
def normalized_text(self) -> Optional[str]:
"""Subfield -- a human-readable string representation of the normalized
field.

Should be set manually in the basic ``Field`` class with
``set_normalized_text`` or is automatically created in more complex
subclasses. Contains ``None`` in case there is no normalization."""
if self._normalized_text is not None:
return self._normalized_text
if callable(self._create_normalized_text):
text = self._create_normalized_text()
assert isinstance(text, str)
return text
else:
return None
def normalize(self) -> NormalizationResult:
"""Perform normalization on this field, based on the ``normalizer``
attribute. Subclasses of ``Field`` may predefine a normalizer function,
but this can always be overridden."""
return self.normalizer()

def to_graph(self) -> Graph:
'''Create an ``rdflib`` RDF graph according to the current data.'''
Expand Down Expand Up @@ -176,3 +159,15 @@ def __init__(self, original_text: str) -> None:
('location_type', EDPOPREC.locationType, 'uriref')
)


class LanguageField(Field):
_rdf_class = EDPOPREC.LanguageField
language_code: Optional[str] = None
normalizer = normalizers.normalize_by_language_code

def __init__(self, original_text: str) -> None:
super().__init__(original_text)
self._subfields.append(
('language_code', EDPOPREC.languageCode, 'string')
)

24 changes: 24 additions & 0 deletions edpop_explorer/normalizers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from iso639 import Lang
from iso639.exceptions import InvalidLanguageValue
from enum import Enum


class NormalizationResult(Enum):
SUCCESS = 'success'
NO_DATA = 'nodata'
FAIL = 'fail'


def normalize_by_language_code(field) -> NormalizationResult:
"""Normalize using the iso639-lang package, which allows the name of the
language in English as input, as well as one of the ISO-639 language
codes."""
if field.original_text is None:
return NormalizationResult.NO_DATA
try:
language = Lang(field.original_text)
field.language_code = language.pt3
field.normalized_text = language.name
return NormalizationResult.SUCCESS
except InvalidLanguageValue:
return NormalizationResult.FAIL
4 changes: 3 additions & 1 deletion edpop_explorer/readers/fbtee.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from edpop_explorer import (
Reader, BibliographicalRecord, ReaderError, Field, BIBLIOGRAPHICAL, DatabaseFileMixin
)
from edpop_explorer.fields import LanguageField
from edpop_explorer.reader import GetByIdBasedOnQueryMixin
from edpop_explorer.sql import SQLPreparedQuery

Expand Down Expand Up @@ -46,7 +47,8 @@ def _add_fields(cls, record: BibliographicalRecord) -> None:
record.title = Field(record.data['full_book_title'])
if record.data['languages']:
languages = record.data['languages'].split(sep=', ')
record.languages = [Field(x) for x in languages]
record.languages = [LanguageField(x) for x in languages]
[x.normalize() for x in record.languages]
pages = record.data['pages']
if pages:
record.extent = Field(pages)
Expand Down
5 changes: 4 additions & 1 deletion edpop_explorer/readers/gallica.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import requests
import xmltodict

from edpop_explorer.fields import LanguageField


def _force_list(data) -> list:
if isinstance(data, list):
Expand Down Expand Up @@ -72,7 +74,8 @@ def _convert_record(cls, sruthirecord: dict) -> BibliographicalRecord:
if dating:
record.dating = Field(dating)
languages = _force_list(sruthirecord.get('language', None))
record.languages = [Field(x) for x in languages]
record.languages = [LanguageField(x) for x in languages]
[x.normalize() for x in record.languages]
publisher = _force_string(sruthirecord.get('publisher', None))
if publisher:
record.publisher_or_printer = Field(publisher)
Expand Down
8 changes: 6 additions & 2 deletions edpop_explorer/readers/kb.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from rdflib import URIRef
from edpop_explorer import SRUReader, BibliographicalRecord, BIBLIOGRAPHICAL
from edpop_explorer import Field
from edpop_explorer.fields import LanguageField


class KBReader(SRUReader):
Expand Down Expand Up @@ -76,7 +77,10 @@ def _get_languages(self, data) -> Optional[List[Field]]:
# consisting of three characters are language codes.
if 'language' not in data:
return []
return [
Field(x) for x in data['language']
fields = [
LanguageField(x) for x in data['language']
if isinstance(x, str) and len(x) == 3
]
for field in fields:
field.normalize()
return fields
5 changes: 4 additions & 1 deletion edpop_explorer/readers/pierre_belle.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from edpop_explorer import Reader, ReaderError, BibliographicalRecord, Field, DatabaseFileMixin, BIBLIOGRAPHICAL
from rdflib import URIRef

from edpop_explorer.fields import LanguageField


class PierreBelleReader(DatabaseFileMixin, Reader):
""" Pierre-Belle database reader. Access with command 'pb'."""
Expand All @@ -24,7 +26,8 @@ def _convert_record(cls, rawrecord: dict) -> BibliographicalRecord:
record.data = rawrecord
record.identifier = rawrecord['ID']
record.title = Field(rawrecord['Shortened title'])
record.languages = [Field(rawrecord['Language'])]
record.languages = [LanguageField(rawrecord['Language'])]
[x.normalize() for x in record.languages]
record.publisher_or_printer = Field(rawrecord['Publisher'])
record.place_of_publication = Field(rawrecord['Place of publication'])
record.dating = Field(rawrecord['Date'])
Expand Down
5 changes: 4 additions & 1 deletion edpop_explorer/readers/stcn.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import List, Optional, Tuple

from edpop_explorer import Field, BIBLIOGRAPHICAL
from edpop_explorer.fields import LanguageField
from edpop_explorer.sparqlreader import (
SparqlReader, BibliographicalRDFRecord
)
Expand Down Expand Up @@ -51,7 +52,9 @@ def convert_record(
break
record.languages = []
for language in graph.objects(subject_node, SCHEMA.inLanguage):
record.languages.append(Field(str(language)))
field = LanguageField(str(language))
field.normalize()
record.languages.append(field)
# Now get the information from blank nodes
record.contributors = []
for author in graph.objects(subject_node, SCHEMA.author):
Expand Down
5 changes: 4 additions & 1 deletion edpop_explorer/readers/ustc.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
Reader, BibliographicalRecord, ReaderError, Field, BIBLIOGRAPHICAL,
GetByIdBasedOnQueryMixin, DatabaseFileMixin
)
from edpop_explorer.fields import LanguageField
from edpop_explorer.sql import SQLPreparedQuery


Expand Down Expand Up @@ -107,7 +108,9 @@ def _convert_record(self, data: dict) -> BibliographicalRecord:
for i in range(4):
fieldname = f'language_{i + 1}'
if data[fieldname]:
record.languages.append(Field(data[fieldname]))
field = LanguageField(data[fieldname])
field.normalize()
record.languages.append(field)
if data['pagination']:
record.extent = Field(data['pagination'])
return record
Expand Down
6 changes: 4 additions & 2 deletions edpop_explorer/srumarc21reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from edpop_explorer import (
BibliographicalRecord, RawData, SRUReader, Field, BIBLIOGRAPHICAL
)

from edpop_explorer.fields import LanguageField

READABLE_FIELDS_FILE = Path(__file__).parent / 'M21_fields.csv'
translation_dictionary: Dict[str, str] = {}
Expand Down Expand Up @@ -244,7 +244,9 @@ def _convert_record(cls, sruthirecord: dict) -> Marc21BibliographicalRecord:
# TODO: look up if this field is repeatable - if so support multiple
# languages
if language:
record.languages = [Field(language)]
language_field = LanguageField(language)
language_field.normalize()
record.languages = [language_field]
dating = data.get_first_subfield(*cls._dating_field_subfield)
if dating:
record.dating = Field(dating)
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ dependencies = [
'Pygments',
'xmltodict>=0.13.0',
'typing_extensions',
'iso639-lang',
]

description = "Common interface to multiple library catalogues and bibliographical databases"
Expand Down
15 changes: 10 additions & 5 deletions tests/test_field.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from edpop_explorer import Field, FieldError, LocationField
from edpop_explorer import EDPOPREC
from edpop_explorer.normalizers import NormalizationResult


@fixture
Expand Down Expand Up @@ -32,7 +33,7 @@ def test_to_graph(self, basic_field: Field):
Literal(basic_field.original_text)
) in graph
# Test string from property
basic_field.set_normalized_text('normalized')
basic_field.normalized_text = 'normalized'
graph = basic_field.to_graph()
assert (
basic_field.subject_node,
Expand Down Expand Up @@ -66,18 +67,22 @@ def test_normalized_text(self, basic_field: Field):
assert basic_field.normalized_text is None
# Set normalized text by hand
text = 'normalized'
basic_field.set_normalized_text(text)
basic_field.normalized_text = text
assert basic_field.normalized_text == text
# Now test a class with automatic normalized text creation

def complex_normalizer(field):
field.normalized_text = field.original_text.capitalize()
return NormalizationResult.SUCCESS

class ComplexField(Field):
def _create_normalized_text(self):
return self.original_text.capitalize()
normalizer = complex_normalizer
title = 'title'
complex_field = ComplexField(title)
complex_field.normalize()
assert complex_field.normalized_text == title.capitalize()
# A manual normalized text should override this
complex_field.set_normalized_text(text)
complex_field.normalized_text = text
assert complex_field.normalized_text == text


Expand Down
Loading