From 06fec52fb5592f978ad70998c64da54ac9498d85 Mon Sep 17 00:00:00 2001 From: Sheean Spoel Date: Fri, 5 Aug 2022 14:10:49 +0200 Subject: [PATCH 1/8] More error logging --- corpus2alpino/annotators/alpino.py | 20 +++++++++++++++----- corpus2alpino/writers/lassy.py | 5 ++--- setup.py | 2 +- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/corpus2alpino/annotators/alpino.py b/corpus2alpino/annotators/alpino.py index 97fc5e2..8b9137d 100644 --- a/corpus2alpino/annotators/alpino.py +++ b/corpus2alpino/annotators/alpino.py @@ -3,6 +3,9 @@ Wrapper for the Alpino parser. """ +from .alpino_client import AlpinoProcessClient, AlpinoServerClient +from corpus2alpino.models import Document, MetadataValue +from corpus2alpino.abstracts import Annotator import re import os import logging @@ -11,10 +14,6 @@ ANNOTATION_KEY = 'alpino' -from corpus2alpino.abstracts import Annotator -from corpus2alpino.models import Document, MetadataValue - -from .alpino_client import AlpinoProcessClient, AlpinoServerClient timealign_symbol = re.compile(r'\u0015') @@ -52,4 +51,15 @@ def annotate(self, document: Document): self.client.version_date.isoformat(), 'date') except Exception as exception: logging.getLogger().error( - Exception("Problem parsing: {0}|{1}\n{2}".format(utterance.id, utterance.text, exception))) + Exception("Problem parsing: {0}:{1}|{2}\n{3}".format(self.__document_path(document), utterance.id, utterance.text, exception))) + + def __document_path(self, document: Document): + value = document.collected_file.filename + + if document.collected_file.relpath: + value = document.collected_file.relpath + '/' + value + + if document.subpath: + value += '//' + document.subpath + + return value diff --git a/corpus2alpino/writers/lassy.py b/corpus2alpino/writers/lassy.py index 20d3570..a72248e 100644 --- a/corpus2alpino/writers/lassy.py +++ b/corpus2alpino/writers/lassy.py @@ -41,11 +41,10 @@ def write_utterance(self, document: Document, target: Target, utterance: Utteran return target.write(document, self.render_annotation( - document, utterance, not filename), filename) + document, utterance, annotation, not filename), filename) - def render_annotation(self, document: Document, utterance: Utterance, remove_header=False) -> str: + def render_annotation(self, document: Document, utterance: Utterance, annotation: str, remove_header=False) -> str: metadata = {**document.metadata, **utterance.metadata} - annotation = utterance.annotations[ANNOTATION_KEY] if not metadata and remove_header == False: return annotation diff --git a/setup.py b/setup.py index 9a92174..eb1258e 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ ], install_requires=['argparse', 'chamd>=0.5.8', 'folia', 'spacy', 'tei-reader', 'tqdm'], - python_requires='>=3.6', + python_requires='>=3.7', zip_safe=True, entry_points={ 'console_scripts': [ From 8d9db05480449f513205d198db1f421b8684a414 Mon Sep 17 00:00:00 2001 From: Sheean Spoel Date: Wed, 3 Apr 2024 10:40:17 +0200 Subject: [PATCH 2/8] FoLiA edge cases --- .github/workflows/test.yml | 4 +- corpus2alpino/__main__.py | 4 +- corpus2alpino/readers/folia.py | 68 +++++++++++------- requirements.txt | 121 +++++++++++++++++---------------- 4 files changed, 110 insertions(+), 87 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 357446d..baa0261 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -5,10 +5,10 @@ on: [push] jobs: build: - runs-on: ubuntu-18.04 + runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.7', '3.10'] + python-version: ['3.8', '3.10'] steps: - uses: actions/checkout@v3 diff --git a/corpus2alpino/__main__.py b/corpus2alpino/__main__.py index 185a9e6..e9e8f01 100644 --- a/corpus2alpino/__main__.py +++ b/corpus2alpino/__main__.py @@ -51,7 +51,7 @@ def main(args=None): parser.add_argument( '-p', '--progress', action='store_true', - help='Show progress bar, automatically turned on file output') + help='Show progress bar, automatically turned on for file output') parser.add_argument('-t', '--split_treebanks', action='store_true', help='Split treebanks to separate files') @@ -88,7 +88,7 @@ def main(args=None): converter.target = FilesystemTarget( options.output_path, not options.split_treebanks) - show_progress = options.progress if options.progress != None else options.output_path != None + show_progress = options.output_path != None or options.progress if show_progress: with tqdm(converter.convert(), total=len(options.file_names), unit='file') as progress: diff --git a/corpus2alpino/readers/folia.py b/corpus2alpino/readers/folia.py index 2f2b76d..168d101 100644 --- a/corpus2alpino/readers/folia.py +++ b/corpus2alpino/readers/folia.py @@ -6,14 +6,12 @@ from typing import Iterable from corpus2alpino.abstracts import Reader -from corpus2alpino.models import (CollectedFile, Document, MetadataValue, - Utterance) +from corpus2alpino.models import CollectedFile, Document, MetadataValue, Utterance from corpus2alpino.readers.tokenizer import Tokenizer import folia.main as folia -from .alpino_brackets import (escape_id, escape_word, format_add_lex, - format_folia) +from .alpino_brackets import escape_id, escape_word, format_add_lex, format_folia class FoliaReader(Reader): @@ -26,41 +24,54 @@ def __init__(self, custom_tokenizer=None) -> None: def read(self, collected_file: CollectedFile) -> Iterable[Document]: try: - doc = folia.Document(string=collected_file.content, - autodeclare=True, - loadsetdefinitions=False) + doc = folia.Document( + string=collected_file.content, + autodeclare=True, + loadsetdefinitions=False, + ) self.tokenize(doc) doc_metadata = self.get_metadata_dict(doc.metadata.items()) - yield Document(collected_file, - list(self.get_utterances(doc, doc_metadata)), - doc_metadata) + yield Document( + collected_file, + list(self.get_utterances(doc, doc_metadata)), + doc_metadata, + ) except Exception as e: - raise Exception(collected_file.relpath + "/" + - collected_file.filename) from e + raise Exception( + collected_file.relpath + "/" + collected_file.filename + ) from e def tokenize(self, element): """ Tokenizes all the text which isn't tokenized yet. """ + if len(element) == 0: + # no sub elements + if isinstance(element, folia.Text): + self.tokenize_element(element.text(), element) + return for item in element: if isinstance(item, folia.AbstractElement): if isinstance(item, folia.Paragraph): - for sentence in item.sentences(): + for _ in item.sentences(): break else: self.tokenize_paragraph(item) else: self.tokenize(item) - def tokenize_paragraph(self, paragraph): - text = '' - for textContent in paragraph.select(folia.TextContent): - text += textContent.text() + def tokenize_paragraph(self, paragraph: folia.Paragraph): + text = "" + for text_content in paragraph.select(folia.TextContent): + text += text_content.text() + self.tokenize_element(text, paragraph) + + def tokenize_element(self, text: str, element: folia.AbstractElement): sentences = self.tokenizer.process(text) for line in sentences: - sentence = paragraph.add(folia.Sentence) + sentence = element.add(folia.Sentence) for word in line.tokens(): if word: sentence.add(folia.Word, word) @@ -88,7 +99,9 @@ def get_utterances(self, doc, doc_metadata): if word_sentence != sentence or word_paragraph != paragraph: if words: if sentence or paragraph: - yield self.create_utterance(paragraph, sentence, words, doc_metadata) + yield self.create_utterance( + paragraph, sentence, words, doc_metadata + ) words = [] sentence = word_sentence paragraph = word_paragraph @@ -104,7 +117,7 @@ def create_utterance(self, paragraph, sentence, words, doc_metadata): """ word_strings = map(lambda word: self.get_word_string(word), words) - line = " ".join(filter(lambda word: word != '', word_strings)) + line = " ".join(filter(lambda word: word != "", word_strings)) if sentence: container = sentence @@ -113,8 +126,8 @@ def create_utterance(self, paragraph, sentence, words, doc_metadata): sentence_id = escape_id(container.id) sentence_metadata = self.get_metadata_dict( - container.getmetadata().items(), - doc_metadata) + container.getmetadata().items(), doc_metadata + ) return Utterance(line, sentence_id, sentence_metadata, line) @@ -135,7 +148,7 @@ def get_word_string(self, word): text = item.text() break else: - return '' + return "" try: correction = word.getcorrection() @@ -159,8 +172,11 @@ def get_word_string(self, word): def get_metadata_dict(self, native_metadata, filter_by=None): metadata = {} for key, value in native_metadata: - if filter_by == None or not key in filter_by \ - or filter_by[key].value != value: + if ( + filter_by == None + or key not in filter_by + or filter_by[key].value != value + ): metadata[key] = MetadataValue(value) return metadata @@ -169,4 +185,4 @@ def test_file(self, file: CollectedFile): Determine whether this is a FoLiA XML file """ - return ' Date: Wed, 3 Apr 2024 10:51:20 +0200 Subject: [PATCH 3/8] Fixed typing --- README.md | 2 +- corpus2alpino/converter.py | 19 +++++++------- corpus2alpino/models.py | 39 ++++++++++++++++------------- corpus2alpino/targets/console.py | 17 +++++++------ corpus2alpino/targets/filesystem.py | 35 ++++++++++++++------------ corpus2alpino/targets/memory.py | 20 ++++++++------- 6 files changed, 71 insertions(+), 61 deletions(-) diff --git a/README.md b/README.md index 1927eb9..2b7a846 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ [![Actions Status](https://github.com/UUDigitalHumanitiesLab/corpus2alpino/workflows/Unit%20tests/badge.svg)](https://github.com/UUDigitalHumanitiesLab/corpus2alpino/actions) -[PyPi/corpus2alpino](https://pypi.org/project/corpus2alpino/) +[![PyPi/corpus2alpino](https://img.shields.io/pypi/v/corpus2alpino)](https://pypi.org/project/corpus2alpino/) # CHAT, FoLiA, PaQu metadata, plaintext and TEI to Alpino XML or PaQu metadata format diff --git a/corpus2alpino/converter.py b/corpus2alpino/converter.py index 206b6d7..a4fc516 100644 --- a/corpus2alpino/converter.py +++ b/corpus2alpino/converter.py @@ -1,11 +1,8 @@ #!/usr/bin/env python3 -from typing import List +from typing import List, Optional -from corpus2alpino.collectors.filesystem import FilesystemCollector from corpus2alpino.readers.auto import AutoReader from corpus2alpino.targets.console import ConsoleTarget -from corpus2alpino.targets.filesystem import FilesystemTarget -from corpus2alpino.writers.lassy import LassyWriter from corpus2alpino.writers.paqu import PaQuWriter from corpus2alpino.abstracts import Annotator, Collector, Reader, Target, Writer @@ -16,12 +13,14 @@ class Converter: Class for converting files to Alpino XML (input) files. """ - def __init__(self, - collector: Collector, - annotators: List[Annotator] = None, - reader: Reader = AutoReader(), - writer: Writer = PaQuWriter(), - target: Target = ConsoleTarget()) -> None: + def __init__( + self, + collector: Collector, + annotators: Optional[List[Annotator]] = None, + reader: Reader = AutoReader(), + writer: Writer = PaQuWriter(), + target: Target = ConsoleTarget(), + ) -> None: self.collector = collector self.annotators = annotators or [] self.reader = reader diff --git a/corpus2alpino/models.py b/corpus2alpino/models.py index 0d60695..154953a 100644 --- a/corpus2alpino/models.py +++ b/corpus2alpino/models.py @@ -1,10 +1,11 @@ #!/usr/bin/env python3 -from typing import Dict, List, Iterable +from typing import Dict, Iterable, Optional class CollectedFile: - def __init__(self, relpath: str, filename: str, mimetype: str, - content: str) -> None: + def __init__( + self, relpath: str, filename: str, mimetype: str, content: str + ) -> None: self.relpath = relpath self.filename = filename self.mimetype = mimetype @@ -12,18 +13,20 @@ def __init__(self, relpath: str, filename: str, mimetype: str, class MetadataValue: - def __init__(self, value: str, type: str='text') -> None: + def __init__(self, value: str, type: str = "text") -> None: self.value = value self.type = type class Utterance: - def __init__(self, - text: str, - id: str, - metadata: Dict[str, MetadataValue] = None, - line: int = 0, - annotations: Dict[str, str] = None) -> None: + def __init__( + self, + text: str, + id: str, + metadata: Optional[Dict[str, MetadataValue]] = None, + line: int = 0, + annotations: Optional[Dict[str, str]] = None, + ) -> None: self.text = text self.id = id self.metadata = metadata or {} @@ -32,19 +35,21 @@ def __init__(self, class Document: - def __init__(self, - collected_file: CollectedFile, - utterances: Iterable[Utterance], - metadata: Dict[str, MetadataValue] = None, - subpath: str = '', - annotations: Dict[str, str] = None) -> None: + def __init__( + self, + collected_file: CollectedFile, + utterances: Iterable[Utterance], + metadata: Optional[Dict[str, MetadataValue]] = None, + subpath: str = "", + annotations: Optional[Dict[str, str]] = None, + ) -> None: """ A document found in a file. subpath: if a file has an internal structure, this contains a string representation of that relative to the file. E.g. if a tei.xml contains a document A at the - root and a document B + root and a document B """ self.collected_file = collected_file diff --git a/corpus2alpino/targets/console.py b/corpus2alpino/targets/console.py index 11455ae..f41b6ac 100644 --- a/corpus2alpino/targets/console.py +++ b/corpus2alpino/targets/console.py @@ -1,5 +1,4 @@ -from os import path -from pathlib import Path +from typing import Optional from corpus2alpino.abstracts import Target from corpus2alpino.models import Document @@ -10,15 +9,17 @@ class ConsoleTarget(Target): Output chunks to the console on separate lines. """ - def write(self, - document: Document, - content: str, - filename: str = None, - suffix: str = None): + def write( + self, + document: Document, + content: str, + filename: Optional[str] = None, + suffix: Optional[str] = None, + ): """ Write all lines to stdout. """ - print(content, end='') + print(content, end="") def flush(self): return diff --git a/corpus2alpino/targets/filesystem.py b/corpus2alpino/targets/filesystem.py index 2009502..7425460 100644 --- a/corpus2alpino/targets/filesystem.py +++ b/corpus2alpino/targets/filesystem.py @@ -4,7 +4,7 @@ from os import path, makedirs from pathlib import Path -from typing import cast, Any +from typing import Optional, cast class FilesystemTarget(Target): @@ -14,14 +14,16 @@ class FilesystemTarget(Target): __current_output_path = None - def __open_file(self, document: Document, filename: str = None, suffix: str = None): + def __open_file(self, document: Document, filename: Optional[str] = None, suffix: Optional[str] = None): if self.merge_files: # when merge_files = True, a file is already open - return - - output_path = path.join(self.output_path, - document.collected_file.relpath, - document.collected_file.filename) + return + + output_path = path.join( + self.output_path, + document.collected_file.relpath, + document.collected_file.filename, + ) if document.subpath: output_path = path.join(output_path, document.subpath) @@ -29,8 +31,7 @@ def __open_file(self, document: Document, filename: str = None, suffix: str = No if filename != None: output_path = path.join(output_path, cast(str, filename)) if suffix != None: - output_path = str( - Path(output_path).with_suffix(cast(str, suffix))) + output_path = str(Path(output_path).with_suffix(cast(str, suffix))) # always open a new file when splitting in separate files self.__current_output_path = None @@ -53,7 +54,7 @@ def __open_unique(self, directory: str, filename: str): target = Path(path.join(directory, prefix + filename)) if not target.is_file(): # new file! - return target.open('w', encoding='utf-8') + return target.open("w", encoding="utf-8") attempts += 1 def __init__(self, output_path: str, merge_files=False) -> None: @@ -63,15 +64,17 @@ def __init__(self, output_path: str, merge_files=False) -> None: if self.merge_files: # using a single file makedirs(path.dirname(output_path), exist_ok=True) - self.file = open(output_path, 'w', encoding='utf-8') + self.file = open(output_path, "w", encoding="utf-8") else: self.file = None # type: ignore - def write(self, - document: Document, - content: str, - filename: str = None, - suffix: str = None): + def write( + self, + document: Document, + content: str, + filename: Optional[str] = None, + suffix: Optional[str] = None, + ): self.__open_file(document, filename, suffix) if self.file: self.file.write(content) diff --git a/corpus2alpino/targets/memory.py b/corpus2alpino/targets/memory.py index 8c76db5..054f213 100644 --- a/corpus2alpino/targets/memory.py +++ b/corpus2alpino/targets/memory.py @@ -1,5 +1,4 @@ -from os import path -from pathlib import Path +from typing import Optional from corpus2alpino.abstracts import Target from corpus2alpino.models import Document @@ -9,13 +8,16 @@ class MemoryTarget(Target): """ Combine output in memory. """ - buffer = '' - def write(self, - document: Document, - content: str, - filename: str = None, - suffix: str = None): + buffer = "" + + def write( + self, + document: Document, + content: str, + filename: Optional[str] = None, + suffix: Optional[str] = None, + ): """ Write all lines to stdout. """ @@ -25,7 +27,7 @@ def flush(self): try: return self.buffer finally: - self.buffer = '' + self.buffer = "" def close(self): return From b41d45e8a266d3250a4acf6641f72c609671153f Mon Sep 17 00:00:00 2001 From: Sheean Spoel Date: Wed, 3 Apr 2024 15:50:55 +0200 Subject: [PATCH 4/8] Create python-publish.yml --- .github/workflows/python-publish.yml | 39 ++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 .github/workflows/python-publish.yml diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 0000000..fca78d7 --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,39 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.8' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Publish package + uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} From eec5ae275254a52d8817a0f01e08fe4fc218b35b Mon Sep 17 00:00:00 2001 From: Sheean Spoel Date: Fri, 5 Aug 2022 14:10:49 +0200 Subject: [PATCH 5/8] More error logging --- corpus2alpino/annotators/alpino.py | 20 +++++++++++++++----- corpus2alpino/writers/lassy.py | 5 ++--- setup.py | 2 +- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/corpus2alpino/annotators/alpino.py b/corpus2alpino/annotators/alpino.py index 97fc5e2..8b9137d 100644 --- a/corpus2alpino/annotators/alpino.py +++ b/corpus2alpino/annotators/alpino.py @@ -3,6 +3,9 @@ Wrapper for the Alpino parser. """ +from .alpino_client import AlpinoProcessClient, AlpinoServerClient +from corpus2alpino.models import Document, MetadataValue +from corpus2alpino.abstracts import Annotator import re import os import logging @@ -11,10 +14,6 @@ ANNOTATION_KEY = 'alpino' -from corpus2alpino.abstracts import Annotator -from corpus2alpino.models import Document, MetadataValue - -from .alpino_client import AlpinoProcessClient, AlpinoServerClient timealign_symbol = re.compile(r'\u0015') @@ -52,4 +51,15 @@ def annotate(self, document: Document): self.client.version_date.isoformat(), 'date') except Exception as exception: logging.getLogger().error( - Exception("Problem parsing: {0}|{1}\n{2}".format(utterance.id, utterance.text, exception))) + Exception("Problem parsing: {0}:{1}|{2}\n{3}".format(self.__document_path(document), utterance.id, utterance.text, exception))) + + def __document_path(self, document: Document): + value = document.collected_file.filename + + if document.collected_file.relpath: + value = document.collected_file.relpath + '/' + value + + if document.subpath: + value += '//' + document.subpath + + return value diff --git a/corpus2alpino/writers/lassy.py b/corpus2alpino/writers/lassy.py index 20d3570..a72248e 100644 --- a/corpus2alpino/writers/lassy.py +++ b/corpus2alpino/writers/lassy.py @@ -41,11 +41,10 @@ def write_utterance(self, document: Document, target: Target, utterance: Utteran return target.write(document, self.render_annotation( - document, utterance, not filename), filename) + document, utterance, annotation, not filename), filename) - def render_annotation(self, document: Document, utterance: Utterance, remove_header=False) -> str: + def render_annotation(self, document: Document, utterance: Utterance, annotation: str, remove_header=False) -> str: metadata = {**document.metadata, **utterance.metadata} - annotation = utterance.annotations[ANNOTATION_KEY] if not metadata and remove_header == False: return annotation diff --git a/setup.py b/setup.py index 9a92174..eb1258e 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ ], install_requires=['argparse', 'chamd>=0.5.8', 'folia', 'spacy', 'tei-reader', 'tqdm'], - python_requires='>=3.6', + python_requires='>=3.7', zip_safe=True, entry_points={ 'console_scripts': [ From a441b7b3a653f0e21d2578158f59e860a94e6e65 Mon Sep 17 00:00:00 2001 From: Sheean Spoel Date: Wed, 3 Apr 2024 10:40:17 +0200 Subject: [PATCH 6/8] FoLiA edge cases --- .github/workflows/test.yml | 4 +- corpus2alpino/__main__.py | 4 +- corpus2alpino/readers/folia.py | 68 +++++++++++------- requirements.txt | 121 +++++++++++++++++---------------- 4 files changed, 110 insertions(+), 87 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 357446d..baa0261 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -5,10 +5,10 @@ on: [push] jobs: build: - runs-on: ubuntu-18.04 + runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.7', '3.10'] + python-version: ['3.8', '3.10'] steps: - uses: actions/checkout@v3 diff --git a/corpus2alpino/__main__.py b/corpus2alpino/__main__.py index 185a9e6..e9e8f01 100644 --- a/corpus2alpino/__main__.py +++ b/corpus2alpino/__main__.py @@ -51,7 +51,7 @@ def main(args=None): parser.add_argument( '-p', '--progress', action='store_true', - help='Show progress bar, automatically turned on file output') + help='Show progress bar, automatically turned on for file output') parser.add_argument('-t', '--split_treebanks', action='store_true', help='Split treebanks to separate files') @@ -88,7 +88,7 @@ def main(args=None): converter.target = FilesystemTarget( options.output_path, not options.split_treebanks) - show_progress = options.progress if options.progress != None else options.output_path != None + show_progress = options.output_path != None or options.progress if show_progress: with tqdm(converter.convert(), total=len(options.file_names), unit='file') as progress: diff --git a/corpus2alpino/readers/folia.py b/corpus2alpino/readers/folia.py index 2f2b76d..168d101 100644 --- a/corpus2alpino/readers/folia.py +++ b/corpus2alpino/readers/folia.py @@ -6,14 +6,12 @@ from typing import Iterable from corpus2alpino.abstracts import Reader -from corpus2alpino.models import (CollectedFile, Document, MetadataValue, - Utterance) +from corpus2alpino.models import CollectedFile, Document, MetadataValue, Utterance from corpus2alpino.readers.tokenizer import Tokenizer import folia.main as folia -from .alpino_brackets import (escape_id, escape_word, format_add_lex, - format_folia) +from .alpino_brackets import escape_id, escape_word, format_add_lex, format_folia class FoliaReader(Reader): @@ -26,41 +24,54 @@ def __init__(self, custom_tokenizer=None) -> None: def read(self, collected_file: CollectedFile) -> Iterable[Document]: try: - doc = folia.Document(string=collected_file.content, - autodeclare=True, - loadsetdefinitions=False) + doc = folia.Document( + string=collected_file.content, + autodeclare=True, + loadsetdefinitions=False, + ) self.tokenize(doc) doc_metadata = self.get_metadata_dict(doc.metadata.items()) - yield Document(collected_file, - list(self.get_utterances(doc, doc_metadata)), - doc_metadata) + yield Document( + collected_file, + list(self.get_utterances(doc, doc_metadata)), + doc_metadata, + ) except Exception as e: - raise Exception(collected_file.relpath + "/" + - collected_file.filename) from e + raise Exception( + collected_file.relpath + "/" + collected_file.filename + ) from e def tokenize(self, element): """ Tokenizes all the text which isn't tokenized yet. """ + if len(element) == 0: + # no sub elements + if isinstance(element, folia.Text): + self.tokenize_element(element.text(), element) + return for item in element: if isinstance(item, folia.AbstractElement): if isinstance(item, folia.Paragraph): - for sentence in item.sentences(): + for _ in item.sentences(): break else: self.tokenize_paragraph(item) else: self.tokenize(item) - def tokenize_paragraph(self, paragraph): - text = '' - for textContent in paragraph.select(folia.TextContent): - text += textContent.text() + def tokenize_paragraph(self, paragraph: folia.Paragraph): + text = "" + for text_content in paragraph.select(folia.TextContent): + text += text_content.text() + self.tokenize_element(text, paragraph) + + def tokenize_element(self, text: str, element: folia.AbstractElement): sentences = self.tokenizer.process(text) for line in sentences: - sentence = paragraph.add(folia.Sentence) + sentence = element.add(folia.Sentence) for word in line.tokens(): if word: sentence.add(folia.Word, word) @@ -88,7 +99,9 @@ def get_utterances(self, doc, doc_metadata): if word_sentence != sentence or word_paragraph != paragraph: if words: if sentence or paragraph: - yield self.create_utterance(paragraph, sentence, words, doc_metadata) + yield self.create_utterance( + paragraph, sentence, words, doc_metadata + ) words = [] sentence = word_sentence paragraph = word_paragraph @@ -104,7 +117,7 @@ def create_utterance(self, paragraph, sentence, words, doc_metadata): """ word_strings = map(lambda word: self.get_word_string(word), words) - line = " ".join(filter(lambda word: word != '', word_strings)) + line = " ".join(filter(lambda word: word != "", word_strings)) if sentence: container = sentence @@ -113,8 +126,8 @@ def create_utterance(self, paragraph, sentence, words, doc_metadata): sentence_id = escape_id(container.id) sentence_metadata = self.get_metadata_dict( - container.getmetadata().items(), - doc_metadata) + container.getmetadata().items(), doc_metadata + ) return Utterance(line, sentence_id, sentence_metadata, line) @@ -135,7 +148,7 @@ def get_word_string(self, word): text = item.text() break else: - return '' + return "" try: correction = word.getcorrection() @@ -159,8 +172,11 @@ def get_word_string(self, word): def get_metadata_dict(self, native_metadata, filter_by=None): metadata = {} for key, value in native_metadata: - if filter_by == None or not key in filter_by \ - or filter_by[key].value != value: + if ( + filter_by == None + or key not in filter_by + or filter_by[key].value != value + ): metadata[key] = MetadataValue(value) return metadata @@ -169,4 +185,4 @@ def test_file(self, file: CollectedFile): Determine whether this is a FoLiA XML file """ - return ' Date: Wed, 3 Apr 2024 10:51:20 +0200 Subject: [PATCH 7/8] Fixed typing --- README.md | 2 +- corpus2alpino/converter.py | 19 +++++++------- corpus2alpino/models.py | 39 ++++++++++++++++------------- corpus2alpino/targets/console.py | 17 +++++++------ corpus2alpino/targets/filesystem.py | 35 ++++++++++++++------------ corpus2alpino/targets/memory.py | 20 ++++++++------- 6 files changed, 71 insertions(+), 61 deletions(-) diff --git a/README.md b/README.md index 1927eb9..2b7a846 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ [![Actions Status](https://github.com/UUDigitalHumanitiesLab/corpus2alpino/workflows/Unit%20tests/badge.svg)](https://github.com/UUDigitalHumanitiesLab/corpus2alpino/actions) -[PyPi/corpus2alpino](https://pypi.org/project/corpus2alpino/) +[![PyPi/corpus2alpino](https://img.shields.io/pypi/v/corpus2alpino)](https://pypi.org/project/corpus2alpino/) # CHAT, FoLiA, PaQu metadata, plaintext and TEI to Alpino XML or PaQu metadata format diff --git a/corpus2alpino/converter.py b/corpus2alpino/converter.py index 206b6d7..a4fc516 100644 --- a/corpus2alpino/converter.py +++ b/corpus2alpino/converter.py @@ -1,11 +1,8 @@ #!/usr/bin/env python3 -from typing import List +from typing import List, Optional -from corpus2alpino.collectors.filesystem import FilesystemCollector from corpus2alpino.readers.auto import AutoReader from corpus2alpino.targets.console import ConsoleTarget -from corpus2alpino.targets.filesystem import FilesystemTarget -from corpus2alpino.writers.lassy import LassyWriter from corpus2alpino.writers.paqu import PaQuWriter from corpus2alpino.abstracts import Annotator, Collector, Reader, Target, Writer @@ -16,12 +13,14 @@ class Converter: Class for converting files to Alpino XML (input) files. """ - def __init__(self, - collector: Collector, - annotators: List[Annotator] = None, - reader: Reader = AutoReader(), - writer: Writer = PaQuWriter(), - target: Target = ConsoleTarget()) -> None: + def __init__( + self, + collector: Collector, + annotators: Optional[List[Annotator]] = None, + reader: Reader = AutoReader(), + writer: Writer = PaQuWriter(), + target: Target = ConsoleTarget(), + ) -> None: self.collector = collector self.annotators = annotators or [] self.reader = reader diff --git a/corpus2alpino/models.py b/corpus2alpino/models.py index 0d60695..154953a 100644 --- a/corpus2alpino/models.py +++ b/corpus2alpino/models.py @@ -1,10 +1,11 @@ #!/usr/bin/env python3 -from typing import Dict, List, Iterable +from typing import Dict, Iterable, Optional class CollectedFile: - def __init__(self, relpath: str, filename: str, mimetype: str, - content: str) -> None: + def __init__( + self, relpath: str, filename: str, mimetype: str, content: str + ) -> None: self.relpath = relpath self.filename = filename self.mimetype = mimetype @@ -12,18 +13,20 @@ def __init__(self, relpath: str, filename: str, mimetype: str, class MetadataValue: - def __init__(self, value: str, type: str='text') -> None: + def __init__(self, value: str, type: str = "text") -> None: self.value = value self.type = type class Utterance: - def __init__(self, - text: str, - id: str, - metadata: Dict[str, MetadataValue] = None, - line: int = 0, - annotations: Dict[str, str] = None) -> None: + def __init__( + self, + text: str, + id: str, + metadata: Optional[Dict[str, MetadataValue]] = None, + line: int = 0, + annotations: Optional[Dict[str, str]] = None, + ) -> None: self.text = text self.id = id self.metadata = metadata or {} @@ -32,19 +35,21 @@ def __init__(self, class Document: - def __init__(self, - collected_file: CollectedFile, - utterances: Iterable[Utterance], - metadata: Dict[str, MetadataValue] = None, - subpath: str = '', - annotations: Dict[str, str] = None) -> None: + def __init__( + self, + collected_file: CollectedFile, + utterances: Iterable[Utterance], + metadata: Optional[Dict[str, MetadataValue]] = None, + subpath: str = "", + annotations: Optional[Dict[str, str]] = None, + ) -> None: """ A document found in a file. subpath: if a file has an internal structure, this contains a string representation of that relative to the file. E.g. if a tei.xml contains a document A at the - root and a document B + root and a document B """ self.collected_file = collected_file diff --git a/corpus2alpino/targets/console.py b/corpus2alpino/targets/console.py index 11455ae..f41b6ac 100644 --- a/corpus2alpino/targets/console.py +++ b/corpus2alpino/targets/console.py @@ -1,5 +1,4 @@ -from os import path -from pathlib import Path +from typing import Optional from corpus2alpino.abstracts import Target from corpus2alpino.models import Document @@ -10,15 +9,17 @@ class ConsoleTarget(Target): Output chunks to the console on separate lines. """ - def write(self, - document: Document, - content: str, - filename: str = None, - suffix: str = None): + def write( + self, + document: Document, + content: str, + filename: Optional[str] = None, + suffix: Optional[str] = None, + ): """ Write all lines to stdout. """ - print(content, end='') + print(content, end="") def flush(self): return diff --git a/corpus2alpino/targets/filesystem.py b/corpus2alpino/targets/filesystem.py index 2009502..7425460 100644 --- a/corpus2alpino/targets/filesystem.py +++ b/corpus2alpino/targets/filesystem.py @@ -4,7 +4,7 @@ from os import path, makedirs from pathlib import Path -from typing import cast, Any +from typing import Optional, cast class FilesystemTarget(Target): @@ -14,14 +14,16 @@ class FilesystemTarget(Target): __current_output_path = None - def __open_file(self, document: Document, filename: str = None, suffix: str = None): + def __open_file(self, document: Document, filename: Optional[str] = None, suffix: Optional[str] = None): if self.merge_files: # when merge_files = True, a file is already open - return - - output_path = path.join(self.output_path, - document.collected_file.relpath, - document.collected_file.filename) + return + + output_path = path.join( + self.output_path, + document.collected_file.relpath, + document.collected_file.filename, + ) if document.subpath: output_path = path.join(output_path, document.subpath) @@ -29,8 +31,7 @@ def __open_file(self, document: Document, filename: str = None, suffix: str = No if filename != None: output_path = path.join(output_path, cast(str, filename)) if suffix != None: - output_path = str( - Path(output_path).with_suffix(cast(str, suffix))) + output_path = str(Path(output_path).with_suffix(cast(str, suffix))) # always open a new file when splitting in separate files self.__current_output_path = None @@ -53,7 +54,7 @@ def __open_unique(self, directory: str, filename: str): target = Path(path.join(directory, prefix + filename)) if not target.is_file(): # new file! - return target.open('w', encoding='utf-8') + return target.open("w", encoding="utf-8") attempts += 1 def __init__(self, output_path: str, merge_files=False) -> None: @@ -63,15 +64,17 @@ def __init__(self, output_path: str, merge_files=False) -> None: if self.merge_files: # using a single file makedirs(path.dirname(output_path), exist_ok=True) - self.file = open(output_path, 'w', encoding='utf-8') + self.file = open(output_path, "w", encoding="utf-8") else: self.file = None # type: ignore - def write(self, - document: Document, - content: str, - filename: str = None, - suffix: str = None): + def write( + self, + document: Document, + content: str, + filename: Optional[str] = None, + suffix: Optional[str] = None, + ): self.__open_file(document, filename, suffix) if self.file: self.file.write(content) diff --git a/corpus2alpino/targets/memory.py b/corpus2alpino/targets/memory.py index 8c76db5..054f213 100644 --- a/corpus2alpino/targets/memory.py +++ b/corpus2alpino/targets/memory.py @@ -1,5 +1,4 @@ -from os import path -from pathlib import Path +from typing import Optional from corpus2alpino.abstracts import Target from corpus2alpino.models import Document @@ -9,13 +8,16 @@ class MemoryTarget(Target): """ Combine output in memory. """ - buffer = '' - def write(self, - document: Document, - content: str, - filename: str = None, - suffix: str = None): + buffer = "" + + def write( + self, + document: Document, + content: str, + filename: Optional[str] = None, + suffix: Optional[str] = None, + ): """ Write all lines to stdout. """ @@ -25,7 +27,7 @@ def flush(self): try: return self.buffer finally: - self.buffer = '' + self.buffer = "" def close(self): return From 70b81f62bd7c8def2d41e99e402003d1900b2f00 Mon Sep 17 00:00:00 2001 From: Sheean Spoel Date: Wed, 3 Apr 2024 15:56:42 +0200 Subject: [PATCH 8/8] 0.3.11 --- README.md | 5 +++-- setup.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2b7a846..80ae3b7 100644 --- a/README.md +++ b/README.md @@ -71,14 +71,15 @@ See: https://packaging.python.org/tutorials/packaging-projects/#generating-distr Make sure `setuptools` and `wheel` are installed. Then from the virtualenv: ```bash -python setup.py sdist bdist_wheel +pip install build +python -m build twine upload dist/* ``` ## Requirements * [Alpino parser](http://www.let.rug.nl/vannoord/alp/Alpino) running as a server: `Alpino batch_command=alpino_server -notk server_port=7001` -* Python 3.7 or higher +* Python 3.8 or higher * [libfolia-dev](https://packages.ubuntu.com/bionic/libfolia-dev) * [libxml2-dev](https://packages.ubuntu.com/bionic/libxml2-dev) diff --git a/setup.py b/setup.py index eb1258e..4b2d94a 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ long_description = f.read() setuptools.setup(name='corpus2alpino', - version='0.3.10', + version='0.3.11', description='Converts FoLiA and TEI files to Alpino XML files', long_description=long_description, long_description_content_type='text/markdown', @@ -23,7 +23,7 @@ ], install_requires=['argparse', 'chamd>=0.5.8', 'folia', 'spacy', 'tei-reader', 'tqdm'], - python_requires='>=3.7', + python_requires='>=3.8', zip_safe=True, entry_points={ 'console_scripts': [