Merge branch 'master' into master

UUDigitalHumanitieslab · Apr 3, 2024 · 085c4a1 · 085c4a1
2 parents 0553d0c + 46e6592
commit 085c4a1
Show file tree

Hide file tree

Showing 14 changed files with 241 additions and 159 deletions.
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -0,0 +1,39 @@
+# This workflow will upload a Python Package using Twine when a release is created
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+name: Upload Python Package
+
+on:
+  release:
+    types: [published]
+
+permissions:
+  contents: read
+
+jobs:
+  deploy:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.8'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build
+    - name: Build package
+      run: python -m build
+    - name: Publish package
+      uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -5,10 +5,10 @@ on: [push]
 jobs:
   build:
 
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.7', '3.10']
+        python-version: ['3.8', '3.10']
 
     steps:
     - uses: actions/checkout@v3

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 [![Actions Status](https://github.com/UUDigitalHumanitiesLab/corpus2alpino/workflows/Unit%20tests/badge.svg)](https://github.com/UUDigitalHumanitiesLab/corpus2alpino/actions)
 
-[PyPi/corpus2alpino](https://pypi.org/project/corpus2alpino/)
+[![PyPi/corpus2alpino](https://img.shields.io/pypi/v/corpus2alpino)](https://pypi.org/project/corpus2alpino/)
 
 # CHAT, FoLiA, PaQu metadata, plaintext and TEI to Alpino XML or PaQu metadata format
 
@@ -71,14 +71,15 @@ See: https://packaging.python.org/tutorials/packaging-projects/#generating-distr
 Make sure `setuptools` and `wheel` are installed. Then from the virtualenv:
 
 ```bash
-python setup.py sdist bdist_wheel
+pip install build
+python -m build
 twine upload dist/*
 ```
 
 ## Requirements
 
 * [Alpino parser](http://www.let.rug.nl/vannoord/alp/Alpino) running as a server: `Alpino batch_command=alpino_server -notk server_port=7001`
-* Python 3.7 or higher
+* Python 3.8 or higher
 * [libfolia-dev](https://packages.ubuntu.com/bionic/libfolia-dev)
 * [libxml2-dev](https://packages.ubuntu.com/bionic/libxml2-dev)
 

diff --git a/corpus2alpino/__main__.py b/corpus2alpino/__main__.py
@@ -51,7 +51,7 @@ def main(args=None):
         parser.add_argument(
             '-p', '--progress',
             action='store_true',
-            help='Show progress bar, automatically turned on file output')
+            help='Show progress bar, automatically turned on for file output')
         parser.add_argument('-t', '--split_treebanks',
                             action='store_true',
                             help='Split treebanks to separate files')
@@ -88,7 +88,7 @@ def main(args=None):
             converter.target = FilesystemTarget(
                 options.output_path, not options.split_treebanks)
 
-        show_progress = options.progress if options.progress != None else options.output_path != None
+        show_progress = options.output_path != None or options.progress
 
         if show_progress:
             with tqdm(converter.convert(), total=len(options.file_names), unit='file') as progress:

diff --git a/corpus2alpino/annotators/alpino.py b/corpus2alpino/annotators/alpino.py
@@ -3,6 +3,9 @@
 Wrapper for the Alpino parser.
 """
 
+from .alpino_client import AlpinoProcessClient, AlpinoServerClient
+from corpus2alpino.models import Document, MetadataValue
+from corpus2alpino.abstracts import Annotator
 import re
 import os
 import logging
@@ -11,10 +14,6 @@
 
 ANNOTATION_KEY = 'alpino'
 
-from corpus2alpino.abstracts import Annotator
-from corpus2alpino.models import Document, MetadataValue
-
-from .alpino_client import AlpinoProcessClient, AlpinoServerClient
 
 timealign_symbol = re.compile(r'\u0015')
 
@@ -52,4 +51,15 @@ def annotate(self, document: Document):
                             self.client.version_date.isoformat(), 'date')
             except Exception as exception:
                 logging.getLogger().error(
-                    Exception("Problem parsing: {0}|{1}\n{2}".format(utterance.id, utterance.text, exception)))
+                    Exception("Problem parsing: {0}:{1}|{2}\n{3}".format(self.__document_path(document), utterance.id, utterance.text, exception)))
+
+    def __document_path(self, document: Document):
+        value = document.collected_file.filename
+
+        if document.collected_file.relpath:
+            value = document.collected_file.relpath + '/' + value
+
+        if document.subpath:
+            value += '//' + document.subpath
+
+        return value
diff --git a/corpus2alpino/converter.py b/corpus2alpino/converter.py
@@ -1,11 +1,8 @@
 #!/usr/bin/env python3
-from typing import List
+from typing import List, Optional
 
-from corpus2alpino.collectors.filesystem import FilesystemCollector
 from corpus2alpino.readers.auto import AutoReader
 from corpus2alpino.targets.console import ConsoleTarget
-from corpus2alpino.targets.filesystem import FilesystemTarget
-from corpus2alpino.writers.lassy import LassyWriter
 from corpus2alpino.writers.paqu import PaQuWriter
 
 from corpus2alpino.abstracts import Annotator, Collector, Reader, Target, Writer
@@ -16,12 +13,14 @@ class Converter:
     Class for converting files to Alpino XML (input) files.
     """
 
-    def __init__(self,
-                 collector: Collector,
-                 annotators: List[Annotator] = None,
-                 reader: Reader = AutoReader(),
-                 writer: Writer = PaQuWriter(),
-                 target: Target = ConsoleTarget()) -> None:
+    def __init__(
+        self,
+        collector: Collector,
+        annotators: Optional[List[Annotator]] = None,
+        reader: Reader = AutoReader(),
+        writer: Writer = PaQuWriter(),
+        target: Target = ConsoleTarget(),
+    ) -> None:
         self.collector = collector
         self.annotators = annotators or []
         self.reader = reader

diff --git a/corpus2alpino/models.py b/corpus2alpino/models.py
@@ -1,29 +1,32 @@
 #!/usr/bin/env python3
-from typing import Dict, List, Iterable
+from typing import Dict, Iterable, Optional
 
 
 class CollectedFile:
-    def __init__(self, relpath: str, filename: str, mimetype: str,
-                 content: str) -> None:
+    def __init__(
+        self, relpath: str, filename: str, mimetype: str, content: str
+    ) -> None:
         self.relpath = relpath
         self.filename = filename
         self.mimetype = mimetype
         self.content = content
 
 
 class MetadataValue:
-    def __init__(self, value: str, type: str='text') -> None:
+    def __init__(self, value: str, type: str = "text") -> None:
         self.value = value
         self.type = type
 
 
 class Utterance:
-    def __init__(self,
-                 text: str,
-                 id: str,
-                 metadata: Dict[str, MetadataValue] = None,
-                 line: int = 0,
-                 annotations: Dict[str, str] = None) -> None:
+    def __init__(
+        self,
+        text: str,
+        id: str,
+        metadata: Optional[Dict[str, MetadataValue]] = None,
+        line: int = 0,
+        annotations: Optional[Dict[str, str]] = None,
+    ) -> None:
         self.text = text
         self.id = id
         self.metadata = metadata or {}
@@ -32,19 +35,21 @@ def __init__(self,
 
 
 class Document:
-    def __init__(self,
-                 collected_file: CollectedFile,
-                 utterances: Iterable[Utterance],
-                 metadata: Dict[str, MetadataValue] = None,
-                 subpath: str = '',
-                 annotations: Dict[str, str] = None) -> None:
+    def __init__(
+        self,
+        collected_file: CollectedFile,
+        utterances: Iterable[Utterance],
+        metadata: Optional[Dict[str, MetadataValue]] = None,
+        subpath: str = "",
+        annotations: Optional[Dict[str, str]] = None,
+    ) -> None:
         """
         A document found in a file.
 
         subpath: if a file has an internal structure, this
             contains a string representation of that relative to
             the file. E.g. if a tei.xml contains a document A at the
-            root and a document B 
+            root and a document B
 
         """
         self.collected_file = collected_file

diff --git a/corpus2alpino/readers/folia.py b/corpus2alpino/readers/folia.py
@@ -6,14 +6,12 @@
 from typing import Iterable
 
 from corpus2alpino.abstracts import Reader
-from corpus2alpino.models import (CollectedFile, Document, MetadataValue,
-                                  Utterance)
+from corpus2alpino.models import CollectedFile, Document, MetadataValue, Utterance
 from corpus2alpino.readers.tokenizer import Tokenizer
 
 import folia.main as folia
 
-from .alpino_brackets import (escape_id, escape_word, format_add_lex,
-                              format_folia)
+from .alpino_brackets import escape_id, escape_word, format_add_lex, format_folia
 
 
 class FoliaReader(Reader):
@@ -26,41 +24,54 @@ def __init__(self, custom_tokenizer=None) -> None:
 
     def read(self, collected_file: CollectedFile) -> Iterable[Document]:
         try:
-            doc = folia.Document(string=collected_file.content,
-                                 autodeclare=True,
-                                 loadsetdefinitions=False)
+            doc = folia.Document(
+                string=collected_file.content,
+                autodeclare=True,
+                loadsetdefinitions=False,
+            )
             self.tokenize(doc)
             doc_metadata = self.get_metadata_dict(doc.metadata.items())
 
-            yield Document(collected_file,
-                           list(self.get_utterances(doc, doc_metadata)),
-                           doc_metadata)
+            yield Document(
+                collected_file,
+                list(self.get_utterances(doc, doc_metadata)),
+                doc_metadata,
+            )
         except Exception as e:
-            raise Exception(collected_file.relpath + "/" +
-                            collected_file.filename) from e
+            raise Exception(
+                collected_file.relpath + "/" + collected_file.filename
+            ) from e
 
     def tokenize(self, element):
         """
         Tokenizes all the text which isn't tokenized yet.
         """
+        if len(element) == 0:
+            # no sub elements
+            if isinstance(element, folia.Text):
+                self.tokenize_element(element.text(), element)
+            return
 
         for item in element:
             if isinstance(item, folia.AbstractElement):
                 if isinstance(item, folia.Paragraph):
-                    for sentence in item.sentences():
+                    for _ in item.sentences():
                         break
                     else:
                         self.tokenize_paragraph(item)
                 else:
                     self.tokenize(item)
 
-    def tokenize_paragraph(self, paragraph):
-        text = ''
-        for textContent in paragraph.select(folia.TextContent):
-            text += textContent.text()
+    def tokenize_paragraph(self, paragraph: folia.Paragraph):
+        text = ""
+        for text_content in paragraph.select(folia.TextContent):
+            text += text_content.text()
+        self.tokenize_element(text, paragraph)
+
+    def tokenize_element(self, text: str, element: folia.AbstractElement):
         sentences = self.tokenizer.process(text)
         for line in sentences:
-            sentence = paragraph.add(folia.Sentence)
+            sentence = element.add(folia.Sentence)
             for word in line.tokens():
                 if word:
                     sentence.add(folia.Word, word)
@@ -88,7 +99,9 @@ def get_utterances(self, doc, doc_metadata):
             if word_sentence != sentence or word_paragraph != paragraph:
                 if words:
                     if sentence or paragraph:
-                        yield self.create_utterance(paragraph, sentence, words, doc_metadata)
+                        yield self.create_utterance(
+                            paragraph, sentence, words, doc_metadata
+                        )
                     words = []
                 sentence = word_sentence
                 paragraph = word_paragraph
@@ -104,7 +117,7 @@ def create_utterance(self, paragraph, sentence, words, doc_metadata):
         """
 
         word_strings = map(lambda word: self.get_word_string(word), words)
-        line = " ".join(filter(lambda word: word != '', word_strings))
+        line = " ".join(filter(lambda word: word != "", word_strings))
 
         if sentence:
             container = sentence
@@ -113,8 +126,8 @@ def create_utterance(self, paragraph, sentence, words, doc_metadata):
 
         sentence_id = escape_id(container.id)
         sentence_metadata = self.get_metadata_dict(
-            container.getmetadata().items(),
-            doc_metadata)
+            container.getmetadata().items(), doc_metadata
+        )
 
         return Utterance(line, sentence_id, sentence_metadata, line)
 
@@ -135,7 +148,7 @@ def get_word_string(self, word):
                     text = item.text()
                     break
             else:
-                return ''
+                return ""
 
         try:
             correction = word.getcorrection()
@@ -159,8 +172,11 @@ def get_word_string(self, word):
     def get_metadata_dict(self, native_metadata, filter_by=None):
         metadata = {}
         for key, value in native_metadata:
-            if filter_by == None or not key in filter_by \
-                    or filter_by[key].value != value:
+            if (
+                filter_by == None
+                or key not in filter_by
+                or filter_by[key].value != value
+            ):
                 metadata[key] = MetadataValue(value)
         return metadata
 
@@ -169,4 +185,4 @@ def test_file(self, file: CollectedFile):
         Determine whether this is a FoLiA XML file
         """
 
-        return '<FoLiA' in file.content[0:400]
+        return "<FoLiA" in file.content[0:400]