From 06fec52fb5592f978ad70998c64da54ac9498d85 Mon Sep 17 00:00:00 2001
From: Sheean Spoel <s.j.j.spoel@uu.nl>
Date: Fri, 5 Aug 2022 14:10:49 +0200
Subject: [PATCH 1/8] More error logging

---
 corpus2alpino/annotators/alpino.py | 20 +++++++++++++++-----
 corpus2alpino/writers/lassy.py     |  5 ++---
 setup.py                           |  2 +-
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/corpus2alpino/annotators/alpino.py b/corpus2alpino/annotators/alpino.py
index 97fc5e2..8b9137d 100644
--- a/corpus2alpino/annotators/alpino.py
+++ b/corpus2alpino/annotators/alpino.py
@@ -3,6 +3,9 @@
 Wrapper for the Alpino parser.
 """
 
+from .alpino_client import AlpinoProcessClient, AlpinoServerClient
+from corpus2alpino.models import Document, MetadataValue
+from corpus2alpino.abstracts import Annotator
 import re
 import os
 import logging
@@ -11,10 +14,6 @@
 
 ANNOTATION_KEY = 'alpino'
 
-from corpus2alpino.abstracts import Annotator
-from corpus2alpino.models import Document, MetadataValue
-
-from .alpino_client import AlpinoProcessClient, AlpinoServerClient
 
 timealign_symbol = re.compile(r'\u0015')
 
@@ -52,4 +51,15 @@ def annotate(self, document: Document):
                             self.client.version_date.isoformat(), 'date')
             except Exception as exception:
                 logging.getLogger().error(
-                    Exception("Problem parsing: {0}|{1}\n{2}".format(utterance.id, utterance.text, exception)))
+                    Exception("Problem parsing: {0}:{1}|{2}\n{3}".format(self.__document_path(document), utterance.id, utterance.text, exception)))
+
+    def __document_path(self, document: Document):
+        value = document.collected_file.filename
+
+        if document.collected_file.relpath:
+            value = document.collected_file.relpath + '/' + value
+
+        if document.subpath:
+            value += '//' + document.subpath
+
+        return value
diff --git a/corpus2alpino/writers/lassy.py b/corpus2alpino/writers/lassy.py
index 20d3570..a72248e 100644
--- a/corpus2alpino/writers/lassy.py
+++ b/corpus2alpino/writers/lassy.py
@@ -41,11 +41,10 @@ def write_utterance(self, document: Document, target: Target, utterance: Utteran
             return
 
         target.write(document, self.render_annotation(
-            document, utterance, not filename), filename)
+            document, utterance, annotation, not filename), filename)
 
-    def render_annotation(self, document: Document, utterance: Utterance, remove_header=False) -> str:
+    def render_annotation(self, document: Document, utterance: Utterance, annotation: str, remove_header=False) -> str:
         metadata = {**document.metadata, **utterance.metadata}
-        annotation = utterance.annotations[ANNOTATION_KEY]
 
         if not metadata and remove_header == False:
             return annotation
diff --git a/setup.py b/setup.py
index 9a92174..eb1258e 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@
       ],
       install_requires=['argparse', 'chamd>=0.5.8', 'folia',
                         'spacy', 'tei-reader', 'tqdm'],
-      python_requires='>=3.6',
+      python_requires='>=3.7',
       zip_safe=True,
       entry_points={
           'console_scripts': [

From 8d9db05480449f513205d198db1f421b8684a414 Mon Sep 17 00:00:00 2001
From: Sheean Spoel <s.j.j.spoel@uu.nl>
Date: Wed, 3 Apr 2024 10:40:17 +0200
Subject: [PATCH 2/8] FoLiA edge cases

---
 .github/workflows/test.yml     |   4 +-
 corpus2alpino/__main__.py      |   4 +-
 corpus2alpino/readers/folia.py |  68 +++++++++++-------
 requirements.txt               | 121 +++++++++++++++++----------------
 4 files changed, 110 insertions(+), 87 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 357446d..baa0261 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -5,10 +5,10 @@ on: [push]
 jobs:
   build:
 
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.7', '3.10']
+        python-version: ['3.8', '3.10']
 
     steps:
     - uses: actions/checkout@v3
diff --git a/corpus2alpino/__main__.py b/corpus2alpino/__main__.py
index 185a9e6..e9e8f01 100644
--- a/corpus2alpino/__main__.py
+++ b/corpus2alpino/__main__.py
@@ -51,7 +51,7 @@ def main(args=None):
         parser.add_argument(
             '-p', '--progress',
             action='store_true',
-            help='Show progress bar, automatically turned on file output')
+            help='Show progress bar, automatically turned on for file output')
         parser.add_argument('-t', '--split_treebanks',
                             action='store_true',
                             help='Split treebanks to separate files')
@@ -88,7 +88,7 @@ def main(args=None):
             converter.target = FilesystemTarget(
                 options.output_path, not options.split_treebanks)
 
-        show_progress = options.progress if options.progress != None else options.output_path != None
+        show_progress = options.output_path != None or options.progress
 
         if show_progress:
             with tqdm(converter.convert(), total=len(options.file_names), unit='file') as progress:
diff --git a/corpus2alpino/readers/folia.py b/corpus2alpino/readers/folia.py
index 2f2b76d..168d101 100644
--- a/corpus2alpino/readers/folia.py
+++ b/corpus2alpino/readers/folia.py
@@ -6,14 +6,12 @@
 from typing import Iterable
 
 from corpus2alpino.abstracts import Reader
-from corpus2alpino.models import (CollectedFile, Document, MetadataValue,
-                                  Utterance)
+from corpus2alpino.models import CollectedFile, Document, MetadataValue, Utterance
 from corpus2alpino.readers.tokenizer import Tokenizer
 
 import folia.main as folia
 
-from .alpino_brackets import (escape_id, escape_word, format_add_lex,
-                              format_folia)
+from .alpino_brackets import escape_id, escape_word, format_add_lex, format_folia
 
 
 class FoliaReader(Reader):
@@ -26,41 +24,54 @@ def __init__(self, custom_tokenizer=None) -> None:
 
     def read(self, collected_file: CollectedFile) -> Iterable[Document]:
         try:
-            doc = folia.Document(string=collected_file.content,
-                                 autodeclare=True,
-                                 loadsetdefinitions=False)
+            doc = folia.Document(
+                string=collected_file.content,
+                autodeclare=True,
+                loadsetdefinitions=False,
+            )
             self.tokenize(doc)
             doc_metadata = self.get_metadata_dict(doc.metadata.items())
 
-            yield Document(collected_file,
-                           list(self.get_utterances(doc, doc_metadata)),
-                           doc_metadata)
+            yield Document(
+                collected_file,
+                list(self.get_utterances(doc, doc_metadata)),
+                doc_metadata,
+            )
         except Exception as e:
-            raise Exception(collected_file.relpath + "/" +
-                            collected_file.filename) from e
+            raise Exception(
+                collected_file.relpath + "/" + collected_file.filename
+            ) from e
 
     def tokenize(self, element):
         """
         Tokenizes all the text which isn't tokenized yet.
         """
+        if len(element) == 0:
+            # no sub elements
+            if isinstance(element, folia.Text):
+                self.tokenize_element(element.text(), element)
+            return
 
         for item in element:
             if isinstance(item, folia.AbstractElement):
                 if isinstance(item, folia.Paragraph):
-                    for sentence in item.sentences():
+                    for _ in item.sentences():
                         break
                     else:
                         self.tokenize_paragraph(item)
                 else:
                     self.tokenize(item)
 
-    def tokenize_paragraph(self, paragraph):
-        text = ''
-        for textContent in paragraph.select(folia.TextContent):
-            text += textContent.text()
+    def tokenize_paragraph(self, paragraph: folia.Paragraph):
+        text = ""
+        for text_content in paragraph.select(folia.TextContent):
+            text += text_content.text()
+        self.tokenize_element(text, paragraph)
+
+    def tokenize_element(self, text: str, element: folia.AbstractElement):
         sentences = self.tokenizer.process(text)
         for line in sentences:
-            sentence = paragraph.add(folia.Sentence)
+            sentence = element.add(folia.Sentence)
             for word in line.tokens():
                 if word:
                     sentence.add(folia.Word, word)
@@ -88,7 +99,9 @@ def get_utterances(self, doc, doc_metadata):
             if word_sentence != sentence or word_paragraph != paragraph:
                 if words:
                     if sentence or paragraph:
-                        yield self.create_utterance(paragraph, sentence, words, doc_metadata)
+                        yield self.create_utterance(
+                            paragraph, sentence, words, doc_metadata
+                        )
                     words = []
                 sentence = word_sentence
                 paragraph = word_paragraph
@@ -104,7 +117,7 @@ def create_utterance(self, paragraph, sentence, words, doc_metadata):
         """
 
         word_strings = map(lambda word: self.get_word_string(word), words)
-        line = " ".join(filter(lambda word: word != '', word_strings))
+        line = " ".join(filter(lambda word: word != "", word_strings))
 
         if sentence:
             container = sentence
@@ -113,8 +126,8 @@ def create_utterance(self, paragraph, sentence, words, doc_metadata):
 
         sentence_id = escape_id(container.id)
         sentence_metadata = self.get_metadata_dict(
-            container.getmetadata().items(),
-            doc_metadata)
+            container.getmetadata().items(), doc_metadata
+        )
 
         return Utterance(line, sentence_id, sentence_metadata, line)
 
@@ -135,7 +148,7 @@ def get_word_string(self, word):
                     text = item.text()
                     break
             else:
-                return ''
+                return ""
 
         try:
             correction = word.getcorrection()
@@ -159,8 +172,11 @@ def get_word_string(self, word):
     def get_metadata_dict(self, native_metadata, filter_by=None):
         metadata = {}
         for key, value in native_metadata:
-            if filter_by == None or not key in filter_by \
-                    or filter_by[key].value != value:
+            if (
+                filter_by == None
+                or key not in filter_by
+                or filter_by[key].value != value
+            ):
                 metadata[key] = MetadataValue(value)
         return metadata
 
@@ -169,4 +185,4 @@ def test_file(self, file: CollectedFile):
         Determine whether this is a FoLiA XML file
         """
 
-        return '<FoLiA' in file.content[0:400]
+        return "<FoLiA" in file.content[0:400]
diff --git a/requirements.txt b/requirements.txt
index 9d80d95..7d439e7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,133 +1,140 @@
 #
-# This file is autogenerated by pip-compile with python 3.7
+# This file is autogenerated by pip-compile with python 3.10
 # To update, run:
 #
 #    pip-compile
 #
+annotated-types==0.6.0
+    # via pydantic
 argparse==1.4.0
     # via corpus2alpino (setup.py)
-beautifulsoup4==4.10.0
+beautifulsoup4==4.12.3
     # via tei-reader
-blis==0.7.5
-    # via
-    #   spacy
-    #   thinc
-catalogue==2.0.6
+blis==0.7.11
+    # via thinc
+catalogue==2.0.10
     # via
     #   spacy
     #   srsly
     #   thinc
-certifi==2021.10.8
+certifi==2024.2.2
     # via requests
-chamd==0.5.8
+chamd==0.5.12
     # via corpus2alpino (setup.py)
-charset-normalizer==2.0.10
+charset-normalizer==3.3.2
     # via requests
-click==8.0.3
+click==8.1.7
     # via typer
-cymem==2.0.6
+cloudpathlib==0.16.0
+    # via weasel
+confection==0.1.4
+    # via
+    #   thinc
+    #   weasel
+cymem==2.0.8
     # via
     #   preshed
     #   spacy
     #   thinc
-folia==2.5.7
+folia==2.5.11
     # via corpus2alpino (setup.py)
-idna==3.3
+idna==3.6
     # via requests
-importlib-metadata==4.12.0
-    # via
-    #   click
-    #   rdflib
 isodate==0.6.1
     # via rdflib
-jinja2==3.0.3
+jinja2==3.1.3
     # via spacy
 langcodes==3.3.0
     # via spacy
-lxml==4.7.1
+lxml==5.1.0
     # via
     #   folia
     #   tei-reader
-markupsafe==2.0.1
+markupsafe==2.1.5
     # via jinja2
-murmurhash==1.0.6
+murmurhash==1.0.10
     # via
     #   preshed
     #   spacy
     #   thinc
-numpy==1.21.5
+numpy==1.24.4
     # via
     #   blis
     #   spacy
     #   thinc
-packaging==21.3
-    # via spacy
-pathy==0.6.1
-    # via spacy
-preshed==3.0.6
+packaging==24.0
     # via
     #   spacy
     #   thinc
-pydantic==1.8.2
+    #   weasel
+preshed==3.0.9
     # via
     #   spacy
     #   thinc
-pyparsing==3.0.7
+pydantic==2.6.4
     # via
-    #   packaging
-    #   rdflib
-rdflib==6.1.1
+    #   confection
+    #   spacy
+    #   thinc
+    #   weasel
+pydantic-core==2.16.3
+    # via pydantic
+pyparsing==3.1.2
+    # via rdflib
+rdflib==7.0.0
     # via folia
-requests==2.27.1
+requests==2.31.0
     # via
     #   folia
     #   spacy
+    #   weasel
 six==1.16.0
     # via isodate
-smart-open==5.2.1
-    # via pathy
-soupsieve==2.3.1
+smart-open==6.4.0
+    # via
+    #   spacy
+    #   weasel
+soupsieve==2.5
     # via beautifulsoup4
-spacy==3.2.1
+spacy==3.7.4
     # via corpus2alpino (setup.py)
-spacy-legacy==3.0.8
+spacy-legacy==3.0.12
     # via spacy
-spacy-loggers==1.0.1
+spacy-loggers==1.0.5
     # via spacy
-srsly==2.4.2
+srsly==2.4.8
     # via
+    #   confection
     #   spacy
     #   thinc
+    #   weasel
 tei-reader==0.0.17
     # via corpus2alpino (setup.py)
-thinc==8.0.13
+thinc==8.2.3
     # via spacy
-tqdm==4.62.3
+tqdm==4.66.2
     # via
     #   corpus2alpino (setup.py)
     #   spacy
-typer==0.4.0
+typer==0.9.0
     # via
-    #   pathy
     #   spacy
-typing-extensions==3.10.0.2
+    #   weasel
+typing-extensions==4.10.0
     # via
-    #   catalogue
-    #   importlib-metadata
+    #   cloudpathlib
     #   pydantic
-    #   spacy
-    #   thinc
-urllib3==1.26.8
+    #   pydantic-core
+    #   typer
+urllib3==2.2.1
     # via requests
-wasabi==0.9.0
+wasabi==1.1.2
     # via
     #   spacy
-    #   spacy-loggers
     #   thinc
-zipp==3.8.0
-    # via
-    #   catalogue
-    #   importlib-metadata
+    #   weasel
+weasel==0.3.4
+    # via spacy
 
 # The following packages are considered to be unsafe in a requirements file:
 # setuptools

From e52c30107282013e55c5545eae0d33a076608874 Mon Sep 17 00:00:00 2001
From: Sheean Spoel <s.j.j.spoel@uu.nl>
Date: Wed, 3 Apr 2024 10:51:20 +0200
Subject: [PATCH 3/8] Fixed typing

---
 README.md                           |  2 +-
 corpus2alpino/converter.py          | 19 +++++++-------
 corpus2alpino/models.py             | 39 ++++++++++++++++-------------
 corpus2alpino/targets/console.py    | 17 +++++++------
 corpus2alpino/targets/filesystem.py | 35 ++++++++++++++------------
 corpus2alpino/targets/memory.py     | 20 ++++++++-------
 6 files changed, 71 insertions(+), 61 deletions(-)

diff --git a/README.md b/README.md
index 1927eb9..2b7a846 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 [![Actions Status](https://github.com/UUDigitalHumanitiesLab/corpus2alpino/workflows/Unit%20tests/badge.svg)](https://github.com/UUDigitalHumanitiesLab/corpus2alpino/actions)
 
-[PyPi/corpus2alpino](https://pypi.org/project/corpus2alpino/)
+[![PyPi/corpus2alpino](https://img.shields.io/pypi/v/corpus2alpino)](https://pypi.org/project/corpus2alpino/)
 
 # CHAT, FoLiA, PaQu metadata, plaintext and TEI to Alpino XML or PaQu metadata format
 
diff --git a/corpus2alpino/converter.py b/corpus2alpino/converter.py
index 206b6d7..a4fc516 100644
--- a/corpus2alpino/converter.py
+++ b/corpus2alpino/converter.py
@@ -1,11 +1,8 @@
 #!/usr/bin/env python3
-from typing import List
+from typing import List, Optional
 
-from corpus2alpino.collectors.filesystem import FilesystemCollector
 from corpus2alpino.readers.auto import AutoReader
 from corpus2alpino.targets.console import ConsoleTarget
-from corpus2alpino.targets.filesystem import FilesystemTarget
-from corpus2alpino.writers.lassy import LassyWriter
 from corpus2alpino.writers.paqu import PaQuWriter
 
 from corpus2alpino.abstracts import Annotator, Collector, Reader, Target, Writer
@@ -16,12 +13,14 @@ class Converter:
     Class for converting files to Alpino XML (input) files.
     """
 
-    def __init__(self,
-                 collector: Collector,
-                 annotators: List[Annotator] = None,
-                 reader: Reader = AutoReader(),
-                 writer: Writer = PaQuWriter(),
-                 target: Target = ConsoleTarget()) -> None:
+    def __init__(
+        self,
+        collector: Collector,
+        annotators: Optional[List[Annotator]] = None,
+        reader: Reader = AutoReader(),
+        writer: Writer = PaQuWriter(),
+        target: Target = ConsoleTarget(),
+    ) -> None:
         self.collector = collector
         self.annotators = annotators or []
         self.reader = reader
diff --git a/corpus2alpino/models.py b/corpus2alpino/models.py
index 0d60695..154953a 100644
--- a/corpus2alpino/models.py
+++ b/corpus2alpino/models.py
@@ -1,10 +1,11 @@
 #!/usr/bin/env python3
-from typing import Dict, List, Iterable
+from typing import Dict, Iterable, Optional
 
 
 class CollectedFile:
-    def __init__(self, relpath: str, filename: str, mimetype: str,
-                 content: str) -> None:
+    def __init__(
+        self, relpath: str, filename: str, mimetype: str, content: str
+    ) -> None:
         self.relpath = relpath
         self.filename = filename
         self.mimetype = mimetype
@@ -12,18 +13,20 @@ def __init__(self, relpath: str, filename: str, mimetype: str,
 
 
 class MetadataValue:
-    def __init__(self, value: str, type: str='text') -> None:
+    def __init__(self, value: str, type: str = "text") -> None:
         self.value = value
         self.type = type
 
 
 class Utterance:
-    def __init__(self,
-                 text: str,
-                 id: str,
-                 metadata: Dict[str, MetadataValue] = None,
-                 line: int = 0,
-                 annotations: Dict[str, str] = None) -> None:
+    def __init__(
+        self,
+        text: str,
+        id: str,
+        metadata: Optional[Dict[str, MetadataValue]] = None,
+        line: int = 0,
+        annotations: Optional[Dict[str, str]] = None,
+    ) -> None:
         self.text = text
         self.id = id
         self.metadata = metadata or {}
@@ -32,19 +35,21 @@ def __init__(self,
 
 
 class Document:
-    def __init__(self,
-                 collected_file: CollectedFile,
-                 utterances: Iterable[Utterance],
-                 metadata: Dict[str, MetadataValue] = None,
-                 subpath: str = '',
-                 annotations: Dict[str, str] = None) -> None:
+    def __init__(
+        self,
+        collected_file: CollectedFile,
+        utterances: Iterable[Utterance],
+        metadata: Optional[Dict[str, MetadataValue]] = None,
+        subpath: str = "",
+        annotations: Optional[Dict[str, str]] = None,
+    ) -> None:
         """
         A document found in a file.
 
         subpath: if a file has an internal structure, this
             contains a string representation of that relative to
             the file. E.g. if a tei.xml contains a document A at the
-            root and a document B 
+            root and a document B
 
         """
         self.collected_file = collected_file
diff --git a/corpus2alpino/targets/console.py b/corpus2alpino/targets/console.py
index 11455ae..f41b6ac 100644
--- a/corpus2alpino/targets/console.py
+++ b/corpus2alpino/targets/console.py
@@ -1,5 +1,4 @@
-from os import path
-from pathlib import Path
+from typing import Optional
 
 from corpus2alpino.abstracts import Target
 from corpus2alpino.models import Document
@@ -10,15 +9,17 @@ class ConsoleTarget(Target):
     Output chunks to the console on separate lines.
     """
 
-    def write(self,
-              document: Document,
-              content: str,
-              filename: str = None,
-              suffix: str = None):
+    def write(
+        self,
+        document: Document,
+        content: str,
+        filename: Optional[str] = None,
+        suffix: Optional[str] = None,
+    ):
         """
         Write all lines to stdout.
         """
-        print(content, end='')
+        print(content, end="")
 
     def flush(self):
         return
diff --git a/corpus2alpino/targets/filesystem.py b/corpus2alpino/targets/filesystem.py
index 2009502..7425460 100644
--- a/corpus2alpino/targets/filesystem.py
+++ b/corpus2alpino/targets/filesystem.py
@@ -4,7 +4,7 @@
 
 from os import path, makedirs
 from pathlib import Path
-from typing import cast, Any
+from typing import Optional, cast
 
 
 class FilesystemTarget(Target):
@@ -14,14 +14,16 @@ class FilesystemTarget(Target):
 
     __current_output_path = None
 
-    def __open_file(self, document: Document, filename: str = None, suffix: str = None):
+    def __open_file(self, document: Document, filename: Optional[str] = None, suffix: Optional[str] = None):
         if self.merge_files:
             # when merge_files = True, a file is already open
-            return 
-        
-        output_path = path.join(self.output_path,
-                                document.collected_file.relpath,
-                                document.collected_file.filename)
+            return
+
+        output_path = path.join(
+            self.output_path,
+            document.collected_file.relpath,
+            document.collected_file.filename,
+        )
 
         if document.subpath:
             output_path = path.join(output_path, document.subpath)
@@ -29,8 +31,7 @@ def __open_file(self, document: Document, filename: str = None, suffix: str = No
         if filename != None:
             output_path = path.join(output_path, cast(str, filename))
         if suffix != None:
-            output_path = str(
-                Path(output_path).with_suffix(cast(str, suffix)))
+            output_path = str(Path(output_path).with_suffix(cast(str, suffix)))
 
         # always open a new file when splitting in separate files
         self.__current_output_path = None
@@ -53,7 +54,7 @@ def __open_unique(self, directory: str, filename: str):
             target = Path(path.join(directory, prefix + filename))
             if not target.is_file():
                 # new file!
-                return target.open('w', encoding='utf-8')
+                return target.open("w", encoding="utf-8")
             attempts += 1
 
     def __init__(self, output_path: str, merge_files=False) -> None:
@@ -63,15 +64,17 @@ def __init__(self, output_path: str, merge_files=False) -> None:
         if self.merge_files:
             # using a single file
             makedirs(path.dirname(output_path), exist_ok=True)
-            self.file = open(output_path, 'w', encoding='utf-8')
+            self.file = open(output_path, "w", encoding="utf-8")
         else:
             self.file = None  # type: ignore
 
-    def write(self,
-              document: Document,
-              content: str,
-              filename: str = None,
-              suffix: str = None):
+    def write(
+        self,
+        document: Document,
+        content: str,
+        filename: Optional[str] = None,
+        suffix: Optional[str] = None,
+    ):
         self.__open_file(document, filename, suffix)
         if self.file:
             self.file.write(content)
diff --git a/corpus2alpino/targets/memory.py b/corpus2alpino/targets/memory.py
index 8c76db5..054f213 100644
--- a/corpus2alpino/targets/memory.py
+++ b/corpus2alpino/targets/memory.py
@@ -1,5 +1,4 @@
-from os import path
-from pathlib import Path
+from typing import Optional
 
 from corpus2alpino.abstracts import Target
 from corpus2alpino.models import Document
@@ -9,13 +8,16 @@ class MemoryTarget(Target):
     """
     Combine output in memory.
     """
-    buffer = ''
 
-    def write(self,
-              document: Document,
-              content: str,
-              filename: str = None,
-              suffix: str = None):
+    buffer = ""
+
+    def write(
+        self,
+        document: Document,
+        content: str,
+        filename: Optional[str] = None,
+        suffix: Optional[str] = None,
+    ):
         """
         Write all lines to stdout.
         """
@@ -25,7 +27,7 @@ def flush(self):
         try:
             return self.buffer
         finally:
-            self.buffer = ''
+            self.buffer = ""
 
     def close(self):
         return

From b41d45e8a266d3250a4acf6641f72c609671153f Mon Sep 17 00:00:00 2001
From: Sheean Spoel <s.j.j.spoel@uu.nl>
Date: Wed, 3 Apr 2024 15:50:55 +0200
Subject: [PATCH 4/8] Create python-publish.yml

---
 .github/workflows/python-publish.yml | 39 ++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 .github/workflows/python-publish.yml

diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
new file mode 100644
index 0000000..fca78d7
--- /dev/null
+++ b/.github/workflows/python-publish.yml
@@ -0,0 +1,39 @@
+# This workflow will upload a Python Package using Twine when a release is created
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+name: Upload Python Package
+
+on:
+  release:
+    types: [published]
+
+permissions:
+  contents: read
+
+jobs:
+  deploy:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.8'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build
+    - name: Build package
+      run: python -m build
+    - name: Publish package
+      uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_API_TOKEN }}

From eec5ae275254a52d8817a0f01e08fe4fc218b35b Mon Sep 17 00:00:00 2001
From: Sheean Spoel <s.j.j.spoel@uu.nl>
Date: Fri, 5 Aug 2022 14:10:49 +0200
Subject: [PATCH 5/8] More error logging

---
 corpus2alpino/annotators/alpino.py | 20 +++++++++++++++-----
 corpus2alpino/writers/lassy.py     |  5 ++---
 setup.py                           |  2 +-
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/corpus2alpino/annotators/alpino.py b/corpus2alpino/annotators/alpino.py
index 97fc5e2..8b9137d 100644
--- a/corpus2alpino/annotators/alpino.py
+++ b/corpus2alpino/annotators/alpino.py
@@ -3,6 +3,9 @@
 Wrapper for the Alpino parser.
 """
 
+from .alpino_client import AlpinoProcessClient, AlpinoServerClient
+from corpus2alpino.models import Document, MetadataValue
+from corpus2alpino.abstracts import Annotator
 import re
 import os
 import logging
@@ -11,10 +14,6 @@
 
 ANNOTATION_KEY = 'alpino'
 
-from corpus2alpino.abstracts import Annotator
-from corpus2alpino.models import Document, MetadataValue
-
-from .alpino_client import AlpinoProcessClient, AlpinoServerClient
 
 timealign_symbol = re.compile(r'\u0015')
 
@@ -52,4 +51,15 @@ def annotate(self, document: Document):
                             self.client.version_date.isoformat(), 'date')
             except Exception as exception:
                 logging.getLogger().error(
-                    Exception("Problem parsing: {0}|{1}\n{2}".format(utterance.id, utterance.text, exception)))
+                    Exception("Problem parsing: {0}:{1}|{2}\n{3}".format(self.__document_path(document), utterance.id, utterance.text, exception)))
+
+    def __document_path(self, document: Document):
+        value = document.collected_file.filename
+
+        if document.collected_file.relpath:
+            value = document.collected_file.relpath + '/' + value
+
+        if document.subpath:
+            value += '//' + document.subpath
+
+        return value
diff --git a/corpus2alpino/writers/lassy.py b/corpus2alpino/writers/lassy.py
index 20d3570..a72248e 100644
--- a/corpus2alpino/writers/lassy.py
+++ b/corpus2alpino/writers/lassy.py
@@ -41,11 +41,10 @@ def write_utterance(self, document: Document, target: Target, utterance: Utteran
             return
 
         target.write(document, self.render_annotation(
-            document, utterance, not filename), filename)
+            document, utterance, annotation, not filename), filename)
 
-    def render_annotation(self, document: Document, utterance: Utterance, remove_header=False) -> str:
+    def render_annotation(self, document: Document, utterance: Utterance, annotation: str, remove_header=False) -> str:
         metadata = {**document.metadata, **utterance.metadata}
-        annotation = utterance.annotations[ANNOTATION_KEY]
 
         if not metadata and remove_header == False:
             return annotation
diff --git a/setup.py b/setup.py
index 9a92174..eb1258e 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@
       ],
       install_requires=['argparse', 'chamd>=0.5.8', 'folia',
                         'spacy', 'tei-reader', 'tqdm'],
-      python_requires='>=3.6',
+      python_requires='>=3.7',
       zip_safe=True,
       entry_points={
           'console_scripts': [

From a441b7b3a653f0e21d2578158f59e860a94e6e65 Mon Sep 17 00:00:00 2001
From: Sheean Spoel <s.j.j.spoel@uu.nl>
Date: Wed, 3 Apr 2024 10:40:17 +0200
Subject: [PATCH 6/8] FoLiA edge cases

---
 .github/workflows/test.yml     |   4 +-
 corpus2alpino/__main__.py      |   4 +-
 corpus2alpino/readers/folia.py |  68 +++++++++++-------
 requirements.txt               | 121 +++++++++++++++++----------------
 4 files changed, 110 insertions(+), 87 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 357446d..baa0261 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -5,10 +5,10 @@ on: [push]
 jobs:
   build:
 
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.7', '3.10']
+        python-version: ['3.8', '3.10']
 
     steps:
     - uses: actions/checkout@v3
diff --git a/corpus2alpino/__main__.py b/corpus2alpino/__main__.py
index 185a9e6..e9e8f01 100644
--- a/corpus2alpino/__main__.py
+++ b/corpus2alpino/__main__.py
@@ -51,7 +51,7 @@ def main(args=None):
         parser.add_argument(
             '-p', '--progress',
             action='store_true',
-            help='Show progress bar, automatically turned on file output')
+            help='Show progress bar, automatically turned on for file output')
         parser.add_argument('-t', '--split_treebanks',
                             action='store_true',
                             help='Split treebanks to separate files')
@@ -88,7 +88,7 @@ def main(args=None):
             converter.target = FilesystemTarget(
                 options.output_path, not options.split_treebanks)
 
-        show_progress = options.progress if options.progress != None else options.output_path != None
+        show_progress = options.output_path != None or options.progress
 
         if show_progress:
             with tqdm(converter.convert(), total=len(options.file_names), unit='file') as progress:
diff --git a/corpus2alpino/readers/folia.py b/corpus2alpino/readers/folia.py
index 2f2b76d..168d101 100644
--- a/corpus2alpino/readers/folia.py
+++ b/corpus2alpino/readers/folia.py
@@ -6,14 +6,12 @@
 from typing import Iterable
 
 from corpus2alpino.abstracts import Reader
-from corpus2alpino.models import (CollectedFile, Document, MetadataValue,
-                                  Utterance)
+from corpus2alpino.models import CollectedFile, Document, MetadataValue, Utterance
 from corpus2alpino.readers.tokenizer import Tokenizer
 
 import folia.main as folia
 
-from .alpino_brackets import (escape_id, escape_word, format_add_lex,
-                              format_folia)
+from .alpino_brackets import escape_id, escape_word, format_add_lex, format_folia
 
 
 class FoliaReader(Reader):
@@ -26,41 +24,54 @@ def __init__(self, custom_tokenizer=None) -> None:
 
     def read(self, collected_file: CollectedFile) -> Iterable[Document]:
         try:
-            doc = folia.Document(string=collected_file.content,
-                                 autodeclare=True,
-                                 loadsetdefinitions=False)
+            doc = folia.Document(
+                string=collected_file.content,
+                autodeclare=True,
+                loadsetdefinitions=False,
+            )
             self.tokenize(doc)
             doc_metadata = self.get_metadata_dict(doc.metadata.items())
 
-            yield Document(collected_file,
-                           list(self.get_utterances(doc, doc_metadata)),
-                           doc_metadata)
+            yield Document(
+                collected_file,
+                list(self.get_utterances(doc, doc_metadata)),
+                doc_metadata,
+            )
         except Exception as e:
-            raise Exception(collected_file.relpath + "/" +
-                            collected_file.filename) from e
+            raise Exception(
+                collected_file.relpath + "/" + collected_file.filename
+            ) from e
 
     def tokenize(self, element):
         """
         Tokenizes all the text which isn't tokenized yet.
         """
+        if len(element) == 0:
+            # no sub elements
+            if isinstance(element, folia.Text):
+                self.tokenize_element(element.text(), element)
+            return
 
         for item in element:
             if isinstance(item, folia.AbstractElement):
                 if isinstance(item, folia.Paragraph):
-                    for sentence in item.sentences():
+                    for _ in item.sentences():
                         break
                     else:
                         self.tokenize_paragraph(item)
                 else:
                     self.tokenize(item)
 
-    def tokenize_paragraph(self, paragraph):
-        text = ''
-        for textContent in paragraph.select(folia.TextContent):
-            text += textContent.text()
+    def tokenize_paragraph(self, paragraph: folia.Paragraph):
+        text = ""
+        for text_content in paragraph.select(folia.TextContent):
+            text += text_content.text()
+        self.tokenize_element(text, paragraph)
+
+    def tokenize_element(self, text: str, element: folia.AbstractElement):
         sentences = self.tokenizer.process(text)
         for line in sentences:
-            sentence = paragraph.add(folia.Sentence)
+            sentence = element.add(folia.Sentence)
             for word in line.tokens():
                 if word:
                     sentence.add(folia.Word, word)
@@ -88,7 +99,9 @@ def get_utterances(self, doc, doc_metadata):
             if word_sentence != sentence or word_paragraph != paragraph:
                 if words:
                     if sentence or paragraph:
-                        yield self.create_utterance(paragraph, sentence, words, doc_metadata)
+                        yield self.create_utterance(
+                            paragraph, sentence, words, doc_metadata
+                        )
                     words = []
                 sentence = word_sentence
                 paragraph = word_paragraph
@@ -104,7 +117,7 @@ def create_utterance(self, paragraph, sentence, words, doc_metadata):
         """
 
         word_strings = map(lambda word: self.get_word_string(word), words)
-        line = " ".join(filter(lambda word: word != '', word_strings))
+        line = " ".join(filter(lambda word: word != "", word_strings))
 
         if sentence:
             container = sentence
@@ -113,8 +126,8 @@ def create_utterance(self, paragraph, sentence, words, doc_metadata):
 
         sentence_id = escape_id(container.id)
         sentence_metadata = self.get_metadata_dict(
-            container.getmetadata().items(),
-            doc_metadata)
+            container.getmetadata().items(), doc_metadata
+        )
 
         return Utterance(line, sentence_id, sentence_metadata, line)
 
@@ -135,7 +148,7 @@ def get_word_string(self, word):
                     text = item.text()
                     break
             else:
-                return ''
+                return ""
 
         try:
             correction = word.getcorrection()
@@ -159,8 +172,11 @@ def get_word_string(self, word):
     def get_metadata_dict(self, native_metadata, filter_by=None):
         metadata = {}
         for key, value in native_metadata:
-            if filter_by == None or not key in filter_by \
-                    or filter_by[key].value != value:
+            if (
+                filter_by == None
+                or key not in filter_by
+                or filter_by[key].value != value
+            ):
                 metadata[key] = MetadataValue(value)
         return metadata
 
@@ -169,4 +185,4 @@ def test_file(self, file: CollectedFile):
         Determine whether this is a FoLiA XML file
         """
 
-        return '<FoLiA' in file.content[0:400]
+        return "<FoLiA" in file.content[0:400]
diff --git a/requirements.txt b/requirements.txt
index 9d80d95..7d439e7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,133 +1,140 @@
 #
-# This file is autogenerated by pip-compile with python 3.7
+# This file is autogenerated by pip-compile with python 3.10
 # To update, run:
 #
 #    pip-compile
 #
+annotated-types==0.6.0
+    # via pydantic
 argparse==1.4.0
     # via corpus2alpino (setup.py)
-beautifulsoup4==4.10.0
+beautifulsoup4==4.12.3
     # via tei-reader
-blis==0.7.5
-    # via
-    #   spacy
-    #   thinc
-catalogue==2.0.6
+blis==0.7.11
+    # via thinc
+catalogue==2.0.10
     # via
     #   spacy
     #   srsly
     #   thinc
-certifi==2021.10.8
+certifi==2024.2.2
     # via requests
-chamd==0.5.8
+chamd==0.5.12
     # via corpus2alpino (setup.py)
-charset-normalizer==2.0.10
+charset-normalizer==3.3.2
     # via requests
-click==8.0.3
+click==8.1.7
     # via typer
-cymem==2.0.6
+cloudpathlib==0.16.0
+    # via weasel
+confection==0.1.4
+    # via
+    #   thinc
+    #   weasel
+cymem==2.0.8
     # via
     #   preshed
     #   spacy
     #   thinc
-folia==2.5.7
+folia==2.5.11
     # via corpus2alpino (setup.py)
-idna==3.3
+idna==3.6
     # via requests
-importlib-metadata==4.12.0
-    # via
-    #   click
-    #   rdflib
 isodate==0.6.1
     # via rdflib
-jinja2==3.0.3
+jinja2==3.1.3
     # via spacy
 langcodes==3.3.0
     # via spacy
-lxml==4.7.1
+lxml==5.1.0
     # via
     #   folia
     #   tei-reader
-markupsafe==2.0.1
+markupsafe==2.1.5
     # via jinja2
-murmurhash==1.0.6
+murmurhash==1.0.10
     # via
     #   preshed
     #   spacy
     #   thinc
-numpy==1.21.5
+numpy==1.24.4
     # via
     #   blis
     #   spacy
     #   thinc
-packaging==21.3
-    # via spacy
-pathy==0.6.1
-    # via spacy
-preshed==3.0.6
+packaging==24.0
     # via
     #   spacy
     #   thinc
-pydantic==1.8.2
+    #   weasel
+preshed==3.0.9
     # via
     #   spacy
     #   thinc
-pyparsing==3.0.7
+pydantic==2.6.4
     # via
-    #   packaging
-    #   rdflib
-rdflib==6.1.1
+    #   confection
+    #   spacy
+    #   thinc
+    #   weasel
+pydantic-core==2.16.3
+    # via pydantic
+pyparsing==3.1.2
+    # via rdflib
+rdflib==7.0.0
     # via folia
-requests==2.27.1
+requests==2.31.0
     # via
     #   folia
     #   spacy
+    #   weasel
 six==1.16.0
     # via isodate
-smart-open==5.2.1
-    # via pathy
-soupsieve==2.3.1
+smart-open==6.4.0
+    # via
+    #   spacy
+    #   weasel
+soupsieve==2.5
     # via beautifulsoup4
-spacy==3.2.1
+spacy==3.7.4
     # via corpus2alpino (setup.py)
-spacy-legacy==3.0.8
+spacy-legacy==3.0.12
     # via spacy
-spacy-loggers==1.0.1
+spacy-loggers==1.0.5
     # via spacy
-srsly==2.4.2
+srsly==2.4.8
     # via
+    #   confection
     #   spacy
     #   thinc
+    #   weasel
 tei-reader==0.0.17
     # via corpus2alpino (setup.py)
-thinc==8.0.13
+thinc==8.2.3
     # via spacy
-tqdm==4.62.3
+tqdm==4.66.2
     # via
     #   corpus2alpino (setup.py)
     #   spacy
-typer==0.4.0
+typer==0.9.0
     # via
-    #   pathy
     #   spacy
-typing-extensions==3.10.0.2
+    #   weasel
+typing-extensions==4.10.0
     # via
-    #   catalogue
-    #   importlib-metadata
+    #   cloudpathlib
     #   pydantic
-    #   spacy
-    #   thinc
-urllib3==1.26.8
+    #   pydantic-core
+    #   typer
+urllib3==2.2.1
     # via requests
-wasabi==0.9.0
+wasabi==1.1.2
     # via
     #   spacy
-    #   spacy-loggers
     #   thinc
-zipp==3.8.0
-    # via
-    #   catalogue
-    #   importlib-metadata
+    #   weasel
+weasel==0.3.4
+    # via spacy
 
 # The following packages are considered to be unsafe in a requirements file:
 # setuptools

From 3534b6d9a4e5fee930512867b75788d6a1c4bb71 Mon Sep 17 00:00:00 2001
From: Sheean Spoel <s.j.j.spoel@uu.nl>
Date: Wed, 3 Apr 2024 10:51:20 +0200
Subject: [PATCH 7/8] Fixed typing

---
 README.md                           |  2 +-
 corpus2alpino/converter.py          | 19 +++++++-------
 corpus2alpino/models.py             | 39 ++++++++++++++++-------------
 corpus2alpino/targets/console.py    | 17 +++++++------
 corpus2alpino/targets/filesystem.py | 35 ++++++++++++++------------
 corpus2alpino/targets/memory.py     | 20 ++++++++-------
 6 files changed, 71 insertions(+), 61 deletions(-)

diff --git a/README.md b/README.md
index 1927eb9..2b7a846 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 [![Actions Status](https://github.com/UUDigitalHumanitiesLab/corpus2alpino/workflows/Unit%20tests/badge.svg)](https://github.com/UUDigitalHumanitiesLab/corpus2alpino/actions)
 
-[PyPi/corpus2alpino](https://pypi.org/project/corpus2alpino/)
+[![PyPi/corpus2alpino](https://img.shields.io/pypi/v/corpus2alpino)](https://pypi.org/project/corpus2alpino/)
 
 # CHAT, FoLiA, PaQu metadata, plaintext and TEI to Alpino XML or PaQu metadata format
 
diff --git a/corpus2alpino/converter.py b/corpus2alpino/converter.py
index 206b6d7..a4fc516 100644
--- a/corpus2alpino/converter.py
+++ b/corpus2alpino/converter.py
@@ -1,11 +1,8 @@
 #!/usr/bin/env python3
-from typing import List
+from typing import List, Optional
 
-from corpus2alpino.collectors.filesystem import FilesystemCollector
 from corpus2alpino.readers.auto import AutoReader
 from corpus2alpino.targets.console import ConsoleTarget
-from corpus2alpino.targets.filesystem import FilesystemTarget
-from corpus2alpino.writers.lassy import LassyWriter
 from corpus2alpino.writers.paqu import PaQuWriter
 
 from corpus2alpino.abstracts import Annotator, Collector, Reader, Target, Writer
@@ -16,12 +13,14 @@ class Converter:
     Class for converting files to Alpino XML (input) files.
     """
 
-    def __init__(self,
-                 collector: Collector,
-                 annotators: List[Annotator] = None,
-                 reader: Reader = AutoReader(),
-                 writer: Writer = PaQuWriter(),
-                 target: Target = ConsoleTarget()) -> None:
+    def __init__(
+        self,
+        collector: Collector,
+        annotators: Optional[List[Annotator]] = None,
+        reader: Reader = AutoReader(),
+        writer: Writer = PaQuWriter(),
+        target: Target = ConsoleTarget(),
+    ) -> None:
         self.collector = collector
         self.annotators = annotators or []
         self.reader = reader
diff --git a/corpus2alpino/models.py b/corpus2alpino/models.py
index 0d60695..154953a 100644
--- a/corpus2alpino/models.py
+++ b/corpus2alpino/models.py
@@ -1,10 +1,11 @@
 #!/usr/bin/env python3
-from typing import Dict, List, Iterable
+from typing import Dict, Iterable, Optional
 
 
 class CollectedFile:
-    def __init__(self, relpath: str, filename: str, mimetype: str,
-                 content: str) -> None:
+    def __init__(
+        self, relpath: str, filename: str, mimetype: str, content: str
+    ) -> None:
         self.relpath = relpath
         self.filename = filename
         self.mimetype = mimetype
@@ -12,18 +13,20 @@ def __init__(self, relpath: str, filename: str, mimetype: str,
 
 
 class MetadataValue:
-    def __init__(self, value: str, type: str='text') -> None:
+    def __init__(self, value: str, type: str = "text") -> None:
         self.value = value
         self.type = type
 
 
 class Utterance:
-    def __init__(self,
-                 text: str,
-                 id: str,
-                 metadata: Dict[str, MetadataValue] = None,
-                 line: int = 0,
-                 annotations: Dict[str, str] = None) -> None:
+    def __init__(
+        self,
+        text: str,
+        id: str,
+        metadata: Optional[Dict[str, MetadataValue]] = None,
+        line: int = 0,
+        annotations: Optional[Dict[str, str]] = None,
+    ) -> None:
         self.text = text
         self.id = id
         self.metadata = metadata or {}
@@ -32,19 +35,21 @@ def __init__(self,
 
 
 class Document:
-    def __init__(self,
-                 collected_file: CollectedFile,
-                 utterances: Iterable[Utterance],
-                 metadata: Dict[str, MetadataValue] = None,
-                 subpath: str = '',
-                 annotations: Dict[str, str] = None) -> None:
+    def __init__(
+        self,
+        collected_file: CollectedFile,
+        utterances: Iterable[Utterance],
+        metadata: Optional[Dict[str, MetadataValue]] = None,
+        subpath: str = "",
+        annotations: Optional[Dict[str, str]] = None,
+    ) -> None:
         """
         A document found in a file.
 
         subpath: if a file has an internal structure, this
             contains a string representation of that relative to
             the file. E.g. if a tei.xml contains a document A at the
-            root and a document B 
+            root and a document B
 
         """
         self.collected_file = collected_file
diff --git a/corpus2alpino/targets/console.py b/corpus2alpino/targets/console.py
index 11455ae..f41b6ac 100644
--- a/corpus2alpino/targets/console.py
+++ b/corpus2alpino/targets/console.py
@@ -1,5 +1,4 @@
-from os import path
-from pathlib import Path
+from typing import Optional
 
 from corpus2alpino.abstracts import Target
 from corpus2alpino.models import Document
@@ -10,15 +9,17 @@ class ConsoleTarget(Target):
     Output chunks to the console on separate lines.
     """
 
-    def write(self,
-              document: Document,
-              content: str,
-              filename: str = None,
-              suffix: str = None):
+    def write(
+        self,
+        document: Document,
+        content: str,
+        filename: Optional[str] = None,
+        suffix: Optional[str] = None,
+    ):
         """
         Write all lines to stdout.
         """
-        print(content, end='')
+        print(content, end="")
 
     def flush(self):
         return
diff --git a/corpus2alpino/targets/filesystem.py b/corpus2alpino/targets/filesystem.py
index 2009502..7425460 100644
--- a/corpus2alpino/targets/filesystem.py
+++ b/corpus2alpino/targets/filesystem.py
@@ -4,7 +4,7 @@
 
 from os import path, makedirs
 from pathlib import Path
-from typing import cast, Any
+from typing import Optional, cast
 
 
 class FilesystemTarget(Target):
@@ -14,14 +14,16 @@ class FilesystemTarget(Target):
 
     __current_output_path = None
 
-    def __open_file(self, document: Document, filename: str = None, suffix: str = None):
+    def __open_file(self, document: Document, filename: Optional[str] = None, suffix: Optional[str] = None):
         if self.merge_files:
             # when merge_files = True, a file is already open
-            return 
-        
-        output_path = path.join(self.output_path,
-                                document.collected_file.relpath,
-                                document.collected_file.filename)
+            return
+
+        output_path = path.join(
+            self.output_path,
+            document.collected_file.relpath,
+            document.collected_file.filename,
+        )
 
         if document.subpath:
             output_path = path.join(output_path, document.subpath)
@@ -29,8 +31,7 @@ def __open_file(self, document: Document, filename: str = None, suffix: str = No
         if filename != None:
             output_path = path.join(output_path, cast(str, filename))
         if suffix != None:
-            output_path = str(
-                Path(output_path).with_suffix(cast(str, suffix)))
+            output_path = str(Path(output_path).with_suffix(cast(str, suffix)))
 
         # always open a new file when splitting in separate files
         self.__current_output_path = None
@@ -53,7 +54,7 @@ def __open_unique(self, directory: str, filename: str):
             target = Path(path.join(directory, prefix + filename))
             if not target.is_file():
                 # new file!
-                return target.open('w', encoding='utf-8')
+                return target.open("w", encoding="utf-8")
             attempts += 1
 
     def __init__(self, output_path: str, merge_files=False) -> None:
@@ -63,15 +64,17 @@ def __init__(self, output_path: str, merge_files=False) -> None:
         if self.merge_files:
             # using a single file
             makedirs(path.dirname(output_path), exist_ok=True)
-            self.file = open(output_path, 'w', encoding='utf-8')
+            self.file = open(output_path, "w", encoding="utf-8")
         else:
             self.file = None  # type: ignore
 
-    def write(self,
-              document: Document,
-              content: str,
-              filename: str = None,
-              suffix: str = None):
+    def write(
+        self,
+        document: Document,
+        content: str,
+        filename: Optional[str] = None,
+        suffix: Optional[str] = None,
+    ):
         self.__open_file(document, filename, suffix)
         if self.file:
             self.file.write(content)
diff --git a/corpus2alpino/targets/memory.py b/corpus2alpino/targets/memory.py
index 8c76db5..054f213 100644
--- a/corpus2alpino/targets/memory.py
+++ b/corpus2alpino/targets/memory.py
@@ -1,5 +1,4 @@
-from os import path
-from pathlib import Path
+from typing import Optional
 
 from corpus2alpino.abstracts import Target
 from corpus2alpino.models import Document
@@ -9,13 +8,16 @@ class MemoryTarget(Target):
     """
     Combine output in memory.
     """
-    buffer = ''
 
-    def write(self,
-              document: Document,
-              content: str,
-              filename: str = None,
-              suffix: str = None):
+    buffer = ""
+
+    def write(
+        self,
+        document: Document,
+        content: str,
+        filename: Optional[str] = None,
+        suffix: Optional[str] = None,
+    ):
         """
         Write all lines to stdout.
         """
@@ -25,7 +27,7 @@ def flush(self):
         try:
             return self.buffer
         finally:
-            self.buffer = ''
+            self.buffer = ""
 
     def close(self):
         return

From 70b81f62bd7c8def2d41e99e402003d1900b2f00 Mon Sep 17 00:00:00 2001
From: Sheean Spoel <s.j.j.spoel@uu.nl>
Date: Wed, 3 Apr 2024 15:56:42 +0200
Subject: [PATCH 8/8] 0.3.11

---
 README.md | 5 +++--
 setup.py  | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 2b7a846..80ae3b7 100644
--- a/README.md
+++ b/README.md
@@ -71,14 +71,15 @@ See: https://packaging.python.org/tutorials/packaging-projects/#generating-distr
 Make sure `setuptools` and `wheel` are installed. Then from the virtualenv:
 
 ```bash
-python setup.py sdist bdist_wheel
+pip install build
+python -m build
 twine upload dist/*
 ```
 
 ## Requirements
 
 * [Alpino parser](http://www.let.rug.nl/vannoord/alp/Alpino) running as a server: `Alpino batch_command=alpino_server -notk server_port=7001`
-* Python 3.7 or higher
+* Python 3.8 or higher
 * [libfolia-dev](https://packages.ubuntu.com/bionic/libfolia-dev)
 * [libxml2-dev](https://packages.ubuntu.com/bionic/libxml2-dev)
 
diff --git a/setup.py b/setup.py
index eb1258e..4b2d94a 100644
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,7 @@
     long_description = f.read()
 
 setuptools.setup(name='corpus2alpino',
-                 version='0.3.10',
+                 version='0.3.11',
       description='Converts FoLiA and TEI files to Alpino XML files',
       long_description=long_description,
       long_description_content_type='text/markdown',
@@ -23,7 +23,7 @@
       ],
       install_requires=['argparse', 'chamd>=0.5.8', 'folia',
                         'spacy', 'tei-reader', 'tqdm'],
-      python_requires='>=3.7',
+      python_requires='>=3.8',
       zip_safe=True,
       entry_points={
           'console_scripts': [