Skip to content

Commit

Permalink
Merge branch 'master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
oktaal authored Apr 3, 2024
2 parents 0553d0c + 46e6592 commit 085c4a1
Show file tree
Hide file tree
Showing 14 changed files with 241 additions and 159 deletions.
39 changes: 39 additions & 0 deletions .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# This workflow will upload a Python Package using Twine when a release is created
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries

# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.

name: Upload Python Package

on:
release:
types: [published]

permissions:
contents: read

jobs:
deploy:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: '3.8'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install build
- name: Build package
run: python -m build
- name: Publish package
uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
with:
user: __token__
password: ${{ secrets.PYPI_API_TOKEN }}
4 changes: 2 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ on: [push]
jobs:
build:

runs-on: ubuntu-18.04
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.7', '3.10']
python-version: ['3.8', '3.10']

steps:
- uses: actions/checkout@v3
Expand Down
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[![Actions Status](https://github.com/UUDigitalHumanitiesLab/corpus2alpino/workflows/Unit%20tests/badge.svg)](https://github.com/UUDigitalHumanitiesLab/corpus2alpino/actions)

[PyPi/corpus2alpino](https://pypi.org/project/corpus2alpino/)
[![PyPi/corpus2alpino](https://img.shields.io/pypi/v/corpus2alpino)](https://pypi.org/project/corpus2alpino/)

# CHAT, FoLiA, PaQu metadata, plaintext and TEI to Alpino XML or PaQu metadata format

Expand Down Expand Up @@ -71,14 +71,15 @@ See: https://packaging.python.org/tutorials/packaging-projects/#generating-distr
Make sure `setuptools` and `wheel` are installed. Then from the virtualenv:

```bash
python setup.py sdist bdist_wheel
pip install build
python -m build
twine upload dist/*
```

## Requirements

* [Alpino parser](http://www.let.rug.nl/vannoord/alp/Alpino) running as a server: `Alpino batch_command=alpino_server -notk server_port=7001`
* Python 3.7 or higher
* Python 3.8 or higher
* [libfolia-dev](https://packages.ubuntu.com/bionic/libfolia-dev)
* [libxml2-dev](https://packages.ubuntu.com/bionic/libxml2-dev)

Expand Down
4 changes: 2 additions & 2 deletions corpus2alpino/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def main(args=None):
parser.add_argument(
'-p', '--progress',
action='store_true',
help='Show progress bar, automatically turned on file output')
help='Show progress bar, automatically turned on for file output')
parser.add_argument('-t', '--split_treebanks',
action='store_true',
help='Split treebanks to separate files')
Expand Down Expand Up @@ -88,7 +88,7 @@ def main(args=None):
converter.target = FilesystemTarget(
options.output_path, not options.split_treebanks)

show_progress = options.progress if options.progress != None else options.output_path != None
show_progress = options.output_path != None or options.progress

if show_progress:
with tqdm(converter.convert(), total=len(options.file_names), unit='file') as progress:
Expand Down
20 changes: 15 additions & 5 deletions corpus2alpino/annotators/alpino.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
Wrapper for the Alpino parser.
"""

from .alpino_client import AlpinoProcessClient, AlpinoServerClient
from corpus2alpino.models import Document, MetadataValue
from corpus2alpino.abstracts import Annotator
import re
import os
import logging
Expand All @@ -11,10 +14,6 @@

ANNOTATION_KEY = 'alpino'

from corpus2alpino.abstracts import Annotator
from corpus2alpino.models import Document, MetadataValue

from .alpino_client import AlpinoProcessClient, AlpinoServerClient

timealign_symbol = re.compile(r'\u0015')

Expand Down Expand Up @@ -52,4 +51,15 @@ def annotate(self, document: Document):
self.client.version_date.isoformat(), 'date')
except Exception as exception:
logging.getLogger().error(
Exception("Problem parsing: {0}|{1}\n{2}".format(utterance.id, utterance.text, exception)))
Exception("Problem parsing: {0}:{1}|{2}\n{3}".format(self.__document_path(document), utterance.id, utterance.text, exception)))

def __document_path(self, document: Document):
value = document.collected_file.filename

if document.collected_file.relpath:
value = document.collected_file.relpath + '/' + value

if document.subpath:
value += '//' + document.subpath

return value
19 changes: 9 additions & 10 deletions corpus2alpino/converter.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
#!/usr/bin/env python3
from typing import List
from typing import List, Optional

from corpus2alpino.collectors.filesystem import FilesystemCollector
from corpus2alpino.readers.auto import AutoReader
from corpus2alpino.targets.console import ConsoleTarget
from corpus2alpino.targets.filesystem import FilesystemTarget
from corpus2alpino.writers.lassy import LassyWriter
from corpus2alpino.writers.paqu import PaQuWriter

from corpus2alpino.abstracts import Annotator, Collector, Reader, Target, Writer
Expand All @@ -16,12 +13,14 @@ class Converter:
Class for converting files to Alpino XML (input) files.
"""

def __init__(self,
collector: Collector,
annotators: List[Annotator] = None,
reader: Reader = AutoReader(),
writer: Writer = PaQuWriter(),
target: Target = ConsoleTarget()) -> None:
def __init__(
self,
collector: Collector,
annotators: Optional[List[Annotator]] = None,
reader: Reader = AutoReader(),
writer: Writer = PaQuWriter(),
target: Target = ConsoleTarget(),
) -> None:
self.collector = collector
self.annotators = annotators or []
self.reader = reader
Expand Down
39 changes: 22 additions & 17 deletions corpus2alpino/models.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,32 @@
#!/usr/bin/env python3
from typing import Dict, List, Iterable
from typing import Dict, Iterable, Optional


class CollectedFile:
def __init__(self, relpath: str, filename: str, mimetype: str,
content: str) -> None:
def __init__(
self, relpath: str, filename: str, mimetype: str, content: str
) -> None:
self.relpath = relpath
self.filename = filename
self.mimetype = mimetype
self.content = content


class MetadataValue:
def __init__(self, value: str, type: str='text') -> None:
def __init__(self, value: str, type: str = "text") -> None:
self.value = value
self.type = type


class Utterance:
def __init__(self,
text: str,
id: str,
metadata: Dict[str, MetadataValue] = None,
line: int = 0,
annotations: Dict[str, str] = None) -> None:
def __init__(
self,
text: str,
id: str,
metadata: Optional[Dict[str, MetadataValue]] = None,
line: int = 0,
annotations: Optional[Dict[str, str]] = None,
) -> None:
self.text = text
self.id = id
self.metadata = metadata or {}
Expand All @@ -32,19 +35,21 @@ def __init__(self,


class Document:
def __init__(self,
collected_file: CollectedFile,
utterances: Iterable[Utterance],
metadata: Dict[str, MetadataValue] = None,
subpath: str = '',
annotations: Dict[str, str] = None) -> None:
def __init__(
self,
collected_file: CollectedFile,
utterances: Iterable[Utterance],
metadata: Optional[Dict[str, MetadataValue]] = None,
subpath: str = "",
annotations: Optional[Dict[str, str]] = None,
) -> None:
"""
A document found in a file.
subpath: if a file has an internal structure, this
contains a string representation of that relative to
the file. E.g. if a tei.xml contains a document A at the
root and a document B
root and a document B
"""
self.collected_file = collected_file
Expand Down
68 changes: 42 additions & 26 deletions corpus2alpino/readers/folia.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,12 @@
from typing import Iterable

from corpus2alpino.abstracts import Reader
from corpus2alpino.models import (CollectedFile, Document, MetadataValue,
Utterance)
from corpus2alpino.models import CollectedFile, Document, MetadataValue, Utterance
from corpus2alpino.readers.tokenizer import Tokenizer

import folia.main as folia

from .alpino_brackets import (escape_id, escape_word, format_add_lex,
format_folia)
from .alpino_brackets import escape_id, escape_word, format_add_lex, format_folia


class FoliaReader(Reader):
Expand All @@ -26,41 +24,54 @@ def __init__(self, custom_tokenizer=None) -> None:

def read(self, collected_file: CollectedFile) -> Iterable[Document]:
try:
doc = folia.Document(string=collected_file.content,
autodeclare=True,
loadsetdefinitions=False)
doc = folia.Document(
string=collected_file.content,
autodeclare=True,
loadsetdefinitions=False,
)
self.tokenize(doc)
doc_metadata = self.get_metadata_dict(doc.metadata.items())

yield Document(collected_file,
list(self.get_utterances(doc, doc_metadata)),
doc_metadata)
yield Document(
collected_file,
list(self.get_utterances(doc, doc_metadata)),
doc_metadata,
)
except Exception as e:
raise Exception(collected_file.relpath + "/" +
collected_file.filename) from e
raise Exception(
collected_file.relpath + "/" + collected_file.filename
) from e

def tokenize(self, element):
"""
Tokenizes all the text which isn't tokenized yet.
"""
if len(element) == 0:
# no sub elements
if isinstance(element, folia.Text):
self.tokenize_element(element.text(), element)
return

for item in element:
if isinstance(item, folia.AbstractElement):
if isinstance(item, folia.Paragraph):
for sentence in item.sentences():
for _ in item.sentences():
break
else:
self.tokenize_paragraph(item)
else:
self.tokenize(item)

def tokenize_paragraph(self, paragraph):
text = ''
for textContent in paragraph.select(folia.TextContent):
text += textContent.text()
def tokenize_paragraph(self, paragraph: folia.Paragraph):
text = ""
for text_content in paragraph.select(folia.TextContent):
text += text_content.text()
self.tokenize_element(text, paragraph)

def tokenize_element(self, text: str, element: folia.AbstractElement):
sentences = self.tokenizer.process(text)
for line in sentences:
sentence = paragraph.add(folia.Sentence)
sentence = element.add(folia.Sentence)
for word in line.tokens():
if word:
sentence.add(folia.Word, word)
Expand Down Expand Up @@ -88,7 +99,9 @@ def get_utterances(self, doc, doc_metadata):
if word_sentence != sentence or word_paragraph != paragraph:
if words:
if sentence or paragraph:
yield self.create_utterance(paragraph, sentence, words, doc_metadata)
yield self.create_utterance(
paragraph, sentence, words, doc_metadata
)
words = []
sentence = word_sentence
paragraph = word_paragraph
Expand All @@ -104,7 +117,7 @@ def create_utterance(self, paragraph, sentence, words, doc_metadata):
"""

word_strings = map(lambda word: self.get_word_string(word), words)
line = " ".join(filter(lambda word: word != '', word_strings))
line = " ".join(filter(lambda word: word != "", word_strings))

if sentence:
container = sentence
Expand All @@ -113,8 +126,8 @@ def create_utterance(self, paragraph, sentence, words, doc_metadata):

sentence_id = escape_id(container.id)
sentence_metadata = self.get_metadata_dict(
container.getmetadata().items(),
doc_metadata)
container.getmetadata().items(), doc_metadata
)

return Utterance(line, sentence_id, sentence_metadata, line)

Expand All @@ -135,7 +148,7 @@ def get_word_string(self, word):
text = item.text()
break
else:
return ''
return ""

try:
correction = word.getcorrection()
Expand All @@ -159,8 +172,11 @@ def get_word_string(self, word):
def get_metadata_dict(self, native_metadata, filter_by=None):
metadata = {}
for key, value in native_metadata:
if filter_by == None or not key in filter_by \
or filter_by[key].value != value:
if (
filter_by == None
or key not in filter_by
or filter_by[key].value != value
):
metadata[key] = MetadataValue(value)
return metadata

Expand All @@ -169,4 +185,4 @@ def test_file(self, file: CollectedFile):
Determine whether this is a FoLiA XML file
"""

return '<FoLiA' in file.content[0:400]
return "<FoLiA" in file.content[0:400]
Loading

0 comments on commit 085c4a1

Please sign in to comment.