Skip to content

Commit

Permalink
deps: poetry update (#208)
Browse files Browse the repository at this point in the history
* deps: poetry update

* fix: ruff format

* misc: poetry lock

* misc: update ruff

* misc: ruff format with latest ruff
  • Loading branch information
himkt authored Mar 13, 2024
1 parent 839ce44 commit 4ad2139
Show file tree
Hide file tree
Showing 10 changed files with 335 additions and 328 deletions.
16 changes: 8 additions & 8 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@

import os

on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
on_rtd = os.environ.get("READTHEDOCS", None) == "True"

# -- Project information -----------------------------------------------------

project = 'konoha'
copyright = '2020, himkt'
author = 'himkt'
project = "konoha"
copyright = "2020, himkt"
author = "himkt"


# -- General configuration ---------------------------------------------------
Expand All @@ -30,11 +30,11 @@
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
"sphinx.ext.autodoc",
]

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
templates_path = ["_templates"]

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
Expand All @@ -48,9 +48,9 @@
# a list of builtin themes.
#
if not on_rtd:
html_theme = 'pydata_sphinx_theme'
html_theme = "pydata_sphinx_theme"

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_static_path = ["_static"]
4 changes: 1 addition & 3 deletions example/tokenize_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@
print("Skip: ", word_tokenizer_name)

try:
_tokenizer = WordTokenizer(
"Sentencepiece", model_path="./data/model.spm"
) # NOQA
_tokenizer = WordTokenizer("Sentencepiece", model_path="./data/model.spm") # NOQA
word_tokenizers.append(_tokenizer)

except (ImportError, OSError, RuntimeError):
Expand Down
592 changes: 296 additions & 296 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ all = ["janome", "natto-py", "kytea", "sudachipy", "sudachidict-core", "nagisa",
[tool.poetry.group.dev.dependencies]
pytest = "^7.4.4"
httpx = "^0.26.0"
ruff = "^0.1.14"
ruff = "^0.3.0"
mypy = "^1.8.0"
sphinx = "<7.2.0"
pydata-sphinx-theme = "<0.15.0"
Expand Down
1 change: 1 addition & 0 deletions src/konoha/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""__init__.py."""

from importlib.metadata import version

from konoha.sentence_tokenizer import SentenceTokenizer # NOQA
Expand Down
1 change: 1 addition & 0 deletions src/konoha/data/token.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Token class."""

from typing import Dict
from typing import List
from typing import Optional
Expand Down
1 change: 1 addition & 0 deletions src/konoha/word_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Word Level Tokenizer."""

import warnings
from typing import Dict
from typing import List
Expand Down
10 changes: 6 additions & 4 deletions tests/api/v1/test_batch_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@


@pytest.mark.parametrize(
"tokenizer_params", [
"tokenizer_params",
[
{"tokenizer": "mecab"},
{"tokenizer": "sudachi", "mode": "A"},
{"tokenizer": "sudachi", "mode": "B"},
Expand All @@ -22,7 +23,7 @@
{"tokenizer": "character"},
{"tokenizer": "nagisa"},
{"tokenizer": "janome"},
]
],
)
def test_tokenization(tokenizer_params: Dict):
if tokenizer_params["tokenizer"] == "kytea" and sys.version_info < (3, 7):
Expand All @@ -36,9 +37,10 @@ def test_tokenization(tokenizer_params: Dict):


@pytest.mark.parametrize(
"tokenizer_params", [
"tokenizer_params",
[
{"tokenizer": "mecab", "system_dictionary_path": "s3://konoha-demo/mecab/ipadic"},
]
],
)
def test_tokenization_with_remote_resource(tokenizer_params: Dict):
if "AWS_ACCESS_KEY_ID" not in os.environ and tokenizer_params["system_dictionary_path"].startswith("s3://"):
Expand Down
10 changes: 6 additions & 4 deletions tests/api/v1/test_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@


@pytest.mark.parametrize(
"tokenizer_params", [
"tokenizer_params",
[
{"tokenizer": "mecab"},
{"tokenizer": "mecab", "with_postag": True},
{"tokenizer": "sudachi", "mode": "A"},
Expand All @@ -23,7 +24,7 @@
{"tokenizer": "character"},
{"tokenizer": "nagisa"},
{"tokenizer": "janome"},
]
],
)
def test_tokenization(tokenizer_params: Dict):
if tokenizer_params["tokenizer"] == "kytea" and sys.version_info < (3, 7):
Expand All @@ -37,9 +38,10 @@ def test_tokenization(tokenizer_params: Dict):


@pytest.mark.parametrize(
"tokenizer_params", [
"tokenizer_params",
[
{"tokenizer": "mecab", "system_dictionary_path": "s3://konoha-demo/mecab/ipadic"},
]
],
)
def test_tokenization_with_remote_resoruce(tokenizer_params: Dict):
if "AWS_ACCESS_KEY_ID" not in os.environ and tokenizer_params["system_dictionary_path"].startswith("s3://"):
Expand Down
26 changes: 14 additions & 12 deletions tests/test_word_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ def read_lines(tokenizer: str):


@pytest.mark.parametrize(
"tokenizer_params", [
"tokenizer_params",
[
{"tokenizer": "mecab"},
{"tokenizer": "sudachi", "mode": "A"},
{"tokenizer": "sudachi", "mode": "A"},
Expand All @@ -36,7 +37,7 @@ def read_lines(tokenizer: str):
{"tokenizer": "character"},
{"tokenizer": "whitespace"},
{"tokenizer": "sentencepiece", "model_path": "data/model.spm"},
]
],
)
def test_tokenize_with_character(raw_texts: List[str], tokenizer_params: Dict):
if tokenizer_params["tokenizer"] == "kytea" and sys.version_info < (3, 7):
Expand All @@ -50,9 +51,10 @@ def test_tokenize_with_character(raw_texts: List[str], tokenizer_params: Dict):


@pytest.mark.parametrize(
"tokenizer_params", [
"tokenizer_params",
[
{"tokenizer": "mecab", "system_dictionary_path": "s3://konoha-demo/mecab/ipadic"},
]
],
)
def test_tokenize(raw_texts: List[str], tokenizer_params: Dict):
tokenizer_name = tokenizer_params["tokenizer"]
Expand All @@ -63,7 +65,8 @@ def test_tokenize(raw_texts: List[str], tokenizer_params: Dict):


@pytest.mark.parametrize(
"tokenizer_params", [
"tokenizer_params",
[
{"tokenizer": "mecab"},
{"tokenizer": "sudachi", "mode": "A"},
{"tokenizer": "sudachi", "mode": "A"},
Expand All @@ -73,7 +76,7 @@ def test_tokenize(raw_texts: List[str], tokenizer_params: Dict):
{"tokenizer": "character"},
{"tokenizer": "whitespace"},
{"tokenizer": "sentencepiece", "model_path": "data/model.spm"},
]
],
)
def test_batch_tokenize_with_character(raw_texts: List[str], tokenizer_params: Dict):
if tokenizer_params["tokenizer"] == "kytea" and sys.version_info < (3, 7):
Expand All @@ -82,17 +85,17 @@ def test_batch_tokenize_with_character(raw_texts: List[str], tokenizer_params: D
tokenizer_name = tokenizer_params["tokenizer"]
tokenizer = WordTokenizer(**tokenizer_params)
expect = [
[Token.from_dict(token_param) for token_param in token_params]
for token_params in read_lines(tokenizer_name)
[Token.from_dict(token_param) for token_param in token_params] for token_params in read_lines(tokenizer_name)
]
result = tokenizer.batch_tokenize(raw_texts)
assert expect == result


@pytest.mark.parametrize(
"tokenizer_params", [
"tokenizer_params",
[
{"tokenizer": "mecab", "system_dictionary_path": "s3://konoha-demo/mecab/ipadic"},
]
],
)
def test_batch_tokenize(raw_texts: List[str], tokenizer_params: Dict):
if "AWS_ACCESS_KEY_ID" not in os.environ and tokenizer_params["system_dictionary_path"].startswith("s3://"):
Expand All @@ -101,8 +104,7 @@ def test_batch_tokenize(raw_texts: List[str], tokenizer_params: Dict):
tokenizer_name = tokenizer_params["tokenizer"]
tokenizer = WordTokenizer(**tokenizer_params)
expect = [
[Token.from_dict(token_param) for token_param in token_params]
for token_params in read_lines(tokenizer_name)
[Token.from_dict(token_param) for token_param in token_params] for token_params in read_lines(tokenizer_name)
]
result = tokenizer.batch_tokenize(raw_texts)
assert expect == result

0 comments on commit 4ad2139

Please sign in to comment.