Skip to content

Commit

Permalink
Merge branch 'master' into datapipe-ml-opts
Browse files Browse the repository at this point in the history
  • Loading branch information
bobokvsky committed Aug 12, 2024
2 parents 208ddf8 + b5a40d9 commit f46cc66
Show file tree
Hide file tree
Showing 36 changed files with 1,225 additions and 541 deletions.
57 changes: 57 additions & 0 deletions .github/workflows/docs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
name: Build and publish docs

on:
push:
branches:
- "master"
paths:
- ".github/workflows/docs.yaml"
- "docs/**"
- "pyproject.toml"

jobs:
# Build job
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
cache: pip

- name: Install dependencies
run: |
pip install ".[docs]"
- name: Setup Pages
id: pages
uses: actions/configure-pages@v5

- name: Build docs
run: |
cd docs; make html
- name: Upload artifact
uses: actions/upload-pages-artifact@v3.0.1
with:
path: docs/build/html

# Deployment job
deploy:
environment:
name: github-pages
url: ${{steps.deployment.outputs.page_url}}
runs-on: ubuntu-latest
needs: build

permissions:
pages: write # to deploy to Pages
id-token: write # to verify the deployment originates from an appropriate source

steps:
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4
52 changes: 31 additions & 21 deletions .github/workflows/pytest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@ name: Run tests

on:
push:
paths-ignore:
- "docs/**"
- "**/*.md"
- "examples/**"
paths:
- ".github/workflows/pytest.yaml"
- "datapipe/**"
- "tests/**"
- "pyproject.toml"


jobs:
Expand All @@ -15,14 +16,14 @@ jobs:
matrix:
python-version:
# - "3.8"
# - "3.9"
- "3.9"
- "3.10"

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: pip
Expand Down Expand Up @@ -50,31 +51,40 @@ jobs:
strategy:
matrix:
include:
- python-version: "3.8"
test-db-env: "postgres"
pip-extra: "sqlalchemy <2"
- python-version: "3.8"
test-db-env: "postgres"
pip-extra: "sqlalchemy >2"
# - python-version: "3.8"
# test-db-env: "sqlite"
# - python-version: "3.9"
# test-db-env: "postgres"
# pip-extra: "sqlalchemy <2"
# - python-version: "3.8"
# test-db-env: "postgres"
# pip-extra: "'sqlalchemy>2'"
# - python-version: "3.8"
# test-db-env: "sqlite"

- python-version: "3.9"
test-db-env: "postgres"
pip-extra: '"sqlalchemy>2"'
# - python-version: "3.9"
# test-db-env: "sqlite"

# - python-version: "3.10"
# test-db-env: "postgres"
# - python-version: "3.10"
# test-db-env: "sqlite"

- python-version: "3.11"
test-db-env: "postgres"
pip-extra: "sqlalchemy <2"
pip-extra: '"sqlalchemy<2" "pandas<2.2"'
- python-version: "3.11"
test-db-env: "postgres"
pip-extra: "sqlalchemy >2"
pip-extra: '"sqlalchemy>2"'
- python-version: "3.11"
test-db-env: "sqlite"
pip-extra: "sqlalchemy >2"
pip-extra: '"sqlalchemy>2"'

- python-version: "3.12"
test-db-env: "postgres"
- python-version: "3.12"
test-db-env: "sqlite"

services:
# Label used to access the service container
Expand Down Expand Up @@ -107,17 +117,17 @@ jobs:
- 6333:6333

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: pip

- name: Install dependencies
run: |
pip install "${{ matrix.pip-extra }}" ".[sqlite,excel,milvus,gcsfs,s3fs,redis,qdrant,gcp]" "pytest" "pytest_cases"
pip install ${{ matrix.pip-extra }} ".[sqlite,excel,milvus,gcsfs,s3fs,redis,qdrant,gcp]" "pytest<8" "pytest_cases"
- name: Test with pytest
run: |
Expand Down
16 changes: 9 additions & 7 deletions .github/workflows/test_examples.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@ name: Test examples

on:
push:
paths-ignore:
- "docs/**"
- "**/*.md"
- "tests/**"
paths:
- ".github/workflows/test_examples.yaml"
- "examples/**"
- "datapipe/**"
- "pyproject.toml"

jobs:
test-examples:
Expand All @@ -14,9 +15,10 @@ jobs:
matrix:
python-version:
# - "3.8"
# - "3.9"
- "3.9"
- "3.10"
# - "3.11"
- "3.12"
example:
- datatable_batch_transform
- image_resize
Expand All @@ -28,10 +30,10 @@ jobs:
- RayExecutor

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: pip
Expand Down
2 changes: 1 addition & 1 deletion .vscode/tasks.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
{
"label": "mypy-whole-project",
"type": "shell",
"command": "source .venv/bin/activate; mypy -p datapipe --show-column-numbers --show-error-codes --ignore-missing-imports --namespace-packages",
"command": "poetry run mypy -p datapipe --show-column-numbers --show-error-codes --ignore-missing-imports --namespace-packages",
"presentation": {
"echo": true,
"reveal": "never",
Expand Down
52 changes: 52 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,37 @@
# WIP 0.14.0

Changes:
* Enable Python 3.12 support
* `DatatableTansform` can become `BatchTransform` with empty indices
* SQLAlchemy tables can be used directly without duplication in Catalog
* `datapipe.compute.Table` can be used directly without Catalog

See "Migration from v0.13 to v0.14" for more details

# 0.13.14

* Fix [#334](https://github.com/epoch8/datapipe/issues/334)

# 0.13.13

* Add `ComputeStep.get_status` method
* Remove restriction for Pandas < 2.2

# 0.13.12

* Add processing of an empty response in `QdrantStore`
* Add optional `index_schema` to `QdrantStore`
* Add redis cluster mode support in `RedisStore`

# 0.13.11

* Remove logging to database (`datapipe_events` table) from `EventLogger`

# 0.13.10

* Fix compatibility with SQLalchemy < 2 (ColumnClause in typing)
* Fix compatibility with Ray and SQLalchemy > 2 (serialization of Table)
* (post.1) Fix dependencies for MacOS; deprecate Python 3.8

# 0.13.9

Expand Down Expand Up @@ -541,6 +571,28 @@

Стало: `DataTable(name, meta_store.create_meta_table(name), data_store)`

# 0.5.0

* Добавился прототип debug ui. Запускается с помощью команды `python script.py ui`

# 0.4.0

**Значимый рефакторинг**

* Корневой модуль переименован `c12n_pipe``datapipe`
* Реализован новый модуль `datapipe.dsl` для декларативного описания пайплайнов.
Пример использования: `examples/goal_v0.4.0.py`
* Описание пайплайна отделено от способа выполнения → есть возможность написать
конвертер в другой рантайм, например Prefect
* Переписана логика работы `MetaStore`, теперь всеми мета-таблицами `MetaStore`
владеет в явном виде, а потребители информации обращаются к ним по имени
* Реализовано хранение данных таблиц в отделяемых хранилищах (хранение в БД - один
из способов)
* `database.TableStoreDB` - хранение данных в таблице БД
* `filedir.TableStoreFiledir` - хранение данных в виде папки с файлами (плюс
описаны адаптеры для разных типов файлов)
* `pandas.TableStoreJsonLine` - хранение данных в виде одного jsonline файла

# 0.2.0 (2021-02-01)
- Add major code with Label Studio implementation
- Add Data cataloges and Nodes
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
# Datapipe

`datapipe` is a realtime incremental ETL library for Python application.
[Datapipe](https://datapipe.dev/) is a real-time, incremental ETL library for Python with record-level dependency tracking.

The library is designed for describing data processing pipelines and is capable
of tracking dependencies for each record in the pipeline. This ensures that
tasks within the pipeline receive only the data that has been modified, thereby
improving the overall efficiency of data handling.

https://datapipe.dev/

# Development

Expand Down
41 changes: 22 additions & 19 deletions datapipe/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,9 +198,9 @@ def table():
pass


@table.command()
@table.command(name="list")
@click.pass_context
def list(ctx: click.Context) -> None:
def table_list(ctx: click.Context) -> None:
app: DatapipeApp = ctx.obj["pipeline"]

for table in sorted(app.catalog.catalog.keys()):
Expand Down Expand Up @@ -339,30 +339,27 @@ def to_human_repr(step: ComputeStep, extra_args: Optional[Dict] = None) -> str:
return "\n".join(res)


@step.command() # type: ignore
@step.command(name="list") # type: ignore
@click.option("--status", is_flag=True, type=click.BOOL, default=False)
@click.pass_context
def list(ctx: click.Context, status: bool) -> None: # noqa
def step_list(ctx: click.Context, status: bool) -> None: # noqa
app: DatapipeApp = ctx.obj["pipeline"]
steps: List[ComputeStep] = ctx.obj["steps"]

for step in steps:
extra_args = {}

if status:
if len(step.input_dts) > 0:
try:
if isinstance(step, BaseBatchTransformStep):
changed_idx_count = step.get_changed_idx_count(ds=app.ds)

if changed_idx_count > 0:
extra_args[
"changed_idx_count"
] = f"[red]{changed_idx_count}[/red]"

except NotImplementedError:
# Currently we do not support empty join_keys
extra_args["changed_idx_count"] = "[red]N/A[/red]"
try:
step_status = step.get_status(ds=app.ds)
extra_args["total_idx_count"] = str(step_status.total_idx_count)
extra_args["changed_idx_count"] = (
f"[red]{step_status.changed_idx_count}[/red]"
)
except NotImplementedError:
# Currently we do not support empty join_keys
extra_args["total_idx_count"] = "[red]N/A[/red]"
extra_args["changed_idx_count"] = "[red]N/A[/red]"

rprint(to_human_repr(step, extra_args=extra_args))
rprint("")
Expand Down Expand Up @@ -519,8 +516,14 @@ def migrate_transform_tables(ctx: click.Context, labels: str, name: str) -> None
return migrations_v013.migrate_transform_tables(app, batch_transforms_steps)


for entry_point in metadata.entry_points().get("datapipe.cli", []):
register_commands = entry_point.load()
try:
entry_points = metadata.entry_points(group="datapipe.cli") # type: ignore
except TypeError:
# Compatibility with older versions of importlib.metadata (Python 3.8-3.9)
entry_points = metadata.entry_points().get("datapipe.cli", []) # type: ignore

for entry_point in entry_points:
register_commands = entry_point.load() # type: ignore
register_commands(cli)


Expand Down
Loading

0 comments on commit f46cc66

Please sign in to comment.