diff --git a/README.md b/README.md index d513810..e6ac082 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,21 @@ # agct: Another Genome Conversion Tool -Drop-in replacement for the [pyliftover](https://github.com/konstantint/pyliftover) tool, using the St. Jude's [chainfile](https://docs.rs/chainfile/latest/chainfile/) crate. Enables significantly faster chainfile loading from cold start (see `analysis/`). +[![image](https://img.shields.io/pypi/v/agct.svg)](https://pypi.python.org/pypi/agct) +[![image](https://img.shields.io/pypi/l/agct.svg)](https://pypi.python.org/pypi/agct) +[![image](https://img.shields.io/pypi/pyversions/agct.svg)](https://pypi.python.org/pypi/agct) +[![Actions status](https://github.com/genomicmedlab/agct/workflows/CI/badge.svg)](https://github.com/genomicmedlab/agct/actions) -Status: alpha. + +A drop-in replacement for the [pyliftover](https://github.com/konstantint/pyliftover) tool, using the St. Jude's [chainfile](https://docs.rs/chainfile/latest/chainfile/) crate. Enables significantly faster chainfile loading from cold start (see `analysis/`). + + +## Installation + +Install from [PyPI](https://pypi.org/project/agct/): + +```shell +python3 -m pip install agct +``` ## Usage @@ -13,11 +26,13 @@ from agct import Converter c = Converter("hg38", "hg19") ``` +> If a chainfile is unavailable locally, it's downloaded from UCSC and saved using the `wags-tails` package -- see the [configuration instructions](https://github.com/GenomicMedLab/wags-tails?tab=readme-ov-file#configuration) for information on how to designate a non-default storage location. + Call ``convert_coordinate()``: ```python3 c.convert_coordinate("chr7", 140453136, "+") -# [['chr7', '140152936', '+']] +# [['chr7', 140152936, '+']] ``` ## Development @@ -44,6 +59,19 @@ This installs Python code as editable, but after any changes to Rust code, ``mat maturin develop ``` +Check Python style with `ruff`: + +```shell +python3 -m ruff format . && python3 -m ruff check --fix . +``` + +Use `cargo fmt` to check Rust style (must be run from within the `rust/` subdirectory): + +```shell +cd rust/ +cargo fmt +``` + Run tests with `pytest`: ```shell diff --git a/pyproject.toml b/pyproject.toml index e12da67..42c00c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,11 @@ [project] name = "agct" -version = "0.1.0-dev0" +version = "0.1.0-dev1" authors = [ - {name = "James Stevenson"} + {name = "James Stevenson"}, + {name = "Kori Kuzma"}, ] -description = "Another Genome Conversion Tool: Python frontend to Rust chainfile crate" readme = "README.md" -license = {file = "LICENSE"} -requires-python = ">=3.8" classifiers = [ "Development Status :: 3 - Alpha", "Programming Language :: Rust", @@ -18,10 +16,14 @@ classifiers = [ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Scientific/Engineering :: Bio-Informatics", "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", ] +requires-python = ">=3.8" +description = "Another Genome Conversion Tool: Python frontend to Rust chainfile crate" +license = {file = "LICENSE"} dependencies = ["wags-tails"] [project.optional-dependencies] @@ -31,7 +33,7 @@ tests = [ ] dev = [ "maturin", - "ruff>=0.1.12", + "ruff==0.2.0", "pre-commit", ] @@ -60,15 +62,51 @@ branch = true [tool.ruff] src = ["src"] -# pycodestyle (E, W) -# Pyflakes (F) -# flake8-annotations (ANN) -# pydocstyle (D) -# pep8-naming (N) -# isort (I) -select = ["E", "W", "F", "ANN", "D", "N", "I"] -fixable = ["I", "F401"] +[tool.ruff.lint] +select = [ + "F", # https://docs.astral.sh/ruff/rules/#pyflakes-f + "E", "W", # https://docs.astral.sh/ruff/rules/#pycodestyle-e-w + "I", # https://docs.astral.sh/ruff/rules/#isort-i + "N", # https://docs.astral.sh/ruff/rules/#pep8-naming-n + "D", # https://docs.astral.sh/ruff/rules/#pydocstyle-d + "UP", # https://docs.astral.sh/ruff/rules/#pyupgrade-up + "ANN", # https://docs.astral.sh/ruff/rules/#flake8-annotations-ann + "ASYNC", # https://docs.astral.sh/ruff/rules/#flake8-async-async + "S", # https://docs.astral.sh/ruff/rules/#flake8-bandit-s + "B", # https://docs.astral.sh/ruff/rules/#flake8-bugbear-b + "A", # https://docs.astral.sh/ruff/rules/#flake8-builtins-a + "C4", # https://docs.astral.sh/ruff/rules/#flake8-comprehensions-c4 + "DTZ", # https://docs.astral.sh/ruff/rules/#flake8-datetimez-dtz + "T10", # https://docs.astral.sh/ruff/rules/#flake8-datetimez-dtz + "EM", # https://docs.astral.sh/ruff/rules/#flake8-errmsg-em + "G", # https://docs.astral.sh/ruff/rules/#flake8-logging-format-g + "PIE", # https://docs.astral.sh/ruff/rules/#flake8-pie-pie + "T20", # https://docs.astral.sh/ruff/rules/#flake8-print-t20 + "PT", # https://docs.astral.sh/ruff/rules/#flake8-pytest-style-pt + "Q", # https://docs.astral.sh/ruff/rules/#flake8-quotes-q + "RSE", # https://docs.astral.sh/ruff/rules/#flake8-raise-rse + "RET", # https://docs.astral.sh/ruff/rules/#flake8-return-ret + "SIM", # https://docs.astral.sh/ruff/rules/#flake8-simplify-sim + "PTH", # https://docs.astral.sh/ruff/rules/#flake8-use-pathlib-pth + "PGH", # https://docs.astral.sh/ruff/rules/#pygrep-hooks-pgh + "RUF", # https://docs.astral.sh/ruff/rules/#ruff-specific-rules-ruf +] +fixable = [ + "I", + "F401", + "D", + "UP", + "ANN", + "B", + "C4", + "G", + "PIE", + "PT", + "RSE", + "SIM", + "RUF" +] # ANN101 - missing-type-self # ANN003 - missing-type-kwargs # D203 - one-blank-line-before-class @@ -83,16 +121,20 @@ fixable = ["I", "F401"] # E117 - over-indented* # E501 - line-too-long* # W191 - tab-indentation* +# S321 - suspicious-ftp-lib-usage # *ignored for compatibility with formatter ignore = [ "ANN101", "ANN003", "D203", "D205", "D206", "D213", "D300", "D400", "D415", "E111", "E114", "E117", "E501", - "W191" + "W191", + "S321", ] -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] # ANN001 - missing-type-function-argument -# ANN102 - missing-type-cls # ANN2 - missing-return-type -"tests/*" = ["ANN001", "ANN102", "ANN2"] +# ANN102 - missing-type-cls +# S101 - assert +# B011 - assert-false +"tests/*" = ["ANN001", "ANN2", "ANN102", "S101", "B011"] diff --git a/src/agct/converter.py b/src/agct/converter.py index e7567cd..0a7b145 100644 --- a/src/agct/converter.py +++ b/src/agct/converter.py @@ -2,7 +2,7 @@ import logging from enum import Enum from pathlib import Path -from typing import Callable, List, Tuple +from typing import Callable, List, Optional, Tuple from wags_tails import CustomData from wags_tails.utils.downloads import download_http, handle_gzip @@ -36,35 +36,51 @@ class Converter: association. """ - def __init__(self, from_db: Genome, to_db: Genome) -> None: + def __init__( + self, + from_db: Optional[Genome] = None, + to_db: Optional[Genome] = None, + chainfile: Optional[str] = None, + ) -> None: """Initialize liftover instance. - :param from_db: database name, e.g. ``"19"`` - :param to_db: database name, e.g. ``"38"`` + :param from_db: database name, e.g. ``"19"``. Must be different than ``to_db`` + If ``chainfile`` is provided, will ignore this argument + :param to_db: database name, e.g. ``"38"``. Must be different than ``from_db`` + If ``chainfile`` is provided, will ignore this argument + :param chainfile: Path to chainfile + If not provided, must provide both ``from_db`` and ``to_db`` so that + ``wags-tails`` can download the corresponding chainfile + :raise ValueError: if required arguments are not passed or are invalid :raise FileNotFoundError: if unable to open corresponding chainfile :raise _core.ChainfileError: if unable to read chainfile (i.e. it's invalid) """ - if from_db == to_db: - raise ValueError("Liftover must be to/from different sources.") - if not isinstance(from_db, Genome): - from_db = Genome(from_db) - if not isinstance(to_db, Genome): - to_db = Genome(to_db) - data_handler = CustomData( - f"chainfile_{from_db.value}_to_{to_db.value}", - "chain", - lambda: "", - self._download_function_builder(from_db, to_db), - data_dir=get_data_dir() / "ucsc-chainfile", - ) - file, _ = data_handler.get_latest() + if not chainfile: + if from_db is None and to_db is None: + msg = "Must provide both `from_db` and `to_db`" + raise ValueError(msg) + + if from_db == to_db: + msg = "Liftover must be to/from different sources." + raise ValueError(msg) + + data_handler = CustomData( + f"chainfile_{from_db.value}_to_{to_db.value}", + "chain", + lambda: "", + self._download_function_builder(from_db, to_db), + data_dir=get_data_dir() / "ucsc-chainfile", + ) + file, _ = data_handler.get_latest() + chainfile = str(file.absolute()) + try: - self._converter = _core.Converter(str(file.absolute())) + self._converter = _core.Converter(chainfile) except FileNotFoundError as e: - _logger.error("Unable to open chainfile located at %s", file.absolute()) + _logger.error("Unable to open chainfile located at %s", chainfile) raise e except _core.ChainfileError as e: - _logger.error("Error reading chainfile located at %s", file.absolute()) + _logger.error("Error reading chainfile located at %s", chainfile) raise e @staticmethod @@ -104,7 +120,7 @@ def convert_coordinate( c = Converter("hg19", "hg38") c.convert_coordinate("chr7", 140453136, Strand.POSITIVE) - # returns [['chr7', '140753336', '+']] + # returns [['chr7', 140753336, '+']] :param chrom: chromosome name as given in chainfile. Usually e.g. ``"chr7"``. diff --git a/tests/test_converter.py b/tests/test_converter.py new file mode 100644 index 0000000..6db2ce7 --- /dev/null +++ b/tests/test_converter.py @@ -0,0 +1,21 @@ +"""Module for testing Converter initialization""" +import pytest +from tests.conftest import DATA_DIR + +from agct import Converter, Genome + + +def test_valid(): + """Test valid initialization""" + assert Converter( + chainfile=str(DATA_DIR / "ucsc-chainfile" / "chainfile_hg19_to_hg38_.chain") + ) + + +def test_invalid(): + """Test invalid initialization""" + with pytest.raises(ValueError, match="Must provide both `from_db` and `to_db`"): + Converter() + + with pytest.raises(ValueError, match="Liftover must be to/from different sources."): + Converter(Genome.HG19, Genome.HG19)