Skip to content

Commit

Permalink
Merge branch 'interface-updates-epic' into cli-refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
jsstevenson committed Dec 13, 2023
2 parents 1abfdde + e64953d commit 71c51da
Show file tree
Hide file tree
Showing 14 changed files with 131 additions and 558 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ boto3 = "*"
gene = {editable = true, path = "."}
gffutils = "*"
"biocommons.seqrepo" = "*"
wags-tails = ">=0.1.1"
psycopg = {version = "*", extras=["binary"]}
pytest = "*"
pre-commit = "*"
Expand Down
27 changes: 18 additions & 9 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,23 +88,32 @@ def linkcode_resolve(domain, info):
# -- sphinx-click ------------------------------------------------------------
from typing import List
import re

CMD_PATTERN = r"--[^ ]+"
STR_PATTERN = r"\"[^ ]+\""
SNAKE_PATTERN = r"[A-Z]+_[A-Z_]*[A-Z]"


def _add_formatting_to_string(line: str) -> str:
"""Add fixed-width code formatting to span sections in lines:
* shell options, eg `--update_all`
* strings, eg `"HGNC"`
* env vars, eg `GENE_NORM_REMOTE_DB_URL`
* double-quoted strings, eg `"HGNC"`
* all caps SNAKE_CASE env vars, eg `GENE_NORM_REMOTE_DB_URL`
"""
for pattern in (CMD_PATTERN, STR_PATTERN, SNAKE_PATTERN):
line = re.sub(pattern, lambda x: f"``{x.group()}``", line)
return line


def process_description(app, ctx, lines: List[str]):
"""Add custom formatting to sphinx-click autodocs"""
"""Add custom formatting to sphinx-click autodocs. This lets us write Click
docstrings in ways that look presentable in the CLI, but adds extra formatting when
generating Sphinx docs.
* add fixed-width (code) font to certain words
* add code block formatting to example shell commands
"""
# chop off params
param_boundary = None
for i, line in enumerate(lines):
Expand All @@ -118,7 +127,7 @@ def process_description(app, ctx, lines: List[str]):
# add code formatting to strings, commands, and env vars
lines_to_fmt = []
for i, line in enumerate(lines):
if line.startswith("% "):
if line.startswith(" ") or line.startswith(">>> "):
continue # skip example code blocks
if any([
re.findall(CMD_PATTERN, line),
Expand All @@ -129,13 +138,13 @@ def process_description(app, ctx, lines: List[str]):
for line_num in lines_to_fmt:
lines[line_num] = _add_formatting_to_string(lines[line_num])

# add code block formatting to example commands
# add code block formatting to example console commands
for i in range(len(lines) - 1, -1, -1):
if lines[i].startswith('% '):
lines[i] = " " + lines[i]
lines.insert(i, "")
lines.insert(i, ".. code-block:: sh")
if lines[i].startswith(' '):
lines.insert(i + 2, "")
if i == 0 or not lines[i - 1].startswith(" "):
lines.insert(i, "")
lines.insert(i, ".. code-block:: console")


def setup(app):
Expand Down
54 changes: 2 additions & 52 deletions docs/source/managing_data/loading_and_updating_data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,63 +3,13 @@
Loading and updating data
=========================

The Gene Normalizer defines a command line tool for data management. It includes functions for refreshing data, checking database status, and for the PostgreSQL data backend, dumping to a local file and updating from a remote backup.

.. note::

See the :ref:`ETL API documentation<etl-api>` for information on programmatic access to the data loader classes.


Refreshing all data
-------------------

Calling the Gene Normalizer update command with the ``--all`` and ``--normalize`` flags will delete all existing data, fetch new source data if available, and then perform a complete reload of the database (including merged records):

.. code-block:: shell
gene-normalizer update --all --normalize
Reload individual sources
-------------------------

To update specific sources, provide them as arguments to the ``update`` command. While it is possible to update individual source data without also updating the normalized record data, that may affect the proper function of the normalized query endpoints, so it is recommended to include the ``--normalize`` flag as well.

.. code-block:: shell
gene-normalizer update --normalize HGNC NCBI
Use local data
--------------

The Gene Normalizer will fetch the latest available data from all sources if local data is out-of-date. To suppress this and force usage of local files, use the `--use_existing` flag:

.. code-block:: shell
gene-normalizer update --all --use_existing
Check DB health
---------------

The command ``check-db`` performs a basic check on the database status. It first confirms that the database's schema exists, and then identifies whether metadata is available for each source, and whether gene record and normalized concept tables are non-empty. Check the process's exit code for the result (per the UNIX standard, ``0`` means success, and any other return code means failure).

.. code-block:: console
$ gene-normalizer check-db
$ echo $?
1 # indicates failure
This command is equivalent to the combination of the database classes' ``check_schema_initialized`` and ``check_tables_populated`` methods:

.. code-block:: python
from gene.database import create_db
db = create_db()
db_is_healthy = db.check_schema_initialized() and db.check_tables_populated()
Complete CLI documentation
--------------------------

.. click:: gene.cli:cli
:prog: gene-normalizer
:nested: full
6 changes: 1 addition & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,9 @@ dynamic = ["version"]

[project.optional-dependencies]
pg = ["psycopg[binary]"]

etl = ["gffutils", "biocommons.seqrepo"]

etl = ["gffutils", "biocommons.seqrepo", "wags-tails>=0.1.1"]
test = ["pytest>=6.0", "pytest-cov", "mock", "httpx"]

dev = ["pre-commit", "ruff>=0.1.2"]

docs = [
"sphinx==6.1.3",
"sphinx-autodoc-typehints==1.22.0",
Expand Down
14 changes: 12 additions & 2 deletions src/gene/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,18 @@ def update(
For example, the following command will update NCBI and HGNC source records:
% gene-normalizer update HGNC NCBI
$ gene-normalizer update HGNC NCBI
To completely reload all source records and construct normalized concepts, use the
--all and --normalize options:
% gene-normalizer update --all --normalize
$ gene-normalizer update --all --normalize
The Gene Normalizer will fetch the latest available data from all sources if local
data is out-of-date. To suppress this and force usage of local files only, use the
–use_existing flag:
$ gene-normalizer update --all --use_existing
\f
:param sources: tuple of raw names of sources to update
Expand Down Expand Up @@ -137,6 +143,10 @@ def check_db(db_url: str, verbose: bool = False) -> None:
"""Perform basic checks on DB health and population. Exits with status code 1
if DB schema is uninitialized or if critical tables appear to be empty.
$ gene-normalizer check-db
$ echo $?
1 # indicates failure
This command is equivalent to the combination of the database classes'
``check_schema_initialized()`` and ``check_tables_populated()`` methods:
Expand Down
149 changes: 47 additions & 102 deletions src/gene/etl/base.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
"""A base class for extraction, transformation, and loading of data."""
import datetime
import gzip
import logging
import re
import shutil
from abc import ABC, abstractmethod
from ftplib import FTP
from os import environ, remove
from os import environ
from pathlib import Path
from typing import Callable, Dict, List, Optional
from typing import Dict, List, Optional, Union

import click
import pydantic
from biocommons.seqrepo import SeqRepo
from dateutil import parser
from gffutils.feature import Feature
from wags_tails import EnsemblData, HgncData, NcbiGeneData

from gene.database import AbstractDatabase
from gene.schemas import ITEM_TYPES, Gene, GeneSequenceLocation, MatchType, SourceName
Expand All @@ -28,50 +25,75 @@
)


DATA_DISPATCH = {
SourceName.HGNC: HgncData,
SourceName.ENSEMBL: EnsemblData,
SourceName.NCBI: NcbiGeneData,
}


class Base(ABC):
"""The ETL base class."""

def __init__(
self,
database: AbstractDatabase,
host: str,
data_dir: str,
src_data_dir: Path,
seqrepo_dir: Path = SEQREPO_ROOT_DIR,
data_path: Optional[Path] = None,
silent: bool = True,
) -> None:
"""Instantiate Base class.
:param database: database instance
:param host: Hostname of FTP site
:param data_dir: Data directory of FTP site to look at
:param src_data_dir: Data directory for source
:param seqrepo_dir: Path to seqrepo directory
:param data_path: path to app data directory
:param silent: if True, don't print ETL result to console
"""
self.src_data_dir = src_data_dir
self.src_data_dir.mkdir(exist_ok=True, parents=True)
self._silent = silent
self._src_name = SourceName(self.__class__.__name__)
self._data_source = self._get_data_handler(data_path)
self._database = database
self._host = host
self._data_dir = data_dir
self._processed_ids = list()
self.seqrepo = self.get_seqrepo(seqrepo_dir)
self._processed_ids = list()

def _get_data_handler(
self, data_path: Optional[Path] = None
) -> Union[HgncData, EnsemblData, NcbiGeneData]:
"""Construct data handler instance for source. Overwrite for edge-case sources.
:param data_path: location of data storage
:return: instance of wags_tails.DataSource to manage source file(s)
"""
return DATA_DISPATCH[self._src_name](data_dir=data_path, silent=self._silent)

def perform_etl(self, use_existing: bool = False) -> List[str]:
"""Extract, Transform, and Load data into database.
"""Public-facing method to begin ETL procedures on given data.
Returned concept IDs can be passed to Merge method for computing
merged concepts.
:param use_existing: if true, use most recent available local files
:return: Concept IDs of concepts successfully loaded
:param use_existing: if True, don't try to retrieve latest source data
:return: list of concept IDs which were successfully processed and
uploaded.
"""
self._extract_data(use_existing)
if not self._silent:
click.echo("Transforming and loading data to DB...")
self._add_meta()
self._transform_data()
self._database.complete_write_transaction()
return self._processed_ids

@abstractmethod
def _extract_data(self, *args, **kwargs) -> None: # noqa: ANN002
"""Extract data from FTP site or local data directory."""
raise NotImplementedError
def _extract_data(self, use_existing: bool) -> None:
"""Acquire source data.
This method is responsible for initializing an instance of a data handler and,
in most cases, setting ``self._data_file`` and ``self._version``.
:param bool use_existing: if True, don't try to fetch latest source data
"""
self._data_file, self._version = self._data_source.get_latest(
from_local=use_existing
)

@abstractmethod
def _transform_data(self) -> None:
Expand All @@ -83,40 +105,6 @@ def _add_meta(self) -> None:
"""Add source meta to database source info."""
raise NotImplementedError

def _acquire_data_file(
self,
file_glob: str,
use_existing: bool,
check_latest_callback: Callable[[Path], bool],
download_callback: Callable[[], Path],
) -> Path:
"""Acquire data file.
:param file_glob: pattern to match relevant files against
:param use_existing: don't fetch from remote origin if local versions are
available
:param check_latest_callback: function to check whether local data is up-to-date
:param download_callback: function to download from remote
:return: path to acquired data file
:raise FileNotFoundError: if unable to find any files matching the pattern
"""
matching_files = list(self.src_data_dir.glob(file_glob))
if not matching_files:
if use_existing:
raise FileNotFoundError(
f"No local files matching pattern {self.src_data_dir.absolute().as_uri() + file_glob}"
)
else:
return download_callback()
else:
latest_file = list(sorted(matching_files))[-1]
if use_existing:
return latest_file
if not check_latest_callback(latest_file):
return download_callback()
else:
return latest_file

def _load_gene(self, gene: Dict) -> None:
"""Load a gene record into database. This method takes responsibility for:
* validating structure correctness
Expand Down Expand Up @@ -147,49 +135,6 @@ def _load_gene(self, gene: Dict) -> None:
self._database.add_record(gene, self._src_name)
self._processed_ids.append(concept_id)

def _ftp_download(
self, host: str, data_dir: str, fn: str, source_dir: Path, data_fn: str
) -> Optional[str]:
"""Download data file from FTP site.
:param host: Source's FTP host name
:param data_dir: Data directory located on FTP site
:param fn: Filename for downloaded file
:param source_dir: Source's data directory
:param data_fn: Filename on FTP site to be downloaded
:return: Date file was last updated
"""
with FTP(host) as ftp:
ftp.login()
timestamp = ftp.voidcmd(f"MDTM {data_dir}{data_fn}")[4:].strip()
date = str(parser.parse(timestamp)).split()[0]
version = datetime.datetime.strptime(date, "%Y-%m-%d").strftime("%Y%m%d")
ftp.cwd(data_dir)
self._ftp_download_file(ftp, data_fn, source_dir, fn)
return version

def _ftp_download_file(
self, ftp: FTP, data_fn: str, source_dir: Path, fn: str
) -> None:
"""Download data file from FTP
:param ftp: FTP instance
:param data_fn: Filename on FTP site to be downloaded
:param source_dir: Source's data directory
:param fn: Filename for downloaded file
"""
if data_fn.endswith(".gz"):
filepath = source_dir / f"{fn}.gz"
else:
filepath = source_dir / fn
with open(filepath, "wb") as fp:
ftp.retrbinary(f"RETR {data_fn}", fp.write)
if data_fn.endswith(".gz"):
with gzip.open(filepath, "rb") as f_in:
with open(source_dir / fn, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
remove(filepath)

def get_seqrepo(self, seqrepo_dir: Path) -> SeqRepo:
"""Return SeqRepo instance if seqrepo_dir exists.
Expand Down
Loading

0 comments on commit 71c51da

Please sign in to comment.