Merge branch 'interface-updates-epic' into cli-refactor

cancervariants · Dec 13, 2023 · 71c51da · 71c51da
2 parents 1abfdde + e64953d
commit 71c51da
Show file tree

Hide file tree

Showing 14 changed files with 131 additions and 558 deletions.
diff --git a/Pipfile b/Pipfile
@@ -15,6 +15,7 @@ boto3 = "*"
 gene = {editable = true, path = "."}
 gffutils = "*"
 "biocommons.seqrepo" = "*"
+wags-tails = ">=0.1.1"
 psycopg = {version = "*", extras=["binary"]}
 pytest = "*"
 pre-commit = "*"

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -88,23 +88,32 @@ def linkcode_resolve(domain, info):
 # -- sphinx-click ------------------------------------------------------------
 from typing import List
 import re
+
 CMD_PATTERN = r"--[^ ]+"
 STR_PATTERN = r"\"[^ ]+\""
 SNAKE_PATTERN = r"[A-Z]+_[A-Z_]*[A-Z]"
 
+
 def _add_formatting_to_string(line: str) -> str:
     """Add fixed-width code formatting to span sections in lines:
 
     * shell options, eg `--update_all`
-    * strings, eg `"HGNC"`
-    * env vars, eg `GENE_NORM_REMOTE_DB_URL`
+    * double-quoted strings, eg `"HGNC"`
+    * all caps SNAKE_CASE env vars, eg `GENE_NORM_REMOTE_DB_URL`
     """
     for pattern in (CMD_PATTERN, STR_PATTERN, SNAKE_PATTERN):
         line = re.sub(pattern, lambda x: f"``{x.group()}``", line)
     return line
 
+
 def process_description(app, ctx, lines: List[str]):
-    """Add custom formatting to sphinx-click autodocs"""
+    """Add custom formatting to sphinx-click autodocs. This lets us write Click
+    docstrings in ways that look presentable in the CLI, but adds extra formatting when
+    generating Sphinx docs.
+
+    * add fixed-width (code) font to certain words
+    * add code block formatting to example shell commands
+    """
     # chop off params
     param_boundary = None
     for i, line in enumerate(lines):
@@ -118,7 +127,7 @@ def process_description(app, ctx, lines: List[str]):
     # add code formatting to strings, commands, and env vars
     lines_to_fmt = []
     for i, line in enumerate(lines):
-        if line.startswith("% "):
+        if line.startswith("   ") or line.startswith(">>> "):
             continue  # skip example code blocks
         if any([
             re.findall(CMD_PATTERN, line),
@@ -129,13 +138,13 @@ def process_description(app, ctx, lines: List[str]):
     for line_num in lines_to_fmt:
         lines[line_num] = _add_formatting_to_string(lines[line_num])
 
-    # add code block formatting to example commands
+    # add code block formatting to example console commands
     for i in range(len(lines) - 1, -1, -1):
-        if lines[i].startswith('% '):
-            lines[i] = "   " + lines[i]
-            lines.insert(i, "")
-            lines.insert(i, ".. code-block:: sh")
+        if lines[i].startswith('    '):
             lines.insert(i + 2, "")
+            if i == 0 or not lines[i - 1].startswith("    "):
+                lines.insert(i, "")
+                lines.insert(i, ".. code-block:: console")
 
 
 def setup(app):

diff --git a/docs/source/managing_data/loading_and_updating_data.rst b/docs/source/managing_data/loading_and_updating_data.rst
@@ -3,63 +3,13 @@
 Loading and updating data
 =========================
 
+The Gene Normalizer defines a command line tool for data management. It includes functions for refreshing data, checking database status, and for the PostgreSQL data backend, dumping to a local file and updating from a remote backup.
+
 .. note::
 
     See the :ref:`ETL API documentation<etl-api>` for information on programmatic access to the data loader classes.
 
 
-Refreshing all data
--------------------
-
-Calling the Gene Normalizer update command with the ``--all`` and ``--normalize`` flags will delete all existing data, fetch new source data if available, and then perform a complete reload of the database (including merged records):
-
-.. code-block:: shell
-
-    gene-normalizer update --all --normalize
-
-
-Reload individual sources
--------------------------
-
-To update specific sources, provide them as arguments to the ``update`` command. While it is possible to update individual source data without also updating the normalized record data, that may affect the proper function of the normalized query endpoints, so it is recommended to include the ``--normalize`` flag as well.
-
-.. code-block:: shell
-
-    gene-normalizer update --normalize HGNC NCBI
-
-
-Use local data
---------------
-
-The Gene Normalizer will fetch the latest available data from all sources if local data is out-of-date. To suppress this and force usage of local files, use the `--use_existing` flag:
-
-.. code-block:: shell
-
-    gene-normalizer update --all --use_existing
-
-
-Check DB health
----------------
-
-The command ``check-db`` performs a basic check on the database status. It first confirms that the database's schema exists, and then identifies whether metadata is available for each source, and whether gene record and normalized concept tables are non-empty. Check the process's exit code for the result (per the UNIX standard, ``0`` means success, and any other return code means failure).
-
-.. code-block:: console
-
-    $ gene-normalizer check-db
-    $ echo $?
-    1  # indicates failure
-
-This command is equivalent to the combination of the database classes' ``check_schema_initialized`` and ``check_tables_populated`` methods:
-
-.. code-block:: python
-
-   from gene.database import create_db
-   db = create_db()
-   db_is_healthy = db.check_schema_initialized() and db.check_tables_populated()
-
-Complete CLI documentation
---------------------------
-
 .. click:: gene.cli:cli
    :prog: gene-normalizer
    :nested: full
diff --git a/pyproject.toml b/pyproject.toml
@@ -36,13 +36,9 @@ dynamic = ["version"]
 
 [project.optional-dependencies]
 pg = ["psycopg[binary]"]
-
-etl = ["gffutils", "biocommons.seqrepo"]
-
+etl = ["gffutils", "biocommons.seqrepo", "wags-tails>=0.1.1"]
 test = ["pytest>=6.0", "pytest-cov", "mock", "httpx"]
-
 dev = ["pre-commit", "ruff>=0.1.2"]
-
 docs = [
     "sphinx==6.1.3",
     "sphinx-autodoc-typehints==1.22.0",

diff --git a/src/gene/cli.py b/src/gene/cli.py
@@ -47,12 +47,18 @@ def update(
 
     For example, the following command will update NCBI and HGNC source records:
 
-    % gene-normalizer update HGNC NCBI
+        $ gene-normalizer update HGNC NCBI
 
     To completely reload all source records and construct normalized concepts, use the
     --all and --normalize options:
 
-    % gene-normalizer update --all --normalize
+        $ gene-normalizer update --all --normalize
+
+    The Gene Normalizer will fetch the latest available data from all sources if local
+    data is out-of-date. To suppress this and force usage of local files only, use the
+    –use_existing flag:
+
+        $ gene-normalizer update --all --use_existing
 
     \f
     :param sources: tuple of raw names of sources to update
@@ -137,6 +143,10 @@ def check_db(db_url: str, verbose: bool = False) -> None:
     """Perform basic checks on DB health and population. Exits with status code 1
     if DB schema is uninitialized or if critical tables appear to be empty.
 
+        $ gene-normalizer check-db
+        $ echo $?
+        1  # indicates failure
+
     This command is equivalent to the combination of the database classes'
     ``check_schema_initialized()`` and ``check_tables_populated()`` methods:
 

diff --git a/src/gene/etl/base.py b/src/gene/etl/base.py
@@ -1,19 +1,16 @@
 """A base class for extraction, transformation, and loading of data."""
-import datetime
-import gzip
 import logging
 import re
-import shutil
 from abc import ABC, abstractmethod
-from ftplib import FTP
-from os import environ, remove
+from os import environ
 from pathlib import Path
-from typing import Callable, Dict, List, Optional
+from typing import Dict, List, Optional, Union
 
+import click
 import pydantic
 from biocommons.seqrepo import SeqRepo
-from dateutil import parser
 from gffutils.feature import Feature
+from wags_tails import EnsemblData, HgncData, NcbiGeneData
 
 from gene.database import AbstractDatabase
 from gene.schemas import ITEM_TYPES, Gene, GeneSequenceLocation, MatchType, SourceName
@@ -28,50 +25,75 @@
 )
 
 
+DATA_DISPATCH = {
+    SourceName.HGNC: HgncData,
+    SourceName.ENSEMBL: EnsemblData,
+    SourceName.NCBI: NcbiGeneData,
+}
+
+
 class Base(ABC):
     """The ETL base class."""
 
     def __init__(
         self,
         database: AbstractDatabase,
-        host: str,
-        data_dir: str,
-        src_data_dir: Path,
         seqrepo_dir: Path = SEQREPO_ROOT_DIR,
+        data_path: Optional[Path] = None,
+        silent: bool = True,
     ) -> None:
         """Instantiate Base class.
 
         :param database: database instance
-        :param host: Hostname of FTP site
-        :param data_dir: Data directory of FTP site to look at
-        :param src_data_dir: Data directory for source
         :param seqrepo_dir: Path to seqrepo directory
+        :param data_path: path to app data directory
+        :param silent: if True, don't print ETL result to console
         """
-        self.src_data_dir = src_data_dir
-        self.src_data_dir.mkdir(exist_ok=True, parents=True)
+        self._silent = silent
         self._src_name = SourceName(self.__class__.__name__)
+        self._data_source = self._get_data_handler(data_path)
         self._database = database
-        self._host = host
-        self._data_dir = data_dir
-        self._processed_ids = list()
         self.seqrepo = self.get_seqrepo(seqrepo_dir)
+        self._processed_ids = list()
+
+    def _get_data_handler(
+        self, data_path: Optional[Path] = None
+    ) -> Union[HgncData, EnsemblData, NcbiGeneData]:
+        """Construct data handler instance for source. Overwrite for edge-case sources.
+
+        :param data_path: location of data storage
+        :return: instance of wags_tails.DataSource to manage source file(s)
+        """
+        return DATA_DISPATCH[self._src_name](data_dir=data_path, silent=self._silent)
 
     def perform_etl(self, use_existing: bool = False) -> List[str]:
-        """Extract, Transform, and Load data into database.
+        """Public-facing method to begin ETL procedures on given data.
+        Returned concept IDs can be passed to Merge method for computing
+        merged concepts.
 
-        :param use_existing: if true, use most recent available local files
-        :return: Concept IDs of concepts successfully loaded
+        :param use_existing: if True, don't try to retrieve latest source data
+        :return: list of concept IDs which were successfully processed and
+            uploaded.
         """
         self._extract_data(use_existing)
+        if not self._silent:
+            click.echo("Transforming and loading data to DB...")
         self._add_meta()
         self._transform_data()
         self._database.complete_write_transaction()
         return self._processed_ids
 
-    @abstractmethod
-    def _extract_data(self, *args, **kwargs) -> None:  # noqa: ANN002
-        """Extract data from FTP site or local data directory."""
-        raise NotImplementedError
+    def _extract_data(self, use_existing: bool) -> None:
+        """Acquire source data.
+
+        This method is responsible for initializing an instance of a data handler and,
+        in most cases, setting ``self._data_file`` and ``self._version``.
+
+        :param bool use_existing: if True, don't try to fetch latest source data
+        """
+        self._data_file, self._version = self._data_source.get_latest(
+            from_local=use_existing
+        )
 
     @abstractmethod
     def _transform_data(self) -> None:
@@ -83,40 +105,6 @@ def _add_meta(self) -> None:
         """Add source meta to database source info."""
         raise NotImplementedError
 
-    def _acquire_data_file(
-        self,
-        file_glob: str,
-        use_existing: bool,
-        check_latest_callback: Callable[[Path], bool],
-        download_callback: Callable[[], Path],
-    ) -> Path:
-        """Acquire data file.
-
-        :param file_glob: pattern to match relevant files against
-        :param use_existing: don't fetch from remote origin if local versions are
-            available
-        :param check_latest_callback: function to check whether local data is up-to-date
-        :param download_callback: function to download from remote
-        :return: path to acquired data file
-        :raise FileNotFoundError: if unable to find any files matching the pattern
-        """
-        matching_files = list(self.src_data_dir.glob(file_glob))
-        if not matching_files:
-            if use_existing:
-                raise FileNotFoundError(
-                    f"No local files matching pattern {self.src_data_dir.absolute().as_uri() + file_glob}"
-                )
-            else:
-                return download_callback()
-        else:
-            latest_file = list(sorted(matching_files))[-1]
-            if use_existing:
-                return latest_file
-            if not check_latest_callback(latest_file):
-                return download_callback()
-            else:
-                return latest_file
-
     def _load_gene(self, gene: Dict) -> None:
         """Load a gene record into database. This method takes responsibility for:
          * validating structure correctness
@@ -147,49 +135,6 @@ def _load_gene(self, gene: Dict) -> None:
             self._database.add_record(gene, self._src_name)
             self._processed_ids.append(concept_id)
 
-    def _ftp_download(
-        self, host: str, data_dir: str, fn: str, source_dir: Path, data_fn: str
-    ) -> Optional[str]:
-        """Download data file from FTP site.
-
-        :param host: Source's FTP host name
-        :param data_dir: Data directory located on FTP site
-        :param fn: Filename for downloaded file
-        :param source_dir: Source's data directory
-        :param data_fn: Filename on FTP site to be downloaded
-        :return: Date file was last updated
-        """
-        with FTP(host) as ftp:
-            ftp.login()
-            timestamp = ftp.voidcmd(f"MDTM {data_dir}{data_fn}")[4:].strip()
-            date = str(parser.parse(timestamp)).split()[0]
-            version = datetime.datetime.strptime(date, "%Y-%m-%d").strftime("%Y%m%d")
-            ftp.cwd(data_dir)
-            self._ftp_download_file(ftp, data_fn, source_dir, fn)
-        return version
-
-    def _ftp_download_file(
-        self, ftp: FTP, data_fn: str, source_dir: Path, fn: str
-    ) -> None:
-        """Download data file from FTP
-
-        :param ftp: FTP instance
-        :param data_fn: Filename on FTP site to be downloaded
-        :param source_dir: Source's data directory
-        :param fn: Filename for downloaded file
-        """
-        if data_fn.endswith(".gz"):
-            filepath = source_dir / f"{fn}.gz"
-        else:
-            filepath = source_dir / fn
-        with open(filepath, "wb") as fp:
-            ftp.retrbinary(f"RETR {data_fn}", fp.write)
-        if data_fn.endswith(".gz"):
-            with gzip.open(filepath, "rb") as f_in:
-                with open(source_dir / fn, "wb") as f_out:
-                    shutil.copyfileobj(f_in, f_out)
-            remove(filepath)
-
     def get_seqrepo(self, seqrepo_dir: Path) -> SeqRepo:
         """Return SeqRepo instance if seqrepo_dir exists.