Skip to content

Commit

Permalink
Merge pull request #73 from cancervariants/issue-57
Browse files Browse the repository at this point in the history
Issue 57
  • Loading branch information
korikuzma authored Aug 17, 2021
2 parents 7d40990 + 3ab1656 commit 6dd633c
Show file tree
Hide file tree
Showing 20 changed files with 1,920 additions and 3,324 deletions.
4 changes: 0 additions & 4 deletions .github/workflows/github-actions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,5 @@ jobs:
chmod +x ./tests/unit/dynamodb_build.bash
./tests/unit/dynamodb_build.bash
- name: Load and Test DynamoDB
run: |
pipenv run pytest tests/unit/test_database.py
- run: pipenv run flake8
- run: pipenv run pytest tests/
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ coverage = "*"
coveralls = "*"
pytest-cov = "*"
jupyterlab = "*"
mock = "*"
25 changes: 19 additions & 6 deletions gene/etl/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,24 @@
class Base(ABC):
"""The ETL base class."""

def __init__(self, database: Database, host: str, data_dir: str, *args,
**kwargs) -> None:
def __init__(self, database: Database, host: str, data_dir: str,
src_data_dir: Path,
seqrepo_dir=PROJECT_ROOT / 'data' / 'seqrepo' / 'latest',
*args, **kwargs) -> None:
"""Instantiate Base class.
:param Database database: DynamoDB database
:param str host: Hostname of FTP site
:param str data_dir: Data directory of FTP site to look at
:param Path src_data_dir: Data directory for source
:param Path seqrepo_dir: Path to seqrepo directory
"""
self._database = database
self._host = host
self._data_dir = data_dir
self.src_data_dir = src_data_dir
self._processed_ids = list()
self.seqrepo = self.get_seqrepo(seqrepo_dir)

@abstractmethod
def perform_etl(self) -> List[str]:
Expand All @@ -59,6 +65,10 @@ def _add_meta(self, *args, **kwargs) -> None:
"""Add source meta to DynamoDB table."""
raise NotImplementedError

def _create_data_directory(self):
"""Create data directory for source."""
self.src_data_dir.mkdir(exist_ok=True, parents=True)

def _load_meta(self, db, metadata, source_name) -> None:
"""Load source metadata into database.
Expand Down Expand Up @@ -148,9 +158,12 @@ def _ftp_download(self, host: str, data_dir: str, fn: str,
remove(filepath)
return version

def get_seqrepo(self) -> SeqRepo:
"""Return SeqRepo instance."""
seqrepo_dir = PROJECT_ROOT / 'data' / 'seqrepo' / 'latest'
def get_seqrepo(self, seqrepo_dir) -> SeqRepo:
"""Return SeqRepo instance if seqrepo_dir exists.
:param Path seqrepo_dir: Path to seqrepo directory
:return: SeqRepo instance
"""
if not seqrepo_dir.exists():
raise NotADirectoryError("Could not find gene/data/seqrepo/latest")
raise NotADirectoryError(f"Could not find {seqrepo_dir}")
return SeqRepo(seqrepo_dir)
19 changes: 9 additions & 10 deletions gene/etl/ensembl.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,18 @@ def __init__(self,
database: Database,
host='ftp.ensembl.org',
data_dir='pub/',
src_data_dir=PROJECT_ROOT / 'data' / 'ensembl',
version='104'
):
"""Initialize Ensembl ETL class.
:param Database database: DynamoDB database
:param str host: FTP host name
:param str data_dir: FTP data directory to use
:param Path src_data_dir: Data directory for Ensembl
:param int version: Version for fn
"""
super().__init__(database, host, data_dir)
super().__init__(database, host, data_dir, src_data_dir)
self._sequence_location = SequenceLocation()
self._host = host
self._data_dir = data_dir
Expand All @@ -41,15 +43,14 @@ def __init__(self,
def _download_data(self):
"""Download Ensembl GFF3 data file."""
logger.info('Downloading Ensembl data file...')
ens_dir = PROJECT_ROOT / 'data' / 'ensembl'
ens_dir.mkdir(exist_ok=True, parents=True)
self._create_data_directory()
new_fn = f'ensembl_{self._version}.gff3'
if not (ens_dir / new_fn).exists():
if not (self.src_data_dir / new_fn).exists():
self._ftp_download(self._host,
f'{self._data_dir}release-{self._version}'
f'/gff3/homo_sapiens/',
new_fn,
ens_dir,
self.src_data_dir,
self._fn)
logger.info('Successfully downloaded Ensembl data file.')

Expand All @@ -58,8 +59,7 @@ def _extract_data(self, *args, **kwargs):
if 'data_path' in kwargs:
self._data_src = kwargs['data_path']
else:
ensembl_dir = PROJECT_ROOT / 'data' / 'ensembl'
self._data_src = sorted(list(ensembl_dir.iterdir()))[-1]
self._data_src = sorted(list(self.src_data_dir.iterdir()))[-1]

def _transform_data(self, *args, **kwargs):
"""Transform the Ensembl source."""
Expand All @@ -70,8 +70,6 @@ def _transform_data(self, *args, **kwargs):
merge_strategy="create_unique",
keep_order=True)

sr = self.get_seqrepo()

# Get accession numbers
accession_numbers = dict()
for item in db.features_of_type('scaffold'):
Expand All @@ -84,7 +82,8 @@ def _transform_data(self, *args, **kwargs):
if f.attributes.get('ID'):
f_id = f.attributes.get('ID')[0].split(':')[0]
if f_id == 'gene':
gene = self._add_gene(f, sr, accession_numbers)
gene = \
self._add_gene(f, self.seqrepo, accession_numbers)
if gene:
self._load_gene(gene, batch)
logger.info('Successfully transformed Ensembl.')
Expand Down
16 changes: 8 additions & 8 deletions gene/etl/hgnc.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,18 @@ def __init__(self,
database: Database,
host='ftp.ebi.ac.uk',
data_dir='pub/databases/genenames/hgnc/json/',
src_data_dir=PROJECT_ROOT / 'data' / 'hgnc',
fn='hgnc_complete_set.json'
):
"""Initialize HGNC ETL class.
:param Database database: DynamoDB database
:param str host: FTP host name
:param str data_dir: FTP data directory to use
:param Path src_data_dir: Data directory for HGNC
:param str fn: Data file to download
"""
super().__init__(database, host, data_dir)
super().__init__(database, host, data_dir, src_data_dir)
self._chromosome_location = ChromosomeLocation()
self._data_url = f"ftp://{host}/{data_dir}{fn}"
self._fn = fn
Expand All @@ -39,23 +41,21 @@ def __init__(self,
def _download_data(self, *args, **kwargs):
"""Download HGNC JSON data file."""
logger.info('Downloading HGNC data file...')
hgnc_data_dir = PROJECT_ROOT / 'data' / 'hgnc'
hgnc_data_dir.mkdir(exist_ok=True, parents=True)
self._create_data_directory()
tmp_fn = 'hgnc_version.json'
self._version = \
self._ftp_download(self._host, self._data_dir, tmp_fn,
hgnc_data_dir, self._fn)
shutil.move(f"{hgnc_data_dir}/{tmp_fn}",
f"{hgnc_data_dir}/hgnc_{self._version}.json")
self.src_data_dir, self._fn)
shutil.move(f"{self.src_data_dir}/{tmp_fn}",
f"{self.src_data_dir}/hgnc_{self._version}.json")
logger.info('Successfully downloaded HGNC data file.')

def _extract_data(self, *args, **kwargs):
"""Extract data from the HGNC source."""
if 'data_path' in kwargs:
self._data_src = kwargs['data_path']
else:
hgnc_dir = PROJECT_ROOT / 'data' / 'hgnc'
self._data_src = sorted(list(hgnc_dir.iterdir()))[-1]
self._data_src = sorted(list(self.src_data_dir.iterdir()))[-1]

def _transform_data(self, *args, **kwargs):
"""Transform the HGNC source."""
Expand Down
17 changes: 8 additions & 9 deletions gene/etl/ncbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,17 @@ def __init__(self,
database: Database,
host='ftp.ncbi.nlm.nih.gov',
data_dir='gene/DATA/',
src_data_dir=PROJECT_ROOT / 'data' / 'ncbi',
assembly: str = 'GRCh38.p13'):
"""Construct the NCBI ETL instance.
:param Database database: gene database for adding new data
:param str host: FTP host name
:param str data_dir: FTP data directory to use
:param Path src_data_dir: Data directory for NCBI
:param str assembly: The genome assembly
"""
super().__init__(database, host, data_dir)
super().__init__(database, host, data_dir, src_data_dir)
self._sequence_location = SequenceLocation()
self._chromosome_location = ChromosomeLocation()
self._data_url = f"ftp://{host}"
Expand Down Expand Up @@ -105,11 +107,10 @@ def _extract_data(self):
- Data is expected to be in <PROJECT ROOT>/data/ncbi.
- For now, data files should all be from the same source data version.
"""
local_data_dir = PROJECT_ROOT / 'data' / 'ncbi'
local_data_dir.mkdir(exist_ok=True, parents=True)
if not self._files_downloaded(local_data_dir):
self._download_data(local_data_dir)
local_files = [f for f in local_data_dir.iterdir()
self._create_data_directory()
if not self._files_downloaded(self.src_data_dir):
self._download_data(self.src_data_dir)
local_files = [f for f in self.src_data_dir.iterdir()
if f.name.startswith('ncbi')]
local_files.sort(key=lambda f: f.name.split('_')[-1], reverse=True)
self._info_src = [f for f in local_files
Expand Down Expand Up @@ -514,16 +515,14 @@ def _transform_data(self):
prev_symbols = self._get_prev_symbols()
info_genes = self._get_gene_info(prev_symbols)

sr = self.get_seqrepo()

# create db for gff file
db = gffutils.create_db(str(self._gff_src),
dbfn=":memory:",
force=True,
merge_strategy="create_unique",
keep_order=True)

self._get_gene_gff(db, info_genes, sr)
self._get_gene_gff(db, info_genes, self.seqrepo)

with self._database.genes.batch_writer() as batch:
for gene in info_genes.keys():
Expand Down
12 changes: 11 additions & 1 deletion gene/vrs_locations/sequence_location.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""This module defines GA4GH Sequence Location."""
from typing import List
from ga4gh.vrs import models
from ga4gh.core import ga4gh_identify
import logging
Expand All @@ -10,6 +11,15 @@
class SequenceLocation:
"""The class for GA4GH Sequence Location."""

def get_aliases(self, sr, seqid) -> List[str]:
"""Get aliases for a sequence id
:param SeqRepo sr: seqrepo instance
:param str seqid: Sequence ID accession
:return: List of aliases for seqid
"""
return sr.translate_alias(seqid)

def add_location(self, seqid, gene, params, sr):
"""Get a gene's Sequence Location.
Expand All @@ -20,7 +30,7 @@ def add_location(self, seqid, gene, params, sr):
:return: A dictionary of a GA4GH VRS SequenceLocation.
"""
location = dict()
aliases = sr.translate_alias(seqid)
aliases = self.get_aliases(sr, seqid)
sequence_id = [a for a in aliases if a.startswith('ga4gh')][0]

if gene.start != '.' and gene.end != '.' and sequence_id:
Expand Down
Loading

0 comments on commit 6dd633c

Please sign in to comment.