Skip to content

Commit

Permalink
Merge pull request #55 from cancervariants/updates
Browse files Browse the repository at this point in the history
Updates to sources in prod
  • Loading branch information
korikuzma authored Aug 4, 2021
2 parents e9e7119 + 1252490 commit 4ae9ee7
Show file tree
Hide file tree
Showing 7 changed files with 43 additions and 154 deletions.
28 changes: 19 additions & 9 deletions gene/etl/ensembl.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
"""This module defines the Ensembl ETL methods."""
import pydantic

from .base import Base
from gene import PROJECT_ROOT
from gene.schemas import SourceName, NamespacePrefix, Strand, Gene, SourceMeta
Expand All @@ -19,21 +21,23 @@ def __init__(self,
database: Database,
host='ftp.ensembl.org',
data_dir='pub/',
fn='Homo_sapiens.GRCh38.102.gff3.gz'
version=104
):
"""Initialize Ensembl ETL class.
:param Database database: DynamoDB database
:param str host: FTP host name
:param str data_dir: FTP data directory to use
:param str fn: Data file to download
:param int version: Version for fn
"""
super().__init__(database, host, data_dir)
self._sequence_location = SequenceLocation()
self._data_url = f"ftp://{host}/{data_dir}{fn}"
self._fn = fn
self._host = host
self._data_dir = data_dir
self._version = version
self._fn = f'Homo_sapiens.GRCh38.{self._version}.gff3.gz'
self._data_url = f"ftp://{self._host}/{self._data_dir}{self._fn}"
self._data_file_url = None
self._version = '102'
self._assembly = 'GRCh38'

def _download_data(self):
Expand All @@ -42,8 +46,9 @@ def _download_data(self):
ens_dir = PROJECT_ROOT / 'data' / 'ensembl'
ens_dir.mkdir(exist_ok=True, parents=True)
self._ftp_download(self._host,
f'{self._data_dir}release-102/gff3/homo_sapiens/',
'ensembl_102.gff3',
f'{self._data_dir}release-{self._version}'
f'/gff3/homo_sapiens/',
f'ensembl_{self._version}.gff3',
ens_dir,
self._fn)
logger.info('Successfully downloaded Ensembl data file.')
Expand Down Expand Up @@ -81,8 +86,13 @@ def _transform_data(self, *args, **kwargs):
if f_id == 'gene':
gene = self._add_gene(f, sr, accession_numbers)
if gene:
assert Gene(**gene)
self._load_gene(gene, batch)
try:
assert Gene(**gene)
except pydantic.error_wrappers.ValidationError:
logger.warning(f"Unable to load gene due to "
f"validation error: {gene}")
else:
self._load_gene(gene, batch)
logger.info('Successfully transformed Ensembl.')

def _add_gene(self, f, sr, accession_numbers):
Expand Down
18 changes: 9 additions & 9 deletions gene/etl/ncbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def __init__(self,
self._chromosome_location = ChromosomeLocation()
self._data_url = f"ftp://{host}"
self._assembly = assembly
self._date_today = datetime.today().strftime('%Y%m%d')

def perform_etl(self):
"""Perform ETL methods.
Expand All @@ -53,28 +54,27 @@ def _download_data(self, ncbi_dir: Path):
:param str ncbi_dir: The NCBI data directory
"""
version = datetime.today().strftime('%Y%m%d')

# Download info
data_dir = f'{self._data_dir}GENE_INFO/Mammalia/'
fn = f'ncbi_info_{version}.tsv'
fn = f'ncbi_info_{self._date_today}.tsv'
data_fn = 'Homo_sapiens.gene_info.gz'
logger.info('Downloading NCBI gene_info....')
self._ftp_download(self._host, data_dir, fn, ncbi_dir, data_fn)
logger.info('Successfully downloaded NCBI gene_info.')

# Download history
fn = f'ncbi_history_{version}.tsv'
fn = f'ncbi_history_{self._date_today}.tsv'
data_fn = 'gene_history.gz'
logger.info('Downloading NCBI gene_history...')
self._ftp_download(self._host, self._data_dir, fn, ncbi_dir, data_fn)
logger.info('Successfully downloaded NCBI gene_history.')

# Download gff
og_fn = 'GCF_000001405.39_GRCh38.p13'
data_dir = 'genomes/refseq/vertebrate_mammalian/Homo_sapiens/' \
'latest_assembly_versions/GCF_000001405.39_GRCh38.p13/'
f'latest_assembly_versions/{og_fn}/'
fn = f'ncbi_{self._assembly}.gff'
data_fn = 'GCF_000001405.39_GRCh38.p13_genomic.gff.gz'
data_fn = f'{og_fn}_genomic.gff.gz'
logger.info('Downloading NCBI gff data file...')
self._ftp_download(self._host, data_dir, fn, ncbi_dir, data_fn)
logger.info('Successfully downloaded NCBI gff data file.')
Expand All @@ -92,11 +92,11 @@ def _files_downloaded(self, data_dir: Path) -> bool:
gff_downloaded: bool = False

for f in files:
if f.name.startswith('ncbi_info'):
if f.name.startswith(f'ncbi_info_{self._date_today}'):
info_downloaded = True
elif f.name.startswith('ncbi_history'):
elif f.name.startswith(f'ncbi_history_{self._date_today}'):
history_downloaded = True
elif f.name.startswith('ncbi_GRCh38'):
elif f.name.startswith('ncbi_GRCh38.p13'):
gff_downloaded = True
return info_downloaded and history_downloaded and gff_downloaded

Expand Down
62 changes: 0 additions & 62 deletions tests/unit/data/ensembl_genes.json
Original file line number Diff line number Diff line change
Expand Up @@ -119,42 +119,6 @@
"src_name": "Ensembl",
"item_type": "symbol"
},
{
"label_and_type": "ensembl:ensg00000284906##identity",
"concept_id": "ensembl:ENSG00000284906",
"symbol": "AC091057.5",
"label": "Rho GTPase-activating protein 11B",
"locations": [
{
"_id": "ga4gh:VSL.UJx3xHRkDuoALaGxyic-cPQNQnXYiAM8",
"interval": {
"end": 30685606,
"start": 30624548,
"type": "SimpleInterval"
},
"sequence_id": "ga4gh:SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6",
"type": "SequenceLocation"
}
],
"strand": "+",
"associated_with": [
"uniprot:Q3KRB8"
],
"src_name": "Ensembl",
"item_type": "identity"
},
{
"label_and_type": "uniprot:q3krb8##associated_with",
"concept_id": "ensembl:ensg00000284906",
"src_name": "Ensembl",
"item_type": "associated_with"
},
{
"label_and_type": "ac091057.5##symbol",
"concept_id": "ensembl:ensg00000284906",
"src_name": "Ensembl",
"item_type": "symbol"
},
{
"label_and_type": "ensembl:ensg00000272920##identity",
"concept_id": "ensembl:ENSG00000272920",
Expand Down Expand Up @@ -221,32 +185,6 @@
"src_name": "Ensembl",
"item_type": "symbol"
},
{
"label_and_type": "ensembl:ensg00000278704##identity",
"concept_id": "ensembl:ENSG00000278704",
"symbol": "BX004987.1",
"locations": [
{
"_id": "ga4gh:VSL.0JJsYiFwwNH2-7rYKj1ZitEcFRxIGwdQ",
"interval": {
"end": 58376,
"start": 56140,
"type": "SimpleInterval"
},
"sequence_id": "ga4gh:SQ.K_ieIfNIy1Ktulg8QSlhvJvm_1uQOtjD",
"type": "SequenceLocation"
}
],
"strand": "-",
"src_name": "Ensembl",
"item_type": "identity"
},
{
"label_and_type": "bx004987.1##symbol",
"concept_id": "ensembl:ensg00000278704",
"src_name": "Ensembl",
"item_type": "symbol"
},
{
"label_and_type": "ensembl:ensg00000087085##identity",
"concept_id": "ensembl:ENSG00000087085",
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/data/metadata.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[
{
"src_name": "Ensembl",
"version": "102",
"data_url": "ftp://ftp.ensembl.org/pub/Homo_sapiens.GRCh38.102.gff3.gz",
"version": "104",
"data_url": "ftp://ftp.ensembl.org/pub/Homo_sapiens.GRCh38.104.gff3.gz",
"data_license_attributes": {
"non_commercial": false,
"share_alike": false,
Expand Down
3 changes: 2 additions & 1 deletion tests/unit/data/ncbi_genes.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
"aliases": [
"BAF45b",
"NEUD4",
"neuro-d4"
"neuro-d4",
"SMARCG1"
],
"xrefs": [
"hgnc:20225",
Expand Down
80 changes: 10 additions & 70 deletions tests/unit/test_ensembl_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,36 +230,6 @@ def spry3():
return Gene(**params)


@pytest.fixture(scope='module')
def bx004987_1():
"""Create a BX004987.1 test fixture."""
params = {
'concept_id': 'ensembl:ENSG00000278704',
'symbol': 'BX004987.1',
'label': None,
'previous_symbols': [],
'aliases': [],
'xrefs': [],
'symbol_status': None,
'location_annotations': [],
'locations': [
{
'_id': 'ga4gh:VSL.0JJsYiFwwNH2-7rYKj1ZitEcFRxIGwdQ',
'interval': {
'end': 58376,
'start': 56140,
'type': 'SimpleInterval'
},
'sequence_id': 'ga4gh:SQ.K_ieIfNIy1Ktulg8QSlhvJvm_1uQOtjD',
'type': 'SequenceLocation'
}
],
'strand': '-',
'associated_with': []
}
return Gene(**params)


def test_ddx11l1(ensembl, ddx11l1):
"""Test that DDX11L1 normalizes to correct gene concept."""
# Concept ID
Expand Down Expand Up @@ -321,27 +291,6 @@ def test_CH17_340M24_3(ensembl, CH17_340M24_3):
MatchType.SYMBOL)


def test_AC091057_5(ensembl, AC091057_5):
"""Test that AC091057.5 normalizes to correct gene concept."""
# Concept ID
normalizer_response = ensembl.search('ensembl:ENSG00000284906')
assertion_checks(normalizer_response, AC091057_5, 1, MatchType.CONCEPT_ID)

normalizer_response = ensembl.search('ENSEMBL:ENSG00000284906')
assertion_checks(normalizer_response, AC091057_5, 1, MatchType.CONCEPT_ID)

normalizer_response = ensembl.search('ENSG00000284906')
assertion_checks(normalizer_response, AC091057_5, 1, MatchType.CONCEPT_ID)

# Symbol
normalizer_response = ensembl.search('AC091057.5')
assertion_checks(normalizer_response, AC091057_5, 1, MatchType.SYMBOL)

# associated_with
normalizer_response = ensembl.search('uniprot:Q3KRB8')
assertion_checks(normalizer_response, AC091057_5, 1, MatchType.ASSOCIATED_WITH) # noqa: E501


def test_hsa_mir_1253(ensembl, hsa_mir_1253):
"""Test that hsa-mir-1253 normalizes to correct gene concept."""
# Concept ID
Expand Down Expand Up @@ -385,23 +334,6 @@ def test_spry3(ensembl, spry3):
assertion_checks(normalizer_response, spry3, 1, MatchType.SYMBOL)


def test_bx004987_1(ensembl, bx004987_1):
"""Test that tp53 normalizes to correct gene concept."""
# Concept ID
normalizer_response = ensembl.search('ensembl:ENSG00000278704')
assertion_checks(normalizer_response, bx004987_1, 1, MatchType.CONCEPT_ID)

normalizer_response = ensembl.search('ENSEMBL:ENSG00000278704')
assertion_checks(normalizer_response, bx004987_1, 1, MatchType.CONCEPT_ID)

normalizer_response = ensembl.search('ENSG00000278704')
assertion_checks(normalizer_response, bx004987_1, 1, MatchType.CONCEPT_ID)

# Symbol
normalizer_response = ensembl.search('BX004987.1')
assertion_checks(normalizer_response, bx004987_1, 1, MatchType.SYMBOL)


def test_no_match(ensembl):
"""Test that a term normalizes to correct gene concept as a NO match."""
normalizer_response = ensembl.search('A1BG - AS1')
Expand All @@ -421,16 +353,24 @@ def test_no_match(ensembl):
assert normalizer_response['match_type'] == MatchType.NO_MATCH
assert len(normalizer_response['records']) == 0

normalizer_response = ensembl.search('ensembl:ENSG00000278704')
assert normalizer_response['match_type'] == MatchType.NO_MATCH
assert len(normalizer_response['records']) == 0

normalizer_response = ensembl.search('ensembl:ENSG00000284906')
assert normalizer_response['match_type'] == MatchType.NO_MATCH
assert len(normalizer_response['records']) == 0.


def test_meta_info(ddx11l1, ensembl):
"""Test that the meta field is correct."""
normalizer_response = ensembl.search('chromosome:1')
assert normalizer_response['source_meta_'].data_license == 'custom'
assert normalizer_response['source_meta_'].data_license_url ==\
'https://useast.ensembl.org/info/about/legal/disclaimer.html'
assert normalizer_response['source_meta_'].version == '102'
assert normalizer_response['source_meta_'].version == '104'
assert normalizer_response['source_meta_'].data_url == \
'ftp://ftp.ensembl.org/pub/Homo_sapiens.GRCh38.102.gff3.gz'
'ftp://ftp.ensembl.org/pub/Homo_sapiens.GRCh38.104.gff3.gz'
assert normalizer_response['source_meta_'].rdp_url is None
assert normalizer_response['source_meta_'].genome_assemblies == ['GRCh38']
assert normalizer_response['source_meta_'].data_license_attributes == {
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/test_ncbi_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def dpf1():
'label': 'double PHD fingers 1',
'concept_id': 'ncbigene:8193',
'symbol': 'DPF1',
'aliases': ['BAF45b', 'NEUD4', 'neuro-d4'],
'aliases': ['BAF45b', 'NEUD4', 'neuro-d4', 'SMARCG1'],
'xrefs': ['hgnc:20225', 'ensembl:ENSG00000011332'],
'previous_symbols': [],
'associated_with': ['omim:601670'],
Expand Down

0 comments on commit 4ae9ee7

Please sign in to comment.