From 098cfa52a777a14b4c08850a729d6847fff59fa3 Mon Sep 17 00:00:00 2001 From: korikuzma Date: Tue, 3 Aug 2021 11:08:29 -0400 Subject: [PATCH 1/4] Update sources --- gene/etl/ensembl.py | 26 ++++++---- tests/unit/data/genes.json | 60 +---------------------- tests/unit/test_ensembl_source.py | 80 ++++--------------------------- tests/unit/test_ncbi_source.py | 2 +- 4 files changed, 30 insertions(+), 138 deletions(-) diff --git a/gene/etl/ensembl.py b/gene/etl/ensembl.py index 6836a02f..13dd9e7c 100644 --- a/gene/etl/ensembl.py +++ b/gene/etl/ensembl.py @@ -1,4 +1,6 @@ """This module defines the Ensembl ETL methods.""" +import pydantic + from .base import Base from gene import PROJECT_ROOT from gene.schemas import SourceName, NamespacePrefix, Strand, Gene, SourceMeta @@ -20,23 +22,23 @@ def __init__(self, database: Database, host='ftp.ensembl.org', data_dir='pub/', - fn='Homo_sapiens.GRCh38.102.gff3.gz' + version=104 ): """Initialize Ensembl ETL class. :param Database database: DynamoDB database :param str host: FTP host name :param str data_dir: FTP data directory to use - :param str fn: Data file to download + :param int version: Version for fn """ self._database = database self._sequence_location = SequenceLocation() - self._data_url = f"ftp://{host}/{data_dir}{fn}" self._host = host self._data_dir = data_dir - self._fn = fn + self._version = version + self._fn = f'Homo_sapiens.GRCh38.{self._version}.gff3.gz' + self._data_url = f"ftp://{self._host}/{self._data_dir}{self._fn}" self._data_file_url = None - self._version = '102' self._assembly = 'GRCh38' self._load_data() @@ -46,8 +48,9 @@ def _download_data(self): ens_dir = PROJECT_ROOT / 'data' / 'ensembl' ens_dir.mkdir(exist_ok=True, parents=True) self._ftp_download(self._host, - f'{self._data_dir}release-102/gff3/homo_sapiens/', - 'ensembl_102.gff3', + f'{self._data_dir}release-{self._version}' + f'/gff3/homo_sapiens/', + f'ensembl_{self._version}.gff3', ens_dir, self._fn) logger.info('Successfully downloaded Ensembl data file.') @@ -90,8 +93,13 @@ def _transform_data(self, *args, **kwargs): if f_id == 'gene': gene = self._add_gene(f, sr, accession_numbers) if gene: - assert Gene(**gene) - self._load_gene(gene, batch) + try: + assert Gene(**gene) + except pydantic.error_wrappers.ValidationError: + logger.warning(f"Unable to load gene due to " + f"validation error: {gene}") + else: + self._load_gene(gene, batch) logger.info('Successfully transformed Ensembl.') def _add_gene(self, f, sr, accession_numbers): diff --git a/tests/unit/data/genes.json b/tests/unit/data/genes.json index 0442cf69..8fb41b60 100644 --- a/tests/unit/data/genes.json +++ b/tests/unit/data/genes.json @@ -111,39 +111,6 @@ "concept_id": "ensembl:ensg00000197180", "src_name": "Ensembl" }, - { - "label_and_type": "ensembl:ensg00000284906##identity", - "concept_id": "ensembl:ENSG00000284906", - "symbol": "AC091057.5", - "label": "Rho GTPase-activating protein 11B", - "locations": [ - { - "_id": "ga4gh:VSL.UJx3xHRkDuoALaGxyic-cPQNQnXYiAM8", - "interval": { - "end": 30685606, - "start": 30624548, - "type": "SimpleInterval" - }, - "sequence_id": "ga4gh:SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6", - "type": "SequenceLocation" - } - ], - "strand": "+", - "associated_with": [ - "uniprot:Q3KRB8" - ], - "src_name": "Ensembl" - }, - { - "label_and_type": "uniprot:q3krb8##associated_with", - "concept_id": "ensembl:ensg00000284906", - "src_name": "Ensembl" - }, - { - "label_and_type": "ac091057.5##symbol", - "concept_id": "ensembl:ensg00000284906", - "src_name": "Ensembl" - }, { "label_and_type": "ensembl:ensg00000272920##identity", "concept_id": "ensembl:ENSG00000272920", @@ -205,30 +172,6 @@ "concept_id": "ensembl:ensg00000168939", "src_name": "Ensembl" }, - { - "label_and_type": "ensembl:ensg00000278704##identity", - "concept_id": "ensembl:ENSG00000278704", - "symbol": "BX004987.1", - "locations": [ - { - "_id": "ga4gh:VSL.0JJsYiFwwNH2-7rYKj1ZitEcFRxIGwdQ", - "interval": { - "end": 58376, - "start": 56140, - "type": "SimpleInterval" - }, - "sequence_id": "ga4gh:SQ.K_ieIfNIy1Ktulg8QSlhvJvm_1uQOtjD", - "type": "SequenceLocation" - } - ], - "strand": "-", - "src_name": "Ensembl" - }, - { - "label_and_type": "bx004987.1##symbol", - "concept_id": "ensembl:ensg00000278704", - "src_name": "Ensembl" - }, { "label_and_type": "ensembl:ensg00000087085##identity", "concept_id": "ensembl:ENSG00000087085", @@ -1162,7 +1105,8 @@ "aliases": [ "BAF45b", "NEUD4", - "neuro-d4" + "neuro-d4", + "SMARCG1" ], "xrefs": [ "hgnc:20225", diff --git a/tests/unit/test_ensembl_source.py b/tests/unit/test_ensembl_source.py index 6fc1f4eb..518fc5ac 100644 --- a/tests/unit/test_ensembl_source.py +++ b/tests/unit/test_ensembl_source.py @@ -231,36 +231,6 @@ def spry3(): return Gene(**params) -@pytest.fixture(scope='module') -def bx004987_1(): - """Create a BX004987.1 test fixture.""" - params = { - 'concept_id': 'ensembl:ENSG00000278704', - 'symbol': 'BX004987.1', - 'label': None, - 'previous_symbols': [], - 'aliases': [], - 'xrefs': [], - 'symbol_status': None, - 'location_annotations': [], - 'locations': [ - { - '_id': 'ga4gh:VSL.0JJsYiFwwNH2-7rYKj1ZitEcFRxIGwdQ', - 'interval': { - 'end': 58376, - 'start': 56140, - 'type': 'SimpleInterval' - }, - 'sequence_id': 'ga4gh:SQ.K_ieIfNIy1Ktulg8QSlhvJvm_1uQOtjD', - 'type': 'SequenceLocation' - } - ], - 'strand': '-', - 'associated_with': [] - } - return Gene(**params) - - def test_ddx11l1(ensembl, ddx11l1): """Test that DDX11L1 normalizes to correct gene concept.""" # Concept ID @@ -322,27 +292,6 @@ def test_CH17_340M24_3(ensembl, CH17_340M24_3): MatchType.SYMBOL) -def test_AC091057_5(ensembl, AC091057_5): - """Test that AC091057.5 normalizes to correct gene concept.""" - # Concept ID - normalizer_response = ensembl.search('ensembl:ENSG00000284906') - assertion_checks(normalizer_response, AC091057_5, 1, MatchType.CONCEPT_ID) - - normalizer_response = ensembl.search('ENSEMBL:ENSG00000284906') - assertion_checks(normalizer_response, AC091057_5, 1, MatchType.CONCEPT_ID) - - normalizer_response = ensembl.search('ENSG00000284906') - assertion_checks(normalizer_response, AC091057_5, 1, MatchType.CONCEPT_ID) - - # Symbol - normalizer_response = ensembl.search('AC091057.5') - assertion_checks(normalizer_response, AC091057_5, 1, MatchType.SYMBOL) - - # associated_with - normalizer_response = ensembl.search('uniprot:Q3KRB8') - assertion_checks(normalizer_response, AC091057_5, 1, MatchType.ASSOCIATED_WITH) # noqa: E501 - - def test_hsa_mir_1253(ensembl, hsa_mir_1253): """Test that hsa-mir-1253 normalizes to correct gene concept.""" # Concept ID @@ -386,23 +335,6 @@ def test_spry3(ensembl, spry3): assertion_checks(normalizer_response, spry3, 1, MatchType.SYMBOL) -def test_bx004987_1(ensembl, bx004987_1): - """Test that tp53 normalizes to correct gene concept.""" - # Concept ID - normalizer_response = ensembl.search('ensembl:ENSG00000278704') - assertion_checks(normalizer_response, bx004987_1, 1, MatchType.CONCEPT_ID) - - normalizer_response = ensembl.search('ENSEMBL:ENSG00000278704') - assertion_checks(normalizer_response, bx004987_1, 1, MatchType.CONCEPT_ID) - - normalizer_response = ensembl.search('ENSG00000278704') - assertion_checks(normalizer_response, bx004987_1, 1, MatchType.CONCEPT_ID) - - # Symbol - normalizer_response = ensembl.search('BX004987.1') - assertion_checks(normalizer_response, bx004987_1, 1, MatchType.SYMBOL) - - def test_no_match(ensembl): """Test that a term normalizes to correct gene concept as a NO match.""" normalizer_response = ensembl.search('A1BG - AS1') @@ -422,6 +354,14 @@ def test_no_match(ensembl): assert normalizer_response['match_type'] == MatchType.NO_MATCH assert len(normalizer_response['records']) == 0 + normalizer_response = ensembl.search('ensembl:ENSG00000278704') + assert normalizer_response['match_type'] == MatchType.NO_MATCH + assert len(normalizer_response['records']) == 0 + + normalizer_response = ensembl.search('ensembl:ENSG00000284906') + assert normalizer_response['match_type'] == MatchType.NO_MATCH + assert len(normalizer_response['records']) == 0. + def test_meta_info(ddx11l1, ensembl): """Test that the meta field is correct.""" @@ -429,9 +369,9 @@ def test_meta_info(ddx11l1, ensembl): assert normalizer_response['source_meta_'].data_license == 'custom' assert normalizer_response['source_meta_'].data_license_url ==\ 'https://useast.ensembl.org/info/about/legal/disclaimer.html' - assert normalizer_response['source_meta_'].version == '102' + assert normalizer_response['source_meta_'].version == '104' assert normalizer_response['source_meta_'].data_url == \ - 'ftp://ftp.ensembl.org/pub/Homo_sapiens.GRCh38.102.gff3.gz' + 'ftp://ftp.ensembl.org/pub/Homo_sapiens.GRCh38.104.gff3.gz' assert normalizer_response['source_meta_'].rdp_url is None assert normalizer_response['source_meta_'].genome_assemblies == ['GRCh38'] assert normalizer_response['source_meta_'].data_license_attributes == { diff --git a/tests/unit/test_ncbi_source.py b/tests/unit/test_ncbi_source.py index f84d2562..e9cc997c 100644 --- a/tests/unit/test_ncbi_source.py +++ b/tests/unit/test_ncbi_source.py @@ -29,7 +29,7 @@ def dpf1(): 'label': 'double PHD fingers 1', 'concept_id': 'ncbigene:8193', 'symbol': 'DPF1', - 'aliases': ['BAF45b', 'NEUD4', 'neuro-d4'], + 'aliases': ['BAF45b', 'NEUD4', 'neuro-d4', 'SMARCG1'], 'xrefs': ['hgnc:20225', 'ensembl:ENSG00000011332'], 'previous_symbols': [], 'associated_with': ['omim:601670'], From 191921b5a6dcde482a247926fd6d9e2205d4c282 Mon Sep 17 00:00:00 2001 From: korikuzma Date: Tue, 3 Aug 2021 11:23:48 -0400 Subject: [PATCH 2/4] Forgot to update metadata --- tests/unit/data/metadata.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/data/metadata.json b/tests/unit/data/metadata.json index e63e7169..af9cd24d 100644 --- a/tests/unit/data/metadata.json +++ b/tests/unit/data/metadata.json @@ -1,8 +1,8 @@ [ { "src_name": "Ensembl", - "version": "102", - "data_url": "ftp://ftp.ensembl.org/pub/Homo_sapiens.GRCh38.102.gff3.gz", + "version": "104", + "data_url": "ftp://ftp.ensembl.org/pub/Homo_sapiens.GRCh38.104.gff3.gz", "data_license_attributes": { "non_commercial": false, "share_alike": false, From 2d2268ddd82d9ec8fbf7ba6550e0273a0c757be8 Mon Sep 17 00:00:00 2001 From: korikuzma Date: Tue, 3 Aug 2021 13:09:17 -0400 Subject: [PATCH 3/4] increment version --- gene/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gene/version.py b/gene/version.py index 569b1212..0c5c3007 100644 --- a/gene/version.py +++ b/gene/version.py @@ -1 +1 @@ -__version__ = "0.1.10" +__version__ = "0.1.11" From d364be1f508184b59ca2b1cee0ceb704f056e89e Mon Sep 17 00:00:00 2001 From: korikuzma Date: Tue, 3 Aug 2021 14:33:50 -0400 Subject: [PATCH 4/4] NCBI files were not being updated to newest version --- gene/etl/ncbi.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/gene/etl/ncbi.py b/gene/etl/ncbi.py index e37cb83c..14909e0d 100644 --- a/gene/etl/ncbi.py +++ b/gene/etl/ncbi.py @@ -40,6 +40,7 @@ def __init__(self, self._host = host self._data_dir = data_dir self._assembly = assembly + self._date_today = datetime.today().strftime('%Y%m%d') self._extract_data() self._transform_data() @@ -48,28 +49,27 @@ def _download_data(self, ncbi_dir: Path): :param str ncbi_dir: The NCBI data directory """ - version = datetime.today().strftime('%Y%m%d') - # Download info data_dir = f'{self._data_dir}GENE_INFO/Mammalia/' - fn = f'ncbi_info_{version}.tsv' + fn = f'ncbi_info_{self._date_today}.tsv' data_fn = 'Homo_sapiens.gene_info.gz' logger.info('Downloading NCBI gene_info....') self._ftp_download(self._host, data_dir, fn, ncbi_dir, data_fn) logger.info('Successfully downloaded NCBI gene_info.') # Download history - fn = f'ncbi_history_{version}.tsv' + fn = f'ncbi_history_{self._date_today}.tsv' data_fn = 'gene_history.gz' logger.info('Downloading NCBI gene_history...') self._ftp_download(self._host, self._data_dir, fn, ncbi_dir, data_fn) logger.info('Successfully downloaded NCBI gene_history.') # Download gff + og_fn = 'GCF_000001405.39_GRCh38.p13' data_dir = 'genomes/refseq/vertebrate_mammalian/Homo_sapiens/' \ - 'latest_assembly_versions/GCF_000001405.39_GRCh38.p13/' + f'latest_assembly_versions/{og_fn}/' fn = f'ncbi_{self._assembly}.gff' - data_fn = 'GCF_000001405.39_GRCh38.p13_genomic.gff.gz' + data_fn = f'{og_fn}_genomic.gff.gz' logger.info('Downloading NCBI gff data file...') self._ftp_download(self._host, data_dir, fn, ncbi_dir, data_fn) logger.info('Successfully downloaded NCBI gff data file.') @@ -87,11 +87,11 @@ def _files_downloaded(self, data_dir: Path) -> bool: gff_downloaded: bool = False for f in files: - if f.name.startswith('ncbi_info'): + if f.name.startswith(f'ncbi_info_{self._date_today}'): info_downloaded = True - elif f.name.startswith('ncbi_history'): + elif f.name.startswith(f'ncbi_history_{self._date_today}'): history_downloaded = True - elif f.name.startswith('ncbi_GRCh38'): + elif f.name.startswith('ncbi_GRCh38.p13'): gff_downloaded = True return info_downloaded and history_downloaded and gff_downloaded