Merge pull request #55 from cancervariants/updates

Updates to sources in prod
cancervariants · Aug 4, 2021 · 4ae9ee7 · 4ae9ee7
2 parents e9e7119 + 1252490
commit 4ae9ee7
Show file tree

Hide file tree

Showing 7 changed files with 43 additions and 154 deletions.
diff --git a/gene/etl/ensembl.py b/gene/etl/ensembl.py
@@ -1,4 +1,6 @@
 """This module defines the Ensembl ETL methods."""
+import pydantic
+
 from .base import Base
 from gene import PROJECT_ROOT
 from gene.schemas import SourceName, NamespacePrefix, Strand, Gene, SourceMeta
@@ -19,21 +21,23 @@ def __init__(self,
                  database: Database,
                  host='ftp.ensembl.org',
                  data_dir='pub/',
-                 fn='Homo_sapiens.GRCh38.102.gff3.gz'
+                 version=104
                  ):
         """Initialize Ensembl ETL class.
 
         :param Database database: DynamoDB database
         :param str host: FTP host name
         :param str data_dir: FTP data directory to use
-        :param str fn: Data file to download
+        :param int version: Version for fn
         """
         super().__init__(database, host, data_dir)
         self._sequence_location = SequenceLocation()
-        self._data_url = f"ftp://{host}/{data_dir}{fn}"
-        self._fn = fn
+        self._host = host
+        self._data_dir = data_dir
+        self._version = version
+        self._fn = f'Homo_sapiens.GRCh38.{self._version}.gff3.gz'
+        self._data_url = f"ftp://{self._host}/{self._data_dir}{self._fn}"
         self._data_file_url = None
-        self._version = '102'
         self._assembly = 'GRCh38'
 
     def _download_data(self):
@@ -42,8 +46,9 @@ def _download_data(self):
         ens_dir = PROJECT_ROOT / 'data' / 'ensembl'
         ens_dir.mkdir(exist_ok=True, parents=True)
         self._ftp_download(self._host,
-                           f'{self._data_dir}release-102/gff3/homo_sapiens/',
-                           'ensembl_102.gff3',
+                           f'{self._data_dir}release-{self._version}'
+                           f'/gff3/homo_sapiens/',
+                           f'ensembl_{self._version}.gff3',
                            ens_dir,
                            self._fn)
         logger.info('Successfully downloaded Ensembl data file.')
@@ -81,8 +86,13 @@ def _transform_data(self, *args, **kwargs):
                     if f_id == 'gene':
                         gene = self._add_gene(f, sr, accession_numbers)
                         if gene:
-                            assert Gene(**gene)
-                            self._load_gene(gene, batch)
+                            try:
+                                assert Gene(**gene)
+                            except pydantic.error_wrappers.ValidationError:
+                                logger.warning(f"Unable to load gene due to "
+                                               f"validation error: {gene}")
+                            else:
+                                self._load_gene(gene, batch)
         logger.info('Successfully transformed Ensembl.')
 
     def _add_gene(self, f, sr, accession_numbers):

diff --git a/gene/etl/ncbi.py b/gene/etl/ncbi.py
@@ -37,6 +37,7 @@ def __init__(self,
         self._chromosome_location = ChromosomeLocation()
         self._data_url = f"ftp://{host}"
         self._assembly = assembly
+        self._date_today = datetime.today().strftime('%Y%m%d')
 
     def perform_etl(self):
         """Perform ETL methods.
@@ -53,28 +54,27 @@ def _download_data(self, ncbi_dir: Path):
 
         :param str ncbi_dir: The NCBI data directory
         """
-        version = datetime.today().strftime('%Y%m%d')
-
         # Download info
         data_dir = f'{self._data_dir}GENE_INFO/Mammalia/'
-        fn = f'ncbi_info_{version}.tsv'
+        fn = f'ncbi_info_{self._date_today}.tsv'
         data_fn = 'Homo_sapiens.gene_info.gz'
         logger.info('Downloading NCBI gene_info....')
         self._ftp_download(self._host, data_dir, fn, ncbi_dir, data_fn)
         logger.info('Successfully downloaded NCBI gene_info.')
 
         # Download history
-        fn = f'ncbi_history_{version}.tsv'
+        fn = f'ncbi_history_{self._date_today}.tsv'
         data_fn = 'gene_history.gz'
         logger.info('Downloading NCBI gene_history...')
         self._ftp_download(self._host, self._data_dir, fn, ncbi_dir, data_fn)
         logger.info('Successfully downloaded NCBI gene_history.')
 
         # Download gff
+        og_fn = 'GCF_000001405.39_GRCh38.p13'
         data_dir = 'genomes/refseq/vertebrate_mammalian/Homo_sapiens/' \
-                   'latest_assembly_versions/GCF_000001405.39_GRCh38.p13/'
+                   f'latest_assembly_versions/{og_fn}/'
         fn = f'ncbi_{self._assembly}.gff'
-        data_fn = 'GCF_000001405.39_GRCh38.p13_genomic.gff.gz'
+        data_fn = f'{og_fn}_genomic.gff.gz'
         logger.info('Downloading NCBI gff data file...')
         self._ftp_download(self._host, data_dir, fn, ncbi_dir, data_fn)
         logger.info('Successfully downloaded NCBI gff data file.')
@@ -92,11 +92,11 @@ def _files_downloaded(self, data_dir: Path) -> bool:
         gff_downloaded: bool = False
 
         for f in files:
-            if f.name.startswith('ncbi_info'):
+            if f.name.startswith(f'ncbi_info_{self._date_today}'):
                 info_downloaded = True
-            elif f.name.startswith('ncbi_history'):
+            elif f.name.startswith(f'ncbi_history_{self._date_today}'):
                 history_downloaded = True
-            elif f.name.startswith('ncbi_GRCh38'):
+            elif f.name.startswith('ncbi_GRCh38.p13'):
                 gff_downloaded = True
         return info_downloaded and history_downloaded and gff_downloaded
 

diff --git a/tests/unit/data/ensembl_genes.json b/tests/unit/data/ensembl_genes.json
@@ -119,42 +119,6 @@
     "src_name": "Ensembl",
     "item_type": "symbol"
   },
-  {
-    "label_and_type": "ensembl:ensg00000284906##identity",
-    "concept_id": "ensembl:ENSG00000284906",
-    "symbol": "AC091057.5",
-    "label": "Rho GTPase-activating protein 11B",
-    "locations": [
-      {
-        "_id": "ga4gh:VSL.UJx3xHRkDuoALaGxyic-cPQNQnXYiAM8",
-        "interval": {
-          "end": 30685606,
-          "start": 30624548,
-          "type": "SimpleInterval"
-        },
-        "sequence_id": "ga4gh:SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6",
-        "type": "SequenceLocation"
-      }
-    ],
-    "strand": "+",
-    "associated_with": [
-      "uniprot:Q3KRB8"
-    ],
-    "src_name": "Ensembl",
-    "item_type": "identity"
-  },
-  {
-    "label_and_type": "uniprot:q3krb8##associated_with",
-    "concept_id": "ensembl:ensg00000284906",
-    "src_name": "Ensembl",
-    "item_type": "associated_with"
-  },
-  {
-    "label_and_type": "ac091057.5##symbol",
-    "concept_id": "ensembl:ensg00000284906",
-    "src_name": "Ensembl",
-    "item_type": "symbol"
-  },
   {
     "label_and_type": "ensembl:ensg00000272920##identity",
     "concept_id": "ensembl:ENSG00000272920",
@@ -221,32 +185,6 @@
     "src_name": "Ensembl",
     "item_type": "symbol"
   },
-  {
-    "label_and_type": "ensembl:ensg00000278704##identity",
-    "concept_id": "ensembl:ENSG00000278704",
-    "symbol": "BX004987.1",
-    "locations": [
-      {
-        "_id": "ga4gh:VSL.0JJsYiFwwNH2-7rYKj1ZitEcFRxIGwdQ",
-        "interval": {
-          "end": 58376,
-          "start": 56140,
-          "type": "SimpleInterval"
-        },
-        "sequence_id": "ga4gh:SQ.K_ieIfNIy1Ktulg8QSlhvJvm_1uQOtjD",
-        "type": "SequenceLocation"
-      }
-    ],
-    "strand": "-",
-    "src_name": "Ensembl",
-    "item_type": "identity"
-  },
-  {
-    "label_and_type": "bx004987.1##symbol",
-    "concept_id": "ensembl:ensg00000278704",
-    "src_name": "Ensembl",
-    "item_type": "symbol"
-  },
   {
     "label_and_type": "ensembl:ensg00000087085##identity",
     "concept_id": "ensembl:ENSG00000087085",

diff --git a/tests/unit/data/metadata.json b/tests/unit/data/metadata.json
@@ -1,8 +1,8 @@
 [
   {
     "src_name": "Ensembl",
-    "version": "102",
-    "data_url": "ftp://ftp.ensembl.org/pub/Homo_sapiens.GRCh38.102.gff3.gz",
+    "version": "104",
+    "data_url": "ftp://ftp.ensembl.org/pub/Homo_sapiens.GRCh38.104.gff3.gz",
     "data_license_attributes": {
       "non_commercial": false,
       "share_alike": false,

diff --git a/tests/unit/data/ncbi_genes.json b/tests/unit/data/ncbi_genes.json
@@ -7,7 +7,8 @@
     "aliases": [
       "BAF45b",
       "NEUD4",
-      "neuro-d4"
+      "neuro-d4",
+      "SMARCG1"
     ],
     "xrefs": [
       "hgnc:20225",

diff --git a/tests/unit/test_ensembl_source.py b/tests/unit/test_ensembl_source.py
@@ -230,36 +230,6 @@ def spry3():
     return Gene(**params)
 
 
-@pytest.fixture(scope='module')
-def bx004987_1():
-    """Create a BX004987.1 test fixture."""
-    params = {
-        'concept_id': 'ensembl:ENSG00000278704',
-        'symbol': 'BX004987.1',
-        'label': None,
-        'previous_symbols': [],
-        'aliases': [],
-        'xrefs': [],
-        'symbol_status': None,
-        'location_annotations': [],
-        'locations': [
-            {
-                '_id': 'ga4gh:VSL.0JJsYiFwwNH2-7rYKj1ZitEcFRxIGwdQ',
-                'interval': {
-                    'end': 58376,
-                    'start': 56140,
-                    'type': 'SimpleInterval'
-                },
-                'sequence_id': 'ga4gh:SQ.K_ieIfNIy1Ktulg8QSlhvJvm_1uQOtjD',
-                'type': 'SequenceLocation'
-            }
-        ],
-        'strand': '-',
-        'associated_with': []
-    }
-    return Gene(**params)
-
-
 def test_ddx11l1(ensembl, ddx11l1):
     """Test that DDX11L1 normalizes to correct gene concept."""
     # Concept ID
@@ -321,27 +291,6 @@ def test_CH17_340M24_3(ensembl, CH17_340M24_3):
                      MatchType.SYMBOL)
 
 
-def test_AC091057_5(ensembl, AC091057_5):
-    """Test that AC091057.5 normalizes to correct gene concept."""
-    # Concept ID
-    normalizer_response = ensembl.search('ensembl:ENSG00000284906')
-    assertion_checks(normalizer_response, AC091057_5, 1, MatchType.CONCEPT_ID)
-
-    normalizer_response = ensembl.search('ENSEMBL:ENSG00000284906')
-    assertion_checks(normalizer_response, AC091057_5, 1, MatchType.CONCEPT_ID)
-
-    normalizer_response = ensembl.search('ENSG00000284906')
-    assertion_checks(normalizer_response, AC091057_5, 1, MatchType.CONCEPT_ID)
-
-    # Symbol
-    normalizer_response = ensembl.search('AC091057.5')
-    assertion_checks(normalizer_response, AC091057_5, 1, MatchType.SYMBOL)
-
-    # associated_with
-    normalizer_response = ensembl.search('uniprot:Q3KRB8')
-    assertion_checks(normalizer_response, AC091057_5, 1, MatchType.ASSOCIATED_WITH)  # noqa: E501
-
-
 def test_hsa_mir_1253(ensembl, hsa_mir_1253):
     """Test that hsa-mir-1253 normalizes to correct gene concept."""
     # Concept ID
@@ -385,23 +334,6 @@ def test_spry3(ensembl, spry3):
     assertion_checks(normalizer_response, spry3, 1, MatchType.SYMBOL)
 
 
-def test_bx004987_1(ensembl, bx004987_1):
-    """Test that tp53 normalizes to correct gene concept."""
-    # Concept ID
-    normalizer_response = ensembl.search('ensembl:ENSG00000278704')
-    assertion_checks(normalizer_response, bx004987_1, 1, MatchType.CONCEPT_ID)
-
-    normalizer_response = ensembl.search('ENSEMBL:ENSG00000278704')
-    assertion_checks(normalizer_response, bx004987_1, 1, MatchType.CONCEPT_ID)
-
-    normalizer_response = ensembl.search('ENSG00000278704')
-    assertion_checks(normalizer_response, bx004987_1, 1, MatchType.CONCEPT_ID)
-
-    # Symbol
-    normalizer_response = ensembl.search('BX004987.1')
-    assertion_checks(normalizer_response, bx004987_1, 1, MatchType.SYMBOL)
-
-
 def test_no_match(ensembl):
     """Test that a term normalizes to correct gene concept as a NO match."""
     normalizer_response = ensembl.search('A1BG - AS1')
@@ -421,16 +353,24 @@ def test_no_match(ensembl):
     assert normalizer_response['match_type'] == MatchType.NO_MATCH
     assert len(normalizer_response['records']) == 0
 
+    normalizer_response = ensembl.search('ensembl:ENSG00000278704')
+    assert normalizer_response['match_type'] == MatchType.NO_MATCH
+    assert len(normalizer_response['records']) == 0
+
+    normalizer_response = ensembl.search('ensembl:ENSG00000284906')
+    assert normalizer_response['match_type'] == MatchType.NO_MATCH
+    assert len(normalizer_response['records']) == 0.
+
 
 def test_meta_info(ddx11l1, ensembl):
     """Test that the meta field is correct."""
     normalizer_response = ensembl.search('chromosome:1')
     assert normalizer_response['source_meta_'].data_license == 'custom'
     assert normalizer_response['source_meta_'].data_license_url ==\
            'https://useast.ensembl.org/info/about/legal/disclaimer.html'
-    assert normalizer_response['source_meta_'].version == '102'
+    assert normalizer_response['source_meta_'].version == '104'
     assert normalizer_response['source_meta_'].data_url == \
-           'ftp://ftp.ensembl.org/pub/Homo_sapiens.GRCh38.102.gff3.gz'
+           'ftp://ftp.ensembl.org/pub/Homo_sapiens.GRCh38.104.gff3.gz'
     assert normalizer_response['source_meta_'].rdp_url is None
     assert normalizer_response['source_meta_'].genome_assemblies == ['GRCh38']
     assert normalizer_response['source_meta_'].data_license_attributes == {

diff --git a/tests/unit/test_ncbi_source.py b/tests/unit/test_ncbi_source.py
@@ -28,7 +28,7 @@ def dpf1():
         'label': 'double PHD fingers 1',
         'concept_id': 'ncbigene:8193',
         'symbol': 'DPF1',
-        'aliases': ['BAF45b', 'NEUD4', 'neuro-d4'],
+        'aliases': ['BAF45b', 'NEUD4', 'neuro-d4', 'SMARCG1'],
         'xrefs': ['hgnc:20225', 'ensembl:ENSG00000011332'],
         'previous_symbols': [],
         'associated_with': ['omim:601670'],