From 098cfa52a777a14b4c08850a729d6847fff59fa3 Mon Sep 17 00:00:00 2001
From: korikuzma <korikuzma@gmail.com>
Date: Tue, 3 Aug 2021 11:08:29 -0400
Subject: [PATCH 1/4] Update sources

---
 gene/etl/ensembl.py               | 26 ++++++----
 tests/unit/data/genes.json        | 60 +----------------------
 tests/unit/test_ensembl_source.py | 80 ++++---------------------------
 tests/unit/test_ncbi_source.py    |  2 +-
 4 files changed, 30 insertions(+), 138 deletions(-)

diff --git a/gene/etl/ensembl.py b/gene/etl/ensembl.py
index 6836a02f..13dd9e7c 100644
--- a/gene/etl/ensembl.py
+++ b/gene/etl/ensembl.py
@@ -1,4 +1,6 @@
 """This module defines the Ensembl ETL methods."""
+import pydantic
+
 from .base import Base
 from gene import PROJECT_ROOT
 from gene.schemas import SourceName, NamespacePrefix, Strand, Gene, SourceMeta
@@ -20,23 +22,23 @@ def __init__(self,
                  database: Database,
                  host='ftp.ensembl.org',
                  data_dir='pub/',
-                 fn='Homo_sapiens.GRCh38.102.gff3.gz'
+                 version=104
                  ):
         """Initialize Ensembl ETL class.
 
         :param Database database: DynamoDB database
         :param str host: FTP host name
         :param str data_dir: FTP data directory to use
-        :param str fn: Data file to download
+        :param int version: Version for fn
         """
         self._database = database
         self._sequence_location = SequenceLocation()
-        self._data_url = f"ftp://{host}/{data_dir}{fn}"
         self._host = host
         self._data_dir = data_dir
-        self._fn = fn
+        self._version = version
+        self._fn = f'Homo_sapiens.GRCh38.{self._version}.gff3.gz'
+        self._data_url = f"ftp://{self._host}/{self._data_dir}{self._fn}"
         self._data_file_url = None
-        self._version = '102'
         self._assembly = 'GRCh38'
         self._load_data()
 
@@ -46,8 +48,9 @@ def _download_data(self):
         ens_dir = PROJECT_ROOT / 'data' / 'ensembl'
         ens_dir.mkdir(exist_ok=True, parents=True)
         self._ftp_download(self._host,
-                           f'{self._data_dir}release-102/gff3/homo_sapiens/',
-                           'ensembl_102.gff3',
+                           f'{self._data_dir}release-{self._version}'
+                           f'/gff3/homo_sapiens/',
+                           f'ensembl_{self._version}.gff3',
                            ens_dir,
                            self._fn)
         logger.info('Successfully downloaded Ensembl data file.')
@@ -90,8 +93,13 @@ def _transform_data(self, *args, **kwargs):
                     if f_id == 'gene':
                         gene = self._add_gene(f, sr, accession_numbers)
                         if gene:
-                            assert Gene(**gene)
-                            self._load_gene(gene, batch)
+                            try:
+                                assert Gene(**gene)
+                            except pydantic.error_wrappers.ValidationError:
+                                logger.warning(f"Unable to load gene due to "
+                                               f"validation error: {gene}")
+                            else:
+                                self._load_gene(gene, batch)
         logger.info('Successfully transformed Ensembl.')
 
     def _add_gene(self, f, sr, accession_numbers):
diff --git a/tests/unit/data/genes.json b/tests/unit/data/genes.json
index 0442cf69..8fb41b60 100644
--- a/tests/unit/data/genes.json
+++ b/tests/unit/data/genes.json
@@ -111,39 +111,6 @@
     "concept_id": "ensembl:ensg00000197180",
     "src_name": "Ensembl"
   },
-  {
-    "label_and_type": "ensembl:ensg00000284906##identity",
-    "concept_id": "ensembl:ENSG00000284906",
-    "symbol": "AC091057.5",
-    "label": "Rho GTPase-activating protein 11B",
-    "locations": [
-      {
-        "_id": "ga4gh:VSL.UJx3xHRkDuoALaGxyic-cPQNQnXYiAM8",
-        "interval": {
-          "end": 30685606,
-          "start": 30624548,
-          "type": "SimpleInterval"
-        },
-        "sequence_id": "ga4gh:SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6",
-        "type": "SequenceLocation"
-      }
-    ],
-    "strand": "+",
-    "associated_with": [
-      "uniprot:Q3KRB8"
-    ],
-    "src_name": "Ensembl"
-  },
-  {
-    "label_and_type": "uniprot:q3krb8##associated_with",
-    "concept_id": "ensembl:ensg00000284906",
-    "src_name": "Ensembl"
-  },
-  {
-    "label_and_type": "ac091057.5##symbol",
-    "concept_id": "ensembl:ensg00000284906",
-    "src_name": "Ensembl"
-  },
   {
     "label_and_type": "ensembl:ensg00000272920##identity",
     "concept_id": "ensembl:ENSG00000272920",
@@ -205,30 +172,6 @@
     "concept_id": "ensembl:ensg00000168939",
     "src_name": "Ensembl"
   },
-  {
-    "label_and_type": "ensembl:ensg00000278704##identity",
-    "concept_id": "ensembl:ENSG00000278704",
-    "symbol": "BX004987.1",
-    "locations": [
-      {
-        "_id": "ga4gh:VSL.0JJsYiFwwNH2-7rYKj1ZitEcFRxIGwdQ",
-        "interval": {
-          "end": 58376,
-          "start": 56140,
-          "type": "SimpleInterval"
-        },
-        "sequence_id": "ga4gh:SQ.K_ieIfNIy1Ktulg8QSlhvJvm_1uQOtjD",
-        "type": "SequenceLocation"
-      }
-    ],
-    "strand": "-",
-    "src_name": "Ensembl"
-  },
-  {
-    "label_and_type": "bx004987.1##symbol",
-    "concept_id": "ensembl:ensg00000278704",
-    "src_name": "Ensembl"
-  },
   {
     "label_and_type": "ensembl:ensg00000087085##identity",
     "concept_id": "ensembl:ENSG00000087085",
@@ -1162,7 +1105,8 @@
     "aliases": [
       "BAF45b",
       "NEUD4",
-      "neuro-d4"
+      "neuro-d4",
+      "SMARCG1"
     ],
     "xrefs": [
       "hgnc:20225",
diff --git a/tests/unit/test_ensembl_source.py b/tests/unit/test_ensembl_source.py
index 6fc1f4eb..518fc5ac 100644
--- a/tests/unit/test_ensembl_source.py
+++ b/tests/unit/test_ensembl_source.py
@@ -231,36 +231,6 @@ def spry3():
     return Gene(**params)
 
 
-@pytest.fixture(scope='module')
-def bx004987_1():
-    """Create a BX004987.1 test fixture."""
-    params = {
-        'concept_id': 'ensembl:ENSG00000278704',
-        'symbol': 'BX004987.1',
-        'label': None,
-        'previous_symbols': [],
-        'aliases': [],
-        'xrefs': [],
-        'symbol_status': None,
-        'location_annotations': [],
-        'locations': [
-            {
-                '_id': 'ga4gh:VSL.0JJsYiFwwNH2-7rYKj1ZitEcFRxIGwdQ',
-                'interval': {
-                    'end': 58376,
-                    'start': 56140,
-                    'type': 'SimpleInterval'
-                },
-                'sequence_id': 'ga4gh:SQ.K_ieIfNIy1Ktulg8QSlhvJvm_1uQOtjD',
-                'type': 'SequenceLocation'
-            }
-        ],
-        'strand': '-',
-        'associated_with': []
-    }
-    return Gene(**params)
-
-
 def test_ddx11l1(ensembl, ddx11l1):
     """Test that DDX11L1 normalizes to correct gene concept."""
     # Concept ID
@@ -322,27 +292,6 @@ def test_CH17_340M24_3(ensembl, CH17_340M24_3):
                      MatchType.SYMBOL)
 
 
-def test_AC091057_5(ensembl, AC091057_5):
-    """Test that AC091057.5 normalizes to correct gene concept."""
-    # Concept ID
-    normalizer_response = ensembl.search('ensembl:ENSG00000284906')
-    assertion_checks(normalizer_response, AC091057_5, 1, MatchType.CONCEPT_ID)
-
-    normalizer_response = ensembl.search('ENSEMBL:ENSG00000284906')
-    assertion_checks(normalizer_response, AC091057_5, 1, MatchType.CONCEPT_ID)
-
-    normalizer_response = ensembl.search('ENSG00000284906')
-    assertion_checks(normalizer_response, AC091057_5, 1, MatchType.CONCEPT_ID)
-
-    # Symbol
-    normalizer_response = ensembl.search('AC091057.5')
-    assertion_checks(normalizer_response, AC091057_5, 1, MatchType.SYMBOL)
-
-    # associated_with
-    normalizer_response = ensembl.search('uniprot:Q3KRB8')
-    assertion_checks(normalizer_response, AC091057_5, 1, MatchType.ASSOCIATED_WITH)  # noqa: E501
-
-
 def test_hsa_mir_1253(ensembl, hsa_mir_1253):
     """Test that hsa-mir-1253 normalizes to correct gene concept."""
     # Concept ID
@@ -386,23 +335,6 @@ def test_spry3(ensembl, spry3):
     assertion_checks(normalizer_response, spry3, 1, MatchType.SYMBOL)
 
 
-def test_bx004987_1(ensembl, bx004987_1):
-    """Test that tp53 normalizes to correct gene concept."""
-    # Concept ID
-    normalizer_response = ensembl.search('ensembl:ENSG00000278704')
-    assertion_checks(normalizer_response, bx004987_1, 1, MatchType.CONCEPT_ID)
-
-    normalizer_response = ensembl.search('ENSEMBL:ENSG00000278704')
-    assertion_checks(normalizer_response, bx004987_1, 1, MatchType.CONCEPT_ID)
-
-    normalizer_response = ensembl.search('ENSG00000278704')
-    assertion_checks(normalizer_response, bx004987_1, 1, MatchType.CONCEPT_ID)
-
-    # Symbol
-    normalizer_response = ensembl.search('BX004987.1')
-    assertion_checks(normalizer_response, bx004987_1, 1, MatchType.SYMBOL)
-
-
 def test_no_match(ensembl):
     """Test that a term normalizes to correct gene concept as a NO match."""
     normalizer_response = ensembl.search('A1BG - AS1')
@@ -422,6 +354,14 @@ def test_no_match(ensembl):
     assert normalizer_response['match_type'] == MatchType.NO_MATCH
     assert len(normalizer_response['records']) == 0
 
+    normalizer_response = ensembl.search('ensembl:ENSG00000278704')
+    assert normalizer_response['match_type'] == MatchType.NO_MATCH
+    assert len(normalizer_response['records']) == 0
+
+    normalizer_response = ensembl.search('ensembl:ENSG00000284906')
+    assert normalizer_response['match_type'] == MatchType.NO_MATCH
+    assert len(normalizer_response['records']) == 0.
+
 
 def test_meta_info(ddx11l1, ensembl):
     """Test that the meta field is correct."""
@@ -429,9 +369,9 @@ def test_meta_info(ddx11l1, ensembl):
     assert normalizer_response['source_meta_'].data_license == 'custom'
     assert normalizer_response['source_meta_'].data_license_url ==\
            'https://useast.ensembl.org/info/about/legal/disclaimer.html'
-    assert normalizer_response['source_meta_'].version == '102'
+    assert normalizer_response['source_meta_'].version == '104'
     assert normalizer_response['source_meta_'].data_url == \
-           'ftp://ftp.ensembl.org/pub/Homo_sapiens.GRCh38.102.gff3.gz'
+           'ftp://ftp.ensembl.org/pub/Homo_sapiens.GRCh38.104.gff3.gz'
     assert normalizer_response['source_meta_'].rdp_url is None
     assert normalizer_response['source_meta_'].genome_assemblies == ['GRCh38']
     assert normalizer_response['source_meta_'].data_license_attributes == {
diff --git a/tests/unit/test_ncbi_source.py b/tests/unit/test_ncbi_source.py
index f84d2562..e9cc997c 100644
--- a/tests/unit/test_ncbi_source.py
+++ b/tests/unit/test_ncbi_source.py
@@ -29,7 +29,7 @@ def dpf1():
         'label': 'double PHD fingers 1',
         'concept_id': 'ncbigene:8193',
         'symbol': 'DPF1',
-        'aliases': ['BAF45b', 'NEUD4', 'neuro-d4'],
+        'aliases': ['BAF45b', 'NEUD4', 'neuro-d4', 'SMARCG1'],
         'xrefs': ['hgnc:20225', 'ensembl:ENSG00000011332'],
         'previous_symbols': [],
         'associated_with': ['omim:601670'],

From 191921b5a6dcde482a247926fd6d9e2205d4c282 Mon Sep 17 00:00:00 2001
From: korikuzma <korikuzma@gmail.com>
Date: Tue, 3 Aug 2021 11:23:48 -0400
Subject: [PATCH 2/4] Forgot to update metadata

---
 tests/unit/data/metadata.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/data/metadata.json b/tests/unit/data/metadata.json
index e63e7169..af9cd24d 100644
--- a/tests/unit/data/metadata.json
+++ b/tests/unit/data/metadata.json
@@ -1,8 +1,8 @@
 [
   {
     "src_name": "Ensembl",
-    "version": "102",
-    "data_url": "ftp://ftp.ensembl.org/pub/Homo_sapiens.GRCh38.102.gff3.gz",
+    "version": "104",
+    "data_url": "ftp://ftp.ensembl.org/pub/Homo_sapiens.GRCh38.104.gff3.gz",
     "data_license_attributes": {
       "non_commercial": false,
       "share_alike": false,

From 2d2268ddd82d9ec8fbf7ba6550e0273a0c757be8 Mon Sep 17 00:00:00 2001
From: korikuzma <korikuzma@gmail.com>
Date: Tue, 3 Aug 2021 13:09:17 -0400
Subject: [PATCH 3/4] increment version

---
 gene/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gene/version.py b/gene/version.py
index 569b1212..0c5c3007 100644
--- a/gene/version.py
+++ b/gene/version.py
@@ -1 +1 @@
-__version__ = "0.1.10"
+__version__ = "0.1.11"

From d364be1f508184b59ca2b1cee0ceb704f056e89e Mon Sep 17 00:00:00 2001
From: korikuzma <korikuzma@gmail.com>
Date: Tue, 3 Aug 2021 14:33:50 -0400
Subject: [PATCH 4/4] NCBI files were not being updated to newest version

---
 gene/etl/ncbi.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/gene/etl/ncbi.py b/gene/etl/ncbi.py
index e37cb83c..14909e0d 100644
--- a/gene/etl/ncbi.py
+++ b/gene/etl/ncbi.py
@@ -40,6 +40,7 @@ def __init__(self,
         self._host = host
         self._data_dir = data_dir
         self._assembly = assembly
+        self._date_today = datetime.today().strftime('%Y%m%d')
         self._extract_data()
         self._transform_data()
 
@@ -48,28 +49,27 @@ def _download_data(self, ncbi_dir: Path):
 
         :param str ncbi_dir: The NCBI data directory
         """
-        version = datetime.today().strftime('%Y%m%d')
-
         # Download info
         data_dir = f'{self._data_dir}GENE_INFO/Mammalia/'
-        fn = f'ncbi_info_{version}.tsv'
+        fn = f'ncbi_info_{self._date_today}.tsv'
         data_fn = 'Homo_sapiens.gene_info.gz'
         logger.info('Downloading NCBI gene_info....')
         self._ftp_download(self._host, data_dir, fn, ncbi_dir, data_fn)
         logger.info('Successfully downloaded NCBI gene_info.')
 
         # Download history
-        fn = f'ncbi_history_{version}.tsv'
+        fn = f'ncbi_history_{self._date_today}.tsv'
         data_fn = 'gene_history.gz'
         logger.info('Downloading NCBI gene_history...')
         self._ftp_download(self._host, self._data_dir, fn, ncbi_dir, data_fn)
         logger.info('Successfully downloaded NCBI gene_history.')
 
         # Download gff
+        og_fn = 'GCF_000001405.39_GRCh38.p13'
         data_dir = 'genomes/refseq/vertebrate_mammalian/Homo_sapiens/' \
-                   'latest_assembly_versions/GCF_000001405.39_GRCh38.p13/'
+                   f'latest_assembly_versions/{og_fn}/'
         fn = f'ncbi_{self._assembly}.gff'
-        data_fn = 'GCF_000001405.39_GRCh38.p13_genomic.gff.gz'
+        data_fn = f'{og_fn}_genomic.gff.gz'
         logger.info('Downloading NCBI gff data file...')
         self._ftp_download(self._host, data_dir, fn, ncbi_dir, data_fn)
         logger.info('Successfully downloaded NCBI gff data file.')
@@ -87,11 +87,11 @@ def _files_downloaded(self, data_dir: Path) -> bool:
         gff_downloaded: bool = False
 
         for f in files:
-            if f.name.startswith('ncbi_info'):
+            if f.name.startswith(f'ncbi_info_{self._date_today}'):
                 info_downloaded = True
-            elif f.name.startswith('ncbi_history'):
+            elif f.name.startswith(f'ncbi_history_{self._date_today}'):
                 history_downloaded = True
-            elif f.name.startswith('ncbi_GRCh38'):
+            elif f.name.startswith('ncbi_GRCh38.p13'):
                 gff_downloaded = True
         return info_downloaded and history_downloaded and gff_downloaded