From 593d05b9fef69bfcc11c4b9ba2c80f21724d57d4 Mon Sep 17 00:00:00 2001 From: korikuzma Date: Mon, 16 Aug 2021 15:16:02 -0400 Subject: [PATCH 1/8] wip: create sample test data --- tests/unit/data/ensembl_genes.json | 423 ----- tests/unit/data/etl_data/ensembl_104.gff3 | 248 +++ tests/unit/data/etl_data/hgnc_20210810.json | 1175 ++++++++++++++ tests/unit/data/etl_data/ncbi_GRCh38.p13.gff | 81 + .../data/etl_data/ncbi_history_20210813.tsv | 15 + .../unit/data/etl_data/ncbi_info_20210813.tsv | 22 + tests/unit/data/hgnc_genes.json | 1359 ----------------- tests/unit/data/metadata.json | 42 - tests/unit/data/ncbi_genes.json | 1261 --------------- tests/unit/test_database.py | 89 -- tests/unit/test_database_and_etl.py | 127 ++ 11 files changed, 1668 insertions(+), 3174 deletions(-) delete mode 100644 tests/unit/data/ensembl_genes.json create mode 100644 tests/unit/data/etl_data/ensembl_104.gff3 create mode 100644 tests/unit/data/etl_data/hgnc_20210810.json create mode 100644 tests/unit/data/etl_data/ncbi_GRCh38.p13.gff create mode 100644 tests/unit/data/etl_data/ncbi_history_20210813.tsv create mode 100644 tests/unit/data/etl_data/ncbi_info_20210813.tsv delete mode 100644 tests/unit/data/hgnc_genes.json delete mode 100644 tests/unit/data/metadata.json delete mode 100644 tests/unit/data/ncbi_genes.json delete mode 100644 tests/unit/test_database.py create mode 100644 tests/unit/test_database_and_etl.py diff --git a/tests/unit/data/ensembl_genes.json b/tests/unit/data/ensembl_genes.json deleted file mode 100644 index 73a98e81..00000000 --- a/tests/unit/data/ensembl_genes.json +++ /dev/null @@ -1,423 +0,0 @@ -[ - { - "label_and_type": "ensembl:ensg00000223972##identity", - "concept_id": "ensembl:ENSG00000223972", - "symbol": "DDX11L1", - "label": "DEAD/H-box helicase 11 like 1 (pseudogene)", - "locations": [ - { - "_id": "ga4gh:VSL.iTXYEeSmSj73q-lpxtKLlnp_1OlX658F", - "interval": { - "end": 14409, - "start": 11869, - "type": "SimpleInterval" - }, - "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", - "type": "SequenceLocation" - } - ], - "strand": "+", - "xrefs": [ - "hgnc:37102" - ], - "src_name": "Ensembl", - "item_type": "identity" - }, - { - "label_and_type": "ddx11l1##symbol", - "concept_id": "ensembl:ensg00000223972", - "src_name": "Ensembl", - "item_type": "symbol" - }, - { - "label_and_type": "ensembl:ensg00000141510##identity", - "concept_id": "ensembl:ENSG00000141510", - "symbol": "TP53", - "label": "tumor protein p53", - "locations": [ - { - "_id": "ga4gh:VSL.FfERYK71L10OLwk6QGoG8OPLgl7PItgK", - "interval": { - "end": 7687538, - "start": 7661779, - "type": "SimpleInterval" - }, - "sequence_id": "ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7", - "type": "SequenceLocation" - } - ], - "strand": "-", - "xrefs": [ - "hgnc:11998" - ], - "src_name": "Ensembl", - "item_type": "identity" - }, - { - "label_and_type": "tp53##symbol", - "concept_id": "ensembl:ensg00000141510", - "src_name": "Ensembl", - "item_type": "symbol" - }, - { - "label_and_type": "ensembl:ensg00000278757##identity", - "concept_id": "ensembl:ENSG00000278757", - "symbol": "U6", - "label": "U6 spliceosomal RNA", - "locations": [ - { - "_id": "ga4gh:VSL.7Gh2fIORi69Fm1UMai49Ek-6HQNzuyqv", - "interval": { - "end": 516479, - "start": 516376, - "type": "SimpleInterval" - }, - "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", - "type": "SequenceLocation" - } - ], - "strand": "-", - "associated_with": [ - "rfam:RF00026" - ], - "src_name": "Ensembl", - "item_type": "identity" - }, - { - "label_and_type": "u6##symbol", - "concept_id": "ensembl:ensg00000278757", - "src_name": "Ensembl", - "item_type": "symbol" - }, - { - "label_and_type": "ensembl:ensg00000197180##identity", - "concept_id": "ensembl:ENSG00000197180", - "symbol": "CH17-340M24.3", - "label": "uncharacterized protein BC009467", - "locations": [ - { - "_id": "ga4gh:VSL.L5PJ5tioPr5ozAj1Ad0VIG-qHrGXnMUh", - "interval": { - "end": 154428479, - "start": 154424380, - "type": "SimpleInterval" - }, - "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", - "type": "SequenceLocation" - } - ], - "strand": "-", - "xrefs": [ - "ncbigene:158960" - ], - "src_name": "Ensembl", - "item_type": "identity" - }, - { - "label_and_type": "ch17-340m24.3##symbol", - "concept_id": "ensembl:ensg00000197180", - "src_name": "Ensembl", - "item_type": "symbol" - }, - { - "label_and_type": "ensembl:ensg00000272920##identity", - "concept_id": "ensembl:ENSG00000272920", - "symbol": "hsa-mir-1253", - "label": "hsa-mir-1253", - "locations": [ - { - "_id": "ga4gh:VSL.hS8fW7o0qqy15qOnQOKv1VqOZQDBswNI", - "interval": { - "end": 2748182, - "start": 2748078, - "type": "SimpleInterval" - }, - "sequence_id": "ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7", - "type": "SequenceLocation" - } - ], - "strand": "+", - "associated_with": [ - "mirbase:MI0006387" - ], - "src_name": "Ensembl", - "item_type": "identity" - }, - { - "label_and_type": "mirbase:mi0006387##associated_with", - "concept_id": "ensembl:ENSG00000272920", - "src_name": "Ensembl", - "item_type": "associated_with" - }, - { - "label_and_type": "hsa-mir-1253##symbol", - "concept_id": "ensembl:ENSG00000272920", - "src_name": "Ensembl", - "item_type": "symbol" - }, - { - "label_and_type": "ensembl:ensg00000168939##identity", - "concept_id": "ensembl:ENSG00000168939", - "symbol": "SPRY3", - "label": "sprouty RTK signaling antagonist 3", - "locations": [ - { - "_id": "ga4gh:VSL.h8YcFZq0v-Vwj6aGarOvh1R3LFNGD0YU", - "interval": { - "end": 155782459, - "start": 155612572, - "type": "SimpleInterval" - }, - "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", - "type": "SequenceLocation" - } - ], - "strand": "+", - "xrefs": [ - "hgnc:11271" - ], - "src_name": "Ensembl", - "item_type": "identity" - }, - { - "label_and_type": "spry3##symbol", - "concept_id": "ensembl:ensg00000168939", - "src_name": "Ensembl", - "item_type": "symbol" - }, - { - "label_and_type": "ensembl:ensg00000087085##identity", - "concept_id": "ensembl:ENSG00000087085", - "symbol": "ACHE", - "label": "acetylcholinesterase (Cartwright blood group)", - "locations": [ - { - "_id": "ga4gh:VSL.SWENJPb__SVgxImg-Ybyr2jzqXUahRDf", - "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", - "interval": { - "end": 100896974, - "start": 100889994, - "type": "SimpleInterval" - } - } - ], - "strand": "-", - "xrefs": [ - "hgnc:108" - ], - "src_name": "Ensembl", - "item_type": "identity" - }, - { - "label_and_type": "ache##symbol", - "concept_id": "ensembl:ensg00000087085", - "src_name": "Ensembl", - "item_type": "symbol" - }, - { - "label_and_type": "acetylcholinesterase (cartwright blood group)##label", - "concept_id": "ensembl:ensg00000087085", - "src_name": "Ensembl", - "item_type": "label" - }, - { - "label_and_type": "hgnc:108##xref", - "concept_id": "ensembl:ensg00000087085", - "src_name": "Ensembl", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000097007##identity", - "concept_id": "ensembl:ENSG00000097007", - "symbol": "ABL1", - "label": "ABL proto-oncogene 1, non-receptor tyrosine kinase", - "strand": "+", - "locations": [ - { - "_id": "ga4gh:VSL.HqNwGZwH-66U56AkU0ia4QWN2ASpDRS3", - "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI", - "interval": { - "end": 130887675, - "start": 130713016, - "type": "SimpleInterval" - } - } - ], - "xrefs": [ - "hgnc:76" - ], - "src_name": "Ensembl", - "item_type": "identity" - }, - { - "label_and_type": "abl1##symbol", - "concept_id": "ensembl:ensg00000097007", - "src_name": "Ensembl", - "item_type": "symbol" - }, - { - "label_and_type": "abl proto-oncogene 1, non-receptor tyrosine kinase##label", - "concept_id": "ensembl:ensg00000097007", - "src_name": "Ensembl", - "item_type": "label" - }, - { - "label_and_type": "hgnc:76##xref", - "concept_id": "ensembl:ensg00000097007", - "src_name": "Ensembl", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000157764##identity", - "concept_id": "ensembl:ENSG00000157764", - "symbol": "BRAF", - "label": "B-Raf proto-oncogene, serine/threonine kinase", - "strand": "-", - "locations": [ - { - "_id": "ga4gh:VSL.yVv8OqIODws5WLrSMcydC7roEuhujq0w", - "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", - "interval": { - "end": 140924929, - "start": 140719327, - "type": "SimpleInterval" - } - } - ], - "xrefs": [ - "hgnc:1097" - ], - "src_name": "Ensembl", - "item_type": "identity" - }, - { - "label_and_type": "braf##symbol", - "concept_id": "ensembl:ensg00000157764", - "src_name": "Ensembl", - "item_type": "symbol" - }, - { - "label_and_type": "b-raf proto-oncogene, serine/threonine kinase##label", - "concept_id": "ensembl:ensg00000157764", - "src_name": "Ensembl", - "item_type": "label" - }, - { - "label_and_type": "hgnc:1097##xref", - "concept_id": "ensembl:ensg00000157764", - "src_name": "Ensembl", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000167670##identity", - "concept_id": "ensembl:ENSG00000167670", - "symbol": "CHAF1A", - "label": "chromatin assembly factor 1 subunit A", - "xrefs": [ - "hgnc:1910" - ], - "src_name": "Ensembl", - "item_type": "identity" - }, - { - "label_and_type": "chaf1a##symbol", - "concept_id": "ensembl:ENSG00000167670", - "src_name": "Ensembl", - "item_type": "symbol" - }, - { - "label_and_type": "chromatin assembly factor 1 subunit a##label", - "concept_id": "ensembl:ENSG00000167670", - "src_name": "Ensembl", - "item_type": "label" - }, - { - "label_and_type": "hgnc:1910##xref", - "concept_id": "ensembl:ENSG00000167670", - "src_name": "Ensembl", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000198730##identity", - "concept_id": "ensembl:ENSG00000198730", - "symbol": "CTR9", - "label": "CTR9 homolog, Paf1/RNA polymerase II complex component", - "xrefs": [ - "hgnc:16850" - ], - "src_name": "Ensembl", - "item_type": "identity" - }, - { - "label_and_type": "hgnc:16850##xref", - "concept_id": "ensembl:ENSG00000198730", - "src_name": "Ensembl", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000166825##identity", - "concept_id": "ensembl:ENSG00000166825", - "symbol": "ANPEP", - "label": "alanyl aminopeptidase, membrane", - "xrefs": [ - "hgnc:500" - ], - "src_name": "Ensembl", - "item_type": "identity" - }, - { - "label_and_type": "hgnc:500##xref", - "concept_id": "ensembl:ENSG00000166825", - "src_name": "Ensembl", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000196455##identity", - "concept_id": "ensembl:ENSG00000196455", - "symbol": "PIK3R4", - "label": "chromatin assembly factor 1 subunit A", - "xrefs": [ - "hgnc:8982" - ], - "src_name": "Ensembl", - "item_type": "identity" - }, - { - "label_and_type": "pik3r4##symbol", - "concept_id": "ensembl:ENSG00000196455", - "src_name": "Ensembl", - "item_type": "symbol" - }, - { - "label_and_type": "chromatin assembly factor 1 subunit a##label", - "concept_id": "ensembl:ENSG00000196455", - "src_name": "Ensembl", - "item_type": "label" - }, - { - "label_and_type": "hgnc:8982##xref", - "concept_id": "ensembl:ENSG00000196455", - "src_name": "Ensembl", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000118873##identity", - "concept_id": "ensembl:ENSG00000118873", - "symbol": "RAB3GAP2", - "label": "RAB3 GTPase activating non-catalytic protein subunit 2", - "xrefs": [ - "hgnc:17168" - ], - "src_name": "Ensembl", - "item_type": "identity" - }, - { - "label_and_type": "hgnc:17168##xref", - "concept_id": "ensembl:ENSG00000118873", - "src_name": "Ensembl", - "item_type": "xref" - } -] \ No newline at end of file diff --git a/tests/unit/data/etl_data/ensembl_104.gff3 b/tests/unit/data/etl_data/ensembl_104.gff3 new file mode 100644 index 00000000..488e93b5 --- /dev/null +++ b/tests/unit/data/etl_data/ensembl_104.gff3 @@ -0,0 +1,248 @@ +##gff-version 3 +##sequence-region 1 1 248956422 +##sequence-region 10 1 133797422 +##sequence-region 11 1 135086622 +##sequence-region 12 1 133275309 +##sequence-region 13 1 114364328 +##sequence-region 14 1 107043718 +##sequence-region 15 1 101991189 +##sequence-region 16 1 90338345 +##sequence-region 17 1 83257441 +##sequence-region 18 1 80373285 +##sequence-region 19 1 58617616 +##sequence-region 2 1 242193529 +##sequence-region 20 1 64444167 +##sequence-region 21 1 46709983 +##sequence-region 22 1 50818468 +##sequence-region 3 1 198295559 +##sequence-region 4 1 190214555 +##sequence-region 5 1 181538259 +##sequence-region 6 1 170805979 +##sequence-region 7 1 159345973 +##sequence-region 8 1 145138636 +##sequence-region 9 1 138394717 +##sequence-region GL000008.2 1 209709 +##sequence-region GL000009.2 1 201709 +##sequence-region GL000194.1 1 191469 +##sequence-region GL000195.1 1 182896 +##sequence-region GL000205.2 1 185591 +##sequence-region GL000208.1 1 92689 +##sequence-region GL000213.1 1 164239 +##sequence-region GL000214.1 1 137718 +##sequence-region GL000216.2 1 176608 +##sequence-region GL000218.1 1 161147 +##sequence-region GL000219.1 1 179198 +##sequence-region GL000220.1 1 161802 +##sequence-region GL000221.1 1 155397 +##sequence-region GL000224.1 1 179693 +##sequence-region GL000225.1 1 211173 +##sequence-region GL000226.1 1 15008 +##sequence-region KI270302.1 1 2274 +##sequence-region KI270303.1 1 1942 +##sequence-region KI270304.1 1 2165 +##sequence-region KI270305.1 1 1472 +##sequence-region KI270310.1 1 1201 +##sequence-region KI270311.1 1 12399 +##sequence-region KI270312.1 1 998 +##sequence-region KI270315.1 1 2276 +##sequence-region KI270316.1 1 1444 +##sequence-region KI270317.1 1 37690 +##sequence-region KI270320.1 1 4416 +##sequence-region KI270322.1 1 21476 +##sequence-region KI270329.1 1 1040 +##sequence-region KI270330.1 1 1652 +##sequence-region KI270333.1 1 2699 +##sequence-region KI270334.1 1 1368 +##sequence-region KI270335.1 1 1048 +##sequence-region KI270336.1 1 1026 +##sequence-region KI270337.1 1 1121 +##sequence-region KI270338.1 1 1428 +##sequence-region KI270340.1 1 1428 +##sequence-region KI270362.1 1 3530 +##sequence-region KI270363.1 1 1803 +##sequence-region KI270364.1 1 2855 +##sequence-region KI270366.1 1 8320 +##sequence-region KI270371.1 1 2805 +##sequence-region KI270372.1 1 1650 +##sequence-region KI270373.1 1 1451 +##sequence-region KI270374.1 1 2656 +##sequence-region KI270375.1 1 2378 +##sequence-region KI270376.1 1 1136 +##sequence-region KI270378.1 1 1048 +##sequence-region KI270379.1 1 1045 +##sequence-region KI270381.1 1 1930 +##sequence-region KI270382.1 1 4215 +##sequence-region KI270383.1 1 1750 +##sequence-region KI270384.1 1 1658 +##sequence-region KI270385.1 1 990 +##sequence-region KI270386.1 1 1788 +##sequence-region KI270387.1 1 1537 +##sequence-region KI270388.1 1 1216 +##sequence-region KI270389.1 1 1298 +##sequence-region KI270390.1 1 2387 +##sequence-region KI270391.1 1 1484 +##sequence-region KI270392.1 1 971 +##sequence-region KI270393.1 1 1308 +##sequence-region KI270394.1 1 970 +##sequence-region KI270395.1 1 1143 +##sequence-region KI270396.1 1 1880 +##sequence-region KI270411.1 1 2646 +##sequence-region KI270412.1 1 1179 +##sequence-region KI270414.1 1 2489 +##sequence-region KI270417.1 1 2043 +##sequence-region KI270418.1 1 2145 +##sequence-region KI270419.1 1 1029 +##sequence-region KI270420.1 1 2321 +##sequence-region KI270422.1 1 1445 +##sequence-region KI270423.1 1 981 +##sequence-region KI270424.1 1 2140 +##sequence-region KI270425.1 1 1884 +##sequence-region KI270429.1 1 1361 +##sequence-region KI270435.1 1 92983 +##sequence-region KI270438.1 1 112505 +##sequence-region KI270442.1 1 392061 +##sequence-region KI270448.1 1 7992 +##sequence-region KI270465.1 1 1774 +##sequence-region KI270466.1 1 1233 +##sequence-region KI270467.1 1 3920 +##sequence-region KI270468.1 1 4055 +##sequence-region KI270507.1 1 5353 +##sequence-region KI270508.1 1 1951 +##sequence-region KI270509.1 1 2318 +##sequence-region KI270510.1 1 2415 +##sequence-region KI270511.1 1 8127 +##sequence-region KI270512.1 1 22689 +##sequence-region KI270515.1 1 6361 +##sequence-region KI270516.1 1 1300 +##sequence-region KI270517.1 1 3253 +##sequence-region KI270518.1 1 2186 +##sequence-region KI270519.1 1 138126 +##sequence-region KI270521.1 1 7642 +##sequence-region KI270522.1 1 5674 +##sequence-region KI270528.1 1 2983 +##sequence-region KI270529.1 1 1899 +##sequence-region KI270530.1 1 2168 +##sequence-region KI270538.1 1 91309 +##sequence-region KI270539.1 1 993 +##sequence-region KI270544.1 1 1202 +##sequence-region KI270548.1 1 1599 +##sequence-region KI270579.1 1 31033 +##sequence-region KI270580.1 1 1553 +##sequence-region KI270581.1 1 7046 +##sequence-region KI270582.1 1 6504 +##sequence-region KI270583.1 1 1400 +##sequence-region KI270584.1 1 4513 +##sequence-region KI270587.1 1 2969 +##sequence-region KI270588.1 1 6158 +##sequence-region KI270589.1 1 44474 +##sequence-region KI270590.1 1 4685 +##sequence-region KI270591.1 1 5796 +##sequence-region KI270593.1 1 3041 +##sequence-region KI270706.1 1 175055 +##sequence-region KI270707.1 1 32032 +##sequence-region KI270708.1 1 127682 +##sequence-region KI270709.1 1 66860 +##sequence-region KI270710.1 1 40176 +##sequence-region KI270711.1 1 42210 +##sequence-region KI270712.1 1 176043 +##sequence-region KI270713.1 1 40745 +##sequence-region KI270714.1 1 41717 +##sequence-region KI270715.1 1 161471 +##sequence-region KI270716.1 1 153799 +##sequence-region KI270717.1 1 40062 +##sequence-region KI270718.1 1 38054 +##sequence-region KI270719.1 1 176845 +##sequence-region KI270720.1 1 39050 +##sequence-region KI270721.1 1 100316 +##sequence-region KI270722.1 1 194050 +##sequence-region KI270723.1 1 38115 +##sequence-region KI270724.1 1 39555 +##sequence-region KI270725.1 1 172810 +##sequence-region KI270726.1 1 43739 +##sequence-region KI270727.1 1 448248 +##sequence-region KI270728.1 1 1872759 +##sequence-region KI270729.1 1 280839 +##sequence-region KI270730.1 1 112551 +##sequence-region KI270731.1 1 150754 +##sequence-region KI270732.1 1 41543 +##sequence-region KI270733.1 1 179772 +##sequence-region KI270734.1 1 165050 +##sequence-region KI270735.1 1 42811 +##sequence-region KI270736.1 1 181920 +##sequence-region KI270737.1 1 103838 +##sequence-region KI270738.1 1 99375 +##sequence-region KI270739.1 1 73985 +##sequence-region KI270740.1 1 37240 +##sequence-region KI270741.1 1 157432 +##sequence-region KI270742.1 1 186739 +##sequence-region KI270743.1 1 210658 +##sequence-region KI270744.1 1 168472 +##sequence-region KI270745.1 1 41891 +##sequence-region KI270746.1 1 66486 +##sequence-region KI270747.1 1 198735 +##sequence-region KI270748.1 1 93321 +##sequence-region KI270749.1 1 158759 +##sequence-region KI270750.1 1 148850 +##sequence-region KI270751.1 1 150742 +##sequence-region KI270752.1 1 27745 +##sequence-region KI270753.1 1 62944 +##sequence-region KI270754.1 1 40191 +##sequence-region KI270755.1 1 36723 +##sequence-region KI270756.1 1 79590 +##sequence-region KI270757.1 1 71251 +##sequence-region MT 1 16569 +##sequence-region X 1 156040895 +##sequence-region Y 2752083 56887902 +#!genome-build Genome Reference Consortium GRCh38.p13 +#!genome-version GRCh38 +#!genome-date 2013-12 +#!genome-build-accession GCA_000001405.28 +#!genebuild-last-updated 2021-03 +1 GRCh38 chromosome 1 248956422 . . . ID=chromosome:1;Alias=CM000663.2,chr1,NC_000001.11 +### +1 havana pseudogene 11869 14409 . + . ID=gene:ENSG00000223972;Name=DDX11L1;biotype=transcribed_unprocessed_pseudogene;description=DEAD/H-box helicase 11 like 1 (pseudogene) [Source:HGNC Symbol%3BAcc:HGNC:37102];gene_id=ENSG00000223972;logic_name=havana_homo_sapiens;version=5 +### +1 ensembl_havana gene 220148293 220272453 . - . ID=gene:ENSG00000118873;Name=RAB3GAP2;biotype=protein_coding;description=RAB3 GTPase activating non-catalytic protein subunit 2 [Source:HGNC Symbol%3BAcc:HGNC:17168];gene_id=ENSG00000118873;logic_name=ensembl_havana_gene_homo_sapiens;version=16 +### +2 GRCh38 chromosome 1 242193529 . . . ID=chromosome:2;Alias=CM000664.2,chr2,NC_000002.12 +### +2 ensembl ncRNA_gene 47781379 47781465 . + . ID=gene:ENSG00000283502;Name=U6;biotype=snRNA;description=U6 spliceosomal RNA [Source:RFAM%3BAcc:RF00026];gene_id=ENSG00000283502;logic_name=ncrna_homo_sapiens;version=1 +### +3 GRCh38 chromosome 1 198295559 . . . ID=chromosome:3;Alias=CM000665.2,chr3,NC_000003.12 +### +3 ensembl_havana gene 130678934 130746829 . - . ID=gene:ENSG00000196455;Name=PIK3R4;biotype=protein_coding;description=phosphoinositide-3-kinase regulatory subunit 4 [Source:HGNC Symbol%3BAcc:HGNC:8982];gene_id=ENSG00000196455;logic_name=ensembl_havana_gene_homo_sapiens;version=8 +### +7 GRCh38 chromosome 1 159345973 . . . ID=chromosome:7;Alias=CM000669.2,chr7,NC_000007.14 +### +7 ensembl_havana gene 100889994 100896974 . - . ID=gene:ENSG00000087085;Name=ACHE;biotype=protein_coding;description=acetylcholinesterase (Cartwright blood group) [Source:HGNC Symbol%3BAcc:HGNC:108];gene_id=ENSG00000087085;logic_name=ensembl_havana_gene_homo_sapiens;version=16 +### +7 ensembl_havana gene 140719327 140924929 . - . ID=gene:ENSG00000157764;Name=BRAF;biotype=protein_coding;description=B-Raf proto-oncogene%2C serine/threonine kinase [Source:HGNC Symbol%3BAcc:HGNC:1097];gene_id=ENSG00000157764;logic_name=ensembl_havana_gene_homo_sapiens;version=14 +### +9 GRCh38 chromosome 1 138394717 . . . ID=chromosome:9;Alias=CM000671.2,chr9,NC_000009.12 +### +9 ensembl_havana gene 130713016 130887675 . + . ID=gene:ENSG00000097007;Name=ABL1;biotype=protein_coding;description=ABL proto-oncogene 1%2C non-receptor tyrosine kinase [Source:HGNC Symbol%3BAcc:HGNC:76];gene_id=ENSG00000097007;logic_name=ensembl_havana_gene_homo_sapiens;version=19 +### +11 GRCh38 chromosome 1 135086622 . . . ID=chromosome:11;Alias=CM000673.2,chr11,NC_000011.10 +### +11 ensembl_havana gene 10751246 10801625 . + . ID=gene:ENSG00000198730;Name=CTR9;biotype=protein_coding;description=CTR9 homolog%2C Paf1/RNA polymerase II complex component [Source:HGNC Symbol%3BAcc:HGNC:16850];gene_id=ENSG00000198730;logic_name=ensembl_havana_gene_homo_sapiens;version=9 +### +15 GRCh38 chromosome 1 101991189 . . . ID=chromosome:15;Alias=CM000677.2,chr15,NC_000015.10 +### +15 ensembl_havana gene 89784895 89815401 . - . ID=gene:ENSG00000166825;Name=ANPEP;biotype=protein_coding;description=alanyl aminopeptidase%2C membrane [Source:HGNC Symbol%3BAcc:HGNC:500];gene_id=ENSG00000166825;logic_name=ensembl_havana_gene_homo_sapiens;version=15 +### +17 GRCh38 chromosome 1 83257441 . . . ID=chromosome:17;Alias=CM000679.2,chr17,NC_000017.11 +### +17 ensembl_havana gene 7661779 7687538 . - . ID=gene:ENSG00000141510;Name=TP53;biotype=protein_coding;description=tumor protein p53 [Source:HGNC Symbol%3BAcc:HGNC:11998];gene_id=ENSG00000141510;logic_name=ensembl_havana_gene_homo_sapiens;version=18 +### +17 havana ncRNA_gene 2748078 2748182 . + . ID=gene:ENSG00000272920;Name=hsa-mir-1253;biotype=lncRNA;description=hsa-mir-1253 [Source:miRBase%3BAcc:MI0006387];gene_id=ENSG00000272920;logic_name=havana_homo_sapiens;version=1 +### +19 GRCh38 chromosome 1 58617616 . . . ID=chromosome:19;Alias=CM000681.2,chr19,NC_000019.10 +### +19 ensembl_havana gene 4402640 4445018 . + . ID=gene:ENSG00000167670;Name=CHAF1A;biotype=protein_coding;description=chromatin assembly factor 1 subunit A [Source:HGNC Symbol%3BAcc:HGNC:1910];gene_id=ENSG00000167670;logic_name=ensembl_havana_gene_homo_sapiens;version=16 +### +X GRCh38 chromosome 1 156040895 . . . ID=chromosome:X;Alias=CM000685.2,chrX,NC_000023.11 +### +X havana ncRNA_gene 154424380 154428479 . - . ID=gene:ENSG00000197180;Name=CH17-340M24.3;biotype=lncRNA;description=uncharacterized protein BC009467 [Source:NCBI gene (formerly Entrezgene)%3BAcc:158960];gene_id=ENSG00000197180;logic_name=havana_homo_sapiens;version=2 +### +X ensembl_havana gene 155612572 155782459 . + . ID=gene:ENSG00000168939;Name=SPRY3;biotype=protein_coding;description=sprouty RTK signaling antagonist 3 [Source:HGNC Symbol%3BAcc:HGNC:11271];gene_id=ENSG00000168939;logic_name=ensembl_havana_gene_homo_sapiens;version=12 +### diff --git a/tests/unit/data/etl_data/hgnc_20210810.json b/tests/unit/data/etl_data/hgnc_20210810.json new file mode 100644 index 00000000..2d1e0024 --- /dev/null +++ b/tests/unit/data/etl_data/hgnc_20210810.json @@ -0,0 +1,1175 @@ +{ + "responseHeader": { + "status": 0, + "QTime": 21 + }, + "response": { + "numFound": 42759, + "docs": [ + { + "symbol_report_tag": [ + "Stable symbol" + ], + "vega_id": "OTTHUMG00000157033", + "locus_group": "protein-coding gene", + "mane_select": [ + "ENST00000241069.11", + "NM_000665.5" + ], + "_version_": 1707696195519905792, + "uuid": "ac1d39ff-c8fd-4ad1-b713-4731cad0ee18", + "prev_name": [ + "acetylcholinesterase (YT blood group)", + "acetylcholinesterase (Yt blood group)" + ], + "lsdb": [ + "Blood Group Antigen Mutation Database|http://www.ncbi.nlm.nih.gov/gv/mhc/xslcgi.cgi?cmd=bgmut/home", + "LRG_804|http://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_804.xml" + ], + "refseq_accession": [ + "NM_015831" + ], + "hgnc_id": "HGNC:108", + "entrez_id": "43", + "symbol": "ACHE", + "location": "7q22.1", + "name": "acetylcholinesterase (Cartwright blood group)", + "mgd_id": [ + "MGI:87876" + ], + "prev_symbol": [ + "YT" + ], + "alias_name": [ + "Yt blood group" + ], + "date_approved_reserved": "1989-06-02", + "status": "Approved", + "merops": "S09.979", + "locus_type": "gene with protein product", + "iuphar": "objectId:2465", + "agr": "HGNC:108", + "rgd_id": [ + "RGD:69313" + ], + "ensembl_gene_id": "ENSG00000087085", + "gene_group": [ + "Blood group antigens" + ], + "date_name_changed": "2016-03-30", + "omim_id": [ + "100740" + ], + "date_modified": "2021-05-26", + "ucsc_id": "uc003uxi.4", + "enzyme_id": [ + "3.1.1.7" + ], + "uniprot_ids": [ + "P22303" + ], + "ccds_id": [ + "CCDS64736", + "CCDS5709", + "CCDS5710" + ], + "pubmed_id": [ + 1380483 + ], + "gene_group_id": [ + 454 + ], + "location_sortable": "07q22.1" + }, + { + "symbol_report_tag": [ + "Stable symbol" + ], + "vega_id": "OTTHUMG00000157457", + "locus_group": "protein-coding gene", + "mane_select": [ + "ENST00000644969.2", + "NM_001374258.1" + ], + "alias_symbol": [ + "BRAF1" + ], + "_version_": 1707696197057118209, + "uuid": "fbac0c54-1e0e-47af-a39a-82949139939c", + "prev_name": [ + "v-raf murine sarcoma viral oncogene homolog B" + ], + "lsdb": [ + "LRG_299|http://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_299.xml" + ], + "refseq_accession": [ + "NM_004333" + ], + "cosmic": "BRAF", + "hgnc_id": "HGNC:1097", + "entrez_id": "673", + "symbol": "BRAF", + "location": "7q34", + "name": "B-Raf proto-oncogene, serine/threonine kinase", + "mgd_id": [ + "MGI:88190" + ], + "orphanet": 119066, + "date_approved_reserved": "1991-07-16", + "status": "Approved", + "locus_type": "gene with protein product", + "iuphar": "objectId:1943", + "agr": "HGNC:1097", + "rgd_id": [ + "RGD:619908" + ], + "ensembl_gene_id": "ENSG00000157764", + "gene_group": [ + "Mitogen-activated protein kinase kinase kinases", + "RAF family" + ], + "date_name_changed": "2014-06-26", + "omim_id": [ + "164757" + ], + "date_modified": "2021-05-26", + "ucsc_id": "uc003vwc.5", + "uniprot_ids": [ + "P15056" + ], + "ena": [ + "M95712" + ], + "ccds_id": [ + "CCDS87555", + "CCDS5863" + ], + "pubmed_id": [ + 2284096, + 1565476 + ], + "gene_group_id": [ + 654, + 1157 + ], + "location_sortable": "07q34" + }, + { + "symbol_report_tag": [ + "Stable symbol" + ], + "date_approved_reserved": "1986-01-01", + "alias_name": [ + "Li-Fraumeni syndrome" + ], + "vega_id": "OTTHUMG00000162125", + "locus_group": "protein-coding gene", + "mane_select": [ + "ENST00000269305.9", + "NM_000546.6" + ], + "status": "Approved", + "alias_symbol": [ + "p53", + "LFS1" + ], + "_version_": 1707696218478477314, + "uuid": "d674d23a-6ac1-4482-a2da-81a8837c07f3", + "lsdb": [ + "IARC TP53 Mutation Database|http://www-p53.iarc.fr/", + "p53 UMD TP53 mutation database|http://p53.fr/", + "Database of Germline p53 Mutations|http://www.lf2.cuni.cz/projects/germline_mut_p53.htm", + "MUTP53LOAD, Mutant p53 Loss Of Activity Database|http://www.umd.be:2072/", + "LRG_321|http://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_321.xml" + ], + "refseq_accession": [ + "NM_000546" + ], + "locus_type": "gene with protein product", + "agr": "HGNC:11998", + "cosmic": "TP53", + "hgnc_id": "HGNC:11998", + "rgd_id": [ + "RGD:3889" + ], + "ensembl_gene_id": "ENSG00000141510", + "entrez_id": "7157", + "omim_id": [ + "191170" + ], + "symbol": "TP53", + "date_name_changed": "2008-01-16", + "location": "17p13.1", + "name": "tumor protein p53", + "date_modified": "2021-05-26", + "mgd_id": [ + "MGI:98834" + ], + "ucsc_id": "uc060aur.1", + "uniprot_ids": [ + "P04637" + ], + "orphanet": 120204, + "ccds_id": [ + "CCDS73964", + "CCDS11118", + "CCDS45605", + "CCDS45606", + "CCDS73967", + "CCDS73963", + "CCDS73968", + "CCDS73971", + "CCDS73970", + "CCDS73966", + "CCDS73965", + "CCDS73969" + ], + "ena": [ + "AF307851" + ], + "pubmed_id": [ + 6396087, + 3456488, + 2047879 + ], + "location_sortable": "17p13.1" + }, + { + "date_approved_reserved": "2001-06-22", + "symbol": "CECR", + "locus_group": "other", + "location": "22pter-q11", + "status": "Approved", + "date_modified": "2012-10-02", + "name": "cat eye syndrome chromosome region", + "_version_": 1707696198085771265, + "uuid": "6c4dc1c7-6e7a-4775-a185-1d86d544e732", + "locus_type": "region", + "hgnc_id": "HGNC:1838", + "entrez_id": "1055", + "location_sortable": "22pter-q11" + }, + { + "date_approved_reserved": "1999-09-29", + "alias_name": [ + "chromatin assembly factor I (150 kDa)" + ], + "vega_id": "OTTHUMG00000181922", + "locus_group": "protein-coding gene", + "mane_select": [ + "ENST00000301280.10", + "NM_005483.3" + ], + "status": "Approved", + "alias_symbol": [ + "CAF1P150", + "CAF1B", + "CAF-1", + "CAF1", + "P150", + "MGC71229" + ], + "_version_": 1707696198253543425, + "uuid": "cbaac19b-6e86-4b58-9053-e34c3aa5d99e", + "prev_name": [ + "chromatin assembly factor 1, subunit A (p150)" + ], + "refseq_accession": [ + "NM_005483" + ], + "locus_type": "gene with protein product", + "agr": "HGNC:1910", + "hgnc_id": "HGNC:1910", + "rgd_id": [ + "RGD:1590865" + ], + "ensembl_gene_id": "ENSG00000167670", + "entrez_id": "10036", + "omim_id": [ + "601246" + ], + "symbol": "CHAF1A", + "date_name_changed": "2015-11-23", + "location": "19p13.3", + "name": "chromatin assembly factor 1 subunit A", + "date_modified": "2019-08-21", + "mgd_id": [ + "MGI:1351331" + ], + "ucsc_id": "uc002mal.4", + "uniprot_ids": [ + "Q13111" + ], + "ccds_id": [ + "CCDS32875" + ], + "ena": [ + "U20979" + ], + "pubmed_id": [ + 7600578 + ], + "location_sortable": "19p13.3" + }, + { + "date_approved_reserved": "2003-11-13", + "alias_name": [ + "CTF4, chromosome transmission fidelity factor 4 homolog (S. cerevisiae)" + ], + "vega_id": "OTTHUMG00000140304", + "locus_group": "protein-coding gene", + "mane_select": [ + "ENST00000360586.8", + "NM_007086.4" + ], + "status": "Approved", + "alias_symbol": [ + "AND-1", + "CTF4", + "CHTF4" + ], + "_version_": 1707696220276785153, + "uuid": "85c49e45-d6a8-483b-9531-927ec4786436", + "refseq_accession": [ + "NM_007086" + ], + "locus_type": "gene with protein product", + "agr": "HGNC:23170", + "hgnc_id": "HGNC:23170", + "rgd_id": [ + "RGD:1310200" + ], + "ensembl_gene_id": "ENSG00000198554", + "entrez_id": "11169", + "gene_group": [ + "WD repeat domain containing" + ], + "omim_id": [ + "608126" + ], + "symbol": "WDHD1", + "location": "14q22.2-q22.3", + "name": "WD repeat and HMG-box DNA binding protein 1", + "date_modified": "2018-02-13", + "mgd_id": [ + "MGI:2443514" + ], + "ucsc_id": "uc001xbm.3", + "uniprot_ids": [ + "O75717" + ], + "ccds_id": [ + "CCDS41955", + "CCDS9721" + ], + "ena": [ + "AJ006266" + ], + "gene_group_id": [ + 362 + ], + "pubmed_id": [ + 9175701, + 20028748 + ], + "location_sortable": "14q22.2-q22.3" + }, + { + "vega_id": "OTTHUMG00000012533", + "locus_group": "protein-coding gene", + "mane_select": [ + "ENST00000381529.9", + "NM_172245.4" + ], + "alias_symbol": [ + "CD116", + "alphaGMR" + ], + "_version_": 1707696198915194881, + "uuid": "3095e0d7-4533-41e7-a543-8ef63f9d2037", + "prev_name": [ + "colony stimulating factor 2 receptor, alpha, low-affinity (granulocyte-macrophage)" + ], + "lsdb": [ + "Global Variome shared LOVD|https://databases.lovd.nl/shared/genes/CSF2RA", + "LRG_186|http://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_186.xml" + ], + "refseq_accession": [ + "NM_001161529" + ], + "hgnc_id": "HGNC:2435", + "entrez_id": "1438", + "symbol": "CSF2RA", + "location": "Xp22.32 and Yp11.3", + "name": "colony stimulating factor 2 receptor subunit alpha", + "mgd_id": [ + "MGI:1339754" + ], + "prev_symbol": [ + "CSF2R" + ], + "orphanet": 209477, + "alias_name": [ + "alpha-GM-CSF receptor" + ], + "date_approved_reserved": "1990-07-03", + "status": "Approved", + "locus_type": "gene with protein product", + "iuphar": "objectId:1707", + "agr": "HGNC:2435", + "rgd_id": [ + "RGD:1594330" + ], + "ensembl_gene_id": "ENSG00000198223", + "gene_group": [ + "CD molecules", + "Pseudoautosomal region 1" + ], + "date_name_changed": "2019-11-26", + "cd": "CD116", + "omim_id": [ + "306250", + "425000" + ], + "date_modified": "2021-04-13", + "ucsc_id": "uc010nvv.3", + "uniprot_ids": [ + "P15509" + ], + "ena": [ + "M64445" + ], + "ccds_id": [ + "CCDS35193", + "CCDS35190", + "CCDS55361", + "CCDS35191", + "CCDS55359", + "CCDS35192", + "CCDS55360" + ], + "pubmed_id": [ + 1702217 + ], + "gene_group_id": [ + 471, + 715 + ], + "location_sortable": "Xp22.32 and Yp11.3" + }, + { + "date_approved_reserved": "2005-05-06", + "alias_name": [ + "iGb3 synthase", + "isoglobotriaosylceramide synthase" + ], + "vega_id": "OTTHUMG00000004125", + "locus_group": "protein-coding gene", + "mane_select": [ + "ENST00000442999.3", + "NM_001080438.1" + ], + "status": "Approved", + "alias_symbol": [ + "IGBS3S", + "IGB3S" + ], + "_version_": 1707696195380445184, + "uuid": "ec929101-693b-4afc-ae1b-bbe1d38f9c62", + "prev_name": [ + "alpha 1,3-galactosyltransferase 2, pseudogene" + ], + "refseq_accession": [ + "NM_001080438" + ], + "locus_type": "gene with protein product", + "agr": "HGNC:30005", + "hgnc_id": "HGNC:30005", + "rgd_id": [ + "RGD:727913" + ], + "ensembl_gene_id": "ENSG00000184389", + "entrez_id": "127550", + "gene_group": [ + "Glycosyltransferase family 6" + ], + "symbol": "A3GALT2", + "date_name_changed": "2013-03-11", + "location": "1p35.1", + "name": "alpha 1,3-galactosyltransferase 2", + "date_modified": "2018-02-08", + "mgd_id": [ + "MGI:2685279" + ], + "ucsc_id": "uc031plq.1", + "prev_symbol": [ + "A3GALT2P" + ], + "uniprot_ids": [ + "U3KPV4" + ], + "ccds_id": [ + "CCDS60080" + ], + "gene_group_id": [ + 429 + ], + "date_symbol_changed": "2013-03-11", + "pubmed_id": [ + 10854427, + 18630988 + ], + "location_sortable": "01p35.1" + }, + { + "date_approved_reserved": "2009-02-18", + "alias_name": [ + "tektin 4 pseudogene 4" + ], + "vega_id": "OTTHUMG00000188065", + "locus_group": "pseudogene", + "status": "Approved", + "alias_symbol": [ + "FLJ35473", + "FLJ00219", + "FLJ39633", + "MIP", + "pp5644", + "TEKT4P4" + ], + "_version_": 1707696205983645696, + "uuid": "3347c721-7e09-4350-a8e7-ed466ac4b616", + "prev_name": [ + "MAFF interacting protein" + ], + "refseq_accession": [ + "NR_046439" + ], + "locus_type": "pseudogene", + "hgnc_id": "HGNC:31102", + "ensembl_gene_id": "ENSG00000274847", + "entrez_id": "727764", + "symbol": "MAFIP", + "date_name_changed": "2012-03-26", + "location": "14 unplaced", + "name": "MAFF interacting protein (pseudogene)", + "date_modified": "2017-09-21", + "uniprot_ids": [ + "Q8WZ33" + ], + "ena": [ + "AK074146", + "AF289559" + ], + "pubmed_id": [ + 16549056, + 15881666 + ], + "location_sortable": "14 unplaced" + }, + { + "date_approved_reserved": "2008-08-29", + "locus_group": "non-coding RNA", + "status": "Approved", + "alias_symbol": [ + "tRNA-Leu-CAG-2-1" + ], + "_version_": 1707696219107622913, + "uuid": "6d4dfe64-1599-4570-8287-da84de9c772d", + "rna_central_id": [ + "URS00000FB60D" + ], + "prev_name": [ + "transfer RNA leucine 13 (anticodon CAG)", + "transfer RNA-Leu (CAG) 2-1" + ], + "locus_type": "RNA, transfer", + "agr": "HGNC:34692", + "hgnc_id": "HGNC:34692", + "gtrnadb": "tRNA-Leu-CAG-2-1", + "entrez_id": "100189130", + "gene_group": [ + "Cytoplasmic transfer RNAs" + ], + "symbol": "TRL-CAG2-1", + "date_name_changed": "2019-04-04", + "location": "16q13-21", + "name": "tRNA-Leu (anticodon CAG) 2-1", + "date_modified": "2019-04-04", + "prev_symbol": [ + "TRNAL13" + ], + "ena": [ + "HG983896" + ], + "gene_group_id": [ + 842 + ], + "date_symbol_changed": "2014-06-19", + "location_sortable": "16q13-21" + }, + { + "date_approved_reserved": "2009-03-06", + "symbol": "RPS24P5", + "locus_group": "pseudogene", + "location": "1p36.13-q41", + "status": "Approved", + "date_modified": "2009-03-11", + "name": "ribosomal protein S24 pseudogene 5", + "_version_": 1707696215183851521, + "uuid": "547b3b9d-dcae-45d3-b6dd-d344bc433681", + "refseq_accession": [ + "NG_011274" + ], + "locus_type": "pseudogene", + "agr": "HGNC:36026", + "hgnc_id": "HGNC:36026", + "pubmed_id": [ + 19123937 + ], + "entrez_id": "100271094", + "location_sortable": "01p36.13-q41" + }, + { + "date_approved_reserved": "2009-07-20", + "vega_id": "OTTHUMG00000183508", + "locus_group": "non-coding RNA", + "status": "Approved", + "alias_symbol": [ + "FLJ23569" + ], + "_version_": 1707696195375202304, + "uuid": "7241a986-d17e-4b06-9851-2f8e7dec7d01", + "rna_central_id": [ + "URS00007E4F6E" + ], + "prev_name": [ + "non-protein coding RNA 181", + "A1BG antisense RNA (non-protein coding)", + "A1BG antisense RNA 1 (non-protein coding)" + ], + "refseq_accession": [ + "NR_015380" + ], + "locus_type": "RNA, long non-coding", + "agr": "HGNC:37133", + "hgnc_id": "HGNC:37133", + "ensembl_gene_id": "ENSG00000268895", + "entrez_id": "503538", + "gene_group": [ + "Antisense RNAs" + ], + "symbol": "A1BG-AS1", + "date_name_changed": "2012-08-15", + "location": "19q13.43", + "lncipedia": "A1BG-AS1", + "name": "A1BG antisense RNA 1", + "date_modified": "2013-06-27", + "ucsc_id": "uc002qse.3", + "prev_symbol": [ + "NCRNA00181", + "A1BGAS", + "A1BG-AS" + ], + "ena": [ + "BC040926" + ], + "gene_group_id": [ + 1987 + ], + "date_symbol_changed": "2010-11-25", + "location_sortable": "19q13.43" + }, + { + "gene_group": [ + "Piwi-interacting RNA clusters" + ], + "date_approved_reserved": "2009-11-05", + "symbol": "PIRC24", + "locus_group": "non-coding RNA", + "location": "6", + "status": "Approved", + "date_modified": "2014-11-18", + "name": "piwi-interacting RNA cluster 24", + "_version_": 1707696210573262850, + "uuid": "fb149011-fbe7-454e-852b-9b694324ea30", + "locus_type": "RNA, cluster", + "hgnc_id": "HGNC:37528", + "gene_group_id": [ + 851 + ], + "pubmed_id": [ + 17881367 + ], + "entrez_id": "100313810", + "location_sortable": "06" + }, + { + "date_approved_reserved": "1998-07-15", + "locus_group": "other", + "status": "Approved", + "alias_symbol": [ + "GSD1aSP" + ], + "_version_": 1707696201536634880, + "uuid": "87b7050e-610d-4e4b-9359-e40f0d316d85", + "curator_notes": [ + "This gene has the locus type unknown because it has never been mapped to the human genome." + ], + "prev_name": [ + "glucose-6-phosphatase, regulatory" + ], + "locus_type": "unknown", + "hgnc_id": "HGNC:4059", + "entrez_id": "2541", + "symbol": "G6PR", + "date_name_changed": "2004-05-20", + "location": "reserved", + "name": "glucose-6-phosphatase regulator", + "date_modified": "2016-04-20", + "pubmed_id": [ + 2172641, + 7814621, + 2996501 + ], + "location_sortable": "reserved" + }, + { + "date_approved_reserved": "1995-05-10", + "alias_name": [ + "cancer/testis antigen family 4, member 4" + ], + "locus_group": "protein-coding gene", + "status": "Approved", + "alias_symbol": [ + "CT4.4" + ], + "_version_": 1707696201573335041, + "uuid": "81d15344-3ea2-4f99-8232-aca8f42c9830", + "refseq_accession": [ + "NM_001474" + ], + "locus_type": "gene with protein product", + "hgnc_id": "HGNC:4101", + "entrez_id": "2576", + "gene_group": [ + "GAGE family" + ], + "omim_id": [ + "300597" + ], + "symbol": "GAGE4", + "location": "Xp11.4-p11.2 not on reference assembly", + "name": "G antigen 4", + "date_modified": "2019-06-20", + "uniprot_ids": [ + "P0DSO3" + ], + "ena": [ + "U19145" + ], + "gene_group_id": [ + 1845 + ], + "pubmed_id": [ + 7544395 + ], + "location_sortable": "Xp11.4-p11.2 not on reference assembly" + }, + { + "date_approved_reserved": "1994-08-10", + "locus_group": "protein-coding gene", + "status": "Approved", + "_version_": 1707696202216112128, + "uuid": "1faed30e-867c-483f-830a-0d9e8c40ba74", + "refseq_accession": [ + "NM_000853" + ], + "locus_type": "gene with protein product", + "hgnc_id": "HGNC:4641", + "rgd_id": [ + "RGD:2765" + ], + "ensembl_gene_id": "ENSG00000277656", + "entrez_id": "2952", + "gene_group": [ + "Soluble glutathione S-transferases" + ], + "omim_id": [ + "600436" + ], + "symbol": "GSTT1", + "location": "22q11.23 alternate reference locus", + "name": "glutathione S-transferase theta 1", + "date_modified": "2015-07-31", + "mgd_id": [ + "MGI:107379" + ], + "ucsc_id": "uc002zze.4", + "enzyme_id": [ + "2.5.1.18" + ], + "uniprot_ids": [ + "P30711" + ], + "orphanet": 470418, + "ena": [ + "KI270879" + ], + "gene_group_id": [ + 567 + ], + "pubmed_id": [ + 8617495 + ], + "location_sortable": "22q11.23 alternate reference locus" + }, + { + "vega_id": "OTTHUMG00000149814", + "locus_group": "protein-coding gene", + "mane_select": [ + "ENST00000300060.7", + "NM_001150.3" + ], + "alias_symbol": [ + "LAP1", + "gp150", + "p150" + ], + "_version_": 1707696196168974337, + "uuid": "496dd55c-564d-4ec3-996c-4e08b2385543", + "prev_name": [ + "alanyl (membrane) aminopeptidase" + ], + "refseq_accession": [ + "NM_001150" + ], + "hgnc_id": "HGNC:500", + "entrez_id": "290", + "symbol": "ANPEP", + "location": "15q26.1", + "name": "alanyl aminopeptidase, membrane", + "mgd_id": [ + "MGI:5000466" + ], + "prev_symbol": [ + "CD13", + "PEPN" + ], + "alias_name": [ + "aminopeptidase N", + "aminopeptidase M", + "microsomal aminopeptidase", + "membrane alanyl aminopeptidase" + ], + "date_approved_reserved": "1989-02-28", + "status": "Approved", + "merops": "M01.001", + "locus_type": "gene with protein product", + "iuphar": "objectId:1560", + "agr": "HGNC:500", + "rgd_id": [ + "RGD:2991" + ], + "ensembl_gene_id": "ENSG00000166825", + "gene_group": [ + "Aminopeptidases", + "CD molecules", + "M1 metallopeptidases" + ], + "date_name_changed": "2016-01-06", + "cd": "CD13", + "omim_id": [ + "151530" + ], + "date_modified": "2020-09-17", + "ucsc_id": "uc002bop.5", + "enzyme_id": [ + "3.4.11.2" + ], + "uniprot_ids": [ + "P15144" + ], + "ena": [ + "M22324" + ], + "ccds_id": [ + "CCDS10356" + ], + "pubmed_id": [ + 2428842, + 1977688 + ], + "gene_group_id": [ + 104, + 471, + 1437 + ], + "location_sortable": "15q26.1" + }, + { + "date_approved_reserved": "1989-10-11", + "locus_group": "other", + "status": "Approved", + "_version_": 1707696207810265089, + "uuid": "9faf5b05-e788-4d19-b2a6-c467ec68d246", + "prev_name": [ + "7S DNA" + ], + "locus_type": "region", + "hgnc_id": "HGNC:7409", + "gene_group": [ + "Mitochondrially encoded regions" + ], + "symbol": "MT-7SDNA", + "date_name_changed": "2005-02-15", + "location": "mitochondria", + "name": "mitochondrially encoded 7S DNA", + "date_modified": "2019-10-09", + "prev_symbol": [ + "MT7SDNA" + ], + "pubmed_id": [ + 24709344, + 273237 + ], + "date_symbol_changed": "2019-10-09", + "gene_group_id": [ + 1973 + ], + "location_sortable": "mitochondria" + }, + { + "symbol_report_tag": [ + "Stable symbol" + ], + "vega_id": "OTTHUMG00000020813", + "locus_group": "protein-coding gene", + "mane_select": [ + "ENST00000318560.6", + "NM_005157.6" + ], + "alias_symbol": [ + "JTK7", + "c-ABL", + "p150" + ], + "_version_": 1707696195483205632, + "uuid": "f3c383ea-b6bd-4a03-bff0-aa3be952a670", + "prev_name": [ + "v-abl Abelson murine leukemia viral oncogene homolog 1", + "c-abl oncogene 1, receptor tyrosine kinase", + "c-abl oncogene 1, non-receptor tyrosine kinase" + ], + "lsdb": [ + "LRG_769|http://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_769.xml" + ], + "refseq_accession": [ + "NM_007313" + ], + "cosmic": "ABL1", + "hgnc_id": "HGNC:76", + "entrez_id": "25", + "symbol": "ABL1", + "location": "9q34.12", + "name": "ABL proto-oncogene 1, non-receptor tyrosine kinase", + "mgd_id": [ + "MGI:87859" + ], + "prev_symbol": [ + "ABL" + ], + "orphanet": 117691, + "date_approved_reserved": "1986-01-01", + "status": "Approved", + "locus_type": "gene with protein product", + "iuphar": "objectId:1923", + "agr": "HGNC:76", + "rgd_id": [ + "RGD:1584969" + ], + "ensembl_gene_id": "ENSG00000097007", + "gene_group": [ + "SH2 domain containing", + "Abl family tyrosine kinases" + ], + "date_name_changed": "2014-06-26", + "omim_id": [ + "189980" + ], + "date_modified": "2021-05-26", + "ucsc_id": "uc004bzv.4", + "uniprot_ids": [ + "P00519" + ], + "ena": [ + "M14752" + ], + "ccds_id": [ + "CCDS35165", + "CCDS35166" + ], + "pubmed_id": [ + 1857987, + 12626632 + ], + "gene_group_id": [ + 741, + 1463 + ], + "location_sortable": "09q34.12" + }, + { + "symbol_report_tag": [ + "Stable symbol" + ], + "date_approved_reserved": "1996-04-04", + "vega_id": "OTTHUMG00000179843", + "locus_group": "protein-coding gene", + "mane_select": [ + "ENST00000285039.12", + "NM_001080467.3" + ], + "status": "Approved", + "alias_symbol": [ + "KIAA1119" + ], + "_version_": 1707696208307290113, + "uuid": "e6878d55-46d4-406a-ab31-e1e51f3041f1", + "refseq_accession": [ + "NM_001080467" + ], + "locus_type": "gene with protein product", + "agr": "HGNC:7603", + "hgnc_id": "HGNC:7603", + "rgd_id": [ + "RGD:621347" + ], + "ensembl_gene_id": "ENSG00000167306", + "entrez_id": "4645", + "gene_group": [ + "Myosin heavy chains, class V", + "MicroRNA protein coding host genes" + ], + "omim_id": [ + "606540" + ], + "symbol": "MYO5B", + "location": "18q", + "name": "myosin VB", + "date_modified": "2021-05-26", + "mgd_id": [ + "MGI:106598" + ], + "ucsc_id": "uc002leb.3", + "uniprot_ids": [ + "Q9ULV0" + ], + "orphanet": 171089, + "ccds_id": [ + "CCDS42436" + ], + "ena": [ + "AB032945" + ], + "gene_group_id": [ + 1100, + 1691 + ], + "pubmed_id": [ + 8884266, + 17462998 + ], + "location_sortable": "18q" + }, + { + "date_approved_reserved": "2000-04-11", + "vega_id": "OTTHUMG00000159645", + "locus_group": "protein-coding gene", + "mane_select": [ + "ENST00000356763.8", + "NM_014602.3" + ], + "status": "Approved", + "alias_symbol": [ + "VPS15", + "p150" + ], + "_version_": 1707696210538659841, + "uuid": "870b459d-1527-4192-a8a3-f09cb4de0f65", + "prev_name": [ + "phosphoinositide-3-kinase, regulatory subunit 4" + ], + "refseq_accession": [ + "NM_014602" + ], + "locus_type": "gene with protein product", + "iuphar": "objectId:2157", + "agr": "HGNC:8982", + "hgnc_id": "HGNC:8982", + "rgd_id": [ + "RGD:1311809" + ], + "ensembl_gene_id": "ENSG00000196455", + "entrez_id": "30849", + "gene_group": [ + "WD repeat domain containing", + "Armadillo like helical domain containing", + "PIK3C3 complex subunits" + ], + "omim_id": [ + "602610" + ], + "symbol": "PIK3R4", + "date_name_changed": "2015-11-17", + "location": "3q22.1", + "name": "phosphoinositide-3-kinase regulatory subunit 4", + "date_modified": "2015-11-17", + "mgd_id": [ + "MGI:1922919" + ], + "ucsc_id": "uc003enj.4", + "uniprot_ids": [ + "Q99570" + ], + "ccds_id": [ + "CCDS3067" + ], + "ena": [ + "Y08991" + ], + "gene_group_id": [ + 362, + 1492, + 1596 + ], + "pubmed_id": [ + 8999962 + ], + "location_sortable": "03q22.1" + }, + { + "date_approved_reserved": "1986-01-01", + "symbol": "FRA10A", + "locus_group": "other", + "location": "10q23.3 or 10q24.2", + "status": "Approved", + "date_modified": "2020-04-20", + "name": "fragile site, folic acid type, rare, fra(10)(q23.3) or fra(10)(q24.2)", + "_version_": 1707696201387737088, + "uuid": "334225e1-0a3b-4407-afbc-47f2fd98bfc9", + "refseq_accession": [ + "NG_052564" + ], + "locus_type": "fragile site", + "hgnc_id": "HGNC:3829", + "pubmed_id": [ + 15203205 + ], + "entrez_id": "109280162", + "location_sortable": "10q23.3 or 10q24.2" + } + ], + "start": 0 + } +} \ No newline at end of file diff --git a/tests/unit/data/etl_data/ncbi_GRCh38.p13.gff b/tests/unit/data/etl_data/ncbi_GRCh38.p13.gff new file mode 100644 index 00000000..9fdf5abe --- /dev/null +++ b/tests/unit/data/etl_data/ncbi_GRCh38.p13.gff @@ -0,0 +1,81 @@ +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build GRCh38.p13 +#!genome-build-accession NCBI_Assembly:GCF_000001405.39 +#!annotation-date 05/14/2021 +#!annotation-source NCBI Homo sapiens Updated Annotation Release 109.20210514 +##sequence-region NC_000001.11 1 248956422 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 +NC_000001.11 RefSeq region 1 248956422 . + . ID=NC_000001.11:1..248956422;Dbxref=taxon:9606;Name=1;chromosome=1;gbkey=Src;genome=chromosome;mol_type=genomic DNA +NC_000001.11 BestRefSeq gene 220148293 220272453 . - . ID=gene-RAB3GAP2;Dbxref=GeneID:25782,HGNC:HGNC:17168,MIM:609275;Name=RAB3GAP2;description=RAB3 GTPase activating non-catalytic protein subunit 2;gbkey=Gene;gene=RAB3GAP2;gene_biotype=protein_coding;gene_synonym=p150,RAB3-GAP150,RAB3GAP150,SPG69,WARBM2 +##sequence-region NT_187370.1 1 161471 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 +NT_187370.1 RefSeq region 1 161471 . + . ID=NT_187370.1:1..161471;Dbxref=taxon:9606;Name=2;chromosome=2;gbkey=Src;genome=genomic;map=unlocalized;mol_type=genomic DNA +##sequence-region NT_187371.1 1 153799 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 +NT_187371.1 RefSeq region 1 153799 . + . ID=NT_187371.1:1..153799;Dbxref=taxon:9606;Name=2;chromosome=2;gbkey=Src;genome=genomic;map=unlocalized;mol_type=genomic DNA +##sequence-region NC_000003.12 1 198295559 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 +NC_000003.12 RefSeq region 1 198295559 . + . ID=NC_000003.12:1..198295559;Dbxref=taxon:9606;Name=3;chromosome=3;gbkey=Src;genome=chromosome;mol_type=genomic DNA +NC_000003.12 BestRefSeq gene 130678934 130746829 . - . ID=gene-PIK3R4;Dbxref=GeneID:30849,HGNC:HGNC:8982,MIM:602610;Name=PIK3R4;description=phosphoinositide-3-kinase regulatory subunit 4;gbkey=Gene;gene=PIK3R4;gene_biotype=protein_coding;gene_synonym=p150,VPS15 +##sequence-region NC_000007.14 1 159345973 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 +NC_000007.14 RefSeq region 1 159345973 . + . ID=NC_000007.14:1..159345973;Dbxref=taxon:9606;Name=7;chromosome=7;gbkey=Src;genome=chromosome;mol_type=genomic DNA +NC_000007.14 BestRefSeq gene 100889994 100896994 . - . ID=gene-ACHE;Dbxref=GeneID:43,HGNC:HGNC:108,MIM:100740;Name=ACHE;description=acetylcholinesterase (Cartwright blood group);gbkey=Gene;gene=ACHE;gene_biotype=protein_coding;gene_synonym=ACEE,ARACHE,N-ACHE,YT +NC_000007.14 BestRefSeq%2CGnomon gene 140713328 140924929 . - . ID=gene-BRAF;Dbxref=GeneID:673,HGNC:HGNC:1097,MIM:164757;Name=BRAF;description=B-Raf proto-oncogene%2C serine/threonine kinase;gbkey=Gene;gene=BRAF;gene_biotype=protein_coding;gene_synonym=B-raf,B-RAF1,BRAF1,NS7,RAFB1 +##sequence-region NC_000008.11 1 145138636 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 +NC_000008.11 RefSeq region 1 145138636 . + . ID=NC_000008.11:1..145138636;Dbxref=taxon:9606;Name=8;chromosome=8;gbkey=Src;genome=chromosome;mol_type=genomic DNA +NC_000008.11 BestRefSeq%2CGnomon gene 93916923 93926068 . + . ID=gene-PDP1;Dbxref=GeneID:54704,HGNC:HGNC:9279,MIM:605993;Name=PDP1;description=pyruvate dehydrogenase phosphatase catalytic subunit 1;gbkey=Gene;gene=PDP1;gene_biotype=protein_coding;gene_synonym=PDH,PDP,PDPC,PPM2A,PPM2C +##sequence-region NC_000009.12 1 138394717 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 +NC_000009.12 RefSeq region 1 138394717 . + . ID=NC_000009.12:1..138394717;Dbxref=taxon:9606;Name=9;chromosome=9;gbkey=Src;genome=chromosome;mol_type=genomic DNA +NC_000009.12 BestRefSeq gene 130713043 130887675 . + . ID=gene-ABL1;Dbxref=GeneID:25,HGNC:HGNC:76,MIM:189980;Name=ABL1;description=ABL proto-oncogene 1%2C non-receptor tyrosine kinase;gbkey=Gene;gene=ABL1;gene_biotype=protein_coding;gene_synonym=ABL,BCR-ABL,bcr/abl,c-ABL,c-ABL1,CHDSKM,JTK7,p150,v-abl +##sequence-region NC_000011.10 1 135086622 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 +NC_000011.10 RefSeq region 1 135086622 . + . ID=NC_000011.10:1..135086622;Dbxref=taxon:9606;Name=11;chromosome=11;gbkey=Src;genome=chromosome;mol_type=genomic DNA +NC_000011.10 BestRefSeq gene 10751246 10779746 . + . ID=gene-CTR9;Dbxref=GeneID:9646,HGNC:HGNC:16850,MIM:609366;Name=CTR9;description=CTR9 homolog%2C Paf1/RNA polymerase II complex component;gbkey=Gene;gene=CTR9;gene_biotype=protein_coding;gene_synonym=p150,p150TSP,SH2BP1,TSBP +##sequence-region NC_000012.12 1 133275309 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 +NC_000012.12 RefSeq region 1 133275309 . + . ID=NC_000012.12:1..133275309;Dbxref=taxon:9606;Name=12;chromosome=12;gbkey=Src;genome=chromosome;mol_type=genomic DNA +NC_000012.12 BestRefSeq%2CGnomon gene 133037301 133063299 . + . ID=gene-ZNF84;Dbxref=GeneID:7637,HGNC:HGNC:13159,MIM:618554;Name=ZNF84;description=zinc finger protein 84;gbkey=Gene;gene=ZNF84;gene_biotype=protein_coding;gene_synonym=HPF2 +##sequence-region NC_000015.10 1 101991189 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 +NC_000015.10 RefSeq region 1 101991189 . + . ID=NC_000015.10:1..101991189;Dbxref=taxon:9606;Name=15;chromosome=15;gbkey=Src;genome=chromosome;mol_type=genomic DNA +NC_000015.10 BestRefSeq gene 89784895 89814852 . - . ID=gene-ANPEP;Dbxref=GeneID:290,HGNC:HGNC:500,MIM:151530;Name=ANPEP;description=alanyl aminopeptidase%2C membrane;gbkey=Gene;gene=ANPEP;gene_biotype=protein_coding;gene_synonym=APN,CD13,GP150,LAP1,P150,PEPN +##sequence-region NC_000019.10 1 58617616 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 +NC_000019.10 RefSeq region 1 58617616 . + . ID=NC_000019.10:1..58617616;Dbxref=taxon:9606;Name=19;chromosome=19;gbkey=Src;genome=chromosome;mol_type=genomic DNA +NC_000019.10 BestRefSeq%2CGnomon gene 38211006 38229695 . - . ID=gene-DPF1;Dbxref=GeneID:8193,HGNC:HGNC:20225,MIM:601670;Name=DPF1;description=double PHD fingers 1;gbkey=Gene;gene=DPF1;gene_biotype=protein_coding;gene_synonym=BAF45b,NEUD4,neuro-d4 +NC_000019.10 BestRefSeq%2CGnomon gene 4402596 4448322 . + . ID=gene-CHAF1A;Dbxref=GeneID:10036,HGNC:HGNC:1910,MIM:601246;Name=CHAF1A;description=chromatin assembly factor 1 subunit A;gbkey=Gene;gene=CHAF1A;gene_biotype=protein_coding;gene_synonym=CAF-1,CAF1,CAF1B,CAF1P150,P150 +##sequence-region NT_187390.1 1 42811 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 +NT_187390.1 RefSeq region 1 42811 . + . ID=NT_187390.1:1..42811;Dbxref=taxon:9606;Name=22;chromosome=22;gbkey=Src;genome=genomic;map=unlocalized;mol_type=genomic DNA +##sequence-region NT_187391.1 1 181920 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 +NT_187391.1 RefSeq region 1 181920 . + . ID=NT_187391.1:1..181920;Dbxref=taxon:9606;Name=22;chromosome=22;gbkey=Src;genome=genomic;map=unlocalized;mol_type=genomic DNA +##sequence-region NT_187392.1 1 103838 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 +NT_187392.1 RefSeq region 1 103838 . + . ID=NT_187392.1:1..103838;Dbxref=taxon:9606;Name=22;chromosome=22;gbkey=Src;genome=genomic;map=unlocalized;mol_type=genomic DNA +##sequence-region NT_187393.1 1 99375 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 +NT_187393.1 RefSeq region 1 99375 . + . ID=NT_187393.1:1..99375;Dbxref=taxon:9606;Name=22;chromosome=22;gbkey=Src;genome=genomic;map=unlocalized;mol_type=genomic DNA +##sequence-region NT_187394.1 1 73985 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 +NT_187394.1 RefSeq region 1 73985 . + . ID=NT_187394.1:1..73985;Dbxref=taxon:9606;Name=22;chromosome=22;gbkey=Src;genome=genomic;map=unlocalized;mol_type=genomic DNA +##sequence-region NC_000023.11 1 156040895 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 +NC_000023.11 RefSeq region 1 156040895 . + . ID=NC_000023.11:1..156040895;Dbxref=taxon:9606;Name=X;chromosome=X;gbkey=Src;genome=chromosome;mol_type=genomic DNA +NC_000023.11 BestRefSeq gene 1386152 1392113 . - . ID=gene-SLC25A6;Dbxref=GeneID:293,HGNC:HGNC:10992,MIM:403000;Name=SLC25A6;description=solute carrier family 25 member 6;gbkey=Gene;gene=SLC25A6;gene_biotype=protein_coding;gene_synonym=AAC3,ANT,ANT 2,ANT 3,ANT3,ANT3Y +NC_000023.11 BestRefSeq gene 155612586 155782459 . + . ID=gene-SPRY3;Dbxref=GeneID:10251,HGNC:HGNC:11271,MIM:300531;Name=SPRY3;description=sprouty RTK signaling antagonist 3;gbkey=Gene;gene=SPRY3;gene_biotype=protein_coding;gene_synonym=spry-3 +##sequence-region NC_000024.10 1 57227415 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 +NC_000024.10 RefSeq region 1 57227415 . + . ID=NC_000024.10:1..57227415;Dbxref=taxon:9606;Name=Y;chromosome=Y;gbkey=Src;genome=chromosome;mol_type=genomic DNA +NC_000024.10 BestRefSeq gene 1386152 1392113 . - . ID=gene-SLC25A6-2;Dbxref=GeneID:293,HGNC:HGNC:10992,MIM:403000;Name=SLC25A6;description=solute carrier family 25 member 6;gbkey=Gene;gene=SLC25A6;gene_biotype=protein_coding;gene_synonym=AAC3,ANT,ANT 2,ANT 3,ANT3,ANT3Y +NC_000024.10 BestRefSeq gene 56923423 56968979 . + . ID=gene-SPRY3-2;Dbxref=GeneID:10251,HGNC:HGNC:11271,MIM:300531;Name=SPRY3;description=sprouty RTK signaling antagonist 3;gbkey=Gene;gene=SPRY3;gene_biotype=protein_coding;gene_synonym=spry-3;partial=true;start_range=.,56923423 +##sequence-region NT_167246.2 1 4677643 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 +NT_167246.2 RefSeq region 1 4677643 . + . ID=NT_167246.2:1..4677643;Dbxref=taxon:9606;Name=6;chromosome=6;gbkey=Src;genome=genomic;map=6p22.1-21.32;mol_type=genomic DNA +NT_167246.2 Curated Genomic pseudogene 3940270 3941874 . + . ID=gene-PRKRAP1;Dbxref=GeneID:731716,HGNC:HGNC:33447;Name=PRKRAP1;description=protein activator of interferon induced protein kinase EIF2AK2 pseudogene 1;gbkey=Gene;gene=PRKRAP1;gene_biotype=pseudogene;pseudo=true +NT_167249.2 Curated Genomic pseudogene 3930481 3932085 . + . ID=gene-PRKRAP1-2;Dbxref=GeneID:731716,HGNC:HGNC:33447;Name=PRKRAP1;description=protein activator of interferon induced protein kinase EIF2AK2 pseudogene 1;gbkey=Gene;gene=PRKRAP1;gene_biotype=pseudogene;pseudo=trueNC_000023.11 BestRefSeq gene 1386152 1392113 . - . ID=gene-SLC25A6;Dbxref=GeneID:293,HGNC:HGNC:10992,MIM:403000;Name=SLC25A6;description=solute carrier family 25 member 6;gbkey=Gene;gene=SLC25A6;gene_biotype=protein_coding;gene_synonym=AAC3,ANT,ANT 2,ANT 3,ANT3,ANT3Y diff --git a/tests/unit/data/etl_data/ncbi_history_20210813.tsv b/tests/unit/data/etl_data/ncbi_history_20210813.tsv new file mode 100644 index 00000000..a6d01a5d --- /dev/null +++ b/tests/unit/data/etl_data/ncbi_history_20210813.tsv @@ -0,0 +1,15 @@ +#tax_id GeneID Discontinued_GeneID Discontinued_Symbol Discontinue_Date +132 43 99999 FAKE_ACEE 20110915 +9606 10251 170187 LOC170187 20050510 +9606 10251 253479 LOC253479 20050510 +9606 25 112779 LOC112779 20050510 +9606 25 116063 LOC116063 20050510 +9606 25782 26114 DKFZP434D245 20050510 +9606 293 8283 ANT3Y 20050510 +9606 43 100187742 ACEE 20110915 +9606 54704 5497 PPM2C 20050510 +9606 54704 157663 LOC157663 20050507 +9606 731716 100289695 LOC100289695 20091015 +9606 7637 100287429 LOC100287429 20110803 +9606 - 103344718 HOTS 20200620 +9606 - 544580 AASTH23 20190503 diff --git a/tests/unit/data/etl_data/ncbi_info_20210813.tsv b/tests/unit/data/etl_data/ncbi_info_20210813.tsv new file mode 100644 index 00000000..dcd32c97 --- /dev/null +++ b/tests/unit/data/etl_data/ncbi_info_20210813.tsv @@ -0,0 +1,22 @@ +9606 25 ABL1 - ABL|BCR-ABL|CHDSKM|JTK7|bcr/abl|c-ABL|c-ABL1|p150|v-abl MIM:189980|HGNC:HGNC:76|Ensembl:ENSG00000097007 9 9q34.12 ABL proto-oncogene 1, non-receptor tyrosine kinase protein-coding ABL1 ABL proto-oncogene 1, non-receptor tyrosine kinase O tyrosine-protein kinase ABL1|ABL protooncogene 1 nonreceptor tyrosine kinase|Abelson tyrosine-protein kinase 1|bcr/c-abl oncogene protein|c-abl oncogene 1, receptor tyrosine kinase|proto-oncogene c-Abl|proto-oncogene tyrosine-protein kinase ABL1|truncated ABL protooncogene 1 nonreceptor tyrosine kinase|v-abl Abelson murine leukemia viral oncogene homolog 1 20210809 - +9606 43 ACHE - ACEE|ARACHE|N-ACHE|YT MIM:100740|HGNC:HGNC:108|Ensembl:ENSG00000087085 7 7q22.1 acetylcholinesterase (Cartwright blood group) protein-coding ACHE acetylcholinesterase (Cartwright blood group) O acetylcholinesterase|Yt blood group|acetylcholinesterase (Yt blood group)|apoptosis-related acetylcholinesterase 20210719 - +9606 106 ADCP1 - - HGNC:HGNC:229 6 - adenosine deaminase complexing protein 1 unknown ADCP1 adenosine deaminase complexing protein 1 O - 20190324 - +9606 170 AFA - - MIM:106250 - - ankyloblepharon filiforme adnatum unknown - - - - 20191002 - +9606 290 ANPEP - APN|CD13|GP150|LAP1|P150|PEPN MIM:151530|HGNC:HGNC:500|Ensembl:ENSG00000166825 15 15q26.1 alanyl aminopeptidase, membrane protein-coding ANPEP alanyl aminopeptidase, membrane O aminopeptidase N|AP-M|AP-N|alanyl (membrane) aminopeptidase|aminopeptidase M|hAPN|membrane alanyl aminopeptidase|microsomal aminopeptidase|myeloid plasma membrane glycoprotein CD13 20210708 - +9606 673 BRAF - B-RAF1|B-raf|BRAF1|NS7|RAFB1 MIM:164757|HGNC:HGNC:1097|Ensembl:ENSG00000157764 7 7q34 B-Raf proto-oncogene, serine/threonine kinase protein-coding BRAF B-Raf proto-oncogene, serine/threonine kinase O serine/threonine-protein kinase B-raf|94 kDa B-raf protein|B-Raf proto-oncogene serine/threonine-protein kinase (p94)|B-Raf serine/threonine-protein|murine sarcoma viral (v-raf) oncogene homolog B1|proto-oncogene B-Raf|v-raf murine sarcoma viral oncogene homolog B|v-raf murine sarcoma viral oncogene homolog B1 20210809 - +9606 10036 CHAF1A - CAF-1|CAF1|CAF1B|CAF1P150|P150 MIM:601246|HGNC:HGNC:1910|Ensembl:ENSG00000167670 19 19p13.3 chromatin assembly factor 1 subunit A protein-coding CHAF1A chromatin assembly factor 1 subunit A O chromatin assembly factor 1 subunit A|CAF-1 subunit A|CAF-I 150 kDa subunit|CAF-I p150|CTB-50L17.7|chromatin assembly factor I (150 kDa)|chromatin assembly factor I p150 subunit|hp150 20210808 - +9606 9646 CTR9 - SH2BP1|TSBP|p150|p150TSP MIM:609366|HGNC:HGNC:16850|Ensembl:ENSG00000198730 11 11p15.4 CTR9 homolog, Paf1/RNA polymerase II complex component protein-coding CTR9 CTR9 homolog, Paf1/RNA polymerase II complex component O RNA polymerase-associated protein CTR9 homolog|Ctr9, Paf1/RNA polymerase II complex component, homolog|SH2 domain binding protein 1 (tetratricopeptide repeat containing)|TPR-containing, SH2-binding phosphoprotein 20210708 - +9606 8193 DPF1 - BAF45b|NEUD4|SMARCG1|neuro-d4 MIM:601670|HGNC:HGNC:20225|Ensembl:ENSG00000011332 19 19q13.2 double PHD fingers 1 protein-coding DPF1 double PHD fingers 1 O zinc finger protein neuro-d4|BRG1-associated factor 45B|D4, zinc and double PHD fingers family 1|neuro-d4 homolog 20210726 - +9606 2722 GLC1B - - MIM:606689 2 2cen-q13 glaucoma 1, open angle, B (adult-onset) unknown - - - - 20191002 - +9606 50829 HDPA - - MIM:300221 X Xpter-p22.32 Hodgkin disease, susceptibility, pseudoautosomal unknown - - - - 20190816 - +9606 106783576 LOC106783576 - - - 10 10p nonconserved acetylation island sequence 68 enhancer biological-region - - - non-conserved AI 68 20210518 regulatory:enhancer +9606 619511 MHB - - MIM:255160 3 3p22.2-p21.32 myopathy, hyaline body, autosomal recessive unknown - - - - 20170408 - +9606 54704 PDP1 - PDH|PDP|PDPC|PPM2A|PPM2C MIM:605993|HGNC:HGNC:9279|Ensembl:ENSG00000164951 8 8q22.1 pyruvate dehydrogenase phosphatase catalytic subunit 1 protein-coding PDP1 pyruvate dehydrogenase phosphatase catalytic subunit 1 O pyruvate dehyrogenase phosphatase catalytic subunit 1|PDP 1|PDPC 1|[Pyruvate dehydrogenase [acetyl-transferring]]-phosphatase 1, mitochondrial|protein phosphatase 2C, magnesium-dependent, catalytic subunit|protein phosphatase, Mg2+/Mn2+ dependent 2A|pyruvate dehydrogenase (Lipoamide) phosphatase-phosphatase 20210708 - +9606 30849 PIK3R4 - VPS15|p150 MIM:602610|HGNC:HGNC:8982|Ensembl:ENSG00000196455 3 3q22.1 phosphoinositide-3-kinase regulatory subunit 4 protein-coding PIK3R4 phosphoinositide-3-kinase regulatory subunit 4 O phosphoinositide 3-kinase regulatory subunit 4|PI3-kinase p150 subunit|PI3-kinase regulatory subunit 4|phosphatidylinositol 3-kinase-associated p150|phosphoinositide 3-kinase adaptor protein|phosphoinositide-3-kinase, regulatory subunit 4, p150 20210808 - +9606 731716 PRKRAP1 - - HGNC:HGNC:33447 6 6p21.3 alternate reference locus protein activator of interferon induced protein kinase EIF2AK2 pseudogene 1 pseudo PRKRAP1 protein activator of interferon induced protein kinase EIF2AK2 pseudogene 1 O protein kinase, interferon-inducible double stranded RNA dependent activator pseudogene 1 20210611 - +9606 25782 RAB3GAP2 - MARTS1|RAB3-GAP150|RAB3GAP150|SPG69|WARBM2|p150 MIM:609275|HGNC:HGNC:17168|Ensembl:ENSG00000118873 1 1q41 RAB3 GTPase activating non-catalytic protein subunit 2 protein-coding RAB3GAP2 RAB3 GTPase activating non-catalytic protein subunit 2 O rab3 GTPase-activating protein non-catalytic subunit|RAB3 GTPase activating protein subunit 2 (non-catalytic)|RGAP-iso|rab3 GTPase-activating protein 150 kDa subunit|rab3-GAP p150|rab3-GAP regulatory subunit 20210709 - +9606 293 SLC25A6 - AAC3|ANT|ANT 2|ANT 3|ANT3|ANT3Y MIM:300151|MIM:403000|HGNC:HGNC:10992|Ensembl:ENSG00000169100 X|Y X;Y solute carrier family 25 member 6 protein-coding SLC25A6 solute carrier family 25 member 6 O ADP/ATP translocase 3|ADP,ATP carrier protein 3|ADP,ATP carrier protein, liver|ADP/ATP translocator of liver|adenine nucleotide translocator 3|epididymis secretory sperm binding protein|solute carrier family 25 (mitochondrial carrier; adenine nucleotide translocator), member 6 20210708 - +9606 100049159 SPG37 - - MIM:611945 8 8p21.2-q13.3 spastic paraplegia 37 (autosomal dominant) unknown - - - - 20191002 - +9606 10251 SPRY3 - spry-3 MIM:300531|HGNC:HGNC:11271|Ensembl:ENSG00000168939 X|Y Xq28 and Yq12 sprouty RTK signaling antagonist 3 protein-coding SPRY3 sprouty RTK signaling antagonist 3 O protein sprouty homolog 3|antagonist of FGF signaling|sprouty homolog 3|sprouty3 20210807 - +9606 7637 ZNF84 - HPF2 MIM:618554|HGNC:HGNC:13159|Ensembl:ENSG00000198040 12 12q24.33|map from Rosati ref via FISH [AFS] zinc finger protein 84 protein-coding ZNF84 zinc finger protein 84 O zinc finger protein 84|zinc finger protein HPF2 20210611 - +9606 619538 OMS - COME/ROM MIM:166760 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 otitis media, susceptibility to unknown - - - chronic/recurrent otitis media 20170408 - \ No newline at end of file diff --git a/tests/unit/data/hgnc_genes.json b/tests/unit/data/hgnc_genes.json deleted file mode 100644 index 3701fd7b..00000000 --- a/tests/unit/data/hgnc_genes.json +++ /dev/null @@ -1,1359 +0,0 @@ -[ - { - "label_and_type": "hgnc:37133##identity", - "label": "A1BG antisense RNA 1", - "concept_id": "hgnc:37133", - "symbol": "A1BG-AS1", - "locations": [ - { - "_id": "ga4gh:VCL.3Zdz1Stgx8HdWf1cT1KaUHFUQjoKTTcD", - "chr": "19", - "interval": { - "end": "q13.43", - "start": "q13.43", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - } - ], - "previous_symbols": [ - "NCRNA00181", - "A1BGAS", - "A1BG-AS" - ], - "aliases": [ - "FLJ23569" - ], - "symbol_status": "approved", - "xrefs": [ - "ensembl:ENSG00000268895", - "ncbigene:503538" - ], - "associated_with": [ - "vega:OTTHUMG00000183508", - "ucsc:uc002qse.3", - "refseq:NR_015380", - "ena.embl:BC040926", - "refseq:NR_015380", - "ena.embl:BC040926" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "a1bg-as1##symbol", - "concept_id": "hgnc:37133", - "src_name": "HGNC", - "item_type": "symbol" - }, - { - "label_and_type": "ncrna00181##prev_symbol", - "concept_id": "hgnc:37133", - "src_name": "HGNC", - "item_type": "prev_symbol" - }, - { - "label_and_type": "a1bgas##prev_symbol", - "concept_id": "hgnc:37133", - "src_name": "HGNC", - "item_type": "prev_symbol" - }, - { - "label_and_type": "a1bg-as##prev_symbol", - "concept_id": "hgnc:37133", - "src_name": "HGNC", - "item_type": "prev_symbol" - }, - { - "label_and_type": "flj23569##alias", - "concept_id": "hgnc:37133", - "src_name": "HGNC", - "item_type": "alias" - }, - { - "label_and_type": "hgnc:11998##identity", - "label": "tumor protein p53", - "concept_id": "hgnc:11998", - "symbol": "TP53", - "locations": [ - { - "_id": "ga4gh:VCL._Cl_XG2bfBUVG6uwi-jHtCHavOAyfPXN", - "chr": "17", - "interval": { - "end": "p13.1", - "start": "p13.1", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - } - ], - "aliases": [ - "p53", - "LFS1" - ], - "symbol_status": "approved", - "xrefs": [ - "ensembl:ENSG00000141510", - "ncbigene:7157" - ], - "associated_with": [ - "vega:OTTHUMG00000162125", - "refseq:NM_000546", - "cosmic:TP53", - "omim:191170", - "ucsc:uc060aur.1", - "uniprot:P04637", - "orphanet:120204", - "ccds:CCDS73968", - "ccds:CCDS73971", - "ccds:CCDS73970", - "ccds:CCDS73969", - "ccds:CCDS73967", - "ccds:CCDS73966", - "ccds:CCDS73965", - "ccds:CCDS73964", - "ccds:CCDS73963", - "ccds:CCDS11118", - "ccds:CCDS45605", - "ccds:CCDS45606", - "ena.embl:AF307851", - "pubmed:6396087", - "pubmed:3456488", - "pubmed:2047879" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "tp53##symbol", - "concept_id": "hgnc:11998", - "src_name": "HGNC", - "item_type": "symbol" - }, - { - "label_and_type": "lfs1##alias", - "concept_id": "hgnc:11998", - "src_name": "HGNC", - "item_type": "alias" - }, - { - "label_and_type": "p53##alias", - "concept_id": "hgnc:11998", - "src_name": "HGNC", - "item_type": "alias" - }, - { - "label_and_type": "hgnc:30005##identity", - "label": "alpha 1,3-galactosyltransferase 2", - "concept_id": "hgnc:30005", - "symbol": "A3GALT2", - "locations": [ - { - "_id": "ga4gh:VCL.Rs8bogwClWoTYjhY9vI9J3wnPEXlao-U", - "chr": "1", - "interval": { - "end": "p35.1", - "start": "p35.1", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - } - ], - "previous_symbols": [ - "A3GALT2P" - ], - "aliases": [ - "IGBS3S", - "IGB3S" - ], - "symbol_status": "approved", - "xrefs": [ - "ensembl:ENSG00000184389", - "ncbigene:127550" - ], - "associated_with": [ - "vega:OTTHUMG00000004125", - "ucsc:uc031plq.1", - "uniprot:U3KPV4", - "ccds:CCDS60080", - "pubmed:10854427", - "pubmed:18630988", - "refseq:NM_001080438" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "a3galt2##symbol", - "concept_id": "hgnc:30005", - "src_name": "HGNC", - "item_type": "symbol" - }, - { - "label_and_type": "a3galt2p##prev_symbol", - "concept_id": "hgnc:30005", - "src_name": "HGNC", - "item_type": "prev_symbol" - }, - { - "label_and_type": "igbs3s##alias", - "concept_id": "hgnc:30005", - "src_name": "HGNC", - "item_type": "alias" - }, - { - "label_and_type": "igb3s##alias", - "concept_id": "hgnc:30005", - "src_name": "HGNC", - "item_type": "alias" - }, - { - "label_and_type": "hgnc:23170##identity", - "label": "WD repeat and HMG-box DNA binding protein 1", - "concept_id": "hgnc:23170", - "symbol": "WDHD1", - "locations": [ - { - "_id": "ga4gh:VCL.R_izmPbRVtPQ2HwflIVh1XLXvRtVi-a7", - "chr": "14", - "interval": { - "end": "q22.2", - "start": "q22.3", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - } - ], - "aliases": [ - "AND-1", - "CTF4", - "CHTF4" - ], - "symbol_status": "approved", - "xrefs": [ - "ensembl:ENSG00000198554", - "ncbigene:11169" - ], - "associated_with": [ - "vega:OTTHUMG00000140304", - "refseq:NM_007086", - "omim:608126", - "ucsc:uc001xbm.3", - "uniprot:O75717", - "ccds:CCDS41955", - "ccds:CCDS9721", - "ena.embl:AJ006266", - "pubmed:9175701", - "pubmed:20028748" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "wdhd1##symbol", - "concept_id": "hgnc:23170", - "src_name": "HGNC", - "item_type": "symbol" - }, - { - "label_and_type": "hgnc:4059##identity", - "label": "glucose-6-phosphatase regulator", - "concept_id": "hgnc:4059", - "symbol": "G6PR", - "location_annotations": [ - "reserved" - ], - "aliases": [ - "GSD1aSP" - ], - "symbol_status": "approved", - "xrefs": [ - "ncbigene:2541" - ], - "associated_with": [ - "pubmed:2172641", - "pubmed:7814621", - "pubmed:2996501" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "g6pr##symbol", - "concept_id": "hgnc:4059", - "src_name": "HGNC", - "item_type": "symbol" - }, - { - "label_and_type": "hgnc:37528##identity", - "label": "piwi-interacting RNA cluster 24", - "concept_id": "hgnc:37528", - "symbol": "PIRC24", - "location_annotations": [ - "6" - ], - "symbol_status": "approved", - "xrefs": [ - "ncbigene:100313810" - ], - "associated_with": [ - "pubmed:17881367" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "pirc24##symbol", - "concept_id": "hgnc:37528", - "src_name": "HGNC", - "item_type": "symbol" - }, - { - "label_and_type": "hgnc:4101##identity", - "label": "G antigen 4", - "concept_id": "hgnc:4101", - "symbol": "GAGE4", - "location_annotations": [ - "not on reference assembly" - ], - "locations": [ - { - "_id": "ga4gh:VCL.AlwtARlUTZiNX3NEEKab-X5eeayXd8v8", - "chr": "X", - "interval": { - "end": "p11.2", - "start": "p11.4", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - } - ], - "symbol_status": "approved", - "aliases": [ - "CT4.4" - ], - "xrefs": [ - "ncbigene:2576" - ], - "associated_with": [ - "refseq:NM_001474", - "omim:300597", - "uniprot:P0DSO3", - "ena.embl:U19145", - "pubmed:7544395" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "gage4##symbol", - "concept_id": "hgnc:4101", - "src_name": "HGNC", - "item_type": "symbol" - }, - { - "label_and_type": "hgnc:31102##identity", - "label": "MAFF interacting protein (pseudogene)", - "concept_id": "hgnc:31102", - "symbol": "MAFIP", - "location_annotations": [ - "14", - "unplaced" - ], - "symbol_status": "approved", - "aliases": [ - "FLJ35473", - "FLJ00219", - "FLJ39633", - "MIP", - "pp5644", - "TEKT4P4" - ], - "xrefs": [ - "ensembl:ENSG00000274847", - "ncbigene:727764" - ], - "associated_with": [ - "vega:OTTHUMG00000188065", - "refseq:NR_046439", - "uniprot:Q8WZ33", - "ena.embl:AK074146", - "ena.embl:AF289559", - "pubmed:16549056", - "pubmed:15881666" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "mafip##symbol", - "concept_id": "hgnc:31102", - "src_name": "HGNC", - "item_type": "symbol" - }, - { - "label_and_type": "hgnc:7409##identity", - "label": "mitochondrially encoded 7S DNA", - "concept_id": "hgnc:7409", - "symbol": "MT-7SDNA", - "location_annotations": [ - "MT" - ], - "previous_symbols": [ - "MT7SDNA" - ], - "symbol_status": "approved", - "associated_with": [ - "pubmed:24709344", - "pubmed:273237" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "mt-7sdna##symbol", - "concept_id": "hgnc:7409", - "src_name": "HGNC", - "item_type": "symbol" - }, - { - "label_and_type": "hgnc:1838##identity", - "label": "cat eye syndrome chromosome region", - "concept_id": "hgnc:1838", - "symbol": "CECR", - "locations": [ - { - "_id": "ga4gh:VCL.-hT6Cp6B32GmZTD8BXh1xf6SJeLM1uN7", - "chr": "22", - "interval": { - "end": "q11", - "start": "pter", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - } - ], - "symbol_status": "approved", - "xrefs": [ - "ncbigene:1055" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "cecr##symbol", - "concept_id": "hgnc:1838", - "src_name": "HGNC", - "item_type": "symbol" - }, - { - "label_and_type": "hgnc:2435##identity", - "label": "colony stimulating factor 2 receptor subunit alpha", - "concept_id": "hgnc:2435", - "symbol": "CSF2RA", - "locations": [ - { - "_id": "ga4gh:VCL.B5KOWxL8BQRpM2MOHP-RUmGlmm4ZtMAC", - "chr": "X", - "interval": { - "end": "p22.32", - "start": "p22.32", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - }, - { - "_id": "ga4gh:VCL.QzbpRniVtZz8V-7B7vKhGeX3A3huKacK", - "chr": "Y", - "interval": { - "end": "p11.3", - "start": "p11.3", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - } - ], - "symbol_status": "approved", - "aliases": [ - "CD116", - "alphaGMR" - ], - "previous_symbols": [ - "CSF2R" - ], - "xrefs": [ - "ensembl:ENSG00000198223", - "ncbigene:1438" - ], - "associated_with": [ - "vega:OTTHUMG00000012533", - "refseq:NM_001161529", - "orphanet:209477", - "iuphar:1707", - "hcdmdb:CD116", - "omim:306250", - "omim:425000", - "ucsc:uc010nvv.3", - "uniprot:P15509", - "ena.embl:M64445", - "ccds:CCDS35190", - "ccds:CCDS55360", - "ccds:CCDS35191", - "ccds:CCDS55361", - "ccds:CCDS55359", - "ccds:CCDS35192", - "ccds:CCDS35193", - "pubmed:1702217" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "csf2ra##symbol", - "concept_id": "hgnc:2435", - "src_name": "HGNC", - "item_type": "symbol" - }, - { - "label_and_type": "hgnc:36026##identity", - "label": "ribosomal protein S24 pseudogene 5", - "concept_id": "hgnc:36026", - "symbol": "RPS24P5", - "locations": [ - { - "_id": "ga4gh:VCL.dLLTOKtFTnVd3ope5gTii1Gbdj7FxSfa", - "chr": "1", - "interval": { - "end": "q41", - "start": "p36.13", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - } - ], - "symbol_status": "approved", - "xrefs": [ - "ncbigene:100271094" - ], - "associated_with": [ - "refseq:NG_011274", - "pubmed:19123937" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "rps24p5##symbol", - "concept_id": "hgnc:36026", - "src_name": "HGNC", - "item_type": "symbol" - }, - { - "label_and_type": "hgnc:34692##identity", - "label": "tRNA-Leu (anticodon CAG) 2-1", - "concept_id": "hgnc:34692", - "symbol": "TRL-CAG2-1", - "locations": [ - { - "_id": "ga4gh:VCL.r_iXu-FjXuJjmeNhmDEputf6tgjXRQIr", - "chr": "16", - "interval": { - "end": "q13", - "start": "q21", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - } - ], - "symbol_status": "approved", - "aliases": [ - "tRNA-Leu-CAG-2-1" - ], - "previous_symbols": [ - "TRNAL13" - ], - "xrefs": [ - "ncbigene:100189130" - ], - "associated_with": [ - "ena.embl:HG983896" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "trl-cag2-1##symbol", - "concept_id": "hgnc:34692", - "src_name": "HGNC", - "item_type": "symbol" - }, - { - "label_and_type": "hgnc:7603##identity", - "label": "myosin VB", - "concept_id": "hgnc:7603", - "symbol": "MYO5B", - "locations": [ - { - "_id": "ga4gh:VCL.1vd9qlPiSSaDZC5X4jIKpapokxvKrITd", - "chr": "18", - "interval": { - "end": "qter", - "start": "cen", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - } - ], - "symbol_status": "approved", - "aliases": [ - "KIAA1119" - ], - "xrefs": [ - "ncbigene:4645", - "ensembl:ENSG00000167306" - ], - "associated_with": [ - "vega:OTTHUMG00000179843", - "refseq:NM_001080467", - "omim:606540", - "ucsc:uc002leb.3", - "uniprot:Q9ULV0", - "orphanet:171089", - "ccds:CCDS42436", - "ena.embl:AB032945", - "pubmed:8884266", - "pubmed:17462998" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "refseq:nm_001080467##associated_with", - "concept_id": "hgnc:7603", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "myo5b##symbol", - "concept_id": "hgnc:7603", - "src_name": "HGNC", - "item_type": "symbol" - }, - { - "label_and_type": "hgnc:4641##identity", - "label": "glutathione S-transferase theta 1", - "concept_id": "hgnc:4641", - "symbol": "GSTT1", - "location_annotations": [ - "alternate reference locus" - ], - "locations": [ - { - "_id": "ga4gh:VCL.EfA-UFrmtjncDxutoiP6PWxu32UtH1Zu", - "chr": "22", - "interval": { - "end": "q11.23", - "start": "q11.23", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - } - ], - "symbol_status": "approved", - "aliases": [ - "2.5.1.18" - ], - "xrefs": [ - "ncbigene:2952", - "ensembl:ENSG00000277656" - ], - "associated_with": [ - "refseq:NM_000853", - "omim:600436", - "ucsc:uc002zze.4", - "uniprot:P30711", - "orphanet:470418", - "ena.embl:KI270879", - "pubmed:8617495" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "omim:600436##associated_with", - "concept_id": "hgnc:4641", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "gstt1##symbol", - "concept_id": "hgnc:4641", - "src_name": "HGNC", - "item_type": "symbol" - }, - { - "label_and_type": "hgnc:108##identity", - "label": "acetylcholinesterase (Cartwright blood group)", - "concept_id": "hgnc:108", - "symbol": "ACHE", - "locations": [ - { - "_id": "ga4gh:VCL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", - "type": "ChromosomeLocation", - "species_id": "taxonomy:9606", - "chr": "7", - "interval": { - "end": "q22.1", - "start": "q22.1", - "type": "CytobandInterval" - } - } - ], - "symbol_status": "approved", - "aliases": [ - "3.1.1.7" - ], - "previous_symbols": [ - "YT" - ], - "xrefs": [ - "ncbigene:43", - "ensembl:ENSG00000087085" - ], - "associated_with": [ - "vega:OTTHUMG00000157033", - "ucsc:uc003uxi.4", - "ccds:CCDS5710", - "ccds:CCDS64736", - "ccds:CCDS5709", - "uniprot:P22303", - "pubmed:1380483", - "omim:100740", - "merops:S09.979", - "iuphar:2465", - "refseq:NM_015831" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "ache##symbol", - "concept_id": "hgnc:108", - "src_name": "HGNC", - "item_type": "symbol" - }, - { - "label_and_type": "acetylcholinesterase (cartwright blood group)##label", - "concept_id": "hgnc:108", - "src_name": "HGNC", - "item_type": "label" - }, - { - "label_and_type": "ncbigene:43##xref", - "concept_id": "hgnc:108", - "src_name": "HGNC", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000087085##xref", - "concept_id": "hgnc:108", - "src_name": "HGNC", - "item_type": "xref" - }, - { - "label_and_type": "vega:otthumg00000157033##associated_with", - "concept_id": "hgnc:108", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "ucsc:uc003uxi.4##associated_with", - "concept_id": "hgnc:108", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "ccds:ccds5710##associated_with", - "concept_id": "hgnc:108", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "ccds:ccds64736##associated_with", - "concept_id": "hgnc:108", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "ccds:ccds5709##associated_with", - "concept_id": "hgnc:108", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "uniprot:p22303##associated_with", - "concept_id": "hgnc:108", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "pubmed:1380483##associated_with", - "concept_id": "hgnc:108", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "omim:100740##associated_with", - "concept_id": "hgnc:108", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "merops:s09.979##associated_with", - "concept_id": "hgnc:108", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "iuphar:2465##associated_with", - "concept_id": "hgnc:108", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "refseq:nm_015831##associated_with", - "concept_id": "hgnc:108", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "3.1.1.7##alias", - "concept_id": "hgnc:108", - "src_name": "HGNC", - "item_type": "alias" - }, - { - "label_and_type": "yt##prev_symbol", - "concept_id": "hgnc:108", - "src_name": "HGNC", - "item_type": "prev_symbol" - }, - { - "label_and_type": "hgnc:1097##identity", - "concept_id": "hgnc:1097", - "symbol": "BRAF", - "symbol_status": "approved", - "label": "B-Raf proto-oncogene, serine/threonine kinase", - "locations": [ - { - "_id": "ga4gh:VCL.O6yCQ1cnThOrTfK9YUgMlTfM6HTqbrKw", - "type": "ChromosomeLocation", - "species_id": "taxonomy:9606", - "chr": "7", - "interval": { - "end": "q34", - "start": "q34", - "type": "CytobandInterval" - } - } - ], - "aliases": [ - "BRAF1" - ], - "xrefs": [ - "ensembl:ENSG00000157764", - "ncbigene:673" - ], - "associated_with": [ - "vega:OTTHUMG00000157457", - "ucsc:uc003vwc.5", - "ccds:CCDS5863", - "ccds:CCDS87555", - "uniprot:P15056", - "pubmed:2284096", - "pubmed:1565476", - "cosmic:BRAF", - "omim:164757", - "orphanet:119066", - "iuphar:1943", - "ena.embl:M95712", - "refseq:NM_004333" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "braf##symbol", - "concept_id": "hgnc:1097", - "src_name": "HGNC", - "item_type": "symbol" - }, - { - "label_and_type": "b-raf proto-oncogene, serine/threonine kinase##label", - "concept_id": "hgnc:1097", - "src_name": "HGNC", - "item_type": "label" - }, - { - "label_and_type": "ncbigene:673##xref", - "concept_id": "hgnc:1097", - "src_name": "HGNC", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000157764##xref", - "concept_id": "hgnc:1097", - "src_name": "HGNC", - "item_type": "xref" - }, - { - "label_and_type": "vega:otthumg00000157457##associated_with", - "concept_id": "hgnc:1097", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "ucsc:uc003vwc.5##associated_with", - "concept_id": "hgnc:1097", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "ccds:ccds5863##associated_with", - "concept_id": "hgnc:1097", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "ccds:ccds87555##associated_with", - "concept_id": "hgnc:1097", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "uniprot:p15056##associated_with", - "concept_id": "hgnc:1097", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "pubmed:2284096##associated_with", - "concept_id": "hgnc:1097", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "pubmed:1565476##associated_with", - "concept_id": "hgnc:1097", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "cosmic:braf##associated_with", - "concept_id": "hgnc:1097", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "omim:164757##associated_with", - "concept_id": "hgnc:1097", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "orphanet:119066##associated_with", - "concept_id": "hgnc:1097", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "iuphar:1943##associated_with", - "concept_id": "hgnc:1097", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "ena.embl:m95712##associated_with", - "concept_id": "hgnc:1097", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "refseq:nm_004333##associated_with", - "concept_id": "hgnc:1097", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "braf1##alias", - "concept_id": "hgnc:1097", - "src_name": "HGNC", - "item_type": "alias" - }, - { - "label_and_type": "hgnc:76##identity", - "concept_id": "hgnc:76", - "symbol": "ABL1", - "symbol_status": "approved", - "label": "ABL proto-oncogene 1, non-receptor tyrosine kinase", - "locations": [ - { - "_id": "ga4gh:VCL.WvMfE67KxSDAV8JaK593TI74yyJWIsMQ", - "type": "ChromosomeLocation", - "species_id": "taxonomy:9606", - "chr": "9", - "interval": { - "end": "q34.12", - "start": "q34.12", - "type": "CytobandInterval" - } - } - ], - "aliases": [ - "c-ABL", - "JTK7", - "p150" - ], - "previous_symbols": [ - "ABL" - ], - "xrefs": [ - "ensembl:ENSG00000097007", - "ncbigene:25" - ], - "associated_with": [ - "vega:OTTHUMG00000020813", - "ucsc:uc004bzv.4", - "ccds:CCDS35166", - "ccds:CCDS35165", - "uniprot:P00519", - "pubmed:1857987", - "pubmed:12626632", - "cosmic:ABL1", - "omim:189980", - "orphanet:117691", - "iuphar:1923", - "ena.embl:M14752", - "refseq:NM_007313" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "abl1##symbol", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "symbol" - }, - { - "label_and_type": "abl proto-oncogene 1, non-receptor tyrosine kinase##label", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "xref" - }, - { - "label_and_type": "ncbigene:25##xref", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000097007##xref", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "xref" - }, - { - "label_and_type": "vega:otthumg00000020813##associated_with", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "ucsc:uc004bzv.4##associated_with", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "ccds:ccds35166##associated_with", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "ccds:ccds35165##associated_with", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "uniprot:p00519##associated_with", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "pubmed:1857987##associated_with", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "pubmed:12626632##associated_with", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "cosmic:abl1##associated_with", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "omim:189980##associated_with", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "orphanet:117691##associated_with", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "iuphar:1923##associated_with", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "ena.embl:m14752##associated_with", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "refseq:nm_007313##associated_with", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "associated_with" - }, - { - "label_and_type": "c-abl1##alias", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "alias" - }, - { - "label_and_type": "jtk7##alias", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "alias" - }, - { - "label_and_type": "p150##alias", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "alias" - }, - { - "label_and_type": "abl##prev_symbol", - "concept_id": "hgnc:76", - "src_name": "HGNC", - "item_type": "prev_symbol" - }, - { - "label_and_type": "hgnc:500##identity", - "concept_id": "hgnc:500", - "symbol": "ANPEP", - "label": "alanyl aminopeptidase, membrane", - "aliases": [ - "gp150", - "p150", - "LAP1", - "3.4.11.2" - ], - "previous_symbols": [ - "PEPN", - "CD13" - ], - "xrefs": [ - "ncbigene:290", - "ensembl:ENSG00000166825" - ], - "associated_with": [ - "vega:OTTHUMG00000149814", - "ucsc:uc002bop.5", - "ccds:CCDS10356", - "uniprot:P15144", - "pubmed:2428842", - "pubmed:1977688", - "omim:151530", - "merops:M01.001", - "iuphar:1560", - "hcdmdb:CD13", - "ena.embl:M22324", - "refseq:NM_001150" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "p150##alias", - "concept_id": "hgnc:500", - "src_name": "HGNC", - "item_type": "alias" - }, - { - "label_and_type": "ncbigene:290##xref", - "concept_id": "hgnc:500", - "src_name": "HGNC", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000166825##xref", - "concept_id": "hgnc:500", - "src_name": "HGNC", - "item_type": "xref" - }, - { - "label_and_type": "hgnc:1910##identity", - "concept_id": "hgnc:1910", - "symbol": "CHAF1A", - "symbol_status": "approved", - "label": "chromatin assembly factor 1 subunit A", - "aliases": [ - "CAF-1", - "CAF1P150", - "CAF1", - "CAF1B", - "P150", - "MGC71229" - ], - "xrefs": [ - "ncbigene:10036", - "ensembl:ENSG00000167670" - ], - "locations": [ - { - "_id": "ga4gh:VCL.yF2TzeunqY92v3yhDsCR_t5X997mWriF", - "type": "ChromosomeLocation", - "species_id": "taxonomy:9606", - "chr": "19", - "interval": { - "end": "p13.3", - "start": "p13.3", - "type": "CytobandInterval" - } - } - ], - "associated_with": [ - "vega:OTTHUMG00000181922", - "ucsc:uc002mal.4", - "ccds:CCDS32875", - "uniprot:Q13111", - "pubmed:7600578", - "omim:601246", - "ena.embl:U20979", - "refseq:NM_005483" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "p150##alias", - "concept_id": "hgnc:1910", - "src_name": "HGNC", - "item_type": "alias" - }, - { - "label_and_type": "ncbigene:10036##xref", - "concept_id": "hgnc:1910", - "src_name": "HGNC", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000167670##xref", - "concept_id": "hgnc:1910", - "src_name": "HGNC", - "item_type": "xref" - }, - { - "label_and_type": "hgnc:8982##identity", - "concept_id": "hgnc:8982", - "symbol": "PIK3R4", - "label": "phosphoinositide-3-kinase regulatory subunit 4", - "aliases": [ - "p150", - "VPS15" - ], - "xrefs": [ - "ncbigene:30849", - "ensembl:ENSG00000196455" - ], - "associated_with": [ - "vega:OTTHUMG00000159645", - "ucsc:uc003enj.4", - "ccds:CCDS3067", - "uniprot:Q99570", - "pubmed:8999962", - "omim:602610", - "iuphar:2157", - "ena.embl:Y08991", - "refseq:NM_014602" - ], - "src_name": "HGNC", - "item_type": "identity" - }, - { - "label_and_type": "pik3r4##symbol", - "concept_id": "hgnc:8982", - "src_name": "HGNC", - "item_type": "symbol" - }, - { - "label_and_type": "phosphoinositide-3-kinase regulatory subunit 4##label", - "concept_id": "hgnc:8982", - "src_name": "HGNC", - "item_type": "label" - }, - { - "label_and_type": "p150##alias", - "concept_id": "hgnc:8982", - "src_name": "HGNC", - "item_type": "alias" - }, - { - "label_and_type": "vps15##alias", - "concept_id": "hgnc:8982", - "src_name": "HGNC", - "item_type": "alias" - }, - { - "label_and_type": "ncbigene:30849##xref", - "concept_id": "hgnc:8982", - "src_name": "HGNC", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000196455##xref", - "concept_id": "hgnc:8982", - "src_name": "HGNC", - "item_type": "xref" - } -] \ No newline at end of file diff --git a/tests/unit/data/metadata.json b/tests/unit/data/metadata.json deleted file mode 100644 index af9cd24d..00000000 --- a/tests/unit/data/metadata.json +++ /dev/null @@ -1,42 +0,0 @@ -[ - { - "src_name": "Ensembl", - "version": "104", - "data_url": "ftp://ftp.ensembl.org/pub/Homo_sapiens.GRCh38.104.gff3.gz", - "data_license_attributes": { - "non_commercial": false, - "share_alike": false, - "attribution": false - }, - "genome_assemblies": ["GRCh38"], - "data_license": "custom", - "data_license_url": "https://useast.ensembl.org/info/about/legal/disclaimer.html" - }, - { - "src_name": "HGNC", - "version": "20201208", - "data_url": "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json", - "data_license_attributes": { - "non_commercial": false, - "share_alike": false, - "attribution": false - }, - "genome_assemblies": [], - "data_license": "custom", - "data_license_url": "https://www.genenames.org/about/" - }, - { - "src_name": "NCBI", - "version": "20201215", - "data_url": "ftp://ftp.ncbi.nlm.nih.gov", - "data_license_attributes": { - "non_commercial": false, - "share_alike": false, - "attribution": false - }, - "genome_assemblies":[ "GRCh38.p13"], - "data_license": "custom", - "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", - "rdp_url": "https://reusabledata.org/ncbi-gene.html" - } -] diff --git a/tests/unit/data/ncbi_genes.json b/tests/unit/data/ncbi_genes.json deleted file mode 100644 index e5b9a93d..00000000 --- a/tests/unit/data/ncbi_genes.json +++ /dev/null @@ -1,1261 +0,0 @@ -[ - { - "label_and_type": "ncbigene:8193##identity", - "label": "double PHD fingers 1", - "concept_id": "ncbigene:8193", - "symbol": "DPF1", - "aliases": [ - "BAF45b", - "NEUD4", - "neuro-d4", - "SMARCG1" - ], - "xrefs": [ - "hgnc:20225", - "ensembl:ENSG00000011332" - ], - "associated_with": [ - "omim:601670" - ], - "strand": "-", - "locations": [ - { - "_id": "ga4gh:VCL.nEPKXzyfglrOMMFySOTQ8Om_f6xmr-pP", - "chr": "19", - "interval": { - "end": "q13.2", - "start": "q13.2", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - }, - { - "_id": "ga4gh:VSL.5ddLPk8gIdQm3YA4r4p5NAsty9SPwXvJ", - "interval": { - "end": 38229695, - "start": 38211006, - "type": "SimpleInterval" - }, - "sequence_id": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", - "type": "SequenceLocation" - } - ], - "src_name": "NCBI", - "item_type": "identity" - }, -{ - "label_and_type": "omim:601670##associated_with", - "concept_id": "ncbigene:8193", - "src_name": "NCBI", - "item_type": "associated_with" - }, - { - "label_and_type": "dpf1##symbol", - "concept_id": "ncbigene:8193", - "src_name": "NCBI", - "item_type": "symbol" - }, - { - "label_and_type": "baf45b##alias", - "concept_id": "ncbigene:8193", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "neud4##alias", - "concept_id": "ncbigene:8193", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "neuro-d4##alias", - "concept_id": "ncbigene:8193", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "ncbigene:54704##identity", - "label": "pyruvate dehydrogenase phosphatase catalytic subunit 1", - "concept_id": "ncbigene:54704", - "symbol": "PDP1", - "aliases": [ - "PDH", - "PDP", - "PDPC", - "PPM2A", - "PPM2C" - ], - "xrefs": [ - "hgnc:9279", - "ensembl:ENSG00000164951" - ], - "previous_symbols": [ - "LOC157663", - "PPM2C" - ], - "associated_with": [ - "omim:605993" - ], - "strand": "+", - "locations": [ - { - "_id": "ga4gh:VCL.n9W_wjDCStQf29yPcjhkMnFmESG8wN9A", - "chr": "8", - "interval": { - "end": "q22.1", - "start": "q22.1", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - }, - { - "_id": "ga4gh:VSL.s8NNWWCxWeiOCUa90ge6x8wxFWpO34gV", - "interval": { - "end": 93926068, - "start": 93916923, - "type": "SimpleInterval" - }, - "sequence_id": "ga4gh:SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs", - "type": "SequenceLocation" - } - ], - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "pdp1##symbol", - "concept_id": "ncbigene:54704", - "src_name": "NCBI", - "item_type": "symbol" - }, - { - "label_and_type": "loc157663##prev_symbol", - "concept_id": "ncbigene:54704", - "src_name": "NCBI", - "item_type": "prev_symbol" - }, - { - "label_and_type": "ppm2c##prev_symbol", - "concept_id": "ncbigene:54704", - "src_name": "NCBI", - "item_type": "prev_symbol" - }, - { - "label_and_type": "pdh##alias", - "concept_id": "ncbigene:54704", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "pdp##alias", - "concept_id": "ncbigene:54704", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "pdpc##alias", - "concept_id": "ncbigene:54704", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "ppm2a##alias", - "concept_id": "ncbigene:54704", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "ppm2c##alias", - "concept_id": "ncbigene:54704", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "ncbigene:10251##identity", - "label": "sprouty RTK signaling antagonist 3", - "concept_id": "ncbigene:10251", - "symbol": "SPRY3", - "aliases": [ - "spry-3" - ], - "xrefs": [ - "hgnc:11271", - "ensembl:ENSG00000168939" - ], - "previous_symbols": [ - "LOC170187", - "LOC253479" - ], - "associated_with": [ - "omim:300531" - ], - "strand": "+", - "locations": [ - { - "_id": "ga4gh:VCL.A1s9hZY1tgmRi1WuXM1ETZOqJcpo4Ftx", - "chr": "Y", - "interval": { - "end": "q12", - "start": "q12", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - }, - { - "_id": "ga4gh:VCL.fEBeCyej0jVKsvjw4vxyW6j1h8UVLb5S", - "chr": "X", - "interval": { - "end": "q28", - "start": "q28", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - }, - { - "_id": "ga4gh:VSL.ILQGzmwapQ5xuIBoFPDTKUOuN9gJNSqp", - "interval": { - "end": 155782459, - "start": 155612586, - "type": "SimpleInterval" - }, - "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", - "type": "SequenceLocation" - }, - { - "_id": "ga4gh:VSL.R8x4-hQzGNmp654J83nUT2RJlIXOucno", - "interval": { - "end": 56968979, - "start": 56923423, - "type": "SimpleInterval" - }, - "sequence_id": "ga4gh:SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", - "type": "SequenceLocation" - } - ], - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "spry3##symbol", - "concept_id": "ncbigene:10251", - "src_name": "NCBI", - "item_type": "symbol" - }, - { - "label_and_type": "spry-3##alias", - "concept_id": "ncbigene:10251", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "ncbigene:106##identity", - "label": "adenosine deaminase complexing protein 1", - "concept_id": "ncbigene:106", - "symbol": "ADCP1", - "xrefs": [ - "hgnc:229" - ], - "location_annotations": [ - "6" - ], - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "adcp1##symbol", - "concept_id": "ncbigene:106", - "src_name": "NCBI", - "item_type": "symbol" - }, - { - "label_and_type": "ncbigene:170##identity", - "label": "ankyloblepharon filiforme adnatum", - "concept_id": "ncbigene:170", - "symbol": "AFA", - "associated_with": [ - "omim:106250" - ], - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "afa##symbol", - "concept_id": "ncbigene:170", - "src_name": "NCBI", - "item_type": "symbol" - }, - { - "label_and_type": "ncbigene:7637##identity", - "label": "zinc finger protein 84", - "concept_id": "ncbigene:7637", - "symbol": "ZNF84", - "aliases": [ - "HPF2" - ], - "xrefs": [ - "hgnc:13159", - "ensembl:ENSG00000198040" - ], - "previous_symbols": [ - "LOC100287429" - ], - "associated_with": [ - "omim:618554" - ], - "location_annotations": [ - "map from Rosati ref via FISH [AFS]" - ], - "strand": "+", - "locations": [ - { - "_id": "ga4gh:VCL.CusjBE-q66vf4v8VSHRhMxjR_4G688Ve", - "chr": "12", - "interval": { - "end": "q24.33", - "start": "q24.33", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - }, - { - "_id": "ga4gh:VSL.xuSNWLM7M4vru2gbt2hOfmjL7GD_UYdA", - "interval": { - "end": 133063299, - "start": 133037301, - "type": "SimpleInterval" - }, - "sequence_id": "ga4gh:SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl", - "type": "SequenceLocation" - } - ], - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "znf84##symbol", - "concept_id": "ncbigene:7637", - "src_name": "NCBI", - "item_type": "symbol" - }, - { - "label_and_type": "ncbigene:293##identity", - "label": "solute carrier family 25 member 6", - "concept_id": "ncbigene:293", - "symbol": "SLC25A6", - "aliases": [ - "AAC3", - "ANT", - "ANT 2", - "ANT 3", - "ANT3", - "ANT3Y" - ], - "xrefs": [ - "hgnc:10992", - "ensembl:ENSG00000169100" - ], - "previous_symbols": [ - "ANT3Y" - ], - "associated_with": [ - "omim:403000", - "omim:300151" - ], - "location_annotations": [ - "X", - "Y" - ], - "strand": "-", - "locations": [ - { - "_id": "ga4gh:VSL.8LvkImsib1GaFH8UQrz9DwjdrgkWNmdD", - "interval": { - "end": 1392113, - "start": 1386152, - "type": "SimpleInterval" - }, - "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", - "type": "SequenceLocation" - }, - { - "_id": "ga4gh:VSL.QLShSFmwPqUKTzjNS-x3Y3DZtOV4SK4J", - "interval": { - "end": 1392113, - "start": 1386152, - "type": "SimpleInterval" - }, - "sequence_id": "ga4gh:SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", - "type": "SequenceLocation" - } - ], - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "slc25a6##symbol", - "concept_id": "ncbigene:293", - "src_name": "NCBI", - "item_type": "symbol" - }, - { - "label_and_type": "ncbigene:106783576##identity", - "label": "nonconserved acetylation island sequence 68 enhancer", - "concept_id": "ncbigene:106783576", - "symbol": "LOC106783576", - "locations": [ - { - "_id": "ga4gh:VCL.RFN35KQMhqzhmo4QP7AxKAzlPtnh7slL", - "chr": "10", - "interval": { - "end": "cen", - "start": "pter", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - } - ], - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "loc106783576##symbol", - "concept_id": "ncbigene:106783576", - "src_name": "NCBI", - "item_type": "symbol" - }, - { - "label_and_type": "ncbigene:2722##identity", - "label": "glaucoma 1, open angle, B (adult-onset)", - "concept_id": "ncbigene:2722", - "symbol": "GLC1B", - "associated_with": [ - "omim:606689" - ], - "locations": [ - { - "_id": "ga4gh:VCL.HStPIl_6UkNQmbjZW1TeUmHFMptbIj6t", - "chr": "2", - "interval": { - "end": "q13", - "start": "cen", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - } - ], - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "omim:606689##associated_with", - "concept_id": "ncbigene:2722", - "src_name": "NCBI", - "item_type": "associated_with" - }, - { - "label_and_type": "glc1b##symbol", - "concept_id": "ncbigene:2722", - "src_name": "NCBI", - "item_type": "symbol" - }, - { - "label_and_type": "ncbigene:50829##identity", - "label": "Hodgkin disease, susceptibility, pseudoautosomal", - "concept_id": "ncbigene:50829", - "symbol": "HDPA", - "associated_with": [ - "omim:300221" - ], - "locations": [ - { - "_id": "ga4gh:VCL.faRHNO_VJMssbjYQ628mfdRgLqg9qK2b", - "chr": "X", - "interval": { - "end": "p22.32", - "start": "pter", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - } - ], - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "hdpa##symbol", - "concept_id": "ncbigene:50829", - "src_name": "NCBI", - "item_type": "symbol" - }, - { - "label_and_type": "ncbigene:731716##identity", - "label": "protein activator of interferon induced protein kinase EIF2AK2 pseudogene 1", - "concept_id": "ncbigene:731716", - "symbol": "PRKRAP1", - "xrefs": [ - "hgnc:33447" - ], - "previous_symbols": [ - "LOC100289695" - ], - "location_annotations": [ - "alternate reference locus" - ], - "strand": "+", - "locations": [ - { - "_id": "ga4gh:VCL.HeTd-jABCr22v4rUfVWJbkz2NkPyGScK", - "chr": "6", - "interval": { - "end": "p21.3", - "start": "p21.3", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - }, - { - "_id": "ga4gh:VSL.Sk6GHE3MekN_PaimBKBmsQR6pLqQ-SAN", - "interval": { - "end": 3941874, - "start": 3940270, - "type": "SimpleInterval" - }, - "sequence_id": "ga4gh:SQ.MjujHSAsgNWRTX4w3ysM7b5OVhZpdXu1", - "type": "SequenceLocation" - }, - { - "_id": "ga4gh:VSL.WTVn8f6OmsdbiEniT8W4CHuHJdp78oBP", - "interval": { - "end": 3932085, - "start": 3930481, - "type": "SimpleInterval" - }, - "sequence_id": "ga4gh:SQ.Q8IworEhpLeXwpz1CHM7C3luysh-ltx-", - "type": "SequenceLocation" - } - ], - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "hgnc:33447##xref", - "concept_id": "ncbigene:731716", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "prkrap1##symbol", - "concept_id": "ncbigene:731716", - "src_name": "NCBI", - "item_type": "symbol" - }, - { - "label_and_type": "ncbigene:619511##identity", - "label": "myopathy, hyaline body, autosomal recessive", - "concept_id": "ncbigene:619511", - "symbol": "MHB", - "associated_with": [ - "omim:255160" - ], - "locations": [ - { - "_id": "ga4gh:VCL.2WDJu032Gc_9BN4qiNELb577XomiZv8z", - "chr": "3", - "interval": { - "end": "p21.32", - "start": "p22.2", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - } - ], - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "omim:255160##associated_with", - "concept_id": "ncbigene:619511", - "src_name": "NCBI", - "item_type": "associated_with" - }, - { - "label_and_type": "mhb##symbol", - "concept_id": "ncbigene:619511", - "src_name": "NCBI", - "item_type": "symbol" - }, - { - "label_and_type": "ncbigene:100049159##identity", - "label": "spastic paraplegia 37 (autosomal dominant)", - "concept_id": "ncbigene:100049159", - "symbol": "SPG37", - "associated_with": [ - "omim:611945" - ], - "locations": [ - { - "_id": "ga4gh:VCL.P5jAIluXneqHZMV9FBEQ2ZqOpO-8fqbP", - "chr": "8", - "interval": { - "end": "q13.3", - "start": "p21.2", - "type": "CytobandInterval" - }, - "species_id": "taxonomy:9606", - "type": "ChromosomeLocation" - } - ], - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "omim:611945##associated_with", - "concept_id": "ncbigene:100049159", - "src_name": "NCBI", - "item_type": "associated_with" - }, - { - "label_and_type": "spg37##symbol", - "concept_id": "ncbigene:100049159", - "src_name": "NCBI", - "item_type": "symbol" - }, - { - "label_and_type": "ncbigene:43##identity", - "item_type": "identity", - "label": "acetylcholinesterase (Cartwright blood group)", - "concept_id": "ncbigene:43", - "symbol": "ACHE", - "aliases": [ - "N-ACHE", - "YT", - "ARACHE", - "ACEE" - ], - "previous_symbols": [ - "ACEE" - ], - "xrefs": [ - "hgnc:108", - "ensembl:ENSG00000087085" - ], - "associated_with": [ - "omim:100740" - ], - "locations": [ - { - "_id": "ga4gh:VCL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", - "type": "ChromosomeLocation", - "species_id": "taxonomy:9606", - "chr": "7", - "interval": { - "end": "q22.1", - "start": "q22.1", - "type": "CytobandInterval" - } - }, - { - "_id": "ga4gh:VSL.jsbI0UoBvKoN6a_My1w9F4H1kd2kWVUb", - "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", - "interval": { - "end": 100896994, - "start": 100889994, - "type": "SimpleInterval" - } - } - ], - "strand": "-", - "src_name": "NCBI" - }, - { - "label_and_type": "ache##symbol", - "concept_id": "ncbigene:43", - "src_name": "NCBI", - "item_type": "symbol" - }, - { - "label_and_type": "acetylcholinesterase (cartwright blood group)##label", - "concept_id": "ncbigene:43", - "src_name": "NCBI", - "item_type": "label" - }, - { - "label_and_type": "hgnc:108##xref", - "concept_id": "ncbigene:43", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000087085##xref", - "concept_id": "ncbigene:43", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "omim:100740##associated_with", - "concept_id": "ncbigene:43", - "src_name": "NCBI", - "item_type": "associated_with" - }, - { - "label_and_type": "n-ache##alias", - "concept_id": "ncbigene:43", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "yt##alias", - "concept_id": "ncbigene:43", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "arache##alias", - "concept_id": "ncbigene:43", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "acee##alias", - "concept_id": "ncbigene:43", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "acee##prev_symbol", - "concept_id": "ncbigene:43", - "src_name": "NCBI", - "item_type": "prev_symbol" - }, - { - "label_and_type": "ncbigene:673##identity", - "concept_id": "ncbigene:673", - "symbol": "BRAF", - "label": "B-Raf proto-oncogene, serine/threonine kinase", - "strand": "-", - "locations": [ - { - "_id": "ga4gh:VCL.O6yCQ1cnThOrTfK9YUgMlTfM6HTqbrKw", - "type": "ChromosomeLocation", - "species_id": "taxonomy:9606", - "chr": "7", - "interval": { - "end": "q34", - "start": "q34", - "type": "CytobandInterval" - } - }, - { - "_id": "ga4gh:VSL.pmvErcgG96dcFyrZ4YiiSnFa5jJCCZmp", - "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", - "interval": { - "end": 140924929, - "start": 140713328, - "type": "SimpleInterval" - } - } - ], - "aliases": [ - "RAFB1", - "NS7", - "B-RAF1", - "BRAF1", - "B-raf" - ], - "xrefs": [ - "hgnc:1097", - "ensembl:ENSG00000157764" - ], - "associated_with": [ - "omim:164757" - ], - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "braf##symbol", - "concept_id": "ncbigene:673", - "src_name": "NCBI", - "item_type": "symbol" - }, - { - "label_and_type": "b-raf proto-oncogene, serine/threonine kinase##label", - "concept_id": "ncbigene:673", - "src_name": "NCBI", - "item_type": "label" - }, - { - "label_and_type": "hgnc:1097##xref", - "concept_id": "ncbigene:673", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000157764##xref", - "concept_id": "ncbigene:673", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "omim:164757##associated_with", - "concept_id": "ncbigene:673", - "src_name": "NCBI", - "item_type": "associated_with" - }, - { - "label_and_type": "rafb1##alias", - "concept_id": "ncbigene:673", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "ns7##alias", - "concept_id": "ncbigene:673", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "b-raf1##alias", - "concept_id": "ncbigene:673", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "braf1##alias", - "concept_id": "ncbigene:673", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "b-raf##alias", - "concept_id": "ncbigene:673", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "ncbigene:25##identity", - "concept_id": "ncbigene:25", - "symbol": "ABL1", - "label": "ABL proto-oncogene 1, non-receptor tyrosine kinase", - "strand": "+", - "locations": [ - { - "_id": "ga4gh:VCL.WvMfE67KxSDAV8JaK593TI74yyJWIsMQ", - "type": "ChromosomeLocation", - "species_id": "taxonomy:9606", - "chr": "9", - "interval": { - "end": "q34.12", - "start": "q34.12", - "type": "CytobandInterval" - } - }, - { - "_id": "ga4gh:VSL.r8YW4n2xNqifnz-dmlnRjmbCYqUGqnDk", - "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI", - "interval": { - "end": 130887675, - "start": 130713881, - "type": "SimpleInterval" - } - } - ], - "aliases": [ - "CHDSKM", - "BCR-ABL", - "ABL", - "v-abl", - "JTK7", - "c-ABL", - "p150", - "c-ABL1", - "bcr/abl" - ], - "previous_symbols": [ - "LOC116063", - "LOC112779" - ], - "xrefs": [ - "hgnc:76", - "ensembl:ENSG00000097007" - ], - "associated_with": [ - "omim:189980" - ], - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "abl1##symbol", - "concept_id": "ncbigene:25", - "src_name": "NCBI", - "item_type": "symbol" - }, - { - "label_and_type": "abl proto-oncogene 1, non-receptor tyrosine kinase##label", - "concept_id": "ncbigene:25", - "src_name": "NCBI", - "item_type": "label" - }, - { - "label_and_type": "hgnc:76##xref", - "concept_id": "ncbigene:25", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000097007##xref", - "concept_id": "ncbigene:25", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "omim:189980##associated_with", - "concept_id": "ncbigene:25", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "chdskm##alias", - "concept_id": "ncbigene:25", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "bcr-abl##alias", - "concept_id": "ncbigene:25", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "abl##alias", - "concept_id": "ncbigene:25", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "v-abl##alias", - "concept_id": "ncbigene:25", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "jtk7##alias", - "concept_id": "ncbigene:25", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "c-abl##alias", - "concept_id": "ncbigene:25", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "p150##alias", - "concept_id": "ncbigene:25", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "c-abl1##alias", - "concept_id": "ncbigene:25", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "bcr/abl##alias", - "concept_id": "ncbigene:25", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "loc116063##prev_symbol", - "concept_id": "ncbigene:25", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "loc112779##prev_symbol", - "concept_id": "ncbigene:25", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "ncbigene:103344718##identity", - "concept_id": "ncbigene:103344718", - "symbol": "HOTS", - "symbol_status": "discontinued", - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "hots##symbol", - "concept_id": "ncbigene:103344718", - "src_name": "NCBI", - "item_type": "symbol" - }, - { - "label_and_type": "ncbigene:544580##identity", - "concept_id": "ncbigene:544580", - "symbol": "AASTH23", - "symbol_status": "discontinued", - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "aasth23##symbol", - "concept_id": "ncbigene:544580", - "src_name": "NCBI", - "item_type": "symbol" - }, - { - "label_and_type": "ncbigene:9646##identity", - "concept_id": "ncbigene:9646", - "symbol": "CTR9", - "label": "CTR9 homolog, Paf1/RNA polymerase II complex component", - "aliases": [ - "p150", - "SH2BP1", - "p150TSP", - "TSBP" - ], - "xrefs": [ - "hgnc:16850", - "ensembl:ENSG00000198730" - ], - "associated_with": [ - "omim:609366" - ], - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "p150##alias", - "concept_id": "ncbigene:9646", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "hgnc:16850##xref", - "concept_id": "ncbigene:9646", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000198730##xref", - "concept_id": "ncbigene:9646", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "ncbigene:290##identity", - "concept_id": "ncbigene:290", - "symbol": "ANPEP", - "label": "alanyl aminopeptidase, membrane", - "aliases": [ - "GP150", - "LAP1", - "APN", - "CD13", - "PEPN", - "P150" - ], - "xrefs": [ - "hgnc:500", - "ensembl:ENSG00000166825" - ], - "associated_with": [ - "omim:151530" - ], - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "p150##alias", - "concept_id": "ncbigene:290", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "hgnc:500##xref", - "concept_id": "ncbigene:290", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000166825##xref", - "concept_id": "ncbigene:290", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "ncbigene:10036##identity", - "concept_id": "ncbigene:10036", - "symbol": "CHAF1A", - "label": "chromatin assembly factor 1 subunit A", - "aliases": [ - "CAF1B", - "CAF1", - "CAF-1", - "CAF1P150", - "P150" - ], - "xrefs": [ - "hgnc:1910", - "ensembl:ENSG00000167670" - ], - "associated_with": [ - "omim:601246" - ], - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "chaf1a##symbol", - "concept_id": "ncbigene:10036", - "src_name": "NCBI", - "item_type": "symbol" - }, - { - "label_and_type": "chromatin assembly factor 1 subunit a##label", - "concept_id": "ncbigene:10036", - "src_name": "NCBI", - "item_type": "label" - }, - { - "label_and_type": "p150##alias", - "concept_id": "ncbigene:10036", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "caf1b##alias", - "concept_id": "ncbigene:10036", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "caf1##alias", - "concept_id": "ncbigene:10036", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "caf-1##alias", - "concept_id": "ncbigene:10036", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "caf1p150##alias", - "concept_id": "ncbigene:10036", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "hgnc:1910##xref", - "concept_id": "ncbigene:10036", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000167670##xref", - "concept_id": "ncbigene:10036", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "ncbigene:30849##identity", - "concept_id": "ncbigene:30849", - "symbol": "PIK3R4", - "label": "phosphoinositide-3-kinase regulatory subunit 4", - "aliases": [ - "p150", - "VPS15" - ], - "xrefs": [ - "hgnc:8982", - "ensembl:ENSG00000196455" - ], - "associated_with": [ - "omim:602610" - ], - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "p150##alias", - "concept_id": "ncbigene:30849", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "hgnc:8982##xref", - "concept_id": "ncbigene:30849", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ensg00000196455##xref", - "concept_id": "ensembl:ENSG00000196455", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "ncbigene:25782##identity", - "concept_id": "ncbigene:25782", - "symbol": "RAB3GAP2", - "label": "RAB3 GTPase activating non-catalytic protein subunit 2", - "aliases": [ - "RAB3GAP150", - "RAB3-GAP150", - "SPG69", - "WARBM2", - "p150" - ], - "previous_symbols": [ - "DKFZP434D245" - ], - "xrefs": [ - "hgnc:17168", - "ensembl:ENSG00000118873" - ], - "associated_with": [ - "omim:609275" - ], - "src_name": "NCBI", - "item_type": "identity" - }, - { - "label_and_type": "p150##alias", - "concept_id": "ncbigene:25782", - "src_name": "NCBI", - "item_type": "alias" - }, - { - "label_and_type": "hgnc:17168##xref", - "concept_id": "ncbigene:25782", - "src_name": "NCBI", - "item_type": "xref" - }, - { - "label_and_type": "ensembl:ENSG00000118873##xref", - "concept_id": "ENSG00000118873", - "src_name": "NCBI", - "item_type": "xref" - } -] diff --git a/tests/unit/test_database.py b/tests/unit/test_database.py deleted file mode 100644 index 58341a83..00000000 --- a/tests/unit/test_database.py +++ /dev/null @@ -1,89 +0,0 @@ -"""Test DynamoDB""" -import pytest -from gene import PREFIX_LOOKUP -from gene.database import Database -from gene.etl.merge import Merge -import json -import os -from pathlib import Path -from boto3.dynamodb.conditions import Key - -TEST_ROOT = Path(__file__).resolve().parents[2] - - -@pytest.fixture(scope='module') -def db(): - """Create a DynamoDB test fixture.""" - class DB: - def __init__(self): - self.db = Database() - self.merge = Merge(database=self.db) - if os.environ.get('TEST') is not None: - self.db.delete_all_db_tables() - self.db.create_db_tables() - processed_ids = self.load_test_data() - self.merge.create_merged_concepts(processed_ids) - - def load_test_data(self): - processed_ids = set() - for src in PREFIX_LOOKUP.values(): - with open(f'{TEST_ROOT}/tests/unit/' - f'data/{src.lower()}_genes.json', 'r') as f: - genes = json.load(f) - with self.db.genes.batch_writer() as batch: - for gene in genes: - if gene["label_and_type"].endswith("##identity"): - processed_ids.add(gene["concept_id"]) - batch.put_item(Item=gene) - f.close() - - with open(f'{TEST_ROOT}/tests/unit/' - f'data/metadata.json', 'r') as f: - metadata = json.load(f) - with self.db.metadata.batch_writer() as batch: - for m in metadata: - batch.put_item(Item=m) - f.close() - return processed_ids - - return DB().db - - -def test_tables_created(db): - """Check that gene_concepts and gene_metadata are created.""" - existing_tables = db.dynamodb_client.list_tables()['TableNames'] - assert 'gene_concepts' in existing_tables - assert 'gene_metadata' in existing_tables - - -def test_item_type(db): - """Check that items are tagged with item_type attribute.""" - filter_exp = Key('label_and_type').eq('ncbigene:43##identity') - item = db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] - assert 'item_type' in item - assert item['item_type'] == 'identity' - - filter_exp = Key('label_and_type').eq('prkrap1##symbol') - item = db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] - assert 'item_type' in item - assert item['item_type'] == 'symbol' - - filter_exp = Key('label_and_type').eq('a1bgas##prev_symbol') - item = db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] - assert 'item_type' in item - assert item['item_type'] == 'prev_symbol' - - filter_exp = Key('label_and_type').eq('flj23569##alias') - item = db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] - assert 'item_type' in item - assert item['item_type'] == 'alias' - - filter_exp = Key('label_and_type').eq('omim:606689##associated_with') - item = db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] - assert 'item_type' in item - assert item['item_type'] == 'associated_with' - - filter_exp = Key('label_and_type').eq('ensembl:ensg00000097007##xref') - item = db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] - assert 'item_type' in item - assert item['item_type'] == 'xref' diff --git a/tests/unit/test_database_and_etl.py b/tests/unit/test_database_and_etl.py new file mode 100644 index 00000000..de6a2ead --- /dev/null +++ b/tests/unit/test_database_and_etl.py @@ -0,0 +1,127 @@ +"""Test DynamoDB and ETL methods.""" +import pytest +from gene.etl import Ensembl, HGNC, NCBI +from gene.etl.merge import Merge +from gene.database import Database +import os +from pathlib import Path +from boto3.dynamodb.conditions import Key + + +@pytest.fixture(scope='module') +def is_test_env(): + """Test fixture to determine whether or not using test environment.""" + return os.environ.get('TEST') is not None + + +@pytest.fixture(scope='module') +def dynamodb(is_test_env): + """Create a DynamoDB test fixture.""" + class DB: + def __init__(self): + self.db = Database() + self.merge = Merge(database=self.db) + if is_test_env: + self.db.delete_all_db_tables() + self.db.create_db_tables() + return DB() + + +@pytest.fixture(scope='module') +def processed_ids(): + """Create a test fixture to store processed ids for merged concepts.""" + return list() + + +@pytest.fixture(scope='module') +def etl_data_path(): + """Create a test fixture to return etl data path.""" + project_root = Path().resolve().parents[1] + return project_root / 'tests' / 'unit' / 'data' / 'etl_data' + + +def test_tables_created(dynamodb): + """Check that gene_concepts and gene_metadata are created.""" + existing_tables = dynamodb.db.dynamodb_client.list_tables()['TableNames'] + assert 'gene_concepts' in existing_tables + assert 'gene_metadata' in existing_tables + + +def test_ensembl_transform(processed_ids, dynamodb, etl_data_path, + is_test_env): + """Test ensembl transform method.""" + if is_test_env: + e = Ensembl(dynamodb.db) + e._data_src = etl_data_path / 'ensembl_104.gff3' + e._transform_data() + e._add_meta() + processed_ids += e._processed_ids + + +def test_hgnc_transform(processed_ids, dynamodb, etl_data_path, is_test_env): + """Test hgnc transform method.""" + if is_test_env: + h = HGNC(dynamodb.db) + h._data_src = etl_data_path / 'hgnc_20210810.json' + h._version = '20210810' + h._transform_data() + h._add_meta() + processed_ids += h._processed_ids + + +def test_ncbi_transform(processed_ids, dynamodb, etl_data_path, is_test_env): + """Test ncbi transform method.""" + if is_test_env: + n = NCBI(dynamodb.db) + n._info_src = etl_data_path / 'ncbi_info_20210813.tsv' + n._history_src = etl_data_path / 'ncbi_history_20210813.tsv' + n._gff_src = etl_data_path / 'ncbi_GRCh38.p13.gff' + n._version = n._info_src.stem.split('_')[-1] + n._transform_data() + n._add_meta() + processed_ids += n._processed_ids + + +def test_merged_conecpts(processed_ids, dynamodb, is_test_env): + """Create merged concepts and load to db.""" + if is_test_env: + dynamodb.merge.create_merged_concepts(processed_ids) + + +def test_item_type(dynamodb): + """Check that items are tagged with item_type attribute.""" + filter_exp = Key('label_and_type').eq('ncbigene:43##identity') + item = \ + dynamodb.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'identity' + + filter_exp = Key('label_and_type').eq('prkrap1##symbol') + item = \ + dynamodb.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'symbol' + + filter_exp = Key('label_and_type').eq('a1bgas##prev_symbol') + item = \ + dynamodb.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'prev_symbol' + + filter_exp = Key('label_and_type').eq('flj23569##alias') + item = \ + dynamodb.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'alias' + + filter_exp = Key('label_and_type').eq('omim:606689##associated_with') + item = \ + dynamodb.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'associated_with' + + filter_exp = Key('label_and_type').eq('ensembl:ensg00000097007##xref') + item = \ + dynamodb.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'xref' From 18347f73fe2965949517987e838f736a147a80c8 Mon Sep 17 00:00:00 2001 From: korikuzma Date: Mon, 16 Aug 2021 16:16:51 -0400 Subject: [PATCH 2/8] monkey patch seqrepo --- .github/workflows/github-actions.yml | 4 --- Pipfile | 1 + gene/etl/base.py | 11 +++--- gene/etl/ensembl.py | 5 ++- gene/etl/ncbi.py | 4 +-- gene/vrs_locations/sequence_location.py | 13 ++++++- tests/unit/test_database_and_etl.py | 48 ++++++++++++++++++++++--- 7 files changed, 66 insertions(+), 20 deletions(-) diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index 8bc9de81..e5a4df88 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -28,9 +28,5 @@ jobs: chmod +x ./tests/unit/dynamodb_build.bash ./tests/unit/dynamodb_build.bash - - name: Load and Test DynamoDB - run: | - pipenv run pytest tests/unit/test_database.py - - run: pipenv run flake8 - run: pipenv run pytest tests/ diff --git a/Pipfile b/Pipfile index 36b0e4c8..5de6f28d 100644 --- a/Pipfile +++ b/Pipfile @@ -30,3 +30,4 @@ coverage = "*" coveralls = "*" pytest-cov = "*" jupyterlab = "*" +mock = "*" diff --git a/gene/etl/base.py b/gene/etl/base.py index 64762db5..adda7334 100644 --- a/gene/etl/base.py +++ b/gene/etl/base.py @@ -23,8 +23,9 @@ class Base(ABC): """The ETL base class.""" - def __init__(self, database: Database, host: str, data_dir: str, *args, - **kwargs) -> None: + def __init__(self, database: Database, host: str, data_dir: str, + seqrepo_dir=PROJECT_ROOT / 'data' / 'seqrepo' / 'latest', + *args, **kwargs) -> None: """Instantiate Base class. :param Database database: DynamoDB database @@ -35,6 +36,7 @@ def __init__(self, database: Database, host: str, data_dir: str, *args, self._host = host self._data_dir = data_dir self._processed_ids = list() + self.seqrepo = self.get_seqrepo(seqrepo_dir) @abstractmethod def perform_etl(self) -> List[str]: @@ -148,9 +150,8 @@ def _ftp_download(self, host: str, data_dir: str, fn: str, remove(filepath) return version - def get_seqrepo(self) -> SeqRepo: + def get_seqrepo(self, seqrepo_dir) -> SeqRepo: """Return SeqRepo instance.""" - seqrepo_dir = PROJECT_ROOT / 'data' / 'seqrepo' / 'latest' if not seqrepo_dir.exists(): - raise NotADirectoryError("Could not find gene/data/seqrepo/latest") + raise NotADirectoryError(f"Could not find {seqrepo_dir}") return SeqRepo(seqrepo_dir) diff --git a/gene/etl/ensembl.py b/gene/etl/ensembl.py index 5673f3a5..6ec7cdb6 100644 --- a/gene/etl/ensembl.py +++ b/gene/etl/ensembl.py @@ -70,8 +70,6 @@ def _transform_data(self, *args, **kwargs): merge_strategy="create_unique", keep_order=True) - sr = self.get_seqrepo() - # Get accession numbers accession_numbers = dict() for item in db.features_of_type('scaffold'): @@ -84,7 +82,8 @@ def _transform_data(self, *args, **kwargs): if f.attributes.get('ID'): f_id = f.attributes.get('ID')[0].split(':')[0] if f_id == 'gene': - gene = self._add_gene(f, sr, accession_numbers) + gene = \ + self._add_gene(f, self.seqrepo, accession_numbers) if gene: self._load_gene(gene, batch) logger.info('Successfully transformed Ensembl.') diff --git a/gene/etl/ncbi.py b/gene/etl/ncbi.py index 76b21571..f0c8efb2 100644 --- a/gene/etl/ncbi.py +++ b/gene/etl/ncbi.py @@ -514,8 +514,6 @@ def _transform_data(self): prev_symbols = self._get_prev_symbols() info_genes = self._get_gene_info(prev_symbols) - sr = self.get_seqrepo() - # create db for gff file db = gffutils.create_db(str(self._gff_src), dbfn=":memory:", @@ -523,7 +521,7 @@ def _transform_data(self): merge_strategy="create_unique", keep_order=True) - self._get_gene_gff(db, info_genes, sr) + self._get_gene_gff(db, info_genes, self.seqrepo) with self._database.genes.batch_writer() as batch: for gene in info_genes.keys(): diff --git a/gene/vrs_locations/sequence_location.py b/gene/vrs_locations/sequence_location.py index 0578ca2a..4ad31709 100644 --- a/gene/vrs_locations/sequence_location.py +++ b/gene/vrs_locations/sequence_location.py @@ -1,4 +1,6 @@ """This module defines GA4GH Sequence Location.""" +from typing import List + from ga4gh.vrs import models from ga4gh.core import ga4gh_identify import logging @@ -10,6 +12,15 @@ class SequenceLocation: """The class for GA4GH Sequence Location.""" + def get_aliases(self, sr, seqid) -> List[str]: + """Get aliases for a sequence id + + :param SeqRepo sr: seqrepo instance + :param str seqid: Sequence ID accession + :return: List of aliases for seqid + """ + return sr.translate_alias(seqid) + def add_location(self, seqid, gene, params, sr): """Get a gene's Sequence Location. @@ -20,7 +31,7 @@ def add_location(self, seqid, gene, params, sr): :return: A dictionary of a GA4GH VRS SequenceLocation. """ location = dict() - aliases = sr.translate_alias(seqid) + aliases = self.get_aliases(sr, seqid) sequence_id = [a for a in aliases if a.startswith('ga4gh')][0] if gene.start != '.' and gene.end != '.' and sequence_id: diff --git a/tests/unit/test_database_and_etl.py b/tests/unit/test_database_and_etl.py index de6a2ead..97789b9c 100644 --- a/tests/unit/test_database_and_etl.py +++ b/tests/unit/test_database_and_etl.py @@ -6,6 +6,25 @@ import os from pathlib import Path from boto3.dynamodb.conditions import Key +from mock import patch + +ALIASES = { + "NC_000001.11": ["ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO"], + "NC_000002.12": ["ga4gh:SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g"], + "NC_000003.12": ["ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX"], + "NC_000007.14": ["ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul"], + "NC_000009.12": ["ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI"], + "NC_000011.10": ["ga4gh:SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1"], + "NC_000015.10": ["ga4gh:SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6"], + "NC_000017.11": ["ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7"], + "NC_000019.10": ["ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl"], + "NC_000023.11": ["ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP"], + "NC_000008.11": ["ga4gh:SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs"], + "NC_000012.12": ["ga4gh:SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl"], + "NC_000024.10": ["ga4gh:SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5"], + "NT_167246.2": ["ga4gh:SQ.MjujHSAsgNWRTX4w3ysM7b5OVhZpdXu1"], + "NT_167249.2": ["ga4gh:SQ.Q8IworEhpLeXwpz1CHM7C3luysh-ltx-"] +} @pytest.fixture(scope='module') @@ -33,6 +52,16 @@ def processed_ids(): return list() +def _get_aliases(sr, seqid): + """Monkey patch get aliases method + + :param SeqRepo sr: seqrepo instance + :param str seqid: Sequence ID accession + :return: List of aliases for seqid + """ + return ALIASES[seqid] + + @pytest.fixture(scope='module') def etl_data_path(): """Create a test fixture to return etl data path.""" @@ -47,20 +76,27 @@ def test_tables_created(dynamodb): assert 'gene_metadata' in existing_tables -def test_ensembl_transform(processed_ids, dynamodb, etl_data_path, - is_test_env): +@patch.object(Ensembl, 'get_seqrepo') +def test_ensembl_transform(test_get_seqrepo, + processed_ids, dynamodb, + etl_data_path, is_test_env): """Test ensembl transform method.""" if is_test_env: + test_get_seqrepo.return_value = None e = Ensembl(dynamodb.db) + e._sequence_location.get_aliases = _get_aliases e._data_src = etl_data_path / 'ensembl_104.gff3' e._transform_data() e._add_meta() processed_ids += e._processed_ids -def test_hgnc_transform(processed_ids, dynamodb, etl_data_path, is_test_env): +@patch.object(HGNC, 'get_seqrepo') +def test_hgnc_transform(test_get_seqrepo, processed_ids, dynamodb, + etl_data_path, is_test_env): """Test hgnc transform method.""" if is_test_env: + test_get_seqrepo.return_value = None h = HGNC(dynamodb.db) h._data_src = etl_data_path / 'hgnc_20210810.json' h._version = '20210810' @@ -69,10 +105,14 @@ def test_hgnc_transform(processed_ids, dynamodb, etl_data_path, is_test_env): processed_ids += h._processed_ids -def test_ncbi_transform(processed_ids, dynamodb, etl_data_path, is_test_env): +@patch.object(NCBI, 'get_seqrepo') +def test_ncbi_transform(test_get_seqrepo, processed_ids, dynamodb, + etl_data_path, is_test_env): """Test ncbi transform method.""" if is_test_env: + test_get_seqrepo.return_value = None n = NCBI(dynamodb.db) + n._sequence_location.get_aliases = _get_aliases n._info_src = etl_data_path / 'ncbi_info_20210813.tsv' n._history_src = etl_data_path / 'ncbi_history_20210813.tsv' n._gff_src = etl_data_path / 'ncbi_GRCh38.p13.gff' From dd039cdec4a04f65cac8dbe6b0a14db1bb6ae5e1 Mon Sep 17 00:00:00 2001 From: korikuzma Date: Mon, 16 Aug 2021 16:20:16 -0400 Subject: [PATCH 3/8] update requirements --- requirements-dev.txt | 164 +++++++++++++++++++++++-------------------- requirements.txt | 86 ++++++++++++++--------- 2 files changed, 138 insertions(+), 112 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 036173f8..b7354771 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -10,146 +10,154 @@ -i https://pypi.org/simple -e . -anyio==2.2.0; python_full_version >= '3.6.2' +anyio==3.3.0; python_full_version >= '3.6.2' appdirs==1.4.4 -appnope==0.1.2; sys_platform == 'darwin' and platform_system == 'Darwin' -argcomplete==1.12.2 +appnope==0.1.2; sys_platform == 'darwin' +argcomplete==1.12.3 argh==0.26.2 argon2-cffi==20.1.0 -async-generator==1.10; python_version >= '3.5' -attrs==20.3.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' -babel==2.9.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +asgiref==3.4.1; python_version >= '3.6' +attrs==21.2.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +babel==2.9.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' backcall==0.2.0 backports-datetime-fromisoformat==1.0.0 +backports.entry-points-selectable==1.1.0; python_version >= '2.7' beautifulsoup4==4.9.3 -biocommons.seqrepo==0.6.3 -bioutils==0.5.2.post3; python_version >= '3.6' -bleach==3.3.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' -boto3==1.17.41 -botocore==1.20.41; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' +biocommons.seqrepo==0.6.4 +bioutils==0.5.5; python_version >= '3.6' +bleach==4.0.0; python_version >= '3.6' +boto3==1.18.22 +botocore==1.21.22; python_version >= '3.6' bs4==0.0.1 canonicaljson==1.4.0; python_version ~= '3.5' -certifi==2020.12.5 -cffi==1.14.5 -cfgv==3.2.0; python_full_version >= '3.6.1' -chardet==4.0.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +certifi==2021.5.30 +cffi==1.14.6 +cfgv==3.3.0; python_full_version >= '3.6.1' +charset-normalizer==2.0.4; python_version >= '3' civicpy==1.1.3; python_version >= '3.5' -click==7.1.2 -coloredlogs==15.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +click==8.0.1 +coloredlogs==15.0.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' coverage==5.5 -coveralls==3.0.1 +coveralls==3.2.0 cssselect==1.1.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' -decorator==4.4.2 +debugpy==1.4.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +decorator==5.0.9; python_version >= '3.5' defusedxml==0.7.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' deprecation==2.1.0 -distlib==0.3.1 +distlib==0.3.2 docopt==0.6.2 entrypoints==0.3; python_version >= '2.7' fake-useragent==0.1.11 -fastapi==0.63.0 +fastapi==0.68.0 filelock==3.0.12 flake8-docstrings==1.6.0 -flake8==3.9.0 -frozendict==1.2 +flake8==3.9.2 +frozendict==2.0.6; python_version >= '3.6' ga4gh.vrs==0.6.3rc0 gffutils==0.10.1 h11==0.12.0; python_version >= '3.6' -httptools==0.1.1 -humanfriendly==9.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' -identify==2.2.2; python_full_version >= '3.6.1' -idna==2.10; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +httptools==0.3.0 +humanfriendly==9.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +identify==2.2.13; python_full_version >= '3.6.1' +idna==3.2; python_version >= '3' +importlib-metadata==4.6.4; python_version >= '3.6' inflection==0.5.1; python_version >= '3.5' iniconfig==1.1.1 -ipykernel==5.5.3; python_version >= '3.5' +ipykernel==6.2.0; python_version >= '3.7' ipython-genutils==0.2.0 -ipython==7.22.0; python_version >= '3.7' +ipython==7.26.0; python_version >= '3.7' jedi==0.18.0; python_version >= '3.6' -jinja2==2.11.3; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +jinja2==3.0.1; python_version >= '3.6' jmespath==0.10.0; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3' -json5==0.9.5 +json5==0.9.6 jsonschema==3.2.0 jupyter-client==6.1.12; python_version >= '3.5' jupyter-core==4.7.1; python_version >= '3.6' -jupyter-packaging==0.7.12; python_version >= '3.6' -jupyter-server==1.5.1; python_version >= '3.6' +jupyter-server==1.10.2; python_version >= '3.6' jupyterlab-pygments==0.1.2 -jupyterlab-server==2.4.0; python_version >= '3.6' -jupyterlab==3.0.12 +jupyterlab-server==2.7.0; python_version >= '3.6' +jupyterlab==3.1.7 lxml==4.6.3; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' markdown==3.3.4; python_version >= '3.6' -markupsafe==1.1.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +markupsafe==2.0.1; python_version >= '3.6' +matplotlib-inline==0.1.2; python_version >= '3.5' mccabe==0.6.1 mistune==0.8.4 -nbclassic==0.2.6; python_version >= '3.6' -nbclient==0.5.3; python_full_version >= '3.6.1' -nbconvert==6.0.7; python_version >= '3.6' -nbformat==5.1.2; python_version >= '3.5' +mock==4.0.3 +nbclassic==0.3.1; python_version >= '3.6' +nbclient==0.5.4; python_full_version >= '3.6.1' +nbconvert==6.1.0; python_version >= '3.7' +nbformat==5.1.3; python_version >= '3.5' nest-asyncio==1.5.1; python_version >= '3.5' -networkx==2.5; python_version >= '3.6' -nodeenv==1.5.0 -notebook==6.3.0; python_version >= '3.6' -numpy==1.20.2; python_version >= '3.7' -obonet==0.2.6; python_version >= '3' -packaging==20.9; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' -pandas==1.2.3; python_full_version >= '3.7.1' +networkx==2.6.2; python_version >= '3.7' +nodeenv==1.6.0 +notebook==6.4.3; python_version >= '3.6' +numpy==1.21.2; python_version < '3.11' and python_version >= '3.7' +obonet==0.3.0; python_version >= '3.5' +packaging==21.0; python_version >= '3.6' +pandas==1.3.2; python_full_version >= '3.7.1' pandocfilters==1.4.3 parse==1.19.0 parso==0.8.2; python_version >= '3.6' pexpect==4.8.0; sys_platform != 'win32' pickleshare==0.7.5 +platformdirs==2.2.0; python_version >= '3.6' pluggy==0.13.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' -pre-commit==2.11.1 -prometheus-client==0.9.0 -prompt-toolkit==3.0.18; python_full_version >= '3.6.1' -ptyprocess==0.7.0; os_name != 'nt' +pre-commit==2.14.0 +prometheus-client==0.11.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +prompt-toolkit==3.0.19; python_full_version >= '3.6.1' +ptyprocess==0.7.0 py==1.10.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' pycodestyle==2.7.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' pycparser==2.20; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' -pydantic==1.8.1 -pydocstyle==6.0.0; python_version >= '3.6' -pyee==8.1.0 -pyfaidx==0.5.9.5 +pydantic==1.8.2 +pydocstyle==6.1.1; python_version >= '3.6' +pyee==8.2.2 +pyfaidx==0.6.1 pyflakes==2.3.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' -pygments==2.8.1; python_version >= '3.5' +pygments==2.10.0; python_version >= '3.5' pyparsing==2.4.7; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3' -pyppeteer==0.2.5; python_full_version >= '3.6.1' and python_full_version < '4.0.0' +pyppeteer==0.2.6; python_version < '4' and python_full_version >= '3.6.1' pyquery==1.4.3 -pyrsistent==0.17.3; python_version >= '3.5' +pyrsistent==0.18.0; python_version >= '3.6' pysam==0.16.0.1 -pytest-cov==2.11.1 -pytest==6.2.2 -python-dateutil==2.8.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +pytest-cov==2.12.1 +pytest==6.2.4 +python-dateutil==2.8.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' python-jsonschema-objects==0.3.10 pytz==2021.1 pyyaml==5.4.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' -pyzmq==22.0.3; python_version >= '3.6' +pyzmq==22.2.1; python_version >= '3.6' requests-html==0.10.0; python_version >= '3.6' -requests==2.25.1 -s3transfer==0.3.6 -send2trash==1.5.0 +requests-unixsocket==0.2.0 +requests==2.26.0 +s3transfer==0.5.0; python_version >= '3.6' +send2trash==1.8.0 seqrepo==0.0.0 -simplejson==3.17.2; python_version >= '2.5' and python_version not in '3.0, 3.1, 3.2, 3.3' -six==1.15.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +simplejson==3.17.3; python_version >= '2.5' and python_version not in '3.0, 3.1, 3.2, 3.3' +six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' sniffio==1.2.0; python_version >= '3.5' snowballstemmer==2.1.0 soupsieve==2.2.1; python_version >= '3' sqlparse==0.4.1; python_version >= '3.5' -starlette==0.13.6; python_version >= '3.6' +starlette==0.14.2; python_version >= '3.6' tabulate==0.8.9 -terminado==0.9.4; python_version >= '3.6' -testpath==0.4.4 +terminado==0.11.0; python_version >= '3.6' +testpath==0.5.0; python_version >= '3.5' toml==0.10.2; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3' tornado==6.1; python_version >= '3.5' -tqdm==4.59.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +tqdm==4.62.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' traitlets==5.0.5; python_version >= '3.7' -typing-extensions==3.7.4.3 -urllib3==1.26.4; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_full_version < '4.0.0' -uvicorn==0.13.4 -uvloop==0.15.2 +typing-extensions==3.10.0.0 +urllib3==1.26.6; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4' +uvicorn==0.15.0 +uvloop==0.16.0 vcfpy==0.13.3 -virtualenv==20.4.3; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +virtualenv==20.7.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' w3lib==1.22.0 wcwidth==0.2.5 webencodings==0.5.1 -websockets==8.1 -yoyo-migrations==7.3.1 +websocket-client==1.2.1; python_version >= '3.6' +websockets==9.1 +yoyo-migrations==7.3.2 +zipp==3.5.0; python_version >= '3.6' diff --git a/requirements.txt b/requirements.txt index 3c9074e4..c35f1db5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,62 +7,80 @@ -i https://pypi.org/simple appdirs==1.4.4 -argcomplete==1.12.2 +appnope==0.1.2; sys_platform == 'darwin' +argcomplete==1.12.3 argh==0.26.2 -attrs==20.3.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +asgiref==3.4.1; python_version >= '3.6' +attrs==21.2.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +backcall==0.2.0 beautifulsoup4==4.9.3 -biocommons.seqrepo==0.6.3 -bioutils==0.5.2.post3; python_version >= '3.6' -boto3==1.17.41 -botocore==1.20.41; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' +biocommons.seqrepo==0.6.4 +bioutils==0.5.5; python_version >= '3.6' +boto3==1.18.22 +botocore==1.21.22; python_version >= '3.6' bs4==0.0.1 canonicaljson==1.4.0; python_version ~= '3.5' -certifi==2020.12.5 -chardet==4.0.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' -click==7.1.2 -coloredlogs==15.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +certifi==2021.5.30 +charset-normalizer==2.0.4; python_version >= '3' +click==8.0.1 +coloredlogs==15.0.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' cssselect==1.1.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +decorator==5.0.9; python_version >= '3.5' fake-useragent==0.1.11 -fastapi==0.63.0 -frozendict==1.2 +fastapi==0.68.0 +frozendict==2.0.6; python_version >= '3.6' ga4gh.vrs==0.6.3rc0 gffutils==0.10.1 h11==0.12.0; python_version >= '3.6' -httptools==0.1.1 -humanfriendly==9.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' -idna==2.10; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +httptools==0.3.0 +humanfriendly==9.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +idna==3.2; python_version >= '3' +importlib-metadata==4.6.4; python_version >= '3.6' inflection==0.5.1; python_version >= '3.5' +ipython-genutils==0.2.0 +ipython==7.26.0; python_version >= '3.7' +jedi==0.18.0; python_version >= '3.6' jmespath==0.10.0; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3' jsonschema==3.2.0 lxml==4.6.3; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' markdown==3.3.4; python_version >= '3.6' -numpy==1.20.2; python_version >= '3.7' +matplotlib-inline==0.1.2; python_version >= '3.5' +numpy==1.21.2; python_version < '3.11' and python_version >= '3.7' parse==1.19.0 -pydantic==1.8.1 -pyee==8.1.0 -pyfaidx==0.5.9.5 -pyppeteer==0.2.5; python_full_version >= '3.6.1' and python_full_version < '4.0.0' +parso==0.8.2; python_version >= '3.6' +pexpect==4.8.0; sys_platform != 'win32' +pickleshare==0.7.5 +prompt-toolkit==3.0.19; python_full_version >= '3.6.1' +ptyprocess==0.7.0 +pydantic==1.8.2 +pyee==8.2.2 +pyfaidx==0.6.1 +pygments==2.10.0; python_version >= '3.5' +pyppeteer==0.2.6; python_version < '4' and python_full_version >= '3.6.1' pyquery==1.4.3 -pyrsistent==0.17.3; python_version >= '3.5' +pyrsistent==0.18.0; python_version >= '3.6' pysam==0.16.0.1 -python-dateutil==2.8.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +python-dateutil==2.8.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' python-jsonschema-objects==0.3.10 pyyaml==5.4.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' requests-html==0.10.0; python_version >= '3.6' -requests==2.25.1 -s3transfer==0.3.6 +requests==2.26.0 +s3transfer==0.5.0; python_version >= '3.6' seqrepo==0.0.0 -simplejson==3.17.2; python_version >= '2.5' and python_version not in '3.0, 3.1, 3.2, 3.3' -six==1.15.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +simplejson==3.17.3; python_version >= '2.5' and python_version not in '3.0, 3.1, 3.2, 3.3' +six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' soupsieve==2.2.1; python_version >= '3' sqlparse==0.4.1; python_version >= '3.5' -starlette==0.13.6; python_version >= '3.6' +starlette==0.14.2; python_version >= '3.6' tabulate==0.8.9 -tqdm==4.59.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' -typing-extensions==3.7.4.3 -urllib3==1.26.4; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_full_version < '4.0.0' -uvicorn==0.13.4 -uvloop==0.15.2 +tqdm==4.62.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +traitlets==5.0.5; python_version >= '3.7' +typing-extensions==3.10.0.0 +urllib3==1.26.6; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4' +uvicorn==0.15.0 +uvloop==0.16.0 w3lib==1.22.0 -websockets==8.1 -yoyo-migrations==7.3.1 +wcwidth==0.2.5 +websockets==9.1 +yoyo-migrations==7.3.2 +zipp==3.5.0; python_version >= '3.6' From 5ca1f41b696c0dc3428d1d8865836d4e904203ef Mon Sep 17 00:00:00 2001 From: korikuzma Date: Mon, 16 Aug 2021 16:25:47 -0400 Subject: [PATCH 4/8] use diff path --- tests/unit/test_database_and_etl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_database_and_etl.py b/tests/unit/test_database_and_etl.py index 97789b9c..1bc900f9 100644 --- a/tests/unit/test_database_and_etl.py +++ b/tests/unit/test_database_and_etl.py @@ -65,8 +65,8 @@ def _get_aliases(sr, seqid): @pytest.fixture(scope='module') def etl_data_path(): """Create a test fixture to return etl data path.""" - project_root = Path().resolve().parents[1] - return project_root / 'tests' / 'unit' / 'data' / 'etl_data' + test_root = Path(__file__).resolve().parents[2] + return test_root / 'tests' / 'unit' / 'data' / 'etl_data' def test_tables_created(dynamodb): From 5b73b6e857ebb6917f1074a1fb0693ea1fc34234 Mon Sep 17 00:00:00 2001 From: korikuzma Date: Mon, 16 Aug 2021 16:28:19 -0400 Subject: [PATCH 5/8] update docs --- gene/etl/base.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/gene/etl/base.py b/gene/etl/base.py index adda7334..8fa9f376 100644 --- a/gene/etl/base.py +++ b/gene/etl/base.py @@ -31,6 +31,7 @@ def __init__(self, database: Database, host: str, data_dir: str, :param Database database: DynamoDB database :param str host: Hostname of FTP site :param str data_dir: Data directory of FTP site to look at + :param Path seqrepo_dir: Path to seqrepo directory """ self._database = database self._host = host @@ -151,7 +152,11 @@ def _ftp_download(self, host: str, data_dir: str, fn: str, return version def get_seqrepo(self, seqrepo_dir) -> SeqRepo: - """Return SeqRepo instance.""" + """Return SeqRepo instance if seqrepo_dir exists. + + :param Path seqrepo_dir: Path to seqrepo directory + :return: SeqRepo instance + """ if not seqrepo_dir.exists(): raise NotADirectoryError(f"Could not find {seqrepo_dir}") return SeqRepo(seqrepo_dir) From 5b7a47035bc60e389850cc479f35df9567f88dd3 Mon Sep 17 00:00:00 2001 From: korikuzma Date: Mon, 16 Aug 2021 16:41:49 -0400 Subject: [PATCH 6/8] forgot to include header in ncbi info --- tests/unit/data/etl_data/ncbi_info_20210813.tsv | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/data/etl_data/ncbi_info_20210813.tsv b/tests/unit/data/etl_data/ncbi_info_20210813.tsv index dcd32c97..aae1f466 100644 --- a/tests/unit/data/etl_data/ncbi_info_20210813.tsv +++ b/tests/unit/data/etl_data/ncbi_info_20210813.tsv @@ -1,3 +1,4 @@ +#tax_id GeneID Symbol LocusTag Synonyms dbXrefs chromosome map_location description type_of_gene Symbol_from_nomenclature_authority Full_name_from_nomenclature_authority Nomenclature_status Other_designations Modification_date Feature_type 9606 25 ABL1 - ABL|BCR-ABL|CHDSKM|JTK7|bcr/abl|c-ABL|c-ABL1|p150|v-abl MIM:189980|HGNC:HGNC:76|Ensembl:ENSG00000097007 9 9q34.12 ABL proto-oncogene 1, non-receptor tyrosine kinase protein-coding ABL1 ABL proto-oncogene 1, non-receptor tyrosine kinase O tyrosine-protein kinase ABL1|ABL protooncogene 1 nonreceptor tyrosine kinase|Abelson tyrosine-protein kinase 1|bcr/c-abl oncogene protein|c-abl oncogene 1, receptor tyrosine kinase|proto-oncogene c-Abl|proto-oncogene tyrosine-protein kinase ABL1|truncated ABL protooncogene 1 nonreceptor tyrosine kinase|v-abl Abelson murine leukemia viral oncogene homolog 1 20210809 - 9606 43 ACHE - ACEE|ARACHE|N-ACHE|YT MIM:100740|HGNC:HGNC:108|Ensembl:ENSG00000087085 7 7q22.1 acetylcholinesterase (Cartwright blood group) protein-coding ACHE acetylcholinesterase (Cartwright blood group) O acetylcholinesterase|Yt blood group|acetylcholinesterase (Yt blood group)|apoptosis-related acetylcholinesterase 20210719 - 9606 106 ADCP1 - - HGNC:HGNC:229 6 - adenosine deaminase complexing protein 1 unknown ADCP1 adenosine deaminase complexing protein 1 O - 20190324 - From b91bf2cf0c6d7e68a396262043b158e31415fd4e Mon Sep 17 00:00:00 2001 From: korikuzma Date: Mon, 16 Aug 2021 17:26:47 -0400 Subject: [PATCH 7/8] test etl methods --- gene/etl/ensembl.py | 9 +++--- gene/etl/hgnc.py | 10 +++---- gene/etl/ncbi.py | 3 +- gene/vrs_locations/sequence_location.py | 1 - tests/unit/test_database_and_etl.py | 37 ++++++++++++++++++------- 5 files changed, 37 insertions(+), 23 deletions(-) diff --git a/gene/etl/ensembl.py b/gene/etl/ensembl.py index 6ec7cdb6..9c7d5918 100644 --- a/gene/etl/ensembl.py +++ b/gene/etl/ensembl.py @@ -38,10 +38,9 @@ def __init__(self, self._data_file_url = None self._assembly = 'GRCh38' - def _download_data(self): + def _download_data(self, ens_dir=PROJECT_ROOT / 'data' / 'ensembl'): """Download Ensembl GFF3 data file.""" logger.info('Downloading Ensembl data file...') - ens_dir = PROJECT_ROOT / 'data' / 'ensembl' ens_dir.mkdir(exist_ok=True, parents=True) new_fn = f'ensembl_{self._version}.gff3' if not (ens_dir / new_fn).exists(): @@ -53,13 +52,13 @@ def _download_data(self): self._fn) logger.info('Successfully downloaded Ensembl data file.') - def _extract_data(self, *args, **kwargs): + def _extract_data(self, ens_dir=PROJECT_ROOT / 'data' / 'ensembl', + *args, **kwargs): """Extract data from the Ensembl source.""" if 'data_path' in kwargs: self._data_src = kwargs['data_path'] else: - ensembl_dir = PROJECT_ROOT / 'data' / 'ensembl' - self._data_src = sorted(list(ensembl_dir.iterdir()))[-1] + self._data_src = sorted(list(ens_dir.iterdir()))[-1] def _transform_data(self, *args, **kwargs): """Transform the Ensembl source.""" diff --git a/gene/etl/hgnc.py b/gene/etl/hgnc.py index 0e9fb74a..c10f6ce6 100644 --- a/gene/etl/hgnc.py +++ b/gene/etl/hgnc.py @@ -36,10 +36,10 @@ def __init__(self, self._fn = fn self._version = None - def _download_data(self, *args, **kwargs): + def _download_data(self, hgnc_data_dir=PROJECT_ROOT / 'data' / 'hgnc', + *args, **kwargs): """Download HGNC JSON data file.""" logger.info('Downloading HGNC data file...') - hgnc_data_dir = PROJECT_ROOT / 'data' / 'hgnc' hgnc_data_dir.mkdir(exist_ok=True, parents=True) tmp_fn = 'hgnc_version.json' self._version = \ @@ -49,13 +49,13 @@ def _download_data(self, *args, **kwargs): f"{hgnc_data_dir}/hgnc_{self._version}.json") logger.info('Successfully downloaded HGNC data file.') - def _extract_data(self, *args, **kwargs): + def _extract_data(self, hgnc_data_dir=PROJECT_ROOT / 'data' / 'hgnc', + *args, **kwargs): """Extract data from the HGNC source.""" if 'data_path' in kwargs: self._data_src = kwargs['data_path'] else: - hgnc_dir = PROJECT_ROOT / 'data' / 'hgnc' - self._data_src = sorted(list(hgnc_dir.iterdir()))[-1] + self._data_src = sorted(list(hgnc_data_dir.iterdir()))[-1] def _transform_data(self, *args, **kwargs): """Transform the HGNC source.""" diff --git a/gene/etl/ncbi.py b/gene/etl/ncbi.py index f0c8efb2..f03fa8d5 100644 --- a/gene/etl/ncbi.py +++ b/gene/etl/ncbi.py @@ -100,12 +100,11 @@ def _files_downloaded(self, data_dir: Path) -> bool: gff_downloaded = True return info_downloaded and history_downloaded and gff_downloaded - def _extract_data(self): + def _extract_data(self, local_data_dir=PROJECT_ROOT / 'data' / 'ncbi'): """Gather data from local files or download from source. - Data is expected to be in /data/ncbi. - For now, data files should all be from the same source data version. """ - local_data_dir = PROJECT_ROOT / 'data' / 'ncbi' local_data_dir.mkdir(exist_ok=True, parents=True) if not self._files_downloaded(local_data_dir): self._download_data(local_data_dir) diff --git a/gene/vrs_locations/sequence_location.py b/gene/vrs_locations/sequence_location.py index 4ad31709..49a6bb7d 100644 --- a/gene/vrs_locations/sequence_location.py +++ b/gene/vrs_locations/sequence_location.py @@ -1,6 +1,5 @@ """This module defines GA4GH Sequence Location.""" from typing import List - from ga4gh.vrs import models from ga4gh.core import ga4gh_identify import logging diff --git a/tests/unit/test_database_and_etl.py b/tests/unit/test_database_and_etl.py index 1bc900f9..cc7af053 100644 --- a/tests/unit/test_database_and_etl.py +++ b/tests/unit/test_database_and_etl.py @@ -7,6 +7,7 @@ from pathlib import Path from boto3.dynamodb.conditions import Key from mock import patch +import shutil ALIASES = { "NC_000001.11": ["ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO"], @@ -77,13 +78,18 @@ def test_tables_created(dynamodb): @patch.object(Ensembl, 'get_seqrepo') -def test_ensembl_transform(test_get_seqrepo, - processed_ids, dynamodb, - etl_data_path, is_test_env): - """Test ensembl transform method.""" +def test_ensembl_etl(test_get_seqrepo, processed_ids, dynamodb, etl_data_path, + is_test_env): + """Test that ensembl etl methods work correctly.""" if is_test_env: test_get_seqrepo.return_value = None e = Ensembl(dynamodb.db) + + tmp_dir = etl_data_path / 'ensembl' + e._download_data(tmp_dir) + e._extract_data(tmp_dir) + shutil.rmtree(tmp_dir) + e._sequence_location.get_aliases = _get_aliases e._data_src = etl_data_path / 'ensembl_104.gff3' e._transform_data() @@ -92,12 +98,18 @@ def test_ensembl_transform(test_get_seqrepo, @patch.object(HGNC, 'get_seqrepo') -def test_hgnc_transform(test_get_seqrepo, processed_ids, dynamodb, - etl_data_path, is_test_env): - """Test hgnc transform method.""" +def test_hgnc_etl(test_get_seqrepo, processed_ids, dynamodb, etl_data_path, + is_test_env): + """Test that hgnc etl methods work correctly.""" if is_test_env: test_get_seqrepo.return_value = None h = HGNC(dynamodb.db) + + tmp_dir = etl_data_path / 'hgnc' + h._download_data(tmp_dir) + h._extract_data(tmp_dir) + shutil.rmtree(tmp_dir) + h._data_src = etl_data_path / 'hgnc_20210810.json' h._version = '20210810' h._transform_data() @@ -106,12 +118,17 @@ def test_hgnc_transform(test_get_seqrepo, processed_ids, dynamodb, @patch.object(NCBI, 'get_seqrepo') -def test_ncbi_transform(test_get_seqrepo, processed_ids, dynamodb, - etl_data_path, is_test_env): - """Test ncbi transform method.""" +def test_ncbi_etl(test_get_seqrepo, processed_ids, dynamodb, etl_data_path, + is_test_env): + """Test that ncbi etl methods work correctly.""" if is_test_env: test_get_seqrepo.return_value = None n = NCBI(dynamodb.db) + + tmp_dir = etl_data_path / 'ncbi' + n._extract_data(tmp_dir) + shutil.rmtree(tmp_dir) + n._sequence_location.get_aliases = _get_aliases n._info_src = etl_data_path / 'ncbi_info_20210813.tsv' n._history_src = etl_data_path / 'ncbi_history_20210813.tsv' From 3ab1656c7e43c17bb2e546041850b581cb047ed8 Mon Sep 17 00:00:00 2001 From: korikuzma Date: Tue, 17 Aug 2021 09:54:41 -0400 Subject: [PATCH 8/8] clean up --- gene/etl/base.py | 7 +++++++ gene/etl/ensembl.py | 17 +++++++++-------- gene/etl/hgnc.py | 20 ++++++++++---------- gene/etl/ncbi.py | 14 ++++++++------ tests/unit/test_database_and_etl.py | 22 +++++++++++----------- 5 files changed, 45 insertions(+), 35 deletions(-) diff --git a/gene/etl/base.py b/gene/etl/base.py index 8fa9f376..d4dc77f2 100644 --- a/gene/etl/base.py +++ b/gene/etl/base.py @@ -24,6 +24,7 @@ class Base(ABC): """The ETL base class.""" def __init__(self, database: Database, host: str, data_dir: str, + src_data_dir: Path, seqrepo_dir=PROJECT_ROOT / 'data' / 'seqrepo' / 'latest', *args, **kwargs) -> None: """Instantiate Base class. @@ -31,11 +32,13 @@ def __init__(self, database: Database, host: str, data_dir: str, :param Database database: DynamoDB database :param str host: Hostname of FTP site :param str data_dir: Data directory of FTP site to look at + :param Path src_data_dir: Data directory for source :param Path seqrepo_dir: Path to seqrepo directory """ self._database = database self._host = host self._data_dir = data_dir + self.src_data_dir = src_data_dir self._processed_ids = list() self.seqrepo = self.get_seqrepo(seqrepo_dir) @@ -62,6 +65,10 @@ def _add_meta(self, *args, **kwargs) -> None: """Add source meta to DynamoDB table.""" raise NotImplementedError + def _create_data_directory(self): + """Create data directory for source.""" + self.src_data_dir.mkdir(exist_ok=True, parents=True) + def _load_meta(self, db, metadata, source_name) -> None: """Load source metadata into database. diff --git a/gene/etl/ensembl.py b/gene/etl/ensembl.py index 9c7d5918..9c6fb10f 100644 --- a/gene/etl/ensembl.py +++ b/gene/etl/ensembl.py @@ -19,6 +19,7 @@ def __init__(self, database: Database, host='ftp.ensembl.org', data_dir='pub/', + src_data_dir=PROJECT_ROOT / 'data' / 'ensembl', version='104' ): """Initialize Ensembl ETL class. @@ -26,9 +27,10 @@ def __init__(self, :param Database database: DynamoDB database :param str host: FTP host name :param str data_dir: FTP data directory to use + :param Path src_data_dir: Data directory for Ensembl :param int version: Version for fn """ - super().__init__(database, host, data_dir) + super().__init__(database, host, data_dir, src_data_dir) self._sequence_location = SequenceLocation() self._host = host self._data_dir = data_dir @@ -38,27 +40,26 @@ def __init__(self, self._data_file_url = None self._assembly = 'GRCh38' - def _download_data(self, ens_dir=PROJECT_ROOT / 'data' / 'ensembl'): + def _download_data(self): """Download Ensembl GFF3 data file.""" logger.info('Downloading Ensembl data file...') - ens_dir.mkdir(exist_ok=True, parents=True) + self._create_data_directory() new_fn = f'ensembl_{self._version}.gff3' - if not (ens_dir / new_fn).exists(): + if not (self.src_data_dir / new_fn).exists(): self._ftp_download(self._host, f'{self._data_dir}release-{self._version}' f'/gff3/homo_sapiens/', new_fn, - ens_dir, + self.src_data_dir, self._fn) logger.info('Successfully downloaded Ensembl data file.') - def _extract_data(self, ens_dir=PROJECT_ROOT / 'data' / 'ensembl', - *args, **kwargs): + def _extract_data(self, *args, **kwargs): """Extract data from the Ensembl source.""" if 'data_path' in kwargs: self._data_src = kwargs['data_path'] else: - self._data_src = sorted(list(ens_dir.iterdir()))[-1] + self._data_src = sorted(list(self.src_data_dir.iterdir()))[-1] def _transform_data(self, *args, **kwargs): """Transform the Ensembl source.""" diff --git a/gene/etl/hgnc.py b/gene/etl/hgnc.py index c10f6ce6..08cf735e 100644 --- a/gene/etl/hgnc.py +++ b/gene/etl/hgnc.py @@ -21,6 +21,7 @@ def __init__(self, database: Database, host='ftp.ebi.ac.uk', data_dir='pub/databases/genenames/hgnc/json/', + src_data_dir=PROJECT_ROOT / 'data' / 'hgnc', fn='hgnc_complete_set.json' ): """Initialize HGNC ETL class. @@ -28,34 +29,33 @@ def __init__(self, :param Database database: DynamoDB database :param str host: FTP host name :param str data_dir: FTP data directory to use + :param Path src_data_dir: Data directory for HGNC :param str fn: Data file to download """ - super().__init__(database, host, data_dir) + super().__init__(database, host, data_dir, src_data_dir) self._chromosome_location = ChromosomeLocation() self._data_url = f"ftp://{host}/{data_dir}{fn}" self._fn = fn self._version = None - def _download_data(self, hgnc_data_dir=PROJECT_ROOT / 'data' / 'hgnc', - *args, **kwargs): + def _download_data(self, *args, **kwargs): """Download HGNC JSON data file.""" logger.info('Downloading HGNC data file...') - hgnc_data_dir.mkdir(exist_ok=True, parents=True) + self._create_data_directory() tmp_fn = 'hgnc_version.json' self._version = \ self._ftp_download(self._host, self._data_dir, tmp_fn, - hgnc_data_dir, self._fn) - shutil.move(f"{hgnc_data_dir}/{tmp_fn}", - f"{hgnc_data_dir}/hgnc_{self._version}.json") + self.src_data_dir, self._fn) + shutil.move(f"{self.src_data_dir}/{tmp_fn}", + f"{self.src_data_dir}/hgnc_{self._version}.json") logger.info('Successfully downloaded HGNC data file.') - def _extract_data(self, hgnc_data_dir=PROJECT_ROOT / 'data' / 'hgnc', - *args, **kwargs): + def _extract_data(self, *args, **kwargs): """Extract data from the HGNC source.""" if 'data_path' in kwargs: self._data_src = kwargs['data_path'] else: - self._data_src = sorted(list(hgnc_data_dir.iterdir()))[-1] + self._data_src = sorted(list(self.src_data_dir.iterdir()))[-1] def _transform_data(self, *args, **kwargs): """Transform the HGNC source.""" diff --git a/gene/etl/ncbi.py b/gene/etl/ncbi.py index f03fa8d5..94336d60 100644 --- a/gene/etl/ncbi.py +++ b/gene/etl/ncbi.py @@ -24,15 +24,17 @@ def __init__(self, database: Database, host='ftp.ncbi.nlm.nih.gov', data_dir='gene/DATA/', + src_data_dir=PROJECT_ROOT / 'data' / 'ncbi', assembly: str = 'GRCh38.p13'): """Construct the NCBI ETL instance. :param Database database: gene database for adding new data :param str host: FTP host name :param str data_dir: FTP data directory to use + :param Path src_data_dir: Data directory for NCBI :param str assembly: The genome assembly """ - super().__init__(database, host, data_dir) + super().__init__(database, host, data_dir, src_data_dir) self._sequence_location = SequenceLocation() self._chromosome_location = ChromosomeLocation() self._data_url = f"ftp://{host}" @@ -100,15 +102,15 @@ def _files_downloaded(self, data_dir: Path) -> bool: gff_downloaded = True return info_downloaded and history_downloaded and gff_downloaded - def _extract_data(self, local_data_dir=PROJECT_ROOT / 'data' / 'ncbi'): + def _extract_data(self): """Gather data from local files or download from source. - Data is expected to be in /data/ncbi. - For now, data files should all be from the same source data version. """ - local_data_dir.mkdir(exist_ok=True, parents=True) - if not self._files_downloaded(local_data_dir): - self._download_data(local_data_dir) - local_files = [f for f in local_data_dir.iterdir() + self._create_data_directory() + if not self._files_downloaded(self.src_data_dir): + self._download_data(self.src_data_dir) + local_files = [f for f in self.src_data_dir.iterdir() if f.name.startswith('ncbi')] local_files.sort(key=lambda f: f.name.split('_')[-1], reverse=True) self._info_src = [f for f in local_files diff --git a/tests/unit/test_database_and_etl.py b/tests/unit/test_database_and_etl.py index cc7af053..9d8ef595 100644 --- a/tests/unit/test_database_and_etl.py +++ b/tests/unit/test_database_and_etl.py @@ -85,10 +85,10 @@ def test_ensembl_etl(test_get_seqrepo, processed_ids, dynamodb, etl_data_path, test_get_seqrepo.return_value = None e = Ensembl(dynamodb.db) - tmp_dir = etl_data_path / 'ensembl' - e._download_data(tmp_dir) - e._extract_data(tmp_dir) - shutil.rmtree(tmp_dir) + e.src_data_dir = etl_data_path / 'ensembl' + e._download_data() + e._extract_data() + shutil.rmtree(e.src_data_dir) e._sequence_location.get_aliases = _get_aliases e._data_src = etl_data_path / 'ensembl_104.gff3' @@ -105,10 +105,10 @@ def test_hgnc_etl(test_get_seqrepo, processed_ids, dynamodb, etl_data_path, test_get_seqrepo.return_value = None h = HGNC(dynamodb.db) - tmp_dir = etl_data_path / 'hgnc' - h._download_data(tmp_dir) - h._extract_data(tmp_dir) - shutil.rmtree(tmp_dir) + h.src_data_dir = etl_data_path / 'hgnc' + h._download_data() + h._extract_data() + shutil.rmtree(h.src_data_dir) h._data_src = etl_data_path / 'hgnc_20210810.json' h._version = '20210810' @@ -125,9 +125,9 @@ def test_ncbi_etl(test_get_seqrepo, processed_ids, dynamodb, etl_data_path, test_get_seqrepo.return_value = None n = NCBI(dynamodb.db) - tmp_dir = etl_data_path / 'ncbi' - n._extract_data(tmp_dir) - shutil.rmtree(tmp_dir) + n.src_data_dir = etl_data_path / 'ncbi' + n._extract_data() + shutil.rmtree(n.src_data_dir) n._sequence_location.get_aliases = _get_aliases n._info_src = etl_data_path / 'ncbi_info_20210813.tsv'