diff --git a/docs/source/index.rst b/docs/source/index.rst
index 7c6e5926..d64bbe4d 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -12,7 +12,7 @@ Gene Normalizer |version|
:alt: citation
:target: https://zenodo.org/badge/latestdoi/309797998
-The Gene Normalizer provides tools for resolving ambiguous human gene references to consistently-structured, normalized terms. For gene concepts extracted from `NCBI Gene `_, `Ensembl `_, and `HGNC `_, it designates a `CURIE `_, and provides additional metadata like current and previously-used symbols, aliases, database cross-references and associations, and coordinates.
+The Gene Normalizer provides tools for resolving ambiguous human gene references to consistently-structured, normalized terms. For gene concepts extracted from `NCBI Gene `_, `Ensembl `_, and `HGNC `_, it designates a `CURIE `_, and provides additional metadata like current and previously-used symbols, aliases, database cross-references, and coordinates.
A `public REST instance of the service `_ is available for programmatic queries:
diff --git a/docs/source/normalizing_data/sources.rst b/docs/source/normalizing_data/sources.rst
index 591e582c..0de9cf57 100644
--- a/docs/source/normalizing_data/sources.rst
+++ b/docs/source/normalizing_data/sources.rst
@@ -33,9 +33,7 @@ HGNC
"previous_symbols": [],
"xrefs": [
"ensembl:ENSG00000157764",
- "ncbigene:673"
- ],
- "associated_with": [
+ "ncbigene:673",
"uniprot:P15056",
"pubmed:2284096",
"omim:164757",
@@ -99,7 +97,6 @@ Ensembl
"xrefs": [
"hgnc:1097"
],
- "associated_with": [],
"gene_type": "protein_coding",
"match_type": 100
}
@@ -143,9 +140,7 @@ The `NCBI Gene Database `_ is a service prov
"previous_symbols": [],
"xrefs": [
"ensembl:ENSG00000157764",
- "hgnc:1097"
- ],
- "associated_with": [
+ "hgnc:1097",
"omim:164757"
],
"gene_type": "protein-coding",
diff --git a/src/gene/database/dynamodb.py b/src/gene/database/dynamodb.py
index 6f7b0ee7..629059c8 100644
--- a/src/gene/database/dynamodb.py
+++ b/src/gene/database/dynamodb.py
@@ -434,8 +434,7 @@ def _add_ref_record(
:param str term: referent term
:param str concept_id: concept ID to refer to
- :param str ref_type: one of {'alias', 'label', 'xref',
- 'associated_with'}
+ :param str ref_type: one of {'alias', 'label', 'xref'}
:param src_name: name of source for record
"""
label_and_type = f"{term.lower()}##{ref_type}"
diff --git a/src/gene/database/postgresql.py b/src/gene/database/postgresql.py
index 6638645c..66a43132 100644
--- a/src/gene/database/postgresql.py
+++ b/src/gene/database/postgresql.py
@@ -97,7 +97,6 @@ def list_tables(self) -> List[str]:
_drop_db_query = b"""
DROP MATERIALIZED VIEW IF EXISTS record_lookup_view;
DROP TABLE IF EXISTS
- gene_associations,
gene_symbols,
gene_previous_symbols,
gene_aliases,
@@ -324,12 +323,11 @@ def _format_source_record(self, source_row: Tuple) -> Dict:
"locations": source_row[5],
"gene_type": source_row[6],
"aliases": source_row[7],
- "associated_with": source_row[8],
- "previous_symbols": source_row[9],
- "symbol": source_row[10],
- "xrefs": source_row[11],
- "src_name": source_row[12],
- "merge_ref": source_row[13],
+ "previous_symbols": source_row[8],
+ "symbol": source_row[9],
+ "xrefs": source_row[10],
+ "src_name": source_row[11],
+ "merge_ref": source_row[12],
"item_type": RecordType.IDENTITY.value,
}
return {k: v for k, v in gene_record.items() if v}
@@ -373,8 +371,7 @@ def _format_merged_record(self, merged_row: Tuple) -> Dict:
"hgnc_locus_type": merged_row[11],
"ncbi_gene_type": merged_row[12],
"aliases": merged_row[13],
- "associated_with": merged_row[14],
- "xrefs": merged_row[15],
+ "xrefs": merged_row[14],
"item_type": RecordType.MERGER.value,
}
return {k: v for k, v in merged_record.items() if v}
@@ -421,7 +418,6 @@ def get_record_by_id(
RefType.PREVIOUS_SYMBOLS: b"SELECT concept_id FROM gene_previous_symbols WHERE lower(prev_symbol) = %s;", # noqa: E501
RefType.ALIASES: b"SELECT concept_id FROM gene_aliases WHERE lower(alias) = %s;", # noqa: E501
RefType.XREFS: b"SELECT concept_id FROM gene_xrefs WHERE lower(xref) = %s;",
- RefType.ASSOCIATED_WITH: b"SELECT concept_id FROM gene_associations WHERE lower(associated_with) = %s;", # noqa: E501
}
def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]:
@@ -558,9 +554,6 @@ def add_source_metadata(self, src_name: SourceName, meta: SourceMeta) -> None:
)
_ins_alias_query = b"INSERT INTO gene_aliases (alias, concept_id) VALUES (%s, %s);"
_ins_xref_query = b"INSERT INTO gene_xrefs (xref, concept_id) VALUES (%s, %s);"
- _ins_assoc_query = (
- b"INSERT INTO gene_associations (associated_with, concept_id) VALUES (%s, %s);"
- )
def add_record(self, record: Dict, src_name: SourceName) -> None:
"""Add new record to database.
@@ -591,8 +584,6 @@ def add_record(self, record: Dict, src_name: SourceName) -> None:
cur.execute(self._ins_alias_query, [a, concept_id])
for x in record.get("xrefs", []):
cur.execute(self._ins_xref_query, [x, concept_id])
- for a in record.get("associated_with", []):
- cur.execute(self._ins_assoc_query, [a, concept_id])
for p in record.get("previous_symbols", []):
cur.execute(self._ins_prev_symbol_query, [p, concept_id])
if record.get("symbol"):
@@ -606,10 +597,9 @@ def add_record(self, record: Dict, src_name: SourceName) -> None:
INSERT INTO gene_merged (
concept_id, symbol, symbol_status, previous_symbols, label, strand,
location_annotations, ensembl_locations, hgnc_locations, ncbi_locations,
- hgnc_locus_type, ensembl_biotype, ncbi_gene_type, aliases, associated_with,
- xrefs
+ hgnc_locus_type, ensembl_biotype, ncbi_gene_type, aliases, xrefs
)
- VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+ VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
"""
def add_merged_record(self, record: Dict) -> None:
@@ -644,7 +634,6 @@ def add_merged_record(self, record: Dict) -> None:
record.get("ensembl_biotype"),
record.get("ncbi_gene_type"),
record.get("aliases"),
- record.get("associated_with"),
record.get("xrefs"),
],
)
@@ -702,13 +691,6 @@ def delete_normalized_concepts(self) -> None:
WHERE gc.source = %s
);
"""
- _drop_associations_query = b"""
- DELETE FROM gene_associations WHERE id IN (
- SELECT ga.id FROM gene_associations ga LEFT JOIN gene_concepts gc
- ON gc.concept_id = ga.concept_id
- WHERE gc.source = %s
- );
- """
_drop_prev_symbols_query = b"""
DELETE FROM gene_previous_symbols WHERE id IN (
SELECT gps.id FROM gene_previous_symbols gps LEFT JOIN gene_concepts gc
@@ -750,7 +732,6 @@ def delete_source(self, src_name: SourceName) -> None:
"""
with self.conn.cursor() as cur:
cur.execute(self._drop_aliases_query, [src_name.value])
- cur.execute(self._drop_associations_query, [src_name.value])
cur.execute(self._drop_prev_symbols_query, [src_name.value])
cur.execute(self._drop_symbols_query, [src_name.value])
cur.execute(self._drop_xrefs_query, [src_name.value])
diff --git a/src/gene/database/postgresql/add_fkeys.sql b/src/gene/database/postgresql/add_fkeys.sql
index f93459b3..28e1a88f 100644
--- a/src/gene/database/postgresql/add_fkeys.sql
+++ b/src/gene/database/postgresql/add_fkeys.sql
@@ -1,7 +1,5 @@
ALTER TABLE gene_aliases ADD CONSTRAINT gene_aliases_concept_id_fkey
FOREIGN KEY (concept_id) REFERENCES gene_concepts (concept_id);
-ALTER TABLE gene_associations ADD CONSTRAINT gene_associations_concept_id_fkey
- FOREIGN KEY (concept_id) REFERENCES gene_concepts (concept_id);
ALTER TABLE gene_previous_symbols
ADD CONSTRAINT gene_previous_symbols_concept_id_fkey
FOREIGN KEY (concept_id) REFERENCES gene_concepts (concept_id);
diff --git a/src/gene/database/postgresql/add_indexes.sql b/src/gene/database/postgresql/add_indexes.sql
index b96df534..805ad71b 100644
--- a/src/gene/database/postgresql/add_indexes.sql
+++ b/src/gene/database/postgresql/add_indexes.sql
@@ -7,7 +7,5 @@ CREATE INDEX idx_gps_symbol_low
ON gene_previous_symbols (lower(prev_symbol));
CREATE INDEX idx_ga_alias_low ON gene_aliases (lower(alias));
CREATE INDEX idx_gx_xref_low ON gene_xrefs (lower(xref));
-CREATE INDEX idx_g_as_association_low
- ON gene_associations (lower(associated_with));
CREATE INDEX idx_rlv_concept_id_low
ON record_lookup_view (lower(concept_id));
diff --git a/src/gene/database/postgresql/create_record_lookup_view.sql b/src/gene/database/postgresql/create_record_lookup_view.sql
index 1e33977f..7474a07a 100644
--- a/src/gene/database/postgresql/create_record_lookup_view.sql
+++ b/src/gene/database/postgresql/create_record_lookup_view.sql
@@ -7,7 +7,6 @@ SELECT gc.concept_id,
gc.locations,
gc.gene_type,
ga.aliases,
- gas.associated_with,
gps.previous_symbols,
gs.symbol,
gx.xrefs,
@@ -20,11 +19,6 @@ FULL JOIN (
FROM gene_aliases ga_1
GROUP BY ga_1.concept_id
) ga ON gc.concept_id::text = ga.concept_id::text
-FULL JOIN (
- SELECT gas_1.concept_id, array_agg(gas_1.associated_with) AS associated_with
- FROM gene_associations gas_1
- GROUP BY gas_1.concept_id
-) gas ON gc.concept_id::text = gas.concept_id::text
FULL JOIN (
SELECT gps_1.concept_id, array_agg(gps_1.prev_symbol) AS previous_symbols
FROM gene_previous_symbols gps_1
diff --git a/src/gene/database/postgresql/create_tables.sql b/src/gene/database/postgresql/create_tables.sql
index 83198199..9100e553 100644
--- a/src/gene/database/postgresql/create_tables.sql
+++ b/src/gene/database/postgresql/create_tables.sql
@@ -26,7 +26,6 @@ CREATE TABLE gene_merged (
hgnc_locus_type TEXT [],
ncbi_gene_type TEXT [],
aliases TEXT [],
- associated_with TEXT [],
xrefs TEXT []
);
CREATE TABLE gene_concepts (
@@ -60,8 +59,3 @@ CREATE TABLE gene_xrefs (
xref TEXT NOT NULL,
concept_id VARCHAR(127) NOT NULL REFERENCES gene_concepts (concept_id)
);
-CREATE TABLE gene_associations (
- id SERIAL PRIMARY KEY,
- associated_with TEXT NOT NULL,
- concept_ID VARCHAR(127) NOT NULL REFERENCES gene_concepts (concept_id)
-);
diff --git a/src/gene/database/postgresql/delete_normalized_concepts.sql b/src/gene/database/postgresql/delete_normalized_concepts.sql
index 5141c841..e5e1bdce 100644
--- a/src/gene/database/postgresql/delete_normalized_concepts.sql
+++ b/src/gene/database/postgresql/delete_normalized_concepts.sql
@@ -19,7 +19,6 @@ CREATE TABLE gene_merged (
hgnc_locus_type TEXT [],
ncbi_gene_type TEXT [],
aliases TEXT [],
- associated_with TEXT [],
xrefs TEXT []
);
ALTER TABLE gene_concepts ADD CONSTRAINT gene_concepts_merge_ref_fkey
diff --git a/src/gene/database/postgresql/drop_fkeys.sql b/src/gene/database/postgresql/drop_fkeys.sql
index f804ca1e..ba2aeef5 100644
--- a/src/gene/database/postgresql/drop_fkeys.sql
+++ b/src/gene/database/postgresql/drop_fkeys.sql
@@ -1,5 +1,4 @@
ALTER TABLE gene_aliases DROP CONSTRAINT gene_aliases_concept_id_fkey;
-ALTER TABLE gene_associations DROP CONSTRAINT gene_associations_concept_id_fkey;
ALTER TABLE gene_previous_symbols
DROP CONSTRAINT gene_previous_symbols_concept_id_fkey;
ALTER TABLE gene_symbols DROP CONSTRAINT gene_symbols_concept_id_fkey;
diff --git a/src/gene/database/postgresql/drop_indexes.sql b/src/gene/database/postgresql/drop_indexes.sql
index 7c9743d0..dd9156dc 100644
--- a/src/gene/database/postgresql/drop_indexes.sql
+++ b/src/gene/database/postgresql/drop_indexes.sql
@@ -4,5 +4,4 @@ DROP INDEX IF EXISTS idx_gs_symbol_low;
DROP INDEX IF EXISTS idx_gps_symbol_low;
DROP INDEX IF EXISTS idx_gx_xref_low;
DROP INDEX IF EXISTS idx_ga_alias_low;
-DROP INDEX IF EXISTS idx_g_as_association_low;
DROP INDEX IF EXISTS idx_rlv_concept_id_low;
diff --git a/src/gene/etl/ensembl.py b/src/gene/etl/ensembl.py
index bb590047..aef9d388 100644
--- a/src/gene/etl/ensembl.py
+++ b/src/gene/etl/ensembl.py
@@ -1,7 +1,7 @@
"""Defines the Ensembl ETL methods."""
import logging
import re
-from typing import Dict
+from typing import Dict, Optional
import gffutils
from gffutils.feature import Feature
@@ -90,73 +90,83 @@ def _add_gene(self, f: Feature, accession_numbers: Dict) -> Dict:
return gene_params
def _add_attributes(self, f: Feature, gene: Dict) -> None:
- """Add concept_id, symbol, xrefs, and associated_with to a gene record.
+ """Add concept_id, symbol, and xrefs to a gene record.
:param f: A gene from the data
:param gene: A transformed gene record
"""
- attributes = {
+ attributes_map = {
"ID": "concept_id",
"Name": "symbol",
"description": "xrefs",
"biotype": "gene_type",
}
- for attribute in f.attributes.items():
- key = attribute[0]
-
- if key in attributes.keys():
- val = attribute[1]
-
- if len(val) == 1:
- val = val[0]
- if key == "ID":
- if val.startswith("gene"):
- val = (
- f"{NamespacePrefix.ENSEMBL.value}:"
- f"{val.split(':')[1]}"
- )
-
- if key == "description":
- gene["label"] = val.split("[")[0].strip()
- if "Source:" in val:
- src_name = (
- val.split("[")[-1]
- .split("Source:")[-1]
- .split("Acc")[0]
- .split(";")[0]
- )
- src_id = val.split("Acc:")[-1].split("]")[0]
- if ":" in src_id:
- src_id = src_id.split(":")[-1]
- source = self._get_xref_associated_with(src_name, src_id)
- if "xrefs" in source:
- gene["xrefs"] = source["xrefs"]
- elif "associated_with" in source:
- gene["associated_with"] = source["associated_with"]
- continue
-
- gene[attributes[key]] = val
-
- def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict:
- """Get xref or associated_with concept.
+ for key, value in f.attributes.items():
+ if key not in attributes_map:
+ continue
+
+ if key == "ID" and value[0].startswith("gene"):
+ gene[
+ "concept_id"
+ ] = f"{NamespacePrefix.ENSEMBL.value}:{value[0].split(':')[1]}"
+ elif key == "description":
+ pattern = "^(.*) \\[Source:.*;Acc:(.*):(.*)\\]$"
+ matches = re.findall(pattern, value[0])
+ if matches:
+ gene["label"] = matches[0][0]
+ gene["xrefs"] = [self._get_xref(matches[0][1], matches[0][2])]
+ else:
+ gene[attributes_map[key]] = value
+ # key = attribute[0]
+ #
+ # if key in attributes_map.keys():
+ # val = attribute[1]
+ #
+ # if len(val) == 1:
+ # val = val[0]
+ # if key == "ID":
+ # if val.startswith("gene"):
+ # val = (
+ # f"{NamespacePrefix.ENSEMBL.value}:"
+ # f"{val.split(':')[1]}"
+ # )
+ #
+ # if key == "description":
+ # gene["label"] = val.split("[")[0].strip()
+ # if "Source:" in val:
+ # src_name = (
+ # val.split("[")[-1]
+ # .split("Source:")[-1]
+ # .split("Acc")[0]
+ # .split(";")[0]
+ # )
+ # src_id = val.split("Acc:")[-1].split("]")[0]
+ # if ":" in src_id:
+ # src_id = src_id.split(":")[-1]
+ # gene["xrefs"] = self._get_xref(src_name, src_id)
+ # continue
+ #
+ # gene[attributes_map[key]] = val
+
+ def _get_xref(self, src_name: str, src_id: str) -> Optional[str]:
+ """Get xref.
:param src_name: Source name
:param src_id: The source's accession number
- :return: A dict containing an other identifier or xref
+ :return: xref, if successfully parsed
"""
- source = dict()
- if src_name.startswith("HGNC"):
- source["xrefs"] = [f"{NamespacePrefix.HGNC.value}:{src_id}"]
- elif src_name.startswith("NCBI"):
- source["xrefs"] = [f"{NamespacePrefix.NCBI.value}:{src_id}"]
- elif src_name.startswith("UniProt"):
- source["associated_with"] = [f"{NamespacePrefix.UNIPROT.value}:{src_id}"]
- elif src_name.startswith("miRBase"):
- source["associated_with"] = [f"{NamespacePrefix.MIRBASE.value}:{src_id}"]
- elif src_name.startswith("RFAM"):
- source["associated_with"] = [f"{NamespacePrefix.RFAM.value}:{src_id}"]
- return source
+ for prefix, constrained_prefix in (
+ ("HGNC", NamespacePrefix.HGNC),
+ ("NCBI", NamespacePrefix.NCBI),
+ ("UniProt", NamespacePrefix.UNIPROT),
+ ("miRBase", NamespacePrefix.MIRBASE),
+ ("RFAM", NamespacePrefix.RFAM),
+ ):
+ if src_name.startswith(prefix):
+ return f"{constrained_prefix.value}:{src_id}"
+ _logger.warning("Unrecognized source name: %:%", src_name, src_id)
+ return None
def _add_meta(self) -> None:
"""Add Ensembl metadata.
diff --git a/src/gene/etl/hgnc.py b/src/gene/etl/hgnc.py
index 1f060935..805fbe37 100644
--- a/src/gene/etl/hgnc.py
+++ b/src/gene/etl/hgnc.py
@@ -6,7 +6,6 @@
from gene.etl.base import Base, GeneNormalizerEtlError
from gene.schemas import (
- PREFIX_LOOKUP,
Annotation,
Chromosome,
DataLicenseAttributes,
@@ -42,9 +41,9 @@ def _transform_data(self) -> None:
elif r["status"] == "Entry Withdrawn":
gene["symbol_status"] = SymbolStatus.WITHDRAWN.value
- # store alias, xref, associated_with, prev_symbols, location
+ # store alias, xref, prev_symbols, location
self._get_aliases(r, gene)
- self._get_xrefs_associated_with(r, gene)
+ self._get_xrefs(r, gene)
if "prev_symbol" in r:
self._get_previous_symbols(r, gene)
if "location" in r:
@@ -81,14 +80,13 @@ def _get_previous_symbols(self, r: Dict, gene: Dict) -> None:
if prev_symbols:
gene["previous_symbols"] = list(set(prev_symbols))
- def _get_xrefs_associated_with(self, record: Dict, gene: Dict) -> None:
- """Store xrefs and/or associated_with refs in a gene record.
+ def _get_xrefs(self, record: Dict, gene: Dict) -> None:
+ """Store xrefs in a gene record.
:param record: A gene record in the HGNC data file
:param gene: A transformed gene record
"""
xrefs = list()
- associated_with = list()
sources = [
"entrez_id",
"ensembl_gene_id",
@@ -128,37 +126,28 @@ def _get_xrefs_associated_with(self, record: Dict, gene: Dict) -> None:
key = src
if key.upper() in NamespacePrefix.__members__:
- if NamespacePrefix[key.upper()].value in PREFIX_LOOKUP.keys():
- self._get_xref_associated_with(key, src, record, xrefs)
- else:
- self._get_xref_associated_with(
- key, src, record, associated_with
- )
+ self._get_xref(key, src, record, xrefs)
else:
_logger.warning(f"{key} not in schemas.py")
if xrefs:
gene["xrefs"] = xrefs
- if associated_with:
- gene["associated_with"] = associated_with
- def _get_xref_associated_with(
- self, key: str, src: str, r: Dict, src_type: List[str]
- ) -> None:
- """Add an xref or associated_with ref to a gene record.
+ def _get_xref(self, key: str, src: str, r: Dict, xrefs: List[str]) -> None:
+ """Add an xref to a gene record.
:param key: The source's name
:param src: HGNC's source field
:param r: A gene record in the HGNC data file
- :param src_type: Either xrefs or associated_with list
+ :param xrefs: xrefs list
"""
if isinstance(r[src], list):
for xref in r[src]:
- src_type.append(f"{NamespacePrefix[key.upper()].value}:{xref}")
+ xrefs.append(f"{NamespacePrefix[key.upper()].value}:{xref}")
else:
if isinstance(r[src], str) and ":" in r[src]:
r[src] = r[src].split(":")[-1].strip()
- src_type.append(f"{NamespacePrefix[key.upper()].value}" f":{r[src]}")
+ xrefs.append(f"{NamespacePrefix[key.upper()].value}" f":{r[src]}")
def _get_location(self, r: Dict, gene: Dict) -> None:
"""Store GA4GH VRS ChromosomeLocation in a gene record.
diff --git a/src/gene/etl/merge.py b/src/gene/etl/merge.py
index d065be73..57ca439c 100644
--- a/src/gene/etl/merge.py
+++ b/src/gene/etl/merge.py
@@ -5,7 +5,7 @@
from gene.database import AbstractDatabase
from gene.database.database import DatabaseWriteError
-from gene.schemas import GeneTypeFieldName, RecordType, SourcePriority
+from gene.schemas import GeneTypeFieldName, NamespacePrefix, RecordType, SourcePriority
_logger = logging.getLogger(__name__)
@@ -98,7 +98,14 @@ def _create_record_id_set(
if not record_xrefs:
return observed_id_set | {db_record["concept_id"]}
else:
- local_id_set = set(record_xrefs)
+ local_id_set = set()
+ for xref in record_xrefs:
+ if (
+ xref.startswith(NamespacePrefix.NCBI.value)
+ or xref.startswith(NamespacePrefix.ENSEMBL.value)
+ or xref.startswith(NamespacePrefix.HGNC.value)
+ ):
+ local_id_set.add(xref)
merged_id_set = {record_id} | observed_id_set
for local_record_id in local_id_set - observed_id_set:
merged_id_set |= self._create_record_id_set(
@@ -145,7 +152,6 @@ def record_order(record: Dict) -> Tuple:
merged_attrs = {
"concept_id": records[0]["concept_id"],
"aliases": set(),
- "associated_with": set(),
"previous_symbols": set(),
"hgnc_locus_type": set(),
"ncbi_gene_type": set(),
@@ -156,7 +162,7 @@ def record_order(record: Dict) -> Tuple:
merged_attrs["xrefs"] = list({r["concept_id"] for r in records[1:]})
# merge from constituent records
- set_fields = ["aliases", "associated_with", "previous_symbols", "strand"]
+ set_fields = ["aliases", "previous_symbols", "strand"]
scalar_fields = ["symbol", "symbol_status", "label", "location_annotations"]
for record in records:
for field in set_fields:
diff --git a/src/gene/etl/ncbi.py b/src/gene/etl/ncbi.py
index 427d57e6..a5954bbe 100644
--- a/src/gene/etl/ncbi.py
+++ b/src/gene/etl/ncbi.py
@@ -98,14 +98,13 @@ def _get_prev_symbols(self) -> Dict[str, str]:
history_file.close()
return prev_symbols
- def _add_xrefs_associated_with(self, val: List[str], params: Dict) -> None:
- """Add xrefs and associated_with refs to a transformed gene.
+ def _add_xrefs(self, val: List[str], params: Dict) -> None:
+ """Add xrefs to a transformed gene.
:param val: A list of source ids for a given gene
:param params: A transformed gene record
"""
params["xrefs"] = []
- params["associated_with"] = []
for src in val:
src_name = src.split(":")[0].upper()
src_id = src.split(":")[-1]
@@ -125,16 +124,12 @@ def _add_xrefs_associated_with(self, val: List[str], params: Dict) -> None:
prefix = NamespacePrefix.IMGT_GENE_DB.value
elif src_name.startswith("MIRBASE"):
prefix = NamespacePrefix.MIRBASE.value
- else:
- prefix = None
- if prefix:
- params["associated_with"].append(f"{prefix}:{src_id}")
else:
_logger.info(f"{src_name} is not in NameSpacePrefix.")
+ continue
+ params["xrefs"].append(f"{prefix}:{src_id}")
if not params["xrefs"]:
del params["xrefs"]
- if not params["associated_with"]:
- del params["associated_with"]
def _get_gene_info(self, prev_symbols: Dict[str, str]) -> Dict[str, str]:
"""Store genes from NCBI info file.
@@ -158,10 +153,10 @@ def _get_gene_info(self, prev_symbols: Dict[str, str]) -> Dict[str, str]:
params["aliases"] = row[4].split("|")
else:
params["aliases"] = []
- # get associated_with
+ # get xrefs
if row[5] != "-":
- associated_with = row[5].split("|")
- self._add_xrefs_associated_with(associated_with, params)
+ xrefs = row[5].split("|")
+ self._add_xrefs(xrefs, params)
# get chromosome location
vrs_chr_location = self._get_vrs_chr_location(row, params)
if "exclude" in vrs_chr_location:
@@ -223,7 +218,7 @@ def _add_gff_gene(
return params
def _add_attributes(self, f: gffutils.feature.Feature, gene: Dict) -> None:
- """Add concept_id, symbol, and xrefs/associated_with to a gene record.
+ """Add concept_id, symbol, and xrefs to a gene record.
:param gffutils.feature.Feature f: A gene from the data
:param gene: A transformed gene record
@@ -239,7 +234,7 @@ def _add_attributes(self, f: gffutils.feature.Feature, gene: Dict) -> None:
val = val[0]
if key == "Dbxref":
- self._add_xrefs_associated_with(val, gene)
+ self._add_xrefs(val, gene)
elif key == "Name":
gene["symbol"] = val
@@ -258,25 +253,24 @@ def _get_vrs_sq_location(
params["strand"] = gene.strand
return self._build_sequence_location(gene.seqid, gene, params["concept_id"])
- def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict:
- """Get xref or associated_with ref.
+ def _get_xref(self, src_name: str, src_id: str) -> Dict:
+ """Get xref.
:param src_name: Source name
:param src_id: The source's accession number
- :return: A dict containing an xref or associated_with ref
+ :return: A dict containing an xref
"""
- source = dict()
- if src_name.startswith("HGNC"):
- source["xrefs"] = [f"{NamespacePrefix.HGNC.value}:{src_id}"]
- elif src_name.startswith("NCBI"):
- source["xrefs"] = [f"{NamespacePrefix.NCBI.value}:{src_id}"]
- elif src_name.startswith("UniProt"):
- source["associated_with"] = [f"{NamespacePrefix.UNIPROT.value}:{src_id}"]
- elif src_name.startswith("miRBase"):
- source["associated_with"] = [f"{NamespacePrefix.MIRBASE.value}:{src_id}"]
- elif src_name.startswith("RFAM"):
- source["associated_with"] = [f"{NamespacePrefix.RFAM.value}:{src_id}"]
- return source
+ for prefix, constrained_prefix in (
+ ("HGNC", NamespacePrefix.HGNC),
+ ("NCBI", NamespacePrefix.NCBI), # ?
+ ("UniProt", NamespacePrefix.UNIPROT),
+ ("miRBase", NamespacePrefix.MIRBASE),
+ ("RFAM", NamespacePrefix.RFAM),
+ ):
+ if src_name.startswith(prefix):
+ return {"xrefs": [f"{constrained_prefix.value}:{src_id}"]}
+ _logger.warning("Unrecognized source name: %:%", src_name, src_id)
+ return {}
def _get_vrs_chr_location(self, row: List[str], params: Dict) -> List:
"""Store GA4GH VRS ChromosomeLocation in a gene record.
diff --git a/src/gene/query.py b/src/gene/query.py
index 8c100446..0a57be43 100644
--- a/src/gene/query.py
+++ b/src/gene/query.py
@@ -375,9 +375,8 @@ def _add_gene(
)
# mappings
- source_ids = record.get("xrefs", []) + record.get("associated_with", [])
mappings = []
- for source_id in source_ids:
+ for source_id in record.get("xrefs", []):
system, code = source_id.split(":")
mappings.append(
core_models.Mapping(
diff --git a/src/gene/schemas.py b/src/gene/schemas.py
index d2c17d84..96b6fc4b 100644
--- a/src/gene/schemas.py
+++ b/src/gene/schemas.py
@@ -58,7 +58,6 @@ class MatchType(IntEnum):
PREV_SYMBOL = 80
ALIAS = 60
XREF = 60
- ASSOCIATED_WITH = 60
FUZZY_MATCH = 20
NO_MATCH = 0
@@ -102,7 +101,6 @@ class BaseGene(BaseModel):
aliases: List[StrictStr] = []
previous_symbols: List[StrictStr] = []
xrefs: List[CURIE] = []
- associated_with: List[CURIE] = []
gene_type: Optional[StrictStr] = None
@@ -242,7 +240,6 @@ class RefType(str, Enum):
PREVIOUS_SYMBOLS = "prev_symbol"
ALIASES = "alias"
XREFS = "xref"
- ASSOCIATED_WITH = "associated_with"
# collective name to singular name, e.g. {"previous_symbols": "prev_symbol"}
@@ -561,8 +558,9 @@ class UnmergedNormalizationService(BaseNormalizationService):
],
"aliases": ["3.1.1.7"],
"previous_symbols": ["YT"],
- "xrefs": ["ncbigene:43", "ensembl:ENSG00000087085"],
- "associated_with": [
+ "xrefs": [
+ "ncbigene:43",
+ "ensembl:ENSG00000087085",
"ucsc:uc003uxi.4",
"vega:OTTHUMG00000157033",
"merops:S09.979",
@@ -671,8 +669,6 @@ class UnmergedNormalizationService(BaseNormalizationService):
"xrefs": [
"hgnc:108",
"ensembl:ENSG00000087085",
- ],
- "associated_with": [
"omim:100740",
],
"gene_type": "protein-coding",
diff --git a/tests/conftest.py b/tests/conftest.py
index ad1a14a2..923d71ac 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -41,7 +41,6 @@ def _compare_records(normalized_gene, test_gene, match_type):
assert set(normalized_gene.xrefs) == set(test_gene.xrefs)
assert normalized_gene.symbol_status == test_gene.symbol_status
assert set(normalized_gene.previous_symbols) == set(test_gene.previous_symbols)
- assert set(normalized_gene.associated_with) == set(test_gene.associated_with)
assert normalized_gene.symbol == test_gene.symbol
assert len(normalized_gene.locations) == len(test_gene.locations)
for loc in normalized_gene.locations:
diff --git a/tests/unit/test_database_and_etl.py b/tests/unit/test_database_and_etl.py
index 092cc6c3..62a4154e 100644
--- a/tests/unit/test_database_and_etl.py
+++ b/tests/unit/test_database_and_etl.py
@@ -76,7 +76,6 @@ def test_tables_created(db_fixture):
existing_tables = db_fixture.db.list_tables()
if db_fixture.db_name == "PostgresDatabase":
assert set(existing_tables) == {
- "gene_associations",
"gene_symbols",
"gene_previous_symbols",
"gene_aliases",
@@ -150,11 +149,6 @@ def test_item_type(db_fixture):
assert "item_type" in item
assert item["item_type"] == "alias"
- filter_exp = Key("label_and_type").eq("omim:606689##associated_with")
- item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0]
- assert "item_type" in item
- assert item["item_type"] == "associated_with"
-
filter_exp = Key("label_and_type").eq("ensembl:ensg00000268895##xref")
item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0]
assert "item_type" in item
diff --git a/tests/unit/test_ensembl_source.py b/tests/unit/test_ensembl_source.py
index 1ab55430..e9980579 100644
--- a/tests/unit/test_ensembl_source.py
+++ b/tests/unit/test_ensembl_source.py
@@ -47,7 +47,6 @@ def ddx11l1():
}
],
"strand": "+",
- "associated_with": [],
"gene_type": "transcribed_unprocessed_pseudogene",
}
return Gene(**params)
@@ -79,7 +78,6 @@ def tp53():
}
],
"strand": "-",
- "associated_with": [],
"gene_type": "protein_coding",
}
return Gene(**params)
@@ -111,7 +109,6 @@ def ATP6AP1_DT(): # noqa: N802
}
],
"strand": "-",
- "associated_with": [],
"gene_type": "lncRNA",
}
return Gene(**params)
@@ -127,7 +124,6 @@ def hsa_mir_1253():
"label": "hsa-mir-1253",
"previous_symbols": [],
"aliases": [],
- "xrefs": [],
"symbol_status": None,
"location_annotations": [],
"locations": [
@@ -143,7 +139,7 @@ def hsa_mir_1253():
}
],
"strand": "+",
- "associated_with": ["mirbase:MI0006387"],
+ "xrefs": ["mirbase:MI0006387"],
"gene_type": "lncRNA",
}
return Gene(**params)
@@ -175,7 +171,6 @@ def spry3():
}
],
"strand": "+",
- "associated_with": [],
"gene_type": "protein_coding",
}
return Gene(**params)
@@ -254,9 +249,9 @@ def test_hsa_mir_1253(check_resp_single_record, ensembl, hsa_mir_1253):
resp = ensembl.search("hsa-mir-1253")
check_resp_single_record(resp, hsa_mir_1253, MatchType.SYMBOL)
- # associated_with
+ # xref
resp = ensembl.search("mirbase:MI0006387")
- check_resp_single_record(resp, hsa_mir_1253, MatchType.ASSOCIATED_WITH)
+ check_resp_single_record(resp, hsa_mir_1253, MatchType.XREF)
def test_spry3(check_resp_single_record, ensembl, spry3):
diff --git a/tests/unit/test_hgnc_source.py b/tests/unit/test_hgnc_source.py
index 185809fe..2d35c028 100644
--- a/tests/unit/test_hgnc_source.py
+++ b/tests/unit/test_hgnc_source.py
@@ -49,7 +49,9 @@ def a1bg_as1():
"previous_symbols": ["NCRNA00181", "A1BGAS", "A1BG-AS"],
"aliases": ["FLJ23569"],
"symbol_status": "approved",
- "associated_with": [
+ "xrefs": [
+ "ensembl:ENSG00000268895",
+ "ncbigene:503538",
"vega:OTTHUMG00000183508",
"ucsc:uc002qse.3",
"refseq:NR_015380",
@@ -57,7 +59,6 @@ def a1bg_as1():
"refseq:NR_015380",
"ena.embl:BC040926",
],
- "xrefs": ["ensembl:ENSG00000268895", "ncbigene:503538"],
"gene_type": "RNA, long non-coding",
}
return Gene(**params)
@@ -86,7 +87,7 @@ def tp53():
"previous_symbols": [],
"aliases": ["p53", "LFS1"],
"symbol_status": "approved",
- "associated_with": [
+ "xrefs": [
"vega:OTTHUMG00000162125",
"refseq:NM_000546",
"cosmic:TP53",
@@ -110,8 +111,9 @@ def tp53():
"pubmed:6396087",
"pubmed:3456488",
"pubmed:2047879",
+ "ensembl:ENSG00000141510",
+ "ncbigene:7157",
],
- "xrefs": ["ensembl:ENSG00000141510", "ncbigene:7157"],
"gene_type": "gene with protein product",
}
return Gene(**params)
@@ -140,8 +142,9 @@ def a3galt2():
"previous_symbols": ["A3GALT2P"],
"aliases": ["IGBS3S", "IGB3S"],
"symbol_status": "approved",
- "xrefs": ["ensembl:ENSG00000184389", "ncbigene:127550"],
- "associated_with": [
+ "xrefs": [
+ "ensembl:ENSG00000184389",
+ "ncbigene:127550",
"vega:OTTHUMG00000004125",
"vega:OTTHUMG00000004125",
"ucsc:uc031plq.1",
@@ -180,8 +183,9 @@ def wdhd1():
"previous_symbols": [],
"aliases": ["AND-1", "CTF4", "CHTF4"],
"symbol_status": "approved",
- "xrefs": ["ensembl:ENSG00000198554", "ncbigene:11169"],
- "associated_with": [
+ "xrefs": [
+ "ensembl:ENSG00000198554",
+ "ncbigene:11169",
"vega:OTTHUMG00000140304",
"refseq:NM_007086",
"omim:608126",
@@ -212,8 +216,12 @@ def g6pr():
"previous_symbols": [],
"aliases": ["GSD1aSP"],
"symbol_status": "approved",
- "xrefs": ["ncbigene:2541"],
- "associated_with": ["pubmed:2172641", "pubmed:7814621", "pubmed:2996501"],
+ "xrefs": [
+ "ncbigene:2541",
+ "pubmed:2172641",
+ "pubmed:7814621",
+ "pubmed:2996501",
+ ],
"gene_type": "unknown",
}
return Gene(**params)
@@ -233,8 +241,7 @@ def pirc24():
"previous_symbols": [],
"aliases": [],
"symbol_status": "approved",
- "xrefs": ["ncbigene:100313810"],
- "associated_with": ["pubmed:17881367"],
+ "xrefs": ["ncbigene:100313810", "pubmed:17881367"],
"gene_type": "RNA, cluster",
}
return Gene(**params)
@@ -263,8 +270,8 @@ def gage4():
"previous_symbols": [],
"aliases": ["CT4.4"],
"symbol_status": "approved",
- "xrefs": ["ncbigene:2576"],
- "associated_with": [
+ "xrefs": [
+ "ncbigene:2576",
"refseq:NM_001474",
"omim:300597",
"uniprot:P0DSO3",
@@ -290,8 +297,9 @@ def mafip():
"previous_symbols": [],
"aliases": ["FLJ35473", "FLJ00219", "FLJ39633", "MIP", "pp5644", "TEKT4P4"],
"symbol_status": "approved",
- "xrefs": ["ensembl:ENSG00000274847", "ncbigene:727764"],
- "associated_with": [
+ "xrefs": [
+ "ensembl:ENSG00000274847",
+ "ncbigene:727764",
"vega:OTTHUMG00000188065",
"refseq:NR_046439",
"uniprot:Q8WZ33",
@@ -319,8 +327,7 @@ def mt_7sdna():
"previous_symbols": ["MT7SDNA"],
"aliases": [],
"symbol_status": "approved",
- "xrefs": [],
- "associated_with": ["pubmed:24709344", "pubmed:273237"],
+ "xrefs": ["pubmed:24709344", "pubmed:273237"],
"gene_type": "region",
}
return Gene(**params)
@@ -350,7 +357,6 @@ def cecr():
"aliases": [],
"symbol_status": "approved",
"xrefs": ["ncbigene:1055"],
- "associated_with": [],
"gene_type": "region",
}
return Gene(**params)
@@ -387,8 +393,9 @@ def csf2ra():
"previous_symbols": ["CSF2R"],
"aliases": ["CD116", "alphaGMR"],
"symbol_status": "approved",
- "xrefs": ["ensembl:ENSG00000198223", "ncbigene:1438"],
- "associated_with": [
+ "xrefs": [
+ "ensembl:ENSG00000198223",
+ "ncbigene:1438",
"vega:OTTHUMG00000012533",
"refseq:NM_001161529",
"orphanet:209477",
@@ -435,8 +442,7 @@ def rps24p5():
"previous_symbols": [],
"aliases": [],
"symbol_status": "approved",
- "xrefs": ["ncbigene:100271094"],
- "associated_with": ["refseq:NG_011274", "pubmed:19123937"],
+ "xrefs": ["ncbigene:100271094", "refseq:NG_011274", "pubmed:19123937"],
"gene_type": "pseudogene",
}
return Gene(**params)
@@ -465,8 +471,7 @@ def trl_cag2_1():
"previous_symbols": ["TRNAL13"],
"aliases": ["tRNA-Leu-CAG-2-1"],
"symbol_status": "approved",
- "xrefs": ["ncbigene:100189130"],
- "associated_with": ["ena.embl:HG983896"],
+ "xrefs": ["ncbigene:100189130", "ena.embl:HG983896"],
"gene_type": "RNA, transfer",
}
return Gene(**params)
@@ -495,8 +500,9 @@ def myo5b():
"previous_symbols": [],
"aliases": ["KIAA1119"],
"symbol_status": "approved",
- "xrefs": ["ensembl:ENSG00000167306", "ncbigene:4645"],
- "associated_with": [
+ "xrefs": [
+ "ensembl:ENSG00000167306",
+ "ncbigene:4645",
"vega:OTTHUMG00000179843",
"refseq:NM_001080467",
"omim:606540",
@@ -539,7 +545,7 @@ def gstt1():
"previous_symbols": [],
"aliases": ["2.5.1.18"],
"symbol_status": "approved",
- "associated_with": [
+ "xrefs": [
"refseq:NM_000853",
"omim:600436",
"ucsc:uc002zze.4",
@@ -547,8 +553,9 @@ def gstt1():
"orphanet:470418",
"ena.embl:KI270879",
"pubmed:8617495",
+ "ensembl:ENSG00000277656",
+ "ncbigene:2952",
],
- "xrefs": ["ensembl:ENSG00000277656", "ncbigene:2952"],
"gene_type": "gene with protein product",
}
return Gene(**params)
@@ -772,9 +779,9 @@ def test_myo5b(check_resp_single_record, myo5b, hgnc):
resp = hgnc.search("MYO5B")
check_resp_single_record(resp, myo5b, MatchType.SYMBOL)
- # associated_with
+ # xref
resp = hgnc.search("refseq:NM_001080467")
- check_resp_single_record(resp, myo5b, MatchType.ASSOCIATED_WITH)
+ check_resp_single_record(resp, myo5b, MatchType.XREF)
def test_gstt1(check_resp_single_record, gstt1, hgnc):
@@ -787,9 +794,9 @@ def test_gstt1(check_resp_single_record, gstt1, hgnc):
resp = hgnc.search("GSTT1")
check_resp_single_record(resp, gstt1, MatchType.SYMBOL)
- # associated_with
+ # xref
resp = hgnc.search("omim:600436")
- check_resp_single_record(resp, gstt1, MatchType.ASSOCIATED_WITH)
+ check_resp_single_record(resp, gstt1, MatchType.XREF)
def test_no_match(hgnc):
diff --git a/tests/unit/test_ncbi_source.py b/tests/unit/test_ncbi_source.py
index f7b7508c..95504401 100644
--- a/tests/unit/test_ncbi_source.py
+++ b/tests/unit/test_ncbi_source.py
@@ -22,7 +22,6 @@ def check_ncbi_discontinued_gene(normalizer_response, concept_id, symbol, match_
assert resp.aliases == []
assert resp.previous_symbols == []
assert resp.xrefs == []
- assert resp.associated_with == []
@pytest.fixture(scope="module")
@@ -50,9 +49,8 @@ def dpf1():
"concept_id": "ncbigene:8193",
"symbol": "DPF1",
"aliases": ["BAF45b", "NEUD4", "neuro-d4", "SMARCG1"],
- "xrefs": ["hgnc:20225", "ensembl:ENSG00000011332"],
"previous_symbols": [],
- "associated_with": ["omim:601670"],
+ "xrefs": ["hgnc:20225", "ensembl:ENSG00000011332", "omim:601670"],
"symbol_status": None,
"location_annotations": [],
"strand": "-",
@@ -90,9 +88,8 @@ def pdp1_symbol():
"concept_id": "ncbigene:54704",
"symbol": "PDP1",
"aliases": ["PDH", "PDP", "PDPC", "PPM2A", "PPM2C"],
- "xrefs": ["hgnc:9279", "ensembl:ENSG00000164951"],
+ "xrefs": ["hgnc:9279", "ensembl:ENSG00000164951", "omim:605993"],
"previous_symbols": ["LOC157663", "PPM2C"],
- "associated_with": ["omim:605993"],
"symbol_status": None,
"location_annotations": [],
"strand": "+",
@@ -130,9 +127,8 @@ def pdp1_alias():
"concept_id": "ncbigene:403313",
"symbol": "PLPP6",
"aliases": ["PDP1", "PSDP", "PPAPDC2", "bA6J24.6", "LPRP-B", "PA-PSP"],
- "xrefs": ["hgnc:23682", "ensembl:ENSG00000205808"],
+ "xrefs": ["hgnc:23682", "ensembl:ENSG00000205808", "omim:611666"],
"previous_symbols": [],
- "associated_with": ["omim:611666"],
"symbol_status": None,
"location_annotations": [],
"strand": "+",
@@ -171,9 +167,8 @@ def spry3():
"concept_id": "ncbigene:10251",
"symbol": "SPRY3",
"aliases": ["spry-3"],
- "xrefs": ["hgnc:11271", "ensembl:ENSG00000168939"],
+ "xrefs": ["hgnc:11271", "ensembl:ENSG00000168939", "omim:300531"],
"previous_symbols": ["LOC170187", "LOC253479"],
- "associated_with": ["omim:300531"],
"symbol_status": None,
"location_annotations": [],
"strand": "+",
@@ -232,7 +227,6 @@ def adcp1():
"aliases": [],
"xrefs": ["hgnc:229"],
"previous_symbols": [],
- "associated_with": [],
"symbol_status": None,
"strand": None,
"location_annotations": ["6"],
@@ -252,9 +246,8 @@ def afa():
"concept_id": "ncbigene:170",
"symbol": "AFA",
"aliases": [],
- "xrefs": [],
"previous_symbols": [],
- "associated_with": ["omim:106250"],
+ "xrefs": ["omim:106250"],
"symbol_status": None,
"strand": None,
"location_annotations": [],
@@ -274,9 +267,8 @@ def znf84():
"concept_id": "ncbigene:7637",
"symbol": "ZNF84",
"aliases": ["HPF2"],
- "xrefs": ["hgnc:13159", "ensembl:ENSG00000198040"],
+ "xrefs": ["hgnc:13159", "ensembl:ENSG00000198040", "omim:618554"],
"previous_symbols": ["LOC100287429"],
- "associated_with": ["omim:618554"],
"symbol_status": None,
"location_annotations": ["map from Rosati ref via FISH [AFS]"],
"strand": "+",
@@ -315,9 +307,14 @@ def slc25a6():
"concept_id": "ncbigene:293",
"symbol": "SLC25A6",
"aliases": ["AAC3", "ANT", "ANT 2", "ANT 3", "ANT3", "ANT3Y"],
- "xrefs": ["hgnc:10992", "ensembl:ENSG00000169100", "ensembl:ENSG00000292334"],
+ "xrefs": [
+ "hgnc:10992",
+ "ensembl:ENSG00000169100",
+ "ensembl:ENSG00000292334",
+ "omim:300151",
+ "omim:403000",
+ ],
"previous_symbols": ["ANT3Y"],
- "associated_with": ["omim:300151", "omim:403000"],
"symbol_status": None,
"location_annotations": [],
"strand": "-",
@@ -376,7 +373,6 @@ def loc106783576():
"aliases": [],
"xrefs": [],
"previous_symbols": [],
- "associated_with": [],
"symbol_status": None,
"location_annotations": [],
"strand": None,
@@ -405,9 +401,8 @@ def glc1b():
"concept_id": "ncbigene:2722",
"symbol": "GLC1B",
"aliases": [],
- "xrefs": [],
"previous_symbols": [],
- "associated_with": ["omim:606689"],
+ "xrefs": ["omim:606689"],
"symbol_status": None,
"location_annotations": [],
"strand": None,
@@ -436,9 +431,8 @@ def hdpa():
"concept_id": "ncbigene:50829",
"symbol": "HDPA",
"aliases": [],
- "xrefs": [],
"previous_symbols": [],
- "associated_with": ["omim:300221"],
+ "xrefs": ["omim:300221"],
"symbol_status": None,
"location_annotations": [],
"strand": None,
@@ -470,7 +464,6 @@ def prkrap1():
"aliases": [],
"xrefs": ["hgnc:33447"],
"previous_symbols": ["LOC100289695"],
- "associated_with": [],
"symbol_status": None,
"location_annotations": ["alternate reference locus"],
"strand": "+",
@@ -519,9 +512,8 @@ def mhb():
"concept_id": "ncbigene:619511",
"symbol": "MHB",
"aliases": [],
- "xrefs": [],
"previous_symbols": [],
- "associated_with": ["omim:255160"],
+ "xrefs": ["omim:255160"],
"symbol_status": None,
"location_annotations": [],
"strand": None,
@@ -550,9 +542,8 @@ def spg37():
"concept_id": "ncbigene:100049159",
"symbol": "SPG37",
"aliases": [],
- "xrefs": [],
"previous_symbols": [],
- "associated_with": ["omim:611945"],
+ "xrefs": ["omim:611945"],
"symbol_status": None,
"location_annotations": [],
"strand": None,
@@ -607,9 +598,9 @@ def test_dpf1(check_resp_single_record, ncbi, dpf1):
resp = ncbi.search("neuro-d4")
check_resp_single_record(resp, dpf1, MatchType.ALIAS)
- # associated_with
+ # xref
resp = ncbi.search("omim:601670")
- check_resp_single_record(resp, dpf1, MatchType.ASSOCIATED_WITH)
+ check_resp_single_record(resp, dpf1, MatchType.XREF)
# No Match
resp = ncbi.search("DPF 1")
@@ -751,9 +742,9 @@ def test_glc1b(check_resp_single_record, ncbi, glc1b):
resp = ncbi.search("GLC1B")
check_resp_single_record(resp, glc1b, MatchType.SYMBOL)
- # associated_with
+ # xref
resp = ncbi.search("omim:606689")
- check_resp_single_record(resp, glc1b, MatchType.ASSOCIATED_WITH)
+ check_resp_single_record(resp, glc1b, MatchType.XREF)
def test_hdpa(check_resp_single_record, ncbi, hdpa):
@@ -792,9 +783,9 @@ def test_mhb(check_resp_single_record, ncbi, mhb):
resp = ncbi.search("MHB")
check_resp_single_record(resp, mhb, MatchType.SYMBOL)
- # associated_with
+ # xref
resp = ncbi.search("OMIM:255160")
- check_resp_single_record(resp, mhb, MatchType.ASSOCIATED_WITH)
+ check_resp_single_record(resp, mhb, MatchType.XREF)
def test_spg37(check_resp_single_record, ncbi, spg37):
@@ -807,9 +798,9 @@ def test_spg37(check_resp_single_record, ncbi, spg37):
resp = ncbi.search("SPG37")
check_resp_single_record(resp, spg37, MatchType.SYMBOL)
- # associated_with
+ # xref
resp = ncbi.search("omim:611945")
- check_resp_single_record(resp, spg37, MatchType.ASSOCIATED_WITH)
+ check_resp_single_record(resp, spg37, MatchType.XREF)
def test_discontinued_genes(ncbi):
diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py
index bfb11460..f9b08927 100644
--- a/tests/unit/test_query.py
+++ b/tests/unit/test_query.py
@@ -702,7 +702,6 @@ def normalize_unmerged_loc_653303():
"aliases": [],
"previous_symbols": ["LOC196266", "LOC731196", "LOC654080"],
"xrefs": [],
- "associated_with": [],
"gene_type": "pseudo",
}
]
@@ -745,8 +744,9 @@ def normalize_unmerged_chaf1a():
"CAF-1",
],
"previous_symbols": [],
- "xrefs": ["ensembl:ENSG00000167670", "ncbigene:10036"],
- "associated_with": [
+ "xrefs": [
+ "ensembl:ENSG00000167670",
+ "ncbigene:10036",
"vega:OTTHUMG00000181922",
"ccds:CCDS32875",
"ucsc:uc002mal.4",
@@ -784,7 +784,6 @@ def normalize_unmerged_chaf1a():
"aliases": [],
"previous_symbols": [],
"xrefs": ["hgnc:1910"],
- "associated_with": [],
"gene_type": "protein_coding",
}
],
@@ -820,8 +819,11 @@ def normalize_unmerged_chaf1a():
],
"aliases": ["CAF1P150", "P150", "CAF1", "CAF1B", "CAF-1"],
"previous_symbols": ["LOC107985297"],
- "xrefs": ["ensembl:ENSG00000167670", "hgnc:1910"],
- "associated_with": ["omim:601246"],
+ "xrefs": [
+ "ensembl:ENSG00000167670",
+ "hgnc:1910",
+ "omim:601246",
+ ],
"gene_type": "protein-coding",
}
]
@@ -867,8 +869,7 @@ def normalize_unmerged_ache():
],
"aliases": ["YT", "ARACHE", "ACEE", "N-ACHE"],
"previous_symbols": ["ACEE"],
- "xrefs": ["hgnc:108", "ensembl:ENSG00000087085"],
- "associated_with": ["omim:100740"],
+ "xrefs": ["hgnc:108", "ensembl:ENSG00000087085", "omim:100740"],
"gene_type": "protein-coding",
}
],
@@ -897,7 +898,6 @@ def normalize_unmerged_ache():
"aliases": [],
"previous_symbols": [],
"xrefs": ["hgnc:108"],
- "associated_with": [],
"gene_type": "protein_coding",
}
]
@@ -923,8 +923,9 @@ def normalize_unmerged_ache():
],
"aliases": ["3.1.1.7"],
"previous_symbols": ["YT"],
- "xrefs": ["ncbigene:43", "ensembl:ENSG00000087085"],
- "associated_with": [
+ "xrefs": [
+ "ncbigene:43",
+ "ensembl:ENSG00000087085",
"ucsc:uc003uxi.4",
"vega:OTTHUMG00000157033",
"merops:S09.979",
@@ -1050,7 +1051,6 @@ def compare_unmerged_record(gene, test_gene):
assert set(gene.xrefs) == set(test_gene.xrefs)
assert gene.symbol_status == test_gene.symbol_status
assert set(gene.previous_symbols) == set(test_gene.previous_symbols)
- assert set(gene.associated_with) == set(test_gene.associated_with)
assert gene.symbol == test_gene.symbol
assert len(gene.locations) == len(test_gene.locations)
for loc in gene.locations:
@@ -1259,7 +1259,7 @@ def test_ache_query(query_handler, num_sources, normalized_ache, source_meta):
compare_normalize_resp(
resp,
q,
- MatchType.ASSOCIATED_WITH,
+ MatchType.XREF,
normalized_ache,
expected_source_meta=source_meta,
)
@@ -1337,7 +1337,7 @@ def test_braf_query(query_handler, num_sources, normalized_braf, source_meta):
compare_normalize_resp(
resp,
q,
- MatchType.ASSOCIATED_WITH,
+ MatchType.XREF,
normalized_braf,
expected_source_meta=source_meta,
)
@@ -1439,7 +1439,7 @@ def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta):
compare_normalize_resp(
resp,
q,
- MatchType.ASSOCIATED_WITH,
+ MatchType.XREF,
normalized_abl1,
expected_source_meta=source_meta,
)
@@ -1572,18 +1572,14 @@ def test_normalize_unmerged(
resp = query_handler.normalize_unmerged(q)
compare_unmerged_response(resp, q, [], MatchType.ALIAS, normalize_unmerged_chaf1a)
- # assoc with
+ # xref
q = "omim:100740"
resp = query_handler.normalize_unmerged(q)
- compare_unmerged_response(
- resp, q, [], MatchType.ASSOCIATED_WITH, normalize_unmerged_ache
- )
+ compare_unmerged_response(resp, q, [], MatchType.XREF, normalize_unmerged_ache)
q = "uniprot:Q13111"
resp = query_handler.normalize_unmerged(q)
- compare_unmerged_response(
- resp, q, [], MatchType.ASSOCIATED_WITH, normalize_unmerged_chaf1a
- )
+ compare_unmerged_response(resp, q, [], MatchType.XREF, normalize_unmerged_chaf1a)
def test_invalid_queries(query_handler):
diff --git a/tests/unit/test_schemas.py b/tests/unit/test_schemas.py
index 3d5fceed..afe56b84 100644
--- a/tests/unit/test_schemas.py
+++ b/tests/unit/test_schemas.py
@@ -78,15 +78,6 @@ def test_gene(gene, sequence_location):
xrefs=["hgnc", "hgnc:1"],
)
- # associated_with not a valid curie
- with pytest.raises(pydantic.ValidationError):
- Gene(
- match_type=100,
- concept_id="hgnc:1096",
- symbol="BRAF",
- associated_with=["hgnc", "hgnc:1"],
- )
-
# symbol status invalid
with pytest.raises(pydantic.ValidationError):
Gene(