diff --git a/pyproject.toml b/pyproject.toml index 1db785b..1b27a20 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ "uvicorn", "click", "boto3", - "ga4gh.vrs~=2.0.0a10", + "ga4gh.vrs==2.0.0a13", ] dynamic = ["version"] @@ -41,7 +41,7 @@ etl = [ "wags-tails~=0.2.1", "setuptools", # pinned for 3.12 because yoyo-migrations still uses pkg_resources ] -test = ["pytest>=6.0", "pytest-cov", "mock", "httpx"] +test = ["pytest>=6.0", "pytest-cov", "mock", "httpx", "deepdiff"] dev = ["pre-commit>=3.7.1", "ruff==0.5.0"] docs = [ "sphinx==6.1.3", diff --git a/src/gene/query.py b/src/gene/query.py index c466bd1..2b2e21e 100644 --- a/src/gene/query.py +++ b/src/gene/query.py @@ -6,8 +6,16 @@ from collections.abc import Callable from typing import Any, TypeVar -from ga4gh.core import domain_models, entity_models, ga4gh_identify -from ga4gh.vrs import models +from ga4gh.core import ga4gh_identify +from ga4gh.core.models import ( + Coding, + ConceptMapping, + Extension, + MappableConcept, + Relation, + code, +) +from ga4gh.vrs.models import SequenceLocation, SequenceReference from gene import ITEM_TYPES, NAMESPACE_LOOKUP, PREFIX_LOOKUP, __version__ from gene.database import AbstractDatabase, DatabaseReadException @@ -81,7 +89,7 @@ def _emit_warnings(query_str: str) -> list: return warnings @staticmethod - def _transform_sequence_location(loc: dict) -> models.SequenceLocation: + def _transform_sequence_location(loc: dict) -> SequenceLocation: """Transform a sequence location to VRS sequence location :param loc: GeneSequenceLocation represented as a dict @@ -89,8 +97,8 @@ def _transform_sequence_location(loc: dict) -> models.SequenceLocation: """ refget_ac = loc["sequence_id"].split("ga4gh:")[-1] - return models.SequenceLocation( - sequenceReference=models.SequenceReference(refgetAccession=refget_ac), + return SequenceLocation( + sequenceReference=SequenceReference(refgetAccession=refget_ac), start=int(loc["start"]), end=int(loc["end"]), ) @@ -390,27 +398,29 @@ def _add_gene( :param possible_concepts: List of other normalized concepts found :return: Response with core Gene """ - gene_obj = domain_models.Gene( + gene_obj = MappableConcept( id=f"normalize.gene.{record['concept_id']}", label=record["symbol"], + conceptType="Gene", ) # mappings source_ids = record.get("xrefs", []) + record.get("associated_with", []) mappings = [] for source_id in source_ids: - system, code = source_id.split(":") + system, system_code = source_id.split(":") mappings.append( - entity_models.ConceptMapping( - coding=entity_models.Coding( - code=entity_models.Code(code), system=system.lower() - ), - relation=entity_models.Relation.RELATED_MATCH, + ConceptMapping( + coding=Coding(code=code(system_code), system=system.lower()), + relation=Relation.RELATED_MATCH, ) ) if mappings: gene_obj.mappings = mappings + # extensions + extensions = [] + # aliases aliases = set() for key in ["previous_symbols", "aliases"]: @@ -420,10 +430,8 @@ def _add_gene( val = [val] aliases.update(val) if aliases: - gene_obj.alternativeLabels = list(aliases) + extensions.append(Extension(name="aliases", value=list(aliases))) - # extensions - extensions = [] extension_and_record_labels = [ ("symbol_status", "symbol_status"), ("approved_name", "label"), @@ -433,9 +441,7 @@ def _add_gene( ] for ext_label, record_label in extension_and_record_labels: if record.get(record_label): - extensions.append( - entity_models.Extension(name=ext_label, value=record[record_label]) - ) + extensions.append(Extension(name=ext_label, value=record[record_label])) record_locations = {} if record["item_type"] == RecordType.IDENTITY: @@ -455,16 +461,14 @@ def _add_gene( ] if transformed_locs: - extensions.append( - entity_models.Extension(name=loc_name, value=transformed_locs) - ) + extensions.append(Extension(name=loc_name, value=transformed_locs)) # handle gene types separately because they're wonky if record["item_type"] == RecordType.IDENTITY: gene_type = record.get("gene_type") if gene_type: extensions.append( - entity_models.Extension( + Extension( name=GeneTypeFieldName[record["src_name"].upper()].value, value=gene_type, ) @@ -474,8 +478,7 @@ def _add_gene( field_name = f.value values = record.get(field_name, []) extensions.extend( - entity_models.Extension(name=field_name, value=value) - for value in values + Extension(name=field_name, value=value) for value in values ) if extensions: gene_obj.extensions = extensions diff --git a/src/gene/schemas.py b/src/gene/schemas.py index 56febb9..9e68f9b 100644 --- a/src/gene/schemas.py +++ b/src/gene/schemas.py @@ -3,8 +3,8 @@ from enum import Enum, IntEnum from typing import Annotated, Literal -from ga4gh.core import domain_models -from ga4gh.vrs import models +from ga4gh.core.models import MappableConcept +from ga4gh.vrs.models import SequenceLocation from pydantic import ( BaseModel, ConfigDict, @@ -85,7 +85,7 @@ class BaseGene(BaseModel): label: StrictStr | None = None strand: Strand | None = None location_annotations: list[StrictStr] = [] - locations: list[models.SequenceLocation] | list[GeneSequenceLocation] = [] + locations: list[SequenceLocation] | list[GeneSequenceLocation] = [] aliases: list[StrictStr] = [] previous_symbols: list[StrictStr] = [] xrefs: list[Annotated[str, StringConstraints(pattern=CURIE_REGEX)]] = [] @@ -301,7 +301,7 @@ class NormalizeService(BaseNormalizationService): """Define model for returning normalized concept.""" normalized_id: str | None = None - gene: domain_models.Gene | None = None + gene: MappableConcept | None = None source_meta_: dict[SourceName, SourceMeta] = {} model_config = ConfigDict( diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index bf7a0e6..206bead 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -1,7 +1,8 @@ """Module to test the query module.""" import pytest -from ga4gh.core import domain_models +from deepdiff import DeepDiff +from ga4gh.core.models import MappableConcept from gene.query import InvalidParameterException, QueryHandler from gene.schemas import BaseGene, MatchType, SourceName @@ -31,7 +32,7 @@ def normalize_unmerged(self, query_str): def normalized_ache(): """Return normalized core Gene object for ACHE.""" params = { - "type": "Gene", + "conceptType": "Gene", "id": "normalize.gene.hgnc:108", "label": "ACHE", "mappings": [ @@ -88,8 +89,8 @@ def normalized_ache(): "relation": "relatedMatch", }, ], - "alternativeLabels": ["3.1.1.7", "YT", "N-ACHE", "ARACHE", "ACEE"], "extensions": [ + {"name": "aliases", "value": ["3.1.1.7", "YT", "N-ACHE", "ARACHE", "ACEE"]}, {"name": "previous_symbols", "value": ["ACEE", "YT"]}, { "name": "approved_name", @@ -133,14 +134,14 @@ def normalized_ache(): {"name": "strand", "value": "-"}, ], } - return domain_models.Gene(**params) + return MappableConcept(**params) @pytest.fixture(scope="module") def normalized_braf(): """Return normalized core Gene object for BRAF.""" params = { - "type": "Gene", + "conceptType": "Gene", "id": "normalize.gene.hgnc:1097", "label": "BRAF", "mappings": [ @@ -213,8 +214,11 @@ def normalized_braf(): "relation": "relatedMatch", }, ], - "alternativeLabels": ["BRAF1", "BRAF-1", "RAFB1", "NS7", "B-RAF1", "B-raf"], "extensions": [ + { + "name": "aliases", + "value": ["BRAF1", "BRAF-1", "RAFB1", "NS7", "B-RAF1", "B-raf"], + }, { "name": "approved_name", "value": "B-Raf proto-oncogene, serine/threonine kinase", @@ -257,14 +261,14 @@ def normalized_braf(): {"name": "symbol_status", "value": "approved"}, ], } - return domain_models.Gene(**params) + return MappableConcept(**params) @pytest.fixture(scope="module") def normalized_abl1(): """Return normalized core Gene object for ABL1.""" params = { - "type": "Gene", + "conceptType": "Gene", "id": "normalize.gene.hgnc:76", "label": "ABL1", "mappings": [ @@ -329,20 +333,23 @@ def normalized_abl1(): "relation": "relatedMatch", }, ], - "alternativeLabels": [ - "c-ABL", - "JTK7", - "p150", - "CHDSKM", - "BCR-ABL", - "v-abl", - "c-ABL1", - "bcr/abl", - "LOC116063", - "LOC112779", - "ABL", - ], "extensions": [ + { + "name": "aliases", + "value": [ + "c-ABL", + "JTK7", + "p150", + "CHDSKM", + "BCR-ABL", + "v-abl", + "c-ABL1", + "bcr/abl", + "LOC116063", + "LOC112779", + "ABL", + ], + }, { "name": "previous_symbols", "value": ["LOC116063", "LOC112779", "ABL"], @@ -389,14 +396,14 @@ def normalized_abl1(): {"name": "symbol_status", "value": "approved"}, ], } - return domain_models.Gene(**params) + return MappableConcept(**params) @pytest.fixture(scope="module") def normalized_p150(): """Return normalized core Gene object for p150.""" params = { - "type": "Gene", + "conceptType": "Gene", "id": "normalize.gene.hgnc:1910", "label": "CHAF1A", "mappings": [ @@ -441,16 +448,19 @@ def normalized_p150(): "relation": "relatedMatch", }, ], - "alternativeLabels": [ - "CAF1P150", - "MGC71229", - "CAF-1", - "P150", - "CAF1B", - "CAF1", - "LOC107985297", - ], "extensions": [ + { + "name": "aliases", + "value": [ + "CAF1P150", + "MGC71229", + "CAF-1", + "P150", + "CAF1B", + "CAF1", + "LOC107985297", + ], + }, { "name": "approved_name", "value": "chromatin assembly factor 1 subunit A", @@ -497,7 +507,7 @@ def normalized_p150(): {"name": "symbol_status", "value": "approved"}, ], } - return domain_models.Gene(**params) + return MappableConcept(**params) @pytest.fixture(scope="module") @@ -506,10 +516,13 @@ def normalized_loc_653303(): normalized results that don't merge records. """ params = { - "type": "Gene", + "conceptType": "Gene", "label": "LOC653303", - "alternativeLabels": ["LOC196266", "LOC654080", "LOC731196"], "extensions": [ + { + "name": "aliases", + "value": ["LOC196266", "LOC654080", "LOC731196"], + }, { "name": "approved_name", "value": "proprotein convertase subtilisin/kexin type 7 pseudogene", @@ -537,7 +550,7 @@ def normalized_loc_653303(): ], "id": "normalize.gene.ncbigene:653303", } - return domain_models.Gene(**params) + return MappableConcept(**params) @pytest.fixture(scope="module") @@ -780,7 +793,7 @@ def normalize_unmerged_ache(): def normalized_ifnr(): """Return normalized core Gene object for IFNR.""" params = { - "type": "Gene", + "conceptType": "Gene", "id": "normalize.gene.hgnc:5447", "label": "IFNR", "mappings": [ @@ -801,8 +814,11 @@ def normalized_ifnr(): "relation": "relatedMatch", }, ], - "alternativeLabels": ["IFNGM", "IFNGM2"], "extensions": [ + { + "name": "aliases", + "value": ["IFNGM", "IFNGM2"], + }, { "name": "approved_name", "value": "interferon production regulator", @@ -814,7 +830,7 @@ def normalized_ifnr(): {"name": "location_annotations", "value": ["16"]}, ], } - return domain_models.Gene(**params) + return MappableConcept(**params) @pytest.fixture(scope="module") @@ -924,64 +940,15 @@ def compare_service_meta(service_meta): def compare_gene(test, actual): """Test that actual and expected core gene objects match.""" - assert actual.id == test.id - assert actual.type == test.type - assert actual.label == test.label - - assert bool(actual.mappings) == bool(test.mappings) - if actual.mappings: - no_matches = [] - for actual_mapping in actual.mappings: - match = None - for fixture_mapping in test.mappings: - if actual_mapping == fixture_mapping: - match = actual_mapping - break - if not match: - no_matches.append(actual_mapping) - assert no_matches == [], no_matches - assert len(actual.mappings) == len(test.mappings) - - assert set(actual.alternativeLabels) == set( - test.alternativeLabels - ), "alternativeLabels" - extensions_present = "extensions" in test.model_fields - assert ("extensions" in actual.model_fields) == extensions_present - if extensions_present: - actual_ext_names = sorted([ext.name for ext in actual.extensions]) - unique_actual_ext_names = sorted(set(actual_ext_names)) - assert actual_ext_names == unique_actual_ext_names, "duplicate extension names" - test_ext_names = {ext.name for ext in test.extensions} - assert set(actual_ext_names) == test_ext_names, "extension names dont match" - n_ext_correct = 0 - for test_ext in test.extensions: - for actual_ext in actual.extensions: - if actual_ext.name == test_ext.name: - assert isinstance(actual_ext.value, type(test_ext.value)) - if isinstance(test_ext.value, list): - if test_ext.value: - if isinstance(test_ext.value[0], dict): - if test_ext.value[0].get("type") == "SequenceLocation": - actual_digest = ( - actual_ext.value[0] - .pop("id") - .split("ga4gh:SL.")[-1] - ) - assert ( - actual_ext.value[0].pop("digest") - == actual_digest - ) - assert actual_ext.value == test_ext.value - else: - assert set(actual_ext.value) == set( - test_ext.value - ), f"{test_ext.value} value" - else: - assert actual_ext.value == test_ext.value - else: - assert actual_ext.value == test_ext.value - n_ext_correct += 1 - assert n_ext_correct == len(test.extensions), "number of correct extensions" + for ext in actual.extensions: + if ext.name.endswith("_locations"): + for loc in ext.value: + loc_id = loc.pop("id") + loc_digest = loc.pop("digest") + assert loc_id.split("ga4gh:SL.")[-1] == loc_digest + + diff = DeepDiff(actual, test, ignore_order=True) + assert diff == {}, test.id def test_search_query(query_handler, num_sources): diff --git a/tests/unit/test_schemas.py b/tests/unit/test_schemas.py index 903cced..7dbc2a5 100644 --- a/tests/unit/test_schemas.py +++ b/tests/unit/test_schemas.py @@ -2,7 +2,7 @@ import pydantic import pytest -from ga4gh.vrs import models +from ga4gh.vrs.models import SequenceLocation, SequenceReference from gene.schemas import Gene @@ -10,8 +10,8 @@ @pytest.fixture(scope="module") def sequence_location(): """Create a valid sequence location test fixture.""" - return models.SequenceLocation( - sequenceReference=models.SequenceReference( + return SequenceLocation( + sequenceReference=SequenceReference( refgetAccession="SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul" ), start=140719327,