diff --git a/src/gene/database/dynamodb.py b/src/gene/database/dynamodb.py index 494d977a..6f7b0ee7 100644 --- a/src/gene/database/dynamodb.py +++ b/src/gene/database/dynamodb.py @@ -10,6 +10,7 @@ import click from boto3.dynamodb.conditions import Key from botocore.exceptions import ClientError +from pydantic import BaseModel from gene.database.database import ( AWS_ENV_VAR_NAME, @@ -382,6 +383,12 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: label_and_type = f"{concept_id.lower()}##identity" record["label_and_type"] = label_and_type record["item_type"] = "identity" + for i, location in enumerate(record.get("locations", [])): + if isinstance(location, BaseModel): + record["locations"][i] = location.model_dump( + mode="json", exclude_none=True + ) + try: self.batch.put_item(Item=record) except ClientError as e: diff --git a/src/gene/database/postgresql.py b/src/gene/database/postgresql.py index 9b5967d0..6638645c 100644 --- a/src/gene/database/postgresql.py +++ b/src/gene/database/postgresql.py @@ -535,9 +535,9 @@ def add_source_metadata(self, src_name: SourceName, meta: SourceMeta) -> None: meta.version, json.dumps(meta.data_url), meta.rdp_url, - meta.data_license_attributes["non_commercial"], - meta.data_license_attributes["attribution"], - meta.data_license_attributes["share_alike"], + meta.data_license_attributes.non_commercial, + meta.data_license_attributes.attribution, + meta.data_license_attributes.share_alike, meta.genome_assemblies, ], ) @@ -566,10 +566,10 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: """Add new record to database. :param record: record to upload - :param src_name: name of source for record. Not used by PostgreSQL instance. + :param src_name: name of source for record. """ concept_id = record["concept_id"] - locations = [json.dumps(loc) for loc in record.get("locations", [])] + locations = [loc.model_dump_json() for loc in record.get("locations", [])] if not locations: locations = None with self.conn.cursor() as cur: @@ -578,7 +578,7 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: self._add_record_query, [ concept_id, - record["src_name"], + src_name.value, record.get("symbol_status"), record.get("label"), record.get("strand"), diff --git a/src/gene/etl/base.py b/src/gene/etl/base.py index 93804f35..768165ae 100644 --- a/src/gene/etl/base.py +++ b/src/gene/etl/base.py @@ -12,7 +12,7 @@ from wags_tails import EnsemblData, HgncData, NcbiGeneData from gene.database import AbstractDatabase -from gene.schemas import ITEM_TYPES, Gene, GeneSequenceLocation, MatchType, SourceName +from gene.schemas import ITEM_TYPES, Gene, MatchType, SourceName, StoredSequenceLocation _logger = logging.getLogger(__name__) @@ -122,11 +122,6 @@ def _load_gene(self, gene: Dict) -> None: except pydantic.ValidationError as e: _logger.warning(f"Unable to load {gene} due to validation error: " f"{e}") else: - concept_id = gene["concept_id"] - gene["label_and_type"] = f"{concept_id.lower()}##identity" - gene["src_name"] = self._src_name.value - gene["item_type"] = "identity" - for attr_type in ITEM_TYPES: if attr_type in gene: value = gene[attr_type] @@ -137,7 +132,7 @@ def _load_gene(self, gene: Dict) -> None: gene[attr_type] = list(set(value)) self._database.add_record(gene, self._src_name) - self._processed_ids.append(concept_id) + self._processed_ids.append(gene["concept_id"]) def get_seqrepo(self, seqrepo_dir: Path) -> SeqRepo: """Return SeqRepo instance if seqrepo_dir exists. @@ -224,32 +219,31 @@ def _get_seq_id_aliases(self, seq_id: str) -> List[str]: _logger.warning(f"SeqRepo raised KeyError: {e}") return aliases - def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Dict: - """Get a gene's GeneSequenceLocation. + def _build_sequence_location( + self, seq_id: str, gene: Feature, concept_id: str + ) -> Optional[StoredSequenceLocation]: + """Construct a sequence location for storing in a DB. :param seq_id: The sequence ID. :param gene: A gene from the source file. - :param params: The transformed gene record. - :return: A dictionary of a GA4GH VRS SequenceLocation, if seq_id alias found. - Else, empty dictionary + :param concept_id: record ID from source + :return: A storable SequenceLocation containing relevant params for returning a + VRS SequenceLocation, or None if unable to retrieve valid parameters """ - location = {} aliases = self._get_seq_id_aliases(seq_id) - if not aliases: - return location + if not aliases or gene.start is None or gene.end is None: + return None sequence = aliases[0] if gene.start != "." and gene.end != "." and sequence: - if 0 <= gene.start <= gene.end: # type: ignore - location = GeneSequenceLocation( - start=gene.start - 1, # type: ignore - end=gene.end, # type: ignore + if 0 <= gene.start <= gene.end: + return StoredSequenceLocation( + start=gene.start - 1, + end=gene.end, sequence_id=sequence, - ).model_dump() # type: ignore + ) else: _logger.warning( - f"{params['concept_id']} has invalid interval:" - f"start={gene.start - 1} end={gene.end}" - ) # type: ignore - return location + f"{concept_id} has invalid interval: start={gene.start - 1} end={gene.end}" + ) diff --git a/src/gene/etl/ensembl.py b/src/gene/etl/ensembl.py index c640b6ac..bb590047 100644 --- a/src/gene/etl/ensembl.py +++ b/src/gene/etl/ensembl.py @@ -7,7 +7,12 @@ from gffutils.feature import Feature from gene.etl.base import Base, GeneNormalizerEtlError -from gene.schemas import NamespacePrefix, SourceMeta, SourceName, Strand +from gene.schemas import ( + DataLicenseAttributes, + NamespacePrefix, + SourceMeta, + Strand, +) _logger = logging.getLogger(__name__) @@ -66,22 +71,23 @@ def _add_gene(self, f: Feature, accession_numbers: Dict) -> Dict: :param accession_numbers: Accession numbers for each chromosome and scaffold :return: A gene dictionary containing data if the ID attribute exists. """ - gene = dict() + gene_params = dict() if f.strand == "-": - gene["strand"] = Strand.REVERSE.value + gene_params["strand"] = Strand.REVERSE.value elif f.strand == "+": - gene["strand"] = Strand.FORWARD.value - gene["src_name"] = SourceName.ENSEMBL.value + gene_params["strand"] = Strand.FORWARD.value - self._add_attributes(f, gene) - location = self._add_location(f, gene, accession_numbers) + self._add_attributes(f, gene_params) + location = self._build_sequence_location( + accession_numbers[f.seqid], f, gene_params["concept_id"] + ) if location: - gene["locations"] = [location] + gene_params["locations"] = [location] - gene["label_and_type"] = f"{gene['concept_id'].lower()}##identity" - gene["item_type"] = "identity" + gene_params["label_and_type"] = f"{gene_params['concept_id'].lower()}##identity" + gene_params["item_type"] = "identity" - return gene + return gene_params def _add_attributes(self, f: Feature, gene: Dict) -> None: """Add concept_id, symbol, xrefs, and associated_with to a gene record. @@ -132,17 +138,6 @@ def _add_attributes(self, f: Feature, gene: Dict) -> None: gene[attributes[key]] = val - def _add_location(self, f: Feature, gene: Dict, accession_numbers: Dict) -> Dict: - """Add GA4GH SequenceLocation to a gene record. - https://vr-spec.readthedocs.io/en/1.1/terms_and_model.html#sequencelocation - - :param f: A gene from the data - :param gene: A transformed gene record - :param accession_numbers: Accession numbers for each chromosome and scaffold - :return: gene record dictionary with location added - """ - return self._get_sequence_location(accession_numbers[f.seqid], f, gene) - def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict: """Get xref or associated_with concept. @@ -181,11 +176,11 @@ def _add_meta(self) -> None: "genome_annotations": f"ftp://ftp.ensembl.org/pub/release-{self._version}/gff3/homo_sapiens/Homo_sapiens.{self._assembly}.{self._version}.gff3.gz" }, rdp_url=None, - data_license_attributes={ - "non_commercial": False, - "share_alike": False, - "attribution": False, - }, + data_license_attributes=DataLicenseAttributes( + non_commercial=False, + share_alike=False, + attribution=False, + ), genome_assemblies=[self._assembly], ) diff --git a/src/gene/etl/hgnc.py b/src/gene/etl/hgnc.py index 39e20ffc..1f060935 100644 --- a/src/gene/etl/hgnc.py +++ b/src/gene/etl/hgnc.py @@ -2,13 +2,14 @@ import json import logging import re -from typing import Dict +from typing import Dict, List from gene.etl.base import Base, GeneNormalizerEtlError from gene.schemas import ( PREFIX_LOOKUP, Annotation, Chromosome, + DataLicenseAttributes, NamespacePrefix, SourceMeta, SourceName, @@ -30,19 +31,16 @@ def _transform_data(self) -> None: records = data["response"]["docs"] for r in records: - gene = dict() - gene["concept_id"] = r["hgnc_id"].lower() - gene["label_and_type"] = f"{gene['concept_id']}##identity" - gene["item_type"] = "identity" - gene["symbol"] = r["symbol"] - gene["label"] = r["name"] - gene["src_name"] = SourceName.HGNC.value + gene = { + "concept_id": r["hgnc_id"].lower(), + "symbol": r["symbol"], + "label": r["name"], + } if r["status"]: if r["status"] == "Approved": gene["symbol_status"] = SymbolStatus.APPROVED.value elif r["status"] == "Entry Withdrawn": gene["symbol_status"] = SymbolStatus.WITHDRAWN.value - gene["src_name"] = SourceName.HGNC.value # store alias, xref, associated_with, prev_symbols, location self._get_aliases(r, gene) @@ -83,10 +81,10 @@ def _get_previous_symbols(self, r: Dict, gene: Dict) -> None: if prev_symbols: gene["previous_symbols"] = list(set(prev_symbols)) - def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: + def _get_xrefs_associated_with(self, record: Dict, gene: Dict) -> None: """Store xrefs and/or associated_with refs in a gene record. - :param r: A gene record in the HGNC data file + :param record: A gene record in the HGNC data file :param gene: A transformed gene record """ xrefs = list() @@ -119,7 +117,7 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: ] for src in sources: - if src in r: + if src in record: if "-" in src: key = src.split("-")[0] elif "." in src: @@ -131,9 +129,11 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: if key.upper() in NamespacePrefix.__members__: if NamespacePrefix[key.upper()].value in PREFIX_LOOKUP.keys(): - self._get_xref_associated_with(key, src, r, xrefs) + self._get_xref_associated_with(key, src, record, xrefs) else: - self._get_xref_associated_with(key, src, r, associated_with) + self._get_xref_associated_with( + key, src, record, associated_with + ) else: _logger.warning(f"{key} not in schemas.py") @@ -143,7 +143,7 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: gene["associated_with"] = associated_with def _get_xref_associated_with( - self, key: str, src: str, r: Dict, src_type: Dict + self, key: str, src: str, r: Dict, src_type: List[str] ) -> None: """Add an xref or associated_with ref to a gene record. @@ -194,6 +194,8 @@ def _get_location(self, r: Dict, gene: Dict) -> None: if not gene["location_annotations"]: del gene["location_annotations"] + _annotation_types = {v.value for v in Annotation.__members__.values()} + def _set_annotation(self, loc: str, gene: Dict) -> None: """Set the annotations attribute if one is provided. Return `True` if a location is provided, `False` otherwise. @@ -202,9 +204,7 @@ def _set_annotation(self, loc: str, gene: Dict) -> None: :param gene: in-progress gene record :return: A bool whether or not a gene map location is provided """ - annotations = {v.value for v in Annotation.__members__.values()} - - for annotation in annotations: + for annotation in self._annotation_types: if annotation in loc: gene["location_annotations"].append(annotation) # Check if location is also included @@ -256,11 +256,11 @@ def _add_meta(self) -> None: "complete_set_archive": "ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json" }, rdp_url=None, - data_license_attributes={ - "non_commercial": False, - "share_alike": False, - "attribution": False, - }, + data_license_attributes=DataLicenseAttributes( + non_commercial=False, + share_alike=False, + attribution=False, + ), genome_assemblies=[], ) self._database.add_source_metadata(SourceName.HGNC, metadata) diff --git a/src/gene/etl/ncbi.py b/src/gene/etl/ncbi.py index da5d97e8..427d57e6 100644 --- a/src/gene/etl/ncbi.py +++ b/src/gene/etl/ncbi.py @@ -15,9 +15,11 @@ PREFIX_LOOKUP, Annotation, Chromosome, + DataLicenseAttributes, NamespacePrefix, SourceMeta, SourceName, + StoredSequenceLocation, SymbolStatus, ) @@ -77,22 +79,22 @@ def _get_prev_symbols(self) -> Dict[str, str]: next(history) prev_symbols = {} for row in history: - # Only interested in rows that have homo sapiens tax id - if row[0] == "9606": - if row[1] != "-": - gene_id = row[1] - if gene_id in prev_symbols.keys(): - prev_symbols[gene_id].append(row[3]) - else: - prev_symbols[gene_id] = [row[3]] + if row[0] != "9606": + continue # humans only + if row[1] != "-": + gene_id = row[1] + if gene_id in prev_symbols.keys(): + prev_symbols[gene_id].append(row[3]) else: - # Load discontinued genes - params = { - "concept_id": f"{NamespacePrefix.NCBI.value}:{row[2]}", - "symbol": row[3], - "symbol_status": SymbolStatus.DISCONTINUED.value, - } - self._load_gene(params) + prev_symbols[gene_id] = [row[3]] + else: + # Load discontinued genes + params = { + "concept_id": f"{NamespacePrefix.NCBI.value}:{row[2]}", + "symbol": row[3], + "symbol_status": SymbolStatus.DISCONTINUED.value, + } + self._load_gene(params) history_file.close() return prev_symbols @@ -138,7 +140,7 @@ def _get_gene_info(self, prev_symbols: Dict[str, str]) -> Dict[str, str]: """Store genes from NCBI info file. :param prev_symbols: A dictionary of a gene's previous symbols - :return: A dictionary of gene's from the NCBI info file. + :return: A dictionary of genes from the NCBI info file. """ # open info file, skip headers info_file = open(self._info_src, "r") @@ -191,11 +193,10 @@ def _get_gene_gff(self, db: gffutils.FeatureDB, info_genes: Dict) -> None: if f_id.startswith("gene"): symbol = f.attributes["Name"][0] if symbol in info_genes: - # Just need to add SequenceLocation - params = info_genes.get(symbol) + params: Dict = info_genes.get(symbol) # type: ignore vrs_sq_location = self._get_vrs_sq_location(db, params, f_id) if vrs_sq_location: - params["locations"].append(vrs_sq_location) # type: ignore + params["locations"].append(vrs_sq_location) else: # Need to add entire gene gene = self._add_gff_gene(db, f, f_id) @@ -212,7 +213,6 @@ def _add_gff_gene( :return: A gene dictionary if the ID attribute exists. Else return None. """ params = dict() - params["src_name"] = SourceName.NCBI.value self._add_attributes(f, params) sq_loc = self._get_vrs_sq_location(db, params, f_id) if sq_loc: @@ -245,18 +245,18 @@ def _add_attributes(self, f: gffutils.feature.Feature, gene: Dict) -> None: def _get_vrs_sq_location( self, db: gffutils.FeatureDB, params: Dict, f_id: str - ) -> Dict: + ) -> Optional[StoredSequenceLocation]: """Store GA4GH VRS SequenceLocation in a gene record. https://vr-spec.readthedocs.io/en/1.1/terms_and_model.html#sequencelocation :param db: GFF database :param params: A transformed gene record :param f_id: The feature's ID - :return: A GA4GH VRS SequenceLocation + :return: A storable set of SequenceLocation params """ gene = db[f_id] params["strand"] = gene.strand - return self._get_sequence_location(gene.seqid, gene, params) + return self._build_sequence_location(gene.seqid, gene, params["concept_id"]) def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict: """Get xref or associated_with ref. @@ -493,11 +493,11 @@ def _add_meta(self) -> None: "assembly_file": self._assembly_url, }, rdp_url="https://reusabledata.org/ncbi-gene.html", - data_license_attributes={ - "non_commercial": False, - "share_alike": False, - "attribution": False, - }, + data_license_attributes=DataLicenseAttributes( + non_commercial=False, + share_alike=False, + attribution=False, + ), genome_assemblies=[self._assembly], ) diff --git a/src/gene/schemas.py b/src/gene/schemas.py index e6cb5183..d2c17d84 100644 --- a/src/gene/schemas.py +++ b/src/gene/schemas.py @@ -63,8 +63,8 @@ class MatchType(IntEnum): NO_MATCH = 0 -class GeneSequenceLocation(BaseModel): - """Sequence Location model when storing in DynamoDB.""" +class StoredSequenceLocation(BaseModel): + """Sequence Location model when storing in database.""" type: Literal["SequenceLocation"] = "SequenceLocation" start: StrictInt @@ -73,7 +73,7 @@ class GeneSequenceLocation(BaseModel): # class GeneChromosomeLocation(BaseModel): -# """Chromosome Location model when storing in DynamDB.""" +# """Chromosome Location model when storing in database.""" # type: Literal["ChromosomeLocation"] = "ChromosomeLocation" # species_id: Literal["taxonomy:9606"] = "taxonomy:9606" @@ -92,9 +92,10 @@ class BaseGene(BaseModel): symbol_status: Optional[SymbolStatus] = None label: Optional[StrictStr] = None strand: Optional[Strand] = None - location_annotations: List[StrictStr] = [] + location_annotations: List[Union[Annotation, Chromosome, StrictStr]] = [] locations: Union[ - List[models.SequenceLocation], List[GeneSequenceLocation] + List[models.SequenceLocation], + List[StoredSequenceLocation], # List[Union[SequenceLocation, ChromosomeLocation]], # List[Union[GeneSequenceLocation, GeneChromosomeLocation]] # dynamodb ] = [] @@ -123,7 +124,6 @@ class Gene(BaseGene): "strand": "-", "locations": [], "location_annotations": [], - "associated_with": [], "gene_type": None, "match_type": 100, } @@ -257,7 +257,7 @@ class SourceMeta(BaseModel): version: StrictStr data_url: Dict[StrictStr, StrictStr] # TODO strictness necessary? rdp_url: Optional[StrictStr] = None - data_license_attributes: Dict[StrictStr, StrictBool] + data_license_attributes: DataLicenseAttributes genome_assemblies: List[StrictStr] = [] model_config = ConfigDict( @@ -618,7 +618,6 @@ class UnmergedNormalizationService(BaseNormalizationService): "aliases": [], "previous_symbols": [], "xrefs": ["hgnc:108"], - "associated_with": [], "gene_type": "protein_coding", } ], @@ -669,8 +668,13 @@ class UnmergedNormalizationService(BaseNormalizationService): ], "aliases": ["YT", "ARACHE", "ACEE", "N-ACHE"], "previous_symbols": ["ACEE"], - "xrefs": ["hgnc:108", "ensembl:ENSG00000087085"], - "associated_with": ["omim:100740"], + "xrefs": [ + "hgnc:108", + "ensembl:ENSG00000087085", + ], + "associated_with": [ + "omim:100740", + ], "gene_type": "protein-coding", } ], diff --git a/tests/unit/test_ensembl_source.py b/tests/unit/test_ensembl_source.py index 1cfe0547..1ab55430 100644 --- a/tests/unit/test_ensembl_source.py +++ b/tests/unit/test_ensembl_source.py @@ -313,8 +313,6 @@ def test_meta_info(ensembl): } assert resp.source_meta_.rdp_url is None assert resp.source_meta_.genome_assemblies == ["GRCh38"] - assert resp.source_meta_.data_license_attributes == { - "non_commercial": False, - "share_alike": False, - "attribution": False, - } + assert resp.source_meta_.data_license_attributes.non_commercial is False + assert resp.source_meta_.data_license_attributes.share_alike is False + assert resp.source_meta_.data_license_attributes.attribution is False diff --git a/tests/unit/test_hgnc_source.py b/tests/unit/test_hgnc_source.py index 1f143139..185809fe 100644 --- a/tests/unit/test_hgnc_source.py +++ b/tests/unit/test_hgnc_source.py @@ -822,8 +822,6 @@ def test_meta_info(hgnc): } assert resp.source_meta_.rdp_url is None assert resp.source_meta_.genome_assemblies == [] - assert resp.source_meta_.data_license_attributes == { - "non_commercial": False, - "share_alike": False, - "attribution": False, - } + assert resp.source_meta_.data_license_attributes.non_commercial is False + assert resp.source_meta_.data_license_attributes.share_alike is False + assert resp.source_meta_.data_license_attributes.attribution is False diff --git a/tests/unit/test_ncbi_source.py b/tests/unit/test_ncbi_source.py index 4e384b04..f7b7508c 100644 --- a/tests/unit/test_ncbi_source.py +++ b/tests/unit/test_ncbi_source.py @@ -860,9 +860,9 @@ def test_no_match(ncbi, source_urls): assert datetime.strptime(response.source_meta_.version, "%Y%m%d") assert response.source_meta_.data_url == source_urls assert response.source_meta_.rdp_url == "https://reusabledata.org/ncbi-gene.html" - assert not response.source_meta_.data_license_attributes["non_commercial"] - assert not response.source_meta_.data_license_attributes["share_alike"] - assert not response.source_meta_.data_license_attributes["attribution"] + assert response.source_meta_.data_license_attributes.non_commercial is False + assert response.source_meta_.data_license_attributes.share_alike is False + assert response.source_meta_.data_license_attributes.attribution is False # check blank response = ncbi.search("") @@ -907,8 +907,6 @@ def test_meta(ncbi, source_urls): assert response.source_meta_.data_url == source_urls assert response.source_meta_.rdp_url == "https://reusabledata.org/ncbi-gene.html" assert response.source_meta_.genome_assemblies == ["GRCh38.p14"] - assert response.source_meta_.data_license_attributes == { - "non_commercial": False, - "share_alike": False, - "attribution": False, - } + assert response.source_meta_.data_license_attributes.non_commercial is False + assert response.source_meta_.data_license_attributes.share_alike is False + assert response.source_meta_.data_license_attributes.attribution is False