From 6fdff5341f30c64dc19f8b76e16f840142a16f15 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 21 Jul 2023 08:13:44 -0400 Subject: [PATCH] feat: provide strand in normalized record (#226) --- .../source/normalizing_data/normalization.rst | 5 ++++ gene/etl/merge.py | 16 +++++++++--- gene/query.py | 3 ++- gene/version.py | 2 +- tests/unit/test_query.py | 26 +++++++++++++++++++ 5 files changed, 46 insertions(+), 6 deletions(-) diff --git a/docs/source/normalizing_data/normalization.rst b/docs/source/normalizing_data/normalization.rst index 80a57d3b..162a85d5 100644 --- a/docs/source/normalizing_data/normalization.rst +++ b/docs/source/normalizing_data/normalization.rst @@ -118,6 +118,11 @@ Normalized records are structured as `Gene Descriptors 1: merged_attrs['xrefs'] = list({r['concept_id'] for r in records[1:]}) # merge from constituent records - set_fields = ["aliases", "associated_with", "previous_symbols"] - scalar_fields = ["symbol", "symbol_status", "label", "strand", - "location_annotations"] + set_fields = ["aliases", "associated_with", "previous_symbols", "strand"] + scalar_fields = ["symbol", "symbol_status", "label", "location_annotations"] for record in records: for field in set_fields: merged_attrs[field] |= set(record.get(field, set())) @@ -174,5 +174,13 @@ def record_order(record): else: del merged_attrs[field] + # ensure no conflicting strands + unique_strand_values = set(merged_attrs.get("strand", [])) + num_unique_strand_values = len(unique_strand_values) + if num_unique_strand_values > 1: + del merged_attrs["strand"] + elif num_unique_strand_values == 1: + merged_attrs["strand"] = list(unique_strand_values)[0] + merged_attrs['item_type'] = 'merger' return merged_attrs diff --git a/gene/query.py b/gene/query.py index b8d8e4ef..752da694 100644 --- a/gene/query.py +++ b/gene/query.py @@ -449,7 +449,8 @@ def _add_gene_descriptor( ("approved_name", "label"), ("associated_with", "associated_with"), ("previous_symbols", "previous_symbols"), - ("location_annotations", "location_annotations") + ("location_annotations", "location_annotations"), + ("strand", "strand") ] for ext_label, record_label in extension_and_record_labels: if record_label in record and record[record_label]: diff --git a/gene/version.py b/gene/version.py index a2082e10..ebcc436c 100644 --- a/gene/version.py +++ b/gene/version.py @@ -1,2 +1,2 @@ """Gene normalizer version""" -__version__ = "0.1.37" +__version__ = "0.1.38" diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index 53080cd6..63b29837 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -154,6 +154,11 @@ def normalized_ache(): "name": "ensembl_biotype", "type": "Extension", "value": "protein_coding" + }, + { + "name": "strand", + "type": "Extension", + "value": "-" } ] } @@ -284,6 +289,11 @@ def normalized_braf(): "name": "ensembl_biotype", "type": "Extension", "value": "protein_coding" + }, + { + "name": "strand", + "type": "Extension", + "value": "-" } ] } @@ -426,6 +436,11 @@ def normalized_abl1(): "name": "ensembl_biotype", "type": "Extension", "value": "protein_coding" + }, + { + "name": "strand", + "type": "Extension", + "value": "+" } ] } @@ -553,7 +568,13 @@ def normalized_p150(): }, { "name": "previous_symbols", + "type": "Extension", "value": ["LOC107985297"] + }, + { + "name": "strand", + "type": "Extension", + "value": "+" } ] } @@ -620,6 +641,11 @@ def normalized_loc_653303(): "type": "Extension", "name": "ncbi_gene_type", "value": "pseudo" + }, + { + "name": "strand", + "type": "Extension", + "value": "+" } ], "gene_id": "ncbigene:653303"