feat: provide strand in normalized record (#226)

cancervariants · Jul 21, 2023 · 6fdff53 · 6fdff53
1 parent 2839deb
commit 6fdff53
Show file tree

Hide file tree

Showing 5 changed files with 46 additions and 6 deletions.
diff --git a/docs/source/normalizing_data/normalization.rst b/docs/source/normalizing_data/normalization.rst
@@ -118,6 +118,11 @@ Normalized records are structured as `Gene Descriptors <https://vrsatile.readthe
             }
           ]
         },
+        {
+          "type": "Extension",
+          "name": "strand",
+          "value": "-"
+        },
         {
           "type": "Extension",
           "name": "ensembl_locations",

diff --git a/gene/etl/merge.py b/gene/etl/merge.py
@@ -140,15 +140,15 @@ def record_order(record):
             "previous_symbols": set(),
             "hgnc_locus_type": set(),
             "ncbi_gene_type": set(),
-            "ensembl_biotype": set()
+            "ensembl_biotype": set(),
+            "strand": set(),
         }
         if len(records) > 1:
             merged_attrs['xrefs'] = list({r['concept_id'] for r in records[1:]})
 
         # merge from constituent records
-        set_fields = ["aliases", "associated_with", "previous_symbols"]
-        scalar_fields = ["symbol", "symbol_status", "label", "strand",
-                         "location_annotations"]
+        set_fields = ["aliases", "associated_with", "previous_symbols", "strand"]
+        scalar_fields = ["symbol", "symbol_status", "label", "location_annotations"]
         for record in records:
             for field in set_fields:
                 merged_attrs[field] |= set(record.get(field, set()))
@@ -174,5 +174,13 @@ def record_order(record):
             else:
                 del merged_attrs[field]
 
+        # ensure no conflicting strands
+        unique_strand_values = set(merged_attrs.get("strand", []))
+        num_unique_strand_values = len(unique_strand_values)
+        if num_unique_strand_values > 1:
+            del merged_attrs["strand"]
+        elif num_unique_strand_values == 1:
+            merged_attrs["strand"] = list(unique_strand_values)[0]
+
         merged_attrs['item_type'] = 'merger'
         return merged_attrs
diff --git a/gene/query.py b/gene/query.py
@@ -449,7 +449,8 @@ def _add_gene_descriptor(
             ("approved_name", "label"),
             ("associated_with", "associated_with"),
             ("previous_symbols", "previous_symbols"),
-            ("location_annotations", "location_annotations")
+            ("location_annotations", "location_annotations"),
+            ("strand", "strand")
         ]
         for ext_label, record_label in extension_and_record_labels:
             if record_label in record and record[record_label]:

diff --git a/gene/version.py b/gene/version.py
@@ -1,2 +1,2 @@
 """Gene normalizer version"""
-__version__ = "0.1.37"
+__version__ = "0.1.38"
diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py
@@ -154,6 +154,11 @@ def normalized_ache():
                 "name": "ensembl_biotype",
                 "type": "Extension",
                 "value": "protein_coding"
+            },
+            {
+                "name": "strand",
+                "type": "Extension",
+                "value": "-"
             }
         ]
     }
@@ -284,6 +289,11 @@ def normalized_braf():
                 "name": "ensembl_biotype",
                 "type": "Extension",
                 "value": "protein_coding"
+            },
+            {
+                "name": "strand",
+                "type": "Extension",
+                "value": "-"
             }
         ]
     }
@@ -426,6 +436,11 @@ def normalized_abl1():
                 "name": "ensembl_biotype",
                 "type": "Extension",
                 "value": "protein_coding"
+            },
+            {
+                "name": "strand",
+                "type": "Extension",
+                "value": "+"
             }
         ]
     }
@@ -553,7 +568,13 @@ def normalized_p150():
             },
             {
                 "name": "previous_symbols",
+                "type": "Extension",
                 "value": ["LOC107985297"]
+            },
+            {
+                "name": "strand",
+                "type": "Extension",
+                "value": "+"
             }
         ]
     }
@@ -620,6 +641,11 @@ def normalized_loc_653303():
                 "type": "Extension",
                 "name": "ncbi_gene_type",
                 "value": "pseudo"
+            },
+            {
+                "name": "strand",
+                "type": "Extension",
+                "value": "+"
             }
         ],
         "gene_id": "ncbigene:653303"