Skip to content

Commit

Permalink
feat: provide strand in normalized record (#226)
Browse files Browse the repository at this point in the history
  • Loading branch information
jsstevenson authored Jul 21, 2023
1 parent 2839deb commit 6fdff53
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 6 deletions.
5 changes: 5 additions & 0 deletions docs/source/normalizing_data/normalization.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,11 @@ Normalized records are structured as `Gene Descriptors <https://vrsatile.readthe
}
]
},
{
"type": "Extension",
"name": "strand",
"value": "-"
},
{
"type": "Extension",
"name": "ensembl_locations",
Expand Down
16 changes: 12 additions & 4 deletions gene/etl/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,15 +140,15 @@ def record_order(record):
"previous_symbols": set(),
"hgnc_locus_type": set(),
"ncbi_gene_type": set(),
"ensembl_biotype": set()
"ensembl_biotype": set(),
"strand": set(),
}
if len(records) > 1:
merged_attrs['xrefs'] = list({r['concept_id'] for r in records[1:]})

# merge from constituent records
set_fields = ["aliases", "associated_with", "previous_symbols"]
scalar_fields = ["symbol", "symbol_status", "label", "strand",
"location_annotations"]
set_fields = ["aliases", "associated_with", "previous_symbols", "strand"]
scalar_fields = ["symbol", "symbol_status", "label", "location_annotations"]
for record in records:
for field in set_fields:
merged_attrs[field] |= set(record.get(field, set()))
Expand All @@ -174,5 +174,13 @@ def record_order(record):
else:
del merged_attrs[field]

# ensure no conflicting strands
unique_strand_values = set(merged_attrs.get("strand", []))
num_unique_strand_values = len(unique_strand_values)
if num_unique_strand_values > 1:
del merged_attrs["strand"]
elif num_unique_strand_values == 1:
merged_attrs["strand"] = list(unique_strand_values)[0]

merged_attrs['item_type'] = 'merger'
return merged_attrs
3 changes: 2 additions & 1 deletion gene/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,8 @@ def _add_gene_descriptor(
("approved_name", "label"),
("associated_with", "associated_with"),
("previous_symbols", "previous_symbols"),
("location_annotations", "location_annotations")
("location_annotations", "location_annotations"),
("strand", "strand")
]
for ext_label, record_label in extension_and_record_labels:
if record_label in record and record[record_label]:
Expand Down
2 changes: 1 addition & 1 deletion gene/version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
"""Gene normalizer version"""
__version__ = "0.1.37"
__version__ = "0.1.38"
26 changes: 26 additions & 0 deletions tests/unit/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,11 @@ def normalized_ache():
"name": "ensembl_biotype",
"type": "Extension",
"value": "protein_coding"
},
{
"name": "strand",
"type": "Extension",
"value": "-"
}
]
}
Expand Down Expand Up @@ -284,6 +289,11 @@ def normalized_braf():
"name": "ensembl_biotype",
"type": "Extension",
"value": "protein_coding"
},
{
"name": "strand",
"type": "Extension",
"value": "-"
}
]
}
Expand Down Expand Up @@ -426,6 +436,11 @@ def normalized_abl1():
"name": "ensembl_biotype",
"type": "Extension",
"value": "protein_coding"
},
{
"name": "strand",
"type": "Extension",
"value": "+"
}
]
}
Expand Down Expand Up @@ -553,7 +568,13 @@ def normalized_p150():
},
{
"name": "previous_symbols",
"type": "Extension",
"value": ["LOC107985297"]
},
{
"name": "strand",
"type": "Extension",
"value": "+"
}
]
}
Expand Down Expand Up @@ -620,6 +641,11 @@ def normalized_loc_653303():
"type": "Extension",
"name": "ncbi_gene_type",
"value": "pseudo"
},
{
"name": "strand",
"type": "Extension",
"value": "+"
}
],
"gene_id": "ncbigene:653303"
Expand Down

0 comments on commit 6fdff53

Please sign in to comment.