From c1d87ac115ef10305ebd3e7a5996eee39f0f6936 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Mon, 11 Dec 2023 20:10:14 -0500 Subject: [PATCH] style: fix ruff settings --- .pre-commit-config.yaml | 3 +- docs/scripts/generate_normalize_figure.py | 88 +- docs/source/conf.py | 8 +- docs/source/contributing.rst | 2 +- pyproject.toml | 36 +- src/gene/__init__.py | 22 +- src/gene/cli.py | 108 +- src/gene/database/__init__.py | 9 - src/gene/database/database.py | 32 +- src/gene/database/dynamodb.py | 232 ++-- src/gene/database/postgresql.py | 292 ++--- src/gene/etl/__init__.py | 12 +- src/gene/etl/base.py | 44 +- src/gene/etl/ensembl.py | 118 +- src/gene/etl/hgnc.py | 176 +-- src/gene/etl/merge.py | 80 +- src/gene/etl/ncbi.py | 278 ++--- src/gene/main.py | 64 +- src/gene/query.py | 164 +-- src/gene/schemas.py | 608 +++++------ src/gene/version.py | 2 +- tests/conftest.py | 20 +- tests/unit/test_database_and_etl.py | 144 +-- tests/unit/test_emit_warnings.py | 12 +- tests/unit/test_endpoints.py | 18 +- tests/unit/test_ensembl_source.py | 290 ++--- tests/unit/test_hgnc_source.py | 740 ++++++------- tests/unit/test_ncbi_source.py | 718 ++++++------- tests/unit/test_query.py | 1194 ++++++++++----------- tests/unit/test_schemas.py | 54 +- 30 files changed, 2782 insertions(+), 2786 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2b0ba8a9..1c696108 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,5 +10,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.1.2 hooks: - - id: ruff - id: ruff-format + - id: ruff + args: [ --fix, --exit-non-zero-on-fix ] diff --git a/docs/scripts/generate_normalize_figure.py b/docs/scripts/generate_normalize_figure.py index 1a39a085..d5863a38 100644 --- a/docs/scripts/generate_normalize_figure.py +++ b/docs/scripts/generate_normalize_figure.py @@ -18,9 +18,9 @@ from gene.schemas import UnmergedNormalizationService COLORS = [ - "#F8766D", - "#00BA38", - "#00B9E3", + '#F8766D', + '#00BA38', + '#00B9E3', ] @@ -30,50 +30,50 @@ def create_gjgf(result: UnmergedNormalizationService) -> Dict: :param result: result from Unmerged Normalization search """ graph = { - "graph": { - "label": "tmp", - "nodes": {}, - "edges": [], - "metadata": { - "arrow_size": 15, - "node_size": 15, - "node_label_size": 20, - "edge_size": 2, + 'graph': { + 'label': 'tmp', + 'nodes': {}, + 'edges': [], + 'metadata': { + 'arrow_size': 15, + 'node_size': 15, + 'node_label_size': 20, + 'edge_size': 2, }, } } - for i, (source, matches) in enumerate(result.source_matches.items()): + for i, (_, matches) in enumerate(result.source_matches.items()): for match in matches.records: - graph["graph"]["nodes"][match.concept_id] = { - "metadata": { - "color": COLORS[i], - "hover": f"{match.concept_id}\n{match.symbol}\n{match.label}", # noqa: E501 - "click": f"

{json.dumps(match.model_dump(), indent=2)}

", # noqa: E501 + graph['graph']['nodes'][match.concept_id] = { + 'metadata': { + 'color': COLORS[i], + 'hover': f'{match.concept_id}\n{match.symbol}\n{match.label}', + 'click': f"

{json.dumps(match.model_dump(), indent=2)}

", } } for xref in match.xrefs: - graph["graph"]["edges"].append( - {"source": match.concept_id, "target": xref} + graph['graph']['edges'].append( + {'source': match.concept_id, 'target': xref} ) included_edges = [] - for edge in graph["graph"]["edges"]: + for edge in graph['graph']['edges']: if ( - edge["target"] in graph["graph"]["nodes"] - and edge["source"] in graph["graph"]["nodes"] + edge['target'] in graph['graph']['nodes'] + and edge['source'] in graph['graph']['nodes'] ): included_edges.append(edge) - graph["graph"]["edges"] = included_edges + graph['graph']['edges'] = included_edges - included_nodes = {k["source"] for k in graph["graph"]["edges"]}.union( - {k["target"] for k in graph["graph"]["edges"]} + included_nodes = {k['source'] for k in graph['graph']['edges']}.union( + {k['target'] for k in graph['graph']['edges']} ) new_nodes = {} - for key, value in graph["graph"]["nodes"].items(): + for key, value in graph['graph']['nodes'].items(): if key in included_nodes: new_nodes[key] = value - graph["graph"]["nodes"] = new_nodes + graph['graph']['nodes'] = new_nodes return graph @@ -82,8 +82,8 @@ def gen_norm_figure() -> None: """Generate normalized graph figure for docs.""" q = QueryHandler(create_db()) - otx2p1 = "OTX2P1" - otx2p2 = "OTX2P2" + otx2p1 = 'OTX2P1' + otx2p2 = 'OTX2P2' otx2p1_result = q.normalize_unmerged(otx2p1) otx2p2_result = q.normalize_unmerged(otx2p2) @@ -91,15 +91,15 @@ def gen_norm_figure() -> None: otx2p1_graph = create_gjgf(otx2p1_result) otx2p2_graph = create_gjgf(otx2p2_result) - nodes = otx2p1_graph["graph"]["nodes"] - nodes.update(otx2p2_graph["graph"]["nodes"]) + nodes = otx2p1_graph['graph']['nodes'] + nodes.update(otx2p2_graph['graph']['nodes']) graph = { - "graph": { - "label": f"Reference network for {otx2p1} and {otx2p2}", - "metadata": otx2p1_graph["graph"]["metadata"], - "nodes": nodes, - "edges": otx2p1_graph["graph"]["edges"] + otx2p2_graph["graph"]["edges"], + 'graph': { + 'label': f'Reference network for {otx2p1} and {otx2p2}', + 'metadata': otx2p1_graph['graph']['metadata'], + 'nodes': nodes, + 'edges': otx2p1_graph['graph']['edges'] + otx2p2_graph['graph']['edges'], } } @@ -107,20 +107,20 @@ def gen_norm_figure() -> None: data=graph, graph_height=250, node_hover_neighborhood=True, - node_label_font="arial", + node_label_font='arial', ) fig.export_html( ( APP_ROOT.parents[0] - / "docs" - / "source" - / "_static" - / "html" - / "normalize_example.html" + / 'docs' + / 'source' + / '_static' + / 'html' + / 'normalize_example.html' ).absolute(), overwrite=True, ) -if __name__ == "__main__": +if __name__ == '__main__': gen_norm_figure() diff --git a/docs/source/conf.py b/docs/source/conf.py index 46a04cce..9526290b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -57,14 +57,14 @@ ], } # -- autodoc things ---------------------------------------------------------- -import os # noqa: E402 -import sys # noqa: E402 +import os +import sys sys.path.insert(0, os.path.abspath("../../gene")) autodoc_preserve_defaults = True # -- get version ------------------------------------------------------------- -from gene import __version__ # noqa: E402 +from gene import __version__ version = __version__ release = version @@ -77,7 +77,7 @@ def linkcode_resolve(domain, info): if not info["module"]: return None filename = info["module"].replace(".", "/") - return f"https://github.com/cancervariants/gene-normalization/blob/main/{filename}.py" # noqa: E501 + return f"https://github.com/cancervariants/gene-normalization/blob/main/{filename}.py" # -- code block style -------------------------------------------------------- diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst index 2ec80652..af533cba 100644 --- a/docs/source/contributing.rst +++ b/docs/source/contributing.rst @@ -48,7 +48,7 @@ When running the web server, enable hot-reloading on new code changes: :: Style ----- -Code style is managed by `Ruff `_ and `Black `_, and should be checked via pre-commit hook before commits. Final QC is applied with GitHub Actions to every pull request. +Code style is managed by `Ruff `_, and should be checked via pre-commit hook before commits. Final QC is applied with GitHub Actions to every pull request. Tests ----- diff --git a/pyproject.toml b/pyproject.toml index 41c8f7fd..f1981eb6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,38 +92,44 @@ exclude = ["docs/source/conf.py"] # pycodestyle (E, W) # Pyflakes (F) # flake8-annotations (ANN) -# flake8-quotes (Q) # pydocstyle (D) # pep8-naming (N) # isort (I) -select = ["E", "W", "F", "ANN", "Q", "D", "N", "I"] - +select = ["E", "W", "F", "ANN", "D", "N", "I"] fixable = ["I", "F401"] # D203 - one-blank-line-before-class # D205 - blank-line-after-summary +# D206 - indent-with-spaces* # D213 - multi-line-summary-second-line +# D300 - triple-single-quotes # D400 - ends-in-period # D415 - ends-in-punctuation # ANN101 - missing-type-self # ANN003 - missing-type-kwargs -# E501 - line-too-long -ignore = ["D203", "D205", "D213", "D400", "D415", "ANN101", "ANN003", "E501"] +# E111 - indentation-with-invalid-multiple* +# E114 - indentation-with-invalid-multiple-comment* +# E117 - over-indented* +# E501 - line-too-long* +# W191 - tab-indentation* +# *ignored for compatibility with formatter +ignore = [ + "D203", "D205", "D206", "D213", "D300", "D400", "D415", + "ANN101", "ANN003", + "E111", "E114", "E117", "E501", + "W191" +] -[tool.ruff.flake8-quotes] -docstring-quotes = "double" +[tool.ruff.format] +quote-style = "single" [tool.ruff.per-file-ignores] # ANN001 - missing-type-function-argument +# ANN102 - missing-type-cls # ANN2 - missing-return-type # ANN201 - Missing type annotation -# ANN102 - missing-type-cls -# D103 - Missing docstring in public function -# F821 - undefined-name -# F401 - unused-import -# I001 - Import block unsorted or unformatted +# D301 - escape-sequence-in-docstring # N805 - invalid-first-argument-name-for-method "tests/*" = ["ANN001", "ANN102", "ANN2"] -"*__init__.py" = ["F401"] -"gene/schemas.py" = ["ANN001", "ANN201", "N805"] -"docs/source/conf.py" = ["D100", "I001", "D103", "ANN201", "ANN001"] +"src/gene/schemas.py" = ["ANN001", "ANN201", "N805"] +"src/gene/cli.py" = ["D301"] diff --git a/src/gene/__init__.py b/src/gene/__init__.py index 2c569554..9dd82081 100644 --- a/src/gene/__init__.py +++ b/src/gene/__init__.py @@ -3,27 +3,25 @@ from os import environ from pathlib import Path -from .version import __version__ # noqa: F401 - APP_ROOT = Path(__file__).resolve().parent logging.basicConfig( - filename="gene.log", format="[%(asctime)s] - %(name)s - %(levelname)s : %(message)s" + filename='gene.log', format='[%(asctime)s] - %(name)s - %(levelname)s : %(message)s' ) -logger = logging.getLogger("gene") +logger = logging.getLogger('gene') logger.setLevel(logging.DEBUG) logger.handlers = [] -logging.getLogger("boto3").setLevel(logging.INFO) -logging.getLogger("botocore").setLevel(logging.INFO) -logging.getLogger("urllib3").setLevel(logging.INFO) -logging.getLogger("python_jsonschema_objects").setLevel(logging.INFO) -logging.getLogger("biocommons.seqrepo.seqaliasdb.seqaliasdb").setLevel(logging.INFO) -logging.getLogger("biocommons.seqrepo.fastadir.fastadir").setLevel(logging.INFO) +logging.getLogger('boto3').setLevel(logging.INFO) +logging.getLogger('botocore').setLevel(logging.INFO) +logging.getLogger('urllib3').setLevel(logging.INFO) +logging.getLogger('python_jsonschema_objects').setLevel(logging.INFO) +logging.getLogger('biocommons.seqrepo.seqaliasdb.seqaliasdb').setLevel(logging.INFO) +logging.getLogger('biocommons.seqrepo.fastadir.fastadir').setLevel(logging.INFO) SEQREPO_ROOT_DIR = Path( - environ.get("SEQREPO_ROOT_DIR", "/usr/local/share/seqrepo/latest") + environ.get('SEQREPO_ROOT_DIR', '/usr/local/share/seqrepo/latest') ) @@ -61,5 +59,5 @@ class DownloadException(Exception): # noqa: N818 NAMESPACE_LOOKUP = { v.value.lower(): NamespacePrefix[k].value for k, v in SourceIDAfterNamespace.__members__.items() - if v.value != "" + if v.value != '' } diff --git a/src/gene/cli.py b/src/gene/cli.py index a14ce952..f19a6537 100644 --- a/src/gene/cli.py +++ b/src/gene/cli.py @@ -17,13 +17,13 @@ from gene.database.database import DatabaseException from gene.schemas import SourceName -logger = logging.getLogger("gene") +logger = logging.getLogger('gene') logger.setLevel(logging.DEBUG) @click.command() -@click.option("--db_url", help="URL endpoint for the application database.") -@click.option("--verbose", "-v", is_flag=True, help="Print result to console if set.") +@click.option('--db_url', help='URL endpoint for the application database.') +@click.option('--verbose', '-v', is_flag=True, help='Print result to console if set.') def check_db(db_url: str, verbose: bool = False) -> None: """Perform basic checks on DB health and population. Exits with status code 1 if DB schema is uninitialized or if critical tables appear to be empty. @@ -31,25 +31,25 @@ def check_db(db_url: str, verbose: bool = False) -> None: \f :param db_url: URL to normalizer database :param verbose: if true, print result to console - """ # noqa: D301 + """ db = create_db(db_url, False) if not db.check_schema_initialized(): if verbose: - click.echo("Health check failed: DB schema uninitialized.") + click.echo('Health check failed: DB schema uninitialized.') click.get_current_context().exit(1) if not db.check_tables_populated(): if verbose: - click.echo("Health check failed: DB is incompletely populated.") + click.echo('Health check failed: DB is incompletely populated.') click.get_current_context().exit(1) if verbose: - click.echo("DB health check successful: tables appear complete.") + click.echo('DB health check successful: tables appear complete.') @click.command() -@click.option("--data_url", help="URL to data dump") -@click.option("--db_url", help="URL endpoint for the application database.") +@click.option('--data_url', help='URL to data dump') +@click.option('--db_url', help='URL endpoint for the application database.') def update_from_remote(data_url: Optional[str], db_url: str) -> None: """Update data from remotely-hosted DB dump. By default, fetches from latest available dump on VICC S3 bucket; specific URLs can be provided instead by @@ -58,52 +58,52 @@ def update_from_remote(data_url: Optional[str], db_url: str) -> None: \f :param data_url: user-specified location to pull DB dump from :param db_url: URL to normalizer database - """ # noqa: D301 - if not click.confirm("Are you sure you want to overwrite existing data?"): + """ + if not click.confirm('Are you sure you want to overwrite existing data?'): click.get_current_context().exit() if not data_url: - data_url = os.environ.get("GENE_NORM_REMOTE_DB_URL") + data_url = os.environ.get('GENE_NORM_REMOTE_DB_URL') db = create_db(db_url, False) try: db.load_from_remote(data_url) except NotImplementedError: click.echo( - f"Error: Fetching remote data dump not supported for {db.__class__.__name__}" - ) # noqa: E501 + f'Error: Fetching remote data dump not supported for {db.__class__.__name__}' + ) click.get_current_context().exit(1) except DatabaseException as e: - click.echo(f"Encountered exception during update: {str(e)}") + click.echo(f'Encountered exception during update: {str(e)}') click.get_current_context().exit(1) @click.command() @click.option( - "--output_directory", - "-o", - help="Output location to write to", + '--output_directory', + '-o', + help='Output location to write to', type=click.Path(exists=True, path_type=Path), ) -@click.option("--db_url", help="URL endpoint for the application database.") +@click.option('--db_url', help='URL endpoint for the application database.') def dump_database(output_directory: Path, db_url: str) -> None: """Dump data from database into file. \f :param output_directory: path to existing directory :param db_url: URL to normalizer database - """ # noqa: D301 + """ if not output_directory: - output_directory = Path(".") + output_directory = Path('.') db = create_db(db_url, False) try: db.export_db(output_directory) except NotImplementedError: click.echo( - f"Error: Dumping data to file not supported for {db.__class__.__name__}" - ) # noqa: E501 + f'Error: Dumping data to file not supported for {db.__class__.__name__}' + ) click.get_current_context().exit(1) except DatabaseException as e: - click.echo(f"Encountered exception during update: {str(e)}") + click.echo(f'Encountered exception during update: {str(e)}') click.get_current_context().exit(1) @@ -138,20 +138,20 @@ def _delete_source(n: SourceName, db: AbstractDatabase) -> float: :param db: database instance :return: time taken (in seconds) to delete """ - msg = f"Deleting {n.value}..." - click.echo(f"\n{msg}") + msg = f'Deleting {n.value}...' + click.echo(f'\n{msg}') logger.info(msg) start_delete = timer() db.delete_source(n) end_delete = timer() delete_time = end_delete - start_delete - msg = f"Deleted {n.value} in {delete_time:.5f} seconds." - click.echo(f"{msg}\n") + msg = f'Deleted {n.value} in {delete_time:.5f} seconds.' + click.echo(f'{msg}\n') logger.info(msg) return delete_time -_etl_dependency_help = "Are ETL dependencies installed? See the Installation page in the documentation for more info." +_etl_dependency_help = 'Are ETL dependencies installed? See the Installation page in the documentation for more info.' def _load_source( @@ -170,7 +170,7 @@ def _load_source( :param use_existing: if True, use most recent local data files instead of fetching from remote """ - msg = f"Loading {n.value}..." + msg = f'Loading {n.value}...' click.echo(msg) logger.info(msg) start_load = timer() @@ -181,7 +181,7 @@ def _load_source( from gene.etl.exceptions import GeneNormalizerEtlError except ModuleNotFoundError as e: click.echo( - f"Encountered ModuleNotFoundError attempting to import {e.name}. {_etl_dependency_help}" + f'Encountered ModuleNotFoundError attempting to import {e.name}. {_etl_dependency_help}' ) click.get_current_context().exit() SourceClass = eval(n.value) # noqa: N806 @@ -191,14 +191,14 @@ def _load_source( processed_ids += source.perform_etl(use_existing) except GeneNormalizerEtlError as e: logger.error(e) - click.echo(f"Encountered error while loading {n}: {e}.") + click.echo(f'Encountered error while loading {n}: {e}.') click.get_current_context().exit() end_load = timer() load_time = end_load - start_load - msg = f"Loaded {n.value} in {load_time:.5f} seconds." + msg = f'Loaded {n.value} in {load_time:.5f} seconds.' click.echo(msg) logger.info(msg) - msg = f"Total time for {n.value}: {(delete_time + load_time):.5f} seconds." + msg = f'Total time for {n.value}: {(delete_time + load_time):.5f} seconds.' click.echo(msg) logger.info(msg) @@ -208,15 +208,15 @@ def _delete_normalized_data(database: AbstractDatabase) -> None: :param database: DB instance """ - click.echo("\nDeleting normalized records...") + click.echo('\nDeleting normalized records...') start_delete = timer() try: database.delete_normalized_concepts() except (DatabaseReadException, DatabaseWriteException) as e: - click.echo(f"Encountered exception during normalized data deletion: {e}") + click.echo(f'Encountered exception during normalized data deletion: {e}') end_delete = timer() delete_time = end_delete - start_delete - click.echo(f"Deleted normalized records in {delete_time:.5f} seconds.") + click.echo(f'Deleted normalized records in {delete_time:.5f} seconds.') def _load_merge(db: AbstractDatabase, processed_ids: Set[str]) -> None: @@ -234,34 +234,34 @@ def _load_merge(db: AbstractDatabase, processed_ids: Set[str]) -> None: from gene.etl.merge import Merge except ModuleNotFoundError as e: click.echo( - f"Encountered ModuleNotFoundError attempting to import {e.name}. {_etl_dependency_help}" + f'Encountered ModuleNotFoundError attempting to import {e.name}. {_etl_dependency_help}' ) click.get_current_context().exit() merge = Merge(database=db) - click.echo("Constructing normalized records...") + click.echo('Constructing normalized records...') merge.create_merged_concepts(processed_ids) end = timer() click.echo( - f"Merged concept generation completed in " f"{(end - start):.5f} seconds" + f'Merged concept generation completed in ' f'{(end - start):.5f} seconds' ) @click.command() -@click.option("--sources", help="The source(s) you wish to update separated by spaces.") -@click.option("--aws_instance", is_flag=True, help="Using AWS DynamodDB instance.") -@click.option("--db_url", help="URL endpoint for the application database.") -@click.option("--update_all", is_flag=True, help="Update all normalizer sources.") +@click.option('--sources', help='The source(s) you wish to update separated by spaces.') +@click.option('--aws_instance', is_flag=True, help='Using AWS DynamodDB instance.') +@click.option('--db_url', help='URL endpoint for the application database.') +@click.option('--update_all', is_flag=True, help='Update all normalizer sources.') @click.option( - "--update_merged", + '--update_merged', is_flag=True, - help="Update concepts for normalize endpoint from accepted sources.", + help='Update concepts for normalize endpoint from accepted sources.', ) @click.option( - "--use_existing", + '--use_existing', is_flag=True, default=False, - help="Use most recent local source data instead of fetching latest version", + help='Use most recent local source data instead of fetching latest version', ) def update_normalizer_db( sources: str, @@ -285,7 +285,7 @@ def update_normalizer_db( :param update_all: if true, update all sources (ignore `normalizer` parameter) :param update_merged: if true, update normalized records :param use_existing: if True, use most recent local data instead of fetching latest version - """ # noqa: D301 + """ db = create_db(db_url, aws_instance) if update_all: @@ -296,24 +296,24 @@ def update_normalizer_db( else: ctx = click.get_current_context() click.echo( - "Must either enter 1 or more sources, or use `--update_all` parameter" - ) # noqa: E501 + 'Must either enter 1 or more sources, or use `--update_all` parameter' + ) click.echo(ctx.get_help()) ctx.exit() else: sources_split = sources.lower().split() if len(sources_split) == 0: - raise Exception("Must enter 1 or more source names to update") + raise Exception('Must enter 1 or more source names to update') non_sources = set(sources_split) - set(SOURCES) if len(non_sources) != 0: - raise Exception(f"Not valid source(s): {non_sources}") + raise Exception(f'Not valid source(s): {non_sources}') parsed_source_names = {SourceName(SOURCES[s]) for s in sources_split} _update_normalizer(parsed_source_names, db, update_merged, use_existing) -if __name__ == "__main__": +if __name__ == '__main__': update_normalizer_db() diff --git a/src/gene/database/__init__.py b/src/gene/database/__init__.py index 3a71e721..216d9fb2 100644 --- a/src/gene/database/__init__.py +++ b/src/gene/database/__init__.py @@ -1,10 +1 @@ """Provide database clients.""" -from .database import ( - AWS_ENV_VAR_NAME, - AbstractDatabase, - DatabaseException, - DatabaseInitializationException, - DatabaseReadException, - DatabaseWriteException, - create_db, -) diff --git a/src/gene/database/database.py b/src/gene/database/database.py index 67bcafd6..93ef3cdf 100644 --- a/src/gene/database/database.py +++ b/src/gene/database/database.py @@ -61,12 +61,12 @@ def _check_delete_okay() -> bool: :raise DatabaseWriteException: if skip confirmation variable is set -- manual approval is required. """ - if environ.get(AWS_ENV_VAR_NAME, "") == AwsEnvName.PRODUCTION: - if environ.get(SKIP_AWS_DB_ENV_NAME, "") == "true": + if environ.get(AWS_ENV_VAR_NAME, '') == AwsEnvName.PRODUCTION: + if environ.get(SKIP_AWS_DB_ENV_NAME, '') == 'true': raise DatabaseWriteException( - f"Must unset {SKIP_AWS_DB_ENV_NAME} env variable to enable drop_db()" # noqa: E501 + f'Must unset {SKIP_AWS_DB_ENV_NAME} env variable to enable drop_db()' ) - return click.confirm("Are you sure you want to delete existing data?") + return click.confirm('Are you sure you want to delete existing data?') else: return True @@ -242,19 +242,19 @@ def export_db(self, export_location: Path) -> None: # can be set to either `Dev`, `Staging`, or `Prod` # ONLY set when wanting to access aws instance -AWS_ENV_VAR_NAME = "GENE_NORM_ENV" +AWS_ENV_VAR_NAME = 'GENE_NORM_ENV' # Set to "true" if want to skip db confirmation check. Should ONLY be used for # deployment needs -SKIP_AWS_DB_ENV_NAME = "SKIP_AWS_CONFIRMATION" +SKIP_AWS_DB_ENV_NAME = 'SKIP_AWS_CONFIRMATION' class AwsEnvName(str, Enum): """AWS environment name that is being used""" - DEVELOPMENT = "Dev" - STAGING = "Staging" - PRODUCTION = "Prod" + DEVELOPMENT = 'Dev' + STAGING = 'Staging' + PRODUCTION = 'Prod' VALID_AWS_ENV_NAMES = {v.value for v in AwsEnvName.__members__.values()} @@ -263,11 +263,11 @@ class AwsEnvName(str, Enum): def confirm_aws_db_use(env_name: str) -> None: """Check to ensure that AWS instance should actually be used.""" if click.confirm( - f"Are you sure you want to use the AWS {env_name} database?", default=False + f'Are you sure you want to use the AWS {env_name} database?', default=False ): - click.echo(f"***GENE AWS {env_name.upper()} DATABASE IN USE***") + click.echo(f'***GENE AWS {env_name.upper()} DATABASE IN USE***') else: - click.echo("Exiting.") + click.echo('Exiting.') sys.exit() @@ -324,13 +324,13 @@ def create_db( else: if db_url: endpoint_url = db_url - elif "GENE_NORM_DB_URL" in environ.keys(): - endpoint_url = environ["GENE_NORM_DB_URL"] + elif 'GENE_NORM_DB_URL' in environ.keys(): + endpoint_url = environ['GENE_NORM_DB_URL'] else: - endpoint_url = "http://localhost:8000" + endpoint_url = 'http://localhost:8000' # prefer DynamoDB unless connection explicitly reads like a libpq URI - if endpoint_url.startswith("postgres"): + if endpoint_url.startswith('postgres'): from gene.database.postgresql import PostgresDatabase db = PostgresDatabase(endpoint_url) diff --git a/src/gene/database/dynamodb.py b/src/gene/database/dynamodb.py index 5df9e0d0..b7658aa2 100644 --- a/src/gene/database/dynamodb.py +++ b/src/gene/database/dynamodb.py @@ -39,48 +39,48 @@ def __init__(self, db_url: Optional[str] = None, **db_args) -> None: * region_name: AWS region (defaults to "us-east-2") :raise DatabaseInitializationException: if initial setup fails """ - self.gene_table = environ.get("GENE_DYNAMO_TABLE", "gene_normalizer") - region_name = db_args.get("region_name", "us-east-2") + self.gene_table = environ.get('GENE_DYNAMO_TABLE', 'gene_normalizer') + region_name = db_args.get('region_name', 'us-east-2') if AWS_ENV_VAR_NAME in environ: - if "GENE_TEST" in environ: + if 'GENE_TEST' in environ: raise DatabaseInitializationException( - f"Cannot have both GENE_TEST and {AWS_ENV_VAR_NAME} set." - ) # noqa: E501 + f'Cannot have both GENE_TEST and {AWS_ENV_VAR_NAME} set.' + ) aws_env = environ[AWS_ENV_VAR_NAME] if aws_env not in VALID_AWS_ENV_NAMES: raise DatabaseInitializationException( - f"{AWS_ENV_VAR_NAME} must be one of {VALID_AWS_ENV_NAMES}" - ) # noqa: E501 + f'{AWS_ENV_VAR_NAME} must be one of {VALID_AWS_ENV_NAMES}' + ) skip_confirmation = environ.get(SKIP_AWS_DB_ENV_NAME) if (not skip_confirmation) or ( - skip_confirmation and skip_confirmation != "true" - ): # noqa: E501 + skip_confirmation and skip_confirmation != 'true' + ): confirm_aws_db_use(environ[AWS_ENV_VAR_NAME]) - boto_params = {"region_name": region_name} + boto_params = {'region_name': region_name} if aws_env == AwsEnvName.DEVELOPMENT: self.gene_table = environ.get( - "GENE_DYNAMO_TABLE", "gene_normalizer_nonprod" + 'GENE_DYNAMO_TABLE', 'gene_normalizer_nonprod' ) else: if db_url: endpoint_url = db_url - elif "GENE_NORM_DB_URL" in environ: - endpoint_url = environ["GENE_NORM_DB_URL"] + elif 'GENE_NORM_DB_URL' in environ: + endpoint_url = environ['GENE_NORM_DB_URL'] else: - endpoint_url = "http://localhost:8000" - click.echo(f"***Using Gene Database Endpoint: {endpoint_url}***") - boto_params = {"region_name": region_name, "endpoint_url": endpoint_url} + endpoint_url = 'http://localhost:8000' + click.echo(f'***Using Gene Database Endpoint: {endpoint_url}***') + boto_params = {'region_name': region_name, 'endpoint_url': endpoint_url} - self.dynamodb = boto3.resource("dynamodb", **boto_params) - self.dynamodb_client = boto3.client("dynamodb", **boto_params) + self.dynamodb = boto3.resource('dynamodb', **boto_params) + self.dynamodb_client = boto3.client('dynamodb', **boto_params) # Only create tables for local instance - envs_do_not_create_tables = {AWS_ENV_VAR_NAME, "GENE_TEST"} + envs_do_not_create_tables = {AWS_ENV_VAR_NAME, 'GENE_TEST'} if not set(envs_do_not_create_tables) & set(environ): self.initialize_db() @@ -94,7 +94,7 @@ def list_tables(self) -> List[str]: :return: Table names in DynamoDB """ - return self.dynamodb_client.list_tables()["TableNames"] + return self.dynamodb_client.list_tables()['TableNames'] def drop_db(self) -> None: """Delete all tables from database. Requires manual confirmation. @@ -116,36 +116,36 @@ def _create_genes_table(self) -> None: self.dynamodb.create_table( TableName=self.gene_table, KeySchema=[ - {"AttributeName": "label_and_type", "KeyType": "HASH"}, # Partition key - {"AttributeName": "concept_id", "KeyType": "RANGE"}, # Sort key + {'AttributeName': 'label_and_type', 'KeyType': 'HASH'}, # Partition key + {'AttributeName': 'concept_id', 'KeyType': 'RANGE'}, # Sort key ], AttributeDefinitions=[ - {"AttributeName": "label_and_type", "AttributeType": "S"}, - {"AttributeName": "concept_id", "AttributeType": "S"}, - {"AttributeName": "src_name", "AttributeType": "S"}, - {"AttributeName": "item_type", "AttributeType": "S"}, + {'AttributeName': 'label_and_type', 'AttributeType': 'S'}, + {'AttributeName': 'concept_id', 'AttributeType': 'S'}, + {'AttributeName': 'src_name', 'AttributeType': 'S'}, + {'AttributeName': 'item_type', 'AttributeType': 'S'}, ], GlobalSecondaryIndexes=[ { - "IndexName": "src_index", - "KeySchema": [{"AttributeName": "src_name", "KeyType": "HASH"}], - "Projection": {"ProjectionType": "KEYS_ONLY"}, - "ProvisionedThroughput": { - "ReadCapacityUnits": 10, - "WriteCapacityUnits": 10, + 'IndexName': 'src_index', + 'KeySchema': [{'AttributeName': 'src_name', 'KeyType': 'HASH'}], + 'Projection': {'ProjectionType': 'KEYS_ONLY'}, + 'ProvisionedThroughput': { + 'ReadCapacityUnits': 10, + 'WriteCapacityUnits': 10, }, }, { - "IndexName": "item_type_index", - "KeySchema": [{"AttributeName": "item_type", "KeyType": "HASH"}], - "Projection": {"ProjectionType": "KEYS_ONLY"}, - "ProvisionedThroughput": { - "ReadCapacityUnits": 10, - "WriteCapacityUnits": 10, + 'IndexName': 'item_type_index', + 'KeySchema': [{'AttributeName': 'item_type', 'KeyType': 'HASH'}], + 'Projection': {'ProjectionType': 'KEYS_ONLY'}, + 'ProvisionedThroughput': { + 'ReadCapacityUnits': 10, + 'WriteCapacityUnits': 10, }, }, ], - ProvisionedThroughput={"ReadCapacityUnits": 10, "WriteCapacityUnits": 10}, + ProvisionedThroughput={'ReadCapacityUnits': 10, 'WriteCapacityUnits': 10}, ) def check_schema_initialized(self) -> bool: @@ -156,7 +156,7 @@ def check_schema_initialized(self) -> bool: existing_tables = self.list_tables() exists = self.gene_table in existing_tables if not exists: - logger.info(f"{self.gene_table} table is missing or unavailable.") + logger.info(f'{self.gene_table} table is missing or unavailable.') return exists def check_tables_populated(self) -> bool: @@ -169,29 +169,29 @@ def check_tables_populated(self) -> bool: :return: True if queries successful, false if DB appears empty """ sources = self.genes.query( - IndexName="item_type_index", - KeyConditionExpression=Key("item_type").eq("source"), - ).get("Items", []) + IndexName='item_type_index', + KeyConditionExpression=Key('item_type').eq('source'), + ).get('Items', []) if len(sources) < len(SourceName): - logger.info("Gene sources table is missing expected sources.") + logger.info('Gene sources table is missing expected sources.') return False records = self.genes.query( - IndexName="item_type_index", - KeyConditionExpression=Key("item_type").eq("identity"), + IndexName='item_type_index', + KeyConditionExpression=Key('item_type').eq('identity'), Limit=1, ) - if len(records.get("Items", [])) < 1: - logger.info("Gene records index is empty.") + if len(records.get('Items', [])) < 1: + logger.info('Gene records index is empty.') return False normalized_records = self.genes.query( - IndexName="item_type_index", - KeyConditionExpression=Key("item_type").eq(RecordType.MERGER.value), + IndexName='item_type_index', + KeyConditionExpression=Key('item_type').eq(RecordType.MERGER.value), Limit=1, ) - if len(normalized_records.get("Items", [])) < 1: - logger.info("Normalized gene records index is empty.") + if len(normalized_records.get('Items', [])) < 1: + logger.info('Normalized gene records index is empty.') return False return True @@ -211,14 +211,14 @@ def get_source_metadata(self, src_name: Union[str, SourceName]) -> Dict: if src_name in self._cached_sources: return self._cached_sources[src_name] else: - pk = f"{src_name.lower()}##source" - concept_id = f"source:{src_name.lower()}" + pk = f'{src_name.lower()}##source' + concept_id = f'source:{src_name.lower()}' metadata = self.genes.get_item( - Key={"label_and_type": pk, "concept_id": concept_id} - ).get("Item") + Key={'label_and_type': pk, 'concept_id': concept_id} + ).get('Item') if not metadata: raise DatabaseReadException( - f"Unable to retrieve data for source {src_name}" + f'Unable to retrieve data for source {src_name}' ) self._cached_sources[src_name] = metadata return metadata @@ -238,19 +238,19 @@ def get_record_by_id( """ try: if merge: - pk = f"{concept_id.lower()}##{RecordType.MERGER.value}" + pk = f'{concept_id.lower()}##{RecordType.MERGER.value}' else: - pk = f"{concept_id.lower()}##{RecordType.IDENTITY.value}" + pk = f'{concept_id.lower()}##{RecordType.IDENTITY.value}' if case_sensitive: match = self.genes.get_item( - Key={"label_and_type": pk, "concept_id": concept_id} + Key={'label_and_type': pk, 'concept_id': concept_id} ) - return match["Item"] + return match['Item'] else: - exp = Key("label_and_type").eq(pk) + exp = Key('label_and_type').eq(pk) response = self.genes.query(KeyConditionExpression=exp) - record = response["Items"][0] - del record["label_and_type"] + record = response['Items'][0] + del record['label_and_type'] return record except ClientError as e: logger.error( @@ -270,11 +270,11 @@ def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]: :param ref_type: type of match to look for. :return: list of associated concept IDs. Empty if lookup fails. """ - pk = f"{search_term}##{ref_type.value.lower()}" - filter_exp = Key("label_and_type").eq(pk) + pk = f'{search_term}##{ref_type.value.lower()}' + filter_exp = Key('label_and_type').eq(pk) try: matches = self.genes.query(KeyConditionExpression=filter_exp) - return [m["concept_id"] for m in matches.get("Items", None)] + return [m['concept_id'] for m in matches.get('Items', None)] except ClientError as e: logger.error( f"boto3 client error on get_refs_by_type for " @@ -291,7 +291,7 @@ def get_all_concept_ids(self) -> Set[str]: last_evaluated_key = None concept_ids = [] params = { - "ProjectionExpression": "concept_id", + 'ProjectionExpression': 'concept_id', } while True: if last_evaluated_key: @@ -300,10 +300,10 @@ def get_all_concept_ids(self) -> Set[str]: ) else: response = self.genes.scan(**params) - records = response["Items"] + records = response['Items'] for record in records: - concept_ids.append(record["concept_id"]) - last_evaluated_key = response.get("LastEvaluatedKey") + concept_ids.append(record['concept_id']) + last_evaluated_key = response.get('LastEvaluatedKey') if not last_evaluated_key: break return set(concept_ids) @@ -332,19 +332,19 @@ def get_all_records(self, record_type: RecordType) -> Generator[Dict, None, None ) else: response = self.genes.scan() - records = response.get("Items", []) + records = response.get('Items', []) for record in records: - incoming_record_type = record.get("item_type") + incoming_record_type = record.get('item_type') if record_type == RecordType.IDENTITY: if incoming_record_type == record_type: yield record else: if ( incoming_record_type == RecordType.IDENTITY - and not record.get("merge_ref") # noqa: E501 + and not record.get('merge_ref') ) or incoming_record_type == RecordType.MERGER: yield record - last_evaluated_key = response.get("LastEvaluatedKey") + last_evaluated_key = response.get('LastEvaluatedKey') if not last_evaluated_key: break @@ -357,10 +357,10 @@ def add_source_metadata(self, src_name: SourceName, metadata: SourceMeta) -> Non """ src_name_value = src_name.value metadata_item = metadata.model_dump() - metadata_item["src_name"] = src_name_value - metadata_item["label_and_type"] = f"{str(src_name_value).lower()}##source" - metadata_item["concept_id"] = f"source:{str(src_name_value).lower()}" - metadata_item["item_type"] = "source" + metadata_item['src_name'] = src_name_value + metadata_item['label_and_type'] = f'{str(src_name_value).lower()}##source' + metadata_item['concept_id'] = f'source:{str(src_name_value).lower()}' + metadata_item['item_type'] = 'source' try: self.genes.put_item(Item=metadata_item) except ClientError as e: @@ -372,11 +372,11 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: :param Dict record: record to upload :param SourceName src_name: name of source for record """ - concept_id = record["concept_id"] - record["src_name"] = src_name.value - label_and_type = f"{concept_id.lower()}##identity" - record["label_and_type"] = label_and_type - record["item_type"] = "identity" + concept_id = record['concept_id'] + record['src_name'] = src_name.value + label_and_type = f'{concept_id.lower()}##identity' + record['label_and_type'] = label_and_type + record['item_type'] = 'identity' try: self.batch.put_item(Item=record) except ClientError as e: @@ -395,7 +395,7 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: items = {item.lower() for item in value} for item in items: self._add_ref_record( - item, record["concept_id"], item_type, src_name + item, record['concept_id'], item_type, src_name ) def add_merged_record(self, record: Dict) -> None: @@ -403,12 +403,12 @@ def add_merged_record(self, record: Dict) -> None: :param record: merged record to add """ - concept_id = record["concept_id"] - id_prefix = concept_id.split(":")[0].lower() - record["src_name"] = PREFIX_LOOKUP[id_prefix] - label_and_type = f"{concept_id.lower()}##{RecordType.MERGER.value}" - record["label_and_type"] = label_and_type - record["item_type"] = RecordType.MERGER.value + concept_id = record['concept_id'] + id_prefix = concept_id.split(':')[0].lower() + record['src_name'] = PREFIX_LOOKUP[id_prefix] + label_and_type = f'{concept_id.lower()}##{RecordType.MERGER.value}' + record['label_and_type'] = label_and_type + record['item_type'] = RecordType.MERGER.value try: self.batch.put_item(Item=record) except ClientError as e: @@ -428,12 +428,12 @@ def _add_ref_record( 'associated_with'} :param src_name: name of source for record """ - label_and_type = f"{term.lower()}##{ref_type}" + label_and_type = f'{term.lower()}##{ref_type}' record = { - "label_and_type": label_and_type, - "concept_id": concept_id.lower(), - "src_name": src_name.value, - "item_type": ref_type, + 'label_and_type': label_and_type, + 'concept_id': concept_id.lower(), + 'src_name': src_name.value, + 'item_type': ref_type, } try: self.batch.put_item(Item=record) @@ -451,11 +451,11 @@ def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: AN :param merge_ref: new ref value :raise DatabaseWriteException: if attempting to update non-existent record """ - label_and_type = f"{concept_id.lower()}##identity" - key = {"label_and_type": label_and_type, "concept_id": concept_id} - update_expression = "set merge_ref=:r" - update_values = {":r": merge_ref.lower()} - condition_expression = "attribute_exists(label_and_type)" + label_and_type = f'{concept_id.lower()}##identity' + key = {'label_and_type': label_and_type, 'concept_id': concept_id} + update_expression = 'set merge_ref=:r' + update_values = {':r': merge_ref.lower()} + condition_expression = 'attribute_exists(label_and_type)' try: self.genes.update_item( Key=key, @@ -464,10 +464,10 @@ def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: AN ConditionExpression=condition_expression, ) except ClientError as e: - code = e.response.get("Error", {}).get("Code") - if code == "ConditionalCheckFailedException": + code = e.response.get('Error', {}).get('Code') + if code == 'ConditionalCheckFailedException': raise DatabaseWriteException( - f"No such record exists for keys {label_and_type}, {concept_id}" + f'No such record exists for keys {label_and_type}, {concept_id}' ) else: logger.error( @@ -485,25 +485,25 @@ def delete_normalized_concepts(self) -> None: """ while True: with self.genes.batch_writer( - overwrite_by_pkeys=["label_and_type", "concept_id"] + overwrite_by_pkeys=['label_and_type', 'concept_id'] ) as batch: try: response = self.genes.query( - IndexName="item_type_index", - KeyConditionExpression=Key("item_type").eq( + IndexName='item_type_index', + KeyConditionExpression=Key('item_type').eq( RecordType.MERGER.value ), ) except ClientError as e: raise DatabaseReadException(e) - records = response["Items"] + records = response['Items'] if not records: break for record in records: batch.delete_item( Key={ - "label_and_type": record["label_and_type"], - "concept_id": record["concept_id"], + 'label_and_type': record['label_and_type'], + 'concept_id': record['concept_id'], } ) @@ -518,23 +518,23 @@ def delete_source(self, src_name: SourceName) -> None: while True: try: response = self.genes.query( - IndexName="src_index", - KeyConditionExpression=Key("src_name").eq(src_name.value), + IndexName='src_index', + KeyConditionExpression=Key('src_name').eq(src_name.value), ) except ClientError as e: raise DatabaseReadException(e) - records = response["Items"] + records = response['Items'] if not records: break with self.genes.batch_writer( - overwrite_by_pkeys=["label_and_type", "concept_id"] + overwrite_by_pkeys=['label_and_type', 'concept_id'] ) as batch: for record in records: try: batch.delete_item( Key={ - "label_and_type": record["label_and_type"], - "concept_id": record["concept_id"], + 'label_and_type': record['label_and_type'], + 'concept_id': record['concept_id'], } ) except ClientError as e: diff --git a/src/gene/database/postgresql.py b/src/gene/database/postgresql.py index f62a1819..90924c8c 100644 --- a/src/gene/database/postgresql.py +++ b/src/gene/database/postgresql.py @@ -29,7 +29,7 @@ logger = logging.getLogger(__name__) -SCRIPTS_DIR = Path(__file__).parent / "postgresql" +SCRIPTS_DIR = Path(__file__).parent / 'postgresql' class PostgresDatabase(AbstractDatabase): @@ -56,16 +56,16 @@ def __init__(self, db_url: Optional[str] = None, **db_args) -> None: """ if db_url: conninfo = db_url - elif "GENE_NORM_DB_URL" in os.environ: - conninfo = os.environ["GENE_NORM_DB_URL"] + elif 'GENE_NORM_DB_URL' in os.environ: + conninfo = os.environ['GENE_NORM_DB_URL'] else: - user = db_args.get("user", "postgres") - password = db_args.get("password", "") - db_name = db_args.get("db_name", "gene_normalizer") + user = db_args.get('user', 'postgres') + password = db_args.get('password', '') + db_name = db_args.get('db_name', 'gene_normalizer') if password: - conninfo = f"dbname={db_name} user={user} password={password}" + conninfo = f'dbname={db_name} user={user} password={password}' else: - conninfo = f"dbname={db_name} user={user}" + conninfo = f'dbname={db_name} user={user}' self.conn = psycopg.connect(conninfo) self.initialize_db() @@ -119,7 +119,7 @@ def drop_db(self) -> None: with self.conn.cursor() as cur: cur.execute(self._drop_db_query) self.conn.commit() - logger.info("Dropped all existing gene normalizer tables.") + logger.info('Dropped all existing gene normalizer tables.') def check_schema_initialized(self) -> bool: """Check if database schema is properly initialized. @@ -128,48 +128,48 @@ def check_schema_initialized(self) -> bool: """ try: with self.conn.cursor() as cur: - cur.execute((SCRIPTS_DIR / "create_tables.sql").read_bytes()) + cur.execute((SCRIPTS_DIR / 'create_tables.sql').read_bytes()) except DuplicateTable: self.conn.rollback() else: - logger.info("Gene table existence check failed.") + logger.info('Gene table existence check failed.') self.conn.rollback() return False try: with self.conn.cursor() as cur: - cur.execute((SCRIPTS_DIR / "add_fkeys.sql").read_bytes()) + cur.execute((SCRIPTS_DIR / 'add_fkeys.sql').read_bytes()) except DuplicateObject: self.conn.rollback() else: - logger.info("Gene foreign key existence check failed.") + logger.info('Gene foreign key existence check failed.') self.conn.rollback() return False try: with self.conn.cursor() as cur: cur.execute( - (SCRIPTS_DIR / "create_record_lookup_view.sql").read_bytes() + (SCRIPTS_DIR / 'create_record_lookup_view.sql').read_bytes() ) except DuplicateTable: self.conn.rollback() else: - logger.info("Gene normalized view lookup failed.") + logger.info('Gene normalized view lookup failed.') self.conn.rollback() return False try: with self.conn.cursor() as cur: - cur.execute((SCRIPTS_DIR / "add_indexes.sql").read_bytes()) + cur.execute((SCRIPTS_DIR / 'add_indexes.sql').read_bytes()) except DuplicateTable: self.conn.rollback() else: - logger.info("Gene indexes check failed.") + logger.info('Gene indexes check failed.') self.conn.rollback() return False return True - _check_sources_query = b"SELECT name FROM gene_sources;" - _check_concepts_query = b"SELECT COUNT(1) FROM gene_concepts LIMIT 1;" - _check_merged_query = b"SELECT COUNT(1) FROM gene_merged LIMIT 1;" + _check_sources_query = b'SELECT name FROM gene_sources;' + _check_concepts_query = b'SELECT COUNT(1) FROM gene_concepts LIMIT 1;' + _check_merged_query = b'SELECT COUNT(1) FROM gene_merged LIMIT 1;' def check_tables_populated(self) -> bool: """Perform rudimentary checks to see if tables are populated. @@ -184,21 +184,21 @@ def check_tables_populated(self) -> bool: cur.execute(self._check_sources_query) results = cur.fetchall() if len(results) < len(SourceName): - logger.info("Gene sources table is missing expected sources.") + logger.info('Gene sources table is missing expected sources.') return False with self.conn.cursor() as cur: cur.execute(self._check_concepts_query) result = cur.fetchone() if not result or result[0] < 1: - logger.info("Gene records table is empty.") + logger.info('Gene records table is empty.') return False with self.conn.cursor() as cur: cur.execute(self._check_merged_query) result = cur.fetchone() if not result or result[0] < 1: - logger.info("Normalized gene records table is empty.") + logger.info('Normalized gene records table is empty.') return False return True @@ -213,12 +213,12 @@ def initialize_db(self) -> None: def _create_views(self) -> None: """Create materialized views.""" - create_view_query = (SCRIPTS_DIR / "create_record_lookup_view.sql").read_bytes() + create_view_query = (SCRIPTS_DIR / 'create_record_lookup_view.sql').read_bytes() with self.conn.cursor() as cur: cur.execute(create_view_query) self.conn.commit() - _refresh_views_query = b"REFRESH MATERIALIZED VIEW record_lookup_view;" + _refresh_views_query = b'REFRESH MATERIALIZED VIEW record_lookup_view;' def _refresh_views(self) -> None: """Update materialized views. @@ -232,36 +232,36 @@ def _refresh_views(self) -> None: def _add_fkeys(self) -> None: """Add fkey relationships.""" - add_fkey_query = (SCRIPTS_DIR / "add_fkeys.sql").read_bytes() + add_fkey_query = (SCRIPTS_DIR / 'add_fkeys.sql').read_bytes() with self.conn.cursor() as cur: cur.execute(add_fkey_query) self.conn.commit() def _drop_fkeys(self) -> None: """Drop fkey relationships.""" - drop_fkey_query = (SCRIPTS_DIR / "drop_fkeys.sql").read_bytes() + drop_fkey_query = (SCRIPTS_DIR / 'drop_fkeys.sql').read_bytes() with self.conn.cursor() as cur: cur.execute(drop_fkey_query) self.conn.commit() def _add_indexes(self) -> None: """Create core search indexes.""" - add_indexes_query = (SCRIPTS_DIR / "add_indexes.sql").read_bytes() + add_indexes_query = (SCRIPTS_DIR / 'add_indexes.sql').read_bytes() with self.conn.cursor() as cur: cur.execute(add_indexes_query) self.conn.commit() def _drop_indexes(self) -> None: """Drop all custom indexes.""" - drop_indexes_query = (SCRIPTS_DIR / "drop_indexes.sql").read_bytes() + drop_indexes_query = (SCRIPTS_DIR / 'drop_indexes.sql').read_bytes() with self.conn.cursor() as cur: cur.execute(drop_indexes_query) self.conn.commit() def _create_tables(self) -> None: """Create all tables, indexes, and views.""" - logger.debug("Creating new gene normalizer tables.") - tables_query = (SCRIPTS_DIR / "create_tables.sql").read_bytes() + logger.debug('Creating new gene normalizer tables.') + tables_query = (SCRIPTS_DIR / 'create_tables.sql').read_bytes() with self.conn.cursor() as cur: cur.execute(tables_query) @@ -278,30 +278,30 @@ def get_source_metadata(self, src_name: SourceName) -> Dict: if src_name in self._cached_sources: return self._cached_sources[src_name] - metadata_query = "SELECT * FROM gene_sources WHERE name = %s;" + metadata_query = 'SELECT * FROM gene_sources WHERE name = %s;' with self.conn.cursor() as cur: cur.execute(metadata_query, [src_name]) metadata_result = cur.fetchone() if not metadata_result: - raise DatabaseReadException(f"{src_name} metadata lookup failed") + raise DatabaseReadException(f'{src_name} metadata lookup failed') metadata = { - "data_license": metadata_result[1], - "data_license_url": metadata_result[2], - "version": metadata_result[3], - "data_url": metadata_result[4], - "rdp_url": metadata_result[5], - "data_license_attributes": { - "non_commercial": metadata_result[6], - "attribution": metadata_result[7], - "share_alike": metadata_result[8], + 'data_license': metadata_result[1], + 'data_license_url': metadata_result[2], + 'version': metadata_result[3], + 'data_url': metadata_result[4], + 'rdp_url': metadata_result[5], + 'data_license_attributes': { + 'non_commercial': metadata_result[6], + 'attribution': metadata_result[7], + 'share_alike': metadata_result[8], }, - "genome_assemblies": metadata_result[9], + 'genome_assemblies': metadata_result[9], } self._cached_sources[src_name] = metadata return metadata _get_record_query = ( - b"SELECT * FROM record_lookup_view WHERE lower(concept_id) = %s;" # noqa: E501 + b'SELECT * FROM record_lookup_view WHERE lower(concept_id) = %s;' ) def _format_source_record(self, source_row: Tuple) -> Dict: @@ -311,21 +311,21 @@ def _format_source_record(self, source_row: Tuple) -> Dict: :return: reformatted dictionary keying gene properties to row values """ gene_record = { - "concept_id": source_row[0], - "symbol_status": source_row[1], - "label": source_row[2], - "strand": source_row[3], - "location_annotations": source_row[4], - "locations": source_row[5], - "gene_type": source_row[6], - "aliases": source_row[7], - "associated_with": source_row[8], - "previous_symbols": source_row[9], - "symbol": source_row[10], - "xrefs": source_row[11], - "src_name": source_row[12], - "merge_ref": source_row[13], - "item_type": RecordType.IDENTITY.value, + 'concept_id': source_row[0], + 'symbol_status': source_row[1], + 'label': source_row[2], + 'strand': source_row[3], + 'location_annotations': source_row[4], + 'locations': source_row[5], + 'gene_type': source_row[6], + 'aliases': source_row[7], + 'associated_with': source_row[8], + 'previous_symbols': source_row[9], + 'symbol': source_row[10], + 'xrefs': source_row[11], + 'src_name': source_row[12], + 'merge_ref': source_row[13], + 'item_type': RecordType.IDENTITY.value, } return {k: v for k, v in gene_record.items() if v} @@ -354,28 +354,28 @@ def _format_merged_record(self, merged_row: Tuple) -> Dict: :return: reformatted dictionary keying normalized gene properties to row values """ merged_record = { - "concept_id": merged_row[0], - "symbol": merged_row[1], - "symbol_status": merged_row[2], - "previous_symbols": merged_row[3], - "label": merged_row[4], - "strand": merged_row[5], - "ensembl_locations": merged_row[6], - "hgnc_locations": merged_row[7], - "ncbi_locations": merged_row[8], - "location_annotations": merged_row[9], - "ensembl_biotype": merged_row[10], - "hgnc_locus_type": merged_row[11], - "ncbi_gene_type": merged_row[12], - "aliases": merged_row[13], - "associated_with": merged_row[14], - "xrefs": merged_row[15], - "item_type": RecordType.MERGER.value, + 'concept_id': merged_row[0], + 'symbol': merged_row[1], + 'symbol_status': merged_row[2], + 'previous_symbols': merged_row[3], + 'label': merged_row[4], + 'strand': merged_row[5], + 'ensembl_locations': merged_row[6], + 'hgnc_locations': merged_row[7], + 'ncbi_locations': merged_row[8], + 'location_annotations': merged_row[9], + 'ensembl_biotype': merged_row[10], + 'hgnc_locus_type': merged_row[11], + 'ncbi_gene_type': merged_row[12], + 'aliases': merged_row[13], + 'associated_with': merged_row[14], + 'xrefs': merged_row[15], + 'item_type': RecordType.MERGER.value, } return {k: v for k, v in merged_record.items() if v} _get_merged_record_query = ( - b"SELECT * FROM gene_merged WHERE lower(concept_id) = %s;" # noqa: E501 + b'SELECT * FROM gene_merged WHERE lower(concept_id) = %s;' ) def _get_merged_record( @@ -412,11 +412,11 @@ def get_record_by_id( return self._get_record(concept_id, case_sensitive) _ref_types_query = { - RefType.SYMBOL: b"SELECT concept_id FROM gene_symbols WHERE lower(symbol) = %s;", # noqa: E501 - RefType.PREVIOUS_SYMBOLS: b"SELECT concept_id FROM gene_previous_symbols WHERE lower(prev_symbol) = %s;", # noqa: E501 - RefType.ALIASES: b"SELECT concept_id FROM gene_aliases WHERE lower(alias) = %s;", # noqa: E501 - RefType.XREFS: b"SELECT concept_id FROM gene_xrefs WHERE lower(xref) = %s;", - RefType.ASSOCIATED_WITH: b"SELECT concept_id FROM gene_associations WHERE lower(associated_with) = %s;", # noqa: E501 + RefType.SYMBOL: b'SELECT concept_id FROM gene_symbols WHERE lower(symbol) = %s;', + RefType.PREVIOUS_SYMBOLS: b'SELECT concept_id FROM gene_previous_symbols WHERE lower(prev_symbol) = %s;', + RefType.ALIASES: b'SELECT concept_id FROM gene_aliases WHERE lower(alias) = %s;', + RefType.XREFS: b'SELECT concept_id FROM gene_xrefs WHERE lower(xref) = %s;', + RefType.ASSOCIATED_WITH: b'SELECT concept_id FROM gene_associations WHERE lower(associated_with) = %s;', } def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]: @@ -429,7 +429,7 @@ def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]: """ query = self._ref_types_query.get(ref_type) if not query: - raise ValueError("invalid reference type") + raise ValueError('invalid reference type') with self.conn.cursor() as cur: cur.execute(query, (search_term.lower(),)) @@ -439,7 +439,7 @@ def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]: else: return [] - _ids_query = b"SELECT concept_id FROM gene_concepts;" + _ids_query = b'SELECT concept_id FROM gene_concepts;' def get_all_concept_ids(self) -> Set[str]: """Retrieve concept IDs for use in generating normalized records. @@ -451,11 +451,11 @@ def get_all_concept_ids(self) -> Set[str]: ids_tuple = cur.fetchall() return {i[0] for i in ids_tuple} - _get_all_normalized_records_query = b"SELECT * FROM gene_merged;" + _get_all_normalized_records_query = b'SELECT * FROM gene_merged;' _get_all_unmerged_source_records_query = ( - b"SELECT * FROM record_lookup_view WHERE merge_ref IS NULL;" # noqa: E501 + b'SELECT * FROM record_lookup_view WHERE merge_ref IS NULL;' ) - _get_all_source_records_query = b"SELECT * FROM record_lookup_view;" + _get_all_source_records_query = b'SELECT * FROM record_lookup_view;' def get_all_records(self, record_type: RecordType) -> Generator[Dict, None, None]: """Retrieve all source or normalized records. Either return all source records, @@ -530,9 +530,9 @@ def add_source_metadata(self, src_name: SourceName, meta: SourceMeta) -> None: meta.version, json.dumps(meta.data_url), meta.rdp_url, - meta.data_license_attributes["non_commercial"], - meta.data_license_attributes["attribution"], - meta.data_license_attributes["share_alike"], + meta.data_license_attributes['non_commercial'], + meta.data_license_attributes['attribution'], + meta.data_license_attributes['share_alike'], meta.genome_assemblies, ], ) @@ -546,15 +546,15 @@ def add_source_metadata(self, src_name: SourceName, meta: SourceMeta) -> None: VALUES (%s, %s, %s, %s, %s, %s, %s, %s); """ _ins_symbol_query = ( - b"INSERT INTO gene_symbols (symbol, concept_id) VALUES (%s, %s);" + b'INSERT INTO gene_symbols (symbol, concept_id) VALUES (%s, %s);' ) _ins_prev_symbol_query = ( - b"INSERT INTO gene_previous_symbols (prev_symbol, concept_id) VALUES (%s, %s);" + b'INSERT INTO gene_previous_symbols (prev_symbol, concept_id) VALUES (%s, %s);' ) - _ins_alias_query = b"INSERT INTO gene_aliases (alias, concept_id) VALUES (%s, %s);" - _ins_xref_query = b"INSERT INTO gene_xrefs (xref, concept_id) VALUES (%s, %s);" + _ins_alias_query = b'INSERT INTO gene_aliases (alias, concept_id) VALUES (%s, %s);' + _ins_xref_query = b'INSERT INTO gene_xrefs (xref, concept_id) VALUES (%s, %s);' _ins_assoc_query = ( - b"INSERT INTO gene_associations (associated_with, concept_id) VALUES (%s, %s);" + b'INSERT INTO gene_associations (associated_with, concept_id) VALUES (%s, %s);' ) def add_record(self, record: Dict, src_name: SourceName) -> None: @@ -563,8 +563,8 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: :param record: record to upload :param src_name: name of source for record. Not used by PostgreSQL instance. """ - concept_id = record["concept_id"] - locations = [json.dumps(loc) for loc in record.get("locations", [])] + concept_id = record['concept_id'] + locations = [json.dumps(loc) for loc in record.get('locations', [])] if not locations: locations = None with self.conn.cursor() as cur: @@ -573,28 +573,28 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: self._add_record_query, [ concept_id, - record["src_name"], - record.get("symbol_status"), - record.get("label"), - record.get("strand"), - record.get("location_annotations"), + record['src_name'], + record.get('symbol_status'), + record.get('label'), + record.get('strand'), + record.get('location_annotations'), locations, - record.get("gene_type"), + record.get('gene_type'), ], ) - for a in record.get("aliases", []): + for a in record.get('aliases', []): cur.execute(self._ins_alias_query, [a, concept_id]) - for x in record.get("xrefs", []): + for x in record.get('xrefs', []): cur.execute(self._ins_xref_query, [x, concept_id]) - for a in record.get("associated_with", []): + for a in record.get('associated_with', []): cur.execute(self._ins_assoc_query, [a, concept_id]) - for p in record.get("previous_symbols", []): + for p in record.get('previous_symbols', []): cur.execute(self._ins_prev_symbol_query, [p, concept_id]) - if record.get("symbol"): - cur.execute(self._ins_symbol_query, [record["symbol"], concept_id]) + if record.get('symbol'): + cur.execute(self._ins_symbol_query, [record['symbol'], concept_id]) self.conn.commit() except UniqueViolation: - logger.error(f"Record with ID {concept_id} already exists") + logger.error(f'Record with ID {concept_id} already exists') self.conn.rollback() _add_merged_record_query = b""" @@ -612,35 +612,35 @@ def add_merged_record(self, record: Dict) -> None: :param record: merged record to add """ - ensembl_locations = record.get("ensembl_locations") + ensembl_locations = record.get('ensembl_locations') if ensembl_locations: ensembl_locations = [json.dumps(i) for i in ensembl_locations] - ncbi_locations = record.get("ncbi_locations") + ncbi_locations = record.get('ncbi_locations') if ncbi_locations: ncbi_locations = [json.dumps(i) for i in ncbi_locations] - hgnc_locations = record.get("hgnc_locations") + hgnc_locations = record.get('hgnc_locations') if hgnc_locations: hgnc_locations = [json.dumps(i) for i in hgnc_locations] with self.conn.cursor() as cur: cur.execute( self._add_merged_record_query, [ - record["concept_id"], - record.get("symbol"), - record.get("symbol_status"), - record.get("previous_symbols"), - record.get("label"), - record.get("strand"), - record.get("location_annotations"), + record['concept_id'], + record.get('symbol'), + record.get('symbol_status'), + record.get('previous_symbols'), + record.get('label'), + record.get('strand'), + record.get('location_annotations'), ensembl_locations, hgnc_locations, ncbi_locations, - record.get("hgnc_locus_type"), - record.get("ensembl_biotype"), - record.get("ncbi_gene_type"), - record.get("aliases"), - record.get("associated_with"), - record.get("xrefs"), + record.get('hgnc_locus_type'), + record.get('ensembl_biotype'), + record.get('ncbi_gene_type'), + record.get('aliases'), + record.get('associated_with'), + record.get('xrefs'), ], ) self.conn.commit() @@ -661,7 +661,7 @@ def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: AN with self.conn.cursor() as cur: cur.execute( self._update_merge_ref_query, - {"merge_ref": merge_ref, "concept_id": concept_id}, + {'merge_ref': merge_ref, 'concept_id': concept_id}, ) row_count = cur.rowcount self.conn.commit() @@ -669,7 +669,7 @@ def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: AN # UPDATE will fail silently unless we check the # of affected rows if row_count < 1: raise DatabaseWriteException( - f"No such record exists for primary key {concept_id}" + f'No such record exists for primary key {concept_id}' ) def delete_normalized_concepts(self) -> None: @@ -687,7 +687,7 @@ def delete_normalized_concepts(self) -> None: :raise DatabaseWriteException: if deletion call fails """ with self.conn.cursor() as cur: - cur.execute((SCRIPTS_DIR / "delete_normalized_concepts.sql").read_bytes()) + cur.execute((SCRIPTS_DIR / 'delete_normalized_concepts.sql').read_bytes()) self.conn.commit() _drop_aliases_query = b""" @@ -725,8 +725,8 @@ def delete_normalized_concepts(self) -> None: WHERE gc.source = %s ); """ - _drop_concepts_query = b"DELETE FROM gene_concepts WHERE source = %s;" - _drop_source_query = b"DELETE FROM gene_sources gs WHERE gs.name = %s;" + _drop_concepts_query = b'DELETE FROM gene_concepts WHERE source = %s;' + _drop_source_query = b'DELETE FROM gene_sources gs WHERE gs.name = %s;' def delete_source(self, src_name: SourceName) -> None: """Delete all data for a source. Use when updating source data. @@ -784,35 +784,35 @@ def load_from_remote(self, url: Optional[str]) -> None: command fails """ if not url: - url = "https://vicc-normalizers.s3.us-east-2.amazonaws.com/gene_normalization/postgresql/gene_norm_latest.sql.tar.gz" # noqa: E501 + url = 'https://vicc-normalizers.s3.us-east-2.amazonaws.com/gene_normalization/postgresql/gene_norm_latest.sql.tar.gz' with tempfile.TemporaryDirectory() as tempdir: tempdir_path = Path(tempdir) - temp_tarfile = tempdir_path / "gene_norm_latest.tar.gz" + temp_tarfile = tempdir_path / 'gene_norm_latest.tar.gz' with requests.get(url, stream=True) as r: try: r.raise_for_status() except requests.HTTPError: raise DatabaseException( - f"Unable to retrieve PostgreSQL dump file from {url}" + f'Unable to retrieve PostgreSQL dump file from {url}' ) - with open(temp_tarfile, "wb") as h: + with open(temp_tarfile, 'wb') as h: for chunk in r.iter_content(chunk_size=8192): if chunk: h.write(chunk) - tar = tarfile.open(temp_tarfile, "r:gz") + tar = tarfile.open(temp_tarfile, 'r:gz') tar_dump_file = [ - f for f in tar.getmembers() if f.name.startswith("gene_norm_") + f for f in tar.getmembers() if f.name.startswith('gene_norm_') ][0] tar.extractall(path=tempdir_path, members=[tar_dump_file]) dump_file = tempdir_path / tar_dump_file.name if self.conn.info.password: - pw_param = f"-W {self.conn.info.password}" + pw_param = f'-W {self.conn.info.password}' else: - pw_param = "-w" + pw_param = '-w' self.drop_db() - system_call = f"psql -d {self.conn.info.dbname} -U {self.conn.info.user} {pw_param} -f {dump_file.absolute()}" # noqa: E501 + system_call = f'psql -d {self.conn.info.dbname} -U {self.conn.info.user} {pw_param} -f {dump_file.absolute()}' result = os.system(system_call) if result != 0: raise DatabaseException( @@ -831,19 +831,19 @@ def export_db(self, output_directory: Path) -> None: if not output_directory.is_dir() or not output_directory.exists(): raise ValueError( f"Output location {output_directory} isn't a directory or doesn't exist" - ) # noqa: E501 - now = datetime.now().strftime("%Y%m%d%H%M%S") - output_location = output_directory / f"gene_norm_{now}.sql" + ) + now = datetime.now().strftime('%Y%m%d%H%M%S') + output_location = output_directory / f'gene_norm_{now}.sql' user = self.conn.info.user host = self.conn.info.host port = self.conn.info.port database_name = self.conn.info.dbname if self.conn.info.password: - pw_param = f"-W {self.conn.info.password}" + pw_param = f'-W {self.conn.info.password}' else: - pw_param = "-w" + pw_param = '-w' - system_call = f"pg_dump -E UTF8 -f {output_location} -U {user} {pw_param} -h {host} -p {port} {database_name}" # noqa: E501 + system_call = f'pg_dump -E UTF8 -f {output_location} -U {user} {pw_param} -h {host} -p {port} {database_name}' result = os.system(system_call) if result != 0: raise DatabaseException( diff --git a/src/gene/etl/__init__.py b/src/gene/etl/__init__.py index 569df1d7..1d7020b3 100644 --- a/src/gene/etl/__init__.py +++ b/src/gene/etl/__init__.py @@ -9,10 +9,10 @@ from .ncbi import NCBI __all__ = [ - "Ensembl", - "HGNC", - "NCBI", - "GeneNormalizerEtlError", - "GeneFileVersionError", - "GeneSourceFetchError", + 'Ensembl', + 'HGNC', + 'NCBI', + 'GeneNormalizerEtlError', + 'GeneFileVersionError', + 'GeneSourceFetchError', ] diff --git a/src/gene/etl/base.py b/src/gene/etl/base.py index 77e9eee1..771e2294 100644 --- a/src/gene/etl/base.py +++ b/src/gene/etl/base.py @@ -15,7 +15,7 @@ from gene.database import AbstractDatabase from gene.schemas import Gene, GeneSequenceLocation, MatchType, SourceName -logger = logging.getLogger("gene") +logger = logging.getLogger('gene') logger.setLevel(logging.DEBUG) @@ -71,7 +71,7 @@ def perform_etl(self, use_existing: bool = False) -> List[str]: """ self._extract_data(use_existing) if not self._silent: - click.echo("Transforming and loading data to DB...") + click.echo('Transforming and loading data to DB...') self._add_meta() self._transform_data() self._database.complete_write_transaction() @@ -110,12 +110,12 @@ def _load_gene(self, gene: Dict) -> None: try: assert Gene(match_type=MatchType.NO_MATCH, **gene) except pydantic.ValidationError as e: - logger.warning(f"Unable to load {gene} due to validation error: " f"{e}") + logger.warning(f'Unable to load {gene} due to validation error: ' f'{e}') else: - concept_id = gene["concept_id"] - gene["label_and_type"] = f"{concept_id.lower()}##identity" - gene["src_name"] = self._src_name.value - gene["item_type"] = "identity" + concept_id = gene['concept_id'] + gene['label_and_type'] = f'{concept_id.lower()}##identity' + gene['src_name'] = self._src_name.value + gene['item_type'] = 'identity' for attr_type in ITEM_TYPES: if attr_type in gene: @@ -136,7 +136,7 @@ def get_seqrepo(self, seqrepo_dir: Path) -> SeqRepo: :return: SeqRepo instance """ if not Path(seqrepo_dir).exists(): - raise NotADirectoryError(f"Could not find {seqrepo_dir}") + raise NotADirectoryError(f'Could not find {seqrepo_dir}') return SeqRepo(seqrepo_dir) def _set_cl_interval_range(self, loc: str, arm_ix: int, location: Dict) -> None: @@ -146,33 +146,33 @@ def _set_cl_interval_range(self, loc: str, arm_ix: int, location: Dict) -> None: :param arm_ix: The index of the q or p arm for a given location :param location: VRS chromosome location. This will be mutated. """ - range_ix = re.search("-", loc).start() # type: ignore + range_ix = re.search('-', loc).start() # type: ignore start = loc[arm_ix:range_ix] - start_arm_ix = re.search("[pq]", start).start() # type: ignore + start_arm_ix = re.search('[pq]', start).start() # type: ignore start_arm = start[start_arm_ix] end = loc[range_ix + 1 :] - end_arm_match = re.search("[pq]", end) + end_arm_match = re.search('[pq]', end) if not end_arm_match: # Does not specify the arm, so use the same as start"s - end = f"{start[0]}{end}" - end_arm_match = re.search("[pq]", end) + end = f'{start[0]}{end}' + end_arm_match = re.search('[pq]', end) end_arm_ix = end_arm_match.start() # type: ignore end_arm = end[end_arm_ix] if (start_arm == end_arm and start > end) or ( - start_arm != end_arm and start_arm == "p" and end_arm == "q" + start_arm != end_arm and start_arm == 'p' and end_arm == 'q' ): - location["start"] = start - location["end"] = end + location['start'] = start + location['end'] = end elif (start_arm == end_arm and start < end) or ( - start_arm != end_arm and start_arm == "q" and end_arm == "p" + start_arm != end_arm and start_arm == 'q' and end_arm == 'p' ): - location["start"] = end - location["end"] = start + location['start'] = end + location['end'] = start # Add back once VRS Chromosome Location is supported in 2.0-alpha # def _get_chromosome_location(self, location: Dict, gene: Dict) -> Optional[Dict]: @@ -209,9 +209,9 @@ def _get_seq_id_aliases(self, seq_id: str) -> List[str]: """ aliases = [] try: - aliases = self.seqrepo.translate_alias(seq_id, target_namespaces="ga4gh") + aliases = self.seqrepo.translate_alias(seq_id, target_namespaces='ga4gh') except KeyError as e: - logger.warning(f"SeqRepo raised KeyError: {e}") + logger.warning(f'SeqRepo raised KeyError: {e}') return aliases def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Dict: @@ -230,7 +230,7 @@ def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Di sequence = aliases[0] - if gene.start != "." and gene.end != "." and sequence: + if gene.start != '.' and gene.end != '.' and sequence: if 0 <= gene.start <= gene.end: # type: ignore location = GeneSequenceLocation( start=gene.start - 1, # type: ignore diff --git a/src/gene/etl/ensembl.py b/src/gene/etl/ensembl.py index 4a52975a..4e775afd 100644 --- a/src/gene/etl/ensembl.py +++ b/src/gene/etl/ensembl.py @@ -12,7 +12,7 @@ ) from gene.schemas import NamespacePrefix, SourceMeta, SourceName, Strand -logger = logging.getLogger("gene") +logger = logging.getLogger('gene') logger.setLevel(logging.DEBUG) @@ -30,36 +30,36 @@ def _extract_data(self, use_existing: bool) -> None: self._data_file, raw_version = self._data_source.get_latest( from_local=use_existing ) - match = re.match(r"(GRCh\d+)_(\d+)", raw_version) + match = re.match(r'(GRCh\d+)_(\d+)', raw_version) self._assembly = match.groups()[0] self._version = match.groups()[1] def _transform_data(self) -> None: """Transform the Ensembl source.""" - logger.info("Transforming Ensembl...") + logger.info('Transforming Ensembl...') db = gffutils.create_db( str(self._data_file), - dbfn=":memory:", + dbfn=':memory:', force=True, - merge_strategy="create_unique", + merge_strategy='create_unique', keep_order=True, ) # Get accession numbers accession_numbers = dict() - for item in db.features_of_type("scaffold"): - accession_numbers[item[0]] = item[8]["Alias"][-1] - for item in db.features_of_type("chromosome"): - accession_numbers[item[0]] = item[8]["Alias"][-1] + for item in db.features_of_type('scaffold'): + accession_numbers[item[0]] = item[8]['Alias'][-1] + for item in db.features_of_type('chromosome'): + accession_numbers[item[0]] = item[8]['Alias'][-1] for f in db.all_features(): - if f.attributes.get("ID"): - f_id = f.attributes.get("ID")[0].split(":")[0] - if f_id == "gene": + if f.attributes.get('ID'): + f_id = f.attributes.get('ID')[0].split(':')[0] + if f_id == 'gene': gene = self._add_gene(f, accession_numbers) if gene: self._load_gene(gene) - logger.info("Successfully transformed Ensembl.") + logger.info('Successfully transformed Ensembl.') def _add_gene(self, f: Feature, accession_numbers: Dict) -> Dict: """Create a transformed gene record. @@ -69,19 +69,19 @@ def _add_gene(self, f: Feature, accession_numbers: Dict) -> Dict: :return: A gene dictionary containing data if the ID attribute exists. """ gene = dict() - if f.strand == "-": - gene["strand"] = Strand.REVERSE.value - elif f.strand == "+": - gene["strand"] = Strand.FORWARD.value - gene["src_name"] = SourceName.ENSEMBL.value + if f.strand == '-': + gene['strand'] = Strand.REVERSE.value + elif f.strand == '+': + gene['strand'] = Strand.FORWARD.value + gene['src_name'] = SourceName.ENSEMBL.value self._add_attributes(f, gene) location = self._add_location(f, gene, accession_numbers) if location: - gene["locations"] = [location] + gene['locations'] = [location] - gene["label_and_type"] = f"{gene['concept_id'].lower()}##identity" - gene["item_type"] = "identity" + gene['label_and_type'] = f"{gene['concept_id'].lower()}##identity" + gene['item_type'] = 'identity' return gene @@ -92,10 +92,10 @@ def _add_attributes(self, f: Feature, gene: Dict) -> None: :param gene: A transformed gene record """ attributes = { - "ID": "concept_id", - "Name": "symbol", - "description": "xrefs", - "biotype": "gene_type", + 'ID': 'concept_id', + 'Name': 'symbol', + 'description': 'xrefs', + 'biotype': 'gene_type', } for attribute in f.attributes.items(): @@ -106,30 +106,30 @@ def _add_attributes(self, f: Feature, gene: Dict) -> None: if len(val) == 1: val = val[0] - if key == "ID": - if val.startswith("gene"): + if key == 'ID': + if val.startswith('gene'): val = ( f"{NamespacePrefix.ENSEMBL.value}:" f"{val.split(':')[1]}" ) - if key == "description": - gene["label"] = val.split("[")[0].strip() - if "Source:" in val: + if key == 'description': + gene['label'] = val.split('[')[0].strip() + if 'Source:' in val: src_name = ( - val.split("[")[-1] - .split("Source:")[-1] - .split("Acc")[0] - .split(";")[0] + val.split('[')[-1] + .split('Source:')[-1] + .split('Acc')[0] + .split(';')[0] ) - src_id = val.split("Acc:")[-1].split("]")[0] - if ":" in src_id: - src_id = src_id.split(":")[-1] + src_id = val.split('Acc:')[-1].split(']')[0] + if ':' in src_id: + src_id = src_id.split(':')[-1] source = self._get_xref_associated_with(src_name, src_id) - if "xrefs" in source: - gene["xrefs"] = source["xrefs"] - elif "associated_with" in source: - gene["associated_with"] = source["associated_with"] + if 'xrefs' in source: + gene['xrefs'] = source['xrefs'] + elif 'associated_with' in source: + gene['associated_with'] = source['associated_with'] continue gene[attributes[key]] = val @@ -153,16 +153,16 @@ def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict: :return: A dict containing an other identifier or xref """ source = dict() - if src_name.startswith("HGNC"): - source["xrefs"] = [f"{NamespacePrefix.HGNC.value}:{src_id}"] - elif src_name.startswith("NCBI"): - source["xrefs"] = [f"{NamespacePrefix.NCBI.value}:{src_id}"] - elif src_name.startswith("UniProt"): - source["associated_with"] = [f"{NamespacePrefix.UNIPROT.value}:{src_id}"] - elif src_name.startswith("miRBase"): - source["associated_with"] = [f"{NamespacePrefix.MIRBASE.value}:{src_id}"] - elif src_name.startswith("RFAM"): - source["associated_with"] = [f"{NamespacePrefix.RFAM.value}:{src_id}"] + if src_name.startswith('HGNC'): + source['xrefs'] = [f'{NamespacePrefix.HGNC.value}:{src_id}'] + elif src_name.startswith('NCBI'): + source['xrefs'] = [f'{NamespacePrefix.NCBI.value}:{src_id}'] + elif src_name.startswith('UniProt'): + source['associated_with'] = [f'{NamespacePrefix.UNIPROT.value}:{src_id}'] + elif src_name.startswith('miRBase'): + source['associated_with'] = [f'{NamespacePrefix.MIRBASE.value}:{src_id}'] + elif src_name.startswith('RFAM'): + source['associated_with'] = [f'{NamespacePrefix.RFAM.value}:{src_id}'] return source def _add_meta(self) -> None: @@ -172,21 +172,21 @@ def _add_meta(self) -> None: """ if not self._version or not self._assembly: raise GeneNormalizerEtlError( - "Source metadata unavailable -- was data properly acquired before attempting to load DB?" + 'Source metadata unavailable -- was data properly acquired before attempting to load DB?' ) metadata = SourceMeta( - data_license="custom", - data_license_url="https://useast.ensembl.org/info/about" - "/legal/disclaimer.html", + data_license='custom', + data_license_url='https://useast.ensembl.org/info/about' + '/legal/disclaimer.html', version=self._version, data_url={ - "genome_annotations": f"ftp://ftp.ensembl.org/pub/release-{self._version}/gff3/homo_sapiens/Homo_sapiens.{self._assembly}.{self._version}.gff3.gz" + 'genome_annotations': f'ftp://ftp.ensembl.org/pub/release-{self._version}/gff3/homo_sapiens/Homo_sapiens.{self._assembly}.{self._version}.gff3.gz' }, rdp_url=None, data_license_attributes={ - "non_commercial": False, - "share_alike": False, - "attribution": False, + 'non_commercial': False, + 'share_alike': False, + 'attribution': False, }, genome_assemblies=[self._assembly], ) diff --git a/src/gene/etl/hgnc.py b/src/gene/etl/hgnc.py index 2fee6117..5e4f7c2a 100644 --- a/src/gene/etl/hgnc.py +++ b/src/gene/etl/hgnc.py @@ -18,7 +18,7 @@ SymbolStatus, ) -logger = logging.getLogger("gene") +logger = logging.getLogger('gene') logger.setLevel(logging.DEBUG) @@ -27,38 +27,38 @@ class HGNC(Base): def _transform_data(self) -> None: """Transform the HGNC source.""" - logger.info("Transforming HGNC...") - with open(self._data_file, "r") as f: # type: ignore + logger.info('Transforming HGNC...') + with open(self._data_file, 'r') as f: # type: ignore data = json.load(f) - records = data["response"]["docs"] + records = data['response']['docs'] for r in records: gene = dict() - gene["concept_id"] = r["hgnc_id"].lower() - gene["label_and_type"] = f"{gene['concept_id']}##identity" - gene["item_type"] = "identity" - gene["symbol"] = r["symbol"] - gene["label"] = r["name"] - gene["src_name"] = SourceName.HGNC.value - if r["status"]: - if r["status"] == "Approved": - gene["symbol_status"] = SymbolStatus.APPROVED.value - elif r["status"] == "Entry Withdrawn": - gene["symbol_status"] = SymbolStatus.WITHDRAWN.value - gene["src_name"] = SourceName.HGNC.value + gene['concept_id'] = r['hgnc_id'].lower() + gene['label_and_type'] = f"{gene['concept_id']}##identity" + gene['item_type'] = 'identity' + gene['symbol'] = r['symbol'] + gene['label'] = r['name'] + gene['src_name'] = SourceName.HGNC.value + if r['status']: + if r['status'] == 'Approved': + gene['symbol_status'] = SymbolStatus.APPROVED.value + elif r['status'] == 'Entry Withdrawn': + gene['symbol_status'] = SymbolStatus.WITHDRAWN.value + gene['src_name'] = SourceName.HGNC.value # store alias, xref, associated_with, prev_symbols, location self._get_aliases(r, gene) self._get_xrefs_associated_with(r, gene) - if "prev_symbol" in r: + if 'prev_symbol' in r: self._get_previous_symbols(r, gene) - if "location" in r: + if 'location' in r: self._get_location(r, gene) - if "locus_type" in r: - gene["gene_type"] = r["locus_type"] + if 'locus_type' in r: + gene['gene_type'] = r['locus_type'] self._load_gene(gene) - logger.info("Successfully transformed HGNC.") + logger.info('Successfully transformed HGNC.') def _get_aliases(self, r: Dict, gene: Dict) -> None: """Store aliases in a gene record. @@ -68,14 +68,14 @@ def _get_aliases(self, r: Dict, gene: Dict) -> None: """ alias_symbol = list() enzyme_id = list() - if "alias_symbol" in r: - alias_symbol = r["alias_symbol"] + if 'alias_symbol' in r: + alias_symbol = r['alias_symbol'] - if "enzyme_id" in r: - enzyme_id = r["enzyme_id"] + if 'enzyme_id' in r: + enzyme_id = r['enzyme_id'] if alias_symbol or enzyme_id: - gene["aliases"] = list(set(alias_symbol + enzyme_id)) + gene['aliases'] = list(set(alias_symbol + enzyme_id)) def _get_previous_symbols(self, r: Dict, gene: Dict) -> None: """Store previous symbols in a gene record. @@ -83,9 +83,9 @@ def _get_previous_symbols(self, r: Dict, gene: Dict) -> None: :param r: A gene record in the HGNC data file :param gene: A transformed gene record """ - prev_symbols = r["prev_symbol"] + prev_symbols = r['prev_symbol'] if prev_symbols: - gene["previous_symbols"] = list(set(prev_symbols)) + gene['previous_symbols'] = list(set(prev_symbols)) def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: """Store xrefs and/or associated_with refs in a gene record. @@ -96,40 +96,40 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: xrefs = list() associated_with = list() sources = [ - "entrez_id", - "ensembl_gene_id", - "vega_id", - "ucsc_id", - "ccds_id", - "uniprot_ids", - "pubmed_id", - "cosmic", - "omim_id", - "mirbase", - "homeodb", - "snornabase", - "orphanet", - "horde_id", - "merops", - "imgt", - "iuphar", - "kznf_gene_catalog", - "mamit-trnadb", - "cd", - "lncrnadb", - "ena", - "pseudogene.org", - "refseq_accession", + 'entrez_id', + 'ensembl_gene_id', + 'vega_id', + 'ucsc_id', + 'ccds_id', + 'uniprot_ids', + 'pubmed_id', + 'cosmic', + 'omim_id', + 'mirbase', + 'homeodb', + 'snornabase', + 'orphanet', + 'horde_id', + 'merops', + 'imgt', + 'iuphar', + 'kznf_gene_catalog', + 'mamit-trnadb', + 'cd', + 'lncrnadb', + 'ena', + 'pseudogene.org', + 'refseq_accession', ] for src in sources: if src in r: - if "-" in src: - key = src.split("-")[0] - elif "." in src: - key = src.split(".")[0] - elif "_" in src: - key = src.split("_")[0] + if '-' in src: + key = src.split('-')[0] + elif '.' in src: + key = src.split('.')[0] + elif '_' in src: + key = src.split('_')[0] else: key = src @@ -139,12 +139,12 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: else: self._get_xref_associated_with(key, src, r, associated_with) else: - logger.warning(f"{key} not in schemas.py") + logger.warning(f'{key} not in schemas.py') if xrefs: - gene["xrefs"] = xrefs + gene['xrefs'] = xrefs if associated_with: - gene["associated_with"] = associated_with + gene['associated_with'] = associated_with def _get_xref_associated_with( self, key: str, src: str, r: Dict, src_type: Dict @@ -158,11 +158,11 @@ def _get_xref_associated_with( """ if isinstance(r[src], list): for xref in r[src]: - src_type.append(f"{NamespacePrefix[key.upper()].value}:{xref}") + src_type.append(f'{NamespacePrefix[key.upper()].value}:{xref}') else: - if isinstance(r[src], str) and ":" in r[src]: - r[src] = r[src].split(":")[-1].strip() - src_type.append(f"{NamespacePrefix[key.upper()].value}" f":{r[src]}") + if isinstance(r[src], str) and ':' in r[src]: + r[src] = r[src].split(':')[-1].strip() + src_type.append(f'{NamespacePrefix[key.upper()].value}' f':{r[src]}') def _get_location(self, r: Dict, gene: Dict) -> None: """Store GA4GH VRS ChromosomeLocation in a gene record. @@ -172,20 +172,20 @@ def _get_location(self, r: Dict, gene: Dict) -> None: :param gene: A transformed gene record """ # Get list of a gene's map locations - if "and" in r["location"]: - locations = r["location"].split("and") + if 'and' in r['location']: + locations = r['location'].split('and') else: - locations = [r["location"]] + locations = [r['location']] location_list = list() - gene["location_annotations"] = list() + gene['location_annotations'] = list() for loc in locations: loc = loc.strip() loc = self._set_annotation(loc, gene) if loc: - if loc == "mitochondria": - gene["location_annotations"].append(Chromosome.MITOCHONDRIA.value) + if loc == 'mitochondria': + gene['location_annotations'].append(Chromosome.MITOCHONDRIA.value) else: location = dict() self._set_location(loc, location, gene) @@ -194,9 +194,9 @@ def _get_location(self, r: Dict, gene: Dict) -> None: # location_list.append(chr_location) if location_list: - gene["locations"] = location_list - if not gene["location_annotations"]: - del gene["location_annotations"] + gene['locations'] = location_list + if not gene['location_annotations']: + del gene['location_annotations'] def _set_annotation(self, loc: str, gene: Dict) -> None: """Set the annotations attribute if one is provided. @@ -210,7 +210,7 @@ def _set_annotation(self, loc: str, gene: Dict) -> None: for annotation in annotations: if annotation in loc: - gene["location_annotations"].append(annotation) + gene['location_annotations'].append(annotation) # Check if location is also included loc = loc.split(annotation)[0].strip() if not loc: @@ -224,24 +224,24 @@ def _set_location(self, loc: str, location: Dict, gene: Dict) -> None: :param location: GA4GH location :param gene: A transformed gene record """ - arm_match = re.search("[pq]", loc) + arm_match = re.search('[pq]', loc) if arm_match: # Location gives arm and sub / sub band arm_ix = arm_match.start() - location["chr"] = loc[:arm_ix] + location['chr'] = loc[:arm_ix] - if "-" in loc: + if '-' in loc: # Location gives both start and end self._set_cl_interval_range(loc, arm_ix, location) else: # Location only gives start start = loc[arm_ix:] - location["start"] = start - location["end"] = start + location['start'] = start + location['end'] = start else: # Only gives chromosome - gene["location_annotations"].append(loc) + gene['location_annotations'].append(loc) def _add_meta(self) -> None: """Add HGNC metadata. @@ -250,20 +250,20 @@ def _add_meta(self) -> None: """ if not self._version: raise GeneNormalizerEtlError( - "Source metadata unavailable -- was data properly acquired before attempting to load DB?" + 'Source metadata unavailable -- was data properly acquired before attempting to load DB?' ) metadata = SourceMeta( - data_license="CC0", - data_license_url="https://www.genenames.org/about/license/", + data_license='CC0', + data_license_url='https://www.genenames.org/about/license/', version=self._version, data_url={ - "complete_set_archive": "ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json" + 'complete_set_archive': 'ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json' }, rdp_url=None, data_license_attributes={ - "non_commercial": False, - "share_alike": False, - "attribution": False, + 'non_commercial': False, + 'share_alike': False, + 'attribution': False, }, genome_assemblies=[], ) diff --git a/src/gene/etl/merge.py b/src/gene/etl/merge.py index 8124d294..9121e498 100644 --- a/src/gene/etl/merge.py +++ b/src/gene/etl/merge.py @@ -7,7 +7,7 @@ from gene.database.database import DatabaseWriteException from gene.schemas import GeneTypeFieldName, RecordType, SourcePriority -logger = logging.getLogger("gene") +logger = logging.getLogger('gene') logger.setLevel(logging.DEBUG) @@ -28,7 +28,7 @@ def create_merged_concepts(self, record_ids: Set[str]) -> None: :param record_ids: concept identifiers from which groups should be generated. Should *not* include any records from excluded sources. """ - logger.info("Generating record ID sets...") + logger.info('Generating record ID sets...') start = timer() for record_id in record_ids: new_group = self._create_record_id_set(record_id) @@ -36,11 +36,11 @@ def create_merged_concepts(self, record_ids: Set[str]) -> None: for concept_id in new_group: self._groups[concept_id] = new_group end = timer() - logger.debug(f"Built record ID sets in {end - start} seconds") + logger.debug(f'Built record ID sets in {end - start} seconds') self._groups = {k: v for k, v in self._groups.items() if len(v) > 1} - logger.info("Creating merged records and updating database...") + logger.info('Creating merged records and updating database...') uploaded_ids = set() start = timer() for record_id, group in self._groups.items(): @@ -53,22 +53,22 @@ def create_merged_concepts(self, record_ids: Set[str]) -> None: # add updated references for concept_id in group: - merge_ref = merged_record["concept_id"] + merge_ref = merged_record['concept_id'] try: self._database.update_merge_ref(concept_id, merge_ref) except DatabaseWriteException as dw: - if str(dw).startswith("No such record exists"): + if str(dw).startswith('No such record exists'): logger.error( - f"Updating nonexistent record: {concept_id} " - f"for merge ref to {merge_ref}" + f'Updating nonexistent record: {concept_id} ' + f'for merge ref to {merge_ref}' ) else: logger.error(str(dw)) uploaded_ids |= group self._database.complete_write_transaction() - logger.info("Merged concept generation successful.") + logger.info('Merged concept generation successful.') end = timer() - logger.debug(f"Generated and added concepts in {end - start} seconds") + logger.debug(f'Generated and added concepts in {end - start} seconds') def _create_record_id_set( self, record_id: str, observed_id_set: Optional[Set] = None @@ -89,15 +89,15 @@ def _create_record_id_set( db_record = self._database.get_record_by_id(record_id) if not db_record: logger.warning( - f"Record ID set creator could not resolve " - f"lookup for {record_id} in ID set: " - f"{observed_id_set}" + f'Record ID set creator could not resolve ' + f'lookup for {record_id} in ID set: ' + f'{observed_id_set}' ) return observed_id_set - {record_id} - record_xrefs = db_record.get("xrefs") + record_xrefs = db_record.get('xrefs') if not record_xrefs: - return observed_id_set | {db_record["concept_id"]} + return observed_id_set | {db_record['concept_id']} else: local_id_set = set(record_xrefs) merged_id_set = {record_id} | observed_id_set @@ -125,40 +125,40 @@ def _generate_merged_record(self, record_id_set: Set[str]) -> Dict: records.append(record) else: logger.error( - f"Merge record generator could not retrieve " - f"record for {record_id} in {record_id_set}" + f'Merge record generator could not retrieve ' + f'record for {record_id} in {record_id_set}' ) def record_order(record: Dict) -> Tuple: """Provide priority values of concepts for sort function.""" - src = record["src_name"].upper() + src = record['src_name'].upper() if src in SourcePriority.__members__: source_rank = SourcePriority[src].value else: raise Exception( f"Prohibited source: {src} in concept_id " f"{record['concept_id']}" ) - return source_rank, record["concept_id"] + return source_rank, record['concept_id'] records.sort(key=record_order) # initialize merged record merged_attrs = { - "concept_id": records[0]["concept_id"], - "aliases": set(), - "associated_with": set(), - "previous_symbols": set(), - "hgnc_locus_type": set(), - "ncbi_gene_type": set(), - "ensembl_biotype": set(), - "strand": set(), + 'concept_id': records[0]['concept_id'], + 'aliases': set(), + 'associated_with': set(), + 'previous_symbols': set(), + 'hgnc_locus_type': set(), + 'ncbi_gene_type': set(), + 'ensembl_biotype': set(), + 'strand': set(), } if len(records) > 1: - merged_attrs["xrefs"] = list({r["concept_id"] for r in records[1:]}) + merged_attrs['xrefs'] = list({r['concept_id'] for r in records[1:]}) # merge from constituent records - set_fields = ["aliases", "associated_with", "previous_symbols", "strand"] - scalar_fields = ["symbol", "symbol_status", "label", "location_annotations"] + set_fields = ['aliases', 'associated_with', 'previous_symbols', 'strand'] + scalar_fields = ['symbol', 'symbol_status', 'label', 'location_annotations'] for record in records: for field in set_fields: merged_attrs[field] |= set(record.get(field, set())) @@ -167,19 +167,19 @@ def record_order(record: Dict) -> Tuple: if field not in merged_attrs and field in record: merged_attrs[field] = record[field] - locations = record.get("locations") + locations = record.get('locations') if locations: merged_attrs[f"{record['src_name'].lower()}_locations"] = locations - gene_type = record.get("gene_type") + gene_type = record.get('gene_type') if gene_type: - merged_field = GeneTypeFieldName[record["src_name"].upper()] + merged_field = GeneTypeFieldName[record['src_name'].upper()] merged_attrs[merged_field] |= {gene_type} for field in set_fields + [ - "hgnc_locus_type", - "ncbi_gene_type", - "ensembl_biotype", + 'hgnc_locus_type', + 'ncbi_gene_type', + 'ensembl_biotype', ]: field_value = merged_attrs[field] if field_value: @@ -188,12 +188,12 @@ def record_order(record: Dict) -> Tuple: del merged_attrs[field] # ensure no conflicting strands - unique_strand_values = set(merged_attrs.get("strand", [])) + unique_strand_values = set(merged_attrs.get('strand', [])) num_unique_strand_values = len(unique_strand_values) if num_unique_strand_values > 1: - del merged_attrs["strand"] + del merged_attrs['strand'] elif num_unique_strand_values == 1: - merged_attrs["strand"] = list(unique_strand_values)[0] + merged_attrs['strand'] = list(unique_strand_values)[0] - merged_attrs["item_type"] = RecordType.MERGER.value + merged_attrs['item_type'] = RecordType.MERGER.value return merged_attrs diff --git a/src/gene/etl/ncbi.py b/src/gene/etl/ncbi.py index a3b2e706..ba675dc3 100644 --- a/src/gene/etl/ncbi.py +++ b/src/gene/etl/ncbi.py @@ -24,7 +24,7 @@ SymbolStatus, ) -logger = logging.getLogger("gene") +logger = logging.getLogger('gene') logger.setLevel(logging.DEBUG) @@ -63,10 +63,10 @@ def _extract_data(self, use_existing: bool) -> None: self._info_src = gene_paths.gene_info self._history_src = gene_paths.gene_history self._gene_url = ( - "ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz" + 'ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz' ) - self._history_url = "ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz" - self._assembly_url = "ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/" + self._history_url = 'ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz' + self._assembly_url = 'ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/' def _get_prev_symbols(self) -> Dict[str, str]: """Store a gene's symbol history. @@ -74,14 +74,14 @@ def _get_prev_symbols(self) -> Dict[str, str]: :return: A dictionary of a gene's previous symbols """ # get symbol history - history_file = open(self._history_src, "r") - history = csv.reader(history_file, delimiter="\t") + history_file = open(self._history_src, 'r') + history = csv.reader(history_file, delimiter='\t') next(history) prev_symbols = {} for row in history: # Only interested in rows that have homo sapiens tax id - if row[0] == "9606": - if row[1] != "-": + if row[0] == '9606': + if row[1] != '-': gene_id = row[1] if gene_id in prev_symbols.keys(): prev_symbols[gene_id].append(row[3]) @@ -90,9 +90,9 @@ def _get_prev_symbols(self) -> Dict[str, str]: else: # Load discontinued genes params = { - "concept_id": f"{NamespacePrefix.NCBI.value}:{row[2]}", - "symbol": row[3], - "symbol_status": SymbolStatus.DISCONTINUED.value, + 'concept_id': f'{NamespacePrefix.NCBI.value}:{row[2]}', + 'symbol': row[3], + 'symbol_status': SymbolStatus.DISCONTINUED.value, } self._load_gene(params) history_file.close() @@ -104,37 +104,37 @@ def _add_xrefs_associated_with(self, val: List[str], params: Dict) -> None: :param val: A list of source ids for a given gene :param params: A transformed gene record """ - params["xrefs"] = [] - params["associated_with"] = [] + params['xrefs'] = [] + params['associated_with'] = [] for src in val: - src_name = src.split(":")[0].upper() - src_id = src.split(":")[-1] - if src_name == "GENEID": - params["concept_id"] = f"{NamespacePrefix.NCBI.value}:{src_id}" + src_name = src.split(':')[0].upper() + src_id = src.split(':')[-1] + if src_name == 'GENEID': + params['concept_id'] = f'{NamespacePrefix.NCBI.value}:{src_id}' elif ( src_name in NamespacePrefix.__members__ and NamespacePrefix[src_name].value in PREFIX_LOOKUP ): - params["xrefs"].append( - f"{NamespacePrefix[src_name].value}" f":{src_id}" + params['xrefs'].append( + f'{NamespacePrefix[src_name].value}' f':{src_id}' ) else: - if src_name.startswith("MIM"): + if src_name.startswith('MIM'): prefix = NamespacePrefix.OMIM.value - elif src_name.startswith("IMGT/GENE-DB"): + elif src_name.startswith('IMGT/GENE-DB'): prefix = NamespacePrefix.IMGT_GENE_DB.value - elif src_name.startswith("MIRBASE"): + elif src_name.startswith('MIRBASE'): prefix = NamespacePrefix.MIRBASE.value else: prefix = None if prefix: - params["associated_with"].append(f"{prefix}:{src_id}") + params['associated_with'].append(f'{prefix}:{src_id}') else: - logger.info(f"{src_name} is not in NameSpacePrefix.") - if not params["xrefs"]: - del params["xrefs"] - if not params["associated_with"]: - del params["associated_with"] + logger.info(f'{src_name} is not in NameSpacePrefix.') + if not params['xrefs']: + del params['xrefs'] + if not params['associated_with']: + del params['associated_with'] def _get_gene_info(self, prev_symbols: Dict[str, str]) -> Dict[str, str]: """Store genes from NCBI info file. @@ -143,42 +143,42 @@ def _get_gene_info(self, prev_symbols: Dict[str, str]) -> Dict[str, str]: :return: A dictionary of gene's from the NCBI info file. """ # open info file, skip headers - info_file = open(self._info_src, "r") - info = csv.reader(info_file, delimiter="\t") + info_file = open(self._info_src, 'r') + info = csv.reader(info_file, delimiter='\t') next(info) info_genes = dict() for row in info: params = dict() - params["concept_id"] = f"{NamespacePrefix.NCBI.value}:{row[1]}" + params['concept_id'] = f'{NamespacePrefix.NCBI.value}:{row[1]}' # get symbol - params["symbol"] = row[2] + params['symbol'] = row[2] # get aliases - if row[4] != "-": - params["aliases"] = row[4].split("|") + if row[4] != '-': + params['aliases'] = row[4].split('|') else: - params["aliases"] = [] + params['aliases'] = [] # get associated_with - if row[5] != "-": - associated_with = row[5].split("|") + if row[5] != '-': + associated_with = row[5].split('|') self._add_xrefs_associated_with(associated_with, params) # get chromosome location vrs_chr_location = self._get_vrs_chr_location(row, params) - if "exclude" in vrs_chr_location: + if 'exclude' in vrs_chr_location: # Exclude genes with multiple distinct locations (e.g. OMS) continue if not vrs_chr_location: vrs_chr_location = [] - params["locations"] = vrs_chr_location + params['locations'] = vrs_chr_location # get label - if row[8] != "-": - params["label"] = row[8] + if row[8] != '-': + params['label'] = row[8] # add prev symbols if row[1] in prev_symbols.keys(): - params["previous_symbols"] = prev_symbols[row[1]] - info_genes[params["symbol"]] = params + params['previous_symbols'] = prev_symbols[row[1]] + info_genes[params['symbol']] = params # get type - params["gene_type"] = row[9] + params['gene_type'] = row[9] return info_genes def _get_gene_gff(self, db: gffutils.FeatureDB, info_genes: Dict) -> None: @@ -188,20 +188,20 @@ def _get_gene_gff(self, db: gffutils.FeatureDB, info_genes: Dict) -> None: :param info_genes: A dictionary of gene's from the NCBI info file. """ for f in db.all_features(): - if f.attributes.get("ID"): - f_id = f.attributes.get("ID")[0] - if f_id.startswith("gene"): - symbol = f.attributes["Name"][0] + if f.attributes.get('ID'): + f_id = f.attributes.get('ID')[0] + if f_id.startswith('gene'): + symbol = f.attributes['Name'][0] if symbol in info_genes: # Just need to add SequenceLocation params = info_genes.get(symbol) vrs_sq_location = self._get_vrs_sq_location(db, params, f_id) if vrs_sq_location: - params["locations"].append(vrs_sq_location) # type: ignore + params['locations'].append(vrs_sq_location) # type: ignore else: # Need to add entire gene gene = self._add_gff_gene(db, f, f_id) - info_genes[gene["symbol"]] = gene + info_genes[gene['symbol']] = gene def _add_gff_gene( self, db: gffutils.FeatureDB, f: gffutils.Feature, f_id: str @@ -214,14 +214,14 @@ def _add_gff_gene( :return: A gene dictionary if the ID attribute exists. Else return None. """ params = dict() - params["src_name"] = SourceName.NCBI.value + params['src_name'] = SourceName.NCBI.value self._add_attributes(f, params) sq_loc = self._get_vrs_sq_location(db, params, f_id) if sq_loc: - params["locations"] = [sq_loc] + params['locations'] = [sq_loc] else: - params["locations"] = list() - params["label_and_type"] = f"{params['concept_id'].lower()}##identity" + params['locations'] = list() + params['label_and_type'] = f"{params['concept_id'].lower()}##identity" return params def _add_attributes(self, f: gffutils.feature.Feature, gene: Dict) -> None: @@ -230,20 +230,20 @@ def _add_attributes(self, f: gffutils.feature.Feature, gene: Dict) -> None: :param gffutils.feature.Feature f: A gene from the data :param gene: A transformed gene record """ - attributes = ["ID", "Name", "description", "Dbxref"] + attributes = ['ID', 'Name', 'description', 'Dbxref'] for attribute in f.attributes.items(): key = attribute[0] if key in attributes: val = attribute[1] - if len(val) == 1 and key != "Dbxref": + if len(val) == 1 and key != 'Dbxref': val = val[0] - if key == "Dbxref": + if key == 'Dbxref': self._add_xrefs_associated_with(val, gene) - elif key == "Name": - gene["symbol"] = val + elif key == 'Name': + gene['symbol'] = val def _get_vrs_sq_location( self, db: gffutils.FeatureDB, params: Dict, f_id: str @@ -257,7 +257,7 @@ def _get_vrs_sq_location( :return: A GA4GH VRS SequenceLocation """ gene = db[f_id] - params["strand"] = gene.strand + params['strand'] = gene.strand return self._get_sequence_location(gene.seqid, gene, params) def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict: @@ -268,16 +268,16 @@ def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict: :return: A dict containing an xref or associated_with ref """ source = dict() - if src_name.startswith("HGNC"): - source["xrefs"] = [f"{NamespacePrefix.HGNC.value}:{src_id}"] - elif src_name.startswith("NCBI"): - source["xrefs"] = [f"{NamespacePrefix.NCBI.value}:{src_id}"] - elif src_name.startswith("UniProt"): - source["associated_with"] = [f"{NamespacePrefix.UNIPROT.value}:{src_id}"] - elif src_name.startswith("miRBase"): - source["associated_with"] = [f"{NamespacePrefix.MIRBASE.value}:{src_id}"] - elif src_name.startswith("RFAM"): - source["associated_with"] = [f"{NamespacePrefix.RFAM.value}:{src_id}"] + if src_name.startswith('HGNC'): + source['xrefs'] = [f'{NamespacePrefix.HGNC.value}:{src_id}'] + elif src_name.startswith('NCBI'): + source['xrefs'] = [f'{NamespacePrefix.NCBI.value}:{src_id}'] + elif src_name.startswith('UniProt'): + source['associated_with'] = [f'{NamespacePrefix.UNIPROT.value}:{src_id}'] + elif src_name.startswith('miRBase'): + source['associated_with'] = [f'{NamespacePrefix.MIRBASE.value}:{src_id}'] + elif src_name.startswith('RFAM'): + source['associated_with'] = [f'{NamespacePrefix.RFAM.value}:{src_id}'] return source def _get_vrs_chr_location(self, row: List[str], params: Dict) -> List: @@ -288,24 +288,24 @@ def _get_vrs_chr_location(self, row: List[str], params: Dict) -> List: :param params: A transformed gene record :return: A list of GA4GH VRS ChromosomeLocations """ - params["location_annotations"] = list() + params['location_annotations'] = list() chromosomes_locations = self._set_chromsomes_locations(row, params) - locations = chromosomes_locations["locations"] - chromosomes = chromosomes_locations["chromosomes"] - if chromosomes_locations["exclude"]: - return ["exclude"] + locations = chromosomes_locations['locations'] + chromosomes = chromosomes_locations['chromosomes'] + if chromosomes_locations['exclude']: + return ['exclude'] location_list = list() if chromosomes and not locations: for chromosome in chromosomes: - if chromosome == "MT": - params["location_annotations"].append(Chromosome.MITOCHONDRIA.value) + if chromosome == 'MT': + params['location_annotations'].append(Chromosome.MITOCHONDRIA.value) else: - params["location_annotations"].append(chromosome.strip()) + params['location_annotations'].append(chromosome.strip()) elif locations: self._add_chromosome_location(locations, location_list, params) - if not params["location_annotations"]: - del params["location_annotations"] + if not params['location_annotations']: + del params['location_annotations'] return location_list def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict: @@ -316,29 +316,29 @@ def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict: :return: A dictionary containing a gene's chromosomes and locations """ chromosomes = None - if row[6] != "-": - if "|" in row[6]: - chromosomes = row[6].split("|") + if row[6] != '-': + if '|' in row[6]: + chromosomes = row[6].split('|') else: chromosomes = [row[6]] if len(chromosomes) >= 2: - if chromosomes and "X" not in chromosomes and "Y" not in chromosomes: + if chromosomes and 'X' not in chromosomes and 'Y' not in chromosomes: logger.info( - f"{row[2]} contains multiple distinct " - f"chromosomes: {chromosomes}." + f'{row[2]} contains multiple distinct ' + f'chromosomes: {chromosomes}.' ) chromosomes = None locations = None exclude = False - if row[7] != "-": - if "|" in row[7]: - locations = row[7].split("|") - elif ";" in row[7]: - locations = row[7].split(";") - elif "and" in row[7]: - locations = row[7].split("and") + if row[7] != '-': + if '|' in row[7]: + locations = row[7].split('|') + elif ';' in row[7]: + locations = row[7].split(';') + elif 'and' in row[7]: + locations = row[7].split('and') else: locations = [row[7]] @@ -351,7 +351,7 @@ def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict: # i.e. OMS: '10q26.3', '19q13.42-q13.43', '3p25.3' if len(locations) > 2: logger.info( - f"{row[2]} contains multiple distinct " f"locations: {locations}." + f'{row[2]} contains multiple distinct ' f'locations: {locations}.' ) locations = None exclude = True @@ -360,13 +360,13 @@ def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict: if locations: for i in range(len(locations)): loc = locations[i].strip() - if not re.match("^([1-9][0-9]?|X[pq]?|Y[pq]?)", loc): + if not re.match('^([1-9][0-9]?|X[pq]?|Y[pq]?)', loc): logger.info( - f"{row[2]} contains invalid map location:" f"{loc}." + f'{row[2]} contains invalid map location:' f'{loc}.' ) - params["location_annotations"].append(loc) + params['location_annotations'].append(loc) del locations[i] - return {"locations": locations, "chromosomes": chromosomes, "exclude": exclude} + return {'locations': locations, 'chromosomes': chromosomes, 'exclude': exclude} def _add_chromosome_location( self, locations: List, location_list: List, params: Dict @@ -382,42 +382,42 @@ def _add_chromosome_location( location = dict() if Annotation.ALT_LOC.value in loc: - loc = loc.split(f"{Annotation.ALT_LOC.value}")[0].strip() - params["location_annotations"].append(Annotation.ALT_LOC.value) + loc = loc.split(f'{Annotation.ALT_LOC.value}')[0].strip() + params['location_annotations'].append(Annotation.ALT_LOC.value) contains_centromere = False - if "cen" in loc: + if 'cen' in loc: contains_centromere = True - arm_match = re.search("[pq]", loc) + arm_match = re.search('[pq]', loc) if arm_match and not contains_centromere: arm_ix = arm_match.start() chromosome = loc[:arm_ix].strip() # NCBI sometimes stores invalid map locations # i.e. 7637 stores 'map from Rosati ref via FISH [AFS]' - if not re.match("^([1-9][0-9]?|X|Y|MT)$", chromosome): + if not re.match('^([1-9][0-9]?|X|Y|MT)$', chromosome): continue - location["chr"] = chromosome + location['chr'] = chromosome # Check to see if there is a band / sub band included if arm_ix != len(loc) - 1: - if "-" in loc: + if '-' in loc: self._set_cl_interval_range(loc, arm_ix, location) else: # Location only gives start start = loc[arm_ix:] - location["start"] = start - location["end"] = start + location['start'] = start + location['end'] = start else: # Only arm is included - location["start"] = loc[arm_ix] - location["end"] = loc[arm_ix] + location['start'] = loc[arm_ix] + location['end'] = loc[arm_ix] elif contains_centromere: self._set_centromere_location(loc, location) else: # Location only gives chr - params["location_annotations"].append(loc) + params['location_annotations'].append(loc) # chr_location = self._get_chromosome_location(location, params) # if chr_location: @@ -429,36 +429,36 @@ def _set_centromere_location(self, loc: str, location: Dict) -> None: :param loc: A gene location :param location: GA4GH location """ - centromere_ix = re.search("cen", loc).start() # type: ignore - if "-" in loc: + centromere_ix = re.search('cen', loc).start() # type: ignore + if '-' in loc: # Location gives both start and end - range_ix = re.search("-", loc).start() # type: ignore - if "q" in loc: - location["chr"] = loc[:centromere_ix].strip() - location["start"] = "cen" - location["end"] = loc[range_ix + 1 :] - elif "p" in loc: - p_ix = re.search("p", loc).start() # type: ignore - location["chr"] = loc[:p_ix].strip() - location["end"] = "cen" - location["start"] = loc[:range_ix] + range_ix = re.search('-', loc).start() # type: ignore + if 'q' in loc: + location['chr'] = loc[:centromere_ix].strip() + location['start'] = 'cen' + location['end'] = loc[range_ix + 1 :] + elif 'p' in loc: + p_ix = re.search('p', loc).start() # type: ignore + location['chr'] = loc[:p_ix].strip() + location['end'] = 'cen' + location['start'] = loc[:range_ix] else: - location["chr"] = loc[:centromere_ix].strip() - location["start"] = "cen" - location["end"] = "cen" + location['chr'] = loc[:centromere_ix].strip() + location['start'] = 'cen' + location['end'] = 'cen' def _transform_data(self) -> None: """Modify data and pass to loading functions.""" - logger.info("Transforming NCBI...") + logger.info('Transforming NCBI...') prev_symbols = self._get_prev_symbols() info_genes = self._get_gene_info(prev_symbols) # create db for gff file db = gffutils.create_db( str(self._gff_src), - dbfn=":memory:", + dbfn=':memory:', force=True, - merge_strategy="create_unique", + merge_strategy='create_unique', keep_order=True, ) @@ -466,7 +466,7 @@ def _transform_data(self) -> None: for gene in info_genes.keys(): self._load_gene(info_genes[gene]) - logger.info("Successfully transformed NCBI.") + logger.info('Successfully transformed NCBI.') def _add_meta(self) -> None: """Add Ensembl metadata. @@ -483,22 +483,22 @@ def _add_meta(self) -> None: ] ): raise GeneNormalizerEtlError( - "Source metadata unavailable -- was data properly acquired before attempting to load DB?" + 'Source metadata unavailable -- was data properly acquired before attempting to load DB?' ) metadata = SourceMeta( - data_license="custom", - data_license_url="https://www.ncbi.nlm.nih.gov/home/about/policies/", + data_license='custom', + data_license_url='https://www.ncbi.nlm.nih.gov/home/about/policies/', version=self._version, data_url={ - "info_file": self._gene_url, - "history_file": self._history_url, - "assembly_file": self._assembly_url, + 'info_file': self._gene_url, + 'history_file': self._history_url, + 'assembly_file': self._assembly_url, }, - rdp_url="https://reusabledata.org/ncbi-gene.html", + rdp_url='https://reusabledata.org/ncbi-gene.html', data_license_attributes={ - "non_commercial": False, - "share_alike": False, - "attribution": False, + 'non_commercial': False, + 'share_alike': False, + 'attribution': False, }, genome_assemblies=[self._assembly], ) diff --git a/src/gene/main.py b/src/gene/main.py index e6c87223..31db7076 100644 --- a/src/gene/main.py +++ b/src/gene/main.py @@ -21,27 +21,27 @@ """ app = FastAPI( - title="Gene Normalizer", + title='Gene Normalizer', description=description, version=__version__, contact={ - "name": "Alex H. Wagner", - "email": "Alex.Wagner@nationwidechildrens.org", - "url": "https://www.nationwidechildrens.org/specialties/institute-for-genomic-medicine/research-labs/wagner-lab", # noqa: E501 + 'name': 'Alex H. Wagner', + 'email': 'Alex.Wagner@nationwidechildrens.org', + 'url': 'https://www.nationwidechildrens.org/specialties/institute-for-genomic-medicine/research-labs/wagner-lab', }, license={ - "name": "MIT", - "url": "https://github.com/cancervariants/gene-normalization/blob/main/LICENSE", + 'name': 'MIT', + 'url': 'https://github.com/cancervariants/gene-normalization/blob/main/LICENSE', }, - docs_url="/gene", - openapi_url="/gene/openapi.json", - swagger_ui_parameters={"tryItOutEnabled": True}, + docs_url='/gene', + openapi_url='/gene/openapi.json', + swagger_ui_parameters={'tryItOutEnabled': True}, ) -read_query_summary = "Given query, provide best-matching source records." -response_description = "A response to a validly-formed query" -q_descr = "Gene to normalize." +read_query_summary = 'Given query, provide best-matching source records.' +response_description = 'A response to a validly-formed query' +q_descr = 'Gene to normalize.' incl_descr = """Optional. Comma-separated list of source names to include in response. Will exclude all other sources. Returns HTTP status code 422: Unprocessable Entity if both 'incl' and 'excl' parameters @@ -51,21 +51,21 @@ code 422: Unprocessable Entity if both 'incl' and 'excl' parameters are given.""" search_description = ( - "For each source, return strongest-match concepts " - "for query string provided by user" + 'For each source, return strongest-match concepts ' + 'for query string provided by user' ) @app.get( - "/gene/search", + '/gene/search', summary=read_query_summary, response_description=response_description, response_model=SearchService, description=search_description, - tags=["Query"], + tags=['Query'], ) def search( - q: str = Query(..., description=q_descr), # noqa: D103 + q: str = Query(..., description=q_descr), incl: Optional[str] = Query(None, description=incl_descr), excl: Optional[str] = Query(None, description=excl_descr), ) -> SearchService: @@ -87,20 +87,20 @@ def search( return resp -normalize_summary = "Given query, provide merged normalized record." -normalize_response_descr = "A response to a validly-formed query." -normalize_descr = "Return merged highest-match concept for query." -normalize_q_descr = "Gene to normalize." +normalize_summary = 'Given query, provide merged normalized record.' +normalize_response_descr = 'A response to a validly-formed query.' +normalize_descr = 'Return merged highest-match concept for query.' +normalize_q_descr = 'Gene to normalize.' @app.get( - "/gene/normalize", + '/gene/normalize', summary=normalize_summary, response_description=normalize_response_descr, response_model=NormalizeService, response_model_exclude_none=True, description=normalize_descr, - tags=["Query"], + tags=['Query'], ) def normalize(q: str = Query(..., description=normalize_q_descr)) -> NormalizeService: """Return strongest match concepts to query string provided by user. @@ -113,29 +113,29 @@ def normalize(q: str = Query(..., description=normalize_q_descr)) -> NormalizeSe unmerged_matches_summary = ( - "Given query, provide source records corresponding to " "normalized concept." + 'Given query, provide source records corresponding to ' 'normalized concept.' ) unmerged_response_descr = ( - "Response containing source records contained within " "normalized concept." + 'Response containing source records contained within ' 'normalized concept.' ) unmerged_normalize_description = ( - "Return unmerged records associated with the " - "normalized result of the user-provided query " - "string." + 'Return unmerged records associated with the ' + 'normalized result of the user-provided query ' + 'string.' ) @app.get( - "/gene/normalize_unmerged", + '/gene/normalize_unmerged', summary=unmerged_matches_summary, - operation_id="getUnmergedRecords", + operation_id='getUnmergedRecords', response_description=unmerged_response_descr, response_model=UnmergedNormalizationService, description=unmerged_normalize_description, - tags=["Query"], + tags=['Query'], ) def normalize_unmerged( - q: str = Query(..., description=normalize_q_descr) + q: str = Query(..., description=normalize_q_descr), ) -> UnmergedNormalizationService: """Return all individual records associated with a normalized concept. diff --git a/src/gene/query.py b/src/gene/query.py index e30a79d8..59a5bd5b 100644 --- a/src/gene/query.py +++ b/src/gene/query.py @@ -28,7 +28,7 @@ ) from gene.version import __version__ -NormService = TypeVar("NormService", bound=BaseNormalizationService) +NormService = TypeVar('NormService', bound=BaseNormalizationService) class InvalidParameterException(Exception): # noqa: N818 @@ -65,15 +65,15 @@ def _emit_warnings(query_str: str) -> List: :return: List of warnings """ warnings = [] - nbsp = re.search("\xa0| ", query_str) + nbsp = re.search('\xa0| ', query_str) if nbsp: warnings = [ { - "non_breaking_space_characters": "Query contains non-breaking space characters" + 'non_breaking_space_characters': 'Query contains non-breaking space characters' } ] logger.warning( - f"Query ({query_str}) contains non-breaking space characters." + f'Query ({query_str}) contains non-breaking space characters.' ) return warnings @@ -84,12 +84,12 @@ def _transform_sequence_location(loc: Dict) -> models.SequenceLocation: :param loc: GeneSequenceLocation represented as a dict :return: VRS sequence location """ - refget_ac = loc["sequence_id"].split("ga4gh:")[-1] + refget_ac = loc['sequence_id'].split('ga4gh:')[-1] return models.SequenceLocation( sequenceReference=models.SequenceReference(refgetAccession=refget_ac), - start=int(loc["start"]), - end=int(loc["end"]), + start=int(loc['start']), + end=int(loc['end']), ) # @staticmethod @@ -128,11 +128,11 @@ def _transform_locations(self, record: Dict) -> Dict: :return: record with transformed locations attributes, if applicable """ record_locations = list() - if "locations" in record: - for loc in record["locations"]: - if loc["type"] == "SequenceLocation": + if 'locations' in record: + for loc in record['locations']: + if loc['type'] == 'SequenceLocation': record_locations.append(self._transform_location(loc)) - record["locations"] = record_locations + record['locations'] = record_locations return record def _get_src_name(self, concept_id: str) -> SourceName: @@ -149,7 +149,7 @@ def _get_src_name(self, concept_id: str) -> SourceName: elif concept_id.startswith(NamespacePrefix.HGNC.value): return SourceName.HGNC else: - raise ValueError("Invalid or unrecognized concept ID provided") + raise ValueError('Invalid or unrecognized concept ID provided') def _add_record( self, response: Dict[str, Dict], item: Dict, match_type: MatchType @@ -161,20 +161,20 @@ def _add_record( :param match_type: match type for query """ item = self._transform_locations(item) - item["match_type"] = match_type + item['match_type'] = match_type gene = Gene(**item) - src_name = item["src_name"] + src_name = item['src_name'] - matches = response["source_matches"] + matches = response['source_matches'] if src_name not in matches.keys(): pass elif matches[src_name] is None: matches[src_name] = { - "records": [gene], - "source_meta_": self.db.get_source_metadata(src_name), + 'records': [gene], + 'source_meta_': self.db.get_source_metadata(src_name), } else: - matches[src_name]["records"].append(gene) + matches[src_name]['records'].append(gene) def _fetch_record( self, response: Dict[str, Dict], concept_id: str, match_type: MatchType @@ -189,15 +189,15 @@ def _fetch_record( match = self.db.get_record_by_id(concept_id, case_sensitive=False) except DatabaseReadException as e: logger.error( - f"Encountered DatabaseReadException looking up {concept_id}: {e}" + f'Encountered DatabaseReadException looking up {concept_id}: {e}' ) else: if match: self._add_record(response, match, match_type) else: logger.error( - f"Unable to find expected record for {concept_id} matching as {match_type}" - ) # noqa: E501 + f'Unable to find expected record for {concept_id} matching as {match_type}' + ) def _post_process_resp(self, resp: Dict) -> Dict: """Fill all empty source_matches slots with NO_MATCH results and @@ -207,15 +207,15 @@ def _post_process_resp(self, resp: Dict) -> Dict: :return: response object with empty source slots filled with NO_MATCH results and corresponding source metadata """ - for src_name in resp["source_matches"].keys(): - if resp["source_matches"][src_name] is None: - resp["source_matches"][src_name] = { - "match_type": MatchType.NO_MATCH, - "records": [], - "source_meta_": self.db.get_source_metadata(src_name), + for src_name in resp['source_matches'].keys(): + if resp['source_matches'][src_name] is None: + resp['source_matches'][src_name] = { + 'match_type': MatchType.NO_MATCH, + 'records': [], + 'source_meta_': self.db.get_source_metadata(src_name), } else: - records = resp["source_matches"][src_name]["records"] + records = resp['source_matches'][src_name]['records'] if len(records) > 1: records = sorted(records, key=lambda k: k.match_type, reverse=True) return resp @@ -229,11 +229,11 @@ def _get_search_response(self, query: str, sources: Set[str]) -> Dict: :return: completed response object to return to client """ resp = { - "query": query, - "warnings": self._emit_warnings(query), - "source_matches": {source: None for source in sources}, + 'query': query, + 'warnings': self._emit_warnings(query), + 'source_matches': {source: None for source in sources}, } - if query == "": + if query == '': return self._post_process_resp(resp) query_l = query.lower() @@ -242,7 +242,7 @@ def _get_search_response(self, query: str, sources: Set[str]) -> Dict: queries.append((query_l, RecordType.IDENTITY.value)) for prefix in [p for p in NAMESPACE_LOOKUP.keys() if query_l.startswith(p)]: - term = f"{NAMESPACE_LOOKUP[prefix].lower()}:{query_l}" + term = f'{NAMESPACE_LOOKUP[prefix].lower()}:{query_l}' queries.append((term, RecordType.IDENTITY.value)) for match in ITEM_TYPES.values(): @@ -253,7 +253,7 @@ def _get_search_response(self, query: str, sources: Set[str]) -> Dict: try: if item_type == RecordType.IDENTITY.value: record = self.db.get_record_by_id(term, False) - if record and record["concept_id"] not in matched_concept_ids: + if record and record['concept_id'] not in matched_concept_ids: self._add_record(resp, record, MatchType.CONCEPT_ID) else: refs = self.db.get_refs_by_type(term, RefType(item_type)) @@ -264,8 +264,8 @@ def _get_search_response(self, query: str, sources: Set[str]) -> Dict: except DatabaseReadException as e: logger.error( - f"Encountered DatabaseReadException looking up {item_type}" - f" {term}: {e}" + f'Encountered DatabaseReadException looking up {item_type}' + f' {term}: {e}' ) continue @@ -283,8 +283,8 @@ def _get_service_meta() -> ServiceMeta: def search( self, query_str: str, - incl: str = "", - excl: str = "", + incl: str = '', + excl: str = '', **params, ) -> SearchService: """Return highest match for each source. @@ -316,10 +316,10 @@ def search( if not incl and not excl: query_sources = set(sources.values()) elif incl and excl: - detail = "Cannot request both source inclusions and exclusions." + detail = 'Cannot request both source inclusions and exclusions.' raise InvalidParameterException(detail) elif incl: - req_sources = [n.strip() for n in incl.split(",")] + req_sources = [n.strip() for n in incl.split(',')] invalid_sources = [] query_sources = set() for source in req_sources: @@ -328,10 +328,10 @@ def search( else: invalid_sources.append(source) if invalid_sources: - detail = f"Invalid source name(s): {invalid_sources}" + detail = f'Invalid source name(s): {invalid_sources}' raise InvalidParameterException(detail) else: - req_exclusions = [n.strip() for n in excl.lower().split(",")] + req_exclusions = [n.strip() for n in excl.lower().split(',')] req_excl_dict = {r.lower(): r for r in req_exclusions} invalid_sources = [] query_sources = set() @@ -342,14 +342,14 @@ def search( if src_l not in req_excl_dict.keys(): query_sources.add(src) if invalid_sources: - detail = f"Invalid source name(s): {invalid_sources}" + detail = f'Invalid source name(s): {invalid_sources}' raise InvalidParameterException(detail) query_str = query_str.strip() resp = self._get_search_response(query_str, query_sources) - resp["service_meta_"] = self._get_service_meta() + resp['service_meta_'] = self._get_service_meta() return SearchService(**resp) def _add_merged_meta(self, response: NormalizeService) -> NormalizeService: @@ -360,7 +360,7 @@ def _add_merged_meta(self, response: NormalizeService) -> NormalizeService: """ sources_meta = {} gene = response.gene - sources = [response.normalized_id.split(":")[0]] + sources = [response.normalized_id.split(':')[0]] if gene.mappings: sources += [m.coding.system for m in gene.mappings] @@ -391,13 +391,13 @@ def _add_alt_matches( for concept_id in possible_concepts: r = self.db.get_record_by_id(concept_id, True) if r: - merge_ref = r.get("merge_ref") + merge_ref = r.get('merge_ref') if merge_ref: norm_concepts.add(merge_ref) - norm_concepts = norm_concepts - {record["concept_id"]} + norm_concepts = norm_concepts - {record['concept_id']} if norm_concepts: response.warnings.append( - {"multiple_normalized_concepts_found": list(norm_concepts)} + {'multiple_normalized_concepts_found': list(norm_concepts)} ) return response @@ -418,14 +418,14 @@ def _add_gene( """ gene_obj = core_models.Gene( id=f"normalize.gene.{record['concept_id']}", - label=record["symbol"], + label=record['symbol'], ) # mappings - source_ids = record.get("xrefs", []) + record.get("associated_with", []) + source_ids = record.get('xrefs', []) + record.get('associated_with', []) mappings = [] for source_id in source_ids: - system, code = source_id.split(":") + system, code = source_id.split(':') mappings.append( core_models.Mapping( coding=core_models.Coding( @@ -439,7 +439,7 @@ def _add_gene( # aliases aliases = set() - for key in ["previous_symbols", "aliases"]: + for key in ['previous_symbols', 'aliases']: if key in record and record[key]: val = record[key] if isinstance(val, str): @@ -451,11 +451,11 @@ def _add_gene( # extensions extensions = [] extension_and_record_labels = [ - ("symbol_status", "symbol_status"), - ("approved_name", "label"), - ("previous_symbols", "previous_symbols"), - ("location_annotations", "location_annotations"), - ("strand", "strand"), + ('symbol_status', 'symbol_status'), + ('approved_name', 'label'), + ('previous_symbols', 'previous_symbols'), + ('location_annotations', 'location_annotations'), + ('strand', 'strand'), ] for ext_label, record_label in extension_and_record_labels: if record_label in record and record[record_label]: @@ -464,19 +464,19 @@ def _add_gene( ) record_locations = {} - if record["item_type"] == RecordType.IDENTITY: - locs = record.get("locations") + if record['item_type'] == RecordType.IDENTITY: + locs = record.get('locations') if locs: record_locations[f"{record['src_name'].lower()}_locations"] = locs - elif record["item_type"] == RecordType.MERGER: + elif record['item_type'] == RecordType.MERGER: for k, v in record.items(): - if k.endswith("locations") and v: + if k.endswith('locations') and v: record_locations[k] = v for loc_name, locations in record_locations.items(): transformed_locs = [] for loc in locations: - if loc["type"] == "SequenceLocation": + if loc['type'] == 'SequenceLocation': transformed_locs.append(self._transform_location(loc)) if transformed_locs: @@ -485,12 +485,12 @@ def _add_gene( ) # handle gene types separately because they're wonky - if record["item_type"] == RecordType.IDENTITY: - gene_type = record.get("gene_type") + if record['item_type'] == RecordType.IDENTITY: + gene_type = record.get('gene_type') if gene_type: extensions.append( core_models.Extension( - name=GeneTypeFieldName[record["src_name"].upper()].value, + name=GeneTypeFieldName[record['src_name'].upper()].value, value=gene_type, ) ) @@ -509,7 +509,7 @@ def _add_gene( if possible_concepts: response = self._add_alt_matches(response, record, possible_concepts) - response.normalized_id = record["concept_id"] + response.normalized_id = record['concept_id'] response.gene = gene_obj response = self._add_merged_meta(response) response.match_type = match_type @@ -522,9 +522,9 @@ def _record_order(record: Dict) -> Tuple[int, str]: :param record: individual record item in iterable to sort :return: tuple with rank value and concept ID """ - src = record["src_name"].upper() + src = record['src_name'].upper() source_rank = SourcePriority[src] - return source_rank, record["concept_id"] + return source_rank, record['concept_id'] @staticmethod def _handle_failed_merge_ref(record: Dict, response: Dict, query: str) -> Dict: @@ -539,7 +539,7 @@ def _handle_failed_merge_ref(record: Dict, response: Dict, query: str) -> Dict: f"Merge ref lookup failed for ref {record['merge_ref']} " f"in record {record['concept_id']} from query {query}" ) - response["match_type"] = MatchType.NO_MATCH + response['match_type'] = MatchType.NO_MATCH return response def _prepare_normalized_response(self, query: str) -> Dict[str, Any]: @@ -549,10 +549,10 @@ def _prepare_normalized_response(self, query: str) -> Dict[str, Any]: :return: basic normalization response boilerplate """ return { - "query": query, - "match_type": MatchType.NO_MATCH, - "warnings": self._emit_warnings(query), - "service_meta_": ServiceMeta( + 'query': query, + 'match_type': MatchType.NO_MATCH, + 'warnings': self._emit_warnings(query), + 'service_meta_': ServiceMeta( version=__version__, response_datetime=str(datetime.now()) ), } @@ -594,7 +594,7 @@ def _resolve_merge( :param possible_concepts: alternate possible matches :return: Normalized response object """ - merge_ref = record.get("merge_ref") + merge_ref = record.get('merge_ref') if merge_ref: # follow merge_ref merge = self.db.get_record_by_id(merge_ref, False, True) @@ -621,7 +621,7 @@ def _perform_normalized_lookup( :param response_builder: response constructor callback method :return: completed service response object """ - if query == "": + if query == '': return response query_str = query.lower().strip() @@ -653,7 +653,7 @@ def _perform_normalized_lookup( # attempt merge ref resolution until successful for match in matching_records: assert match is not None - record = self.db.get_record_by_id(match["concept_id"], False) + record = self.db.get_record_by_id(match['concept_id'], False) if record: match_type_value = MatchType[match_type.value.upper()] return self._resolve_merge( @@ -682,23 +682,23 @@ def _add_normalized_records( :return: Completed response object """ response.match_type = match_type - response.normalized_concept_id = normalized_record["concept_id"] - if normalized_record["item_type"] == RecordType.IDENTITY: - record_source = SourceName[normalized_record["src_name"].upper()] + response.normalized_concept_id = normalized_record['concept_id'] + if normalized_record['item_type'] == RecordType.IDENTITY: + record_source = SourceName[normalized_record['src_name'].upper()] meta = self.db.get_source_metadata(record_source.value) response.source_matches[record_source] = MatchesNormalized( records=[BaseGene(**self._transform_locations(normalized_record))], source_meta_=meta, # type: ignore ) else: - concept_ids = [normalized_record["concept_id"]] + normalized_record.get( - "xrefs", [] + concept_ids = [normalized_record['concept_id']] + normalized_record.get( + 'xrefs', [] ) for concept_id in concept_ids: record = self.db.get_record_by_id(concept_id, case_sensitive=False) if not record: continue - record_source = SourceName[record["src_name"].upper()] + record_source = SourceName[record['src_name'].upper()] gene = BaseGene(**self._transform_locations(record)) if record_source in response.source_matches: response.source_matches[record_source].records.append(gene) diff --git a/src/gene/schemas.py b/src/gene/schemas.py index 6f85b1bc..602c9abb 100644 --- a/src/gene/schemas.py +++ b/src/gene/schemas.py @@ -15,22 +15,22 @@ from gene.version import __version__ -CURIE = constr(pattern=r"^\w[^:]*:.+$") +CURIE = constr(pattern=r'^\w[^:]*:.+$') class SymbolStatus(str, Enum): """Define string constraints for symbol status attribute.""" - WITHDRAWN = "withdrawn" - APPROVED = "approved" - DISCONTINUED = "discontinued" + WITHDRAWN = 'withdrawn' + APPROVED = 'approved' + DISCONTINUED = 'discontinued' class Strand(str, Enum): """Define string constraints for strand attribute.""" - FORWARD = "+" - REVERSE = "-" + FORWARD = '+' + REVERSE = '-' class Annotation(str, Enum): @@ -38,16 +38,16 @@ class Annotation(str, Enum): is absent. """ - NOT_FOUND_ON_REFERENCE = "not on reference assembly" - UNPLACED = "unplaced" - RESERVED = "reserved" - ALT_LOC = "alternate reference locus" + NOT_FOUND_ON_REFERENCE = 'not on reference assembly' + UNPLACED = 'unplaced' + RESERVED = 'reserved' + ALT_LOC = 'alternate reference locus' class Chromosome(str, Enum): """Define string constraints for chromosomes.""" - MITOCHONDRIA = "MT" + MITOCHONDRIA = 'MT' class MatchType(IntEnum): @@ -66,10 +66,10 @@ class MatchType(IntEnum): class GeneSequenceLocation(BaseModel): """Sequence Location model when storing in DynamoDB.""" - type: Literal["SequenceLocation"] = "SequenceLocation" + type: Literal['SequenceLocation'] = 'SequenceLocation' start: StrictInt end: StrictInt - sequence_id: constr(pattern=r"^ga4gh:SQ.[0-9A-Za-z_\-]{32}$") # noqa: F722 + sequence_id: constr(pattern=r'^ga4gh:SQ.[0-9A-Za-z_\-]{32}$') # noqa: F722 # class GeneChromosomeLocation(BaseModel): @@ -112,20 +112,20 @@ class Gene(BaseGene): model_config = ConfigDict( json_schema_extra={ - "example": { - "label": None, - "concept_id": "ensembl:ENSG00000157764", - "symbol": "BRAF", - "previous_symbols": [], - "aliases": [], - "xrefs": [], - "symbol_status": None, - "strand": "-", - "locations": [], - "location_annotations": [], - "associated_with": [], - "gene_type": None, - "match_type": 100, + 'example': { + 'label': None, + 'concept_id': 'ensembl:ENSG00000157764', + 'symbol': 'BRAF', + 'previous_symbols': [], + 'aliases': [], + 'xrefs': [], + 'symbol_status': None, + 'strand': '-', + 'locations': [], + 'location_annotations': [], + 'associated_with': [], + 'gene_type': None, + 'match_type': 100, } } ) @@ -142,9 +142,9 @@ class GeneGroup(Gene): class SourceName(Enum): """Define string constraints to ensure consistent capitalization.""" - HGNC = "HGNC" - ENSEMBL = "Ensembl" - NCBI = "NCBI" + HGNC = 'HGNC' + ENSEMBL = 'Ensembl' + NCBI = 'NCBI' class SourcePriority(IntEnum): @@ -158,42 +158,42 @@ class SourcePriority(IntEnum): class SourceIDAfterNamespace(Enum): """Define string constraints after namespace.""" - HGNC = "" - ENSEMBL = "ENSG" - NCBI = "" + HGNC = '' + ENSEMBL = 'ENSG' + NCBI = '' class NamespacePrefix(Enum): """Define string constraints for namespace prefixes on concept IDs.""" - HGNC = "hgnc" - ENSEMBL = "ensembl" - NCBI = "ncbigene" + HGNC = 'hgnc' + ENSEMBL = 'ensembl' + NCBI = 'ncbigene' ENTREZ = NCBI - VEGA = "vega" - UCSC = "ucsc" - ENA = "ena.embl" - REFSEQ = "refseq" - CCDS = "ccds" - UNIPROT = "uniprot" - PUBMED = "pubmed" - COSMIC = "cosmic" - OMIM = "omim" - MIRBASE = "mirbase" - HOMEODB = "homeodb" - SNORNABASE = "snornabase" - ORPHANET = "orphanet" - PSEUDOGENE = "pseudogene.org" - HORDE = "hordedb" - MEROPS = "merops" - IUPHAR = "iuphar" - KZNF = "knzfgc" - MAMIT = "mamittrnadb" - CD = "hcdmdb" - LNCRNADB = "lncrnadb" - IMGT = "imgt" # .hla? .ligm? leave as is? - IMGT_GENE_DB = "imgt/gene-db" # redundant w/ above? - RFAM = "rfam" + VEGA = 'vega' + UCSC = 'ucsc' + ENA = 'ena.embl' + REFSEQ = 'refseq' + CCDS = 'ccds' + UNIPROT = 'uniprot' + PUBMED = 'pubmed' + COSMIC = 'cosmic' + OMIM = 'omim' + MIRBASE = 'mirbase' + HOMEODB = 'homeodb' + SNORNABASE = 'snornabase' + ORPHANET = 'orphanet' + PSEUDOGENE = 'pseudogene.org' + HORDE = 'hordedb' + MEROPS = 'merops' + IUPHAR = 'iuphar' + KZNF = 'knzfgc' + MAMIT = 'mamittrnadb' + CD = 'hcdmdb' + LNCRNADB = 'lncrnadb' + IMGT = 'imgt' # .hla? .ligm? leave as is? + IMGT_GENE_DB = 'imgt/gene-db' # redundant w/ above? + RFAM = 'rfam' class DataLicenseAttributes(BaseModel): @@ -207,19 +207,19 @@ class DataLicenseAttributes(BaseModel): class RecordType(str, Enum): """Record item types.""" - IDENTITY = "identity" - MERGER = "merger" + IDENTITY = 'identity' + MERGER = 'merger' class RefType(str, Enum): """Reference item types.""" # Must be in descending MatchType order. - SYMBOL = "symbol" - PREVIOUS_SYMBOLS = "prev_symbol" - ALIASES = "alias" - XREFS = "xref" - ASSOCIATED_WITH = "associated_with" + SYMBOL = 'symbol' + PREVIOUS_SYMBOLS = 'prev_symbol' + ALIASES = 'alias' + XREFS = 'xref' + ASSOCIATED_WITH = 'associated_with' class SourceMeta(BaseModel): @@ -235,22 +235,22 @@ class SourceMeta(BaseModel): model_config = ConfigDict( json_schema_extra={ - "example": { - "data_license": "custom", - "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", - "version": "20201215", - "data_url": { - "info_file": "ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz", - "history_file": "ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz", - "assembly_file": "ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/", + 'example': { + 'data_license': 'custom', + 'data_license_url': 'https://www.ncbi.nlm.nih.gov/home/about/policies/', + 'version': '20201215', + 'data_url': { + 'info_file': 'ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz', + 'history_file': 'ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz', + 'assembly_file': 'ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/', }, - "rdp_url": "https://reusabledata.org/ncbi-gene.html", - "data_license_attributes": { - "non_commercial": False, - "share_alike": False, - "attribution": False, + 'rdp_url': 'https://reusabledata.org/ncbi-gene.html', + 'data_license_attributes': { + 'non_commercial': False, + 'share_alike': False, + 'attribution': False, }, - "genome_assemblies": [], + 'genome_assemblies': [], } } ) @@ -262,26 +262,26 @@ class SourceSearchMatches(BaseModel): records: List[Gene] = [] source_meta_: SourceMeta - model_config = ConfigDict(json_schema_extra={"example": {}}) # TODO + model_config = ConfigDict(json_schema_extra={'example': {}}) # TODO class ServiceMeta(BaseModel): """Metadata regarding the gene-normalization service.""" - name: Literal["gene-normalizer"] = "gene-normalizer" + name: Literal['gene-normalizer'] = 'gene-normalizer' version: StrictStr response_datetime: StrictStr url: Literal[ - "https://github.com/cancervariants/gene-normalization" - ] = "https://github.com/cancervariants/gene-normalization" # noqa: E501 + 'https://github.com/cancervariants/gene-normalization' + ] = 'https://github.com/cancervariants/gene-normalization' model_config = ConfigDict( json_schema_extra={ - "example": { - "name": "gene-normalizer", - "version": __version__, - "response_datetime": "2022-03-23 15:57:14.180908", - "url": "https://github.com/cancervariants/gene-normalization", + 'example': { + 'name': 'gene-normalizer', + 'version': __version__, + 'response_datetime': '2022-03-23 15:57:14.180908', + 'url': 'https://github.com/cancervariants/gene-normalization', } } ) @@ -303,9 +303,9 @@ class GeneTypeFieldName(str, Enum): internal records. """ - HGNC = "hgnc_locus_type" - NCBI = "ncbi_gene_type" - ENSEMBL = "ensembl_biotype" + HGNC = 'hgnc_locus_type' + NCBI = 'ncbi_gene_type' + ENSEMBL = 'ensembl_biotype' class BaseNormalizationService(BaseModel): @@ -326,93 +326,93 @@ class NormalizeService(BaseNormalizationService): model_config = ConfigDict( json_schema_extra={ - "example": { - "query": "BRAF", - "warnings": [], - "match_type": 100, - "normalized_id": "hgnc:1037", - "gene": { - "type": "Gene", - "id": "normalize.gene.hgnc:1097", - "label": "BRAF", - "mappings": [ + 'example': { + 'query': 'BRAF', + 'warnings': [], + 'match_type': 100, + 'normalized_id': 'hgnc:1037', + 'gene': { + 'type': 'Gene', + 'id': 'normalize.gene.hgnc:1097', + 'label': 'BRAF', + 'mappings': [ { - "coding": {"code": "673", "system": "ncbigene"}, - "relation": "relatedMatch", + 'coding': {'code': '673', 'system': 'ncbigene'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "ENSG00000157764", "system": "ensembl"}, - "relation": "relatedMatch", + 'coding': {'code': 'ENSG00000157764', 'system': 'ensembl'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "CCDS5863", "system": "ccds"}, - "relation": "relatedMatch", + 'coding': {'code': 'CCDS5863', 'system': 'ccds'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "1943", "system": "iuphar"}, - "relation": "relatedMatch", + 'coding': {'code': '1943', 'system': 'iuphar'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "119066", "system": "orphanet"}, - "relation": "relatedMatch", + 'coding': {'code': '119066', 'system': 'orphanet'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "BRAF", "system": "cosmic"}, - "relation": "relatedMatch", + 'coding': {'code': 'BRAF', 'system': 'cosmic'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "2284096", "system": "pubmed"}, - "relation": "relatedMatch", + 'coding': {'code': '2284096', 'system': 'pubmed'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "uc003vwc.5", "system": "ucsc"}, - "relation": "relatedMatch", + 'coding': {'code': 'uc003vwc.5', 'system': 'ucsc'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "164757", "system": "omim"}, - "relation": "relatedMatch", + 'coding': {'code': '164757', 'system': 'omim'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "NM_004333", "system": "refseq"}, - "relation": "relatedMatch", + 'coding': {'code': 'NM_004333', 'system': 'refseq'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "CCDS87555", "system": "ccds"}, - "relation": "relatedMatch", + 'coding': {'code': 'CCDS87555', 'system': 'ccds'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "P15056", "system": "uniprot"}, - "relation": "relatedMatch", + 'coding': {'code': 'P15056', 'system': 'uniprot'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "M95712", "system": "ena.embl"}, - "relation": "relatedMatch", + 'coding': {'code': 'M95712', 'system': 'ena.embl'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "OTTHUMG00000157457", "system": "vega"}, - "relation": "relatedMatch", + 'coding': {'code': 'OTTHUMG00000157457', 'system': 'vega'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "1565476", "system": "pubmed"}, - "relation": "relatedMatch", + 'coding': {'code': '1565476', 'system': 'pubmed'}, + 'relation': 'relatedMatch', }, ], - "aliases": ["BRAF1", "RAFB1", "B-raf", "NS7", "B-RAF1"], - "extensions": [ + 'aliases': ['BRAF1', 'RAFB1', 'B-raf', 'NS7', 'B-RAF1'], + 'extensions': [ { - "name": "approved_name", - "value": "B-Raf proto-oncogene, serine/threonine kinase", - "type": "Extension", + 'name': 'approved_name', + 'value': 'B-Raf proto-oncogene, serine/threonine kinase', + 'type': 'Extension', }, { - "name": "symbol_status", - "value": "approved", - "type": "Extension", + 'name': 'symbol_status', + 'value': 'approved', + 'type': 'Extension', }, # { # "name": "chromosome_location", # "value": { - # "id": "ga4gh:CL.O6yCQ1cnThOrTfK9YUgMlTfM6HTqbrKw", # noqa: E501 + # "id": "ga4gh:CL.O6yCQ1cnThOrTfK9YUgMlTfM6HTqbrKw", # "type": "ChromosomeLocation", # "species_id": "taxonomy:9606", # "chr": "7", @@ -423,60 +423,60 @@ class NormalizeService(BaseNormalizationService): # } ], }, - "source_meta_": { - "HGNC": { - "data_license": "custom", - "data_license_url": "https://www.genenames.org/about/", - "version": "20210810", - "data_url": { - "complete_set_archive": "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json" + 'source_meta_': { + 'HGNC': { + 'data_license': 'custom', + 'data_license_url': 'https://www.genenames.org/about/', + 'version': '20210810', + 'data_url': { + 'complete_set_archive': 'ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json' }, - "rdp_url": None, - "data_license_attributes": { - "non_commercial": False, - "attribution": False, - "share_alike": False, + 'rdp_url': None, + 'data_license_attributes': { + 'non_commercial': False, + 'attribution': False, + 'share_alike': False, }, - "genome_assemblies": [], + 'genome_assemblies': [], }, - "Ensembl": { - "data_license": "custom", - "data_license_url": "https://useast.ensembl.org/info/about/legal/disclaimer.html", # noqa: E501 - "version": "104", - "data_url": { - "genome_annotations": "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz" + 'Ensembl': { + 'data_license': 'custom', + 'data_license_url': 'https://useast.ensembl.org/info/about/legal/disclaimer.html', + 'version': '104', + 'data_url': { + 'genome_annotations': 'ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz' }, - "rdp_url": None, - "data_license_attributes": { - "non_commercial": False, - "attribution": False, - "share_alike": False, + 'rdp_url': None, + 'data_license_attributes': { + 'non_commercial': False, + 'attribution': False, + 'share_alike': False, }, - "genome_assemblies": ["GRCh38"], + 'genome_assemblies': ['GRCh38'], }, - "NCBI": { - "data_license": "custom", - "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", # noqa: E501 - "version": "20210813", - "data_url": { - "info_file": "ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz", - "history_file": "ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz", - "assembly_file": "ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/", + 'NCBI': { + 'data_license': 'custom', + 'data_license_url': 'https://www.ncbi.nlm.nih.gov/home/about/policies/', + 'version': '20210813', + 'data_url': { + 'info_file': 'ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz', + 'history_file': 'ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz', + 'assembly_file': 'ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/', }, - "rdp_url": "https://reusabledata.org/ncbi-gene.html", - "data_license_attributes": { - "non_commercial": False, - "attribution": False, - "share_alike": False, + 'rdp_url': 'https://reusabledata.org/ncbi-gene.html', + 'data_license_attributes': { + 'non_commercial': False, + 'attribution': False, + 'share_alike': False, }, - "genome_assemblies": ["GRCh38.p13"], + 'genome_assemblies': ['GRCh38.p13'], }, }, - "service_meta_": { - "name": "gene-normalizer", - "version": __version__, - "response_datetime": "2022-03-23 15:57:14.180908", - "url": "https://github.com/cancervariants/gene-normalization", + 'service_meta_': { + 'name': 'gene-normalizer', + 'version': __version__, + 'response_datetime': '2022-03-23 15:57:14.180908', + 'url': 'https://github.com/cancervariants/gene-normalization', }, } } @@ -501,168 +501,168 @@ class UnmergedNormalizationService(BaseNormalizationService): model_config = ConfigDict( json_schema_extra={ - "example": { - "query": "hgnc:108", - "warnings": [], - "match_type": 100, - "service_meta_": { - "version": __version__, - "response_datetime": "2022-04-26 14:20:54.180240", - "name": "gene-normalizer", - "url": "https://github.com/cancervariants/gene-normalization", + 'example': { + 'query': 'hgnc:108', + 'warnings': [], + 'match_type': 100, + 'service_meta_': { + 'version': __version__, + 'response_datetime': '2022-04-26 14:20:54.180240', + 'name': 'gene-normalizer', + 'url': 'https://github.com/cancervariants/gene-normalization', }, - "normalized_concept_id": "hgnc:108", - "source_matches": { - "HGNC": { - "records": [ + 'normalized_concept_id': 'hgnc:108', + 'source_matches': { + 'HGNC': { + 'records': [ { - "concept_id": "hgnc:108", - "symbol": "ACHE", - "symbol_status": "approved", - "label": "acetylcholinesterase (Cartwright blood group)", # noqa: E501 - "strand": None, - "location_annotations": [], - "locations": [ + 'concept_id': 'hgnc:108', + 'symbol': 'ACHE', + 'symbol_status': 'approved', + 'label': 'acetylcholinesterase (Cartwright blood group)', + 'strand': None, + 'location_annotations': [], + 'locations': [ # { # "type": "ChromosomeLocation", - # "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # noqa: E501 + # "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # "species_id": "taxonomy:9606", # "chr": "7", # "start": "q22.1", # "end": "q22.1" # } ], - "aliases": ["3.1.1.7"], - "previous_symbols": ["YT"], - "xrefs": ["ncbigene:43", "ensembl:ENSG00000087085"], - "associated_with": [ - "ucsc:uc003uxi.4", - "vega:OTTHUMG00000157033", - "merops:S09.979", - "ccds:CCDS5710", - "omim:100740", - "iuphar:2465", - "ccds:CCDS5709", - "refseq:NM_015831", - "pubmed:1380483", - "uniprot:P22303", - "ccds:CCDS64736", + 'aliases': ['3.1.1.7'], + 'previous_symbols': ['YT'], + 'xrefs': ['ncbigene:43', 'ensembl:ENSG00000087085'], + 'associated_with': [ + 'ucsc:uc003uxi.4', + 'vega:OTTHUMG00000157033', + 'merops:S09.979', + 'ccds:CCDS5710', + 'omim:100740', + 'iuphar:2465', + 'ccds:CCDS5709', + 'refseq:NM_015831', + 'pubmed:1380483', + 'uniprot:P22303', + 'ccds:CCDS64736', ], - "gene_type": "gene with protein product", + 'gene_type': 'gene with protein product', } ], - "source_meta_": { - "data_license": "custom", - "data_license_url": "https://www.genenames.org/about/", - "version": "20220407", - "data_url": { - "complete_set_archive": "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json" + 'source_meta_': { + 'data_license': 'custom', + 'data_license_url': 'https://www.genenames.org/about/', + 'version': '20220407', + 'data_url': { + 'complete_set_archive': 'ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json' }, - "rdp_url": None, - "data_license_attributes": { - "non_commercial": False, - "share_alike": False, - "attribution": False, + 'rdp_url': None, + 'data_license_attributes': { + 'non_commercial': False, + 'share_alike': False, + 'attribution': False, }, - "genome_assemblies": [], + 'genome_assemblies': [], }, }, - "Ensembl": { - "records": [ + 'Ensembl': { + 'records': [ { - "concept_id": "ensembl:ENSG00000087085", - "symbol": "ACHE", - "symbol_status": None, - "label": "acetylcholinesterase (Cartwright blood group)", # noqa: E501 - "strand": "-", - "location_annotations": [], - "locations": [ + 'concept_id': 'ensembl:ENSG00000087085', + 'symbol': 'ACHE', + 'symbol_status': None, + 'label': 'acetylcholinesterase (Cartwright blood group)', + 'strand': '-', + 'location_annotations': [], + 'locations': [ { - "id": "ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1", # noqa: E501 - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501 + 'id': 'ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1', + 'type': 'SequenceLocation', + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul', }, - "start": 100889993, - "end": 100896974, + 'start': 100889993, + 'end': 100896974, } ], - "aliases": [], - "previous_symbols": [], - "xrefs": ["hgnc:108"], - "associated_with": [], - "gene_type": "protein_coding", + 'aliases': [], + 'previous_symbols': [], + 'xrefs': ['hgnc:108'], + 'associated_with': [], + 'gene_type': 'protein_coding', } ], - "source_meta_": { - "data_license": "custom", - "data_license_url": "https://useast.ensembl.org/info/about/legal/disclaimer.html", # noqa: E501 - "version": "104", - "data_url": { - "genome_annotations": "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz" + 'source_meta_': { + 'data_license': 'custom', + 'data_license_url': 'https://useast.ensembl.org/info/about/legal/disclaimer.html', + 'version': '104', + 'data_url': { + 'genome_annotations': 'ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz' }, - "rdp_url": None, - "data_license_attributes": { - "non_commercial": False, - "share_alike": False, - "attribution": False, + 'rdp_url': None, + 'data_license_attributes': { + 'non_commercial': False, + 'share_alike': False, + 'attribution': False, }, - "genome_assemblies": ["GRCh38"], + 'genome_assemblies': ['GRCh38'], }, }, - "NCBI": { - "records": [ + 'NCBI': { + 'records': [ { - "concept_id": "ncbigene:43", - "symbol": "ACHE", - "symbol_status": None, - "label": "acetylcholinesterase (Cartwright blood group)", # noqa: E501 - "strand": "-", - "location_annotations": [], - "locations": [ + 'concept_id': 'ncbigene:43', + 'symbol': 'ACHE', + 'symbol_status': None, + 'label': 'acetylcholinesterase (Cartwright blood group)', + 'strand': '-', + 'location_annotations': [], + 'locations': [ { # "type": "ChromosomeLocation", - # "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # noqa: E501 + # "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # "species_id": "taxonomy:9606", # "chr": "7", # "start": "q22.1", # "end": "q22.1" }, { - "id": "ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g", # noqa: E501 - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501 + 'id': 'ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g', + 'type': 'SequenceLocation', + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul', }, - "start": 100889993, - "end": 100896994, + 'start': 100889993, + 'end': 100896994, }, ], - "aliases": ["YT", "ARACHE", "ACEE", "N-ACHE"], - "previous_symbols": ["ACEE"], - "xrefs": ["hgnc:108", "ensembl:ENSG00000087085"], - "associated_with": ["omim:100740"], - "gene_type": "protein-coding", + 'aliases': ['YT', 'ARACHE', 'ACEE', 'N-ACHE'], + 'previous_symbols': ['ACEE'], + 'xrefs': ['hgnc:108', 'ensembl:ENSG00000087085'], + 'associated_with': ['omim:100740'], + 'gene_type': 'protein-coding', } ], - "source_meta_": { - "data_license": "custom", - "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", # noqa: E501 - "version": "20220407", - "data_url": { - "info_file": "ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz", - "history_file": "ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz", - "assembly_file": "ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/", + 'source_meta_': { + 'data_license': 'custom', + 'data_license_url': 'https://www.ncbi.nlm.nih.gov/home/about/policies/', + 'version': '20220407', + 'data_url': { + 'info_file': 'ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz', + 'history_file': 'ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz', + 'assembly_file': 'ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/', }, - "rdp_url": "https://reusabledata.org/ncbi-gene.html", - "data_license_attributes": { - "non_commercial": False, - "share_alike": False, - "attribution": False, + 'rdp_url': 'https://reusabledata.org/ncbi-gene.html', + 'data_license_attributes': { + 'non_commercial': False, + 'share_alike': False, + 'attribution': False, }, - "genome_assemblies": ["GRCh38.p13"], + 'genome_assemblies': ['GRCh38.p13'], }, }, }, diff --git a/src/gene/version.py b/src/gene/version.py index 75c5d6c1..b4913868 100644 --- a/src/gene/version.py +++ b/src/gene/version.py @@ -1,2 +1,2 @@ """Gene normalizer version""" -__version__ = "0.3.0-dev1" +__version__ = '0.3.0-dev1' diff --git a/tests/conftest.py b/tests/conftest.py index ad1a14a2..ba941b0a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,7 +6,7 @@ from gene.database import AbstractDatabase, create_db -@pytest.fixture(scope="session") +@pytest.fixture(scope='session') def database() -> AbstractDatabase: """Create database instance.""" return create_db() @@ -17,19 +17,19 @@ def pytest_addoption(parser): See https://docs.pytest.org/en/7.1.x/reference/reference.html#parser """ parser.addoption( - "--verbose-logs", - action="store_true", + '--verbose-logs', + action='store_true', default=False, - help="show noisy module logs", + help='show noisy module logs', ) def pytest_configure(config): """Configure pytest setup.""" - if not config.getoption("--verbose-logs"): - logging.getLogger("botocore").setLevel(logging.ERROR) - logging.getLogger("boto3").setLevel(logging.ERROR) - logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR) + if not config.getoption('--verbose-logs'): + logging.getLogger('botocore').setLevel(logging.ERROR) + logging.getLogger('boto3').setLevel(logging.ERROR) + logging.getLogger('urllib3.connectionpool').setLevel(logging.ERROR) def _compare_records(normalized_gene, test_gene, match_type): @@ -53,7 +53,7 @@ def _compare_records(normalized_gene, test_gene, match_type): assert normalized_gene.gene_type == test_gene.gene_type -@pytest.fixture(scope="session") +@pytest.fixture(scope='session') def compare_records(): """Provide record(s) comparison function""" return _compare_records @@ -65,7 +65,7 @@ def _check_resp_single_record(resp, test_gene, match_type): _compare_records(resp.records[0], test_gene, match_type) -@pytest.fixture(scope="session") +@pytest.fixture(scope='session') def check_resp_single_record(): """Provide record comparison function for single record""" return _check_resp_single_record diff --git a/tests/unit/test_database_and_etl.py b/tests/unit/test_database_and_etl.py index 092cc6c3..58ef6461 100644 --- a/tests/unit/test_database_and_etl.py +++ b/tests/unit/test_database_and_etl.py @@ -12,28 +12,28 @@ from gene.schemas import RecordType ALIASES = { - "NC_000001.11": ["ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO"], - "NC_000002.12": ["ga4gh:SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g"], - "NC_000003.12": ["ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX"], - "NC_000007.14": ["ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul"], - "NC_000009.12": ["ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI"], - "NC_000011.10": ["ga4gh:SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1"], - "NC_000015.10": ["ga4gh:SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6"], - "NC_000017.11": ["ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7"], - "NC_000019.10": ["ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl"], - "NC_000023.11": ["ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP"], - "NC_000008.11": ["ga4gh:SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs"], - "NC_000012.12": ["ga4gh:SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl"], - "NC_000024.10": ["ga4gh:SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5"], - "NT_167246.2": ["ga4gh:SQ.MjujHSAsgNWRTX4w3ysM7b5OVhZpdXu1"], - "NT_167249.2": ["ga4gh:SQ.Q8IworEhpLeXwpz1CHM7C3luysh-ltx-"], + 'NC_000001.11': ['ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO'], + 'NC_000002.12': ['ga4gh:SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g'], + 'NC_000003.12': ['ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX'], + 'NC_000007.14': ['ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul'], + 'NC_000009.12': ['ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI'], + 'NC_000011.10': ['ga4gh:SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1'], + 'NC_000015.10': ['ga4gh:SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6'], + 'NC_000017.11': ['ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7'], + 'NC_000019.10': ['ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl'], + 'NC_000023.11': ['ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP'], + 'NC_000008.11': ['ga4gh:SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs'], + 'NC_000012.12': ['ga4gh:SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl'], + 'NC_000024.10': ['ga4gh:SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5'], + 'NT_167246.2': ['ga4gh:SQ.MjujHSAsgNWRTX4w3ysM7b5OVhZpdXu1'], + 'NT_167249.2': ['ga4gh:SQ.Q8IworEhpLeXwpz1CHM7C3luysh-ltx-'], } -IS_TEST_ENV = environ.get("GENE_TEST", "").lower() == "true" -IS_DDB_TEST = not environ.get("GENE_NORM_DB_URL", "").lower().startswith("postgres") +IS_TEST_ENV = environ.get('GENE_TEST', '').lower() == 'true' +IS_DDB_TEST = not environ.get('GENE_NORM_DB_URL', '').lower().startswith('postgres') -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def db_fixture(database): """Create a database test fixture.""" @@ -49,7 +49,7 @@ def __init__(self): return DB() -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def processed_ids(): """Create a test fixture to store processed ids for merged concepts.""" return list() @@ -64,33 +64,33 @@ def _get_aliases(seqid): return ALIASES[seqid] -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def etl_data_path(): """Create a test fixture to return etl data path.""" test_root = Path(__file__).resolve().parents[2] - return test_root / "tests" / "unit" / "data" / "etl_data" + return test_root / 'tests' / 'unit' / 'data' / 'etl_data' def test_tables_created(db_fixture): """Check that requisite tables are created.""" existing_tables = db_fixture.db.list_tables() - if db_fixture.db_name == "PostgresDatabase": + if db_fixture.db_name == 'PostgresDatabase': assert set(existing_tables) == { - "gene_associations", - "gene_symbols", - "gene_previous_symbols", - "gene_aliases", - "gene_xrefs", - "gene_concepts", - "gene_merged", - "gene_sources", + 'gene_associations', + 'gene_symbols', + 'gene_previous_symbols', + 'gene_aliases', + 'gene_xrefs', + 'gene_concepts', + 'gene_merged', + 'gene_sources', } else: assert db_fixture.db.gene_table in existing_tables -@pytest.mark.skipif(not IS_TEST_ENV, reason="not in test environment") -@patch.object(Ensembl, "get_seqrepo") +@pytest.mark.skipif(not IS_TEST_ENV, reason='not in test environment') +@patch.object(Ensembl, 'get_seqrepo') def test_ensembl_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path): """Test that ensembl etl methods work correctly.""" test_get_seqrepo.return_value = None @@ -100,8 +100,8 @@ def test_ensembl_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path) processed_ids += ensembl_ids -@pytest.mark.skipif(not IS_TEST_ENV, reason="not in test environment") -@patch.object(HGNC, "get_seqrepo") +@pytest.mark.skipif(not IS_TEST_ENV, reason='not in test environment') +@patch.object(HGNC, 'get_seqrepo') def test_hgnc_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path): """Test that hgnc etl methods work correctly.""" test_get_seqrepo.return_value = None @@ -110,8 +110,8 @@ def test_hgnc_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path): processed_ids += hgnc_ids -@pytest.mark.skipif(not IS_TEST_ENV, reason="not in test environment") -@patch.object(NCBI, "get_seqrepo") +@pytest.mark.skipif(not IS_TEST_ENV, reason='not in test environment') +@patch.object(NCBI, 'get_seqrepo') def test_ncbi_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path): """Test that ncbi etl methods work correctly.""" test_get_seqrepo.return_value = None @@ -121,47 +121,47 @@ def test_ncbi_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path): processed_ids += ncbi_ids -@pytest.mark.skipif(not IS_TEST_ENV, reason="not in test environment") +@pytest.mark.skipif(not IS_TEST_ENV, reason='not in test environment') def test_merged_concepts(processed_ids, db_fixture): """Create merged concepts and load to db.""" db_fixture.merge.create_merged_concepts(processed_ids) -@pytest.mark.skipif(not IS_DDB_TEST, reason="only applies to DynamoDB in test env") +@pytest.mark.skipif(not IS_DDB_TEST, reason='only applies to DynamoDB in test env') def test_item_type(db_fixture): """Check that items are tagged with item_type attribute.""" - filter_exp = Key("label_and_type").eq("ncbigene:8193##identity") - item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0] - assert "item_type" in item - assert item["item_type"] == "identity" - - filter_exp = Key("label_and_type").eq("prkrap1##symbol") - item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0] - assert "item_type" in item - assert item["item_type"] == "symbol" - - filter_exp = Key("label_and_type").eq("loc157663##prev_symbol") - item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0] - assert "item_type" in item - assert item["item_type"] == "prev_symbol" - - filter_exp = Key("label_and_type").eq("flj23569##alias") - item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0] - assert "item_type" in item - assert item["item_type"] == "alias" - - filter_exp = Key("label_and_type").eq("omim:606689##associated_with") - item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0] - assert "item_type" in item - assert item["item_type"] == "associated_with" - - filter_exp = Key("label_and_type").eq("ensembl:ensg00000268895##xref") - item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0] - assert "item_type" in item - assert item["item_type"] == "xref" - - -@pytest.mark.skipif(not IS_TEST_ENV, reason="not in test environment") + filter_exp = Key('label_and_type').eq('ncbigene:8193##identity') + item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'identity' + + filter_exp = Key('label_and_type').eq('prkrap1##symbol') + item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'symbol' + + filter_exp = Key('label_and_type').eq('loc157663##prev_symbol') + item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'prev_symbol' + + filter_exp = Key('label_and_type').eq('flj23569##alias') + item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'alias' + + filter_exp = Key('label_and_type').eq('omim:606689##associated_with') + item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'associated_with' + + filter_exp = Key('label_and_type').eq('ensembl:ensg00000268895##xref') + item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] + assert 'item_type' in item + assert item['item_type'] == 'xref' + + +@pytest.mark.skipif(not IS_TEST_ENV, reason='not in test environment') def test_get_all_records(db_fixture): """Basic test of get_all_records method. @@ -171,10 +171,10 @@ def test_get_all_records(db_fixture): """ source_records = list(db_fixture.db.get_all_records(RecordType.IDENTITY)) assert len(source_records) == 63 - source_ids = {r["concept_id"] for r in source_records} + source_ids = {r['concept_id'] for r in source_records} assert len(source_ids) == 63 normalized_records = list(db_fixture.db.get_all_records(RecordType.MERGER)) assert len(normalized_records) == 46 - normalized_ids = {r["concept_id"] for r in normalized_records} + normalized_ids = {r['concept_id'] for r in normalized_records} assert len(normalized_ids) == 46 diff --git a/tests/unit/test_emit_warnings.py b/tests/unit/test_emit_warnings.py index c8309aac..c28e7ae5 100644 --- a/tests/unit/test_emit_warnings.py +++ b/tests/unit/test_emit_warnings.py @@ -7,25 +7,25 @@ def test_emit_warnings(): """Test that emit_warnings works correctly.""" expected_warnings = [ { - "non_breaking_space_characters": "Query contains non-breaking space characters" + 'non_breaking_space_characters': 'Query contains non-breaking space characters' } ] db = create_db() query_handler = QueryHandler(db) # Test emit no warnings - actual_warnings = query_handler._emit_warnings("spry3") + actual_warnings = query_handler._emit_warnings('spry3') assert actual_warnings == [] # Test emit warnings - actual_warnings = query_handler._emit_warnings("sp ry3") + actual_warnings = query_handler._emit_warnings('sp ry3') assert actual_warnings == actual_warnings - actual_warnings = query_handler._emit_warnings("sp\u00A0ry3") + actual_warnings = query_handler._emit_warnings('sp\u00A0ry3') assert expected_warnings == actual_warnings - actual_warnings = query_handler._emit_warnings("sp ry3") + actual_warnings = query_handler._emit_warnings('sp ry3') assert expected_warnings == actual_warnings - actual_warnings = query_handler._emit_warnings("sp\xa0ry3") + actual_warnings = query_handler._emit_warnings('sp\xa0ry3') assert expected_warnings == actual_warnings diff --git a/tests/unit/test_endpoints.py b/tests/unit/test_endpoints.py index 0639e6a0..25e3aa05 100644 --- a/tests/unit/test_endpoints.py +++ b/tests/unit/test_endpoints.py @@ -10,7 +10,7 @@ from gene.main import app -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def api_client(): """Provide test client fixture.""" return TestClient(app) @@ -18,26 +18,26 @@ def api_client(): def test_search(api_client): """Test /search endpoint.""" - response = api_client.get("/gene/search?q=braf") + response = api_client.get('/gene/search?q=braf') assert response.status_code == 200 assert ( - response.json()["source_matches"]["HGNC"]["records"][0]["concept_id"] - == "hgnc:1097" + response.json()['source_matches']['HGNC']['records'][0]['concept_id'] + == 'hgnc:1097' ) - response = api_client.get("/gene/search?q=braf&incl=sdkl") + response = api_client.get('/gene/search?q=braf&incl=sdkl') assert response.status_code == 422 def test_normalize(api_client): """Test /normalize endpoint.""" - response = api_client.get("/gene/normalize?q=braf") + response = api_client.get('/gene/normalize?q=braf') assert response.status_code == 200 - assert response.json()["normalized_id"] == "hgnc:1097" + assert response.json()['normalized_id'] == 'hgnc:1097' def test_normalize_unmerged(api_client): """Test /normalize_unmerged endpoint.""" - response = api_client.get("/gene/normalize_unmerged?q=braf") + response = api_client.get('/gene/normalize_unmerged?q=braf') assert response.status_code == 200 - assert response.json()["normalized_concept_id"] == "hgnc:1097" + assert response.json()['normalized_concept_id'] == 'hgnc:1097' diff --git a/tests/unit/test_ensembl_source.py b/tests/unit/test_ensembl_source.py index 7660be3e..0e012a78 100644 --- a/tests/unit/test_ensembl_source.py +++ b/tests/unit/test_ensembl_source.py @@ -5,7 +5,7 @@ from gene.schemas import Gene, MatchType, SourceName -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def ensembl(database): """Build ensembl test fixture.""" @@ -13,7 +13,7 @@ class QueryGetter: def __init__(self): self.query_handler = QueryHandler(database) - def search(self, query_str, incl="ensembl"): + def search(self, query_str, incl='ensembl'): resp = self.query_handler.search(query_str, incl=incl) return resp.source_matches[SourceName.ENSEMBL] @@ -21,162 +21,162 @@ def search(self, query_str, incl="ensembl"): return e -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def ddx11l1(): """Create a DDX11L1 fixutre.""" params = { - "match_type": MatchType.NO_MATCH, - "concept_id": "ensembl:ENSG00000223972", - "symbol": "DDX11L1", - "label": "DEAD/H-box helicase 11 like 1 (pseudogene)", - "previous_symbols": [], - "aliases": [], - "xrefs": ["hgnc:37102"], - "symbol_status": None, - "location_annotations": [], - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'concept_id': 'ensembl:ENSG00000223972', + 'symbol': 'DDX11L1', + 'label': 'DEAD/H-box helicase 11 like 1 (pseudogene)', + 'previous_symbols': [], + 'aliases': [], + 'xrefs': ['hgnc:37102'], + 'symbol_status': None, + 'location_annotations': [], + 'locations': [ { - "id": "ga4gh:SL.Ihi0T86UoFIEbH0DHttX2nIw_BdOkI5L", - "end": 14409, - "start": 11868, - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + 'id': 'ga4gh:SL.Ihi0T86UoFIEbH0DHttX2nIw_BdOkI5L', + 'end': 14409, + 'start': 11868, + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO', }, - "type": "SequenceLocation", + 'type': 'SequenceLocation', } ], - "strand": "+", - "associated_with": [], - "gene_type": "transcribed_unprocessed_pseudogene", + 'strand': '+', + 'associated_with': [], + 'gene_type': 'transcribed_unprocessed_pseudogene', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def tp53(): """Create a TP53 fixture.""" params = { - "match_type": MatchType.NO_MATCH, - "concept_id": "ensembl:ENSG00000141510", - "symbol": "TP53", - "label": "tumor protein p53", - "previous_symbols": [], - "aliases": [], - "xrefs": ["hgnc:11998"], - "symbol_status": None, - "location_annotations": [], - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'concept_id': 'ensembl:ENSG00000141510', + 'symbol': 'TP53', + 'label': 'tumor protein p53', + 'previous_symbols': [], + 'aliases': [], + 'xrefs': ['hgnc:11998'], + 'symbol_status': None, + 'location_annotations': [], + 'locations': [ { - "id": "ga4gh:SL.TlGoA-JmP3Xky3RhJ6_UU3eJKq8EpEp9", - "end": 7687538, - "start": 7661778, - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7", + 'id': 'ga4gh:SL.TlGoA-JmP3Xky3RhJ6_UU3eJKq8EpEp9', + 'end': 7687538, + 'start': 7661778, + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7', }, - "type": "SequenceLocation", + 'type': 'SequenceLocation', } ], - "strand": "-", - "associated_with": [], - "gene_type": "protein_coding", + 'strand': '-', + 'associated_with': [], + 'gene_type': 'protein_coding', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def ATP6AP1_DT(): # noqa: N802 """Create a ATP6AP1-DT test fixture.""" params = { - "match_type": MatchType.NO_MATCH, - "concept_id": "ensembl:ENSG00000197180", - "symbol": "ATP6AP1-DT", - "label": "ATP6AP1 divergent transcript", - "previous_symbols": [], - "aliases": [], - "xrefs": ["hgnc:25138"], - "symbol_status": None, - "location_annotations": [], - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'concept_id': 'ensembl:ENSG00000197180', + 'symbol': 'ATP6AP1-DT', + 'label': 'ATP6AP1 divergent transcript', + 'previous_symbols': [], + 'aliases': [], + 'xrefs': ['hgnc:25138'], + 'symbol_status': None, + 'location_annotations': [], + 'locations': [ { - "id": "ga4gh:SL.bPbeeEGSqjlZJ1Ddmg5T9ptJz9tKxYi3", - "end": 154428526, - "start": 154424377, - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + 'id': 'ga4gh:SL.bPbeeEGSqjlZJ1Ddmg5T9ptJz9tKxYi3', + 'end': 154428526, + 'start': 154424377, + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP', }, - "type": "SequenceLocation", + 'type': 'SequenceLocation', } ], - "strand": "-", - "associated_with": [], - "gene_type": "lncRNA", + 'strand': '-', + 'associated_with': [], + 'gene_type': 'lncRNA', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def hsa_mir_1253(): """Create a hsa-miR-1253 test fixture.""" params = { - "match_type": MatchType.NO_MATCH, - "concept_id": "ensembl:ENSG00000272920", - "symbol": "hsa-mir-1253", - "label": "hsa-mir-1253", - "previous_symbols": [], - "aliases": [], - "xrefs": [], - "symbol_status": None, - "location_annotations": [], - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'concept_id': 'ensembl:ENSG00000272920', + 'symbol': 'hsa-mir-1253', + 'label': 'hsa-mir-1253', + 'previous_symbols': [], + 'aliases': [], + 'xrefs': [], + 'symbol_status': None, + 'location_annotations': [], + 'locations': [ { - "id": "ga4gh:SL.x4kOE6ZXG-xY7nm6bu2W7lvm6ljaJXzR", - "end": 2748182, - "start": 2748077, - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7", + 'id': 'ga4gh:SL.x4kOE6ZXG-xY7nm6bu2W7lvm6ljaJXzR', + 'end': 2748182, + 'start': 2748077, + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7', }, - "type": "SequenceLocation", + 'type': 'SequenceLocation', } ], - "strand": "+", - "associated_with": ["mirbase:MI0006387"], - "gene_type": "lncRNA", + 'strand': '+', + 'associated_with': ['mirbase:MI0006387'], + 'gene_type': 'lncRNA', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def spry3(): """Create a SPRY3 test fixture.""" params = { - "match_type": MatchType.NO_MATCH, - "concept_id": "ensembl:ENSG00000168939", - "symbol": "SPRY3", - "label": "sprouty RTK signaling antagonist 3", - "previous_symbols": [], - "aliases": [], - "xrefs": ["hgnc:11271"], - "symbol_status": None, - "location_annotations": [], - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'concept_id': 'ensembl:ENSG00000168939', + 'symbol': 'SPRY3', + 'label': 'sprouty RTK signaling antagonist 3', + 'previous_symbols': [], + 'aliases': [], + 'xrefs': ['hgnc:11271'], + 'symbol_status': None, + 'location_annotations': [], + 'locations': [ { - "id": "ga4gh:SL.fxU7Axal2_GbyOfW8NQf0plM-SUWFCB0", - "end": 155782459, - "start": 155612571, - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + 'id': 'ga4gh:SL.fxU7Axal2_GbyOfW8NQf0plM-SUWFCB0', + 'end': 155782459, + 'start': 155612571, + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP', }, - "type": "SequenceLocation", + 'type': 'SequenceLocation', } ], - "strand": "+", - "associated_with": [], - "gene_type": "protein_coding", + 'strand': '+', + 'associated_with': [], + 'gene_type': 'protein_coding', } return Gene(**params) @@ -184,137 +184,137 @@ def spry3(): def test_ddx11l1(check_resp_single_record, ensembl, ddx11l1): """Test that DDX11L1 normalizes to correct gene concept.""" # Concept ID - resp = ensembl.search("ensembl:ENSG00000223972") + resp = ensembl.search('ensembl:ENSG00000223972') check_resp_single_record(resp, ddx11l1, MatchType.CONCEPT_ID) - resp = ensembl.search("ENSEMBL:ENSG00000223972") + resp = ensembl.search('ENSEMBL:ENSG00000223972') check_resp_single_record(resp, ddx11l1, MatchType.CONCEPT_ID) - resp = ensembl.search("ENSG00000223972") + resp = ensembl.search('ENSG00000223972') check_resp_single_record(resp, ddx11l1, MatchType.CONCEPT_ID) # Symbol - resp = ensembl.search("ddx11l1") + resp = ensembl.search('ddx11l1') check_resp_single_record(resp, ddx11l1, MatchType.SYMBOL) - resp = ensembl.search("DDX11L1") + resp = ensembl.search('DDX11L1') check_resp_single_record(resp, ddx11l1, MatchType.SYMBOL) def test_tp53(check_resp_single_record, ensembl, tp53): """Test that tp53 normalizes to correct gene concept.""" # Concept ID - resp = ensembl.search("ensembl:ENSG00000141510") + resp = ensembl.search('ensembl:ENSG00000141510') check_resp_single_record(resp, tp53, MatchType.CONCEPT_ID) - resp = ensembl.search("ENSEMBL:ENSG00000141510") + resp = ensembl.search('ENSEMBL:ENSG00000141510') check_resp_single_record(resp, tp53, MatchType.CONCEPT_ID) - resp = ensembl.search("ENSG00000141510") + resp = ensembl.search('ENSG00000141510') check_resp_single_record(resp, tp53, MatchType.CONCEPT_ID) # Symbol - resp = ensembl.search("tp53") + resp = ensembl.search('tp53') check_resp_single_record(resp, tp53, MatchType.SYMBOL) - resp = ensembl.search("TP53") + resp = ensembl.search('TP53') check_resp_single_record(resp, tp53, MatchType.SYMBOL) def test_ATP6AP1_DT(check_resp_single_record, ensembl, ATP6AP1_DT): # noqa: N802 N803 """Test that ATP6AP1-DT normalizes to correct gene concept.""" # Concept ID - resp = ensembl.search("ensembl:ENSG00000197180") + resp = ensembl.search('ensembl:ENSG00000197180') check_resp_single_record(resp, ATP6AP1_DT, MatchType.CONCEPT_ID) - resp = ensembl.search("ENSEMBL:ENSG00000197180") + resp = ensembl.search('ENSEMBL:ENSG00000197180') check_resp_single_record(resp, ATP6AP1_DT, MatchType.CONCEPT_ID) - resp = ensembl.search("ENSG00000197180") + resp = ensembl.search('ENSG00000197180') check_resp_single_record(resp, ATP6AP1_DT, MatchType.CONCEPT_ID) # Symbol - resp = ensembl.search("ATP6AP1-DT") + resp = ensembl.search('ATP6AP1-DT') check_resp_single_record(resp, ATP6AP1_DT, MatchType.SYMBOL) def test_hsa_mir_1253(check_resp_single_record, ensembl, hsa_mir_1253): """Test that hsa-mir-1253 normalizes to correct gene concept.""" # Concept ID - resp = ensembl.search("ensembl:ENSG00000272920") + resp = ensembl.search('ensembl:ENSG00000272920') check_resp_single_record(resp, hsa_mir_1253, MatchType.CONCEPT_ID) - resp = ensembl.search("ENSEMBL:ENSG00000272920") + resp = ensembl.search('ENSEMBL:ENSG00000272920') check_resp_single_record(resp, hsa_mir_1253, MatchType.CONCEPT_ID) - resp = ensembl.search("ENSG00000272920") + resp = ensembl.search('ENSG00000272920') check_resp_single_record(resp, hsa_mir_1253, MatchType.CONCEPT_ID) # Symbol - resp = ensembl.search("hsa-mir-1253") + resp = ensembl.search('hsa-mir-1253') check_resp_single_record(resp, hsa_mir_1253, MatchType.SYMBOL) # associated_with - resp = ensembl.search("mirbase:MI0006387") + resp = ensembl.search('mirbase:MI0006387') check_resp_single_record(resp, hsa_mir_1253, MatchType.ASSOCIATED_WITH) def test_spry3(check_resp_single_record, ensembl, spry3): """Test that spry3 normalizes to correct gene concept.""" # Concept ID - resp = ensembl.search("ensembl:EnSG00000168939") + resp = ensembl.search('ensembl:EnSG00000168939') check_resp_single_record(resp, spry3, MatchType.CONCEPT_ID) - resp = ensembl.search("ENSEMBL:EnSG00000168939") + resp = ensembl.search('ENSEMBL:EnSG00000168939') check_resp_single_record(resp, spry3, MatchType.CONCEPT_ID) - resp = ensembl.search("EnSG00000168939") + resp = ensembl.search('EnSG00000168939') check_resp_single_record(resp, spry3, MatchType.CONCEPT_ID) # Symbol - resp = ensembl.search("spry3") + resp = ensembl.search('spry3') check_resp_single_record(resp, spry3, MatchType.SYMBOL) def test_no_match(ensembl): """Test that a term normalizes to correct gene concept as a NO match.""" - resp = ensembl.search("A1BG - AS1") + resp = ensembl.search('A1BG - AS1') assert len(resp.records) == 0 - resp = ensembl.search("hnc:5") + resp = ensembl.search('hnc:5') assert len(resp.records) == 0 # Test empty query - resp = ensembl.search("") + resp = ensembl.search('') assert len(resp.records) == 0 # Do not search on label - resp = ensembl.search("A1BG antisense RNA 1") + resp = ensembl.search('A1BG antisense RNA 1') assert len(resp.records) == 0 - resp = ensembl.search("ensembl:ENSG00000278704") + resp = ensembl.search('ensembl:ENSG00000278704') assert len(resp.records) == 0 - resp = ensembl.search("ensembl:ENSG00000284906") + resp = ensembl.search('ensembl:ENSG00000284906') assert len(resp.records) == 0 def test_meta_info(ensembl): """Test that the meta field is correct.""" - resp = ensembl.search("chromosome:1") - assert resp.source_meta_.data_license == "custom" + resp = ensembl.search('chromosome:1') + assert resp.source_meta_.data_license == 'custom' assert ( resp.source_meta_.data_license_url - == "https://useast.ensembl.org/info/about/legal/disclaimer.html" + == 'https://useast.ensembl.org/info/about/legal/disclaimer.html' ) - assert resp.source_meta_.version == "110" + assert resp.source_meta_.version == '110' assert resp.source_meta_.data_url == { - "genome_annotations": "ftp://ftp.ensembl.org/pub/release-110/gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz" + 'genome_annotations': 'ftp://ftp.ensembl.org/pub/release-110/gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz' } assert resp.source_meta_.rdp_url is None - assert resp.source_meta_.genome_assemblies == ["GRCh38"] + assert resp.source_meta_.genome_assemblies == ['GRCh38'] assert resp.source_meta_.data_license_attributes == { - "non_commercial": False, - "share_alike": False, - "attribution": False, + 'non_commercial': False, + 'share_alike': False, + 'attribution': False, } diff --git a/tests/unit/test_hgnc_source.py b/tests/unit/test_hgnc_source.py index 54d0aff0..1673c2ba 100644 --- a/tests/unit/test_hgnc_source.py +++ b/tests/unit/test_hgnc_source.py @@ -7,7 +7,7 @@ from gene.schemas import Gene, MatchType, SourceName -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def hgnc(database): """Build hgnc test fixture.""" @@ -15,7 +15,7 @@ class QueryGetter: def __init__(self): self.query_handler = QueryHandler(database) - def search(self, query_str, incl="hgnc"): + def search(self, query_str, incl='hgnc'): resp = self.query_handler.search(query_str, incl=incl) return resp.source_matches[SourceName.HGNC] @@ -26,17 +26,17 @@ def search(self, query_str, incl="hgnc"): # Test Non Alt Loci Set -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def a1bg_as1(): """Create an A1BG-AS1 gene fixture.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "A1BG antisense RNA 1", - "concept_id": "hgnc:37133", - "symbol": "A1BG-AS1", - "location_annotations": [], - "strand": None, - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'A1BG antisense RNA 1', + 'concept_id': 'hgnc:37133', + 'symbol': 'A1BG-AS1', + 'location_annotations': [], + 'strand': None, + 'locations': [ # { # "id": "ga4gh:CL.Rz-M5wA0_bIhQYLKi2ZPqlqW3nBPfAx5", # "chr": "19", @@ -46,34 +46,34 @@ def a1bg_as1(): # "type": "ChromosomeLocation" # } ], - "previous_symbols": ["NCRNA00181", "A1BGAS", "A1BG-AS"], - "aliases": ["FLJ23569"], - "symbol_status": "approved", - "associated_with": [ - "vega:OTTHUMG00000183508", - "ucsc:uc002qse.3", - "refseq:NR_015380", - "ena.embl:BC040926", - "refseq:NR_015380", - "ena.embl:BC040926", + 'previous_symbols': ['NCRNA00181', 'A1BGAS', 'A1BG-AS'], + 'aliases': ['FLJ23569'], + 'symbol_status': 'approved', + 'associated_with': [ + 'vega:OTTHUMG00000183508', + 'ucsc:uc002qse.3', + 'refseq:NR_015380', + 'ena.embl:BC040926', + 'refseq:NR_015380', + 'ena.embl:BC040926', ], - "xrefs": ["ensembl:ENSG00000268895", "ncbigene:503538"], - "gene_type": "RNA, long non-coding", + 'xrefs': ['ensembl:ENSG00000268895', 'ncbigene:503538'], + 'gene_type': 'RNA, long non-coding', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def tp53(): """Create a TP53 gene fixture.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "tumor protein p53", - "concept_id": "hgnc:11998", - "symbol": "TP53", - "location_annotations": [], - "strand": None, - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'tumor protein p53', + 'concept_id': 'hgnc:11998', + 'symbol': 'TP53', + 'location_annotations': [], + 'strand': None, + 'locations': [ # { # "id": "ga4gh:CL.BPk3okUhv4BBatjkyC7eQQsyXL6YwmeF", # "chr": "17", @@ -83,51 +83,51 @@ def tp53(): # "type": "ChromosomeLocation" # } ], - "previous_symbols": [], - "aliases": ["p53", "LFS1"], - "symbol_status": "approved", - "associated_with": [ - "vega:OTTHUMG00000162125", - "refseq:NM_000546", - "cosmic:TP53", - "omim:191170", - "ucsc:uc060aur.1", - "uniprot:P04637", - "orphanet:120204", - "ccds:CCDS73968", - "ccds:CCDS73971", - "ccds:CCDS73970", - "ccds:CCDS73969", - "ccds:CCDS73967", - "ccds:CCDS73966", - "ccds:CCDS73965", - "ccds:CCDS73964", - "ccds:CCDS73963", - "ccds:CCDS11118", - "ccds:CCDS45605", - "ccds:CCDS45606", - "ena.embl:AF307851", - "pubmed:6396087", - "pubmed:3456488", - "pubmed:2047879", + 'previous_symbols': [], + 'aliases': ['p53', 'LFS1'], + 'symbol_status': 'approved', + 'associated_with': [ + 'vega:OTTHUMG00000162125', + 'refseq:NM_000546', + 'cosmic:TP53', + 'omim:191170', + 'ucsc:uc060aur.1', + 'uniprot:P04637', + 'orphanet:120204', + 'ccds:CCDS73968', + 'ccds:CCDS73971', + 'ccds:CCDS73970', + 'ccds:CCDS73969', + 'ccds:CCDS73967', + 'ccds:CCDS73966', + 'ccds:CCDS73965', + 'ccds:CCDS73964', + 'ccds:CCDS73963', + 'ccds:CCDS11118', + 'ccds:CCDS45605', + 'ccds:CCDS45606', + 'ena.embl:AF307851', + 'pubmed:6396087', + 'pubmed:3456488', + 'pubmed:2047879', ], - "xrefs": ["ensembl:ENSG00000141510", "ncbigene:7157"], - "gene_type": "gene with protein product", + 'xrefs': ['ensembl:ENSG00000141510', 'ncbigene:7157'], + 'gene_type': 'gene with protein product', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def a3galt2(): """Create an A3GALT2 gene fixture.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "alpha 1,3-galactosyltransferase 2", - "concept_id": "hgnc:30005", - "symbol": "A3GALT2", - "location_annotations": [], - "strand": None, - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'alpha 1,3-galactosyltransferase 2', + 'concept_id': 'hgnc:30005', + 'symbol': 'A3GALT2', + 'location_annotations': [], + 'strand': None, + 'locations': [ # { # "id": "ga4gh:CL.iiwv6oaDfVVkjMZ_OH6XEQmM0daVft4u", # "chr": "1", @@ -137,37 +137,37 @@ def a3galt2(): # "type": "ChromosomeLocation" # } ], - "previous_symbols": ["A3GALT2P"], - "aliases": ["IGBS3S", "IGB3S"], - "symbol_status": "approved", - "xrefs": ["ensembl:ENSG00000184389", "ncbigene:127550"], - "associated_with": [ - "vega:OTTHUMG00000004125", - "vega:OTTHUMG00000004125", - "ucsc:uc031plq.1", - "uniprot:U3KPV4", - "ccds:CCDS60080", - "pubmed:10854427", - "pubmed:18630988", - "refseq:NM_001080438", - "omim:619850", + 'previous_symbols': ['A3GALT2P'], + 'aliases': ['IGBS3S', 'IGB3S'], + 'symbol_status': 'approved', + 'xrefs': ['ensembl:ENSG00000184389', 'ncbigene:127550'], + 'associated_with': [ + 'vega:OTTHUMG00000004125', + 'vega:OTTHUMG00000004125', + 'ucsc:uc031plq.1', + 'uniprot:U3KPV4', + 'ccds:CCDS60080', + 'pubmed:10854427', + 'pubmed:18630988', + 'refseq:NM_001080438', + 'omim:619850', ], - "gene_type": "gene with protein product", + 'gene_type': 'gene with protein product', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def wdhd1(): """Create a WDHD1 gene fixture.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "WD repeat and HMG-box DNA binding protein 1", - "concept_id": "hgnc:23170", - "symbol": "WDHD1", - "location_annotations": [], - "strand": None, - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'WD repeat and HMG-box DNA binding protein 1', + 'concept_id': 'hgnc:23170', + 'symbol': 'WDHD1', + 'location_annotations': [], + 'strand': None, + 'locations': [ # { # "id": "ga4gh:CL.sNe5mpPbxivH2KE6HdaDA3U29BkCQXc3", # "chr": "14", @@ -177,80 +177,80 @@ def wdhd1(): # "type": "ChromosomeLocation" # } ], - "previous_symbols": [], - "aliases": ["AND-1", "CTF4", "CHTF4"], - "symbol_status": "approved", - "xrefs": ["ensembl:ENSG00000198554", "ncbigene:11169"], - "associated_with": [ - "vega:OTTHUMG00000140304", - "refseq:NM_007086", - "omim:608126", - "ucsc:uc001xbm.3", - "uniprot:O75717", - "ccds:CCDS41955", - "ccds:CCDS9721", - "ena.embl:AJ006266", - "pubmed:9175701", - "pubmed:20028748", + 'previous_symbols': [], + 'aliases': ['AND-1', 'CTF4', 'CHTF4'], + 'symbol_status': 'approved', + 'xrefs': ['ensembl:ENSG00000198554', 'ncbigene:11169'], + 'associated_with': [ + 'vega:OTTHUMG00000140304', + 'refseq:NM_007086', + 'omim:608126', + 'ucsc:uc001xbm.3', + 'uniprot:O75717', + 'ccds:CCDS41955', + 'ccds:CCDS9721', + 'ena.embl:AJ006266', + 'pubmed:9175701', + 'pubmed:20028748', ], - "gene_type": "gene with protein product", + 'gene_type': 'gene with protein product', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def g6pr(): """Create a G6PR gene fixture.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "glucose-6-phosphatase regulator", - "concept_id": "hgnc:4059", - "symbol": "G6PR", - "location_annotations": ["reserved"], - "locations": [], - "strand": None, - "previous_symbols": [], - "aliases": ["GSD1aSP"], - "symbol_status": "approved", - "xrefs": ["ncbigene:2541"], - "associated_with": ["pubmed:2172641", "pubmed:7814621", "pubmed:2996501"], - "gene_type": "unknown", + 'match_type': MatchType.NO_MATCH, + 'label': 'glucose-6-phosphatase regulator', + 'concept_id': 'hgnc:4059', + 'symbol': 'G6PR', + 'location_annotations': ['reserved'], + 'locations': [], + 'strand': None, + 'previous_symbols': [], + 'aliases': ['GSD1aSP'], + 'symbol_status': 'approved', + 'xrefs': ['ncbigene:2541'], + 'associated_with': ['pubmed:2172641', 'pubmed:7814621', 'pubmed:2996501'], + 'gene_type': 'unknown', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def pirc24(): """Create a PIRC24 gene fixture.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "piwi-interacting RNA cluster 24", - "concept_id": "hgnc:37528", - "symbol": "PIRC24", - "location_annotations": ["6"], - "locations": [], - "strand": None, - "previous_symbols": [], - "aliases": [], - "symbol_status": "approved", - "xrefs": ["ncbigene:100313810"], - "associated_with": ["pubmed:17881367"], - "gene_type": "RNA, cluster", + 'match_type': MatchType.NO_MATCH, + 'label': 'piwi-interacting RNA cluster 24', + 'concept_id': 'hgnc:37528', + 'symbol': 'PIRC24', + 'location_annotations': ['6'], + 'locations': [], + 'strand': None, + 'previous_symbols': [], + 'aliases': [], + 'symbol_status': 'approved', + 'xrefs': ['ncbigene:100313810'], + 'associated_with': ['pubmed:17881367'], + 'gene_type': 'RNA, cluster', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def gage4(): """Create a GAGE4 gene fixture.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "G antigen 4", - "concept_id": "hgnc:4101", - "symbol": "GAGE4", - "location_annotations": ["not on reference assembly"], - "strand": None, - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'G antigen 4', + 'concept_id': 'hgnc:4101', + 'symbol': 'GAGE4', + 'location_annotations': ['not on reference assembly'], + 'strand': None, + 'locations': [ # { # "id": "ga4gh:CL.6KzwrFm2WeSXqwIIiNbAu-pKQQHt2q5Q", # "chr": "X", @@ -260,83 +260,83 @@ def gage4(): # "type": "ChromosomeLocation" # } ], - "previous_symbols": [], - "aliases": ["CT4.4"], - "symbol_status": "approved", - "xrefs": ["ncbigene:2576"], - "associated_with": [ - "refseq:NM_001474", - "omim:300597", - "uniprot:P0DSO3", - "ena.embl:U19145", - "pubmed:7544395", + 'previous_symbols': [], + 'aliases': ['CT4.4'], + 'symbol_status': 'approved', + 'xrefs': ['ncbigene:2576'], + 'associated_with': [ + 'refseq:NM_001474', + 'omim:300597', + 'uniprot:P0DSO3', + 'ena.embl:U19145', + 'pubmed:7544395', ], - "gene_type": "gene with protein product", + 'gene_type': 'gene with protein product', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def mafip(): """Create a MAFIP gene fixture.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "MAFF interacting protein", - "concept_id": "hgnc:31102", - "symbol": "MAFIP", - "location_annotations": ["unplaced", "14"], - "locations": [], - "strand": None, - "previous_symbols": [], - "aliases": ["FLJ35473", "FLJ00219", "FLJ39633", "MIP", "pp5644", "TEKT4P4"], - "symbol_status": "approved", - "xrefs": ["ensembl:ENSG00000274847", "ncbigene:727764"], - "associated_with": [ - "vega:OTTHUMG00000188065", - "refseq:NR_046439", - "uniprot:Q8WZ33", - "ena.embl:AK074146", - "ena.embl:AF289559", - "pubmed:16549056", - "pubmed:15881666", + 'match_type': MatchType.NO_MATCH, + 'label': 'MAFF interacting protein', + 'concept_id': 'hgnc:31102', + 'symbol': 'MAFIP', + 'location_annotations': ['unplaced', '14'], + 'locations': [], + 'strand': None, + 'previous_symbols': [], + 'aliases': ['FLJ35473', 'FLJ00219', 'FLJ39633', 'MIP', 'pp5644', 'TEKT4P4'], + 'symbol_status': 'approved', + 'xrefs': ['ensembl:ENSG00000274847', 'ncbigene:727764'], + 'associated_with': [ + 'vega:OTTHUMG00000188065', + 'refseq:NR_046439', + 'uniprot:Q8WZ33', + 'ena.embl:AK074146', + 'ena.embl:AF289559', + 'pubmed:16549056', + 'pubmed:15881666', ], - "gene_type": "unknown", + 'gene_type': 'unknown', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def mt_7sdna(): """Create a MT-7SDNA gene fixture.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "mitochondrially encoded 7S DNA", - "concept_id": "hgnc:7409", - "symbol": "MT-7SDNA", - "location_annotations": ["MT"], - "locations": [], - "strand": None, - "previous_symbols": ["MT7SDNA"], - "aliases": [], - "symbol_status": "approved", - "xrefs": [], - "associated_with": ["pubmed:24709344", "pubmed:273237"], - "gene_type": "region", + 'match_type': MatchType.NO_MATCH, + 'label': 'mitochondrially encoded 7S DNA', + 'concept_id': 'hgnc:7409', + 'symbol': 'MT-7SDNA', + 'location_annotations': ['MT'], + 'locations': [], + 'strand': None, + 'previous_symbols': ['MT7SDNA'], + 'aliases': [], + 'symbol_status': 'approved', + 'xrefs': [], + 'associated_with': ['pubmed:24709344', 'pubmed:273237'], + 'gene_type': 'region', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def cecr(): """Create a CECR gene fixture.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "cat eye syndrome chromosome region", - "concept_id": "hgnc:1838", - "symbol": "CECR", - "location_annotations": [], - "strand": None, - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'cat eye syndrome chromosome region', + 'concept_id': 'hgnc:1838', + 'symbol': 'CECR', + 'location_annotations': [], + 'strand': None, + 'locations': [ # { # "id": "ga4gh:CL.AgASk5sB6LCeaB6rcqOwmrm16ise3pof", # "chr": "22", @@ -346,27 +346,27 @@ def cecr(): # "type": "ChromosomeLocation" # } ], - "previous_symbols": [], - "aliases": [], - "symbol_status": "approved", - "xrefs": ["ncbigene:1055"], - "associated_with": [], - "gene_type": "region", + 'previous_symbols': [], + 'aliases': [], + 'symbol_status': 'approved', + 'xrefs': ['ncbigene:1055'], + 'associated_with': [], + 'gene_type': 'region', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def csf2ra(): """Create a CSF2RA gene fixture.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "colony stimulating factor 2 receptor subunit alpha", - "concept_id": "hgnc:2435", - "symbol": "CSF2RA", - "location_annotations": [], - "strand": None, - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'colony stimulating factor 2 receptor subunit alpha', + 'concept_id': 'hgnc:2435', + 'symbol': 'CSF2RA', + 'location_annotations': [], + 'strand': None, + 'locations': [ # { # "id": "ga4gh:CL.cITg67iNn_QNZTKpJd0I-1JMMhW_yHGU", # "chr": "X", @@ -384,45 +384,45 @@ def csf2ra(): # "type": "ChromosomeLocation" # } ], - "previous_symbols": ["CSF2R"], - "aliases": ["CD116", "alphaGMR"], - "symbol_status": "approved", - "xrefs": ["ensembl:ENSG00000198223", "ncbigene:1438"], - "associated_with": [ - "vega:OTTHUMG00000012533", - "refseq:NM_001161529", - "orphanet:209477", - "iuphar:1707", - "hcdmdb:CD116", - "omim:306250", - "omim:425000", - "ucsc:uc010nvv.3", - "uniprot:P15509", - "ena.embl:M64445", - "ccds:CCDS35190", - "ccds:CCDS55360", - "ccds:CCDS35191", - "ccds:CCDS55359", - "ccds:CCDS35192", - "ccds:CCDS35193", - "pubmed:1702217", + 'previous_symbols': ['CSF2R'], + 'aliases': ['CD116', 'alphaGMR'], + 'symbol_status': 'approved', + 'xrefs': ['ensembl:ENSG00000198223', 'ncbigene:1438'], + 'associated_with': [ + 'vega:OTTHUMG00000012533', + 'refseq:NM_001161529', + 'orphanet:209477', + 'iuphar:1707', + 'hcdmdb:CD116', + 'omim:306250', + 'omim:425000', + 'ucsc:uc010nvv.3', + 'uniprot:P15509', + 'ena.embl:M64445', + 'ccds:CCDS35190', + 'ccds:CCDS55360', + 'ccds:CCDS35191', + 'ccds:CCDS55359', + 'ccds:CCDS35192', + 'ccds:CCDS35193', + 'pubmed:1702217', ], - "gene_type": "gene with protein product", + 'gene_type': 'gene with protein product', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def rps24p5(): """Create a RPS24P5 gene fixture.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "ribosomal protein S24 pseudogene 5", - "concept_id": "hgnc:36026", - "symbol": "RPS24P5", - "location_annotations": [], - "strand": None, - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'ribosomal protein S24 pseudogene 5', + 'concept_id': 'hgnc:36026', + 'symbol': 'RPS24P5', + 'location_annotations': [], + 'strand': None, + 'locations': [ # { # "id": "ga4gh:CL.Ri0ddtMpe6DGzrC9_QGbL35gYAtU2bh_", # "chr": "1", @@ -432,27 +432,27 @@ def rps24p5(): # "type": "ChromosomeLocation" # } ], - "previous_symbols": [], - "aliases": [], - "symbol_status": "approved", - "xrefs": ["ncbigene:100271094"], - "associated_with": ["refseq:NG_011274", "pubmed:19123937"], - "gene_type": "pseudogene", + 'previous_symbols': [], + 'aliases': [], + 'symbol_status': 'approved', + 'xrefs': ['ncbigene:100271094'], + 'associated_with': ['refseq:NG_011274', 'pubmed:19123937'], + 'gene_type': 'pseudogene', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def trl_cag2_1(): """Create a TRL-CAG2-1 gene fixture.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "tRNA-Leu (anticodon CAG) 2-1", - "concept_id": "hgnc:34692", - "symbol": "TRL-CAG2-1", - "location_annotations": [], - "strand": None, - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'tRNA-Leu (anticodon CAG) 2-1', + 'concept_id': 'hgnc:34692', + 'symbol': 'TRL-CAG2-1', + 'location_annotations': [], + 'strand': None, + 'locations': [ # { # "id": "ga4gh:CL.aZ5aYHaC3GhDWgwhKkAcd9GBvkEo034v", # "chr": "16", @@ -462,27 +462,27 @@ def trl_cag2_1(): # "type": "ChromosomeLocation" # } ], - "previous_symbols": ["TRNAL13"], - "aliases": ["tRNA-Leu-CAG-2-1"], - "symbol_status": "approved", - "xrefs": ["ncbigene:100189130"], - "associated_with": ["ena.embl:HG983896"], - "gene_type": "RNA, transfer", + 'previous_symbols': ['TRNAL13'], + 'aliases': ['tRNA-Leu-CAG-2-1'], + 'symbol_status': 'approved', + 'xrefs': ['ncbigene:100189130'], + 'associated_with': ['ena.embl:HG983896'], + 'gene_type': 'RNA, transfer', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def myo5b(): """Create a MYO5B gene fixture.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "myosin VB", - "concept_id": "hgnc:7603", - "symbol": "MYO5B", - "location_annotations": [], - "strand": None, - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'myosin VB', + 'concept_id': 'hgnc:7603', + 'symbol': 'MYO5B', + 'location_annotations': [], + 'strand': None, + 'locations': [ # { # "id": "ga4gh:CL.hFukVqPVLD70cshAz1Gtmd6EC1imobpO", # "chr": "18", @@ -492,23 +492,23 @@ def myo5b(): # "type": "ChromosomeLocation" # } ], - "previous_symbols": [], - "aliases": ["KIAA1119"], - "symbol_status": "approved", - "xrefs": ["ensembl:ENSG00000167306", "ncbigene:4645"], - "associated_with": [ - "vega:OTTHUMG00000179843", - "refseq:NM_001080467", - "omim:606540", - "ucsc:uc002leb.3", - "uniprot:Q9ULV0", - "orphanet:171089", - "ccds:CCDS42436", - "ena.embl:AB032945", - "pubmed:8884266", - "pubmed:17462998", + 'previous_symbols': [], + 'aliases': ['KIAA1119'], + 'symbol_status': 'approved', + 'xrefs': ['ensembl:ENSG00000167306', 'ncbigene:4645'], + 'associated_with': [ + 'vega:OTTHUMG00000179843', + 'refseq:NM_001080467', + 'omim:606540', + 'ucsc:uc002leb.3', + 'uniprot:Q9ULV0', + 'orphanet:171089', + 'ccds:CCDS42436', + 'ena.embl:AB032945', + 'pubmed:8884266', + 'pubmed:17462998', ], - "gene_type": "gene with protein product", + 'gene_type': 'gene with protein product', } return Gene(**params) @@ -516,17 +516,17 @@ def myo5b(): # Test Alt Loci Set -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def gstt1(): """Create an GSTT1 gene fixture.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "glutathione S-transferase theta 1", - "concept_id": "hgnc:4641", - "symbol": "GSTT1", - "location_annotations": ["alternate reference locus"], - "strand": None, - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'glutathione S-transferase theta 1', + 'concept_id': 'hgnc:4641', + 'symbol': 'GSTT1', + 'location_annotations': ['alternate reference locus'], + 'strand': None, + 'locations': [ # { # "id": "ga4gh:CL.g74mxFvAzPoenOlyMjY32j-UFMvjjas_", # "chr": "22", @@ -536,20 +536,20 @@ def gstt1(): # "type": "ChromosomeLocation" # } ], - "previous_symbols": [], - "aliases": ["2.5.1.18"], - "symbol_status": "approved", - "associated_with": [ - "refseq:NM_000853", - "omim:600436", - "ucsc:uc002zze.4", - "uniprot:P30711", - "orphanet:470418", - "ena.embl:KI270879", - "pubmed:8617495", + 'previous_symbols': [], + 'aliases': ['2.5.1.18'], + 'symbol_status': 'approved', + 'associated_with': [ + 'refseq:NM_000853', + 'omim:600436', + 'ucsc:uc002zze.4', + 'uniprot:P30711', + 'orphanet:470418', + 'ena.embl:KI270879', + 'pubmed:8617495', ], - "xrefs": ["ensembl:ENSG00000277656", "ncbigene:2952"], - "gene_type": "gene with protein product", + 'xrefs': ['ensembl:ENSG00000277656', 'ncbigene:2952'], + 'gene_type': 'gene with protein product', } return Gene(**params) @@ -557,273 +557,273 @@ def gstt1(): def test_a1bg_as1(check_resp_single_record, a1bg_as1, hgnc): """Test that a1bg_as1 normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search("hgnc:37133") + resp = hgnc.search('hgnc:37133') check_resp_single_record(resp, a1bg_as1, MatchType.CONCEPT_ID) - resp = hgnc.search("HGNC:37133") + resp = hgnc.search('HGNC:37133') check_resp_single_record(resp, a1bg_as1, MatchType.CONCEPT_ID) - resp = hgnc.search("Hgnc:37133") + resp = hgnc.search('Hgnc:37133') check_resp_single_record(resp, a1bg_as1, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search("A1BG-AS1") + resp = hgnc.search('A1BG-AS1') check_resp_single_record(resp, a1bg_as1, MatchType.SYMBOL) - resp = hgnc.search("A1BG-as1") + resp = hgnc.search('A1BG-as1') check_resp_single_record(resp, a1bg_as1, MatchType.SYMBOL) # Previous Symbol - resp = hgnc.search("NCRNA00181") + resp = hgnc.search('NCRNA00181') check_resp_single_record(resp, a1bg_as1, MatchType.PREV_SYMBOL) - resp = hgnc.search("A1BGAS") + resp = hgnc.search('A1BGAS') check_resp_single_record(resp, a1bg_as1, MatchType.PREV_SYMBOL) - resp = hgnc.search("A1BG-AS") + resp = hgnc.search('A1BG-AS') check_resp_single_record(resp, a1bg_as1, MatchType.PREV_SYMBOL) # Alias - resp = hgnc.search("FLJ23569") + resp = hgnc.search('FLJ23569') check_resp_single_record(resp, a1bg_as1, MatchType.ALIAS) - resp = hgnc.search("flj23569") + resp = hgnc.search('flj23569') check_resp_single_record(resp, a1bg_as1, MatchType.ALIAS) def test_a3galt2(check_resp_single_record, a3galt2, hgnc): """Test that a3galt2 normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search("hgnc:30005") + resp = hgnc.search('hgnc:30005') check_resp_single_record(resp, a3galt2, MatchType.CONCEPT_ID) - resp = hgnc.search("HGNC:30005") + resp = hgnc.search('HGNC:30005') check_resp_single_record(resp, a3galt2, MatchType.CONCEPT_ID) - resp = hgnc.search("Hgnc:30005") + resp = hgnc.search('Hgnc:30005') check_resp_single_record(resp, a3galt2, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search("A3GALT2") + resp = hgnc.search('A3GALT2') check_resp_single_record(resp, a3galt2, MatchType.SYMBOL) - resp = hgnc.search("a3galt2") + resp = hgnc.search('a3galt2') check_resp_single_record(resp, a3galt2, MatchType.SYMBOL) # Previous Symbol - resp = hgnc.search("A3GALT2P") + resp = hgnc.search('A3GALT2P') check_resp_single_record(resp, a3galt2, MatchType.PREV_SYMBOL) - resp = hgnc.search("A3GALT2p") + resp = hgnc.search('A3GALT2p') check_resp_single_record(resp, a3galt2, MatchType.PREV_SYMBOL) # Alias - resp = hgnc.search("IGBS3S") + resp = hgnc.search('IGBS3S') check_resp_single_record(resp, a3galt2, MatchType.ALIAS) - resp = hgnc.search("igB3s") + resp = hgnc.search('igB3s') check_resp_single_record(resp, a3galt2, MatchType.ALIAS) def test_tp53(check_resp_single_record, tp53, hgnc): """Test that tp53 normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search("hgnc:11998") + resp = hgnc.search('hgnc:11998') check_resp_single_record(resp, tp53, MatchType.CONCEPT_ID) - resp = hgnc.search("HGNC:11998") + resp = hgnc.search('HGNC:11998') check_resp_single_record(resp, tp53, MatchType.CONCEPT_ID) - resp = hgnc.search("Hgnc:11998") + resp = hgnc.search('Hgnc:11998') check_resp_single_record(resp, tp53, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search("tp53") + resp = hgnc.search('tp53') check_resp_single_record(resp, tp53, MatchType.SYMBOL) - resp = hgnc.search("TP53") + resp = hgnc.search('TP53') check_resp_single_record(resp, tp53, MatchType.SYMBOL) # Alias - resp = hgnc.search("LFS1") + resp = hgnc.search('LFS1') check_resp_single_record(resp, tp53, MatchType.ALIAS) - resp = hgnc.search("p53") + resp = hgnc.search('p53') check_resp_single_record(resp, tp53, MatchType.ALIAS) def test_wdhd1(check_resp_single_record, wdhd1, hgnc): """Test that a1bg_as1 normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search("hgnc:23170") + resp = hgnc.search('hgnc:23170') check_resp_single_record(resp, wdhd1, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search("WDHD1") + resp = hgnc.search('WDHD1') check_resp_single_record(resp, wdhd1, MatchType.SYMBOL) def test_g6pr(check_resp_single_record, g6pr, hgnc): """Test that g6pr normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search("hgnc:4059") + resp = hgnc.search('hgnc:4059') check_resp_single_record(resp, g6pr, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search("G6PR") + resp = hgnc.search('G6PR') check_resp_single_record(resp, g6pr, MatchType.SYMBOL) def test_pirc24(check_resp_single_record, pirc24, hgnc): """Test that pirc24 normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search("hgnc:37528") + resp = hgnc.search('hgnc:37528') check_resp_single_record(resp, pirc24, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search("PIRC24") + resp = hgnc.search('PIRC24') check_resp_single_record(resp, pirc24, MatchType.SYMBOL) def test_gage4(check_resp_single_record, gage4, hgnc): """Test that gage4 normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search("hgnc:4101") + resp = hgnc.search('hgnc:4101') check_resp_single_record(resp, gage4, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search("GAGE4") + resp = hgnc.search('GAGE4') check_resp_single_record(resp, gage4, MatchType.SYMBOL) def test_mafip(check_resp_single_record, mafip, hgnc): """Test that mafip normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search("hgnc:31102") + resp = hgnc.search('hgnc:31102') check_resp_single_record(resp, mafip, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search("MAFIP") + resp = hgnc.search('MAFIP') check_resp_single_record(resp, mafip, MatchType.SYMBOL) def test_mt_7sdna(check_resp_single_record, mt_7sdna, hgnc): """Test that mt_7sdna normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search("hgnc:7409") + resp = hgnc.search('hgnc:7409') check_resp_single_record(resp, mt_7sdna, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search("MT-7SDNA") + resp = hgnc.search('MT-7SDNA') check_resp_single_record(resp, mt_7sdna, MatchType.SYMBOL) def test_cecr(check_resp_single_record, cecr, hgnc): """Test that cecr normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search("hgnc:1838") + resp = hgnc.search('hgnc:1838') check_resp_single_record(resp, cecr, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search("CECR") + resp = hgnc.search('CECR') check_resp_single_record(resp, cecr, MatchType.SYMBOL) def test_csf2ra(check_resp_single_record, csf2ra, hgnc): """Test that csf2ra normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search("hgnc:2435") + resp = hgnc.search('hgnc:2435') check_resp_single_record(resp, csf2ra, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search("CSF2RA") + resp = hgnc.search('CSF2RA') check_resp_single_record(resp, csf2ra, MatchType.SYMBOL) def test_rps24p5(check_resp_single_record, rps24p5, hgnc): """Test that rps24p5 normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search("hgnc:36026") + resp = hgnc.search('hgnc:36026') check_resp_single_record(resp, rps24p5, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search("rpS24P5") + resp = hgnc.search('rpS24P5') check_resp_single_record(resp, rps24p5, MatchType.SYMBOL) def test_trl_cag2_1(check_resp_single_record, trl_cag2_1, hgnc): """Test that trl_cag2_1 normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search("hgnc:34692") + resp = hgnc.search('hgnc:34692') check_resp_single_record(resp, trl_cag2_1, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search("TRL-CAG2-1") + resp = hgnc.search('TRL-CAG2-1') check_resp_single_record(resp, trl_cag2_1, MatchType.SYMBOL) def test_myo5b(check_resp_single_record, myo5b, hgnc): """Test that myo5b normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search("hgnc:7603") + resp = hgnc.search('hgnc:7603') check_resp_single_record(resp, myo5b, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search("MYO5B") + resp = hgnc.search('MYO5B') check_resp_single_record(resp, myo5b, MatchType.SYMBOL) # associated_with - resp = hgnc.search("refseq:NM_001080467") + resp = hgnc.search('refseq:NM_001080467') check_resp_single_record(resp, myo5b, MatchType.ASSOCIATED_WITH) def test_gstt1(check_resp_single_record, gstt1, hgnc): """Test that gstt1 normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search("hgnc:4641") + resp = hgnc.search('hgnc:4641') check_resp_single_record(resp, gstt1, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search("GSTT1") + resp = hgnc.search('GSTT1') check_resp_single_record(resp, gstt1, MatchType.SYMBOL) # associated_with - resp = hgnc.search("omim:600436") + resp = hgnc.search('omim:600436') check_resp_single_record(resp, gstt1, MatchType.ASSOCIATED_WITH) def test_no_match(hgnc): """Test that a term normalizes to correct gene concept as a NO match.""" - resp = hgnc.search("A1BG - AS1") + resp = hgnc.search('A1BG - AS1') assert len(resp.records) == 0 - resp = hgnc.search("hnc:5") + resp = hgnc.search('hnc:5') assert len(resp.records) == 0 # Test empty query - resp = hgnc.search("") + resp = hgnc.search('') assert len(resp.records) == 0 # Do not search on label - resp = hgnc.search("A1BG antisense RNA 1") + resp = hgnc.search('A1BG antisense RNA 1') assert len(resp.records) == 0 def test_meta_info(hgnc): """Test that the meta field is correct.""" - resp = hgnc.search("HGNC:37133") - assert resp.source_meta_.data_license == "CC0" + resp = hgnc.search('HGNC:37133') + assert resp.source_meta_.data_license == 'CC0' assert ( - resp.source_meta_.data_license_url == "https://www.genenames.org/about/license/" + resp.source_meta_.data_license_url == 'https://www.genenames.org/about/license/' ) - assert datetime.strptime(resp.source_meta_.version, "%Y%m%d") + assert datetime.strptime(resp.source_meta_.version, '%Y%m%d') assert resp.source_meta_.data_url == { - "complete_set_archive": "ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json" + 'complete_set_archive': 'ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json' } assert resp.source_meta_.rdp_url is None assert resp.source_meta_.genome_assemblies == [] assert resp.source_meta_.data_license_attributes == { - "non_commercial": False, - "share_alike": False, - "attribution": False, + 'non_commercial': False, + 'share_alike': False, + 'attribution': False, } diff --git a/tests/unit/test_ncbi_source.py b/tests/unit/test_ncbi_source.py index d0083a43..2476a725 100644 --- a/tests/unit/test_ncbi_source.py +++ b/tests/unit/test_ncbi_source.py @@ -25,7 +25,7 @@ def check_ncbi_discontinued_gene(normalizer_response, concept_id, symbol, match_ assert resp.associated_with == [] -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def ncbi(database): """Build ncbi test fixture.""" @@ -33,7 +33,7 @@ class QueryGetter: def __init__(self): self.query_handler = QueryHandler(database) - def search(self, query_str, incl="ncbi"): + def search(self, query_str, incl='ncbi'): resp = self.query_handler.search(query_str, incl=incl) return resp.source_matches[SourceName.NCBI] @@ -41,22 +41,22 @@ def search(self, query_str, incl="ncbi"): return n -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def dpf1(): """Create gene fixture for DPF1.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "double PHD fingers 1", - "concept_id": "ncbigene:8193", - "symbol": "DPF1", - "aliases": ["BAF45b", "NEUD4", "neuro-d4", "SMARCG1"], - "xrefs": ["hgnc:20225", "ensembl:ENSG00000011332"], - "previous_symbols": [], - "associated_with": ["omim:601670"], - "symbol_status": None, - "location_annotations": [], - "strand": "-", - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'double PHD fingers 1', + 'concept_id': 'ncbigene:8193', + 'symbol': 'DPF1', + 'aliases': ['BAF45b', 'NEUD4', 'neuro-d4', 'SMARCG1'], + 'xrefs': ['hgnc:20225', 'ensembl:ENSG00000011332'], + 'previous_symbols': [], + 'associated_with': ['omim:601670'], + 'symbol_status': None, + 'location_annotations': [], + 'strand': '-', + 'locations': [ # { # "id": "ga4gh:CL.bzgLv8gt3KHK00OWTAEUNZcdgUjbHU8i", # "chr": "19", @@ -66,37 +66,37 @@ def dpf1(): # "type": "ChromosomeLocation" # }, { - "id": "ga4gh:SL.0bmpLh_dlBRrzfviiQY9Vg4iEH0XeR20", - "end": 38229695, - "start": 38211005, - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", + 'id': 'ga4gh:SL.0bmpLh_dlBRrzfviiQY9Vg4iEH0XeR20', + 'end': 38229695, + 'start': 38211005, + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl', }, - "type": "SequenceLocation", + 'type': 'SequenceLocation', } ], - "gene_type": "protein-coding", + 'gene_type': 'protein-coding', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def pdp1_symbol(): """Create gene fixture for PDP1 (ncbigene:54704).""" params = { - "match_type": MatchType.NO_MATCH, - "label": "pyruvate dehydrogenase phosphatase catalytic subunit 1", - "concept_id": "ncbigene:54704", - "symbol": "PDP1", - "aliases": ["PDH", "PDP", "PDPC", "PPM2A", "PPM2C"], - "xrefs": ["hgnc:9279", "ensembl:ENSG00000164951"], - "previous_symbols": ["LOC157663", "PPM2C"], - "associated_with": ["omim:605993"], - "symbol_status": None, - "location_annotations": [], - "strand": "+", - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'pyruvate dehydrogenase phosphatase catalytic subunit 1', + 'concept_id': 'ncbigene:54704', + 'symbol': 'PDP1', + 'aliases': ['PDH', 'PDP', 'PDPC', 'PPM2A', 'PPM2C'], + 'xrefs': ['hgnc:9279', 'ensembl:ENSG00000164951'], + 'previous_symbols': ['LOC157663', 'PPM2C'], + 'associated_with': ['omim:605993'], + 'symbol_status': None, + 'location_annotations': [], + 'strand': '+', + 'locations': [ # { # "id": "ga4gh:CL.cJsZWKrEtzpFn5psdCtgofb6NaEDVPfB", # "chr": "8", @@ -106,37 +106,37 @@ def pdp1_symbol(): # "type": "ChromosomeLocation" # }, { - "id": "ga4gh:SL.-455M-S51D8nXPFoGH0dYNFVFAJxm5dG", - "end": 93926068, - "start": 93916922, - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs", + 'id': 'ga4gh:SL.-455M-S51D8nXPFoGH0dYNFVFAJxm5dG', + 'end': 93926068, + 'start': 93916922, + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs', }, - "type": "SequenceLocation", + 'type': 'SequenceLocation', } ], - "gene_type": "protein-coding", + 'gene_type': 'protein-coding', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def pdp1_alias(): """Create gene fixture for PDP1 (ncbigene:403313).""" params = { - "match_type": MatchType.NO_MATCH, - "label": "phospholipid phosphatase 6", - "concept_id": "ncbigene:403313", - "symbol": "PLPP6", - "aliases": ["PDP1", "PSDP", "PPAPDC2", "bA6J24.6", "LPRP-B", "PA-PSP"], - "xrefs": ["hgnc:23682", "ensembl:ENSG00000205808"], - "previous_symbols": [], - "associated_with": ["omim:611666"], - "symbol_status": None, - "location_annotations": [], - "strand": "+", - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'phospholipid phosphatase 6', + 'concept_id': 'ncbigene:403313', + 'symbol': 'PLPP6', + 'aliases': ['PDP1', 'PSDP', 'PPAPDC2', 'bA6J24.6', 'LPRP-B', 'PA-PSP'], + 'xrefs': ['hgnc:23682', 'ensembl:ENSG00000205808'], + 'previous_symbols': [], + 'associated_with': ['omim:611666'], + 'symbol_status': None, + 'location_annotations': [], + 'strand': '+', + 'locations': [ # { # "id": "ga4gh:CL.7ivmMgKAqiFiRh1qsbA909w2kUcPabr_", # "chr": "9", @@ -146,38 +146,38 @@ def pdp1_alias(): # "type": "ChromosomeLocation" # }, { - "id": "ga4gh:SL.VI_0P0-ei90MDsLjAeUrDfeXBlZVJtJY", - "end": 4665258, - "start": 4662293, - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI", + 'id': 'ga4gh:SL.VI_0P0-ei90MDsLjAeUrDfeXBlZVJtJY', + 'end': 4665258, + 'start': 4662293, + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI', }, - "type": "SequenceLocation", + 'type': 'SequenceLocation', } ], - "gene_type": "protein-coding", + 'gene_type': 'protein-coding', } return Gene(**params) # X and Y chromosomes -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def spry3(): """Create gene fixture for SPRY3.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "sprouty RTK signaling antagonist 3", - "concept_id": "ncbigene:10251", - "symbol": "SPRY3", - "aliases": ["spry-3"], - "xrefs": ["hgnc:11271", "ensembl:ENSG00000168939"], - "previous_symbols": ["LOC170187", "LOC253479"], - "associated_with": ["omim:300531"], - "symbol_status": None, - "location_annotations": [], - "strand": "+", - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'sprouty RTK signaling antagonist 3', + 'concept_id': 'ncbigene:10251', + 'symbol': 'SPRY3', + 'aliases': ['spry-3'], + 'xrefs': ['hgnc:11271', 'ensembl:ENSG00000168939'], + 'previous_symbols': ['LOC170187', 'LOC253479'], + 'associated_with': ['omim:300531'], + 'symbol_status': None, + 'location_annotations': [], + 'strand': '+', + 'locations': [ # { # "id": "ga4gh:CL.r8Qv_b-B3SoguReqdunL3GCkt1RH-es1", # "chr": "Y", @@ -195,92 +195,92 @@ def spry3(): # "type": "ChromosomeLocation" # }, { - "id": "ga4gh:SL.2N5aguRIvBdGemRgABZFutmLTV925dsV", - "end": 155782459, - "start": 155612585, - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + 'id': 'ga4gh:SL.2N5aguRIvBdGemRgABZFutmLTV925dsV', + 'end': 155782459, + 'start': 155612585, + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP', }, - "type": "SequenceLocation", + 'type': 'SequenceLocation', }, { - "id": "ga4gh:SL.U9E9WtQdzFc4elR3t1qw48nueHgfWFWL", - "end": 56968979, - "start": 56954315, - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", + 'id': 'ga4gh:SL.U9E9WtQdzFc4elR3t1qw48nueHgfWFWL', + 'end': 56968979, + 'start': 56954315, + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5', }, - "type": "SequenceLocation", + 'type': 'SequenceLocation', }, ], - "gene_type": "protein-coding", + 'gene_type': 'protein-coding', } return Gene(**params) # chromosome but no map locations -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def adcp1(): """Create gene fixture for ADCP1.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "adenosine deaminase complexing protein 1", - "concept_id": "ncbigene:106", - "symbol": "ADCP1", - "aliases": [], - "xrefs": ["hgnc:229"], - "previous_symbols": [], - "associated_with": [], - "symbol_status": None, - "strand": None, - "location_annotations": ["6"], - "locations": [], - "gene_type": "unknown", + 'match_type': MatchType.NO_MATCH, + 'label': 'adenosine deaminase complexing protein 1', + 'concept_id': 'ncbigene:106', + 'symbol': 'ADCP1', + 'aliases': [], + 'xrefs': ['hgnc:229'], + 'previous_symbols': [], + 'associated_with': [], + 'symbol_status': None, + 'strand': None, + 'location_annotations': ['6'], + 'locations': [], + 'gene_type': 'unknown', } return Gene(**params) # no chromosome or map locations -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def afa(): """Create gene fixture for AFA.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "ankyloblepharon filiforme adnatum", - "concept_id": "ncbigene:170", - "symbol": "AFA", - "aliases": [], - "xrefs": [], - "previous_symbols": [], - "associated_with": ["omim:106250"], - "symbol_status": None, - "strand": None, - "location_annotations": [], - "locations": [], - "gene_type": "unknown", + 'match_type': MatchType.NO_MATCH, + 'label': 'ankyloblepharon filiforme adnatum', + 'concept_id': 'ncbigene:170', + 'symbol': 'AFA', + 'aliases': [], + 'xrefs': [], + 'previous_symbols': [], + 'associated_with': ['omim:106250'], + 'symbol_status': None, + 'strand': None, + 'location_annotations': [], + 'locations': [], + 'gene_type': 'unknown', } return Gene(**params) # Contains non cytogenic locations (i.e. "map from Rosati....") -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def znf84(): """Create gene fixture for ZNF84.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "zinc finger protein 84", - "concept_id": "ncbigene:7637", - "symbol": "ZNF84", - "aliases": ["HPF2"], - "xrefs": ["hgnc:13159", "ensembl:ENSG00000198040"], - "previous_symbols": ["LOC100287429"], - "associated_with": ["omim:618554"], - "symbol_status": None, - "location_annotations": ["map from Rosati ref via FISH [AFS]"], - "strand": "+", - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'zinc finger protein 84', + 'concept_id': 'ncbigene:7637', + 'symbol': 'ZNF84', + 'aliases': ['HPF2'], + 'xrefs': ['hgnc:13159', 'ensembl:ENSG00000198040'], + 'previous_symbols': ['LOC100287429'], + 'associated_with': ['omim:618554'], + 'symbol_status': None, + 'location_annotations': ['map from Rosati ref via FISH [AFS]'], + 'strand': '+', + 'locations': [ # { # "id": "ga4gh:CL.6YvQEs6MuHuNvt0Vlv8r4hMKIOK5Ktq4", # "chr": "12", @@ -290,38 +290,38 @@ def znf84(): # "type": "ChromosomeLocation" # }, { - "id": "ga4gh:SL.IRsls9vud2-CiA7Jq4L3ry2VVK7LoNud", - "end": 133063299, - "start": 133037508, - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl", + 'id': 'ga4gh:SL.IRsls9vud2-CiA7Jq4L3ry2VVK7LoNud', + 'end': 133063299, + 'start': 133037508, + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl', }, - "type": "SequenceLocation", + 'type': 'SequenceLocation', } ], - "gene_type": "protein-coding", + 'gene_type': 'protein-coding', } return Gene(**params) # No arm or sub band -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def slc25a6(): """Create gene fixture for SLC25A6.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "solute carrier family 25 member 6", - "concept_id": "ncbigene:293", - "symbol": "SLC25A6", - "aliases": ["AAC3", "ANT", "ANT 2", "ANT 3", "ANT3", "ANT3Y"], - "xrefs": ["hgnc:10992", "ensembl:ENSG00000169100", "ensembl:ENSG00000292334"], - "previous_symbols": ["ANT3Y"], - "associated_with": ["omim:300151", "omim:403000"], - "symbol_status": None, - "location_annotations": [], - "strand": "-", - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'solute carrier family 25 member 6', + 'concept_id': 'ncbigene:293', + 'symbol': 'SLC25A6', + 'aliases': ['AAC3', 'ANT', 'ANT 2', 'ANT 3', 'ANT3', 'ANT3Y'], + 'xrefs': ['hgnc:10992', 'ensembl:ENSG00000169100', 'ensembl:ENSG00000292334'], + 'previous_symbols': ['ANT3Y'], + 'associated_with': ['omim:300151', 'omim:403000'], + 'symbol_status': None, + 'location_annotations': [], + 'strand': '-', + 'locations': [ # { # "id": "ga4gh:CL.Z5pOXNI2Bt8L2NpypNYsbbtgC9L1uyl4", # "type": "ChromosomeLocation", @@ -339,48 +339,48 @@ def slc25a6(): # "end": "p11.2" # }, { - "id": "ga4gh:SL.dvD-ZopQGZkVWx4Z-vFpP9ateicPHgQ6", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + 'id': 'ga4gh:SL.dvD-ZopQGZkVWx4Z-vFpP9ateicPHgQ6', + 'type': 'SequenceLocation', + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP', }, - "start": 1386151, - "end": 1392113, + 'start': 1386151, + 'end': 1392113, }, { - "id": "ga4gh:SL.bv3LobZZ-sERq5cIthyS4w_tmSwV2QSg", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", + 'id': 'ga4gh:SL.bv3LobZZ-sERq5cIthyS4w_tmSwV2QSg', + 'type': 'SequenceLocation', + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5', }, - "start": 1386151, - "end": 1392113, + 'start': 1386151, + 'end': 1392113, }, ], - "gene_type": "protein-coding", + 'gene_type': 'protein-coding', } return Gene(**params) # Contains arm but no sub band -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def loc106783576(): """Create gene fixture for .""" params = { - "match_type": MatchType.NO_MATCH, - "label": "nonconserved acetylation island sequence 68 enhancer", - "concept_id": "ncbigene:106783576", - "symbol": "LOC106783576", - "aliases": [], - "xrefs": [], - "previous_symbols": [], - "associated_with": [], - "symbol_status": None, - "location_annotations": [], - "strand": None, - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'nonconserved acetylation island sequence 68 enhancer', + 'concept_id': 'ncbigene:106783576', + 'symbol': 'LOC106783576', + 'aliases': [], + 'xrefs': [], + 'previous_symbols': [], + 'associated_with': [], + 'symbol_status': None, + 'location_annotations': [], + 'strand': None, + 'locations': [ # { # "id": "ga4gh:CL.YYGQrLtmKwKgp38asAkHT8AydAidnui8", # "chr": "10", @@ -390,28 +390,28 @@ def loc106783576(): # "type": "ChromosomeLocation" # } ], - "gene_type": "biological-region", + 'gene_type': 'biological-region', } return Gene(**params) # Testing for cen -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def glc1b(): """Create gene fixture for GLC1B.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "glaucoma 1, open angle, B (adult-onset)", - "concept_id": "ncbigene:2722", - "symbol": "GLC1B", - "aliases": [], - "xrefs": [], - "previous_symbols": [], - "associated_with": ["omim:606689"], - "symbol_status": None, - "location_annotations": [], - "strand": None, - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'glaucoma 1, open angle, B (adult-onset)', + 'concept_id': 'ncbigene:2722', + 'symbol': 'GLC1B', + 'aliases': [], + 'xrefs': [], + 'previous_symbols': [], + 'associated_with': ['omim:606689'], + 'symbol_status': None, + 'location_annotations': [], + 'strand': None, + 'locations': [ # { # "id": "ga4gh:CL.8D0hLCktRxyPrx4Etgabq10vEq6TtU43", # "chr": "2", @@ -421,28 +421,28 @@ def glc1b(): # "type": "ChromosomeLocation" # } ], - "gene_type": "unknown", + 'gene_type': 'unknown', } return Gene(**params) # Testing for ter ranges -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def hdpa(): """Create gene fixture for HDPA.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "Hodgkin disease, susceptibility, pseudoautosomal", - "concept_id": "ncbigene:50829", - "symbol": "HDPA", - "aliases": [], - "xrefs": [], - "previous_symbols": [], - "associated_with": ["omim:300221"], - "symbol_status": None, - "location_annotations": [], - "strand": None, - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'Hodgkin disease, susceptibility, pseudoautosomal', + 'concept_id': 'ncbigene:50829', + 'symbol': 'HDPA', + 'aliases': [], + 'xrefs': [], + 'previous_symbols': [], + 'associated_with': ['omim:300221'], + 'symbol_status': None, + 'location_annotations': [], + 'strand': None, + 'locations': [ # { # "id": "ga4gh:CL.kl9HXvnUCE6Z1ktXibt83NBdXvxnT2RA", # "chr": "X", @@ -452,29 +452,29 @@ def hdpa(): # "type": "ChromosomeLocation" # } ], - "gene_type": "unknown", + 'gene_type': 'unknown', } return Gene(**params) # Testing for annotation -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def prkrap1(): """Create gene fixture for PRKRAP1.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "protein activator of interferon induced protein kinase " - "EIF2AK2 pseudogene 1", - "concept_id": "ncbigene:731716", - "symbol": "PRKRAP1", - "aliases": [], - "xrefs": ["hgnc:33447"], - "previous_symbols": ["LOC100289695"], - "associated_with": [], - "symbol_status": None, - "location_annotations": ["alternate reference locus"], - "strand": "+", - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'protein activator of interferon induced protein kinase ' + 'EIF2AK2 pseudogene 1', + 'concept_id': 'ncbigene:731716', + 'symbol': 'PRKRAP1', + 'aliases': [], + 'xrefs': ['hgnc:33447'], + 'previous_symbols': ['LOC100289695'], + 'associated_with': [], + 'symbol_status': None, + 'location_annotations': ['alternate reference locus'], + 'strand': '+', + 'locations': [ # { # "id": "ga4gh:CL.FYt7UkCHZVLpkYe7zhNdMk1K6lxl_k7I", # "chr": "6", @@ -484,48 +484,48 @@ def prkrap1(): # "type": "ChromosomeLocation" # }, { - "id": "ga4gh:SL.LwWy5JYncZVnOM9hWiLWW_z0n2eY-peb", - "end": 3941874, - "start": 3940269, - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.MjujHSAsgNWRTX4w3ysM7b5OVhZpdXu1", + 'id': 'ga4gh:SL.LwWy5JYncZVnOM9hWiLWW_z0n2eY-peb', + 'end': 3941874, + 'start': 3940269, + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.MjujHSAsgNWRTX4w3ysM7b5OVhZpdXu1', }, - "type": "SequenceLocation", + 'type': 'SequenceLocation', }, { - "id": "ga4gh:SL.q36ql_fX4HrZy_G2EXX_SGWl-7X5Bq6c", - "end": 3932085, - "start": 3930480, - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.Q8IworEhpLeXwpz1CHM7C3luysh-ltx-", + 'id': 'ga4gh:SL.q36ql_fX4HrZy_G2EXX_SGWl-7X5Bq6c', + 'end': 3932085, + 'start': 3930480, + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.Q8IworEhpLeXwpz1CHM7C3luysh-ltx-', }, - "type": "SequenceLocation", + 'type': 'SequenceLocation', }, ], - "gene_type": "pseudo", + 'gene_type': 'pseudo', } return Gene(**params) # start > end -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def mhb(): """Create gene fixture for MHB.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "myopathy, hyaline body, autosomal recessive", - "concept_id": "ncbigene:619511", - "symbol": "MHB", - "aliases": [], - "xrefs": [], - "previous_symbols": [], - "associated_with": ["omim:255160"], - "symbol_status": None, - "location_annotations": [], - "strand": None, - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'myopathy, hyaline body, autosomal recessive', + 'concept_id': 'ncbigene:619511', + 'symbol': 'MHB', + 'aliases': [], + 'xrefs': [], + 'previous_symbols': [], + 'associated_with': ['omim:255160'], + 'symbol_status': None, + 'location_annotations': [], + 'strand': None, + 'locations': [ # { # "id": "ga4gh:CL.6vlmdqdXYxSAGsJI9no7kLN5iLKpvr5X", # "chr": "3", @@ -535,28 +535,28 @@ def mhb(): # "type": "ChromosomeLocation" # } ], - "gene_type": "unknown", + 'gene_type': 'unknown', } return Gene(**params) # Different arms -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def spg37(): """Create gene fixture for SPG37.""" params = { - "match_type": MatchType.NO_MATCH, - "label": "spastic paraplegia 37 (autosomal dominant)", - "concept_id": "ncbigene:100049159", - "symbol": "SPG37", - "aliases": [], - "xrefs": [], - "previous_symbols": [], - "associated_with": ["omim:611945"], - "symbol_status": None, - "location_annotations": [], - "strand": None, - "locations": [ + 'match_type': MatchType.NO_MATCH, + 'label': 'spastic paraplegia 37 (autosomal dominant)', + 'concept_id': 'ncbigene:100049159', + 'symbol': 'SPG37', + 'aliases': [], + 'xrefs': [], + 'previous_symbols': [], + 'associated_with': ['omim:611945'], + 'symbol_status': None, + 'location_annotations': [], + 'strand': None, + 'locations': [ # { # "id": "ga4gh:CL.XWbwTwmJ95KD-aCuXfJcD8cNIvXbiXRh", # "chr": "8", @@ -566,349 +566,349 @@ def spg37(): # "type": "ChromosomeLocation" # } ], - "gene_type": "unknown", + 'gene_type': 'unknown', } return Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def source_urls(): """Provide source data URLs fixture.""" return { - "info_file": "ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz", - "history_file": "ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz", - "assembly_file": "ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/", + 'info_file': 'ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz', + 'history_file': 'ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz', + 'assembly_file': 'ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/', } def test_dpf1(check_resp_single_record, ncbi, dpf1): """Test that DPF1 normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search("ncbigene:8193") + resp = ncbi.search('ncbigene:8193') check_resp_single_record(resp, dpf1, MatchType.CONCEPT_ID) - resp = ncbi.search("ncbIgene:8193") + resp = ncbi.search('ncbIgene:8193') check_resp_single_record(resp, dpf1, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search("DPF1") + resp = ncbi.search('DPF1') check_resp_single_record(resp, dpf1, MatchType.SYMBOL) - resp = ncbi.search("DpF1") + resp = ncbi.search('DpF1') check_resp_single_record(resp, dpf1, MatchType.SYMBOL) # Alias - resp = ncbi.search("BAF45b") + resp = ncbi.search('BAF45b') check_resp_single_record(resp, dpf1, MatchType.ALIAS) - resp = ncbi.search("NEUD4") + resp = ncbi.search('NEUD4') check_resp_single_record(resp, dpf1, MatchType.ALIAS) - resp = ncbi.search("neuro-d4") + resp = ncbi.search('neuro-d4') check_resp_single_record(resp, dpf1, MatchType.ALIAS) # associated_with - resp = ncbi.search("omim:601670") + resp = ncbi.search('omim:601670') check_resp_single_record(resp, dpf1, MatchType.ASSOCIATED_WITH) # No Match - resp = ncbi.search("DPF 1") + resp = ncbi.search('DPF 1') assert len(resp.records) == 0 - resp = ncbi.search("DPG1") + resp = ncbi.search('DPG1') assert len(resp.records) == 0 def test_pdp1(compare_records, check_resp_single_record, ncbi, pdp1_symbol, pdp1_alias): """Test that PDP1 normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search("ncbigene:54704") + resp = ncbi.search('ncbigene:54704') check_resp_single_record(resp, pdp1_symbol, MatchType.CONCEPT_ID) - resp = ncbi.search("NCBIGENE:54704") + resp = ncbi.search('NCBIGENE:54704') check_resp_single_record(resp, pdp1_symbol, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search("PDP1") + resp = ncbi.search('PDP1') assert len(resp.records) == 2 # first record check (should always be symbol) compare_records(resp.records[0], pdp1_symbol, MatchType.SYMBOL) compare_records(resp.records[1], pdp1_alias, MatchType.ALIAS) - resp = ncbi.search("pdp1") + resp = ncbi.search('pdp1') assert len(resp.records) == 2 # first record check (should always be symbol) compare_records(resp.records[0], pdp1_symbol, MatchType.SYMBOL) compare_records(resp.records[1], pdp1_alias, MatchType.ALIAS) # Previous Symbol - resp = ncbi.search("LOC157663") + resp = ncbi.search('LOC157663') check_resp_single_record(resp, pdp1_symbol, MatchType.PREV_SYMBOL) - resp = ncbi.search("PPM2C") + resp = ncbi.search('PPM2C') check_resp_single_record(resp, pdp1_symbol, MatchType.PREV_SYMBOL) - resp = ncbi.search("loc157663") + resp = ncbi.search('loc157663') check_resp_single_record(resp, pdp1_symbol, MatchType.PREV_SYMBOL) # Alias - resp = ncbi.search("pdh") + resp = ncbi.search('pdh') check_resp_single_record(resp, pdp1_symbol, MatchType.ALIAS) - resp = ncbi.search("PDP") + resp = ncbi.search('PDP') check_resp_single_record(resp, pdp1_symbol, MatchType.ALIAS) - resp = ncbi.search("PDPC") + resp = ncbi.search('PDPC') check_resp_single_record(resp, pdp1_symbol, MatchType.ALIAS) - resp = ncbi.search("PPM2A") + resp = ncbi.search('PPM2A') check_resp_single_record(resp, pdp1_symbol, MatchType.ALIAS) def test_spry3(check_resp_single_record, ncbi, spry3): """Test that SPRY3 normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search("NCBIgene:10251") + resp = ncbi.search('NCBIgene:10251') check_resp_single_record(resp, spry3, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search("sprY3") + resp = ncbi.search('sprY3') check_resp_single_record(resp, spry3, MatchType.SYMBOL) # Alias - resp = ncbi.search("SPRY-3") + resp = ncbi.search('SPRY-3') check_resp_single_record(resp, spry3, MatchType.ALIAS) def test_adcp1(check_resp_single_record, ncbi, adcp1): """Test that ADCP1 normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search("NCBIgene:106") + resp = ncbi.search('NCBIgene:106') check_resp_single_record(resp, adcp1, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search("ADCP1") + resp = ncbi.search('ADCP1') check_resp_single_record(resp, adcp1, MatchType.SYMBOL) def test_afa(check_resp_single_record, ncbi, afa): """Test that AFA normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search("NCBIgene:170") + resp = ncbi.search('NCBIgene:170') check_resp_single_record(resp, afa, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search("AFA") + resp = ncbi.search('AFA') check_resp_single_record(resp, afa, MatchType.SYMBOL) def test_znf84(check_resp_single_record, ncbi, znf84): """Test that ZNF84 normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search("NCBIgene:7637") + resp = ncbi.search('NCBIgene:7637') check_resp_single_record(resp, znf84, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search("ZNF84") + resp = ncbi.search('ZNF84') check_resp_single_record(resp, znf84, MatchType.SYMBOL) def test_slc25a6(check_resp_single_record, ncbi, slc25a6): """Test that SLC25A6 normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search("NCBIgene:293") + resp = ncbi.search('NCBIgene:293') check_resp_single_record(resp, slc25a6, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search("SLC25A6") + resp = ncbi.search('SLC25A6') check_resp_single_record(resp, slc25a6, MatchType.SYMBOL) def test_loc106783576(check_resp_single_record, ncbi, loc106783576): """Test that LOC106783576 normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search("NCBIgene:106783576") + resp = ncbi.search('NCBIgene:106783576') check_resp_single_record(resp, loc106783576, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search("LOC106783576") + resp = ncbi.search('LOC106783576') check_resp_single_record(resp, loc106783576, MatchType.SYMBOL) def test_oms(ncbi): """Test that OMS matches to correct gene concept.""" - resp = ncbi.search("NCBIgene:619538") + resp = ncbi.search('NCBIgene:619538') assert len(resp.records) == 0 def test_glc1b(check_resp_single_record, ncbi, glc1b): """Test that GLC1B normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search("NCBIgene:2722") + resp = ncbi.search('NCBIgene:2722') check_resp_single_record(resp, glc1b, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search("GLC1B") + resp = ncbi.search('GLC1B') check_resp_single_record(resp, glc1b, MatchType.SYMBOL) # associated_with - resp = ncbi.search("omim:606689") + resp = ncbi.search('omim:606689') check_resp_single_record(resp, glc1b, MatchType.ASSOCIATED_WITH) def test_hdpa(check_resp_single_record, ncbi, hdpa): """Test that HDPA normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search("NCBIgene:50829") + resp = ncbi.search('NCBIgene:50829') check_resp_single_record(resp, hdpa, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search("HDPA") + resp = ncbi.search('HDPA') check_resp_single_record(resp, hdpa, MatchType.SYMBOL) def test_prkrap1(check_resp_single_record, ncbi, prkrap1): """Test that PRKRAP1 normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search("NCBIgene:731716") + resp = ncbi.search('NCBIgene:731716') check_resp_single_record(resp, prkrap1, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search("PRKRAP1") + resp = ncbi.search('PRKRAP1') check_resp_single_record(resp, prkrap1, MatchType.SYMBOL) # xref - resp = ncbi.search("hgnc:33447") + resp = ncbi.search('hgnc:33447') check_resp_single_record(resp, prkrap1, MatchType.XREF) def test_mhb(check_resp_single_record, ncbi, mhb): """Test that MHB normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search("NCBIgene:619511") + resp = ncbi.search('NCBIgene:619511') check_resp_single_record(resp, mhb, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search("MHB") + resp = ncbi.search('MHB') check_resp_single_record(resp, mhb, MatchType.SYMBOL) # associated_with - resp = ncbi.search("OMIM:255160") + resp = ncbi.search('OMIM:255160') check_resp_single_record(resp, mhb, MatchType.ASSOCIATED_WITH) def test_spg37(check_resp_single_record, ncbi, spg37): """Test that SPG37 normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search("NCBIgene:100049159") + resp = ncbi.search('NCBIgene:100049159') check_resp_single_record(resp, spg37, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search("SPG37") + resp = ncbi.search('SPG37') check_resp_single_record(resp, spg37, MatchType.SYMBOL) # associated_with - resp = ncbi.search("omim:611945") + resp = ncbi.search('omim:611945') check_resp_single_record(resp, spg37, MatchType.ASSOCIATED_WITH) def test_discontinued_genes(ncbi): """Test searches for discontinued genes.""" # HOTS - resp = ncbi.search("ncbigene:103344718") + resp = ncbi.search('ncbigene:103344718') check_ncbi_discontinued_gene( - resp, "ncbigene:103344718", "HOTS", MatchType.CONCEPT_ID + resp, 'ncbigene:103344718', 'HOTS', MatchType.CONCEPT_ID ) - resp = ncbi.search("HOTS") + resp = ncbi.search('HOTS') check_ncbi_discontinued_gene( - resp, "ncbigene:103344718", "HOTS", MatchType.CONCEPT_ID + resp, 'ncbigene:103344718', 'HOTS', MatchType.CONCEPT_ID ) - resp = ncbi.search("hots") + resp = ncbi.search('hots') check_ncbi_discontinued_gene( - resp, "ncbigene:103344718", "HOTS", MatchType.CONCEPT_ID + resp, 'ncbigene:103344718', 'HOTS', MatchType.CONCEPT_ID ) # AASTH23 - resp = ncbi.search("ncbigene:544580") + resp = ncbi.search('ncbigene:544580') check_ncbi_discontinued_gene( - resp, "ncbigene:544580", "AASTH23", MatchType.CONCEPT_ID + resp, 'ncbigene:544580', 'AASTH23', MatchType.CONCEPT_ID ) - resp = ncbi.search("AASTH23") + resp = ncbi.search('AASTH23') check_ncbi_discontinued_gene( - resp, "ncbigene:544580", "AASTH23", MatchType.CONCEPT_ID + resp, 'ncbigene:544580', 'AASTH23', MatchType.CONCEPT_ID ) - resp = ncbi.search("aastH23") + resp = ncbi.search('aastH23') check_ncbi_discontinued_gene( - resp, "ncbigene:544580", "AASTH23", MatchType.CONCEPT_ID + resp, 'ncbigene:544580', 'AASTH23', MatchType.CONCEPT_ID ) def test_no_match(ncbi, source_urls): """Test that nonexistent query doesn"t normalize to a match.""" - response = ncbi.search("cisplatin") + response = ncbi.search('cisplatin') assert len(response.records) == 0 # double-check that meta still populates - assert response.source_meta_.data_license == "custom" + assert response.source_meta_.data_license == 'custom' assert ( response.source_meta_.data_license_url - == "https://www.ncbi.nlm.nih.gov/home/about/policies/" + == 'https://www.ncbi.nlm.nih.gov/home/about/policies/' ) - assert datetime.strptime(response.source_meta_.version, "%Y%m%d") + assert datetime.strptime(response.source_meta_.version, '%Y%m%d') assert response.source_meta_.data_url == source_urls - assert response.source_meta_.rdp_url == "https://reusabledata.org/ncbi-gene.html" - assert not response.source_meta_.data_license_attributes["non_commercial"] - assert not response.source_meta_.data_license_attributes["share_alike"] - assert not response.source_meta_.data_license_attributes["attribution"] + assert response.source_meta_.rdp_url == 'https://reusabledata.org/ncbi-gene.html' + assert not response.source_meta_.data_license_attributes['non_commercial'] + assert not response.source_meta_.data_license_attributes['share_alike'] + assert not response.source_meta_.data_license_attributes['attribution'] # check blank - response = ncbi.search("") + response = ncbi.search('') assert len(response.records) == 0 # check some strange characters - response = ncbi.search("----") + response = ncbi.search('----') assert len(response.records) == 0 response = ncbi.search("''") assert len(response.records) == 0 - response = ncbi.search("~~~") + response = ncbi.search('~~~') assert len(response.records) == 0 - response = ncbi.search(" ") + response = ncbi.search(' ') assert len(response.records) == 0 # Incorrect Concept IDs - response = ncbi.search("ncblgene:8193") + response = ncbi.search('ncblgene:8193') assert len(response.records) == 0 - response = ncbi.search("NCBIGENE54704") + response = ncbi.search('NCBIGENE54704') assert len(response.records) == 0 - response = ncbi.search("54704") + response = ncbi.search('54704') assert len(response.records) == 0 - response = ncbi.search("ncbigene;54704") + response = ncbi.search('ncbigene;54704') assert len(response.records) == 0 def test_meta(ncbi, source_urls): """Test NCBI source metadata.""" - response = ncbi.search("PDP1") - assert response.source_meta_.data_license == "custom" + response = ncbi.search('PDP1') + assert response.source_meta_.data_license == 'custom' assert ( response.source_meta_.data_license_url - == "https://www.ncbi.nlm.nih.gov/home/about/policies/" + == 'https://www.ncbi.nlm.nih.gov/home/about/policies/' ) - assert datetime.strptime(response.source_meta_.version, "%Y%m%d") + assert datetime.strptime(response.source_meta_.version, '%Y%m%d') assert response.source_meta_.data_url == source_urls - assert response.source_meta_.rdp_url == "https://reusabledata.org/ncbi-gene.html" - assert response.source_meta_.genome_assemblies == ["GRCh38.p14"] + assert response.source_meta_.rdp_url == 'https://reusabledata.org/ncbi-gene.html' + assert response.source_meta_.genome_assemblies == ['GRCh38.p14'] assert response.source_meta_.data_license_attributes == { - "non_commercial": False, - "share_alike": False, - "attribution": False, + 'non_commercial': False, + 'share_alike': False, + 'attribution': False, } diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index f767ced1..31da2878 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -6,7 +6,7 @@ from gene.schemas import BaseGene, MatchType, SourceName -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def query_handler(database): """Build query_handler test fixture.""" @@ -14,7 +14,7 @@ class QueryGetter: def __init__(self): self.query_handler = QueryHandler(database) - def search(self, query_str, incl="", excl=""): + def search(self, query_str, incl='', excl=''): return self.query_handler.search(query_str=query_str, incl=incl, excl=excl) def normalize(self, query_str): @@ -26,79 +26,79 @@ def normalize_unmerged(self, query_str): return QueryGetter() -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def normalized_ache(): """Return normalized core Gene object for ACHE.""" params = { - "type": "Gene", - "id": "normalize.gene.hgnc:108", - "label": "ACHE", - "mappings": [ + 'type': 'Gene', + 'id': 'normalize.gene.hgnc:108', + 'label': 'ACHE', + 'mappings': [ { - "coding": {"code": "ENSG00000087085", "system": "ensembl"}, - "relation": "relatedMatch", + 'coding': {'code': 'ENSG00000087085', 'system': 'ensembl'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "43", "system": "ncbigene"}, - "relation": "relatedMatch", + 'coding': {'code': '43', 'system': 'ncbigene'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "OTTHUMG00000157033", "system": "vega"}, - "relation": "relatedMatch", + 'coding': {'code': 'OTTHUMG00000157033', 'system': 'vega'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "uc003uxi.4", "system": "ucsc"}, - "relation": "relatedMatch", + 'coding': {'code': 'uc003uxi.4', 'system': 'ucsc'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "CCDS5710", "system": "ccds"}, - "relation": "relatedMatch", + 'coding': {'code': 'CCDS5710', 'system': 'ccds'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "CCDS64736", "system": "ccds"}, - "relation": "relatedMatch", + 'coding': {'code': 'CCDS64736', 'system': 'ccds'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "CCDS5709", "system": "ccds"}, - "relation": "relatedMatch", + 'coding': {'code': 'CCDS5709', 'system': 'ccds'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "P22303", "system": "uniprot"}, - "relation": "relatedMatch", + 'coding': {'code': 'P22303', 'system': 'uniprot'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "1380483", "system": "pubmed"}, - "relation": "relatedMatch", + 'coding': {'code': '1380483', 'system': 'pubmed'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "100740", "system": "omim"}, - "relation": "relatedMatch", + 'coding': {'code': '100740', 'system': 'omim'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "S09.979", "system": "merops"}, - "relation": "relatedMatch", + 'coding': {'code': 'S09.979', 'system': 'merops'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "2465", "system": "iuphar"}, - "relation": "relatedMatch", + 'coding': {'code': '2465', 'system': 'iuphar'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "NM_015831", "system": "refseq"}, - "relation": "relatedMatch", + 'coding': {'code': 'NM_015831', 'system': 'refseq'}, + 'relation': 'relatedMatch', }, ], - "aliases": ["3.1.1.7", "YT", "N-ACHE", "ARACHE", "ACEE"], - "extensions": [ - {"name": "previous_symbols", "value": ["ACEE", "YT"], "type": "Extension"}, + 'aliases': ['3.1.1.7', 'YT', 'N-ACHE', 'ARACHE', 'ACEE'], + 'extensions': [ + {'name': 'previous_symbols', 'value': ['ACEE', 'YT'], 'type': 'Extension'}, { - "name": "approved_name", - "value": "acetylcholinesterase (Cartwright blood group)", - "type": "Extension", + 'name': 'approved_name', + 'value': 'acetylcholinesterase (Cartwright blood group)', + 'type': 'Extension', }, - {"name": "symbol_status", "value": "approved", "type": "Extension"}, + {'name': 'symbol_status', 'value': 'approved', 'type': 'Extension'}, { - "name": "ncbi_locations", - "value": [ + 'name': 'ncbi_locations', + 'value': [ # { # "id": "ga4gh:CL.JSw-08GkF-7M-OQR-33MLLKQHSi7QJb5", # "type": "ChromosomeLocation", @@ -108,17 +108,17 @@ def normalized_ache(): # "start": "q22.1" # }, { - "id": "ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + 'id': 'ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g', + 'type': 'SequenceLocation', + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul', }, - "start": 100889993, - "end": 100896994, + 'start': 100889993, + 'end': 100896994, } ], - "type": "Extension", + 'type': 'Extension', }, # { # "name": "hgnc_locations", @@ -135,117 +135,117 @@ def normalized_ache(): # "type": "Extension" # }, { - "name": "ensembl_locations", - "value": [ + 'name': 'ensembl_locations', + 'value': [ { - "id": "ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + 'id': 'ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1', + 'type': 'SequenceLocation', + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul', }, - "start": 100889993, - "end": 100896974, + 'start': 100889993, + 'end': 100896974, } ], - "type": "Extension", + 'type': 'Extension', }, - {"name": "ncbi_gene_type", "type": "Extension", "value": "protein-coding"}, + {'name': 'ncbi_gene_type', 'type': 'Extension', 'value': 'protein-coding'}, { - "name": "hgnc_locus_type", - "type": "Extension", - "value": "gene with protein product", + 'name': 'hgnc_locus_type', + 'type': 'Extension', + 'value': 'gene with protein product', }, - {"name": "ensembl_biotype", "type": "Extension", "value": "protein_coding"}, - {"name": "strand", "type": "Extension", "value": "-"}, + {'name': 'ensembl_biotype', 'type': 'Extension', 'value': 'protein_coding'}, + {'name': 'strand', 'type': 'Extension', 'value': '-'}, ], } return core_models.Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def normalized_braf(): """Return normalized core Gene object for BRAF.""" params = { - "type": "Gene", - "id": "normalize.gene.hgnc:1097", - "label": "BRAF", - "mappings": [ + 'type': 'Gene', + 'id': 'normalize.gene.hgnc:1097', + 'label': 'BRAF', + 'mappings': [ { - "coding": {"code": "673", "system": "ncbigene"}, - "relation": "relatedMatch", + 'coding': {'code': '673', 'system': 'ncbigene'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "ENSG00000157764", "system": "ensembl"}, - "relation": "relatedMatch", + 'coding': {'code': 'ENSG00000157764', 'system': 'ensembl'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "CCDS5863", "system": "ccds"}, - "relation": "relatedMatch", + 'coding': {'code': 'CCDS5863', 'system': 'ccds'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "1943", "system": "iuphar"}, - "relation": "relatedMatch", + 'coding': {'code': '1943', 'system': 'iuphar'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "119066", "system": "orphanet"}, - "relation": "relatedMatch", + 'coding': {'code': '119066', 'system': 'orphanet'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "BRAF", "system": "cosmic"}, - "relation": "relatedMatch", + 'coding': {'code': 'BRAF', 'system': 'cosmic'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "2284096", "system": "pubmed"}, - "relation": "relatedMatch", + 'coding': {'code': '2284096', 'system': 'pubmed'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "uc003vwc.5", "system": "ucsc"}, - "relation": "relatedMatch", + 'coding': {'code': 'uc003vwc.5', 'system': 'ucsc'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "164757", "system": "omim"}, - "relation": "relatedMatch", + 'coding': {'code': '164757', 'system': 'omim'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "NM_004333", "system": "refseq"}, - "relation": "relatedMatch", + 'coding': {'code': 'NM_004333', 'system': 'refseq'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "CCDS87555", "system": "ccds"}, - "relation": "relatedMatch", + 'coding': {'code': 'CCDS87555', 'system': 'ccds'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "P15056", "system": "uniprot"}, - "relation": "relatedMatch", + 'coding': {'code': 'P15056', 'system': 'uniprot'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "M95712", "system": "ena.embl"}, - "relation": "relatedMatch", + 'coding': {'code': 'M95712', 'system': 'ena.embl'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "OTTHUMG00000157457", "system": "vega"}, - "relation": "relatedMatch", + 'coding': {'code': 'OTTHUMG00000157457', 'system': 'vega'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "1565476", "system": "pubmed"}, - "relation": "relatedMatch", + 'coding': {'code': '1565476', 'system': 'pubmed'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "CCDS94219", "system": "ccds"}, - "relation": "relatedMatch", + 'coding': {'code': 'CCDS94219', 'system': 'ccds'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "CCDS94218", "system": "ccds"}, - "relation": "relatedMatch", + 'coding': {'code': 'CCDS94218', 'system': 'ccds'}, + 'relation': 'relatedMatch', }, ], - "aliases": ["BRAF1", "BRAF-1", "RAFB1", "NS7", "B-RAF1", "B-raf"], - "extensions": [ + 'aliases': ['BRAF1', 'BRAF-1', 'RAFB1', 'NS7', 'B-RAF1', 'B-raf'], + 'extensions': [ { - "name": "approved_name", - "value": "B-Raf proto-oncogene, serine/threonine kinase", - "type": "Extension", + 'name': 'approved_name', + 'value': 'B-Raf proto-oncogene, serine/threonine kinase', + 'type': 'Extension', }, # { # "name": "hgnc_locations", @@ -262,24 +262,24 @@ def normalized_braf(): # "type": "Extension" # }, { - "name": "ensembl_locations", - "value": [ + 'name': 'ensembl_locations', + 'value': [ { - "id": "ga4gh:SL.WJ0hsPzXuK54mQyVysTqUNV5jaCATnRf", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + 'id': 'ga4gh:SL.WJ0hsPzXuK54mQyVysTqUNV5jaCATnRf', + 'type': 'SequenceLocation', + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul', }, - "start": 140719326, - "end": 140924929, + 'start': 140719326, + 'end': 140924929, } ], - "type": "Extension", + 'type': 'Extension', }, { - "name": "ncbi_locations", - "value": [ + 'name': 'ncbi_locations', + 'value': [ # { # "id": "ga4gh:CL.ZZZYpOwuW1BLLJXc_Dm4eVZ5E0smVYCc", # "type": "ChromosomeLocation", @@ -289,124 +289,124 @@ def normalized_braf(): # "end": "q34" # }, { - "id": "ga4gh:SL.uNBZoxhjhohl24VlIut-JxPJAGfJ7EQE", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + 'id': 'ga4gh:SL.uNBZoxhjhohl24VlIut-JxPJAGfJ7EQE', + 'type': 'SequenceLocation', + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul', }, - "start": 140713327, - "end": 140924929, + 'start': 140713327, + 'end': 140924929, } ], - "type": "Extension", + 'type': 'Extension', }, - {"name": "ncbi_gene_type", "type": "Extension", "value": "protein-coding"}, + {'name': 'ncbi_gene_type', 'type': 'Extension', 'value': 'protein-coding'}, { - "name": "hgnc_locus_type", - "type": "Extension", - "value": "gene with protein product", + 'name': 'hgnc_locus_type', + 'type': 'Extension', + 'value': 'gene with protein product', }, - {"name": "ensembl_biotype", "type": "Extension", "value": "protein_coding"}, - {"name": "strand", "type": "Extension", "value": "-"}, - {"name": "symbol_status", "type": "Extension", "value": "approved"}, + {'name': 'ensembl_biotype', 'type': 'Extension', 'value': 'protein_coding'}, + {'name': 'strand', 'type': 'Extension', 'value': '-'}, + {'name': 'symbol_status', 'type': 'Extension', 'value': 'approved'}, ], } return core_models.Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def normalized_abl1(): """Return normalized core Gene object for ABL1.""" params = { - "type": "Gene", - "id": "normalize.gene.hgnc:76", - "label": "ABL1", - "mappings": [ + 'type': 'Gene', + 'id': 'normalize.gene.hgnc:76', + 'label': 'ABL1', + 'mappings': [ { - "coding": {"code": "ENSG00000097007", "system": "ensembl"}, - "relation": "relatedMatch", + 'coding': {'code': 'ENSG00000097007', 'system': 'ensembl'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "25", "system": "ncbigene"}, - "relation": "relatedMatch", + 'coding': {'code': '25', 'system': 'ncbigene'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "OTTHUMG00000020813", "system": "vega"}, - "relation": "relatedMatch", + 'coding': {'code': 'OTTHUMG00000020813', 'system': 'vega'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "uc004bzv.4", "system": "ucsc"}, - "relation": "relatedMatch", + 'coding': {'code': 'uc004bzv.4', 'system': 'ucsc'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "CCDS35166", "system": "ccds"}, - "relation": "relatedMatch", + 'coding': {'code': 'CCDS35166', 'system': 'ccds'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "CCDS35165", "system": "ccds"}, - "relation": "relatedMatch", + 'coding': {'code': 'CCDS35165', 'system': 'ccds'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "P00519", "system": "uniprot"}, - "relation": "relatedMatch", + 'coding': {'code': 'P00519', 'system': 'uniprot'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "1857987", "system": "pubmed"}, - "relation": "relatedMatch", + 'coding': {'code': '1857987', 'system': 'pubmed'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "12626632", "system": "pubmed"}, - "relation": "relatedMatch", + 'coding': {'code': '12626632', 'system': 'pubmed'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "ABL1", "system": "cosmic"}, - "relation": "relatedMatch", + 'coding': {'code': 'ABL1', 'system': 'cosmic'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "189980", "system": "omim"}, - "relation": "relatedMatch", + 'coding': {'code': '189980', 'system': 'omim'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "117691", "system": "orphanet"}, - "relation": "relatedMatch", + 'coding': {'code': '117691', 'system': 'orphanet'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "1923", "system": "iuphar"}, - "relation": "relatedMatch", + 'coding': {'code': '1923', 'system': 'iuphar'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "M14752", "system": "ena.embl"}, - "relation": "relatedMatch", + 'coding': {'code': 'M14752', 'system': 'ena.embl'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "NM_007313", "system": "refseq"}, - "relation": "relatedMatch", + 'coding': {'code': 'NM_007313', 'system': 'refseq'}, + 'relation': 'relatedMatch', }, ], - "aliases": [ - "c-ABL", - "JTK7", - "p150", - "CHDSKM", - "BCR-ABL", - "v-abl", - "c-ABL1", - "bcr/abl", - "LOC116063", - "LOC112779", - "ABL", + 'aliases': [ + 'c-ABL', + 'JTK7', + 'p150', + 'CHDSKM', + 'BCR-ABL', + 'v-abl', + 'c-ABL1', + 'bcr/abl', + 'LOC116063', + 'LOC112779', + 'ABL', ], - "extensions": [ + 'extensions': [ { - "name": "previous_symbols", - "value": ["LOC116063", "LOC112779", "ABL"], - "type": "Extension", + 'name': 'previous_symbols', + 'value': ['LOC116063', 'LOC112779', 'ABL'], + 'type': 'Extension', }, { - "name": "approved_name", - "value": "ABL proto-oncogene 1, non-receptor tyrosine kinase", - "type": "Extension", + 'name': 'approved_name', + 'value': 'ABL proto-oncogene 1, non-receptor tyrosine kinase', + 'type': 'Extension', }, # { # "name": "hgnc_locations", @@ -423,8 +423,8 @@ def normalized_abl1(): # "type": "Extension" # }, { - "name": "ncbi_locations", - "value": [ + 'name': 'ncbi_locations', + 'value': [ # { # "id": "ga4gh:CL.1vsxettosueUHyFIOoTPzwIFD1DodLuT", # "type": "ChromosomeLocation", @@ -434,111 +434,111 @@ def normalized_abl1(): # "end": "q34.12" # }, { - "id": "ga4gh:SL.F1QUtInXQaBEjAJNR1sYHXdp0XC000Qi", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI", + 'id': 'ga4gh:SL.F1QUtInXQaBEjAJNR1sYHXdp0XC000Qi', + 'type': 'SequenceLocation', + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI', }, - "start": 130713042, - "end": 130887675, + 'start': 130713042, + 'end': 130887675, } ], - "type": "Extension", + 'type': 'Extension', }, { - "name": "ensembl_locations", - "value": [ + 'name': 'ensembl_locations', + 'value': [ { - "id": "ga4gh:SL.P9Qu87GYxoWPYh1BdAQC5bTLorjvvye7", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI", + 'id': 'ga4gh:SL.P9Qu87GYxoWPYh1BdAQC5bTLorjvvye7', + 'type': 'SequenceLocation', + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI', }, - "start": 130713015, - "end": 130887675, + 'start': 130713015, + 'end': 130887675, } ], - "type": "Extension", + 'type': 'Extension', }, - {"name": "ncbi_gene_type", "type": "Extension", "value": "protein-coding"}, + {'name': 'ncbi_gene_type', 'type': 'Extension', 'value': 'protein-coding'}, { - "name": "hgnc_locus_type", - "type": "Extension", - "value": "gene with protein product", + 'name': 'hgnc_locus_type', + 'type': 'Extension', + 'value': 'gene with protein product', }, - {"name": "ensembl_biotype", "type": "Extension", "value": "protein_coding"}, - {"name": "strand", "type": "Extension", "value": "+"}, - {"name": "symbol_status", "type": "Extension", "value": "approved"}, + {'name': 'ensembl_biotype', 'type': 'Extension', 'value': 'protein_coding'}, + {'name': 'strand', 'type': 'Extension', 'value': '+'}, + {'name': 'symbol_status', 'type': 'Extension', 'value': 'approved'}, ], } return core_models.Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def normalized_p150(): """Return normalized core Gene object for p150.""" params = { - "type": "Gene", - "id": "normalize.gene.hgnc:1910", - "label": "CHAF1A", - "mappings": [ + 'type': 'Gene', + 'id': 'normalize.gene.hgnc:1910', + 'label': 'CHAF1A', + 'mappings': [ { - "coding": {"code": "ENSG00000167670", "system": "ensembl"}, - "relation": "relatedMatch", + 'coding': {'code': 'ENSG00000167670', 'system': 'ensembl'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "10036", "system": "ncbigene"}, - "relation": "relatedMatch", + 'coding': {'code': '10036', 'system': 'ncbigene'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "601246", "system": "omim"}, - "relation": "relatedMatch", + 'coding': {'code': '601246', 'system': 'omim'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "CCDS32875", "system": "ccds"}, - "relation": "relatedMatch", + 'coding': {'code': 'CCDS32875', 'system': 'ccds'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "7600578", "system": "pubmed"}, - "relation": "relatedMatch", + 'coding': {'code': '7600578', 'system': 'pubmed'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "OTTHUMG00000181922", "system": "vega"}, - "relation": "relatedMatch", + 'coding': {'code': 'OTTHUMG00000181922', 'system': 'vega'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "Q13111", "system": "uniprot"}, - "relation": "relatedMatch", + 'coding': {'code': 'Q13111', 'system': 'uniprot'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "NM_005483", "system": "refseq"}, - "relation": "relatedMatch", + 'coding': {'code': 'NM_005483', 'system': 'refseq'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "U20979", "system": "ena.embl"}, - "relation": "relatedMatch", + 'coding': {'code': 'U20979', 'system': 'ena.embl'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "uc002mal.4", "system": "ucsc"}, - "relation": "relatedMatch", + 'coding': {'code': 'uc002mal.4', 'system': 'ucsc'}, + 'relation': 'relatedMatch', }, ], - "aliases": [ - "CAF1P150", - "MGC71229", - "CAF-1", - "P150", - "CAF1B", - "CAF1", - "LOC107985297", + 'aliases': [ + 'CAF1P150', + 'MGC71229', + 'CAF-1', + 'P150', + 'CAF1B', + 'CAF1', + 'LOC107985297', ], - "extensions": [ + 'extensions': [ { - "name": "approved_name", - "value": "chromatin assembly factor 1 subunit A", - "type": "Extension", + 'name': 'approved_name', + 'value': 'chromatin assembly factor 1 subunit A', + 'type': 'Extension', }, # { # "name": "hgnc_locations", @@ -555,23 +555,23 @@ def normalized_p150(): # "type": "Extension" # }, { - "name": "ensembl_locations", - "value": [ + 'name': 'ensembl_locations', + 'value': [ { - "id": "ga4gh:SL.tLUFh2LAYq-bsMi0Vob_TIWrz-sE4HgE", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", + 'id': 'ga4gh:SL.tLUFh2LAYq-bsMi0Vob_TIWrz-sE4HgE', + 'type': 'SequenceLocation', + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl', }, - "start": 4402639, - "end": 4445018, + 'start': 4402639, + 'end': 4445018, } ], }, { - "name": "ncbi_locations", - "value": [ + 'name': 'ncbi_locations', + 'value': [ # { # "id": "ga4gh:CL.kPEG2TGUPOAsAYK6HY0ukprQ-DR_IuMZ", # "type": "ChromosomeLocation", @@ -581,54 +581,54 @@ def normalized_p150(): # "end": "p13.3" # }, { - "id": "ga4gh:SL.-3T7UXNk6nIkMKB9YGEb0RTYxbVY2TUy", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", + 'id': 'ga4gh:SL.-3T7UXNk6nIkMKB9YGEb0RTYxbVY2TUy', + 'type': 'SequenceLocation', + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl', }, - "start": 4402639, - "end": 4450830, + 'start': 4402639, + 'end': 4450830, } ], }, - {"name": "ncbi_gene_type", "type": "Extension", "value": "protein-coding"}, + {'name': 'ncbi_gene_type', 'type': 'Extension', 'value': 'protein-coding'}, { - "name": "hgnc_locus_type", - "type": "Extension", - "value": "gene with protein product", + 'name': 'hgnc_locus_type', + 'type': 'Extension', + 'value': 'gene with protein product', }, - {"name": "ensembl_biotype", "type": "Extension", "value": "protein_coding"}, + {'name': 'ensembl_biotype', 'type': 'Extension', 'value': 'protein_coding'}, { - "name": "previous_symbols", - "type": "Extension", - "value": ["LOC107985297"], + 'name': 'previous_symbols', + 'type': 'Extension', + 'value': ['LOC107985297'], }, - {"name": "strand", "type": "Extension", "value": "+"}, - {"name": "symbol_status", "type": "Extension", "value": "approved"}, + {'name': 'strand', 'type': 'Extension', 'value': '+'}, + {'name': 'symbol_status', 'type': 'Extension', 'value': 'approved'}, ], } return core_models.Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def normalized_loc_653303(): """Provide test fixture for NCBI gene LOC653303. Used to validate normalized results that don't merge records. """ params = { - "type": "Gene", - "label": "LOC653303", - "aliases": ["LOC196266", "LOC654080", "LOC731196"], - "extensions": [ + 'type': 'Gene', + 'label': 'LOC653303', + 'aliases': ['LOC196266', 'LOC654080', 'LOC731196'], + 'extensions': [ { - "type": "Extension", - "name": "approved_name", - "value": "proprotein convertase subtilisin/kexin type 7 pseudogene", + 'type': 'Extension', + 'name': 'approved_name', + 'value': 'proprotein convertase subtilisin/kexin type 7 pseudogene', }, { - "name": "ncbi_locations", - "value": [ + 'name': 'ncbi_locations', + 'value': [ # { # "id": "ga4gh:CL.82tL1yxucvwp5U2Yo4jNYX06pru8zZYl", # "type": "ChromosomeLocation", @@ -638,48 +638,48 @@ def normalized_loc_653303(): # "end": "q23.3" # }, { - "id": "ga4gh:SL.hgpw5EH5q6_PFX1CTcOx5od0LKUQRuDH", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1", + 'id': 'ga4gh:SL.hgpw5EH5q6_PFX1CTcOx5od0LKUQRuDH', + 'type': 'SequenceLocation', + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1', }, - "start": 117135528, - "end": 117138867, + 'start': 117135528, + 'end': 117138867, } ], }, { - "type": "Extension", - "name": "previous_symbols", - "value": ["LOC196266", "LOC731196", "LOC654080"], + 'type': 'Extension', + 'name': 'previous_symbols', + 'value': ['LOC196266', 'LOC731196', 'LOC654080'], }, - {"type": "Extension", "name": "ncbi_gene_type", "value": "pseudo"}, - {"name": "strand", "type": "Extension", "value": "+"}, + {'type': 'Extension', 'name': 'ncbi_gene_type', 'value': 'pseudo'}, + {'name': 'strand', 'type': 'Extension', 'value': '+'}, ], - "id": "normalize.gene.ncbigene:653303", + 'id': 'normalize.gene.ncbigene:653303', } return core_models.Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def normalize_unmerged_loc_653303(): """Provide fixture for NCBI gene LOC655303. Used to validate normalized results that don't merge records. """ return { - "normalized_concept_id": "ncbigene:653303", - "source_matches": { - "NCBI": { - "records": [ + 'normalized_concept_id': 'ncbigene:653303', + 'source_matches': { + 'NCBI': { + 'records': [ { - "concept_id": "ncbigene:653303", - "symbol": "LOC653303", - "symbol_status": None, - "label": "proprotein convertase subtilisin/kexin type 7 pseudogene", # noqa: E501 - "strand": "+", - "location_annotations": [], - "locations": [ + 'concept_id': 'ncbigene:653303', + 'symbol': 'LOC653303', + 'symbol_status': None, + 'label': 'proprotein convertase subtilisin/kexin type 7 pseudogene', + 'strand': '+', + 'location_annotations': [], + 'locations': [ # { # "type": "ChromosomeLocation", # "id": "ga4gh:CL.82tL1yxucvwp5U2Yo4jNYX06pru8zZYl", @@ -689,21 +689,21 @@ def normalize_unmerged_loc_653303(): # "end": "q23.3" # }, { - "id": "ga4gh:SL.hgpw5EH5q6_PFX1CTcOx5od0LKUQRuDH", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1", # noqa: E501 + 'id': 'ga4gh:SL.hgpw5EH5q6_PFX1CTcOx5od0LKUQRuDH', + 'type': 'SequenceLocation', + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1', }, - "start": 117135528, - "end": 117138867, + 'start': 117135528, + 'end': 117138867, } ], - "aliases": [], - "previous_symbols": ["LOC196266", "LOC731196", "LOC654080"], - "xrefs": [], - "associated_with": [], - "gene_type": "pseudo", + 'aliases': [], + 'previous_symbols': ['LOC196266', 'LOC731196', 'LOC654080'], + 'xrefs': [], + 'associated_with': [], + 'gene_type': 'pseudo', } ] } @@ -711,22 +711,22 @@ def normalize_unmerged_loc_653303(): } -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def normalize_unmerged_chaf1a(): """Return expected results from /normalize_unmerged for CHAF1A.""" return { - "normalized_concept_id": "hgnc:1910", - "source_matches": { - "HGNC": { - "records": [ + 'normalized_concept_id': 'hgnc:1910', + 'source_matches': { + 'HGNC': { + 'records': [ { - "concept_id": "hgnc:1910", - "symbol": "CHAF1A", - "symbol_status": "approved", - "label": "chromatin assembly factor 1 subunit A", - "strand": None, - "location_annotations": [], - "locations": [ + 'concept_id': 'hgnc:1910', + 'symbol': 'CHAF1A', + 'symbol_status': 'approved', + 'label': 'chromatin assembly factor 1 subunit A', + 'strand': None, + 'location_annotations': [], + 'locations': [ # { # "type": "ChromosomeLocation", # "id": "ga4gh:CL.kPEG2TGUPOAsAYK6HY0ukprQ-DR_IuMZ", @@ -736,69 +736,69 @@ def normalize_unmerged_chaf1a(): # "end": "p13.3" # } ], - "aliases": [ - "CAF1P150", - "P150", - "CAF1", - "CAF1B", - "MGC71229", - "CAF-1", + 'aliases': [ + 'CAF1P150', + 'P150', + 'CAF1', + 'CAF1B', + 'MGC71229', + 'CAF-1', ], - "previous_symbols": [], - "xrefs": ["ensembl:ENSG00000167670", "ncbigene:10036"], - "associated_with": [ - "vega:OTTHUMG00000181922", - "ccds:CCDS32875", - "ucsc:uc002mal.4", - "pubmed:7600578", - "uniprot:Q13111", - "omim:601246", - "ena.embl:U20979", - "refseq:NM_005483", + 'previous_symbols': [], + 'xrefs': ['ensembl:ENSG00000167670', 'ncbigene:10036'], + 'associated_with': [ + 'vega:OTTHUMG00000181922', + 'ccds:CCDS32875', + 'ucsc:uc002mal.4', + 'pubmed:7600578', + 'uniprot:Q13111', + 'omim:601246', + 'ena.embl:U20979', + 'refseq:NM_005483', ], - "gene_type": "gene with protein product", + 'gene_type': 'gene with protein product', } ], }, - "Ensembl": { - "records": [ + 'Ensembl': { + 'records': [ { - "concept_id": "ensembl:ENSG00000167670", - "symbol": "CHAF1A", - "symbol_status": None, - "label": "chromatin assembly factor 1 subunit A", - "strand": "+", - "location_annotations": [], - "locations": [ + 'concept_id': 'ensembl:ENSG00000167670', + 'symbol': 'CHAF1A', + 'symbol_status': None, + 'label': 'chromatin assembly factor 1 subunit A', + 'strand': '+', + 'location_annotations': [], + 'locations': [ { - "id": "ga4gh:SL.tLUFh2LAYq-bsMi0Vob_TIWrz-sE4HgE", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", # noqa: E501 + 'id': 'ga4gh:SL.tLUFh2LAYq-bsMi0Vob_TIWrz-sE4HgE', + 'type': 'SequenceLocation', + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl', }, - "start": 4402639, - "end": 4445018, + 'start': 4402639, + 'end': 4445018, } ], - "aliases": [], - "previous_symbols": [], - "xrefs": ["hgnc:1910"], - "associated_with": [], - "gene_type": "protein_coding", + 'aliases': [], + 'previous_symbols': [], + 'xrefs': ['hgnc:1910'], + 'associated_with': [], + 'gene_type': 'protein_coding', } ], }, - "NCBI": { - "records": [ + 'NCBI': { + 'records': [ { - "concept_id": "ncbigene:10036", - "symbol": "CHAF1A", - "symbol_status": None, - "label": "chromatin assembly factor 1 subunit A", - "strand": "+", - "location_annotations": [], - "locations": [ + 'concept_id': 'ncbigene:10036', + 'symbol': 'CHAF1A', + 'symbol_status': None, + 'label': 'chromatin assembly factor 1 subunit A', + 'strand': '+', + 'location_annotations': [], + 'locations': [ # { # "type": "ChromosomeLocation", # "id": "ga4gh:CL.kPEG2TGUPOAsAYK6HY0ukprQ-DR_IuMZ", @@ -808,21 +808,21 @@ def normalize_unmerged_chaf1a(): # "end": "p13.3" # }, { - "id": "ga4gh:SL.-3T7UXNk6nIkMKB9YGEb0RTYxbVY2TUy", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", # noqa: E501 + 'id': 'ga4gh:SL.-3T7UXNk6nIkMKB9YGEb0RTYxbVY2TUy', + 'type': 'SequenceLocation', + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl', }, - "start": 4402639, - "end": 4450830, + 'start': 4402639, + 'end': 4450830, } ], - "aliases": ["CAF1P150", "P150", "CAF1", "CAF1B", "CAF-1"], - "previous_symbols": ["LOC107985297"], - "xrefs": ["ensembl:ENSG00000167670", "hgnc:1910"], - "associated_with": ["omim:601246"], - "gene_type": "protein-coding", + 'aliases': ['CAF1P150', 'P150', 'CAF1', 'CAF1B', 'CAF-1'], + 'previous_symbols': ['LOC107985297'], + 'xrefs': ['ensembl:ENSG00000167670', 'hgnc:1910'], + 'associated_with': ['omim:601246'], + 'gene_type': 'protein-coding', } ] }, @@ -830,22 +830,22 @@ def normalize_unmerged_chaf1a(): } -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def normalize_unmerged_ache(): """Provide ACHE fixture for unmerged normalize endpoint.""" return { - "normalized_concept_id": "hgnc:108", - "source_matches": { - "NCBI": { - "records": [ + 'normalized_concept_id': 'hgnc:108', + 'source_matches': { + 'NCBI': { + 'records': [ { - "concept_id": "ncbigene:43", - "symbol": "ACHE", - "symbol_status": None, - "label": "acetylcholinesterase (Cartwright blood group)", - "strand": "-", - "location_annotations": [], - "locations": [ + 'concept_id': 'ncbigene:43', + 'symbol': 'ACHE', + 'symbol_status': None, + 'label': 'acetylcholinesterase (Cartwright blood group)', + 'strand': '-', + 'location_annotations': [], + 'locations': [ # { # "type": "ChromosomeLocation", # "id": "ga4gh:CL.JSw-08GkF-7M-OQR-33MLLKQHSi7QJb5", @@ -855,63 +855,63 @@ def normalize_unmerged_ache(): # "end": "q22.1" # }, { - "id": "ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501 + 'id': 'ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g', + 'type': 'SequenceLocation', + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul', }, - "start": 100889993, - "end": 100896994, + 'start': 100889993, + 'end': 100896994, } ], - "aliases": ["YT", "ARACHE", "ACEE", "N-ACHE"], - "previous_symbols": ["ACEE"], - "xrefs": ["hgnc:108", "ensembl:ENSG00000087085"], - "associated_with": ["omim:100740"], - "gene_type": "protein-coding", + 'aliases': ['YT', 'ARACHE', 'ACEE', 'N-ACHE'], + 'previous_symbols': ['ACEE'], + 'xrefs': ['hgnc:108', 'ensembl:ENSG00000087085'], + 'associated_with': ['omim:100740'], + 'gene_type': 'protein-coding', } ], }, - "Ensembl": { - "records": [ + 'Ensembl': { + 'records': [ { - "concept_id": "ensembl:ENSG00000087085", - "symbol": "ACHE", - "symbol_status": None, - "label": "acetylcholinesterase (Cartwright blood group)", - "strand": "-", - "location_annotations": [], - "locations": [ + 'concept_id': 'ensembl:ENSG00000087085', + 'symbol': 'ACHE', + 'symbol_status': None, + 'label': 'acetylcholinesterase (Cartwright blood group)', + 'strand': '-', + 'location_annotations': [], + 'locations': [ { - "id": "ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501 + 'id': 'ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1', + 'type': 'SequenceLocation', + 'sequenceReference': { + 'type': 'SequenceReference', + 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul', }, - "start": 100889993, - "end": 100896974, + 'start': 100889993, + 'end': 100896974, } ], - "aliases": [], - "previous_symbols": [], - "xrefs": ["hgnc:108"], - "associated_with": [], - "gene_type": "protein_coding", + 'aliases': [], + 'previous_symbols': [], + 'xrefs': ['hgnc:108'], + 'associated_with': [], + 'gene_type': 'protein_coding', } ] }, - "HGNC": { - "records": [ + 'HGNC': { + 'records': [ { - "concept_id": "hgnc:108", - "symbol": "ACHE", - "symbol_status": "approved", - "label": "acetylcholinesterase (Cartwright blood group)", - "strand": None, - "location_annotations": [], - "locations": [ + 'concept_id': 'hgnc:108', + 'symbol': 'ACHE', + 'symbol_status': 'approved', + 'label': 'acetylcholinesterase (Cartwright blood group)', + 'strand': None, + 'location_annotations': [], + 'locations': [ # { # "type": "ChromosomeLocation", # "id": "ga4gh:CL.JSw-08GkF-7M-OQR-33MLLKQHSi7QJb5", @@ -921,23 +921,23 @@ def normalize_unmerged_ache(): # "end": "q22.1" # } ], - "aliases": ["3.1.1.7"], - "previous_symbols": ["YT"], - "xrefs": ["ncbigene:43", "ensembl:ENSG00000087085"], - "associated_with": [ - "ucsc:uc003uxi.4", - "vega:OTTHUMG00000157033", - "merops:S09.979", - "ccds:CCDS5710", - "omim:100740", - "iuphar:2465", - "ccds:CCDS5709", - "refseq:NM_015831", - "pubmed:1380483", - "uniprot:P22303", - "ccds:CCDS64736", + 'aliases': ['3.1.1.7'], + 'previous_symbols': ['YT'], + 'xrefs': ['ncbigene:43', 'ensembl:ENSG00000087085'], + 'associated_with': [ + 'ucsc:uc003uxi.4', + 'vega:OTTHUMG00000157033', + 'merops:S09.979', + 'ccds:CCDS5710', + 'omim:100740', + 'iuphar:2465', + 'ccds:CCDS5709', + 'refseq:NM_015831', + 'pubmed:1380483', + 'uniprot:P22303', + 'ccds:CCDS64736', ], - "gene_type": "gene with protein product", + 'gene_type': 'gene with protein product', } ] }, @@ -945,55 +945,55 @@ def normalize_unmerged_ache(): } -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def normalized_ifnr(): """Return normalized core Gene object for IFNR.""" params = { - "type": "Gene", - "id": "normalize.gene.hgnc:5447", - "label": "IFNR", - "mappings": [ + 'type': 'Gene', + 'id': 'normalize.gene.hgnc:5447', + 'label': 'IFNR', + 'mappings': [ { - "coding": {"code": "3466", "system": "ncbigene"}, - "relation": "relatedMatch", + 'coding': {'code': '3466', 'system': 'ncbigene'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "1906174", "system": "pubmed"}, - "relation": "relatedMatch", + 'coding': {'code': '1906174', 'system': 'pubmed'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "147573", "system": "omim"}, - "relation": "relatedMatch", + 'coding': {'code': '147573', 'system': 'omim'}, + 'relation': 'relatedMatch', }, { - "coding": {"code": "1193239", "system": "pubmed"}, - "relation": "relatedMatch", + 'coding': {'code': '1193239', 'system': 'pubmed'}, + 'relation': 'relatedMatch', }, ], - "aliases": ["IFNGM", "IFNGM2"], - "extensions": [ - { - "name": "approved_name", - "value": "interferon production regulator", - "type": "Extension", - }, - {"name": "symbol_status", "value": "approved", "type": "Extension"}, - {"name": "symbol_status", "value": "approved", "type": "Extension"}, - {"name": "ncbi_gene_type", "type": "Extension", "value": "unknown"}, - {"name": "hgnc_locus_type", "type": "Extension", "value": "unknown"}, - {"name": "location_annotations", "type": "Extension", "value": ["16"]}, + 'aliases': ['IFNGM', 'IFNGM2'], + 'extensions': [ + { + 'name': 'approved_name', + 'value': 'interferon production regulator', + 'type': 'Extension', + }, + {'name': 'symbol_status', 'value': 'approved', 'type': 'Extension'}, + {'name': 'symbol_status', 'value': 'approved', 'type': 'Extension'}, + {'name': 'ncbi_gene_type', 'type': 'Extension', 'value': 'unknown'}, + {'name': 'hgnc_locus_type', 'type': 'Extension', 'value': 'unknown'}, + {'name': 'location_annotations', 'type': 'Extension', 'value': ['16']}, ], } return core_models.Gene(**params) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def num_sources(): """Get the number of sources.""" return len({s for s in SourceName}) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def source_meta(): """Create test fixture for source meta""" return [SourceName.HGNC, SourceName.ENSEMBL, SourceName.NCBI] @@ -1002,18 +1002,18 @@ def source_meta(): def compare_warnings(actual_warnings, expected_warnings): """Compare response warnings against expected results.""" if expected_warnings: - assert len(actual_warnings) == len(expected_warnings), "warnings len" + assert len(actual_warnings) == len(expected_warnings), 'warnings len' for e_warnings in expected_warnings: for r_warnings in actual_warnings: for e_key, e_val in e_warnings.items(): for r_val in r_warnings.values(): if e_key == r_val: if isinstance(e_val, list): - assert set(r_val) == set(e_val), "warnings val" + assert set(r_val) == set(e_val), 'warnings val' else: - assert r_val == e_val, "warnings val" + assert r_val == e_val, 'warnings val' else: - assert actual_warnings == [], "warnings != []" + assert actual_warnings == [], 'warnings != []' def compare_normalize_resp( @@ -1028,7 +1028,7 @@ def compare_normalize_resp( assert resp.query == expected_query compare_warnings(resp.warnings, expected_warnings) assert resp.match_type == expected_match_type - assert resp.normalized_id == expected_gene.id.split("normalize.gene.")[-1] + assert resp.normalized_id == expected_gene.id.split('normalize.gene.')[-1] compare_gene(expected_gene, resp.gene) if not expected_source_meta: assert resp.source_meta_ == {} @@ -1036,7 +1036,7 @@ def compare_normalize_resp( resp_source_meta_keys = resp.source_meta_.keys() assert len(resp_source_meta_keys) == len( expected_source_meta - ), "source_meta_keys" # noqa: E501 + ), 'source_meta_keys' for src in expected_source_meta: assert src in resp_source_meta_keys compare_service_meta(resp.service_meta_) @@ -1065,7 +1065,7 @@ def compare_unmerged_response(actual, query, warnings, match_type, fixture): assert actual.query == query compare_warnings(actual.warnings, warnings) assert actual.match_type == match_type - assert actual.normalized_concept_id == fixture["normalized_concept_id"] + assert actual.normalized_concept_id == fixture['normalized_concept_id'] for source, match in actual.source_matches.items(): assert match.source_meta_ # check that it's there @@ -1073,20 +1073,20 @@ def compare_unmerged_response(actual, query, warnings, match_type, fixture): concept_id = record.concept_id fixture_gene = None # get corresponding fixture record - for gene in fixture["source_matches"][source.value]["records"]: - if gene["concept_id"] == concept_id: + for gene in fixture['source_matches'][source.value]['records']: + if gene['concept_id'] == concept_id: fixture_gene = BaseGene(**gene) break - assert fixture_gene, f"Unable to find fixture for {concept_id}" + assert fixture_gene, f'Unable to find fixture for {concept_id}' compare_unmerged_record(record, fixture_gene) def compare_service_meta(service_meta): """Check that service metadata is correct.""" - assert service_meta.name == "gene-normalizer" - assert service_meta.version >= "0.1.0" + assert service_meta.name == 'gene-normalizer' + assert service_meta.version >= '0.1.0' assert isinstance(service_meta.response_datetime, str) - assert service_meta.url == "https://github.com/cancervariants/gene-normalization" + assert service_meta.url == 'https://github.com/cancervariants/gene-normalization' def compare_gene(test, actual): @@ -1109,15 +1109,15 @@ def compare_gene(test, actual): assert no_matches == [], no_matches assert len(actual.mappings) == len(test.mappings) - assert set(actual.aliases) == set(test.aliases), "aliases" - extensions_present = "extensions" in test.model_fields.keys() - assert ("extensions" in actual.model_fields.keys()) == extensions_present + assert set(actual.aliases) == set(test.aliases), 'aliases' + extensions_present = 'extensions' in test.model_fields.keys() + assert ('extensions' in actual.model_fields.keys()) == extensions_present if extensions_present: actual_ext_names = sorted([ext.name for ext in actual.extensions]) unique_actual_ext_names = sorted(set(actual_ext_names)) - assert actual_ext_names == unique_actual_ext_names, "duplicate extension names" + assert actual_ext_names == unique_actual_ext_names, 'duplicate extension names' test_ext_names = {ext.name for ext in test.extensions} - assert set(actual_ext_names) == test_ext_names, "extension names dont match" + assert set(actual_ext_names) == test_ext_names, 'extension names dont match' n_ext_correct = 0 for test_ext in test.extensions: for actual_ext in actual.extensions: @@ -1130,20 +1130,20 @@ def compare_gene(test, actual): else: assert set(actual_ext.value) == set( test_ext.value - ), f"{test_ext.value} value" + ), f'{test_ext.value} value' else: assert actual_ext.value == test_ext.value else: assert actual_ext.value == test_ext.value assert actual_ext.type == test_ext.type n_ext_correct += 1 - assert n_ext_correct == len(test.extensions), "number of correct extensions" + assert n_ext_correct == len(test.extensions), 'number of correct extensions' def test_search_query(query_handler, num_sources): """Test that query returns properly-structured response.""" - resp = query_handler.search(" BRAF ") - assert resp.query == "BRAF" + resp = query_handler.search(' BRAF ') + assert resp.query == 'BRAF' matches = resp.source_matches assert isinstance(matches, dict) assert len(matches) == num_sources @@ -1151,20 +1151,20 @@ def test_search_query(query_handler, num_sources): def test_search_query_inc_exc(query_handler, num_sources): """Test that query incl and excl work correctly.""" - sources = "hgnc, ensembl, ncbi" - resp = query_handler.search("BRAF", excl=sources) + sources = 'hgnc, ensembl, ncbi' + resp = query_handler.search('BRAF', excl=sources) matches = resp.source_matches assert len(matches) == num_sources - len(sources.split()) - sources = "Hgnc, NCBi" - resp = query_handler.search("BRAF", incl=sources) + sources = 'Hgnc, NCBi' + resp = query_handler.search('BRAF', incl=sources) matches = resp.source_matches assert len(matches) == len(sources.split()) assert SourceName.HGNC in matches assert SourceName.NCBI in matches - sources = "HGnC" - resp = query_handler.search("BRAF", excl=sources) + sources = 'HGnC' + resp = query_handler.search('BRAF', excl=sources) matches = resp.source_matches assert len(matches) == num_sources - len(sources.split()) assert SourceName.ENSEMBL in matches @@ -1174,30 +1174,30 @@ def test_search_query_inc_exc(query_handler, num_sources): def test_search_invalid_parameter_exception(query_handler): """Test that Invalid parameter exception works correctly.""" with pytest.raises(InvalidParameterException): - _ = query_handler.search("BRAF", incl="hgn") # noqa: F841, E501 + _ = query_handler.search('BRAF', incl='hgn') # noqa: F841 with pytest.raises(InvalidParameterException): - resp = query_handler.search("BRAF", incl="hgnc", excl="hgnc") # noqa: F841 + resp = query_handler.search('BRAF', incl='hgnc', excl='hgnc') # noqa: F841 def test_ache_query(query_handler, num_sources, normalized_ache, source_meta): """Test that ACHE concept_id shows xref matches.""" # Search - resp = query_handler.search("ncbigene:43") + resp = query_handler.search('ncbigene:43') matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF assert len(matches[SourceName.ENSEMBL].records) == 0 assert matches[SourceName.NCBI].records[0].match_type == MatchType.CONCEPT_ID - resp = query_handler.search("hgnc:108") + resp = query_handler.search('hgnc:108') matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.CONCEPT_ID assert matches[SourceName.ENSEMBL].records[0].match_type == MatchType.XREF assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF - resp = query_handler.search("ensembl:ENSG00000087085") + resp = query_handler.search('ensembl:ENSG00000087085') matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF @@ -1205,49 +1205,49 @@ def test_ache_query(query_handler, num_sources, normalized_ache, source_meta): assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF # Normalize - q = "ACHE" + q = 'ACHE' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.SYMBOL, normalized_ache, expected_source_meta=source_meta ) - q = "ache" + q = 'ache' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.SYMBOL, normalized_ache, expected_source_meta=source_meta ) - q = "hgnc:108" + q = 'hgnc:108' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.CONCEPT_ID, normalized_ache, expected_source_meta=source_meta ) - q = "ensembl:ENSG00000087085" + q = 'ensembl:ENSG00000087085' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.CONCEPT_ID, normalized_ache, expected_source_meta=source_meta ) - q = "ncbigene:43" + q = 'ncbigene:43' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.CONCEPT_ID, normalized_ache, expected_source_meta=source_meta ) - q = "3.1.1.7" + q = '3.1.1.7' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.ALIAS, normalized_ache, expected_source_meta=source_meta ) - q = "ARACHE" + q = 'ARACHE' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.ALIAS, normalized_ache, expected_source_meta=source_meta ) - q = "YT" + q = 'YT' resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1257,7 +1257,7 @@ def test_ache_query(query_handler, num_sources, normalized_ache, source_meta): expected_source_meta=source_meta, ) - q = "ACEE" + q = 'ACEE' resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1267,7 +1267,7 @@ def test_ache_query(query_handler, num_sources, normalized_ache, source_meta): expected_source_meta=source_meta, ) - q = "omim:100740" + q = 'omim:100740' resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1281,21 +1281,21 @@ def test_ache_query(query_handler, num_sources, normalized_ache, source_meta): def test_braf_query(query_handler, num_sources, normalized_braf, source_meta): """Test that BRAF concept_id shows xref matches.""" # Search - resp = query_handler.search("ncbigene:673") + resp = query_handler.search('ncbigene:673') matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF assert len(matches[SourceName.ENSEMBL].records) == 0 assert matches[SourceName.NCBI].records[0].match_type == MatchType.CONCEPT_ID - resp = query_handler.search("hgnc:1097") + resp = query_handler.search('hgnc:1097') matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.CONCEPT_ID assert matches[SourceName.ENSEMBL].records[0].match_type == MatchType.XREF assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF - resp = query_handler.search("ensembl:ENSG00000157764") + resp = query_handler.search('ensembl:ENSG00000157764') matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF @@ -1303,49 +1303,49 @@ def test_braf_query(query_handler, num_sources, normalized_braf, source_meta): assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF # Normalize - q = "BRAF" + q = 'BRAF' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.SYMBOL, normalized_braf, expected_source_meta=source_meta ) - q = "braf" + q = 'braf' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.SYMBOL, normalized_braf, expected_source_meta=source_meta ) - q = "hgnc:1097" + q = 'hgnc:1097' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.CONCEPT_ID, normalized_braf, expected_source_meta=source_meta ) - q = "ensembl:ENSG00000157764" + q = 'ensembl:ENSG00000157764' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.CONCEPT_ID, normalized_braf, expected_source_meta=source_meta ) - q = "ncbigene:673" + q = 'ncbigene:673' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.CONCEPT_ID, normalized_braf, expected_source_meta=source_meta ) - q = "NS7" + q = 'NS7' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.ALIAS, normalized_braf, expected_source_meta=source_meta ) - q = "b-raf" + q = 'b-raf' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.ALIAS, normalized_braf, expected_source_meta=source_meta ) - q = "omim:164757" + q = 'omim:164757' resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1359,21 +1359,21 @@ def test_braf_query(query_handler, num_sources, normalized_braf, source_meta): def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta): """Test that ABL1 concept_id shows xref matches.""" # Search - resp = query_handler.search("ncbigene:25") + resp = query_handler.search('ncbigene:25') matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF assert len(matches[SourceName.ENSEMBL].records) == 0 assert matches[SourceName.NCBI].records[0].match_type == MatchType.CONCEPT_ID - resp = query_handler.search("hgnc:76") + resp = query_handler.search('hgnc:76') matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.CONCEPT_ID assert matches[SourceName.ENSEMBL].records[0].match_type == MatchType.XREF assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF - resp = query_handler.search("ensembl:ENSG00000097007") + resp = query_handler.search('ensembl:ENSG00000097007') matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF @@ -1381,43 +1381,43 @@ def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta): assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF # Normalize - q = "ABL1" + q = 'ABL1' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.SYMBOL, normalized_abl1, expected_source_meta=source_meta ) - q = "abl1" + q = 'abl1' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.SYMBOL, normalized_abl1, expected_source_meta=source_meta ) - q = "hgnc:76" + q = 'hgnc:76' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.CONCEPT_ID, normalized_abl1, expected_source_meta=source_meta ) - q = "ensembl:ENSG00000097007" + q = 'ensembl:ENSG00000097007' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.CONCEPT_ID, normalized_abl1, expected_source_meta=source_meta ) - q = "ncbigene:25" + q = 'ncbigene:25' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.CONCEPT_ID, normalized_abl1, expected_source_meta=source_meta ) - q = "v-abl" + q = 'v-abl' resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.ALIAS, normalized_abl1, expected_source_meta=source_meta ) - q = "LOC116063" + q = 'LOC116063' resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1427,7 +1427,7 @@ def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta): expected_source_meta=source_meta, ) - q = "LOC112779" + q = 'LOC112779' resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1437,7 +1437,7 @@ def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta): expected_source_meta=source_meta, ) - q = "ABL" + q = 'ABL' resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1447,7 +1447,7 @@ def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta): expected_source_meta=source_meta, ) - q = "refseq:NM_007313" + q = 'refseq:NM_007313' resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1460,16 +1460,16 @@ def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta): def test_multiple_norm_concepts(query_handler, normalized_p150, source_meta): """Tests where more than one normalized concept is found.""" - q = "P150" + q = 'P150' resp = query_handler.normalize(q) expected_warnings = [ { - "multiple_normalized_concepts_found": [ - "hgnc:16850", - "hgnc:76", - "hgnc:17168", - "hgnc:500", - "hgnc:8982", + 'multiple_normalized_concepts_found': [ + 'hgnc:16850', + 'hgnc:76', + 'hgnc:17168', + 'hgnc:500', + 'hgnc:8982', ] } ] @@ -1487,7 +1487,7 @@ def test_normalize_single_entry(query_handler, normalized_loc_653303): """Test that the normalized endpoint correctly shapes unmerged identity records into core gene objects. """ - q = "LOC653303" + q = 'LOC653303' resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1502,7 +1502,7 @@ def test_normalize_no_locations(query_handler, normalized_ifnr): """Test that the normalized endpoint correcly shapes merged entity with no locations """ - q = "IFNR" + q = 'IFNR' resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1521,55 +1521,55 @@ def test_normalize_unmerged( ): """Test that unmerged normalization produces correct results.""" # concept ID - q = "ncbigene:653303" + q = 'ncbigene:653303' resp = query_handler.normalize_unmerged(q) compare_unmerged_response( resp, q, [], MatchType.CONCEPT_ID, normalize_unmerged_loc_653303 ) - q = "hgnc:1910" + q = 'hgnc:1910' resp = query_handler.normalize_unmerged(q) compare_unmerged_response( resp, q, [], MatchType.CONCEPT_ID, normalize_unmerged_chaf1a ) - q = "HGNC:108" + q = 'HGNC:108' resp = query_handler.normalize_unmerged(q) compare_unmerged_response( resp, q, [], MatchType.CONCEPT_ID, normalize_unmerged_ache ) # symbol - q = "LOC653303" + q = 'LOC653303' resp = query_handler.normalize_unmerged(q) compare_unmerged_response( resp, q, [], MatchType.SYMBOL, normalize_unmerged_loc_653303 ) # prev symbol - q = "ACEE" + q = 'ACEE' resp = query_handler.normalize_unmerged(q) compare_unmerged_response( resp, q, [], MatchType.PREV_SYMBOL, normalize_unmerged_ache ) - q = "LOC196266" + q = 'LOC196266' resp = query_handler.normalize_unmerged(q) compare_unmerged_response( resp, q, [], MatchType.PREV_SYMBOL, normalize_unmerged_loc_653303 ) # alias - q = "P150" + q = 'P150' resp = query_handler.normalize_unmerged(q) expected_warnings = [ { - "multiple_normalized_concepts_found": [ - "hgnc:500", - "hgnc:8982", - "hgnc:17168", - "hgnc:16850", - "hgnc:76", + 'multiple_normalized_concepts_found': [ + 'hgnc:500', + 'hgnc:8982', + 'hgnc:17168', + 'hgnc:16850', + 'hgnc:76', ] } ] @@ -1577,22 +1577,22 @@ def test_normalize_unmerged( resp, q, expected_warnings, MatchType.ALIAS, normalize_unmerged_chaf1a ) - q = "ARACHE" + q = 'ARACHE' resp = query_handler.normalize_unmerged(q) compare_unmerged_response(resp, q, [], MatchType.ALIAS, normalize_unmerged_ache) - q = "MGC71229" + q = 'MGC71229' resp = query_handler.normalize_unmerged(q) compare_unmerged_response(resp, q, [], MatchType.ALIAS, normalize_unmerged_chaf1a) # assoc with - q = "omim:100740" + q = 'omim:100740' resp = query_handler.normalize_unmerged(q) compare_unmerged_response( resp, q, [], MatchType.ASSOCIATED_WITH, normalize_unmerged_ache ) - q = "uniprot:Q13111" + q = 'uniprot:Q13111' resp = query_handler.normalize_unmerged(q) compare_unmerged_response( resp, q, [], MatchType.ASSOCIATED_WITH, normalize_unmerged_chaf1a @@ -1601,18 +1601,18 @@ def test_normalize_unmerged( def test_invalid_queries(query_handler): """Test invalid queries""" - resp = query_handler.normalize("B R A F") + resp = query_handler.normalize('B R A F') assert resp.match_type is MatchType.NO_MATCH with pytest.raises(TypeError): - resp["match_type"] + resp['match_type'] - resp = query_handler.search("B R A F") + resp = query_handler.search('B R A F') records = [r for matches in resp.source_matches.values() for r in matches.records] assert len(records) == 0 def test_service_meta(query_handler): """Test service meta info in response.""" - resp = query_handler.search("pheno") + resp = query_handler.search('pheno') compare_service_meta(resp.service_meta_) diff --git a/tests/unit/test_schemas.py b/tests/unit/test_schemas.py index 3d5fceed..fbf6f339 100644 --- a/tests/unit/test_schemas.py +++ b/tests/unit/test_schemas.py @@ -16,22 +16,22 @@ # ) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def sequence_location(): """Create a valid sequence location test fixture.""" return models.SequenceLocation( sequence=models.SequenceReference( - refgetAccession="SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul" + refgetAccession='SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul' ), start=140719327, end=140924929, ) -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def gene(): """Create a valid gene test fixture.""" - return Gene(match_type=100, concept_id="hgnc:1097", symbol="BRAF") + return Gene(match_type=100, concept_id='hgnc:1097', symbol='BRAF') def test_gene(gene, sequence_location): @@ -39,77 +39,77 @@ def test_gene(gene, sequence_location): assert gene assert Gene( match_type=100, - concept_id="ensembl:1", - symbol="GENE", + concept_id='ensembl:1', + symbol='GENE', # locations=[chromosome_location, sequence_location] locations=[sequence_location], ) assert Gene( match_type=100, - concept_id="ensembl:1", - symbol="GENE", + concept_id='ensembl:1', + symbol='GENE', locations=[sequence_location], ) assert Gene( match_type=100, - concept_id="ensembl:1", - symbol="GENE", + concept_id='ensembl:1', + symbol='GENE', locations=[sequence_location], ) # id not a valid curie with pytest.raises(pydantic.ValidationError): - Gene(match_type=100, concept_id="hgnc1096", symbol="BRAF") + Gene(match_type=100, concept_id='hgnc1096', symbol='BRAF') # symbol not a str with pytest.raises(pydantic.ValidationError): - Gene(match_type=100, concept_id="hgnc:1096", symbol=1) + Gene(match_type=100, concept_id='hgnc:1096', symbol=1) # strand not -/+ with pytest.raises(pydantic.ValidationError): - Gene(match_type=100, concept_id="hgnc:1096", symbol="BRAF", strand="positive") + Gene(match_type=100, concept_id='hgnc:1096', symbol='BRAF', strand='positive') # xrefs not a valid curie with pytest.raises(pydantic.ValidationError): Gene( match_type=100, - concept_id="hgnc:1096", - symbol="BRAF", - xrefs=["hgnc", "hgnc:1"], + concept_id='hgnc:1096', + symbol='BRAF', + xrefs=['hgnc', 'hgnc:1'], ) # associated_with not a valid curie with pytest.raises(pydantic.ValidationError): Gene( match_type=100, - concept_id="hgnc:1096", - symbol="BRAF", - associated_with=["hgnc", "hgnc:1"], + concept_id='hgnc:1096', + symbol='BRAF', + associated_with=['hgnc', 'hgnc:1'], ) # symbol status invalid with pytest.raises(pydantic.ValidationError): Gene( match_type=100, - concept_id="hgnc:1096", - symbol="BRAF", - symbol_status="nothing", + concept_id='hgnc:1096', + symbol='BRAF', + symbol_status='nothing', ) # locations not a sequence or chromosome location with pytest.raises(pydantic.ValidationError): Gene( match_type=100, - concept_id="hgnc:1096", - symbol="BRAF", - locations=["GRCh38:chr1"], + concept_id='hgnc:1096', + symbol='BRAF', + locations=['GRCh38:chr1'], ) # location not a list with pytest.raises(pydantic.ValidationError): Gene( match_type=100, - concept_id="hgnc:1096", - symbol="BRAF", + concept_id='hgnc:1096', + symbol='BRAF', locations=sequence_location, )