diff --git a/Pipfile b/Pipfile index 9ea28096..10887e0e 100644 --- a/Pipfile +++ b/Pipfile @@ -19,7 +19,7 @@ wags-tails = ">=0.1.1" psycopg = {version = "*", extras=["binary"]} pytest = "*" pre-commit = "*" -ruff = ">=0.1.2" +ruff = "==0.2.0" pytest-cov = "*" httpx = "*" mock = "*" diff --git a/docs/scripts/generate_normalize_figure.py b/docs/scripts/generate_normalize_figure.py index 1a39a085..aac3dcae 100644 --- a/docs/scripts/generate_normalize_figure.py +++ b/docs/scripts/generate_normalize_figure.py @@ -43,13 +43,13 @@ def create_gjgf(result: UnmergedNormalizationService) -> Dict: } } - for i, (source, matches) in enumerate(result.source_matches.items()): + for i, (_, matches) in enumerate(result.source_matches.items()): for match in matches.records: graph["graph"]["nodes"][match.concept_id] = { "metadata": { "color": COLORS[i], - "hover": f"{match.concept_id}\n{match.symbol}\n{match.label}", # noqa: E501 - "click": f"
{json.dumps(match.model_dump(), indent=2)}
", # noqa: E501 + "hover": f"{match.concept_id}\n{match.symbol}\n{match.label}", + "click": f"{json.dumps(match.model_dump(), indent=2)}
", } } for xref in match.xrefs: diff --git a/docs/source/conf.py b/docs/source/conf.py index f22f3d54..959356b5 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -79,7 +79,7 @@ def linkcode_resolve(domain, info): if not info["module"]: return None filename = info["module"].replace(".", "/") - return f"https://github.com/cancervariants/gene-normalization/blob/main/{filename}.py" # noqa: E501 + return f"https://github.com/cancervariants/gene-normalization/blob/main/{filename}.py" # -- code block style -------------------------------------------------------- diff --git a/src/gene/__init__.py b/src/gene/__init__.py index 2c569554..8121c6ee 100644 --- a/src/gene/__init__.py +++ b/src/gene/__init__.py @@ -3,7 +3,7 @@ from os import environ from pathlib import Path -from .version import __version__ # noqa: F401 +from .version import __version__ APP_ROOT = Path(__file__).resolve().parent @@ -53,7 +53,7 @@ class DownloadException(Exception): # noqa: N818 PREFIX_LOOKUP = { v.value: SourceName[k].value for k, v in NamespacePrefix.__members__.items() - if k in SourceName.__members__.keys() + if k in SourceName.__members__ } # use to generate namespace prefix from source ID value diff --git a/src/gene/cli.py b/src/gene/cli.py index a14ce952..aac15baf 100644 --- a/src/gene/cli.py +++ b/src/gene/cli.py @@ -1,6 +1,7 @@ """Provides a CLI util to make updates to normalizer database.""" import logging import os +from ast import literal_eval from pathlib import Path from timeit import default_timer as timer from typing import Collection, List, Optional, Set @@ -69,10 +70,10 @@ def update_from_remote(data_url: Optional[str], db_url: str) -> None: except NotImplementedError: click.echo( f"Error: Fetching remote data dump not supported for {db.__class__.__name__}" - ) # noqa: E501 + ) click.get_current_context().exit(1) except DatabaseException as e: - click.echo(f"Encountered exception during update: {str(e)}") + click.echo(f"Encountered exception during update: {e!s}") click.get_current_context().exit(1) @@ -92,7 +93,7 @@ def dump_database(output_directory: Path, db_url: str) -> None: :param db_url: URL to normalizer database """ # noqa: D301 if not output_directory: - output_directory = Path(".") + output_directory = Path() db = create_db(db_url, False) try: @@ -100,10 +101,10 @@ def dump_database(output_directory: Path, db_url: str) -> None: except NotImplementedError: click.echo( f"Error: Dumping data to file not supported for {db.__class__.__name__}" - ) # noqa: E501 + ) click.get_current_context().exit(1) except DatabaseException as e: - click.echo(f"Encountered exception during update: {str(e)}") + click.echo(f"Encountered exception during update: {e!s}") click.get_current_context().exit(1) @@ -122,7 +123,7 @@ def _update_normalizer( :param use_existing: if True, use most recent local version of source data instead of fetching from remote """ - processed_ids = list() + processed_ids = [] for n in sources: delete_time = _delete_source(n, db) _load_source(n, db, delete_time, processed_ids, use_existing) @@ -184,7 +185,7 @@ def _load_source( f"Encountered ModuleNotFoundError attempting to import {e.name}. {_etl_dependency_help}" ) click.get_current_context().exit() - SourceClass = eval(n.value) # noqa: N806 + SourceClass = literal_eval(n.value) # noqa: N806 source = SourceClass(database=db, silent=False) try: @@ -297,19 +298,21 @@ def update_normalizer_db( ctx = click.get_current_context() click.echo( "Must either enter 1 or more sources, or use `--update_all` parameter" - ) # noqa: E501 + ) click.echo(ctx.get_help()) ctx.exit() else: sources_split = sources.lower().split() if len(sources_split) == 0: - raise Exception("Must enter 1 or more source names to update") + err_msg = "Must enter 1 or more source names to update" + raise Exception(err_msg) non_sources = set(sources_split) - set(SOURCES) if len(non_sources) != 0: - raise Exception(f"Not valid source(s): {non_sources}") + err_msg = f"Not valid source(s): {non_sources}" + raise Exception(err_msg) parsed_source_names = {SourceName(SOURCES[s]) for s in sources_split} _update_normalizer(parsed_source_names, db, update_merged, use_existing) diff --git a/src/gene/database/database.py b/src/gene/database/database.py index 67bcafd6..90ad5108 100644 --- a/src/gene/database/database.py +++ b/src/gene/database/database.py @@ -63,12 +63,11 @@ def _check_delete_okay() -> bool: """ if environ.get(AWS_ENV_VAR_NAME, "") == AwsEnvName.PRODUCTION: if environ.get(SKIP_AWS_DB_ENV_NAME, "") == "true": - raise DatabaseWriteException( - f"Must unset {SKIP_AWS_DB_ENV_NAME} env variable to enable drop_db()" # noqa: E501 - ) + err_msg = f"Must unset {SKIP_AWS_DB_ENV_NAME} env variable to enable drop_db()" + raise DatabaseWriteException(err_msg) return click.confirm("Are you sure you want to delete existing data?") - else: - return True + + return True @abc.abstractmethod def drop_db(self) -> None: @@ -324,7 +323,7 @@ def create_db( else: if db_url: endpoint_url = db_url - elif "GENE_NORM_DB_URL" in environ.keys(): + elif "GENE_NORM_DB_URL" in environ: endpoint_url = environ["GENE_NORM_DB_URL"] else: endpoint_url = "http://localhost:8000" diff --git a/src/gene/database/dynamodb.py b/src/gene/database/dynamodb.py index 5df9e0d0..218c525e 100644 --- a/src/gene/database/dynamodb.py +++ b/src/gene/database/dynamodb.py @@ -44,20 +44,18 @@ def __init__(self, db_url: Optional[str] = None, **db_args) -> None: if AWS_ENV_VAR_NAME in environ: if "GENE_TEST" in environ: - raise DatabaseInitializationException( - f"Cannot have both GENE_TEST and {AWS_ENV_VAR_NAME} set." - ) # noqa: E501 + err_msg = f"Cannot have both GENE_TEST and {AWS_ENV_VAR_NAME} set." + raise DatabaseInitializationException(err_msg) aws_env = environ[AWS_ENV_VAR_NAME] if aws_env not in VALID_AWS_ENV_NAMES: - raise DatabaseInitializationException( - f"{AWS_ENV_VAR_NAME} must be one of {VALID_AWS_ENV_NAMES}" - ) # noqa: E501 + err_msg = f"{AWS_ENV_VAR_NAME} must be one of {VALID_AWS_ENV_NAMES}" + raise DatabaseInitializationException(err_msg) skip_confirmation = environ.get(SKIP_AWS_DB_ENV_NAME) if (not skip_confirmation) or ( skip_confirmation and skip_confirmation != "true" - ): # noqa: E501 + ): confirm_aws_db_use(environ[AWS_ENV_VAR_NAME]) boto_params = {"region_name": region_name} @@ -156,7 +154,7 @@ def check_schema_initialized(self) -> bool: existing_tables = self.list_tables() exists = self.gene_table in existing_tables if not exists: - logger.info(f"{self.gene_table} table is missing or unavailable.") + logger.info("%s table is missing or unavailable.", self.gene_table) return exists def check_tables_populated(self) -> bool: @@ -210,18 +208,17 @@ def get_source_metadata(self, src_name: Union[str, SourceName]) -> Dict: src_name = src_name.value if src_name in self._cached_sources: return self._cached_sources[src_name] - else: - pk = f"{src_name.lower()}##source" - concept_id = f"source:{src_name.lower()}" - metadata = self.genes.get_item( - Key={"label_and_type": pk, "concept_id": concept_id} - ).get("Item") - if not metadata: - raise DatabaseReadException( - f"Unable to retrieve data for source {src_name}" - ) - self._cached_sources[src_name] = metadata - return metadata + + pk = f"{src_name.lower()}##source" + concept_id = f"source:{src_name.lower()}" + metadata = self.genes.get_item( + Key={"label_and_type": pk, "concept_id": concept_id} + ).get("Item") + if not metadata: + err_msg = f"Unable to retrieve data for source {src_name}" + raise DatabaseReadException(err_msg) + self._cached_sources[src_name] = metadata + return metadata def get_record_by_id( self, concept_id: str, case_sensitive: bool = True, merge: bool = False @@ -246,17 +243,17 @@ def get_record_by_id( Key={"label_and_type": pk, "concept_id": concept_id} ) return match["Item"] - else: - exp = Key("label_and_type").eq(pk) - response = self.genes.query(KeyConditionExpression=exp) - record = response["Items"][0] - del record["label_and_type"] - return record + + exp = Key("label_and_type").eq(pk) + response = self.genes.query(KeyConditionExpression=exp) + record = response["Items"][0] + del record["label_and_type"] + return record except ClientError as e: logger.error( - f"boto3 client error on get_records_by_id for " - f"search term {concept_id}: " - f"{e.response['Error']['Message']}" + "boto3 client error on get_records_by_id for search term %s: %s", + concept_id, + e.response["Error"]["Message"], ) return None except (KeyError, IndexError): # record doesn't exist @@ -277,9 +274,9 @@ def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]: return [m["concept_id"] for m in matches.get("Items", None)] except ClientError as e: logger.error( - f"boto3 client error on get_refs_by_type for " - f"search term {search_term}: " - f"{e.response['Error']['Message']}" + "boto3 client error on get_refs_by_type for search term %s: %s", + search_term, + e.response["Error"]["Message"], ) return [] @@ -341,7 +338,7 @@ def get_all_records(self, record_type: RecordType) -> Generator[Dict, None, None else: if ( incoming_record_type == RecordType.IDENTITY - and not record.get("merge_ref") # noqa: E501 + and not record.get("merge_ref") ) or incoming_record_type == RecordType.MERGER: yield record last_evaluated_key = response.get("LastEvaluatedKey") @@ -364,7 +361,7 @@ def add_source_metadata(self, src_name: SourceName, metadata: SourceMeta) -> Non try: self.genes.put_item(Item=metadata_item) except ClientError as e: - raise DatabaseWriteException(e) + raise DatabaseWriteException(e) from e def add_record(self, record: Dict, src_name: SourceName) -> None: """Add new record to database. @@ -381,8 +378,9 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: self.batch.put_item(Item=record) except ClientError as e: logger.error( - "boto3 client error on add_record for " - f"{concept_id}: {e.response['Error']['Message']}" + "boto3 client error on add_record for %s: %s", + concept_id, + e.response["Error"]["Message"], ) for attr_type, item_type in ITEM_TYPES.items(): if attr_type in record: @@ -413,8 +411,9 @@ def add_merged_record(self, record: Dict) -> None: self.batch.put_item(Item=record) except ClientError as e: logger.error( - "boto3 client error on add_record for " - f"{concept_id}: {e.response['Error']['Message']}" + "boto3 client error on add_record for " "%s: %s", + concept_id, + e.response["Error"]["Message"], ) def _add_ref_record( @@ -439,9 +438,11 @@ def _add_ref_record( self.batch.put_item(Item=record) except ClientError as e: logger.error( - f"boto3 client error adding reference {term} for " - f"{concept_id} with match type {ref_type}: " - f"{e.response['Error']['Message']}" + "boto3 client error adding reference %s for %s with match type %s: %s", + term, + concept_id, + ref_type, + e.response["Error"]["Message"], ) def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: ANN401 @@ -466,14 +467,15 @@ def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: AN except ClientError as e: code = e.response.get("Error", {}).get("Code") if code == "ConditionalCheckFailedException": - raise DatabaseWriteException( + err_msg = ( f"No such record exists for keys {label_and_type}, {concept_id}" ) - else: - logger.error( - f"boto3 client error in `database.update_record()`: " - f"{e.response['Error']['Message']}" - ) + raise DatabaseWriteException(err_msg) from e + + logger.error( + "boto3 client error in `database.update_record()`: %s", + e.response["Error"]["Message"], + ) def delete_normalized_concepts(self) -> None: """Remove merged records from the database. Use when performing a new update @@ -495,7 +497,7 @@ def delete_normalized_concepts(self) -> None: ), ) except ClientError as e: - raise DatabaseReadException(e) + raise DatabaseReadException(e) from e records = response["Items"] if not records: break @@ -522,7 +524,7 @@ def delete_source(self, src_name: SourceName) -> None: KeyConditionExpression=Key("src_name").eq(src_name.value), ) except ClientError as e: - raise DatabaseReadException(e) + raise DatabaseReadException(e) from e records = response["Items"] if not records: break @@ -538,7 +540,7 @@ def delete_source(self, src_name: SourceName) -> None: } ) except ClientError as e: - raise DatabaseWriteException(e) + raise DatabaseWriteException(e) from e def complete_write_transaction(self) -> None: """Conclude transaction or batch writing if relevant.""" diff --git a/src/gene/database/postgresql.py b/src/gene/database/postgresql.py index f62a1819..2f6fcd73 100644 --- a/src/gene/database/postgresql.py +++ b/src/gene/database/postgresql.py @@ -7,7 +7,7 @@ import tempfile from datetime import datetime from pathlib import Path -from typing import Any, Dict, Generator, List, Optional, Set, Tuple +from typing import Any, ClassVar, Dict, Generator, List, Optional, Set, Tuple import psycopg import requests @@ -283,7 +283,8 @@ def get_source_metadata(self, src_name: SourceName) -> Dict: cur.execute(metadata_query, [src_name]) metadata_result = cur.fetchone() if not metadata_result: - raise DatabaseReadException(f"{src_name} metadata lookup failed") + err_msg = f"{src_name} metadata lookup failed" + raise DatabaseReadException(err_msg) metadata = { "data_license": metadata_result[1], "data_license_url": metadata_result[2], @@ -301,7 +302,7 @@ def get_source_metadata(self, src_name: SourceName) -> Dict: return metadata _get_record_query = ( - b"SELECT * FROM record_lookup_view WHERE lower(concept_id) = %s;" # noqa: E501 + b"SELECT * FROM record_lookup_view WHERE lower(concept_id) = %s;" ) def _format_source_record(self, source_row: Tuple) -> Dict: @@ -375,7 +376,7 @@ def _format_merged_record(self, merged_row: Tuple) -> Dict: return {k: v for k, v in merged_record.items() if v} _get_merged_record_query = ( - b"SELECT * FROM gene_merged WHERE lower(concept_id) = %s;" # noqa: E501 + b"SELECT * FROM gene_merged WHERE lower(concept_id) = %s;" ) def _get_merged_record( @@ -408,15 +409,15 @@ def get_record_by_id( """ if merge: return self._get_merged_record(concept_id, case_sensitive) - else: - return self._get_record(concept_id, case_sensitive) - _ref_types_query = { - RefType.SYMBOL: b"SELECT concept_id FROM gene_symbols WHERE lower(symbol) = %s;", # noqa: E501 - RefType.PREVIOUS_SYMBOLS: b"SELECT concept_id FROM gene_previous_symbols WHERE lower(prev_symbol) = %s;", # noqa: E501 - RefType.ALIASES: b"SELECT concept_id FROM gene_aliases WHERE lower(alias) = %s;", # noqa: E501 + return self._get_record(concept_id, case_sensitive) + + _ref_types_query: ClassVar[dict[str, bytes]] = { + RefType.SYMBOL: b"SELECT concept_id FROM gene_symbols WHERE lower(symbol) = %s;", + RefType.PREVIOUS_SYMBOLS: b"SELECT concept_id FROM gene_previous_symbols WHERE lower(prev_symbol) = %s;", + RefType.ALIASES: b"SELECT concept_id FROM gene_aliases WHERE lower(alias) = %s;", RefType.XREFS: b"SELECT concept_id FROM gene_xrefs WHERE lower(xref) = %s;", - RefType.ASSOCIATED_WITH: b"SELECT concept_id FROM gene_associations WHERE lower(associated_with) = %s;", # noqa: E501 + RefType.ASSOCIATED_WITH: b"SELECT concept_id FROM gene_associations WHERE lower(associated_with) = %s;", } def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]: @@ -429,15 +430,16 @@ def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]: """ query = self._ref_types_query.get(ref_type) if not query: - raise ValueError("invalid reference type") + err_msg = "invalid reference type" + raise ValueError(err_msg) with self.conn.cursor() as cur: cur.execute(query, (search_term.lower(),)) concept_ids = cur.fetchall() if concept_ids: return [i[0] for i in concept_ids] - else: - return [] + + return [] _ids_query = b"SELECT concept_id FROM gene_concepts;" @@ -453,7 +455,7 @@ def get_all_concept_ids(self) -> Set[str]: _get_all_normalized_records_query = b"SELECT * FROM gene_merged;" _get_all_unmerged_source_records_query = ( - b"SELECT * FROM record_lookup_view WHERE merge_ref IS NULL;" # noqa: E501 + b"SELECT * FROM record_lookup_view WHERE merge_ref IS NULL;" ) _get_all_source_records_query = b"SELECT * FROM record_lookup_view;" @@ -594,7 +596,7 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: cur.execute(self._ins_symbol_query, [record["symbol"], concept_id]) self.conn.commit() except UniqueViolation: - logger.error(f"Record with ID {concept_id} already exists") + logger.error("Record with ID %s already exists", concept_id) self.conn.rollback() _add_merged_record_query = b""" @@ -668,9 +670,8 @@ def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: AN # UPDATE will fail silently unless we check the # of affected rows if row_count < 1: - raise DatabaseWriteException( - f"No such record exists for primary key {concept_id}" - ) + err_msg = f"No such record exists for primary key {concept_id}" + raise DatabaseWriteException(err_msg) def delete_normalized_concepts(self) -> None: """Remove merged records from the database. Use when performing a new update @@ -784,26 +785,25 @@ def load_from_remote(self, url: Optional[str]) -> None: command fails """ if not url: - url = "https://vicc-normalizers.s3.us-east-2.amazonaws.com/gene_normalization/postgresql/gene_norm_latest.sql.tar.gz" # noqa: E501 + url = "https://vicc-normalizers.s3.us-east-2.amazonaws.com/gene_normalization/postgresql/gene_norm_latest.sql.tar.gz" with tempfile.TemporaryDirectory() as tempdir: tempdir_path = Path(tempdir) temp_tarfile = tempdir_path / "gene_norm_latest.tar.gz" - with requests.get(url, stream=True) as r: + with requests.get(url, stream=True, timeout=10) as r: try: r.raise_for_status() - except requests.HTTPError: - raise DatabaseException( - f"Unable to retrieve PostgreSQL dump file from {url}" - ) - with open(temp_tarfile, "wb") as h: + except requests.HTTPError as e: + err_msg = f"Unable to retrieve PostgreSQL dump file from {url}" + raise DatabaseException(err_msg) from e + with temp_tarfile.open("wb") as h: for chunk in r.iter_content(chunk_size=8192): if chunk: h.write(chunk) tar = tarfile.open(temp_tarfile, "r:gz") - tar_dump_file = [ + tar_dump_file = next( f for f in tar.getmembers() if f.name.startswith("gene_norm_") - ][0] - tar.extractall(path=tempdir_path, members=[tar_dump_file]) + ) + tar.extractall(path=tempdir_path, members=[tar_dump_file]) # noqa: S202 dump_file = tempdir_path / tar_dump_file.name if self.conn.info.password: @@ -812,12 +812,11 @@ def load_from_remote(self, url: Optional[str]) -> None: pw_param = "-w" self.drop_db() - system_call = f"psql -d {self.conn.info.dbname} -U {self.conn.info.user} {pw_param} -f {dump_file.absolute()}" # noqa: E501 - result = os.system(system_call) + system_call = f"psql -d {self.conn.info.dbname} -U {self.conn.info.user} {pw_param} -f {dump_file.absolute()}" + result = os.system(system_call) # noqa: S605 if result != 0: - raise DatabaseException( - f"System call '{result}' returned failing exit code." - ) + err_msg = f"System call '{result}' returned failing exit code." + raise DatabaseException(err_msg) def export_db(self, output_directory: Path) -> None: """Dump DB to specified location. @@ -829,23 +828,20 @@ def export_db(self, output_directory: Path) -> None: :raise DatabaseException: if psql call fails """ if not output_directory.is_dir() or not output_directory.exists(): - raise ValueError( + err_msg = ( f"Output location {output_directory} isn't a directory or doesn't exist" - ) # noqa: E501 - now = datetime.now().strftime("%Y%m%d%H%M%S") + ) + raise ValueError(err_msg) + now = datetime.now(tz=datetime.timezone.utc).strftime("%Y%m%d%H%M%S") output_location = output_directory / f"gene_norm_{now}.sql" user = self.conn.info.user host = self.conn.info.host port = self.conn.info.port database_name = self.conn.info.dbname - if self.conn.info.password: - pw_param = f"-W {self.conn.info.password}" - else: - pw_param = "-w" + pw_param = f"-W {self.conn.info.password}" if self.conn.info.password else "-w" - system_call = f"pg_dump -E UTF8 -f {output_location} -U {user} {pw_param} -h {host} -p {port} {database_name}" # noqa: E501 - result = os.system(system_call) + system_call = f"pg_dump -E UTF8 -f {output_location} -U {user} {pw_param} -h {host} -p {port} {database_name}" + result = os.system(system_call) # noqa: S605 if result != 0: - raise DatabaseException( - f"System call '{system_call}' returned failing exit code." - ) + err_msg = f"System call '{system_call}' returned failing exit code." + raise DatabaseException(err_msg) diff --git a/src/gene/etl/base.py b/src/gene/etl/base.py index 77e9eee1..edd730ea 100644 --- a/src/gene/etl/base.py +++ b/src/gene/etl/base.py @@ -48,7 +48,7 @@ def __init__( self._data_source = self._get_data_handler(data_path) self._database = database self.seqrepo = self.get_seqrepo(seqrepo_dir) - self._processed_ids = list() + self._processed_ids = [] def _get_data_handler( self, data_path: Optional[Path] = None @@ -108,9 +108,9 @@ def _load_gene(self, gene: Dict) -> None: :param gene: Gene record """ try: - assert Gene(match_type=MatchType.NO_MATCH, **gene) + Gene(match_type=MatchType.NO_MATCH, **gene) except pydantic.ValidationError as e: - logger.warning(f"Unable to load {gene} due to validation error: " f"{e}") + logger.warning("Unable to load %s due to validation error: %s", gene, e) else: concept_id = gene["concept_id"] gene["label_and_type"] = f"{concept_id.lower()}##identity" @@ -136,7 +136,8 @@ def get_seqrepo(self, seqrepo_dir: Path) -> SeqRepo: :return: SeqRepo instance """ if not Path(seqrepo_dir).exists(): - raise NotADirectoryError(f"Could not find {seqrepo_dir}") + err_msg = f"Could not find {seqrepo_dir}" + raise NotADirectoryError(err_msg) return SeqRepo(seqrepo_dir) def _set_cl_interval_range(self, loc: str, arm_ix: int, location: Dict) -> None: @@ -146,10 +147,10 @@ def _set_cl_interval_range(self, loc: str, arm_ix: int, location: Dict) -> None: :param arm_ix: The index of the q or p arm for a given location :param location: VRS chromosome location. This will be mutated. """ - range_ix = re.search("-", loc).start() # type: ignore + range_ix = re.search("-", loc).start() start = loc[arm_ix:range_ix] - start_arm_ix = re.search("[pq]", start).start() # type: ignore + start_arm_ix = re.search("[pq]", start).start() start_arm = start[start_arm_ix] end = loc[range_ix + 1 :] @@ -160,7 +161,7 @@ def _set_cl_interval_range(self, loc: str, arm_ix: int, location: Dict) -> None: end = f"{start[0]}{end}" end_arm_match = re.search("[pq]", end) - end_arm_ix = end_arm_match.start() # type: ignore + end_arm_ix = end_arm_match.start() end_arm = end[end_arm_ix] if (start_arm == end_arm and start > end) or ( @@ -211,7 +212,7 @@ def _get_seq_id_aliases(self, seq_id: str) -> List[str]: try: aliases = self.seqrepo.translate_alias(seq_id, target_namespaces="ga4gh") except KeyError as e: - logger.warning(f"SeqRepo raised KeyError: {e}") + logger.warning("SeqRepo raised KeyError: %s", e) return aliases def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Dict: @@ -231,15 +232,17 @@ def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Di sequence = aliases[0] if gene.start != "." and gene.end != "." and sequence: - if 0 <= gene.start <= gene.end: # type: ignore + if 0 <= gene.start <= gene.end: location = GeneSequenceLocation( - start=gene.start - 1, # type: ignore - end=gene.end, # type: ignore + start=gene.start - 1, + end=gene.end, sequence_id=sequence, - ).model_dump() # type: ignore + ).model_dump() else: logger.warning( - f"{params['concept_id']} has invalid interval:" - f"start={gene.start - 1} end={gene.end}" - ) # type: ignore + "%s has invalid interval: start=%i end=%i", + params["concept_id"], + gene.start - 1, + gene.end, + ) return location diff --git a/src/gene/etl/ensembl.py b/src/gene/etl/ensembl.py index 4a52975a..38fa4454 100644 --- a/src/gene/etl/ensembl.py +++ b/src/gene/etl/ensembl.py @@ -46,7 +46,7 @@ def _transform_data(self) -> None: ) # Get accession numbers - accession_numbers = dict() + accession_numbers = {} for item in db.features_of_type("scaffold"): accession_numbers[item[0]] = item[8]["Alias"][-1] for item in db.features_of_type("chromosome"): @@ -68,7 +68,7 @@ def _add_gene(self, f: Feature, accession_numbers: Dict) -> Dict: :param accession_numbers: Accession numbers for each chromosome and scaffold :return: A gene dictionary containing data if the ID attribute exists. """ - gene = dict() + gene = {} if f.strand == "-": gene["strand"] = Strand.REVERSE.value elif f.strand == "+": @@ -101,17 +101,13 @@ def _add_attributes(self, f: Feature, gene: Dict) -> None: for attribute in f.attributes.items(): key = attribute[0] - if key in attributes.keys(): + if key in attributes: val = attribute[1] if len(val) == 1: val = val[0] - if key == "ID": - if val.startswith("gene"): - val = ( - f"{NamespacePrefix.ENSEMBL.value}:" - f"{val.split(':')[1]}" - ) + if key == "ID" and val.startswith("gene"): + val = f"{NamespacePrefix.ENSEMBL.value}:" f"{val.split(':')[1]}" if key == "description": gene["label"] = val.split("[")[0].strip() @@ -152,7 +148,7 @@ def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict: :param src_id: The source's accession number :return: A dict containing an other identifier or xref """ - source = dict() + source = {} if src_name.startswith("HGNC"): source["xrefs"] = [f"{NamespacePrefix.HGNC.value}:{src_id}"] elif src_name.startswith("NCBI"): @@ -171,9 +167,8 @@ def _add_meta(self) -> None: :raise GeneNormalizerEtlError: if requisite metadata is unset """ if not self._version or not self._assembly: - raise GeneNormalizerEtlError( - "Source metadata unavailable -- was data properly acquired before attempting to load DB?" - ) + err_msg = "Source metadata unavailable -- was data properly acquired before attempting to load DB?" + raise GeneNormalizerEtlError(err_msg) metadata = SourceMeta( data_license="custom", data_license_url="https://useast.ensembl.org/info/about" diff --git a/src/gene/etl/hgnc.py b/src/gene/etl/hgnc.py index 2fee6117..24aedd1e 100644 --- a/src/gene/etl/hgnc.py +++ b/src/gene/etl/hgnc.py @@ -28,13 +28,13 @@ class HGNC(Base): def _transform_data(self) -> None: """Transform the HGNC source.""" logger.info("Transforming HGNC...") - with open(self._data_file, "r") as f: # type: ignore + with self._data_file.open() as f: data = json.load(f) records = data["response"]["docs"] for r in records: - gene = dict() + gene = {} gene["concept_id"] = r["hgnc_id"].lower() gene["label_and_type"] = f"{gene['concept_id']}##identity" gene["item_type"] = "identity" @@ -66,8 +66,8 @@ def _get_aliases(self, r: Dict, gene: Dict) -> None: :param r: A gene record in the HGNC data file :param gene: A transformed gene record """ - alias_symbol = list() - enzyme_id = list() + alias_symbol = [] + enzyme_id = [] if "alias_symbol" in r: alias_symbol = r["alias_symbol"] @@ -93,8 +93,8 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: :param r: A gene record in the HGNC data file :param gene: A transformed gene record """ - xrefs = list() - associated_with = list() + xrefs = [] + associated_with = [] sources = [ "entrez_id", "ensembl_gene_id", @@ -134,12 +134,12 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: key = src if key.upper() in NamespacePrefix.__members__: - if NamespacePrefix[key.upper()].value in PREFIX_LOOKUP.keys(): + if NamespacePrefix[key.upper()].value in PREFIX_LOOKUP: self._get_xref_associated_with(key, src, r, xrefs) else: self._get_xref_associated_with(key, src, r, associated_with) else: - logger.warning(f"{key} not in schemas.py") + logger.warning("%s not in schemas.py", key) if xrefs: gene["xrefs"] = xrefs @@ -177,8 +177,8 @@ def _get_location(self, r: Dict, gene: Dict) -> None: else: locations = [r["location"]] - location_list = list() - gene["location_annotations"] = list() + location_list = [] + gene["location_annotations"] = [] for loc in locations: loc = loc.strip() loc = self._set_annotation(loc, gene) @@ -187,7 +187,7 @@ def _get_location(self, r: Dict, gene: Dict) -> None: if loc == "mitochondria": gene["location_annotations"].append(Chromosome.MITOCHONDRIA.value) else: - location = dict() + location = {} self._set_location(loc, location, gene) # chr_location = self._get_chromosome_location(location, gene) # if chr_location: @@ -249,9 +249,8 @@ def _add_meta(self) -> None: :raise GeneNormalizerEtlError: if requisite metadata is unset """ if not self._version: - raise GeneNormalizerEtlError( - "Source metadata unavailable -- was data properly acquired before attempting to load DB?" - ) + err_msg = "Source metadata unavailable -- was data properly acquired before attempting to load DB?" + raise GeneNormalizerEtlError(err_msg) metadata = SourceMeta( data_license="CC0", data_license_url="https://www.genenames.org/about/license/", diff --git a/src/gene/etl/merge.py b/src/gene/etl/merge.py index 8124d294..a04b0706 100644 --- a/src/gene/etl/merge.py +++ b/src/gene/etl/merge.py @@ -36,7 +36,7 @@ def create_merged_concepts(self, record_ids: Set[str]) -> None: for concept_id in new_group: self._groups[concept_id] = new_group end = timer() - logger.debug(f"Built record ID sets in {end - start} seconds") + logger.debug("Built record ID sets in %f seconds", end - start) self._groups = {k: v for k, v in self._groups.items() if len(v) > 1} @@ -59,8 +59,9 @@ def create_merged_concepts(self, record_ids: Set[str]) -> None: except DatabaseWriteException as dw: if str(dw).startswith("No such record exists"): logger.error( - f"Updating nonexistent record: {concept_id} " - f"for merge ref to {merge_ref}" + "Updating nonexistent record: %s for merge ref to %s", + concept_id, + merge_ref, ) else: logger.error(str(dw)) @@ -68,7 +69,7 @@ def create_merged_concepts(self, record_ids: Set[str]) -> None: self._database.complete_write_transaction() logger.info("Merged concept generation successful.") end = timer() - logger.debug(f"Generated and added concepts in {end - start} seconds") + logger.debug("Generated and added concepts in %f seconds", end - start) def _create_record_id_set( self, record_id: str, observed_id_set: Optional[Set] = None @@ -85,27 +86,25 @@ def _create_record_id_set( if record_id in self._groups: return self._groups[record_id] - else: - db_record = self._database.get_record_by_id(record_id) - if not db_record: - logger.warning( - f"Record ID set creator could not resolve " - f"lookup for {record_id} in ID set: " - f"{observed_id_set}" - ) - return observed_id_set - {record_id} - record_xrefs = db_record.get("xrefs") - if not record_xrefs: - return observed_id_set | {db_record["concept_id"]} - else: - local_id_set = set(record_xrefs) - merged_id_set = {record_id} | observed_id_set - for local_record_id in local_id_set - observed_id_set: - merged_id_set |= self._create_record_id_set( - local_record_id, merged_id_set - ) - return merged_id_set + db_record = self._database.get_record_by_id(record_id) + if not db_record: + logger.warning( + "Record ID set creator could not resolve lookup for %s in ID set: %s", + record_id, + observed_id_set, + ) + return observed_id_set - {record_id} + + record_xrefs = db_record.get("xrefs") + if not record_xrefs: + return observed_id_set | {db_record["concept_id"]} + + local_id_set = set(record_xrefs) + merged_id_set = {record_id} | observed_id_set + for local_record_id in local_id_set - observed_id_set: + merged_id_set |= self._create_record_id_set(local_record_id, merged_id_set) + return merged_id_set def _generate_merged_record(self, record_id_set: Set[str]) -> Dict: """Generate merged record from provided concept ID group. @@ -125,8 +124,9 @@ def _generate_merged_record(self, record_id_set: Set[str]) -> Dict: records.append(record) else: logger.error( - f"Merge record generator could not retrieve " - f"record for {record_id} in {record_id_set}" + "Merge record generator could not retrieve record for %s in %s", + record_id, + record_id_set, ) def record_order(record: Dict) -> Tuple: @@ -135,9 +135,10 @@ def record_order(record: Dict) -> Tuple: if src in SourcePriority.__members__: source_rank = SourcePriority[src].value else: - raise Exception( + err_msg = ( f"Prohibited source: {src} in concept_id " f"{record['concept_id']}" ) + raise Exception(err_msg) return source_rank, record["concept_id"] records.sort(key=record_order) @@ -176,7 +177,8 @@ def record_order(record: Dict) -> Tuple: merged_field = GeneTypeFieldName[record["src_name"].upper()] merged_attrs[merged_field] |= {gene_type} - for field in set_fields + [ + for field in [ + *set_fields, "hgnc_locus_type", "ncbi_gene_type", "ensembl_biotype", @@ -193,7 +195,7 @@ def record_order(record: Dict) -> Tuple: if num_unique_strand_values > 1: del merged_attrs["strand"] elif num_unique_strand_values == 1: - merged_attrs["strand"] = list(unique_strand_values)[0] + merged_attrs["strand"] = next(unique_strand_values) merged_attrs["item_type"] = RecordType.MERGER.value return merged_attrs diff --git a/src/gene/etl/ncbi.py b/src/gene/etl/ncbi.py index a3b2e706..ace61a65 100644 --- a/src/gene/etl/ncbi.py +++ b/src/gene/etl/ncbi.py @@ -59,7 +59,7 @@ def _extract_data(self, use_existing: bool) -> None: gene_paths: NcbiGenePaths gene_paths, self._version = self._data_source.get_latest( from_local=use_existing - ) # type: ignore + ) self._info_src = gene_paths.gene_info self._history_src = gene_paths.gene_history self._gene_url = ( @@ -74,7 +74,7 @@ def _get_prev_symbols(self) -> Dict[str, str]: :return: A dictionary of a gene's previous symbols """ # get symbol history - history_file = open(self._history_src, "r") + history_file = self._history_src.open() history = csv.reader(history_file, delimiter="\t") next(history) prev_symbols = {} @@ -83,7 +83,7 @@ def _get_prev_symbols(self) -> Dict[str, str]: if row[0] == "9606": if row[1] != "-": gene_id = row[1] - if gene_id in prev_symbols.keys(): + if gene_id in prev_symbols: prev_symbols[gene_id].append(row[3]) else: prev_symbols[gene_id] = [row[3]] @@ -130,7 +130,7 @@ def _add_xrefs_associated_with(self, val: List[str], params: Dict) -> None: if prefix: params["associated_with"].append(f"{prefix}:{src_id}") else: - logger.info(f"{src_name} is not in NameSpacePrefix.") + logger.info("%s is not in NameSpacePrefix.", src_name) if not params["xrefs"]: del params["xrefs"] if not params["associated_with"]: @@ -143,13 +143,13 @@ def _get_gene_info(self, prev_symbols: Dict[str, str]) -> Dict[str, str]: :return: A dictionary of gene's from the NCBI info file. """ # open info file, skip headers - info_file = open(self._info_src, "r") - info = csv.reader(info_file, delimiter="\t") - next(info) + with self._info_src.open() as info_file: + info = csv.reader(info_file, delimiter="\t") + next(info) - info_genes = dict() + info_genes = {} for row in info: - params = dict() + params = {} params["concept_id"] = f"{NamespacePrefix.NCBI.value}:{row[1]}" # get symbol params["symbol"] = row[2] @@ -174,7 +174,7 @@ def _get_gene_info(self, prev_symbols: Dict[str, str]) -> Dict[str, str]: if row[8] != "-": params["label"] = row[8] # add prev symbols - if row[1] in prev_symbols.keys(): + if row[1] in prev_symbols: params["previous_symbols"] = prev_symbols[row[1]] info_genes[params["symbol"]] = params # get type @@ -197,7 +197,7 @@ def _get_gene_gff(self, db: gffutils.FeatureDB, info_genes: Dict) -> None: params = info_genes.get(symbol) vrs_sq_location = self._get_vrs_sq_location(db, params, f_id) if vrs_sq_location: - params["locations"].append(vrs_sq_location) # type: ignore + params["locations"].append(vrs_sq_location) else: # Need to add entire gene gene = self._add_gff_gene(db, f, f_id) @@ -213,14 +213,14 @@ def _add_gff_gene( :param f_id: The feature's ID :return: A gene dictionary if the ID attribute exists. Else return None. """ - params = dict() + params = {} params["src_name"] = SourceName.NCBI.value self._add_attributes(f, params) sq_loc = self._get_vrs_sq_location(db, params, f_id) if sq_loc: params["locations"] = [sq_loc] else: - params["locations"] = list() + params["locations"] = [] params["label_and_type"] = f"{params['concept_id'].lower()}##identity" return params @@ -267,7 +267,7 @@ def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict: :param src_id: The source's accession number :return: A dict containing an xref or associated_with ref """ - source = dict() + source = {} if src_name.startswith("HGNC"): source["xrefs"] = [f"{NamespacePrefix.HGNC.value}:{src_id}"] elif src_name.startswith("NCBI"): @@ -288,14 +288,14 @@ def _get_vrs_chr_location(self, row: List[str], params: Dict) -> List: :param params: A transformed gene record :return: A list of GA4GH VRS ChromosomeLocations """ - params["location_annotations"] = list() + params["location_annotations"] = [] chromosomes_locations = self._set_chromsomes_locations(row, params) locations = chromosomes_locations["locations"] chromosomes = chromosomes_locations["chromosomes"] if chromosomes_locations["exclude"]: return ["exclude"] - location_list = list() + location_list = [] if chromosomes and not locations: for chromosome in chromosomes: if chromosome == "MT": @@ -317,18 +317,18 @@ def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict: """ chromosomes = None if row[6] != "-": - if "|" in row[6]: - chromosomes = row[6].split("|") - else: - chromosomes = [row[6]] + chromosomes = row[6].split("|") if "|" in row[6] else [row[6]] - if len(chromosomes) >= 2: - if chromosomes and "X" not in chromosomes and "Y" not in chromosomes: - logger.info( - f"{row[2]} contains multiple distinct " - f"chromosomes: {chromosomes}." - ) - chromosomes = None + if ( + len(chromosomes) >= 2 + and chromosomes + and "X" not in chromosomes + and "Y" not in chromosomes + ): + logger.info( + "%s contains multiple distinct chromosomes: %s", row[2], chromosomes + ) + chromosomes = None locations = None exclude = False @@ -343,15 +343,14 @@ def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict: locations = [row[7]] # Sometimes locations will store the same location twice - if len(locations) == 2: - if locations[0] == locations[1]: - locations = [locations[0]] + if len(locations) == 2 and locations[0] == locations[1]: + locations = [locations[0]] # Exclude genes where there are multiple distinct locations # i.e. OMS: '10q26.3', '19q13.42-q13.43', '3p25.3' if len(locations) > 2: logger.info( - f"{row[2]} contains multiple distinct " f"locations: {locations}." + "%s contains multiple distinct locations: %s", row[2], locations ) locations = None exclude = True @@ -361,9 +360,7 @@ def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict: for i in range(len(locations)): loc = locations[i].strip() if not re.match("^([1-9][0-9]?|X[pq]?|Y[pq]?)", loc): - logger.info( - f"{row[2]} contains invalid map location:" f"{loc}." - ) + logger.info("%s contains invalid map location: %s", row[2], loc) params["location_annotations"].append(loc) del locations[i] return {"locations": locations, "chromosomes": chromosomes, "exclude": exclude} @@ -379,7 +376,7 @@ def _add_chromosome_location( """ for i in range(len(locations)): loc = locations[i].strip() - location = dict() + location = {} if Annotation.ALT_LOC.value in loc: loc = loc.split(f"{Annotation.ALT_LOC.value}")[0].strip() @@ -429,16 +426,16 @@ def _set_centromere_location(self, loc: str, location: Dict) -> None: :param loc: A gene location :param location: GA4GH location """ - centromere_ix = re.search("cen", loc).start() # type: ignore + centromere_ix = re.search("cen", loc).start() if "-" in loc: # Location gives both start and end - range_ix = re.search("-", loc).start() # type: ignore + range_ix = re.search("-", loc).start() if "q" in loc: location["chr"] = loc[:centromere_ix].strip() location["start"] = "cen" location["end"] = loc[range_ix + 1 :] elif "p" in loc: - p_ix = re.search("p", loc).start() # type: ignore + p_ix = re.search("p", loc).start() location["chr"] = loc[:p_ix].strip() location["end"] = "cen" location["start"] = loc[:range_ix] @@ -464,7 +461,7 @@ def _transform_data(self) -> None: self._get_gene_gff(db, info_genes) - for gene in info_genes.keys(): + for gene in info_genes: self._load_gene(info_genes[gene]) logger.info("Successfully transformed NCBI.") @@ -482,9 +479,8 @@ def _add_meta(self) -> None: self._assembly, ] ): - raise GeneNormalizerEtlError( - "Source metadata unavailable -- was data properly acquired before attempting to load DB?" - ) + err_msg = "Source metadata unavailable -- was data properly acquired before attempting to load DB?" + raise GeneNormalizerEtlError(err_msg) metadata = SourceMeta( data_license="custom", data_license_url="https://www.ncbi.nlm.nih.gov/home/about/policies/", diff --git a/src/gene/main.py b/src/gene/main.py index 6d286d8e..3bc90f24 100644 --- a/src/gene/main.py +++ b/src/gene/main.py @@ -27,7 +27,7 @@ contact={ "name": "Alex H. Wagner", "email": "Alex.Wagner@nationwidechildrens.org", - "url": "https://www.nationwidechildrens.org/specialties/institute-for-genomic-medicine/research-labs/wagner-lab", # noqa: E501 + "url": "https://www.nationwidechildrens.org/specialties/institute-for-genomic-medicine/research-labs/wagner-lab", }, license={ "name": "MIT", @@ -65,7 +65,7 @@ tags=["Query"], ) def search( - q: str = Query(..., description=q_descr), # noqa: D103 + q: str = Query(..., description=q_descr), incl: Optional[str] = Query(None, description=incl_descr), excl: Optional[str] = Query(None, description=excl_descr), ) -> SearchService: @@ -83,7 +83,7 @@ def search( try: resp = query_handler.search(html.unescape(q), incl=incl, excl=excl) except InvalidParameterException as e: - raise HTTPException(status_code=422, detail=str(e)) + raise HTTPException(status_code=422, detail=str(e)) from e return resp @@ -108,8 +108,7 @@ def normalize(q: str = Query(..., description=normalize_q_descr)) -> NormalizeSe :param str q: gene search term :return: JSON response with normalized gene concept """ - resp = query_handler.normalize(html.unescape(q)) - return resp + return query_handler.normalize(html.unescape(q)) unmerged_matches_summary = ( @@ -142,5 +141,4 @@ def normalize_unmerged( :param q: Gene search term :returns: JSON response with matching normalized record and source metadata """ - response = query_handler.normalize_unmerged(html.unescape(q)) - return response + return query_handler.normalize_unmerged(html.unescape(q)) diff --git a/src/gene/query.py b/src/gene/query.py index e30a79d8..692f7950 100644 --- a/src/gene/query.py +++ b/src/gene/query.py @@ -127,7 +127,7 @@ def _transform_locations(self, record: Dict) -> Dict: :param record: original record :return: record with transformed locations attributes, if applicable """ - record_locations = list() + record_locations = [] if "locations" in record: for loc in record["locations"]: if loc["type"] == "SequenceLocation": @@ -144,12 +144,15 @@ def _get_src_name(self, concept_id: str) -> SourceName: """ if concept_id.startswith(NamespacePrefix.ENSEMBL.value): return SourceName.ENSEMBL - elif concept_id.startswith(NamespacePrefix.NCBI.value): + + if concept_id.startswith(NamespacePrefix.NCBI.value): return SourceName.NCBI - elif concept_id.startswith(NamespacePrefix.HGNC.value): + + if concept_id.startswith(NamespacePrefix.HGNC.value): return SourceName.HGNC - else: - raise ValueError("Invalid or unrecognized concept ID provided") + + err_msg = "Invalid or unrecognized concept ID provided" + raise ValueError(err_msg) def _add_record( self, response: Dict[str, Dict], item: Dict, match_type: MatchType @@ -166,7 +169,7 @@ def _add_record( src_name = item["src_name"] matches = response["source_matches"] - if src_name not in matches.keys(): + if src_name not in matches: pass elif matches[src_name] is None: matches[src_name] = { @@ -197,7 +200,7 @@ def _fetch_record( else: logger.error( f"Unable to find expected record for {concept_id} matching as {match_type}" - ) # noqa: E501 + ) def _post_process_resp(self, resp: Dict) -> Dict: """Fill all empty source_matches slots with NO_MATCH results and @@ -207,7 +210,7 @@ def _post_process_resp(self, resp: Dict) -> Dict: :return: response object with empty source slots filled with NO_MATCH results and corresponding source metadata """ - for src_name in resp["source_matches"].keys(): + for src_name in resp["source_matches"]: if resp["source_matches"][src_name] is None: resp["source_matches"][src_name] = { "match_type": MatchType.NO_MATCH, @@ -237,18 +240,18 @@ def _get_search_response(self, query: str, sources: Set[str]) -> Dict: return self._post_process_resp(resp) query_l = query.lower() - queries = list() - if [p for p in PREFIX_LOOKUP.keys() if query_l.startswith(p)]: + queries = [] + if [p for p in PREFIX_LOOKUP if query_l.startswith(p)]: queries.append((query_l, RecordType.IDENTITY.value)) - for prefix in [p for p in NAMESPACE_LOOKUP.keys() if query_l.startswith(p)]: + for prefix in [p for p in NAMESPACE_LOOKUP if query_l.startswith(p)]: term = f"{NAMESPACE_LOOKUP[prefix].lower()}:{query_l}" queries.append((term, RecordType.IDENTITY.value)) for match in ITEM_TYPES.values(): queries.append((query_l, match)) - matched_concept_ids = list() + matched_concept_ids = [] for term, item_type in queries: try: if item_type == RecordType.IDENTITY.value: @@ -278,7 +281,10 @@ def _get_service_meta() -> ServiceMeta: :return: Service Meta """ - return ServiceMeta(version=__version__, response_datetime=str(datetime.now())) + return ServiceMeta( + version=__version__, + response_datetime=str(datetime.now(tz=datetime.timezone.utc)), + ) def search( self, @@ -308,7 +314,7 @@ def search( possible_sources = { name.value.lower(): name.value for name in SourceName.__members__.values() } - sources = dict() + sources = {} for k, v in possible_sources.items(): if self.db.get_source_metadata(v): sources[k] = v @@ -323,7 +329,7 @@ def search( invalid_sources = [] query_sources = set() for source in req_sources: - if source.lower() in sources.keys(): + if source.lower() in sources: query_sources.add(sources[source.lower()]) else: invalid_sources.append(source) @@ -336,10 +342,10 @@ def search( invalid_sources = [] query_sources = set() for req_l, req in req_excl_dict.items(): - if req_l not in sources.keys(): + if req_l not in sources: invalid_sources.append(req) for src_l, src in sources.items(): - if src_l not in req_excl_dict.keys(): + if src_l not in req_excl_dict: query_sources.add(src) if invalid_sources: detail = f"Invalid source name(s): {invalid_sources}" @@ -440,7 +446,7 @@ def _add_gene( # aliases aliases = set() for key in ["previous_symbols", "aliases"]: - if key in record and record[key]: + if record.get(key): val = record[key] if isinstance(val, str): val = [val] @@ -458,7 +464,7 @@ def _add_gene( ("strand", "strand"), ] for ext_label, record_label in extension_and_record_labels: - if record_label in record and record[record_label]: + if record.get(record_label): extensions.append( core_models.Extension(name=ext_label, value=record[record_label]) ) @@ -553,7 +559,8 @@ def _prepare_normalized_response(self, query: str) -> Dict[str, Any]: "match_type": MatchType.NO_MATCH, "warnings": self._emit_warnings(query), "service_meta_": ServiceMeta( - version=__version__, response_datetime=str(datetime.now()) + version=__version__, + response_datetime=str(datetime.now(tz=datetime.timezone.utc)), ), } @@ -605,11 +612,11 @@ def _resolve_merge( f"in record {record['concept_id']} from query `{query}`" ) return response - else: - return callback(response, merge, match_type, possible_concepts) - else: - # record is sole member of concept group - return callback(response, record, match_type, possible_concepts) + + return callback(response, merge, match_type, possible_concepts) + + # record is sole member of concept group + return callback(response, record, match_type, possible_concepts) def _perform_normalized_lookup( self, response: NormService, query: str, response_builder: Callable @@ -643,16 +650,14 @@ def _perform_normalized_lookup( matching_records = [ self.db.get_record_by_id(ref, False) for ref in matching_refs ] - matching_records.sort(key=self._record_order) # type: ignore + matching_records.sort(key=self._record_order) - if len(matching_refs) > 1: - possible_concepts = [ref for ref in matching_refs] - else: - possible_concepts = None + possible_concepts = list(matching_refs) if len(matching_refs) > 1 else None # attempt merge ref resolution until successful for match in matching_records: - assert match is not None + if match is None: + raise ValueError record = self.db.get_record_by_id(match["concept_id"], False) if record: match_type_value = MatchType[match_type.value.upper()] @@ -688,12 +693,11 @@ def _add_normalized_records( meta = self.db.get_source_metadata(record_source.value) response.source_matches[record_source] = MatchesNormalized( records=[BaseGene(**self._transform_locations(normalized_record))], - source_meta_=meta, # type: ignore + source_meta_=meta, ) else: - concept_ids = [normalized_record["concept_id"]] + normalized_record.get( - "xrefs", [] - ) + xrefs = normalized_record.get("xrefs") or [] + concept_ids = [normalized_record["concept_id"], *xrefs] for concept_id in concept_ids: record = self.db.get_record_by_id(concept_id, case_sensitive=False) if not record: @@ -705,8 +709,7 @@ def _add_normalized_records( else: meta = self.db.get_source_metadata(record_source.value) response.source_matches[record_source] = MatchesNormalized( - records=[gene], - source_meta_=meta, # type: ignore + records=[gene], source_meta_=meta ) if possible_concepts: response = self._add_alt_matches( diff --git a/src/gene/schemas.py b/src/gene/schemas.py index 6f85b1bc..e308a08a 100644 --- a/src/gene/schemas.py +++ b/src/gene/schemas.py @@ -69,7 +69,7 @@ class GeneSequenceLocation(BaseModel): type: Literal["SequenceLocation"] = "SequenceLocation" start: StrictInt end: StrictInt - sequence_id: constr(pattern=r"^ga4gh:SQ.[0-9A-Za-z_\-]{32}$") # noqa: F722 + sequence_id: constr(pattern=r"^ga4gh:SQ.[0-9A-Za-z_\-]{32}$") # class GeneChromosomeLocation(BaseModel): @@ -160,7 +160,7 @@ class SourceIDAfterNamespace(Enum): HGNC = "" ENSEMBL = "ENSG" - NCBI = "" + NCBI = "" # noqa: PIE796 class NamespacePrefix(Enum): @@ -273,7 +273,7 @@ class ServiceMeta(BaseModel): response_datetime: StrictStr url: Literal[ "https://github.com/cancervariants/gene-normalization" - ] = "https://github.com/cancervariants/gene-normalization" # noqa: E501 + ] = "https://github.com/cancervariants/gene-normalization" model_config = ConfigDict( json_schema_extra={ @@ -412,7 +412,7 @@ class NormalizeService(BaseNormalizationService): # { # "name": "chromosome_location", # "value": { - # "id": "ga4gh:CL.O6yCQ1cnThOrTfK9YUgMlTfM6HTqbrKw", # noqa: E501 + # "id": "ga4gh:CL.O6yCQ1cnThOrTfK9YUgMlTfM6HTqbrKw", # "type": "ChromosomeLocation", # "species_id": "taxonomy:9606", # "chr": "7", @@ -441,7 +441,7 @@ class NormalizeService(BaseNormalizationService): }, "Ensembl": { "data_license": "custom", - "data_license_url": "https://useast.ensembl.org/info/about/legal/disclaimer.html", # noqa: E501 + "data_license_url": "https://useast.ensembl.org/info/about/legal/disclaimer.html", "version": "104", "data_url": { "genome_annotations": "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz" @@ -456,7 +456,7 @@ class NormalizeService(BaseNormalizationService): }, "NCBI": { "data_license": "custom", - "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", # noqa: E501 + "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", "version": "20210813", "data_url": { "info_file": "ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz", @@ -519,13 +519,13 @@ class UnmergedNormalizationService(BaseNormalizationService): "concept_id": "hgnc:108", "symbol": "ACHE", "symbol_status": "approved", - "label": "acetylcholinesterase (Cartwright blood group)", # noqa: E501 + "label": "acetylcholinesterase (Cartwright blood group)", "strand": None, "location_annotations": [], "locations": [ # { # "type": "ChromosomeLocation", - # "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # noqa: E501 + # "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # "species_id": "taxonomy:9606", # "chr": "7", # "start": "q22.1", @@ -573,16 +573,16 @@ class UnmergedNormalizationService(BaseNormalizationService): "concept_id": "ensembl:ENSG00000087085", "symbol": "ACHE", "symbol_status": None, - "label": "acetylcholinesterase (Cartwright blood group)", # noqa: E501 + "label": "acetylcholinesterase (Cartwright blood group)", "strand": "-", "location_annotations": [], "locations": [ { - "id": "ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1", # noqa: E501 + "id": "ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1", "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", - "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501 + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, "start": 100889993, "end": 100896974, @@ -597,7 +597,7 @@ class UnmergedNormalizationService(BaseNormalizationService): ], "source_meta_": { "data_license": "custom", - "data_license_url": "https://useast.ensembl.org/info/about/legal/disclaimer.html", # noqa: E501 + "data_license_url": "https://useast.ensembl.org/info/about/legal/disclaimer.html", "version": "104", "data_url": { "genome_annotations": "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz" @@ -617,24 +617,24 @@ class UnmergedNormalizationService(BaseNormalizationService): "concept_id": "ncbigene:43", "symbol": "ACHE", "symbol_status": None, - "label": "acetylcholinesterase (Cartwright blood group)", # noqa: E501 + "label": "acetylcholinesterase (Cartwright blood group)", "strand": "-", "location_annotations": [], "locations": [ { # "type": "ChromosomeLocation", - # "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # noqa: E501 + # "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # "species_id": "taxonomy:9606", # "chr": "7", # "start": "q22.1", # "end": "q22.1" }, { - "id": "ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g", # noqa: E501 + "id": "ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g", "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", - "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501 + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, "start": 100889993, "end": 100896994, @@ -649,7 +649,7 @@ class UnmergedNormalizationService(BaseNormalizationService): ], "source_meta_": { "data_license": "custom", - "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", # noqa: E501 + "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", "version": "20220407", "data_url": { "info_file": "ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz", diff --git a/tests/unit/test_database_and_etl.py b/tests/unit/test_database_and_etl.py index 092cc6c3..5367d794 100644 --- a/tests/unit/test_database_and_etl.py +++ b/tests/unit/test_database_and_etl.py @@ -1,10 +1,10 @@ """Test DynamoDB and ETL methods.""" from os import environ from pathlib import Path +from unittest.mock import patch import pytest from boto3.dynamodb.conditions import Key -from mock import patch from gene.database import AWS_ENV_VAR_NAME from gene.etl import HGNC, NCBI, Ensembl @@ -52,7 +52,7 @@ def __init__(self): @pytest.fixture(scope="module") def processed_ids(): """Create a test fixture to store processed ids for merged concepts.""" - return list() + return [] def _get_aliases(seqid): @@ -95,7 +95,7 @@ def test_ensembl_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path) """Test that ensembl etl methods work correctly.""" test_get_seqrepo.return_value = None e = Ensembl(db_fixture.db, data_path=etl_data_path) - e._get_seq_id_aliases = _get_aliases # type: ignore + e._get_seq_id_aliases = _get_aliases ensembl_ids = e.perform_etl(use_existing=True) processed_ids += ensembl_ids @@ -116,7 +116,7 @@ def test_ncbi_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path): """Test that ncbi etl methods work correctly.""" test_get_seqrepo.return_value = None n = NCBI(db_fixture.db, data_path=etl_data_path) - n._get_seq_id_aliases = _get_aliases # type: ignore + n._get_seq_id_aliases = _get_aliases ncbi_ids = n.perform_etl(use_existing=True) processed_ids += ncbi_ids diff --git a/tests/unit/test_emit_warnings.py b/tests/unit/test_emit_warnings.py index c8309aac..73acbcaa 100644 --- a/tests/unit/test_emit_warnings.py +++ b/tests/unit/test_emit_warnings.py @@ -18,7 +18,7 @@ def test_emit_warnings(): assert actual_warnings == [] # Test emit warnings - actual_warnings = query_handler._emit_warnings("sp ry3") + actual_warnings = query_handler._emit_warnings("sp ry3") assert actual_warnings == actual_warnings actual_warnings = query_handler._emit_warnings("sp\u00A0ry3") diff --git a/tests/unit/test_ensembl_source.py b/tests/unit/test_ensembl_source.py index 7660be3e..081b3e6a 100644 --- a/tests/unit/test_ensembl_source.py +++ b/tests/unit/test_ensembl_source.py @@ -17,8 +17,7 @@ def search(self, query_str, incl="ensembl"): resp = self.query_handler.search(query_str, incl=incl) return resp.source_matches[SourceName.ENSEMBL] - e = QueryGetter() - return e + return QueryGetter() @pytest.fixture(scope="module") diff --git a/tests/unit/test_hgnc_source.py b/tests/unit/test_hgnc_source.py index 54d0aff0..e6ee38bc 100644 --- a/tests/unit/test_hgnc_source.py +++ b/tests/unit/test_hgnc_source.py @@ -1,5 +1,5 @@ """Test that the gene normalizer works as intended for the HGNC source.""" -from datetime import datetime +import datetime import pytest @@ -19,8 +19,7 @@ def search(self, query_str, incl="hgnc"): resp = self.query_handler.search(query_str, incl=incl) return resp.source_matches[SourceName.HGNC] - h = QueryGetter() - return h + return QueryGetter() # Test Non Alt Loci Set @@ -816,7 +815,9 @@ def test_meta_info(hgnc): assert ( resp.source_meta_.data_license_url == "https://www.genenames.org/about/license/" ) - assert datetime.strptime(resp.source_meta_.version, "%Y%m%d") + assert datetime.datetime.now(tz=datetime.timezone.utc).strptime( + resp.source_meta_.version, "%Y%m%d" + ) assert resp.source_meta_.data_url == { "complete_set_archive": "ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json" } diff --git a/tests/unit/test_ncbi_source.py b/tests/unit/test_ncbi_source.py index d0083a43..58bf7a1e 100644 --- a/tests/unit/test_ncbi_source.py +++ b/tests/unit/test_ncbi_source.py @@ -1,5 +1,5 @@ """Test import of NCBI source data""" -from datetime import datetime +import datetime import pytest @@ -37,8 +37,7 @@ def search(self, query_str, incl="ncbi"): resp = self.query_handler.search(query_str, incl=incl) return resp.source_matches[SourceName.NCBI] - n = QueryGetter() - return n + return QueryGetter() @pytest.fixture(scope="module") @@ -857,7 +856,9 @@ def test_no_match(ncbi, source_urls): response.source_meta_.data_license_url == "https://www.ncbi.nlm.nih.gov/home/about/policies/" ) - assert datetime.strptime(response.source_meta_.version, "%Y%m%d") + assert datetime.datetime.now(tz=datetime.timezone.utc).strptime( + response.source_meta_.version, "%Y%m%d" + ) assert response.source_meta_.data_url == source_urls assert response.source_meta_.rdp_url == "https://reusabledata.org/ncbi-gene.html" assert not response.source_meta_.data_license_attributes["non_commercial"] @@ -903,7 +904,9 @@ def test_meta(ncbi, source_urls): response.source_meta_.data_license_url == "https://www.ncbi.nlm.nih.gov/home/about/policies/" ) - assert datetime.strptime(response.source_meta_.version, "%Y%m%d") + assert datetime.datetime.now(tz=datetime.timezone.utc).strptime( + response.source_meta_.version, "%Y%m%d" + ) assert response.source_meta_.data_url == source_urls assert response.source_meta_.rdp_url == "https://reusabledata.org/ncbi-gene.html" assert response.source_meta_.genome_assemblies == ["GRCh38.p14"] diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index f767ced1..e9226a2d 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -676,7 +676,7 @@ def normalize_unmerged_loc_653303(): "concept_id": "ncbigene:653303", "symbol": "LOC653303", "symbol_status": None, - "label": "proprotein convertase subtilisin/kexin type 7 pseudogene", # noqa: E501 + "label": "proprotein convertase subtilisin/kexin type 7 pseudogene", "strand": "+", "location_annotations": [], "locations": [ @@ -693,7 +693,7 @@ def normalize_unmerged_loc_653303(): "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", - "refgetAccession": "SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1", # noqa: E501 + "refgetAccession": "SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1", }, "start": 117135528, "end": 117138867, @@ -775,7 +775,7 @@ def normalize_unmerged_chaf1a(): "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", - "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", # noqa: E501 + "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", }, "start": 4402639, "end": 4445018, @@ -812,7 +812,7 @@ def normalize_unmerged_chaf1a(): "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", - "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", # noqa: E501 + "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", }, "start": 4402639, "end": 4450830, @@ -859,7 +859,7 @@ def normalize_unmerged_ache(): "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", - "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501 + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, "start": 100889993, "end": 100896994, @@ -888,7 +888,7 @@ def normalize_unmerged_ache(): "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", - "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501 + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, "start": 100889993, "end": 100896974, @@ -990,7 +990,7 @@ def normalized_ifnr(): @pytest.fixture(scope="module") def num_sources(): """Get the number of sources.""" - return len({s for s in SourceName}) + return len(set(SourceName)) @pytest.fixture(scope="module") @@ -1036,7 +1036,7 @@ def compare_normalize_resp( resp_source_meta_keys = resp.source_meta_.keys() assert len(resp_source_meta_keys) == len( expected_source_meta - ), "source_meta_keys" # noqa: E501 + ), "source_meta_keys" for src in expected_source_meta: assert src in resp_source_meta_keys compare_service_meta(resp.service_meta_) @@ -1110,8 +1110,8 @@ def compare_gene(test, actual): assert len(actual.mappings) == len(test.mappings) assert set(actual.aliases) == set(test.aliases), "aliases" - extensions_present = "extensions" in test.model_fields.keys() - assert ("extensions" in actual.model_fields.keys()) == extensions_present + extensions_present = "extensions" in test.model_fields + assert ("extensions" in actual.model_fields) == extensions_present if extensions_present: actual_ext_names = sorted([ext.name for ext in actual.extensions]) unique_actual_ext_names = sorted(set(actual_ext_names)) @@ -1174,7 +1174,7 @@ def test_search_query_inc_exc(query_handler, num_sources): def test_search_invalid_parameter_exception(query_handler): """Test that Invalid parameter exception works correctly.""" with pytest.raises(InvalidParameterException): - _ = query_handler.search("BRAF", incl="hgn") # noqa: F841, E501 + _ = query_handler.search("BRAF", incl="hgn") with pytest.raises(InvalidParameterException): resp = query_handler.search("BRAF", incl="hgnc", excl="hgnc") # noqa: F841