From 1701fc2e6e65d8bcd9d327eef45d948bf06f75b7 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 12 Dec 2023 14:24:34 -0500 Subject: [PATCH] chore: clean up logging --- src/gene/cli.py | 15 +++++++-------- src/gene/database/dynamodb.py | 22 +++++++++++----------- src/gene/database/postgresql.py | 22 +++++++++++----------- src/gene/etl/base.py | 9 ++++----- src/gene/etl/ensembl.py | 11 +++++------ src/gene/etl/hgnc.py | 13 ++++++------- src/gene/etl/merge.py | 21 ++++++++++----------- src/gene/etl/ncbi.py | 27 +++++++++++++-------------- 8 files changed, 67 insertions(+), 73 deletions(-) diff --git a/src/gene/cli.py b/src/gene/cli.py index 30beb762..f78a0d57 100644 --- a/src/gene/cli.py +++ b/src/gene/cli.py @@ -16,8 +16,7 @@ from gene.database.database import DatabaseException from gene.schemas import SOURCES, SourceName -logger = logging.getLogger("gene") -logger.setLevel(logging.DEBUG) +_logger = logging.getLogger(__name__) @click.command() @@ -139,14 +138,14 @@ def _delete_source(n: SourceName, db: AbstractDatabase) -> float: """ msg = f"Deleting {n.value}..." click.echo(f"\n{msg}") - logger.info(msg) + _logger.info(msg) start_delete = timer() db.delete_source(n) end_delete = timer() delete_time = end_delete - start_delete msg = f"Deleted {n.value} in {delete_time:.5f} seconds." click.echo(f"{msg}\n") - logger.info(msg) + _logger.info(msg) return delete_time @@ -171,7 +170,7 @@ def _load_source( """ msg = f"Loading {n.value}..." click.echo(msg) - logger.info(msg) + _logger.info(msg) start_load = timer() # used to get source class name from string @@ -189,17 +188,17 @@ def _load_source( try: processed_ids += source.perform_etl(use_existing) except GeneNormalizerEtlError as e: - logger.error(e) + _logger.error(e) click.echo(f"Encountered error while loading {n}: {e}.") click.get_current_context().exit() end_load = timer() load_time = end_load - start_load msg = f"Loaded {n.value} in {load_time:.5f} seconds." click.echo(msg) - logger.info(msg) + _logger.info(msg) msg = f"Total time for {n.value}: {(delete_time + load_time):.5f} seconds." click.echo(msg) - logger.info(msg) + _logger.info(msg) def _delete_normalized_data(database: AbstractDatabase) -> None: diff --git a/src/gene/database/dynamodb.py b/src/gene/database/dynamodb.py index 161972e1..4710f156 100644 --- a/src/gene/database/dynamodb.py +++ b/src/gene/database/dynamodb.py @@ -31,7 +31,7 @@ SourceName, ) -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) class DynamoDbDatabase(AbstractDatabase): @@ -162,7 +162,7 @@ def check_schema_initialized(self) -> bool: existing_tables = self.list_tables() exists = self.gene_table in existing_tables if not exists: - logger.info(f"{self.gene_table} table is missing or unavailable.") + _logger.info(f"{self.gene_table} table is missing or unavailable.") return exists def check_tables_populated(self) -> bool: @@ -179,7 +179,7 @@ def check_tables_populated(self) -> bool: KeyConditionExpression=Key("item_type").eq("source"), ).get("Items", []) if len(sources) < len(SourceName): - logger.info("Gene sources table is missing expected sources.") + _logger.info("Gene sources table is missing expected sources.") return False records = self.genes.query( @@ -188,7 +188,7 @@ def check_tables_populated(self) -> bool: Limit=1, ) if len(records.get("Items", [])) < 1: - logger.info("Gene records index is empty.") + _logger.info("Gene records index is empty.") return False normalized_records = self.genes.query( @@ -197,7 +197,7 @@ def check_tables_populated(self) -> bool: Limit=1, ) if len(normalized_records.get("Items", [])) < 1: - logger.info("Normalized gene records index is empty.") + _logger.info("Normalized gene records index is empty.") return False return True @@ -259,7 +259,7 @@ def get_record_by_id( del record["label_and_type"] return record except ClientError as e: - logger.error( + _logger.error( f"boto3 client error on get_records_by_id for " f"search term {concept_id}: " f"{e.response['Error']['Message']}" @@ -282,7 +282,7 @@ def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]: matches = self.genes.query(KeyConditionExpression=filter_exp) return [m["concept_id"] for m in matches.get("Items", None)] except ClientError as e: - logger.error( + _logger.error( f"boto3 client error on get_refs_by_type for " f"search term {search_term}: " f"{e.response['Error']['Message']}" @@ -386,7 +386,7 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: try: self.batch.put_item(Item=record) except ClientError as e: - logger.error( + _logger.error( "boto3 client error on add_record for " f"{concept_id}: {e.response['Error']['Message']}" ) @@ -418,7 +418,7 @@ def add_merged_record(self, record: Dict) -> None: try: self.batch.put_item(Item=record) except ClientError as e: - logger.error( + _logger.error( "boto3 client error on add_record for " f"{concept_id}: {e.response['Error']['Message']}" ) @@ -444,7 +444,7 @@ def _add_ref_record( try: self.batch.put_item(Item=record) except ClientError as e: - logger.error( + _logger.error( f"boto3 client error adding reference {term} for " f"{concept_id} with match type {ref_type}: " f"{e.response['Error']['Message']}" @@ -476,7 +476,7 @@ def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: AN f"No such record exists for keys {label_and_type}, {concept_id}" ) else: - logger.error( + _logger.error( f"boto3 client error in `database.update_record()`: " f"{e.response['Error']['Message']}" ) diff --git a/src/gene/database/postgresql.py b/src/gene/database/postgresql.py index f62a1819..47371bc4 100644 --- a/src/gene/database/postgresql.py +++ b/src/gene/database/postgresql.py @@ -26,7 +26,7 @@ ) from gene.schemas import RecordType, RefType, SourceMeta, SourceName -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) SCRIPTS_DIR = Path(__file__).parent / "postgresql" @@ -119,7 +119,7 @@ def drop_db(self) -> None: with self.conn.cursor() as cur: cur.execute(self._drop_db_query) self.conn.commit() - logger.info("Dropped all existing gene normalizer tables.") + _logger.info("Dropped all existing gene normalizer tables.") def check_schema_initialized(self) -> bool: """Check if database schema is properly initialized. @@ -132,7 +132,7 @@ def check_schema_initialized(self) -> bool: except DuplicateTable: self.conn.rollback() else: - logger.info("Gene table existence check failed.") + _logger.info("Gene table existence check failed.") self.conn.rollback() return False try: @@ -141,7 +141,7 @@ def check_schema_initialized(self) -> bool: except DuplicateObject: self.conn.rollback() else: - logger.info("Gene foreign key existence check failed.") + _logger.info("Gene foreign key existence check failed.") self.conn.rollback() return False try: @@ -152,7 +152,7 @@ def check_schema_initialized(self) -> bool: except DuplicateTable: self.conn.rollback() else: - logger.info("Gene normalized view lookup failed.") + _logger.info("Gene normalized view lookup failed.") self.conn.rollback() return False try: @@ -161,7 +161,7 @@ def check_schema_initialized(self) -> bool: except DuplicateTable: self.conn.rollback() else: - logger.info("Gene indexes check failed.") + _logger.info("Gene indexes check failed.") self.conn.rollback() return False @@ -184,21 +184,21 @@ def check_tables_populated(self) -> bool: cur.execute(self._check_sources_query) results = cur.fetchall() if len(results) < len(SourceName): - logger.info("Gene sources table is missing expected sources.") + _logger.info("Gene sources table is missing expected sources.") return False with self.conn.cursor() as cur: cur.execute(self._check_concepts_query) result = cur.fetchone() if not result or result[0] < 1: - logger.info("Gene records table is empty.") + _logger.info("Gene records table is empty.") return False with self.conn.cursor() as cur: cur.execute(self._check_merged_query) result = cur.fetchone() if not result or result[0] < 1: - logger.info("Normalized gene records table is empty.") + _logger.info("Normalized gene records table is empty.") return False return True @@ -260,7 +260,7 @@ def _drop_indexes(self) -> None: def _create_tables(self) -> None: """Create all tables, indexes, and views.""" - logger.debug("Creating new gene normalizer tables.") + _logger.debug("Creating new gene normalizer tables.") tables_query = (SCRIPTS_DIR / "create_tables.sql").read_bytes() with self.conn.cursor() as cur: @@ -594,7 +594,7 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: cur.execute(self._ins_symbol_query, [record["symbol"], concept_id]) self.conn.commit() except UniqueViolation: - logger.error(f"Record with ID {concept_id} already exists") + _logger.error(f"Record with ID {concept_id} already exists") self.conn.rollback() _add_merged_record_query = b""" diff --git a/src/gene/etl/base.py b/src/gene/etl/base.py index f46f0044..77937bcb 100644 --- a/src/gene/etl/base.py +++ b/src/gene/etl/base.py @@ -18,8 +18,7 @@ from gene.database import AbstractDatabase from gene.schemas import ITEM_TYPES, Gene, GeneSequenceLocation, MatchType, SourceName -logger = logging.getLogger("gene") -logger.setLevel(logging.DEBUG) +_logger = logging.getLogger(__name__) APP_ROOT = Path(__file__).resolve().parent @@ -128,7 +127,7 @@ def _load_gene(self, gene: Dict) -> None: try: assert Gene(match_type=MatchType.NO_MATCH, **gene) except pydantic.ValidationError as e: - logger.warning(f"Unable to load {gene} due to validation error: " f"{e}") + _logger.warning(f"Unable to load {gene} due to validation error: " f"{e}") else: concept_id = gene["concept_id"] gene["label_and_type"] = f"{concept_id.lower()}##identity" @@ -272,7 +271,7 @@ def _get_seq_id_aliases(self, seq_id: str) -> List[str]: try: aliases = self.seqrepo.translate_alias(seq_id, target_namespaces="ga4gh") except KeyError as e: - logger.warning(f"SeqRepo raised KeyError: {e}") + _logger.warning(f"SeqRepo raised KeyError: {e}") return aliases def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Dict: @@ -299,7 +298,7 @@ def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Di sequence_id=sequence, ).model_dump() # type: ignore else: - logger.warning( + _logger.warning( f"{params['concept_id']} has invalid interval:" f"start={gene.start - 1} end={gene.end}" ) # type: ignore diff --git a/src/gene/etl/ensembl.py b/src/gene/etl/ensembl.py index f75ed034..6c330e0d 100644 --- a/src/gene/etl/ensembl.py +++ b/src/gene/etl/ensembl.py @@ -19,8 +19,7 @@ ) from gene.schemas import NamespacePrefix, SourceMeta, SourceName, Strand -logger = logging.getLogger("gene") -logger.setLevel(logging.DEBUG) +_logger = logging.getLogger(__name__) class Ensembl(Base): @@ -84,7 +83,7 @@ def _download_data(self) -> Path: :return: path to acquired file :raise GeneSourceFetchError: if unable to find file matching expected pattern """ - logger.info("Downloading latest Ensembl data file...") + _logger.info("Downloading latest Ensembl data file...") pattern = r"Homo_sapiens\.(?PGRCh\d+)\.(?P\d+)\.gff3\.gz" with FTP(self._host) as ftp: ftp.login() @@ -98,7 +97,7 @@ def _download_data(self) -> Path: version = resp["version"] new_fn = f"ensembl_{assembly}_{version}.gff3" self._ftp_download_file(ftp, f, self.src_data_dir, new_fn) - logger.info( + _logger.info( f"Successfully downloaded Ensembl {version} data to {self.src_data_dir / new_fn}." ) return self.src_data_dir / new_fn @@ -120,7 +119,7 @@ def _extract_data(self, use_existing: bool) -> None: def _transform_data(self) -> None: """Transform the Ensembl source.""" - logger.info("Transforming Ensembl...") + _logger.info("Transforming Ensembl...") db = gffutils.create_db( str(self._data_src), dbfn=":memory:", @@ -143,7 +142,7 @@ def _transform_data(self) -> None: gene = self._add_gene(f, accession_numbers) if gene: self._load_gene(gene) - logger.info("Successfully transformed Ensembl.") + _logger.info("Successfully transformed Ensembl.") def _add_gene(self, f: Feature, accession_numbers: Dict) -> Dict: """Create a transformed gene record. diff --git a/src/gene/etl/hgnc.py b/src/gene/etl/hgnc.py index bfd07a7e..e725f7d4 100644 --- a/src/gene/etl/hgnc.py +++ b/src/gene/etl/hgnc.py @@ -27,8 +27,7 @@ SymbolStatus, ) -logger = logging.getLogger("gene") -logger.setLevel(logging.DEBUG) +_logger = logging.getLogger(__name__) class HGNC(Base): @@ -87,7 +86,7 @@ def _download_data(self) -> Path: :return: path to newly-downloaded file """ - logger.info("Downloading HGNC data file...") + _logger.info("Downloading HGNC data file...") tmp_fn = "hgnc_version.json" version = self._ftp_download( @@ -95,7 +94,7 @@ def _download_data(self) -> Path: ) final_location = f"{self.src_data_dir}/hgnc_{version}.json" shutil.move(f"{self.src_data_dir}/{tmp_fn}", final_location) - logger.info(f"Successfully downloaded HGNC data file to {final_location}.") + _logger.info(f"Successfully downloaded HGNC data file to {final_location}.") return Path(final_location) def _extract_data(self, use_existing: bool) -> None: @@ -111,7 +110,7 @@ def _extract_data(self, use_existing: bool) -> None: def _transform_data(self) -> None: """Transform the HGNC source.""" - logger.info("Transforming HGNC...") + _logger.info("Transforming HGNC...") with open(self._data_file, "r") as f: data = json.load(f) @@ -142,7 +141,7 @@ def _transform_data(self) -> None: if "locus_type" in r: gene["gene_type"] = r["locus_type"] self._load_gene(gene) - logger.info("Successfully transformed HGNC.") + _logger.info("Successfully transformed HGNC.") def _get_aliases(self, r: Dict, gene: Dict) -> None: """Store aliases in a gene record. @@ -223,7 +222,7 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: else: self._get_xref_associated_with(key, src, r, associated_with) else: - logger.warning(f"{key} not in schemas.py") + _logger.warning(f"{key} not in schemas.py") if xrefs: gene["xrefs"] = xrefs diff --git a/src/gene/etl/merge.py b/src/gene/etl/merge.py index 8124d294..651ce8ae 100644 --- a/src/gene/etl/merge.py +++ b/src/gene/etl/merge.py @@ -7,8 +7,7 @@ from gene.database.database import DatabaseWriteException from gene.schemas import GeneTypeFieldName, RecordType, SourcePriority -logger = logging.getLogger("gene") -logger.setLevel(logging.DEBUG) +_logger = logging.getLogger(__name__) class Merge: @@ -28,7 +27,7 @@ def create_merged_concepts(self, record_ids: Set[str]) -> None: :param record_ids: concept identifiers from which groups should be generated. Should *not* include any records from excluded sources. """ - logger.info("Generating record ID sets...") + _logger.info("Generating record ID sets...") start = timer() for record_id in record_ids: new_group = self._create_record_id_set(record_id) @@ -36,11 +35,11 @@ def create_merged_concepts(self, record_ids: Set[str]) -> None: for concept_id in new_group: self._groups[concept_id] = new_group end = timer() - logger.debug(f"Built record ID sets in {end - start} seconds") + _logger.debug(f"Built record ID sets in {end - start} seconds") self._groups = {k: v for k, v in self._groups.items() if len(v) > 1} - logger.info("Creating merged records and updating database...") + _logger.info("Creating merged records and updating database...") uploaded_ids = set() start = timer() for record_id, group in self._groups.items(): @@ -58,17 +57,17 @@ def create_merged_concepts(self, record_ids: Set[str]) -> None: self._database.update_merge_ref(concept_id, merge_ref) except DatabaseWriteException as dw: if str(dw).startswith("No such record exists"): - logger.error( + _logger.error( f"Updating nonexistent record: {concept_id} " f"for merge ref to {merge_ref}" ) else: - logger.error(str(dw)) + _logger.error(str(dw)) uploaded_ids |= group self._database.complete_write_transaction() - logger.info("Merged concept generation successful.") + _logger.info("Merged concept generation successful.") end = timer() - logger.debug(f"Generated and added concepts in {end - start} seconds") + _logger.debug(f"Generated and added concepts in {end - start} seconds") def _create_record_id_set( self, record_id: str, observed_id_set: Optional[Set] = None @@ -88,7 +87,7 @@ def _create_record_id_set( else: db_record = self._database.get_record_by_id(record_id) if not db_record: - logger.warning( + _logger.warning( f"Record ID set creator could not resolve " f"lookup for {record_id} in ID set: " f"{observed_id_set}" @@ -124,7 +123,7 @@ def _generate_merged_record(self, record_id_set: Set[str]) -> Dict: if record: records.append(record) else: - logger.error( + _logger.error( f"Merge record generator could not retrieve " f"record for {record_id} in {record_id_set}" ) diff --git a/src/gene/etl/ncbi.py b/src/gene/etl/ncbi.py index 45d3c308..3f0cdfbb 100644 --- a/src/gene/etl/ncbi.py +++ b/src/gene/etl/ncbi.py @@ -26,8 +26,7 @@ SymbolStatus, ) -logger = logging.getLogger("gene") -logger.setLevel(logging.DEBUG) +_logger = logging.getLogger(__name__) class NCBI(Base): @@ -113,7 +112,7 @@ def _download_gff(self) -> Path: :return: Path to downloaded file :raise SourceFetchError: if unable to identify latest available file """ - logger.info("Downloading NCBI genome annotation file...") + _logger.info("Downloading NCBI genome annotation file...") genomic_gff_pattern = r"GCF_\d+\.\d+_(GRCh\d+\.\w\d+)_genomic.gff.gz" with FTP(self._host) as ftp: ftp.login() @@ -133,7 +132,7 @@ def _download_gff(self) -> Path: self._ftp_download_file( ftp, genomic_filename, self.src_data_dir, new_filename ) - logger.info( + _logger.info( f"Downloaded NCBI genome annotation file to {self.src_data_dir / new_filename}" ) return self.src_data_dir / new_filename @@ -162,7 +161,7 @@ def _download_history_file(self) -> Path: :return: Path to downloaded file """ - logger.info("Downloading NCBI gene_history...") + _logger.info("Downloading NCBI gene_history...") tmp_fn = "ncbi_history_tmp.tsv" data_fn = "gene_history.gz" version = self._ftp_download( @@ -170,7 +169,7 @@ def _download_history_file(self) -> Path: ) final_location = f"{self.src_data_dir}/ncbi_history_{version}.tsv" shutil.move(f"{self.src_data_dir}/{tmp_fn}", final_location) - logger.info(f"Successfully downloaded NCBI gene_history to {final_location}.") + _logger.info(f"Successfully downloaded NCBI gene_history to {final_location}.") return Path(final_location) def _gene_file_is_up_to_date(self, gene_file: Path) -> bool: @@ -200,13 +199,13 @@ def _download_gene_file(self) -> Path: data_dir = f"{self._data_dir}GENE_INFO/Mammalia/" tmp_fn = "ncbi_info_tmp.tsv" data_fn = "Homo_sapiens.gene_info.gz" - logger.info("Downloading NCBI gene_info....") + _logger.info("Downloading NCBI gene_info....") version = self._ftp_download( self._host, data_dir, tmp_fn, self.src_data_dir, data_fn ) final_location = f"{self.src_data_dir}/ncbi_info_{version}.tsv" shutil.move(f"{self.src_data_dir}/{tmp_fn}", final_location) - logger.info(f"Successfully downloaded NCBI gene_info to {final_location}.") + _logger.info(f"Successfully downloaded NCBI gene_info to {final_location}.") return Path(final_location) def _extract_data(self, use_existing: bool) -> None: @@ -300,7 +299,7 @@ def _add_xrefs_associated_with(self, val: List[str], params: Dict) -> None: if prefix: params["associated_with"].append(f"{prefix}:{src_id}") else: - logger.info(f"{src_name} is not in NameSpacePrefix.") + _logger.info(f"{src_name} is not in NameSpacePrefix.") if not params["xrefs"]: del params["xrefs"] if not params["associated_with"]: @@ -494,7 +493,7 @@ def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict: if len(chromosomes) >= 2: if chromosomes and "X" not in chromosomes and "Y" not in chromosomes: - logger.info( + _logger.info( f"{row[2]} contains multiple distinct " f"chromosomes: {chromosomes}." ) @@ -520,7 +519,7 @@ def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict: # Exclude genes where there are multiple distinct locations # i.e. OMS: '10q26.3', '19q13.42-q13.43', '3p25.3' if len(locations) > 2: - logger.info( + _logger.info( f"{row[2]} contains multiple distinct " f"locations: {locations}." ) locations = None @@ -531,7 +530,7 @@ def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict: for i in range(len(locations)): loc = locations[i].strip() if not re.match("^([1-9][0-9]?|X[pq]?|Y[pq]?)", loc): - logger.info( + _logger.warning( f"{row[2]} contains invalid map location:" f"{loc}." ) params["location_annotations"].append(loc) @@ -619,7 +618,7 @@ def _set_centromere_location(self, loc: str, location: Dict) -> None: def _transform_data(self) -> None: """Modify data and pass to loading functions.""" - logger.info("Transforming NCBI...") + _logger.info("Transforming NCBI...") prev_symbols = self._get_prev_symbols() info_genes = self._get_gene_info(prev_symbols) @@ -636,7 +635,7 @@ def _transform_data(self) -> None: for gene in info_genes.keys(): self._load_gene(info_genes[gene]) - logger.info("Successfully transformed NCBI.") + _logger.info("Successfully transformed NCBI.") def _add_meta(self) -> None: """Add Ensembl metadata.