diff --git a/src/gene/cli.py b/src/gene/cli.py index a14ce952..50e458b8 100644 --- a/src/gene/cli.py +++ b/src/gene/cli.py @@ -17,8 +17,16 @@ from gene.database.database import DatabaseException from gene.schemas import SourceName -logger = logging.getLogger("gene") -logger.setLevel(logging.DEBUG) +_logger = logging.getLogger(__name__) + + +def _configure_logging() -> None: + """Configure logging.""" + logging.basicConfig( + filename=f"{__package__}.log", + format="[%(asctime)s] - %(name)s - %(levelname)s : %(message)s", + ) + logging.getLogger(__package__).setLevel(logging.DEBUG) @click.command() @@ -32,6 +40,7 @@ def check_db(db_url: str, verbose: bool = False) -> None: :param db_url: URL to normalizer database :param verbose: if true, print result to console """ # noqa: D301 + _configure_logging() db = create_db(db_url, False) if not db.check_schema_initialized(): if verbose: @@ -43,8 +52,10 @@ def check_db(db_url: str, verbose: bool = False) -> None: click.echo("Health check failed: DB is incompletely populated.") click.get_current_context().exit(1) + msg = "DB health check successful: tables appear complete." if verbose: - click.echo("DB health check successful: tables appear complete.") + click.echo(msg) + _logger.info(msg) @click.command() @@ -59,6 +70,7 @@ def update_from_remote(data_url: Optional[str], db_url: str) -> None: :param data_url: user-specified location to pull DB dump from :param db_url: URL to normalizer database """ # noqa: D301 + _configure_logging() if not click.confirm("Are you sure you want to overwrite existing data?"): click.get_current_context().exit() if not data_url: @@ -74,6 +86,7 @@ def update_from_remote(data_url: Optional[str], db_url: str) -> None: except DatabaseException as e: click.echo(f"Encountered exception during update: {str(e)}") click.get_current_context().exit(1) + _logger.info("Successfully loaded data from remote snapshot.") @click.command() @@ -91,6 +104,7 @@ def dump_database(output_directory: Path, db_url: str) -> None: :param output_directory: path to existing directory :param db_url: URL to normalizer database """ # noqa: D301 + _configure_logging() if not output_directory: output_directory = Path(".") @@ -105,6 +119,7 @@ def dump_database(output_directory: Path, db_url: str) -> None: except DatabaseException as e: click.echo(f"Encountered exception during update: {str(e)}") click.get_current_context().exit(1) + _logger.info("Database dump successful.") def _update_normalizer( @@ -140,14 +155,14 @@ def _delete_source(n: SourceName, db: AbstractDatabase) -> float: """ msg = f"Deleting {n.value}..." click.echo(f"\n{msg}") - logger.info(msg) + _logger.info(msg) start_delete = timer() db.delete_source(n) end_delete = timer() delete_time = end_delete - start_delete msg = f"Deleted {n.value} in {delete_time:.5f} seconds." click.echo(f"{msg}\n") - logger.info(msg) + _logger.info(msg) return delete_time @@ -172,7 +187,7 @@ def _load_source( """ msg = f"Loading {n.value}..." click.echo(msg) - logger.info(msg) + _logger.info(msg) start_load = timer() # used to get source class name from string @@ -190,17 +205,17 @@ def _load_source( try: processed_ids += source.perform_etl(use_existing) except GeneNormalizerEtlError as e: - logger.error(e) + _logger.error(e) click.echo(f"Encountered error while loading {n}: {e}.") click.get_current_context().exit() end_load = timer() load_time = end_load - start_load msg = f"Loaded {n.value} in {load_time:.5f} seconds." click.echo(msg) - logger.info(msg) + _logger.info(msg) msg = f"Total time for {n.value}: {(delete_time + load_time):.5f} seconds." click.echo(msg) - logger.info(msg) + _logger.info(msg) def _delete_normalized_data(database: AbstractDatabase) -> None: @@ -286,6 +301,7 @@ def update_normalizer_db( :param update_merged: if true, update normalized records :param use_existing: if True, use most recent local data instead of fetching latest version """ # noqa: D301 + _configure_logging() db = create_db(db_url, aws_instance) if update_all: diff --git a/src/gene/database/dynamodb.py b/src/gene/database/dynamodb.py index 5df9e0d0..5baee7b7 100644 --- a/src/gene/database/dynamodb.py +++ b/src/gene/database/dynamodb.py @@ -25,7 +25,7 @@ ) from gene.schemas import RecordType, RefType, SourceMeta, SourceName -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) class DynamoDbDatabase(AbstractDatabase): @@ -156,7 +156,7 @@ def check_schema_initialized(self) -> bool: existing_tables = self.list_tables() exists = self.gene_table in existing_tables if not exists: - logger.info(f"{self.gene_table} table is missing or unavailable.") + _logger.info(f"{self.gene_table} table is missing or unavailable.") return exists def check_tables_populated(self) -> bool: @@ -173,7 +173,7 @@ def check_tables_populated(self) -> bool: KeyConditionExpression=Key("item_type").eq("source"), ).get("Items", []) if len(sources) < len(SourceName): - logger.info("Gene sources table is missing expected sources.") + _logger.info("Gene sources table is missing expected sources.") return False records = self.genes.query( @@ -182,7 +182,7 @@ def check_tables_populated(self) -> bool: Limit=1, ) if len(records.get("Items", [])) < 1: - logger.info("Gene records index is empty.") + _logger.info("Gene records index is empty.") return False normalized_records = self.genes.query( @@ -191,7 +191,7 @@ def check_tables_populated(self) -> bool: Limit=1, ) if len(normalized_records.get("Items", [])) < 1: - logger.info("Normalized gene records index is empty.") + _logger.info("Normalized gene records index is empty.") return False return True @@ -253,7 +253,7 @@ def get_record_by_id( del record["label_and_type"] return record except ClientError as e: - logger.error( + _logger.error( f"boto3 client error on get_records_by_id for " f"search term {concept_id}: " f"{e.response['Error']['Message']}" @@ -276,7 +276,7 @@ def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]: matches = self.genes.query(KeyConditionExpression=filter_exp) return [m["concept_id"] for m in matches.get("Items", None)] except ClientError as e: - logger.error( + _logger.error( f"boto3 client error on get_refs_by_type for " f"search term {search_term}: " f"{e.response['Error']['Message']}" @@ -380,7 +380,7 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: try: self.batch.put_item(Item=record) except ClientError as e: - logger.error( + _logger.error( "boto3 client error on add_record for " f"{concept_id}: {e.response['Error']['Message']}" ) @@ -412,7 +412,7 @@ def add_merged_record(self, record: Dict) -> None: try: self.batch.put_item(Item=record) except ClientError as e: - logger.error( + _logger.error( "boto3 client error on add_record for " f"{concept_id}: {e.response['Error']['Message']}" ) @@ -438,7 +438,7 @@ def _add_ref_record( try: self.batch.put_item(Item=record) except ClientError as e: - logger.error( + _logger.error( f"boto3 client error adding reference {term} for " f"{concept_id} with match type {ref_type}: " f"{e.response['Error']['Message']}" @@ -470,7 +470,7 @@ def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: AN f"No such record exists for keys {label_and_type}, {concept_id}" ) else: - logger.error( + _logger.error( f"boto3 client error in `database.update_record()`: " f"{e.response['Error']['Message']}" ) diff --git a/src/gene/database/postgresql.py b/src/gene/database/postgresql.py index 16209ed0..e181a2a1 100644 --- a/src/gene/database/postgresql.py +++ b/src/gene/database/postgresql.py @@ -26,7 +26,7 @@ ) from gene.schemas import RecordType, RefType, SourceMeta, SourceName -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) SCRIPTS_DIR = Path(__file__).parent / "postgresql" @@ -119,7 +119,7 @@ def drop_db(self) -> None: with self.conn.cursor() as cur: cur.execute(self._drop_db_query) self.conn.commit() - logger.info("Dropped all existing gene normalizer tables.") + _logger.info("Dropped all existing gene normalizer tables.") def check_schema_initialized(self) -> bool: """Check if database schema is properly initialized. @@ -132,7 +132,7 @@ def check_schema_initialized(self) -> bool: except DuplicateTable: self.conn.rollback() else: - logger.info("Gene table existence check failed.") + _logger.info("Gene table existence check failed.") self.conn.rollback() return False try: @@ -141,7 +141,7 @@ def check_schema_initialized(self) -> bool: except DuplicateObject: self.conn.rollback() else: - logger.info("Gene foreign key existence check failed.") + _logger.info("Gene foreign key existence check failed.") self.conn.rollback() return False try: @@ -152,7 +152,7 @@ def check_schema_initialized(self) -> bool: except DuplicateTable: self.conn.rollback() else: - logger.info("Gene normalized view lookup failed.") + _logger.info("Gene normalized view lookup failed.") self.conn.rollback() return False try: @@ -161,7 +161,7 @@ def check_schema_initialized(self) -> bool: except DuplicateTable: self.conn.rollback() else: - logger.info("Gene indexes check failed.") + _logger.info("Gene indexes check failed.") self.conn.rollback() return False @@ -184,21 +184,21 @@ def check_tables_populated(self) -> bool: cur.execute(self._check_sources_query) results = cur.fetchall() if len(results) < len(SourceName): - logger.info("Gene sources table is missing expected sources.") + _logger.info("Gene sources table is missing expected sources.") return False with self.conn.cursor() as cur: cur.execute(self._check_concepts_query) result = cur.fetchone() if not result or result[0] < 1: - logger.info("Gene records table is empty.") + _logger.info("Gene records table is empty.") return False with self.conn.cursor() as cur: cur.execute(self._check_merged_query) result = cur.fetchone() if not result or result[0] < 1: - logger.info("Normalized gene records table is empty.") + _logger.info("Normalized gene records table is empty.") return False return True @@ -260,7 +260,7 @@ def _drop_indexes(self) -> None: def _create_tables(self) -> None: """Create all tables, indexes, and views.""" - logger.debug("Creating new gene normalizer tables.") + _logger.debug("Creating new gene normalizer tables.") tables_query = (SCRIPTS_DIR / "create_tables.sql").read_bytes() with self.conn.cursor() as cur: @@ -594,7 +594,7 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: cur.execute(self._ins_symbol_query, [record["symbol"], concept_id]) self.conn.commit() except UniqueViolation: - logger.error(f"Record with ID {concept_id} already exists") + _logger.error(f"Record with ID {concept_id} already exists") self.conn.rollback() _add_merged_record_query = b""" diff --git a/src/gene/etl/base.py b/src/gene/etl/base.py index 77e9eee1..d49b3b11 100644 --- a/src/gene/etl/base.py +++ b/src/gene/etl/base.py @@ -15,8 +15,7 @@ from gene.database import AbstractDatabase from gene.schemas import Gene, GeneSequenceLocation, MatchType, SourceName -logger = logging.getLogger("gene") -logger.setLevel(logging.DEBUG) +_logger = logging.getLogger(__name__) DATA_DISPATCH = { @@ -70,6 +69,7 @@ def perform_etl(self, use_existing: bool = False) -> List[str]: uploaded. """ self._extract_data(use_existing) + _logger.info("Transforming and loading %s data to DB...", self._src_name.value) if not self._silent: click.echo("Transforming and loading data to DB...") self._add_meta() @@ -88,6 +88,7 @@ def _extract_data(self, use_existing: bool) -> None: self._data_file, self._version = self._data_source.get_latest( from_local=use_existing ) + _logger.info("Acquired data for %s: %s", self._src_name.value, self._data_file) @abstractmethod def _transform_data(self) -> None: @@ -110,7 +111,7 @@ def _load_gene(self, gene: Dict) -> None: try: assert Gene(match_type=MatchType.NO_MATCH, **gene) except pydantic.ValidationError as e: - logger.warning(f"Unable to load {gene} due to validation error: " f"{e}") + _logger.warning(f"Unable to load {gene} due to validation error: " f"{e}") else: concept_id = gene["concept_id"] gene["label_and_type"] = f"{concept_id.lower()}##identity" @@ -211,7 +212,7 @@ def _get_seq_id_aliases(self, seq_id: str) -> List[str]: try: aliases = self.seqrepo.translate_alias(seq_id, target_namespaces="ga4gh") except KeyError as e: - logger.warning(f"SeqRepo raised KeyError: {e}") + _logger.warning(f"SeqRepo raised KeyError: {e}") return aliases def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Dict: @@ -238,7 +239,7 @@ def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Di sequence_id=sequence, ).model_dump() # type: ignore else: - logger.warning( + _logger.warning( f"{params['concept_id']} has invalid interval:" f"start={gene.start - 1} end={gene.end}" ) # type: ignore diff --git a/src/gene/etl/ensembl.py b/src/gene/etl/ensembl.py index 4a52975a..e062a545 100644 --- a/src/gene/etl/ensembl.py +++ b/src/gene/etl/ensembl.py @@ -12,8 +12,7 @@ ) from gene.schemas import NamespacePrefix, SourceMeta, SourceName, Strand -logger = logging.getLogger("gene") -logger.setLevel(logging.DEBUG) +_logger = logging.getLogger(__name__) class Ensembl(Base): @@ -36,7 +35,7 @@ def _extract_data(self, use_existing: bool) -> None: def _transform_data(self) -> None: """Transform the Ensembl source.""" - logger.info("Transforming Ensembl...") + _logger.info("Transforming Ensembl...") db = gffutils.create_db( str(self._data_file), dbfn=":memory:", @@ -59,7 +58,7 @@ def _transform_data(self) -> None: gene = self._add_gene(f, accession_numbers) if gene: self._load_gene(gene) - logger.info("Successfully transformed Ensembl.") + _logger.info("Successfully transformed Ensembl.") def _add_gene(self, f: Feature, accession_numbers: Dict) -> Dict: """Create a transformed gene record. diff --git a/src/gene/etl/hgnc.py b/src/gene/etl/hgnc.py index 2fee6117..242923d7 100644 --- a/src/gene/etl/hgnc.py +++ b/src/gene/etl/hgnc.py @@ -18,8 +18,7 @@ SymbolStatus, ) -logger = logging.getLogger("gene") -logger.setLevel(logging.DEBUG) +_logger = logging.getLogger(__name__) class HGNC(Base): @@ -27,7 +26,7 @@ class HGNC(Base): def _transform_data(self) -> None: """Transform the HGNC source.""" - logger.info("Transforming HGNC...") + _logger.info("Transforming HGNC...") with open(self._data_file, "r") as f: # type: ignore data = json.load(f) @@ -58,7 +57,7 @@ def _transform_data(self) -> None: if "locus_type" in r: gene["gene_type"] = r["locus_type"] self._load_gene(gene) - logger.info("Successfully transformed HGNC.") + _logger.info("Successfully transformed HGNC.") def _get_aliases(self, r: Dict, gene: Dict) -> None: """Store aliases in a gene record. @@ -139,7 +138,7 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: else: self._get_xref_associated_with(key, src, r, associated_with) else: - logger.warning(f"{key} not in schemas.py") + _logger.warning(f"{key} not in schemas.py") if xrefs: gene["xrefs"] = xrefs diff --git a/src/gene/etl/merge.py b/src/gene/etl/merge.py index 8124d294..651ce8ae 100644 --- a/src/gene/etl/merge.py +++ b/src/gene/etl/merge.py @@ -7,8 +7,7 @@ from gene.database.database import DatabaseWriteException from gene.schemas import GeneTypeFieldName, RecordType, SourcePriority -logger = logging.getLogger("gene") -logger.setLevel(logging.DEBUG) +_logger = logging.getLogger(__name__) class Merge: @@ -28,7 +27,7 @@ def create_merged_concepts(self, record_ids: Set[str]) -> None: :param record_ids: concept identifiers from which groups should be generated. Should *not* include any records from excluded sources. """ - logger.info("Generating record ID sets...") + _logger.info("Generating record ID sets...") start = timer() for record_id in record_ids: new_group = self._create_record_id_set(record_id) @@ -36,11 +35,11 @@ def create_merged_concepts(self, record_ids: Set[str]) -> None: for concept_id in new_group: self._groups[concept_id] = new_group end = timer() - logger.debug(f"Built record ID sets in {end - start} seconds") + _logger.debug(f"Built record ID sets in {end - start} seconds") self._groups = {k: v for k, v in self._groups.items() if len(v) > 1} - logger.info("Creating merged records and updating database...") + _logger.info("Creating merged records and updating database...") uploaded_ids = set() start = timer() for record_id, group in self._groups.items(): @@ -58,17 +57,17 @@ def create_merged_concepts(self, record_ids: Set[str]) -> None: self._database.update_merge_ref(concept_id, merge_ref) except DatabaseWriteException as dw: if str(dw).startswith("No such record exists"): - logger.error( + _logger.error( f"Updating nonexistent record: {concept_id} " f"for merge ref to {merge_ref}" ) else: - logger.error(str(dw)) + _logger.error(str(dw)) uploaded_ids |= group self._database.complete_write_transaction() - logger.info("Merged concept generation successful.") + _logger.info("Merged concept generation successful.") end = timer() - logger.debug(f"Generated and added concepts in {end - start} seconds") + _logger.debug(f"Generated and added concepts in {end - start} seconds") def _create_record_id_set( self, record_id: str, observed_id_set: Optional[Set] = None @@ -88,7 +87,7 @@ def _create_record_id_set( else: db_record = self._database.get_record_by_id(record_id) if not db_record: - logger.warning( + _logger.warning( f"Record ID set creator could not resolve " f"lookup for {record_id} in ID set: " f"{observed_id_set}" @@ -124,7 +123,7 @@ def _generate_merged_record(self, record_id_set: Set[str]) -> Dict: if record: records.append(record) else: - logger.error( + _logger.error( f"Merge record generator could not retrieve " f"record for {record_id} in {record_id_set}" ) diff --git a/src/gene/etl/ncbi.py b/src/gene/etl/ncbi.py index a3b2e706..01681bb4 100644 --- a/src/gene/etl/ncbi.py +++ b/src/gene/etl/ncbi.py @@ -24,8 +24,7 @@ SymbolStatus, ) -logger = logging.getLogger("gene") -logger.setLevel(logging.DEBUG) +_logger = logging.getLogger(__name__) class NCBI(Base): @@ -53,6 +52,7 @@ def _extract_data(self, use_existing: bool) -> None: :param use_existing: if True, use latest available local file """ + _logger.info("Gathering %s data...", self._src_name.value) self._gff_src, self._assembly = self._genome_data_handler.get_latest( from_local=use_existing ) @@ -130,7 +130,7 @@ def _add_xrefs_associated_with(self, val: List[str], params: Dict) -> None: if prefix: params["associated_with"].append(f"{prefix}:{src_id}") else: - logger.info(f"{src_name} is not in NameSpacePrefix.") + _logger.info(f"{src_name} is not in NameSpacePrefix.") if not params["xrefs"]: del params["xrefs"] if not params["associated_with"]: @@ -324,7 +324,7 @@ def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict: if len(chromosomes) >= 2: if chromosomes and "X" not in chromosomes and "Y" not in chromosomes: - logger.info( + _logger.info( f"{row[2]} contains multiple distinct " f"chromosomes: {chromosomes}." ) @@ -350,7 +350,7 @@ def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict: # Exclude genes where there are multiple distinct locations # i.e. OMS: '10q26.3', '19q13.42-q13.43', '3p25.3' if len(locations) > 2: - logger.info( + _logger.info( f"{row[2]} contains multiple distinct " f"locations: {locations}." ) locations = None @@ -361,7 +361,7 @@ def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict: for i in range(len(locations)): loc = locations[i].strip() if not re.match("^([1-9][0-9]?|X[pq]?|Y[pq]?)", loc): - logger.info( + _logger.info( f"{row[2]} contains invalid map location:" f"{loc}." ) params["location_annotations"].append(loc) @@ -449,7 +449,7 @@ def _set_centromere_location(self, loc: str, location: Dict) -> None: def _transform_data(self) -> None: """Modify data and pass to loading functions.""" - logger.info("Transforming NCBI...") + _logger.info("Transforming NCBI...") prev_symbols = self._get_prev_symbols() info_genes = self._get_gene_info(prev_symbols) @@ -466,7 +466,7 @@ def _transform_data(self) -> None: for gene in info_genes.keys(): self._load_gene(info_genes[gene]) - logger.info("Successfully transformed NCBI.") + _logger.info("Successfully transformed NCBI.") def _add_meta(self) -> None: """Add Ensembl metadata.