Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 11 additions & 8 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.env
*.pyc
vendor/*
Assignee_Lawyer_Disambiguation/lib/alchemy/config.ini
Expand All @@ -8,16 +9,14 @@ venv
.vscode
*.yml
*.yaml
mydumper/
mydumper/*
!mydumper/mydumper.cnf.template
Development/config.ini
airflow/airflow-webserver.pid
airflow/airflow.db
airflow/lawyer.pickle
airflow/logs/scheduler/2019-01-15/update-db.py.log
airflow/logs/scheduler/2019-01-16/update-db.py.log
airflow/logs/scheduler/latest
airflow/*
!airflow/dags/*
!airflow/airflow.cfg.template
airflow-metadata-db-disk
airflow/unittests.cfg
airflow_pipeline_env.sh
Development/dev_config.ini
Assignee_Lawyer_Disambiguation/lib/alchemy/alchemy_config.ini
airflow-metadata-db-disk/*
Expand All @@ -28,6 +27,8 @@ node_modules/
app-db-exploration
QA_*
output/
go_live/*.json
go_live/*.csv
upload_*/*
*_qa_loc*
**/*.log
Expand All @@ -44,7 +45,9 @@ scratch
**/*.err
pgpubs_*
resources/sql.conf
resources/us-patent-application-*.dtd
config.ini
airflow-metadata-*
patent_db_disk/
TableToggle.json
Z_Frame_job-*.csv
657 changes: 419 additions & 238 deletions QA/DatabaseTester.py

Large diffs are not rendered by default.

54 changes: 37 additions & 17 deletions QA/production/ProdDBTester.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from lib.download_check_delete_databases import query_for_all_tables_in_db, get_count_for_all_tables
from lib.download_check_delete_databases import (
query_for_all_tables_in_db,
get_count_for_all_tables,
)
from lib.configuration import get_current_config, get_unique_connection_string
import datetime
from QA.DatabaseTester import DatabaseTester
Expand All @@ -11,40 +14,57 @@

class ProdDBTester(DatabaseTester):
def __init__(self, config):
end_date = datetime.datetime.strptime(config['DATES']['END_DATE'], '%Y%m%d')
database_name = config['PATENTSVIEW_DATABASES']["REPORTING_DATABASE"]
super().__init__(config, database_name, datetime.date(year=1976, month=1, day=1),end_date)
self.connection = pymysql.connect(host=config['PROD_DATABASE_SETUP']['HOST'],
user=config['PROD_DATABASE_SETUP']['USERNAME'],
password=config['PROD_DATABASE_SETUP']['PASSWORD'],
db=database_name,
charset='utf8mb4',
cursorclass=pymysql.cursors.SSCursor, defer_connect=True)
end_date = datetime.datetime.strptime(config["DATES"]["END_DATE"], "%Y%m%d")
database_name = config["PATENTSVIEW_DATABASES"]["REPORTING_DATABASE"]
super().__init__(
config, database_name, datetime.date(year=1976, month=1, day=1), end_date
)

self.connection = pymysql.connect(
host=config["PROD_DATABASE_SETUP"]["HOST"],
user=config["PROD_DATABASE_SETUP"]["USERNAME"],
password=config["PROD_DATABASE_SETUP"]["PASSWORD"],
db=database_name,
charset="utf8mb4",
cursorclass=pymysql.cursors.SSCursor,
defer_connect=True,
)
self.database_type = "PROD_" + "PatentsView"

def run_prod_db_tests(self):
counter = 0
total_tables = len(self.table_config.keys())
self.init_qa_dict()
for table in self.table_config:
for table in ["cpc_current_group_application_year"]: # self.table_config:
self.load_table_row_count(table, where_vi=False)
self.check_for_indexes(table)
self.save_qa_data()
self.init_qa_dict()
logger.info(f"FINISHED WITH TABLE: {table}")
counter += 1
logger.info(f"==============================================================================")
logger.info(f"Currently Done With {counter} of {total_tables} | {counter/total_tables:.2%}")
logger.info(f"==============================================================================")
logger.info(
f"=============================================================================="
)
logger.info(
f"Currently Done With {counter} of {total_tables} | {counter/total_tables:.2%}"
)
logger.info(
f"=============================================================================="
)


def run_prod_db_qa(**kwargs):
config = get_current_config(type='granted_patent', schedule="quarterly", **kwargs)
config = get_current_config(type="granted_patent", schedule="quarterly", **kwargs)
qc = ProdDBTester(config)
qc.run_prod_db_tests()


if __name__ == '__main__':
if __name__ == "__main__":
# check_reporting_db_row_count()
config = get_current_config('granted_patent', schedule='quarterly', **{"execution_date": datetime.date(2023, 4, 1)})
config = get_current_config(
"granted_patent",
schedule="quarterly",
**{"execution_date": datetime.date(2025, 9, 1)},
)
qc = ProdDBTester(config)
qc.run_prod_db_tests()
6 changes: 6 additions & 0 deletions commands
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
cp 20211230/patent/download/g_inventor_disambiguated.tsv /pv_export_volume/output/20211230_g_inventor_disambiguated.tsv


awk -F'\t' 'NR==1 || $6=="" || $6=="\"\""' 20220929_g_inventor_disambiguated.tsv > 20220929_output.tsv

wc -l 20220929_output.tsv
2 changes: 1 addition & 1 deletion gender_it
Loading