From e1bd687afab3ea7a118fcdb6467e88d0f832ce92 Mon Sep 17 00:00:00 2001 From: Dylan Hall Date: Wed, 1 Mar 2023 07:59:23 -0500 Subject: [PATCH 1/7] add new options for v2 address selection query --- utils/data_reader.py | 124 ++++++++++++++++++++++++++++++++----------- 1 file changed, 92 insertions(+), 32 deletions(-) diff --git a/utils/data_reader.py b/utils/data_reader.py index 029a490..5739f1d 100644 --- a/utils/data_reader.py +++ b/utils/data_reader.py @@ -52,7 +52,7 @@ def add_parser_db_args(parser): default=V2, choices=[V1, V2], help="Version of the CODI Data Model schema to use. " - f'Valid options are "{V1}" or "{V2}"', + f'Valid options are "{V1}" or "{V2}". Default is "{V2}', ) parser.add_argument( @@ -79,8 +79,29 @@ def add_parser_db_args(parser): "--schema_name", dest="v2_schema", default="cdm", - help="Name of the database schema containing the CODI DEMOGRAPHIC" - " and PRIVATE_DEMOGRAPHIC tables", + help="Name of the database schema containing the PRIVATE_DEMOGRAPHIC" + " and PRIVATE_ADDRESS_HISTORY tables in a v2 database. " + "Default is 'cdm'", + ) + parser.add_argument( + "--address_selection", + dest="v2_address_selection", + choices=["full", "preferred", "single"], + default="full", + help="Determines the approach for selecting a single address per PATID" + " from PRIVATE_ADDRESS_HISTORY. Options: Use \"single\" if " + "the data is already guaranteed to only contain one address per PATID." + " Use \"preferred\" if the database is guaranteed to only contain one " + "address with address_preferred='Y' per PATID. " + "Use \"full\" if the database may contain multiple preferred addresses" + " for different dates/types/use. This option will select " + "the most recent preferred address by start date." + "Default if not specified is 'full'", + ) + parser.add_argument( + "--debug_query", + action="store_true", + help="Aids in debugging by printing out the actual DB query being run", ) @@ -171,6 +192,10 @@ def get_query(engine, version, args): ) query = select([identifier]) + + if args.debug_query: + print(query) + return query else: # note there is also the `demographic` table, but @@ -193,36 +218,71 @@ def get_query(engine, version, args): schema=args.v2_schema, ) - addr_period_order = prv_address.columns.address_period_start.desc() - - # Different SQL engines have different semantics for sorting DESC: - # Postgres and Oracle put nulls first, so we want NULLS LAST - # MSSQL puts nulls last, but doesn't support NULLS LAST - # so we use this hack to get NULLS LAST for all main dialects. - # For safety, in case other engines also don't support NULLS LAST, - # only apply it to the ones that we know it works on - # (vs not applying it to the ones we know it doesn't) - - # TODO: test on MySQL - deferring since none of our partners use it now - - # known engine dialect names are "mssql", "postgresql", and "oracle" - if engine.dialect.name in ["postgresql", "oracle"]: - addr_period_order = addr_period_order.nulls_last() - - subquery = ( - select(prv_address.columns.addressid) - .filter(prv_address.columns.patid == prv_demo.columns.patid) - .order_by(prv_address.columns.address_preferred.desc()) - .order_by(addr_period_order) - .limit(1) - .correlate(prv_demo) - .scalar_subquery() - ) + if args.v2_address_selection == "single": + # The user said their data is guaranteed to only have a single + # address per PATID. This simplifies the query to just + # join the tables together with no additional filters + query = select([prv_demo, prv_address]).filter( + prv_demo.columns.patid == prv_address.columns.patid + ) + elif args.v2_address_selection == "preferred": + # The user said their data may have multiple addresses, + # but is guaranteed that only one per PATID will be preferred. + # This simplifies the query to just select ADDRESS_PREFERRED=Y + query = select([prv_demo, prv_address]).filter( + prv_demo.columns.patid == prv_address.columns.patid, + prv_address.columns.address_preferred == 'Y', + ) + else: + # The user indicated the data may have multiple preferreds, + # (or at least did not select one of the above options) + # so we select the most recent by date. + # The PCOR schema includes "type" (physical/postal/both/unknown) + # and "use" (home/word/temp/old/unknown) fields, and the hierarchy + # of the possible combination of those options is not well-defined. + # (eg, should we pick physical/work over a both-type/unknown-use?) + # For simplicity and performance we will just pick + # the first preferred address we find, sorting by date. + # Going forward, a better solution is likely to include all of + # an individuals' addresses in PPRL, rather than more complex ways + # of picking a single one. + + addr_period_order = prv_address.columns.address_period_start.desc() + + # Different SQL engines have different semantics for sorting DESC: + # Postgres and Oracle put nulls first, so we want NULLS LAST + # MSSQL puts nulls last, but doesn't support NULLS LAST + # so we use this hack to get NULLS LAST for all main dialects. + # For safety, in case other engines also don't support NULLS LAST, + # only apply it to the ones that we know it works on + # (vs not applying it to the ones we know it doesn't) + + # TODO: test on MySQL - deferring since none of our partners use it now + + # known engine dialect names are "mssql", "postgresql", and "oracle" + if engine.dialect.name in ["postgresql", "oracle"]: + addr_period_order = addr_period_order.nulls_last() + + subquery = ( + select(prv_address.columns.addressid) + .filter( + prv_address.columns.patid == prv_demo.columns.patid, + prv_address.columns.address_preferred == 'Y' + ) + .order_by(prv_address.columns.address_preferred.desc()) + .order_by(addr_period_order) + .limit(1) + .correlate(prv_demo) + .scalar_subquery() + ) - query = select([prv_demo, prv_address]).filter( - prv_demo.columns.patid == prv_address.columns.patid, - prv_address.columns.addressid == subquery, - ) + query = select([prv_demo, prv_address]).filter( + prv_demo.columns.patid == prv_address.columns.patid, + prv_address.columns.addressid == subquery, + ) + + if args.debug_query: + print(query) return query From 4c99cca3047895b6072116376a0a02ce2c6627fd Mon Sep 17 00:00:00 2001 From: Dylan Hall Date: Wed, 1 Mar 2023 07:59:49 -0500 Subject: [PATCH 2/7] don't crash on null dob/sex --- extract.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/extract.py b/extract.py index 5c2b480..77af057 100644 --- a/extract.py +++ b/extract.py @@ -230,11 +230,17 @@ def handle_row(row, report, version): output_row.append(clean_string(family_name)) dob = case_insensitive_lookup(row, "DOB", version) - output_row.append(dob.strftime("%Y-%m-%d")) + if dob: + output_row.append(dob.strftime("%Y-%m-%d")) + else: + output_row.append("") sex = case_insensitive_lookup(row, "sex", version) validate(report, "sex", sex) - output_row.append(sex.strip()) + if sex: + output_row.append(sex.strip()) + else: + output_row.append("") phone_number = case_insensitive_lookup(row, "phone", version) validate(report, "phone_number", phone_number) From b6c648508229eacd16c0f53e7877062b54d67bdd Mon Sep 17 00:00:00 2001 From: Dylan Hall Date: Wed, 1 Mar 2023 08:00:44 -0500 Subject: [PATCH 3/7] remove 'earlier dob than expected' from data characterization --- data_analysis.py | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/data_analysis.py b/data_analysis.py index a2a72ee..549b9f4 100644 --- a/data_analysis.py +++ b/data_analysis.py @@ -56,16 +56,8 @@ def analyze(data, source): "missing": int(dob_col.isna().sum()), } - expected_min_dob = "1997-01-01" - # roughly, 19 years old at the start of CODI observation period (2016) - # expected_max_dob is trickier because we don't know when the data was populated - # TODO: add another command line arg? - if source == "csv": - if "-" in notnull_dobs[0]: - out_of_range = notnull_dobs[notnull_dobs < expected_min_dob] - stats["dob"]["count_earlier_dob_than_expected"] = len(out_of_range) - else: + if "-" not in notnull_dobs.iloc[0]: # the date format will either be YYYY-MM-DD or YYMMDD # we'll assume it's consistent across a single file @@ -81,21 +73,6 @@ def analyze(data, source): ) # str-ify the Timestamps again stats["dob"]["max_parsed"] = str(parsed_dobs.max()) - expected_min_dob = pd.to_datetime(expected_min_dob, format="%Y-%m-%d") - - out_of_range = parsed_dobs[parsed_dobs < expected_min_dob] - stats["dob"]["count_earlier_dob_than_expected"] = len(out_of_range) - - else: - # different DBs may return different data types for the DOB column - if type(notnull_dobs[0]) is date: - expected_min_dob = date.fromisoformat(expected_min_dob) - # else - # assume it's a string in ISO format, no change should be needed - - out_of_range = notnull_dobs[notnull_dobs < expected_min_dob] - stats["dob"]["count_earlier_dob_than_expected"] = len(out_of_range) - sex_col = case_insensitive_lookup(data, "sex", source) stats["sex"] = top_N(sex_col) From 4d504e4b2f381bf4a93f66a09d796bf2dc57f03c Mon Sep 17 00:00:00 2001 From: Dylan Hall Date: Wed, 1 Mar 2023 08:25:35 -0500 Subject: [PATCH 4/7] don't crash on blank DOB in csv extract --- extract.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/extract.py b/extract.py index 77af057..0d1947e 100644 --- a/extract.py +++ b/extract.py @@ -97,6 +97,8 @@ def clean_zip(household_zip): def clean_dob_fromstr(dob_str, date_format): + if not dob_str: + return "" return strftime("%Y-%m-%d", strptime(dob_str, date_format)) From c06bfdec87078ae3b39b7415712a56320a23388c Mon Sep 17 00:00:00 2001 From: Dylan Hall Date: Wed, 1 Mar 2023 08:26:42 -0500 Subject: [PATCH 5/7] style fixes --- utils/data_reader.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/utils/data_reader.py b/utils/data_reader.py index 5739f1d..3bec987 100644 --- a/utils/data_reader.py +++ b/utils/data_reader.py @@ -89,11 +89,11 @@ def add_parser_db_args(parser): choices=["full", "preferred", "single"], default="full", help="Determines the approach for selecting a single address per PATID" - " from PRIVATE_ADDRESS_HISTORY. Options: Use \"single\" if " + ' from PRIVATE_ADDRESS_HISTORY. Options: Use "single" if ' "the data is already guaranteed to only contain one address per PATID." - " Use \"preferred\" if the database is guaranteed to only contain one " + ' Use "preferred" if the database is guaranteed to only contain one ' "address with address_preferred='Y' per PATID. " - "Use \"full\" if the database may contain multiple preferred addresses" + 'Use "full" if the database may contain multiple preferred addresses' " for different dates/types/use. This option will select " "the most recent preferred address by start date." "Default if not specified is 'full'", @@ -231,7 +231,7 @@ def get_query(engine, version, args): # This simplifies the query to just select ADDRESS_PREFERRED=Y query = select([prv_demo, prv_address]).filter( prv_demo.columns.patid == prv_address.columns.patid, - prv_address.columns.address_preferred == 'Y', + prv_address.columns.address_preferred == "Y", ) else: # The user indicated the data may have multiple preferreds, @@ -267,7 +267,7 @@ def get_query(engine, version, args): select(prv_address.columns.addressid) .filter( prv_address.columns.patid == prv_demo.columns.patid, - prv_address.columns.address_preferred == 'Y' + prv_address.columns.address_preferred == "Y", ) .order_by(prv_address.columns.address_preferred.desc()) .order_by(addr_period_order) From 66ca835bc08464277bbe459af0c74d170727fa94 Mon Sep 17 00:00:00 2001 From: Dylan Hall Date: Wed, 1 Mar 2023 08:29:12 -0500 Subject: [PATCH 6/7] remove unused import --- data_analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_analysis.py b/data_analysis.py index 549b9f4..115fe6d 100644 --- a/data_analysis.py +++ b/data_analysis.py @@ -2,7 +2,7 @@ import json import re import time -from datetime import date, datetime +from datetime import datetime import pandas as pd From 331356071e4953176efabbc673d1a8bfaa856ca4 Mon Sep 17 00:00:00 2001 From: Dylan Hall Date: Wed, 1 Mar 2023 09:00:05 -0500 Subject: [PATCH 7/7] missing quote in doc --- utils/data_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/data_reader.py b/utils/data_reader.py index 3bec987..c523983 100644 --- a/utils/data_reader.py +++ b/utils/data_reader.py @@ -52,7 +52,7 @@ def add_parser_db_args(parser): default=V2, choices=[V1, V2], help="Version of the CODI Data Model schema to use. " - f'Valid options are "{V1}" or "{V2}". Default is "{V2}', + f'Valid options are "{V1}" or "{V2}". Default is "{V2}"', ) parser.add_argument(