From e1bd687afab3ea7a118fcdb6467e88d0f832ce92 Mon Sep 17 00:00:00 2001
From: Dylan Hall <dehall@mitre.org>
Date: Wed, 1 Mar 2023 07:59:23 -0500
Subject: [PATCH 1/7] add new options for v2 address selection query

---
 utils/data_reader.py | 124 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 92 insertions(+), 32 deletions(-)

diff --git a/utils/data_reader.py b/utils/data_reader.py
index 029a490..5739f1d 100644
--- a/utils/data_reader.py
+++ b/utils/data_reader.py
@@ -52,7 +52,7 @@ def add_parser_db_args(parser):
         default=V2,
         choices=[V1, V2],
         help="Version of the CODI Data Model schema to use. "
-        f'Valid options are "{V1}" or "{V2}"',
+        f'Valid options are "{V1}" or "{V2}". Default is "{V2}',
     )
 
     parser.add_argument(
@@ -79,8 +79,29 @@ def add_parser_db_args(parser):
         "--schema_name",
         dest="v2_schema",
         default="cdm",
-        help="Name of the database schema containing the CODI DEMOGRAPHIC"
-        " and PRIVATE_DEMOGRAPHIC tables",
+        help="Name of the database schema containing the PRIVATE_DEMOGRAPHIC"
+        " and PRIVATE_ADDRESS_HISTORY tables in a v2 database. "
+        "Default is 'cdm'",
+    )
+    parser.add_argument(
+        "--address_selection",
+        dest="v2_address_selection",
+        choices=["full", "preferred", "single"],
+        default="full",
+        help="Determines the approach for selecting a single address per PATID"
+        " from PRIVATE_ADDRESS_HISTORY. Options: Use \"single\" if "
+        "the data is already guaranteed to only contain one address per PATID."
+        " Use \"preferred\" if the database is guaranteed to only contain one "
+        "address with address_preferred='Y' per PATID. "
+        "Use \"full\" if the database may contain multiple preferred addresses"
+        " for different dates/types/use. This option will select "
+        "the most recent preferred address by start date."
+        "Default if not specified is 'full'",
+    )
+    parser.add_argument(
+        "--debug_query",
+        action="store_true",
+        help="Aids in debugging by printing out the actual DB query being run",
     )
 
 
@@ -171,6 +192,10 @@ def get_query(engine, version, args):
         )
 
         query = select([identifier])
+
+        if args.debug_query:
+            print(query)
+
         return query
     else:
         # note there is also the `demographic` table, but
@@ -193,36 +218,71 @@ def get_query(engine, version, args):
             schema=args.v2_schema,
         )
 
-        addr_period_order = prv_address.columns.address_period_start.desc()
-
-        # Different SQL engines have different semantics for sorting DESC:
-        # Postgres and Oracle put nulls first, so we want NULLS LAST
-        # MSSQL puts nulls last, but doesn't support NULLS LAST
-        # so we use this hack to get NULLS LAST for all main dialects.
-        # For safety, in case other engines also don't support NULLS LAST,
-        #  only apply it to the ones that we know it works on
-        #  (vs not applying it to the ones we know it doesn't)
-
-        # TODO: test on MySQL - deferring since none of our partners use it now
-
-        # known engine dialect names are "mssql", "postgresql", and "oracle"
-        if engine.dialect.name in ["postgresql", "oracle"]:
-            addr_period_order = addr_period_order.nulls_last()
-
-        subquery = (
-            select(prv_address.columns.addressid)
-            .filter(prv_address.columns.patid == prv_demo.columns.patid)
-            .order_by(prv_address.columns.address_preferred.desc())
-            .order_by(addr_period_order)
-            .limit(1)
-            .correlate(prv_demo)
-            .scalar_subquery()
-        )
+        if args.v2_address_selection == "single":
+            # The user said their data is guaranteed to only have a single
+            # address per PATID. This simplifies the query to just
+            # join the tables together with no additional filters
+            query = select([prv_demo, prv_address]).filter(
+                prv_demo.columns.patid == prv_address.columns.patid
+            )
+        elif args.v2_address_selection == "preferred":
+            # The user said their data may have multiple addresses,
+            # but is guaranteed that only one per PATID will be preferred.
+            # This simplifies the query to just select ADDRESS_PREFERRED=Y
+            query = select([prv_demo, prv_address]).filter(
+                prv_demo.columns.patid == prv_address.columns.patid,
+                prv_address.columns.address_preferred == 'Y',
+            )
+        else:
+            # The user indicated the data may have multiple preferreds,
+            # (or at least did not select one of the above options)
+            # so we select the most recent by date.
+            # The PCOR schema includes "type" (physical/postal/both/unknown)
+            # and "use" (home/word/temp/old/unknown) fields, and the hierarchy
+            # of the possible combination of those options is not well-defined.
+            # (eg, should we pick physical/work over a both-type/unknown-use?)
+            # For simplicity and performance we will just pick
+            # the first preferred address we find, sorting by date.
+            # Going forward, a better solution is likely to include all of
+            # an individuals' addresses in PPRL, rather than more complex ways
+            # of picking a single one.
+
+            addr_period_order = prv_address.columns.address_period_start.desc()
+
+            # Different SQL engines have different semantics for sorting DESC:
+            # Postgres and Oracle put nulls first, so we want NULLS LAST
+            # MSSQL puts nulls last, but doesn't support NULLS LAST
+            # so we use this hack to get NULLS LAST for all main dialects.
+            # For safety, in case other engines also don't support NULLS LAST,
+            #  only apply it to the ones that we know it works on
+            #  (vs not applying it to the ones we know it doesn't)
+
+            # TODO: test on MySQL - deferring since none of our partners use it now
+
+            # known engine dialect names are "mssql", "postgresql", and "oracle"
+            if engine.dialect.name in ["postgresql", "oracle"]:
+                addr_period_order = addr_period_order.nulls_last()
+
+            subquery = (
+                select(prv_address.columns.addressid)
+                .filter(
+                    prv_address.columns.patid == prv_demo.columns.patid,
+                    prv_address.columns.address_preferred == 'Y'
+                )
+                .order_by(prv_address.columns.address_preferred.desc())
+                .order_by(addr_period_order)
+                .limit(1)
+                .correlate(prv_demo)
+                .scalar_subquery()
+            )
 
-        query = select([prv_demo, prv_address]).filter(
-            prv_demo.columns.patid == prv_address.columns.patid,
-            prv_address.columns.addressid == subquery,
-        )
+            query = select([prv_demo, prv_address]).filter(
+                prv_demo.columns.patid == prv_address.columns.patid,
+                prv_address.columns.addressid == subquery,
+            )
+
+        if args.debug_query:
+            print(query)
 
         return query
 

From 4c99cca3047895b6072116376a0a02ce2c6627fd Mon Sep 17 00:00:00 2001
From: Dylan Hall <dehall@mitre.org>
Date: Wed, 1 Mar 2023 07:59:49 -0500
Subject: [PATCH 2/7] don't crash on null dob/sex

---
 extract.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/extract.py b/extract.py
index 5c2b480..77af057 100644
--- a/extract.py
+++ b/extract.py
@@ -230,11 +230,17 @@ def handle_row(row, report, version):
     output_row.append(clean_string(family_name))
 
     dob = case_insensitive_lookup(row, "DOB", version)
-    output_row.append(dob.strftime("%Y-%m-%d"))
+    if dob:
+        output_row.append(dob.strftime("%Y-%m-%d"))
+    else:
+        output_row.append("")
 
     sex = case_insensitive_lookup(row, "sex", version)
     validate(report, "sex", sex)
-    output_row.append(sex.strip())
+    if sex:
+        output_row.append(sex.strip())
+    else:
+        output_row.append("")
 
     phone_number = case_insensitive_lookup(row, "phone", version)
     validate(report, "phone_number", phone_number)

From b6c648508229eacd16c0f53e7877062b54d67bdd Mon Sep 17 00:00:00 2001
From: Dylan Hall <dehall@mitre.org>
Date: Wed, 1 Mar 2023 08:00:44 -0500
Subject: [PATCH 3/7] remove 'earlier dob than expected' from data
 characterization

---
 data_analysis.py | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/data_analysis.py b/data_analysis.py
index a2a72ee..549b9f4 100644
--- a/data_analysis.py
+++ b/data_analysis.py
@@ -56,16 +56,8 @@ def analyze(data, source):
         "missing": int(dob_col.isna().sum()),
     }
 
-    expected_min_dob = "1997-01-01"
-    # roughly, 19 years old at the start of CODI observation period (2016)
-    # expected_max_dob is trickier because we don't know when the data was populated
-    # TODO: add another command line arg?
-
     if source == "csv":
-        if "-" in notnull_dobs[0]:
-            out_of_range = notnull_dobs[notnull_dobs < expected_min_dob]
-            stats["dob"]["count_earlier_dob_than_expected"] = len(out_of_range)
-        else:
+        if "-" not in notnull_dobs.iloc[0]:
             # the date format will either be YYYY-MM-DD or YYMMDD
             # we'll assume it's consistent across a single file
 
@@ -81,21 +73,6 @@ def analyze(data, source):
             )  # str-ify the Timestamps again
             stats["dob"]["max_parsed"] = str(parsed_dobs.max())
 
-            expected_min_dob = pd.to_datetime(expected_min_dob, format="%Y-%m-%d")
-
-            out_of_range = parsed_dobs[parsed_dobs < expected_min_dob]
-            stats["dob"]["count_earlier_dob_than_expected"] = len(out_of_range)
-
-    else:
-        # different DBs may return different data types for the DOB column
-        if type(notnull_dobs[0]) is date:
-            expected_min_dob = date.fromisoformat(expected_min_dob)
-        # else
-        #   assume it's a string in ISO format, no change should be needed
-
-        out_of_range = notnull_dobs[notnull_dobs < expected_min_dob]
-        stats["dob"]["count_earlier_dob_than_expected"] = len(out_of_range)
-
     sex_col = case_insensitive_lookup(data, "sex", source)
     stats["sex"] = top_N(sex_col)
 

From 4d504e4b2f381bf4a93f66a09d796bf2dc57f03c Mon Sep 17 00:00:00 2001
From: Dylan Hall <dehall@mitre.org>
Date: Wed, 1 Mar 2023 08:25:35 -0500
Subject: [PATCH 4/7] don't crash on blank DOB in csv extract

---
 extract.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/extract.py b/extract.py
index 77af057..0d1947e 100644
--- a/extract.py
+++ b/extract.py
@@ -97,6 +97,8 @@ def clean_zip(household_zip):
 
 
 def clean_dob_fromstr(dob_str, date_format):
+    if not dob_str:
+        return ""
     return strftime("%Y-%m-%d", strptime(dob_str, date_format))
 
 

From c06bfdec87078ae3b39b7415712a56320a23388c Mon Sep 17 00:00:00 2001
From: Dylan Hall <dehall@mitre.org>
Date: Wed, 1 Mar 2023 08:26:42 -0500
Subject: [PATCH 5/7] style fixes

---
 utils/data_reader.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/utils/data_reader.py b/utils/data_reader.py
index 5739f1d..3bec987 100644
--- a/utils/data_reader.py
+++ b/utils/data_reader.py
@@ -89,11 +89,11 @@ def add_parser_db_args(parser):
         choices=["full", "preferred", "single"],
         default="full",
         help="Determines the approach for selecting a single address per PATID"
-        " from PRIVATE_ADDRESS_HISTORY. Options: Use \"single\" if "
+        ' from PRIVATE_ADDRESS_HISTORY. Options: Use "single" if '
         "the data is already guaranteed to only contain one address per PATID."
-        " Use \"preferred\" if the database is guaranteed to only contain one "
+        ' Use "preferred" if the database is guaranteed to only contain one '
         "address with address_preferred='Y' per PATID. "
-        "Use \"full\" if the database may contain multiple preferred addresses"
+        'Use "full" if the database may contain multiple preferred addresses'
         " for different dates/types/use. This option will select "
         "the most recent preferred address by start date."
         "Default if not specified is 'full'",
@@ -231,7 +231,7 @@ def get_query(engine, version, args):
             # This simplifies the query to just select ADDRESS_PREFERRED=Y
             query = select([prv_demo, prv_address]).filter(
                 prv_demo.columns.patid == prv_address.columns.patid,
-                prv_address.columns.address_preferred == 'Y',
+                prv_address.columns.address_preferred == "Y",
             )
         else:
             # The user indicated the data may have multiple preferreds,
@@ -267,7 +267,7 @@ def get_query(engine, version, args):
                 select(prv_address.columns.addressid)
                 .filter(
                     prv_address.columns.patid == prv_demo.columns.patid,
-                    prv_address.columns.address_preferred == 'Y'
+                    prv_address.columns.address_preferred == "Y",
                 )
                 .order_by(prv_address.columns.address_preferred.desc())
                 .order_by(addr_period_order)

From 66ca835bc08464277bbe459af0c74d170727fa94 Mon Sep 17 00:00:00 2001
From: Dylan Hall <dehall@mitre.org>
Date: Wed, 1 Mar 2023 08:29:12 -0500
Subject: [PATCH 6/7] remove unused import

---
 data_analysis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data_analysis.py b/data_analysis.py
index 549b9f4..115fe6d 100644
--- a/data_analysis.py
+++ b/data_analysis.py
@@ -2,7 +2,7 @@
 import json
 import re
 import time
-from datetime import date, datetime
+from datetime import datetime
 
 import pandas as pd
 

From 331356071e4953176efabbc673d1a8bfaa856ca4 Mon Sep 17 00:00:00 2001
From: Dylan Hall <dehall@mitre.org>
Date: Wed, 1 Mar 2023 09:00:05 -0500
Subject: [PATCH 7/7] missing quote in doc

---
 utils/data_reader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/data_reader.py b/utils/data_reader.py
index 3bec987..c523983 100644
--- a/utils/data_reader.py
+++ b/utils/data_reader.py
@@ -52,7 +52,7 @@ def add_parser_db_args(parser):
         default=V2,
         choices=[V1, V2],
         help="Version of the CODI Data Model schema to use. "
-        f'Valid options are "{V1}" or "{V2}". Default is "{V2}',
+        f'Valid options are "{V1}" or "{V2}". Default is "{V2}"',
     )
 
     parser.add_argument(