CDCgov · krista-skylight · Apr 1, 2026 · Apr 6, 2026 · Apr 6, 2026 · Apr 7, 2026
@@ -36,6 +36,9 @@ databaseChangeLog:
         - sqlFile:
             path: db/report/execution/libraries/nbs_sr_13.sql
             splitStatements: false
+        - sqlFile:
+            path: db/report/execution/libraries/potntl_dup_inv_sum.sql
+            splitStatements: false
         - sqlFile:
             path: db/report/execution/libraries/nbs_sr_12.sql
             splitStatements: false

@@ -0,0 +1,44 @@
+-- Migrate the NBSCUSTOM.SAS library to the nbs_sr_05 python library
+
+USE [NBS_ODSE]
+
+DECLARE @pyLib VARCHAR(50) = 'potntl_dup_inv_sum'
+DECLARE @sasLib VARCHAR(50) = 'POTNTL_DUP_INV_SUM.SAS'
+DECLARE @desc VARCHAR(300) = 'Potential Duplicate Investigations - Identifies potential duplicate investigations for the same patient with the
+    same disease within a user-specified number of days.'
+
+IF EXISTS (SELECT * FROM [dbo].[Report_Library] WHERE UPPER(library_name) = @sasLib)
+BEGIN
+    UPDATE [dbo].[Report_Library]
+    SET
+        library_name = @pyLib,
+        runner = 'python',
+        desc_txt = @desc,
+        last_chg_time = CURRENT_TIMESTAMP,
+        last_chg_user_id = 99999999
+    WHERE
+        UPPER(library_name) = @sasLib;
+END
+ELSE
+BEGIN
+    -- Create a row for this library
+    INSERT INTO [dbo].[Report_Library] (
+        library_name,
+        desc_txt,
+        runner,
+        is_builtin_ind,
+        add_time,
+        add_user_id,
+        last_chg_time,
+        last_chg_user_id
+    ) VALUES (
+        @pyLib,
+        @desc,
+        'python',
+        'Y',
+        CURRENT_TIMESTAMP,
+        99999999,
+        CURRENT_TIMESTAMP,
+        99999999
+    );
+END
@@ -23,6 +23,7 @@ def execute_report(report_spec: models.ReportSpec):
             trx,
             subset_query=report_spec.subset_query,
             data_source_name=report_spec.data_source_name,
+            days_value=report_spec.days_value,
         )
 
     check_valid_result(result, report_spec)

@@ -0,0 +1,121 @@
+from src.db_transaction import Transaction
+from src.models import ReportResult
+
+
+def execute(
+    trx: Transaction,
+    subset_query: str,
+    data_source_name: str,
+    days_value: None | int,
+    **kwargs,
+):
+    """Potential Duplicate Investigations.
+
+    Identifies potential duplicate investigations for the same patient,
+    with the same disease, within a user-specified number of days.
+    """
+    # Only use default if days_value is None (not provided)
+    # If days_value is 0, treat it as 0 (not default)
+    # days_value = kwargs.get('days_value')
+    if days_value is None:
+        days_value = 3650
+
+    full_query = f"""
+    WITH subset AS ({subset_query})
+    -- Capture SQL Server's physical row order
+    , source_order AS (
+        SELECT 
+            *,
+            ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) AS sas_row_num
+        FROM subset
+    )
+    , clean_data AS (
+        SELECT 
+            PATIENT_LOCAL_ID,
+            PATIENT_FIRST_NAME,
+            PATIENT_LAST_NAME,
+            PATIENT_DOB,
+            INVESTIGATION_LOCAL_ID,
+            DISEASE,
+            CASE_STATUS,
+            EVENT_DATE,
+            EVENT_DATE_TYPE,
+            MMWR_YEAR,
+            NOTIFICATION_STATUS,
+            DISEASE_CD,
+            sas_row_num
+        FROM source_order
+        WHERE EVENT_DATE IS NOT NULL
+            AND PATIENT_LOCAL_ID IS NOT NULL
+            AND DISEASE_CD IS NOT NULL
+    )
+    -- Calculate days since previous and until next event
+    , datediff_calc AS (
+        SELECT 
+            *,
+            DATEDIFF(day, 
+                LAG(EVENT_DATE) OVER (
+                    PARTITION BY
+                    PATIENT_LOCAL_ID,
+                    DISEASE_CD 
+                    ORDER BY EVENT_DATE, sas_row_num
+                ),
+                EVENT_DATE
+            ) AS days_since_prev,
+            DATEDIFF(day, 
+                EVENT_DATE,
+                LEAD(EVENT_DATE) OVER (
+                    PARTITION BY PATIENT_LOCAL_ID,
+                    DISEASE_CD 
+                    ORDER BY EVENT_DATE, sas_row_num
+                )
+            ) AS days_until_next
+        FROM clean_data
+    )
+    -- Count events for each patient and disease to identify potential duplicates
+    , event_counts AS (
+        SELECT 
+            PATIENT_LOCAL_ID,
+            DISEASE_CD,
+            COUNT(*) AS event_count
+        FROM clean_data
+        GROUP BY PATIENT_LOCAL_ID, DISEASE_CD
+    )
+    -- Final selection of potential duplicates based on days thresholds
+    SELECT 
+        d.PATIENT_LOCAL_ID AS [Patient Local ID],
+        d.PATIENT_FIRST_NAME AS [Patient First Name],
+        d.PATIENT_LAST_NAME AS [Patient Last Name],
+        d.PATIENT_DOB AS DOB,
+        d.INVESTIGATION_LOCAL_ID AS [Investigation Local ID],
+        d.DISEASE AS Disease,
+        d.CASE_STATUS AS [Case Status],
+        d.EVENT_DATE AS [Event Date],
+        d.EVENT_DATE_TYPE AS [Event Date Type],
+        d.MMWR_YEAR AS [MMWR Year],
+        d.NOTIFICATION_STATUS AS [Notification Record Status],
+        d.DISEASE_CD AS [Disease Code]
+    FROM datediff_calc d
+    JOIN event_counts c 
+        ON d.PATIENT_LOCAL_ID = c.PATIENT_LOCAL_ID 
+        AND d.DISEASE_CD = c.DISEASE_CD
+    WHERE c.event_count > 1
+    AND (
+        (d.days_since_prev IS NOT NULL AND d.days_since_prev <= {days_value})
+        OR (d.days_until_next IS NOT NULL AND d.days_until_next <= {days_value})
+    )
+    ORDER BY 
+        d.PATIENT_LOCAL_ID COLLATE Latin1_General_BIN,
+        d.DISEASE_CD COLLATE Latin1_General_BIN,
+        d.EVENT_DATE,
+        d.sas_row_num
+    """
+
+    content = trx.query(full_query)
+
+    header = 'Potential Duplicate Investigations'
+    subheader = f'Duplicate Investigations Time Frame: {days_value} Days'
+
+    return ReportResult(
+        content_type='table', content=content, header=header, subheader=subheader
+    )
@@ -13,6 +13,7 @@ class ReportSpec(BaseModel):
     library_name: str = Field(min_length=1)
     data_source_name: str = Field(min_length=1)
     subset_query: str = Field(min_length=1)
+    days_value: int | None = None  # Specific to potntl_dup_inv_sum
 
 
 # column names and values

@@ -153,6 +153,7 @@ def get_faker_sql(schema_name: str) -> str:
 
     # KLUDGE: NULL writing is not always correct
     result = result.replace(' nan,', ' NULL,')
+    result = result.replace('nan', ' NULL')
     result = result.replace(' nan)', ' NULL)')
     result = result.replace(' <NA>,', ' NULL,')
     result = result.replace(' <NA>)', ' NULL)')
@@ -167,7 +168,7 @@ def get_tables_from_faker(schema_name: str) -> tuple[list[str], list[str]]:
         schema = yaml.safe_load(f.read())
 
     db_tables = [t['table_name'] for t in schema['tables']]
-    fk_tables = schema['config']['nbs']['fk_tables']
+    fk_tables = schema['config'].get('nbs', {}).get('fk_tables', [])
 
     return (db_tables, fk_tables)
 
@@ -234,6 +235,7 @@ def insert_fake_data(
     with db_transaction(conn_string) as trx:
         # Tables with foreign keys pointing to the table we want to replace need to
         # be backed up and cleared out to avoid FK constraint violations
+
-
-
         for fk_table in fk_tables:
             temp_fk_table = temp_name(fk_table)
             trx.execute(