From 8e98121f899f551dc65fcc585f84e2668b6e9776 Mon Sep 17 00:00:00 2001
From: billmills <you@example.com>
Date: Fri, 6 Jan 2023 20:19:04 -0500
Subject: [PATCH] first pass at iquod flagging script

---
 iquod_flags.py | 119 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100644 iquod_flags.py

diff --git a/iquod_flags.py b/iquod_flags.py
new file mode 100644
index 0000000..3579609
--- /dev/null
+++ b/iquod_flags.py
@@ -0,0 +1,119 @@
+import numpy, pandas, sqlite3, math
+import util.main as main
+import util.dbutils as dbutils
+
+def parse(results):
+    'lifted from dbutils without the summary'
+
+    return results.apply(dbutils.unpack_qc)
+
+def db_to_df_simplified(table,
+             n_to_extract=numpy.iinfo(numpy.int32).max,
+             targetdb='data/demo.db',
+             batchsize=1000):
+
+    '''
+    simplified version of dbutils db_to_df
+    Reads the table from targetdb into a pandas dataframe.
+    Set n_to_extract to limit the number of rows extracted to the specified number.
+    '''
+
+    # what tests are available
+    testNames = main.importQC('qctests')
+    testNames.sort()
+
+    # connect to database
+    conn = sqlite3.connect(targetdb, isolation_level=None)
+    cur = conn.cursor()
+
+    # extract matrix of test results and true flags into a dataframe
+    query = 'SELECT uid, truth'
+    for test in testNames:
+        query += ', ' + test.lower()
+    query += ' , probe FROM ' + table   
+    query += ' WHERE uid IN (SELECT uid FROM ' + table + ' ORDER BY RANDOM() LIMIT ' + str(n_to_extract) + ')' 
+
+    cur.execute(query)
+    rawresults = cur.fetchall()
+    
+    # Loop over the profiles, 1000 profiles at a time.
+    sub  = batchsize # Number of profiles to process at a time.
+    nsub = math.ceil(len(rawresults)/sub) # Number of batches of 1000 profiles there will be.
+    df_final = None
+    testNamesSave = testNames.copy()
+    for i in range(nsub):
+        # Define the start and end points of this batch of profiles and create a dataframe from them.
+        istart = i * sub
+        iend   = min((i + 1) * sub, len(rawresults))
+        df = pandas.DataFrame(rawresults[istart:iend]).astype('bytes')
+        df.columns = ['uid', 'Truth'] + testNamesSave + ['probe']
+        df = df.astype({'uid': 'int'})
+        df = df.astype({'probe': 'int'})
+         
+        testNames = df.columns[2:-1].values.tolist()
+        df[['Truth']] = df[['Truth']].apply(parse)
+        df[testNames] = df[testNames].apply(parse)
+
+        # Keep the results.
+        if i == 0:
+            df_final = df
+        else:
+            df_final = pandas.concat([df_final, df])
+
+    return df_final.reset_index(drop=True)
+
+def combotests(row, qctests):
+    '''
+    given a row from the dataframe returned by db_to_df_simplified
+    and a list qctests of strings matching df qc column names to be ORed to generate a flag,
+    return a list indicating if any of the provided qc tests flagged the corresponding level
+    '''
+
+    nLevels = len(row['Truth'])
+    levelFlags = [False]*nLevels
+
+    for i in range(nLevels):
+        testresults = [row[test][i] for test in qctests]
+        levelFlags[i] = any(testresults)
+
+    return levelFlags
+
+def genflag(HTPRresults, Compresults, LFPRresults, isXBT):
+    '''
+    given per-level lists of results for each of HTPR, Comp and LFPR cases,
+    assess the appropriate IQuOD flag per our paper's prescription
+    if isXBT, flags should be monotonically increasing with depth, meaning higher flags propagate to all deeper levels
+    '''
+
+    flag = [1]*len(HTPRresults)
+    minFlag = 1
+    for i in range(len(HTPRresults)):
+        flag[i] = minFlag
+        if HTPRresults[i]:
+            flag[i] = max(2, minFlag)
+            if isXBT:
+                minFlag = max(2, minFlag)
+       	if Compresults[i]:
+       	    flag[i] = max(3, minFlag)
+       	    if isXBT:
+       	       	minFlag	= max(3, minFlag)
+       	if LFPRresults[i]:
+       	    flag[i] = max(4, minFlag)
+       	    if isXBT:
+       	       	minFlag	= max(4, minFlag)
+
+    return flag
+
+    
+df = db_to_df_simplified('iquod')
+
+HTPR = ['Argo_impossible_date_test', 'Argo_impossible_location_test', 'IQUOD_bottom', 'ICDC_aqc_01_level_order', 'CSIRO_wire_break', 'Argo_global_range_check', 'ICDC_aqc_09_local_climatology_check', 'CoTeDe_GTSPP_WOA_normbias', 'EN_std_lev_bkg_and_buddy_check', 'CSIRO_constant_bottom', 'ICDC_aqc_06_n_temperature_extrema', 'CoTeDe_tukey53H', 'AOML_spike', 'CSIRO_long_gradient', 'ICDC_aqc_08_gradient_check', 'CoTeDe_anomaly_detection', 'EN_background_available_check', 'CSIRO_depth', 'IQuOD_gross_range_check', 'EN_range_check', 'ICDC_aqc_10_local_climatology_check', 'AOML_climatology_test', 'EN_constant_value_check', 'AOML_constant', 'Argo_spike_test', 'ICDC_aqc_07_spike_check', 'EN_spike_and_step_suspect', 'AOML_gradient', 'CSIRO_short_gradient']
+Comp = ['Argo_impossible_date_test', 'Argo_impossible_location_test', 'EN_background_available_check', 'ICDC_aqc_01_level_order', 'CSIRO_depth', 'IQuOD_gross_range_check', 'WOD_range_check', 'AOML_climatology_test',  'CoTeDe_GTSPP_WOA_normbias', 'EN_increasing_depth_check', 'EN_constant_value_check', 'EN_spike_and_step_check',  'CSIRO_long_gradient', 'ICDC_aqc_08_gradient_check', 'EN_stability_check']
+LFPR = ['Argo_impossible_date_test', 'Argo_impossible_location_test', 'loose_location_at_sea', 'ICDC_aqc_01_level_order', 'IQuOD_gross_range_check', 'WOD_range_check', 'ICDC_aqc_02_crude_range', 'EN_background_check', 'EN_std_lev_bkg_and_buddy_check', 'EN_increasing_depth_check', 'ICDC_aqc_05_stuck_value', 'EN_spike_and_step_check', 'CSIRO_long_gradient', 'EN_stability_check']
+
+for i in range(len(df)):
+    HTPRflags = combotests(df.iloc[i], HTPR)
+    Compflags = combotests(df.iloc[i], Comp)
+    LFPRflags = combotests(df.iloc[i], LFPR)
+    iquodFlag = genflag(HTPRflags, Compflags, LFPRflags, df['probe'][i]==2)
+    print(iquodFlag)