diff --git a/harmonize_wq/clean.py b/harmonize_wq/clean.py index 380e55a..c590c71 100644 --- a/harmonize_wq/clean.py +++ b/harmonize_wq/clean.py @@ -1,13 +1,13 @@ # -*- coding: utf-8 -*- """Functions to clean/correct additional columns in subset/entire dataset.""" -#from warnings import warn +# from warnings import warn import dataretrieval.utils from numpy import nan from harmonize_wq.convert import convert_unit_series from harmonize_wq.domains import accepted_methods -#from harmonize_wq.wrangle import add_activities_to_df +# from harmonize_wq.wrangle import add_activities_to_df def datetime(df_in): @@ -46,18 +46,20 @@ def datetime(df_in): [2 rows x 4 columns] """ # Expected columns - date, time, tz = ('ActivityStartDate', - 'ActivityStartTime/Time', - 'ActivityStartTime/TimeZoneCode') + date, time, tz = ( + "ActivityStartDate", + "ActivityStartTime/Time", + "ActivityStartTime/TimeZoneCode", + ) df_out = df_in.copy() # NOTE: even if date, if time is NA datetime is NaT df_out = dataretrieval.utils.format_datetime(df_out, date, time, tz) - df_out = df_out.rename(columns={'datetime': 'Activity_datetime'}) + df_out = df_out.rename(columns={"datetime": "Activity_datetime"}) return df_out -def harmonize_depth(df_in, units='meter'): +def harmonize_depth(df_in, units="meter"): """Create 'Depth' column with result depth values in consistent units. New column combines values from the 'ResultDepthHeightMeasure/MeasureValue' column @@ -107,16 +109,18 @@ def harmonize_depth(df_in, units='meter'): """ df_out = df_in.copy() # Default columns - meas_col = 'ResultDepthHeightMeasure/MeasureValue' - unit_col = 'ResultDepthHeightMeasure/MeasureUnitCode' + meas_col = "ResultDepthHeightMeasure/MeasureValue" + unit_col = "ResultDepthHeightMeasure/MeasureUnitCode" # Note: there are also 'Activity' cols for both of these & top/bottom depth df_checks(df_out, [meas_col, unit_col]) # Confirm columns in df na_mask = df_out[meas_col].notna() # Mask NA to speed up processing # TODO: if units missing? - params = {'quantity_series': df_out.loc[na_mask, meas_col], - 'unit_series': df_out.loc[na_mask, unit_col], - 'units': units, } + params = { + "quantity_series": df_out.loc[na_mask, meas_col], + "unit_series": df_out.loc[na_mask, unit_col], + "units": units, + } df_out.loc[na_mask, "Depth"] = convert_unit_series(**params) # TODO: where result depth is missing use activity depth? @@ -160,11 +164,13 @@ def df_checks(df_in, columns=None): """ if columns is None: # Assign defaults - columns = ('ResultMeasure/MeasureUnitCode', - 'ResultMeasureValue', - 'CharacteristicName') + columns = ( + "ResultMeasure/MeasureUnitCode", + "ResultMeasureValue", + "CharacteristicName", + ) for col in columns: - assert col in df_in.columns, f'{col} not in DataFrame' + assert col in df_in.columns, f"{col} not in DataFrame" def check_precision(df_in, col, limit=3): @@ -191,8 +197,8 @@ def check_precision(df_in, col, limit=3): """ df_out = df_in.copy() # Create T/F mask based on len of everything after the decimal - c_mask = [len(str(x).split('.')[1]) < limit for x in df_out[col]] - flag = f'{col}: Imprecise: lessthan{limit}decimaldigits' + c_mask = [len(str(x).split(".")[1]) < limit for x in df_out[col]] + flag = f"{col}: Imprecise: lessthan{limit}decimaldigits" df_out = add_qa_flag(df_out, c_mask, flag) # Assign flags return df_out @@ -224,11 +230,11 @@ def methods_check(df_in, char_val, methods=None): """ if methods is None: methods = accepted_methods - method_col = 'ResultAnalyticalMethod/MethodIdentifier' + method_col = "ResultAnalyticalMethod/MethodIdentifier" df2 = df_in.copy() # TODO: check df for method_col - char_mask = df2['CharacteristicName'] == char_val - methods = [item['Method'] for item in methods[char_val]] + char_mask = df2["CharacteristicName"] == char_val + methods = [item["Method"] for item in methods[char_val]] methods_used = list(set(df2.loc[char_mask, method_col].dropna())) accept = [method for method in methods_used if method in methods] # reject = [method for method in methods_used if method not in methods] @@ -261,24 +267,24 @@ def wet_dry_checks(df_in, mask=None): """ df_out = df_in.copy() - media_col = 'ActivityMediaName' + media_col = "ActivityMediaName" # Check columns are in df - df_checks(df_out, [media_col, - 'ResultSampleFractionText', - 'ResultWeightBasisText']) + df_checks(df_out, [media_col, "ResultSampleFractionText", "ResultWeightBasisText"]) # QA - Sample Media, fix assigned 'Water' that are actually 'Sediment' - qa_flag = f'{media_col}: Water changed to Sediment' + qa_flag = f"{media_col}: Water changed to Sediment" # Create mask for bad data - media_mask = ((df_out['ResultSampleFractionText'] == 'Bed Sediment') & - (df_out['ResultWeightBasisText'] == 'Dry') & - (df_out['ActivityMediaName'] == 'Water')) + media_mask = ( + (df_out["ResultSampleFractionText"] == "Bed Sediment") + & (df_out["ResultWeightBasisText"] == "Dry") + & (df_out["ActivityMediaName"] == "Water") + ) # Use mask if user specified, else run on all rows if mask: media_mask = mask & (media_mask) # Assign QA flag where data was bad df_out = add_qa_flag(df_out, media_mask, qa_flag) # Fix the data - df_out.loc[media_mask, 'ActivityMediaName'] = 'Sediment' + df_out.loc[media_mask, "ActivityMediaName"] = "Sediment" return df_out @@ -326,21 +332,20 @@ def add_qa_flag(df_in, mask, flag): 2 Carbon 2.1 words """ df_out = df_in.copy() - if 'QA_flag' not in list(df_out.columns): - df_out['QA_flag'] = nan + if "QA_flag" not in list(df_out.columns): + df_out["QA_flag"] = nan # Append flag where QA_flag is not nan - cond_notna = mask & (df_out['QA_flag'].notna()) # Mask cond and not NA - existing_flags = df_out.loc[cond_notna, 'QA_flag'] # Current QA flags - df_out.loc[cond_notna, 'QA_flag'] = [f'{txt}; {flag}' for - txt in existing_flags] + cond_notna = mask & (df_out["QA_flag"].notna()) # Mask cond and not NA + existing_flags = df_out.loc[cond_notna, "QA_flag"] # Current QA flags + df_out.loc[cond_notna, "QA_flag"] = [f"{txt}; {flag}" for txt in existing_flags] # Equals flag where QA_flag is nan - df_out.loc[mask & (df_out['QA_flag'].isna()), 'QA_flag'] = flag + df_out.loc[mask & (df_out["QA_flag"].isna()), "QA_flag"] = flag return df_out -def wet_dry_drop(df_in, wet_dry='wet', char_val=None): +def wet_dry_drop(df_in, wet_dry="wet", char_val=None): """Restrict to only water or only sediment samples. Parameters @@ -360,34 +365,34 @@ def wet_dry_drop(df_in, wet_dry='wet', char_val=None): df2 = df_in.copy() if char_val: # Set characteristic mask - c_mask = df2['CharacteristicName'] == char_val + c_mask = df2["CharacteristicName"] == char_val # Adding activities fails on len(df)==0, a do-nothing, end it early if len(df2[c_mask]) == 0: return df2 # Set variables for columns and check they're in df - media_col = 'ActivityMediaName' -# try: + media_col = "ActivityMediaName" + # try: df_checks(df2, media_col) -# except AssertionError: -# warn(f'Warning: {media_col} missing, querying from activities...') - # Try query/join -# if char_val: -# df2 = add_activities_to_df(df2, c_mask) -# else: -# df2 = add_activities_to_df(df2) # no mask, runs on all -# df_checks(df2, [media_col]) # Check it's been added - # if ERROR? - # print('Query and join activities first') + # except AssertionError: + # warn(f'Warning: {media_col} missing, querying from activities...') + # Try query/join + # if char_val: + # df2 = add_activities_to_df(df2, c_mask) + # else: + # df2 = add_activities_to_df(df2) # no mask, runs on all + # df_checks(df2, [media_col]) # Check it's been added + # if ERROR? + # print('Query and join activities first') # Fix wet/dry columns df2 = wet_dry_checks(df2) # Changed from df_in? # Filter wet/dry rows - if wet_dry == 'wet': - media_mask = df2[media_col] == 'Water' - elif wet_dry == 'dry': - media_mask = df2[media_col] == 'Sediment' + if wet_dry == "wet": + media_mask = df2[media_col] == "Water" + elif wet_dry == "dry": + media_mask = df2[media_col] == "Sediment" # Filter characteristic rows if char_val: diff --git a/harmonize_wq/convert.py b/harmonize_wq/convert.py index 35d69a3..7a37031 100644 --- a/harmonize_wq/convert.py +++ b/harmonize_wq/convert.py @@ -13,31 +13,32 @@ from harmonize_wq.domains import registry_adds_list # TODO: does this constant belong here or in domains? -PERIODIC_MW = {'Organic carbon': 180.16, - 'C6H12O6': 180.16, - 'Phosphorus': 30.97, - 'P': 30.97, - 'PO4': 94.97, - 'Nitrogen': 14.01, - 'N': 14.01, - 'NO3': 62.01, - 'NO2': 46.01, - 'NH4': 18.04, - 'NH3': 17.03, - 'SiO3': 76.08, - } +PERIODIC_MW = { + "Organic carbon": 180.16, + "C6H12O6": 180.16, + "Phosphorus": 30.97, + "P": 30.97, + "PO4": 94.97, + "Nitrogen": 14.01, + "N": 14.01, + "NO3": 62.01, + "NO2": 46.01, + "NH4": 18.04, + "NH3": 17.03, + "SiO3": 76.08, +} # Molecular weight assumptions: Organic carbon = C6H12O6 # NOTE: for a more complete handling of MW: CalebBell/chemicals u_reg = pint.UnitRegistry() # For use in wrappers # TODO: find more elegant way to do this with all definitions -for definition in registry_adds_list('Turbidity'): +for definition in registry_adds_list("Turbidity"): u_reg.define(definition) -for definition in registry_adds_list('Salinity'): +for definition in registry_adds_list("Salinity"): u_reg.define(definition) -#timeit: 159.17 +# timeit: 159.17 # def convert_unit_series(quantity_series, unit_series, units, ureg=None): # # Convert quantities to float if they aren't already (should be) # if quantity_series.dtype=='O': @@ -53,8 +54,8 @@ # out_list = [val.to(ureg(units)) for val in val_list] # # Re-index to return series # return pandas.Series(out_list, index=quantity_series.index) -#timeit: 27.08 -def convert_unit_series(quantity_series, unit_series, units, ureg=None, errors='raise'): +# timeit: 27.08 +def convert_unit_series(quantity_series, unit_series, units, ureg=None, errors="raise"): """Convert quantities to consistent units. Convert list of quantities (quantity_list), each with a specified old unit, @@ -98,18 +99,18 @@ def convert_unit_series(quantity_series, unit_series, units, ureg=None, errors=' 1 10000.000000000002 milligram / liter dtype: object """ - if quantity_series.dtype=='O': + if quantity_series.dtype == "O": quantity_series = pandas.to_numeric(quantity_series) # Initialize classes from pint if ureg is None: ureg = pint.UnitRegistry() Q_ = ureg.Quantity - lst_series = [pandas.Series(dtype='object')] + lst_series = [pandas.Series(dtype="object")] # Note: set of series does not preservce order and must be sorted at end for unit in list(set(unit_series)): # Filter quantity_series by unit_series where == unit - f_quant_series = quantity_series.where(unit_series==unit).dropna() + f_quant_series = quantity_series.where(unit_series == unit).dropna() unit_ = ureg(unit) # Set unit once per unit result_list = [Q_(q, unit_) for q in f_quant_series] if unit != units: @@ -117,10 +118,10 @@ def convert_unit_series(quantity_series, unit_series, units, ureg=None, errors=' try: result_list = [val.to(ureg(units)) for val in result_list] except pint.DimensionalityError as exception: - if errors=='skip': + if errors == "skip": # do nothing, leave result_list unconverted warn(f"WARNING: '{unit}' not converted") - elif errors=='ignore': + elif errors == "ignore": # convert to NaN result_list = [nan for val in result_list] warn(f"WARNING: '{unit}' converted to NaN") @@ -166,7 +167,7 @@ def mass_to_moles(ureg, char_val, Q_): """ # TODO: Not used yet m_w = PERIODIC_MW[char_val] - return Q_.to('moles', 'chemistry', mw=m_w * ureg('g/mol')) + return Q_.to("moles", "chemistry", mw=m_w * ureg("g/mol")) def moles_to_mass(ureg, Q_, basis=None, char_val=None): @@ -210,14 +211,14 @@ def moles_to_mass(ureg, Q_, basis=None, char_val=None): if basis: # Clean-up basis # print(basis) - if basis.startswith('as '): + if basis.startswith("as "): basis = basis[3:] m_w = PERIODIC_MW[basis] elif char_val: m_w = PERIODIC_MW[char_val] else: raise ValueError("Characteristic Name or basis (Speciation) required") - return Q_.to('g', 'chemistry', mw=m_w / ureg('mol/g')) + return Q_.to("g", "chemistry", mw=m_w / ureg("mol/g")) @u_reg.wraps(u_reg.NTU, u_reg.centimeter) @@ -363,7 +364,7 @@ def JTU_to_NTU(val): # from Maceina, M. J., & Soballe, D. M. (1990). # Wind-related limnological variation in Lake Okeechobee, Florida. # Lake and Reservoir Management, 6(1), 93-100. - return 19.025*val - 0.0477 + return 19.025 * val - 0.0477 @u_reg.wraps(u_reg.NTU, u_reg.dimensionless) @@ -439,12 +440,13 @@ def FNU_to_NTU(val): return val * 1.267 -@u_reg.wraps(u_reg.gram/u_reg.kilogram, (u_reg.gram/u_reg.liter, - u_reg.standard_atmosphere, - u_reg.degree_Celsius)) -def density_to_PSU(val, - pressure=1*u_reg("atm"), - temperature=u_reg.Quantity(25, u_reg("degC"))): +@u_reg.wraps( + u_reg.gram / u_reg.kilogram, + (u_reg.gram / u_reg.liter, u_reg.standard_atmosphere, u_reg.degree_Celsius), +) +def density_to_PSU( + val, pressure=1 * u_reg("atm"), temperature=u_reg.Quantity(25, u_reg("degC")) +): """Convert salinity as density (mass/volume) to Practical Salinity Units. Parameters @@ -483,24 +485,25 @@ def density_to_PSU(val, """ # Standard Reference Value - ref = 35.16504/35.0 + ref = 35.16504 / 35.0 # density of pure water is ~1000 mg/mL if val > 1000: - PSU = (float(val)*ref)-1000 + PSU = (float(val) * ref) - 1000 else: - PSU = ((float(val)+1000)*ref)-1000 + PSU = ((float(val) + 1000) * ref) - 1000 # print('{} mg/ml == {} ppth'.format(val, PSU)) # multiply by 33.45 @26C, 33.44 @25C return PSU -@u_reg.wraps(u_reg.milligram/u_reg.milliliter, (u_reg.dimensionless, - u_reg.standard_atmosphere, - u_reg.degree_Celsius)) -def PSU_to_density(val, - pressure=1*u_reg("atm"), - temperature=u_reg.Quantity(25, u_reg("degC"))): +@u_reg.wraps( + u_reg.milligram / u_reg.milliliter, + (u_reg.dimensionless, u_reg.standard_atmosphere, u_reg.degree_Celsius), +) +def PSU_to_density( + val, pressure=1 * u_reg("atm"), temperature=u_reg.Quantity(25, u_reg("degC")) +): """Convert salinity as Practical Salinity Units (PSU) to density. Dimensionality changes from dimensionless Practical Salinity Units (PSU) to @@ -554,22 +557,29 @@ def PSU_to_density(val, _p, t = pressure, temperature # Pure water density (see SMOW, Craig 1961) - x = [999.842594, - 6.793952e-2 * t, - -9.095290e-3 * t**2, - 1.001685e-4 * t**3, - -1.120083e-6 * t**4, - 6.536336e-9 * t**5] + x = [ + 999.842594, + 6.793952e-2 * t, + -9.095290e-3 * t**2, + 1.001685e-4 * t**3, + -1.120083e-6 * t**4, + 6.536336e-9 * t**5, + ] pure_water = sum(x) # Constants - a0 = [-4.0899e-3*t, 7.6438e-5*(t**2), -8.2467e-7*(t**3), 5.3875e-9*(t**4)] + a0 = [ + -4.0899e-3 * t, + 7.6438e-5 * (t**2), + -8.2467e-7 * (t**3), + 5.3875e-9 * (t**4), + ] a = 8.24493e-1 + sum(a0) - b0 = [-5.72466e-3, 1.0227e-4*t, -1.6546e-6*(t**2)] + b0 = [-5.72466e-3, 1.0227e-4 * t, -1.6546e-6 * (t**2)] b = sum(b0) - density = pure_water + a*val + b*(val**(3/2)) + 4.8314e-4*(val**2) + density = pure_water + a * val + b * (val ** (3 / 2)) + 4.8314e-4 * (val**2) # # UNESCO 1983 Eqn.(13) p17. @@ -586,12 +596,13 @@ def PSU_to_density(val, return density -@u_reg.wraps(u_reg.milligram/u_reg.liter, (None, - u_reg.standard_atmosphere, - u_reg.degree_Celsius)) -def DO_saturation(val, - pressure=1*u_reg("atm"), - temperature=u_reg.Quantity(25, u_reg("degC"))): +@u_reg.wraps( + u_reg.milligram / u_reg.liter, + (None, u_reg.standard_atmosphere, u_reg.degree_Celsius), +) +def DO_saturation( + val, pressure=1 * u_reg("atm"), temperature=u_reg.Quantity(25, u_reg("degC")) +): """Convert Dissolved Oxygen (DO) from saturation (%) to concentration (mg/l). Defaults assume STP where pressure is 1 atmosphere and temperature 25C. @@ -625,15 +636,16 @@ def DO_saturation(val, cP = 8.262332418 else: cP = _DO_concentration_eq(p, t) - return float(val)/100 * cP # Divide by 100? + return float(val) / 100 * cP # Divide by 100? -@u_reg.wraps(None, (u_reg.milligram/u_reg.liter, - u_reg.standard_atmosphere, - u_reg.degree_Celsius)) -def DO_concentration(val, - pressure=1*u_reg("atm"), - temperature=u_reg.Quantity(25, u_reg("degC"))): +@u_reg.wraps( + None, + (u_reg.milligram / u_reg.liter, u_reg.standard_atmosphere, u_reg.degree_Celsius), +) +def DO_concentration( + val, pressure=1 * u_reg("atm"), temperature=u_reg.Quantity(25, u_reg("degC")) +): """Convert Dissolved Oxygen (DO) from concentration (mg/l) to saturation (%). Parameters @@ -665,32 +677,37 @@ def DO_concentration(val, cP = 8.262332418 else: cP = _DO_concentration_eq(p, t) - return 100*val /cP + return 100 * val / cP def _DO_concentration_eq(p, t): - """ Equilibrium oxygen concentration at non-standard""" + """Equilibrium oxygen concentration at non-standard""" # https://www.waterontheweb.org/under/waterquality/oxygen.html#:~: # text=Oxygen%20saturation%20is%20calculated%20as, # concentration%20at%20100%25%20saturation%20decreases. tk = t + 273.15 # t in kelvin (t is in C) - standard = 0.000975 - (1.426e-05*t) + (6.436e-08*(t**2)) # Theta + standard = 0.000975 - (1.426e-05 * t) + (6.436e-08 * (t**2)) # Theta # partial pressure of water vapor, atm - Pwv = math.exp(11.8571 - (3840.7/tk) - (216961/(tk**2))) + Pwv = math.exp(11.8571 - (3840.7 / tk) - (216961 / (tk**2))) # equilibrium oxygen concentration at std pres of 1 atm cStar = math.exp(7.7117 - 1.31403 * math.log(t + 45.93)) - numerator = (1-Pwv/p)*(1-(standard*p)) - denominator = (1-Pwv)*(1-standard) - - return cStar*p*(numerator/denominator) - - -@u_reg.wraps(u_reg.dimensionless, (u_reg.microsiemens / u_reg.centimeter, - u_reg.standard_atmosphere, - u_reg.degree_Celsius)) -def conductivity_to_PSU(val, - pressure=0*u_reg("atm"), - temperature=u_reg.Quantity(25, u_reg("degC"))): + numerator = (1 - Pwv / p) * (1 - (standard * p)) + denominator = (1 - Pwv) * (1 - standard) + + return cStar * p * (numerator / denominator) + + +@u_reg.wraps( + u_reg.dimensionless, + ( + u_reg.microsiemens / u_reg.centimeter, + u_reg.standard_atmosphere, + u_reg.degree_Celsius, + ), +) +def conductivity_to_PSU( + val, pressure=0 * u_reg("atm"), temperature=u_reg.Quantity(25, u_reg("degC")) +): """Estimate salinity (PSU) from conductivity. Parameters @@ -761,22 +778,35 @@ def conductivity_to_PSU(val, # Csw = 42.914 K = 0.0162 Ct = round(val * (1 + 0.0191 * (t - 25)), 0) - R = (Ct/1000)/42.914 + R = (Ct / 1000) / 42.914 # Was rt c = c[0] + (c[1] * t) + (c[2] * t**2) + (c[3] * t**3) + (c[4] * t**4) - Rp = (1 + (p * e[0] + e[1] * p**2 + e[2] * p**3) / - (1 + D[0] * t + D[1] * t**2 + (D[2] + D[3] * t) * R)) - Rt1 = R/(Rp * c) - dS = ((b[0] + b[1] * Rt1**(1/2) + - b[2] * Rt1**(2/2) + - b[3] * Rt1**(3/2) + - b[4] * Rt1**(4/2) + - b[5] * Rt1**(5/2)) * - (t - 15)/(1 + K * (t - 15))) - S = (a[0] + a[1] * Rt1**(1/2) + - a[2] * Rt1**(2/2) + a[3] * Rt1**(3/2) + - a[4] * Rt1**(4/2) + a[5] * Rt1**(5/2) + dS) + Rp = 1 + (p * e[0] + e[1] * p**2 + e[2] * p**3) / ( + 1 + D[0] * t + D[1] * t**2 + (D[2] + D[3] * t) * R + ) + Rt1 = R / (Rp * c) + dS = ( + ( + b[0] + + b[1] * Rt1 ** (1 / 2) + + b[2] * Rt1 ** (2 / 2) + + b[3] * Rt1 ** (3 / 2) + + b[4] * Rt1 ** (4 / 2) + + b[5] * Rt1 ** (5 / 2) + ) + * (t - 15) + / (1 + K * (t - 15)) + ) + S = ( + a[0] + + a[1] * Rt1 ** (1 / 2) + + a[2] * Rt1 ** (2 / 2) + + a[3] * Rt1 ** (3 / 2) + + a[4] * Rt1 ** (4 / 2) + + a[5] * Rt1 ** (5 / 2) + + dS + ) # TODO: implement these two lines? Shouldn't encounter NaN. # S[is.na(S<0)]<-NA # if <0 or NA set as nan diff --git a/harmonize_wq/location.py b/harmonize_wq/location.py index 2cd4f4e..719e58e 100644 --- a/harmonize_wq/location.py +++ b/harmonize_wq/location.py @@ -11,11 +11,13 @@ from harmonize_wq.wrangle import clip_stations -def infer_CRS(df_in, - out_EPSG, - out_col='EPSG', - bad_crs_val=None, - crs_col='HorizontalCoordinateReferenceSystemDatumName'): +def infer_CRS( + df_in, + out_EPSG, + out_col="EPSG", + bad_crs_val=None, + crs_col="HorizontalCoordinateReferenceSystemDatumName", +): """Replace missing or unrecognized Coordinate Reference System (CRS). Replaces with desired CRS and notes it was missing in 'QA_flag' column. @@ -71,11 +73,11 @@ def infer_CRS(df_in, df_out = df_in.copy() if bad_crs_val: # QA flag for bad CRS based on bad_crs_val - flag = f'{crs_col}: Bad datum {bad_crs_val}, EPSG:{out_EPSG} assumed' + flag = f"{crs_col}: Bad datum {bad_crs_val}, EPSG:{out_EPSG} assumed" c_mask = df_out[crs_col] == bad_crs_val # Mask for bad CRS value else: # QA flag for missing CRS - flag = f'{crs_col}: MISSING datum, EPSG:{out_EPSG} assumed' + flag = f"{crs_col}: MISSING datum, EPSG:{out_EPSG} assumed" c_mask = df_out[crs_col].isna() # Mask for missing units df_out = add_qa_flag(df_out, c_mask, flag) # Assign flag df_out.loc[c_mask, out_col] = out_EPSG # Update with infered unit @@ -83,8 +85,7 @@ def infer_CRS(df_in, return df_out -def harmonize_locations(df_in, out_EPSG=4326, - intermediate_columns=False, **kwargs): +def harmonize_locations(df_in, out_EPSG=4326, intermediate_columns=False, **kwargs): """Create harmonized geopandas GeoDataframe from pandas DataFrame. Takes a :class:`~pandas.DataFrame` with lat/lon in multiple Coordinate @@ -124,16 +125,14 @@ def harmonize_locations(df_in, out_EPSG=4326, -------- Build pandas DataFrame to use in example: - >>> df_in = pandas.DataFrame({'LatitudeMeasure': [27.5950355, - ... 27.52183, - ... 28.0661111], - ... 'LongitudeMeasure': [-82.0300865, - ... -82.64476, - ... -82.3775], - ... 'HorizontalCoordinateReferenceSystemDatumName': ['NAD83', - ... 'WGS84', - ... 'NAD27'], - ... }) + >>> df_in = pandas.DataFrame( + ... { + ... "LatitudeMeasure": [27.5950355, 27.52183, 28.0661111], + ... "LongitudeMeasure": [-82.0300865, -82.64476, -82.3775], + ... "HorizontalCoordinateReferenceSystemDatumName": + ... ["NAD83", "WGS84", "NAD27"], + ... } + ... ) >>> df_in LatitudeMeasure ... HorizontalCoordinateReferenceSystemDatumName 0 27.595036 ... NAD83 @@ -154,10 +153,9 @@ def harmonize_locations(df_in, out_EPSG=4326, df2 = df_in.copy() # Default columns - crs_col = kwargs.get('crs_col', - "HorizontalCoordinateReferenceSystemDatumName") - lat_col = kwargs.get('lat_col', 'LatitudeMeasure') - lon_col = kwargs.get('lon_col', 'LongitudeMeasure') + crs_col = kwargs.get("crs_col", "HorizontalCoordinateReferenceSystemDatumName") + lat_col = kwargs.get("lat_col", "LatitudeMeasure") + lon_col = kwargs.get("lon_col", "LongitudeMeasure") # Check columns are in df df_checks(df2, [crs_col, lat_col, lon_col]) @@ -167,12 +165,13 @@ def harmonize_locations(df_in, out_EPSG=4326, df2 = check_precision(df2, lon_col) # Create tuple column - df2['geom_orig'] = list(zip(df2[lon_col], df2[lat_col])) + df2["geom_orig"] = list(zip(df2[lon_col], df2[lat_col])) # Create/populate EPSG column crs_mask = df2[crs_col].isin(xy_datum.keys()) # w/ known datum - df2.loc[crs_mask, 'EPSG'] = [xy_datum[crs]['EPSG'] for crs - in df2.loc[crs_mask, crs_col]] + df2.loc[crs_mask, "EPSG"] = [ + xy_datum[crs]["EPSG"] for crs in df2.loc[crs_mask, crs_col] + ] # Fix/flag missing df2 = infer_CRS(df2, out_EPSG, crs_col=crs_col) @@ -182,16 +181,17 @@ def harmonize_locations(df_in, out_EPSG=4326, df2 = infer_CRS(df2, out_EPSG, bad_crs_val=crs, crs_col=crs_col) # Transform points by vector (sub-set by datum) - for datum in set(df2['EPSG'].astype(int)): + for datum in set(df2["EPSG"].astype(int)): df2 = transform_vector_of_points(df2, datum, out_EPSG) # Convert geom to shape object to use with geopandas - df2['geom'] = [shape({'type': 'Point', 'coordinates': pnt}) - for pnt in list(df2['geom'])] - gdf = geopandas.GeoDataFrame(df2, geometry=df2['geom'], crs=out_EPSG) + df2["geom"] = [ + shape({"type": "Point", "coordinates": pnt}) for pnt in list(df2["geom"]) + ] + gdf = geopandas.GeoDataFrame(df2, geometry=df2["geom"], crs=out_EPSG) if not intermediate_columns: # Drop intermediate columns - gdf = gdf.drop(['geom', 'geom_orig', 'EPSG'], axis=1) + gdf = gdf.drop(["geom", "geom_orig", "EPSG"], axis=1) return gdf @@ -215,13 +215,12 @@ def transform_vector_of_points(df_in, datum, out_EPSG): """ # Create transform object for input datum (EPSG colum) and out_EPSG transformer = Transformer.from_crs(datum, out_EPSG) - d_mask = df_in['EPSG'] == datum # Mask for datum in subset - points = df_in.loc[d_mask, 'geom_orig'] # Points series + d_mask = df_in["EPSG"] == datum # Mask for datum in subset + points = df_in.loc[d_mask, "geom_orig"] # Points series # List of transformed point geometries new_geoms = [transformer.transform(pnt[0], pnt[1]) for pnt in points] # Assign list to df.geom using Index from mask to re-index list - df_in.loc[d_mask, 'geom'] = pandas.Series(new_geoms, - index=df_in.loc[d_mask].index) + df_in.loc[d_mask, "geom"] = pandas.Series(new_geoms, index=df_in.loc[d_mask].index) return df_in @@ -262,8 +261,8 @@ def get_harmonized_stations(query, aoi=None): # TODO: **kwargs instead of query dict? # Query stations (can be slow) - if 'dataProfile' in query.keys(): - query.pop('dataProfile') # TODO: this changes query arg (mutable) + if "dataProfile" in query.keys(): + query.pop("dataProfile") # TODO: this changes query arg (mutable) stations, site_md = wqp.what_sites(**query) # Harmonize stations