refactor: map generators to substations using voltages

danielolsen · danielolsen · commit 39058fb70e20 · 2022-02-22T09:42:07.000-08:00
diff --git a/prereise/gather/griddata/hifld/data_process/generators.py b/prereise/gather/griddata/hifld/data_process/generators.py
@@ -1,3 +1,5 @@
+from math import asin
+
 import numpy as np
 import pandas as pd
 from scipy.optimize import curve_fit
@@ -24,7 +26,9 @@ def floatify(value, default=float("nan")):
         return default
 
 
-def map_generators_to_sub_by_location(generators, substations, inplace=True):
+def map_generators_to_sub_by_location(
+    generators, substations, inplace=True, report_worst=None
+):
     """Determine the closest substation to each generator. For generators without
     latitude and longitude, an attempt will be made to match via ZIP code, and failing
     that a pandas.NA value will be returned.
@@ -37,17 +41,38 @@ def map_generators_to_sub_by_location(generators, substations, inplace=True):
         'sub_id' column or to return a new one. If ``inplace`` is `True`, entries in
         `generators` which have non-sensical combinations of 'state' and 'interconnect'
         columns will have their 'interconnect' entries modified.
+    :param int report_worst: if not None, display the distances of the worst N mappings.
     :return: (*pandas.DataFrame/None*) -- if ``inplace`` is `False`, return the modified
         DataFrame; otherwise return nothing.
     """
 
-    def get_sub_id_of_closest_substation(generator, state_trees, subs_state_lookup):
+    def get_closest_substation(generator, state_trees, subs_state_lookup):
         if not isinstance(generator["xyz"], list):
             return pd.NA
-        grouper_key = (generator["interconnect"], generator["state"])
-        _, array_index = state_trees[grouper_key].query(generator["xyz"])
-        sub_index = subs_state_lookup[grouper_key][array_index]
-        return sub_index
+        if pd.isnull(generator["voltage_class"]) or generator["Pmax"] < 100:
+            grouper_key = generator["interconnect"]
+        else:
+            grouper_key = (generator["interconnect"], generator["voltage_class"])
+        chord_dist, array_index = voltage_trees[grouper_key].query(generator["xyz"])
+        sub_id = subs_voltage_lookup[grouper_key][array_index]
+        # Translate chord distance (unit circle) to great circle distance (miles)
+        dist_in_miles = 3963 * 2 * asin(chord_dist / 2)  # use 3963 mi as earth radius
+        return pd.Series({"dist": dist_in_miles, "sub_id": sub_id})
+
+    def classify_voltages(voltage, voltage_ranges):
+        for v_range, bounds in voltage_ranges.items():
+            if bounds["min"] <= voltage <= bounds["max"]:
+                return v_range
+        return float("nan")
+
+    voltage_ranges = {
+        "under 100": {"min": 0, "max": 99},
+        "100-161": {"min": 100, "max": 161},
+        "220-287": {"min": 220, "max": 287},
+        "345": {"min": 345, "max": 345},
+        "500": {"min": 500, "max": 500},
+        "735 and above": {"min": 735, "max": float("inf")},
+    }
 
     # Translate lat/lon to 3D positions (assume spherical earth, origin at center)
     substations_with_xyz = substations.assign(
@@ -64,68 +89,67 @@ def get_sub_id_of_closest_substation(generator, state_trees, subs_state_lookup):
         )
     )
 
-    # Group substations by state to build KDTrees
-    subs_state_lookup = substations_with_xyz.groupby(["interconnect", "STATE"]).groups
+    # Bin voltages into broad classes
+    substations_with_xyz["voltage_class"] = substations["MAX_VOLT"].map(
+        lambda x: classify_voltages(x, voltage_ranges)
+    )
+    generators_with_xyz["voltage_class"] = generators["Grid Voltage (kV)"].map(
+        lambda x: classify_voltages(x, voltage_ranges)
+    )
+
+    # Group substations by voltage to build KDTrees
+    subs_voltage_lookup = {
+        (interconnect, voltage_level): substations_with_xyz.query(
+            "interconnect == @interconnect and MAX_VOLT >= @voltage_range['min']"
+        ).index
+        for interconnect in generators["interconnect"].unique()
+        for voltage_level, voltage_range in voltage_ranges.items()
+    }
     # Group substations by ZIP code for a fallback for generators without coordinates
     subs_zip_groupby = substations_with_xyz.groupby(["interconnect", "ZIP"])
 
     # Create a KDTree for each combination of state and interconnect
-    state_trees = {
+    voltage_trees = {
         key: KDTree(np.array(substations_with_xyz.loc[sub_ids, "xyz"].tolist()))
-        for key, sub_ids in subs_state_lookup.items()
+        for key, sub_ids in subs_voltage_lookup.items()
+        if len(sub_ids) > 0
     }
-    # Ensure that we have a tree for every generator
-    gens_state_groupby = generators_with_xyz.groupby(["interconnect", "state"])
-    missing_groups = set(gens_state_groupby.groups) - set(state_trees)
-    if len(missing_groups) > 0:
-        # There are some combinations of generator (interconnect, state) without subs
-        allowable_border_states = {"KS", "NE", "OK"}
-        for interconnect, state in missing_groups:
-            if state in allowable_border_states:
-                # Assume that the interconnect and state are correct
-                print(
-                    f"no substations within ({interconnect}, {state}), "
-                    f"will map generators to substations within {interconnect} instead"
-                )
-                # Find all substations for the interconnection
-                new_subs = substations_with_xyz.query("interconnect == @interconnect")
-                # Extend the 'true' combinations of (interconnect, state) with fakes
-                state_trees[(interconnect, state)] = KDTree(
-                    np.array(new_subs["xyz"].tolist())
-                )
-                subs_state_lookup[(interconnect, state)] = new_subs.index
-            else:
-                # Assume that the state is correct, the interconnect is wrong
-                print(
-                    f"no substations within ({interconnect}, {state}), "
-                    f"will map generators to substations within {state} instead"
-                )
-                (assumed,) = {
-                    interconnect
-                    for interconnect, state_list in const.interconnect2state.items()
-                    if interconnect not in {"ignore", "split"} and state in state_list
-                }
-                gens_to_fix = gens_state_groupby.get_group((interconnect, state)).index
-                generators_with_xyz.loc[gens_to_fix, "interconnect"] = assumed
+    # Create a KDTree for each interconnect (all voltages)
+    subs_interconnect_groupby = substations_with_xyz.groupby("interconnect")
+    for interconnect in generators["interconnect"].unique():
+        tree_subs = subs_interconnect_groupby.get_group(interconnect)
+        voltage_trees[interconnect] = KDTree(np.array(tree_subs["xyz"].tolist()))
+        subs_voltage_lookup[interconnect] = tree_subs.index
 
     # Query the appropriate tree for each generator to get the closest substation ID
-    sub_ids = generators_with_xyz.apply(
-        lambda x: get_sub_id_of_closest_substation(x, state_trees, subs_state_lookup),
+    mapping_results = generators_with_xyz.apply(
+        lambda x: get_closest_substation(x, voltage_trees, subs_voltage_lookup),
         axis=1,
     )
     # For generators without coordinates, try to pick a substation with a matching ZIP
-    for g in generators.loc[sub_ids.isnull()].index:
+    for g in generators.loc[mapping_results["sub_id"].isnull()].index:
         try:
             candidates = subs_zip_groupby.get_group(generators.loc[g, "ZIP"])
-            sub_ids.loc[g] = candidates.index[0]  # arbitrary choose the first one
+            # arbitrary choose the first one
+            mapping_results.loc[g, "sub_id"] = candidates.index[0]
         except KeyError:
             continue  # No coordinates, no matching ZIP, we're out of luck
 
+    if report_worst is not None:
+        print(
+            mapping_results.sort_values("sub_dist", ascending=False)
+            .join(generators[["Plant Code", "Grid Voltage (kV)", "Pmax"]])
+            .head(report_worst)
+        )
+
     if inplace:
-        generators["sub_id"] = sub_ids
+        generators["sub_id"] = mapping_results["sub_id"]
+        generators["sub_dist"] = mapping_results["dist"]
         generators["interconnect"] = generators_with_xyz["interconnect"]
     else:
-        return generators_with_xyz.drop("xyz", axis=1).assign(sub_id=sub_ids)
+        return generators_with_xyz.drop(["xyz", "voltage_class"], axis=1).join(
+            mapping_results
+        )
 
 
 def map_generator_to_bus_by_sub(generator, bus_groupby):
@@ -375,7 +399,16 @@ def build_plant(bus, substations, kwargs={}):
     epa_ampd_groupby = epa_ampd.groupby(["ORISPL_CODE", "UNITID"])
 
     # Add information to generators based on Form 860 Plant table
-    generators = generators.merge(plants, on="Plant Code", suffixes=(None, "_860Plant"))
+    # Merging this way allows column-on-column merge while preserving original index
+    generators = (
+        generators.reset_index()
+        .merge(
+            plants,
+            on="Plant Code",
+            suffixes=(None, "_860Plant"),
+        )
+        .set_index("index")
+    )
     generators.rename(
         {"Latitude": "lat", "Longitude": "lon", "Zip": "ZIP"}, axis=1, inplace=True
     )
@@ -385,6 +418,7 @@ def build_plant(bus, substations, kwargs={}):
         .map(const.balancingauthority2interconnect)
         .combine_first(generators["NERC Region"].map(const.nercregion2interconnect))
     )
+    generators["Grid Voltage (kV)"] = generators["Grid Voltage (kV)"].map(floatify)
 
     # Ensure we have Pmax and Pmin for each generator
     generators["Pmax"] = generators[