From 7fefd316c3ed1483362fb4f04869b0aa45932701 Mon Sep 17 00:00:00 2001 From: niclaswue <17875208+niclaswue@users.noreply.github.com> Date: Fri, 24 May 2024 09:29:16 +0200 Subject: [PATCH] Improve support for SCAT dataset (#437) --- src/traffic/data/datasets/scat.py | 151 ++++++++++++++++++++++-------- tests/test_datasets.py | 52 ++++++++++ 2 files changed, 163 insertions(+), 40 deletions(-) diff --git a/src/traffic/data/datasets/scat.py b/src/traffic/data/datasets/scat.py index 603b4ae4..57633158 100644 --- a/src/traffic/data/datasets/scat.py +++ b/src/traffic/data/datasets/scat.py @@ -8,6 +8,7 @@ import pandas as pd from ...core import Flight, Traffic, tqdm +from ...data.basic.navaid import Navaids from .mendeley import Mendeley @@ -51,55 +52,113 @@ class SCAT: traffic: Traffic flight_plans: pd.DataFrame clearances: pd.DataFrame + waypoints: Navaids + weather: pd.DataFrame def parse_zipinfo(self, zf: ZipFile, file_info: ZipInfo) -> Entry: with zf.open(file_info.filename, "r") as fh: content_bytes = fh.read() - decoded = json.loads(content_bytes.decode()) - flight_id = str(decoded["id"]) # noqa: F841 - - flight_plan = ( - pd.json_normalize(decoded["fpl"]["fpl_plan_update"]) - .rename(columns=rename_columns) - .eval( - """ - timestamp = @pd.to_datetime(timestamp, utc=True, format="mixed") - flight_id = @flight_id + decoded = json.loads(content_bytes.decode()) + flight_id = str(decoded["id"]) # noqa: F841 + + flight_plan = ( + pd.json_normalize(decoded["fpl"]["fpl_plan_update"]) + .rename(columns=rename_columns) + .eval( """ - ) + timestamp = @pd.to_datetime(timestamp, utc=True, format="mixed") + flight_id = @flight_id + """, + engine="python", ) + ) - clearance = ( - pd.json_normalize(decoded["fpl"]["fpl_clearance"]) - .rename(columns=rename_columns) - .eval( - """ - timestamp = @pd.to_datetime(timestamp, utc=True, format="mixed") - flight_id = @flight_id + clearance = ( + pd.json_normalize(decoded["fpl"]["fpl_clearance"]) + .rename(columns=rename_columns) + .eval( """ - ) + timestamp = @pd.to_datetime(timestamp, utc=True, format="mixed") + flight_id = @flight_id + """, + engine="python", ) + ) - fpl_base, *_ = decoded["fpl"]["fpl_base"] - df = ( - pd.json_normalize(decoded["plots"]) - .rename(columns=rename_columns) - .eval( - """ - timestamp = @pd.to_datetime(time_of_track, utc=True, format="mixed") - altitude = 100 * flight_level - origin = @fpl_base['adep'] - destination = @fpl_base['ades'] - typecode = @fpl_base['aircraft_type'] - callsign = @fpl_base['callsign'] - flight_id = @flight_id - icao24 = "000000" - """ - ) + fpl_base, *_ = decoded["fpl"]["fpl_base"] + df = ( + pd.json_normalize(decoded["plots"]) + .rename(columns=rename_columns) + .eval( + """ + timestamp = @pd.to_datetime(time_of_track, utc=True, format="mixed") + altitude = 100 * flight_level + origin = @fpl_base['adep'] + destination = @fpl_base['ades'] + typecode = @fpl_base['aircraft_type'] + callsign = @fpl_base['callsign'] + flight_id = @flight_id + icao24 = "000000" + """, + engine="python", + ) + ) + return Entry(Flight(df), flight_plan, clearance) + + def parse_waypoints(self, zf: ZipFile, file_info: ZipInfo) -> Navaids: + rename_columns = { + "lat": "latitude", + "lon": "longitude", + } + with zf.open(file_info.filename, "r") as fh: + content_bytes = fh.read() + centers = json.loads(content_bytes.decode()) + + fixes = [] + for center in centers: + points = pd.json_normalize(center["points"]) + points["type"] = "FIX" + points["altitude"] = None + points["frequency"] = None + points["magnetic_variation"] = None + points["description"] = f"Center: {center['name']}" + fixes.append(points.rename(columns=rename_columns)) + df = pd.concat(fixes).drop_duplicates(ignore_index=True) + waypoints = Navaids(data=df) + waypoints.priority = -1 # prefer over default navaids + return waypoints + + def parse_weather(self, zf: ZipFile, file_info: ZipInfo) -> pd.DataFrame: + rename_columns = { + "alt": "altitude", + "lat": "latitude", + "lon": "longitude", + "temp": "temperature", + "time": "timestamp", + "wind_dir": "wind_direction", + "wind_spd": "wind_speed", + } + with zf.open(file_info.filename, "r") as fh: + content_bytes = fh.read() + decoded = json.loads(content_bytes.decode()) + return ( + pd.json_normalize(decoded) + .rename(columns=rename_columns) + .eval( + """ + timestamp = @pd.to_datetime(timestamp, utc=True, format="mixed") + """, + engine="python", ) - return Entry(Flight(df), flight_plan, clearance) + ) - def __init__(self, ident: str, nflights: None | int = None) -> None: + def __init__( + self, + ident: str, + nflights: None | int = None, + include_waypoints: bool = False, + include_weather: bool = False, + ) -> None: mendeley = Mendeley("8yn985bwz5") filename = mendeley.get_data(ident) @@ -108,14 +167,26 @@ def __init__(self, ident: str, nflights: None | int = None) -> None: flight_plans = [] with ZipFile(filename, "r") as zf: - info_list = zf.infolist() - if nflights is not None: - info_list = info_list[:nflights] + all_files = zf.infolist() + total_flights = len(all_files) - 2 + nflights = ( + min(nflights, total_flights) + if nflights is not None + else total_flights + ) + info_list = all_files[:nflights] + if include_waypoints: + info_list.append(all_files[-2]) + if include_weather: + info_list.append(all_files[-1]) + for file_info in tqdm(info_list): if "airspace" in file_info.filename: + self.waypoints = self.parse_waypoints(zf, file_info) continue if "grib_meteo" in file_info.filename: + self.weather = self.parse_weather(zf, file_info) continue entry = self.parse_zipinfo(zf, file_info) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 4aa2a860..2989a8c3 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,8 +1,60 @@ from traffic.data.datasets.scat import SCAT +import pandas as pd + def test_scat() -> None: s = SCAT("scat20161015_20161021.zip", nflights=10) assert len(s.traffic) == 10 assert s.flight_plans.flight_id.nunique() == 10 assert s.clearances.flight_id.nunique() == 10 + + +def test_scat_waypoints() -> None: + s = SCAT("scat20161015_20161021.zip", nflights=10, include_waypoints=True) + assert isinstance(s.waypoints, pd.DataFrame) + assert len(s.waypoints) == 15871 + assert set(s.waypoints.columns) == { + "latitude", + "longitude", + "name", + "center", + } + aa212 = s.waypoints[s.waypoints["name"] == "AA212"] + assert len(aa212) == 1 + assert aa212["latitude"].item() == 58.4902778 + assert aa212["longitude"].item() == 14.4866667 + assert aa212["center"].item() == "ESMM" + + # KERAX is present for both centers + kerax = s.waypoints[s.waypoints["name"] == "KERAX"] + assert set(kerax["center"].values) == {"ESMM", "ESOS"} + assert kerax.iloc[0]["latitude"] == kerax.iloc[1]["latitude"] == 50.475 + assert kerax.iloc[0]["longitude"] == kerax.iloc[1]["longitude"] == 9.5819444 + + +def test_scat_weather() -> None: + s = SCAT("scat20161015_20161021.zip", nflights=10, include_weather=True) + assert isinstance(s.weather, pd.DataFrame) + assert not s.weather.isna().any().max() + assert len(s.weather) == 1519310 + assert set(s.weather.columns) == { + "altitude", + "latitude", + "longitude", + "temperature", + "timestamp", + "wind_direction", + "wind_speed", + } + assert isinstance(s.weather["timestamp"].dtype, pd.DatetimeTZDtype) + + # compare measurement for a specific timestamp + ts = pd.to_datetime("2016-10-14 10:30:00+00:00") # noqa: F841 + measurement = s.weather.query( + "timestamp == @ts & altitude == 50 & latitude == 42.5 & longitude == 60" + ) + assert len(measurement) == 1 + assert measurement["temperature"].item() == 4 + assert measurement["wind_direction"].item() == 166 + assert measurement["wind_speed"].item() == 16