From 6cd3dbea5b3810e58be49bd76c9c861eaaa8891c Mon Sep 17 00:00:00 2001 From: Joseph Xu Date: Wed, 16 Oct 2024 09:19:52 -0700 Subject: [PATCH] Prioritize using longitude and latitude columns in building files when present. This saves time over parsing building polygon WKTs and deriving centroids from them. Also, drop duplicate buildings during example generation. PiperOrigin-RevId: 686532902 --- src/skai/buildings.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/skai/buildings.py b/src/skai/buildings.py index b55ab5bb..465c1dc0 100644 --- a/src/skai/buildings.py +++ b/src/skai/buildings.py @@ -41,7 +41,10 @@ def _read_buildings_csv(path: str) -> gpd.GeoDataFrame: """ with tf.io.gfile.GFile(path, 'r') as csv_file: df = pd.read_csv(csv_file) - if 'geometry' in df.columns: + if 'longitude' in df.columns and 'latitude' in df.columns: + geometries = gpd.points_from_xy(df['longitude'], df['latitude']) + df.drop(columns=['longitude', 'latitude'], inplace=True) + elif 'geometry' in df.columns: logging.info('Parsing %d WKT strings. This could take a while.', len(df)) geometries = gpd.GeoSeries.from_wkt(df['geometry']) df.drop(columns=['geometry'], inplace=True) @@ -49,13 +52,12 @@ def _read_buildings_csv(path: str) -> gpd.GeoDataFrame: logging.info('Parsing %d WKT strings. This could take a while.', len(df)) geometries = gpd.GeoSeries.from_wkt(df['wkt']) df.drop(columns=['wkt'], inplace=True) - elif 'longitude' in df.columns and 'latitude' in df.columns: - geometries = gpd.points_from_xy(df['longitude'], df['latitude']) - df.drop(columns=['longitude', 'latitude'], inplace=True) else: raise ValueError(f'No geometry information found in file "{path}"') - return gpd.GeoDataFrame(df, geometry=geometries, crs=4326) + geometries = geometries.normalize() + gdf = gpd.GeoDataFrame(df, geometry=geometries, crs=4326) + return gdf.drop_duplicates() def convert_buildings_file(