From 3a8c821064914a12757796c4c4d58535fd498c08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20Manuel=20Burgue=C3=B1o=20Romero?= Date: Mon, 15 Jul 2024 08:00:32 +0000 Subject: [PATCH] Now tile dataframe is created using a dict not pd.concat, reducing RAM usage --- src/landcoverpy/utilities/raster.py | 9 +++------ src/landcoverpy/workflow_predict.py | 24 +++++++++++------------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/src/landcoverpy/utilities/raster.py b/src/landcoverpy/utilities/raster.py index d9c2dd7..3072ad1 100644 --- a/src/landcoverpy/utilities/raster.py +++ b/src/landcoverpy/utilities/raster.py @@ -1,5 +1,4 @@ import json -import math from itertools import compress from pathlib import Path @@ -53,12 +52,13 @@ def _read_raster( """ if window is not None and (mask_geometry is not None or path_to_disk is not None): - print("If a window is provided, the raster can't be saved to disk or cropped") + raise ValueError("If a window is provided, mask_geometry and path_to_disk must be None") band_name = _get_raster_name_from_path(str(band_path)) - print(f"Reading raster {band_name}") + with rasterio.open(band_path) as band_file: + spatial_resolution = _get_spatial_resolution_raster(band_path) if window is not None and spatial_resolution != 10: # Transform the window to the actual resolution of the raster @@ -100,7 +100,6 @@ def _read_raster( path_to_disk = path_to_disk[:-3] + "tif" if normalize_range is not None: - print(f"Normalizing band {band_name}") value1, value2 = normalize_range band = _normalize(band, value1, value2) @@ -111,7 +110,6 @@ def _read_raster( # Create a temporal memory file to mask the band # This is necessary because the band is previously read to scale its resolution if mask_geometry: - print(f"Cropping raster {band_name}") projected_geometry = _project_shape(mask_geometry, dcs=destination_crs) with rasterio.io.MemoryFile() as memfile: with memfile.open(**kwargs) as memfile_band: @@ -430,7 +428,6 @@ def _rescale_band( rescaled_raster = np.ndarray( shape=(kwargs["count"], new_kwargs["height"], new_kwargs["width"]), dtype=np.float32) - print(f"Rescaling raster {band_name}, from: {img_resolution}m to {str(spatial_resol)}.0m") reproject( source=band, destination=rescaled_raster, diff --git a/src/landcoverpy/workflow_predict.py b/src/landcoverpy/workflow_predict.py index e377ed9..b181e83 100644 --- a/src/landcoverpy/workflow_predict.py +++ b/src/landcoverpy/workflow_predict.py @@ -173,7 +173,6 @@ def _process_tile_predict(tile, execution_mode, used_columns=None, use_block_win rasters_by_season = defaultdict(dict) for season, products_metadata in product_per_season.items(): - print(season) bucket_products = settings.MINIO_BUCKET_NAME_PRODUCTS bucket_composites = settings.MINIO_BUCKET_NAME_COMPOSITES current_bucket = None @@ -218,7 +217,6 @@ def _process_tile_predict(tile, execution_mode, used_columns=None, use_block_win temp_product_folder = Path(settings.TMP_DIR, product_name + ".SAFE") if not temp_product_folder.exists(): Path.mkdir(temp_product_folder) - print(f"Processing product {product_name}") rasters_by_season[season]["raster_paths"] = rasters_paths rasters_by_season[season]["is_band"] = is_band @@ -249,8 +247,8 @@ def _process_tile_predict(tile, execution_mode, used_columns=None, use_block_win window_kwargs["height"] = window.height window_kwargs["transform"] = rasterio.windows.transform(window, kwargs_s2["transform"]) - # Dataframe for storing data of a window - window_tile_df = None + # Dict for storing data of a window (all rasters and indexes) + window_tile_dict = {} crop_mask = np.zeros(shape=(int(window_kwargs["height"]), int(window_kwargs["width"])), dtype=np.uint8) @@ -306,9 +304,7 @@ def _process_tile_predict(tile, execution_mode, used_columns=None, use_block_win raster_masked = np.ma.masked_array(raster[0], mask=crop_mask) raster_masked = np.ma.compressed(raster_masked) - window_raster_df = pd.DataFrame({f"{season}_{raster_name}": raster_masked}) - - window_tile_df = pd.concat([window_tile_df, window_raster_df], axis=1) + window_tile_dict.update({f"{season}_{raster_name}": raster_masked}) for dem_name in dems_raster_names: # Add dem and aspect data @@ -333,13 +329,9 @@ def _process_tile_predict(tile, execution_mode, used_columns=None, use_block_win ) raster_masked = np.ma.masked_array(raster, mask=crop_mask) raster_masked = np.ma.compressed(raster_masked).flatten() - window_raster_df = pd.DataFrame({dem_name: raster_masked}) - window_tile_df = pd.concat([window_tile_df, window_raster_df], axis=1) - - + window_tile_dict.update({dem_name: raster_masked}) - print("Dataframe information:") - print(window_tile_df.info()) + window_tile_df = pd.DataFrame(window_tile_dict) if execution_mode == ExecutionMode.LAND_COVER_PREDICTION: @@ -404,6 +396,8 @@ def _process_tile_predict(tile, execution_mode, used_columns=None, use_block_win ) as classification_file: classification_file.write(encoded_sl_predictions, window=window) + print(f"Window {window} processed, output raster {classification_path} updated") + minio_client.fput_object( bucket_name=settings.MINIO_BUCKET_CLASSIFICATIONS, object_name=f"{settings.MINIO_DATA_FOLDER_NAME}/{classification_name}", @@ -411,6 +405,10 @@ def _process_tile_predict(tile, execution_mode, used_columns=None, use_block_win content_type="image/tif", ) + print(f"Classification raster {classification_name} uploaded to Minio") + print(f"Tile {tile} processed") + print("Cleaning up temporary files") + for path in Path(settings.TMP_DIR).glob("**/*"): if path.is_file(): path.unlink()