Clear cache on file close (#90)

s3Path downloads all files it is reading to a temp directory by default. This meant we were downloading the entire bucket! Unsurprisingly we ran out of disc space! There was no configurable option for this in v0.7 of cloudpathlib, but later versions support cache clearing. Going with the `close_file` strategy ensures that we cleanup files after they are read into memory. See for more: https://cloudpathlib.drivendata.org/v0.17/caching/#setting-the-cache-clearing-method
climatepolicyradar · Feb 1, 2024 · 50c9040 · 50c9040
1 parent fbb08e7
commit 50c9040
Show file tree

Hide file tree

Showing 3 changed files with 651 additions and 649 deletions.
diff --git a/cli/index_data.py b/cli/index_data.py
@@ -38,6 +38,8 @@
 _LOGGER = logging.getLogger(__name__)
 logging.config.dictConfig(DEFAULT_LOGGING)
 
+os.environ["CLOUPATHLIB_FILE_CACHE_MODE"] = "close_file"
+
 
 def _get_index_tasks(
     text2embedding_output_dir: str,
@@ -47,10 +49,11 @@ def _get_index_tasks(
 ) -> Tuple[Sequence[ParserOutput], Union[Path, S3Path]]:
     if s3:
         embedding_dir_as_path = cast(S3Path, S3Path(text2embedding_output_dir))
+        _LOGGER.info(f"Getting tasks from s3, cache dir: {embedding_dir_as_path._local}")
     else:
         embedding_dir_as_path = Path(text2embedding_output_dir)
+        _LOGGER.info(f"Getting tasks from local")
 
-    _LOGGER.info(f"Getting tasks from {'s3' if s3 else 'local'}")
     tasks = [
         ParserOutput.model_validate_json(path.read_text())
         for path in tqdm(list(embedding_dir_as_path.glob("*.json")))