Skip to content

Commit

Permalink
Clear cache on file close (#90)
Browse files Browse the repository at this point in the history
s3Path downloads all files it is reading to a temp directory by default.
This meant we were downloading the entire bucket! Unsurprisingly we ran
out of disc space!

There was no configurable option for this in v0.7 of cloudpathlib, but
later versions support cache clearing. Going with the `close_file`
strategy ensures that we cleanup files after they are read into memory.
See for more:
https://cloudpathlib.drivendata.org/v0.17/caching/#setting-the-cache-clearing-method
  • Loading branch information
olaughter authored Feb 1, 2024
1 parent fbb08e7 commit 50c9040
Show file tree
Hide file tree
Showing 3 changed files with 651 additions and 649 deletions.
5 changes: 4 additions & 1 deletion cli/index_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
_LOGGER = logging.getLogger(__name__)
logging.config.dictConfig(DEFAULT_LOGGING)

os.environ["CLOUPATHLIB_FILE_CACHE_MODE"] = "close_file"


def _get_index_tasks(
text2embedding_output_dir: str,
Expand All @@ -47,10 +49,11 @@ def _get_index_tasks(
) -> Tuple[Sequence[ParserOutput], Union[Path, S3Path]]:
if s3:
embedding_dir_as_path = cast(S3Path, S3Path(text2embedding_output_dir))
_LOGGER.info(f"Getting tasks from s3, cache dir: {embedding_dir_as_path._local}")
else:
embedding_dir_as_path = Path(text2embedding_output_dir)
_LOGGER.info(f"Getting tasks from local")

_LOGGER.info(f"Getting tasks from {'s3' if s3 else 'local'}")
tasks = [
ParserOutput.model_validate_json(path.read_text())
for path in tqdm(list(embedding_dir_as_path.glob("*.json")))
Expand Down
Loading

0 comments on commit 50c9040

Please sign in to comment.