From 42ccc0012ba8864e6db1392430100f350236183a Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 13 Jan 2021 11:31:39 +0100 Subject: [PATCH] fix cache_file_name docstring to make it explicit that it is a path --- src/datasets/arrow_dataset.py | 20 ++++++++++---------- src/datasets/dataset_dict.py | 8 ++++---- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 5fb3a70e191..f9a1f9e96f5 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -1163,7 +1163,7 @@ def map( keep_in_memory (`bool`, defaults to `False`): Keep the dataset in memory instead of writing it to a cache file. load_from_cache_file (`bool`, defaults to `True`): If a cache file storing the current computation from `function` can be identified, use it instead of recomputing. - cache_file_name (`Optional[str]`, defaults to `None`): Provide the name of a cache file to use to store the + cache_file_name (`Optional[str]`, defaults to `None`): Provide the name of a path for the cache file. It is used to store the results of the computation instead of the automatically generated cache file name. writer_batch_size (`int`, defaults to `1000`): Number of rows per write operation for the cache file writer. Higher value gives smaller cache files, lower value consume less temporary memory while running `.map()`. @@ -1369,7 +1369,7 @@ def _map_single( keep_in_memory (`bool`, defaults to `False`): Keep the dataset in memory instead of writing it to a cache file. load_from_cache_file (`bool`, defaults to `True`): If a cache file storing the current computation from `function` can be identified, use it instead of recomputing. - cache_file_name (`Optional[str]`, defaults to `None`): Provide the name of a cache file to use to store the + cache_file_name (`Optional[str]`, defaults to `None`): Provide the name of a path for the cache file. It is used to store the results of the computation instead of the automatically generated cache file name. writer_batch_size (`int`, defaults to `1000`): Number of rows per write operation for the cache file writer. Higher value gives smaller cache files, lower value consume less temporary memory while running `.map()`. @@ -1590,7 +1590,7 @@ def filter( keep_in_memory (`bool`, defaults to `False`): Keep the dataset in memory instead of writing it to a cache file. load_from_cache_file (`bool`, defaults to `True`): If a cache file storing the current computation from `function` can be identified, use it instead of recomputing. - cache_file_name (`Optional[str]`, defaults to `None`): Provide the name of a cache file to use to store the + cache_file_name (`Optional[str]`, defaults to `None`): Provide the name of a path for the cache file. It is used to store the results of the computation instead of the automatically generated cache file name. writer_batch_size (`int`, defaults to `1000`): Number of rows per write operation for the cache file writer. Higher value gives smaller cache files, lower value consume less temporary memory while running `.map()`. @@ -1660,7 +1660,7 @@ def flatten_indices( Args: keep_in_memory (`bool`, default: `False`): Keep the dataset in memory instead of writing it to a cache file. - cache_file_name (`Optional[str]`, defaults to `None`): Provide the name of a cache file to use to store the + cache_file_name (`Optional[str]`, defaults to `None`): Provide the name of a path for the cache file. It is used to store the results of the computation instead of the automatically generated cache file name. writer_batch_size (`int`, defaults to `1000`): Number of rows per write operation for the cache file writer. Higher value gives smaller cache files, lower value consume less temporary memory while running `.map()`. @@ -1736,7 +1736,7 @@ def select( Args: `indices` (sequence, iterable, ndarray or Series): List or 1D-array of integer indices for indexing. `keep_in_memory` (`bool`, default: `False`): Keep the indices mapping in memory instead of writing it to a cache file. - `indices_cache_file_name` (`Optional[str]`, default: `None`): Provide the name of a cache file to use to store the + `indices_cache_file_name` (`Optional[str]`, default: `None`): Provide the name of a path for the cache file. It is used to store the indices mapping instead of the automatically generated cache file name. `writer_batch_size` (`int`, default: `1000`): Number of rows per write operation for the cache file writer. Higher value gives smaller cache files, lower value consume less temporary memory while running `.map()`. @@ -1830,7 +1830,7 @@ def sort( keep_in_memory (`bool`, defaults to `False`): Keep the sorted indices in memory instead of writing it to a cache file. load_from_cache_file (`bool`, defaults to `True`): If a cache file storing the sorted indices can be identified, use it instead of recomputing. - indices_cache_file_name (`Optional[str]`, defaults to `None`): Provide the name of a cache file to use to store the + indices_cache_file_name (`Optional[str]`, defaults to `None`): Provide the name of a path for the cache file. It is used to store the sorted indices instead of the automatically generated cache file name. writer_batch_size (`int`, defaults to `1000`): Number of rows per write operation for the cache file writer. Higher value gives smaller cache files, lower value consume less temporary memory. @@ -1906,7 +1906,7 @@ def shuffle( keep_in_memory (`bool`, defaults to `False`): Keep the shuffled indices in memory instead of writing it to a cache file. load_from_cache_file (`bool`, defaults to `True`): If a cache file storing the shuffled indices can be identified, use it instead of recomputing. - indices_cache_file_name (`Optional[str]`, defaults to `None`): Provide the name of a cache file to use to store the + indices_cache_file_name (`Optional[str]`, defaults to `None`): Provide the name of a path for the cache file. It is used to store the shuffled indices instead of the automatically generated cache file name. writer_batch_size (`int`, defaults to `1000`): Number of rows per write operation for the cache file writer. Higher value gives smaller cache files, lower value consume less temporary memory while running `.map()`. @@ -1998,9 +1998,9 @@ def train_test_split( keep_in_memory (`bool`, defaults to `False`): Keep the splits indices in memory instead of writing it to a cache file. load_from_cache_file (`bool`, defaults to `True`): If a cache file storing the splits indices can be identified, use it instead of recomputing. - train_cache_file_name (`Optional[str]`, defaults to `None`): Provide the name of a cache file to use to store the + train_cache_file_name (`Optional[str]`, defaults to `None`): Provide the name of a path for the cache file. It is used to store the train split indices instead of the automatically generated cache file name. - test_cache_file_name (`Optional[str]`, defaults to `None`): Provide the name of a cache file to use to store the + test_cache_file_name (`Optional[str]`, defaults to `None`): Provide the name of a path for the cache file. It is used to store the test split indices instead of the automatically generated cache file name. writer_batch_size (`int`, defaults to `1000`): Number of rows per write operation for the cache file writer. Higher value gives smaller cache files, lower value consume less temporary memory while running `.map()`. @@ -2183,7 +2183,7 @@ def shard( keep_in_memory (`bool`, defaults to `False`): Keep the dataset in memory instead of writing it to a cache file. load_from_cache_file (`bool`, defaults to `True`): If a cache file storing the current computation from `function` can be identified, use it instead of recomputing. - indices_cache_file_name (`Optional[str]`, defaults to `None`): Provide the name of a cache file to use to store the + indices_cache_file_name (`Optional[str]`, defaults to `None`): Provide the name of a path for the cache file. It is used to store the indices of each shard instead of the automatically generated cache file name. writer_batch_size (`int`, defaults to `1000`): Number of rows per write operation for the cache file writer. Higher value gives smaller cache files, lower value consume less temporary memory while running `.map()`. diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index b7117a2b53f..14c39648185 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -267,7 +267,7 @@ def map( keep_in_memory (`bool`, defaults to `False`): Keep the dataset in memory instead of writing it to a cache file. load_from_cache_file (`bool`, defaults to `True`): If a cache file storing the current computation from `function` can be identified, use it instead of recomputing. - cache_file_names (`Optional[Dict[str, str]]`, defaults to `None`): Provide the name of a cache file to use to store the + cache_file_names (`Optional[Dict[str, str]]`, defaults to `None`): Provide the name of a path for the cache file. It is used to store the results of the computation instead of the automatically generated cache file name. You have to provide one :obj:`cache_file_name` per dataset in the dataset dictionary. writer_batch_size (`int`, defaults to `1000`): Number of rows per write operation for the cache file writer. @@ -337,7 +337,7 @@ def filter( keep_in_memory (`bool`, defaults to `False`): Keep the dataset in memory instead of writing it to a cache file. load_from_cache_file (`bool`, defaults to `True`): If a cache file storing the current computation from `function` can be identified, use it instead of recomputing. - cache_file_names (`Optional[Dict[str, str]]`, defaults to `None`): Provide the name of a cache file to use to store the + cache_file_names (`Optional[Dict[str, str]]`, defaults to `None`): Provide the name of a path for the cache file. It is used to store the results of the computation instead of the automatically generated cache file name. You have to provide one :obj:`cache_file_name` per dataset in the dataset dictionary. writer_batch_size (`int`, defaults to `1000`): Number of rows per write operation for the cache file writer. @@ -394,7 +394,7 @@ def sort( keep_in_memory (`bool`, defaults to `False`): Keep the dataset in memory instead of writing it to a cache file. load_from_cache_file (`bool`, defaults to `True`): If a cache file storing the current computation from `function` can be identified, use it instead of recomputing. - indices_cache_file_names (`Optional[Dict[str, str]]`, defaults to `None`): Provide the name of a cache file to use to store the + indices_cache_file_names (`Optional[Dict[str, str]]`, defaults to `None`): Provide the name of a path for the cache file. It is used to store the indices mapping instead of the automatically generated cache file name. You have to provide one :obj:`cache_file_name` per dataset in the dataset dictionary. writer_batch_size (`int`, defaults to `1000`): Number of rows per write operation for the cache file writer. @@ -446,7 +446,7 @@ def shuffle( keep_in_memory (`bool`, defaults to `False`): Keep the dataset in memory instead of writing it to a cache file. load_from_cache_file (`bool`, defaults to `True`): If a cache file storing the current computation from `function` can be identified, use it instead of recomputing. - indices_cache_file_names (`Optional[Dict[str, str]]`, default: `None`): Provide the name of a cache file to use to store the + indices_cache_file_names (`Optional[Dict[str, str]]`, default: `None`): Provide the name of a path for the cache file. It is used to store the indices mappings instead of the automatically generated cache file name. You have to provide one :obj:`cache_file_name` per dataset in the dataset dictionary. writer_batch_size (`int`, defaults to `1000`): Number of rows per write operation for the cache file writer.