1818
1919def _arrow_to_datasets_dtype (arrow_type : pa .DataType ) -> str :
2020 """
21- _arrow_to_datasets_dtype takes a pyarrow.DataType and converts it to a datasets string dtype.
22- In effect, `dt == string_to_arrow(_arrow_to_datasets_dtype(dt))`
21+ _arrow_to_datasets_dtype takes a pyarrow.DataType and converts it to a datasets
22+ string dtype. In effect, `dt == string_to_arrow(_arrow_to_datasets_dtype(dt))`
2323 """
2424
2525 if pa .types .is_null (arrow_type ):
@@ -74,11 +74,11 @@ def string_to_arrow(datasets_dtype: str) -> pa.DataType:
7474 """
7575 string_to_arrow takes a datasets string dtype and converts it to a pyarrow.DataType.
7676 In effect, `dt == string_to_arrow(_arrow_to_datasets_dtype(dt))`
77- This is necessary because the datasets.Value() primitive type is constructed using a string dtype
78- Value(dtype=str)
79- But Features.type (via `get_nested_type()` expects to resolve Features into a pyarrow Schema,
80- which means that each Value() must be able to resolve into a corresponding pyarrow.DataType, which is the
81- purpose of this function.
77+ This is necessary because the datasets.Value() primitive type is constructed using a
78+ string dtype Value(dtype=str)
79+ But Features.type (via `get_nested_type()` expects to resolve Features into a
80+ pyarrow Schema, which means that each Value() must be able to resolve into a
81+ corresponding pyarrow.DataType, which is the purpose of this function.
8282 """
8383 timestamp_regex = re .compile (r"^timestamp\[(.*)\]$" )
8484 timestamp_matches = timestamp_regex .search (datasets_dtype )
@@ -97,16 +97,21 @@ def string_to_arrow(datasets_dtype: str) -> pa.DataType:
9797 return pa .timestamp (internals_matches .group (1 ), internals_matches .group (2 ))
9898 else :
9999 raise ValueError (
100- f"{ datasets_dtype } is not a validly formatted string representation of a pyarrow timestamp."
101- f"Examples include timestamp[us] or timestamp[us, tz=America/New_York]"
102- f"See: https://arrow.apache.org/docs/python/generated/pyarrow.timestamp.html#pyarrow.timestamp"
100+ f"""
101+ { datasets_dtype } is not a validly formatted string representation of a pyarrow
102+ timestamp. Examples include timestamp[us] or timestamp[us, tz=America/New_York]
103+ See:
104+ https://arrow.apache.org/docs/python/generated/pyarrow.timestamp.html#pyarrow.timestamp
105+ """
103106 )
104107 elif datasets_dtype not in pa .__dict__ :
105108 if str (datasets_dtype + "_" ) not in pa .__dict__ :
106109 raise ValueError (
107- f"Neither { datasets_dtype } nor { datasets_dtype + '_' } seems to be a pyarrow data type. "
108- f"Please make sure to use a correct data type, see: "
109- f"https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions"
110+ f"""
111+ Neither { datasets_dtype } nor { datasets_dtype + '_' } seems to be a pyarrow data type.
112+ Please make sure to use a correct data type, see:
113+ https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions
114+ """
110115 )
111116 arrow_data_factory_function_name = str (datasets_dtype + "_" )
112117 else :
@@ -119,17 +124,23 @@ def _cast_to_python_objects(obj: Any, only_1d_for_numpy: bool) -> tuple[Any, boo
119124 """
120125 Cast pytorch/tensorflow/pandas objects to python numpy array/lists.
121126 It works recursively.
122- To avoid iterating over possibly long lists, it first checks if the first element that is not None has to be casted.
123- If the first element needs to be casted, then all the elements of the list will be casted, otherwise they'll stay the same.
124- This trick allows to cast objects that contain tokenizers outputs without iterating over every single token for example.
127+ To avoid iterating over possibly long lists, it first checks if the first element
128+ that is not None has to be casted.
129+ If the first element needs to be casted, then all the elements of the list will be
130+ casted, otherwise they'll stay the same.
131+ This trick allows to cast objects that contain tokenizers outputs without iterating
132+ over every single token for example.
125133 Args:
126134 obj: the object (nested struct) to cast
127- only_1d_for_numpy (bool): whether to keep the full multi-dim tensors as multi-dim numpy arrays, or convert them to
128- nested lists of 1-dimensional numpy arrays. This can be useful to keep only 1-d arrays to instantiate Arrow arrays.
129- Indeed Arrow only support converting 1-dimensional array values.
135+ only_1d_for_numpy (bool): whether to keep the full multi-dim tensors as
136+ multi-dim numpy arrays, or convert them to nested lists of 1-dimensional
137+ numpy arrays. This can be useful to keep only 1-d arrays to instantiate
138+ Arrow arrays. Indeed Arrow only support converting 1-dimensional array
139+ values.
130140 Returns:
131141 casted_obj: the casted object
132- has_changed (bool): True if the object has been changed, False if it is identical
142+ has_changed (bool): True if the object has been changed, False if it is
143+ identical
133144 """
134145
135146 if config .TF_AVAILABLE and "tensorflow" in sys .modules :
@@ -240,9 +251,12 @@ def cast_to_python_objects(obj: Any, only_1d_for_numpy=False) -> Any:
240251 """
241252 Cast numpy/pytorch/tensorflow/pandas objects to python lists.
242253 It works recursively.
243- To avoid iterating over possibly long lists, it first checks if the first element that is not None has to be casted.
244- If the first element needs to be casted, then all the elements of the list will be casted, otherwise they'll stay the same.
245- This trick allows to cast objects that contain tokenizers outputs without iterating over every single token for example.
254+ To avoid iterating over possibly long lists, it first checks if the first element
255+ that is not None has to be casted.
256+ If the first element needs to be casted, then all the elements of the list will be
257+ casted, otherwise they'll stay the same.
258+ This trick allows to cast objects that contain tokenizers outputs without iterating
259+ over every single token for example.
246260 Args:
247261 obj: the object (nested struct) to cast
248262 Returns:
@@ -552,7 +566,8 @@ def encode_example(self, value):
552566
553567def encode_nested_example (schema , obj ):
554568 """Encode a nested example.
555- This is used since some features (in particular ClassLabel) have some logic during encoding.
569+ This is used since some features (in particular ClassLabel) have some logic during
570+ encoding.
556571 """
557572 # Nested structures: we allow dict, list/tuples, sequences
558573 if isinstance (schema , dict ):
@@ -598,10 +613,12 @@ def encode_nested_example(schema, obj):
598613 else None
599614 )
600615 # Object with special encoding:
601- # ClassLabel will convert from string to int, TranslationVariableLanguages does some checks
616+ # ClassLabel will convert from string to int,
617+ # TranslationVariableLanguages does some checks
602618 elif isinstance (schema , (ClassLabel , Value )):
603619 return schema .encode_example (obj )
604- # Other object should be directly convertible to a native Arrow type (like Translation and Translation)
620+ # Other object should be directly convertible to a native Arrow type
621+ # (like Translation and Translation)
605622 return obj
606623
607624
0 commit comments