[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 872a1d3a9def · 2025-07-21T21:05:13.000Z
for more information, see https://pre-commit.ci
diff --git a/src/distilabel/distiset.py b/src/distilabel/distiset.py
@@ -509,9 +509,9 @@ def load_from_disk(
         )
         dest_distiset_path = distiset_path
 
-        assert fs.isdir(
-            original_distiset_path
-        ), "`distiset_path` must be a `PathLike` object pointing to a folder or a URI of a remote filesystem."
+        assert fs.isdir(original_distiset_path), (
+            "`distiset_path` must be a `PathLike` object pointing to a folder or a URI of a remote filesystem."
+        )
 
         has_config = False
         has_artifacts = False
diff --git a/src/distilabel/pipeline/batch_manager.py b/src/distilabel/pipeline/batch_manager.py
@@ -231,10 +231,10 @@ def from_step(
             input_batch_size=getattr(step, "input_batch_size", None),
             data={predecessor: [] for predecessor in predecessors},
             convergence_step=convergence_step,
-            next_expected_seq_no={predecessor: (0, 0) for predecessor in predecessors},
+            next_expected_seq_no=dict.fromkeys(predecessors, (0, 0)),
             step_signature=step.signature,
             use_cache=step.use_cache,
-            step_offset={predecessor: (0, 0) for predecessor in predecessors},
+            step_offset=dict.fromkeys(predecessors, (0, 0)),
         )
 
     def _get_seq_no(self) -> int:
diff --git a/src/distilabel/pipeline/write_buffer.py b/src/distilabel/pipeline/write_buffer.py
@@ -65,11 +65,9 @@ def __init__(
             step: [] for step in leaf_steps
         }
         # TODO: make this configurable
-        self._buffers_dump_batch_size: Dict[str, int] = {
-            step: 50 for step in leaf_steps
-        }
+        self._buffers_dump_batch_size: Dict[str, int] = dict.fromkeys(leaf_steps, 50)
         self._buffer_last_schema = {}
-        self._buffers_last_file: Dict[str, int] = {step: 1 for step in leaf_steps}
+        self._buffers_last_file: Dict[str, int] = dict.fromkeys(leaf_steps, 1)
         self._steps_cached = steps_cached or {}
         self._logger = logging.getLogger("distilabel.write_buffer")
 
diff --git a/src/distilabel/steps/base.py b/src/distilabel/steps/base.py
@@ -101,7 +101,7 @@ def _infer_step_name(
         base_name = "_".join(parts[:-1])
         while name in step_names:
             idx = int(name.split("_")[-1])
-            name = f"{base_name}_{idx+1}"
+            name = f"{base_name}_{idx + 1}"
     return name
 
 
diff --git a/src/distilabel/steps/tasks/base.py b/src/distilabel/steps/tasks/base.py
@@ -211,7 +211,7 @@ def _output_on_failure(
         a new field `distilabel_meta` with the raw output of the LLM.
         """
         # Create a dictionary with the outputs of the task (every output set to None)
-        outputs = {output: None for output in self.outputs}
+        outputs = dict.fromkeys(self.outputs)
         outputs["model_name"] = self.llm.model_name  # type: ignore
         outputs = self._create_metadata(
             outputs,
diff --git a/src/distilabel/steps/tasks/improving_text_embeddings.py b/src/distilabel/steps/tasks/improving_text_embeddings.py
@@ -66,7 +66,7 @@ def format_output(
             A Python dictionary with the parsed output based on the `keys` property.
         """
         if output is None:
-            return {key: None for key in self.keys}
+            return dict.fromkeys(self.keys)
 
         def escape_backslashes_in_values(s):
             # Regular expression to match the key-value pairs in the dictionary
@@ -100,7 +100,7 @@ def replace_backslashes(match):
             pass
 
         if not isinstance(output, dict):
-            return {key: None for key in self.keys}
+            return dict.fromkeys(self.keys)
 
         return {key: output.get(key, None) for key in self.keys}
 
diff --git a/src/distilabel/steps/tasks/math_shepherd/completer.py b/src/distilabel/steps/tasks/math_shepherd/completer.py
@@ -485,7 +485,7 @@ def _auto_label(
                     self._logger.info("Completer failed due to empty completion")
                     continue
                 if completion[-1] == golden_answers[instruction_i]:
-                    label = f" { self.tags[0]}"
+                    label = f" {self.tags[0]}"
                     # If we found one, it's enough as we are doing Hard Estimation
                     continue
             # In case we had no solutions from the previous step, otherwise we would have
diff --git a/src/distilabel/steps/tasks/text_generation.py b/src/distilabel/steps/tasks/text_generation.py
@@ -229,7 +229,7 @@ def unload(self) -> None:
     @property
     def inputs(self) -> "StepColumns":
         """The input for the task is the `instruction` by default, or the `columns` given as input."""
-        columns = {column: True for column in self.columns}
+        columns = dict.fromkeys(self.columns, True)
         columns["system_prompt"] = False
         return columns
 
diff --git a/src/distilabel/utils/mkdocs/components_gallery.py b/src/distilabel/utils/mkdocs/components_gallery.py
@@ -296,9 +296,9 @@ def _generate_steps_pages(self, src_dir: Path, steps: list) -> List[str]:
                 docstring["icon"] = _STEPS_CATEGORY_TO_ICON.get(first_category, "")
 
             if docstring["icon"]:
-                assert (
-                    docstring["icon"] in _STEPS_CATEGORY_TO_ICON.values()
-                ), f"Icon {docstring['icon']} not found in _STEPS_CATEGORY_TO_ICON"
+                assert docstring["icon"] in _STEPS_CATEGORY_TO_ICON.values(), (
+                    f"Icon {docstring['icon']} not found in _STEPS_CATEGORY_TO_ICON"
+                )
 
             name = step["name"]
 
@@ -364,9 +364,9 @@ def _generate_tasks_pages(self, src_dir: Path, tasks: list) -> List[str]:
                 first_category = docstring["categories"][0]
                 docstring["icon"] = _STEPS_CATEGORY_TO_ICON.get(first_category, "")
             if docstring["icon"]:
-                assert (
-                    docstring["icon"] in _STEPS_CATEGORY_TO_ICON.values()
-                ), f"Icon {docstring['icon']} not found in _STEPS_CATEGORY_TO_ICON"
+                assert docstring["icon"] in _STEPS_CATEGORY_TO_ICON.values(), (
+                    f"Icon {docstring['icon']} not found in _STEPS_CATEGORY_TO_ICON"
+                )
 
             name = task["name"]
 
diff --git a/tests/unit/models/embeddings/test_llamacpp.py b/tests/unit/models/embeddings/test_llamacpp.py
@@ -115,9 +115,9 @@ def test_normalize_embeddings(self, test_inputs):
         for result in results:
             # Check if the embedding is normalized (L2 norm should be close to 1)
             norm = np.linalg.norm(result)
-            assert np.isclose(
-                norm, 1.0, atol=1e-6
-            ), f"Norm is {norm}, expected close to 1.0"
+            assert np.isclose(norm, 1.0, atol=1e-6), (
+                f"Norm is {norm}, expected close to 1.0"
+            )
 
     def test_normalize_embeddings_false(self, test_inputs):
         """
@@ -129,15 +129,15 @@ def test_normalize_embeddings_false(self, test_inputs):
         for result in results:
             # Check if the embedding is not normalized (L2 norm should not be close to 1)
             norm = np.linalg.norm(result)
-            assert not np.isclose(
-                norm, 1.0, atol=1e-6
-            ), f"Norm is {norm}, expected not close to 1.0"
+            assert not np.isclose(norm, 1.0, atol=1e-6), (
+                f"Norm is {norm}, expected not close to 1.0"
+            )
 
         # Additional check: ensure that at least one embedding has a norm significantly different from 1
         norms = [np.linalg.norm(result) for result in results]
-        assert any(
-            not np.isclose(norm, 1.0, atol=0.1) for norm in norms
-        ), "Expected at least one embedding with norm not close to 1.0"
+        assert any(not np.isclose(norm, 1.0, atol=0.1) for norm in norms), (
+            "Expected at least one embedding with norm not close to 1.0"
+        )
 
     def test_encode_batch(self) -> None:
         """
@@ -149,20 +149,20 @@ def test_encode_batch(self) -> None:
             inputs = [f"This is test sentence {i}" for i in range(batch_size)]
             results = self.embeddings.encode(inputs=inputs)
 
-            assert (
-                len(results) == batch_size
-            ), f"Expected {batch_size} results, got {len(results)}"
+            assert len(results) == batch_size, (
+                f"Expected {batch_size} results, got {len(results)}"
+            )
             for result in results:
-                assert (
-                    len(result) == 384
-                ), f"Expected embedding dimension 384, got {len(result)}"
+                assert len(result) == 384, (
+                    f"Expected embedding dimension 384, got {len(result)}"
+                )
 
         # Test with a large batch to ensure it doesn't cause issues
         large_batch = ["Large batch test" for _ in range(100)]
         large_results = self.embeddings.encode(inputs=large_batch)
-        assert (
-            len(large_results) == 100
-        ), f"Expected 100 results for large batch, got {len(large_results)}"
+        assert len(large_results) == 100, (
+            f"Expected 100 results for large batch, got {len(large_results)}"
+        )
 
     def test_encode_batch_consistency(self) -> None:
         """
@@ -180,6 +180,6 @@ def test_encode_batch_consistency(self) -> None:
         batch_result = self.embeddings.encode([input_text, "Another sentence"])[0]
 
         # Compare the embeddings
-        assert np.allclose(
-            single_result, batch_result, atol=1e-5
-        ), "Embeddings are not consistent between single and batch processing"
+        assert np.allclose(single_result, batch_result, atol=1e-5), (
+            "Embeddings are not consistent between single and batch processing"
+        )