error logging and better format of the example

polinabinder1 · polinabinder1 · commit cb77ff1c2845 · 2025-09-17T21:18:04.000-07:00
diff --git a/examples/example_perturbation_expression_prediction.py b/examples/example_perturbation_expression_prediction.py
@@ -130,7 +130,7 @@
         "If not provided, random data will be generated for testing. "
         "The file should have: cell representations in .X, gene names in .var.index, "
         "and cell identifiers in .obs.index. "
-        "Example: adata.write_h5ad('my_model_data.h5ad')",
+        "The gene names and cell identifiers should match the task input, although the ordering does not need to be the same.",
     )
 
     args = parser.parse_args()
@@ -178,20 +178,15 @@
     # Load model data or generate random data
     if args.model_data_file:
         model_adata = ad.read_h5ad(args.model_data_file)
-        # Validate dimensions
-        assert model_adata.shape == dataset.adata.shape, (
-            f"Model data shape {model_adata.shape} does not match dataset shape {dataset.adata.shape}"
-        )
 
         # Use the cell representation data from the file
-        model_output: CellRepresentation = model_adata.X
-
-        # Apply the gene and cell ordering from the model data to the task input
-        task_input.adata.var.index = model_adata.var.index
-        task_input.adata.uns["cell_barcode_index"] = model_adata.obs.index.astype(
-            str
-        ).values
-
+        # Handle both dense and sparse model_adata.X
+        if hasattr(model_adata.X, "toarray"):
+            model_output: CellRepresentation = model_adata.X.toarray()
+        else:
+            model_output: CellRepresentation = model_adata.X
+        # Apply the gene and cell ordering from the model data to the task input and validate dimensions
+        task_input.apply_model_ordering(model_adata)
     else:
         print("No model data file provided - generating random data for testing")
 
diff --git a/examples/test_equivalency_perturbation_dataset.py b/examples/test_equivalency_perturbation_dataset.py
@@ -329,7 +329,6 @@ def run_new_code(
         filter &= df_csv["standardized_mean_diff"].abs() >= args.min_smd
 
     df_csv = df_csv[filter]
-
     assert_de_results_equivalent(df_csv, new_dataset.de_results, col_map)
     logger.info("DE results matched")
 
diff --git a/examples/test_equivalency_perturbation_task.py b/examples/test_equivalency_perturbation_task.py
@@ -255,9 +255,30 @@ def generate_model_predictions(masked_notebook_adata, args):
     ) as f:
         target_genes_to_save = json.load(f)
     print("Generating model predictions matrix...")
+    # Generate random model data
+    print("Generating random model data for testing")
     model_output: CellRepresentation = np.random.rand(
         masked_notebook_adata.shape[0], masked_notebook_adata.shape[1]
     )
+
+    # Create and save h5ad file if save_model_data is specified
+    if args.save_model_data:
+        # Auto-generate filename based on test parameters
+        filename = (
+            f"generated_model_data_{args.metric_type}_{args.percent_genes_to_mask}.h5ad"
+        )
+        print(f"Creating and saving model data to {filename}")
+
+        # Create AnnData object with the generated model data
+        model_adata = ad.AnnData(
+            X=model_output,
+            obs=masked_notebook_adata.obs.copy(),
+            var=masked_notebook_adata.var.copy(),
+        )
+
+        # Save to h5ad file
+        model_adata.write_h5ad(filename)
+        print(f"Saved model data with shape {model_output.shape} to {filename}")
     obs_index = masked_notebook_adata.obs.index
 
     # Speed up by using numpy and pandas vectorized lookups instead of repeated .index() calls
@@ -355,6 +376,14 @@ def generate_model_predictions(masked_notebook_adata, args):
         type=str,
         default="notebook_task_inputs_{metric_type}_{percent_genes_to_mask}",
     )
+    parser.add_argument(
+        "--save_model_data",
+        action="store_true",
+        help="[OPTIONAL] Save the generated random model data as an AnnData file (.h5ad). "
+        "The file will be automatically named based on the test parameters and saved in the current directory. "
+        "The saved file will contain: cell representations in .X, gene names in .var.index, "
+        "and cell identifiers in .obs.index.",
+    )
 
     args = parser.parse_args()
     mask_portion = (
diff --git a/src/czbenchmarks/datasets/single_cell_perturbation.py b/src/czbenchmarks/datasets/single_cell_perturbation.py
@@ -218,7 +218,7 @@ def _create_adata(self) -> Tuple[ad.AnnData, dict]:
             percent_genes_to_mask=self.percent_genes_to_mask,
             min_de_genes_to_mask=self.min_de_genes_to_mask,
             condition_col=self.condition_key,
-            gene_col=self.de_gene_col,
+            gene_col="gene_id",  # Column was renamed to gene_id during optimization
         )
 
         target_conditions = list(target_condition_dict.keys())
@@ -270,6 +270,9 @@ def _create_adata(self) -> Tuple[ad.AnnData, dict]:
             adata_final.obs[self.condition_key]
         )
 
+        # Optimize: Keep only necessary columns in obs (only condition_key is used in task)
+        adata_final.obs = adata_final.obs[[self.condition_key]]
+
         # Add task-related data to uns for easy access
         adata_final.uns["target_conditions_dict"] = target_condition_dict
         adata_final.uns["de_results"] = {
@@ -341,6 +344,24 @@ def load_data(
         self.de_results = self.load_and_filter_deg_results()
         logger.info(f"Using {len(self.de_results)} differential expression values")
 
+        # Optimize: Keep only necessary columns in de_results
+        # Task only uses: condition_key, "gene_id", and metric_column (logfoldchange or standardized_mean_diff)
+        metric_column = (
+            "logfoldchange"
+            if self.deg_test_name == "wilcoxon"
+            else "standardized_mean_diff"
+        )
+        necessary_columns = [self.condition_key, self.de_gene_col, metric_column]
+
+        # Ensure we have gene_id column for compatibility with task
+        if self.de_gene_col != "gene_id":
+            self.de_results = self.de_results.rename(
+                columns={self.de_gene_col: "gene_id"}
+            )
+            necessary_columns = [self.condition_key, "gene_id", metric_column]
+
+        self.de_results = self.de_results[necessary_columns]
+
         # Compare conditions and throw warning or error for unmatched conditions
         unique_conditions_adata = set(self.adata.obs[self.condition_key])
         unique_conditions_control_cells_ids = set(self.control_cells_ids.keys())
@@ -392,11 +413,10 @@ def store_task_inputs(self) -> Path:
         Store all task inputs as separate files.
 
         This method saves all task-related data as separate files:
-        - control_matched_adata.h5ad: The main AnnData object
+        - control_matched_adata.h5ad: The main AnnData object (includes cell_barcode_index in uns)
         - control_cells_ids.json: Control cell IDs mapping
         - target_conditions_dict.json: Target conditions dictionary
         - de_results.csv: Differential expression results
-        - cell_barcode_index.npy: Original cell barcode indices
 
         Returns:
             Path: Path to the task inputs directory.
diff --git a/src/czbenchmarks/tasks/single_cell/perturbation_expression_prediction.py b/src/czbenchmarks/tasks/single_cell/perturbation_expression_prediction.py
@@ -25,8 +25,24 @@ class PerturbationExpressionPredictionTaskInput(TaskInput):
     target_conditions_dict: dict
     de_results: pd.DataFrame
 
-    class Config:
-        arbitrary_types_allowed = True
+    def apply_model_ordering(self, model_adata: ad.AnnData) -> None:
+        """
+        Apply gene and cell ordering from model data to match the task input.
+
+        Args:
+            model_adata: AnnData object containing the desired gene and cell ordering
+        """
+
+        # Apply gene ordering
+        # Assert that the same values are in both gene and cell indices before re-assigning
+        if set(self.adata.var.index) != set(model_adata.var.index):
+            raise ValueError("Gene indices in task input and model data do not match.")
+        if set(self.adata.obs.index) != set(model_adata.obs.index):
+            raise ValueError("Cell indices in task input and model data do not match.")
+        self.adata.var.index = model_adata.var.index
+
+        # Apply cell barcode ordering
+        self.adata.uns["cell_barcode_index"] = model_adata.obs.index.astype(str).values
 
 
 def load_perturbation_task_input_from_saved_files(
@@ -42,7 +58,7 @@ def load_perturbation_task_input_from_saved_files(
         PerturbationExpressionPredictionTaskInput: The loaded task input.
     """
 
-    # Load the main AnnData object
+    # Load the main AnnData object (contains cell_barcode_index in uns)
     adata_file = task_inputs_dir / "control_matched_adata.h5ad"
     task_adata = ad.read_h5ad(adata_file)
 
@@ -133,6 +149,7 @@ def _run_task(
 
         for condition in perturbation_conditions:
             # Get target genes for this condition
+
             target_genes = target_conditions_dict.get(condition, [])
             valid_genes = [g for g in target_genes if g in adata.var.index]
 
@@ -170,7 +187,6 @@ def _run_task(
                 .index.str.split("_")
                 .str[0]
             )
-
             condition_idx = np.where(base_cell_ids.isin(condition_cells))[0]
             control_idx = np.where(base_cell_ids.isin(control_cells))[0]
 
diff --git a/tests/datasets/test_single_cell_perturbation_dataset.py b/tests/datasets/test_single_cell_perturbation_dataset.py
@@ -259,6 +259,7 @@ def test_perturbation_dataset_store_task_inputs(
         # Check that all required files exist
         expected_files = [
             "control_matched_adata.h5ad",
+            "control_cells_ids.json",
             "target_conditions_dict.json",
             "de_results.csv",
         ]
@@ -284,15 +285,16 @@ def test_perturbation_dataset_store_task_inputs(
             target_conditions_dict = json.load(f)
         assert isinstance(target_conditions_dict, dict)
 
-        # Load and validate DE results CSV
+        # Load and validate DE results CSV (should only have optimized columns)
         de_df = pd.read_csv(task_inputs_dir / "de_results.csv")
         assert not de_df.empty
-        base_cols = {"condition", "gene", "pval_adj"}
-        assert base_cols.issubset(set(de_df.columns))
+        # Only the necessary columns should be present
+        expected_cols = {"condition", "gene_id"}
         if deg_test_name == "wilcoxon":
-            assert "logfoldchange" in de_df.columns
+            expected_cols.add("logfoldchange")
         else:
-            assert "standardized_mean_diff" in de_df.columns
+            expected_cols.add("standardized_mean_diff")
+        assert set(de_df.columns) == expected_cols
 
         # Load and validate cell barcode index
         cell_barcode_index = task_adata.uns["cell_barcode_index"]
@@ -415,16 +417,17 @@ def test_control_matched_adata_contains_task_data(self, deg_test_name, tmp_path)
         assert len(uns["control_cells_ids"]) > 0
         assert uns["control_cells_ids"] == dataset.control_cells_ids
 
-        # Check de_results can be reconstructed as DataFrame
+        # Check de_results can be reconstructed as DataFrame (should only have optimized columns)
         assert isinstance(uns["de_results"], dict)
         de_df = pd.DataFrame(uns["de_results"])
         assert not de_df.empty
-        base_cols = {"condition", "gene", "pval_adj"}
-        assert base_cols.issubset(set(de_df.columns))
+        # Only the necessary columns should be present
+        expected_cols = {"condition", "gene_id"}
         if deg_test_name == "wilcoxon":
-            assert "logfoldchange" in de_df.columns
+            expected_cols.add("logfoldchange")
         else:
-            assert "standardized_mean_diff" in de_df.columns
+            expected_cols.add("standardized_mean_diff")
+        assert set(de_df.columns) == expected_cols
 
         # Check cell_barcode_index
         assert isinstance(uns["cell_barcode_index"], np.ndarray)
diff --git a/tests/tasks/test_tasks.py b/tests/tasks/test_tasks.py