@@ -218,7 +218,7 @@ def _create_adata(self) -> Tuple[ad.AnnData, dict]:
218
218
percent_genes_to_mask = self .percent_genes_to_mask ,
219
219
min_de_genes_to_mask = self .min_de_genes_to_mask ,
220
220
condition_col = self .condition_key ,
221
- gene_col = self . de_gene_col ,
221
+ gene_col = "gene_id" , # Column was renamed to gene_id during optimization
222
222
)
223
223
224
224
target_conditions = list (target_condition_dict .keys ())
@@ -270,6 +270,9 @@ def _create_adata(self) -> Tuple[ad.AnnData, dict]:
270
270
adata_final .obs [self .condition_key ]
271
271
)
272
272
273
+ # Optimize: Keep only necessary columns in obs (only condition_key is used in task)
274
+ adata_final .obs = adata_final .obs [[self .condition_key ]]
275
+
273
276
# Add task-related data to uns for easy access
274
277
adata_final .uns ["target_conditions_dict" ] = target_condition_dict
275
278
adata_final .uns ["de_results" ] = {
@@ -341,6 +344,24 @@ def load_data(
341
344
self .de_results = self .load_and_filter_deg_results ()
342
345
logger .info (f"Using { len (self .de_results )} differential expression values" )
343
346
347
+ # Optimize: Keep only necessary columns in de_results
348
+ # Task only uses: condition_key, "gene_id", and metric_column (logfoldchange or standardized_mean_diff)
349
+ metric_column = (
350
+ "logfoldchange"
351
+ if self .deg_test_name == "wilcoxon"
352
+ else "standardized_mean_diff"
353
+ )
354
+ necessary_columns = [self .condition_key , self .de_gene_col , metric_column ]
355
+
356
+ # Ensure we have gene_id column for compatibility with task
357
+ if self .de_gene_col != "gene_id" :
358
+ self .de_results = self .de_results .rename (
359
+ columns = {self .de_gene_col : "gene_id" }
360
+ )
361
+ necessary_columns = [self .condition_key , "gene_id" , metric_column ]
362
+
363
+ self .de_results = self .de_results [necessary_columns ]
364
+
344
365
# Compare conditions and throw warning or error for unmatched conditions
345
366
unique_conditions_adata = set (self .adata .obs [self .condition_key ])
346
367
unique_conditions_control_cells_ids = set (self .control_cells_ids .keys ())
@@ -392,11 +413,10 @@ def store_task_inputs(self) -> Path:
392
413
Store all task inputs as separate files.
393
414
394
415
This method saves all task-related data as separate files:
395
- - control_matched_adata.h5ad: The main AnnData object
416
+ - control_matched_adata.h5ad: The main AnnData object (includes cell_barcode_index in uns)
396
417
- control_cells_ids.json: Control cell IDs mapping
397
418
- target_conditions_dict.json: Target conditions dictionary
398
419
- de_results.csv: Differential expression results
399
- - cell_barcode_index.npy: Original cell barcode indices
400
420
401
421
Returns:
402
422
Path: Path to the task inputs directory.
0 commit comments