snap-stanford · igor-sadalski · Sep 28, 2025 · Sep 28, 2025 · Sep 29, 2025 · Sep 29, 2025
diff --git a/biomni/tool/genomics.py b/biomni/tool/genomics.py
@@ -2162,3 +2162,258 @@ def analyze_genomic_region_overlap(region_sets, output_prefix="overlap_analysis"
     log += f"- Summary statistics saved to: {summary_file}\n"
 
     return log
+
+
+def geneformer_embed(
+    adata_or_path,
+    base_dir,
+    adata_filename,
+    embeddings_prefix="embeddings_geneformer_zero_shot",
+    model_name="Geneformer-V2-104M",
+    model_input_size=4096,
+    chunk_size=10000,
+    nproc=8,
+    forward_batch_size=64,
+):
+    """
+    Run Geneformer embedding extraction pipeline on an AnnData object or file.
+
+    Parameters
+    ----------
+    adata_or_path : str or AnnData
+        Path to .h5ad file or AnnData instance in memory.
+    base_dir : str
+        Base directory for all data/models/outputs.
+    adata_filename : str
+        Filename for AnnData file (used if adata_or_path is AnnData).
+    embeddings_prefix : str
+        Prefix for output embeddings csv.
+    model_name : str
+        Name of the Geneformer model to use (e.g., "Geneformer-V2-104M").
+    model_input_size : int
+        Model input size for tokenizer.
+    chunk_size : int
+        Chunk size for tokenizer.
+    nproc : int
+        Number of processes for parallelization.
+    forward_batch_size : int
+        Batch size for embedding extraction.
+
+    Returns
+    -------
+    str
+        Steps performed during the embedding extraction process.
+    """
+
+    import pickle
+    import subprocess
+    from pathlib import Path
+
+    import anndata as ad
+
+    steps = []
+    steps.append("Starting Geneformer embedding extraction pipeline")
+    steps.append(f"Base directory: {base_dir}")
+    steps.append(f"Model name: {model_name}")
+    steps.append(f"Model input size: {model_input_size}")
+    steps.append(f"Chunk size: {chunk_size}")
+    steps.append(f"Number of processes: {nproc}")
+    steps.append(f"Forward batch size: {forward_batch_size}")
+
+    import sys
+
+    proc = subprocess.Popen(["git", "lfs", "install"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+    for line in proc.stdout:
+        print(line, end="", file=sys.stdout)
+    proc.wait()
+    if proc.returncode != 0:
+        raise RuntimeError("git lfs install failed")
+
+    geneformer_dir = Path(base_dir) / "Geneformer"
+    if not geneformer_dir.exists():
+        proc = subprocess.Popen(
+            ["git", "clone", "https://huggingface.co/ctheodoris/Geneformer", str(geneformer_dir)],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+        )
+        for line in proc.stdout:
+            print(line, end="", file=sys.stdout)
+        proc.wait()
+        if proc.returncode != 0:
+            raise RuntimeError("git clone of Geneformer failed")
+    else:
+        print(f"Geneformer directory already exists: {geneformer_dir}")
+
+    proc = subprocess.Popen(
+        ["pip", "install", "."], cwd=str(geneformer_dir), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
+    )
+    # Also install required dependencies after installing Geneformer
+    proc.wait()
+    if proc.returncode != 0:
+        raise RuntimeError("pip install of Geneformer failed")
+    # required to make geneformer work
+    extra_packages = ["peft==0.11.1", "transformers==4.40"]
+    proc = subprocess.Popen(
+        ["pip", "install"] + extra_packages, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
+    )
+    for line in proc.stdout:
+        print(line, end="", file=sys.stdout)
+    proc.wait()
+    if proc.returncode != 0:
+        raise RuntimeError("pip install of Geneformer failed")
+
+    try:
+        from geneformer import EmbExtractor
+        from geneformer.tokenizer import TranscriptomeTokenizer
+
+        steps.append(f"Loading Geneformer model: {model_name}")
+        BASE_DIR = Path(base_dir)
+        MODELS_DIR = geneformer_dir
+        DATA_DIR = MODELS_DIR / "data"
+        HUMANIZED_DATA_DIR = DATA_DIR / "humanized"
+        MODEL_DIR = MODELS_DIR / model_name
+        EMBEDDINGS_DIR = BASE_DIR / "zero-shot-performance" / "embeddings"
+
+        steps.append("Setting up directory structure")
+        HUMANIZED_DATA_DIR.mkdir(parents=True, exist_ok=True)
+        EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
+        steps.append(f"✓ Created directories: {HUMANIZED_DATA_DIR}, {EMBEDDINGS_DIR}")
+
+        TOKEN_DICT_PATH = MODELS_DIR / "geneformer" / "token_dictionary_gc104M.pkl"
+        GENE_MEDIAN_PATH = MODELS_DIR / "geneformer" / "gene_median_dictionary_gc104M.pkl"
+        GENE_NAME_ID_PATH = MODELS_DIR / "geneformer" / "gene_name_id_dict_gc104M.pkl"
+
+        adata_path = HUMANIZED_DATA_DIR / adata_filename
+
+        steps.append("Loading and preparing AnnData object")
+        if isinstance(adata_or_path, str):
+            steps.append(f"Loading AnnData from file: {adata_or_path}")
+            adata = ad.read_h5ad(adata_or_path)
+            if Path(adata_or_path).resolve() != adata_path.resolve():
+                adata.write_h5ad(adata_path)
+                steps.append(f"✓ Saved AnnData to: {adata_path}")
+        else:
+            steps.append("Using provided AnnData object")
+            adata = adata_or_path
+            adata.write_h5ad(adata_path)
+            steps.append(f"✓ Saved AnnData to: {adata_path}")
+
+        steps.append(f"AnnData shape: {adata.n_obs} cells, {adata.n_vars} genes")
+
+        steps.append("Preparing AnnData for Geneformer")
+        adata.var["ensembl_id"] = adata.var_names
+        adata.obs["n_counts"] = adata.X.sum(axis=1)
+        cell_indices = adata.obs.index.values
+        adata.obs["cell_index"] = cell_indices
+        adata.write_h5ad(adata_path)
+        steps.append("✓ Added required columns: ensembl_id, n_counts, cell_index")
+
+        steps.append("Loading token dictionary and checking gene overlap")
+
+        # Check if model files exist
+        if not TOKEN_DICT_PATH.exists():
+            raise FileNotFoundError(
+                f"Token dictionary file not found: {TOKEN_DICT_PATH}\n"
+                f"Please download the Geneformer model files to: {MODEL_DIR}\n"
+                f"Required files:\n"
+                f"  - {TOKEN_DICT_PATH.name}\n"
+                f"  - {GENE_MEDIAN_PATH.name}\n"
+                f"  - {GENE_NAME_ID_PATH.name}\n"
+                f"You can download them from the Geneformer repository or Hugging Face."
+            )
+
+        if not GENE_MEDIAN_PATH.exists():
+            raise FileNotFoundError(f"Gene median dictionary file not found: {GENE_MEDIAN_PATH}")
+
+        if not GENE_NAME_ID_PATH.exists():
+            raise FileNotFoundError(f"Gene name ID dictionary file not found: {GENE_NAME_ID_PATH}")
+
+        with open(TOKEN_DICT_PATH, "rb") as f:
+            token_dict = pickle.load(f)
+
+        token_keys = set(token_dict.keys())
+        var_names = set(adata.var_names)
+        matching_tokens = token_keys & var_names
+        if len(matching_tokens) == 0:
+            raise ValueError(
+                "No matching genes found between the dataset and the Geneformer tokenizer.\n"
+                f"Number of genes in dataset: {len(var_names)}\n"
+                f"Number of tokens in tokenizer: {len(token_keys)}\n"
+                "Please ensure your gene names (var_names) are Ensembl IDs matching the Geneformer model."
+            )
+
+        steps.append(f"✓ Loaded token dictionary with {len(token_dict)} tokens")
+        steps.append(f"✓ Found {len(matching_tokens)} matching genes between dataset and tokenizer")
+        steps.append(f"Sample matching tokens: {list(matching_tokens)[:10]}")
+
+        steps.append("Initializing TranscriptomeTokenizer")
+        tokenizer = TranscriptomeTokenizer(
+            custom_attr_name_dict={"cell_index": "index"},
+            model_input_size=model_input_size,
+            special_token=False,
+            collapse_gene_ids=True,
+            gene_median_file=Path(GENE_MEDIAN_PATH),
+            token_dictionary_file=Path(TOKEN_DICT_PATH),
+            chunk_size=chunk_size,
+            nproc=nproc,
+        )
+        steps.append("✓ TranscriptomeTokenizer initialized successfully")
+
+        steps.append("Tokenizing data for Geneformer")
+        tokenizer.tokenize_data(
+            data_directory=HUMANIZED_DATA_DIR,
+            output_directory=HUMANIZED_DATA_DIR,
+            output_prefix=f"tokenized_{model_input_size}_geneformer",
+            file_format="h5ad",
+            use_generator=False,
+        )
+        steps.append("✓ Data tokenization completed")
+
+        steps.append("Initializing Embedding Extractor")
+        embex = EmbExtractor(
+            model_type="Pretrained",
+            num_classes=0,
+            emb_mode="cell",
+            cell_emb_style="mean_pool",
+            gene_emb_style="mean_pool",
+            emb_layer=-1,
+            forward_batch_size=forward_batch_size,
+            nproc=nproc,
+            token_dictionary_file=str(TOKEN_DICT_PATH),
+            max_ncells=None,
+            emb_label=["index"],
+        )
+        steps.append("✓ Embedding Extractor initialized successfully")
+
+        steps.append("Extracting cell embeddings using Geneformer")
+        tokenized_dataset_path = HUMANIZED_DATA_DIR / f"tokenized_{model_input_size}_geneformer.dataset"
+        emb_prefix = f"embeddings_geneformer_{model_input_size}"
+        embs = embex.extract_embs(
+            model_directory=str(MODEL_DIR),
+            input_data_file=str(tokenized_dataset_path),
+            output_directory=str(EMBEDDINGS_DIR),
+            output_prefix=emb_prefix,
+            output_torch_embs=True,
+        )
+        steps.append("✓ Cell embeddings extracted successfully")
+
+        steps.append("Processing and saving embeddings")
+        df = embs[0]
+        df = df.set_index("index", drop=True)
+        df.index.name = None
+        df = df.loc[adata.obs.index]
+
+        output_csv_path = EMBEDDINGS_DIR / f"{embeddings_prefix}.csv"
+        df.to_csv(output_csv_path)
+        steps.append(f"✓ Embeddings saved to: {output_csv_path}")
+        steps.append(f"✓ Embeddings shape: {df.shape}")
+        steps.append(f"✓ File size: {output_csv_path.stat().st_size / (1024 * 1024):.2f} MB")
+
+        steps.append("Geneformer embedding extraction completed successfully")
+        return "\n".join(steps)
+
+    except Exception as e:
+        steps.append(f"✗ Error during Geneformer embedding extraction: {str(e)}")
+        steps.append("Geneformer embedding extraction failed")
+        return "\n".join(steps)
diff --git a/biomni/tool/tool_description/genomics.py b/biomni/tool/tool_description/genomics.py
@@ -654,4 +654,65 @@
             }
         ],
     },
+    {
+        "description": "Generate Geneformer embeddings for single-cell RNA-seq data using the Geneformer transformer model. "
+        "This function automatically downloads and installs the Geneformer model, tokenizes the transcriptome data, "
+        "and extracts cell-level embeddings using the pretrained Geneformer-V2-104M model. The function includes "
+        "comprehensive validation to ensure gene names match the tokenizer vocabulary and provides detailed error "
+        "messages if no genes match. The embeddings are saved as CSV files and can be used for downstream analysis "
+        "such as clustering, visualization, and cell type annotation. Requires Ensembl gene IDs as input.",
+        "name": "geneformer_embed",
+        "optional_parameters": [
+            {
+                "default": "embeddings_geneformer_zero_shot",
+                "description": "Prefix for output embeddings CSV file",
+                "name": "embeddings_prefix",
+                "type": "str",
+            },
+            {
+                "default": 4096,
+                "description": "Model input size for tokenizer (maximum number of genes per cell)",
+                "name": "model_input_size",
+                "type": "int",
+            },
+            {
+                "default": 10000,
+                "description": "Chunk size for tokenizer processing",
+                "name": "chunk_size",
+                "type": "int",
+            },
+            {
+                "default": 8,
+                "description": "Number of processes for parallelization",
+                "name": "nproc",
+                "type": "int",
+            },
+            {
+                "default": 64,
+                "description": "Batch size for embedding extraction",
+                "name": "forward_batch_size",
+                "type": "int",
+            },
+        ],
+        "required_parameters": [
+            {
+                "default": None,
+                "description": "Path to .h5ad file or AnnData instance in memory",
+                "name": "adata_or_path",
+                "type": "str",
+            },
+            {
+                "default": None,
+                "description": "Base directory for all data/models/outputs",
+                "name": "base_dir",
+                "type": "str",
+            },
+            {
+                "default": None,
+                "description": "Filename for AnnData file (used if adata_or_path is AnnData)",
+                "name": "adata_filename",
+                "type": "str",
+            },
+        ],
+    },
 ]
diff --git a/biomni_env/new_software_v007.sh b/biomni_env/new_software_v007.sh
@@ -5,6 +5,33 @@ pip install lazyslide
 pip install "git+https://github.com/YosefLab/popV.git@refs/pull/100/head"
 pip install pybiomart
 pip install fair-esm
+sudo apt update && sudo apt install git-lfs
+
+# versioning for geneformer to make this work
+pip install "anndata>=0.11"
+pip install "datasets>=2.12"
+pip install "loompy>=3.0"
+pip install "matplotlib>=3.7"
+pip install "numpy>=1.26.4"
+pip install "optuna>=3.6"
+pip install "optuna-integration>=3.6"
+pip install "packaging>=23.0"
+pip install "pandas>=2.2.2"
+pip install "peft==0.11.1"
+pip install "pyarrow>=12.0"
+pip install "pytz>=2023.0"
+pip install "ray>=2.6"
+pip install "scanpy>=1.9"
+pip install "scipy>=1.13.1"
+pip install "torch>=2.0.1"
+pip install "tqdm>=4.66.3"
+pip install "transformers==4.40"
+pip install "h5py>=3.11.0"
+pip install "numcodecs>=0.13.0"
+pip install "zarr>=2.18.2"
+pip install "requests>=2.32.1"
+pip install "numba>=0.6.1"
+pip install pynvml
 pip install nnunet nibabel nilearn
 pip install mi-googlesearch-python
 conda install weasyprint