Skip to content
255 changes: 255 additions & 0 deletions biomni/tool/genomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2162,3 +2162,258 @@ def analyze_genomic_region_overlap(region_sets, output_prefix="overlap_analysis"
log += f"- Summary statistics saved to: {summary_file}\n"

return log


def geneformer_embed(
adata_or_path,
base_dir,
adata_filename,
embeddings_prefix="embeddings_geneformer_zero_shot",
model_name="Geneformer-V2-104M",
model_input_size=4096,
chunk_size=10000,
nproc=8,
forward_batch_size=64,
):
"""
Run Geneformer embedding extraction pipeline on an AnnData object or file.

Parameters
----------
adata_or_path : str or AnnData
Path to .h5ad file or AnnData instance in memory.
base_dir : str
Base directory for all data/models/outputs.
adata_filename : str
Filename for AnnData file (used if adata_or_path is AnnData).
embeddings_prefix : str
Prefix for output embeddings csv.
model_name : str
Name of the Geneformer model to use (e.g., "Geneformer-V2-104M").
model_input_size : int
Model input size for tokenizer.
chunk_size : int
Chunk size for tokenizer.
nproc : int
Number of processes for parallelization.
forward_batch_size : int
Batch size for embedding extraction.

Returns
-------
str
Steps performed during the embedding extraction process.
"""

import pickle
import subprocess
from pathlib import Path

import anndata as ad

steps = []
steps.append("Starting Geneformer embedding extraction pipeline")
steps.append(f"Base directory: {base_dir}")
steps.append(f"Model name: {model_name}")
steps.append(f"Model input size: {model_input_size}")
steps.append(f"Chunk size: {chunk_size}")
steps.append(f"Number of processes: {nproc}")
steps.append(f"Forward batch size: {forward_batch_size}")

import sys

proc = subprocess.Popen(["git", "lfs", "install"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
for line in proc.stdout:
print(line, end="", file=sys.stdout)
proc.wait()
if proc.returncode != 0:
raise RuntimeError("git lfs install failed")

geneformer_dir = Path(base_dir) / "Geneformer"
if not geneformer_dir.exists():
proc = subprocess.Popen(
["git", "clone", "https://huggingface.co/ctheodoris/Geneformer", str(geneformer_dir)],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
)
for line in proc.stdout:
print(line, end="", file=sys.stdout)
proc.wait()
if proc.returncode != 0:
raise RuntimeError("git clone of Geneformer failed")
else:
print(f"Geneformer directory already exists: {geneformer_dir}")

proc = subprocess.Popen(
["pip", "install", "."], cwd=str(geneformer_dir), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
)
# Also install required dependencies after installing Geneformer
proc.wait()
if proc.returncode != 0:
raise RuntimeError("pip install of Geneformer failed")
# required to make geneformer work
extra_packages = ["peft==0.11.1", "transformers==4.40"]
proc = subprocess.Popen(
["pip", "install"] + extra_packages, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
)
for line in proc.stdout:
print(line, end="", file=sys.stdout)
proc.wait()
if proc.returncode != 0:
raise RuntimeError("pip install of Geneformer failed")

try:
from geneformer import EmbExtractor
from geneformer.tokenizer import TranscriptomeTokenizer

steps.append(f"Loading Geneformer model: {model_name}")
BASE_DIR = Path(base_dir)
MODELS_DIR = geneformer_dir
DATA_DIR = MODELS_DIR / "data"
HUMANIZED_DATA_DIR = DATA_DIR / "humanized"
MODEL_DIR = MODELS_DIR / model_name
EMBEDDINGS_DIR = BASE_DIR / "zero-shot-performance" / "embeddings"

steps.append("Setting up directory structure")
HUMANIZED_DATA_DIR.mkdir(parents=True, exist_ok=True)
EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
steps.append(f"✓ Created directories: {HUMANIZED_DATA_DIR}, {EMBEDDINGS_DIR}")

TOKEN_DICT_PATH = MODELS_DIR / "geneformer" / "token_dictionary_gc104M.pkl"
GENE_MEDIAN_PATH = MODELS_DIR / "geneformer" / "gene_median_dictionary_gc104M.pkl"
GENE_NAME_ID_PATH = MODELS_DIR / "geneformer" / "gene_name_id_dict_gc104M.pkl"

adata_path = HUMANIZED_DATA_DIR / adata_filename

steps.append("Loading and preparing AnnData object")
if isinstance(adata_or_path, str):
steps.append(f"Loading AnnData from file: {adata_or_path}")
adata = ad.read_h5ad(adata_or_path)
if Path(adata_or_path).resolve() != adata_path.resolve():
adata.write_h5ad(adata_path)
steps.append(f"✓ Saved AnnData to: {adata_path}")
else:
steps.append("Using provided AnnData object")
adata = adata_or_path
adata.write_h5ad(adata_path)
steps.append(f"✓ Saved AnnData to: {adata_path}")

steps.append(f"AnnData shape: {adata.n_obs} cells, {adata.n_vars} genes")

steps.append("Preparing AnnData for Geneformer")
adata.var["ensembl_id"] = adata.var_names
adata.obs["n_counts"] = adata.X.sum(axis=1)
cell_indices = adata.obs.index.values
adata.obs["cell_index"] = cell_indices
adata.write_h5ad(adata_path)
steps.append("✓ Added required columns: ensembl_id, n_counts, cell_index")

steps.append("Loading token dictionary and checking gene overlap")

# Check if model files exist
if not TOKEN_DICT_PATH.exists():
raise FileNotFoundError(
f"Token dictionary file not found: {TOKEN_DICT_PATH}\n"
f"Please download the Geneformer model files to: {MODEL_DIR}\n"
f"Required files:\n"
f" - {TOKEN_DICT_PATH.name}\n"
f" - {GENE_MEDIAN_PATH.name}\n"
f" - {GENE_NAME_ID_PATH.name}\n"
f"You can download them from the Geneformer repository or Hugging Face."
)

if not GENE_MEDIAN_PATH.exists():
raise FileNotFoundError(f"Gene median dictionary file not found: {GENE_MEDIAN_PATH}")

if not GENE_NAME_ID_PATH.exists():
raise FileNotFoundError(f"Gene name ID dictionary file not found: {GENE_NAME_ID_PATH}")

with open(TOKEN_DICT_PATH, "rb") as f:
token_dict = pickle.load(f)

token_keys = set(token_dict.keys())
var_names = set(adata.var_names)
matching_tokens = token_keys & var_names
if len(matching_tokens) == 0:
raise ValueError(
"No matching genes found between the dataset and the Geneformer tokenizer.\n"
f"Number of genes in dataset: {len(var_names)}\n"
f"Number of tokens in tokenizer: {len(token_keys)}\n"
"Please ensure your gene names (var_names) are Ensembl IDs matching the Geneformer model."
)

steps.append(f"✓ Loaded token dictionary with {len(token_dict)} tokens")
steps.append(f"✓ Found {len(matching_tokens)} matching genes between dataset and tokenizer")
steps.append(f"Sample matching tokens: {list(matching_tokens)[:10]}")

steps.append("Initializing TranscriptomeTokenizer")
tokenizer = TranscriptomeTokenizer(
custom_attr_name_dict={"cell_index": "index"},
model_input_size=model_input_size,
special_token=False,
collapse_gene_ids=True,
gene_median_file=Path(GENE_MEDIAN_PATH),
token_dictionary_file=Path(TOKEN_DICT_PATH),
chunk_size=chunk_size,
nproc=nproc,
)
steps.append("✓ TranscriptomeTokenizer initialized successfully")

steps.append("Tokenizing data for Geneformer")
tokenizer.tokenize_data(
data_directory=HUMANIZED_DATA_DIR,
output_directory=HUMANIZED_DATA_DIR,
output_prefix=f"tokenized_{model_input_size}_geneformer",
file_format="h5ad",
use_generator=False,
)
steps.append("✓ Data tokenization completed")

steps.append("Initializing Embedding Extractor")
embex = EmbExtractor(
model_type="Pretrained",
num_classes=0,
emb_mode="cell",
cell_emb_style="mean_pool",
gene_emb_style="mean_pool",
emb_layer=-1,
forward_batch_size=forward_batch_size,
nproc=nproc,
token_dictionary_file=str(TOKEN_DICT_PATH),
max_ncells=None,
emb_label=["index"],
)
steps.append("✓ Embedding Extractor initialized successfully")

steps.append("Extracting cell embeddings using Geneformer")
tokenized_dataset_path = HUMANIZED_DATA_DIR / f"tokenized_{model_input_size}_geneformer.dataset"
emb_prefix = f"embeddings_geneformer_{model_input_size}"
embs = embex.extract_embs(
model_directory=str(MODEL_DIR),
input_data_file=str(tokenized_dataset_path),
output_directory=str(EMBEDDINGS_DIR),
output_prefix=emb_prefix,
output_torch_embs=True,
)
steps.append("✓ Cell embeddings extracted successfully")

steps.append("Processing and saving embeddings")
df = embs[0]
df = df.set_index("index", drop=True)
df.index.name = None
df = df.loc[adata.obs.index]

output_csv_path = EMBEDDINGS_DIR / f"{embeddings_prefix}.csv"
df.to_csv(output_csv_path)
steps.append(f"✓ Embeddings saved to: {output_csv_path}")
steps.append(f"✓ Embeddings shape: {df.shape}")
steps.append(f"✓ File size: {output_csv_path.stat().st_size / (1024 * 1024):.2f} MB")

steps.append("Geneformer embedding extraction completed successfully")
return "\n".join(steps)

except Exception as e:
steps.append(f"✗ Error during Geneformer embedding extraction: {str(e)}")
steps.append("Geneformer embedding extraction failed")
return "\n".join(steps)
61 changes: 61 additions & 0 deletions biomni/tool/tool_description/genomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -654,4 +654,65 @@
}
],
},
{
"description": "Generate Geneformer embeddings for single-cell RNA-seq data using the Geneformer transformer model. "
"This function automatically downloads and installs the Geneformer model, tokenizes the transcriptome data, "
"and extracts cell-level embeddings using the pretrained Geneformer-V2-104M model. The function includes "
"comprehensive validation to ensure gene names match the tokenizer vocabulary and provides detailed error "
"messages if no genes match. The embeddings are saved as CSV files and can be used for downstream analysis "
"such as clustering, visualization, and cell type annotation. Requires Ensembl gene IDs as input.",
"name": "geneformer_embed",
"optional_parameters": [
{
"default": "embeddings_geneformer_zero_shot",
"description": "Prefix for output embeddings CSV file",
"name": "embeddings_prefix",
"type": "str",
},
{
"default": 4096,
"description": "Model input size for tokenizer (maximum number of genes per cell)",
"name": "model_input_size",
"type": "int",
},
{
"default": 10000,
"description": "Chunk size for tokenizer processing",
"name": "chunk_size",
"type": "int",
},
{
"default": 8,
"description": "Number of processes for parallelization",
"name": "nproc",
"type": "int",
},
{
"default": 64,
"description": "Batch size for embedding extraction",
"name": "forward_batch_size",
"type": "int",
},
],
"required_parameters": [
{
"default": None,
"description": "Path to .h5ad file or AnnData instance in memory",
"name": "adata_or_path",
"type": "str",
},
{
"default": None,
"description": "Base directory for all data/models/outputs",
"name": "base_dir",
"type": "str",
},
{
"default": None,
"description": "Filename for AnnData file (used if adata_or_path is AnnData)",
"name": "adata_filename",
"type": "str",
},
],
},
]
27 changes: 27 additions & 0 deletions biomni_env/new_software_v007.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,33 @@ pip install lazyslide
pip install "git+https://github.com/YosefLab/popV.git@refs/pull/100/head"
pip install pybiomart
pip install fair-esm
sudo apt update && sudo apt install git-lfs

# versioning for geneformer to make this work
pip install "anndata>=0.11"
pip install "datasets>=2.12"
pip install "loompy>=3.0"
pip install "matplotlib>=3.7"
pip install "numpy>=1.26.4"
pip install "optuna>=3.6"
pip install "optuna-integration>=3.6"
pip install "packaging>=23.0"
pip install "pandas>=2.2.2"
pip install "peft==0.11.1"
pip install "pyarrow>=12.0"
pip install "pytz>=2023.0"
pip install "ray>=2.6"
pip install "scanpy>=1.9"
pip install "scipy>=1.13.1"
pip install "torch>=2.0.1"
pip install "tqdm>=4.66.3"
pip install "transformers==4.40"
pip install "h5py>=3.11.0"
pip install "numcodecs>=0.13.0"
pip install "zarr>=2.18.2"
pip install "requests>=2.32.1"
pip install "numba>=0.6.1"
pip install pynvml
pip install nnunet nibabel nilearn
pip install mi-googlesearch-python
conda install weasyprint