diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml new file mode 100644 index 00000000..b2316674 --- /dev/null +++ b/.github/workflows/codespell.yml @@ -0,0 +1,25 @@ +# Codespell configuration is within pyproject.toml +--- +name: Codespell + +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: + contents: read + +jobs: + codespell: + name: Check for spelling errors + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Annotate locations with typos + uses: codespell-project/codespell-problem-matcher@v1 + - name: Codespell + uses: codespell-project/actions-codespell@v2 diff --git a/.gitignore b/.gitignore index df1576d7..33d27947 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ .env debug* .* +!.github # Byte-compiled / optimized / DLL files biomni_release/biomni_env/biomni_tools/ open_source_process.ipynb diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7b5c4676..c434dbe4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,6 +32,13 @@ repos: - id: check-merge-conflict - id: no-commit-to-branch args: ["--branch=main"] + - repo: https://github.com/codespell-project/codespell + # Configuration for codespell is in pyproject.toml + rev: v2.4.1 + hooks: + - id: codespell + additional_dependencies: + - tomli; python_version<'3.11' #- repo: https://github.com/pre-commit/mirrors-mypy # rev: v1.16.1 # hooks: diff --git a/biomni/agent/a1.py b/biomni/agent/a1.py index 7a62e59f..3f5c06fe 100644 --- a/biomni/agent/a1.py +++ b/biomni/agent/a1.py @@ -1097,7 +1097,7 @@ def format_item_with_description(name, description): # Base prompt prompt_modifier = """ You are a helpful biomedical assistant assigned with the task of problem-solving. -To achieve this, you will be using an interactive coding environment equipped with a variety of tool functions, data, and softwares to assist you throughout the process. +To achieve this, you will be using an interactive coding environment equipped with a variety of tool functions, data, and software to assist you throughout the process. Given a task, make a plan first. The plan should be a numbered list of steps that you will take to solve the task. Be specific and detailed. Format your plan as a checklist with empty checkboxes like this: @@ -1125,7 +1125,7 @@ def format_item_with_description(name, description): - For Python code (default): print("Hello World!") - For R code: #!R\nlibrary(ggplot2)\nprint("Hello from R") - For Bash scripts and commands: #!BASH\necho "Hello from Bash"\nls -la - - For CLI softwares, use Bash scripts. + - For CLI software, use Bash scripts. 2) When you think it is ready, directly provide a solution that adheres to the required format for the given task to the user. Your solution should be enclosed using "" tag, for example: The answer is A . IMPORTANT: You must end the solution block with tag. diff --git a/biomni/tool/genomics.py b/biomni/tool/genomics.py index d84d26bc..27293031 100644 --- a/biomni/tool/genomics.py +++ b/biomni/tool/genomics.py @@ -74,7 +74,7 @@ def unsupervised_celltype_transfer_between_scRNA_datasets( prediction_mode="retrain", ).adata - # passing arugments this way decreases chance of LLM generation and parsing errors + # passing arguments this way decreases chance of LLM generation and parsing errors flags = { "CELLTYPIST": CELLTYPIST, "KNN_BBKNN": KNN_BBKNN, @@ -864,7 +864,7 @@ def get_uce_embeddings_scRNA( DATA_ROOT="/dfs/project/bioagentos/data/singlecell/", custom_args=None, ): - """The UCE embeddings are usually our default tools to get cell embeddings, we map UCE embeddings to IMA referece dataset and get the cell types for a better understanding. + """The UCE embeddings are usually our default tools to get cell embeddings, we map UCE embeddings to IMA reference dataset and get the cell types for a better understanding. The custom_args is a list of strings that will be passed as command line arguments to the UCE script, like ["--adata_path", adata_file, "--dir", output_dir]. The default value is None. """ diff --git a/biomni/tool/literature.py b/biomni/tool/literature.py index e4b31b14..d36eaf93 100644 --- a/biomni/tool/literature.py +++ b/biomni/tool/literature.py @@ -187,7 +187,7 @@ def search_google(query: str, num_results: int = 3, language: str = "en") -> lis """Search using Google search. Args: - query (str): The search query (e.g., "protocol text or seach question") + query (str): The search query (e.g., "protocol text or search question") num_results (int): Number of results to return (default: 10) language (str): Language code for search results (default: 'en') pause (float): Pause between searches to avoid rate limiting (default: 2.0 seconds) diff --git a/biomni/tool/protocols/addgene/Addgene_ Affinity Purification of Recombinant Antibodies with Protein A or Protein G.txt b/biomni/tool/protocols/addgene/Addgene_ Affinity Purification of Recombinant Antibodies with Protein A or Protein G.txt index 9a87b985..bfc98fc1 100644 --- a/biomni/tool/protocols/addgene/Addgene_ Affinity Purification of Recombinant Antibodies with Protein A or Protein G.txt +++ b/biomni/tool/protocols/addgene/Addgene_ Affinity Purification of Recombinant Antibodies with Protein A or Protein G.txt @@ -209,7 +209,7 @@ Section 1: Affinity chromatography 23. (Optional) Regenerate the column by washing with 25 mL of Protein A/G binding buffer and store in 20% ethanol at 4 °C. - 💡 PRO TIP: Columns may be re-used up to 5x when purifying the same + 💡 PRO TIP: Columns may be reused up to 5x when purifying the same recombinant antibody. Section 2: Buffer exchange diff --git a/biomni/tool/protocols/addgene/Addgene_ Isolating a Monoclonal Cell Population by Limiting Dilution.txt b/biomni/tool/protocols/addgene/Addgene_ Isolating a Monoclonal Cell Population by Limiting Dilution.txt index 0c04f2c9..2f415afa 100644 --- a/biomni/tool/protocols/addgene/Addgene_ Isolating a Monoclonal Cell Population by Limiting Dilution.txt +++ b/biomni/tool/protocols/addgene/Addgene_ Isolating a Monoclonal Cell Population by Limiting Dilution.txt @@ -8,7 +8,7 @@ INTRODUCTION This protocol describes how to generate a monoclonal cell line from a polyclonal pool of stable cells. -Transducing cells with lentivirus results in a heterogenous polyclonal population +Transducing cells with lentivirus results in a heterogeneous polyclonal population that varies in the number of integration events and the site(s) of proviral integration across cells. Selective pressure on this heterogeneous cell pool could lead to reduced transgene expression over time, as the lower expressing diff --git a/biomni/tool/protocols/addgene/Addgene_ Protocol - How to Perform Sequence Analysis.txt b/biomni/tool/protocols/addgene/Addgene_ Protocol - How to Perform Sequence Analysis.txt index 418ea2d1..3ce952f8 100644 --- a/biomni/tool/protocols/addgene/Addgene_ Protocol - How to Perform Sequence Analysis.txt +++ b/biomni/tool/protocols/addgene/Addgene_ Protocol - How to Perform Sequence Analysis.txt @@ -53,7 +53,7 @@ recommends using Addgene's sequencing results as a reference for primer design. SEQUENCING RESULTS ================================================================================ -A good sequencing reaction will produce between 300-900 base pairs of useable +A good sequencing reaction will produce between 300-900 base pairs of usable sequence. You should receive your sequencing results as a trace file (.ab1) which graphically depicts the sequence as a series of colored peaks corresponding to one of the four nucleotide bases. This is an example of a trace file from a diff --git a/biomni/tool/protocols/addgene/Addgene_ Protocol - How to Run an Agarose Gel.txt b/biomni/tool/protocols/addgene/Addgene_ Protocol - How to Run an Agarose Gel.txt index 54ab07e5..d689d1db 100644 --- a/biomni/tool/protocols/addgene/Addgene_ Protocol - How to Run an Agarose Gel.txt +++ b/biomni/tool/protocols/addgene/Addgene_ Protocol - How to Run an Agarose Gel.txt @@ -46,7 +46,7 @@ Microwave for 1-3 min until the agarose is completely dissolved (but do not over Caution HOT! Be careful stirring, eruptive boiling can occur. Pro-Tip -It is a good idea to microwave for 30-45 sec, stop and swirl, and then continue towards a boil. Keep an eye on it the solution has a tendancy to boil over. Placing saran wrap over the top of the flask can help with this, but is not necessary if you pay close attention. +It is a good idea to microwave for 30-45 sec, stop and swirl, and then continue towards a boil. Keep an eye on it the solution has a tendency to boil over. Placing saran wrap over the top of the flask can help with this, but is not necessary if you pay close attention. Let agarose solution cool down to about 50 °C (about when you can comfortably keep your hand on the flask), about 5 mins. Optional : Add ethidium bromide (EtBr) to a final concentration of approximately 0.2-0.5 μg/mL (usually about 2-3 μl of lab stock solution per 100 mL gel). EtBr binds to the DNA and allows you to visualize the DNA under ultraviolet (UV) light. diff --git a/biomni/tool/protocols/addgene/Addgene_ Virus Protocol - Generating Stable Cell Lines.txt b/biomni/tool/protocols/addgene/Addgene_ Virus Protocol - Generating Stable Cell Lines.txt index 9bc463a2..df539a27 100644 --- a/biomni/tool/protocols/addgene/Addgene_ Virus Protocol - Generating Stable Cell Lines.txt +++ b/biomni/tool/protocols/addgene/Addgene_ Virus Protocol - Generating Stable Cell Lines.txt @@ -12,7 +12,7 @@ Mol Bio Protocols Viral Service Introduction This protocol can be used to generate stable cell lines expressing a gene of interest from an integrated lentiviral vector. Unlike the short-term protein expression observed using transient transfection approaches, generating cell lines using lentiviral vectors enables long-term protein expression studies. Moreover, repeating experiments in a stable cell line, as opposed to transiently-transfected cells, increases reproducibility, as it eliminates the variation associated with repeated transient transfection. -Some lentiviral vectors deliver mammalian antibiotic resistance (e.g., puromycin, blasticidin), which enables selection of a stable cell culture after transduction. Performing antibiotic selection on transduced cells enables elimination of untransduced cells, resulting in a more homogenous (but still polyclonal) cell population. Depending on the transducibility of the cell line used, this antibiotic selection may be a vital step for obtaining a population of cells that have taken up the lentiviral transgene. Note that not all lentiviral vectors deliver antibiotic resistance. +Some lentiviral vectors deliver mammalian antibiotic resistance (e.g., puromycin, blasticidin), which enables selection of a stable cell culture after transduction. Performing antibiotic selection on transduced cells enables elimination of untransduced cells, resulting in a more homogeneous (but still polyclonal) cell population. Depending on the transducibility of the cell line used, this antibiotic selection may be a vital step for obtaining a population of cells that have taken up the lentiviral transgene. Note that not all lentiviral vectors deliver antibiotic resistance. This protocol was established using 293T cells but can be adapted to alternative cell lines. Workflow Timeline Day 0: diff --git a/biomni/utils.py b/biomni/utils.py index 09e1ef91..0e5bb2c5 100644 --- a/biomni/utils.py +++ b/biomni/utils.py @@ -270,7 +270,7 @@ def function_to_api_schema(function_string, llm): For variable without default values, set them as None, not null. For variable with boolean values, use capitalized True or False, not true or false. Do not add any return type in the docstring. - Be as clear and succint as possible for the descriptions. Please do not make it overly verbose. + Be as clear and succinct as possible for the descriptions. Please do not make it overly verbose. Here is the code snippet: {code} """ @@ -684,7 +684,7 @@ def on_chat_model_start(self, serialized, messages, **kwargs): class NodeLogger(BaseCallbackHandler): def on_llm_end(self, response, **kwargs): # response of type LLMResult - for generations in response.generations: # response.generations of type List[List[Generations]] becuase "each input could have multiple candidate generations" + for generations in response.generations: # response.generations of type List[List[Generations]] because "each input could have multiple candidate generations" for generation in generations: generated_text = generation.message.content # token_usage = generation.message.response_metadata["token_usage"] diff --git a/biomni_env/README.md b/biomni_env/README.md index a561401c..cb69f281 100644 --- a/biomni_env/README.md +++ b/biomni_env/README.md @@ -9,7 +9,7 @@ This directory contains scripts and configuration files to set up a comprehensiv ``` 2. Setting up the environment: -- (a) If you want to use or try out the basic agent without the full E1 or install your own softwares, run the following script: +- (a) If you want to use or try out the basic agent without the full E1 or install your own software, run the following script: ```bash conda env create -f environment.yml diff --git a/biomni_env/bio_env_py310.yml b/biomni_env/bio_env_py310.yml index db0b40f4..823da6b4 100644 --- a/biomni_env/bio_env_py310.yml +++ b/biomni_env/bio_env_py310.yml @@ -11,7 +11,7 @@ dependencies: - pip # Purpose: Python 3.10 compatibility environment for tools not yet supporting newer Python versions. -# Automaticlly setup on setup.sh execution. +# Automatically setup on setup.sh execution. # Contains tools that require Python 3.10 or earlier versions. # Can be create with: # micromamba create -f bio_env_py310.yml OR conda env create -f bio_env_py310.yml diff --git a/biomni_env/new_software_v008.sh b/biomni_env/new_software_v008.sh index 306d6885..1005f579 100644 --- a/biomni_env/new_software_v008.sh +++ b/biomni_env/new_software_v008.sh @@ -10,7 +10,7 @@ pip install pybiomart pip install fair-esm pip install uv uv pip install transcriptformer -pip install "zarr>=2.0,<3.0" #this resolved transcripformer download isses +pip install "zarr>=2.0,<3.0" #this resolved transcripformer download issues uv tool install arc-state pip install nnunet nibabel nilearn pip install mi-googlesearch-python diff --git a/pyproject.toml b/pyproject.toml index 0fad3a91..e796c63c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,7 @@ ignore = [ # First line should be in imperative mood; try rephrasing "D401", ## Disable one in each pair of mutually incompatible rules - # We don’t want a blank line before a class docstring + # We don't want a blank line before a class docstring "D203", # We want docstrings to start immediately after the opening triple quote "D213", @@ -103,3 +103,9 @@ ignore = [ # strip with multi characters "B005" ] + +[tool.codespell] +skip = '.git*,*.pdf,*.svg,*.css,*.min.*,*/i18n/*,*/build/*,biomni_env/*,.cache,.npm' +check-hidden = true +ignore-regex = '^\s*"image/\S+": ".*' +ignore-words-list = 'scarches,hsa,ser,abl,basf,optmizer,transferrin,te,tbe,tre,ther,commun,theis,inactivate' diff --git a/tutorials/biomni_101.ipynb b/tutorials/biomni_101.ipynb index d7366c4e..7f2a406c 100644 --- a/tutorials/biomni_101.ipynb +++ b/tutorials/biomni_101.ipynb @@ -768,14 +768,14 @@ "Rank Gene Category Expected Effect\n", "--------------------------------------------------------------------------------\n", "1 TOX Transcription Factors KO should reduce exhaustion (master exhaustion TF)\n", - "2 PDCD1 Immune Checkpoints KO should reduce exhaustion (remove PD-1 checkpoin\n", + "2 PDCD1 Immune Checkpoints KO should reduce exhaustion (remove PD-1 checkpoint\n", "3 EOMES Transcription Factors KO should reduce exhaustion (exhaustion-promoting \n", "4 HAVCR2 Immune Checkpoints KO should reduce exhaustion (remove TIM-3 checkpoi\n", "5 LAG3 Immune Checkpoints KO should reduce exhaustion (remove LAG-3 checkpoi\n", "6 TIGIT Immune Checkpoints KO should reduce exhaustion (remove TIGIT checkpoi\n", "7 CTLA4 Immune Checkpoints KO should enhance early activation\n", "8 BATF Transcription Factors KO should reduce exhaustion (exhaustion-promoting \n", - "9 HIF1A Metabolic Regulators KO may reduce exhaustion (metabolic stress respons\n", + "9 HIF1A Metabolic Regulators KO may reduce exhaustion (metabolic stress response\n", "10 MYC Metabolic Regulators KO may increase exhaustion (metabolic reprogrammin\n", "11 LCK TCR Signaling KO should reduce TCR signaling strength\n", "12 ZAP70 TCR Signaling KO should reduce TCR signaling strength\n",