From 79244d9a0d20d5200bb61745b1cb892747946853 Mon Sep 17 00:00:00 2001
From: Martin Steinegger <martin.steinegger@mpibpc.mpg.de>
Date: Mon, 18 Nov 2024 00:59:17 +0100
Subject: [PATCH] Add DNA support

---
 boltz1.ipynb | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/boltz1.ipynb b/boltz1.ipynb
index 88f21c9a..255a32e6 100644
--- a/boltz1.ipynb
+++ b/boltz1.ipynb
@@ -6,7 +6,7 @@
       "provenance": [],
       "machine_shape": "hm",
       "gpuType": "A100",
-      "authorship_tag": "ABX9TyNTASXa0TPusw2Bq/ltniMh",
+      "authorship_tag": "ABX9TyP7Wu7UlNfSKmSCprzBFVVC",
       "include_colab_link": true
     },
     "kernelspec": {
@@ -64,12 +64,15 @@
         "#@markdown  - Use `:` to specify multiple ligands as smile strings\n",
         "ligand_input_ccd = 'SAH'  #@param {type:\"string\"}\n",
         "#@markdown - Use `:` to specify multiple ligands as CCD codes (three-letter codes)\n",
+        "dna_input = ''  #@param {type:\"string\"}\n",
+        "#@markdown - Use `:` to specify multiple DNA sequences\n",
         "jobname = 'test'  #@param {type:\"string\"}\n",
         "\n",
         "# Clean up the query sequence and jobname\n",
         "query_sequence = \"\".join(query_sequence.split())\n",
         "ligand_input = \"\".join(ligand_input.split())\n",
         "ligand_input_ccd = \"\".join(ligand_input_ccd.split())\n",
+        "dna_input = \"\".join(dna_input.split())\n",
         "basejobname = \"\".join(jobname.split())\n",
         "basejobname = re.sub(r'\\W+', '', basejobname)\n",
         "jobname = add_hash(basejobname, query_sequence)\n",
@@ -90,9 +93,11 @@
         "from string import ascii_uppercase\n",
         "\n",
         "# Split sequences on chain breaks\n",
-        "protein_sequences = query_sequence.strip().split(':')\n",
-        "ligand_sequences = ligand_input.strip().split(':')\n",
-        "ligand_sequences_ccd = ligand_input_ccd.strip().split(':')\n",
+        "protein_sequences = query_sequence.strip().split(':') if query_sequence.strip() else []\n",
+        "ligand_sequences = ligand_input.strip().split(':') if ligand_input.strip() else []\n",
+        "ligand_sequences_ccd = ligand_input_ccd.strip().split(':') if ligand_input_ccd.strip() else []\n",
+        "dna_sequences = dna_input.strip().split(':') if dna_input.strip() else []\n",
+        "\n",
         "# Initialize chain labels starting from 'A'\n",
         "chain_labels = iter(ascii_uppercase)\n",
         "\n",
@@ -127,6 +132,17 @@
         "    sequence = lig\n",
         "    fasta_entries.append((header, sequence))\n",
         "\n",
+        "# Process DNA sequences (NO MSA is generated)\n",
+        "for seq in dna_sequences:\n",
+        "    seq = seq.strip()\n",
+        "    if not seq:\n",
+        "        continue  # Skip empty sequences\n",
+        "    chain_label = next(chain_labels)\n",
+        "    lig_type = 'DNA'\n",
+        "    header = f\">{chain_label}|{lig_type}\"\n",
+        "    sequence = seq\n",
+        "    fasta_entries.append((header, sequence))\n",
+        "\n",
         "# Process ligand sequences (CCD codes)\n",
         "for lig in ligand_sequences_ccd:\n",
         "    lig = lig.strip()\n",