docs: Add jupyter notebook containing code snippets for working with …

…ProFASTA
hollenstein · Apr 19, 2024 · e846e10 · e846e10
1 parent 53727bd
commit e846e10
Show file tree

Hide file tree

Showing 6 changed files with 443 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -24,14 +24,16 @@ The following code snippet shows how to import a FASTA file containing UniProt p
 ```python
 >>> import profasta
 >>> 
->>> fasta_path = "./example_data/uniprot_hsapiens_10entries.fasta"
+>>> fasta_path = "./examples/uniprot_hsapiens_10entries.fasta"
 >>> db = profasta.db.ProteinDatabase()
 >>> db.add_fasta(fasta_path, header_parser="uniprot")
 >>> protein_record = db["O75385"]
 >>> print(protein_record.header_fields["gene_name"])
 ULK1
 ```
 
+For more examples how to use the ProFASTA library please refer to the [code snippets](examples/code_snippets.ipynb) Jupyter notebook.
+
 ## Requirements
 Python >= 3.9
 

diff --git a/examples/code_snippets.ipynb b/examples/code_snippets.ipynb
@@ -0,0 +1,172 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "273cf753",
+   "metadata": {},
+   "source": [
+    "# Code snippets for working with the proFASTA library"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8d8a7af6",
+   "metadata": {},
+   "source": [
+    "## Removing invalid characters from imported protein sequences"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "830d37b9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MEPG\n"
+     ]
+    }
+   ],
+   "source": [
+    "import profasta\n",
+    "\n",
+    "\n",
+    "def cleanup_protein_sequences(\n",
+    "        db: profasta.ProteinDatabase, alphabet=\"ABCDEFGHIJKLMNOPQRSTUVWXYZ\"\n",
+    "    ) -> None:\n",
+    "    \"\"\"Remove non-alphabet characters from protein sequences in the ProteinDatabase.\n",
+    "    \n",
+    "    Args:\n",
+    "        db: A profasta.ProteinDatabase instance.\n",
+    "        alphabet: List of characters that are allowed in the protein entry sequences.\n",
+    "    \"\"\"\n",
+    "    for entry in db.values():        \n",
+    "        entry.sequence = \"\".join([aa for aa in entry.sequence if aa in alphabet])\n",
+    "\n",
+    "\n",
+    "fasta_path = \"./uniprot_hsapiens_10entries.fasta\"\n",
+    "db = profasta.db.ProteinDatabase()\n",
+    "db.add_fasta(fasta_path, header_parser=\"uniprot\")\n",
+    "db[\"O75385\"].sequence = \"MEPG_-+123\"\n",
+    "cleanup_protein_sequences(db)\n",
+    "\n",
+    "print(db[\"O75385\"].sequence)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0c5bea99",
+   "metadata": {},
+   "source": [
+    "## Converting FASTA headers into a UniProt like format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "1996bde3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import profasta\n",
+    "import profasta.parser\n",
+    "\n",
+    "\n",
+    "class CustomHeaderParser:\n",
+    "    \"\"\"Custom header parser.\"\"\"\n",
+    "\n",
+    "    @classmethod\n",
+    "    def parse(cls, header: str) -> profasta.parser.ParsedHeader:\n",
+    "        \"\"\"Parse a FASTA header string into a ParsedHeader object.\n",
+    "        \n",
+    "        Header format example:\n",
+    "        >ProteinID hypothetical protein name\n",
+    "        \"\"\"\n",
+    "        split_header = header.split(maxsplit=1)\n",
+    "        _id = split_header[0]\n",
+    "\n",
+    "        fields = {\n",
+    "            \"db\": \"xx\",\n",
+    "            \"identifier\": _id,\n",
+    "            \"entry_name\": f\"{_id}_CUSTOM\",\n",
+    "            \"gene_name\": _id,\n",
+    "        }\n",
+    "        if len(split_header) > 1:\n",
+    "            fields[\"protein_name\"] = split_header[1]\n",
+    "        return profasta.parser.ParsedHeader(_id, header, fields)\n",
+    "\n",
+    "# Register the custom header parser so that it can be used by the ProteinDatabase.\n",
+    "profasta.parser.register_parser(\"custom_parser\", CustomHeaderParser)\n",
+    "\n",
+    "fasta_path = \"./custom_header_format.fasta\"\n",
+    "converted_fasta_path =  \"./custom_header_format.uniprot-like.fasta\"\n",
+    "protein_db = profasta.ProteinDatabase()\n",
+    "\n",
+    "# Specify the custom header parser to use for adding the FASTA file.\n",
+    "protein_db.add_fasta(fasta_path, header_parser=\"custom_parser\")\n",
+    "\n",
+    "# Write the ProteinDatabase to a new FASTA file using the uniprot-like header writer.\n",
+    "protein_db.write_fasta(converted_fasta_path, header_writer=\"uniprot_like\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "697f8065",
+   "metadata": {},
+   "source": [
+    "## Create a combined FASTA file with added decoy entries\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "bc15636a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import profasta\n",
+    "\n",
+    "fasta_path = \"./uniprot_hsapiens_10entries.fasta\"\n",
+    "decoy_fasta_path = \"./uniprot_hsapiens_10entries_DECOY.fasta\"\n",
+    "\n",
+    "# Import the FASTA file\n",
+    "db = profasta.db.ProteinDatabase()\n",
+    "db.add_fasta(fasta_path, header_parser=\"uniprot\")\n",
+    "\n",
+    "# Create the new FASTA file and write the original entries to it.\n",
+    "db.write_fasta(decoy_fasta_path, header_writer=\"uniprot\")\n",
+    "\n",
+    "# Create a decoy database from the original database, containing reversed sequences.\n",
+    "decoy_db = profasta.create_decoy_db(db, keep_nterm_methionine=True)\n",
+    "\n",
+    "# Append the decoy entries to the new FASTA file.\n",
+    "decoy_db.write_fasta(decoy_fasta_path, header_writer=\"decoy\", append=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/custom_header_format.fasta b/examples/custom_header_format.fasta
@@ -0,0 +1,3 @@
+>ProteinID hypothetical protein name
+MAWTPLFLFLLTCCPGSNSQAVVTQEPSLTVSPGGTVTLTCGSSTGAVTSGHYPYWFQQK
+PGQAPRTLIYDTSNKHSWTPARFSGSLLGGKAALTLLGAQPEDEAEYYCLLSYSGAR
diff --git a/examples/custom_header_format.uniprot-like.fasta b/examples/custom_header_format.uniprot-like.fasta
@@ -0,0 +1,3 @@
+>xx|ProteinID|ProteinID_CUSTOM hypothetical protein name GN=ProteinID
+MAWTPLFLFLLTCCPGSNSQAVVTQEPSLTVSPGGTVTLTCGSSTGAVTSGHYPYWFQQK
+PGQAPRTLIYDTSNKHSWTPARFSGSLLGGKAALTLLGAQPEDEAEYYCLLSYSGAR
diff --git a/...ple_data/uniprot_hsapiens_10entries.fasta → examples/uniprot_hsapiens_10entries.fasta b/...ple_data/uniprot_hsapiens_10entries.fasta → examples/uniprot_hsapiens_10entries.fasta