-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
docs: Add jupyter notebook containing code snippets for working with …
…ProFASTA
- Loading branch information
1 parent
53727bd
commit e846e10
Showing
6 changed files
with
443 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "273cf753", | ||
"metadata": {}, | ||
"source": [ | ||
"# Code snippets for working with the proFASTA library" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "8d8a7af6", | ||
"metadata": {}, | ||
"source": [ | ||
"## Removing invalid characters from imported protein sequences" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "830d37b9", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"MEPG\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import profasta\n", | ||
"\n", | ||
"\n", | ||
"def cleanup_protein_sequences(\n", | ||
" db: profasta.ProteinDatabase, alphabet=\"ABCDEFGHIJKLMNOPQRSTUVWXYZ\"\n", | ||
" ) -> None:\n", | ||
" \"\"\"Remove non-alphabet characters from protein sequences in the ProteinDatabase.\n", | ||
" \n", | ||
" Args:\n", | ||
" db: A profasta.ProteinDatabase instance.\n", | ||
" alphabet: List of characters that are allowed in the protein entry sequences.\n", | ||
" \"\"\"\n", | ||
" for entry in db.values(): \n", | ||
" entry.sequence = \"\".join([aa for aa in entry.sequence if aa in alphabet])\n", | ||
"\n", | ||
"\n", | ||
"fasta_path = \"./uniprot_hsapiens_10entries.fasta\"\n", | ||
"db = profasta.db.ProteinDatabase()\n", | ||
"db.add_fasta(fasta_path, header_parser=\"uniprot\")\n", | ||
"db[\"O75385\"].sequence = \"MEPG_-+123\"\n", | ||
"cleanup_protein_sequences(db)\n", | ||
"\n", | ||
"print(db[\"O75385\"].sequence)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "0c5bea99", | ||
"metadata": {}, | ||
"source": [ | ||
"## Converting FASTA headers into a UniProt like format" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "1996bde3", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import profasta\n", | ||
"import profasta.parser\n", | ||
"\n", | ||
"\n", | ||
"class CustomHeaderParser:\n", | ||
" \"\"\"Custom header parser.\"\"\"\n", | ||
"\n", | ||
" @classmethod\n", | ||
" def parse(cls, header: str) -> profasta.parser.ParsedHeader:\n", | ||
" \"\"\"Parse a FASTA header string into a ParsedHeader object.\n", | ||
" \n", | ||
" Header format example:\n", | ||
" >ProteinID hypothetical protein name\n", | ||
" \"\"\"\n", | ||
" split_header = header.split(maxsplit=1)\n", | ||
" _id = split_header[0]\n", | ||
"\n", | ||
" fields = {\n", | ||
" \"db\": \"xx\",\n", | ||
" \"identifier\": _id,\n", | ||
" \"entry_name\": f\"{_id}_CUSTOM\",\n", | ||
" \"gene_name\": _id,\n", | ||
" }\n", | ||
" if len(split_header) > 1:\n", | ||
" fields[\"protein_name\"] = split_header[1]\n", | ||
" return profasta.parser.ParsedHeader(_id, header, fields)\n", | ||
"\n", | ||
"# Register the custom header parser so that it can be used by the ProteinDatabase.\n", | ||
"profasta.parser.register_parser(\"custom_parser\", CustomHeaderParser)\n", | ||
"\n", | ||
"fasta_path = \"./custom_header_format.fasta\"\n", | ||
"converted_fasta_path = \"./custom_header_format.uniprot-like.fasta\"\n", | ||
"protein_db = profasta.ProteinDatabase()\n", | ||
"\n", | ||
"# Specify the custom header parser to use for adding the FASTA file.\n", | ||
"protein_db.add_fasta(fasta_path, header_parser=\"custom_parser\")\n", | ||
"\n", | ||
"# Write the ProteinDatabase to a new FASTA file using the uniprot-like header writer.\n", | ||
"protein_db.write_fasta(converted_fasta_path, header_writer=\"uniprot_like\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "697f8065", | ||
"metadata": {}, | ||
"source": [ | ||
"## Create a combined FASTA file with added decoy entries\n", | ||
"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "bc15636a", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import profasta\n", | ||
"\n", | ||
"fasta_path = \"./uniprot_hsapiens_10entries.fasta\"\n", | ||
"decoy_fasta_path = \"./uniprot_hsapiens_10entries_DECOY.fasta\"\n", | ||
"\n", | ||
"# Import the FASTA file\n", | ||
"db = profasta.db.ProteinDatabase()\n", | ||
"db.add_fasta(fasta_path, header_parser=\"uniprot\")\n", | ||
"\n", | ||
"# Create the new FASTA file and write the original entries to it.\n", | ||
"db.write_fasta(decoy_fasta_path, header_writer=\"uniprot\")\n", | ||
"\n", | ||
"# Create a decoy database from the original database, containing reversed sequences.\n", | ||
"decoy_db = profasta.create_decoy_db(db, keep_nterm_methionine=True)\n", | ||
"\n", | ||
"# Append the decoy entries to the new FASTA file.\n", | ||
"decoy_db.write_fasta(decoy_fasta_path, header_writer=\"decoy\", append=True)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.9" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
>ProteinID hypothetical protein name | ||
MAWTPLFLFLLTCCPGSNSQAVVTQEPSLTVSPGGTVTLTCGSSTGAVTSGHYPYWFQQK | ||
PGQAPRTLIYDTSNKHSWTPARFSGSLLGGKAALTLLGAQPEDEAEYYCLLSYSGAR |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
>xx|ProteinID|ProteinID_CUSTOM hypothetical protein name GN=ProteinID | ||
MAWTPLFLFLLTCCPGSNSQAVVTQEPSLTVSPGGTVTLTCGSSTGAVTSGHYPYWFQQK | ||
PGQAPRTLIYDTSNKHSWTPARFSGSLLGGKAALTLLGAQPEDEAEYYCLLSYSGAR |
File renamed without changes.
Oops, something went wrong.