diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 0000000..1e373e3 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,32 @@ +# This workflow will install the profasta package and its dependencies and run tests with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Python package + +on: + push: + branches: ["main", "develop"] + pull_request: + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4.1.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5.1.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install .[tests] + - name: Test with pytest + run: | + python -m pytest diff --git a/.gitignore b/.gitignore index a4b528a..d7da363 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ __pycache__/ *.py[cod] *$py.class + # C extensions *.so @@ -104,6 +105,9 @@ venv.bak/ ### VisualStudioCode ### .vscode/ +### visual studio ### +.vs/ + # Local History for Visual Studio Code .history/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..27502e2 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,22 @@ +# Changelog + +---------------------------------------------------------------------------------------- + +## Version [0.0.5] +Released: 2024-04-19 + +### Added +- Add otpion to `db.ProteinDatabase.add_fasta` that allows skipping entries which headers could not be parsed, instead of raising a `ValueError`. (Suggested by @xeniorn) +- Added `keys`, `values`, and `items` methods to `db.ProteinDatabase` to allow more convenient iteration over the database's entries. + +### Changed +- Made `decoy.reverse_sequence` a private function. +- Renamed the protocol classes `HeaderParser` and `HeaderWriter` to `AbstractHeaderParser` and `AbstractHeaderWriter` to be consistent with the naming of the other abstract classes. (Suggested by @xeniorn) + +### Fixed +- Parsing a FASTA file returned invalid protein sequences when the sequence contained a terminal `*` character or lowercase letters. Terminal `*` characters are now removed from the sequence and the sequence is capitalized. (Contributed by @xeniorn) + +### Chores +- Added a GitHub Actions CI workflow for automated testing. (Contributed by @xeniorn) +- Minor corrections and additions to some docstrings. +- Added a Jupyter notebook containing usage examples for the ProFASTA library. diff --git a/README.md b/README.md index fc7bb3c..764a5c0 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ # ProFASTA [![Project Status: WIP – Initial development is in progress, but there has not yet been a stable, usable release suitable for the public.](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wip) +![Python Version from PEP 621 TOML](https://img.shields.io/python/required-version-toml?tomlFilePath=https%3A%2F%2Fraw.githubusercontent.com%2Fhollenstein%2Fprofasta%2Fmain%2Fpyproject.toml) +[![pypi](https://img.shields.io/pypi/v/profasta)](https://pypi.org/project/profasta) +[![unit-tests](https://github.com/hollenstein/profasta/actions/workflows/python-package.yml/badge.svg?branch=main)](https://github.com/hollenstein/profasta/actions/workflows/python-package.yml) ## Introduction ProFASTA is a Python library for working with FASTA files containing protein records. Unlike other packages, ProFASTA prioritizes simplicity, while aiming to provide a set of useful features required in the field of proteomics based mass spectrometry. @@ -21,7 +24,7 @@ The following code snippet shows how to import a FASTA file containing UniProt p ```python >>> import profasta >>> ->>> fasta_path = "./example_data/uniprot_hsapiens_10entries.fasta" +>>> fasta_path = "./examples/uniprot_hsapiens_10entries.fasta" >>> db = profasta.db.ProteinDatabase() >>> db.add_fasta(fasta_path, header_parser="uniprot") >>> protein_record = db["O75385"] @@ -29,6 +32,8 @@ The following code snippet shows how to import a FASTA file containing UniProt p ULK1 ``` +For more examples how to use the ProFASTA library please refer to the [code snippets](examples/code_snippets.ipynb) Jupyter notebook. + ## Requirements Python >= 3.9 @@ -53,7 +58,7 @@ pip uninstall profasta - [x] built-in parser for uniprot format - [x] allow user defined parser - [x] write FASTA file - -[x] allow custom FASTA header generation + - [x] allow custom FASTA header generation **Additional features** - [x] read multiple FASTA files and write a combined file @@ -62,3 +67,6 @@ pip uninstall profasta - [x] add decoy protein records to an existing FASTA file - [ ] validate FASTA file / FASTA records +## Contributors + +- Juraj Ahel - [@xeniorn](https://github.com/xeniorn) \ No newline at end of file diff --git a/examples/code_snippets.ipynb b/examples/code_snippets.ipynb new file mode 100644 index 0000000..b32ebb6 --- /dev/null +++ b/examples/code_snippets.ipynb @@ -0,0 +1,172 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "273cf753", + "metadata": {}, + "source": [ + "# Code snippets for working with the proFASTA library" + ] + }, + { + "cell_type": "markdown", + "id": "8d8a7af6", + "metadata": {}, + "source": [ + "## Removing invalid characters from imported protein sequences" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "830d37b9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MEPG\n" + ] + } + ], + "source": [ + "import profasta\n", + "\n", + "\n", + "def cleanup_protein_sequences(\n", + " db: profasta.ProteinDatabase, alphabet=\"ABCDEFGHIJKLMNOPQRSTUVWXYZ\"\n", + " ) -> None:\n", + " \"\"\"Remove non-alphabet characters from protein sequences in the ProteinDatabase.\n", + " \n", + " Args:\n", + " db: A profasta.ProteinDatabase instance.\n", + " alphabet: List of characters that are allowed in the protein entry sequences.\n", + " \"\"\"\n", + " for entry in db.values(): \n", + " entry.sequence = \"\".join([aa for aa in entry.sequence if aa in alphabet])\n", + "\n", + "\n", + "fasta_path = \"./uniprot_hsapiens_10entries.fasta\"\n", + "db = profasta.db.ProteinDatabase()\n", + "db.add_fasta(fasta_path, header_parser=\"uniprot\")\n", + "db[\"O75385\"].sequence = \"MEPG_-+123\"\n", + "cleanup_protein_sequences(db)\n", + "\n", + "print(db[\"O75385\"].sequence)" + ] + }, + { + "cell_type": "markdown", + "id": "0c5bea99", + "metadata": {}, + "source": [ + "## Converting FASTA headers into a UniProt like format" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1996bde3", + "metadata": {}, + "outputs": [], + "source": [ + "import profasta\n", + "import profasta.parser\n", + "\n", + "\n", + "class CustomHeaderParser:\n", + " \"\"\"Custom header parser.\"\"\"\n", + "\n", + " @classmethod\n", + " def parse(cls, header: str) -> profasta.parser.ParsedHeader:\n", + " \"\"\"Parse a FASTA header string into a ParsedHeader object.\n", + " \n", + " Header format example:\n", + " >ProteinID hypothetical protein name\n", + " \"\"\"\n", + " split_header = header.split(maxsplit=1)\n", + " _id = split_header[0]\n", + "\n", + " fields = {\n", + " \"db\": \"xx\",\n", + " \"identifier\": _id,\n", + " \"entry_name\": f\"{_id}_CUSTOM\",\n", + " \"gene_name\": _id,\n", + " }\n", + " if len(split_header) > 1:\n", + " fields[\"protein_name\"] = split_header[1]\n", + " return profasta.parser.ParsedHeader(_id, header, fields)\n", + "\n", + "# Register the custom header parser so that it can be used by the ProteinDatabase.\n", + "profasta.parser.register_parser(\"custom_parser\", CustomHeaderParser)\n", + "\n", + "fasta_path = \"./custom_header_format.fasta\"\n", + "converted_fasta_path = \"./custom_header_format.uniprot-like.fasta\"\n", + "protein_db = profasta.ProteinDatabase()\n", + "\n", + "# Specify the custom header parser to use for adding the FASTA file.\n", + "protein_db.add_fasta(fasta_path, header_parser=\"custom_parser\")\n", + "\n", + "# Write the ProteinDatabase to a new FASTA file using the uniprot-like header writer.\n", + "protein_db.write_fasta(converted_fasta_path, header_writer=\"uniprot_like\")" + ] + }, + { + "cell_type": "markdown", + "id": "697f8065", + "metadata": {}, + "source": [ + "## Create a combined FASTA file with added decoy entries\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bc15636a", + "metadata": {}, + "outputs": [], + "source": [ + "import profasta\n", + "\n", + "fasta_path = \"./uniprot_hsapiens_10entries.fasta\"\n", + "decoy_fasta_path = \"./uniprot_hsapiens_10entries_DECOY.fasta\"\n", + "\n", + "# Import the FASTA file\n", + "db = profasta.db.ProteinDatabase()\n", + "db.add_fasta(fasta_path, header_parser=\"uniprot\")\n", + "\n", + "# Create the new FASTA file and write the original entries to it.\n", + "db.write_fasta(decoy_fasta_path, header_writer=\"uniprot\")\n", + "\n", + "# Create a decoy database from the original database, containing reversed sequences.\n", + "decoy_db = profasta.create_decoy_db(db, keep_nterm_methionine=True)\n", + "\n", + "# Append the decoy entries to the new FASTA file.\n", + "decoy_db.write_fasta(decoy_fasta_path, header_writer=\"decoy\", append=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/custom_header_format.fasta b/examples/custom_header_format.fasta new file mode 100644 index 0000000..12af611 --- /dev/null +++ b/examples/custom_header_format.fasta @@ -0,0 +1,3 @@ +>ProteinID hypothetical protein name +MAWTPLFLFLLTCCPGSNSQAVVTQEPSLTVSPGGTVTLTCGSSTGAVTSGHYPYWFQQK +PGQAPRTLIYDTSNKHSWTPARFSGSLLGGKAALTLLGAQPEDEAEYYCLLSYSGAR \ No newline at end of file diff --git a/examples/custom_header_format.uniprot-like.fasta b/examples/custom_header_format.uniprot-like.fasta new file mode 100644 index 0000000..ccdde3b --- /dev/null +++ b/examples/custom_header_format.uniprot-like.fasta @@ -0,0 +1,3 @@ +>xx|ProteinID|ProteinID_CUSTOM hypothetical protein name GN=ProteinID +MAWTPLFLFLLTCCPGSNSQAVVTQEPSLTVSPGGTVTLTCGSSTGAVTSGHYPYWFQQK +PGQAPRTLIYDTSNKHSWTPARFSGSLLGGKAALTLLGAQPEDEAEYYCLLSYSGAR diff --git a/example_data/uniprot_hsapiens_10entries.fasta b/examples/uniprot_hsapiens_10entries.fasta similarity index 100% rename from example_data/uniprot_hsapiens_10entries.fasta rename to examples/uniprot_hsapiens_10entries.fasta diff --git a/examples/uniprot_hsapiens_10entries_DECOY.fasta b/examples/uniprot_hsapiens_10entries_DECOY.fasta new file mode 100644 index 0000000..3c462f8 --- /dev/null +++ b/examples/uniprot_hsapiens_10entries_DECOY.fasta @@ -0,0 +1,262 @@ +>sp|A0A075B6I9|LV746_HUMAN Immunoglobulin lambda variable 7-46 OS=Homo sapiens OX=9606 GN=IGLV7-46 PE=3 SV=4 +MAWTPLFLFLLTCCPGSNSQAVVTQEPSLTVSPGGTVTLTCGSSTGAVTSGHYPYWFQQK +PGQAPRTLIYDTSNKHSWTPARFSGSLLGGKAALTLLGAQPEDEAEYYCLLSYSGAR +>sp|A0A286YF46|SCGR5_HUMAN Small cysteine and glycine repeat-containing protein 5 OS=Homo sapiens OX=9606 GN=SCYGR5 PE=1 SV=1 +MGCCGCGGCGGCGGGCGGGCGSCTTCRCYRVGCCSSCCPCCRGCCGGCCSTPVICCCRRT +CCSCGCGCGKGCCQQKGCCQKQCCC +>sp|A6NKQ9|CGB1_HUMAN Choriogonadotropin subunit beta variant 1 OS=Homo sapiens OX=9606 GN=CGB1 PE=2 SV=3 +MSTFPVLAEDIPLRERHVKGRVDPHFRAPKMEMFQRLLLLLLLSMGGTWASKEPLRPRCR +PINATLAVEKEGCPVCITVNTTICAGYCPTMTRVLQGVLPALPQVVCNYRDVRFESIRLP +GCPRGVNPVVSYAVALSCQCALCRRSTTDCGGPKDHPLTCDDPRFQDSSSSKAPPPSLPS +PSRLPGP +>sp|O14522|PTPRT_HUMAN Receptor-type tyrosine-protein phosphatase T OS=Homo sapiens OX=9606 GN=PTPRT PE=1 SV=6 +MASLAALALSLLLRLQLPPLPGARAQSAAGGCSFDEHYSNCGYSVALGTNGFTWEQINTW +EKPMLDQAVPTGSFMMVNSSGRASGQKAHLLLPTLKENDTHCIDFHYYFSSRDRSSPGAL +NVYVKVNGGPQGNPVWNVSGVVTEGWVKAELAISTFWPHFYQVIFESVSLKGHPGYIAVD +EVRVLAHPCRKAPHFLRLQNVEVNVGQNATFQCIAGGKWSQHDKLWLQQWNGRDTALMVT +RVVNHRRFSATVSVADTAQRSVSKYRCVIRSDGGSGVSNYAELIVKEPPTPIAPPELLAV +GATYLWIKPNANSIIGDGPIILKEVEYRTTTGTWAETHIVDSPNYKLWHLDPDVEYEIRV +LLTRPGEGGTGPPGPPLTTRTKCADPVHGPQNVEIVDIRARQLTLQWEPFGYAVTRCHSY +NLTVQYQYVFNQQQYEAEEVIQTSSHYTLRGLRPFMTIRLRLLLSNPEGRMESEELVVQT +EEDVPGAVPLESIQGGPFEEKIYIQWKPPNETNGVITLYEINYKAVGSLDPSADLSSQRG +KVFKLRNETHHLFVGLYPGTTYSFTIKASTAKGFGPPVTTRIATKISAPSMPEYDTDTPL +NETDTTITVMLKPAQSRGAPVSVYQLVVKEERLQKSRRAADIIECFSVPVSYRNASSLDS +LHYFAAELKPANLPVTQPFTVGDNKTYNGYWNPPLSPLKSYSIYFQALSKANGETKINCV +RLATKGASTQNSNTVEPEKQVDNTVKMAGVIAGLLMFIIILLGVMLTIKRRRNAYSYSYY +LKLAKKQKETQSGAQREMGPVASADKPTTKLSASRNDEGFSSSSQDVNGFTDGSRGELSQ +PTLTIQTHPYRTCDPVEMSYPRDQFQPAIRVADLLQHITQMKRGQGYGFKEEYEALPEGQ +TASWDTAKEDENRNKNRYGNIISYDHSRVRLLVLDGDPHSDYINANYIDGYHRPRHYIAT +QGPMQETVKDFWRMIWQENSASIVMVTNLVEVGRVKCVRYWPDDTEVYGDIKVTLIETEP +LAEYVIRTFTVQKKGYHEIRELRLFHFTSWPDHGVPCYATGLLGFVRQVKFLNPPEAGPI +VVHCSAGAGRTGCFIAIDTMLDMAENEGVVDIFNCVRELRAQRVNLVQTEEQYVFVHDAI +LEACLCGNTAIPVCEFRSLYYNISRLDPQTNSSQIKDEFQTLNIVTPRVRPEDCSIGLLP +RNHDKNRSMDVLPLDRCLPFLISVDGESSNYINAALMDSHKQPAAFVVTQHPLPNTVADF +WRLVFDYNCSSVVMLNEMDTAQFCMQYWPEKTSGCYGPIQVEFVSADIDEDIIHRIFRIC +NMARPQDGYRIVQHLQYIGWPAYRDTPPSKRSLLKVVRRLEKWQEQYDGREGRTVVHCLN +GGGRSGTFCAICSVCEMIQQQNIIDVFHIVKTLRNNKSNMVETLEQYKFVYEVALEYLSS +F +>sp|O75179|ANR17_HUMAN Ankyrin repeat domain-containing protein 17 OS=Homo sapiens OX=9606 GN=ANKRD17 PE=1 SV=3 +MEKATVPVAAATAAEGEGSPPAVAAVAGPPAAAEVGGGVGGSSRARSASSPRGMVRVCDL +LLKKKPPQQQHHKAKRNRTCRPPSSSESSSDSDNSGGGGGGGGGGGGGGGTSSNNSEEEE +DDDDEEEEVSEVESFILDQDDLENPMLETASKLLLSGTADGADLRTVDPETQARLEALLE +AAGIGKLSTADGKAFADPEVLRRLTSSVSCALDEAAAALTRMRAESTANAGQSDNRSLAE +ACSEGDVNAVRKLLIEGRSVNEHTEEGESLLCLACSAGYYELAQVLLAMHANVEDRGIKG +DITPLMAAANGGHVKIVKLLLAHKADVNAQSSTGNTALTYACAGGYVDVVKVLLESGASI +EDHNENGHTPLMEAGSAGHVEVARLLLENGAGINTHSNEFKESALTLACYKGHLEMVRFL +LEAGADQEHKTDEMHTALMEACMDGHVEVARLLLDSGAQVNMPADSFESPLTLAACGGHV +ELAALLIERGASLEEVNDEGYTPLMEAAREGHEEMVALLLGQGANINAQTEETQETALTL +ACCGGFLEVADFLIKAGADIELGCSTPLMEAAQEGHLELVKYLLAAGANVHATTATGDTA +LTYACENGHTDVADVLLQAGADLEHESEGGRTPLMKAARAGHVCTVQFLISKGANVNRTT +ANNDHTVLSLACAGGHLAVVELLLAHGADPTHRLKDGSTMLIEAAKGGHTSVVCYLLDYP +NNLLSAPPPDVTQLTPPSHDLNRAPRVPVQALPMVVPPQEPDKPPANVATTLPIRNKAAS +KQKSSSHLPANSQDVQGYITNQSPESIVEEAQGKLTELEQRIKEAIEKNAQLQSLELAHA +DQLTKEKIEELNKTREEQIQKKQKILEELQKVERELQLKTQQQLKKQYLEVKAQRIQLQQ +QQQQSCQHLGLLTPVGVGEQLSEGDYARLQQVDPVLLKDEPQQTAAQMGFAPIQPLAMPQ +ALPLAAGPLPPGSIANLTELQGVIVGQPVLGQAQLAGLGQGILTETQQGLMVASPAQTLN +DTLDDIMAAVSGRASAMSNTPTHSIAASISQPQTPTPSPIISPSAMLPIYPAIDIDAQTE +SNHDTALTLACAGGHEELVQTLLERGASIEHRDKKGFTPLILAATAGHVGVVEILLDNGA +DIEAQSERTKDTPLSLACSGGRQEVVELLLARGANKEHRNVSDYTPLSLAASGGYVNIIK +ILLNAGAEINSRTGSKLGISPLMLAAMNGHTAAVKLLLDMGSDINAQIETNRNTALTLAC +FQGRTEVVSLLLDRKANVEHRAKTGLTPLMEAASGGYAEVGRVLLDKGADVNAPPVPSSR +DTALTIAADKGHYKFCELLIGRGAHIDVRNKKGNTPLWLAANGGHLDVVQLLVQAGADVD +AADNRKITPLMAAFRKGHVKVVRYLVKEVNQFPSDSECMRYIATITDKEMLKKCHLCMES +IVQAKDRQAAEANKNASILLEELDLEKLREESRRLALAAKREKRKEKRRKKKEEQRRKLE +EIEAKNKENFELQAAQEKEKLKVEDEPEVLTEPPSATTTTTIGISATWTTLAGSHGKRNN +TITTTSSKRKNRKNKITPENVQIIFDDPLPISYSQPEKVNGESKSSSTSESGDSDNMRIS +SCSDESSNSNSSRKSDNHSPAVVTTTVSSKKQPSVLVTFPKEERKSVSGKASIKLSETIS +EGTSNSLSTCTKSGPSPLSSPNGKLTVASPKRGQKREEGWKEVVRRSKKVSVPSTVISRV +IGRGGCNINAIREFTGAHIDIDKQKDKTGDRIITIRGGTESTRQATQLINALIKDPDKEI +DELIPKNRLKSSSANSKIGSSAPTTTAANTSLMGIKMTTVALSSTSQTATALTVPAISSA +STHKTIKNPVNNVRPGFPVSLPLAYPPPQFAHALLAAQTFQQIRPPRLPMTHFGGTFPPA +QSTWGPFPVRPLSPARATNSPKPHMVPRHSNQNSSGSQVNSAGSLTSSPTTTTSSSASTV +PGTSTNGSPSSPSVRRQLFVTVVKTSNATTTTVTTTASNNNTAPTNATYPMPTAKEHYPV +SSPSSPSPPAQPGGVSRNSPLDCGSASPNKVASSSEQEAGSPPVVETTNTRPPNSSSSSG +SSSAHSNQQQPPGSVSQEPRPPLQQSQVPPPEVRMTVPPLATSSAPVAVPSTAPVTYPMP +QTPMGCPQPTPKMETPAIRPPPHGTTAPHKNSASVQNSSVAVLSVNHIKRPHSVPSSVQL +PSTLSTQSACQNSVHPANKPIAPNFSAPLPFGPFSTLFENSPTSAHAFWGGSVVSSQSTP +ESMLSGKSSYLPNSDPLHQSDTSKAPGFRPPLQRPAPSPSGIVNMDSPYGSVTPSSTHLG +NFASNISGGQMYGPGAPLGGAPAAANFNRQHFSPLSLLTPCSSASNDSSAQSVSSGVRAP +SPAPSSVPLGSEKPSNVSQDRKVPVPIGTERSARIRQTGTSAPSVIGSNLSTSVGHSGIW +SFEGIGGNQDKVDWCNPGMGNPMIHRPMSDPGVFSQHQAMERDSTGIVTPSGTFHQHVPA +GYMDFPKVGGMPFSVYGNAMIPPVAPIPDGAGGPIFNGPHAADPSWNSLIKMVSSSTENN +GPQTVWTGPWAPHMNSVHMNQLG +>sp|O76036|NCTR1_HUMAN Natural cytotoxicity triggering receptor 1 OS=Homo sapiens OX=9606 GN=NCR1 PE=1 SV=1 +MSSTLPALLCVGLCLSQRISAQQQTLPKPFIWAEPHFMVPKEKQVTICCQGNYGAVEYQL +HFEGSLFAVDRPKPPERINKVKFYIPDMNSRMAGQYSCIYRVGELWSEPSNLLDLVVTEM +YDTPTLSVHPGPEVISGEKVTFYCRLDTATSMFLLLKEGRSSHVQRGYGKVQAEFPLGPV +TTAHRGTYRCFGSYNNHAWSFPSEPVKLLVTGDIENTSLAPEDPTFPADTWGTYLLTTET +GLQKDHALWDHTAQNLLRMGLAFLVLVALVWFLVEDWLSRKRTRERASRASTWEGRRRLN +TQTL +>sp|O94864|ST65G_HUMAN STAGA complex 65 subunit gamma OS=Homo sapiens OX=9606 GN=SUPT7L PE=1 SV=1 +MNLQRYWGEIPISSSQTNRSSFDLLPREFRLVEVHDPPLHQPSANKPKPPTMLDIPSEPC +SLTIHTIQLIQHNRRLRNLIATAQAQNQQQTEGVKTEESEPLPSCPGSPPLPDDLLPLDC +KNPNAPFQIRHSDPESDFYRGKGEPVTELSWHSCRQLLYQAVATILAHAGFDCANESVLE +TLTDVAHEYCLKFTKLLRFAVDREARLGQTPFPDVMEQVFHEVGIGSVLSLQKFWQHRIK +DYHSYMLQISKQLSEEYERIVNPEKATEDAKPVKIKEEPVSDITFPVSEELEADLASGDQ +SLPMGVLGAQSERFPSNLEVEASPQASSAEVNASPLWNLAHVKMEPQESEEGNVSGHGVL +GSDVFEEPMSGMSEAGIPQSPDDSDSSYGSHSTDSLMGSSPVFNQRCKKRMRKI +>sp|O95069|KCNK2_HUMAN Potassium channel subfamily K member 2 OS=Homo sapiens OX=9606 GN=KCNK2 PE=1 SV=2 +MLPSASRERPGYRAGVAAPDLLDPKSAAQNSKPRLSFSTKPTVLASRVESDTTINVMKWK +TVSTIFLVVVLYLIIGATVFKALEQPHEISQRTTIVIQKQTFISQHSCVNSTELDELIQQ +IVAAINAGIIPLGNTSNQISHWDLGSSFFFAGTVITTIGFGNISPRTEGGKIFCIIYALL +GIPLFGFLLAGVGDQLGTIFGKGIAKVEDTFIKWNVSQTKIRIISTIIFILFGCVLFVAL +PAIIFKHIEGWSALDAIYFVVITLTTIGFGDYVAGGSDIEYLDFYKPVVWFWILVGLAYF +AAVLSMIGDWLRVISKKTKEEVGEFRAHAAEWTANVTAEFKETRRRLSVEIYDKFQRATS +IKRKLSAELAGNHNQELTPCRRTLSVNHLTSERDVLPPLLKTESIYLNGLTPHCAGEEIA +VIENIK +>sp|O95992|CH25H_HUMAN Cholesterol 25-hydroxylase OS=Homo sapiens OX=9606 GN=CH25H PE=1 SV=1 +MSCHNCSDPQVLCSSGQLFLQPLWDHLRSWEALLQSPFFPVIFSITTYVGFCLPFVVLDI +LCSWVPALRRYKIHPDFSPSAQQLLPCLGQTLYQHVMFVFPVTLLHWARSPALLPHEAPE +LLLLLHHILFCLLLFDMEFFVWHLLHHKVPWLYRTFHKVHHQNSSSFALATQYMSVWELF +SLGFFDMMNVTLLGCHPLTTLTFHVVNIWLSVEDHSGYNFPWSTHRLVPFGWYGGVVHHD +LHHSHFNCNFAPYFTHWDKILGTLRTASVPAR +>sp|O75385|ULK1_HUMAN Serine/threonine-protein kinase ULK1 OS=Homo sapiens OX=9606 GN=ULK1 PE=1 SV=2 +MEPGRGGTETVGKFEFSRKDLIGHGAFAVVFKGRHREKHDLEVAVKCINKKNLAKSQTLL +GKEIKILKELKHENIVALYDFQEMANSVYLVMEYCNGGDLADYLHAMRTLSEDTIRLFLQ +QIAGAMRLLHSKGIIHRDLKPQNILLSNPAGRRANPNSIRVKIADFGFARYLQSNMMAAT +LCGSPMYMAPEVIMSQHYDGKADLWSIGTIVYQCLTGKAPFQASSPQDLRLFYEKNKTLV +PTIPRETSAPLRQLLLALLQRNHKDRMDFDEFFHHPFLDASPSVRKSPPVPVPSYPSSGS +GSSSSSSSTSHLASPPSLGEMQQLQKTLASPADTAGFLHSSRDSGGSKDSSCDTDDFVMV +PAQFPGDLVAEAPSAKPPPDSLMCSGSSLVASAGLESHGRTPSPSPPCSSSPSPSGRAGP +FSSSRCGASVPIPVPTQVQNYQRIERNLQSPTQFQTPRSSAIRRSGSTSPLGFARASPSP +PAHAEHGGVLARKMSLGGGRPYTPSPQVGTIPERPGWSGTPSPQGAEMRGGRSPRPGSSA +PEHSPRTSGLGCRLHSAPNLSDLHVVRPKLPKPPTDPLGAVFSPPQASPPQPSHGLQSCR +NLRGSPKLPDFLQRNPLPPILGSPTKAVPSFDFPKTPSSQNLLALLARQGVVMTPPRNRT +LPDLSEVGPFHGQPLGPGLRPGEDPKGPFGRSFSTSRLTDLLLKAAFGTQAPDPGSTESL +QEKPMEIAPSAGFGGSLHPGARAGGTSSPSPVVFTVGSPPSGSTPPQGPRTRMFSAGPTG +SASSSARHLVPGPCSEAPAPELPAPGHGCSFADPITANLEGAVTFEAPDLPEETLMEQEH +TEILRGLRFTLLFVQHVLEIAALKGSASEAAGGPEYQLQESVVADQISLLSREWGFAEQL +VLYLKVAELLSSGLQSAIDQIRAGKLCLSSTVKQVVRRLNELYKASVVSCQGLSLRLQRF +FLDKQRLLDRIHSITAERLIFSHAVQMVQSAALDEMFQHREGCVPRYHKALLLLEGLQHM +LSDQADIENVTKCKLCIERRLSALLTGICA +>rev_sp|A0A075B6I9|LV746_HUMAN Immunoglobulin lambda variable 7-46 OS=Homo sapiens OX=9606 GN=IGLV7-46 PE=3 SV=4 +MRAGSYSLLCYYEAEDEPQAGLLTLAAKGGLLSGSFRAPTWSHKNSTDYILTRPAQGPKQ +QFWYPYHGSTVAGTSSGCTLTVTGGPSVTLSPEQTVVAQSNSGPCCTLLFLFLPTWA +>rev_sp|A0A286YF46|SCGR5_HUMAN Small cysteine and glycine repeat-containing protein 5 OS=Homo sapiens OX=9606 GN=SCYGR5 PE=1 SV=1 +MCCCQKQCCGKQQCCGKGCGCGCSCCTRRCCCIVPTSCCGGCCGRCCPCCSSCCGVRYCR +CTTCSGCGGGCGGGCGGCGGCGCCG +>rev_sp|A6NKQ9|CGB1_HUMAN Choriogonadotropin subunit beta variant 1 OS=Homo sapiens OX=9606 GN=CGB1 PE=2 SV=3 +MPGPLRSPSPLSPPPAKSSSSDQFRPDDCTLPHDKPGGCDTTSRRCLACQCSLAVAYSVV +PNVGRPCGPLRISEFRVDRYNCVVQPLAPLVGQLVRTMTPCYGACITTNVTICVPCGEKE +VALTANIPRCRPRLPEKSAWTGGMSLLLLLLLRQFMEMKPARFHPDVRGKVHRERLPIDE +ALVPFTS +>rev_sp|O14522|PTPRT_HUMAN Receptor-type tyrosine-protein phosphatase T OS=Homo sapiens OX=9606 GN=PTPRT PE=1 SV=6 +MFSSLYELAVEYVFKYQELTEVMNSKNNRLTKVIHFVDIINQQQIMECVSCIACFTGSRG +GGNLCHVVTRGERGDYQEQWKELRRVVKLLSRKSPPTDRYAPWGIYQLHQVIRYGDQPRA +MNCIRFIRHIIDEDIDASVFEVQIPGYCGSTKEPWYQMCFQATDMENLMVVSSCNYDFVL +RWFDAVTNPLPHQTVVFAAPQKHSDMLAANIYNSSEGDVSILFPLCRDLPLVDMSRNKDH +NRPLLGISCDEPRVRPTVINLTQFEDKIQSSNTQPDLRSINYYLSRFECVPIATNGCLCA +ELIADHVFVYQEETQVLNVRQARLERVCNFIDVVGENEAMDLMTDIAIFCGTRGAGASCH +VVIPGAEPPNLFKVQRVFGLLGTAYCPVGHDPWSTFHFLRLERIEHYGKKQVTFTRIVYE +ALPETEILTVKIDGYVETDDPWYRVCKVRGVEVLNTVMVISASNEQWIMRWFDKVTEQMP +GQTAIYHRPRHYGDIYNANIYDSHPDGDLVLLRVRSHDYSIINGYRNKNRNEDEKATDWS +ATQGEPLAEYEEKFGYGQGRKMQTIHQLLDAVRIAPQFQDRPYSMEVPDCTRYPHTQITL +TPQSLEGRSGDTFGNVDQSSSSFGEDNRSASLKTTPKDASAVPGMERQAGSQTEKQKKAL +KLYYSYSYANRRRKITLMVGLLIIIFMLLGAIVGAMKVTNDVQKEPEVTNSNQTSAGKTA +LRVCNIKTEGNAKSLAQFYISYSKLPSLPPNWYGNYTKNDGVTFPQTVPLNAPKLEAAFY +HLSDLSSANRYSVPVSFCEIIDAARRSKQLREEKVVLQYVSVPAGRSQAPKLMVTITTDT +ENLPTDTDYEPMSPASIKTAIRTTVPPGFGKATSAKITFSYTTGPYLGVFLHHTENRLKF +VKGRQSSLDASPDLSGVAKYNIEYLTIVGNTENPPKWQIYIKEEFPGGQISELPVAGPVD +EETQVVLEESEMRGEPNSLLLRLRITMFPRLGRLTYHSSTQIVEEAEYQQQNFVYQYQVT +LNYSHCRTVAYGFPEWQLTLQRARIDVIEVNQPGHVPDACKTRTTLPPGPPGTGGEGPRT +LLVRIEYEVDPDLHWLKYNPSDVIHTEAWTGTTTRYEVEKLIIPGDGIISNANPKIWLYT +AGVALLEPPAIPTPPEKVILEAYNSVGSGGDSRIVCRYKSVSRQATDAVSVTASFRRHNV +VRTVMLATDRGNWQQLWLKDHQSWKGGAICQFTANQGVNVEVNQLRLFHPAKRCPHALVR +VEDVAIYGPHGKLSVSEFIVQYFHPWFTSIALEAKVWGETVVGSVNWVPNGQPGGNVKVY +VNLAGPSSRDRSSFYYHFDICHTDNEKLTPLLLHAKQGSARGSSNVMMFSGTPVAQDLMP +KEWTNIQEWTFGNTGLAVSYGCNSYHEDFSCGGAASQARAGPLPPLQLRLLLSLALAALS +A +>rev_sp|O75179|ANR17_HUMAN Ankyrin repeat domain-containing protein 17 OS=Homo sapiens OX=9606 GN=ANKRD17 PE=1 SV=3 +MGLQNMHVSNMHPAWPGTWVTQPGNNETSSSVMKILSNWSPDAAHPGNFIPGGAGDPIPA +VPPIMANGYVSFPMGGVKPFDMYGAPVHQHFTGSPTVIGTSDREMAQHQSFVGPDSMPRH +IMPNGMGPNCWDVKDQNGGIGEFSWIGSHGVSTSLNSGIVSPASTGTQRIRASRETGIPV +PVKRDQSVNSPKESGLPVSSPAPSPARVGSSVSQASSDNSASSCPTLLSLPSFHQRNFNA +AAPAGGLPAGPGYMQGGSINSAFNGLHTSSPTVSGYPSDMNVIGSPSPAPRQLPPRFGPA +KSTDSQHLPDSNPLYSSKGSLMSEPTSQSSVVSGGWFAHASTPSNEFLTSFPGFPLPASF +NPAIPKNAPHVSNQCASQTSLTSPLQVSSPVSHPRKIHNVSLVAVSSNQVSASNKHPATT +GHPPPRIAPTEMKPTPQPCGMPTQPMPYTVPATSPVAVPASSTALPPVTMRVEPPPVQSQ +QLPPRPEQSVSGPPQQQNSHASSSGSSSSSNPPRTNTTEVVPPSGAEQESSSAVKNPSAS +GCDLPSNRSVGGPQAPPSPSSPSSVPYHEKATPMPYTANTPATNNNSATTTVTTTTANST +KVVTVFLQRRVSPSSPSGNTSTGPVTSASSSTTTTPSSTLSGASNVQSGSSNQNSHRPVM +HPKPSNTARAPSLPRVPFPGWTSQAPPFTGGFHTMPLRPPRIQQFTQAALLAHAFQPPPY +ALPLSVPFGPRVNNVPNKITKHTSASSIAPVTLATATQSTSSLAVTTMKIGMLSTNAATT +TPASSGIKSNASSSKLRNKPILEDIEKDPDKILANILQTAQRTSETGGRITIIRDGTKDK +QKDIDIHAGTFERIANINCGGRGIVRSIVTSPVSVKKSRRVVEKWGEERKQGRKPSAVTL +KGNPSSLPSPGSKTCTSLSNSTGESITESLKISAKGSVSKREEKPFTVLVSPQKKSSVTT +TVVAPSHNDSKRSSNSNSSEDSCSSIRMNDSDGSESTSSSKSEGNVKEPQSYSIPLPDDF +IIQVNEPTIKNKRNKRKSSTTTITNNRKGHSGALTTWTASIGITTTTTASPPETLVEPED +EVKLKEKEQAAQLEFNEKNKAEIEELKRRQEEKKKRRKEKRKERKAALALRRSEERLKEL +DLEELLISANKNAEAAQRDKAQVISEMCLHCKKLMEKDTITAIYRMCESDSPFQNVEKVL +YRVVKVHGKRFAAMLPTIKRNDAADVDAGAQVLLQVVDLHGGNAALWLPTNGKKNRVDIH +AGRGILLECFKYHGKDAAITLATDRSSPVPPANVDAGKDLLVRGVEAYGGSAAEMLPTLG +TKARHEVNAKRDLLLSVVETRGQFCALTLATNRNTEIQANIDSGMDLLLKVAATHGNMAA +LMLPSIGLKSGTRSNIEAGANLLIKIINVYGGSAALSLPTYDSVNRHEKNAGRALLLEVV +EQRGGSCALSLPTDKTRESQAEIDAGNDLLIEVVGVHGATAALILPTFGKKDRHEISAGR +ELLTQVLEEHGGACALTLATDHNSETQADIDIAPYIPLMASPSIIPSPTPTQPQSISAAI +SHTPTNSMASARGSVAAMIDDLTDNLTQAPSAVMLGQQTETLIGQGLGALQAQGLVPQGV +IVGQLETLNAISGPPLPGAALPLAQPMALPQIPAFGMQAATQQPEDKLLVPDVQQLRAYD +GESLQEGVGVPTLLGLHQCSQQQQQQLQIRQAKVELYQKKLQQQTKLQLEREVKQLEELI +KQKKQIQEERTKNLEEIKEKTLQDAHALELSQLQANKEIAEKIRQELETLKGQAEEVISE +PSQNTIYGQVDQSNAPLHSSSKQKSAAKNRIPLTTAVNAPPKDPEQPPVVMPLAQVPVRP +ARNLDHSPPTLQTVDPPPASLLNNPYDLLYCVVSTHGGKAAEILMTSGDKLRHTPDAGHA +LLLEVVALHGGACALSLVTHDNNATTRNVNAGKSILFQVTCVHGARAAKMLPTRGGESEH +ELDAGAQLLVDAVDTHGNECAYTLATDGTATTAHVNAGAALLYKVLELHGEQAAEMLPTS +CGLEIDAGAKILFDAVELFGGCCALTLATEQTEETQANINAGQGLLLAVMEEHGERAAEM +LPTYGEDNVEELSAGREILLAALEVHGGCAALTLPSEFSDAPMNVQAGSDLLLRAVEVHG +DMCAEMLATHMEDTKHEQDAGAELLFRVMELHGKYCALTLASEKFENSHTNIGAGNELLL +RAVEVHGASGAEMLPTHGNENHDEISAGSELLVKVVDVYGGACAYTLATNGTSSQANVDA +KHALLLKVIKVHGGNAAAMLPTIDGKIGRDEVNAHMALLVQALEYYGASCALCLLSEGEE +THENVSRGEILLKRVANVDGESCAEALSRNDSQGANATSEARMRTLAAAAEDLACSVSST +LRRLVEPDAFAKGDATSLKGIGAAELLAELRAQTEPDVTRLDAGDATGSLLLKSATELMP +NELDDQDLIFSEVESVEEEEDDDDEEEESNNSSTGGGGGGGGGGGGGGGSNDSDSSSESS +SPPRCTRNRKAKHHQQQPPKKKLLLDCVRVMGRPSSASRARSSGGVGGGVEAAAPPGAVA +AVAPPSGEGEAATAAAVPVTAKE +>rev_sp|O76036|NCTR1_HUMAN Natural cytotoxicity triggering receptor 1 OS=Homo sapiens OX=9606 GN=NCR1 PE=1 SV=1 +MLTQTNLRRRGEWTSARSARERTRKRSLWDEVLFWVLAVLVLFALGMRLLNQATHDWLAH +DKQLGTETTLLYTGWTDAPFTPDEPALSTNEIDGTVLLKVPESPFSWAHNNYSGFCRYTG +RHATTVPGLPFEAQVKGYGRQVHSSRGEKLLLFMSTATDLRCYFTVKEGSIVEPGPHVSL +TPTDYMETVVLDLLNSPESWLEGVRYICSYQGAMRSNMDPIYFKVKNIREPPKPRDVAFL +SGEFHLQYEVAGYNGQCCITVQKEKPVMFHPEAWIFPKPLTQQQASIRQSLCLGVCLLAP +LTSS +>rev_sp|O94864|ST65G_HUMAN STAGA complex 65 subunit gamma OS=Homo sapiens OX=9606 GN=SUPT7L PE=1 SV=1 +MIKRMRKKCRQNFVPSSGMLSDTSHSGYSSDSDDPSQPIGAESMGSMPEEFVDSGLVGHG +SVNGEESEQPEMKVHALNWLPSANVEASSAQPSAEVELNSPFRESQAGLVGMPLSQDGSA +LDAELEESVPFTIDSVPEEKIKVPKADETAKEPNVIREYEESLQKSIQLMYSHYDKIRHQ +WFKQLSLVSGIGVEHFVQEMVDPFPTQGLRAERDVAFRLLKTFKLCYEHAVDTLTELVSE +NACDFGAHALITAVAQYLLQRCSHWSLETVPEGKGRYFDSEPDSHRIQFPANPNKCDLPL +LDDPLPPSGPCSPLPESEETKVGETQQQNQAQATAILNRLRRNHQILQITHITLSCPESP +IDLMTPPKPKNASPQHLPPDHVEVLRFERPLLDFSSRNTQSSSIPIEGWYRQLN +>rev_sp|O95069|KCNK2_HUMAN Potassium channel subfamily K member 2 OS=Homo sapiens OX=9606 GN=KCNK2 PE=1 SV=2 +MKINEIVAIEEGACHPTLGNLYISETKLLPPLVDRESTLHNVSLTRRCPTLEQNHNGALE +ASLKRKISTARQFKDYIEVSLRRRTEKFEATVNATWEAAHARFEGVEEKTKKSIVRLWDG +IMSLVAAFYALGVLIWFWVVPKYFDLYEIDSGGAVYDGFGITTLTIVVFYIADLASWGEI +HKFIIAPLAVFLVCGFLIFIITSIIRIKTQSVNWKIFTDEVKAIGKGFITGLQDGVGALL +FGFLPIGLLAYIICFIKGGETRPSINGFGITTIVTGAFFFSSGLDWHSIQNSTNGLPIIG +ANIAAVIQQILEDLETSNVCSHQSIFTQKQIVITTRQSIEHPQELAKFVTAGIILYLVVV +LFITSVTKWKMVNITTDSEVRSALVTPKTSFSLRPKSNQAASKPDLLDPAAVGARYGPRE +RSASPL +>rev_sp|O95992|CH25H_HUMAN Cholesterol 25-hydroxylase OS=Homo sapiens OX=9606 GN=CH25H PE=1 SV=1 +MRAPVSATRLTGLIKDWHTFYPAFNCNFHSHHLDHHVVGGYWGFPVLRHTSWPFNYGSHD +EVSLWINVVHFTLTTLPHCGLLTVNMMDFFGLSFLEWVSMYQTALAFSSSNQHHVKHFTR +YLWPVKHHLLHWVFFEMDFLLLCFLIHHLLLLLEPAEHPLLAPSRAWHLLTVPFVFMVHQ +YLTQGLCPLLQQASPSFDPHIKYRRLAPVWSCLIDLVVFPLCFGVYTTISFIVPFFPSQL +LAEWSRLHDWLPQLFLQGSSCLVQPDSCNHCS +>rev_sp|O75385|ULK1_HUMAN Serine/threonine-protein kinase ULK1 OS=Homo sapiens OX=9606 GN=ULK1 PE=1 SV=2 +MACIGTLLASLRREICLKCKTVNEIDAQDSLMHQLGELLLLAKHYRPVCGERHQFMEDLA +ASQVMQVAHSFILREATISHIRDLLRQKDLFFRQLRLSLGQCSVVSAKYLENLRRVVQKV +TSSLCLKGARIQDIASQLGSSLLEAVKLYLVLQEAFGWERSLLSIQDAVVSEQLQYEPGG +AAESASGKLAAIELVHQVFLLTFRLGRLIETHEQEMLTEEPLDPAEFTVAGELNATIPDA +FSCGHGPAPLEPAPAESCPGPVLHRASSSASGTPGASFMRTRPGQPPTSGSPPSGVTFVV +PSPSSTGGARAGPHLSGGFGASPAIEMPKEQLSETSGPDPAQTGFAAKLLLDTLRSTSFS +RGFPGKPDEGPRLGPGLPQGHFPGVESLDPLTRNRPPTMVVGQRALLALLNQSSPTKPFD +FSPVAKTPSGLIPPLPNRQLFDPLKPSGRLNRCSQLGHSPQPPSAQPPSFVAGLPDTPPK +PLKPRVVHLDSLNPASHLRCGLGSTRPSHEPASSGPRPSRGGRMEAGQPSPTGSWGPREP +ITGVQPSPTYPRGGGLSMKRALVGGHEAHAPPSPSARAFGLPSTSGSRRIASSRPTQFQT +PSQLNREIRQYNQVQTPVPIPVSAGCRSSSFPGARGSPSPSSSCPPSPSPTRGHSELGAS +AVLSSGSCMLSDPPPKASPAEAVLDGPFQAPVMVFDDTDCSSDKSGGSDRSSHLFGATDA +PSALTKQLQQMEGLSPPSALHSTSSSSSSSGSGSSPYSPVPVPPSKRVSPSADLFPHHFF +EDFDMRDKHNRQLLALLLQRLPASTERPITPVLTKNKEYFLRLDQPSSAQFPAKGTLCQY +VITGISWLDAKGDYHQSMIVEPAMYMPSGCLTAAMMNSQLYRAFGFDAIKVRISNPNARR +GAPNSLLINQPKLDRHIIGKSHLLRMAGAIQQLFLRITDESLTRMAHLYDALDGGNCYEM +VLYVSNAMEQFDYLAVINEHKLEKLIKIEKGLLTQSKALNKKNICKVAVELDHKERHRGK +FVVAFAGHGILDKRSFEFKGVTETGGRGPE diff --git a/noxfile.py b/noxfile.py new file mode 100644 index 0000000..51b66c7 --- /dev/null +++ b/noxfile.py @@ -0,0 +1,15 @@ +import nox + + +@nox.session(python=["3.9", "3.10", "3.11", "3.12"], tags=["release"]) +def installation(session): + """Test installation of the package.""" + session.install(".[tests]") + session.run("pytest") + + +@nox.session(python=["3.9"], tags=["dev"], reuse_venv=True) +def test(session): + """Run the test suite.""" + session.install(".[tests]") + session.run("pytest") diff --git a/profasta/__init__.py b/profasta/__init__.py index 3d26221..9f710d9 100644 --- a/profasta/__init__.py +++ b/profasta/__init__.py @@ -8,4 +8,4 @@ __author__ = "David M. Hollenstein" __license__ = "MIT" -__version__ = "0.0.4" +__version__ = "0.0.5" diff --git a/profasta/db.py b/profasta/db.py index 82da902..8295b0c 100644 --- a/profasta/db.py +++ b/profasta/db.py @@ -12,12 +12,15 @@ from __future__ import annotations from dataclasses import dataclass +import logging from pathlib import Path from typing import Any, Iterator, Optional, Protocol from profasta.parser import get_parser, get_writer import profasta.io +logger = logging.getLogger(__name__) + class AbstractDatabaseEntry(Protocol): """A protein entry derived from a protein record in a FASTA file.""" @@ -51,16 +54,21 @@ class ProteinDatabase: Attributes: db: Dictionary mapping protein identifiers to protein entries. - imported_fasta_files: List of FASTA files that have been imported into the + added_fasta_files: List of FASTA files that have been imported into the database. + skipped_fasta_entries: Dictionary mapping added FASTA names to lists of FASTA + entry headers that could not be parsed by the header parser, and thus were + not added to the database. """ db: dict[str, AbstractDatabaseEntry] - imported_fasta_files: list[str] + added_fasta_files: list[str] + skipped_fasta_entries: dict[str, list] def __init__(self): self.db = {} - self.imported_fasta_files = [] + self.added_fasta_files = [] + self.skipped_fasta_entries = {} def add_fasta( self, @@ -68,6 +76,7 @@ def add_fasta( header_parser: str, fasta_name: Optional[str] = None, overwrite: bool = False, + skip_invalid: bool = False, ): """Add protein entries from a FASTA file to the database. @@ -80,22 +89,50 @@ def add_fasta( overwrite: If True, overwrite an existing entry with the same identifier. If False and an entry with the same identifier already exists, a KeyError will be raised. + skip_invalid: If True, entries with a non-parsable header are skipped. If + False, a ValueError is raised when an entry is encountered which header + could not be parsed by the header_parser. Headers of skipped entries are + stored in the skipped_fasta_entries attribute. """ - if fasta_name is None: - fasta_name = Path(path).name - self.imported_fasta_files.append(fasta_name) - + fasta_name = fasta_name if fasta_name is not None else Path(path).name parser = get_parser(header_parser) + parsed_protein_entries: list[DatabaseEntry] = [] + skipped_entry_headers: list[str] = [] + with open(path, "r") as file: for fasta_record in profasta.io.parse_fasta(file): - parsed_header = parser.parse(fasta_record.header) + try: + parsed_header = parser.parse(fasta_record.header) + except ValueError as error: + if skip_invalid: + skipped_entry_headers.append(fasta_record.header) + continue + else: + raise ValueError( + f"FASTA header could not be parsed with the " + f"'{header_parser}' parser: '{fasta_record.header}'" + ) from error protein_entry = DatabaseEntry( parsed_header.identifier, parsed_header.header, fasta_record.sequence, parsed_header.header_fields, ) - self.add_entry(protein_entry, overwrite) + parsed_protein_entries.append(protein_entry) + + self.added_fasta_files.append(fasta_name) + self.skipped_fasta_entries[fasta_name] = skipped_entry_headers + for protein_entry in parsed_protein_entries: + self.add_entry(protein_entry, overwrite) + + if skipped_entry_headers: + num_skipped = len(skipped_entry_headers) + num_total = num_skipped + len(parsed_protein_entries) + logger.warning( + f"Skipped {num_skipped}/{num_total} entries while adding " + f"'{fasta_name}' to a ProteinDatabase because their headers could not " + f"be parsed:" + ) def add_entry(self, protein_entry: AbstractDatabaseEntry, overwrite: bool = False): """Add a protein entry to the database. @@ -151,6 +188,15 @@ def get(self, identifier: str, default: Any = None) -> DatabaseEntry | Any: """Get a protein entry by its identifier or return a default value.""" return self.db.get(identifier, default) + def keys(self): + return self.db.keys() + + def values(self): + return self.db.values() + + def items(self): + return self.db.items() + def __getitem__(self, identifier) -> AbstractDatabaseEntry: return self.db[identifier] diff --git a/profasta/decoy.py b/profasta/decoy.py index 8d8f479..bb7d9ef 100644 --- a/profasta/decoy.py +++ b/profasta/decoy.py @@ -1,3 +1,10 @@ +"""Functions for creating decoy databases. + +Functions: + create_decoy_db: Create a decoy database by reversing the sequences of the input + database records. +""" + from copy import deepcopy from profasta.db import ProteinDatabase @@ -20,7 +27,7 @@ def create_decoy_db( decoy_db = ProteinDatabase() for protein in db: decoy_entry = deepcopy(db[protein]) - decoy_entry.sequence = reverse_sequence( + decoy_entry.sequence = _reverse_sequence( decoy_entry.sequence, keep_nterm=keep_nterm, keep_nterm_methionine=keep_nterm_methionine, @@ -29,7 +36,7 @@ def create_decoy_db( return decoy_db -def reverse_sequence( +def _reverse_sequence( sequence: str, keep_nterm: bool = False, keep_nterm_methionine: bool = True ) -> str: """Create a decoy sequence by reversing the input sequence. diff --git a/profasta/io.py b/profasta/io.py index 9063f30..77f3ed5 100644 --- a/profasta/io.py +++ b/profasta/io.py @@ -46,6 +46,8 @@ def parse_fasta(file_object: IO[str]) -> Generator[FastaRecord, None, None]: Lines starting with ">" are header lines, all others are sequence lines. Each header line is followed by one or multiple sequence lines. Multiple sequence lines are joined by removing new line characters into one single sequence string. + Sequence strings are converted to uppercase and spaces and trailing "*" characters + are removed. Args: file_object: A file object to parse. @@ -60,7 +62,7 @@ def parse_fasta(file_object: IO[str]) -> Generator[FastaRecord, None, None]: for block in fasta_text.split("\n>")[1:]: lines = block.split("\n") header = lines[0].strip() - sequence = "".join(lines[1:]).replace(" ", "") + sequence = "".join(lines[1:]).replace(" ", "").rstrip("*").upper() yield FastaRecord(header, sequence) diff --git a/profasta/parser.py b/profasta/parser.py index ae5ae8c..3b75eb8 100644 --- a/profasta/parser.py +++ b/profasta/parser.py @@ -9,9 +9,9 @@ Classes: AbstractParsedHeader (Protocol): Interface for representing a parsed FASTA header. + AbstractHeaderParser (Protocol): Interface for a FASTA header parser. + AbstractHeaderWriter (Protocol): Interface for a FASTA header writer. ParsedHeader: Representation of a parsed FASTA header. - HeaderParser (Protocol): Interface for a FASTA header parser. - HeaderWriter (Protocol): Interface for a FASTA header writer. DefaultParser: Default FASTA header parser. UniprotParser: Parser for Uniprot FASTA headers. UniprotLikeParser: Parser for less strict Uniprot like FASTA headers. @@ -53,31 +53,20 @@ class AbstractParsedHeader(Protocol): header_fields: dict[str, str] -@dataclass -class ParsedHeader: - """Parsed FASTA header. - - Attributes: - identifier: The unique identifier of the FASTA entry. - header: The FASTA header, not containing the starting ">" character. - header_fields: The parsed header fields as a dictionary. - """ - - identifier: str - header: str - header_fields: dict[str, str] = field(default_factory=dict) - - -class HeaderParser(Protocol): +class AbstractHeaderParser(Protocol): """Abstract header parser.""" @classmethod def parse(self, header: str) -> AbstractParsedHeader: - """Parse a FASTA header string into a ParsedHeader object.""" + """Parse a FASTA header string into a ParsedHeader object. + + Raises: + ValueError: If the header could not be parsed. + """ ... -class HeaderWriter(Protocol): +class AbstractHeaderWriter(Protocol): """Abstract header writer.""" @classmethod @@ -86,15 +75,29 @@ def write(self, parsed_header: AbstractParsedHeader) -> str: ... +@dataclass +class ParsedHeader: + """Parsed FASTA header. + + Attributes: + identifier: The unique identifier of the FASTA entry. + header: The FASTA header, not containing the starting ">" character. + header_fields: The parsed header fields as a dictionary. + """ + + identifier: str + header: str + header_fields: dict[str, str] = field(default_factory=dict) + + class DefaultParser: """Default FASTA header parser. The `parse` method returns a ParsedHeader object with the identifier being the first whitespace-separated word of the header. The rest of the header is stored in the "description" field of the `header_fields` dictionary, which might be an - empty string. - - The `write` method returns the original `header` string from the parsed_header. + empty string. This parser is guaranteed to work for any FASTA header string and + never fail. """ @classmethod @@ -159,7 +162,11 @@ class UniprotParser: @classmethod def parse(cls, header: str) -> ParsedHeader: - """Parse a FASTA header string into a ParsedHeader object.""" + """Parse a FASTA header string into a ParsedHeader object. + + Raises: + ValueError: If the header could not be parsed. + """ match = cls.header_pattern.match(header) if match is None: raise ValueError(f"Header does not match the UniProt pattern: {header}") @@ -225,7 +232,11 @@ class UniprotLikeParser: @classmethod def parse(cls, header: str) -> ParsedHeader: - """Parse a FASTA header string into a ParsedHeader object.""" + """Parse a FASTA header string into a ParsedHeader object. + + Raises: + ValueError: If the header could not be parsed. + """ split_header = header.split(maxsplit=1) try: db, _id, entry = split_header[0].split("|") @@ -294,34 +305,34 @@ def write(cls, parsed_header: AbstractParsedHeader) -> str: return " ".join(header_entries) -def register_parser(name: str, parser: HeaderParser): +def register_parser(name: str, parser: AbstractHeaderParser): """Register a custom parser by name.""" PARSER_REGISTRY[name] = parser -def get_parser(parser_name: str) -> HeaderParser: +def get_parser(parser_name: str) -> AbstractHeaderParser: """Get a registered parser by name.""" return PARSER_REGISTRY[parser_name] -def register_writer(name: str, parser: HeaderWriter): +def register_writer(name: str, parser: AbstractHeaderWriter): """Register a custom writer by name.""" WRITER_REGISTRY[name] = parser -def get_writer(parser_name: str) -> HeaderWriter: +def get_writer(parser_name: str) -> AbstractHeaderWriter: """Get a registered writer by name.""" return WRITER_REGISTRY[parser_name] -PARSER_REGISTRY: dict[str, HeaderParser] = { +PARSER_REGISTRY: dict[str, AbstractHeaderParser] = { "default": DefaultParser, "uniprot": UniprotParser, "uniprot_like": UniprotLikeParser, } -WRITER_REGISTRY: dict[str, HeaderWriter] = { +WRITER_REGISTRY: dict[str, AbstractHeaderWriter] = { "default": DefaultWriter, "decoy": DecoyWriter, "uniprot": UniprotWriter, diff --git a/pyproject.toml b/pyproject.toml index d8a5447..e961c6e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ dynamic = ["version"] [project.optional-dependencies] tests = ["pytest"] +dev = ["nox", "pytest"] [project.urls] repository = "https://github.com/hollenstein/profasta" diff --git a/tests/unit/test_db.py b/tests/unit/test_db.py new file mode 100644 index 0000000..347b166 --- /dev/null +++ b/tests/unit/test_db.py @@ -0,0 +1,66 @@ +import io +import pytest + +import profasta.db + + +class TestProteinDatabase: + def test_add_fasta_raises_value_error_when_a_header_cannot_be_parsed(self, tmp_path): # fmt: skip + fasta_entries = [ + ">xx|uniprot_like_01|entry\nMKKK", + ">not_uniprot_like_entry\nMRRR", + ] + fasta_path = tmp_path / "test.fasta" + with open(fasta_path, "w") as file: + file.write("\n".join(fasta_entries)) + + protein_db = profasta.db.ProteinDatabase() + with pytest.raises(ValueError): + protein_db.add_fasta(fasta_path, header_parser="uniprot_like") + + def test_add_fasta_adds_valid_entries_when_skip_invalid_is_true(self, tmp_path): + fasta_entries = [ + ">xx|uniprot_like_01|entry\nMKKK", + ">not_uniprot_like_entry\nMRRR", + ] + fasta_path = tmp_path / "test.fasta" + with open(fasta_path, "w") as file: + file.write("\n".join(fasta_entries)) + + protein_db = profasta.db.ProteinDatabase() + protein_db.add_fasta( + fasta_path, header_parser="uniprot_like", skip_invalid=True + ) + assert len(protein_db.db) == 1 and "uniprot_like_01" in protein_db.db + + def test_add_fasta_adds_records_invalid_entry_headers_when_skip_invalid_is_true(self, tmp_path): # fmt: skip + fasta_entries = [ + ">xx|uniprot_like_01|entry\nMKKK", + ">not_uniprot_like_entry\nMRRR", + ] + fasta_path = tmp_path / "test.fasta" + with open(fasta_path, "w") as file: + file.write("\n".join(fasta_entries)) + + protein_db = profasta.db.ProteinDatabase() + protein_db.add_fasta(fasta_path, header_parser="uniprot_like", skip_invalid=True) # fmt: skip + skipped_headers = protein_db.skipped_fasta_entries["test.fasta"] + assert len(skipped_headers) == 1 and "not_uniprot_like_entry" in skipped_headers + + def test_add_fasta_does_not_add_any_entries_when_failing_to_parse_a_header(self, tmp_path): # fmt: skip + fasta_entries = [ + ">xx|uniprot_like_01|entry\nMKKK", + ">not_uniprot_like_entry\nMRRR", + ] + fasta_path = tmp_path / "test.fasta" + with open(fasta_path, "w") as file: + file.write("\n".join(fasta_entries)) + + protein_db = profasta.db.ProteinDatabase() + try: + protein_db.add_fasta(fasta_path, header_parser="uniprot_like") + except ValueError: + pass + + assert len(protein_db.db) == 0 + assert len(protein_db.added_fasta_files) == 0 diff --git a/tests/unit/test_decoy.py b/tests/unit/test_decoy.py index 8e4cb4e..a31e201 100644 --- a/tests/unit/test_decoy.py +++ b/tests/unit/test_decoy.py @@ -14,7 +14,7 @@ class TestReverseSequence: ], ) def test_without_keeping_nterminal_resiudes(self, sequence, expected_result): - assert decoy.reverse_sequence(sequence, keep_nterm=False, keep_nterm_methionine=False) == expected_result # fmt: skip + assert decoy._reverse_sequence(sequence, keep_nterm=False, keep_nterm_methionine=False) == expected_result # fmt: skip @pytest.mark.parametrize( "sequence, expected_result", @@ -26,7 +26,7 @@ def test_without_keeping_nterminal_resiudes(self, sequence, expected_result): ], ) def test_with_keeping_nterminal_resiude(self, sequence, expected_result): - assert decoy.reverse_sequence(sequence, keep_nterm=True, keep_nterm_methionine=False) == expected_result # fmt: skip + assert decoy._reverse_sequence(sequence, keep_nterm=True, keep_nterm_methionine=False) == expected_result # fmt: skip @pytest.mark.parametrize( "sequence, expected_result", @@ -38,9 +38,9 @@ def test_with_keeping_nterminal_resiude(self, sequence, expected_result): ], ) def test_with_keeping_nterminal_methionione_resiude(self, sequence, expected_result): # fmt: skip - assert decoy.reverse_sequence(sequence, keep_nterm=False, keep_nterm_methionine=True) == expected_result # fmt: skip + assert decoy._reverse_sequence(sequence, keep_nterm=False, keep_nterm_methionine=True) == expected_result # fmt: skip def test_keep_nterminal_residue_overrules_keep_nterminal_methionine_residue(self): sequence = "ABCDEF" expected_result = "AFEDCB" - assert decoy.reverse_sequence(sequence, keep_nterm=True, keep_nterm_methionine=True) == expected_result # fmt: skip + assert decoy._reverse_sequence(sequence, keep_nterm=True, keep_nterm_methionine=True) == expected_result # fmt: skip diff --git a/tests/unit/test_io.py b/tests/unit/test_io.py index 9fda389..bf5e980 100644 --- a/tests/unit/test_io.py +++ b/tests/unit/test_io.py @@ -4,13 +4,22 @@ import profasta.io -def test_parse_fasta_file(): - file_buffer = io.StringIO(f">H1\nMKKK\n>H2\nMAAA") +@pytest.mark.parametrize( + "fasta_content, expected_header, expected_sequence", + [ + (f">H1\nMKKK\n>H2\nMAAA", "H1", "MKKK"), + (f">H1\nMKKK\nRRR", "H1", "MKKKRRR"), + (f">H1\nMKK K\nRR R", "H1", "MKKKRRR"), + (f">H1\nMKKK\nRRR*", "H1", "MKKKRRR"), + ], +) +def test_parse_fasta_file(fasta_content, expected_header, expected_sequence): + file_buffer = io.StringIO(fasta_content) fasta_parser = profasta.io.parse_fasta(file_buffer) record = next(fasta_parser) - assert record.header == "H1" - assert record.sequence == "MKKK" + assert record.header == expected_header + assert record.sequence == expected_sequence def test_write_fasta_file():