diff --git a/README.md b/README.md index 2cddb86..764a5c0 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ The following code snippet shows how to import a FASTA file containing UniProt p ```python >>> import profasta >>> ->>> fasta_path = "./example_data/uniprot_hsapiens_10entries.fasta" +>>> fasta_path = "./examples/uniprot_hsapiens_10entries.fasta" >>> db = profasta.db.ProteinDatabase() >>> db.add_fasta(fasta_path, header_parser="uniprot") >>> protein_record = db["O75385"] @@ -32,6 +32,8 @@ The following code snippet shows how to import a FASTA file containing UniProt p ULK1 ``` +For more examples how to use the ProFASTA library please refer to the [code snippets](examples/code_snippets.ipynb) Jupyter notebook. + ## Requirements Python >= 3.9 diff --git a/examples/code_snippets.ipynb b/examples/code_snippets.ipynb new file mode 100644 index 0000000..b32ebb6 --- /dev/null +++ b/examples/code_snippets.ipynb @@ -0,0 +1,172 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "273cf753", + "metadata": {}, + "source": [ + "# Code snippets for working with the proFASTA library" + ] + }, + { + "cell_type": "markdown", + "id": "8d8a7af6", + "metadata": {}, + "source": [ + "## Removing invalid characters from imported protein sequences" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "830d37b9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MEPG\n" + ] + } + ], + "source": [ + "import profasta\n", + "\n", + "\n", + "def cleanup_protein_sequences(\n", + " db: profasta.ProteinDatabase, alphabet=\"ABCDEFGHIJKLMNOPQRSTUVWXYZ\"\n", + " ) -> None:\n", + " \"\"\"Remove non-alphabet characters from protein sequences in the ProteinDatabase.\n", + " \n", + " Args:\n", + " db: A profasta.ProteinDatabase instance.\n", + " alphabet: List of characters that are allowed in the protein entry sequences.\n", + " \"\"\"\n", + " for entry in db.values(): \n", + " entry.sequence = \"\".join([aa for aa in entry.sequence if aa in alphabet])\n", + "\n", + "\n", + "fasta_path = \"./uniprot_hsapiens_10entries.fasta\"\n", + "db = profasta.db.ProteinDatabase()\n", + "db.add_fasta(fasta_path, header_parser=\"uniprot\")\n", + "db[\"O75385\"].sequence = \"MEPG_-+123\"\n", + "cleanup_protein_sequences(db)\n", + "\n", + "print(db[\"O75385\"].sequence)" + ] + }, + { + "cell_type": "markdown", + "id": "0c5bea99", + "metadata": {}, + "source": [ + "## Converting FASTA headers into a UniProt like format" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1996bde3", + "metadata": {}, + "outputs": [], + "source": [ + "import profasta\n", + "import profasta.parser\n", + "\n", + "\n", + "class CustomHeaderParser:\n", + " \"\"\"Custom header parser.\"\"\"\n", + "\n", + " @classmethod\n", + " def parse(cls, header: str) -> profasta.parser.ParsedHeader:\n", + " \"\"\"Parse a FASTA header string into a ParsedHeader object.\n", + " \n", + " Header format example:\n", + " >ProteinID hypothetical protein name\n", + " \"\"\"\n", + " split_header = header.split(maxsplit=1)\n", + " _id = split_header[0]\n", + "\n", + " fields = {\n", + " \"db\": \"xx\",\n", + " \"identifier\": _id,\n", + " \"entry_name\": f\"{_id}_CUSTOM\",\n", + " \"gene_name\": _id,\n", + " }\n", + " if len(split_header) > 1:\n", + " fields[\"protein_name\"] = split_header[1]\n", + " return profasta.parser.ParsedHeader(_id, header, fields)\n", + "\n", + "# Register the custom header parser so that it can be used by the ProteinDatabase.\n", + "profasta.parser.register_parser(\"custom_parser\", CustomHeaderParser)\n", + "\n", + "fasta_path = \"./custom_header_format.fasta\"\n", + "converted_fasta_path = \"./custom_header_format.uniprot-like.fasta\"\n", + "protein_db = profasta.ProteinDatabase()\n", + "\n", + "# Specify the custom header parser to use for adding the FASTA file.\n", + "protein_db.add_fasta(fasta_path, header_parser=\"custom_parser\")\n", + "\n", + "# Write the ProteinDatabase to a new FASTA file using the uniprot-like header writer.\n", + "protein_db.write_fasta(converted_fasta_path, header_writer=\"uniprot_like\")" + ] + }, + { + "cell_type": "markdown", + "id": "697f8065", + "metadata": {}, + "source": [ + "## Create a combined FASTA file with added decoy entries\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bc15636a", + "metadata": {}, + "outputs": [], + "source": [ + "import profasta\n", + "\n", + "fasta_path = \"./uniprot_hsapiens_10entries.fasta\"\n", + "decoy_fasta_path = \"./uniprot_hsapiens_10entries_DECOY.fasta\"\n", + "\n", + "# Import the FASTA file\n", + "db = profasta.db.ProteinDatabase()\n", + "db.add_fasta(fasta_path, header_parser=\"uniprot\")\n", + "\n", + "# Create the new FASTA file and write the original entries to it.\n", + "db.write_fasta(decoy_fasta_path, header_writer=\"uniprot\")\n", + "\n", + "# Create a decoy database from the original database, containing reversed sequences.\n", + "decoy_db = profasta.create_decoy_db(db, keep_nterm_methionine=True)\n", + "\n", + "# Append the decoy entries to the new FASTA file.\n", + "decoy_db.write_fasta(decoy_fasta_path, header_writer=\"decoy\", append=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/custom_header_format.fasta b/examples/custom_header_format.fasta new file mode 100644 index 0000000..12af611 --- /dev/null +++ b/examples/custom_header_format.fasta @@ -0,0 +1,3 @@ +>ProteinID hypothetical protein name +MAWTPLFLFLLTCCPGSNSQAVVTQEPSLTVSPGGTVTLTCGSSTGAVTSGHYPYWFQQK +PGQAPRTLIYDTSNKHSWTPARFSGSLLGGKAALTLLGAQPEDEAEYYCLLSYSGAR \ No newline at end of file diff --git a/examples/custom_header_format.uniprot-like.fasta b/examples/custom_header_format.uniprot-like.fasta new file mode 100644 index 0000000..ccdde3b --- /dev/null +++ b/examples/custom_header_format.uniprot-like.fasta @@ -0,0 +1,3 @@ +>xx|ProteinID|ProteinID_CUSTOM hypothetical protein name GN=ProteinID +MAWTPLFLFLLTCCPGSNSQAVVTQEPSLTVSPGGTVTLTCGSSTGAVTSGHYPYWFQQK +PGQAPRTLIYDTSNKHSWTPARFSGSLLGGKAALTLLGAQPEDEAEYYCLLSYSGAR diff --git a/example_data/uniprot_hsapiens_10entries.fasta b/examples/uniprot_hsapiens_10entries.fasta similarity index 100% rename from example_data/uniprot_hsapiens_10entries.fasta rename to examples/uniprot_hsapiens_10entries.fasta diff --git a/examples/uniprot_hsapiens_10entries_DECOY.fasta b/examples/uniprot_hsapiens_10entries_DECOY.fasta new file mode 100644 index 0000000..3c462f8 --- /dev/null +++ b/examples/uniprot_hsapiens_10entries_DECOY.fasta @@ -0,0 +1,262 @@ +>sp|A0A075B6I9|LV746_HUMAN Immunoglobulin lambda variable 7-46 OS=Homo sapiens OX=9606 GN=IGLV7-46 PE=3 SV=4 +MAWTPLFLFLLTCCPGSNSQAVVTQEPSLTVSPGGTVTLTCGSSTGAVTSGHYPYWFQQK +PGQAPRTLIYDTSNKHSWTPARFSGSLLGGKAALTLLGAQPEDEAEYYCLLSYSGAR +>sp|A0A286YF46|SCGR5_HUMAN Small cysteine and glycine repeat-containing protein 5 OS=Homo sapiens OX=9606 GN=SCYGR5 PE=1 SV=1 +MGCCGCGGCGGCGGGCGGGCGSCTTCRCYRVGCCSSCCPCCRGCCGGCCSTPVICCCRRT +CCSCGCGCGKGCCQQKGCCQKQCCC +>sp|A6NKQ9|CGB1_HUMAN Choriogonadotropin subunit beta variant 1 OS=Homo sapiens OX=9606 GN=CGB1 PE=2 SV=3 +MSTFPVLAEDIPLRERHVKGRVDPHFRAPKMEMFQRLLLLLLLSMGGTWASKEPLRPRCR +PINATLAVEKEGCPVCITVNTTICAGYCPTMTRVLQGVLPALPQVVCNYRDVRFESIRLP +GCPRGVNPVVSYAVALSCQCALCRRSTTDCGGPKDHPLTCDDPRFQDSSSSKAPPPSLPS +PSRLPGP +>sp|O14522|PTPRT_HUMAN Receptor-type tyrosine-protein phosphatase T OS=Homo sapiens OX=9606 GN=PTPRT PE=1 SV=6 +MASLAALALSLLLRLQLPPLPGARAQSAAGGCSFDEHYSNCGYSVALGTNGFTWEQINTW +EKPMLDQAVPTGSFMMVNSSGRASGQKAHLLLPTLKENDTHCIDFHYYFSSRDRSSPGAL +NVYVKVNGGPQGNPVWNVSGVVTEGWVKAELAISTFWPHFYQVIFESVSLKGHPGYIAVD +EVRVLAHPCRKAPHFLRLQNVEVNVGQNATFQCIAGGKWSQHDKLWLQQWNGRDTALMVT +RVVNHRRFSATVSVADTAQRSVSKYRCVIRSDGGSGVSNYAELIVKEPPTPIAPPELLAV +GATYLWIKPNANSIIGDGPIILKEVEYRTTTGTWAETHIVDSPNYKLWHLDPDVEYEIRV +LLTRPGEGGTGPPGPPLTTRTKCADPVHGPQNVEIVDIRARQLTLQWEPFGYAVTRCHSY +NLTVQYQYVFNQQQYEAEEVIQTSSHYTLRGLRPFMTIRLRLLLSNPEGRMESEELVVQT +EEDVPGAVPLESIQGGPFEEKIYIQWKPPNETNGVITLYEINYKAVGSLDPSADLSSQRG +KVFKLRNETHHLFVGLYPGTTYSFTIKASTAKGFGPPVTTRIATKISAPSMPEYDTDTPL +NETDTTITVMLKPAQSRGAPVSVYQLVVKEERLQKSRRAADIIECFSVPVSYRNASSLDS +LHYFAAELKPANLPVTQPFTVGDNKTYNGYWNPPLSPLKSYSIYFQALSKANGETKINCV +RLATKGASTQNSNTVEPEKQVDNTVKMAGVIAGLLMFIIILLGVMLTIKRRRNAYSYSYY +LKLAKKQKETQSGAQREMGPVASADKPTTKLSASRNDEGFSSSSQDVNGFTDGSRGELSQ +PTLTIQTHPYRTCDPVEMSYPRDQFQPAIRVADLLQHITQMKRGQGYGFKEEYEALPEGQ +TASWDTAKEDENRNKNRYGNIISYDHSRVRLLVLDGDPHSDYINANYIDGYHRPRHYIAT +QGPMQETVKDFWRMIWQENSASIVMVTNLVEVGRVKCVRYWPDDTEVYGDIKVTLIETEP +LAEYVIRTFTVQKKGYHEIRELRLFHFTSWPDHGVPCYATGLLGFVRQVKFLNPPEAGPI +VVHCSAGAGRTGCFIAIDTMLDMAENEGVVDIFNCVRELRAQRVNLVQTEEQYVFVHDAI +LEACLCGNTAIPVCEFRSLYYNISRLDPQTNSSQIKDEFQTLNIVTPRVRPEDCSIGLLP +RNHDKNRSMDVLPLDRCLPFLISVDGESSNYINAALMDSHKQPAAFVVTQHPLPNTVADF +WRLVFDYNCSSVVMLNEMDTAQFCMQYWPEKTSGCYGPIQVEFVSADIDEDIIHRIFRIC +NMARPQDGYRIVQHLQYIGWPAYRDTPPSKRSLLKVVRRLEKWQEQYDGREGRTVVHCLN +GGGRSGTFCAICSVCEMIQQQNIIDVFHIVKTLRNNKSNMVETLEQYKFVYEVALEYLSS +F +>sp|O75179|ANR17_HUMAN Ankyrin repeat domain-containing protein 17 OS=Homo sapiens OX=9606 GN=ANKRD17 PE=1 SV=3 +MEKATVPVAAATAAEGEGSPPAVAAVAGPPAAAEVGGGVGGSSRARSASSPRGMVRVCDL +LLKKKPPQQQHHKAKRNRTCRPPSSSESSSDSDNSGGGGGGGGGGGGGGGTSSNNSEEEE +DDDDEEEEVSEVESFILDQDDLENPMLETASKLLLSGTADGADLRTVDPETQARLEALLE +AAGIGKLSTADGKAFADPEVLRRLTSSVSCALDEAAAALTRMRAESTANAGQSDNRSLAE +ACSEGDVNAVRKLLIEGRSVNEHTEEGESLLCLACSAGYYELAQVLLAMHANVEDRGIKG +DITPLMAAANGGHVKIVKLLLAHKADVNAQSSTGNTALTYACAGGYVDVVKVLLESGASI +EDHNENGHTPLMEAGSAGHVEVARLLLENGAGINTHSNEFKESALTLACYKGHLEMVRFL +LEAGADQEHKTDEMHTALMEACMDGHVEVARLLLDSGAQVNMPADSFESPLTLAACGGHV +ELAALLIERGASLEEVNDEGYTPLMEAAREGHEEMVALLLGQGANINAQTEETQETALTL +ACCGGFLEVADFLIKAGADIELGCSTPLMEAAQEGHLELVKYLLAAGANVHATTATGDTA +LTYACENGHTDVADVLLQAGADLEHESEGGRTPLMKAARAGHVCTVQFLISKGANVNRTT +ANNDHTVLSLACAGGHLAVVELLLAHGADPTHRLKDGSTMLIEAAKGGHTSVVCYLLDYP +NNLLSAPPPDVTQLTPPSHDLNRAPRVPVQALPMVVPPQEPDKPPANVATTLPIRNKAAS +KQKSSSHLPANSQDVQGYITNQSPESIVEEAQGKLTELEQRIKEAIEKNAQLQSLELAHA +DQLTKEKIEELNKTREEQIQKKQKILEELQKVERELQLKTQQQLKKQYLEVKAQRIQLQQ +QQQQSCQHLGLLTPVGVGEQLSEGDYARLQQVDPVLLKDEPQQTAAQMGFAPIQPLAMPQ +ALPLAAGPLPPGSIANLTELQGVIVGQPVLGQAQLAGLGQGILTETQQGLMVASPAQTLN +DTLDDIMAAVSGRASAMSNTPTHSIAASISQPQTPTPSPIISPSAMLPIYPAIDIDAQTE +SNHDTALTLACAGGHEELVQTLLERGASIEHRDKKGFTPLILAATAGHVGVVEILLDNGA +DIEAQSERTKDTPLSLACSGGRQEVVELLLARGANKEHRNVSDYTPLSLAASGGYVNIIK +ILLNAGAEINSRTGSKLGISPLMLAAMNGHTAAVKLLLDMGSDINAQIETNRNTALTLAC +FQGRTEVVSLLLDRKANVEHRAKTGLTPLMEAASGGYAEVGRVLLDKGADVNAPPVPSSR +DTALTIAADKGHYKFCELLIGRGAHIDVRNKKGNTPLWLAANGGHLDVVQLLVQAGADVD +AADNRKITPLMAAFRKGHVKVVRYLVKEVNQFPSDSECMRYIATITDKEMLKKCHLCMES +IVQAKDRQAAEANKNASILLEELDLEKLREESRRLALAAKREKRKEKRRKKKEEQRRKLE +EIEAKNKENFELQAAQEKEKLKVEDEPEVLTEPPSATTTTTIGISATWTTLAGSHGKRNN +TITTTSSKRKNRKNKITPENVQIIFDDPLPISYSQPEKVNGESKSSSTSESGDSDNMRIS +SCSDESSNSNSSRKSDNHSPAVVTTTVSSKKQPSVLVTFPKEERKSVSGKASIKLSETIS +EGTSNSLSTCTKSGPSPLSSPNGKLTVASPKRGQKREEGWKEVVRRSKKVSVPSTVISRV +IGRGGCNINAIREFTGAHIDIDKQKDKTGDRIITIRGGTESTRQATQLINALIKDPDKEI +DELIPKNRLKSSSANSKIGSSAPTTTAANTSLMGIKMTTVALSSTSQTATALTVPAISSA +STHKTIKNPVNNVRPGFPVSLPLAYPPPQFAHALLAAQTFQQIRPPRLPMTHFGGTFPPA +QSTWGPFPVRPLSPARATNSPKPHMVPRHSNQNSSGSQVNSAGSLTSSPTTTTSSSASTV +PGTSTNGSPSSPSVRRQLFVTVVKTSNATTTTVTTTASNNNTAPTNATYPMPTAKEHYPV +SSPSSPSPPAQPGGVSRNSPLDCGSASPNKVASSSEQEAGSPPVVETTNTRPPNSSSSSG +SSSAHSNQQQPPGSVSQEPRPPLQQSQVPPPEVRMTVPPLATSSAPVAVPSTAPVTYPMP +QTPMGCPQPTPKMETPAIRPPPHGTTAPHKNSASVQNSSVAVLSVNHIKRPHSVPSSVQL +PSTLSTQSACQNSVHPANKPIAPNFSAPLPFGPFSTLFENSPTSAHAFWGGSVVSSQSTP +ESMLSGKSSYLPNSDPLHQSDTSKAPGFRPPLQRPAPSPSGIVNMDSPYGSVTPSSTHLG +NFASNISGGQMYGPGAPLGGAPAAANFNRQHFSPLSLLTPCSSASNDSSAQSVSSGVRAP +SPAPSSVPLGSEKPSNVSQDRKVPVPIGTERSARIRQTGTSAPSVIGSNLSTSVGHSGIW +SFEGIGGNQDKVDWCNPGMGNPMIHRPMSDPGVFSQHQAMERDSTGIVTPSGTFHQHVPA +GYMDFPKVGGMPFSVYGNAMIPPVAPIPDGAGGPIFNGPHAADPSWNSLIKMVSSSTENN +GPQTVWTGPWAPHMNSVHMNQLG +>sp|O76036|NCTR1_HUMAN Natural cytotoxicity triggering receptor 1 OS=Homo sapiens OX=9606 GN=NCR1 PE=1 SV=1 +MSSTLPALLCVGLCLSQRISAQQQTLPKPFIWAEPHFMVPKEKQVTICCQGNYGAVEYQL +HFEGSLFAVDRPKPPERINKVKFYIPDMNSRMAGQYSCIYRVGELWSEPSNLLDLVVTEM +YDTPTLSVHPGPEVISGEKVTFYCRLDTATSMFLLLKEGRSSHVQRGYGKVQAEFPLGPV +TTAHRGTYRCFGSYNNHAWSFPSEPVKLLVTGDIENTSLAPEDPTFPADTWGTYLLTTET +GLQKDHALWDHTAQNLLRMGLAFLVLVALVWFLVEDWLSRKRTRERASRASTWEGRRRLN +TQTL +>sp|O94864|ST65G_HUMAN STAGA complex 65 subunit gamma OS=Homo sapiens OX=9606 GN=SUPT7L PE=1 SV=1 +MNLQRYWGEIPISSSQTNRSSFDLLPREFRLVEVHDPPLHQPSANKPKPPTMLDIPSEPC +SLTIHTIQLIQHNRRLRNLIATAQAQNQQQTEGVKTEESEPLPSCPGSPPLPDDLLPLDC +KNPNAPFQIRHSDPESDFYRGKGEPVTELSWHSCRQLLYQAVATILAHAGFDCANESVLE +TLTDVAHEYCLKFTKLLRFAVDREARLGQTPFPDVMEQVFHEVGIGSVLSLQKFWQHRIK +DYHSYMLQISKQLSEEYERIVNPEKATEDAKPVKIKEEPVSDITFPVSEELEADLASGDQ +SLPMGVLGAQSERFPSNLEVEASPQASSAEVNASPLWNLAHVKMEPQESEEGNVSGHGVL +GSDVFEEPMSGMSEAGIPQSPDDSDSSYGSHSTDSLMGSSPVFNQRCKKRMRKI +>sp|O95069|KCNK2_HUMAN Potassium channel subfamily K member 2 OS=Homo sapiens OX=9606 GN=KCNK2 PE=1 SV=2 +MLPSASRERPGYRAGVAAPDLLDPKSAAQNSKPRLSFSTKPTVLASRVESDTTINVMKWK +TVSTIFLVVVLYLIIGATVFKALEQPHEISQRTTIVIQKQTFISQHSCVNSTELDELIQQ +IVAAINAGIIPLGNTSNQISHWDLGSSFFFAGTVITTIGFGNISPRTEGGKIFCIIYALL +GIPLFGFLLAGVGDQLGTIFGKGIAKVEDTFIKWNVSQTKIRIISTIIFILFGCVLFVAL +PAIIFKHIEGWSALDAIYFVVITLTTIGFGDYVAGGSDIEYLDFYKPVVWFWILVGLAYF +AAVLSMIGDWLRVISKKTKEEVGEFRAHAAEWTANVTAEFKETRRRLSVEIYDKFQRATS +IKRKLSAELAGNHNQELTPCRRTLSVNHLTSERDVLPPLLKTESIYLNGLTPHCAGEEIA +VIENIK +>sp|O95992|CH25H_HUMAN Cholesterol 25-hydroxylase OS=Homo sapiens OX=9606 GN=CH25H PE=1 SV=1 +MSCHNCSDPQVLCSSGQLFLQPLWDHLRSWEALLQSPFFPVIFSITTYVGFCLPFVVLDI +LCSWVPALRRYKIHPDFSPSAQQLLPCLGQTLYQHVMFVFPVTLLHWARSPALLPHEAPE +LLLLLHHILFCLLLFDMEFFVWHLLHHKVPWLYRTFHKVHHQNSSSFALATQYMSVWELF +SLGFFDMMNVTLLGCHPLTTLTFHVVNIWLSVEDHSGYNFPWSTHRLVPFGWYGGVVHHD +LHHSHFNCNFAPYFTHWDKILGTLRTASVPAR +>sp|O75385|ULK1_HUMAN Serine/threonine-protein kinase ULK1 OS=Homo sapiens OX=9606 GN=ULK1 PE=1 SV=2 +MEPGRGGTETVGKFEFSRKDLIGHGAFAVVFKGRHREKHDLEVAVKCINKKNLAKSQTLL +GKEIKILKELKHENIVALYDFQEMANSVYLVMEYCNGGDLADYLHAMRTLSEDTIRLFLQ +QIAGAMRLLHSKGIIHRDLKPQNILLSNPAGRRANPNSIRVKIADFGFARYLQSNMMAAT +LCGSPMYMAPEVIMSQHYDGKADLWSIGTIVYQCLTGKAPFQASSPQDLRLFYEKNKTLV +PTIPRETSAPLRQLLLALLQRNHKDRMDFDEFFHHPFLDASPSVRKSPPVPVPSYPSSGS +GSSSSSSSTSHLASPPSLGEMQQLQKTLASPADTAGFLHSSRDSGGSKDSSCDTDDFVMV +PAQFPGDLVAEAPSAKPPPDSLMCSGSSLVASAGLESHGRTPSPSPPCSSSPSPSGRAGP +FSSSRCGASVPIPVPTQVQNYQRIERNLQSPTQFQTPRSSAIRRSGSTSPLGFARASPSP +PAHAEHGGVLARKMSLGGGRPYTPSPQVGTIPERPGWSGTPSPQGAEMRGGRSPRPGSSA +PEHSPRTSGLGCRLHSAPNLSDLHVVRPKLPKPPTDPLGAVFSPPQASPPQPSHGLQSCR +NLRGSPKLPDFLQRNPLPPILGSPTKAVPSFDFPKTPSSQNLLALLARQGVVMTPPRNRT +LPDLSEVGPFHGQPLGPGLRPGEDPKGPFGRSFSTSRLTDLLLKAAFGTQAPDPGSTESL +QEKPMEIAPSAGFGGSLHPGARAGGTSSPSPVVFTVGSPPSGSTPPQGPRTRMFSAGPTG +SASSSARHLVPGPCSEAPAPELPAPGHGCSFADPITANLEGAVTFEAPDLPEETLMEQEH +TEILRGLRFTLLFVQHVLEIAALKGSASEAAGGPEYQLQESVVADQISLLSREWGFAEQL +VLYLKVAELLSSGLQSAIDQIRAGKLCLSSTVKQVVRRLNELYKASVVSCQGLSLRLQRF +FLDKQRLLDRIHSITAERLIFSHAVQMVQSAALDEMFQHREGCVPRYHKALLLLEGLQHM +LSDQADIENVTKCKLCIERRLSALLTGICA +>rev_sp|A0A075B6I9|LV746_HUMAN Immunoglobulin lambda variable 7-46 OS=Homo sapiens OX=9606 GN=IGLV7-46 PE=3 SV=4 +MRAGSYSLLCYYEAEDEPQAGLLTLAAKGGLLSGSFRAPTWSHKNSTDYILTRPAQGPKQ +QFWYPYHGSTVAGTSSGCTLTVTGGPSVTLSPEQTVVAQSNSGPCCTLLFLFLPTWA +>rev_sp|A0A286YF46|SCGR5_HUMAN Small cysteine and glycine repeat-containing protein 5 OS=Homo sapiens OX=9606 GN=SCYGR5 PE=1 SV=1 +MCCCQKQCCGKQQCCGKGCGCGCSCCTRRCCCIVPTSCCGGCCGRCCPCCSSCCGVRYCR +CTTCSGCGGGCGGGCGGCGGCGCCG +>rev_sp|A6NKQ9|CGB1_HUMAN Choriogonadotropin subunit beta variant 1 OS=Homo sapiens OX=9606 GN=CGB1 PE=2 SV=3 +MPGPLRSPSPLSPPPAKSSSSDQFRPDDCTLPHDKPGGCDTTSRRCLACQCSLAVAYSVV +PNVGRPCGPLRISEFRVDRYNCVVQPLAPLVGQLVRTMTPCYGACITTNVTICVPCGEKE +VALTANIPRCRPRLPEKSAWTGGMSLLLLLLLRQFMEMKPARFHPDVRGKVHRERLPIDE +ALVPFTS +>rev_sp|O14522|PTPRT_HUMAN Receptor-type tyrosine-protein phosphatase T OS=Homo sapiens OX=9606 GN=PTPRT PE=1 SV=6 +MFSSLYELAVEYVFKYQELTEVMNSKNNRLTKVIHFVDIINQQQIMECVSCIACFTGSRG +GGNLCHVVTRGERGDYQEQWKELRRVVKLLSRKSPPTDRYAPWGIYQLHQVIRYGDQPRA +MNCIRFIRHIIDEDIDASVFEVQIPGYCGSTKEPWYQMCFQATDMENLMVVSSCNYDFVL +RWFDAVTNPLPHQTVVFAAPQKHSDMLAANIYNSSEGDVSILFPLCRDLPLVDMSRNKDH +NRPLLGISCDEPRVRPTVINLTQFEDKIQSSNTQPDLRSINYYLSRFECVPIATNGCLCA +ELIADHVFVYQEETQVLNVRQARLERVCNFIDVVGENEAMDLMTDIAIFCGTRGAGASCH +VVIPGAEPPNLFKVQRVFGLLGTAYCPVGHDPWSTFHFLRLERIEHYGKKQVTFTRIVYE +ALPETEILTVKIDGYVETDDPWYRVCKVRGVEVLNTVMVISASNEQWIMRWFDKVTEQMP +GQTAIYHRPRHYGDIYNANIYDSHPDGDLVLLRVRSHDYSIINGYRNKNRNEDEKATDWS +ATQGEPLAEYEEKFGYGQGRKMQTIHQLLDAVRIAPQFQDRPYSMEVPDCTRYPHTQITL +TPQSLEGRSGDTFGNVDQSSSSFGEDNRSASLKTTPKDASAVPGMERQAGSQTEKQKKAL +KLYYSYSYANRRRKITLMVGLLIIIFMLLGAIVGAMKVTNDVQKEPEVTNSNQTSAGKTA +LRVCNIKTEGNAKSLAQFYISYSKLPSLPPNWYGNYTKNDGVTFPQTVPLNAPKLEAAFY +HLSDLSSANRYSVPVSFCEIIDAARRSKQLREEKVVLQYVSVPAGRSQAPKLMVTITTDT +ENLPTDTDYEPMSPASIKTAIRTTVPPGFGKATSAKITFSYTTGPYLGVFLHHTENRLKF +VKGRQSSLDASPDLSGVAKYNIEYLTIVGNTENPPKWQIYIKEEFPGGQISELPVAGPVD +EETQVVLEESEMRGEPNSLLLRLRITMFPRLGRLTYHSSTQIVEEAEYQQQNFVYQYQVT +LNYSHCRTVAYGFPEWQLTLQRARIDVIEVNQPGHVPDACKTRTTLPPGPPGTGGEGPRT +LLVRIEYEVDPDLHWLKYNPSDVIHTEAWTGTTTRYEVEKLIIPGDGIISNANPKIWLYT +AGVALLEPPAIPTPPEKVILEAYNSVGSGGDSRIVCRYKSVSRQATDAVSVTASFRRHNV +VRTVMLATDRGNWQQLWLKDHQSWKGGAICQFTANQGVNVEVNQLRLFHPAKRCPHALVR +VEDVAIYGPHGKLSVSEFIVQYFHPWFTSIALEAKVWGETVVGSVNWVPNGQPGGNVKVY +VNLAGPSSRDRSSFYYHFDICHTDNEKLTPLLLHAKQGSARGSSNVMMFSGTPVAQDLMP +KEWTNIQEWTFGNTGLAVSYGCNSYHEDFSCGGAASQARAGPLPPLQLRLLLSLALAALS +A +>rev_sp|O75179|ANR17_HUMAN Ankyrin repeat domain-containing protein 17 OS=Homo sapiens OX=9606 GN=ANKRD17 PE=1 SV=3 +MGLQNMHVSNMHPAWPGTWVTQPGNNETSSSVMKILSNWSPDAAHPGNFIPGGAGDPIPA +VPPIMANGYVSFPMGGVKPFDMYGAPVHQHFTGSPTVIGTSDREMAQHQSFVGPDSMPRH +IMPNGMGPNCWDVKDQNGGIGEFSWIGSHGVSTSLNSGIVSPASTGTQRIRASRETGIPV +PVKRDQSVNSPKESGLPVSSPAPSPARVGSSVSQASSDNSASSCPTLLSLPSFHQRNFNA +AAPAGGLPAGPGYMQGGSINSAFNGLHTSSPTVSGYPSDMNVIGSPSPAPRQLPPRFGPA +KSTDSQHLPDSNPLYSSKGSLMSEPTSQSSVVSGGWFAHASTPSNEFLTSFPGFPLPASF +NPAIPKNAPHVSNQCASQTSLTSPLQVSSPVSHPRKIHNVSLVAVSSNQVSASNKHPATT +GHPPPRIAPTEMKPTPQPCGMPTQPMPYTVPATSPVAVPASSTALPPVTMRVEPPPVQSQ +QLPPRPEQSVSGPPQQQNSHASSSGSSSSSNPPRTNTTEVVPPSGAEQESSSAVKNPSAS +GCDLPSNRSVGGPQAPPSPSSPSSVPYHEKATPMPYTANTPATNNNSATTTVTTTTANST +KVVTVFLQRRVSPSSPSGNTSTGPVTSASSSTTTTPSSTLSGASNVQSGSSNQNSHRPVM +HPKPSNTARAPSLPRVPFPGWTSQAPPFTGGFHTMPLRPPRIQQFTQAALLAHAFQPPPY +ALPLSVPFGPRVNNVPNKITKHTSASSIAPVTLATATQSTSSLAVTTMKIGMLSTNAATT +TPASSGIKSNASSSKLRNKPILEDIEKDPDKILANILQTAQRTSETGGRITIIRDGTKDK +QKDIDIHAGTFERIANINCGGRGIVRSIVTSPVSVKKSRRVVEKWGEERKQGRKPSAVTL +KGNPSSLPSPGSKTCTSLSNSTGESITESLKISAKGSVSKREEKPFTVLVSPQKKSSVTT +TVVAPSHNDSKRSSNSNSSEDSCSSIRMNDSDGSESTSSSKSEGNVKEPQSYSIPLPDDF +IIQVNEPTIKNKRNKRKSSTTTITNNRKGHSGALTTWTASIGITTTTTASPPETLVEPED +EVKLKEKEQAAQLEFNEKNKAEIEELKRRQEEKKKRRKEKRKERKAALALRRSEERLKEL +DLEELLISANKNAEAAQRDKAQVISEMCLHCKKLMEKDTITAIYRMCESDSPFQNVEKVL +YRVVKVHGKRFAAMLPTIKRNDAADVDAGAQVLLQVVDLHGGNAALWLPTNGKKNRVDIH +AGRGILLECFKYHGKDAAITLATDRSSPVPPANVDAGKDLLVRGVEAYGGSAAEMLPTLG +TKARHEVNAKRDLLLSVVETRGQFCALTLATNRNTEIQANIDSGMDLLLKVAATHGNMAA +LMLPSIGLKSGTRSNIEAGANLLIKIINVYGGSAALSLPTYDSVNRHEKNAGRALLLEVV +EQRGGSCALSLPTDKTRESQAEIDAGNDLLIEVVGVHGATAALILPTFGKKDRHEISAGR +ELLTQVLEEHGGACALTLATDHNSETQADIDIAPYIPLMASPSIIPSPTPTQPQSISAAI +SHTPTNSMASARGSVAAMIDDLTDNLTQAPSAVMLGQQTETLIGQGLGALQAQGLVPQGV +IVGQLETLNAISGPPLPGAALPLAQPMALPQIPAFGMQAATQQPEDKLLVPDVQQLRAYD +GESLQEGVGVPTLLGLHQCSQQQQQQLQIRQAKVELYQKKLQQQTKLQLEREVKQLEELI +KQKKQIQEERTKNLEEIKEKTLQDAHALELSQLQANKEIAEKIRQELETLKGQAEEVISE +PSQNTIYGQVDQSNAPLHSSSKQKSAAKNRIPLTTAVNAPPKDPEQPPVVMPLAQVPVRP +ARNLDHSPPTLQTVDPPPASLLNNPYDLLYCVVSTHGGKAAEILMTSGDKLRHTPDAGHA +LLLEVVALHGGACALSLVTHDNNATTRNVNAGKSILFQVTCVHGARAAKMLPTRGGESEH +ELDAGAQLLVDAVDTHGNECAYTLATDGTATTAHVNAGAALLYKVLELHGEQAAEMLPTS +CGLEIDAGAKILFDAVELFGGCCALTLATEQTEETQANINAGQGLLLAVMEEHGERAAEM +LPTYGEDNVEELSAGREILLAALEVHGGCAALTLPSEFSDAPMNVQAGSDLLLRAVEVHG +DMCAEMLATHMEDTKHEQDAGAELLFRVMELHGKYCALTLASEKFENSHTNIGAGNELLL +RAVEVHGASGAEMLPTHGNENHDEISAGSELLVKVVDVYGGACAYTLATNGTSSQANVDA +KHALLLKVIKVHGGNAAAMLPTIDGKIGRDEVNAHMALLVQALEYYGASCALCLLSEGEE +THENVSRGEILLKRVANVDGESCAEALSRNDSQGANATSEARMRTLAAAAEDLACSVSST +LRRLVEPDAFAKGDATSLKGIGAAELLAELRAQTEPDVTRLDAGDATGSLLLKSATELMP +NELDDQDLIFSEVESVEEEEDDDDEEEESNNSSTGGGGGGGGGGGGGGGSNDSDSSSESS +SPPRCTRNRKAKHHQQQPPKKKLLLDCVRVMGRPSSASRARSSGGVGGGVEAAAPPGAVA +AVAPPSGEGEAATAAAVPVTAKE +>rev_sp|O76036|NCTR1_HUMAN Natural cytotoxicity triggering receptor 1 OS=Homo sapiens OX=9606 GN=NCR1 PE=1 SV=1 +MLTQTNLRRRGEWTSARSARERTRKRSLWDEVLFWVLAVLVLFALGMRLLNQATHDWLAH +DKQLGTETTLLYTGWTDAPFTPDEPALSTNEIDGTVLLKVPESPFSWAHNNYSGFCRYTG +RHATTVPGLPFEAQVKGYGRQVHSSRGEKLLLFMSTATDLRCYFTVKEGSIVEPGPHVSL +TPTDYMETVVLDLLNSPESWLEGVRYICSYQGAMRSNMDPIYFKVKNIREPPKPRDVAFL +SGEFHLQYEVAGYNGQCCITVQKEKPVMFHPEAWIFPKPLTQQQASIRQSLCLGVCLLAP +LTSS +>rev_sp|O94864|ST65G_HUMAN STAGA complex 65 subunit gamma OS=Homo sapiens OX=9606 GN=SUPT7L PE=1 SV=1 +MIKRMRKKCRQNFVPSSGMLSDTSHSGYSSDSDDPSQPIGAESMGSMPEEFVDSGLVGHG +SVNGEESEQPEMKVHALNWLPSANVEASSAQPSAEVELNSPFRESQAGLVGMPLSQDGSA +LDAELEESVPFTIDSVPEEKIKVPKADETAKEPNVIREYEESLQKSIQLMYSHYDKIRHQ +WFKQLSLVSGIGVEHFVQEMVDPFPTQGLRAERDVAFRLLKTFKLCYEHAVDTLTELVSE +NACDFGAHALITAVAQYLLQRCSHWSLETVPEGKGRYFDSEPDSHRIQFPANPNKCDLPL +LDDPLPPSGPCSPLPESEETKVGETQQQNQAQATAILNRLRRNHQILQITHITLSCPESP +IDLMTPPKPKNASPQHLPPDHVEVLRFERPLLDFSSRNTQSSSIPIEGWYRQLN +>rev_sp|O95069|KCNK2_HUMAN Potassium channel subfamily K member 2 OS=Homo sapiens OX=9606 GN=KCNK2 PE=1 SV=2 +MKINEIVAIEEGACHPTLGNLYISETKLLPPLVDRESTLHNVSLTRRCPTLEQNHNGALE +ASLKRKISTARQFKDYIEVSLRRRTEKFEATVNATWEAAHARFEGVEEKTKKSIVRLWDG +IMSLVAAFYALGVLIWFWVVPKYFDLYEIDSGGAVYDGFGITTLTIVVFYIADLASWGEI +HKFIIAPLAVFLVCGFLIFIITSIIRIKTQSVNWKIFTDEVKAIGKGFITGLQDGVGALL +FGFLPIGLLAYIICFIKGGETRPSINGFGITTIVTGAFFFSSGLDWHSIQNSTNGLPIIG +ANIAAVIQQILEDLETSNVCSHQSIFTQKQIVITTRQSIEHPQELAKFVTAGIILYLVVV +LFITSVTKWKMVNITTDSEVRSALVTPKTSFSLRPKSNQAASKPDLLDPAAVGARYGPRE +RSASPL +>rev_sp|O95992|CH25H_HUMAN Cholesterol 25-hydroxylase OS=Homo sapiens OX=9606 GN=CH25H PE=1 SV=1 +MRAPVSATRLTGLIKDWHTFYPAFNCNFHSHHLDHHVVGGYWGFPVLRHTSWPFNYGSHD +EVSLWINVVHFTLTTLPHCGLLTVNMMDFFGLSFLEWVSMYQTALAFSSSNQHHVKHFTR +YLWPVKHHLLHWVFFEMDFLLLCFLIHHLLLLLEPAEHPLLAPSRAWHLLTVPFVFMVHQ +YLTQGLCPLLQQASPSFDPHIKYRRLAPVWSCLIDLVVFPLCFGVYTTISFIVPFFPSQL +LAEWSRLHDWLPQLFLQGSSCLVQPDSCNHCS +>rev_sp|O75385|ULK1_HUMAN Serine/threonine-protein kinase ULK1 OS=Homo sapiens OX=9606 GN=ULK1 PE=1 SV=2 +MACIGTLLASLRREICLKCKTVNEIDAQDSLMHQLGELLLLAKHYRPVCGERHQFMEDLA +ASQVMQVAHSFILREATISHIRDLLRQKDLFFRQLRLSLGQCSVVSAKYLENLRRVVQKV +TSSLCLKGARIQDIASQLGSSLLEAVKLYLVLQEAFGWERSLLSIQDAVVSEQLQYEPGG +AAESASGKLAAIELVHQVFLLTFRLGRLIETHEQEMLTEEPLDPAEFTVAGELNATIPDA +FSCGHGPAPLEPAPAESCPGPVLHRASSSASGTPGASFMRTRPGQPPTSGSPPSGVTFVV +PSPSSTGGARAGPHLSGGFGASPAIEMPKEQLSETSGPDPAQTGFAAKLLLDTLRSTSFS +RGFPGKPDEGPRLGPGLPQGHFPGVESLDPLTRNRPPTMVVGQRALLALLNQSSPTKPFD +FSPVAKTPSGLIPPLPNRQLFDPLKPSGRLNRCSQLGHSPQPPSAQPPSFVAGLPDTPPK +PLKPRVVHLDSLNPASHLRCGLGSTRPSHEPASSGPRPSRGGRMEAGQPSPTGSWGPREP +ITGVQPSPTYPRGGGLSMKRALVGGHEAHAPPSPSARAFGLPSTSGSRRIASSRPTQFQT +PSQLNREIRQYNQVQTPVPIPVSAGCRSSSFPGARGSPSPSSSCPPSPSPTRGHSELGAS +AVLSSGSCMLSDPPPKASPAEAVLDGPFQAPVMVFDDTDCSSDKSGGSDRSSHLFGATDA +PSALTKQLQQMEGLSPPSALHSTSSSSSSSGSGSSPYSPVPVPPSKRVSPSADLFPHHFF +EDFDMRDKHNRQLLALLLQRLPASTERPITPVLTKNKEYFLRLDQPSSAQFPAKGTLCQY +VITGISWLDAKGDYHQSMIVEPAMYMPSGCLTAAMMNSQLYRAFGFDAIKVRISNPNARR +GAPNSLLINQPKLDRHIIGKSHLLRMAGAIQQLFLRITDESLTRMAHLYDALDGGNCYEM +VLYVSNAMEQFDYLAVINEHKLEKLIKIEKGLLTQSKALNKKNICKVAVELDHKERHRGK +FVVAFAGHGILDKRSFEFKGVTETGGRGPE