diff --git a/proteingym/baselines/escott/compute_fitness.py b/proteingym/baselines/escott/compute_fitness.py new file mode 100644 index 0000000..8bff43f --- /dev/null +++ b/proteingym/baselines/escott/compute_fitness.py @@ -0,0 +1,225 @@ +""" +Run ESCOTT on selected DMS index in ProteinGym and save fitness scores. + +Note that ESCOTT has the following dependencies that have to be installed: + - JET2 (muscle, java, and optionally psiblast, clustalw) + - R + - DSSP +Due to the difficulty in ESCOTT installation, we recommend using the Docker image +provided by the authors. The current version can be dowloaded with: +docker pull tekpinar/prescott-docker:v1.6.0. +This script assumes that the Docker image is running and accessible, and +folders containing inputs and output files are mounted to the Docker container. +This is done automatically when using the scoring_ESCOTT_substitutions.sh script in +scripts/scoring_DMS_zero_shot folder. +ESCOTT documentation with installation instructions and examples can be found at: +http://gitlab.lcqb.upmc.fr/tekpinar/PRESCOTT + +For the correct ESCOTT execution, the first sequence in MSA file has to be the +query sequence, and MSA and PDB file must span the exact same region of the protein. +In this script, default input files are manipulated to match these requirements. +""" + +import os +import argparse +import subprocess +import numpy as np +import pandas as pd +import sys + +sys.path.append(os.path.dirname(os.path.abspath(__file__))) +import pdb_utils + + +AA_VOCAB = "ACDEFGHIKLMNPQRSTVWY" +aa2idx = {aa: i for i, aa in enumerate(AA_VOCAB)} + + +def get_args(): + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawTextHelpFormatter + ) + parser.add_argument( + "--DMS_reference_file_path", + default=None, + type=str, + help="Path to reference file with list of DMS to score", + ) + parser.add_argument( + "--DMS_index", default=0, type=int, help="Index of DMS assay in reference file" + ) + parser.add_argument( + "--DMS_data_folder", type=str, help="Path to folder that contains all DMS assay datasets" + ) + parser.add_argument( + "--output_scores_folder", + default="./", + type=str, + help="Name of folder to write model scores to", + ) + parser.add_argument("--MSA_folder", default=".", type=str, help="Path to MSA file") + parser.add_argument("--structure_data_folder", default=".", type=str, help="Path to PDB file") + parser.add_argument( + "--temp_folder", + default="./escott_tmp", + type=str, + help="Path to temporary folder to store intermediate files", + ) + parser.add_argument("--nseqs", type=int, default=40000) + args = parser.parse_args() + return args + + +def parse_alignment(ali_file): + """ + Parse alignment file and return dictionary of sequences. + Headers are modified to avoid errors in ESCOTT. + """ + with open(ali_file, "r") as f: + lines = f.readlines() + seqs = {} + for line in lines: + if line[0] == ">": + seq_id = line[1:].strip().replace("_", "").replace(".", "") + seqs[seq_id] = "" + else: + seqs[seq_id] += line.strip().upper().replace(".", "-") + return seqs + + +def extract_scores(predictions, mutants, offset): + """Extract scores for a given assay from the full mutational landscape.""" + scores = [] + for mut in mutants: + score = 0 + for m in mut.split(":"): # multiple mutations case + pos, mut_aa = int(m[1:-1]) - offset, m[-1] + score += predictions[pos, aa2idx[mut_aa]] + scores.append(score) + return scores + + +def main(args): + + if not os.path.isdir(args.temp_folder): + os.mkdir(args.temp_folder) + + mapping_protein_seq_DMS = pd.read_csv(args.DMS_reference_file_path) + list_DMS = mapping_protein_seq_DMS["DMS_id"] + DMS_id = list_DMS[args.DMS_index] + print("Computing scores for DMS: " + str(DMS_id)) + DMS_file_name = mapping_protein_seq_DMS["DMS_filename"][ + mapping_protein_seq_DMS["DMS_id"] == DMS_id + ].values[0] + MSA_data_file = os.path.join( + args.MSA_folder, mapping_protein_seq_DMS["MSA_filename"][args.DMS_index] + ) + MSA_start = mapping_protein_seq_DMS["MSA_start"][args.DMS_index] + MSA_end = mapping_protein_seq_DMS["MSA_end"][args.DMS_index] + + DMS_data = pd.read_csv(os.path.join(args.DMS_data_folder, DMS_file_name)) + if "mutant" in DMS_data.columns: + mutants = DMS_data["mutant"].tolist() + else: + raise ValueError( + "DMS data file must contain a column named 'mutant' with the mutant sequences to score" + ) + + # Create subfolder for single assays in temporary folder, removing some annoying characters + # Use absolute path to avoid issues with subprocess call + assay_folder = os.path.abspath( + os.path.join(args.temp_folder, DMS_id.replace("_", "").replace(".", "")) + ) + if not os.path.isdir(assay_folder): + os.mkdir(assay_folder) + + # since ESCOTT also uses pdb files, and they have to match MSA files, we iterate over + # pdb chunks for a single assay, cutting the MSA file accordingly + # Only after processing each chunk we extract scores from the output file for + # the corresponding mutants + pdb_filenames = ( + mapping_protein_seq_DMS["pdb_file"][mapping_protein_seq_DMS["DMS_id"] == DMS_id] + .values[0] + .split("|") + ) # if sequence is large (eg., BRCA2_HUMAN) the structure is split in several chunks + pdb_ranges = ( + mapping_protein_seq_DMS["pdb_range"][mapping_protein_seq_DMS["DMS_id"] == DMS_id] + .values[0] + .split("|") + ) + all_scores = [] + + try: + for pdb_index, pdb_filename in enumerate(pdb_filenames): + pdb_file = os.path.join(args.structure_data_folder, pdb_filename) + pdb_start, pdb_end = [int(x) for x in pdb_ranges[pdb_index].split("-")] + + # check if pdb range is contained in MSA range, otherwise manipulate pdb file + if not (pdb_start >= MSA_start and pdb_end <= MSA_end): + print("PDB range not contained in MSA range, PDB file manipulation needed") + pdb_start_range = max(pdb_start, MSA_start) - pdb_start + 1 # 1-based index + pdb_end_range = min(pdb_end, MSA_end) - pdb_start + 2 + residue_numbers = list(range(pdb_start_range, pdb_end_range)) + new_pdb_file = os.path.join(assay_folder, "cut_" + pdb_filename) + pdb_utils.filter_residues(pdb_file, new_pdb_file, residue_numbers) + + pdb_file = new_pdb_file + pdb_start = max(pdb_start, MSA_start) + pdb_end = min(pdb_end, MSA_end) + + # Both pdb index and msa index in reference file are 1-based + starting_msa_idx = pdb_start - MSA_start + endings_msa_idx = pdb_end - MSA_start + 1 + + # set offset for scores extraction at the end + if pdb_index == 0: + offset = pdb_start + + # Create temporary MSA file for this pdb chunk, with correct format for ESCOTT + MSA_tmp_file = os.path.join(assay_folder, "MSA_" + str(pdb_index) + ".fasta") + if MSA_data_file is not None: + sequence_dict = parse_alignment(MSA_data_file) + with open(MSA_tmp_file, "w") as f: + for id, seq in sequence_dict.items(): + f.write(">" + id + "\n") + f.write(seq[starting_msa_idx:endings_msa_idx] + "\n") + else: + raise ValueError("MSA data file must be provided to run ESCOTT") + + # run ESCOTT on this pdb chunk + command = f"escott {MSA_tmp_file} -p {pdb_file} -N {args.nseqs}" + output = subprocess.run(command, shell=True, cwd=assay_folder, capture_output=True) + if output.returncode != 0: + raise ValueError(f"Error occurred while running ESCOTT: {output.stderr.decode()}") + + # parse output files and find the file with suffix _evolCombi.txt + for file in os.listdir(assay_folder): + if file.endswith("_evolCombi.txt"): + evol_combi_file = file + break + else: + raise ValueError("ESCOTT output file not found, an error occurred") + + scores = pd.read_csv( + os.path.join(assay_folder, evol_combi_file), sep="\s+", index_col=0 + ).values.transpose() + all_scores.append(scores) + + # concatenate scores from all pdb chunks + all_scores = np.concatenate(all_scores, axis=0) + + # extract scores for selected mutants + mutant_scores = extract_scores(all_scores, mutants, offset) + DMS_data["ESCOTT_score"] = mutant_scores + DMS_data.to_csv(os.path.join(args.output_scores_folder, f"{DMS_id}.csv"), index=False) + print(f"Scores for DMS {DMS_id} computed successfully") + + except Exception as e: + print(f"Error occurred: {e}") + finally: + os.system(f"rm -rf {assay_folder}") + + +if __name__ == "__main__": + args = get_args() + main(args) diff --git a/proteingym/baselines/escott/pdb_utils.py b/proteingym/baselines/escott/pdb_utils.py new file mode 100644 index 0000000..fb99a66 --- /dev/null +++ b/proteingym/baselines/escott/pdb_utils.py @@ -0,0 +1,17 @@ +from Bio.PDB import PDBParser, PDBIO, Select + + +class ResidueSelect(Select): + def __init__(self, residue_numbers): + self.residue_numbers = residue_numbers + + def accept_residue(self, residue): + return residue.id[1] in self.residue_numbers + + +def filter_residues(input_pdb_file, output_pdb_file, residue_numbers): + parser = PDBParser() + structure = parser.get_structure("structure", input_pdb_file) + io = PDBIO() + io.set_structure(structure) + io.save(output_pdb_file, ResidueSelect(residue_numbers)) diff --git a/reference_files/DMS_substitutions.csv b/reference_files/DMS_substitutions.csv index cc89480..d4b02b5 100644 --- a/reference_files/DMS_substitutions.csv +++ b/reference_files/DMS_substitutions.csv @@ -30,7 +30,7 @@ CALM1_HUMAN_Weile_2017,CALM1_HUMAN_Weile_2017.csv,CALM1_HUMAN,Human,Homo sapiens CAPSD_AAV2S_Sinai_2021,CAPSD_AAV2S_Sinai_2021.csv,CAPSD_AAV2S,Virus,Adeno-associated virus 2 (isolate Srivastava/1982) (AAV-2),MAADGYLPDWLEDTLSEGIRQWWKLKPGPPPPKPAERHKDDSRGLVLPGYKYLGPFNGLDKGEPVNEADAAALEHDKAYDRQLDSGDNPYLKYNHADAEFQERLKEDTSFGGNLGRAVFQAKKRVLEPLGLVEEPVKTAPGKKRPVEHSPVEPDSSSGTGKAGQQPARKRLNFGQTGDADSVPDPQPLGQPPAAPSGLGTNTMATGSGAPMADNNEGADGVGNSSGNWHCDSTWMGDRVITTSTRTWALPTYNNHLYKQISSQSGASNDNHYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTQNDGTTTIANNLTSTVQVFTDSEYQLPYVLGSAHQGCLPPFPADVFMVPQYGYLTLNNGSQAVGRSSFYCLEYFPSQMLRTGNNFTFSYTFEDVPFHSSYAHSQSLDRLMNPLIDQYLYYLSRTNTPSGTTTQSRLQFSQAGASDIRDQSRNWLPGPCYRQQRVSKTSADNNNSEYSWTGATKYHLNGRDSLVNPGPAMASHKDDEEKFFPQSGVLIFGKQGSEKTNVDIEKVMITDEEEIRTTNPVATEQYGSVSTNLQRGNRQAATADVNTQGVLPGMVWQDRDVYLQGPIWAKIPHTDGHFHPSPLMGGFGLKHPPPQILIKNTPVPANPSTTFSAAKFASFITQYSTGQVSVEIEWELQKENSKRWNPEIQYTSNYNKSVNVDFTVDTNGVYSEPRPIGTRYLTRNL,735,TRUE,42328,532,41796,-1.2,manual,Sinai,Generative AAV capsid diversification by latent interpolation,2021,10.1101/2021.04.16.440236,561-588,AAV,viability for AAV capsid production,,CAPSD_AAV2S_uniprot_t099_msc70_mcc70_b0.8.a2m,1,735,735,0.8,0.01,604,0.782,575,213.8,0.371826087,low,1943,3.379130435,CAPSD_AAV2S_Sinai_substitutions_2021.csv,viral_selection,1,mutant,CAPSD_AAV2S_theta_0.01.npy,CAPSD_AAV2S.pdb,1-735,0.1,,OrganismalFitness CAR11_HUMAN_Meitlis_2020_gof,CAR11_HUMAN_Meitlis_2020_gof.csv,CAR11_HUMAN,Human,Homo sapiens,MPGGGPEMDDYMETLKDEEDALWENVECNRHMLSRYINPAKLTPYLRQCKVIDEQDEDEVLNAPMLPSKINRAGRLLDILHTKGQRGYVVFLESLEFYYPELYKLVTGKEPTRRFSTIVVEEGHEGLTHFLMNEVIKLQQQMKAKDLQRCELLARLRQLEDEKKQMTLTRVELLTFQERYYKMKEERDSYNDELVKVKDDNYNLAMRYAQLSEEKNMAVMRSRDLQLEIDQLKHRLNKMEEECKLERNQSLKLKNDIENRPKKEQVLELERENEMLKTKNQELQSIIQAGKRSLPDSDKAILDILEHDRKEALEDRQELVNRIYNLQEEARQAEELRDKYLEEKEDLELKCSTLGKDCEMYKHRMNTVMLQLEEVERERDQAFHSRDEAQTQYSQCLIEKDKYRKQIRELEEKNDEMRIEMVRREACIVNLESKLRRLSKDSNNLDQSLPRNLPVTIISQDFGDASPRTNGQEADDSSTSEESPEDSKYFLPYHPPQRRMNLKGIQLQRAKSPISLKRTSDFQAKGHEEEGTDASPSSCGSLPITNSFTKMQPPRSRSSIMSITAEPPGNDSIVRRYKEDAPHRSTVEEDNDSGGFDALDLDDDSHERYSFGPSSIHSSSSSHQSEGLDAYDLEQVNLMFRKFSLERPFRPSVTSVGHVRGPGPSVQHTTLNGDSLTSQLTLLGGNARGSFVHSVKPGSLAEKAGLREGHQLLLLEGCIRGERQSVPLDTCTKEEAHWTIQRCSGPVTLHYKVNHEGYRKLVKDMEDGLITSGDSFYIRLNLNISSQLDACTMSLKCDDVVHVRDTMYQDRHEWLCARVDPFTDHDLDMGTIPSYSRAQQLLLVKLQRLMHRGSREEVDGTHHTLRALRNTLQPEEALSTSDPRVSPRLSRASFLFGQLLQFVSRSENKYKRMNSNERVRIISGSPLGSLARSSLDATKLLTEKQEELDPESELGKNLSLIPYSLVRAFYCERRRPVLFTPTVLAKTLVQRLLNSGGAMEFTICKSDIVTRDEFLRRQKTETIIYSREKNPNAFECIAPANIEAVAAKNKHCLLEAGIGCTRDLIKSNIYPIVLFIRVCEKNIKRFRKLLPRPETEEEFLRVCRLKEKELEALPCLYATVEPDMWGSVEELLRVVKDKIGEEQRKTIWVDEDQL,1154,FALSE,2374,2374,0,0.14475,manual,Meitlis,Multiplexed Functional Assessment of Genetic Variants in CARD11,2020,10.1016/j.ajhg.2020.10.015.,4-146,CARD11,Signaling (in presence of ibrutinib),survival assessment assay,CAR11_HUMAN_2023-10-12_b02.a2m,1,1154,1154,0.2,0.2,1352,0.998,1152,53.7,0.04661458333,Low,0,0,mmc2.xlsx,log2_score,1,mutant,CAR11_HUMAN_theta0.2_2023-10-12_b02.npy,CAR11_HUMAN.pdb,1-1154,1,,OrganismalFitness CAR11_HUMAN_Meitlis_2020_lof,CAR11_HUMAN_Meitlis_2020_lof.csv,CAR11_HUMAN,Human,Homo sapiens,MPGGGPEMDDYMETLKDEEDALWENVECNRHMLSRYINPAKLTPYLRQCKVIDEQDEDEVLNAPMLPSKINRAGRLLDILHTKGQRGYVVFLESLEFYYPELYKLVTGKEPTRRFSTIVVEEGHEGLTHFLMNEVIKLQQQMKAKDLQRCELLARLRQLEDEKKQMTLTRVELLTFQERYYKMKEERDSYNDELVKVKDDNYNLAMRYAQLSEEKNMAVMRSRDLQLEIDQLKHRLNKMEEECKLERNQSLKLKNDIENRPKKEQVLELERENEMLKTKNQELQSIIQAGKRSLPDSDKAILDILEHDRKEALEDRQELVNRIYNLQEEARQAEELRDKYLEEKEDLELKCSTLGKDCEMYKHRMNTVMLQLEEVERERDQAFHSRDEAQTQYSQCLIEKDKYRKQIRELEEKNDEMRIEMVRREACIVNLESKLRRLSKDSNNLDQSLPRNLPVTIISQDFGDASPRTNGQEADDSSTSEESPEDSKYFLPYHPPQRRMNLKGIQLQRAKSPISLKRTSDFQAKGHEEEGTDASPSSCGSLPITNSFTKMQPPRSRSSIMSITAEPPGNDSIVRRYKEDAPHRSTVEEDNDSGGFDALDLDDDSHERYSFGPSSIHSSSSSHQSEGLDAYDLEQVNLMFRKFSLERPFRPSVTSVGHVRGPGPSVQHTTLNGDSLTSQLTLLGGNARGSFVHSVKPGSLAEKAGLREGHQLLLLEGCIRGERQSVPLDTCTKEEAHWTIQRCSGPVTLHYKVNHEGYRKLVKDMEDGLITSGDSFYIRLNLNISSQLDACTMSLKCDDVVHVRDTMYQDRHEWLCARVDPFTDHDLDMGTIPSYSRAQQLLLVKLQRLMHRGSREEVDGTHHTLRALRNTLQPEEALSTSDPRVSPRLSRASFLFGQLLQFVSRSENKYKRMNSNERVRIISGSPLGSLARSSLDATKLLTEKQEELDPESELGKNLSLIPYSLVRAFYCERRRPVLFTPTVLAKTLVQRLLNSGGAMEFTICKSDIVTRDEFLRRQKTETIIYSREKNPNAFECIAPANIEAVAAKNKHCLLEAGIGCTRDLIKSNIYPIVLFIRVCEKNIKRFRKLLPRPETEEEFLRVCRLKEKELEALPCLYATVEPDMWGSVEELLRVVKDKIGEEQRKTIWVDEDQL,1154,FALSE,2395,2395,0,-0.4635,manual,Meitlis,Multiplexed Functional Assessment of Genetic Variants in CARD11,2020,10.1016/j.ajhg.2020.10.015.,4-146,CARD11,Signaling,survival assessment assay,CAR11_HUMAN_2023-10-12_b02.a2m,1,1154,1154,0.2,0.2,1352,0.998,1152,53.7,0.04661458333,Low,0,0,mmc3.xlsx,log2_score,1,mutant,CAR11_HUMAN_theta0.2_2023-10-12_b02.npy,CAR11_HUMAN.pdb,1-1154,1,,OrganismalFitness -CAS9_STRP1_Spencer_2017_positive,CAS9_STRP1_Spencer_2017_positive.csv,CAS9_STRP1,Prokaryote,Streptococcus pyogenes,MDKKYSIGLDIGTNSVGWAVITDEYKVPSKKFKVLGNTDRHSIKKNLIGALLFDSGETAEATRLKRTARRRYTRRKNRICYLQEIFSNEMAKVDDSFFHRLEESFLVEEDKKHERHPIFGNIVDEVAYHEKYPTIYHLRKKLVDSTDKADLRLIYLALAHMIKFRGHFLIEGDLNPDNSDVDKLFIQLVQTYNQLFEENPINASGVDAKAILSARLSKSRRLENLIAQLPGEKKNGLFGNLIALSLGLTPNFKSNFDLAEDAKLQLSKDTYDDDLDNLLAQIGDQYADLFLAAKNLSDAILLSDILRVNTEITKAPLSASMIKRYDEHHQDLTLLKALVRQQLPEKYKEIFFDQSKNGYAGYIDGGASQEEFYKFIKPILEKMDGTEELLVKLNREDLLRKQRTFDNGSIPHQIHLGELHAILRRQEDFYPFLKDNREKIEKILTFRIPYYVGPLARGNSRFAWMTRKSEETITPWNFEEVVDKGASAQSFIERMTNFDKNLPNEKVLPKHSLLYEYFTVYNELTKVKYVTEGMRKPAFLSGEQKKAIVDLLFKTNRKVTVKQLKEDYFKKIECFDSVEISGVEDRFNASLGTYHDLLKIIKDKDFLDNEENEDILEDIVLTLTLFEDREMIEERLKTYAHLFDDKVMKQLKRRRYTGWGRLSRKLINGIRDKQSGKTILDFLKSDGFANRNFMQLIHDDSLTFKEDIQKAQVSGQGDSLHEHIANLAGSPAIKKGILQTVKVVDELVKVMGRHKPENIVIEMARENQTTQKGQKNSRERMKRIEEGIKELGSQILKEHPVENTQLQNEKLYLYYLQNGRDMYVDQELDINRLSDYDVDHIVPQSFLKDDSIDNKVLTRSDKNRGKSDNVPSEEVVKKMKNYWRQLLNAKLITQRKFDNLTKAERGGLSELDKAGFIKRQLVETRQITKHVAQILDSRMNTKYDENDKLIREVKVITLKSKLVSDFRKDFQFYKVREINNYHHAHDAYLNAVVGTALIKKYPKLESEFVYGDYKVYDVRKMIAKSEQEIGKATAKYFFYSNIMNFFKTEITLANGEIRKRPLIETNGETGEIVWDKGRDFATVRKVLSMPQVNIVKKTEVQTGGFSKESILPKRNSDKLIARKKDWDPKKYGGFDSPTVAYSVLVVAKVEKGKSKKLKSVKELLGITIMERSSFEKNPIDFLEAKGYKEVKKDLIIKLPKYSLFELENGRKRMLASAGELQKGNELALPSKYVNFLYLASHYEKLKGSPEDNEQKQLFVEQHKHYLDEIIEQISEFSKRVILADANLDKVLSAYNKHRDKPIREQAENIIHLFTLTNLGAPAAFKYFDTTIDRKRYTSTKEVLDATLIHQSITGLYETRIDLSQLGGD,1368,FALSE,8117,8117,0,-0.2654328586,median,Spencer,Deep mutational scanning of S. pyogenes Cas9 reveals important functional domains,2017,10.1038/s41598-017-17081-y,1-1368,Streptococcus pyogenes Cas9,count of mutation where survival depends on expression of Cas9 and correct cleavage,Flow cytometry,CAS9_STRP1_2023-08-07_b01.a2m,1,1368,1368,0.1,0.2,5349,0.992,1357,1532.3,1.12918,Medium,241,0.17759764,SPCAS9_Spencer_positive_2022.csv,Log2 Fold Change after Positive Selection,1,mutant,CAS9_STRP1_theta0.2_2023-08-07_b01.npy,CAS9_STRP1.pdb,1-1368,1,,Activity +CAS9_STRP1_Spencer_2017_positive,CAS9_STRP1_Spencer_2017_positive.csv,CAS9_STRP1,Prokaryote,Streptococcus pyogenes,MDKKYSIGLDIGTNSVGWAVITDEYKVPSKKFKVLGNTDRHSIKKNLIGALLFDSGETAEATRLKRTARRRYTRRKNRICYLQEIFSNEMAKVDDSFFHRLEESFLVEEDKKHERHPIFGNIVDEVAYHEKYPTIYHLRKKLVDSTDKADLRLIYLALAHMIKFRGHFLIEGDLNPDNSDVDKLFIQLVQTYNQLFEENPINASGVDAKAILSARLSKSRRLENLIAQLPGEKKNGLFGNLIALSLGLTPNFKSNFDLAEDAKLQLSKDTYDDDLDNLLAQIGDQYADLFLAAKNLSDAILLSDILRVNTEITKAPLSASMIKRYDEHHQDLTLLKALVRQQLPEKYKEIFFDQSKNGYAGYIDGGASQEEFYKFIKPILEKMDGTEELLVKLNREDLLRKQRTFDNGSIPHQIHLGELHAILRRQEDFYPFLKDNREKIEKILTFRIPYYVGPLARGNSRFAWMTRKSEETITPWNFEEVVDKGASAQSFIERMTNFDKNLPNEKVLPKHSLLYEYFTVYNELTKVKYVTEGMRKPAFLSGEQKKAIVDLLFKTNRKVTVKQLKEDYFKKIECFDSVEISGVEDRFNASLGTYHDLLKIIKDKDFLDNEENEDILEDIVLTLTLFEDREMIEERLKTYAHLFDDKVMKQLKRRRYTGWGRLSRKLINGIRDKQSGKTILDFLKSDGFANRNFMQLIHDDSLTFKEDIQKAQVSGQGDSLHEHIANLAGSPAIKKGILQTVKVVDELVKVMGRHKPENIVIEMARENQTTQKGQKNSRERMKRIEEGIKELGSQILKEHPVENTQLQNEKLYLYYLQNGRDMYVDQELDINRLSDYDVDHIVPQSFLKDDSIDNKVLTRSDKNRGKSDNVPSEEVVKKMKNYWRQLLNAKLITQRKFDNLTKAERGGLSELDKAGFIKRQLVETRQITKHVAQILDSRMNTKYDENDKLIREVKVITLKSKLVSDFRKDFQFYKVREINNYHHAHDAYLNAVVGTALIKKYPKLESEFVYGDYKVYDVRKMIAKSEQEIGKATAKYFFYSNIMNFFKTEITLANGEIRKRPLIETNGETGEIVWDKGRDFATVRKVLSMPQVNIVKKTEVQTGGFSKESILPKRNSDKLIARKKDWDPKKYGGFDSPTVAYSVLVVAKVEKGKSKKLKSVKELLGITIMERSSFEKNPIDFLEAKGYKEVKKDLIIKLPKYSLFELENGRKRMLASAGELQKGNELALPSKYVNFLYLASHYEKLKGSPEDNEQKQLFVEQHKHYLDEIIEQISEFSKRVILADANLDKVLSAYNKHRDKPIREQAENIIHLFTLTNLGAPAAFKYFDTTIDRKRYTSTKEVLDATLIHQSITGLYETRIDLSQLGGD,1368,FALSE,8117,8117,0,-0.2654328586,median,Spencer,Deep mutational scanning of S. pyogenes Cas9 reveals important functional domains,2017,10.1038/s41598-017-17081-y,1-1368,Streptococcus pyogenes Cas9,count of mutation where survival depends on expression of Cas9 and correct cleavage,Flow cytometry,CAS9_STRP1_2023-08-07_b01.a2m,1,1368,1368,0.1,0.2,5349,0.992,1357,1532.3,1.12918,Medium,241,0.17759764,SPCAS9_Spencer_positive_2022.csv,Log2 Fold Change after Positive Selection,1,mutant,CAS9_STRP1_theta0.2_2023-08-07_b01.npy,CAS9_STRP1.pdb,1-1390,1,,Activity CASP3_HUMAN_Roychowdhury_2020,CASP3_HUMAN_Roychowdhury_2020.csv,CASP3_HUMAN,Human,Homo sapiens,MSGISLDNSYKMDYPEMGLCIIINNKNFHKSTGMTSRSGTDVDAANLRETFRNLKYEVRNKNDLTREEIVELMRDVSKEDHSKRSSFVCVLLSHGEEGIIFGTNGPVDLKKITNFFRGDRCRSLTGKPKLFIIQACRGTELDCGIETDSGVDDDMACHKIPVEADFLYAYSTAPGYYSWRNSKDGSWFIQSLCAMLKQYADKLEFMHILTRVNRKVATEFESFSFDATFHAKKQIPCIVSMLTKELYFYHLEHHHHHH,258,FALSE,1567,1567,0,0.03725973017,median,Roychowdhury,Microfluidic deep mutational scanning of the human executioner caspases reveals differences in structure and regulation,2022,10.1038/s41420-021-00799-0,2-258,CASP3,Fluorescence measurement,,CASP3_HUMAN_2023-08-07_b01.a2m,1,258,258,0.1,0.2,86012,0.884,228,28096.2,123.2289474,High,307,1.346491228,CASP3_HUMAN_Roychowdhury_2020.csv,coef,1,mutant,CASP3_HUMAN_theta0.2_2023-08-07_b01.npy,CASP3_HUMAN.pdb,1-258,1,,Activity CASP7_HUMAN_Roychowdhury_2020,CASP7_HUMAN_Roychowdhury_2020.csv,CASP7_HUMAN,Human,Homo sapiens,MAKPDRSSFVPSLFSKKKKNVTMRSIKTTRDRVPTYQYNMNFEKLGKCIIINNKNFDKVTGMGVRNGTDKDAEALFKCFRSLGFDVIVYNDCSCAKMQDLLKKASEEDHTNAACFACILLSHGEENVIYGKDGVTPIKDLTAHFRGDRCKTLLEKPKLFFIQACRGTELDDGIQADSGPINDTDANPRYKIPVEADFLFAYSTVPGYYSWRSPGRGSWFVQALCSILEEHGKDLEIMQILTRVNDRVARHFESQSDDPHFHEKKQIPCVVSMLTKELYFSQ,281,FALSE,1680,1680,0,-0.3340768074,median,Roychowdhury,Microfluidic deep mutational scanning of the human executioner caspases reveals differences in structure and regulation,2022,10.1038/s41420-021-00799-0,2-281,CASP7,Fluorescence measurement,,CASP7_HUMAN_2023-08-07_b01.a2m,1,281,281,0.1,0.2,71075,0.854,240,21588.4,89.95166667,Medium,298,1.241666667,CASP7_HUMAN_Roychowdhury_2020.csv,coef,1,mutant,CASP7_HUMAN_theta0.2_2023-08-07_b01.npy,CASP7_HUMAN.pdb,1-281,1,,Activity CATR_CHLRE_Tsuboyama_2023_2AMI,CATR_CHLRE_Tsuboyama_2023_2AMI.csv,CATR_CHLRE,Eukaryote,Chlamydomonas reinhardtii,GLTEEQKQEIREAFDLFDTDGSGTIDAKELKVAMRALGFEPKKEEIKKMISEIDKDGSGTIDFEEFLTMMTA,72,TRUE,1903,1340,563,-0.5681612987,median,Tsuboyama,Mega-scale experimental analysis of protein folding stability in biology and design,2023,10.1038/s41586-023-06328-6,1-72,Caltractin,Stability,cDNA display proteolysis,CATR_CHLRE_2023-08-07_b03.a2m,1,72,72,0.3,0.2,551057,0.903,65,75596.9,1163.029231,High,57,0.8769230769,Tsuboyama2023_Dataset2_Dataset7,ddG_ML_float,1,mut_type,CATR_CHLRE_theta0.2_2023-08-07_b03.npy,CATR_CHLRE.pdb,1-72,1,,Stability diff --git a/scripts/scoring_DMS_zero_shot/scoring_ESCOTT_substitutions.sh b/scripts/scoring_DMS_zero_shot/scoring_ESCOTT_substitutions.sh new file mode 100644 index 0000000..625acc6 --- /dev/null +++ b/scripts/scoring_DMS_zero_shot/scoring_ESCOTT_substitutions.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# scoring_ESCOTT_substitutions.sh +# +# Description: +# This script runs the ESCOTT prediction in a Docker container. It ensures that all necessary +# dependencies are available and sets up the environment for running the ESCOTT prediction. +# The script mounts the required directories into the Docker container, executes the prediction, +# and saves the output files. +# +# Prerequisites: +# - Docker must be installed on the host machine. +# - The user must have permission to run Docker commands. +# - The Docker image for ESCOTT with all dependencies must be pulled from Docker Hub using the following command: +# docker pull tekpinar/prescott-docker:v1.6.0 +# +# Note: +# The generated prediction files in the Docker container will have root ownership. After the execution, +# change the ownership of the files to the current user with the following command: +# sudo chown -R $USER:$USER +# + +# Source the configuration file +source ../zero_shot_config.sh + +export DMS_index="Experiment index to run (e.g. 0, 1,...216)" +export DMS_output_score_folder="${DMS_output_score_folder_subs}/ESCOTT/" + +# temporary folder for intermediate files generated by ESCOTT +# This folder will be removed after the execution +export TEMP_FOLDER="./escott_tmp/" + +# Docker image name +export DOCKER_IMAGE="tekpinar/prescott-docker:v1.6.0" + +# using default NSEQS=40000 for DMS_index=147 returns all zeros due to very low +# sequence variation in the first portion of the MSA +if [ $DMS_index -eq 147 ]; then + export NSEQS=182169 +else + export NSEQS=40000 +fi + +# Remove temporary folder on exit +trap 'rm -rf $TEMP_FOLDER' EXIT + +# Run the compute_fitness.py script in the Docker container +docker run --rm \ +-v $(dirname $(dirname $(dirname $(realpath $0)))):/root/ProteinGym \ +-v $(realpath $DMS_reference_file_path_subs):/root/DMS_substitutions.csv \ +-v $(realpath $DMS_data_folder_subs):/root/DMS_data \ +-v $(realpath $DMS_MSA_data_folder):/root/MSA_data \ +-v $(realpath $DMS_structure_folder):/root/structure_data \ +-v $(realpath $DMS_output_score_folder):/root/output_scores \ +-w /root/ProteinGym/scripts/scoring_DMS_zero_shot \ +$DOCKER_IMAGE \ +bash -c " +python3 -u ../../proteingym/baselines/escott/compute_fitness.py \ +--DMS_index=$DMS_index \ +--DMS_reference_file_path=/root/DMS_substitutions.csv \ +--DMS_data_folder=/root/DMS_data \ +--MSA_folder=/root/MSA_data \ +--structure_data_folder=/root/structure_data \ +--output_scores_folder=/root/output_scores \ +--temp_folder=$TEMP_FOLDER \ +--nseqs=$NSEQS"