From 79773021b8b43841b46f46d250b212e48fde6c0d Mon Sep 17 00:00:00 2001 From: gwirn <71886945+gwirn@users.noreply.github.com> Date: Wed, 5 Jun 2024 15:08:24 +0200 Subject: [PATCH] updated examples for new PDBData class --- examples/README.md | 1 + examples/analysis_example.py | 9 +- examples/bb_example_subclassing_trainer.py | 168 ++++++++++++--------- examples/bb_foldingnet_basic.py | 48 +++--- examples/bb_many_subclassing_examples.py | 20 +-- examples/prepare_example.py | 2 +- src/molearn/data/prepare.py | 6 +- 7 files changed, 148 insertions(+), 106 deletions(-) diff --git a/examples/README.md b/examples/README.md index 68ea172..a6ade67 100644 --- a/examples/README.md +++ b/examples/README.md @@ -10,6 +10,7 @@ https://journals.aps.org/prx/abstract/10.1103/PhysRevX.11.011052). #### Training data The files `MurD_closed.tar.gz` and `MurD_open.tar.gz` contain 900 conformations each of MurD, generated with MD simulations of its closed and open state. Extracting these files will yield `MurD_closed_selection.pdb` and `MurD_open_selection.pdb`. +In order to use them as training data please run the `prepare_example.py` in order to obtain a joined and prepared trajectory. #### Test data diff --git a/examples/analysis_example.py b/examples/analysis_example.py index cb3114a..fa2568c 100644 --- a/examples/analysis_example.py +++ b/examples/analysis_example.py @@ -1,5 +1,8 @@ import torch import os +# import sys + +# sys.path.insert(0, os.path.join(os.path.abspath(os.pardir), "src")) from molearn.models.foldingnet import AutoEncoder from molearn.analysis import MolearnAnalysis from molearn.data import PDBData @@ -40,8 +43,10 @@ def main(): # by defining the manual see and loading the dataset in the same order as when # the neural network was trained, the same train-test split will be obtained data = PDBData() - data.import_pdb(f"data{os.sep}MurD_closed_selection.pdb") - data.import_pdb(f"data{os.sep}MurD_open_selection.pdb") + data.import_pdb( + "./clustered/MurD_open_selection_CLUSTER_aggl_train.dcd", + "./clustered/MurD_open_selection_NEW_TOPO.pdb", + ) data.fix_terminal() data.atomselect(atoms=["CA", "C", "N", "CB", "O"]) data.prepare_dataset() diff --git a/examples/bb_example_subclassing_trainer.py b/examples/bb_example_subclassing_trainer.py index 0555359..56b3887 100644 --- a/examples/bb_example_subclassing_trainer.py +++ b/examples/bb_example_subclassing_trainer.py @@ -1,5 +1,7 @@ -import sys, os -sys.path.insert(0, os.path.join(os.path.abspath(os.pardir),'src')) +import sys +import os + +sys.path.insert(0, os.path.join(os.path.abspath(os.pardir), "src")) from molearn.data import PDBData from molearn.trainers import OpenMM_Physics_Trainer from molearn.models.foldingnet import AutoEncoder @@ -10,122 +12,144 @@ class CustomTrainer(OpenMM_Physics_Trainer): -#### All commented out sections are not needed, they are there for demonstration purposes + #### All commented out sections are not needed, they are there for demonstration purposes ### This is what common_step looks like in Trainer ### -# def common_step(self, batch): -# self._internal = {} -# encoded = self.autoencoder.encode(batch) -# self._internal['encoded'] = encoded -# decoded = self.autoencoder.decode(encoded)[:,:,:batch.size(2)] -# self._internal['decoded'] = decoded -# return dict(mse_loss = ((batch-decoded)**2).mean()) + # def common_step(self, batch): + # self._internal = {} + # encoded = self.autoencoder.encode(batch) + # self._internal['encoded'] = encoded + # decoded = self.autoencoder.decode(encoded)[:,:,:batch.size(2)] + # self._internal['decoded'] = decoded + # return dict(mse_loss = ((batch-decoded)**2).mean()) ### This is what common_physics_step looks like in OpenMM_Physics_Trainer ### -# def common_physics_step(self, batch, latent): -# alpha = torch.rand(int(len(batch)//2), 1, 1).type_as(latent) -# latent_interpolated = (1-alpha)*latent[:-1:2] + alpha*latent[1::2] -# generated = self.autoencoder.decode(latent_interpolated)[:,:,:batch.size(2)] -# self._internal['generated'] = generated -# energy = self.physics_loss(generated) -# energy[energy.isinf()]=1e35 -# energy = torch.clamp(energy, max=1e34) -# energy = energy.nanmean() -# return {'physics_loss':energy}#a if not energy.isinf() else torch.tensor(0.0)} + # def common_physics_step(self, batch, latent): + # alpha = torch.rand(int(len(batch)//2), 1, 1).type_as(latent) + # latent_interpolated = (1-alpha)*latent[:-1:2] + alpha*latent[1::2] + # generated = self.autoencoder.decode(latent_interpolated)[:,:,:batch.size(2)] + # self._internal['generated'] = generated + # energy = self.physics_loss(generated) + # energy[energy.isinf()]=1e35 + # energy = torch.clamp(energy, max=1e34) + # energy = energy.nanmean() + # return {'physics_loss':energy}#a if not energy.isinf() else torch.tensor(0.0)} ### This is what valid_step looks like in OpenMM_Physics_Trainer ### -# def valid_step(self, batch): -# results = self.common_step(batch) -# results.update(self.common_physics_step(batch, self._internal['encoded'])) -# scale = (self.psf*results['mse_loss'])/(results['physics_loss'] +1e-5) -# final_loss = torch.log(results['mse_loss'])+scale*torch.log(results['physics_loss'] -# results['loss'] = final_loss -# return results + # def valid_step(self, batch): + # results = self.common_step(batch) + # results.update(self.common_physics_step(batch, self._internal['encoded'])) + # scale = (self.psf*results['mse_loss'])/(results['physics_loss'] +1e-5) + # final_loss = torch.log(results['mse_loss'])+scale*torch.log(results['physics_loss'] + # results['loss'] = final_loss + # return results def valid_step(self, batch): - results = super().valid_step(batch) - #rmsd - rmsd = (((batch-self._internal['decoded'])*self.std)**2).sum(dim=1).mean().sqrt() - results['RMSD'] = rmsd # 'valid_' will automatically be prepended onto this in valid_epoch, to distinguish it from train_step + results = super().valid_step(batch) + # rmsd + rmsd = ( + (((batch - self._internal["decoded"]) * self.std) ** 2) + .sum(dim=1) + .mean() + .sqrt() + ) + results["RMSD"] = ( + rmsd # 'valid_' will automatically be prepended onto this in valid_epoch, to distinguish it from train_step + ) - #calculate some dope + # calculate some dope if self.first_valid_step: self.first_valid_step = False - if not hasattr(self, 'dope_score_class'): - self.dope_score_class = Parallel_DOPE_Score(self.mol,processes=torch.get_num_threads()) - #Calculated dope of decoded structures + if not hasattr(self, "dope_score_class"): + self.dope_score_class = Parallel_DOPE_Score( + self.mol, processes=torch.get_num_threads() + ) + # Calculated dope of decoded structures self.dope_scores = [] - decoded_batch = (self._internal['decoded'].permute(0,2,1)*self.std).data.cpu().numpy() + decoded_batch = ( + (self._internal["decoded"].permute(0, 2, 1) * self.std) + .data.cpu() + .numpy() + ) for f in decoded_batch: if np.isfinite(f).all(): - self.dope_scores.append(self.dope_score_class.get_score(f,refine='both')) + self.dope_scores.append( + self.dope_score_class.get_score(f, refine="both") + ) - #Calcutate dope of interpolated/generated structures + # Calcutate dope of interpolated/generated structures self.interp_dope_scores = [] - interpolated_batch = (self._internal['generated'].permute(0,2,1)*self.std).data.cpu().numpy() + interpolated_batch = ( + (self._internal["generated"].permute(0, 2, 1) * self.std) + .data.cpu() + .numpy() + ) for f in interpolated_batch: if np.isfinite(f).all(): - self.interp_dope_scores.append(self.dope_score_class.get_score(f,refine='both')) + self.interp_dope_scores.append( + self.dope_score_class.get_score(f, refine="both") + ) # These will calculate in the background, synchronize at the end of the epoch. return results def valid_epoch(self, *args, **kwargs): - self.first_valid_step = self.epoch%5==0 + self.first_valid_step = self.epoch % 5 == 0 results = super().valid_epoch(*args, **kwargs) # Might as well keep track of cuda memomry once an epoch - memory = torch.cuda.max_memory_allocated()/1000000.0 - results['Memory'] = memory + memory = torch.cuda.max_memory_allocated() / 1000000.0 + results["Memory"] = memory - if self.epoch%5==0: + if self.epoch % 5 == 0: t1 = time() - #self.dope_scores contains multiprocessing result objects, get the results - #This will synchronize the code + # self.dope_scores contains multiprocessing result objects, get the results + # This will synchronize the code dope = np.array([r.get() for r in self.dope_scores]) idope = np.array([r.get() for r in self.interp_dope_scores]) - #Dope score returns (DOPE_score, refined_DOPE_score), might as well log both - results['valid_DOPE'] = dope[:,0].mean() - results['valid_DOPE_refined'] = dope[:,1].mean() - results['valid_DOPE_interp'] = idope[:,0].mean() - results['valid_DOPE_interp_refined'] = idope[:,1].mean() - results['valid_DOPE_time'] = time()-t1 # extra time taken to calculate DOPE + # Dope score returns (DOPE_score, refined_DOPE_score), might as well log both + results["valid_DOPE"] = dope[:, 0].mean() + results["valid_DOPE_refined"] = dope[:, 1].mean() + results["valid_DOPE_interp"] = idope[:, 0].mean() + results["valid_DOPE_interp_refined"] = idope[:, 1].mean() + results["valid_DOPE_time"] = ( + time() - t1 + ) # extra time taken to calculate DOPE return results - -if __name__ == '__main__': - +if __name__ == "__main__": ##### Load Data ##### data = PDBData() - data.import_pdb('data/MurD_closed_selection.pdb') - data.import_pdb('data/MurD_open_selection.pdb') + data.import_pdb( + "./clustered/MurD_open_selection_CLUSTER_aggl_train.dcd", + "./clustered/MurD_open_selection_NEW_TOPO.pdb", + ) data.fix_terminal() - data.atomselect(atoms = ['CA', 'C', 'N', 'CB', 'O']) + data.atomselect(atoms=["CA", "C", "N", "CB", "O"]) ##### Prepare Trainer ##### - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") trainer = CustomTrainer(device=device) - trainer.set_data(data, batch_size=8, validation_split=0.1, manual_seed = 25) - trainer.prepare_physics(remove_NB = True) - - trainer.set_autoencoder(AutoEncoder, out_points = data.dataset.shape[-1]) - trainer.prepare_optimiser() + trainer.set_data(data, batch_size=8, validation_split=0.1, manual_seed=25) + trainer.prepare_physics(remove_NB=True) + trainer.set_autoencoder(AutoEncoder, out_points=data.dataset.shape[-1]) + trainer.prepare_optimiser() ##### Training Loop ##### - #Keep training until loss does not improve for 32 consecutive epochs + # Keep training until loss does not improve for 32 consecutive epochs runkwargs = dict( - log_filename='log_file.dat', - log_folder='xbb_foldingnet_checkpoints', - checkpoint_folder='xbb_foldingnet_checkpoints', - ) + log_filename="log_file.dat", + log_folder="xbb_foldingnet_checkpoints", + checkpoint_folder="xbb_foldingnet_checkpoints", + ) best = 1e24 while True: - trainer.run(max_epochs = 32+trainer.epoch,**runkwargs) - if not best>trainer.best: + trainer.run(max_epochs=32 + trainer.epoch, **runkwargs) + if not best > trainer.best: break best = trainer.best - print(f'best {trainer.best}, best_filename {trainer.best_name}') + print(f"best {trainer.best}, best_filename {trainer.best_name}") diff --git a/examples/bb_foldingnet_basic.py b/examples/bb_foldingnet_basic.py index 8f620fe..5dea4f5 100644 --- a/examples/bb_foldingnet_basic.py +++ b/examples/bb_foldingnet_basic.py @@ -1,44 +1,50 @@ -import sys, os -sys.path.insert(0, os.path.join(os.path.abspath(os.pardir),'src')) +import sys +import os + +sys.path.insert(0, os.path.join(os.path.abspath(os.pardir), "src")) from molearn.data import PDBData from molearn.trainers import OpenMM_Physics_Trainer from molearn.models.foldingnet import AutoEncoder import torch -if __name__ == '__main__': - +def main(): ##### Load Data ##### data = PDBData() - data.import_pdb('data/MurD_closed_selection.pdb') - data.import_pdb('data/MurD_open_selection.pdb') + data.import_pdb( + "./clustered/MurD_open_selection_CLUSTER_aggl_train.dcd", + "./clustered/MurD_open_selection_NEW_TOPO.pdb", + ) data.fix_terminal() - data.atomselect(atoms = ['CA', 'C', 'N', 'CB', 'O']) + data.atomselect(atoms=["CA", "C", "N", "CB", "O"]) ##### Prepare Trainer ##### - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") trainer = OpenMM_Physics_Trainer(device=device) - trainer.set_data(data, batch_size=8, validation_split=0.1, manual_seed = 25) - trainer.prepare_physics(remove_NB = True) - - trainer.set_autoencoder(AutoEncoder, out_points = data.dataset.shape[-1]) - trainer.prepare_optimiser() + trainer.set_data(data, batch_size=8, validation_split=0.1, manual_seed=25) + trainer.prepare_physics(remove_NB=True) + trainer.set_autoencoder(AutoEncoder, out_points=data.dataset.shape[-1]) + trainer.prepare_optimiser() ##### Training Loop ##### - #Keep training until loss does not improve for 32 consecutive epochs + # Keep training until loss does not improve for 32 consecutive epochs runkwargs = dict( - log_filename='log_file.dat', - log_folder='xbb_foldingnet_checkpoints', - checkpoint_folder='xbb_foldingnet_checkpoints', - ) + log_filename="log_file.dat", + log_folder="xbb_foldingnet_checkpoints", + checkpoint_folder="xbb_foldingnet_checkpoints", + ) best = 1e24 while True: - trainer.run(max_epochs = 32+trainer.epoch,**runkwargs) - if not best>trainer.best: + trainer.run(max_epochs=32 + trainer.epoch, **runkwargs) + if not best > trainer.best: break best = trainer.best - print(f'best {trainer.best}, best_filename {trainer.best_name}') + print(f"best {trainer.best}, best_filename {trainer.best_name}") + + +if __name__ == "__main__": + main() diff --git a/examples/bb_many_subclassing_examples.py b/examples/bb_many_subclassing_examples.py index 74cc2a0..05b1897 100644 --- a/examples/bb_many_subclassing_examples.py +++ b/examples/bb_many_subclassing_examples.py @@ -15,14 +15,14 @@ class ValidRMSDTrainer(OpenMM_Physics_Trainer): ''' def valid_step(self, batch): results = super().valid_step(batch) - #rmsd + #rmsd rmsd = (((batch-self._internal['decoded'])*self.std)**2).sum(dim=1).mean().sqrt() results['RMSD'] = rmsd # 'valid_' will automatically be prepended onto this in trainer.valid_epoch, to distinguish it from train_step return results class ValidDOPETrainer(OpenMM_Physics_Trainer): ''' - calculate DOPE, this might slow your code down significantly so keep an eye on the 'valid_DOPE_extra_time' + calculate DOPE, this might slow your code down significantly so keep an eye on the 'valid_DOPE_extra_time' ''' def __init__(self,calculate_dope_every_n = 4, *args, **kwargs): super().__init__(*args, **kwargs) @@ -30,7 +30,7 @@ def __init__(self,calculate_dope_every_n = 4, *args, **kwargs): def valid_epoch(self, *args, **kwargs): ''' - Override valid_epoch, (make sure to call super().valid_epoch(*args, **kwargs) to keep default behavior too). + Override valid_epoch, (make sure to call super().valid_epoch(*args, **kwargs) to keep default behavior too). ''' # DOPE Calculations are slow, so you probably won't want to calculate them every epoch. Instead every 4 to 16 epochs. self.calculate_dope = self.epoch%self.calculate_dope_every==0 @@ -50,7 +50,7 @@ def valid_epoch(self, *args, **kwargs): def valid_step(self, batch): ''' - Keep default behavior by calling super().valid_step(batch), submit jobs to a multiprocessing pool with Parallel_DOPE_Score, then get results back at the end of the epoch in trainer.valid_epoch + Keep default behavior by calling super().valid_step(batch), submit jobs to a multiprocessing pool with Parallel_DOPE_Score, then get results back at the end of the epoch in trainer.valid_epoch ''' results = super().valid_step(batch) if not hasattr(self, 'dope_score_class'): @@ -79,7 +79,7 @@ def valid_epoch(self, *args, **kwargs): torch.cuda.reset_max_memory_allocated() results['Memory'] = memory return results - + class DisablePhysicsTrainer(OpenMM_Physics_Trainer): ''' Disable Physics in train_step. This essentially resets the train_step to that of Molearn.trainers.Trainer. @@ -100,7 +100,7 @@ def train_step(self, batch): class CalculateDecodedEnergyTrainer(OpenMM_Physics_Trainer): ''' - OpenMM_Physics_Trainer calculates the energy off interpolated structures. It might be helpful to track the energy of decoded structures too. + OpenMM_Physics_Trainer calculates the energy off interpolated structures. It might be helpful to track the energy of decoded structures too. ''' def common_physics_step(self, batch, latent): # calculate interpolated structures energy as normal @@ -124,8 +124,10 @@ def common_physics_step(self, batch, latent): ##### Load Data ##### data = PDBData() - data.import_pdb('data/MurD_closed_selection.pdb') - data.import_pdb('data/MurD_open_selection.pdb') + data.import_pdb( + "./clustered/MurD_open_selection_CLUSTER_aggl_train.dcd", + "./clustered/MurD_open_selection_NEW_TOPO.pdb", + ) data.fix_terminal() data.atomselect(atoms = ['CA', 'C', 'N', 'CB', 'O']) @@ -135,7 +137,7 @@ def common_physics_step(self, batch, latent): trainer.set_data(data, batch_size=8, validation_split=0.1, manual_seed = 25) trainer.prepare_physics(remove_NB = True) - + trainer.set_autoencoder(AutoEncoder, out_points = data.dataset.shape[-1]) trainer.prepare_optimiser() diff --git a/examples/prepare_example.py b/examples/prepare_example.py index 266afd7..6e4dfca 100644 --- a/examples/prepare_example.py +++ b/examples/prepare_example.py @@ -21,7 +21,7 @@ def main(): "./data/preparation/topo_MurDclosed1F.pdb", ], test_size=0.0, - n_cluster=5, + n_cluster=1500, outpath=storage_path, verbose=True, ) diff --git a/src/molearn/data/prepare.py b/src/molearn/data/prepare.py index b5ea9b9..8a43903 100644 --- a/src/molearn/data/prepare.py +++ b/src/molearn/data/prepare.py @@ -142,7 +142,9 @@ def read_traj(self) -> None: ), "there must be as many topologies supplied as trajectories" multi_traj = [] top0 = None + ucell0 = None for ct, (trp, top) in enumerate(zip(self.traj_path, self.topo_path)): + print(f"\tLoading Trajectory {ct}") loaded = None try: # do not enforce topology file on this formats @@ -155,15 +157,17 @@ def read_traj(self) -> None: loaded = self._loading_fallback(trp, top) # select only protein atoms from the trajectory loaded = loaded.atom_slice(loaded.top.select("protein")) - # to be able to join them + # use topology and unit cell from first trajectory to be able to join them if ct == 0: top0 = loaded.topology + ucell0 = loaded.unitcell_vectors else: if top0.n_atoms != loaded.n_atoms: raise ValueError( f"topologies do not match - topology [{ct}] has {loaded.n_atoms} instead of {top0.n_atoms}" ) loaded.topology = top0 + loaded.unitcell_vectors = ucell0 multi_traj.append(loaded) self.traj = md.join(multi_traj) # converts ELEMENT names from eg "Cd" -> "C" to avoid later complications