Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cannot use cpu? #46

Open
SarahAsbury opened this issue Jun 14, 2024 · 0 comments
Open

cannot use cpu? #46

SarahAsbury opened this issue Jun 14, 2024 · 0 comments

Comments

@SarahAsbury
Copy link

SarahAsbury commented Jun 14, 2024

st.train_and_fit(
    callbacks=[cb_early_stopping],
    accelerator = "cpu",
    logger=[log_tb],
)

Generates error:
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

Full error:


GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

| Name | Type | Params


0 Trainable params
0 Non-trainable params
0 Total params
0.000 Total estimated model params size (MB)
Epoch 0: 0%| | 0/27 [00:00<?, ?it/s]

RuntimeError Traceback (most recent call last)
Cell In[33], line 3
1 ## train ST
----> 3 st.train_and_fit(
4 callbacks=[cb_early_stopping],
5 accelerator = "cpu",
6 logger=[log_tb],
7 )

File ~/.virtualenvs/starling/lib/python3.10/site-packages/starling/starling.py:353, in ST.train_and_fit(self, accelerator, strategy, devices, num_nodes, precision, logger, callbacks, fast_dev_run, max_epochs, min_epochs, max_steps, min_steps, max_time, limit_train_batches, limit_val_batches, limit_test_batches, limit_predict_batches, overfit_batches, val_check_interval, check_val_every_n_epoch, num_sanity_val_steps, log_every_n_steps, enable_checkpointing, enable_progress_bar, enable_model_summary, accumulate_grad_batches, gradient_clip_val, gradient_clip_algorithm, deterministic, benchmark, inference_mode, use_distributed_sampler, profiler, detect_anomaly, barebones, plugins, sync_batchnorm, reload_dataloaders_every_n_epochs, default_root_dir)
349 _locals.pop("self")
351 trainer = pl.Trainer(**_locals)
--> 353 trainer.fit(self)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py:545, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
543 self.state.status = TrainerStatus.RUNNING
544 self.training = True
--> 545 call._call_and_handle_interrupt(
546 self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
547 )

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py:44, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
42 if trainer.strategy.launcher is not None:
43 return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
---> 44 return trainer_fn(*args, **kwargs)
46 except _TunerExitException:
47 _call_teardown_hook(trainer)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py:581, in Trainer._fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
574 assert self.state.fn is not None
575 ckpt_path = self._checkpoint_connector._select_ckpt_path(
576 self.state.fn,
577 ckpt_path,
578 model_provided=True,
579 model_connected=self.lightning_module is not None,
580 )
--> 581 self._run(model, ckpt_path=ckpt_path)
583 assert self.state.stopped
584 self.training = False

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py:990, in Trainer._run(self, model, ckpt_path)
985 self._signal_connector.register_signal_handlers()
987 # ----------------------------
988 # RUN THE TRAINER
989 # ----------------------------
--> 990 results = self._run_stage()
992 # ----------------------------
993 # POST-Training CLEAN UP
994 # ----------------------------
995 log.debug(f"{self.class.name}: trainer tearing down")

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py:1036, in Trainer._run_stage(self)
1034 self._run_sanity_check()
1035 with torch.autograd.set_detect_anomaly(self._detect_anomaly):
-> 1036 self.fit_loop.run()
1037 return None
1038 raise RuntimeError(f"Unexpected state {self.state}")

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py:202, in _FitLoop.run(self)
200 try:
201 self.on_advance_start()
--> 202 self.advance()
203 self.on_advance_end()
204 self._restarting = False

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py:359, in _FitLoop.advance(self)
357 with self.trainer.profiler.profile("run_training_epoch"):
358 assert self._data_fetcher is not None
--> 359 self.epoch_loop.run(self._data_fetcher)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py:136, in _TrainingEpochLoop.run(self, data_fetcher)
134 while not self.done:
135 try:
--> 136 self.advance(data_fetcher)
137 self.on_advance_end(data_fetcher)
138 self._restarting = False

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py:240, in _TrainingEpochLoop.advance(self, data_fetcher)
237 with trainer.profiler.profile("run_training_batch"):
238 if trainer.lightning_module.automatic_optimization:
239 # in automatic optimization, there can only be one optimizer
--> 240 batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
241 else:
242 batch_output = self.manual_optimization.run(kwargs)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py:187, in _AutomaticOptimization.run(self, optimizer, batch_idx, kwargs)
180 closure()
182 # ------------------------------
183 # BACKWARD PASS
184 # ------------------------------
185 # gradient update with accumulated gradients
186 else:
--> 187 self._optimizer_step(batch_idx, closure)
189 result = closure.consume_result()
190 if result.loss is None:

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py:265, in _AutomaticOptimization._optimizer_step(self, batch_idx, train_step_and_backward_closure)
262 self.optim_progress.optimizer.step.increment_ready()
264 # model hook
--> 265 call._call_lightning_module_hook(
266 trainer,
267 "optimizer_step",
268 trainer.current_epoch,
269 batch_idx,
270 optimizer,
271 train_step_and_backward_closure,
272 )
274 if not should_accumulate:
275 self.optim_progress.optimizer.step.increment_completed()

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py:157, in _call_lightning_module_hook(trainer, hook_name, pl_module, *args, **kwargs)
154 pl_module._current_fx_name = hook_name
156 with trainer.profiler.profile(f"[LightningModule]{pl_module.class.name}.{hook_name}"):
--> 157 output = fn(*args, **kwargs)
159 # restore current_fx when nested context
160 pl_module._current_fx_name = prev_fx_name

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/core/module.py:1282, in LightningModule.optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure)
1243 def optimizer_step(
1244 self,
1245 epoch: int,
(...)
1248 optimizer_closure: Optional[Callable[[], Any]] = None,
1249 ) -> None:
1250 r"""Override this method to adjust the default way the :class:~pytorch_lightning.trainer.trainer.Trainer calls
1251 the optimizer.
1252
(...)
1280
1281 """
-> 1282 optimizer.step(closure=optimizer_closure)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/core/optimizer.py:151, in LightningOptimizer.step(self, closure, **kwargs)
148 raise MisconfigurationException("When optimizer.step(closure) is called, the closure should be callable")
150 assert self._strategy is not None
--> 151 step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
153 self._on_after_step()
155 return step_output

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py:230, in Strategy.optimizer_step(self, optimizer, closure, model, **kwargs)
228 # TODO(fabric): remove assertion once strategy's optimizer_step typing is fixed
229 assert isinstance(model, pl.LightningModule)
--> 230 return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py:117, in PrecisionPlugin.optimizer_step(self, optimizer, model, closure, **kwargs)
115 """Hook to run the optimizer step."""
116 closure = partial(self._wrap_closure, model, optimizer, closure)
--> 117 return optimizer.step(closure=closure, **kwargs)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/torch/optim/optimizer.py:113, in Optimizer._hook_for_profile..profile_hook_step..wrapper(*args, **kwargs)
111 profile_name = "Optimizer.step#{}.step".format(obj.class.name)
112 with torch.autograd.profiler.record_function(profile_name):
--> 113 return func(*args, **kwargs)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/torch/autograd/grad_mode.py:27, in _DecoratorContextManager.call..decorate_context(*args, **kwargs)
24 @functools.wraps(func)
25 def decorate_context(*args, **kwargs):
26 with self.clone():
---> 27 return func(*args, **kwargs)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/torch/optim/adam.py:118, in Adam.step(self, closure)
116 if closure is not None:
117 with torch.enable_grad():
--> 118 loss = closure()
120 for group in self.param_groups:
121 params_with_grad = []

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py:104, in PrecisionPlugin._wrap_closure(self, model, optimizer, closure)
91 def _wrap_closure(
92 self,
93 model: "pl.LightningModule",
94 optimizer: Optimizer,
95 closure: Callable[[], Any],
96 ) -> Any:
97 """This double-closure allows makes sure the closure is executed before the on_before_optimizer_step
98 hook is called.
99
(...)
102
103 """
--> 104 closure_result = closure()
105 self._after_closure(model, optimizer)
106 return closure_result

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py:140, in Closure.call(self, *args, **kwargs)
139 def call(self, *args: Any, **kwargs: Any) -> Optional[Tensor]:
--> 140 self._result = self.closure(*args, **kwargs)
141 return self._result.loss

File ~/.virtualenvs/starling/lib/python3.10/site-packages/torch/autograd/grad_mode.py:27, in _DecoratorContextManager.call..decorate_context(*args, **kwargs)
24 @functools.wraps(func)
25 def decorate_context(*args, **kwargs):
26 with self.clone():
---> 27 return func(*args, **kwargs)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py:126, in Closure.closure(self, *args, **kwargs)
124 @torch.enable_grad()
125 def closure(self, *args: Any, **kwargs: Any) -> ClosureResult:
--> 126 step_output = self._step_fn()
128 if step_output.closure_loss is None:
129 self.warning_cache.warn("training_step returned None. If this was on purpose, ignore this warning...")

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py:315, in _AutomaticOptimization._training_step(self, kwargs)
312 trainer = self.trainer
314 # manually capture logged metrics
--> 315 training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values())
316 self.trainer.strategy.post_training_step() # unused hook - call anyway for backward compatibility
318 return self.output_result_cls.from_training_step_output(training_step_output, trainer.accumulate_grad_batches)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py:309, in _call_strategy_hook(trainer, hook_name, *args, **kwargs)
306 return None
308 with trainer.profiler.profile(f"[Strategy]{trainer.strategy.class.name}.{hook_name}"):
--> 309 output = fn(*args, **kwargs)
311 # restore current_fx when nested context
312 pl_module._current_fx_name = prev_fx_name

File ~/.virtualenvs/starling/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py:382, in Strategy.training_step(self, *args, **kwargs)
380 if self.model != self.lightning_module:
381 return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, **kwargs)
--> 382 return self.lightning_module.training_step(*args, **kwargs)

File ~/.virtualenvs/starling/lib/python3.10/site-packages/starling/starling.py:121, in ST.training_step(self, batch)
114 """Compute and return the training loss
115
116 :param batch: A list of tensors of size m x n
117
118 :returns: Total loss
119 """
120 # y, s, fy, fs, fl = batch
--> 121 model_nll, fake_loss, p_fake_singlet = self(batch)
123 # total loss
124 loss = model_nll + self.model_regularizer * fake_loss

File ~/.virtualenvs/starling/lib/python3.10/site-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []

File ~/.virtualenvs/starling/lib/python3.10/site-packages/starling/starling.py:94, in ST.forward(self, batch)
92 if self.model_cell_size:
93 y, s, fy, fs, fl = batch
---> 94 _, _, model_nll, _ = utility.compute_posteriors(
95 y, s, self.model_params, self.dist_option, self.model_zplane_overlap
96 )
97 _, _, _, p_fake_singlet = utility.compute_posteriors(
98 fy, fs, self.model_params, self.dist_option, self.model_zplane_overlap
99 )
100 else:

File ~/.virtualenvs/starling/lib/python3.10/site-packages/starling/utility.py:435, in compute_posteriors(Y, S, Theta, dist_option, model_overlap)
428 log_tau = torch.nn.functional.log_softmax(
429 Theta["is_tau"].reshape(-1), dim=0
430 ).reshape(
431 log_pi.shape[0], log_pi.shape[0]
432 ) ## CxC
433 log_delta = torch.nn.functional.log_softmax(Theta["is_delta"], dim=0) ## 2
--> 435 prob_y_given_z = compute_p_y_given_z(
436 Y, Theta, dist_option
437 ) ## log p(y_n|z=c) -> NxC
438 prob_data_given_z_d0 = (
439 prob_y_given_z + log_pi
440 ) ## log p(y_n|z=c) + log p(z=c) -> NxC + C -> NxC
442 if S is not None:

File ~/.virtualenvs/starling/lib/python3.10/site-packages/starling/utility.py:325, in compute_p_y_given_z(Y, Theta, dist_option)
322 else:
323 dist_Y = torch.distributions.StudentT(df=2, loc=mu, scale=sigma)
--> 325 return dist_Y.log_prob(Y.reshape(-1, 1, Y.shape[1])).sum(
326 2
327 )

File ~/.virtualenvs/starling/lib/python3.10/site-packages/torch/distributions/studentT.py:82, in StudentT.log_prob(self, value)
80 if self._validate_args:
81 self._validate_sample(value)
---> 82 y = (value - self.loc) / self.scale
83 Z = (self.scale.log() +
84 0.5 * self.df.log() +
85 0.5 * math.log(math.pi) +
86 torch.lgamma(0.5 * self.df) -
87 torch.lgamma(0.5 * (self.df + 1.)))
88 return -0.5 * (self.df + 1.) * torch.log1p(y**2. / self.df) - Z

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant