Problem with max_training_graph_size setting
@htorres noticed that if we take a number too low for max_training_graph_size
then the first graphs are skipped by ligthning and the loss is set to nan which doesn't seem to please acorn when he can make an inference with a graph of size < max_training_graph_size
.
here is the (last part of the) error :
self._run(model, ckpt_path=self.ckpt_path)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1112, in _run
results = self._run_stage()
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1191, in _run_stage
self._run_train()
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1214, in _run_train
self.fit_loop.run()
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py", line 267, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 213, in advance
batch_output = self.batch_loop.run(kwargs)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 88, in advance
outputs = self.optimizer_loop.run(optimizers, kwargs)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py", line 199, in run
self.advance(*args, **kwargs)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 202, in advance
result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position])
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 249, in _run_optimization
self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 370, in _optimizer_step
self.trainer._call_lightning_module_hook(
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1356, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/core/module.py", line 1742, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py", line 169, in step
step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/strategies/ddp.py", line 280, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 234, in optimizer_step
return self.precision_plugin.optimizer_step(
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 119, in optimizer_step
return optimizer.step(closure=closure, **kwargs)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/torch/optim/lr_scheduler.py", line 65, in wrapper
return wrapped(*args, **kwargs)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/torch/optim/optimizer.py", line 113, in wrapper
return func(*args, **kwargs)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/torch/optim/adamw.py", line 119, in step
loss = closure()
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 105, in _wrap_closure
closure_result = closure()
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 149, in __call__
self._result = self.closure(*args, **kwargs)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 144, in closure
self._backward_fn(step_output.closure_loss)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 305, in backward_fn
self.trainer._call_strategy_hook("backward", loss, optimizer, opt_idx)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1494, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 207, in backward
self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, optimizer_idx, *args, **kwargs)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 67, in backward
model.backward(tensor, optimizer, optimizer_idx, *args, **kwargs)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/pytorch_lightning/core/module.py", line 1486, in backward
loss.backward(*args, **kwargs)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/torch/_tensor.py", line 396, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/torch/autograd/function.py", line 253, in apply
return user_fn(self, *args)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/torch/utils/checkpoint.py", line 146, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/sps/l2it/CommonSoftware/conda/envs/acorn/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: Your training graph has changed in this iteration, e.g., one parameter is unused in first iteration, but then got used in the second iteration. this is not compatible with static_graph set to True.