Running a Hyperparameter tuning with syne-tune (scheduler: HyperTune, trialbackend: PythonBackend) on a GPT-based PyTorch model leads to an error: "ValueError: array must not contain infs or NaNs" The error occurs irregularly after several successful trials (sometimes after just a few and sometimes after hundreds of trials)
The model is a custom spatiotemporal Transformer and i ran other architectures with the same code that did not fail. Also it cant be a cpu/gpu problem as i tried both with the same effect. I suppose it has something to do with the output of my model that gets reported to the tuner (MSELoss of validation data), but i cant see a difference to the other models. Maybe somebody encountered a similar problem and solved it!
Full error message:
...
val_loss: best 0.024526135064661503 for trial-id 5
--------------------
Traceback (most recent call last):
File "/gpfs/gpfs1/scratch/c7161037/traceve-models/opt_model/run_ept_syne.py", line 172, in <module>
result = main(max_wallclock_time=8*60*60, max_steps=120, n_workers=8, exp_tag="ept-tser-vpo-all-2")
File "/gpfs/gpfs1/scratch/c7161037/traceve-models/opt_model/run_ept_syne.py", line 50, in main
tuner.run()
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/tuner.py", line 342, in run
raise e
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/tuner.py", line 321, in run
self._schedule_new_tasks(running_trials_ids=running_trials_ids)
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/tuner.py", line 495, in _schedule_new_tasks
trial = self._schedule_new_task()
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/tuner.py", line 511, in _schedule_new_task
suggestion = self.scheduler.suggest(trial_id=self.trial_backend.new_trial_id())
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/optimizer/schedulers/scheduler_searcher.py", line 64, in suggest
return super().suggest(trial_id)
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/optimizer/scheduler.py", line 157, in suggest
ret_val = self._suggest(trial_id)
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/optimizer/schedulers/fifo.py", line 289, in _suggest
config = self.searcher.get_config(**extra_kwargs, trial_id=trial_id)
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/optimizer/schedulers/searchers/model_based_searcher.py", line 405, in get_config
config = self._get_config_modelbased(
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/optimizer/schedulers/searchers/model_based_searcher.py", line 622, in _get_config_modelbased
predictor = self.state_transformer.fit()
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/optimizer/schedulers/searchers/bayesopt/models/model_transformer.py", line 209, in fit
self._compute_predictor(skip_optimization=skip_optimization)
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/optimizer/schedulers/searchers/bayesopt/models/model_transformer.py", line 385, in _compute_predictor
output_predictors[output_name] = self._estimator[
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/optimizer/schedulers/searchers/bayesopt/models/gp_model.py", line 248, in fit_from_state
self._posterior_for_state(no_pending_state, update_params=update_params)
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/optimizer/schedulers/searchers/bayesopt/models/gp_model.py", line 309, in _posterior_for_state
self._gpmodel.fit(data)
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/optimizer/schedulers/searchers/bayesopt/gpautograd/hypertune/gp_model.py", line 228, in fit
super().fit(data)
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/optimizer/schedulers/searchers/bayesopt/gpautograd/gp_model.py", line 275, in fit
self._recompute_states(data)
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/optimizer/schedulers/searchers/bayesopt/gpautograd/gp_model.py", line 288, in _recompute_states
self._states = [self.likelihood.get_posterior_state(data)]
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/optimizer/schedulers/searchers/bayesopt/gpautograd/hypertune/likelihood.py", line 89, in get_posterior_state
return HyperTuneIndependentGPPosteriorState(
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/optimizer/schedulers/searchers/bayesopt/gpautograd/hypertune/posterior_state.py", line 125, in __init__
super().__init__(
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/optimizer/schedulers/searchers/bayesopt/gpautograd/independent/posterior_state.py", line 86, in __init__
self._compute_states(
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/optimizer/schedulers/searchers/bayesopt/gpautograd/independent/posterior_state.py", line 121, in _compute_states
self._states[resource] = GaussProcPosteriorState(
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/optimizer/schedulers/searchers/bayesopt/gpautograd/posterior_state.py", line 165, in __init__
self.chol_fact, self.pred_mat = cholesky_computations(
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/syne_tune/optimizer/schedulers/searchers/bayesopt/gpautograd/posterior_utils.py", line 88, in cholesky_computations
pred_mat = aspl.solve_triangular(chol_fact, centered_y, lower=True)
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/autograd/tracer.py", line 48, in f_wrapped
return f_raw(*args, **kwargs)
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/scipy/linalg/_basic.py", line 337, in solve_triangular
b1 = _asarray_validated(b, check_finite=check_finite)
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/scipy/_lib/_util.py", line 306, in _asarray_validated
a = toarray(a)
File "/home/c716/c7161037/.conda/envs/tune_v3/lib/python3.10/site-packages/numpy/lib/function_base.py", line 630, in asarray_chkfinite
raise ValueError(
ValueError: array must not contain infs or NaNs
Edit: The error occurs only when setting max_steps > 100 as max_ressource_attr in the scheduler. If i use only e.g. 80 steps the tuner runs for the desired max_wallclock_time.