I'm training a deep neural network pytorch model using Ray Tune for hyperparameter tuning. I encountered FileNotFoundError: Could not fetch metrics for lambda_17db8_00000: both result.json and progress.csv were not found after my training completion.
import os.path
from sklearn.preprocessing import MinMaxScaler
import ray
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from ray import tune, train
from ray.tune.schedulers import ASHAScheduler
from ray.tune import CLIReporter
import shutil
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Set environment variables
os.environ["AIR_VERBOSITY"] = "2"
os.environ["RAY_AIR_NEW_OUTPUT"] = "1"
os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# Split dataset into train and test sets
X_train, X_val, y_train, y_val = train_test_split(
data, target,
test_size=0.2, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
X_val, y_val, test_size=0.5, random_state=42
)
scalar = MinMaxScaler()
X_train = scalar.fit_transform(X_train)
X_val = scalar.fit_transform(X_val)
X_test = scalar.fit_transform(X_test)
# Convert to tensor
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)
# Define the DNN model
class DNNModel(nn.Module):
def __init__(self, input_size, output_size, hidden_layers, hidden_units, activation):
super(DNNModel, self).__init__()
self.input_size = input_size
self.hidden_layers = hidden_layers
self.hidden_units = hidden_units
self.output_size = output_size
self.activation = activation
self.fc_layers = nn.ModuleList()
in_features = input_size
for _ in range(hidden_layers):
self.fc_layers.append(nn.Linear(in_features, hidden_units))
self.fc_layers.append(activation)
in_features = hidden_units
self.fc_layers.append(nn.Linear(in_features, output_size))
def forward(self, x):
x = x.view(-1, self.input_size)
for layer in self.fc_layers:
x = layer(x)
return x
def model_training(config, data, target, test_loader, checkpoint_dir=None):
# Instantiate the model with the given hyperparameters
model = DNNModel(input_size=data.shape[1], hidden_layers=config["hidden_layers"],
hidden_units=config["hidden_units"], output_size=len(torch.unique(target)),
activation=config["activation"])
# Move the model to the device
model.to(device)
print("Device Type:{0}".format(device))
# Print the GPU name
if torch.cuda.is_available():
print("Model training on GPU: {}".format(next(model.parameters()).is_cuda))
print("GPU Name:", torch.cuda.get_device_name(device))
else:
print("No GPU available.")
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = getattr(optim, config["optimizer"])(model.parameters(), lr=config["learning_rate"])
# Create data loaders
train_dataset = TensorDataset(data, target)
train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
train_loss_history = []
train_acc_history = []
# Training loop
for epoch in range(config["epochs"]):
running_loss = 0.0
correct = 0
total = 0
for batch in train_loader:
inputs, labels = batch
# Move data to the device
inputs = inputs.to(device)
labels = labels.to(device)
# Zero the gradients
optimizer.zero_grad()
# Forward pass
outputs = model(inputs)
loss = criterion(outputs, labels)
# Backward pass and optimization
loss.backward()
optimizer.step()
# Compute training accuracy
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
# Print statistics
running_loss += loss.item()
# Calculate average training loss and accuracy for the epoch
epoch_train_loss = running_loss / len(train_loader)
epoch_train_acc = correct / total
# Save the history
train_loss_history.append(epoch_train_loss)
train_acc_history.append(epoch_train_acc)
# Validation
with torch.no_grad():
val_outputs = model(test_loader.dataset.tensors[0].to(device))
val_loss = criterion(val_outputs, test_loader.dataset.tensors[1].to(device))
val_accuracy = (
val_outputs.argmax(dim=1) == test_loader.dataset.tensors[1].to(device)).float().mean().item()
# Print epoch progress
print(f"Epoch [{epoch + 1}/{config['epochs']}]")
print(f" Training Loss: {epoch_train_loss:.4f}, Training Accuracy: {epoch_train_acc:.4f}")
print(f" Validation Loss: {val_loss.item():.4f}, Validation Accuracy: {val_accuracy:.4f}")
# Report train loss, train accuracy, val loss, and val accuracy for tuning
train.report({"train_loss": epoch_train_loss, "train_accuracy": epoch_train_acc,
"loss": val_loss.item(), "accuracy": val_accuracy})
# Save checkpoint
if checkpoint_dir is not None:
checkpoint_path = os.path.join(checkpoint_dir, "checkpoint")
torch.save(model.state_dict(), checkpoint_path)
def evaluate_model(model, data, target):
# Move data to the device
data = data.to(device)
target = target.to(device)
# Forward pass
outputs = model(data)
_, predicted = torch.max(outputs.data, 1)
# Compute accuracy
accuracy = (predicted == target).float().mean().item()
print(f"Accuracy: {accuracy}")
return accuracy
config_space = {
"hidden_layers": tune.randint(1, 6),
"hidden_units": tune.randint(4, 129),
"learning_rate": tune.loguniform(1e-5, 1e-1),
"optimizer": tune.choice(["Adam", "SGD", "RMSprop", "Adadelta", "Adagrad", "Adamax", "NAdam", "RAdam"]),
"activation": tune.choice(
[nn.ReLU(), nn.Tanh(), nn.SELU(), nn.ELU(), nn.Softmax(), nn.LogSigmoid(), nn.LogSoftmax()]),
"batch_size": tune.choice([16, 32, 64, 128, 256, 512, 1024]),
"epochs": tune.choice(list(range(10, 351, 10)))
}
scheduler = ASHAScheduler(
metric="loss",
mode="min",
max_t=100, # Maximum number of epochs
grace_period=1,
reduction_factor=2
)
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True)
reporter = CLIReporter(metric_columns=["loss", "accuracy", "training_iteration"])
checkpoint_dir = os.path.expanduser('~/Documents/reports')
parent_dir = os.path.dirname(checkpoint_dir)
os.makedirs(parent_dir, exist_ok=True)
shutil.rmtree(checkpoint_dir, ignore_errors=True)
# %%
if ray.is_initialized():
ray.shutdown()
ray.init()
tuning = tune.run(
lambda config: model_training(config, X_train, y_train, test_loader, checkpoint_dir=checkpoint_dir),
config=config_space,
num_samples=10,
scheduler=scheduler,
progress_reporter=reporter,
local_dir=checkpoint_dir,
keep_checkpoints_num=1,
resources_per_trial={"cpu": 10}
)
best_config = tuning.get_best_config(metric="loss", mode="min")
best_loss = tuning.get_best_trial(metric="loss", mode="min").last_result["loss"]
best_accuracy = tuning.get_best_trial(metric="accuracy", mode="max").last_result["accuracy"]
Here is the output of my code.

and I am able to get the best config out of tuning after finishing the training.
best_config = tuning.get_best_config(metric="loss", mode="min")
best_config
Out[10]:
{'hidden_layers': 5,
'hidden_units': 9,
'learning_rate': 0.0013342350777003893,
'optimizer': 'NAdam',
'activation': Tanh(),
'batch_size': 512,
'epochs': 210}
I've checked the local_dir path in the tune.run function and it seems to be correct. I also don't see any errors in my code that could be stopping the training prematurely. I'm not sure why the result.json and progress.csv files are not being created.