Ray Tune Warning: Failed to Fetch Metrics for Trials - FileNotFoundError

77 Views Asked by At

I'm training a deep neural network pytorch model using Ray Tune for hyperparameter tuning. I encountered FileNotFoundError: Could not fetch metrics for lambda_17db8_00000: both result.json and progress.csv were not found after my training completion.

import os.path
from sklearn.preprocessing import MinMaxScaler
import ray
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from ray import tune, train
from ray.tune.schedulers import ASHAScheduler
from ray.tune import CLIReporter
import shutil


# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set environment variables
os.environ["AIR_VERBOSITY"] = "2"
os.environ["RAY_AIR_NEW_OUTPUT"] = "1"
os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"


# Split dataset into train and test sets
X_train, X_val, y_train, y_val = train_test_split(
    data, target,
    test_size=0.2, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_val, y_val, test_size=0.5, random_state=42
)

scalar = MinMaxScaler()
X_train = scalar.fit_transform(X_train)
X_val = scalar.fit_transform(X_val)
X_test = scalar.fit_transform(X_test)

# Convert to tensor
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)


# Define the DNN model
class DNNModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_layers, hidden_units, activation):
        super(DNNModel, self).__init__()
        self.input_size = input_size
        self.hidden_layers = hidden_layers
        self.hidden_units = hidden_units
        self.output_size = output_size
        self.activation = activation

        self.fc_layers = nn.ModuleList()
        in_features = input_size

        for _ in range(hidden_layers):
            self.fc_layers.append(nn.Linear(in_features, hidden_units))
            self.fc_layers.append(activation)
            in_features = hidden_units

        self.fc_layers.append(nn.Linear(in_features, output_size))

    def forward(self, x):
        x = x.view(-1, self.input_size)
        for layer in self.fc_layers:
            x = layer(x)
        return x


def model_training(config, data, target, test_loader, checkpoint_dir=None):
    # Instantiate the model with the given hyperparameters
    model = DNNModel(input_size=data.shape[1], hidden_layers=config["hidden_layers"],
                     hidden_units=config["hidden_units"], output_size=len(torch.unique(target)),
                     activation=config["activation"])

    # Move the model to the device
    model.to(device)
    print("Device Type:{0}".format(device))
    # Print the GPU name
    if torch.cuda.is_available():
        print("Model training on GPU: {}".format(next(model.parameters()).is_cuda))
        print("GPU Name:", torch.cuda.get_device_name(device))
    else:
        print("No GPU available.")

    # Define the loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = getattr(optim, config["optimizer"])(model.parameters(), lr=config["learning_rate"])

    # Create data loaders
    train_dataset = TensorDataset(data, target)
    train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)

    train_loss_history = []
    train_acc_history = []

    # Training loop
    for epoch in range(config["epochs"]):
        running_loss = 0.0
        correct = 0
        total = 0

        for batch in train_loader:
            inputs, labels = batch

            # Move data to the device
            inputs = inputs.to(device)
            labels = labels.to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            # Compute training accuracy
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            # Print statistics
            running_loss += loss.item()

        # Calculate average training loss and accuracy for the epoch
        epoch_train_loss = running_loss / len(train_loader)
        epoch_train_acc = correct / total

        # Save the history
        train_loss_history.append(epoch_train_loss)
        train_acc_history.append(epoch_train_acc)

        # Validation
        with torch.no_grad():
            val_outputs = model(test_loader.dataset.tensors[0].to(device))
            val_loss = criterion(val_outputs, test_loader.dataset.tensors[1].to(device))
            val_accuracy = (
                    val_outputs.argmax(dim=1) == test_loader.dataset.tensors[1].to(device)).float().mean().item()

        # Print epoch progress
        print(f"Epoch [{epoch + 1}/{config['epochs']}]")
        print(f"  Training Loss: {epoch_train_loss:.4f}, Training Accuracy: {epoch_train_acc:.4f}")
        print(f"  Validation Loss: {val_loss.item():.4f}, Validation Accuracy: {val_accuracy:.4f}")

        # Report train loss, train accuracy, val loss, and val accuracy for tuning
        train.report({"train_loss": epoch_train_loss, "train_accuracy": epoch_train_acc,
                      "loss": val_loss.item(), "accuracy": val_accuracy})

        # Save checkpoint
        if checkpoint_dir is not None:
            checkpoint_path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save(model.state_dict(), checkpoint_path)


def evaluate_model(model, data, target):
    # Move data to the device
    data = data.to(device)
    target = target.to(device)

    # Forward pass
    outputs = model(data)
    _, predicted = torch.max(outputs.data, 1)

    # Compute accuracy
    accuracy = (predicted == target).float().mean().item()
    print(f"Accuracy: {accuracy}")
    return accuracy


config_space = {
    "hidden_layers": tune.randint(1, 6),
    "hidden_units": tune.randint(4, 129),
    "learning_rate": tune.loguniform(1e-5, 1e-1),
    "optimizer": tune.choice(["Adam", "SGD", "RMSprop", "Adadelta", "Adagrad", "Adamax", "NAdam", "RAdam"]),
    "activation": tune.choice(
        [nn.ReLU(), nn.Tanh(), nn.SELU(), nn.ELU(), nn.Softmax(), nn.LogSigmoid(), nn.LogSoftmax()]),
    "batch_size": tune.choice([16, 32, 64, 128, 256, 512, 1024]),
    "epochs": tune.choice(list(range(10, 351, 10)))
}

scheduler = ASHAScheduler(
    metric="loss",
    mode="min",
    max_t=100,  # Maximum number of epochs
    grace_period=1,
    reduction_factor=2
)

test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True)

reporter = CLIReporter(metric_columns=["loss", "accuracy", "training_iteration"])

checkpoint_dir = os.path.expanduser('~/Documents/reports')
parent_dir = os.path.dirname(checkpoint_dir)
os.makedirs(parent_dir, exist_ok=True)

shutil.rmtree(checkpoint_dir, ignore_errors=True)

# %%

if ray.is_initialized():
    ray.shutdown()
ray.init()

tuning = tune.run(
    lambda config: model_training(config, X_train, y_train, test_loader, checkpoint_dir=checkpoint_dir),
    config=config_space,
    num_samples=10,
    scheduler=scheduler,
    progress_reporter=reporter,
    local_dir=checkpoint_dir,
    keep_checkpoints_num=1,
    resources_per_trial={"cpu": 10}
)

best_config = tuning.get_best_config(metric="loss", mode="min")
best_loss = tuning.get_best_trial(metric="loss", mode="min").last_result["loss"]
best_accuracy = tuning.get_best_trial(metric="accuracy", mode="max").last_result["accuracy"]

Here is the output of my code. output

and I am able to get the best config out of tuning after finishing the training.

best_config = tuning.get_best_config(metric="loss", mode="min")
best_config
Out[10]: 
{'hidden_layers': 5,
 'hidden_units': 9,
 'learning_rate': 0.0013342350777003893,
 'optimizer': 'NAdam',
 'activation': Tanh(),
 'batch_size': 512,
 'epochs': 210}

I've checked the local_dir path in the tune.run function and it seems to be correct. I also don't see any errors in my code that could be stopping the training prematurely. I'm not sure why the result.json and progress.csv files are not being created.

0

There are 0 best solutions below