I am trying to work on ISIC 2017 dataset. I have the code from a paper and I am trying to make it run locally or on colab (limited resources). In a proper server it works fine but with less training images (originally they are 2000, in a server I tested with 1000 and worked). Locally I tried even with - like - 20 images but the RAM still collapses (8 GB). Also, on my local set-up I don't have any conda devices (yeah it sucks). Also on colab sadly the RAM just explodes.
I modified something from the original paper, so now I have the followind dataset_ISIC.py where the loading of images should be progressive:
dataset_isic.py
from torch.utils.data import Dataset
import torchvision.transforms as transforms
from PIL import Image
import numpy as np
import random
import torch
import os
import cv2
import albumentations as A
from albumentations.pytorch import ToTensorV2
import pandas as pd
class ISIC2017(Dataset):
#mask=label
#images and mask are csv
def __init__(self,csv,imgs_path,labels_path,transform,training=True):
self.transform=transform
if training:
self.df = pd.read_csv(csv)
self.images, self.masks = imgs_path, labels_path
print("getting images")
self.images = [''.join([self.images, '/', i.replace('.jpg', '.jpg')]) for i in self.df['image_name']]
self.masks = [''.join([self.masks, '/', i.replace('.jpg', '_segmentation.png')]) for i in self.df['image_name']]
else:
print("taking val imgs and masks path")
self.df = pd.read_csv(csv)
self.images, self.masks = imgs_path, labels_path
print("getting val")
self.images = [''.join([self.images, '/', i]) for i in self.df['image_name']]
self.masks = [''.join([self.masks, '/', i.replace('.jpg', '_segmentation.png')]) for i in self.df['image_name']]
def __getitem__(self, index):
img = cv2.imread(self.images[index])[:, :, ::-1]
mask = cv2.imread(self.masks[index], cv2.IMREAD_GRAYSCALE)
if img.shape[:2] != mask.shape[:2]:
mask = cv2.resize(mask, (img.shape[1], img.shape[0]))
if self.transform:
augmented = self.transform(image=img, mask=mask)
img = augmented['image']
mask = augmented['mask']
return img, mask
def __len__(self):
return len(self.images)
def for_train_transform():
desired_size = 512
train_transform = A.Compose([
A.Resize(width=desired_size, height=desired_size),
A.RandomRotate90(),
A.Flip(p=0.5),
A.ShiftScaleRotate(shift_limit=0, scale_limit=(-0.2, 0.1), rotate_limit=40, p=0.5),
A.RandomBrightnessContrast(
brightness_limit=0.5,
contrast_limit=0.1,
p=0.5
),
A.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=100, val_shift_limit=80),
A.GaussNoise(),
A.OneOf([
A.ElasticTransform(),
A.GridDistortion(),
A.OpticalDistortion(distort_limit=0.5, shift_limit=0)
]),
A.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225],
max_pixel_value=255.0,
p=1.0
),
ToTensorV2()], p=1.)
return train_transform
test_transform = A.Compose([
A.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225],
max_pixel_value=255.0,
p=1.0
),
ToTensorV2()], p=1.)
And this, instead, is the training file. I will post only the train method because everything work just fine before entering inside the actual train:
train.py
def train(model, save_name):
model_savedir = args.checkpoint + save_name + '/'
save_name = model_savedir + 'ckpt'
print(model_savedir)
if not os.path.exists(model_savedir):
os.mkdir(model_savedir)
train_ds=ISIC2017(train_csv,train_imgs, train_masks,train_transform)
val_ds=ISIC2017(df_val, val_imgs, val_masks,test_transform,training=False)
# train_ds = Mydataset(imgs_train, masks_train, train_transform)
# val_ds = Mydataset(imgs_val, masks_val, test_transform)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
CosineLR = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=1e-8)
# train_dl = DataLoader(train_ds, shuffle=True, batch_size=args.batch_size, pin_memory=False, num_workers=0,
# drop_last=True, )
# val_dl = DataLoader(val_ds, batch_size=args.batch_size, pin_memory=False, num_workers=0, )
train_dl = DataLoader(train_ds, shuffle=True, batch_size=16, pin_memory=False, num_workers=0, drop_last=True)
val_dl = DataLoader(val_ds, batch_size=16, pin_memory=False, num_workers=0)
best_acc = 0
print("Start inside train function")
with tqdm(total=epochs, ncols=60) as t:
for epoch in range(epochs):
epoch_loss, epoch_iou, epoch_val_loss, epoch_val_iou = \
fit(epoch, epochs, model, train_dl, val_dl, device, criterion, optimizer, CosineLR)
f = open(model_savedir + 'log' + '.txt', "a")
f.write('epoch' + str(float(epoch)) +
' _train_loss' + str(epoch_loss) + ' _val_loss' + str(epoch_val_loss) +
' _epoch_acc' + str(epoch_iou) + ' _val_iou' + str(epoch_val_iou) + '\n')
if epoch_val_iou > best_acc:
f.write('\n' + 'here' + '\n')
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = epoch_val_iou
torch.save(best_model_wts, ''.join([save_name, '.pth']))
f.close()
t.update(1)
write_options(model_savedir, args, best_acc)
print('Done!')
Now, what I want to achieve is to run locally even with like 50 or 70 train images. The validation set is originally of 150 but I reduced to 10 - the reason is of course that I was thinking that if the images were less, then my device would not have any problem. Sadly it has.
Initially, this:
if training:
self.df = pd.read_csv(csv)
self.images, self.masks = imgs_path, labels_path
print("getting images")
self.images = [''.join([self.images, '/', i.replace('.jpg', '.jpg')]) for i in self.df['image_name']]
self.masks = [''.join([self.masks, '/', i.replace('.jpg', '_segmentation.png')]) for i in self.df['image_name']]
else:
print("taking val imgs and masks path")
self.df = pd.read_csv(csv)
self.images, self.masks = imgs_path, labels_path
print("getting val")
self.images = [''.join([self.images, '/', i]) for i in self.df['image_name']]
self.masks = [''.join([self.masks, '/', i.replace('.jpg', '_segmentation.png')]) for i in self.df['image_name']]
and this:
img = cv2.imread(self.images[index])[:, :, ::-1]
mask = cv2.imread(self.masks[index], cv2.IMREAD_GRAYSCALE)
img = cv2.resize(image,(512,512))
if img.shape[:2] != mask.shape[:2]:
mask = cv2.resize(mask, (img.shape[1], img.shape[0]))
if self.transform:
augmented = self.transform(image=img, mask=mask)
img = augmented['image']
mask = augmented['mask']
were both on the training file but was such a huge amount of memory since all the images were loaded all at once. I am kinda stuck right now, so I would like to understand where my mistake is and understand if it is actually possible to run the code locally or just on a server.
Dataset has different size for each image, so I resize. But I don't think that could be the problem.
EDIT: Now, I am starting to think that is a problem of the model. It has 2M parameters and surely they are too many for my local set up. With a model that has 0.15M parameters, tho, even if the RAM get saturated this is what I obtain Error