I'm facing an issue when I am using the albumentations library in python to do image augmentation on the fly, which means while training the model. I am trying to train an object detection (OD) model and I am using albumentations to perform the augmentations because they make it so easy when dealing with bounding boxes. The issue is whenever I find a source to train an OD model, I search for the part where the data goes into the model, I only add one step where I pass the data to the augmentation function and make sure that the output of this function is exactly as the input without the function and then I proceed to train the model. I tried this resource and added this function
`def get_dataset(filenames, batch_size, architecture, data_type):
AUTOTUNE = tf.data.AUTOTUNE
if data_type=="train":
dataset = (
tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
.map(parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
.map(lambda image, bboxes, labels: tf.py_function(augment_image, inp=[image, bboxes, labels], Tout=(tf.float32, tf.float32, tf.float32)), num_parallel_calls=AUTOTUNE)
.batch(batch_size=batch_size)
.shuffle(batch_size * 10)
.map(LabelEncoder(architecture=architecture).encode_batch, num_parallel_calls=AUTOTUNE)
.prefetch(AUTOTUNE)
)
else:
dataset = (
tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
.map(parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
.map(lambda image, bboxes, labels: tf.py_function(augment_image, inp=[image, bboxes, labels], Tout=(tf.float32, tf.float32, tf.float32)), num_parallel_calls=AUTOTUNE)
.batch(batch_size=batch_size)
.shuffle(batch_size * 10)
.map(LabelEncoder(architecture=architecture).encode_batch, num_parallel_calls=AUTOTUNE)
.prefetch(AUTOTUNE))
return dataset`
where augment_image is:
# augment image
def augment_image(image, bboxes, labels): # getting image metadata class_labels = labels
# defining transformation object
transform = A.Compose([
A.HorizontalFlip(p=0.5),
A.HorizontalFlip(p=0.5),
A.Resize(height=224, width=224, interpolation=cv2.INTER_AREA, p=1),
A.RandomContrast(limit=0.2, p=0.2),
A.RandomBrightness(limit=0.2, p=0.2),
], bbox_params=A.BboxParams(format='coco', label_fields=['class_labels']))
# applying transformation to image
transformed = transform(image=image.numpy(), bboxes=bboxes.numpy().reshape(-1, 4), class_labels=class_labels.numpy())
transformed_bboxes = np.array(transformed['bboxes'])
transformed_class_labels = transformed['class_labels']
# from xmin, ymin, width, height to x_center, y_center, width, height
transformed_bboxes = np.column_stack((transformed_bboxes[..., :2]+transformed_bboxes[..., 2:]/2, transformed_bboxes[..., 2:]))
# image
transformed_image = np.float32(transformed['image'])/255
return transformed_image, transformed_bboxes, transformed_class_labels
When i used these augmentations the model was getting 0 mAP but it was performing well without them.
also i tried using DETR from this source, where i changed the class CoocoDetection like this:
class CocoDetection(torchvision.datasets.CocoDetection):
def __init__(self, img_folder, train_json_path, test_json_path, feature_extractor, train=True):
self.train = train
if self.train:
ann_file = train_json_path
else:
ann_file = test_json_path
super(CocoDetection, self).__init__(img_folder, ann_file)
self.feature_extractor = feature_extractor
# Define an augmentation pipeline
self.transform = A.Compose([
A.Resize(height=800, width=800, interpolation=cv2.INTER_AREA, p=1),
A.VerticalFlip(p=0.2),
A.HorizontalFlip(p=0.2),
A.RandomBrightnessContrast(p=0.2),
], bbox_params=A.BboxParams(format='coco', label_fields=['class_labels']))
def __getitem__(self, idx):
img, target = super(CocoDetection, self).__getitem__(idx)
# Apply transformations
img, target = self.augment_image(img, target)
# preprocess image and target (converting target to DETR format, resizing + normalization of both image and target)
image_id = self.ids[idx]
target = {'image_id': image_id, 'annotations': target}
encoding = self.feature_extractor(images=img, annotations=target, return_tensors="pt")
pixel_values = encoding["pixel_values"].squeeze()
target = encoding["labels"][0]
return pixel_values, target
def augment_image(self, image, target):
# copy to avoid over writing
transformed_target = copy.deepcopy(target)
new_image = np.array(copy.copy(image))
# processing
assert len(transformed_target)==1, "One annotation per image"
bboxes = [obj['bbox'] for obj in transformed_target]
class_labels = [obj['category_id'] for obj in transformed_target]
# applying transformation to image
transformed = self.transform(image=new_image, bboxes=bboxes, class_labels=class_labels)
transformed_image = Image.fromarray(transformed['image'])
transformed_bboxes = np.array(transformed['bboxes'])
transformed_target[0]['area'] = np.round(transformed_bboxes[0][2]*transformed_bboxes[0][3]).astype(int)
transformed_target[0]['bbox'] = np.round(transformed_bboxes[0]).astype(int).tolist()
return transformed_image, transformed_target
and I faced the same issue.
I really don't understand what's happening. in both cases I tried simplifying the augmentation as much as possible all the way to a simple resize and still the sane thing. please note that in both cases I am augmenting the validation set which to my understanding is not a wrong practice. I am also making sure that my bounding boxes are correct after the transformations.