Optimal autoencoder model for picture anomaly detection

18 Views Asked by At

I'm training an autoencoder to detect anomalies among pictures based on the decoder error value. I tried out different ways of image preprocessing, NN architectures, losses, activation functions, image normalisations, augmentations, etc. The optimal model seems a bit unintuitive to be because it uses ReLU and MSE. I'd expect a sigmoid on the last (decoder) layer and binary cross-entropy to win the competition. Can you advise if my solution is alright or how to adjust it to follow the standard for this kind of a task? (I'm worried that I'm making some basic errors.)

# Loading the dataset
def load_and_preprocess_image(img_path, target_size=(256, 256)):
    img = image.load_img(img_path, target_size=target_size)#, color_mode='grayscale')
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = img_array / 255.0  # Scale pixel values
    return img_array

image_directory = 'images'
image_paths = [os.path.join(image_directory, img) for img in os.listdir(image_directory) if img in task_images0.StoragePath.to_list()]
img_size = 256
images = np.vstack([load_and_preprocess_image(img_path, target_size=(img_size, img_size)) for img_path in image_paths])
# Model architecture
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2

# Define the level of L2 regularization
l2_reg = l2(0.01)
img_height, img_width = images[0].shape[:2]
channels = 3

input_img = Input(shape=(img_height, img_width, channels))

# Encoder
x = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_regularizer=l2_reg)(input_img)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Dropout(0.1)(x)  # Dropout layer
x = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=l2_reg)(x)
encoded = MaxPooling2D((2, 2), padding='same')(x)

# Decoder
x = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=l2_reg)(encoded)
x = UpSampling2D((2, 2))(x)
x = Dropout(0.1)(x)  # Dropout layer
x = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_regularizer=l2_reg)(x)
x = UpSampling2D((2, 2))(x)
decoded = Conv2D(1, (3, 3), activation='relu', padding='same')(x)

# Model training.

from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.backend import clear_session, set_value
clear_session()

# Your existing model setup
autoencoder = Model(input_img, decoded)
autoencoder.compile(optimizer=Adam(0.005), loss='mse')

# Callback for early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='min', restore_best_weights=True)

# Callback to reduce learning rate
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=0, mode='min')

# Callback to save the model with the lowest validation loss
model_checkpoint = ModelCheckpoint('best_outlier_model_waug_RGB256_b64.h5', monitor='val_loss', mode='min', save_best_only=True, verbose=0)

X = images.astype(np.float16) # To save memory

# Split the dataset into a training and testing set
X_train, X_test = train_test_split(X, test_size=0.1, random_state=42)

del X
X_train = X_train.astype(np.float16)
X_test = X_test.astype(np.float16)

# Data Augmentation
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from PIL import Image
import random

def random_stretch(image, target_size=(256, 256)):
    img = Image.fromarray((image * 255).astype('uint8'))  # Convert from array to PIL Image
    original_size = img.size  # (width, height)
    stretch_factor = 1.1  # Define stretch factor
    
    axis = random.choice(['width', 'height'])
    
    if axis == 'width':
        new_size = (int(original_size[0] * stretch_factor), original_size[1])
    else:
        new_size = (original_size[0], int(original_size[1] * stretch_factor))
    stretched_img = img.resize(new_size, Image.Resampling.LANCZOS)
    resized_img = stretched_img.resize(target_size, Image.Resampling.LANCZOS)
    return np.array(resized_img) / 255.0  # Convert back to array and scale to [0, 1]

# Define separate ImageDataGenerators for training and test sets
#datagen_params = dict(zca_whitening=True,)
datagen_params = dict()
   
train_datagen = ImageDataGenerator(
    **datagen_params,
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest',
    preprocessing_function=lambda x: random_stretch(x, target_size=(img_size, img_size))  # Apply random stretching and resize
)

test_datagen = ImageDataGenerator(
    **datagen_params,)

autoencoder.fit(train_datagen.flow(X_train, X_train, batch_size=64),
                epochs=100,
                shuffle=True,
                validation_data=test_datagen.flow(X_test, X_test, batch_size=64),
                callbacks=[early_stopping, reduce_lr, model_checkpoint])

Then I make predictions it on an external sample.

reconstructed_images = autoencoder.predict(images_to_predict)

errors = np.mean(np.abs(images_to_predict - reconstructed_images), axis=(1, 2, 3))
threshold = np.percentile(errors, 90)  # Set threshold as the 90th percentile of error

anomalies = errors > threshold

The model works pretty well. When I switch to binary cross-entropy and sigmoid activation on the last layer, it performs poorly. I'm sorry that I'm asking such a broad question, but do you have any hints why my intuition is wrong?

I was expecting to achieve a better performance for binary cross-entropy loss and sigmoid activation on the last layer.

0

There are 0 best solutions below