Trying to build a Generative Adversarial Network to generate images based on captions but facing errors with input shapes

Question

Trying to build a Generative Adversarial Network to generate images based on captions but facing errors with input shapes

12 Views Asked by Yam At 27 March 2024 at 15:45

expected input shape into the discriminator model is (None, 299, 299, 3) but the found shape is (None, 152, 152, 3). I have tried resizing my the dataset and changing the generator model's parameters but nothing seem to work.

Resizing the dataset I used which was Flickr8k seems to make no changes and I couldn't accurately tweak the parameters of the generator model

Here's my full project in Pycharm:

import cv2
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.layers import Input, Dense, Reshape, Embedding, Concatenate, Flatten, Conv2DTranspose, Conv2D, LeakyReLU, Dropout
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras_preprocessing.image import load_img, img_to_array
from keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger
import os

# Load caption data
def load_caption_data(file_path):
    captions = {}
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split(',')
            if len(parts) >= 2:
                image_id, caption = parts[0], ','.join(parts[1:])
                captions.setdefault(image_id, []).append(caption)
            else:
                print("Skipping line:", line)
    return captions

# Load image data
def load_image_data(image_dir):
    images = {}
    for filename in os.listdir(image_dir):
        image_id = filename.split('.')[0]
        img = load_img(os.path.join(image_dir, filename), target_size=(299, 299))
        img = img_to_array(img)
        images[image_id] = img
    return images

# Preprocess text data
def preprocess_text_data(captions):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(captions)
    vocab_size = len(tokenizer.word_index) + 1
    max_length = max(len(description.split()) for description in captions)
    sequences = tokenizer.texts_to_sequences(captions)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    return padded_sequences, vocab_size, max_length

# Preprocess image data
def preprocess_image_data(image_data):
    processed_images = {}
    for image_id, img in image_data.items():
        # Resize the image to (299, 299)
        img_resized = cv2.resize(img, (299, 299))
        # Preprocess the image (e.g., normalization)
        img_processed = preprocess_input(img_resized)
        processed_images[image_id] = img_processed
    return processed_images

# Define generator model
def build_generator(latent_dim, embedding_dim, max_length):
    generator = Sequential([
        Dense(128 * 19 * 19, input_dim=latent_dim),  # Adjust the input shape to match the size of (19, 19, 128)
        Reshape((19, 19, 128)),
        Conv2DTranspose(256, (5, 5), strides=(2, 2), padding='same'),
        Conv2DTranspose(128, (5, 5), strides=(2, 2), padding='same'),
        Conv2DTranspose(64, (5, 5), strides=(2, 2), padding='same'),
        Conv2D(3, (5, 5), activation='tanh', padding='same')  # Output image with dimensions (299, 299, 3)
    ])
    return generator

def resize_generated_images(images):
    resized_images = {}
    for image_id, img in images.items():
        resized_img = cv2.resize(img, (299, 299))
        resized_images[image_id] = resized_img.astype(np.float32)  # Ensure data type consistency
    return resized_images

# Define discriminator model
def build_discriminator(img_shape):
    discriminator = Sequential([
        Conv2D(64, (3,3), strides=(2,2), padding='same', input_shape=img_shape),
        LeakyReLU(alpha=0.2),
        Dropout(0.4),
        Conv2D(128, (3,3), strides=(2,2), padding='same'),
        LeakyReLU(alpha=0.2),
        Dropout(0.4),
        Flatten(),
        Dense(1, activation='sigmoid')
    ])
    return discriminator

# Load caption data
caption_data = load_caption_data('Flickr8k_text/captions.txt')

# Preprocess caption data
captions = [caption for img_captions in caption_data.values() for caption in img_captions]
padded_sequences, vocab_size, max_length = preprocess_text_data(captions)

# Load image data
image_data = load_image_data('Flickr8k_Dataset')
image_data = preprocess_image_data(image_data)

# Define hyperparameters
latent_dim = 100
embedding_dim = 50
img_shape = (299, 299, 3)
batch_size = 64
epochs = 1000

# Build and compile models
generator = build_generator(latent_dim, embedding_dim, max_length)
discriminator = build_discriminator(img_shape)

optimizer = Adam(learning_rate=0.0002, beta_1=0.5)
discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Build and compile the combined model (generator + discriminator)
discriminator.trainable = False
z = Input(shape=(latent_dim,))
img = generator(z)
validity = discriminator(img)
combined = Model(z, validity)
combined.compile(loss='binary_crossentropy', optimizer=optimizer)

# Define callback for saving the model
model_checkpoint = ModelCheckpoint('generator_model.keras', monitor='val_loss', verbose=1, save_best_only=True)

# Define callback for early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

# Define callback for logging training progress
csv_logger = CSVLogger('training.log')

# Training loop
for epoch in range(epochs):
    # Train discriminator
    idx = np.random.randint(0, len(image_data), batch_size)
    imgs = np.array([image_data[i] for i in idx])
    noise = np.random.normal(0, 1, (batch_size, latent_dim))
    gen_imgs = generator.predict(noise)

    d_loss_real = discriminator.train_on_batch(imgs, np.ones((batch_size, 1)))
    d_loss_fake = discriminator.train_on_batch(gen_imgs, np.zeros((batch_size, 1)))
    d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

    # Train generator
    noise = np.random.normal(0, 1, (batch_size, latent_dim))
    valid_y = np.array([1] * batch_size)
    g_loss = combined.train_on_batch(noise, valid_y)

    # Print progress
    print(f"Epoch {epoch+1}/{epochs}, D Loss: {d_loss[0]}, G Loss: {g_loss}")

    # Save model and log progress
    model_checkpoint.on_epoch_end(epoch, logs={'D_loss': d_loss[0], 'G_loss': g_loss})
    csv_logger.on_epoch_end(epoch, logs={'D_loss': d_loss[0], 'G_loss': g_loss})

generator.save('final_generator_model.keras')

Original Q&A

Trying to build a Generative Adversarial Network to generate images based on captions but facing errors with input shapes

There are 0 best solutions below

Related Questions in KERAS

Related Questions in INPUT

Related Questions in ARTIFICIAL-INTELLIGENCE

Related Questions in GENERATIVE-ADVERSARIAL-NETWORK

Trending Questions

Popular # Hahtags

Popular Questions