DeblurGAN can't load his own weights anymore

210 Views Asked by At

Hey I realy need some help =)

firstly, sorry that it's soo long^^ but I hope that you don't need the full code at the end.

I coded a GAN for deblurring. Now I'm training it. the first 71 epochs have been trained without any problems: I trained some epochs till the colab GPU-time limit was reached, the next day I loaded my weights into the gan and continued training. 2 or 3 weeks ago I wanted to load the weights of epoch 71 in my Gan but I recieved the following error (I'm quite sure that I didn't change anything in the code). Since this moment I only can load the first 65 weights and i get the same error for every epoch higher than 65:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-16-a35c9a2bbf3a> in <module>()
      1 # Load weights
----> 2 gan.load_weights(F"/content/gdrive/My Drive/Colab Notebooks/data/deblurGAN_weights66_batchsize_1.h5")

5 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in load_weights(self, filepath, by_name, skip_mismatch, options)
   2209             f, self.layers, skip_mismatch=skip_mismatch)
   2210       else:
-> 2211         hdf5_format.load_weights_from_hdf5_group(f, self.layers)
   2212 
   2213   def _updated_config(self):

/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/saving/hdf5_format.py in load_weights_from_hdf5_group(f, layers)
    706                        str(len(weight_values)) + ' elements.')
    707     weight_value_tuples += zip(symbolic_weights, weight_values)
--> 708   K.batch_set_value(weight_value_tuples)
    709 
    710 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/dispatch.py in wrapper(*args, **kwargs)
    199     """Call target, and fall back on dispatchers if there is a TypeError."""
    200     try:
--> 201       return target(*args, **kwargs)
    202     except (TypeError, ValueError):
    203       # Note: convert_to_eager_tensor currently raises a ValueError, not a

/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/backend.py in batch_set_value(tuples)
   3574   if ops.executing_eagerly_outside_functions():
   3575     for x, value in tuples:
-> 3576       x.assign(np.asarray(value, dtype=dtype(x)))
   3577   else:
   3578     with get_graph().as_default():

/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/resource_variable_ops.py in assign(self, value, use_locking, name, read_value)
    856     with _handle_graph(self.handle):
    857       value_tensor = ops.convert_to_tensor(value, dtype=self.dtype)
--> 858       self._shape.assert_is_compatible_with(value_tensor.shape)
    859       assign_op = gen_resource_variable_ops.assign_variable_op(
    860           self.handle, value_tensor, name=name)

/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/tensor_shape.py in assert_is_compatible_with(self, other)
   1132     """
   1133     if not self.is_compatible_with(other):
-> 1134       raise ValueError("Shapes %s and %s are incompatible" % (self, other))
   1135 
   1136   def most_specific_compatible_shape(self, other):

ValueError: Shapes (4, 4, 64, 128) and (64,) are incompatible

I was looking a long time for a solution and i didn't find a real one. But I found out, that if I train one epoch with one of the old weights (1-65) afterwards I can load one of the new weights. So I thought that I could use this "workaround" but yesterday I plotted the scores of the metric of the Test dataset for every epoch. I recieved this picture: psnrscore/epoch as you can see it looks like I'm producing trash since epoch 65 (on the pic since 60 because I lost the first 5 epochs, so it starts by 6)

I'm realy frustrated and hope that someone could help me =D

Here's the full code of the GAN:

# Libraries to build the model

from tensorflow import pad
from tensorflow.keras.layers import Layer
from keras.layers import Input, Activation, Add, UpSampling2D
from keras.layers.merge import Add
from keras.layers.core import Dropout, Dense, Flatten
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.convolutional import Conv2D, Conv2DTranspose
from keras.layers.core import Lambda
from keras.layers.normalization import BatchNormalization
from keras.models import Model
import keras.backend as K
from keras.applications.vgg16 import VGG16
from keras.optimizers import Adam
import keras

# Reflection padding

from keras.engine import InputSpec
import tensorflow as tf
from keras.engine.topology import Layer

'''
  2D Reflection Padding
  Attributes:
    - padding: (padding_width, padding_height) tuple
'''
class ReflectionPadding2D(Layer):
    def __init__(self, padding=(1, 1), **kwargs):
        self.padding = tuple(padding)
        self.input_spec = [InputSpec(ndim=4)]
        super(ReflectionPadding2D, self).__init__(**kwargs)

    def compute_output_shape(self, s):
        """ If you are using "channels_last" configuration"""
        return (s[0], s[1] + 2 * self.padding[0], s[2] + 2 * self.padding[1], s[3])

    def call(self, x, mask=None):
        w_pad,h_pad = self.padding
        return tf.pad(x, [[0,0], [h_pad,h_pad], [w_pad,w_pad], [0,0] ], 'REFLECT')

# Res Block
def res_block(input, filters, kernel_size = (3,3), strides = (1,1), use_dropout = False):

    """
    Instanciate a Keras Resnet Block using sequential API.
    :param input: Input tensor
    :param filters: Number of filters to use
    :param kernel_size: Shape of the kernel for the convolution
    :param strides: Shape of the strides for the convolution
    :param use_dropout: Boolean value to determine the use of dropout
    :return: Keras Model
    """

    x = ReflectionPadding2D((1,1))(input)
    x = Conv2D(filters = filters,
               kernel_size = kernel_size,
               strides = strides,)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)

    if use_dropout:
        x = Dropout(0.5)(x)

    x = ReflectionPadding2D((1,1))(x)
    x = Conv2D(filters = filters,
                kernel_size = kernel_size,
                strides = strides,)(x)
    x = BatchNormalization()(x)

    # Two convolution layers followed by a direct connection between input and output (skip connection)
    out = Add()([input, x])
    return out

# Generator

n_res_blocks = 9

def generator_model():

  # encoder

  inputs = Input(shape = img_shape)

  x = ReflectionPadding2D((3, 3))(inputs)
  x = Conv2D(filters = 64, kernel_size = (7,7), padding = 'valid')(x)
  x = BatchNormalization()(x)
  x = Activation('relu')(x)

  x = Conv2D(128, (3,3), strides=2, padding='same') (x) #DIM(15,15,128)
  x = BatchNormalization() (x)
  x = Activation('relu') (x)
 
  x = Conv2D(256, (3,3), strides = 2, padding = 'same') (x) #DIM(7,7,256)
  x = BatchNormalization() (x)
  x = Activation('relu') (x)

  # Apply 9 res blocks
  for i in range(n_res_blocks):
      x = res_block(x, 256, use_dropout = True)

  # decoder

  #x = Conv2DTranspose(128, (3,3), strides = 2, padding = 'same') (x)
  x = UpSampling2D()(x)
  x = Conv2D(filters = 128, kernel_size=(3, 3), padding='same')(x)
  x = BatchNormalization()(x)
  x = Activation('relu')(x)

  #x = Conv2DTranspose(64, (3,3), strides = 2, padding = 'same') (x)
  x = UpSampling2D()(x)
  x = Conv2D(filters = 64, kernel_size=(3, 3), padding='same')(x)
  x = BatchNormalization()(x)
  x = Activation('relu')(x)

  x = ReflectionPadding2D((3,3))(x)
  x = Conv2D(filters = 3, kernel_size = (7,7), padding = 'valid')(x)
  x = Activation('tanh')(x)

  # Add direct connection from input to output and recenter to [-1, 1] (skip connection)
  outputs = Add()([x, inputs])
  outputs = Lambda(lambda z: z/2)(outputs) # to keep normalized outputs

  model = Model(inputs = inputs, outputs = outputs, name = 'Generator')

  return model

# Discriminator

def discriminator_model():

  Input_img = Input(shape=(img_shape))

  x = Conv2D(filters = 64, kernel_size = (4, 4), strides = 2, padding='same')(Input_img)
  x = LeakyReLU(0.2)(x)
  nf_mult, nf_mult_prev = 1, 1

  for n in range(3):
    nf_mult_prev, nf_mult = nf_mult, min(2**n, 8)
    x = Conv2D(filters = 64*nf_mult, kernel_size = (4, 4), strides = 2, padding = 'same')(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(0.2)(x)

  nf_mult_prev, nf_mult = nf_mult, 8
  x = Conv2D(filters = 64*nf_mult, kernel_size = (4, 4), strides = 1, padding = 'same')(x)
  x = BatchNormalization()(x)
  x = LeakyReLU(0.2)(x)

  x = Conv2D(filters = 1, kernel_size = (4, 4), strides = 1, padding = 'same')(x)


  x = Flatten()(x)
  x = Dense(1024, activation = 'tanh')(x)
  x = Dense(1, activation = 'sigmoid')(x)

  model = Model(inputs = Input_img, outputs = x, name = 'discriminator')
  return model

def gan_model(generator, discriminator):
    inputs = Input(shape = img_shape)
    generated_images = generator(inputs)
    outputs = discriminator(generated_images)
    model = Model(inputs=inputs, outputs = [generated_images, outputs])
    return model

#Losses

#Wassersteinloss:
def wasserstein_loss(y_true, y_pred):
    return K.mean(y_true * y_pred)


# vgg16 model for perceptual loss
vgg = VGG16(include_top = False, weights = 'imagenet', input_shape = img_shape)
loss_model = Model(inputs = vgg.input, outputs = vgg.get_layer('block3_conv3').output)
loss_model.trainable = False

#perceptual loss:
def perceptual_loss(y_true, y_pred):
  return K.mean(K.square(loss_model(y_true) - loss_model(y_pred)))

#Metrics:

#SSIM:
def ssim_metric(y_true, y_pred):
  return tf.reduce_mean(tf.image.ssim(tf.convert_to_tensor(y_true),tf.convert_to_tensor(y_pred), max_val=1.0, ))

#PSNR:
def psnr_metric(y_true, y_pred):
  return tf.reduce_mean(tf.image.psnr(y_true, y_pred, max_val=1.0))

def training(epochs, batch_size):

  path_psnr = F"/content/gdrive/My Drive/Colab Notebooks/data/psnr"
  path_ssim = F"/content/gdrive/My Drive/Colab Notebooks/data/ssim"

  GAN_losses = []
  #psnrs = []
  #ssims = []

  random_idx = np.arange(0, X_train.shape[0])
  n_batches = int (len(random_idx)/batch_size) #divide trainingset into batches of batch_size
  
  for e in range(epochs):
    #weights_name = "deblurGAN_weights%s_batchsize_%r.h5" %(e + 66, batch_size)
    weights_name = "deblurGAN_weights_test.h5"

    print("epoch: %s " %(e + 66))

    #randomize index of trainig set
    random.shuffle(random_idx)

    for i in range(n_batches):
      
      img_batch_blured = X_train[i*batch_size:(i+1)*batch_size]
      img_batch_generated = generator.predict(img_batch_blured)
      img_batch_original = Y_train[i*batch_size:(i+1)*batch_size]
      img_batch = np.concatenate((img_batch_generated , img_batch_original),0)

      valid0 = -np.ones(batch_size)
      valid1 = np.ones(batch_size)
      valid = np.concatenate((valid0,valid1))

      discriminator.trainable = True


      for k in range(5):
        loss = discriminator.train_on_batch(img_batch, valid)


      discriminator.trainable = False

      GAN_loss = gan.train_on_batch(img_batch_blured, [img_batch_original, valid1])
      GAN_losses.append(GAN_loss)     

      if (100*i/n_batches).is_integer():
        psnr = psnr_metric(img_batch_original, img_batch_generated)
        ssim = ssim_metric(img_batch_original, img_batch_generated)
        psnrs.append(psnr)
        ssims.append(ssim)

        #creating 2 files in Google Drive where the psnr and ssim data will be saved.
        pickle.dump( psnrs, open( path_psnr, "wb" ) )
        pickle.dump( ssims, open( path_ssim, "wb" ) )
        print((100*i/n_batches) + 1, "% psnr: ", psnr," ssim: ", ssim)
  
    # Save weights: mode the path to your directory 

    gan.save_weights(F"/content/gdrive/My Drive/Colab Notebooks/data/{weights_name}")

  return [GAN_losses, psnrs, ssims]

# Initialize models
generator = generator_model()
discriminator = discriminator_model()
gan = gan_model(generator, discriminator)

# Initialize optimizers
d_opt = Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
gan_opt = Adam(lr=1E-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

# Compile models
discriminator.trainable = True
discriminator.compile(optimizer = d_opt, loss = wasserstein_loss)
discriminator.trainable = False
loss = [perceptual_loss, wasserstein_loss]
loss_weights = [100, 1]
gan.compile(optimizer = gan_opt, loss = loss, loss_weights = loss_weights)
discriminator.trainable = True
gan.summary()

# Load weights
gan.load_weights(F"/content/gdrive/My Drive/Colab Notebooks/data/deblurGAN_weights66_batchsize_1.h5")

#connect to GPU
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

loss = training(1, 1) #epochs, batchsize
1

There are 1 best solutions below

0
daysi duck On

It is solved an can be closed. I didn't know that the "discriminato.Trainable = True/False" was changed. It seems to be the reason for another ordering in the weights.