How do I make my data generator use less RAM when training my AI?

60 Views Asked by At

I am currently using Kaggle's notebook and environment, so I am limited to 13 GB of RAM. Before this was working with a small dataset, but now it is over a gigabyte. I do have a data generator in place, but now it is not working. RAM currently maxes out when I do anything. I am using Python with keras and a text corpora dataset filled with my data.

I have tried making all the batch_size, steps, hidden_size, to the lowest(except steps which I made higher because that lowers usage of RAM). I tried finding solutions on google and even resorted to ChatGPT for some help. None of them worked. I would greatly appreciate help on this. Code:

from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import tensorflow as tf
from keras.utils import Sequence

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.callbacks import LambdaCallback, ModelCheckpoint, ReduceLROnPlateau
import random
import sys

class TextDataGenerator(Sequence):
    def __init__(self, text, vocabulary, char_to_indices, indices_to_char, max_length, batch_size):
        self.text = text
        self.vocabulary = vocabulary
        self.char_to_indices = char_to_indices
        self.indices_to_char = indices_to_char
        self.max_length = max_length
        self.batch_size = batch_size
        self.steps = (len(text) - max_length) // batch_size
        
    def __len__(self):
        return self.steps
    
    def __getitem__(self, idx):
        batch_start = idx * self.batch_size
        batch_end = (idx + 1) * self.batch_size
        batches = self.text[batch_start:batch_end]
        X = np.zeros((self.batch_size, self.max_length, len(self.vocabulary)), dtype=bool)
        y = np.zeros((self.batch_size, len(self.vocabulary)), dtype=bool)
        for i, batch in enumerate(batches):
            for t, char in enumerate(batch[:-1]):
                X[i, t, self.char_to_indices[char]] = 1
            y[i, self.char_to_indices[batch[-1]]] = 1
        return X, y
    
    def on_epoch_end(self):
        random.shuffle(self.text)

with open('/kaggle/input/crptic-python/python.txt', 'r') as file:
    text = file.read()

# A preview of the text file
vocabulary = sorted(list(set(text)))

char_to_indices = dict((c, i) for i, c in enumerate(vocabulary))
indices_to_char = dict((i, c) for i, c in enumerate(vocabulary))

# Dividing the text into subsequences of length max_length
# So that at each time step the next max_length characters
# are fed into the network
max_length = 100
batch_size = 32
steps = 10
sentences = []
next_chars = []
for i in range(0, len(text) - max_length, steps):
    sentences.append(text[i: i + max_length + 1])
    next_chars.append(text[i + max_length + 1])

# Building the LSTM network for the task
model = Sequential()
model.add(LSTM(128, input_shape=(max_length, len(vocabulary))))
model.add(Dense(len(vocabulary)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)


# Helper function to sample an index from a probability array
def sample_index(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


# Helper function to generate text after the end of each epoch
def on_epoch_end(epoch, logs):
    if epoch % 1 == 0:
        print()
        print('----- Generating text after Epoch: % d' % epoch)

        start_index = random.randint(0, len(text) - max_length - 1)
        for diversity in [0.1, 0.3, 0.5]:
            print('----- diversity:', diversity)

            generated = ''
            sentence = text[start_index: start_index + max_length]
            generated += sentence
            print('----- Generating with seed: "' + sentence + '"')
            sys.stdout.write(generated)

            for i in range(400):
                x_pred = np.zeros((1, max_length, len(vocabulary)))
                for t, char in enumerate(sentence):
                    x_pred[0, t, char_to_indices[char]] = 1.

                preds = model.predict(x_pred, verbose=0)[0]
                next_index = sample_index(preds, diversity)
                next_char = indices_to_char[next_index]

                generated += next_char
                sentence = sentence[1:] + next_char

                sys.stdout.write(next_char)
                sys.stdout.flush()
            print()


print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

# Defining a helper function to save the model after each epoch
# in which the loss decreases
filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss',
                             verbose=1, save_best_only=True,
                             mode='min')

# Defining a helper function to reduce the learning rate each time
# the learning plateaus
reduce_alpha = ReduceLROnPlateau(monitor='loss', factor=0.2,
                                 patience=1, min_lr=0.001)
callbacks = [print_callback, checkpoint, reduce_alpha]

# Training the LSTM model
data_generator = TextDataGenerator(sentences, vocabulary, char_to_indices, indices_to_char, max_length, batch_size)
model.fit(data_generator, epochs=2, callbacks=callbacks)

def generate_text(length, diversity):
    # Get random starting text
    start_index = random.randint(0, len(text) - max_length - 1)
    generated = ''
    sentence = text[start_index: start_index + max_length]
    generated += sentence
    for i in range(length):
        x_pred = np.zeros((1, max_length, len(vocabulary)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_to_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample_index(preds, diversity)
        next_char = indices_to_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char
    return generated


print(generate_text(500, 0.5))

0

There are 0 best solutions below