The dataset is a list of about 250,000 random words of random lengths. Here is the code I used for training the RNN model:
# Load dataset
with open("/content/drive/MyDrive/Colab Notebooks/HangmanTest/words_250000_train.txt", "r") as file:
dataset = file.read().splitlines()
# Function to prepare data
def prepare_data(dataset):
# Preprocess dataset
vocab = set()
max_sequence_length = 0
for word in dataset:
word = word.lower() # Convert to lowercase
vocab.update(set(word))
max_sequence_length = max(max_sequence_length, len(word))
vocab_size = len(vocab)
char_to_index = {char: index for index, char in enumerate(sorted(vocab))}
index_to_char = {index: char for char, index in char_to_index.items()}
# Generate sequences
X, y = [], []
for word in dataset:
word = word.lower() # Convert to lowercase
X_word, y_word = [], []
for char in word:
X_word.append(char_to_index[char])
y_word.append(char_to_index[char])
X.append(X_word)
y.append(y_word)
# Pad sequences to the maximum sequence length
X_padded = pad_sequences(X, maxlen=max_sequence_length, padding='post')
y_padded = pad_sequences(y, maxlen=max_sequence_length, padding='post')
return X_padded, y_padded, vocab_size, max_sequence_length, char_to_index, index_to_char
# Prepare data
X_train, y_train, vocab_size, max_sequence_length, char_to_index, index_to_char = prepare_data(dataset)
# Define model architecture
embedding_dim = 50
model = Sequential([
Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length),
LSTM(units=64, return_sequences=True),
Dense(units=vocab_size, activation='softmax')
])
# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Train model
batch_size = 64
num_epochs = 10
model.fit(X_train, y_train, batch_size=batch_size, epochs=num_epochs, validation_split=0.2)
# Save the model
model.save("/content/drive/MyDrive/Colab Notebooks/HangmanTest/Trained Model/Model 2")
Here is the code of a hangman type game where I am trying to implement the model:
class HangmanOffline(object):
def __init__(self):
self.full_dictionary = self.build_dictionary("/content/drive/MyDrive/Colab Notebooks/HangmanTest/words_250000_train.txt")
self.guessed_letters = []
self.full_dictionary_common_letter_sorted = collections.Counter("".join(self.full_dictionary)).most_common()
self.current_dictionary = []
self.model = tf.keras.models.load_model("/content/drive/MyDrive/Colab Notebooks/HangmanTest/Trained Model/Model 2")
self.char_to_index = {}
self.index_to_char = {}
self.build_index_maps()
model_path = "/content/drive/MyDrive/Colab Notebooks/HangmanTest/Trained Model/Model 2"
model = tf.keras.models.load_model(model_path)
def build_index_maps(self):
vocab = set("".join(self.full_dictionary))
self.char_to_index = {char: index for index, char in enumerate(sorted(vocab))}
self.index_to_char = {index: char for char, index in self.char_to_index.items()}
def preprocess_word(self, word):
return [self.char_to_index[char] if char in self.char_to_index else -1 for char in word]
def predict_next_letter(self, word):
model_path = "/content/drive/MyDrive/Colab Notebooks/HangmanTest/Trained Model/Model 2"
model = tf.keras.models.load_model(model_path)
encoded_word = self.preprocess_word(word)
padded_word = np.pad(encoded_word, (0, 29 - len(encoded_word)), mode='constant')
padded_word = np.array([padded_word]) # Add batch dimension
next_letter_index = np.argmax(model.predict(padded_word), axis=-1)
return self.index_to_char[next_letter_index[0]]
def guess(self, word):
clean_word = word.replace("_", ".")
len_word = len(clean_word)
current_dictionary = self.current_dictionary
new_dictionary = []
for dict_word in current_dictionary:
if len(dict_word) != len_word:
continue
if re.match(clean_word, dict_word):
new_dictionary.append(dict_word)
self.current_dictionary = new_dictionary
guess_letter = self.predict_next_letter(word)
return guess_letter
# The rest of the class remains unchanged...
def build_dictionary(self, dictionary_file_location):
with open(dictionary_file_location, "r") as text_file:
full_dictionary = text_file.read().splitlines()
return full_dictionary
def start_game(self, verbose=True):
self.guessed_letters = []
self.current_dictionary = self.full_dictionary
word = random.choice(self.full_dictionary)
hidden_word = '-' * len(word)
tries_remains = 6 # Adjusted to match the online version
game_id = ''.join(random.choices('0123456789abcdef', k=12)) # Generate a random game ID
if verbose:
print(f"Successfully start a new game! Game ID: {game_id}. # of tries remaining: {tries_remains}. Word: {hidden_word}.")
while tries_remains > 0:
guess_letter = self.guess(hidden_word)
if guess_letter is None:
# No possible guess. Fall back to original logic.
guess_letter = self.original_guess()
self.guessed_letters.append(guess_letter)
if verbose:
print(f"Guessing letter: {guess_letter}")
if guess_letter in word:
new_hidden_word = ''
for i, letter in enumerate(word):
if letter == guess_letter:
new_hidden_word += letter
else:
new_hidden_word += hidden_word[i]
if new_hidden_word == word:
if verbose:
print(f"Word: {new_hidden_word}")
print(f"Congratulations! You won!")
return True
hidden_word = new_hidden_word
if verbose:
print(f"Word: {hidden_word}")
else:
tries_remains -= 1
if verbose:
print(f"Wrong attempt! Word: {hidden_word} Tries remaining: {tries_remains}")
if tries_remains == 0:
if verbose:
print(f"Failed game: {game_id}. Because of: # of tries exceeded!")
return False
# Play the game a certain number of times
total_recorded_runs = 10
total_recorded_successes = 0
hangman_game = HangmanOffline()
for i in range(total_recorded_runs):
print('Playing', i+1, 'th game')
if hangman_game.start_game(verbose=True):
total_recorded_successes += 1
time.sleep(0.5)
# Calculate success rate
success_rate = (total_recorded_successes / total_recorded_runs) * 100
print(f'Total recorded successes = {total_recorded_successes}')
print(f'Overall success rate: {success_rate} %')
The problem I am facing is that whatever value I assign to blanks appearing in the first step of every game, the model tries to input it and gives an error like this:
indices[0,0] = -1 is not in [0, 26)
[[{{node sequential/embedding/embedding_lookup}}]] [Op:__inference_predict_function_93797]
How to fix this issue?