LSTM model accuracy at 10%

25 Views Asked by At

I'm making a model that classify the txt documents into catgsn and have this problem with my model that the accuracy is about 10% , please suggest how to make it better , for the pre processing I used BoW

from sklearn.feature_extraction.text import CountVectorizer
import joblib
import os

dataset_directories = ['dataset/sport', 'dataset/business', 'dataset/entertainment', 'dataset/food',
                       'dataset/technology', 'dataset/space', 'dataset/politics', 'dataset/medical',
                       'dataset/historical', 'dataset/graphics']

all_document_texts = []
custom_stopwords_file = 'stopword'

with open(custom_stopwords_file, 'r') as stopword_file:
    custom_stopwords = stopword_file.read().splitlines()  #

bow = CountVectorizer(lowercase=True, stop_words=custom_stopwords)

def remove_custom_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in custom_stopwords]
    return ' '.join(filtered_words)

for data_directory in dataset_directories:
    document_texts = []
    for filename in os.listdir(data_directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(data_directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                document_texts.append(text)

    document_texts = [remove_custom_stopwords(text) for text in document_texts]

    all_document_texts.extend(document_texts)

bow_result = bow.fit_transform(all_document_texts)

joblib.dump(bow, 'bow_vectorizer.pkl')
joblib.dump(bow.vocabulary_, 'bow_vocabulary.pkl')

print('\nBag of Words (BoW) values:')
print(bow_result)

for normalizing the vector

from sklearn.preprocessing import StandardScaler
import joblib
import numpy as np

bow_vectorizer = joblib.load('bow_vectorizer.pkl')
bow_vocabulary = joblib.load('bow_vocabulary.pkl')

bow_vocabulary_array = np.array(list(bow_vocabulary.values()))

bow_vocabulary_array = bow_vocabulary_array.reshape(-1, 1)

scaler = StandardScaler()
bow_vocabulary_scaled = scaler.fit_transform(bow_vocabulary_array)
joblib.dump(bow_vocabulary_scaled, 'preprocessed_bow_vocabulary.pkl')

and finally this is the model

import os
import numpy as np
import joblib
from collections import defaultdict
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam


learning_rate = 0.001

bow_vectorizer = joblib.load('preprocessed_bow_vocabulary.pkl')

dataset_directories = ['dataset/sport', 'dataset/business', 'dataset/entertainment', 'dataset/food',
                       'dataset/technology', 'dataset/space', 'dataset/politics', 'dataset/medical',
                       'dataset/historical', 'dataset/graphics']

texts = []
labels = []
label_counts = defaultdict(int)
for i, data_directory in enumerate(dataset_directories):
    for filename in os.listdir(data_directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(data_directory, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    texts.append(text)
                    labels.append(i)
                    label_counts[i] += 1
            except Exception as e:
                print(f"Error reading file '{file_path}': {e}")

max_len = max(len(text.split()) for text in texts)

X_pad = []
for text in texts:
    bow_vector = []
    for word in text.split():
        if word in bow_vectorizer:
            bow_vector.append(bow_vectorizer[word])
        else:
            bow_vector.append(0)
    padded_vector = pad_sequences([bow_vector], padding='post', maxlen=max_len)
    X_pad.append(padded_vector)

X_pad = np.array(X_pad).squeeze()
y = np.array(labels)

print("Shape of X_pad:", X_pad.shape)
print("Number of labeled files per category:")
for label, count in label_counts.items():
    print(f"Category {label}: {count} files")

print("Total labeled files:", len(texts))

if len(X_pad) != len(y):
    print("Error: Number of samples in X_pad and y do not match.")
else:
    print("Number of samples in X_pad and y match.")

X_train, X_val, y_train, y_val = train_test_split(X_pad, y, test_size=0.2, random_state=42)

input_dim = X_pad.shape[1]
num_words = X_pad.shape[1]
embedding_dim = 100

model = Sequential([
    Embedding(input_dim=num_words, output_dim=embedding_dim),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(dataset_directories), activation='softmax')
])

optimizer = Adam(learning_rate=learning_rate)

model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=40, batch_size=64)
accuracy = history.history['accuracy'][-1]
print("Accuracy: %.2f%%" % (accuracy * 100))
model.save('lstm_model.keras')


I want the accuracy to be at least about %60

0

There are 0 best solutions below