I am rather new to deep learning so appreciate simple explanations. I'm using a dataset that contains news article titles and an accompanying human-labelled category. I want to prove the usefulness of SBERT as sentence embeddings and establish a model that con correctly label the Sentences. This is my attempt below but it is giving a very low accuracy of 0.45. I am open to any critques and comments.
Thanks
df = pd.read_csv('Medium_Titles.csv')
# create final dataframe of combined headline and short_description
selected_categories = [ 'health', 'family', 'education', 'politics']
# Initialize lists to store titles and category labels
corpus = []
category_labels = []
final_df = df[df["category"].isin(selected_categories)]
final_df.head()
corpus =final_df['title']
category_labels=final_df['category']
data = corpus.map(word_tokenize).values
total_vocabulary = set(word.lower() for tweet in data for word in tweet) # set created from nested comprehension
print('There are {} unique words in the dataset.'.format(len(total_vocabulary)))
print('There are {} unique titles in the dataset.'.format(len(data)))
# set the emotion/sentiment as our target
target = final_df['category']
# use one hot encoding since our target is categorical
y = pd.get_dummies(target).values
corpus=corpus.tolist()
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Encode the corpus. This might take a while")
corpus_embeddings = embedding_model.encode(corpus, batch_size=64, show_progress_bar=True, convert_to_tensor=True)
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
EMBEDDING_DIM =384
MAX_NB_WORDS = 50000
input_layer = Input(shape=(EMBEDDING_DIM,))
embedding_layer = Embedding(len(corpus), EMBEDDING_DIM, weights=[corpus_embeddings], trainable=False)(input_layer)
lstm_layer = LSTM(128)(embedding_layer)
output_layer = Dense(num_categories, activation='softmax')(lstm_layer)
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.summary() # check the shape