I am having trouble with transformer's FlauBERT module. I have been trying to summarize some texts in French in order to add them to a database but I've been getting this "'numpy.int32' object is not callable." error each time. As I am dealing with long texts, I am summarizing them in batches
I'm not sure what piece of the function calling for summarization is causing the problem. I imported both NumPy and TensorFlow for my code. Here's the snippets for both the function definition and the main loop in question:
Function definition:
...
def summarize_text_batch(batch, language):
"""
Summarizes cleaned text. Procedure done by batches as mentionned below.
Parameters:
- batch (list): List of cleaned text segments.
- language (str): Language identifier ('en' for English, 'fr' for French).
Returns:
- summaries (list): List of summarized text segments.
"""
try:
summaries = []
if language == 'fr':
tokenizer = FlaubertTokenizer.from_pretrained("jplu/tf-flaubert-base-cased")
for i, text_segment in enumerate(tqdm(batch, desc="Summarizing")):
if language == 'en':
summarizer = pipeline("summarization", device=0, batch_size=1600)
current_summary = summarizer(text_segment, max_length=13, min_length=2, length_penalty=2.0, num_beams=4, early_stopping=True)
elif language == 'fr':
inputs = tokenizer(text_segment, return_tensors='tf', max_length=512, truncation=True)
outputs = flaubert_model(**inputs)
summary_logits = outputs.logits
summary_ids = tf.argmax(summary_logits, axis=-1)
current_summary = tokenizer.decode(tf.convert_to_tensor(summary_ids[0].numpy().tolist()), skip_special_tokens=True)
else:
raise ValueError(f"Unsupported language: {language}")
print(f"Batch {i + 1} summary: {current_summary}")
summaries.append(current_summary)
except Exception as e:
print(f"Error summarizing batch {i + 1}: {e}.")
summaries.append(None)
...
Main loop:
...
flaubert_model = load_flaubert_model("jplu/tf-flaubert-base-cased")
while True:
pdf_path_str = validate_input("PDF filepath")
organization = validate_input("Organization")
document_type = validate_input("Document Type")
category = validate_input("Category")
clientele = validate_input("Clientele")
language = validate_input("Language")
try:
pdf_path = Path(pdf_path_str)
content = extract_text_from_pdf(pdf_path)
# Cleaning text
cleaned_content = clean_text(content)
'''print(cleaned_content)''' # Debugging
# Preprocessing text
cleaned_content = preprocess_text(cleaned_content)
# Split the cleaned content into smaller batches
batch_size = 1600
batches = [cleaned_content[i:i+batch_size] for i in range(0, len(cleaned_content), batch_size)]
# Summarize each batch
summaries = []
for i, batch in enumerate(batches):
print(f"Processing batch {i + 1}/{len(batches)}")
current_summary = summarize_text_batch([batch], language)
if current_summary and None not in current_summary[0]:
summaries.extend(current_summary)
# Flatten the nested list and then join
summary = ' '.join([item for sublist in summaries for item in sublist and None not in sublist])
with connect_to_database() as connection:
insert_data(connection, organization, document_type, category, clientele, language, str(pdf_path), content, summary)
print("Operation successful. Please proceed with the next document.")
except OSError as e:
print(f"Error processing PDF: {e}. Please enter a valid PDF path.")
I tried changing the current_summary variable inside the function definition in order to accept numpy variables.
Below are the modifications I made:
current_summary = tokenizer.decode(np.array(summary_ids[0]), skip_special_tokens=True)
current_summary = tokenizer.decode(summary_ids[0].numpy()), skip_special_tokens=True)
current_summary = tokenizer.decode(list(summary_ids[0]), skip_special_tokens=True)
current_summary = tokenizer.decode(summary_ids[0].numpy().tolist(), skip_special_tokens=True)
current_summary = tokenizer.decode(tf.convert_to_tensor(summary_ids[0].numpy().tolist()), skip_special_tokens=True)
All of them gave me the same error.