I am using spacy to enter some customized entities to run NER. Here is what I tried,
import spacy
from spacy.tokens import Span
from spacy.util import filter_spans
from spacy.matcher import PhraseMatcher
nlp = spacy.load("en_core_web_sm")
phrase_matcher = PhraseMatcher(nlp.vocab)
#countries = [nlp.make_doc(text) for text in ['Canada', 'United States', 'Mexico']]
#days = [nlp.make_doc(text) for text in ['Monday', 'Tuesday', 'Wednesday', 'Thursday',
#'Friday', 'Saturday', 'Sunday']]
food_list = [nlp.make_doc(text) for text in [
'breast milk',
'Diet: Christoval COURSE',
'VITAMIN D',
'liquid predominant diet',
'formula',
'vitamin D supplementation',
'tyrosine supplementation',
'meals',
'water'
]]
protein=[nlp.make_doc(text)for text in ['protein']] # Lookbehind for "protein"
phrase_matcher.add("DIET",None, *food_list)
#phrase_matcher.add("protein",None, *protein)
text = 'She intake of 3 meals per day of low protein foods.Encouraged intake of water instead of calorie drinks. Aim for <10 gm per day of protein from the diet. Tyrosine supplementation, Vitamin D'
doc1 = nlp(text)
matches = phrase_matcher(doc1)
# Assign labels and update Doc object with entities
for match_id, start, end in matches:
rule_id = nlp.vocab.strings[match_id] # get the unicode ID, i.e. 'COLOR'
span = doc1[start : end] # get the matched slice of the doc
print(rule_id, span.text)
# Initialize a list to store entities
# Filter spans to keep only the longest or first matching entity
filtered_entities = []
for match_id, start, end in matches:
span = Span(doc1, start, end, label="DIET")
filtered_span = filter_spans([span])[0]
print(f"Filtered Span: {filtered_span.text}") # Check filtered span text
filtered_entities.append(filtered_span)
# Add filtered entities to doc
for ent in filtered_entities:
if not any(ent.start <= token.idx < ent.end for token in doc1):
doc1.ents = list(doc1.ents) + [ent]
# Render the entity's
displacy.render(doc1, style="ent")
However, the About code snippet is throwing an error.
I have used the suggested filter_span
DIET meals
DIET water
DIET vitamin D supplementation
DIET formula
DIET tyrosine supplementation
Filtered Span: meals
Filtered Span: water
Filtered Span: vitamin D supplementation
Filtered Span: formula
Filtered Span: tyrosine supplementation
ValueError: [E1010] Unable to set entity information for token 30 which is included in more than one span in entities, blocked, missing or outside.
Any help or support is appreciated