I want to add custom components to spacy pipeline, e.g. add additional metadata to tokens or the document or to add entities.
So I build the following ExtractorComponent component that will be added to the spaCy pipeline. However, we create a new custom attribute at the Doc level called doids.
import progressbar
from pronto import Ontology
from spacy.tokens import Doc, Span, Token
from spacy.matcher import PhraseMatcher
from spacy.util import filter_spans
from spacy.language import Language
@Language.component("doid_extractor")
class DOIDExtractorComponent(object):
# name of the component
name = "doid_extractor"
def __init__(self, nlp, label="DOID"):
# label that is applied to the matches
self.label = label
# load ontology
print("Loading DOID ontology")
doid = Ontology("http://purl.obolibrary.org/obo/doid.obo")
# init terms and patterns
self.terms = {}
patterns = []
i = 0
nr_terms = len(doid.terms())
# init progress bar as loading terms takes long
print("Importing terms")
bar = progressbar.ProgressBar(maxval=nr_terms,
widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
bar.start()
# iterate over terms in ontology
for term in doid.terms():
# if term has a name
if term.name is not None:
self.terms[term.name.lower()] = {'id': term.id}
patterns.append(nlp(term.name))
i += 1
bar.update(i)
bar.finish()
# initialize matcher and add patterns
self.matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
self.matcher.add(label, None, *patterns)
# set extensions to tokens, spans and docs
Token.set_extension("is_doid_term", default=False, force=True)
Token.set_extension("doid_id", default=False, force=True)
Token.set_extension("merged_concept", default=False, force=True)
Doc.set_extension("has_doids", getter=self.has_doids, force=True)
Doc.set_extension("doids", default=[], force=True)
Span.set_extension("has_doids", getter=self.has_doids, force=True)
def __call__(self, doc):
matches = self.matcher(doc)
spans = [Span(doc, match[1], match[2], label=self.label) for match in matches]
for i, span in enumerate(spans):
span._.set("has_doids", True)
for token in span:
token._.set("is_doid_term", True)
token._.set("doid_id", self.terms[span.text.lower()]["id"])
with doc.retokenize() as retokenizer:
for span in filter_spans(spans):
retokenizer.merge(span, attrs={"_": {"merged_concept": True}})
doc._.doids = list(doc._.doids) + [span]
return doc
# setter function for doc level
def has_doids(self, tokens):
return any([t._.get("is_doid_term") for t in tokens])
and I use the component like that:
from pronto import Ontology
import progressbar
import spacy
from spacy import displacy
from spacy.tokens import Doc, Span, Token
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from spacy.util import filter_spans
from src.iccoExtractor import DOIDExtractorComponent
from spacy.language import Language
# # python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("doid_extractor", after="ner")
text = """Primary education, Intermediate education, and secondary education combined are sometimes referred to as K-12 (Kindergarten through Grade 12). Secondary schooling, known as high school, collegiate institute, école secondaire or secondary school, consists of different grades depending on the province in which one resides"""
doc = nlp(text)
So, when I execute the above code I get the following exception
Traceback (most recent call last): ] 0%
File "C:\Users\Admin\Documents\onto.py", line 21, in <module>
doc = nlp(text)
File "C:\Users\Admin\miniconda3\envs\projet1\lib\site-packages\spacy\language.py", line 1025, in __call__
error_handler(name, proc, [doc], e)
File "C:\Users\Admin\miniconda3\envs\projet1\lib\site-packages\spacy\util.py", line 1630, in raise_error
raise e
File "C:\Users\Admin\miniconda3\envs\projet1\lib\site-
packages\spacy\language.py", line 1020, in __call__
doc = proc(doc, **component_cfg.get(name, {})) # type: ignore[call-arg]
File "C:\Users\Admin\Documents\src\iccoExtractor.py", line 38, in __init__
patterns.append(nlp(term.name))
TypeError: 'spacy.tokens.doc.Doc' object is not callable
I dont know exactly what that means? any suggestion to solve this issue?