How to add a Custom components to spaCy?

Question

I want to add custom components to spacy pipeline, e.g. add additional metadata to tokens or the document or to add entities.

So I build the following ExtractorComponent component that will be added to the spaCy pipeline. However, we create a new custom attribute at the Doc level called doids.

import progressbar
from pronto import Ontology
from spacy.tokens import Doc, Span, Token
from spacy.matcher import PhraseMatcher
from spacy.util import filter_spans
from spacy.language import Language

@Language.component("doid_extractor")    
class DOIDExtractorComponent(object):
    # name of the component
    name = "doid_extractor"

    def __init__(self, nlp, label="DOID"):
        # label that is applied to the matches
        self.label = label

        # load ontology
        print("Loading DOID ontology")
        doid = Ontology("http://purl.obolibrary.org/obo/doid.obo")
        
        # init terms and patterns
        self.terms = {}
        patterns = []

        i = 0
        nr_terms = len(doid.terms())
        # init progress bar as loading terms takes long
        print("Importing terms")
        bar = progressbar.ProgressBar(maxval=nr_terms, 
                                      widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
        bar.start()

        # iterate over terms in ontology
        for term in doid.terms():
          # if term has a name
          if term.name is not None:
            self.terms[term.name.lower()] = {'id': term.id}
            patterns.append(nlp(term.name))
          i += 1
          bar.update(i)
            
        bar.finish()
        
        # initialize matcher and add patterns
        self.matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
        self.matcher.add(label, None, *patterns)
        
        # set extensions to tokens, spans and docs
        Token.set_extension("is_doid_term", default=False, force=True)
        Token.set_extension("doid_id", default=False, force=True)
        Token.set_extension("merged_concept", default=False, force=True)
        Doc.set_extension("has_doids", getter=self.has_doids, force=True)
        Doc.set_extension("doids", default=[], force=True)
        Span.set_extension("has_doids", getter=self.has_doids, force=True)
        
    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = [Span(doc, match[1], match[2], label=self.label) for match in matches]
        for i, span in enumerate(spans):
          span._.set("has_doids", True)
          for token in span:
               token._.set("is_doid_term", True)
               token._.set("doid_id", self.terms[span.text.lower()]["id"])

        with doc.retokenize() as retokenizer:
            for span in filter_spans(spans):
                retokenizer.merge(span, attrs={"_": {"merged_concept": True}})
                doc._.doids = list(doc._.doids) + [span]

        return doc
    # setter function for doc level
    def has_doids(self, tokens):
        return any([t._.get("is_doid_term") for t in tokens])

and I use the component like that:

from pronto import Ontology
import progressbar
import spacy
from spacy import displacy
from spacy.tokens import Doc, Span, Token
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from spacy.util import filter_spans
from src.iccoExtractor import DOIDExtractorComponent
from spacy.language import Language
# # python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")
 
nlp.add_pipe("doid_extractor", after="ner")
   
text = """Primary education, Intermediate education, and secondary education combined are sometimes referred to as K-12 (Kindergarten through Grade 12). Secondary schooling, known as high school, collegiate institute, école secondaire or secondary school, consists of different grades depending on the province in which one resides"""

doc = nlp(text)

So, when I execute the above code I get the following exception

Traceback (most recent call last):                                       ]   0%
  File "C:\Users\Admin\Documents\onto.py", line 21, in <module>
    doc = nlp(text)
  File "C:\Users\Admin\miniconda3\envs\projet1\lib\site-packages\spacy\language.py", line 1025, in __call__
    error_handler(name, proc, [doc], e)
  File "C:\Users\Admin\miniconda3\envs\projet1\lib\site-packages\spacy\util.py", line 1630, in raise_error
    raise e
  File "C:\Users\Admin\miniconda3\envs\projet1\lib\site-

packages\spacy\language.py", line 1020, in __call__
    doc = proc(doc, **component_cfg.get(name, {}))  # type: ignore[call-arg]
  File "C:\Users\Admin\Documents\src\iccoExtractor.py", line 38, in __init__
    patterns.append(nlp(term.name))
TypeError: 'spacy.tokens.doc.Doc' object is not callable

I dont know exactly what that means? any suggestion to solve this issue?

You've probably already found this, but here's what it means: https://stackoverflow.com/questions/21324940/what-does-typeerror-xxx-object-is-not-callable-means So something in "patterns.append(nlp(term.name))" is not callable. "patterns" looks alright to me, but I have problems to understand where "nlp" comes from. I see it in the constructor, but where do you hand over the variable? I think your variable "nlp" in the constructor is misplaced. Look at the example in the spacy documentation: https://spacy.io/usage/processing-pipelines#custom-components Hope that helps. — EustassX, Jul 22 '22 at 07:03
Same question on spaCy forums, with an answer. https://github.com/explosion/spaCy/discussions/11181 — polm23, Jul 24 '22 at 05:05

How to add a Custom components to spaCy?

0 Answers0