1

Trained NER spacy custom training model using the document https://towardsdatascience.com/train-ner-with-custom-training-data-using-spacy-525ce748fab7 and https://spacy.io/usage/processing-pipelines by sample test case dataset to find the currency exactly in the given text.

Examble dataset:

TRAIN_DATA = [('This is AFN currency', {'entities': [(8, 11, 'CUR')]}),
              ('I have EUR european currency', {'entities': [(7, 10, 'CUR')]}),
              ('let as have ALL money', {'entities': [(12, 15, 'CUR')]}),
              ('DZD is a dollar', {'entities': [(0, 3, 'CUR')]}),
              ('money USD united states', {'entities': [(6, 9, 'CUR')]})
              ]

trained a model successfully by naming the model 'currency'. It predicts good for the trained dataset with proper label but mostly it predicts untrained text data with wrong label.

Input test line: 'I have AZWSQTS lot LOT of Indian MZW currency USD INR'

output:

AZWSQTS - CUR , LOT  - CUR, MZW  - CUR, USD  - CUR, INR  - CUR

Here, 'AZWSQTS' & 'LOT' is not a currency but it predicts, this is the problem I am getting.

Complete code:

from __future__ import unicode_literals, print_function
import random
from pathlib import Path
import spacy
from tqdm import tqdm
from spacy.training import Example

def spacy_train_model():
    ''' Sample traning dataset format'''
    '''list of currency'''
    currency_list = ['AFN', 'EUR', 'EUR', 'ALL', 'DZD', 'USD', 'EUR', 'AOA', 'XCD', 'XCD', 'ARS', 
    'AMD', 'AWG', 'SHP', 'AUD', 'EUR', 'AZN', '', 'BSD', 'BHD', 'BDT', 'BBD', 'BYN', 'EUR', 'BZD', 
    'XOF', 'BMD', 'BTN', 'BOB', 'USD', 'BAM', 'BWP', 'BRL', 'USD', 'USD', 'BND', 'BGN', 'XOF', 'BIF', 
    'CVE', 'KHR', 'XAF', 'CAD', 'USD', 'KYD', 'XAF', 'XAF', 'NZD', 'CLP', 'CNY', 'AUD', 'AUD', 'COP', 
    'KMF', 'CDF', 'XAF', 'none', 'CRC', 'XOF', 'HRK', 'CUP', 'ANG', 'EUR', 'CZK', '', 'DKK', 'DJF', 
    'XCD', 'DOP', '', 'USD', 'EGP', 'USD', 'XAF', 'ERN', 'EUR', 'SZL', 'ETB', '', 'FKP', 'FJD', 
    'EUR', 'EUR', 'EUR', 'XPF', '', 'XAF', 'GMD', 'GEL', 'EUR', 'GHS', 'GIP', 'EUR', 'DKK', 'XCD', 
    'EUR', 'USD', 'GTQ', 'GGP', 'GNF', 'XOF', 'GYD', '', 'HTG', 'HNL', 'HKD', 'HUF', 'ISK', 'INR', 
    'IDR', 'XDR', 'IRR', 'IQD', 'EUR', 'IMP', 'ILS', 'EUR', '', 'JMD', 'JPY', 'JEP', 'JOD', 
    'KZT', 'KES', 'AUD', 'EUR', 'KWD', 'KGS', '', 'LAK', 'EUR', 'LBP', 'LSL', 'LRD', 'LYD', 'CHF', 
    'EUR', 'EUR', '', 'MOP', 'MGA', 'MWK', 'MYR', 'MVR', 'XOF', 'EUR', 'USD', 'EUR', 'MRU', 'MUR', 
    'EUR', 'MXN', 'USD', 'MDL', 'EUR', 'MNT', 'EUR', 'XCD', 'MAD', 'MZN', 'MMK', '', 'NAD', 'AUD', 
    'NPR', 'EUR', 'XPF', 'NZD', 'NIO', 'XOF', 'NGN', 'NZD', 'AUD', 'USD', 'KPW', 'MKD', 'NOK', 
    'OMR','PKR', 'USD', 'ILS', 'USD', 'PGK', 'PYG', 'PEN', 'PHP', 'NZD', 'PLN', 'EUR', 'USD','QAR', 
    'EUR', 'RON', 'RUB', 'RWF', '', 'USD', 'EUR', 'SHP', 'XCD', 'XCD', 'EUR', 'EUR', 'XCD', 'WST', 
    'EUR', 'STN', 'SAR', 'XOF', 'RSD', 'SCR', 'SLL', 'SGD', 'USD', 'ANG', 'EUR', 'EUR', 'SBD', 'SOS', 
    'ZAR', 'GBP', 'KRW', 'SSP', 'EUR', 'LKR', 'SDG', 'SRD', 'NOK', 'SEK', 'CHF', 'SYP', '', 'TWD', 
    'TJS', 'TZS', 'THB', 'USD', 'XOF', 'NZD', 'TOP', 'TTD', 'GBP', 'TND', 'TRY', 'TMT', 'USD', 'AUD', 
    'UGX', 'UAH', 'AED', 'GBP', 'USD', 'UYU', 'USD', 'UZS', '', 'VUV', 'EUR', 'VES', 'VND', '', 
    'USD', 'XPF', 'YER', 'ZMW', 'USD']


    TRAIN_DATA = [('This is AFN currency', {'entities': [(8, 11, 'CUR')]}),
              ('I have EUR europen currency', {'entities': [(7, 10, 'CUR')]}),
              ('let as have ALL money', {'entities': [(12, 15, 'CUR')]}),
              ('DZD is a dollar', {'entities': [(0, 3, 'CUR')]}),
              ('money USD united states', {'entities': [(6, 9, 'CUR')]})
              ]

    # model = "en_core_web_lg"
    model = None
    output_dir=Path(r"D:\currency") # Path to save training model - create new empty directory
    n_iter=100

    #load the model

    if model is not None:
        nlp = spacy.load(model)
        optimise = nlp.create_optimizer()
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')
        optimise = nlp.begin_training()
        print("Created blank 'en' model")

    #set up the pipeline

    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe('ner', last=True)
    else:
        ner = nlp.get_pipe('ner')


    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.initialize()
        # optimizer = optimise
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}

            for text, annotations in tqdm(TRAIN_DATA):
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update(
                    [example],
                    drop=0.5,
                    sgd=optimizer,
                    losses=losses)
            print(losses)

    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])


    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
    
    

def test_model(text):
    nlp = spacy.load(r'D:\currency')
    for tex in text.split('\n'):
        doc = nlp(tex)
        for token in doc.ents:
            print(token.text, token.label_)
        
        
spacy_train_model()     #Training the model
test_model('text')      #Testing the model
desertnaut
  • 57,590
  • 26
  • 140
  • 166
sasikumar
  • 11
  • 2

1 Answers1

0

Couple of thoughts here...

You can't train a model with only five examples. Maybe this is just example code and you have more, but you generally needs hundreds of examples.

If you only need to recognize currency names like USD or GBP, use spaCy's rule-based matchers. You would only need an NER model if these are ambiguous somehow. Like if ALL is a currency, but you don't want to recognize it in "I ate ALL the donuts", an NER model can help, but that's a pretty hard distinction to learn, so you'll need hundreds of examples.

What is probably happening in your example problem is that the NER model has learned that any all capital token is a currency. If you want to fix that with an NER model, you'll need to give it examples where an all capital token isn't currency to learn from.

polm23
  • 14,456
  • 7
  • 35
  • 59