1

Hi I am new to python and i need some help. I trying to run a file on Windows 10 OS with python 2.7.

import os
import re
import codecs
import numpy as np
import theano


models_path = "./models"
eval_path = "./evaluation"
eval_temp = os.path.join(eval_path, "temp")
eval_script = os.path.join(eval_path, "conlleval")


def get_name(parameters):
    """
    Generate a model name from its parameters.
    """
    l = []
    for k, v in parameters.items():
        if type(v) is str and "/" in v:
            l.append((k, v[::-1][:v[::-1].index('/')][::-1]))
        else:
            l.append((k, v))
    name = ",".join(["%s=%s" % (k, str(v).replace(',', '')) for k, v in l])
    return "".join(i for i in name if i not in "\/:*?<>|")


def set_values(name, param, pretrained):
    """
    Initialize a network parameter with pretrained values.
    We check that sizes are compatible.
    """
    param_value = param.get_value()
    if pretrained.size != param_value.size:
        raise Exception(
            "Size mismatch for parameter %s. Expected %i, found %i."
            % (name, param_value.size, pretrained.size)
        )
    param.set_value(np.reshape(
        pretrained, param_value.shape
    ).astype(np.float32))


def shared(shape, name):
    """
    Create a shared object of a numpy array.
    """
    if len(shape) == 1:
        value = np.zeros(shape)  # bias are initialized with zeros
    else:
        drange = np.sqrt(6. / (np.sum(shape)))
        value = drange * np.random.uniform(low=-1.0, high=1.0, size=shape)
    return theano.shared(value=value.astype(theano.config.floatX), name=name)


def create_dico(item_list):
    """
    Create a dictionary of items from a list of list of items.
    """
    assert type(item_list) is list
    dico = {}
    for items in item_list:
        for item in items:
            if item not in dico:
                dico[item] = 1
            else:
                dico[item] += 1
    return dico


def create_mapping(dico):
    """
    Create a mapping (item to ID / ID to item) from a dictionary.
    Items are ordered by decreasing frequency.
    """
    sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
    id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
    item_to_id = {v: k for k, v in id_to_item.items()}
    return item_to_id, id_to_item


def zero_digits(s):
    """
    Replace every digit in a string by a zero.
    """
    return re.sub('\d', '0', s)


def iob2(tags):
    """
    Check that tags have a valid IOB format.
    Tags in IOB1 format are converted to IOB2.
    """
    for i, tag in enumerate(tags):
        if tag == 'O':
            continue
        split = tag.split('-')
        if len(split) != 2 or split[0] not in ['I', 'B']:
            return False
        if split[0] == 'B':
            continue
        elif i == 0 or tags[i - 1] == 'O':  # conversion IOB1 to IOB2
            tags[i] = 'B' + tag[1:]
        elif tags[i - 1][1:] == tag[1:]:
            continue
        else:  # conversion IOB1 to IOB2
            tags[i] = 'B' + tag[1:]
    return True


def iob_iobes(tags):
    """
    IOB -> IOBES
    """
    new_tags = []
    for i, tag in enumerate(tags):
        if tag == 'O':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'B':
            if i + 1 != len(tags) and \
               tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('B-', 'S-'))
        elif tag.split('-')[0] == 'I':
            if i + 1 < len(tags) and \
                    tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('I-', 'E-'))
        else:
            raise Exception('Invalid IOB format!')
    return new_tags


def iobes_iob(tags):
    """
    IOBES -> IOB
    """
    new_tags = []
    for i, tag in enumerate(tags):
        if tag.split('-')[0] == 'B':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'I':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'S':
            new_tags.append(tag.replace('S-', 'B-'))
        elif tag.split('-')[0] == 'E':
            new_tags.append(tag.replace('E-', 'I-'))
        elif tag.split('-')[0] == 'O':
            new_tags.append(tag)
        else:
            raise Exception('Invalid format!')
    return new_tags


def insert_singletons(words, singletons, p=0.5):
    """
    Replace singletons by the unknown word with a probability p.
    """
    new_words = []
    for word in words:
        if word in singletons and np.random.uniform() < p:
            new_words.append(0)
        else:
            new_words.append(word)
    return new_words


def pad_word_chars(words):
    """
    Pad the characters of the words in a sentence.
    Input:
        - list of lists of ints (list of words, a word being a list of char indexes)
    Output:
        - padded list of lists of ints
        - padded list of lists of ints (where chars are reversed)
        - list of ints corresponding to the index of the last character of each word
    """
    max_length = max([len(word) for word in words])
    char_for = []
    char_rev = []
    char_pos = []
    for word in words:
        padding = [0] * (max_length - len(word))
        char_for.append(word + padding)
        char_rev.append(word[::-1] + padding)
        char_pos.append(len(word) - 1)
    return char_for, char_rev, char_pos


def create_input(data, parameters, add_label, singletons=None):
    """
    Take sentence data and return an input for
    the training or the evaluation function.
    """
    words = data['words']
    chars = data['chars']
    if singletons is not None:
        words = insert_singletons(words, singletons)
    if parameters['cap_dim']:
        caps = data['caps']
    char_for, char_rev, char_pos = pad_word_chars(chars)
    input = []
    if parameters['word_dim']:
        input.append(words)
    if parameters['char_dim']:
        input.append(char_for)
        if parameters['char_bidirect']:
            input.append(char_rev)
        input.append(char_pos)
    if parameters['cap_dim']:
        input.append(caps)
    if add_label:
        input.append(data['tags'])
    return input


def evaluate(parameters, f_eval, raw_sentences, parsed_sentences,
             id_to_tag, dictionary_tags, eval_id):
    """
    Evaluate current model using CoNLL script.
    """
    n_tags = len(id_to_tag)
    predictions = []
    count = np.zeros((n_tags, n_tags), dtype=np.int32)

    for raw_sentence, data in zip(raw_sentences, parsed_sentences):
        input = create_input(data, parameters, False)
        if parameters['crf']:
            y_preds = np.array(f_eval(*input))[1:-1]
        else:
            y_preds = f_eval(*input).argmax(axis=1)
        y_reals = np.array(data['tags']).astype(np.int32)
        assert len(y_preds) == len(y_reals)
        p_tags = [id_to_tag[y_pred] for y_pred in y_preds]
        r_tags = [id_to_tag[y_real] for y_real in y_reals]
        if parameters['tag_scheme'] == 'iobes':
            p_tags = iobes_iob(p_tags)
            r_tags = iobes_iob(r_tags)
        for i, (y_pred, y_real) in enumerate(zip(y_preds, y_reals)):
            new_line = " ".join(raw_sentence[i][:-1] + [r_tags[i], p_tags[i]])
            predictions.append(new_line)
            count[y_real, y_pred] += 1
        predictions.append("")

    # Write predictions to disk and run CoNLL script externally
    #eval_id = np.random.randint(1000000, 2000000)
    output_path = os.path.join(eval_temp, "eval.%i.output" % eval_id)
    scores_path = os.path.join(eval_temp, "eval.%i.scores" % eval_id)
    with codecs.open(output_path, 'w', 'utf8') as f:
        f.write("\n".join(predictions))
    os.system("%s < %s > %s" % (eval_script, output_path, scores_path))

    # CoNLL evaluation results
    eval_lines = [l.rstrip() for l in codecs.open(scores_path, 'r', 'utf8')]
    #trainLog = open('train.log', 'w')
    for line in eval_lines:
        print line
        #trainLog.write("%s\n" % line)


    # Remove temp files
    # os.remove(output_path)
    # os.remove(scores_path)

    # Confusion matrix with accuracy for each tag
    print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
        "ID", "NE", "Total",
        *([id_to_tag[i] for i in xrange(n_tags)] + ["Percent"])
    )
    for i in xrange(n_tags):
        print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
            str(i), id_to_tag[i], str(count[i].sum()),
            *([count[i][j] for j in xrange(n_tags)] +
              ["%.3f" % (count[i][i] * 100. / max(1, count[i].sum()))])
        )

    # Global accuracy
    print "%i/%i (%.5f%%)" % (
        count.trace(), count.sum(), 100. * count.trace() / max(1, count.sum())
    )

    # F1 on all entities
    return float(eval_lines[1].strip().split()[-1])

When i compile the code as it is i always get the error.I think its either because of restriction on path length in windows or it needs or slashes. I dont know what to add to subtract in order to resolve the problem.

run train.py --train lstm/fold1/train --dev lstm/fold1/dev --test lstm/fold1/test
WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be removed in the next release (v0.10). Please switch to the gpuarray backend. You can get more information about how to switch at this URL:
https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GT 620M (CNMeM is enabled with initial size: 85.0% of memory, cuDNN not available)
Traceback (most recent call last):

File "E:\New-Code\tagger-master\tagger-master\train.py", line 135, in 
model = Model(parameters=parameters, models_path=models_path)

File "model.py", line 36, in init
os.makedirs(self.model_path)

File "C:\Users\Acer\Anaconda2\envs\env_name27\lib\os.py", line 157, in makedirs
mkdir(name, mode)

WindowsError: [Error 3] The system cannot find the path specified: './models\tag_scheme=iob,lower=False,zeros=False,char_dim=25,char_lstm_dim=25,char_bidirect=True,word_dim=100,word_lstm_dim=100,word_bidirect=True,pre_emb=,all_emb=False,cap_dim=0,crf=True,dropout=0.3,lr_method=sgd-lr_.005'
talonmies
  • 70,661
  • 34
  • 192
  • 269
  • This "tag_scheme..." directory name is unusually long at 212 characters. The maximum DOS path length is 260 characters, and you're probably exceeding that relative to ".\models". – Eryk Sun Oct 07 '17 at 15:23
  • You can try using `models_path = u"\\\\?\\" + os.path.abspath(u".\\models")`. The "\\?\" prefix supports paths up to about 32,760 characters, but the path must be Unicode, fully-qualified, and use backslash instead of forward slash. If the library doesn't handle Unicode paths properly, this might just lead to another error. Code written for Python 2 tends to be naive and fail in this regard. Python 3 is generally better. In this case Python 3.6 in Windows 10 would be *a lot* better since it's manifested to allow long paths without requiring the "\\?\" prefix. – Eryk Sun Oct 07 '17 at 15:29
  • Thanks that solved an other error in the same file by setting eval_path = u"\\\\?\\" + os.path.abspath(u".\\evaluation") – Rabia Noureen Oct 08 '17 at 16:22
  • @eryksun According to the comments in https://stackoverflow.com/questions/46688911/python-named-entity-recognition-error-indexerror-list-index-out-of-range?noredirect=1#comment80343841_46688911 there are still some path related issues with the variable. Can you please look into it again? – Rabia Noureen Oct 11 '17 at 23:29

1 Answers1

0

In windows pathe is given by back slash \ instead of forward slash / which is used in linux/unix.

Try it like blow if file is 1 folder back:

models_path = "..\models"
eval_path = "..\evaluation"
Astik Anand
  • 12,757
  • 9
  • 41
  • 51
  • Thanks that solved the error but i got an other error in an other file of the same project.Exception: CoNLL evaluation script not found at "..\evaluation\conlleval" Should i create a different issue for that? – Rabia Noureen Oct 08 '17 at 15:31
  • @RabiaNoureen, Yea you can create a new issue for that. If this solution helped, you can select the answer. Thanks for all the cooperation. – Astik Anand Oct 08 '17 at 15:38
  • I replaced the 2 lines models_path = u"\\\\?\\" + os.path.abspath(u".\\models") eval_path = u"\\\\?\\" + os.path.abspath(u".\\evaluation") and run the script now i got File "E:\New-Code\tagger-master\tagger-master\train.py", line 221, in dev_data, id_to_tag, dico_tags, epoch) File "utils.py", line 284, in evaluate return float(eval_lines[1].strip().split()[-1]) IndexError: list index out of range – Rabia Noureen Oct 08 '17 at 16:29
  • i shall be thankful if you can help me solve the issue as i am stuck for the past 1.5 months with this script. – Rabia Noureen Oct 08 '17 at 16:33
  • @RabiaNoureen, I will surely help you out, but then I will need an access to your code. Can we connect through Teamviewer, so that I have access to your code? – Astik Anand Oct 08 '17 at 16:35
  • I am actually trying to train a model using this script. The code and dataset are available at https://github.com/detuvoldo/tagger. Thanks – Rabia Noureen Oct 08 '17 at 16:38
  • The Windows file API does accept "/" as a path separator, except not for `u"\\\\?\\"` paths. Using "/" is not the problem for regular file paths. The problem is the path is too long for a regular DOS path. Moving back a directory solves the immediate error, but it's not a good long-term solution considering you're working with a library that creates directories with 200+ character names. You're right on the edge of the 260 character limit for DOS paths. – Eryk Sun Oct 08 '17 at 16:39
  • @eryksun So what do you suggest? Actually i am using someone else project and i am new to python so dont know the inner logics. – Rabia Noureen Oct 08 '17 at 16:45
  • @eryksun, so what are the alternatives available? – Astik Anand Oct 08 '17 at 16:46
  • Adding the `u"\\\\?\\"` prefix to `models_path` and `eval_path` looks right to me, assuming it's on a DOS drive letter and not a UNC path. But it's kind of a Windows oddity that may cause problems with programs do something unusual with the path, and the Unicode requirement can cause problems in Python 2. But the `IndexError` doesn't strongly indicate either and could actually be an unrelated problem. – Eryk Sun Oct 08 '17 at 16:53
  • The script runs well and prints the accuracy except for the last line return float(eval_lines[1].strip().split()[-1]) where it produces the index error. – Rabia Noureen Oct 08 '17 at 16:57
  • index error is not the problem here that's understood coz list is going out of bound. – Astik Anand Oct 08 '17 at 16:58
  • @ Astik Anand so what should be done to avoid the error? Have you checked the code on github? – Rabia Noureen Oct 08 '17 at 17:06
  • @RabiaNoureen, I am a bit busy now, but I will surely go through it and let you know. – Astik Anand Oct 08 '17 at 17:08
  • @ Astik Anand thanks no issues i will wait for your response. – Rabia Noureen Oct 08 '17 at 17:09
  • @Astik Anand waiting for your response, please have a look at that code when ever you are free.Thanks – Rabia Noureen Oct 10 '17 at 19:42
  • @Astik Anand waiting for your response, please have a look at that code when ever you are free.Thanks – Rabia Noureen Oct 10 '17 at 19:42