3

I am fine-tuning 'microsoft/trocr-base-printed' image2text model to let it recognize the captcha text on it. I was able to find this link to try to avoid the error: ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds, but it still won't work for me. Below is my python code.

from transformers import TrOCRProcessor, VisionEncoderDecoderModel, BertTokenizer
from transformers import pipeline, default_data_collator
from datasets import load_dataset, Image as image
from datasets import Dataset, Features, Array3D
from PIL import Image
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

from glob import glob

import time
import os
import pandas as pd
import numpy as np
import pyarrow as pa
import pickle

picture_path = './captcha_100'
LIMIT = 100
directory = os.listdir(picture_path)[:LIMIT]
target = pd.read_csv('./captcha.csv').to_numpy().tolist()
data = []
picture = []

#read captcha picture from local directory
for d in directory:
    picture.append(Image.open(picture_path+'/'+d))

processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed')

#What I believe this code snippet can help me, but it's not work
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

#making my own dataset
for i in range(LIMIT):
    temp = {}
    temp['pixel_values'] = processor(picture[i]).pixel_values[0].tolist()
    temp['answer'] = target[i][0]
    data.append(temp)
    
data = pa.Table.from_pylist(data)
data = Dataset(data)   
data = data.train_test_split(train_size=0.8)

#giving training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./captcha100",
    per_device_train_batch_size=16,
    evaluation_strategy="steps",
    num_train_epochs=4,
    fp16=True,
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    save_total_limit=2,
    remove_unused_columns=False,
    load_best_model_at_end=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["test"],
    tokenizer=processor.feature_extractor,
    data_collator=default_data_collator
)

#train
trainer.train()
model = model.to('cpu')

#save the model
with open('captchaModel100.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('processor.pkl', 'wb') as f:
    pickle.dump(processor, f)

And full traceback

Traceback (most recent call last):
  File "/home/aclab/Joywang/captcha/train_captcha.py", line 68, in <module>
    trainer.train()
  File "/home/aclab/.virtualenvs/cudatest/lib/python3.9/site-packages/transformers/trainer.py", line 1501, in train
    return inner_training_loop(
  File "/home/aclab/.virtualenvs/cudatest/lib/python3.9/site-packages/transformers/trainer.py", line 1749, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/home/aclab/.virtualenvs/cudatest/lib/python3.9/site-packages/transformers/trainer.py", line 2508, in training_step
    loss = self.compute_loss(model, inputs)
  File "/home/aclab/.virtualenvs/cudatest/lib/python3.9/site-packages/transformers/trainer.py", line 2540, in compute_loss
    outputs = model(**inputs)
  File "/home/aclab/.virtualenvs/cudatest/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/aclab/.virtualenvs/cudatest/lib/python3.9/site-packages/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py", line 609, in forward
    decoder_outputs = self.decoder(
  File "/home/aclab/.virtualenvs/cudatest/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/aclab/.virtualenvs/cudatest/lib/python3.9/site-packages/transformers/models/trocr/modeling_trocr.py", line 958, in forward
    outputs = self.model.decoder(
  File "/home/aclab/.virtualenvs/cudatest/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/aclab/.virtualenvs/cudatest/lib/python3.9/site-packages/transformers/models/trocr/modeling_trocr.py", line 637, in forward
    raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds

Big thanks for help.

wJoyW
  • 51
  • 1
  • 5

0 Answers0