0

I'm using this code to build a new model from a pretrained model VIT, and i want to understand the pre processing of the images.

here is the code :

!pip install datasets transformers
from datasets import load_dataset

ds = load_dataset('beans', verification_mode= "no_checks")
labels = ds['train'].features['labels'].names
from transformers import ViTFeatureExtractor

model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name_or_path)
def process_example(example):
    inputs = feature_extractor(example['image'], return_tensors='pt')
    inputs['labels'] = example['labels']
    return inputs

process_example(ds['train'][0])

and here is the output :

{'pixel_values': tensor([[[[-0.5686, -0.5686, -0.5608,  ..., -0.0275,  0.1922, -0.2549],
          [-0.6078, -0.6000, -0.5843,  ..., -0.0353, -0.0196, -0.2706],
          [-0.6314, -0.6314, -0.6157,  ..., -0.2392, -0.3647, -0.2314],
          ...,
          [-0.5373, -0.5529, -0.5765,  ..., -0.0745, -0.0431, -0.0980],
          [-0.5608, -0.5765, -0.5843,  ...,  0.3176,  0.1608,  0.1294],
          [-0.5843, -0.5922, -0.6078,  ...,  0.2784,  0.1451,  0.2000]],

         [[-0.7098, -0.7098, -0.7490,  ..., -0.3725, -0.1608, -0.6000],
          [-0.7333, -0.7333, -0.7569,  ..., -0.3569, -0.3176, -0.5608],
          [-0.7490, -0.7490, -0.7647,  ..., -0.5373, -0.6627, -0.5373],
          ...,
          [-0.7725, -0.7882, -0.8196,  ..., -0.2314, -0.0353,  0.0824],
          [-0.7961, -0.8118, -0.8118,  ...,  0.1843,  0.3176,  0.3725],
          [-0.8196, -0.8196, -0.8275,  ...,  0.0745,  0.2863,  0.3961]],

         [[-0.9922, -0.9922, -1.0000,  ..., -0.5451, -0.3647, -0.7333],
          [-0.9922, -0.9922, -1.0000,  ..., -0.5686, -0.5451, -0.7176],
          [-0.9843, -0.9922, -1.0000,  ..., -0.6549, -0.7490, -0.6314],
          ...,
          [-0.8431, -0.8588, -0.8980,  ..., -0.5765, -0.5608, -0.5529],
          [-0.8588, -0.8902, -0.9137,  ..., -0.2078, -0.2549, -0.2706],
          [-0.8824, -0.9059, -0.9294,  ..., -0.2627, -0.1922, -0.1216]]]]), 'labels': 0}

This is the code after training the model and saving it :

# Load your trained ViT model
model = TFAutoModelForImageClassification.from_pretrained(vit_model_path, from_pt=True)

# Load the feature extractor used during training
feature_extractor = ViTFeatureExtractor.from_pretrained(vit_model_path, from_pt=True)

img = Image.open('./angular_leaf_spot.jpeg').resize((224, 224))
inputs = feature_extractor(images=img, return_tensors='tf')
inputs = {k: v.numpy() for k, v in inputs.items()}
predictions = model.predict(inputs)
print(predictions)

So the question is : Do i need to prepare the input like this code everytime i want to predict ?

0 Answers0