I'm using this code to build a new model from a pretrained model VIT, and i want to understand the pre processing of the images.
here is the code :
!pip install datasets transformers
from datasets import load_dataset
ds = load_dataset('beans', verification_mode= "no_checks")
labels = ds['train'].features['labels'].names
from transformers import ViTFeatureExtractor
model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name_or_path)
def process_example(example):
inputs = feature_extractor(example['image'], return_tensors='pt')
inputs['labels'] = example['labels']
return inputs
process_example(ds['train'][0])
and here is the output :
{'pixel_values': tensor([[[[-0.5686, -0.5686, -0.5608, ..., -0.0275, 0.1922, -0.2549],
[-0.6078, -0.6000, -0.5843, ..., -0.0353, -0.0196, -0.2706],
[-0.6314, -0.6314, -0.6157, ..., -0.2392, -0.3647, -0.2314],
...,
[-0.5373, -0.5529, -0.5765, ..., -0.0745, -0.0431, -0.0980],
[-0.5608, -0.5765, -0.5843, ..., 0.3176, 0.1608, 0.1294],
[-0.5843, -0.5922, -0.6078, ..., 0.2784, 0.1451, 0.2000]],
[[-0.7098, -0.7098, -0.7490, ..., -0.3725, -0.1608, -0.6000],
[-0.7333, -0.7333, -0.7569, ..., -0.3569, -0.3176, -0.5608],
[-0.7490, -0.7490, -0.7647, ..., -0.5373, -0.6627, -0.5373],
...,
[-0.7725, -0.7882, -0.8196, ..., -0.2314, -0.0353, 0.0824],
[-0.7961, -0.8118, -0.8118, ..., 0.1843, 0.3176, 0.3725],
[-0.8196, -0.8196, -0.8275, ..., 0.0745, 0.2863, 0.3961]],
[[-0.9922, -0.9922, -1.0000, ..., -0.5451, -0.3647, -0.7333],
[-0.9922, -0.9922, -1.0000, ..., -0.5686, -0.5451, -0.7176],
[-0.9843, -0.9922, -1.0000, ..., -0.6549, -0.7490, -0.6314],
...,
[-0.8431, -0.8588, -0.8980, ..., -0.5765, -0.5608, -0.5529],
[-0.8588, -0.8902, -0.9137, ..., -0.2078, -0.2549, -0.2706],
[-0.8824, -0.9059, -0.9294, ..., -0.2627, -0.1922, -0.1216]]]]), 'labels': 0}
This is the code after training the model and saving it :
# Load your trained ViT model
model = TFAutoModelForImageClassification.from_pretrained(vit_model_path, from_pt=True)
# Load the feature extractor used during training
feature_extractor = ViTFeatureExtractor.from_pretrained(vit_model_path, from_pt=True)
img = Image.open('./angular_leaf_spot.jpeg').resize((224, 224))
inputs = feature_extractor(images=img, return_tensors='tf')
inputs = {k: v.numpy() for k, v in inputs.items()}
predictions = model.predict(inputs)
print(predictions)
So the question is : Do i need to prepare the input like this code everytime i want to predict ?