I have an image dataset of 2432 images, each with a category of a total of 3. The labels are stored in a csv file with the image id and the label (T1). The distribution of data is:
negative 1695 positive 648 neutral 89
I'm trying to OverSample the positive and neutral images but so far I haven't had any real success.
Using imblearn SMOTE and RandomOverSampler and I get an exit code 137 (interrupted by signal 9: SIGKILL) for what I think it's by trying to load all the images locally at the same time.
The code for this:
(desired_width, desired_height) = (300,300)
# Step 1: Load CSV file containing filenames and labels
data = train_dataset
# Step 2: Load images from a folder and create corresponding labels
image_folder = train_images_location
image_filenames = data['id'].values
labels = data['T1'].values
# Step 3: Load and preprocess images (replace this with your own image loading/preprocessing logic)
X = []
for filename in image_filenames:
image_path = os.path.join(image_folder, filename)
image = cv2.imread(image_path) # Load the image using OpenCV
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert BGR to RGB
image = cv2.resize(image, (desired_width, desired_height)) # Resize the image to desired dimensions
image = image / 255.0 # Normalize the pixel values between 0 and 1
X.append(image)
# Convert the image list to NumPy array
X = np.array(X)
# Step 4: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)
# Step 5: Apply a resampling technique to balance the training set
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train.reshape(len(X_train), -1), y_train)
I have also tried ImageDataGenerator.flow_from_dataframe and with this method I'm able to get results but they are always overfitted (training 1.0 accuracy and in testing 0.7):
train_data_gen = ImageDataGenerator(rescale=1./255)
test_data_gen = ImageDataGenerator(rescale=1./255)
#Specify the image size that the pre-trained models will use.
target_image_size = (300,300)
train_generator = train_data_gen.flow_from_dataframe(
target_size=target_image_size,
dataframe=train_dataset,
directory=train_images_location,
x_col="id",
y_col="T1",
batch_size=32,
class_mode="categorical",
)
models_input_shape = (300, 300, 3)
models_to_try = {
'vgg19':VGG19(weights='imagenet', include_top=False, input_shape=models_input_shape),
'inceptionv3':InceptionV3(weights = 'imagenet', include_top = False, input_shape = models_input_shape),
'resnet50':ResNet50(weights = 'imagenet', include_top = False, input_shape = models_input_shape),
'efficientnetv2':EfficientNetV2S(weights = 'imagenet', include_top = False, input_shape = models_input_shape)
}
num_classes = len(pd.unique(train_dataset['T1']))
for model_name in models_to_try.keys():
print(model_name,'\n')
base_model = models_to_try[model_name]
for layer in base_model.layers:
layer.trainable = False
model = Sequential()
model.add(base_model)
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss=losses.CategoricalCrossentropy(), optimizer=Adam(learning_rate=0.0001), metrics=['accuracy'])
epochs = 20
steps_per_epoch = train_generator.n // train_generator.batch_size
history = model.fit(
train_generator,
steps_per_epoch=steps_per_epoch,
epochs=epochs
)
If there's any alternative or idea you can throw me I would appreciate it greatly, thanks.