0

I'm trying to come up with a way to use Keras-Tuner to auto-identify the best parameters for my CNN. I am using Celeb_a dataset

I tried a similar project where I used fashion_mnist and this worked perfectly but my experience with python isn't enough to do what I want to achieve. When I tried with fashion_mnist I managed to create this table of results

My code is here.

Table of results

I am hoping to produce a similar table using the Celeb_a dataset. This is for a report I'm doing for college. In the report, my college used AWS Rekognition to produce the table below.

AWS Rekognition Results

I am hoping to be able to train the data so I can save this model to a pickle and generate a similar table of results to compare them.

Any recommendations on how to approach this? My queries at the moment are:

  1. How to load the dataset correctly?
  2. how can i train the model to give me accuracy on "Moustache", "Beard", "Emotion" (like on the table of results above)

I tried loading the data using:

(x_train, y_train), (x_test, y_test) = tfds.load('celeb_a')

but this gives me the following error

AttributeError: Failed to construct dataset celeb_a: module 'tensorflow_datasets.core.utils' has no attribute 'version'

I am using:

Conda: TensorFlow (Python 3.8.5)
Windows 10 Pro
Intel(R) Core(TM) i3-4170 CPU @ 3.7GHz
64-bit

This is the script I am using to start, the same as the one in my bitbucket, Any help would be appreciated. Thank you in advance.

# -*- coding: utf-8 -*-
import tensorflow_datasets as tfds
#from tensorflow.keras.datasets import fashion_mnist
#import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras.layers import  Conv2D, MaxPooling2D, Dense, Flatten, Activation

from kerastuner.tuners import RandomSearch
#from kerastuner.engine.hyperparameters import HyperParameter
import time
import os

LOG_DIR = f"{int(time.time())}"


(x_train, y_train), (x_test, y_test) = tfds.load('celeb_a')

x_train = x_train.reshape(-1,28,28,1)
x_test = x_test.reshape(-1,28,28,1)

def build_model(hp):   #random search passes this hyperparameter() object
    model = keras.models.Sequential()
    
    
    #model.add(Conv2D(32, (3, 3), input_shape=x_train.shape[1:]))
    model.add(Conv2D(hp.Int("input_units", min_value=32, max_value=256, step=32), (3,3), input_shape = x_train.shape[1:]))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    
    
    for i in range(hp.Int("n_layers",min_value = 1, max_value = 4, step=1)):
        #model.add(Conv2D(32, (3, 3)))                
        model.add(Conv2D(hp.Int(f"conv_{i}_units", min_value=32, max_value=256, step=32), (3,3)))
        model.add(Activation('relu'))
        #model.add(MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
    
    model.add(Dense(10))
    model.add(Activation("softmax"))
    
    model.compile(optimizer="adam",
                  loss="sparse_categorical_crossentropy",
                  metrics=["accuracy"])
    return model

tuner = RandomSearch(build_model,
                     objective = "val_accuracy",
                     max_trials = 1,
                     executions_per_trial=1, #BEST PERFOMANCE SET TO 3+
                     directory= os.path.normpath('C:/'),# there is a limit of characters keep path short
                     overwrite=True #need this to override model when testing
                     )

tuner.search(x=x_train,
             y=y_train, 
             epochs=1,
             batch_size=64,
             validation_data=(x_test,y_test),)
halfer
  • 19,824
  • 17
  • 99
  • 186
Jean Camargo
  • 340
  • 3
  • 17

1 Answers1

0

I managed to do this by creating a function to collect all annotations like this:

def get_annotation(fnmtxt, verbose=True):
if verbose:
    print("_" * 70)
    print(fnmtxt)

rfile = open(fnmtxt, 'r')
texts = rfile.read().split("\n")
rfile.close()

columns = np.array(texts[1].split(" "))
columns = columns[columns != ""]
df = []
for txt in texts[2:]:
    txt = np.array(txt.split(" "))
    txt = txt[txt != ""]

    df.append(txt)

df = pd.DataFrame(df)

if df.shape[1] == len(columns) + 1:
    columns = ["image_id"] + list(columns)
df.columns = columns
df = df.dropna()
if verbose:
    print(" Total number of annotations {}\n".format(df.shape))
    print(df.head())
## cast to integer
for nm in df.columns:
    if nm != "image_id":
        df[nm] = pd.to_numeric(df[nm], downcast="float")
return (df)

I also created a class to do the rest like this:

class CelebA():
'''Wraps the celebA dataset, allowing an easy way to:
     - Select the features of interest,
     - Split the dataset into 'training', 'test' or 'validation' partition.
'''

def __init__(self, main_folder='data/', selected_features=None, drop_features=[]):
    self.main_folder = main_folder
    self.images_folder = os.path.join(main_folder, 'img_align_celeba/')
    self.attributes_path = os.path.join(main_folder, 'list_attr_celeba.txt')
    self.partition_path = os.path.join(main_folder, 'list_eval_partition.txt')
    self.selected_features = selected_features
    self.features_name = []
    self.__prepare(drop_features)

def __prepare(self, drop_features):
    '''do some preprocessing before using the data: e.g. feature selection'''
    # attributes:
    if self.selected_features is None:
        self.attributes = get_annotation(self.attributes_path)
        self.num_features = 40
    else:
        self.num_features = len(self.selected_features)
        self.selected_features = self.selected_features.copy()
        self.selected_features.append('image_id')
        self.attributes = get_annotation(self.attributes_path)[self.selected_features]

    # remove unwanted features:
    for feature in drop_features:
        if feature in self.attributes:
            self.attributes = self.attributes.drop(feature, axis=1)
            self.num_features -= 1

    self.attributes.set_index('image_id', inplace=True)
    self.attributes.replace(to_replace=-1, value=0, inplace=True)
    self.attributes['image_id'] = list(self.attributes.index)
    # self.attributes.drop(self.attributes.columns[-1], axis=1, inplace=True)

    self.features_name = list(self.attributes.columns)[:-1]

    # load ideal partitioning:
    self.partition = pd.read_csv(self.partition_path, sep=" ")
    self.partition.set_index('image_id', inplace=True)

def split(self, name='0', drop_zero=False):
    '''Returns the [0 'training', 1 'validation', 2 'test'] split of the dataset'''
    # select partition split:
    if name is '0':
        to_drop = self.partition.where(lambda x: x != 0).dropna()
    elif name is '1':
        to_drop = self.partition.where(lambda x: x != 1).dropna()
    elif name is '2':  # test
        to_drop = self.partition.where(lambda x: x != 2).dropna()
    else:
        raise ValueError('CelebA.split() => `name` must be one of [0-training, 1-validation, 2-test]')

    partition = self.partition.drop(index=to_drop.index)

    # join attributes with selected partition:
    joint = partition.join(self.attributes, how='inner').drop('partition', axis=1)

    if drop_zero is True:
        # select rows with all zeros values
        return joint.loc[(joint[self.features_name] == 1).any(axis=1)]
    elif 0 <= drop_zero <= 1:
        zero = joint.loc[(joint[self.features_name] == 0).all(axis=1)]
        zero = zero.sample(frac=drop_zero)
        return joint.drop(index=zero.index)

    return joint
Jean Camargo
  • 340
  • 3
  • 17