Parallelization with Joblib Multicore more time than Single core

Question

I´m developing in machine learning(using Python version 2.7.13) and Im using Hyperopt to process data and get a porcentage of well processed data. I want to make crossvalidation multicore, but doing this it takes more time than doing it single core. I´m using loblib with the atributte n_jobs to make it multicore. The code is this one:

import numpy as np
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier
from joblib import Parallel, delayed
import time
from sklearn.cross_validation import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import arff
import os
from weka.core.converters import Loader
from sklearn import preprocessing
from weka.classifiers import Classifier, Evaluation
import weka.core.jvm as jvm
timetot=time.time()


#TAKES DE DATA ARFF FILE FROM THE DIRECTORY AND CONVERTS IT TO BINARY, IN ORDER TO BE ABLE TO BE PROCESSED
########################################################
script_dir = os.getcwd()
rel_path = "data\\iris.arff"
iris_file = os.path.join(os.getcwd(), rel_path)
dataset = arff.load(open(iris_file, 'rb'))
#dataset = arff.load(open('C:\data\iris.arff', 'rb'))
mi_y = np.array(dataset['data'])
data = mi_y[:,0:mi_y.shape[1]-1]
data = np.array(mi_y[:,0:mi_y.shape[1]-1])


datos= data
datosunicos=np.unique(datos)
datosunicos=datosunicos.tolist()
unicos_datos = list(range(len(datosunicos)))

for j in range(len(datos[0])):
    for i in range(len(datos)):
#         print(datosunicos)
#         print(datos[i,j])
         posa = datosunicos.index(datos[i,j])


         datos[i,j] = unicos_datos[posa]

data = datos.astype(np.float64)

#datosBinarios=MultiLabelBinarizer().fit_transform(data)

#y = mi_y
#y = mi_y[:,mi_y.shape[1]-1:mi_y.shape[1]]
y = mi_y[:,mi_y.shape[1]-1]
unicos = np.unique(y)
unicos = unicos.tolist()
unicos_numericos = list(range(len(unicos)))

bar = y
for i in range(len(bar)):
     pos = unicos.index(bar[i])
     bar[i] = unicos_numericos[pos]

y = bar.astype(np.int32)

X = data
#Xbuena = X.astype(np.float)
counter = 0
###########################################################






def hyperopt_train_test(params):
    from sklearn.preprocessing import normalize
    from sklearn.preprocessing import scale

    X_ = X[:]

    if 'normalize' in params:
        if params['normalize'] == 1:
            X_ = normalize(X_)
    del params['normalize']

    if 'scale' in params:
        if params['scale'] == 1:
            X_ = scale(X_)
    del params['scale']

# CHOOSE THE ALGORITHM TO BE USED TO PROCESS THE DATA AND CROSSVALIDATE **HERE IS WHERE I ASSIGN THE CORES WITH N_JOBS=-1
##########################################################
#    clf = SVC(**params)
#    clk =KNeighborsClassifier(**params)
#    clnb=GaussianNB(**params)

    clrf=RandomForestClassifier(**params)
#    clmlp=MLPClassifier(**params)
    #clf = SVR(**params)

    return cross_val_score(clrf, X_, y, cv=10,n_jobs=-1).mean()
##########################################################

#DEFINE THE SEARCH SPACES FOR EACH ALGORITHM
space4svm = {
    'C': hp.uniform('C', 0, 20),
    'kernel': hp.choice('kernel', ['linear', 'sigmoid', 'poly', 'rbf']),
#    'kernel': hp.choice('kernel', ['linear']),
    #'epsilon': hp.choice('epsilon', [0.1]),
    'gamma': hp.uniform('gamma', 0, 20),
    'scale': hp.choice('scale', [0, 1]),
    'normalize': hp.choice('normalize', [0, 1])
}
space4KNN= {
        'n_neighbors': hp.choice('n_neighbors',[1,2,3,4,5]),
        'scale': hp.choice('scale', [0, 1]),
        'normalize': hp.choice('normalize', [0, 1])
        }
space4NB= {
        'scale': hp.choice('scale', [0, 1]),
        'normalize': hp.choice('normalize', [0, 1])
        }
space4RF= {
        'n_estimators': hp.choice('n_estimators',np.arange(10, 30, dtype=int)),
        'max_features': hp.uniform('max_features',0.25, 1),
        'scale': hp.choice('scale', [0, 1]),
        'normalize': hp.choice('normalize', [0, 1])
        }
space4MLP= {
        'momentum': hp.uniform('momentum',0,0.05),
        'scale': hp.choice('scale', [0, 1]),
        'normalize': hp.choice('normalize', [0, 1])
        }



def f(params):
    acc = hyperopt_train_test(params)
    global counter
    counter = counter + 1
    print counter, acc
    return {'loss': -acc, 'status': STATUS_OK}

#HERE IS WHERE I WANT TO MAKE IT MULTICORE , WHEN IT CALLS FMIN FUNCTION
if __name__ == '__main__':
    trials = Trials()
    best = fmin(f,space4RF, algo=tpe.suggest, max_evals=100, trials=trials)
    print 'best:'
    print best



    #CHOOSE THE PARAMETERS DEPENDING ON THE ALGORITHM TO USE
    #############################################################
    #parameters = ['C', 'kernel', 'gamma', 'scale', 'normalize']
    #parameters = ['n_neighbors', 'scale', 'normalize']
    #parameters = [ 'scale', 'normalize']
    parameters = ['n_estimators','max_features', 'scale', 'normalize']
    #parameters = ['momentum','scale', 'normalize']
    #############################################################


    cols = len(parameters)
    f, axes = plt.subplots(nrows=1, ncols=cols, figsize=(20,5))
    cmap = plt.cm.jet
    for i, val in enumerate(parameters):
        xs = np.array([t['misc']['vals'][val] for t in trials.trials]).ravel()
        ys = [-t['result']['loss'] for t in trials.trials]
        #xs, ys = zip(\*sorted(zip(xs, ys)))
        #xs, ys = zipped.sort(\*sorted(zip(xs, ys)))
        axes[i].scatter(xs, ys, s=20, linewidth=0.01, alpha=0.25, c=cmap(float(i)/len(parameters)))
        axes[i].set_title(val)
        axes[i].set_ylim([0.9, 1.0])


#PRINTS TOTAL TIME
print("TIEMPO TOTAL:")
print(time.time()-timetot)

I get aproximatly 96 seconds processing it with one core and 296 with 4 cores.

Thank you very much for your help.

Hard to help here with such huge (badly structured) code. I did not even find your usage of joblib, where is ```delayed``` used? Apart from that: the general idea is not that promising: joblib in general copies all the data and using mmaps to not copy is maybe not that simple. As most of your classifiers support multiprocessing, you should not parallelize the outer task (hard), but make sure all the inner ones are nicely parallelized (easy, just make sure to use the right params). Maybe the defaults are already doing that and your trial shows just the copy-overhead. — sascha, Jun 01 '17 at 12:57
The code is a total mess indeed. Where is hyperopt_best_test called? — Lukasz Tracewski, Jun 01 '17 at 15:27
My aplogies, I have editted it to be more clear. Thank you very much for your help. — alc11, Jun 01 '17 at 16:08

Parallelization with Joblib Multicore more time than Single core

0 Answers0