I am trying gap fill a half hourly time series of carbon fluxes. I want to use train-test-validate cross validation to identify the most parsimonious LSTM model by training a model with all available inputs and then pruning it until the score stops improving. For each model, I'm using k-fold CV to split 90% train, 10% validate, and then in the model.fit(), splitting train further into a train and test set. I'm using early stopping to help minimize run time and using ModelCheckpoint to save the best weights (the epoch with the lowest "val_loss"). Then I want to load those model weights and calculate the validation score (MSE) on the 10% of the data set aside for validation outside the model using the weights that performed best on test set.
Here is a working example of my code training an LSTM with 9 factors and 13 timestimes (the 6 hours leading up to each observation)
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations
from functools import partial
from multiprocessing import Pool
from sklearn.neural_network import MLPRegressor as MPR
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import RepeatedKFold
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.wrappers.scikit_learn import KerasRegressor
from keras.callbacks import EarlyStopping,ModelCheckpoint
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.9
session = tf.Session(config=config)
def TimeShape(rolls,X1):
X = np.zeros(shape = (X1.shape[0],rolls+1,X1.shape[1]))
X[:,0,:] = X1
if rolls > 0:
for roll in range(0,rolls):
X2 = np.roll(X1,(roll+1),axis=0)
X[:,roll+1,:] = X2
return(X)
def LSTM_Model(time_steps,inputs,load=None):
model = Sequential()
model.add(LSTM(12, input_shape=(time_steps+1,inputs),return_sequences=True,init='normal', activation='tanh'))
model.add(LSTM(6,init='normal', activation='tanh'))
model.add(Dense(1, init='normal',activation='linear'))
NUM_GPU = 1 # or the number of GPUs available on your machine
gpu_list = []
for i in range(NUM_GPU): gpu_list.append('gpu(%d)' % i)
model.compile(loss='mean_squared_error', optimizer='adam',context=gpu_list) # - Add if using MXNET
return(model)
class LossHistory(keras.callbacks.Callback):
def on_train_begin(self, logs={}):
self.train_losses = []
self.test_losses = []
def on_epoch_end(self, batch, logs={}):
self.train_losses.append(logs.get('loss'))
self.test_losses.append(logs.get('val_loss'))
class LSTM_Optimize:
def __init__(self,Path,y_var):
# **Read and prep Data Data**
self.Master = pd.read_csv(Path,delimiter = ',',header = 0,na_values = -9999)
self.Master = self.Master.set_index(pd.DatetimeIndex(pd.to_datetime(self.Master['datetime'])))
self.Master['DOY'] = self.Master.index.dayofyear*1.0
self.Master['HR'] = self.Master.index.hour*1.0
self.Data = self.Master[np.isfinite(self.Master[y_var])]
self.Data = self.Data.interpolate().bfill()
self.Data = self.Data.interpolate().ffill()
# ** Nomralize Y variable**
# ** Pipeline takes care of X, but not Y, I've foun the models work better when normalizing Y **
self.y = self.Data[y_var].values
self.YStandard = StandardScaler()
self.YScaled = self.YStandard.fit(self.y.reshape(-1, 1))
Yscale = self.YScaled.transform(self.y.reshape(-1, 1))
self.y = np.ndarray.flatten(Yscale)
self.Ytru = self.YScaled.inverse_transform(self.y.reshape(-1,1))
def Run(self,Inputs):
# Preparing the input data
time_steps = 12
X = self.Data[Inputs]
input_shape = len(Inputs)
self.XStandard = StandardScaler()
self.XScaled= self.XStandard.fit(X)
Xscale = self.XScaled.transform(X)
Xscale = TimeShape(time_steps,Xscale)
Xscale = Xscale[time_steps+1:,:,:]
self.y = self.y[time_steps+1:]
ES = EarlyStopping(monitor='val_loss', min_delta=0.0, patience=25, verbose=1, mode='auto')
CH = ModelCheckpoint(filepath='weights.hdf5',monitor='val_loss', verbose=0, save_best_only=True)
HS=LossHistory()
MSE = []
kf = RepeatedKFold(n_splits=10,n_repeats=2)
batch_size=25
Mod = LSTM_Model(time_steps,input_shape)
plt.figure(figsize = (7,7))
for train,test in kf.split(Xscale,self.y):
Mod.fit(Xscale[train],self.y[train],batch_size=batch_size, nb_epoch=1000,validation_split=0.1,
shuffle=True,callbacks=[ES,CH,HS],verbose=0)
Y = Mod.predict(Xscale[test],batch_size = batch_size)
Mod.load_weights('weights.hdf5')
Y = Mod.predict(Xscale[test],batch_size = batch_size)
MSE.append(metrics.mean_squared_error(self.y[test],Y))
plt.plot(HS.test_losses,linestyle='--')
plt.plot(HS.train_losses)
print(Mod.summary())
print(np.asanyarray(MSE).mean())
Path = 'FluxData.csv'
% matplotlib inline
start_time = time.time()
if __name__ == '__main__':
CH4_Model = ['Sedge','Shrubby','Temp','VWC','ustar','wind_speed','air_pressure',
'PPFD_Avg','NR_Wm2_Avg','AirTC_Avg']
y_var = 'ch4_flux'
Model = CH4_Model
Best = LSTM_Optimize(Path,y_var)
Best.Run(Model)
print()
print("--- %s seconds ---" % (time.time() - start_time))
And here are a few rows of my dataset - the actual series has 1000's of observations
datetime,co2_flux,ch4_flux,ustar,wind_speed,AirTC_Avg,air_pressure,AirTC_Min,RH,PPFD_Avg,NR_Wm2_Avg,VWC,Temp,Sedge,Shrubby
7/11/2016 8:00,-0.337747167,0.011732699,0.404379747,3.887986435,15.07,101118.6513,15.03,92.7,414.2,225.1,0.5895,7.950660426,0.001292044,0.823794007
7/11/2016 8:30,-1.021087283,0.010256442,0.424094541,3.94983083,14.89,101144.0926,14.84,92.8,339.7,177.1,0.5895,8.24119905,0.001058732,0.826866339
7/11/2016 9:00,-0.146511388,0.008503355,0.456274817,4.687202214,14.71,101177.3176,14.63,93.4,354.4,183.7,0.5895,8.146344257,0.000474955,0.84272365
7/11/2016 9:30,0.144368521,0.009458078,0.462915317,4.810986576,14.27,101203.9191,14.2,93.3,370.2,188.4,0.5895,7.995179025,0.00147768,0.854715683
7/11/2016 10:00,1.471425801,0.014895985,0.47095652,5.098075355,13.7,101235.9171,13.62,94.3,462.9,233.9,0.5895,7.521166721,4.64E-05,0.871581919
7/11/2016 10:30,0.889911286,0.01564225,0.487227522,4.969666239,13.13,101277.0195,13.04,96,309.9,155.2,0.5895,7.923818563,8.14E-06,0.880709962
When I run this with a Tensorflow backed, everything goes smoothly and I get . Howeverif I try to run it with a MXNet backend, it fails to load the save model weights and I get this traceback:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-1-14c6597a2feb> in <module>()
114 Model = CH4_Model
115 Best = LSTM_Optimize(Path,y_var)
--> 116 Best.Run(Model)
117 print()
118 print("--- %s seconds ---" % (time.time() - start_time))
<ipython-input-1-14c6597a2feb> in Run(self, Inputs)
96 shuffle=True,callbacks=[ES,CH,HS],verbose=0)
97 Y = Mod.predict(Xscale[test],batch_size = batch_size)
---> 98 Mod.load_weights('weights.hdf5')
99 Y = Mod.predict(Xscale[test],batch_size = batch_size)
100 MSE.append(metrics.mean_squared_error(self.y[test],Y))
/usr/local/lib/python3.5/dist-packages/Keras-1.2.2-py3.5.egg/keras/engine/topology.py in load_weights(self, filepath, by_name)
2718 self.load_weights_from_hdf5_group_by_name(f)
2719 else:
-> 2720 self.load_weights_from_hdf5_group(f)
2721
2722 if hasattr(f, 'close'):
/usr/local/lib/python3.5/dist-packages/Keras-1.2.2-py3.5.egg/keras/engine/topology.py in load_weights_from_hdf5_group(self, f)
2804 weight_values[0] = w
2805 weight_value_tuples += zip(symbolic_weights, weight_values)
-> 2806 K.batch_set_value(weight_value_tuples)
2807
2808 def load_weights_from_hdf5_group_by_name(self, f):
/usr/local/lib/python3.5/dist-packages/Keras-1.2.2-py3.5.egg/keras/backend/mxnet_backend.py in batch_set_value(tuples)
2205 """
2206 for p, w in tuples:
-> 2207 set_value(p, w)
2208
2209
/usr/local/lib/python3.5/dist-packages/Keras-1.2.2-py3.5.egg/keras/backend/mxnet_backend.py in set_value(x, value)
2193 if isinstance(value, Number):
2194 value = [value]
-> 2195 x.bind(mx.nd.array(value))
2196
2197
/usr/local/lib/python3.5/dist-packages/mxnet-0.11.0-py3.5.egg/mxnet/ndarray.py in array(source_array, ctx, dtype)
1295 raise TypeError('source_array must be array like object')
1296 arr = empty(source_array.shape, ctx, dtype)
-> 1297 arr[:] = source_array
1298 return arr
1299
/usr/local/lib/python3.5/dist-packages/mxnet-0.11.0-py3.5.egg/mxnet/ndarray.py in __setitem__(self, key, value)
384 _internal._set_value(float(value), out=self)
385 elif isinstance(value, (np.ndarray, np.generic)):
--> 386 self._sync_copyfrom(value)
387 else:
388 raise TypeError(
/usr/local/lib/python3.5/dist-packages/mxnet-0.11.0-py3.5.egg/mxnet/ndarray.py in _sync_copyfrom(self, source_array)
556 print(self.shape)
557 raise ValueError('Shape inconsistent: expected %s vs got %s'%(
--> 558 str(self.shape), str(source_array.shape)))
559 check_call(_LIB.MXNDArraySyncCopyFromCPU(
560 self.handle,
ValueError: Shape inconsistent: expected () vs got (1,)
Why do I want to use MXNet? It seems to be faster than tensorflow, and I will have to perform train-test-validation on many models with varying inputs and different #s of nodes and hyper-paramters. I've been able to significantly boost the speed of keras models with a MXNet backend by using multiprocessing to train multiple different models in parallel. However, using a tensroflow back end I get a thread-lock error when trying to do multiprocessing.
For context, I'm using Deep Learning AMI Ubuntu Linux - 2.3_Sep2017 (ami-d6ee1dae) environment on a p2.xlarge instance.
Any ideas would be greatly appreciated!