LSTM error with date format

Question

This is my first attempt in deep learning, the purpose of this code is to predict the FOREX market direction.

Here is the code:

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential

column_names = ['Date', 'Time', 'Open', 'High', 'Low','Close', 'Volume']

data = pd.read_csv(r"E:\Tutorial\EURUSD60.csv", header=None, names=column_names)

data['DateTime'] = pd.to_datetime(data.Date + ' ' + data.Time)
del data['Date']
del data['Time']

sequence_length = 21
n_features = len(data.columns)
val_ratio = 0.1
n_epochs = 300
batch_size = 512

data = data.as_matrix()
data_processed = []
for index in range(len(data) - sequence_length):
    data_processed.append(data[index: index + sequence_length])
data_processed = np.array(data_processed)

val_split = round((1 - val_ratio) * data_processed.shape[0])
train = data_processed[: int(val_split), :]
val = data_processed[int(val_split):, :]

print('Training data: {}'.format(train.shape))
print('Validation data: {}'.format(val.shape))

train_samples, train_nx, train_ny = train.shape
val_samples, val_nx, val_ny = val.shape

train = train.reshape((train_samples, train_nx * train_ny))
val = val.reshape((val_samples, val_nx * val_ny))

preprocessor = MinMaxScaler().fit(train)
train = preprocessor.transform(train)
val = preprocessor.transform(val)

train = train.reshape((train_samples, train_nx, train_ny))
val = val.reshape((val_samples, val_nx, val_ny))

X_train = train[:, : -1]
y_train = train[:, -1][:, -1]
X_val = val[:, : -1]
y_val = val[:, -1][:, -1]

X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], n_features))
X_val = np.reshape(X_val, (X_val.shape[0], X_val.shape[1], n_features))

model = Sequential()
model.add(LSTM(input_shape=(X_train.shape[1:]), units=128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.25))
model.add(Dense(units=1))
model.add(Activation("linear"))

model.compile(loss="mse", optimizer="adam")

history = model.fit(
    X_train,
    y_train,
    batch_size=batch_size,
    epochs=n_epochs,
    verbose=2)

preds_val = model.predict(X_val)
diff = []
for i in range(len(y_val)):
    pred = preds_val[i][0]
    diff.append(y_val[i] - pred)

real_min = preprocessor.data_min_[104]
real_max = preprocessor.data_max_[104]
print(preprocessor.data_min_[104])
print(preprocessor.data_max_[104])

preds_real = preds_val * (real_max - real_min) + real_min
y_val_real = y_val * (real_max - real_min) + real_min

plt.plot(preds_real, label='Predictions')
plt.plot(y_val_real, label='Actual values')
plt.xlabel('test')
plt.legend(loc=0)
plt.show()

Here is the error:

Using TensorFlow backend. 2017-12-03 13:26:44.494199: W
C:\tf_jenkins\home\workspace\rel-win\M\windows\PY\36\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but
these are available on your machine and could speed up CPU
computations. 2017-12-03 13:26:44.494660: W
C:\tf_jenkins\home\workspace\rel-win\M\windows\PY\36\tensorflow\core\platform\cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but
these are available on your machine and could speed up CPU
computations. Training data: (1824, 21, 6) Validation data: (203, 21,
6) Traceback (most recent call last): File "E:/Tutorial/Deep
Learning.py", line 42, in preprocessor = MinMaxScaler().fit(train) File "C:\Users\sydgo\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py",
line 308, in fit
return self.partial_fit(X, y) File "C:\Users\sydgo\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py",
line 334, in partial_fit
estimator=self, dtype=FLOAT_DTYPES) File "C:\Users\sydgo\Anaconda3\lib\site-packages\sklearn\utils\validation.py",
line 433, in check_array array = np.array(array, dtype=dtype, order=order, copy=copy) TypeError: float() argument must be a string or a number, not
'Timestamp'

score 1 · Accepted Answer · answered Dec 16 '17 at 14:34

This is the code after fixing the error

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential

column_names = ['Date', 'Time', 'Open', 'High', 'Low','Close', 'Volume']

df = pd.read_csv(r"E:\Tutorial\EURUSD60.csv", header=None, names=column_names)

df['DateTime'] = pd.to_datetime(df.Date + ' ' + df.Time)
del df['Date']
del df['Time']

df.rename(columns={'DateTime': 'timestamp', 'Open': 'open',
                   'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume'}, inplace=True)
df['timestamp'] = pd.to_datetime(df['timestamp'], infer_datetime_format=True)
df.set_index('timestamp', inplace=True)
df = df.astype(float)
df['hour'] = df.index.hour
df['day'] = df.index.weekday
df['week'] = df.index.week


sequence_length = 21
n_features = len(df.columns)
val_ratio = 0.1
n_epochs = 300
batch_size = 512

data = df.as_matrix()
data_processed = []
for index in range(len(data) - sequence_length):
    data_processed.append(data[index: index + sequence_length])
data_processed = np.array(data_processed)

val_split = round((1 - val_ratio) * data_processed.shape[0])
train = data_processed[: int(val_split), :]
val = data_processed[int(val_split):, :]

print('Training data: {}'.format(train.shape))
print('Validation data: {}'.format(val.shape))

train_samples, train_nx, train_ny = train.shape
val_samples, val_nx, val_ny = val.shape

train = train.reshape((train_samples, train_nx * train_ny))
val = val.reshape((val_samples, val_nx * val_ny))

preprocessor = MinMaxScaler().fit(train)
train = preprocessor.transform(train)
val = preprocessor.transform(val)

train = train.reshape((train_samples, train_nx, train_ny))
val = val.reshape((val_samples, val_nx, val_ny))

X_train = train[:, : -1]
y_train = train[:, -1][:, -1]
X_val = val[:, : -1]
y_val = val[:, -1][:, -1]

X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], n_features))
X_val = np.reshape(X_val, (X_val.shape[0], X_val.shape[1], n_features))

model = Sequential()
model.add(LSTM(input_shape=(X_train.shape[1:]), units=128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.25))
model.add(Dense(units=1))
model.add(Activation("linear"))

model.compile(loss="mse", optimizer="adam")

history = model.fit(
    X_train,
    y_train,
    batch_size=batch_size,
    epochs=n_epochs,
    verbose=2)

preds_val = model.predict(X_val)
diff = []
for i in range(len(y_val)):
    pred = preds_val[i][0]
    diff.append(y_val[i] - pred)

real_min = preprocessor.data_min_[104]
real_max = preprocessor.data_max_[104]
print(preprocessor.data_min_[:120])
print(preprocessor.data_max_[:120])

preds_real = preds_val * (real_max - real_min) + real_min
y_val_real = y_val * (real_max - real_min) + real_min

plt.plot(preds_real, label='Predictions')
plt.plot(y_val_real, label='Actual values')
plt.xlabel('test')
plt.legend(loc=0)
plt.show()

score 0 · Answer 2 · answered Dec 04 '17 at 01:59

There is a collision between expected dtype
and
tha actually delivered data type:

TypeError: float() argument must be a string or a number, not 'Timestamp'

the most probable suspect to revise is the conversion in:

data['DateTime'] = pd.to_datetime(data.Date + ' ' + data.Time)

You have to go back into the concept, what ( if at all ) ought be fed, as the set of the intended FOREX Quantitative Modeling Features, into the LSTM Model.

LSTM error with date format

2 Answers2