0

I'm at a loss as to what's happening here.

I'm downloading historical stock data with Pandas Datareader, and after some small manipulations (ie. re-arranging the dataframe, adding moving averages, etc.), I pass the dataframe to FeatureTools to do a quick Auto Feature Engineering, which it does fine by adding new columns to the dataframe...

BUT then I pass it to FeatureSelector (to remove all columns that are highly correlated, have no importance, etc.) but I receive an issue where FeatureSelector cannot find the "label" column in the dataset that I'm trying to point it to anymore (Adj Close). I'm new to FeatureSelector so I'm not entirely sure how to use it yet. From there, it will pass the data on to TPOT to do an Auto Regression.

I have included my full code here, I know you're not supposed to, but it will be a working code for anyone to be able to try and see my issue on their side. The error I get is:

KeyError: "labels ['Adj Close'] not contained in axis"

It would appear that FeatureSelector is removing the "Adj Close" label/column during the removal step, but I thought that was why we assign it to the internal "label=" part? Any suggestions would be great. Would love to get this working. Just type in a ticker symbol to get started (ex. CLVS). Thanks!

ticker_input = input('Which stock ticker would you like to predict?') # Start with CLVS for testing

print('Getting the historical data for: ',ticker_input)

# Downloading historical data as dataframe
from datetime import datetime
from pandas_datareader import data as web
import pandas as pd
ex = 'yahoo'
start = datetime(2010, 1, 1)
end = datetime.now()
df = web.DataReader(ticker_input, ex, start, end).reset_index()

# Create the prediction dataset
df = df.drop(['Close'],axis=1)
df['PrevHi'] = df['High'].shift(1)
df['PrevLo'] = df['Low'].shift(1)
df['PrevClose'] = df['Adj Close'].shift(1)
df['PrevVol'] = df['Volume'].shift(1)
df['PrevOpen'] = df['Open'].shift(1)
df = df.drop(['High','Low','Volume'],axis=1)
# Get the 9 and 20 MA values
df['9MA'] = df['Open'].rolling(window=9).mean()
df['20MA'] = df['Open'].rolling(window=20).mean()


import time
# Reshape the df
df2 = df[['Date','Open','PrevOpen','PrevHi','PrevLo','PrevClose','PrevVol','9MA','20MA','Adj Close']]
df2.dropna(how='all') # THIS DROP ISN'T DROPPING ROWS W/ BLANK VALUES FOR SOME REASON???



# Auto Feature Engineering using Feature Tools
import featuretools as ft
#print(ft.list_primitives().to_string()) # To get full list of primitives that could be used
print('Adding the engineered features to the dataframe. This may take a while...')
es = ft.EntitySet(id = 'stockdata')
es.entity_from_dataframe(entity_id = 'data', dataframe = df2, 
                         make_index = False,index = 'Date')
# Run deep feature synthesis with transformation primitives
feature_matrix, feature_defs = ft.dfs(entityset = es, target_entity = 'data', max_depth=2,verbose=True,

                    agg_primitives = ['skew','mean','median',
                    'all','count','num_unique','trend','max','mode',
                    'std','sum','min'],
                    trans_primitives = ['divide_numeric'])
                    # 'diff',
                    # 'greater_than',
                    # 'less_than_equal_to',
                    # 'cum_mean',
                    # 'time_since',
                    # 'cum_sum',
                    # 'add_numeric',
                    # 'multiply_numeric',
                    # 'greater_than_equal_to',
                    # 'negate',
                    # 'cum_min',
                    # 'subtract_numeric',
                    # 'not',
                    # 'cum_count',
                    # 'modulo_numeric',
                    # 'less_than'])
print(feature_matrix.head())
df2 = feature_matrix
df2.to_csv('FeatureMatrix.csv')

# Trying to now name all the feature columns and label for FeatureSelector...
features = df2.drop(['Adj Close'],axis=1)
label = df2['Adj Close'].values
# Now, drop all columns of low importance
from feature_selector import FeatureSelector
fs = FeatureSelector(data = features, labels = label)
fs.identify_all(selection_params = {'missing_threshold': 0.6,    
                                    'correlation_threshold': 0.98, 
                                    'task': 'regression',    
                                    'eval_metric': 'mse', 
                                    'cumulative_importance': 0.99})
df2 = fs.remove(methods = 'all')
# Somewhere above it's not recognizing my Adj Close label anymore?
# Training dataset
df = df2.iloc[:-90] # subtracting 90 rows/days to use as the predictions dataset later
print('Printing training dataframe...')
print(df)
# Prediction dataset for later use
prediction_df = df2.iloc[-90:]
print('Printing prediction dataframe for later use...')
print(prediction_df)
# Can keep adding to the dataset with things like PrevIndustryHi,Lo,Close,Open and other metrics
print('Pausing for 20 seconds to review before training...')
time.sleep(20)




# Now, train a TPOT Regressor
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
import os
features = df.drop(['Adj Close'],axis=1)
label = df['Adj Close']
X_train, X_test, y_train, y_test = train_test_split(features, label,
                                                    train_size=0.75, test_size=0.25)
# Create a folder to cache the pipeline work (use if not using auto)
# if os.path.exists('./PipelineCache'):
#     pass
# else:
#     os.mkdir('./PipelineCache')
tpot = TPOTRegressor(generations=10, population_size=40, verbosity=2) #memory='./PipelineCache',       memory='auto',
tpot.fit(X_train, y_train)
predictions = (tpot.predict(X_test))
actuals = y_test
last_row = df.tail(1)
print('The last closing price was :')
print(last_row['Adj Close'])
print("TPOT's final score on training data is : ")
print(tpot.score(X_test, y_test))
if os.path.exists('./Exported Pipelines'):
    pass
else:
    os.mkdir('./Exported Pipelines')
tpot.export('./Exported Pipelines/1day-prediction-pipeline.py')


# Now, use the TPOT model to predict on the held out predictions dataset
from sklearn.metrics import mean_squared_error
features = prediction_df.drop(['Adj Close'], axis=1)
labels = prediction_df['Adj Close']
# Fit the model to the prediction_df and predict the labels
#tpot.fit(features, labels)
results = tpot.predict(features)
predictions_list = []
for preds in results:
    predictions_list.append(preds)
prediction_df['Predictions'] = predictions_list
prediction_df.to_csv('PredictionsPerformance.csv', index=True)
print('The Mean Square Error of the predictions is :')
print(mean_squared_error(labels,results))
print('DONE!')

# Clear the cache directory when you don't need it anymore.
# If you're testing the same dataset over and over, use the
# same cache file
#from shutil import rmtree
#rmtree('./PipelineCache')
wildcat89
  • 1,159
  • 16
  • 47

1 Answers1

0

As a workaround, I just re-added the df column with the adj close in it, after the removal process, like so:

# Trying to now name all the feature columns and label for FeatureSelector...
features = df.drop("Adj Close", axis=1)
label = df["Adj Close"]
# Now, drop all columns of low importance
from feature_selector import FeatureSelector
fs = FeatureSelector(data = features, labels = label)
fs.identify_all(selection_params = {'missing_threshold': 0.6,    
                                    'correlation_threshold': 0.98, 
                                    'task': 'regression',    
                                    'eval_metric': 'mse', 
                                    'cumulative_importance': 0.99})
all_to_remove = fs.check_removal()
print(all_to_remove[:])
df = fs.remove(methods = 'all')

# Re-Add the Adj Close to the df because FeatureTools removes it once you assign it as the label for some reason
df['Adj Close'] = label
wildcat89
  • 1,159
  • 16
  • 47