1

when I run my individual models with different training and test data my model works fine. I wanted to run a for loop and now I am getting the error not sure why.

I have created several time splits to check how the model is performing with different data breakdowns.


# dataframe opertations - pandas
import pandas as pd
# plotting data - matplotlib
from matplotlib import pyplot as plt
# time series - statsmodels 
# Seasonality decomposition
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.seasonal import seasonal_decompose 
# holt winters 
# single exponential smoothing
from statsmodels.tsa.holtwinters import SimpleExpSmoothing   
# double and triple exponential smoothing
from statsmodels.tsa.holtwinters import ExponentialSmoothing

from numpy import sqrt 
from sklearn.metrics import mean_squared_error

df = pd.read_csv('/content/hw-cv-imputed.csv',index_col='date', parse_dates=True)
df.index.freq = 'W-FRI'
# finding shape of the dataframe
print(df.shape)
# having a look at the data
print(df.head())
# plotting the original data
df[['visits']].plot(title='visit Data')

#Splitting according to the above description
train1, test1 = df.iloc[:52, 0], df.iloc[52:62, 0]
train2, test2 = df.iloc[:56, 0], df.iloc[56:66, 0]
train3, test3 = df.iloc[:60, 0], df.iloc[60:70, 0]
train4, test4 = df.iloc[:65, 0], df.iloc[65:75, 0]
train5, test5 = df.iloc[:69, 0], df.iloc[69:79, 0]
train6, test6 = df.iloc[:73, 0], df.iloc[73:83, 0]
train7, test7 = df.iloc[:78, 0], df.iloc[78:88, 0]
train8, test8 = df.iloc[:82, 0], df.iloc[82:90, 0]
total_model_parameters = pd.DataFrame(columns = ['Total','Parameters'])
# Split into train and test set
#train_df = train1
#test_df = test1

from sklearn.model_selection import ParameterGrid
for train_df ,test_df in [('train1','test1'),('train2','test2'),('train3','test3'),('train4','test4'),('train5','test5'),('train6','test6'),('train7','test7')]:
  params_grid = {'trend':('mul','add'),
                'seasonal':('mul','add'),
                'seasonal_periods': [10,12]}
  grid = ParameterGrid(params_grid)
  cnt = 0
  for p in grid:
      cnt = cnt+1

  print('Total Possible Models',cnt)

  model_parameters = pd.DataFrame(columns = ['Total','Parameters'])
  for p in grid:
      test = pd.DataFrame()
      print(p)
      **fitted_model = ExponentialSmoothing(train_df,trend=p['trend'],seasonal=p['seasonal'],seasonal_periods=p['seasonal_periods']).fit()**
      test_predictions = fitted_model.forecast(10)
      df_new = pd.concat((test_df,test_predictions.rename('predicted_visits'),(((test_df-test_predictions)/test_df)*100).rename('error')),axis=1)
      def accuracy(row):  
          if  abs(row['error']) < 20:
              return 1
          return 0
      df_new['accuracy'] = df_new.apply(lambda row: accuracy(row), axis=1)
      Total = df_new['accuracy'].sum()
      print('Accuracy------------------------------------',Total)
      model_parameters = model_parameters.append({'Total':Total,'Parameters':p},ignore_index=True)

  parameters = model_parameters.sort_values(by=['Total'],ascending=False)
  parameters = parameters.reset_index(drop=True)
  parameters.head(9)

  Parameters_1 = pd.DataFrame(parameters)
  Parameters_1
  parameters['Parameters'][0]
  total_model_parameters = total_model_parameters.append(parameters)
total_model_parameters

The error is

for the line - *fitted_model = ExponentialSmoothing(train_df,trend=p['trend'],seasonal=p['seasonal'],seasonal_periods=p['seasonal_periods']).fit()*
ValueError: unrecognized data structures: <class 'str'> / <class 'NoneType'>

Can someone help, please? :)

p.s. The data is as follows

date    visits
1/22/2021   7352070
1/29/2021   7063725
2/5/2021    9385950
2/12/2021   7851435
2/19/2021   9509640
2/26/2021   9919170
3/5/2021    9682125
3/12/2021   9597075
3/19/2021   8189835
3/26/2021   7487385
4/2/2021    8863965
4/9/2021    8856165
4/16/2021   8619345
4/23/2021   4499670
4/30/2021   3642705
5/7/2021    3105690
5/14/2021   3096330
5/21/2021   3240360
5/28/2021   5152410
6/4/2021    6471915
6/11/2021   4401030
6/18/2021   3197775
6/25/2021   2606340
7/2/2021    3248460
7/9/2021    4996425
7/16/2021   7775085
7/23/2021   9690795
7/30/2021   10041555
8/6/2021    11849055
8/13/2021   14598750
8/20/2021   15339390
8/27/2021   20118720
9/3/2021    12731115
9/10/2021   17456475
9/17/2021   20393850
9/24/2021   20537895
10/1/2021   20800935
10/8/2021   25035450
10/15/2021  22872450
10/22/2021  22790130
10/29/2021  22036965
11/5/2021   26988975
11/12/2021  29194530
11/19/2021  26106000
11/26/2021  29928660
12/3/2021   29254335
12/10/2021  32165430
12/17/2021  27303570
12/24/2021  21453585
12/31/2021  21568815
1/7/2022    21286680
1/14/2022   25589715
1/21/2022   21890130
1/28/2022   20881515
2/4/2022    24185835
2/11/2022   24160590
2/18/2022   20253360
2/25/2022   20450910
3/4/2022    26542320
3/11/2022   25540335
3/18/2022   29602380
3/25/2022   32258340
4/1/2022    24953640
4/8/2022    22872165
4/15/2022   25784490
4/22/2022   25168356
4/29/2022   25405687
5/6/2022    24693295
5/13/2022   26374944
5/20/2022   26192271
5/27/2022   26868125
6/3/2022    27948287
6/10/2022   28320595
6/17/2022   28153788
6/24/2022   27470327
7/1/2022    30520950
7/8/2022    28635750
7/15/2022   26269140
7/22/2022   24236250
7/29/2022   20541675
8/5/2022    21190020
8/12/2022   22389675
8/19/2022   24496455
8/26/2022   27555645
9/2/2022    26324760
9/9/2022    32937450
9/16/2022   36577425
9/23/2022   33522000
9/30/2022   30759780
10/7/2022   30615870

sdave
  • 531
  • 4
  • 18
  • It would be useful to have the full stack trace so we can see what line of code the problem occurs on. Short of that, I guess that it's because your data is tab delimited when it should be comma separated or that you read in an empty line at the end or that pandas is confused by the american date format. – FiddleStix Oct 17 '22 at 20:44
  • @FiddleStix Thank you for your answer. I have added on which line the error occurs. when I run the same code without for loop for individual train and test set then it works so I believe I have made some mistakes around that. Thanks – sdave Oct 17 '22 at 20:48

1 Answers1

2

The problem is that you have ' quoted your variable names so that

for train_df ,test_df in [('train1','test1'),...]

shouldn't have the 's.

You can do away with that line if you're happy to put your pairs of training and test data into a list of tuples like this

import pandas as pd
from sklearn.model_selection import ParameterGrid
from statsmodels.tsa.holtwinters import ExponentialSmoothing

df = pd.read_csv("hw-cv-imputed.csv", index_col="date", parse_dates=True)
df.index.freq = "W-FRI"

# finding shape of the dataframe
print(df.shape)

# having a look at the data
print(df.head())

# plotting the original data
df[["visits"]].plot(title="visit Data")

# Splitting according to the above description
train_and_test = []
train_and_test.append((df.iloc[:52, 0], df.iloc[52:62, 0]))
train_and_test.append((df.iloc[:56, 0], df.iloc[56:66, 0]))
train_and_test.append((df.iloc[:60, 0], df.iloc[60:70, 0]))
train_and_test.append((df.iloc[:65, 0], df.iloc[65:75, 0]))
train_and_test.append((df.iloc[:69, 0], df.iloc[69:79, 0]))
train_and_test.append((df.iloc[:73, 0], df.iloc[73:83, 0]))
train_and_test.append((df.iloc[:78, 0], df.iloc[78:88, 0]))
train_and_test.append((df.iloc[:82, 0], df.iloc[82:90, 0]))
total_model_parameters = pd.DataFrame(columns=["Total", "Parameters"])

for train_df, test_df in train_and_test:
    params_grid = {
        "trend": ("mul", "add"),
        "seasonal": ("mul", "add"),
        "seasonal_periods": [10, 12],
    }
    grid = ParameterGrid(params_grid)
    cnt = 0
    for p in grid:
        cnt = cnt + 1

    print("Total Possible Models", cnt)

    model_parameters = pd.DataFrame(columns=["Total", "Parameters"])
    for p in grid:
        ...
FiddleStix
  • 3,016
  • 20
  • 21