0

I am working on a weather dataset from Istanbul, available on Kaggle (https://www.kaggle.com/datasets/vonline9/weather-istanbul-data-20092019?resource=download). I am trying to predict the rainfall amount using regression and LSTM methods, and comparing the results. I used the Kalman filter for regression and got an MSE of 7.72, and the graph of predicted vs. actual values is a good fit. However, even though the MSE of the LSTM is lower than that of the Kalman filter, the graph of predicted vs. actual values looks distorted, and the predictions do not match the actual rainfall amounts well. How can I plot the correct graph and see where the model is making mistakes? LSTM GRAPHİC

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

from keras.models import Sequential
from keras.layers import Dense, LSTM

# Veriyi yükle
data = pd.read_csv('Istanbul Weather Data.csv')
data['AvgPressure'] = data['AvgPressure'] / 10   # AvgPressure sütununu hektopascal birimine dönüştürme
# Yeni bir sütun ekleyerek, yağış miktarını 4 sınıfa ayıralım
bins = [-float('inf'), 0, 2, 8, float('inf')]
labels = [1, 2, 3, 4]
data['MoonRise'] = pd.to_datetime(data['MoonRise'], format='%H:%M:%S', errors='coerce').dt.strftime('%H:%M:%S')
data['MoonSet'] = pd.to_datetime(data['MoonSet'], format='%H:%M:%S', errors='coerce').dt.strftime('%H:%M:%S')
data['MoonRise'] = data['MoonRise'].fillna(method='ffill')
data['MoonSet'] = data['MoonSet'].fillna(method='ffill')
data['RainClass'] = pd.cut(data['Rain'], bins=bins, labels=labels)
data['RainToday'] = np.where(data['RainClass'].isin([3,4]), 1, 0)
data['DateTime'] = pd.to_datetime(data['DateTime'], dayfirst=True)
month = [date.month for date in data['DateTime']]
data['Month'] = month
data['Year'] = data['DateTime'].dt.year
data.set_index('DateTime', inplace=True)
data.sort_index(inplace=True)
def get_season(month):
    if month in [1, 2, 3]:
        return 'Winter'
    elif month in [4, 5, 6]:
        return 'Spring'
    elif month in [7, 8, 9]:
        return 'Summer'
    else:
        return 'Autumn'

data['Season'] = data['Month'].apply(get_season)
grouped_data = data.groupby(['Season'])
numeric_data = data._get_numeric_data()
Q1 = numeric_data.quantile(0.25)
Q3 = numeric_data.quantile(0.75)
IQR = Q3 - Q1
outliers = (numeric_data < (Q1 - 1.5 * IQR)) | (numeric_data > (Q3 + 1.5 * IQR))
data = data[~outliers.any(axis=1)]
def apparent_temperature(temp, humidity):
    return temp - ((0.55 - 0.0055 * humidity) * (temp - 14.5))
data.loc[:, 'TemperatureRange'] = data['MaxTemp'] - data['MinTemp']
data.loc[:, 'rain_effect'] = (data['Rain'] * 0.5) + (data['MaxTemp']-data['MinTemp'] * 0.3) + (data['AvgHumidity'] * 0.2)
data.loc[:, 'ApparentTemp'] = data.apply(lambda row: apparent_temperature(row['TemperatureRange'], row['AvgHumidity']), axis=1)

# Hedef değişkeni belirle
target = 'Rain'


# Veriyi eğitim ve test setlerine böl
train_size = int(len(data) * 0.7)
test_size = len(data) - train_size
train_data, test_data = data[0:train_size], data[train_size:len(data)]

# Veriyi uygun şekle getir
def create_dataset(dataset, look_back=1):
    X, Y = [], []
    for i in range(len(dataset) - look_back - 1):
        a = dataset[i:(i + look_back), 0]
        X.append(a)
        Y.append(dataset[i + look_back, 0])
    return np.array(X), np.array(Y)

look_back = 3
X_train, y_train = create_dataset(train_data[[target]].values, look_back)
X_test, y_test = create_dataset(test_data[[target]].values, look_back)

# Veriyi LSTM için uygun şekle dönüştür
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

# Modeli oluştur ve eğit
model = Sequential()
model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
history = model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_test, y_test), verbose=2, shuffle=False)

# Model tahminlerini al
y_pred = model.predict(X_test)

# Tahminleri ters ölçeklendirme
y_test = y_test.reshape(-1, 1)
y_pred = y_pred.reshape(-1, 1)

# Tahmin edilen ve gerçek değerleri karşılaştır
plt.plot(y_test, label='Gerçek Değer')
plt.plot(y_pred, label='Tahmin Edilen Değer')
plt.xlabel('Zaman')
plt.ylabel('Yağış')
plt.legend()
plt.show()

# Modelin eğitim ve doğrulama kayıplarını çiz
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

I am trying to predict the amount of rainfall in Istanbul using regression and LSTM methods. I used the Kalman filter for regression and got an MSE of 7.72, and the graph of predicted vs. actual values is a good fit. However, even though the MSE of the LSTM is lower than that of the Kalman filter, the graph of predicted vs. actual values looks distorted, and the predictions do not match the actual rainfall amounts well. I expected the LSTM model to perform better than the Kalman filter, and to have a good fit between the predicted and actual values. However, this is not the case, and I am unsure why.

0 Answers0