getting this error while doing pca analysis

Question

import keras
import tensorflow as tf
data = tf.keras.datasets.mnist.load_data(path="mnist.npz")
import numpy as np

def pca(X):
    # Normalize the data
    X_mean = np.mean(X, axis=0)
    X_std = np.std(X, axis=0)
    X_std[X_std == 0] = 1e-6 # replace zero std with a small positive value
    X_norm = (X - X_mean) / X_std
    
    # Calculate the singular value decomposition of the centered data matrix
    U, s, Vt = np.linalg.svd(X_norm)
    
    # Calculate the weights (eigenvalues) from the singular values
    weights = s**2 / (X_norm.shape[0]-1)
    
    # The columns of Vt are the principal components (eigenvectors)
    vectors = Vt.T
    
    return X_mean, weights, vectors
from tensorflow.keras.datasets import mnist

(X_train, y_train), (X_test, y_test) = mnist.load_data()
X = np.vstack((X_train, X_test)).reshape(-1, 28*28)

X_mean, weights, vectors = pca(X)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# Load the MNIST data
mnist = fetch_openml('mnist_784')
X, y = mnist.data, mnist.target
X_train, y_train = X[:60000], y[:60000]
X_test, y_test = X[60000:], y[60000:]

# Randomly select 1000 images from the training set
np.random.seed(123)
n_samples = 1000
idx = np.random.choice(X_train.shape[0], n_samples, replace=False)
X_train = X_train[idx]
y_train = y_train[idx]

# Define the number of components to test
n_components_list = [50, 100, 200, 500]

# Loop over the number of components and fit a Random Forest classifier
for n_components in n_components_list:
    # Perform PCA with the specified number of components
    pca = PCA(n_components=n_components)
    pca.fit(X_train)
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    
    # Plot the class locations on a 2D map with the first and second principal components
    plt.figure()
    for label in np.unique(y_train):
        idx = np.where(y_train == label)[0]
        plt.scatter(X_train_pca[idx, 0], X_train_pca[idx, 1], label=label)
    plt.title("PCA with {} components".format(n_components))
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.legend()
    plt.show()
    
    # Plot the class locations on a 2D map with the first and third principal components
    plt.figure()
    for label in np.unique(y_train):
        idx = np.where(y_train == label)[0]
        plt.scatter(X_train_pca[idx, 0], X_train_pca[idx, 2], label=label)
    plt.title("PCA with {} components".format(n_components))
    plt.xlabel("PC1")
    plt.ylabel("PC3")
    plt.legend()
    plt.show()
    
    # Train a Random Forest classifier on the reduced features and evaluate its performance
    clf = RandomForestClassifier(n_estimators=100)
    scores = cross_val_score(clf, X_train_pca, y_train, cv=5)
    print("Accuracy with {} components: {:.2f}".format(n_components, np.mean(scores)))

i wrote this line for hw but after running it. i got following error;

KeyError                                  Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_8072\3984917867.py in <module>
     17 n_samples = 1000
     18 idx = np.random.choice(X_train.shape[0], n_samples, replace=False)
---> 19 X_train = X_train[idx]
     20 y_train = y_train[idx]
     21 

~\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   3509             if is_iterator(key):
   3510                 key = list(key)
-> 3511             indexer = self.columns._get_indexer_strict(key, "columns")[1]
   3512 
   3513         # take() does not accept boolean indexers

~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in _get_indexer_strict(self, key, axis_name)
   5794             keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
   5795 
-> 5796         self._raise_if_missing(keyarr, indexer, axis_name)
   5797 
   5798         keyarr = self.take(indexer)

~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in _raise_if_missing(self, key, indexer, axis_name)
   5854                 if use_interval_msg:
   5855                     key = list(key)
-> 5856                 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
   5857 
   5858             not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())

KeyError: "None of [Int64Index([29561, 26640, 24498, 24594, 24249, 22744, 35431, 37308, 19137,\n            12014,\n            ...\n            30525, 34217,  7084, 49920, 40229, 41658, 37384, 28900,  5335,\n            42681],\n           dtype='int64', length=1000)] are in the [columns]"

Using only a portion of the data (e.g., about 1000 images randomly chosen from the training set) perform PCA and train a classifier.

Using the MNIST data, do a series of PCA-based reductions on the data. This should test at least four different values for the number of components chosen. Plot the class locations on the test data on a 2D map with horizontal axis as the first principal and with vertical axis as the second principal component (like the one discussed in class). Do the same for the first and third principal components. This should show you some clustering of the labels (better than if you just chose any two pixels). Feed the reduced features to a Random Forest Decision tree and show classification results using cross-validation. You should use all the data in training. This should be repeated for a few numbers of components extracted by PCA.

getting this error while doing pca analysis

0 Answers0