import keras
import tensorflow as tf
data = tf.keras.datasets.mnist.load_data(path="mnist.npz")
import numpy as np
def pca(X):
# Normalize the data
X_mean = np.mean(X, axis=0)
X_std = np.std(X, axis=0)
X_std[X_std == 0] = 1e-6 # replace zero std with a small positive value
X_norm = (X - X_mean) / X_std
# Calculate the singular value decomposition of the centered data matrix
U, s, Vt = np.linalg.svd(X_norm)
# Calculate the weights (eigenvalues) from the singular values
weights = s**2 / (X_norm.shape[0]-1)
# The columns of Vt are the principal components (eigenvectors)
vectors = Vt.T
return X_mean, weights, vectors
from tensorflow.keras.datasets import mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X = np.vstack((X_train, X_test)).reshape(-1, 28*28)
X_mean, weights, vectors = pca(X)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
# Load the MNIST data
mnist = fetch_openml('mnist_784')
X, y = mnist.data, mnist.target
X_train, y_train = X[:60000], y[:60000]
X_test, y_test = X[60000:], y[60000:]
# Randomly select 1000 images from the training set
np.random.seed(123)
n_samples = 1000
idx = np.random.choice(X_train.shape[0], n_samples, replace=False)
X_train = X_train[idx]
y_train = y_train[idx]
# Define the number of components to test
n_components_list = [50, 100, 200, 500]
# Loop over the number of components and fit a Random Forest classifier
for n_components in n_components_list:
# Perform PCA with the specified number of components
pca = PCA(n_components=n_components)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
# Plot the class locations on a 2D map with the first and second principal components
plt.figure()
for label in np.unique(y_train):
idx = np.where(y_train == label)[0]
plt.scatter(X_train_pca[idx, 0], X_train_pca[idx, 1], label=label)
plt.title("PCA with {} components".format(n_components))
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend()
plt.show()
# Plot the class locations on a 2D map with the first and third principal components
plt.figure()
for label in np.unique(y_train):
idx = np.where(y_train == label)[0]
plt.scatter(X_train_pca[idx, 0], X_train_pca[idx, 2], label=label)
plt.title("PCA with {} components".format(n_components))
plt.xlabel("PC1")
plt.ylabel("PC3")
plt.legend()
plt.show()
# Train a Random Forest classifier on the reduced features and evaluate its performance
clf = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(clf, X_train_pca, y_train, cv=5)
print("Accuracy with {} components: {:.2f}".format(n_components, np.mean(scores)))
i wrote this line for hw but after running it. i got following error;
KeyError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_8072\3984917867.py in <module>
17 n_samples = 1000
18 idx = np.random.choice(X_train.shape[0], n_samples, replace=False)
---> 19 X_train = X_train[idx]
20 y_train = y_train[idx]
21
~\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
3509 if is_iterator(key):
3510 key = list(key)
-> 3511 indexer = self.columns._get_indexer_strict(key, "columns")[1]
3512
3513 # take() does not accept boolean indexers
~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in _get_indexer_strict(self, key, axis_name)
5794 keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
5795
-> 5796 self._raise_if_missing(keyarr, indexer, axis_name)
5797
5798 keyarr = self.take(indexer)
~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in _raise_if_missing(self, key, indexer, axis_name)
5854 if use_interval_msg:
5855 key = list(key)
-> 5856 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
5857
5858 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
KeyError: "None of [Int64Index([29561, 26640, 24498, 24594, 24249, 22744, 35431, 37308, 19137,\n 12014,\n ...\n 30525, 34217, 7084, 49920, 40229, 41658, 37384, 28900, 5335,\n 42681],\n dtype='int64', length=1000)] are in the [columns]"
Using only a portion of the data (e.g., about 1000 images randomly chosen from the training set) perform PCA and train a classifier.
Using the MNIST data, do a series of PCA-based reductions on the data. This should test at least four different values for the number of components chosen. Plot the class locations on the test data on a 2D map with horizontal axis as the first principal and with vertical axis as the second principal component (like the one discussed in class). Do the same for the first and third principal components. This should show you some clustering of the labels (better than if you just chose any two pixels). Feed the reduced features to a Random Forest Decision tree and show classification results using cross-validation. You should use all the data in training. This should be repeated for a few numbers of components extracted by PCA.