1
```
features, output = make_classification(n_samples = 1000000, n_features = 10, n_informative = 6, n_redundant = 4, n_classes = 2, random_state = 2022)
X = pd.DataFrame(features, columns=["feature_1", "feature_2", "feature_3", "feature_4", "feature_5", "feature_6", "feature_7", "feature_8", "feature_9", "feature_10"])
y = output
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

kernel = 1.0 * RBF(1.0)
gpc = GaussianProcessClassifier(warm_start = True, copy_X_train = False)

gpc.fit(X_train, y_train)
```

I am using Scikit learn to fit a Gaussian process classifier on a made-up data set but got an error saying there is a memory error:

MemoryError                               Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_28100\1590160301.py in <module>
      2 gpc = GaussianProcessClassifier(warm_start = True, copy_X_train = False)
      3 
----> 4 gpc.fit(X_train, y_train)
      5 

~\anaconda3\lib\site-packages\sklearn\gaussian_process\_gpc.py in fit(self, X, y)
    713                 raise ValueError("Unknown multi-class mode %s" % self.multi_class)
    714 
--> 715         self.base_estimator_.fit(X, y)
    716 
    717         if self.n_classes_ > 2:

~\anaconda3\lib\site-packages\sklearn\gaussian_process\_gpc.py in fit(self, X, y)
    249             self.log_marginal_likelihood_value_ = -np.min(lml_values)
    250         else:
--> 251             self.log_marginal_likelihood_value_ = self.log_marginal_likelihood(
    252                 self.kernel_.theta
    253             )

~\anaconda3\lib\site-packages\sklearn\gaussian_process\_gpc.py in log_marginal_likelihood(self, theta, eval_gradient, clone_kernel)
    374             K, K_gradient = kernel(self.X_train_, eval_gradient=True)
    375         else:
--> 376             K = kernel(self.X_train_)
    377 
    378         # Compute log-marginal-likelihood Z and also store some temporaries

~\anaconda3\lib\site-packages\sklearn\gaussian_process\kernels.py in __call__(self, X, Y, eval_gradient)
    944             )
    945         else:
--> 946             return self.k1(X, Y) * self.k2(X, Y)
    947 
    948     def diag(self, X):

~\anaconda3\lib\site-packages\sklearn\gaussian_process\kernels.py in __call__(self, X, Y, eval_gradient)
   1251             raise ValueError("Gradient can only be evaluated when Y is None.")
   1252 
-> 1253         K = np.full(
   1254             (_num_samples(X), _num_samples(Y)),
   1255             self.constant_value,

~\anaconda3\lib\site-packages\numpy\core\numeric.py in full(shape, fill_value, dtype, order, like)
    341         fill_value = asarray(fill_value)
    342         dtype = fill_value.dtype
--> 343     a = empty(shape, dtype, order)
    344     multiarray.copyto(a, fill_value, casting='unsafe')
    345     return a

MemoryError: Unable to allocate 4.66 TiB for an array with shape (800000, 800000) and data type float64

According to this post which is a similar situation, I changed the parameter copy_X_train = False but still get the error.

Is there any way to avoid this error? Or the Gaussian process classifier could not handle this large data set?

Sihang
  • 11
  • 2
  • Your RAM or GPU is not 4.66 TiB large to contain all data. Please Fit with small data. – Awal Mar 27 '23 at 18:36

0 Answers0