I've gone through Scikit-SVM tutorial, and written the code to train and test. But I'm facing an issue with prediction, where it says, 'shape should be equal to training shape'. Here is the code below.
EDIT1: Sample Data
ERROR_DESC CLASSIFICATION_LABEL
ERROR manager.SqlManager: Error executing statement: java.sql.SQLException: ORA-01017: invalid username/password; logon denied at oracle.jdbc.driver.T4CTTIoer.processError(T4CTTIoer.java:447) at oracle.jdbc.driver.T4CTTIoer.processError(T4CTTIoer.java:389) at oracle.jdbc.driver.T4CTTIoer.processError(T4CTTIoer.java:382) at oracle.jdbc.driver.T4CTTIfun.processError(T4CTTIfun.java:675) at oracle.jdbc.driver.T4CTTIoauthenticate.processError(T4CTTIoauthenticate.java:448) at oracle.jdbc.driver.T4CTTIfun.receive(T4CTTIfun.java:513) -- ERROR tool.ImportTool: Encountered IOException running import job: java.io.IOException: No columns to generate for ClassWriter at org.apache.sqoop.orm.ClassWriter.generate(ClassWriter.java:1095),INCORRECT_CREDENTIALS-Database-RAISE_SERVICENOW_DB_CREDENTIALS
A client error (ThrottlingException) occurred when calling the DescribeCluster operation: Rate exceeded fetching DNS name -- ERROR manager.SqlManager: Error executing statement: java.sql.SQLRecoverableException: IO Error: The Network Adapter could not establish the connection at oracle.jdbc.driver.T4CConnection.logon(T4CConnection.java:489) -- ERROR tool.ImportTool: Encountered IOException running import job: java.io.IOException: No columns to generate for ClassWriter at org.apache.sqoop.orm.ClassWriter.generate(ClassWriter.java:1095), NETWORK_ERROR-Database-RAISE_SERVICENOW_DB_CONNECTION
I also found a similar question on SO: Link I tried to use transform, but it throws a different error.
import pandas as pd
# data paths
data_in = '../data/input/file.csv'
df_data = pd.read_csv(data_in)
# lower case all columns for uniformity
df_data.columns = map(str.lower, df_data.columns)
# lower case all data for uniformity
df_data = df_data.apply(lambda x: x.astype(str).str.lower())
labels = df_data['classification_label'].unique()
label_map = {}
i = 1
for label in labels:
label_map[label] = i
i += 1
# apply map to classification_label column
# df_data['classification_label'] = df_data['classification_label'].map(lambda s: label_map.get(s) if s in label_map else s)
# select features and labels
df_final = df_data[['error_desc', 'classification_label']]
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer()
X = v.fit_transform(df_final['error_desc'])
y = df_final['classification_label']
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
from sklearn.svm import SVC
def train_svm(X, y):
"""
Create and train the Support Vector Machine.
"""
svm = SVC(C=1000000.0, gamma='auto', kernel='rbf')
svm.fit(X, y)
return svm
svm = train_svm(X_train, y_train)
from sklearn.metrics import confusion_matrix
# Make an array of predictions on the test set
pred = svm.predict(X_test)
# Output the hit-rate and the confusion matrix for each model
print(svm.score(X_test, y_test))
print(confusion_matrix(pred, y_test))
0.777777777778
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 2 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 2 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 1 0 0 0]
[0 0 0 0 3 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 1 0 0 0 0 0 0 0]
[1 0 0 0 0 1 0 0 1 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 1 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 3 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 1]]
pred_x = """ERROR manager.SqlManager: Error executing statement: java.sql.SQLException: ORA-01017: invalid username/password; logon denied at oracle.jdbc.driver.T4CTTIoer.processError(T4CTTIoer.java:447) at oracle.jdbc.driver.T4CTTIoer.processError(T4CTTIoer.java:389) at oracle.jdbc.driver.T4CTTIoer.processError(T4CTTIoer.java:382) at oracle.jdbc.driver.T4CTTIfun.processError(T4CTTIfun.java:675) at oracle.jdbc.driver.T4CTTIoauthenticate.processError(T4CTTIoauthenticate.java:448) at oracle.jdbc.driver.T4CTTIfun.receive(T4CTTIfun.java:513) -- ERROR tool.ImportTool: Encountered IOException running import job: java.io.IOException: No columns to generate for ClassWriter at org.apache.sqoop.orm.ClassWriter.generate(ClassWriter.java:1095)"""
pred_x_vector = TfidfVectorizer().fit_transform([pred_x])
svm.predict(pred_x_vector)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-86-130bf7f79131> in <module>()
----> 1 svm.predict(pred_x_vector)
/Users/userOne/anaconda/lib/python2.7/site-packages/sklearn/svm/base.pyc in predict(self, X)
571 Class labels for samples in X.
572 """
--> 573 y = super(BaseSVC, self).predict(X)
574 return self.classes_.take(np.asarray(y, dtype=np.intp))
575
/Users/userOne/anaconda/lib/python2.7/site-packages/sklearn/svm/base.pyc in predict(self, X)
308 y_pred : array, shape (n_samples,)
309 """
--> 310 X = self._validate_for_predict(X)
311 predict = self._sparse_predict if self._sparse else self._dense_predict
312 return predict(X)
/Users/userOne/anaconda/lib/python2.7/site-packages/sklearn/svm/base.pyc in _validate_for_predict(self, X)
477 raise ValueError("X.shape[1] = %d should be equal to %d, "
478 "the number of features at training time" %
--> 479 (n_features, self.shape_fit_[1]))
480 return X
481
ValueError: X.shape[1] = 49 should be equal to 554, the number of features at training time