0
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt 
import seaborn as sn
from sklearn import preprocessing, cross_validation, svm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

#################### IMPORT THE DATABASE


columns = ['Age','Workclass','fnlgwt','Education','Education num','Marital Status',
       'Occupation','Relationship','Race','Sex','Capital Gain','Capital Loss',
       'Hours/Week','Native country','Income']


train = pd.read_csv('adult-training.csv', names=columns)
test = pd.read_csv('adult-test.csv', names=columns, skiprows=1)

train.info()
############## Clean the Data

df = pd.concat([train, test], axis=0)
dff=df
k=df





df['Income'] = df['Income'].apply(lambda x: 1 if x==' >50K' else 0)

for col in df.columns:
    if type(df[col][0]) == str:
        print("Working on " + col)
        df[col] = df[col].apply(lambda val: val.replace(" ",""))

####################   REMOVE UNKNOWNS

df.replace(' ?', np.nan, inplace=True)###making copy for visulization

#################### Converting to int


df = pd.concat([df,         pd.get_dummies(df['Workclass'],prefix='Workclass',prefix_sep=':')], axis=1)
df.drop('Workclass',axis=1,inplace=True)

df = pd.concat([df, pd.get_dummies(df['Marital Status'],prefix='Marital Status',prefix_sep=':')], axis=1)
df.drop('Marital Status',axis=1,inplace=True)

df = pd.concat([df,     pd.get_dummies(df['Occupation'],prefix='Occupation',prefix_sep=':')], axis=1)
df.drop('Occupation',axis=1,inplace=True)

df = pd.concat([df,     pd.get_dummies(df['Relationship'],prefix='Relationship',prefix_sep=':')], axis=1)
df.drop('Relationship',axis=1,inplace=True)

df = pd.concat([df, pd.get_dummies(df['Race'],prefix='Race',prefix_sep=':')], axis=1)
df.drop('Race',axis=1,inplace=True)

df = pd.concat([df,     pd.get_dummies(df['Sex'],prefix='Sex',prefix_sep=':')], axis=1)
df.drop('Sex',axis=1,inplace=True)

df = pd.concat([df, pd.get_dummies(df['Native country'],prefix='Native country',prefix_sep=':')], axis=1)
df.drop('Native country',axis=1,inplace=True)

df.drop('Education', axis=1,inplace=True)

df.head()

########### Preparing data for Training and testing 

X = np.array(df.drop(['Income'], 1))
y = np.array(df['Income'])
X = preprocessing.scale(X)
y = np.array(df['Income'])


X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

    #################################CLASSIFICATION##############################    #######
#########LOGISTIC     REGRESSION######################################################
from sklearn.metrics import accuracy_score
lrn = LogisticRegression(penalty = 'l1', C = .001, class_weight='balanced')

lrn.fit(X_train, y_train) # This line is giving me error. 

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

Please help me solve this error, I need help.

desertnaut
  • 57,590
  • 26
  • 140
  • 166
Osro_db40
  • 23
  • 9

1 Answers1

0

I suppose your classes are unbalanced and when you use:

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

y_train get only one class type. Try to split train-test with respect to class balances. StratifiedKFold may help for example or you need to do it manually.

avchauzov
  • 1,007
  • 1
  • 8
  • 13