import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn import preprocessing, cross_validation, svm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
#################### IMPORT THE DATABASE
columns = ['Age','Workclass','fnlgwt','Education','Education num','Marital Status',
'Occupation','Relationship','Race','Sex','Capital Gain','Capital Loss',
'Hours/Week','Native country','Income']
train = pd.read_csv('adult-training.csv', names=columns)
test = pd.read_csv('adult-test.csv', names=columns, skiprows=1)
train.info()
############## Clean the Data
df = pd.concat([train, test], axis=0)
dff=df
k=df
df['Income'] = df['Income'].apply(lambda x: 1 if x==' >50K' else 0)
for col in df.columns:
if type(df[col][0]) == str:
print("Working on " + col)
df[col] = df[col].apply(lambda val: val.replace(" ",""))
#################### REMOVE UNKNOWNS
df.replace(' ?', np.nan, inplace=True)###making copy for visulization
#################### Converting to int
df = pd.concat([df, pd.get_dummies(df['Workclass'],prefix='Workclass',prefix_sep=':')], axis=1)
df.drop('Workclass',axis=1,inplace=True)
df = pd.concat([df, pd.get_dummies(df['Marital Status'],prefix='Marital Status',prefix_sep=':')], axis=1)
df.drop('Marital Status',axis=1,inplace=True)
df = pd.concat([df, pd.get_dummies(df['Occupation'],prefix='Occupation',prefix_sep=':')], axis=1)
df.drop('Occupation',axis=1,inplace=True)
df = pd.concat([df, pd.get_dummies(df['Relationship'],prefix='Relationship',prefix_sep=':')], axis=1)
df.drop('Relationship',axis=1,inplace=True)
df = pd.concat([df, pd.get_dummies(df['Race'],prefix='Race',prefix_sep=':')], axis=1)
df.drop('Race',axis=1,inplace=True)
df = pd.concat([df, pd.get_dummies(df['Sex'],prefix='Sex',prefix_sep=':')], axis=1)
df.drop('Sex',axis=1,inplace=True)
df = pd.concat([df, pd.get_dummies(df['Native country'],prefix='Native country',prefix_sep=':')], axis=1)
df.drop('Native country',axis=1,inplace=True)
df.drop('Education', axis=1,inplace=True)
df.head()
########### Preparing data for Training and testing
X = np.array(df.drop(['Income'], 1))
y = np.array(df['Income'])
X = preprocessing.scale(X)
y = np.array(df['Income'])
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)
#################################CLASSIFICATION############################## #######
#########LOGISTIC REGRESSION######################################################
from sklearn.metrics import accuracy_score
lrn = LogisticRegression(penalty = 'l1', C = .001, class_weight='balanced')
lrn.fit(X_train, y_train) # This line is giving me error.
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0
Please help me solve this error, I need help.