I am trying to build a model to predict if the salary of a job description is above or below the 75th percentile (above 1, below 0) My data has about 250,000 rows and its very hard to tokenize all the text from the job descriptions. My code seems to work fine, but it takes insane amounts of time to do it above 100 rows. I need to find a way to make it more efficient so that I can include more rows to my prediction.
import random
import nltk
import pandas
import csv
import numpy as np
io = pandas.read_csv('Train_rev1.csv',sep=',',usecols=(2,10), nrows=501)
#converted = df.apply(lambda io : int(io[0]))
data = [np.array(x) for x in io.values]
random.shuffle(data)
size = int(len(data) * 0.6)
test_set, train_set = data[size:], data[:size]
train_set = np.array(train_set)
test_set = np.array(test_set)
x = train_set[:,1]
Sal75=np.percentile(x,75)
y = test_set[:,1]
Test75=np.percentile(y,75)
for i in range(len(train_set[:,1])):
if train_set[i,1]>=Sal75:
train_set[i,1]=1
else:
train_set[i,1]=0
for i in range(len(test_set[:,1])):
if test_set[i,1]>=Test75:
test_set[i,1]=1
else:
test_set[i,1]=0
train_setT = [tuple(x) for x in train_set]
test_setT = [tuple(x) for x in test_set]
from nltk.tokenize import word_tokenize
all_words = set(word.lower() for passage in train_setT for word in word_tokenize(passage[0]))
t = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train_setT]
classifier = nltk.NaiveBayesClassifier.train(t)
all_words2 = set(word.lower() for passage in test_setT for word in word_tokenize(passage[0]))
tt = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in test_setT]
print nltk.classify.accuracy(classifier, tt)
classifier.show_most_informative_features(20)
testres = []
predres = []
for i in range(len(tt)):
testres.append(tt[i][1])
for i in range(len(tt)):
z = classifier.classify(tt[i][0])
predres.append(z)
from nltk.metrics import ConfusionMatrix
cm = nltk.ConfusionMatrix(testres, predres)
print(cm)
The csv file was extracted from Kaggle.Use Train_rev1