The folks at TensorFlow have created an excellent tutorial that does just this. It covers how to read the census data from csv, convert it into tensors, and fit and evaluate a machine learning model using the high-level estimator API.
However, I did get an error when I tried using the urllib
function, and I modified the code slightly so that the data is read directly using pandas
.
Original Code
import tempfile
import urllib
train_file = tempfile.NamedTemporaryFile()
test_file = tempfile.NamedTemporaryFile()
urllib.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", train_file.name)
urllib.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", test_file.name)
import pandas as pd
CSV_COLUMNS = [
"age", "workclass", "fnlwgt", "education", "education_num",
"marital_status", "occupation", "relationship", "race", "gender",
"capital_gain", "capital_loss", "hours_per_week", "native_country",
"income_bracket"]
df_train = pd.read_csv(train_file.name, names=CSV_COLUMNS, skipinitialspace=True)
df_test = pd.read_csv(test_file.name, names=CSV_COLUMNS, skipinitialspace=True, skiprows=1)
Modified code
import pandas as pd
COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
"marital_status", "occupation", "relationship", "race", "gender",
"capital_gain", "capital_loss", "hours_per_week", "native_country",
"income_bracket"]
df_train = pd.read_csv('http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data'
, names=COLUMNS
, skipinitialspace=True)
df_test = pd.read_csv('http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test'
, names=COLUMNS
, skipinitialspace=True
, skiprows=1)