I'd like to convert the csv files into hdf5 format,which are used for caffe training.Because the csv files is 80G,it will report memory error.The machine memory is 128G.So can it possbile to improve my code?handle it one by one?Below is my code,it reported memory error when run in np.array
if '__main__' == __name__:
print 'Loading...'
day = sys.argv[1]
file = day+".xls"
data = pd.read_csv(file, header=None)
print data.iloc[0,1:5]
y = np.array(data.iloc[:,0], np.float32)
x = np.array(data.iloc[:,1:], np.float32)
patch = 100000
dirname = "hdf5_" + day
os.mkdir(dirname)
filename = dirname+"/hdf5.txt"
modelname = dirname+"/data"
file_w = open(filename, 'w')
for idx in range(int(math.ceil(y.shape[0]*1.0/patch))):
with h5py.File(modelname + str(idx) + '.h5', 'w') as f:
d_begin = idx*patch
d_end = min(y.shape[0], (idx+1)*patch)
f['data'] = x[d_begin:d_end,:]
f['label'] = y[d_begin:d_end]
file_w.write(modelname + str(idx) + '.h5\n')
file_w.close()