For a given path, i process many GigaBytes of files inside, and yield dataframes for every processed one. For every dataframe that is yield, which includes two string columns of varying size, I want to dump them to disk using the very efficient HDF5 format. The error is raised when the HDFStore.append procedure is called, for the 4th or 5th iteration.
I use the following routine(simplified) to build the dataframes:
def build_data_frames(path):
data = df({'headline': [],
'content': [],
'publication': [],
'file_ref': []},
columns=['publication','file_ref','headline','content'])
for curdir, subdirs, filenames in os.walk(path):
for file in filenames:
if (zipfile.is_zipfile(os.path.join(curdir, file))):
with zf(os.path.join(curdir, file), 'r') as arch:
for arch_file_name in arch.namelist():
if re.search('A[r|d]\d+.xml', arch_file_name) is not None:
xml_file_ref = arch.open(arch_file_name, 'r')
xml_file = xml_file_ref.read()
metadata = XML2MetaData(xml_file)
headlineTokens, contentTokens = XML2TokensParser(xml_file)
rows= [{'headline': " ".join(headlineTokens),
'content': " ".join(contentTokens)}]
rows[0].update(metadata)
data = data.append(df(rows,
columns=['publication',
'file_ref',
'headline',
'content']),
ignore_index=True)
arch.close()
yield data
Then I use the following method to write these dataframes to disk:
def extract_data(path):
hdf_fname = extract_name(path)
hdf_fname += ".h5"
data_store = HDFStore(hdf_fname)
for dataframe in build_data_frames(path):
data_store.append('df', dataframe, data_columns=True)
## passing min_itemsize doesn't work either
## data_store.append('df', dataframe, min_itemsize=8000)
## trying the "alternative" command didn't help
## dataframe.to_hdf(hdf_fname, 'df', format='table', append=True,
## min_itemsize=80000)
data_store.close()
->
%time load_data(publications_path)
And the ValueError I get is:
...
ValueError: Trying to store a string with len [5761] in [values_block_0]
column but this column has a limit of [4430]!
Consider using min_itemsize to preset the sizes on these columns
I tried all the options, went through all the documentation necessary for this task, and tried all the tricks I saw on the Internet. Yet, no idea why it happens.
I use pandas ver: 0.17.0
Appreciate your help very much!