I am trying to read an hdf5 file containing variable-length vectors of doubles in C++. I used the following code to create the hdf5 file. It contains one dataset called "test" containing 100 rows of varying lengths. I had to make a couple of changes to the code in the link, so for convenience here is the exact code I used to write the data to hdf5:
#include <iostream>
#include <string>
#include <H5Cpp.h>
#include <vector>
#include <random>
const hsize_t n_dims = 1;
const hsize_t n_rows = 100;
const std::string dataset_name = "test";
int main () {
H5::H5File file("vlen_cpp.hdf5", H5F_ACC_TRUNC);
H5::DataSpace dataspace(n_dims, &n_rows);
// target dtype for the file
auto item_type = H5::PredType::NATIVE_DOUBLE;
auto file_type = H5::VarLenType(&item_type);
// dtype of the generated data
auto mem_type = H5::VarLenType(&item_type);
H5::DataSet dataset = file.createDataSet(dataset_name, file_type, dataspace);
std::vector<std::vector<double>> data;
data.reserve(n_rows);
// this structure stores length of each varlen row and a pointer to
// the actual data
std::vector<hvl_t> varlen_spec(n_rows);
std::mt19937 gen;
std::normal_distribution<double> normal(0.0, 1.0);
std::poisson_distribution<hsize_t> poisson(20);
for (hsize_t idx=0; idx < n_rows; idx++) {
data.emplace_back();
hsize_t size = poisson(gen);
data.at(idx).reserve(size);
varlen_spec.at(idx).len = size;
varlen_spec.at(idx).p = (void*) &data.at(idx).front();
for (hsize_t i = 0; i < size; i++) {
data.at(idx).push_back(normal(gen));
}
}
dataset.write(&varlen_spec.front(), mem_type);
return 0;
}
I am very new to C++ and my issue is trying to read the data back out of this file in C++. I tried to mimic what I would do in Python, but didn't have any luck. In Python, I would do this:
import h5py
import numpy as np
data = h5py.File("vlen_cpp.hdf5", "r")
i = 0 # This is the row I would want to read
arr = data["test"][i] # <-- This is the simplest way.
# Now trying to mimic something closer to C++
did = data["test"].id
dataspace = did.get_space()
dataspace.select_hyperslab(start=(i, ), count=(1, ))
memspace = h5py.h5s.create_simple(dims_tpl=(1, ))
memspace.select_hyperslab(start=(0, ), count=(1, ))
arr = np.zeros((1, ), dtype=object)
did.read(memspace, dataspace, arr)
print(arr) # This gives back the correct data
The python code seems to works fine, so I tried to mimic those steps in C++:
#include <H5Cpp.h>
#include <string>
#include <vector>
#include <stdio.h>
int main(int argc, char **argv) {
std::string filename = argv[1];
// memtype of the file
auto itemType = H5::PredType::NATIVE_DOUBLE;
auto memType = H5::VarLenType(&itemType);
// get dataspace
H5::H5File file(filename, H5F_ACC_RDONLY);
H5::DataSet dataset = file.openDataSet("test");
H5::DataSpace dataspace = dataset.getSpace();
// get the size of the dataset
hsize_t rank;
hsize_t dims[1];
rank = dataspace.getSimpleExtentDims(dims); // rank = 1
std::cout << "Data size: "<< dims[0] << std::endl; // this is the correct number of values
// create memspace
hsize_t memDims[1] = {1};
H5::DataSpace memspace(rank, memDims);
// container to store read data
std::vector<std::vector<double>> data;
// Select hyperslabs
hsize_t dataCount[1] = {1};
hsize_t dataOffset[1] = {0}; // this should be i
hsize_t memCount[1] = {1};
hsize_t memOffset[1] = {0};
dataspace.selectHyperslab(H5S_SELECT_SET, dataCount, dataOffset);
memspace.selectHyperslab(H5S_SELECT_SET, memCount, memOffset);
// vector to store read data
std::vector<double> temp;
temp.reserve(20);
dataset.read(temp.data(), memType, memspace, dataspace);
for (int i = 0; i < temp.size(); i++) {
std::cout << temp[i] << ", ";
}
std::cout << "\n";
return 0;
}
Nothing crashes when I run the C++ program, and the correct number of rows in the "test" dataset is printed (100), but the dataset.read() step isn't working: the first row isn't being read into the vector I want it to be read into (temp). I would greatly appreciate if someone could let me know what I'm doing wrong. Thanks so much.
My goal is to eventually read all 100 rows in the dataset in a loop (placing each row of data into the std:vector temp) and store each one in the std::vectorstd::vector<double> called data. But for now I'm just trying to make sure I can even read the first row.
EDIT: link to hdf5 file
"test" dataset looks like this:
[ 0.16371168 -0.21425339 0.29859526 -0.82794418 0.01021543 1.05546644
-0.546841 1.17456768 0.66068215 -1.04944273 1.48596426 -0.62527598
-2.55912244 -0.82908105 -0.53978052 -0.88870719]
[ 0.33958656 -0.48258915 2.10885699 -0.12130623 -0.2873894 -0.37100313
-1.05934898 -2.3014427 1.45502412 -0.06152739 0.92532768 1.35432642
1.51560926 -0.24327452 1.00886476 0.19749707 0.43894484 0.4394992
-0.12814881]
[ 0.64574273 0.14938582 -0.10369248 1.53727461 0.62404949 1.07824824
1.17066933 1.17196281 -2.05005927 0.13639514 -1.45473056 -1.71462623
-1.11552074 -1.73985207 1.12422121 -1.58694009]
...
EDIT 2:
I've additionally tried without any luck to read the data into (array, armadillo vector, eigen vectorXd). The program does not crash, but what is read into the containers is garbage:
#include <H5Cpp.h>
#include <string>
#include <vector>
#include <stdio.h>
#include <Eigen/Dense>
#include <Eigen/Core>
#include <armadillo>
int main(int argc, char **argv) {
std::string filename = argv[1];
// memtype of the file
auto itemType = H5::PredType::NATIVE_DOUBLE;
auto memType = H5::VarLenType(&itemType);
// get dataspace
H5::H5File file(filename, H5F_ACC_RDONLY);
H5::DataSet dataset = file.openDataSet("test");
H5::DataSpace dataspace = dataset.getSpace();
// get the size of the dataset
hsize_t rank;
hsize_t dims[1];
rank = dataspace.getSimpleExtentDims(dims); // rank = 1
std::cout << "Data size: "<< dims[0] << std::endl; // this is the correct number of values
std::cout << "Data rank: "<< rank << std::endl; // this is the correct rank
// create memspace
hsize_t memDims[1] = {1};
H5::DataSpace memspace(rank, memDims);
// Select hyperslabs
hsize_t dataCount[1] = {1};
hsize_t dataOffset[1] = {0}; // this would be i if reading in a loop
hsize_t memCount[1] = {1};
hsize_t memOffset[1] = {0};
dataspace.selectHyperslab(H5S_SELECT_SET, dataCount, dataOffset);
memspace.selectHyperslab(H5S_SELECT_SET, memCount, memOffset);
// Create storage to hold read data
int i;
int NX = 20;
double data_out[NX];
for (i = 0; i < NX; i++)
data_out[i] = 0;
arma::vec temp(20);
Eigen::VectorXd temp2(20);
// Read data into data_out (array)
dataset.read(data_out, memType, memspace, dataspace);
std::cout << "data_out: " << "\n";
for (i = 0; i < NX; i++)
std::cout << data_out[i] << " ";
std::cout << std::endl;
// Read data into temp (arma vec)
dataset.read(temp.memptr(), memType, memspace, dataspace);
std::cout << "arma vec: " << "\n";
std::cout << temp << std::endl;
// Read data into temp (eigen vec)
dataset.read(temp2.data(), memType, memspace, dataspace);
std::cout << "eigen vec: " << "\n";
std::cout << temp2 << std::endl;
return 0;
}
(ONE) SOLUTION: After struggling with this a lot, I was able to get a solution working, though admittedly I'm too new to C++ to really understand why this works why and the previous attempts didn't:
#include <H5Cpp.h>
#include <string>
#include <vector>
#include <stdio.h>
int main(int argc, char **argv) {
std::string filename = argv[1];
// Set memtype of the file
auto itemType = H5::PredType::NATIVE_DOUBLE;
auto memType = H5::VarLenType(&itemType);
// Get dataspace
H5::H5File file(filename, H5F_ACC_RDONLY);
H5::DataSet dataset = file.openDataSet("test");
H5::DataSpace dataspace = dataset.getSpace();
// Get the size of the dataset
hsize_t rank;
hsize_t dims[1];
rank = dataspace.getSimpleExtentDims(dims); // rank = 1
std::cout << "Data size: "<< dims[0] << std::endl; // this is the correct number of values
std::cout << "Data rank: "<< rank << std::endl; // this is the correct rank
// Create memspace
hsize_t memDims[1] = {1};
H5::DataSpace memspace(rank, memDims);
// Initialize hyperslabs
hsize_t dataCount[1];
hsize_t dataOffset[1];
hsize_t memCount[1];
hsize_t memOffset[1];
// Create storage to hold read data
hvl_t *rdata = new hvl_t[1];
std::vector<std::vector<double>> dataOut;
for (hsize_t i = 0; i < dims[0]; i++) {
// Select hyperslabs
dataCount[0] = 1;
dataOffset[0] = i;
memCount[0] = 1;
memOffset[0] = 0;
dataspace.selectHyperslab(H5S_SELECT_SET, dataCount, dataOffset);
memspace.selectHyperslab(H5S_SELECT_SET, memCount, memOffset);
// Read out the data
dataset.read(rdata, memType, memspace, dataspace);
double* ptr = (double*)rdata[0].p;
std::vector<double> thisRow;
for (int j = 0; j < rdata[0].len; j++) {
double* val = (double*)&ptr[j];
thisRow.push_back(*val);
}
dataOut.push_back(thisRow);
}
// Confirm data read out properly
for (int i = 0; i < dataOut.size(); i++) {
std::cout << "Row " << i << ":\n";
for (int j = 0; j < dataOut[i].size(); j++) {
std::cout << dataOut[i][j] << " ";
}
std::cout << "\n";
}
return 0;
}
If anyone knows a more efficient way that doesn't involve looping over the elements of each row (i.e. pull out an entire row in one go) that would be really helpful, but for now this works fine for me.