How do I write a uint64_t
value with a logical type of DECIMAL(30, 0)
and physical type of FIXED_LEN_BYTE_ARRAY
to a parquet file?
I describe my attempt below:
Because parquet::StreamWriter
requires any FIXED_LEN_BYTE_ARRAY
columns to have a LogicalType::None
type (code), we cannot use the >>
operator defined in StreamWriter
. Therefore, we create our own class MyStreamWriter
which accepts a uint64_t
value, converts it to a FixedLenByteArray
, and writes it via FixedLenByteArrayWriter
.
When I try writing uint64_t val = 2
using this method, the value is written as 2658455991569831745807614120560689152
in the parquet file, which is clearly incorrect.
The binary representation of 2658455991569831745807614120560689152
is 10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
so my hypothesis is that the write logic puts the correct bits in the most significant positions, but somehow zero-pads the rest.
What am I doing wrong? Is there a more straightforward way of writing uint64_t
as decimals?
#include "arrow/util/decimal.h"
#include "arrow/io/file.h"
#include "arrow/type.h"
#include "parquet/schema.h"
#include "parquet/stream_writer.h"
class MyStreamWriter : public parquet::StreamWriter {
using parquet::StreamWriter::StreamWriter;
public:
MyStreamWriter& writeDecimal(uint64_t v) {
arrow::Decimal128 decimal(arrow::BasicDecimal128(0, v));
std::array<uint8_t, 16> decimalBytes = decimal.ToBytes();
const uint8_t* decimalBytesPtr =
reinterpret_cast<const uint8_t*>(decimalBytes.data());
parquet::FixedLenByteArray flba(decimalBytesPtr);
Write<parquet::FixedLenByteArrayWriter>(flba);
return *this;
}
};
// Define our schema.
parquet::schema::NodeVector fields;
int32_t precision = 30;
int32_t scale = 0;
fields.push_back(parquet::schema::PrimitiveNode::Make(
"my_decimal_col", parquet::Repetition::REQUIRED,
parquet::LogicalType::Decimal(precision, scale),
parquet::Type::FIXED_LEN_BYTE_ARRAY,
arrow::Decimal128Type(precision, scale).byte_width()));
auto schema = static_pointer_cast<parquet::schema::GroupNode>(
parquet::schema::GroupNode::Make("schema", parquet::Repetition::REPEATED, fields));
// Open the writer.
const shared_ptr<arrow::io::OutputStream> os = arrow::io::FileOutputStream::Open(_filepath).ValueOrDie())
writer = make_unique<MyStreamWriter>(
parquet::ParquetFileWriter::Open(os, schema));
// Try writing an arbitrary value.
uint64_t val = 100;
writer->WriteDecimal(val);