I have read the source code of leveldb. I found that when the size of Data Block reaches 4KB, it will flush the Data Block and call FilterBlockBuilder::StartBlock()
to generate filter.
void TableBuilder::Add(const Slice& key, const Slice& value) {
// ...
const size_t estimated_block_size = r->data_block.CurrentSizeEstimate();
if (estimated_block_size >= r->options.block_size) {
Flush();
}
}
void TableBuilder::Flush() {
// ...
if (r->filter_block != nullptr) {
r->filter_block->StartBlock(r->offset);
}
}
FilterBlockBuilder::StartBlock()
calls GenerateFilter()
block_offset/2KB
times. For example, when block_offset is 4KB, it will call GenerateFilter()
2 times.
However, after the first call of GenerateFilter()
, keys
and starts
are both empty. As a result, the second call only generates an empty filter and adds the same filter offset to filter_offsets
.
Does leveldb generate 2 Bloom Filter each of which is for 2KB part of Data Block, or 1 Bloom Filter for the whole 4KB Data Block with another empty filter?
void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
uint64_t filter_index = (block_offset / kFilterBase);
assert(filter_index >= filter_offsets_.size());
while (filter_index > filter_offsets_.size()) {
GenerateFilter();
}
}
void FilterBlockBuilder::GenerateFilter() {
const size_t num_keys = start_.size();
// ****** My Comment: will the second call in StartBlock always trigger this?
if (num_keys == 0) {
// Fast path if there are no keys for this filter
filter_offsets_.push_back(result_.size());
return;
}
// Make list of keys from flattened key structure
start_.push_back(keys_.size()); // Simplify length computation
tmp_keys_.resize(num_keys);
for (size_t i = 0; i < num_keys; i++) {
const char* base = keys_.data() + start_[i];
size_t length = start_[i + 1] - start_[i];
tmp_keys_[i] = Slice(base, length);
}
// Generate filter for current set of keys and append to result_.
filter_offsets_.push_back(result_.size());
policy_->CreateFilter(&tmp_keys_[0], static_cast<int>(num_keys), &result_);
tmp_keys_.clear();
keys_.clear();
start_.clear();
}