in db/compaction/compaction_job.cc [1751:2087]
Status CompactionJob::FinishCompactionOutputFile(
const Status& input_status, SubcompactionState* sub_compact,
CompactionRangeDelAggregator* range_del_agg,
CompactionIterationStats* range_del_out_stats,
const Slice* next_table_min_key /* = nullptr */) {
AutoThreadOperationStageUpdater stage_updater(
ThreadStatus::STAGE_COMPACTION_SYNC_FILE);
assert(sub_compact != nullptr);
assert(sub_compact->outfile);
assert(sub_compact->builder != nullptr);
assert(sub_compact->current_output() != nullptr);
uint64_t output_number = sub_compact->current_output()->meta.fd.GetNumber();
assert(output_number != 0);
ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
const Comparator* ucmp = cfd->user_comparator();
std::string file_checksum = kUnknownFileChecksum;
std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
// Check for iterator errors
Status s = input_status;
auto meta = &sub_compact->current_output()->meta;
assert(meta != nullptr);
if (s.ok()) {
Slice lower_bound_guard, upper_bound_guard;
std::string smallest_user_key;
const Slice *lower_bound, *upper_bound;
bool lower_bound_from_sub_compact = false;
if (sub_compact->outputs.size() == 1) {
// For the first output table, include range tombstones before the min key
// but after the subcompaction boundary.
lower_bound = sub_compact->start;
lower_bound_from_sub_compact = true;
} else if (meta->smallest.size() > 0) {
// For subsequent output tables, only include range tombstones from min
// key onwards since the previous file was extended to contain range
// tombstones falling before min key.
smallest_user_key = meta->smallest.user_key().ToString(false /*hex*/);
lower_bound_guard = Slice(smallest_user_key);
lower_bound = &lower_bound_guard;
} else {
lower_bound = nullptr;
}
if (next_table_min_key != nullptr) {
// This may be the last file in the subcompaction in some cases, so we
// need to compare the end key of subcompaction with the next file start
// key. When the end key is chosen by the subcompaction, we know that
// it must be the biggest key in output file. Therefore, it is safe to
// use the smaller key as the upper bound of the output file, to ensure
// that there is no overlapping between different output files.
upper_bound_guard = ExtractUserKey(*next_table_min_key);
if (sub_compact->end != nullptr &&
ucmp->Compare(upper_bound_guard, *sub_compact->end) >= 0) {
upper_bound = sub_compact->end;
} else {
upper_bound = &upper_bound_guard;
}
} else {
// This is the last file in the subcompaction, so extend until the
// subcompaction ends.
upper_bound = sub_compact->end;
}
auto earliest_snapshot = kMaxSequenceNumber;
if (existing_snapshots_.size() > 0) {
earliest_snapshot = existing_snapshots_[0];
}
bool has_overlapping_endpoints;
if (upper_bound != nullptr && meta->largest.size() > 0) {
has_overlapping_endpoints =
ucmp->Compare(meta->largest.user_key(), *upper_bound) == 0;
} else {
has_overlapping_endpoints = false;
}
// The end key of the subcompaction must be bigger or equal to the upper
// bound. If the end of subcompaction is null or the upper bound is null,
// it means that this file is the last file in the compaction. So there
// will be no overlapping between this file and others.
assert(sub_compact->end == nullptr ||
upper_bound == nullptr ||
ucmp->Compare(*upper_bound , *sub_compact->end) <= 0);
auto it = range_del_agg->NewIterator(lower_bound, upper_bound,
has_overlapping_endpoints);
// Position the range tombstone output iterator. There may be tombstone
// fragments that are entirely out of range, so make sure that we do not
// include those.
if (lower_bound != nullptr) {
it->Seek(*lower_bound);
} else {
it->SeekToFirst();
}
TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1");
for (; it->Valid(); it->Next()) {
auto tombstone = it->Tombstone();
if (upper_bound != nullptr) {
int cmp = ucmp->Compare(*upper_bound, tombstone.start_key_);
if ((has_overlapping_endpoints && cmp < 0) ||
(!has_overlapping_endpoints && cmp <= 0)) {
// Tombstones starting after upper_bound only need to be included in
// the next table. If the current SST ends before upper_bound, i.e.,
// `has_overlapping_endpoints == false`, we can also skip over range
// tombstones that start exactly at upper_bound. Such range tombstones
// will be included in the next file and are not relevant to the point
// keys or endpoints of the current file.
break;
}
}
if (bottommost_level_ && tombstone.seq_ <= earliest_snapshot) {
// TODO(andrewkr): tombstones that span multiple output files are
// counted for each compaction output file, so lots of double counting.
range_del_out_stats->num_range_del_drop_obsolete++;
range_del_out_stats->num_record_drop_obsolete++;
continue;
}
auto kv = tombstone.Serialize();
assert(lower_bound == nullptr ||
ucmp->Compare(*lower_bound, kv.second) < 0);
// Range tombstone is not supported by output validator yet.
sub_compact->builder->Add(kv.first.Encode(), kv.second);
InternalKey smallest_candidate = std::move(kv.first);
if (lower_bound != nullptr &&
ucmp->Compare(smallest_candidate.user_key(), *lower_bound) <= 0) {
// Pretend the smallest key has the same user key as lower_bound
// (the max key in the previous table or subcompaction) in order for
// files to appear key-space partitioned.
//
// When lower_bound is chosen by a subcompaction, we know that
// subcompactions over smaller keys cannot contain any keys at
// lower_bound. We also know that smaller subcompactions exist, because
// otherwise the subcompaction woud be unbounded on the left. As a
// result, we know that no other files on the output level will contain
// actual keys at lower_bound (an output file may have a largest key of
// lower_bound@kMaxSequenceNumber, but this only indicates a large range
// tombstone was truncated). Therefore, it is safe to use the
// tombstone's sequence number, to ensure that keys at lower_bound at
// lower levels are covered by truncated tombstones.
//
// If lower_bound was chosen by the smallest data key in the file,
// choose lowest seqnum so this file's smallest internal key comes after
// the previous file's largest. The fake seqnum is OK because the read
// path's file-picking code only considers user key.
smallest_candidate = InternalKey(
*lower_bound, lower_bound_from_sub_compact ? tombstone.seq_ : 0,
kTypeRangeDeletion);
}
InternalKey largest_candidate = tombstone.SerializeEndKey();
if (upper_bound != nullptr &&
ucmp->Compare(*upper_bound, largest_candidate.user_key()) <= 0) {
// Pretend the largest key has the same user key as upper_bound (the
// min key in the following table or subcompaction) in order for files
// to appear key-space partitioned.
//
// Choose highest seqnum so this file's largest internal key comes
// before the next file's/subcompaction's smallest. The fake seqnum is
// OK because the read path's file-picking code only considers the user
// key portion.
//
// Note Seek() also creates InternalKey with (user_key,
// kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
// kTypeRangeDeletion (0xF), so the range tombstone comes before the
// Seek() key in InternalKey's ordering. So Seek() will look in the
// next file for the user key.
largest_candidate =
InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion);
}
#ifndef NDEBUG
SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber;
if (meta->smallest.size() > 0) {
smallest_ikey_seqnum = GetInternalKeySeqno(meta->smallest.Encode());
}
#endif
meta->UpdateBoundariesForRange(smallest_candidate, largest_candidate,
tombstone.seq_,
cfd->internal_comparator());
// The smallest key in a file is used for range tombstone truncation, so
// it cannot have a seqnum of 0 (unless the smallest data key in a file
// has a seqnum of 0). Otherwise, the truncated tombstone may expose
// deleted keys at lower levels.
assert(smallest_ikey_seqnum == 0 ||
ExtractInternalKeyFooter(meta->smallest.Encode()) !=
PackSequenceAndType(0, kTypeRangeDeletion));
}
}
const uint64_t current_entries = sub_compact->builder->NumEntries();
if (s.ok()) {
s = sub_compact->builder->Finish();
} else {
sub_compact->builder->Abandon();
}
IOStatus io_s = sub_compact->builder->io_status();
if (s.ok()) {
s = io_s;
}
const uint64_t current_bytes = sub_compact->builder->FileSize();
if (s.ok()) {
meta->fd.file_size = current_bytes;
meta->marked_for_compaction = sub_compact->builder->NeedCompact();
// With accurate smallest and largest key, we can get a slightly more
// accurate oldest ancester time.
// This makes oldest ancester time in manifest more accurate than in
// table properties. Not sure how to resolve it.
if (meta->smallest.size() > 0 && meta->largest.size() > 0) {
uint64_t refined_oldest_ancester_time;
Slice new_smallest = meta->smallest.user_key();
Slice new_largest = meta->largest.user_key();
if (!new_largest.empty() && !new_smallest.empty()) {
refined_oldest_ancester_time =
sub_compact->compaction->MinInputFileOldestAncesterTime(
&(meta->smallest), &(meta->largest));
if (refined_oldest_ancester_time != port::kMaxUint64) {
meta->oldest_ancester_time = refined_oldest_ancester_time;
}
}
}
}
sub_compact->current_output()->finished = true;
sub_compact->total_bytes += current_bytes;
// Finish and check for file errors
if (s.ok()) {
StopWatch sw(db_options_.clock, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
io_s = sub_compact->outfile->Sync(db_options_.use_fsync);
}
if (s.ok() && io_s.ok()) {
io_s = sub_compact->outfile->Close();
}
if (s.ok() && io_s.ok()) {
// Add the checksum information to file metadata.
meta->file_checksum = sub_compact->outfile->GetFileChecksum();
meta->file_checksum_func_name =
sub_compact->outfile->GetFileChecksumFuncName();
file_checksum = meta->file_checksum;
file_checksum_func_name = meta->file_checksum_func_name;
}
if (s.ok()) {
s = io_s;
}
if (sub_compact->io_status.ok()) {
sub_compact->io_status = io_s;
// Since this error is really a copy of the
// "normal" status, it does not also need to be checked
sub_compact->io_status.PermitUncheckedError();
}
sub_compact->outfile.reset();
TableProperties tp;
if (s.ok()) {
tp = sub_compact->builder->GetTableProperties();
}
if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) {
// If there is nothing to output, no necessary to generate a sst file.
// This happens when the output level is bottom level, at the same time
// the sub_compact output nothing.
std::string fname =
TableFileName(sub_compact->compaction->immutable_options()->cf_paths,
meta->fd.GetNumber(), meta->fd.GetPathId());
// TODO(AR) it is not clear if there are any larger implications if
// DeleteFile fails here
Status ds = env_->DeleteFile(fname);
if (!ds.ok()) {
ROCKS_LOG_WARN(
db_options_.info_log,
"[%s] [JOB %d] Unable to remove SST file for table #%" PRIu64
" at bottom level%s",
cfd->GetName().c_str(), job_id_, output_number,
meta->marked_for_compaction ? " (need compaction)" : "");
}
// Also need to remove the file from outputs, or it will be added to the
// VersionEdit.
assert(!sub_compact->outputs.empty());
sub_compact->outputs.pop_back();
meta = nullptr;
}
if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) {
// Output to event logger and fire events.
sub_compact->current_output()->table_properties =
std::make_shared<TableProperties>(tp);
ROCKS_LOG_INFO(db_options_.info_log,
"[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64
" keys, %" PRIu64 " bytes%s",
cfd->GetName().c_str(), job_id_, output_number,
current_entries, current_bytes,
meta->marked_for_compaction ? " (need compaction)" : "");
}
std::string fname;
FileDescriptor output_fd;
uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
Status status_for_listener = s;
if (meta != nullptr) {
fname = GetTableFileName(meta->fd.GetNumber());
output_fd = meta->fd;
oldest_blob_file_number = meta->oldest_blob_file_number;
} else {
fname = "(nil)";
if (s.ok()) {
status_for_listener = Status::Aborted("Empty SST file not kept");
}
}
EventHelpers::LogAndNotifyTableFileCreationFinished(
event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname,
job_id_, output_fd, oldest_blob_file_number, tp,
TableFileCreationReason::kCompaction, status_for_listener, file_checksum,
file_checksum_func_name);
#ifndef ROCKSDB_LITE
// Report new file to SstFileManagerImpl
auto sfm =
static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
if (sfm && meta != nullptr && meta->fd.GetPathId() == 0) {
Status add_s = sfm->OnAddFile(fname);
if (!add_s.ok() && s.ok()) {
s = add_s;
}
if (sfm->IsMaxAllowedSpaceReached()) {
// TODO(ajkr): should we return OK() if max space was reached by the final
// compaction output file (similarly to how flush works when full)?
s = Status::SpaceLimit("Max allowed space was reached");
TEST_SYNC_POINT(
"CompactionJob::FinishCompactionOutputFile:"
"MaxAllowedSpaceReached");
InstrumentedMutexLock l(db_mutex_);
db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction);
}
}
#endif
sub_compact->builder.reset();
sub_compact->current_output_file_size = 0;
return s;
}