static int stack_compact_range()

in reftable/stack.c [1193:1557]


static int stack_compact_range(struct reftable_stack *st,
			       size_t first, size_t last,
			       struct reftable_log_expiry_config *expiry,
			       unsigned int flags)
{
	struct reftable_buf tables_list_buf = REFTABLE_BUF_INIT;
	struct reftable_buf new_table_name = REFTABLE_BUF_INIT;
	struct reftable_buf new_table_path = REFTABLE_BUF_INIT;
	struct reftable_buf table_name = REFTABLE_BUF_INIT;
	struct reftable_flock tables_list_lock = REFTABLE_FLOCK_INIT;
	struct reftable_flock *table_locks = NULL;
	struct reftable_tmpfile new_table = REFTABLE_TMPFILE_INIT;
	int is_empty_table = 0, err = 0;
	size_t first_to_replace, last_to_replace;
	size_t i, nlocks = 0;
	char **names = NULL;

	if (first > last || (!expiry && first == last)) {
		err = 0;
		goto done;
	}

	st->stats.attempts++;

	/*
	 * Hold the lock so that we can read "tables.list" and lock all tables
	 * which are part of the user-specified range.
	 */
	err = flock_acquire(&tables_list_lock, st->list_file, st->opts.lock_timeout_ms);
	if (err < 0) {
		if (errno == EEXIST)
			err = REFTABLE_LOCK_ERROR;
		else
			err = REFTABLE_IO_ERROR;
		goto done;
	}

	err = stack_uptodate(st);
	if (err)
		goto done;

	/*
	 * Lock all tables in the user-provided range. This is the slice of our
	 * stack which we'll compact.
	 *
	 * Note that we lock tables in reverse order from last to first. The
	 * intent behind this is to allow a newer process to perform best
	 * effort compaction of tables that it has added in the case where an
	 * older process is still busy compacting tables which are preexisting
	 * from the point of view of the newer process.
	 */
	REFTABLE_ALLOC_ARRAY(table_locks, last - first + 1);
	if (!table_locks) {
		err = REFTABLE_OUT_OF_MEMORY_ERROR;
		goto done;
	}
	for (i = 0; i < last - first + 1; i++)
		table_locks[i] = REFTABLE_FLOCK_INIT;

	for (i = last + 1; i > first; i--) {
		err = stack_filename(&table_name, st, reftable_table_name(st->tables[i - 1]));
		if (err < 0)
			goto done;

		err = flock_acquire(&table_locks[nlocks], table_name.buf, 0);
		if (err < 0) {
			/*
			 * When the table is locked already we may do a
			 * best-effort compaction and compact only the tables
			 * that we have managed to lock so far. This of course
			 * requires that we have been able to lock at least two
			 * tables, otherwise there would be nothing to compact.
			 * In that case, we return a lock error to our caller.
			 */
			if (errno == EEXIST && last - (i - 1) >= 2 &&
			    flags & STACK_COMPACT_RANGE_BEST_EFFORT) {
				err = 0;
				/*
				 * The subtraction is to offset the index, the
				 * addition is to only compact up to the table
				 * of the preceding iteration. They obviously
				 * cancel each other out, but that may be
				 * non-obvious when it was omitted.
				 */
				first = (i - 1) + 1;
				break;
			} else if (errno == EEXIST) {
				err = REFTABLE_LOCK_ERROR;
				goto done;
			} else {
				err = REFTABLE_IO_ERROR;
				goto done;
			}
		}

		/*
		 * We need to close the lockfiles as we might otherwise easily
		 * run into file descriptor exhaustion when we compress a lot
		 * of tables.
		 */
		err = flock_close(&table_locks[nlocks++]);
		if (err < 0) {
			err = REFTABLE_IO_ERROR;
			goto done;
		}
	}

	/*
	 * We have locked all tables in our range and can thus release the
	 * "tables.list" lock while compacting the locked tables. This allows
	 * concurrent updates to the stack to proceed.
	 */
	err = flock_release(&tables_list_lock);
	if (err < 0) {
		err = REFTABLE_IO_ERROR;
		goto done;
	}

	/*
	 * Compact the now-locked tables into a new table. Note that compacting
	 * these tables may end up with an empty new table in case tombstones
	 * end up cancelling out all refs in that range.
	 */
	err = stack_compact_locked(st, first, last, expiry, &new_table);
	if (err < 0) {
		if (err != REFTABLE_EMPTY_TABLE_ERROR)
			goto done;
		is_empty_table = 1;
	}

	/*
	 * Now that we have written the new, compacted table we need to re-lock
	 * "tables.list". We'll then replace the compacted range of tables with
	 * the new table.
	 */
	err = flock_acquire(&tables_list_lock, st->list_file, st->opts.lock_timeout_ms);
	if (err < 0) {
		if (errno == EEXIST)
			err = REFTABLE_LOCK_ERROR;
		else
			err = REFTABLE_IO_ERROR;
		goto done;
	}

	if (st->opts.default_permissions) {
		if (chmod(tables_list_lock.path,
			  st->opts.default_permissions) < 0) {
			err = REFTABLE_IO_ERROR;
			goto done;
		}
	}

	/*
	 * As we have unlocked the stack while compacting our slice of tables
	 * it may have happened that a concurrently running process has updated
	 * the stack while we were compacting. In that case, we need to check
	 * whether the tables that we have just compacted still exist in the
	 * stack in the exact same order as we have compacted them.
	 *
	 * If they do exist, then it is fine to continue and replace those
	 * tables with our compacted version. If they don't, then we need to
	 * abort.
	 */
	err = stack_uptodate(st);
	if (err < 0)
		goto done;
	if (err > 0) {
		ssize_t new_offset = -1;
		int fd;

		fd = open(st->list_file, O_RDONLY);
		if (fd < 0) {
			err = REFTABLE_IO_ERROR;
			goto done;
		}

		err = fd_read_lines(fd, &names);
		close(fd);
		if (err < 0)
			goto done;

		/*
		 * Search for the offset of the first table that we have
		 * compacted in the updated "tables.list" file.
		 */
		for (size_t i = 0; names[i]; i++) {
			if (strcmp(names[i], st->tables[first]->name))
				continue;

			/*
			 * We have found the first entry. Verify that all the
			 * subsequent tables we have compacted still exist in
			 * the modified stack in the exact same order as we
			 * have compacted them.
			 */
			for (size_t j = 1; j < last - first + 1; j++) {
				const char *old = first + j < st->merged->tables_len ?
					st->tables[first + j]->name : NULL;
				const char *new = names[i + j];

				/*
				 * If some entries are missing or in case the tables
				 * have changed then we need to bail out. Again, this
				 * shouldn't ever happen because we have locked the
				 * tables we are compacting.
				 */
				if (!old || !new || strcmp(old, new)) {
					err = REFTABLE_OUTDATED_ERROR;
					goto done;
				}
			}

			new_offset = i;
			break;
		}

		/*
		 * In case we didn't find our compacted tables in the stack we
		 * need to bail out. In theory, this should have never happened
		 * because we locked the tables we are compacting.
		 */
		if (new_offset < 0) {
			err = REFTABLE_OUTDATED_ERROR;
			goto done;
		}

		/*
		 * We have found the new range that we want to replace, so
		 * let's update the range of tables that we want to replace.
		 */
		first_to_replace = new_offset;
		last_to_replace = last + (new_offset - first);
	} else {
		/*
		 * `fd_read_lines()` uses a `NULL` sentinel to indicate that
		 * the array is at its end. As we use `free_names()` to free
		 * the array, we need to include this sentinel value here and
		 * thus have to allocate `tables_len + 1` many entries.
		 */
		REFTABLE_CALLOC_ARRAY(names, st->merged->tables_len + 1);
		if (!names) {
			err = REFTABLE_OUT_OF_MEMORY_ERROR;
			goto done;
		}

		for (size_t i = 0; i < st->merged->tables_len; i++) {
			names[i] = reftable_strdup(st->tables[i]->name);
			if (!names[i]) {
				err = REFTABLE_OUT_OF_MEMORY_ERROR;
				goto done;
			}
		}
		first_to_replace = first;
		last_to_replace = last;
	}

	/*
	 * If the resulting compacted table is not empty, then we need to move
	 * it into place now.
	 */
	if (!is_empty_table) {
		err = format_name(&new_table_name, st->tables[first]->min_update_index,
				  st->tables[last]->max_update_index);
		if (err < 0)
			goto done;

		err = reftable_buf_addstr(&new_table_name, ".ref");
		if (err < 0)
			goto done;

		err = stack_filename(&new_table_path, st, new_table_name.buf);
		if (err < 0)
			goto done;

		err = tmpfile_rename(&new_table, new_table_path.buf);
		if (err < 0)
			goto done;
	}

	/*
	 * Write the new "tables.list" contents with the compacted table we
	 * have just written. In case the compacted table became empty we
	 * simply skip writing it.
	 */
	for (i = 0; i < first_to_replace; i++) {
		if ((err = reftable_buf_addstr(&tables_list_buf, names[i])) < 0 ||
		    (err = reftable_buf_addstr(&tables_list_buf, "\n")) < 0)
		      goto done;
	}
	if (!is_empty_table) {
		if ((err = reftable_buf_addstr(&tables_list_buf, new_table_name.buf)) < 0 ||
		    (err = reftable_buf_addstr(&tables_list_buf, "\n")) < 0)
			goto done;
	}
	for (i = last_to_replace + 1; names[i]; i++) {
		if ((err = reftable_buf_addstr(&tables_list_buf, names[i])) < 0 ||
		    (err = reftable_buf_addstr(&tables_list_buf, "\n")) < 0)
			goto done;
	}

	err = reftable_write_data(tables_list_lock.fd,
				  tables_list_buf.buf, tables_list_buf.len);
	if (err < 0) {
		err = REFTABLE_IO_ERROR;
		unlink(new_table_path.buf);
		goto done;
	}

	err = stack_fsync(&st->opts, tables_list_lock.fd);
	if (err < 0) {
		err = REFTABLE_IO_ERROR;
		unlink(new_table_path.buf);
		goto done;
	}

	err = flock_commit(&tables_list_lock);
	if (err < 0) {
		err = REFTABLE_IO_ERROR;
		unlink(new_table_path.buf);
		goto done;
	}

	/*
	 * Reload the stack before deleting the compacted tables. We can only
	 * delete the files after we closed them on Windows, so this needs to
	 * happen first.
	 */
	err = reftable_stack_reload_maybe_reuse(st, first < last);
	if (err < 0)
		goto done;

	/*
	 * Delete the old tables. They may still be in use by concurrent
	 * readers, so it is expected that unlinking tables may fail.
	 */
	for (i = 0; i < nlocks; i++) {
		struct reftable_flock *table_lock = &table_locks[i];

		reftable_buf_reset(&table_name);
		err = reftable_buf_add(&table_name, table_lock->path,
				       strlen(table_lock->path) - strlen(".lock"));
		if (err)
			continue;

		unlink(table_name.buf);
	}

done:
	flock_release(&tables_list_lock);
	for (i = 0; table_locks && i < nlocks; i++)
		flock_release(&table_locks[i]);
	reftable_free(table_locks);

	tmpfile_delete(&new_table);
	reftable_buf_release(&new_table_name);
	reftable_buf_release(&new_table_path);
	reftable_buf_release(&tables_list_buf);
	reftable_buf_release(&table_name);
	free_names(names);

	if (err == REFTABLE_LOCK_ERROR)
		st->stats.failures++;

	return err;
}