be/src/pipeline/pipeline_fragment

// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include "pipeline_fragment_context.h" #include <gen_cpp/DataSinks_types.h> #include <gen_cpp/PaloInternalService_types.h> #include <gen_cpp/PlanNodes_types.h> #include <pthread.h> #include <algorithm> #include <cstdlib> // IWYU pragma: no_include <bits/chrono.h> #include <fmt/format.h> #include <chrono> // IWYU pragma: keep #include <map> #include <memory> #include <ostream> #include <utility> #include "cloud/config.h" #include "common/cast_set.h" #include "common/config.h" #include "common/exception.h" #include "common/logging.h" #include "common/status.h" #include "io/fs/stream_load_pipe.h" #include "pipeline/dependency.h" #include "pipeline/exec/aggregation_sink_operator.h" #include "pipeline/exec/aggregation_source_operator.h" #include "pipeline/exec/analytic_sink_operator.h" #include "pipeline/exec/analytic_source_operator.h" #include "pipeline/exec/assert_num_rows_operator.h" #include "pipeline/exec/cache_sink_operator.h" #include "pipeline/exec/cache_source_operator.h" #include "pipeline/exec/datagen_operator.h" #include "pipeline/exec/dict_sink_operator.h" #include "pipeline/exec/distinct_streaming_aggregation_operator.h" #include "pipeline/exec/empty_set_operator.h" #include "pipeline/exec/es_scan_operator.h" #include "pipeline/exec/exchange_sink_operator.h" #include "pipeline/exec/exchange_source_operator.h" #include "pipeline/exec/file_scan_operator.h" #include "pipeline/exec/group_commit_block_sink_operator.h" #include "pipeline/exec/group_commit_scan_operator.h" #include "pipeline/exec/hashjoin_build_sink.h" #include "pipeline/exec/hashjoin_probe_operator.h" #include "pipeline/exec/hive_table_sink_operator.h" #include "pipeline/exec/iceberg_table_sink_operator.h" #include "pipeline/exec/jdbc_scan_operator.h" #include "pipeline/exec/jdbc_table_sink_operator.h" #include "pipeline/exec/local_merge_sort_source_operator.h" #include "pipeline/exec/memory_scratch_sink_operator.h" #include "pipeline/exec/meta_scan_operator.h" #include "pipeline/exec/multi_cast_data_stream_sink.h" #include "pipeline/exec/multi_cast_data_stream_source.h" #include "pipeline/exec/nested_loop_join_build_operator.h" #include "pipeline/exec/nested_loop_join_probe_operator.h" #include "pipeline/exec/olap_scan_operator.h" #include "pipeline/exec/olap_table_sink_operator.h" #include "pipeline/exec/olap_table_sink_v2_operator.h" #include "pipeline/exec/partition_sort_sink_operator.h" #include "pipeline/exec/partition_sort_source_operator.h" #include "pipeline/exec/partitioned_aggregation_sink_operator.h" #include "pipeline/exec/partitioned_aggregation_source_operator.h" #include "pipeline/exec/partitioned_hash_join_probe_operator.h" #include "pipeline/exec/partitioned_hash_join_sink_operator.h" #include "pipeline/exec/repeat_operator.h" #include "pipeline/exec/result_file_sink_operator.h" #include "pipeline/exec/result_sink_operator.h" #include "pipeline/exec/schema_scan_operator.h" #include "pipeline/exec/select_operator.h" #include "pipeline/exec/set_probe_sink_operator.h" #include "pipeline/exec/set_sink_operator.h" #include "pipeline/exec/set_source_operator.h" #include "pipeline/exec/sort_sink_operator.h" #include "pipeline/exec/sort_source_operator.h" #include "pipeline/exec/spill_sort_sink_operator.h" #include "pipeline/exec/spill_sort_source_operator.h" #include "pipeline/exec/streaming_aggregation_operator.h" #include "pipeline/exec/table_function_operator.h" #include "pipeline/exec/union_sink_operator.h" #include "pipeline/exec/union_source_operator.h" #include "pipeline/local_exchange/local_exchange_sink_operator.h" #include "pipeline/local_exchange/local_exchange_source_operator.h" #include "pipeline/local_exchange/local_exchanger.h" #include "pipeline/task_scheduler.h" #include "pipeline_task.h" #include "runtime/exec_env.h" #include "runtime/fragment_mgr.h" #include "runtime/runtime_state.h" #include "runtime/stream_load/new_load_stream_mgr.h" #include "runtime/stream_load/stream_load_context.h" #include "runtime/thread_context.h" #include "runtime_filter/runtime_filter_mgr.h" #include "service/backend_options.h" #include "util/container_util.hpp" #include "util/debug_util.h" #include "util/uid_util.h" #include "vec/common/sort/heap_sorter.h" #include "vec/common/sort/topn_sorter.h" #include "vec/runtime/vdata_stream_mgr.h" #include "vec/spill/spill_stream.h" namespace doris::pipeline { #include "common/compile_check_begin.h" PipelineFragmentContext::PipelineFragmentContext( const TUniqueId& query_id, const int fragment_id, std::shared_ptr<QueryContext> query_ctx, ExecEnv* exec_env, const std::function<void(RuntimeState*, Status*)>& call_back, const report_status_callback& report_status_cb) : _query_id(query_id), _fragment_id(fragment_id), _exec_env(exec_env), _query_ctx(std::move(query_ctx)), _call_back(call_back), _is_report_on_cancel(true), _report_status_cb(report_status_cb) { _fragment_watcher.start(); } PipelineFragmentContext::~PipelineFragmentContext() { LOG_INFO("PipelineFragmentContext::~PipelineFragmentContext") .tag("query_id", print_id(_query_id)) .tag("fragment_id", _fragment_id); // The memory released by the query end is recorded in the query mem tracker. SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(_query_ctx->query_mem_tracker()); auto st = _query_ctx->exec_status(); for (size_t i = 0; i < _tasks.size(); i++) { if (!_tasks[i].empty()) { _call_back(_tasks[i].front()->runtime_state(), &st); } } for (auto& runtime_states : _task_runtime_states) { for (auto& runtime_state : runtime_states) { runtime_state.reset(); } } _tasks.clear(); _dag.clear(); _pip_id_to_pipeline.clear(); _pipelines.clear(); _sink.reset(); _root_op.reset(); _runtime_state.reset(); _runtime_filter_mgr_map.clear(); _op_id_to_shared_state.clear(); _query_ctx.reset(); } bool PipelineFragmentContext::is_timeout(timespec now) const { if (_timeout <= 0) { return false; } return _fragment_watcher.elapsed_time_seconds(now) > _timeout; } // Must not add lock in this method. Because it will call query ctx cancel. And // QueryCtx cancel will call fragment ctx cancel. And Also Fragment ctx's running // Method like exchange sink buffer will call query ctx cancel. If we add lock here // There maybe dead lock. void PipelineFragmentContext::cancel(const Status reason) { LOG_INFO("PipelineFragmentContext::cancel") .tag("query_id", print_id(_query_id)) .tag("fragment_id", _fragment_id) .tag("reason", reason.to_string()); { std::lock_guard<std::mutex> l(_task_mutex); if (_closed_tasks == _total_tasks) { // All tasks in this PipelineXFragmentContext already closed. return; } } // Timeout is a special error code, we need print current stack to debug timeout issue. if (reason.is<ErrorCode::TIMEOUT>()) { auto dbg_str = fmt::format("PipelineFragmentContext is cancelled due to timeout:\n{}", debug_string()); LOG_LONG_STRING(WARNING, dbg_str); } // `ILLEGAL_STATE` means queries this fragment belongs to was not found in FE (maybe finished) if (reason.is<ErrorCode::ILLEGAL_STATE>()) { LOG_WARNING("PipelineFragmentContext is cancelled due to illegal state : {}", debug_string()); } if (reason.is<ErrorCode::MEM_LIMIT_EXCEEDED>() || reason.is<ErrorCode::MEM_ALLOC_FAILED>()) { print_profile("cancel pipeline, reason: " + reason.to_string()); } if (auto error_url = get_load_error_url(); !error_url.empty()) { _query_ctx->set_load_error_url(error_url); } _query_ctx->cancel(reason, _fragment_id); if (reason.is<ErrorCode::LIMIT_REACH>()) { _is_report_on_cancel = false; } else { for (auto& id : _fragment_instance_ids) { LOG(WARNING) << "PipelineFragmentContext cancel instance: " << print_id(id); } } // Get pipe from new load stream manager and send cancel to it or the fragment may hang to wait read from pipe // For stream load the fragment's query_id == load id, it is set in FE. auto stream_load_ctx = _exec_env->new_load_stream_mgr()->get(_query_id); if (stream_load_ctx != nullptr) { stream_load_ctx->pipe->cancel(reason.to_string()); } for (auto& tasks : _tasks) { for (auto& task : tasks) { task->terminate(); } } } PipelinePtr PipelineFragmentContext::add_pipeline(PipelinePtr parent, int idx) { PipelineId id = _next_pipeline_id++; auto pipeline = std::make_shared<Pipeline>( id, parent ? std::min(parent->num_tasks(), _num_instances) : _num_instances, parent ? parent->num_tasks() : _num_instances); if (idx >= 0) { _pipelines.insert(_pipelines.begin() + idx, pipeline); } else { _pipelines.emplace_back(pipeline); } if (parent) { parent->set_children(pipeline); } return pipeline; } Status PipelineFragmentContext::prepare(const doris::TPipelineFragmentParams& request, ThreadPool* thread_pool) { if (_prepared) { return Status::InternalError("Already prepared"); } if (request.__isset.query_options && request.query_options.__isset.execution_timeout) { _timeout = request.query_options.execution_timeout; } _fragment_level_profile = std::make_unique<RuntimeProfile>("PipelineContext"); _prepare_timer = ADD_TIMER(_fragment_level_profile, "PrepareTime"); SCOPED_TIMER(_prepare_timer); _build_pipelines_timer = ADD_TIMER(_fragment_level_profile, "BuildPipelinesTime"); _init_context_timer = ADD_TIMER(_fragment_level_profile, "InitContextTime"); _plan_local_exchanger_timer = ADD_TIMER(_fragment_level_profile, "PlanLocalLocalExchangerTime"); _build_tasks_timer = ADD_TIMER(_fragment_level_profile, "BuildTasksTime"); _prepare_all_pipelines_timer = ADD_TIMER(_fragment_level_profile, "PrepareAllPipelinesTime"); { SCOPED_TIMER(_init_context_timer); cast_set(_num_instances, request.local_params.size()); _total_instances = request.__isset.total_instances ? request.total_instances : _num_instances; auto* fragment_context = this; if (request.query_options.__isset.is_report_success) { fragment_context->set_is_report_success(request.query_options.is_report_success); } // 1. Set up the global runtime state. _runtime_state = RuntimeState::create_unique( request.query_id, request.fragment_id, request.query_options, _query_ctx->query_globals, _exec_env, _query_ctx.get()); _runtime_state->set_task_execution_context(shared_from_this()); SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(_runtime_state->query_mem_tracker()); if (request.__isset.backend_id) { _runtime_state->set_backend_id(request.backend_id); } if (request.__isset.import_label) { _runtime_state->set_import_label(request.import_label); } if (request.__isset.db_name) { _runtime_state->set_db_name(request.db_name); } if (request.__isset.load_job_id) { _runtime_state->set_load_job_id(request.load_job_id); } if (request.is_simplified_param) { _desc_tbl = _query_ctx->desc_tbl; } else { DCHECK(request.__isset.desc_tbl); RETURN_IF_ERROR(DescriptorTbl::create(_runtime_state->obj_pool(), request.desc_tbl, &_desc_tbl)); } _runtime_state->set_desc_tbl(_desc_tbl); _runtime_state->set_num_per_fragment_instances(request.num_senders); _runtime_state->set_load_stream_per_node(request.load_stream_per_node); _runtime_state->set_total_load_streams(request.total_load_streams); _runtime_state->set_num_local_sink(request.num_local_sink); // init fragment_instance_ids const auto target_size = request.local_params.size(); _fragment_instance_ids.resize(target_size); for (size_t i = 0; i < request.local_params.size(); i++) { auto fragment_instance_id = request.local_params[i].fragment_instance_id; _fragment_instance_ids[i] = fragment_instance_id; } } { SCOPED_TIMER(_build_pipelines_timer); // 2. Build pipelines with operators in this fragment. auto root_pipeline = add_pipeline(); RETURN_IF_ERROR(_build_pipelines(_runtime_state->obj_pool(), request, *_query_ctx->desc_tbl, &_root_op, root_pipeline)); // 3. Create sink operator if (!request.fragment.__isset.output_sink) { return Status::InternalError("No output sink in this fragment!"); } RETURN_IF_ERROR(_create_data_sink(_runtime_state->obj_pool(), request.fragment.output_sink, request.fragment.output_exprs, request, root_pipeline->output_row_desc(), _runtime_state.get(), *_desc_tbl, root_pipeline->id())); RETURN_IF_ERROR(_sink->init(request.fragment.output_sink)); RETURN_IF_ERROR(root_pipeline->set_sink(_sink)); for (PipelinePtr& pipeline : _pipelines) { DCHECK(pipeline->sink() != nullptr) << pipeline->operators().size(); RETURN_IF_ERROR(pipeline->sink()->set_child(pipeline->operators().back())); } } // 4. Build local exchanger if (_runtime_state->enable_local_shuffle()) { SCOPED_TIMER(_plan_local_exchanger_timer); RETURN_IF_ERROR(_plan_local_exchange(request.num_buckets, request.bucket_seq_to_instance_idx, request.shuffle_idx_to_instance_idx)); } // 5. Initialize global states in pipelines. for (PipelinePtr& pipeline : _pipelines) { SCOPED_TIMER(_prepare_all_pipelines_timer); pipeline->children().clear(); RETURN_IF_ERROR(pipeline->prepare(_runtime_state.get())); } { SCOPED_TIMER(_build_tasks_timer); // 6. Build pipeline tasks and initialize local state. RETURN_IF_ERROR(_build_pipeline_tasks(request, thread_pool)); } _init_next_report_time(); _prepared = true; return Status::OK(); } Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFragmentParams& request, ThreadPool* thread_pool) { _total_tasks = 0; const auto target_size = request.local_params.size(); _tasks.resize(target_size); _runtime_filter_mgr_map.resize(target_size); _task_runtime_states.resize(_pipelines.size()); for (size_t pip_idx = 0; pip_idx < _pipelines.size(); pip_idx++) { _task_runtime_states[pip_idx].resize(_pipelines[pip_idx]->num_tasks()); _pip_id_to_pipeline[_pipelines[pip_idx]->id()] = _pipelines[pip_idx].get(); } auto pipeline_id_to_profile = _runtime_state->build_pipeline_profile(_pipelines.size()); auto pre_and_submit = [&](int i, PipelineFragmentContext* ctx) { const auto& local_params = request.local_params[i]; auto fragment_instance_id = local_params.fragment_instance_id; auto runtime_filter_mgr = std::make_unique<RuntimeFilterMgr>(false); std::map<PipelineId, PipelineTask*> pipeline_id_to_task; auto get_shared_state = [&](PipelinePtr pipeline) -> std::map<int, std::pair<std::shared_ptr<BasicSharedState>, std::vector<std::shared_ptr<Dependency>>>> { std::map<int, std::pair<std::shared_ptr<BasicSharedState>, std::vector<std::shared_ptr<Dependency>>>> shared_state_map; for (auto& op : pipeline->operators()) { auto source_id = op->operator_id(); if (auto iter = _op_id_to_shared_state.find(source_id); iter != _op_id_to_shared_state.end()) { shared_state_map.insert({source_id, iter->second}); } } for (auto sink_to_source_id : pipeline->sink()->dests_id()) { if (auto iter = _op_id_to_shared_state.find(sink_to_source_id); iter != _op_id_to_shared_state.end()) { shared_state_map.insert({sink_to_source_id, iter->second}); } } return shared_state_map; }; for (size_t pip_idx = 0; pip_idx < _pipelines.size(); pip_idx++) { auto& pipeline = _pipelines[pip_idx]; if (pipeline->num_tasks() > 1 || i == 0) { DCHECK(_task_runtime_states[pip_idx][i] == nullptr) << print_id(_task_runtime_states[pip_idx][i]->fragment_instance_id()) << " " << pipeline->debug_string(); _task_runtime_states[pip_idx][i] = RuntimeState::create_unique( local_params.fragment_instance_id, request.query_id, request.fragment_id, request.query_options, _query_ctx->query_globals, _exec_env, _query_ctx.get()); auto& task_runtime_state = _task_runtime_states[pip_idx][i]; { // Initialize runtime state for this task task_runtime_state->set_query_mem_tracker(_query_ctx->query_mem_tracker()); task_runtime_state->set_task_execution_context(shared_from_this()); task_runtime_state->set_be_number(local_params.backend_num); if (request.__isset.backend_id) { task_runtime_state->set_backend_id(request.backend_id); } if (request.__isset.import_label) { task_runtime_state->set_import_label(request.import_label); } if (request.__isset.db_name) { task_runtime_state->set_db_name(request.db_name); } if (request.__isset.load_job_id) { task_runtime_state->set_load_job_id(request.load_job_id); } if (request.__isset.wal_id) { task_runtime_state->set_wal_id(request.wal_id); } task_runtime_state->set_desc_tbl(_desc_tbl); task_runtime_state->set_per_fragment_instance_idx(local_params.sender_id); task_runtime_state->set_num_per_fragment_instances(request.num_senders); task_runtime_state->resize_op_id_to_local_state(max_operator_id()); task_runtime_state->set_max_operator_id(max_operator_id()); task_runtime_state->set_load_stream_per_node(request.load_stream_per_node); task_runtime_state->set_total_load_streams(request.total_load_streams); task_runtime_state->set_num_local_sink(request.num_local_sink); task_runtime_state->set_runtime_filter_mgr(runtime_filter_mgr.get()); } auto cur_task_id = _total_tasks++; task_runtime_state->set_task_id(cur_task_id); task_runtime_state->set_task_num(pipeline->num_tasks()); auto task = std::make_shared<PipelineTask>( pipeline, cur_task_id, task_runtime_state.get(), std::dynamic_pointer_cast<PipelineFragmentContext>(shared_from_this()), pipeline_id_to_profile[pip_idx].get(), get_shared_state(pipeline), i); pipeline->incr_created_tasks(i, task.get()); pipeline_id_to_task.insert({pipeline->id(), task.get()}); _tasks[i].emplace_back(std::move(task)); } } /** * Build DAG for pipeline tasks. * For example, we have * * ExchangeSink (Pipeline1) JoinBuildSink (Pipeline2) * \ / * JoinProbeOperator1 (Pipeline1) JoinBuildSink (Pipeline3) * \ / * JoinProbeOperator2 (Pipeline1) * * In this fragment, we have three pipelines and pipeline 1 depends on pipeline 2 and pipeline 3. * To build this DAG, `_dag` manage dependencies between pipelines by pipeline ID and * `pipeline_id_to_task` is used to find the task by a unique pipeline ID. * * Finally, we have two upstream dependencies in Pipeline1 corresponding to JoinProbeOperator1 * and JoinProbeOperator2. */ for (auto& _pipeline : _pipelines) { if (pipeline_id_to_task.contains(_pipeline->id())) { auto* task = pipeline_id_to_task[_pipeline->id()]; DCHECK(task != nullptr); // If this task has upstream dependency, then inject it into this task. if (_dag.find(_pipeline->id()) != _dag.end()) { auto& deps = _dag[_pipeline->id()]; for (auto& dep : deps) { if (pipeline_id_to_task.contains(dep)) { auto ss = pipeline_id_to_task[dep]->get_sink_shared_state(); if (ss) { task->inject_shared_state(ss); } else { pipeline_id_to_task[dep]->inject_shared_state( task->get_source_shared_state()); } } } } } } for (size_t pip_idx = 0; pip_idx < _pipelines.size(); pip_idx++) { if (pipeline_id_to_task.contains(_pipelines[pip_idx]->id())) { auto* task = pipeline_id_to_task[_pipelines[pip_idx]->id()]; DCHECK(pipeline_id_to_profile[pip_idx]); std::vector<TScanRangeParams> scan_ranges; scan_ranges = find_with_default(local_params.per_node_scan_ranges, _pipelines[pip_idx]->operators().front()->node_id(), scan_ranges); RETURN_IF_ERROR_OR_CATCH_EXCEPTION(task->prepare( scan_ranges, local_params.sender_id, request.fragment.output_sink)); } } { std::lock_guard<std::mutex> l(_state_map_lock); _runtime_filter_mgr_map[i] = std::move(runtime_filter_mgr); } return Status::OK(); }; if (target_size > 1 && (_runtime_state->query_options().__isset.parallel_prepare_threshold && target_size > _runtime_state->query_options().parallel_prepare_threshold)) { // If instances parallelism is big enough ( > parallel_prepare_threshold), we will prepare all tasks by multi-threads std::vector<Status> prepare_status(target_size); std::mutex m; std::condition_variable cv; int prepare_done = 0; for (int i = 0; i < target_size; i++) { RETURN_IF_ERROR(thread_pool->submit_func([&, i]() { SCOPED_ATTACH_TASK(_query_ctx.get()); prepare_status[i] = pre_and_submit(i, this); std::unique_lock<std::mutex> lock(m); prepare_done++; if (prepare_done == target_size) { cv.notify_one(); } })); } std::unique_lock<std::mutex> lock(m); if (prepare_done != target_size) { cv.wait(lock); for (int i = 0; i < target_size; i++) { if (!prepare_status[i].ok()) { return prepare_status[i]; } } } } else { for (int i = 0; i < target_size; i++) { RETURN_IF_ERROR(pre_and_submit(i, this)); } } _pipeline_parent_map.clear(); _op_id_to_shared_state.clear(); return Status::OK(); } void PipelineFragmentContext::_init_next_report_time() { auto interval_s = config::pipeline_status_report_interval; if (_is_report_success && interval_s > 0 && _timeout > interval_s) { VLOG_FILE << "enable period report: fragment id=" << _fragment_id; uint64_t report_fragment_offset = (uint64_t)(rand() % interval_s) * NANOS_PER_SEC; // We don't want to wait longer than it takes to run the entire fragment. _previous_report_time = MonotonicNanos() + report_fragment_offset - (uint64_t)(interval_s)*NANOS_PER_SEC; _disable_period_report = false; } } void PipelineFragmentContext::refresh_next_report_time() { auto disable = _disable_period_report.load(std::memory_order_acquire); DCHECK(disable == true); _previous_report_time.store(MonotonicNanos(), std::memory_order_release); _disable_period_report.compare_exchange_strong(disable, false); } void PipelineFragmentContext::trigger_report_if_necessary() { if (!_is_report_success) { return; } auto disable = _disable_period_report.load(std::memory_order_acquire); if (disable) { return; } int32_t interval_s = config::pipeline_status_report_interval; if (interval_s <= 0) { LOG(WARNING) << "config::status_report_interval is equal to or less than zero, do not trigger " "report."; } uint64_t next_report_time = _previous_report_time.load(std::memory_order_acquire) + (uint64_t)(interval_s)*NANOS_PER_SEC; if (MonotonicNanos() > next_report_time) { if (!_disable_period_report.compare_exchange_strong(disable, true, std::memory_order_acq_rel)) { return; } if (VLOG_FILE_IS_ON) { VLOG_FILE << "Reporting " << "profile for query_id " << print_id(_query_id) << ", fragment id: " << _fragment_id; std::stringstream ss; _runtime_state->runtime_profile()->compute_time_in_profile(); _runtime_state->runtime_profile()->pretty_print(&ss); if (_runtime_state->load_channel_profile()) { _runtime_state->load_channel_profile()->pretty_print(&ss); } VLOG_FILE << "Query " << print_id(get_query_id()) << " fragment " << get_fragment_id() << " profile:\n" << ss.str(); } auto st = send_report(false); if (!st.ok()) { disable = true; _disable_period_report.compare_exchange_strong(disable, false, std::memory_order_acq_rel); } } } Status PipelineFragmentContext::_build_pipelines(ObjectPool* pool, const doris::TPipelineFragmentParams& request, const DescriptorTbl& descs, OperatorPtr* root, PipelinePtr cur_pipe) { if (request.fragment.plan.nodes.empty()) { throw Exception(ErrorCode::INTERNAL_ERROR, "Invalid plan which has no plan node!"); } int node_idx = 0; RETURN_IF_ERROR(_create_tree_helper(pool, request.fragment.plan.nodes, request, descs, nullptr, &node_idx, root, cur_pipe, 0, false)); if (node_idx + 1 != request.fragment.plan.nodes.size()) { return Status::InternalError( "Plan tree only partially reconstructed. Not all thrift nodes were used."); } return Status::OK(); } Status PipelineFragmentContext::_create_tree_helper(ObjectPool* pool, const std::vector<TPlanNode>& tnodes, const doris::TPipelineFragmentParams& request, const DescriptorTbl& descs, OperatorPtr parent, int* node_idx, OperatorPtr* root, PipelinePtr& cur_pipe, int child_idx, const bool followed_by_shuffled_operator) { // propagate error case if (*node_idx >= tnodes.size()) { return Status::InternalError( "Failed to reconstruct plan tree from thrift. Node id: {}, number of nodes: {}", *node_idx, tnodes.size()); } const TPlanNode& tnode = tnodes[*node_idx]; int num_children = tnodes[*node_idx].num_children; bool current_followed_by_shuffled_operator = followed_by_shuffled_operator; OperatorPtr op = nullptr; RETURN_IF_ERROR(_create_operator(pool, tnodes[*node_idx], request, descs, op, cur_pipe, parent == nullptr ? -1 : parent->node_id(), child_idx, followed_by_shuffled_operator)); // Initialization must be done here. For example, group by expressions in agg will be used to // decide if a local shuffle should be planed, so it must be initialized here. RETURN_IF_ERROR(op->init(tnode, _runtime_state.get())); // assert(parent != nullptr || (node_idx == 0 && root_expr != nullptr)); if (parent != nullptr) { // add to parent's child(s) RETURN_IF_ERROR(parent->set_child(op)); } else { *root = op; } /** * `ExchangeType::HASH_SHUFFLE` should be used if an operator is followed by a shuffled operator (shuffled hash join, union operator followed by co-located operators). * * For plan: * LocalExchange(id=0) -> Aggregation(id=1) -> ShuffledHashJoin(id=2) * Exchange(id=3) -> ShuffledHashJoinBuild(id=2) * We must ensure data distribution of `LocalExchange(id=0)` is same as Exchange(id=3). * * If an operator's is followed by a local exchange without shuffle (e.g. passthrough), a * shuffled local exchanger will be used before join so it is not followed by shuffle join. */ auto require_shuffled_data_distribution = cur_pipe->operators().empty() ? cur_pipe->sink()->require_shuffled_data_distribution() : op->require_shuffled_data_distribution(); current_followed_by_shuffled_operator = (followed_by_shuffled_operator || op->is_shuffled_operator()) && require_shuffled_data_distribution; if (num_children == 0) { _use_serial_source = op->is_serial_operator(); } // rely on that tnodes is preorder of the plan for (int i = 0; i < num_children; i++) { ++*node_idx; RETURN_IF_ERROR(_create_tree_helper(pool, tnodes, request, descs, op, node_idx, nullptr, cur_pipe, i, current_followed_by_shuffled_operator)); // we are expecting a child, but have used all nodes // this means we have been given a bad tree and must fail if (*node_idx >= tnodes.size()) { return Status::InternalError( "Failed to reconstruct plan tree from thrift. Node id: {}, number of nodes: {}", *node_idx, tnodes.size()); } } return Status::OK(); } void PipelineFragmentContext::_inherit_pipeline_properties( const DataDistribution& data_distribution, PipelinePtr pipe_with_source, PipelinePtr pipe_with_sink) { pipe_with_sink->set_num_tasks(pipe_with_source->num_tasks()); pipe_with_source->set_num_tasks(_num_instances); pipe_with_source->set_data_distribution(data_distribution); } Status PipelineFragmentContext::_add_local_exchange_impl( int idx, ObjectPool* pool, PipelinePtr cur_pipe, PipelinePtr new_pip, DataDistribution data_distribution, bool* do_local_exchange, int num_buckets, const std::map<int, int>& bucket_seq_to_instance_idx, const std::map<int, int>& shuffle_idx_to_instance_idx) { auto& operators = cur_pipe->operators(); const auto downstream_pipeline_id = cur_pipe->id(); auto local_exchange_id = next_operator_id(); // 1. Create a new pipeline with local exchange sink. DataSinkOperatorPtr sink; auto sink_id = next_sink_operator_id(); /** * `bucket_seq_to_instance_idx` is empty if no scan operator is contained in this fragment. * So co-located operators(e.g. Agg, Analytic) should use `HASH_SHUFFLE` instead of `BUCKET_HASH_SHUFFLE`. */ const bool followed_by_shuffled_operator = operators.size() > idx ? operators[idx]->followed_by_shuffled_operator() : cur_pipe->sink()->followed_by_shuffled_operator(); const bool use_global_hash_shuffle = bucket_seq_to_instance_idx.empty() && shuffle_idx_to_instance_idx.find(-1) == shuffle_idx_to_instance_idx.end() && followed_by_shuffled_operator && !_use_serial_source; sink.reset(new LocalExchangeSinkOperatorX( sink_id, local_exchange_id, use_global_hash_shuffle ? _total_instances : _num_instances, data_distribution.partition_exprs, bucket_seq_to_instance_idx)); if (bucket_seq_to_instance_idx.empty() && data_distribution.distribution_type == ExchangeType::BUCKET_HASH_SHUFFLE) { data_distribution.distribution_type = ExchangeType::HASH_SHUFFLE; } RETURN_IF_ERROR(new_pip->set_sink(sink)); RETURN_IF_ERROR(new_pip->sink()->init(data_distribution.distribution_type, num_buckets, use_global_hash_shuffle, shuffle_idx_to_instance_idx)); // 2. Create and initialize LocalExchangeSharedState. std::shared_ptr<LocalExchangeSharedState> shared_state = LocalExchangeSharedState::create_shared(_num_instances); switch (data_distribution.distribution_type) { case ExchangeType::HASH_SHUFFLE: shared_state->exchanger = ShuffleExchanger::create_unique( std::max(cur_pipe->num_tasks(), _num_instances), _num_instances, use_global_hash_shuffle ? _total_instances : _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit ? cast_set<int>( _runtime_state->query_options().local_exchange_free_blocks_limit) : 0); break; case ExchangeType::BUCKET_HASH_SHUFFLE: shared_state->exchanger = BucketShuffleExchanger::create_unique( std::max(cur_pipe->num_tasks(), _num_instances), _num_instances, num_buckets, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit ? cast_set<int>( _runtime_state->query_options().local_exchange_free_blocks_limit) : 0); break; case ExchangeType::PASSTHROUGH: shared_state->exchanger = PassthroughExchanger::create_unique( cur_pipe->num_tasks(), _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit ? cast_set<int>( _runtime_state->query_options().local_exchange_free_blocks_limit) : 0); break; case ExchangeType::BROADCAST: shared_state->exchanger = BroadcastExchanger::create_unique( cur_pipe->num_tasks(), _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit ? cast_set<int>( _runtime_state->query_options().local_exchange_free_blocks_limit) : 0); break; case ExchangeType::PASS_TO_ONE: if (_runtime_state->enable_share_hash_table_for_broadcast_join()) { // If shared hash table is enabled for BJ, hash table will be built by only one task shared_state->exchanger = PassToOneExchanger::create_unique( cur_pipe->num_tasks(), _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit ? cast_set<int>(_runtime_state->query_options() .local_exchange_free_blocks_limit) : 0); } else { shared_state->exchanger = BroadcastExchanger::create_unique( cur_pipe->num_tasks(), _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit ? cast_set<int>(_runtime_state->query_options() .local_exchange_free_blocks_limit) : 0); } break; case ExchangeType::ADAPTIVE_PASSTHROUGH: shared_state->exchanger = AdaptivePassthroughExchanger::create_unique( std::max(cur_pipe->num_tasks(), _num_instances), _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit ? cast_set<int>( _runtime_state->query_options().local_exchange_free_blocks_limit) : 0); break; default: return Status::InternalError("Unsupported local exchange type : " + std::to_string((int)data_distribution.distribution_type)); } shared_state->create_source_dependencies(_num_instances, local_exchange_id, local_exchange_id, "LOCAL_EXCHANGE_OPERATOR"); shared_state->create_sink_dependency(sink_id, local_exchange_id, "LOCAL_EXCHANGE_SINK"); _op_id_to_shared_state.insert({local_exchange_id, {shared_state, shared_state->sink_deps}}); // 3. Set two pipelines' operator list. For example, split pipeline [Scan - AggSink] to // pipeline1 [Scan - LocalExchangeSink] and pipeline2 [LocalExchangeSource - AggSink]. // 3.1 Initialize new pipeline's operator list. std::copy(operators.begin(), operators.begin() + idx, std::inserter(new_pip->operators(), new_pip->operators().end())); // 3.2 Erase unused operators in previous pipeline. operators.erase(operators.begin(), operators.begin() + idx); // 4. Initialize LocalExchangeSource and insert it into this pipeline. OperatorPtr source_op; source_op.reset(new LocalExchangeSourceOperatorX(pool, local_exchange_id)); RETURN_IF_ERROR(source_op->set_child(new_pip->operators().back())); RETURN_IF_ERROR(source_op->init(data_distribution.distribution_type)); if (!operators.empty()) { RETURN_IF_ERROR(operators.front()->set_child(nullptr)); RETURN_IF_ERROR(operators.front()->set_child(source_op)); } operators.insert(operators.begin(), source_op); // 5. Set children for two pipelines separately. std::vector<std::shared_ptr<Pipeline>> new_children; std::vector<PipelineId> edges_with_source; for (auto child : cur_pipe->children()) { bool found = false; for (auto op : new_pip->operators()) { if (child->sink()->node_id() == op->node_id()) { new_pip->set_children(child); found = true; }; } if (!found) { new_children.push_back(child); edges_with_source.push_back(child->id()); } } new_children.push_back(new_pip); edges_with_source.push_back(new_pip->id()); // 6. Set DAG for new pipelines. if (!new_pip->children().empty()) { std::vector<PipelineId> edges_with_sink; for (auto child : new_pip->children()) { edges_with_sink.push_back(child->id()); } _dag.insert({new_pip->id(), edges_with_sink}); } cur_pipe->set_children(new_children); _dag[downstream_pipeline_id] = edges_with_source; RETURN_IF_ERROR(new_pip->sink()->set_child(new_pip->operators().back())); RETURN_IF_ERROR(cur_pipe->sink()->set_child(nullptr)); RETURN_IF_ERROR(cur_pipe->sink()->set_child(cur_pipe->operators().back())); // 7. Inherit properties from current pipeline. _inherit_pipeline_properties(data_distribution, cur_pipe, new_pip); return Status::OK(); } Status PipelineFragmentContext::_add_local_exchange( int pip_idx, int idx, int node_id, ObjectPool* pool, PipelinePtr cur_pipe, DataDistribution data_distribution, bool* do_local_exchange, int num_buckets, const std::map<int, int>& bucket_seq_to_instance_idx, const std::map<int, int>& shuffle_idx_to_instance_idx) { if (_num_instances <= 1 || cur_pipe->num_tasks_of_parent() <= 1) { return Status::OK(); } if (!cur_pipe->need_to_local_exchange(data_distribution, idx)) { return Status::OK(); } *do_local_exchange = true; auto& operators = cur_pipe->operators(); auto total_op_num = operators.size(); auto new_pip = add_pipeline(cur_pipe, pip_idx + 1); RETURN_IF_ERROR(_add_local_exchange_impl( idx, pool, cur_pipe, new_pip, data_distribution, do_local_exchange, num_buckets, bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx)); CHECK(total_op_num + 1 == cur_pipe->operators().size() + new_pip->operators().size()) << "total_op_num: " << total_op_num << " cur_pipe->operators().size(): " << cur_pipe->operators().size() << " new_pip->operators().size(): " << new_pip->operators().size(); // There are some local shuffles with relatively heavy operations on the sink. // If the local sink concurrency is 1 and the local source concurrency is n, the sink becomes a bottleneck. // Therefore, local passthrough is used to increase the concurrency of the sink. // op -> local sink(1) -> local source (n) // op -> local passthrough(1) -> local passthrough(n) -> local sink(n) -> local source (n) if (cur_pipe->num_tasks() > 1 && new_pip->num_tasks() == 1 && Pipeline::heavy_operations_on_the_sink(data_distribution.distribution_type)) { RETURN_IF_ERROR(_add_local_exchange_impl( cast_set<int>(new_pip->operators().size()), pool, new_pip, add_pipeline(new_pip, pip_idx + 2), DataDistribution(ExchangeType::PASSTHROUGH), do_local_exchange, num_buckets, bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx)); } return Status::OK(); } Status PipelineFragmentContext::_plan_local_exchange( int num_buckets, const std::map<int, int>& bucket_seq_to_instance_idx, const std::map<int, int>& shuffle_idx_to_instance_idx) { for (int pip_idx = cast_set<int>(_pipelines.size()) - 1; pip_idx >= 0; pip_idx--) { _pipelines[pip_idx]->init_data_distribution(); // Set property if child pipeline is not join operator's child. if (!_pipelines[pip_idx]->children().empty()) { for (auto& child : _pipelines[pip_idx]->children()) { if (child->sink()->node_id() == _pipelines[pip_idx]->operators().front()->node_id()) { _pipelines[pip_idx]->set_data_distribution(child->data_distribution()); } } } // if 'num_buckets == 0' means the fragment is colocated by exchange node not the // scan node. so here use `_num_instance` to replace the `num_buckets` to prevent dividing 0 // still keep colocate plan after local shuffle RETURN_IF_ERROR(_plan_local_exchange(num_buckets, pip_idx, _pipelines[pip_idx], bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx)); } return Status::OK(); } Status PipelineFragmentContext::_plan_local_exchange( int num_buckets, int pip_idx, PipelinePtr pip, const std::map<int, int>& bucket_seq_to_instance_idx, const std::map<int, int>& shuffle_idx_to_instance_idx) { int idx = 1; bool do_local_exchange = false; do { auto& ops = pip->operators(); do_local_exchange = false; // Plan local exchange for each operator. for (; idx < ops.size();) { if (ops[idx]->required_data_distribution().need_local_exchange()) { RETURN_IF_ERROR(_add_local_exchange( pip_idx, idx, ops[idx]->node_id(), _runtime_state->obj_pool(), pip, ops[idx]->required_data_distribution(), &do_local_exchange, num_buckets, bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx)); } if (do_local_exchange) { // If local exchange is needed for current operator, we will split this pipeline to // two pipelines by local exchange sink/source. And then we need to process remaining // operators in this pipeline so we set idx to 2 (0 is local exchange source and 1 // is current operator was already processed) and continue to plan local exchange. idx = 2; break; } idx++; } } while (do_local_exchange); if (pip->sink()->required_data_distribution().need_local_exchange()) { RETURN_IF_ERROR(_add_local_exchange( pip_idx, idx, pip->sink()->node_id(), _runtime_state->obj_pool(), pip, pip->sink()->required_data_distribution(), &do_local_exchange, num_buckets, bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx)); } return Status::OK(); } Status PipelineFragmentContext::_create_data_sink(ObjectPool* pool, const TDataSink& thrift_sink, const std::vector<TExpr>& output_exprs, const TPipelineFragmentParams& params, const RowDescriptor& row_desc, RuntimeState* state, DescriptorTbl& desc_tbl, PipelineId cur_pipeline_id) { switch (thrift_sink.type) { case TDataSinkType::DATA_STREAM_SINK: { if (!thrift_sink.__isset.stream_sink) { return Status::InternalError("Missing data stream sink."); } _sink.reset(new ExchangeSinkOperatorX(state, row_desc, next_sink_operator_id(), thrift_sink.stream_sink, params.destinations, _fragment_instance_ids)); break; } case TDataSinkType::RESULT_SINK: { if (!thrift_sink.__isset.result_sink) { return Status::InternalError("Missing data buffer sink."); } _sink.reset(new ResultSinkOperatorX(next_sink_operator_id(), row_desc, output_exprs, thrift_sink.result_sink)); break; } case TDataSinkType::DICTIONARY_SINK: { if (!thrift_sink.__isset.dictionary_sink) { return Status::InternalError("Missing dict sink."); } _sink.reset(new DictSinkOperatorX(next_sink_operator_id(), row_desc, output_exprs, thrift_sink.dictionary_sink)); break; } case TDataSinkType::GROUP_COMMIT_OLAP_TABLE_SINK: case TDataSinkType::OLAP_TABLE_SINK: { if (state->query_options().enable_memtable_on_sink_node && !_has_inverted_index_v1_or_partial_update(thrift_sink.olap_table_sink) && !config::is_cloud_mode()) { _sink.reset(new OlapTableSinkV2OperatorX(pool, next_sink_operator_id(), row_desc, output_exprs)); } else { _sink.reset(new OlapTableSinkOperatorX(pool, next_sink_operator_id(), row_desc, output_exprs)); } break; } case TDataSinkType::GROUP_COMMIT_BLOCK_SINK: { DCHECK(thrift_sink.__isset.olap_table_sink); #ifndef NDEBUG DCHECK(state->get_query_ctx() != nullptr); state->get_query_ctx()->query_mem_tracker()->is_group_commit_load = true; #endif _sink.reset( new GroupCommitBlockSinkOperatorX(next_sink_operator_id(), row_desc, output_exprs)); break; } case TDataSinkType::HIVE_TABLE_SINK: { if (!thrift_sink.__isset.hive_table_sink) { return Status::InternalError("Missing hive table sink."); } _sink.reset( new HiveTableSinkOperatorX(pool, next_sink_operator_id(), row_desc, output_exprs)); break; } case TDataSinkType::ICEBERG_TABLE_SINK: { if (!thrift_sink.__isset.iceberg_table_sink) { return Status::InternalError("Missing hive table sink."); } _sink.reset(new IcebergTableSinkOperatorX(pool, next_sink_operator_id(), row_desc, output_exprs)); break; } case TDataSinkType::JDBC_TABLE_SINK: { if (!thrift_sink.__isset.jdbc_table_sink) { return Status::InternalError("Missing data jdbc sink."); } if (config::enable_java_support) { _sink.reset( new JdbcTableSinkOperatorX(row_desc, next_sink_operator_id(), output_exprs)); } else { return Status::InternalError( "Jdbc table sink is not enabled, you can change be config " "enable_java_support to true and restart be."); } break; } case TDataSinkType::MEMORY_SCRATCH_SINK: { if (!thrift_sink.__isset.memory_scratch_sink) { return Status::InternalError("Missing data buffer sink."); } _sink.reset( new MemoryScratchSinkOperatorX(row_desc, next_sink_operator_id(), output_exprs)); break; } case TDataSinkType::RESULT_FILE_SINK: { if (!thrift_sink.__isset.result_file_sink) { return Status::InternalError("Missing result file sink."); } // Result file sink is not the top sink if (params.__isset.destinations && !params.destinations.empty()) { _sink.reset(new ResultFileSinkOperatorX(next_sink_operator_id(), row_desc, thrift_sink.result_file_sink, params.destinations, output_exprs, desc_tbl)); } else { _sink.reset( new ResultFileSinkOperatorX(next_sink_operator_id(), row_desc, output_exprs)); } break; } case TDataSinkType::MULTI_CAST_DATA_STREAM_SINK: { DCHECK(thrift_sink.__isset.multi_cast_stream_sink); DCHECK_GT(thrift_sink.multi_cast_stream_sink.sinks.size(), 0); auto sink_id = next_sink_operator_id(); auto sender_size = thrift_sink.multi_cast_stream_sink.sinks.size(); // one sink has multiple sources. std::vector<int> sources; for (int i = 0; i < sender_size; ++i) { auto source_id = next_operator_id(); sources.push_back(source_id); } _sink.reset(new MultiCastDataStreamSinkOperatorX(sink_id, sources, pool, thrift_sink.multi_cast_stream_sink)); for (int i = 0; i < sender_size; ++i) { auto new_pipeline = add_pipeline(); // use to exchange sink RowDescriptor* exchange_row_desc = nullptr; { const auto& tmp_row_desc = !thrift_sink.multi_cast_stream_sink.sinks[i].output_exprs.empty() ? RowDescriptor(state->desc_tbl(), {thrift_sink.multi_cast_stream_sink.sinks[i] .output_tuple_id}, {false}) : row_desc; exchange_row_desc = pool->add(new RowDescriptor(tmp_row_desc)); } auto source_id = sources[i]; OperatorPtr source_op; // 1. create and set the source operator of multi_cast_data_stream_source for new pipeline source_op.reset(new MultiCastDataStreamerSourceOperatorX( i, pool, thrift_sink.multi_cast_stream_sink.sinks[i], row_desc, source_id)); RETURN_IF_ERROR(new_pipeline->add_operator( source_op, params.__isset.parallel_instances ? params.parallel_instances : 0)); // 2. create and set sink operator of data stream sender for new pipeline DataSinkOperatorPtr sink_op; sink_op.reset(new ExchangeSinkOperatorX( state, *exchange_row_desc, next_sink_operator_id(), thrift_sink.multi_cast_stream_sink.sinks[i], thrift_sink.multi_cast_stream_sink.destinations[i], _fragment_instance_ids)); RETURN_IF_ERROR(new_pipeline->set_sink(sink_op)); { TDataSink* t = pool->add(new TDataSink()); t->stream_sink = thrift_sink.multi_cast_stream_sink.sinks[i]; RETURN_IF_ERROR(sink_op->init(*t)); } // 3. set dependency dag _dag[new_pipeline->id()].push_back(cur_pipeline_id); } if (sources.empty()) { return Status::InternalError("size of sources must be greater than 0"); } break; } default: return Status::InternalError("Unsuported sink type in pipeline: {}", thrift_sink.type); } return Status::OK(); } // NOLINTBEGIN(readability-function-size) // NOLINTBEGIN(readability-function-cognitive-complexity) Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNode& tnode, const doris::TPipelineFragmentParams& request, const DescriptorTbl& descs, OperatorPtr& op, PipelinePtr& cur_pipe, int parent_idx, int child_idx, const bool followed_by_shuffled_operator) { // We directly construct the operator from Thrift because the given array is in the order of preorder traversal. // Therefore, here we need to use a stack-like structure. _pipeline_parent_map.pop(cur_pipe, parent_idx, child_idx); std::stringstream error_msg; bool enable_query_cache = request.fragment.__isset.query_cache_param; bool fe_with_old_version = false; switch (tnode.node_type) { case TPlanNodeType::OLAP_SCAN_NODE: { op.reset(new OlapScanOperatorX( pool, tnode, next_operator_id(), descs, _num_instances, enable_query_cache ? request.fragment.query_cache_param : TQueryCacheParam {})); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::GROUP_COMMIT_SCAN_NODE: { #ifndef NDEBUG DCHECK(_query_ctx != nullptr); _query_ctx->query_mem_tracker()->is_group_commit_load = true; #endif op.reset(new GroupCommitOperatorX(pool, tnode, next_operator_id(), descs, _num_instances)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case doris::TPlanNodeType::JDBC_SCAN_NODE: { if (config::enable_java_support) { op.reset(new JDBCScanOperatorX(pool, tnode, next_operator_id(), descs, _num_instances)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); } else { return Status::InternalError( "Jdbc scan node is disabled, you can change be config enable_java_support " "to true and restart be."); } fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case doris::TPlanNodeType::FILE_SCAN_NODE: { op.reset(new FileScanOperatorX(pool, tnode, next_operator_id(), descs, _num_instances)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::ES_SCAN_NODE: case TPlanNodeType::ES_HTTP_SCAN_NODE: { op.reset(new EsScanOperatorX(pool, tnode, next_operator_id(), descs, _num_instances)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::EXCHANGE_NODE: { int num_senders = find_with_default(request.per_exch_num_senders, tnode.node_id, 0); DCHECK_GT(num_senders, 0); op.reset(new ExchangeSourceOperatorX(pool, tnode, next_operator_id(), descs, num_senders)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::AGGREGATION_NODE: { if (tnode.agg_node.grouping_exprs.empty() && descs.get_tuple_descriptor(tnode.agg_node.output_tuple_id)->slots().empty()) { return Status::InternalError("Illegal aggregate node " + std::to_string(tnode.node_id) + ": group by and output is empty"); } bool need_create_cache_op = enable_query_cache && tnode.node_id == request.fragment.query_cache_param.node_id; auto create_query_cache_operator = [&](PipelinePtr& new_pipe) { auto cache_node_id = request.local_params[0].per_node_scan_ranges.begin()->first; auto cache_source_id = next_operator_id(); op.reset(new CacheSourceOperatorX(pool, cache_node_id, cache_source_id, request.fragment.query_cache_param)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { _dag.insert({downstream_pipeline_id, {}}); } new_pipe = add_pipeline(cur_pipe); _dag[downstream_pipeline_id].push_back(new_pipe->id()); DataSinkOperatorPtr cache_sink(new CacheSinkOperatorX( next_sink_operator_id(), cache_source_id, op->operator_id())); RETURN_IF_ERROR(new_pipe->set_sink(cache_sink)); return Status::OK(); }; const bool group_by_limit_opt = tnode.agg_node.__isset.agg_sort_info_by_group_key && tnode.limit > 0; /// PartitionedAggSourceOperatorX does not support "group by limit opt(#29641)" yet. /// If `group_by_limit_opt` is true, then it might not need to spill at all. const bool enable_spill = _runtime_state->enable_spill() && !tnode.agg_node.grouping_exprs.empty() && !group_by_limit_opt; const bool is_streaming_agg = tnode.agg_node.__isset.use_streaming_preaggregation && tnode.agg_node.use_streaming_preaggregation && !tnode.agg_node.grouping_exprs.empty(); const bool can_use_distinct_streaming_agg = is_streaming_agg && tnode.agg_node.aggregate_functions.empty() && request.query_options.__isset.enable_distinct_streaming_aggregation && request.query_options.enable_distinct_streaming_aggregation; if (can_use_distinct_streaming_agg) { if (need_create_cache_op) { PipelinePtr new_pipe; RETURN_IF_ERROR(create_query_cache_operator(new_pipe)); op.reset(new DistinctStreamingAggOperatorX(pool, next_operator_id(), tnode, descs, _require_bucket_distribution)); op->set_followed_by_shuffled_operator(false); _require_bucket_distribution = true; RETURN_IF_ERROR(new_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); RETURN_IF_ERROR(cur_pipe->operators().front()->set_child(op)); cur_pipe = new_pipe; } else { op.reset(new DistinctStreamingAggOperatorX(pool, next_operator_id(), tnode, descs, _require_bucket_distribution)); op->set_followed_by_shuffled_operator(followed_by_shuffled_operator); _require_bucket_distribution = _require_bucket_distribution || op->require_data_distribution(); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); } } else if (is_streaming_agg) { if (need_create_cache_op) { PipelinePtr new_pipe; RETURN_IF_ERROR(create_query_cache_operator(new_pipe)); op.reset(new StreamingAggOperatorX(pool, next_operator_id(), tnode, descs)); RETURN_IF_ERROR(cur_pipe->operators().front()->set_child(op)); RETURN_IF_ERROR(new_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); cur_pipe = new_pipe; } else { op.reset(new StreamingAggOperatorX(pool, next_operator_id(), tnode, descs)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); } } else { // create new pipeline to add query cache operator PipelinePtr new_pipe; if (need_create_cache_op) { RETURN_IF_ERROR(create_query_cache_operator(new_pipe)); } if (enable_spill) { op.reset(new PartitionedAggSourceOperatorX(pool, tnode, next_operator_id(), descs)); } else { op.reset(new AggSourceOperatorX(pool, tnode, next_operator_id(), descs)); } if (need_create_cache_op) { RETURN_IF_ERROR(cur_pipe->operators().front()->set_child(op)); RETURN_IF_ERROR(new_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); cur_pipe = new_pipe; } else { RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); } const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { _dag.insert({downstream_pipeline_id, {}}); } cur_pipe = add_pipeline(cur_pipe); _dag[downstream_pipeline_id].push_back(cur_pipe->id()); DataSinkOperatorPtr sink; if (enable_spill) { sink.reset(new PartitionedAggSinkOperatorX(pool, next_sink_operator_id(), op->operator_id(), tnode, descs, _require_bucket_distribution)); } else { sink.reset(new AggSinkOperatorX(pool, next_sink_operator_id(), op->operator_id(), tnode, descs, _require_bucket_distribution)); } sink->set_followed_by_shuffled_operator(followed_by_shuffled_operator); _require_bucket_distribution = _require_bucket_distribution || sink->require_data_distribution(); RETURN_IF_ERROR(cur_pipe->set_sink(sink)); RETURN_IF_ERROR(cur_pipe->sink()->init(tnode, _runtime_state.get())); } break; } case TPlanNodeType::HASH_JOIN_NODE: { const auto is_broadcast_join = tnode.hash_join_node.__isset.is_broadcast_join && tnode.hash_join_node.is_broadcast_join; const auto enable_spill = _runtime_state->enable_spill(); if (enable_spill && !is_broadcast_join) { auto tnode_ = tnode; tnode_.runtime_filters.clear(); uint32_t partition_count = _runtime_state->spill_hash_join_partition_count(); auto inner_probe_operator = std::make_shared<HashJoinProbeOperatorX>(pool, tnode_, 0, descs); // probe side inner sink operator is used to build hash table on probe side when data is spilled. // So here use `tnode_` which has no runtime filters. auto probe_side_inner_sink_operator = std::make_shared<HashJoinBuildSinkOperatorX>(pool, 0, 0, tnode_, descs); RETURN_IF_ERROR(inner_probe_operator->init(tnode_, _runtime_state.get())); RETURN_IF_ERROR(probe_side_inner_sink_operator->init(tnode_, _runtime_state.get())); auto probe_operator = std::make_shared<PartitionedHashJoinProbeOperatorX>( pool, tnode_, next_operator_id(), descs, partition_count); probe_operator->set_inner_operators(probe_side_inner_sink_operator, inner_probe_operator); op = std::move(probe_operator); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { _dag.insert({downstream_pipeline_id, {}}); } PipelinePtr build_side_pipe = add_pipeline(cur_pipe); _dag[downstream_pipeline_id].push_back(build_side_pipe->id()); auto inner_sink_operator = std::make_shared<HashJoinBuildSinkOperatorX>(pool, 0, 0, tnode, descs); auto sink_operator = std::make_shared<PartitionedHashJoinSinkOperatorX>( pool, next_sink_operator_id(), op->operator_id(), tnode_, descs, partition_count); RETURN_IF_ERROR(inner_sink_operator->init(tnode, _runtime_state.get())); sink_operator->set_inner_operators(inner_sink_operator, inner_probe_operator); DataSinkOperatorPtr sink = std::move(sink_operator); RETURN_IF_ERROR(build_side_pipe->set_sink(sink)); RETURN_IF_ERROR(build_side_pipe->sink()->init(tnode_, _runtime_state.get())); _pipeline_parent_map.push(op->node_id(), cur_pipe); _pipeline_parent_map.push(op->node_id(), build_side_pipe); sink->set_followed_by_shuffled_operator(sink->is_shuffled_operator()); op->set_followed_by_shuffled_operator(op->is_shuffled_operator()); } else { op.reset(new HashJoinProbeOperatorX(pool, tnode, next_operator_id(), descs)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { _dag.insert({downstream_pipeline_id, {}}); } PipelinePtr build_side_pipe = add_pipeline(cur_pipe); _dag[downstream_pipeline_id].push_back(build_side_pipe->id()); DataSinkOperatorPtr sink; sink.reset(new HashJoinBuildSinkOperatorX(pool, next_sink_operator_id(), op->operator_id(), tnode, descs)); RETURN_IF_ERROR(build_side_pipe->set_sink(sink)); RETURN_IF_ERROR(build_side_pipe->sink()->init(tnode, _runtime_state.get())); _pipeline_parent_map.push(op->node_id(), cur_pipe); _pipeline_parent_map.push(op->node_id(), build_side_pipe); sink->set_followed_by_shuffled_operator(sink->is_shuffled_operator()); op->set_followed_by_shuffled_operator(op->is_shuffled_operator()); } if (is_broadcast_join && _runtime_state->enable_share_hash_table_for_broadcast_join()) { std::shared_ptr<HashJoinSharedState> shared_state = HashJoinSharedState::create_shared(_num_instances); for (int i = 0; i < _num_instances; i++) { auto sink_dep = std::make_shared<Dependency>(op->operator_id(), op->node_id(), "HASH_JOIN_BUILD_DEPENDENCY"); sink_dep->set_shared_state(shared_state.get()); shared_state->sink_deps.push_back(sink_dep); } shared_state->create_source_dependencies(_num_instances, op->operator_id(), op->node_id(), "HASH_JOIN_PROBE"); _op_id_to_shared_state.insert( {op->operator_id(), {shared_state, shared_state->sink_deps}}); } _require_bucket_distribution = _require_bucket_distribution || op->require_data_distribution(); break; } case TPlanNodeType::CROSS_JOIN_NODE: { op.reset(new NestedLoopJoinProbeOperatorX(pool, tnode, next_operator_id(), descs)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { _dag.insert({downstream_pipeline_id, {}}); } PipelinePtr build_side_pipe = add_pipeline(cur_pipe); _dag[downstream_pipeline_id].push_back(build_side_pipe->id()); DataSinkOperatorPtr sink; sink.reset(new NestedLoopJoinBuildSinkOperatorX(pool, next_sink_operator_id(), op->operator_id(), tnode, descs)); RETURN_IF_ERROR(build_side_pipe->set_sink(sink)); RETURN_IF_ERROR(build_side_pipe->sink()->init(tnode, _runtime_state.get())); _pipeline_parent_map.push(op->node_id(), cur_pipe); _pipeline_parent_map.push(op->node_id(), build_side_pipe); break; } case TPlanNodeType::UNION_NODE: { int child_count = tnode.num_children; op.reset(new UnionSourceOperatorX(pool, tnode, next_operator_id(), descs)); op->set_followed_by_shuffled_operator(_require_bucket_distribution); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { _dag.insert({downstream_pipeline_id, {}}); } for (int i = 0; i < child_count; i++) { PipelinePtr build_side_pipe = add_pipeline(cur_pipe); _dag[downstream_pipeline_id].push_back(build_side_pipe->id()); DataSinkOperatorPtr sink; sink.reset(new UnionSinkOperatorX(i, next_sink_operator_id(), op->operator_id(), pool, tnode, descs)); sink->set_followed_by_shuffled_operator(_require_bucket_distribution); RETURN_IF_ERROR(build_side_pipe->set_sink(sink)); RETURN_IF_ERROR(build_side_pipe->sink()->init(tnode, _runtime_state.get())); // preset children pipelines. if any pipeline found this as its father, will use the prepared pipeline to build. _pipeline_parent_map.push(op->node_id(), build_side_pipe); } break; } case TPlanNodeType::SORT_NODE: { const auto should_spill = _runtime_state->enable_spill() && tnode.sort_node.algorithm == TSortAlgorithm::FULL_SORT; const bool use_local_merge = tnode.sort_node.__isset.use_local_merge && tnode.sort_node.use_local_merge; if (should_spill) { op.reset(new SpillSortSourceOperatorX(pool, tnode, next_operator_id(), descs)); } else if (use_local_merge) { op.reset(new LocalMergeSortSourceOperatorX(pool, tnode, next_operator_id(), descs)); } else { op.reset(new SortSourceOperatorX(pool, tnode, next_operator_id(), descs)); } RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { _dag.insert({downstream_pipeline_id, {}}); } cur_pipe = add_pipeline(cur_pipe); _dag[downstream_pipeline_id].push_back(cur_pipe->id()); DataSinkOperatorPtr sink; if (should_spill) { sink.reset(new SpillSortSinkOperatorX(pool, next_sink_operator_id(), op->operator_id(), tnode, descs, _require_bucket_distribution)); } else { sink.reset(new SortSinkOperatorX(pool, next_sink_operator_id(), op->operator_id(), tnode, descs, _require_bucket_distribution)); } sink->set_followed_by_shuffled_operator(followed_by_shuffled_operator); _require_bucket_distribution = _require_bucket_distribution || sink->require_data_distribution(); RETURN_IF_ERROR(cur_pipe->set_sink(sink)); RETURN_IF_ERROR(cur_pipe->sink()->init(tnode, _runtime_state.get())); break; } case doris::TPlanNodeType::PARTITION_SORT_NODE: { op.reset(new PartitionSortSourceOperatorX(pool, tnode, next_operator_id(), descs)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { _dag.insert({downstream_pipeline_id, {}}); } cur_pipe = add_pipeline(cur_pipe); _dag[downstream_pipeline_id].push_back(cur_pipe->id()); DataSinkOperatorPtr sink; sink.reset(new PartitionSortSinkOperatorX(pool, next_sink_operator_id(), op->operator_id(), tnode, descs)); RETURN_IF_ERROR(cur_pipe->set_sink(sink)); RETURN_IF_ERROR(cur_pipe->sink()->init(tnode, _runtime_state.get())); break; } case TPlanNodeType::ANALYTIC_EVAL_NODE: { op.reset(new AnalyticSourceOperatorX(pool, tnode, next_operator_id(), descs)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { _dag.insert({downstream_pipeline_id, {}}); } cur_pipe = add_pipeline(cur_pipe); _dag[downstream_pipeline_id].push_back(cur_pipe->id()); DataSinkOperatorPtr sink; sink.reset(new AnalyticSinkOperatorX(pool, next_sink_operator_id(), op->operator_id(), tnode, descs, _require_bucket_distribution)); sink->set_followed_by_shuffled_operator(followed_by_shuffled_operator); _require_bucket_distribution = _require_bucket_distribution || sink->require_data_distribution(); RETURN_IF_ERROR(cur_pipe->set_sink(sink)); RETURN_IF_ERROR(cur_pipe->sink()->init(tnode, _runtime_state.get())); break; } case TPlanNodeType::INTERSECT_NODE: { RETURN_IF_ERROR(_build_operators_for_set_operation_node<true>( pool, tnode, descs, op, cur_pipe, parent_idx, child_idx, request)); op->set_followed_by_shuffled_operator(_require_bucket_distribution); break; } case TPlanNodeType::EXCEPT_NODE: { RETURN_IF_ERROR(_build_operators_for_set_operation_node<false>( pool, tnode, descs, op, cur_pipe, parent_idx, child_idx, request)); op->set_followed_by_shuffled_operator(_require_bucket_distribution); break; } case TPlanNodeType::REPEAT_NODE: { op.reset(new RepeatOperatorX(pool, tnode, next_operator_id(), descs)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } case TPlanNodeType::TABLE_FUNCTION_NODE: { op.reset(new TableFunctionOperatorX(pool, tnode, next_operator_id(), descs)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } case TPlanNodeType::ASSERT_NUM_ROWS_NODE: { op.reset(new AssertNumRowsOperatorX(pool, tnode, next_operator_id(), descs)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } case TPlanNodeType::EMPTY_SET_NODE: { op.reset(new EmptySetSourceOperatorX(pool, tnode, next_operator_id(), descs)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } case TPlanNodeType::DATA_GEN_SCAN_NODE: { op.reset(new DataGenSourceOperatorX(pool, tnode, next_operator_id(), descs)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); fe_with_old_version = !tnode.__isset.is_serial_operator; break; } case TPlanNodeType::SCHEMA_SCAN_NODE: { op.reset(new SchemaScanOperatorX(pool, tnode, next_operator_id(), descs)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } case TPlanNodeType::META_SCAN_NODE: { op.reset(new MetaScanOperatorX(pool, tnode, next_operator_id(), descs)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } case TPlanNodeType::SELECT_NODE: { op.reset(new SelectOperatorX(pool, tnode, next_operator_id(), descs)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); break; } default: return Status::InternalError("Unsupported exec type in pipeline: {}", print_plan_node_type(tnode.node_type)); } if (request.__isset.parallel_instances && fe_with_old_version) { cur_pipe->set_num_tasks(request.parallel_instances); op->set_serial_operator(); } return Status::OK(); } // NOLINTEND(readability-function-cognitive-complexity) // NOLINTEND(readability-function-size) template <bool is_intersect> Status PipelineFragmentContext::_build_operators_for_set_operation_node( ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs, OperatorPtr& op, PipelinePtr& cur_pipe, int parent_idx, int child_idx, const doris::TPipelineFragmentParams& request) { op.reset(new SetSourceOperatorX<is_intersect>(pool, tnode, next_operator_id(), descs)); RETURN_IF_ERROR(cur_pipe->add_operator( op, request.__isset.parallel_instances ? request.parallel_instances : 0)); const auto downstream_pipeline_id = cur_pipe->id(); if (_dag.find(downstream_pipeline_id) == _dag.end()) { _dag.insert({downstream_pipeline_id, {}}); } for (int child_id = 0; child_id < tnode.num_children; child_id++) { PipelinePtr probe_side_pipe = add_pipeline(cur_pipe); _dag[downstream_pipeline_id].push_back(probe_side_pipe->id()); DataSinkOperatorPtr sink; if (child_id == 0) { sink.reset(new SetSinkOperatorX<is_intersect>(child_id, next_sink_operator_id(), op->operator_id(), pool, tnode, descs)); } else { sink.reset(new SetProbeSinkOperatorX<is_intersect>( child_id, next_sink_operator_id(), op->operator_id(), pool, tnode, descs)); } RETURN_IF_ERROR(probe_side_pipe->set_sink(sink)); RETURN_IF_ERROR(probe_side_pipe->sink()->init(tnode, _runtime_state.get())); // prepare children pipelines. if any pipeline found this as its father, will use the prepared pipeline to build. _pipeline_parent_map.push(op->node_id(), probe_side_pipe); } return Status::OK(); } Status PipelineFragmentContext::submit() { if (_submitted) { return Status::InternalError("submitted"); } _submitted = true; int submit_tasks = 0; Status st; auto* scheduler = _query_ctx->get_pipe_exec_scheduler(); for (auto& task : _tasks) { for (auto& t : task) { st = scheduler->schedule_task(t); if (!st) { cancel(Status::InternalError("submit context to executor fail")); std::lock_guard<std::mutex> l(_task_mutex); _total_tasks = submit_tasks; break; } submit_tasks++; } } if (!st.ok()) { std::lock_guard<std::mutex> l(_task_mutex); if (_closed_tasks == _total_tasks) { _close_fragment_instance(); } return Status::InternalError("Submit pipeline failed. err = {}, BE: {}", st.to_string(), BackendOptions::get_localhost()); } else { return st; } } void PipelineFragmentContext::print_profile(const std::string& extra_info) { if (_runtime_state->enable_profile()) { std::stringstream ss; for (auto runtime_profile_ptr : _runtime_state->pipeline_id_to_profile()) { runtime_profile_ptr->pretty_print(&ss); } if (_runtime_state->load_channel_profile()) { _runtime_state->load_channel_profile()->pretty_print(&ss); } auto profile_str = fmt::format("Query {} fragment {} {}, profile, {}", print_id(this->_query_id), this->_fragment_id, extra_info, ss.str()); LOG_LONG_STRING(INFO, profile_str); } } // If all pipeline tasks binded to the fragment instance are finished, then we could // close the fragment instance. void PipelineFragmentContext::_close_fragment_instance() { if (_is_fragment_instance_closed) { return; } Defer defer_op {[&]() { _is_fragment_instance_closed = true; }}; _fragment_level_profile->total_time_counter()->update(_fragment_watcher.elapsed_time()); static_cast<void>(send_report(true)); // Print profile content in info log is a tempoeray solution for stream load and external_connector. // Since stream load does not have someting like coordinator on FE, so // backend can not report profile to FE, ant its profile can not be shown // in the same way with other query. So we print the profile content to info log. if (_runtime_state->enable_profile() && (_query_ctx->get_query_source() == QuerySource::STREAM_LOAD || _query_ctx->get_query_source() == QuerySource::EXTERNAL_CONNECTOR || _query_ctx->get_query_source() == QuerySource::GROUP_COMMIT_LOAD)) { std::stringstream ss; // Compute the _local_time_percent before pretty_print the runtime_profile // Before add this operation, the print out like that: // UNION_NODE (id=0):(Active: 56.720us, non-child: 00.00%) // After add the operation, the print out like that: // UNION_NODE (id=0):(Active: 56.720us, non-child: 82.53%) // We can easily know the exec node execute time without child time consumed. for (auto runtime_profile_ptr : _runtime_state->pipeline_id_to_profile()) { runtime_profile_ptr->pretty_print(&ss); } if (_runtime_state->load_channel_profile()) { _runtime_state->load_channel_profile()->pretty_print(&ss); } LOG_INFO("Query {} fragment {} profile:\n {}", print_id(_query_id), _fragment_id, ss.str()); } if (_query_ctx->enable_profile()) { _query_ctx->add_fragment_profile(_fragment_id, collect_realtime_profile(), collect_realtime_load_channel_profile()); } // all submitted tasks done _exec_env->fragment_mgr()->remove_pipeline_context({_query_id, _fragment_id}); } void PipelineFragmentContext::decrement_running_task(PipelineId pipeline_id) { // If all tasks of this pipeline has been closed, upstream tasks is never needed, and we just make those runnable here DCHECK(_pip_id_to_pipeline.contains(pipeline_id)); if (_pip_id_to_pipeline[pipeline_id]->close_task()) { if (_dag.contains(pipeline_id)) { for (auto dep : _dag[pipeline_id]) { _pip_id_to_pipeline[dep]->make_all_runnable(); } } } std::lock_guard<std::mutex> l(_task_mutex); ++_closed_tasks; if (_closed_tasks == _total_tasks) { _close_fragment_instance(); } } std::string PipelineFragmentContext::get_load_error_url() { if (const auto& str = _runtime_state->get_error_log_file_path(); !str.empty()) { return to_load_error_http_path(str); } for (auto& task_states : _task_runtime_states) { for (auto& task_state : task_states) { if (!task_state) { continue; } if (const auto& str = task_state->get_error_log_file_path(); !str.empty()) { return to_load_error_http_path(str); } } } return ""; } Status PipelineFragmentContext::send_report(bool done) { Status exec_status = _query_ctx->exec_status(); // If plan is done successfully, but _is_report_success is false, // no need to send report. // Load will set _is_report_success to true because load wants to know // the process. if (!_is_report_success && done && exec_status.ok()) { return Status::NeedSendAgain(""); } // If both _is_report_success and _is_report_on_cancel are false, // which means no matter query is success or failed, no report is needed. // This may happen when the query limit reached and // a internal cancellation being processed // When limit is reached the fragment is also cancelled, but _is_report_on_cancel will // be set to false, to avoid sending fault report to FE. if (!_is_report_success && !_is_report_on_cancel) { return Status::NeedSendAgain(""); } std::vector<RuntimeState*> runtime_states; for (auto& task_states : _task_runtime_states) { for (auto& task_state : task_states) { if (task_state) { runtime_states.push_back(task_state.get()); } } } std::string load_eror_url = _query_ctx->get_load_error_url().empty() ? get_load_error_url() : _query_ctx->get_load_error_url(); ReportStatusRequest req {exec_status, runtime_states, done || !exec_status.ok(), _query_ctx->coord_addr, _query_id, _fragment_id, TUniqueId(), -1, _runtime_state.get(), load_eror_url, [this](const Status& reason) { cancel(reason); }}; return _report_status_cb( req, std::dynamic_pointer_cast<PipelineFragmentContext>(shared_from_this())); } size_t PipelineFragmentContext::get_revocable_size(bool* has_running_task) const { size_t res = 0; // _tasks will be cleared during ~PipelineFragmentContext, so that it's safe // here to traverse the vector. for (const auto& task_instances : _tasks) { for (const auto& task : task_instances) { if (task->is_running() || task->is_revoking()) { LOG_EVERY_N(INFO, 50) << "Query: " << print_id(_query_id) << " is running, task: " << (void*)task.get() << ", is_revoking: " << task->is_revoking() << ", is_running: " << task->is_running(); *has_running_task = true; return 0; } size_t revocable_size = task->get_revocable_size(); if (revocable_size >= vectorized::SpillStream::MIN_SPILL_WRITE_BATCH_MEM) { res += revocable_size; } } } return res; } std::vector<PipelineTask*> PipelineFragmentContext::get_revocable_tasks() const { std::vector<PipelineTask*> revocable_tasks; for (const auto& task_instances : _tasks) { for (const auto& task : task_instances) { size_t revocable_size_ = task->get_revocable_size(); if (revocable_size_ >= vectorized::SpillStream::MIN_SPILL_WRITE_BATCH_MEM) { revocable_tasks.emplace_back(task.get()); } } } return revocable_tasks; } std::string PipelineFragmentContext::debug_string() { fmt::memory_buffer debug_string_buffer; fmt::format_to(debug_string_buffer, "PipelineFragmentContext Info:\n"); for (size_t j = 0; j < _tasks.size(); j++) { fmt::format_to(debug_string_buffer, "Tasks in instance {}:\n", j); for (size_t i = 0; i < _tasks[j].size(); i++) { fmt::format_to(debug_string_buffer, "Task {}: {}\n", i, _tasks[j][i]->debug_string()); } } return fmt::to_string(debug_string_buffer); } std::vector<std::shared_ptr<TRuntimeProfileTree>> PipelineFragmentContext::collect_realtime_profile() const { std::vector<std::shared_ptr<TRuntimeProfileTree>> res; // we do not have mutex to protect pipeline_id_to_profile // so we need to make sure this funciton is invoked after fragment context // has already been prepared. if (!_prepared) { std::string msg = "Query " + print_id(_query_id) + " collecting profile, but its not prepared"; DCHECK(false) << msg; LOG_ERROR(msg); return res; } // Make sure first profile is fragment level profile auto fragment_profile = std::make_shared<TRuntimeProfileTree>(); _fragment_level_profile->to_thrift(fragment_profile.get(), _runtime_state->profile_level()); res.push_back(fragment_profile); // pipeline_id_to_profile is initialized in prepare stage for (auto pipeline_profile : _runtime_state->pipeline_id_to_profile()) { auto profile_ptr = std::make_shared<TRuntimeProfileTree>(); pipeline_profile->to_thrift(profile_ptr.get(), _runtime_state->profile_level()); res.push_back(profile_ptr); } return res; } std::shared_ptr<TRuntimeProfileTree> PipelineFragmentContext::collect_realtime_load_channel_profile() const { // we do not have mutex to protect pipeline_id_to_profile // so we need to make sure this funciton is invoked after fragment context // has already been prepared. if (!_prepared) { std::string msg = "Query " + print_id(_query_id) + " collecting profile, but its not prepared"; DCHECK(false) << msg; LOG_ERROR(msg); return nullptr; } for (auto& runtime_states : _task_runtime_states) { for (auto& runtime_state : runtime_states) { if (runtime_state == nullptr || runtime_state->runtime_profile() == nullptr) { continue; } auto tmp_load_channel_profile = std::make_shared<TRuntimeProfileTree>(); runtime_state->runtime_profile()->to_thrift(tmp_load_channel_profile.get(), _runtime_state->profile_level()); _runtime_state->load_channel_profile()->update(*tmp_load_channel_profile); } } auto load_channel_profile = std::make_shared<TRuntimeProfileTree>(); _runtime_state->load_channel_profile()->to_thrift(load_channel_profile.get(), _runtime_state->profile_level()); return load_channel_profile; } #include "common/compile_check_end.h" } // namespace doris::pipeline

be/src/pipeline/pipeline_fragment_context.cpp (1,678 lines of code) (raw):