sql/window.h

/* Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef WINDOWS_INCLUDED #define WINDOWS_INCLUDED /* Some Window-related symbols must be known to sql_lex.h which is a frequently included header. To avoid that any change to window.h causes a recompilation of the whole Server, those symbols go into this header: */ #include "window_lex.h" #ifdef WF_DEBUG #include <unordered_map> #endif #include "sql_error.h" #include "sql_list.h" #include <cstring> // std::memcpy class PT_border; class PT_order_list; class PT_frame; class PT_window; class Item_string; class Item_func; class Temp_table_param; /** Represents the (explicit) window of a SQL 2003 section 7.11 \<window clause\>, or the implicit (inlined) window of a window function call, or a reference to a named window in a window function call (instead of the inlined definition) before resolution. After resolving referencing instances become unused, having been replaced with the window resolved to in the w.f. call. @verbatim Cf. 7.11 <window definition> and <existing window name> 6.10 <window name or specification> and <in-line window specification> 5.4 <window name> @endverbatim See also PT_window (which wraps Window as a parse_tree_node), and the related classes PT_frame, PT_border and PT_exclusion in parse_tree_nodes. Currently includes both prepared query and execution state information. The latter is marked as such for ease of separation later. */ class Window { /*------------------------------------------------------------------------ * * Variables stable during execution * *------------------------------------------------------------------------*/ protected: SELECT_LEX *m_select; ///< The SELECT the window is on PT_order_list * const m_partition_by; ///< \<window partition clause\> PT_order_list * const m_order_by; ///< \<window order clause\> ORDER *m_sorting_order; ///< merged partition/order by bool m_sort_redundant; ///< Can use sort from previous w PT_frame * const m_frame; ///< \<window frame clause\> Item_string *m_name; ///< \<window name\> /** Position of definition in query's text, 1 for leftmost. References don't count. Thus, anonymous windows in SELECT list, then windows of WINDOW clause, then anonymous windows in ORDER BY. */ uint m_def_pos; Item_string * const m_inherit_from; ///< \<existing window name\> /** If true, m_name is an unbound window reference, other fields are unused. */ const bool m_is_reference; /** (At least) one window function needs to buffer frame rows for evaluation i.e. it cannot be evaluated on the fly just from previous rows seen */ bool m_needs_frame_buffering; /** (At least) one window function needs the peer set of the current row to evaluate the wf for the current row */ bool m_needs_peerset; /** The functions are optimizable with ROW unit. For example SUM is, MAX is not always optimizable. Optimized means we can use the optimized evaluation path in process_buffered_windowing_record which uses inversion to avoid revisiting all frame rows for every row being evaluated. */ bool m_row_optimizable; /** The functions are optimizable with RANGE unit. For example SUM is, MAX is not always optimizable. Optimized means we can use the optimized evaluation path in process_buffered_windowing_record which uses inversion to avoid revisiting all frame rows for every row being evaluated. */ bool m_range_optimizable; /** The aggregates can be evaluated once for a partition, since it is static, i.e. all rows will have the same value for the aggregates, e.g. ROWS/RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING. */ bool m_static_aggregates; /** Window equires re-evaluation of the first row in optimized moving frame mode e.g. FIRST_VALUE. See also m_dont_aggregate. */ bool m_opt_first_row; /** Window requires re-evaluation of the last row in optimized moving frame mode e.g. LAST_VALUE. See also m_dont_aggregate. */ bool m_opt_last_row; /** The last window to be evaluated at execution time. */ bool m_last; public: struct st_offset { int64 m_rowno; bool m_from_last; st_offset() : m_rowno(0), m_from_last(false) {} /* Used for sorting offsets in ascending order for faster traversal of frame buffer tmp file */ bool operator <(const st_offset &a) const { return m_rowno < a.m_rowno; } }; struct st_ll_offset { int64 m_rowno; ///< negative values is LEAD st_ll_offset() : m_rowno(INT_MIN64 /* uninitialized marker*/) {} /* Used for sorting offsets in ascending order for faster traversal of frame buffer tmp file */ bool operator <(const st_ll_offset &a) const { return m_rowno < a.m_rowno; } }; struct st_nth { Mem_root_array_YY<st_offset> m_offsets; // sorted set of NTH_VALUE offsets }; struct st_lead_lag { Mem_root_array_YY<st_ll_offset> m_offsets; ///< sorted set of LEAD/LAG offsets }; protected: /** Window requires re-evaluation of the Nth row in optimized moving frame mode e.g. NTH_VALUE. See also m_dont_aggregate. */ st_nth m_opt_nth_row; st_lead_lag m_opt_lead_lag; protected: const Window *m_ancestor; ///< resolved from existing window name List<Item_sum> m_functions; ///< window functions based on 'this' List<Cached_item> m_partition_items; ///< items for the PARTITION BY columns List<Cached_item> m_order_by_items; ///< items for the ORDER BY exprs. /*------------------------------------------------------------------------ * * Execution state variables * *------------------------------------------------------------------------*/ public: /** Position hints for the frame buffer are saved for these kind of row accesses, cf. #m_frame_buffer_positions. */ enum retrieve_cached_row_reason { REA_FIRST_IN_PARTITION= 0, REA_CURRENT= 1, REA_FIRST_IN_FRAME= 2, REA_LAST_IN_FRAME= 3, REA_LAST_IN_PEERSET= 4, REA_MISC_POSITIONS= 5 // NTH_VALUE, LEAD/LAG have dynamic indexes 5..N }; /** Cardinality of m_frame_buffer_positions if no NTH_VALUE, LEAD/LAG */ static constexpr int FRAME_BUFFER_POSITIONS_CARD= REA_MISC_POSITIONS; /** Holds information about a position in the buffer frame as stored in a temporary file (cf. m_frame_buffer). We save a number of such positions to speed up lookup when we move the window frame, cf. m_frame_buffer_positions */ struct Frame_buffer_position { ///< The size of the file position is determined by handler::ref_length uchar *m_position; ///< Row number in partition, 0-based int64 m_rowno; Frame_buffer_position(uchar *position, int64 rowno) : m_position(position), m_rowno(rowno) {} }; /** Execution state: used iff m_needs_frame_buffering. Holds four pointers to positions in the file in m_frame_buffer. We use these saved positions to avoid having to position to the first row in the partition and then making many read calls to find the desired row. By repositioning to a suitably(*) saved position we normally (**) need to do only one positioned read and one ha_rdn_next call to get at a desired row. @verbatim [0] the first row in the partition: at the beginning of a new partition that is the first row in the partition. Its rowno == 1 by definition. [1] position of the current row N (for jump-back to current row or next current row in combo with ha_rnd_next) [2] position of the current first row M in frame (for aggregation looping jump-back) [3] position of the current last row in a frame [4] position and line number of the row last read and optionally: [5..X] positions of Nth row of X-5+1 NTH_VALUE functions invoked on window [X+1..Y] position of last row of lead/lag functions invoked on window @endverbatim Pointers are lazily initialized if needed. (*) We use the position closest below the desired position, cf logic in read_frame_buffer_row. (**) Unless we have a frame beyond the current row, in which case we need to do some scanning for the first row in the partition. Also NTH_VALUE with RANGE might sometimes needs to read several rows, since the frame start can jump several rows ahead when the current row moves forward. */ Mem_root_array_YY<Frame_buffer_position> m_frame_buffer_positions; /** Sometimes we read one row too many, so that the saved position will be too far out because we subsequently need to read an earlier (previous) row of the same kind (reason). For such cases, we first save the current position, read, and if we see we read too far, restore the old position. See #save_pos and #restore_pos. */ Frame_buffer_position m_tmp_pos; /** See #m_tmp_pos */ void save_pos(retrieve_cached_row_reason reason) { m_tmp_pos.m_rowno= m_frame_buffer_positions[reason].m_rowno; std::memcpy(m_tmp_pos.m_position, m_frame_buffer_positions[reason].m_position, frame_buffer()->file->ref_length); } /** See #m_tmp_pos */ void restore_pos(retrieve_cached_row_reason reason) { m_frame_buffer_positions[reason].m_rowno= m_tmp_pos.m_rowno; std::memcpy(m_frame_buffer_positions[reason].m_position, m_tmp_pos.m_position, frame_buffer()->file->ref_length); } /** Keys for m_frame_buffer_cache and m_special_rows_cache, for special rows. */ enum Special_keys { /** We read an incoming row. We notice it is the start of a new partition. We must thus process the just-finished partition, but that processing uses this row's buffer; so, we save this row first, process the partition, and restore it later. */ FBC_FIRST_IN_NEXT_PARTITION= -1, /// An already prepared output row whose result we want to reuse. FBC_FIRST_IN_STATIC_RANGE= -2, /// An already prepared output row whose result we want to reuse. FBC_LAST_RESULT_OPTIMIZED_RANGE= -3, // Insert new values here. // And keep the ones below up to date. FBC_FIRST_KEY= FBC_FIRST_IN_NEXT_PARTITION, FBC_LAST_KEY= FBC_LAST_RESULT_OPTIMIZED_RANGE, }; protected: /** Execution state: used iff m_needs_frame_buffering. Holds the temporary file (used for the frame buffering) parameters */ Temp_table_param *m_frame_buffer_param; /** Execution state: Holds the temporary output table (for next step) parameters */ Temp_table_param *m_outtable_param; /** Execution state: used iff m_needs_frame_buffering. Holds the TABLE object for the the temporary file used for the frame buffering. */ TABLE *m_frame_buffer; /** Execution state: The frame buffer tmp file is not truncated for each new partition. We need to keep track of where a partition starts in case we need to switch from heap to innodb tmp file on overflow, in order to re-initialize m_frame_buffer_positions with the current partition's row 1 (which is the minimum hint required) as we cross over. This number is incremented for each write. */ int64 m_frame_buffer_total_rows; /** Execution state: Snapshot of m_frame_buffer_total_rows when we start a new partition, i.e. for the first row in the first partition we will have a value of 1. */ int64 m_frame_buffer_partition_offset; // Enable for more checks. Not for Valgrind as not freed. // @todo: Remove entirely. #ifdef WF_DEBUG /** Execution state: used iff m_needs_frame_buffering. Holds the in memory alternative frame buffer used in the prototype prior to using a tmp file. After we started using a real tmp file for the frame buffer, we retained this cache in debug mode for asserting that the tmp file returns the correct values, in any case for a two other records we never write to the tmp file, (cf. the Special_keys enumeration): - the first row in the next partition: we had to read that row in order to know when a full partition has been read when we buffer partition rows, so we need somehwere to store it, and we chose not to store it in the frame buffer tmp file since it doesn't logically belong there and its layout may be different depending on how we choose to store the frame buffer records. - the first record as fully evaluated in a partition with no frames and no ORDER BY: in such cases the aggregate window functions (SUM, AVG..) will have the same value for all rows, so we skip evaluating them for rows 2..N where N is the last row in the partition. */ std::unordered_map<uint64, std::vector<uchar>> m_frame_buffer_cache; #endif // WF_DEBUG /** Holds a fixed number of copies of special rows; each copy can use up to #m_special_rows_cache_max_length bytes. cf. the Special_keys enumeration. */ uchar *m_special_rows_cache; /// Length of each copy in #m_special_rows_cache, in bytes size_t m_special_rows_cache_length[FBC_FIRST_KEY - FBC_LAST_KEY + 1]; /// Maximum allocated size in #m_special_rows_cache size_t m_special_rows_cache_max_length; /** Execution state: used iff m_needs_frame_buffering. Holds the row number of the last row (hitherto) saved in the frame buffer */ int64 m_last_rowno_in_cache; /** Execution state: used iff m_needs_peerset. Holds the rowno for the last row in this peer set. */ int64 m_last_rowno_in_peerset; /** Execution state: the current row number in the current partition. Set in check_partition_boundary. Used while reading input rows, in contrast to m_rowno_in_partition, which is used when processing buffered rows. Cf. check_partition_boundary. */ int64 m_part_row_number; /** Execution state: the current row starts a new partition. Set in check_partition_boundary. */ bool m_partition_border; /** Execution state: The number of the last output row, i.e. the row number of the last row hitherto evaluated and output to the next phase for the current partition. */ int64 m_last_row_output; /** Execution state: The number of the row being visited for its contribution to a window function, relative to the start of the partition. Note that this will often be different from the current row for which we are processing the window function, reading it for output. That is given by m_rowno_in_partition, q.v. */ int64 m_rowno_being_visited; /** Execution state: the row number of the current row within a frame, cf. m_is_last_row_in_frame, relative to start of the frame. 1-based. */ int64 m_rowno_in_frame; /** Execution state: The row number of the current row being readied for output within the partition. 1-based. */ int64 m_rowno_in_partition; /** Execution state: number of rows in frame. Only calculated if we need it, if not, 0. For example, NTH_VALUE(e, n) FROM LAST needs it. @note FROM LAST is not yet supported. */ // int64 m_frame_cardinality; /** Execution state: for optimizable aggregates, cf. m_row_optimizable and m_range_optimizable, we need to keep track of when we have computed the first aggregate, since aggregates for rows 2..N are computed in an optimized way by inverse aggregation of the row moving out of the frame. */ bool m_aggregates_primed; /*------------------------------------------------------------------------ * * RANGE boundary frame state variables. * *------------------------------------------------------------------------*/ public: /** RANGE bound determination computation; first index is a value of the enum_window_border_type enum; second index 0 for the start bound, 1 for the end bound. Each item is Item_func_lt/gt. */ Item_func *m_comparators[WBT_VALUE_FOLLOWING+1][2]; protected: /** Execution state: the row number of the first row in a frame when evaluating RANGE based frame bounds. When using RANGE bounds, we don't know a priori when moving the frame which row number will be the next lower bound, but we know it will have to be a row number higher than the lower bound of the previous frame, since the bounds increase monotonically as long as the frame bounds are static within the query (current limitation). So, it makes sense to remember the first row number in a frame until we have determined the start of the next frame. If the frame for the previous current row in the partition was empty (cf. "current_row" in process_buffered_windowing_record), this should point to the next possible frame start. Relative to partition start, 1-based. */ int64 m_first_rowno_in_range_frame; /** Execution state: used for RANGE bounds frame evaluation for the continued evaluation for current row > 2 in a partition. If the frame for the current row visited (cf "current_row" in process_buffered_windowing_record) was empty, the invariant m_last_rowno_in_range_frame < m_first_rowno_in_range_frame should hold after the visit. Relative to partition start. 1-based. */ int64 m_last_rowno_in_range_frame; /*------------------------------------------------------------------------ * * Window function special behaviour toggles. These boolean flag influence * the action taken when a window function is evaluated, i.e. * * Item_xxx::val_xxx * * They are set locally just before and after a call to evaluate functions, * i.e. * * w.set_<toggle>(true) * copy_<kind>_window_functions() [see process_buffered_windowing_record] * w.set_<toggle>(false) * * to achive a special semantic, since we cannot pass down extra parameters. * *------------------------------------------------------------------------*/ /** Execution state: the current row is the last row in a window frame For some aggregate functions, e.g AVG, we can save computation by not evaluating the entire function value before the last row has been read. For AVG, do the summation for each row in the frame, but the division only for the last row, at which time the result is needed for the wf. Probably only useful for ROW based or static frames. For frame with peer set computation, determining the last row in the peer set ahead of processing is not possible, so we use a pessimistic assumption. */ bool m_is_last_row_in_frame; /** Execution state: make frame wf produce a NULL (or 0 depending, e.g. if COUNT) value because no rows are available for aggregation: e.g. for first row in partition if frame is ROWS BETWEEN 2 PRECEDING and 1 PRECEDING has no rows for which aggregation can happen */ bool m_do_copy_null; /** Execution state: do inverse, e.g. subtract rather than add in aggregates. Used for optimizing computation of sliding frames for eligible aggregates, cf. Item_sum::check_wf_semantics. */ bool m_inverse_aggregation; /** Execution state: process a selected row conditionally if the visited row is either first, last, or nth by consulting m_rowno_in_frame in frame and m_is_last_row_in_frame. For other framing wfs, skip evaluation, just reuse current result. E.g. after inversion of row N-1, we do not want to aggregate again at N, since that value is already in the "sum"; but we do want to visit if for setting FIRST_VALUE (or MIN/MAX optimized), nor do we want to aggregate again if we visit the nth row in a frame; again, it is already included. */ bool m_dont_aggregate; /*------------------------------------------------------------------------ * * Constructors * *------------------------------------------------------------------------*/ private: /** Generic window constructor, shared */ Window(Item_string *name, PT_order_list *part, PT_order_list *ord, PT_frame *frame, bool is_reference, Item_string *inherit) : m_select(nullptr), m_partition_by(part), m_order_by(ord), m_sorting_order(nullptr), m_sort_redundant(false), m_frame(frame), m_name(name), m_def_pos(0), m_inherit_from(inherit), m_is_reference(is_reference), m_needs_frame_buffering(false), m_needs_peerset(false), m_row_optimizable(true), m_range_optimizable(true), m_static_aggregates(false), m_opt_first_row(false), m_opt_last_row(false), m_last(false), m_ancestor(nullptr), m_tmp_pos(nullptr, -1), m_frame_buffer_param(nullptr), m_outtable_param(nullptr), m_frame_buffer(nullptr), m_frame_buffer_total_rows(0), m_frame_buffer_partition_offset(0), m_special_rows_cache_max_length(0), m_last_rowno_in_cache(0), m_last_rowno_in_peerset(0), m_part_row_number(0), m_partition_border(true), m_last_row_output(0), m_rowno_being_visited(0), m_rowno_in_frame(0), m_rowno_in_partition(0), // m_frame_cardinality(0), m_aggregates_primed(false), m_first_rowno_in_range_frame(1), m_last_rowno_in_range_frame(0), m_is_last_row_in_frame(false), m_do_copy_null(false), m_inverse_aggregation(false), m_dont_aggregate(false) { m_opt_nth_row.m_offsets.init_empty_const(); m_opt_lead_lag.m_offsets.init_empty_const(); m_frame_buffer_positions.init_empty_const(); } public: /** Reference to a named window. This kind is only used before resolution, references to it being replaced by the referenced window object thereafter. */ Window(Item_string *name) : Window(name, nullptr, nullptr, nullptr, true, nullptr) {} /** Unnamed window. If the window turns out to be named, the name will be set later, cf. #set_name(). */ Window(PT_order_list *partition_by, PT_order_list *order_by, PT_frame *frame) : Window(nullptr, partition_by, order_by, frame, false, nullptr) {} /** Unnamed window based on a named window. If the window turns out to be named, the name will be set later, cf. #set_name(). */ Window(PT_order_list *partition_by, PT_order_list *order_by, PT_frame *frame, Item_string *inherit) : Window(nullptr, partition_by, order_by, frame, false, inherit) {} /*------------------------------------------------------------------------ * * Methods * *------------------------------------------------------------------------*/ /** We have a named window. Now set its name. Used once, if at all, for a window as part of parsing. */ void set_name(Item_string *name) { m_name= name; } /** After resolving an existing window name reference in a window definition, we set the ancestor pointer to easy access later. */ void set_ancestor(Window *a) { m_ancestor= a; } /** Get the name of a window. Can be empty, cf. #printable_name which is not. */ Item_string *name() const { return m_name; } uint def_pos() const { return m_def_pos; } ///< @see #m_def_pos void set_def_pos(uint pos) { m_def_pos= pos; }///< @see #m_def_pos /** Get the frame, if any. SQL 2011 7.11 GR 1.b.i.6 */ const PT_frame *frame() const { return m_frame; } /** Get the ORDER BY, if any. That is, the first we find along the ancestor chain. Uniqueness checked in #setup_windows SQL 2011 7.11 GR 1.b.i.5.A-C */ const PT_order_list *order() const { const PT_order_list *o= m_order_by; const Window *w= m_ancestor; while (o == nullptr && w != nullptr) { o= w->m_order_by; w= w->m_ancestor; } return o; } /** Get partition, if any. That is, the partition if any, of the root window. SQL 2011 7.11 GR 1.b.i.4.A-C */ const PT_order_list *partition() const { const PT_order_list *p= m_partition_by; const Window *w= m_ancestor; while (w != nullptr) { if (w->m_ancestor == nullptr) { /* root */ p= w->m_partition_by; } else { /* See #setup_windows for checking */ DBUG_ASSERT(w->m_partition_by == nullptr); } w= w->m_ancestor; } return p; } /** Get the list of functions invoked on this window. */ List<Item_sum> &functions() { return m_functions; } /** Concatenation of columns in PARTITION BY and ORDER BY. Columns present in both list (redundancies) are eliminated, while making sure the order of columns in the ORDER BY is maintained in the merged list. */ ORDER *sorting_order(THD *thd); /** Return true if ordering is redundant for this window */ bool sort_redundant() const { return m_sort_redundant; } /** Check that the semantic requirements for window functions over this window are fulfilled, and accumulate evaluation requirements */ bool check_window_functions(THD *thd, SELECT_LEX *select); /** For RANGE frames we need to do computations involving add/subtract and less than, smaller than. To make this work across types, we construct item trees to do the computations, so we can reuse all the special case handling, e.g. for signed/unsigned int wrap-around, overflow etc. */ bool setup_range_expressions(THD *thd); /** Return if this window represents an unresolved window reference seen in a window function OVER clause. */ bool is_reference() const { return m_is_reference; } /** Check if the just read input row marks the start of a new partition. Sets the member variables: m_partition_border and m_part_row_number */ void check_partition_boundary(); /** Reset the current row's ORDER BY expressions when starting a new peer set. */ void reset_order_by_peer_set(); /** Determine if the current row is not in the same peer set as the previous row. Used for RANGE frame and implicit RANGE frame (the latter is used by aggregates in the presence of ORDER BY). The current row is in the same peer set if all ORDER BY columns have the same value as in the previous row. @return true if current row is in a new peer set */ bool in_new_order_by_peer_set(); /** While processing buffered rows in RANGE frame mode we, determine if the present row revisited from the buffer is before the row being processed; i.e. the current row. @return true if the present row is before the RANGE, i.e. not to be included */ bool before_frame() { return before_or_after_frame(true); } ///< See before_frame() bool after_frame() { return before_or_after_frame(false); } /** Check if we have read all the rows in a partition, possibly having buffered them for further processing @returns true if this is the case */ bool at_partition_border() const { return m_partition_border; } void save_special_record(uint64 special_rowno, TABLE *t); void restore_special_record(uint64 special_rowno, uchar *record); /** Resolve any named window to its definition and update m_window to point to the definition instead */ static bool resolve_reference(THD *thd, Item_sum *wf, PT_window **m_window); /** Semantic checking of windows. * Process any window inheritance, that is a window, that in its specification refer to another named window. Rules: 1) There should be no loops 2) The inheriting window can not specify partitioning 3) The inheriting window can not specify is already specify by an ancestor. 4) An ancestor can not specify window framing clause. Cf. SQL 2011 7.11 window clause SR 10-11. * Check requirements to the window from its using window functions and make a note of those so we know at execution time, for example if we need to buffer rows to process the window functions, whether inversion optimzation will be used for moving frames etc. * Prepare the physical ordering lists used by sorting at execution time. * Set up cached items for partition determination and for range/peer determination based on order by columns. * Check any frame semantics and for RANGE frames, set up bounds computation item trees. @param thd The session's execution thread @param select The select for which we are doing windowing @param ref_item_array The base ref items @param tables The list of tables involved @param fields The list of selected fields @param all_fields The list of all fields, including hidden ones @param windows The list of windows defined for this select @return false if success, true if error */ static bool setup_windows(THD* thd, SELECT_LEX *select, Ref_item_array ref_item_array, TABLE_LIST *tables, List<Item> &fields, List<Item> &all_fields, List<Window> &windows); /** Resolve and set up the PARTITION BY or an ORDER BY list of a window. @param thd The session's execution thread @param select The select for which we are doing windowing @param ref_item_array @param tables The list of tables involved @param fields The list of selected fields @param all_fields The list of all fields, including hidden ones @param o A list of order by expressions @param partition_order If true, o represent a windowing PARTITION BY, else it represents a windowing ORDER BY @returns false if success, true if error */ bool resolve_window_ordering(THD *thd, SELECT_LEX *select, Ref_item_array ref_item_array, TABLE_LIST *tables, List<Item> &fields, List<Item> &all_fields, ORDER *o, bool partition_order); /** Return true if this window's name is not unique in windows */ bool check_unique_name(List<Window> &windows); /** Set up cached items for an partition or an order by list updating m_partition_items or m_order_by_items respectively. @param thd The session's execution thread @param select The select for which we are doing windowing @param o The list of ordering expressions @param partition_order If true, o represents a partition order list, else an ORDER BY list. @returns false if success, true if error */ bool setup_ordering_cached_items(THD *thd, SELECT_LEX *select, const PT_order_list *o, bool partition_order); /** Determine if the window had either a partition clause (inclusive) or a ORDER BY clause @return true if we have such a clause, which means we need to sort the input table before evaluating the window functions */ bool needs_sorting(); /** If we cannot compute the window function without looking at succeeding rows, return true, else false. */ bool needs_buffering() const { return m_needs_frame_buffering; } /** If we cannot compute the window function without looking at all the rows in the peerset of the current row, return true, else false. */ bool needs_peerset() const { return m_needs_peerset; } /** Return true if the set of window functions are all ROW unit optimizable. Only relevant if m_needs_buffering and m_row_optimizable are true. */ bool optimizable_row_aggregates() const { return m_row_optimizable; } /** Return true if the set of window functions are all RANGE unit optimizable. Only relevant if m_needs_buffering and m_range_optimizable are true. */ bool optimizable_range_aggregates() const { return m_range_optimizable; } /** Return true if the aggregates are static, i.e. the same aggregate values for all rows in partition. Only relevant if m_needs_buffering is true. */ bool static_aggregates() const { return m_static_aggregates; } /** See #m_opt_first_row */ bool opt_first_row() const { return m_opt_first_row; } /** See #m_opt_last_row */ bool opt_last_row() const { return m_opt_last_row; } /** See #m_last */ bool is_last() const { return m_last; } /** See #m_opt_nth_row */ const st_nth &opt_nth_row() const { return m_opt_nth_row; } /** See #m_opt_lead_lag */ const st_lead_lag &opt_lead_lag() const { return m_opt_lead_lag; } /** Getter for m_frame_buffer_param, q.v. */ Temp_table_param *frame_buffer_param() { return m_frame_buffer_param; } /** Setter for m_frame_buffer_param, q.v. */ void set_frame_buffer_param(Temp_table_param *p) { m_frame_buffer_param= p; } /** Getter for m_frame_buffer, q.v. */ TABLE *frame_buffer() { return m_frame_buffer; } /** Setter for m_frame_buffer, q.v. */ void set_frame_buffer(TABLE *tab) { m_frame_buffer= tab; } /** Getter for m_outtable_param, q.v. */ Temp_table_param *outtable_param() { return m_outtable_param; } /** Setter for m_outtable_param, q.v. */ void set_outtable_param(Temp_table_param *p) { m_outtable_param= p; } /** Getter for m_part_row_number, q.v., the current row number within the partition. */ int64 partition_rowno() const { return m_part_row_number; } /** If there is no explicit frame and an ORDER BY exists, aggregate wfs has an upper frame bound determined by the last peer of the current row, i.e. a dynamic distance above the current row. This is similar to the case for RANGE upper bound. In general, we don't currently support dynamic (relative to current row) bounds based on the value of the current row. */ bool has_dynamic_frame_upper_bound() const; #ifdef WF_DEBUG /** If a window needs buffering for its rows before we can evalate some window function referencing that window, this map contains the buffered rows. @returns reference to the map containing the buffered rows */ std::unordered_map<uint64, std::vector<uchar>> &frame_buffer_cache() { return m_frame_buffer_cache; } #endif // WF_DEBUG /** Allocate the cache for special rows @param thd thread handle @param out_tbl The table where this window function's value is written to @returns true if error. */ bool make_special_rows_cache(THD *thd, TABLE *out_tbl); /** See #m_last_row_output */ int64 last_row_output() { return m_last_row_output; } /** See #m_last_row_output */ void set_last_row_output(int64 rno) { m_last_row_output= rno; } /** See #m_rowno_being_visited */ int64 rowno_being_visited() { return m_rowno_being_visited; } /** See #m_rowno_being_visited */ void set_rowno_being_visited(int64 rno) { m_rowno_being_visited= rno; } /** See #m_last_rowno_in_cache */ int64 last_rowno_in_cache () { return m_last_rowno_in_cache; } /** See #m_last_rowno_in_cache */ void set_last_rowno_in_cache(uint64 rno) { m_last_rowno_in_cache= rno; } /** See #m_last_rowno_in_range_frame */ int64 last_rowno_in_range_frame () { return m_last_rowno_in_range_frame; } /** See #m_last_rowno_in_range_frame */ void set_last_rowno_in_range_frame(uint64 rno) { m_last_rowno_in_range_frame= rno; } /** See #m_last_rowno_in_peerset */ int64 last_rowno_in_peerset() { return m_last_rowno_in_peerset; } /** See #m_last_rowno_in_peerset */ void set_last_rowno_in_peerset(uint64 rno) { m_last_rowno_in_peerset= rno; } /** See #m_do_copy_null */ bool do_copy_null () { return m_do_copy_null; } /** See #m_do_copy_null */ void set_do_copy_null(bool b) { m_do_copy_null= b; } /** See #m_inverse_aggregation */ bool do_inverse () { return m_inverse_aggregation; } /** See #m_inverse_aggregation */ Window &set_inverse(bool b) { m_inverse_aggregation= b; return *this; } /** See #m_aggregates_primed */ bool aggregates_primed() { return m_aggregates_primed; } /** See #m_aggregates_primed */ void set_aggregates_primed(bool b) { m_aggregates_primed= b; } /** See #m_dont_aggregate */ bool dont_aggregate() { return m_dont_aggregate; } /** See #m_dont_aggregate */ Window &set_dont_aggregate(bool b) { m_dont_aggregate= b; return *this; } /** See #m_is_last_row_in_frame */ bool is_last_row_in_frame (); /** See #m_is_last_row_in_frame */ void set_is_last_row_in_frame(bool b) { m_is_last_row_in_frame= b; } /** Return the size of the frame in number of rows @returns frame size */ // int64 frame_cardinality(); /** See #m_rowno_in_frame */ int64 rowno_in_frame () { return m_rowno_in_frame; } /** See #m_rowno_in_frame */ Window &set_rowno_in_frame(int64 rowno) { m_rowno_in_frame= rowno; return *this; } /** See #m_rowno_in_partition */ int64 rowno_in_partition () { return m_rowno_in_partition; } /** See #m_rowno_in_partition */ void set_rowno_in_partition(int64 rowno) { m_rowno_in_partition= rowno; } /** See #m_first_rowno_in_range_frame */ void set_first_rowno_in_range_frame(int64 rowno) { m_first_rowno_in_range_frame= rowno; } /** See #m_first_rowno_in_range_frame */ int64 first_rowno_in_range_frame() { return m_first_rowno_in_range_frame; } /** See #m_frame_buffer_total_rows */ void set_frame_buffer_total_rows(int64 rows) { m_frame_buffer_total_rows= rows; } /** See #m_frame_buffer_total_rows */ int64 frame_buffer_total_rows() { return m_frame_buffer_total_rows; } /** See #m_frame_buffer_partition_offset */ void set_frame_buffer_partition_offset(int64 offset) { m_frame_buffer_partition_offset= offset; } /** See #m_frame_buffer_partition_offset */ int64 frame_buffer_partition_offset() { return m_frame_buffer_partition_offset; } /** Set the current row number for diagnostics. This should be the absolute row number across all partitions in the windowing step, i.e. after ordering the row if applicable. @param da Diagnostics_area to store row number @param rowno_in_partition The logical row number within the partition */ void set_diagnostics_rowno(Diagnostics_area *da, int64 rowno_in_partition) { da->set_current_row_for_condition(rowno_in_partition + m_frame_buffer_partition_offset - 1); } /** Determine if we need to read the entire partition before we can evaluate some window function(s) on this window. @returns true if that is the case, else false */ bool has_two_pass_wf(); /** Free up any resource used to process the window functions of this window, e.g. temporary files and in-memory data structures. Called when done with all window processing steps from SELECT_LEX::cleanup. FIXME: release earlier? */ void cleanup(THD *thd); /** Reset window state for a new partition. Reset the temporary storage used for window frames, typically when we find a new partition. The rows in the buffer are then no longer needed. */ void reset_partition_state() { reset_execution_state(RL_PARTITION); } /** Reset execution state for next call to JOIN::exec, cf. JOIN::reset. */ void reset_round() { reset_execution_state(RL_ROUND); } /** Reset resolution and execution state to prepare for next execution of a prepared statement. */ void reinit_before_use() { reset_execution_state(RL_FULL); } /** Reset execution state for LEAD/LAG for the current row in partition. */ void reset_lead_lag(); enum Reset_level { RL_FULL, RL_ROUND, RL_PARTITION }; private: /// Common function for all types of resetting void reset_execution_state(Reset_level level); public: /** Reset the execution state for all window functions defined on this window. */ void reset_all_wf_state(); enum Wf_type { WT_FRAMING=1, WT_TWO_PASS=2, WT_STREAMING=4 }; /** Collects evaluation requirements from a window function, used by Item_sum::check_wf_semantics and its overrides. */ struct Evaluation_requirements { /** Set to true if window function requires row buffering */ bool needs_buffer; /** Set to true if we need peerset for evaluation(CUME_DIST) */ bool needs_peerset; /** Set to true if we need FIRST_VALUE or optimized MIN/MAX */ bool opt_first_row; /** Set to true if we need LAST_VALUE or optimized MIN/MAX */ bool opt_last_row; st_offset opt_nth_row; ///< Used if we have NTH_VALUE st_ll_offset opt_ll_row; ///< Used if we have LEAD or LAG /** Set to true if we can compute a sliding window by a combination of undoing the contribution of rows going out of the frame and adding the contribution of rows coming into the frame. For example, SUM and AVG allows this, but MAX/MIN do not. Only applicable if the frame has ROW bounds unit. */ bool row_optimizable; /** Similar to row_optimizable but for RANGE frame bounds unit */ bool range_optimizable; Evaluation_requirements() : needs_buffer(false), needs_peerset(false), opt_first_row(false), opt_last_row(false), row_optimizable(true), range_optimizable(true) {} }; const char *printable_name() const { return (m_name != nullptr ? m_name->str_value.ptr() : "<unnamed window>"); } void print(THD *thd, SELECT_LEX *lex, String *str, enum_query_type qt, bool expand_definition) const; bool has_windowing_steps() const; private: /** Common implementation of before_frame() and after_frame(). @param before True if 'before' is wanted; false if 'after' is. */ bool before_or_after_frame(bool before); void print_frame(String *str, enum_query_type qt) const; void print_border(String *str, PT_border *b, enum_query_type qt) const; /** Reorder windows and eliminate redundant ordering. If a window has the same ordering requirements as another, we will move them next to each other in the evaluation sequence, so we can sort only once, i.e. before the first window step. This allows us to fulfill the guarantee given by SQL standard when it comes to repeatability of non-deterministic (partially ordered) result sets for windowing inside a query, cf. #equal_sort. If more than two have the same ordering, the same applies, we only sort before the first (sort equivalent) window. If the result set is implicitly grouped, we also skip any sorting for windows. */ static void reorder_and_eliminate_sorts(THD *thd, List<Window> &windows, SELECT_LEX &select); /** Return true of the physical[1] sort orderings for the two windows are the same and all ordering expressions are fields (columns), cf. guarantee of SQL 2014 4.15.15 Windowed tables bullet two: The windowing functions are computed using the same row ordering if they specify the same ordering. Collation and null handling is not supported, so moot. The two other bullet points are also covered by this test. [1] After concatenating effective PARTITION BY and ORDER BY (including inheritance) expressions. */ static bool equal_sort(THD *thd, Window *w1, Window *w2); /** Check that a frame border is constant during execution and that it does not contain subqueries (relevant for INTERVAL only): implementation limitation. @param thd Session thread @param border The border to check @return false if OK, true if error */ bool check_constant_bound(THD *thd, PT_border *border); /** Check that frame borders are sane, e.g. they are not negative . @param thd Session thread @param w The window whose frame we are checking @param f The frame to check, if any @param prepare false at EXECUTE ps prepare time, else true @returns true if error */ static bool check_border_sanity(THD *thd, Window *w, const PT_frame *f, bool prepare); }; #endif /* WINDOWS_INCLUDED */

sql/window.h (404 lines of code) (raw):