sql_utils/public/parse_location.cc (189 lines of code) (raw):

/* * Copyright 2023 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "sql_utils/public/parse_location.h" #include <algorithm> #include <cstdint> #include <iterator> #include <optional> #include <string> #include <utility> #include "sql_utils/base/logging.h" #include "sql_utils/proto/internal_error_location.pb.h" #include <cstdint> #include "absl/status/statusor.h" #include "unicode/umachine.h" #include "unicode/utf8.h" #include "sql_utils/base/mathutil.h" #include "sql_utils/base/ret_check.h" #include "sql_utils/base/status_builder.h" namespace bigquery_ml_utils { const int kTabWidth = 8; InternalErrorLocation ParseLocationPoint::ToInternalErrorLocation() const { InternalErrorLocation error_location; if (!filename_.empty()) { error_location.set_filename(std::string(filename_)); } error_location.set_byte_offset(byte_offset_); return error_location; } ParseLocationPoint ParseLocationPoint::FromInternalErrorLocation( const InternalErrorLocation& info) { return ParseLocationPoint::FromByteOffset(info.filename(), info.byte_offset()); } ParseLocationTranslator::ParseLocationTranslator(absl::string_view input) : input_(input) {} void ParseLocationTranslator::CalculateLineOffsets() const { if (line_offsets_.empty()) { line_offsets_.push_back(0); // Line 1 starts at offset 0. int offset = 0; while (offset < input_.size()) { switch (input_[offset]) { case '\n': line_offsets_.push_back(++offset); break; case '\r': if (offset + 1 < input_.size() && input_[offset + 1] == '\n') { ++offset; } line_offsets_.push_back(++offset); break; default: ++offset; break; } } } } namespace { // Helper function used when iterating through a line of text to advance one // character. // // On input, <byte_offset> points to a byte offset within <current_line> // and <column> points to the one-based column number of <byte_offset>. // // On output, advances <*byte_offset> and <*column> by the number of bytes // and columns represented by the current character, respectively. // // In most cases, one byte <=> one char <=> one column, but not always. // <current_line> is processed as UTF-8, and multi-byte characters map to just // one column. In addition, the tab character ('\t') advances the column number // forward to the nearest multiple of kTabSize, so multiple columns contained // within a single byte are also possible. // // Since <current_line> is assumed to represent a single line of text (not // including the terminating newline character), we are assumed to never // encounter newline characters within <current_line>. // // <stop_byte_offset> and <stop_column>, if present, specify a byte offset and // column number which we will never advance past, even if <stop_byte_offset> // is in the middle of a character, or <stop_column> is in the middle of a // tab expansion. If either of these constraints are hit, we return Ok and // partially advance either the byte offset or the column, up to the limit // provided. // // If the current byte offset points at an invalid utf-8 sequence, we advance // one column. // // Returns a generic::internal error if <*byte_offset> is out of bounds with // respect to <current_line> absl::Status AdvanceOneChar(absl::string_view current_line, std::optional<int> stop_byte_offset, std::optional<int> stop_column, int* column, int* byte_offset) { SQL_RET_CHECK_GE(*byte_offset, 0) << "Negative byte offset"; SQL_RET_CHECK_LT(*byte_offset, current_line.length()) << "Byte offset beyond the last column of line"; if (current_line[*byte_offset] == '\t') { int new_column = bigquery_ml_utils_base::MathUtil::RoundUpTo(*column, kTabWidth) + 1; if (stop_column.has_value() && new_column > stop_column.value()) { // <stop_column> points to whitespace in the middle of tab expansion. *column = stop_column.value(); } else { *column = new_column; ++*byte_offset; } return absl::OkStatus(); } // Figure out the length of the current UTF-8 character. Note that // <new_byte_offset> and <current_code_point> are passed by reference and // modified by the U8_NEXT() macro. On output, <new_byte_offset> is the // byte offset of the end of the current character and <current_code_point> // is the code point of the current character, or a negative value in case of // error. int new_byte_offset = *byte_offset; UChar32 current_code_point; U8_NEXT(current_line.data(), new_byte_offset, current_line.length(), current_code_point); if (current_code_point < 0) { // The line contains invalid utf-8, so just fall back to advancing a // single byte. new_byte_offset = *byte_offset + 1; } if (stop_byte_offset.has_value() && new_byte_offset > stop_byte_offset.value()) { // <*stop_byte_offset> represents a byte in the middle of the UTF-8 // character. *byte_offset = stop_byte_offset.value(); } else { ++*column; *byte_offset = new_byte_offset; } return absl::OkStatus(); } absl::StatusOr<int> ColumnNumberFromLineLocalByteOffset( absl::string_view current_line, int desired_byte_offset) { int column = 1; // Column numbers are one-based. int byte_offset = 0; while (byte_offset < desired_byte_offset) { SQL_RETURN_IF_ERROR(AdvanceOneChar( current_line, /*stop_byte_offset=*/desired_byte_offset, /*stop_column=*/std::optional<int>(), &column, &byte_offset)); } return column; } } // namespace absl::StatusOr<std::pair<int, int>> ParseLocationTranslator::GetLineAndColumnFromByteOffset(int byte_offset) const { SQL_DCHECK_GE(byte_offset, 0); SQL_DCHECK_LE(byte_offset, input_.size()); SQL_RET_CHECK(byte_offset >= 0 && byte_offset <= static_cast<int64_t>(input_.size())) << "Byte offset " << byte_offset << " out of bounds of input (size " << input_.size() << ")"; CalculateLineOffsets(); SQL_DCHECK_EQ(line_offsets_[0], 0); SQL_DCHECK(!line_offsets_.empty()); SQL_DCHECK_EQ(line_offsets_.front(), 0); auto ub_iter = std::upper_bound(line_offsets_.begin(), line_offsets_.end(), byte_offset); // ub_iter points at the beginning of the *next* line. --ub_iter; const int line_number = static_cast<int>(std::distance(line_offsets_.begin(), ub_iter) + 1); SQL_ASSIGN_OR_RETURN(absl::string_view current_line, GetLineText(line_number)); SQL_ASSIGN_OR_RETURN( int column_number, ColumnNumberFromLineLocalByteOffset( current_line, byte_offset - line_offsets_[line_number - 1]), _ << "\nByte offset: " << byte_offset << "\nError in line " << line_number << ", which starts at byte offset " << line_offsets_[line_number - 1]); return std::make_pair(line_number, column_number); } absl::StatusOr<int> ParseLocationTranslator::GetByteOffsetFromLineAndColumn( int line, int column) const { SQL_RET_CHECK_GE(line, 1); SQL_RET_CHECK_GE(column, 1); CalculateLineOffsets(); // Find the offset corresponding to the line number. SQL_RET_CHECK_LE(line, line_offsets_.size()) << "Query had " << line_offsets_.size() << " lines but line " << line << " was requested"; SQL_ASSIGN_OR_RETURN(absl::string_view current_line, GetLineText(line)); SQL_DCHECK_EQ(current_line.find('\r'), current_line.npos) << "GetLineText() returned string with newline characters"; SQL_DCHECK_EQ(current_line.find('\n'), current_line.npos) << "GetLineText() returned string with newline characters"; int byte_offset = 0; int curr_column = 1; while (curr_column < column) { SQL_RETURN_IF_ERROR(AdvanceOneChar(current_line, /*stop_byte_offset=*/std::optional<int>(), /*stop_column=*/column, &curr_column, &byte_offset)); } return line_offsets_[line - 1] + byte_offset; } absl::StatusOr<std::pair<int, int>> ParseLocationTranslator::GetLineAndColumnAfterTabExpansion( ParseLocationPoint point) const { return GetLineAndColumnFromByteOffset(point.GetByteOffset()); } // Return <input> with tabs expanded to spaces, assuming kTabWidth-char tabs. std::string ParseLocationTranslator::ExpandTabs(absl::string_view input) { std::string out; for (int i = 0; i < input.size(); ++i) { SQL_DCHECK(input[i] != '\n' && input[i] != '\r'); if (input[i] == '\t') { out += std::string(kTabWidth - (out.size() % kTabWidth), ' '); } else { out += input[i]; } } return out; } absl::StatusOr<absl::string_view> ParseLocationTranslator::GetLineText( int line) const { CalculateLineOffsets(); SQL_RET_CHECK_GT(line, 0) << "Line number <= 0"; SQL_RET_CHECK_LE(line, line_offsets_.size()) << "Query had " << line << " lines but line " << line_offsets_.size() << " was requested"; const int line_index = line - 1; const int line_start_offset = line_offsets_[line_index]; int line_end_offset; if (line_index == line_offsets_.size() - 1) { line_end_offset = static_cast<int>(input_.size()); } else { line_end_offset = line_offsets_[line_index + 1] - 1; } // If the line ends with "\r\n", don't include the "\r" as part of the line. if (line_end_offset > 0 && line_end_offset < input_.size() && input_[line_end_offset] == '\n' && input_[line_end_offset - 1] == '\r') { --line_end_offset; } return input_.substr(line_start_offset, line_end_offset - line_start_offset); } } // namespace bigquery_ml_utils