sql_utils/public/parse_location.h (133 lines of code) (raw):

/* * Copyright 2023 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef THIRD_PARTY_PY_BIGQUERY_ML_UTILS_SQL_UTILS_PUBLIC_PARSE_LOCATION_H_ #define THIRD_PARTY_PY_BIGQUERY_ML_UTILS_SQL_UTILS_PUBLIC_PARSE_LOCATION_H_ #include <ostream> #include <string> #include <utility> #include <vector> #include "sql_utils/public/parse_location_range.pb.h" #include "absl/base/attributes.h" #include "absl/status/statusor.h" #include "absl/strings/str_cat.h" #include "absl/strings/string_view.h" #include "sql_utils/base/ret_check.h" #include "sql_utils/base/status.h" namespace bigquery_ml_utils { class InternalErrorLocation; // Point location in the input string. // Use ParseLocationTranslator to translate a ParseLocationPoint into something // that can be used in relation to the input query. // The <filename> is informational, and only used for error messaging. class ParseLocationPoint { public: ParseLocationPoint() : byte_offset_(-1) {} // Creates a ParseLocationPoint from a filename and byte offset. // <filename> must remain valid for the lifetime of this ParseLocationPoint. static ParseLocationPoint FromByteOffset(absl::string_view filename, int byte_offset) { ParseLocationPoint point; point.filename_ = filename; point.byte_offset_ = byte_offset; return point; } // Creates a ParseLocationPoint from a byte offset (with empty filename). static ParseLocationPoint FromByteOffset(int byte_offset) { return FromByteOffset(absl::string_view(), byte_offset); } absl::string_view filename() const { return filename_; } // Returns the byte offset corresponding to this parse location point. Returns // a negative value for invalid ParseLocationPoints. int GetByteOffset() const { return byte_offset_; } // Creates a ParseLocationPoint from the contents of <info>. Not intended for // public use. static ParseLocationPoint FromInternalErrorLocation( const InternalErrorLocation& info); // Returns the contents of this point as an internal error location. Not // intended for public use. InternalErrorLocation ToInternalErrorLocation() const; // Returns the string representation of this ParseLocationPoint, in the // form of [filename:]byte_offset. std::string GetString() const { if (byte_offset_ >= 0) { return absl::StrCat( (!filename_.empty() ? absl::StrCat(filename_, ":") : ""), byte_offset_); } else { return "INVALID"; } } friend bool operator==(const ParseLocationPoint& lhs, const ParseLocationPoint& rhs) { return lhs.filename_ == rhs.filename_ && lhs.byte_offset_ == rhs.byte_offset_; } friend bool operator!=(const ParseLocationPoint& lhs, const ParseLocationPoint& rhs) { return !(lhs == rhs); } friend bool operator<(const ParseLocationPoint& lhs, const ParseLocationPoint& rhs) { if (lhs.filename_ == rhs.filename_) { return lhs.byte_offset_ < rhs.byte_offset_; } return lhs.filename_ < rhs.filename_; } friend std::ostream& operator<<(std::ostream& os, const ParseLocationPoint& point) { return os << "ParseLocationPoint at offset " << point.GetByteOffset(); } private: absl::string_view filename_; int byte_offset_; // Intentionally copyable. }; // A half-open range of ParseLocationPoints [start(), end()). class ParseLocationRange { public: ParseLocationRange() {} void set_start(ParseLocationPoint start) { start_ = start; } void set_end(ParseLocationPoint end) { end_ = end; } ParseLocationPoint start() const { return start_; } ParseLocationPoint end() const { return end_; } absl::StatusOr<ParseLocationRangeProto> ToProto() const { // The ParseLocationProto only has a single field for the filename, so it // cannot represent a ParseLocationRange where the start and end locations // have different filenames. We SQL_CHECK that condition here. SQL_RET_CHECK_EQ(start().filename(), end().filename()); ParseLocationRangeProto proto; proto.set_filename(std::string(start().filename())); proto.set_start(start().GetByteOffset()); proto.set_end(end().GetByteOffset()); return proto; } // The 'filename' in start and end fields of ParseLocationRange is a // string_view. This filename will point to the filename string in the // ParseLocationRangeProto. Therefore 'proto' must outlive the returned // ParseLocationRange. static absl::StatusOr<ParseLocationRange> Create( const ParseLocationRangeProto& proto) { SQL_RET_CHECK(proto.has_start() && proto.has_end()) << "Provided ParseLocationRangeProto does not have start and/or end " "byte offsets"; ParseLocationRange parse_location_range; // ParseLocationRangeProto has a single filename that is used for both the // start and end location in the output ParseLocationRange. parse_location_range.set_start( ParseLocationPoint::FromByteOffset(proto.filename(), proto.start())); parse_location_range.set_end( ParseLocationPoint::FromByteOffset(proto.filename(), proto.end())); return parse_location_range; } // Returns the string representation of this parse location. std::string GetString() const { if (!start_.filename().empty() && start_.filename() == end_.filename()) { return absl::StrCat(start_.filename(), ":", start_.GetByteOffset(), "-", end_.GetByteOffset()); } return absl::StrCat(start_.GetString(), "-", end_.GetString()); } friend bool operator==(const ParseLocationRange& lhs, const ParseLocationRange& rhs) { return lhs.start() == rhs.start() && lhs.end() == rhs.end(); } friend std::ostream& operator<<(std::ostream& os, const ParseLocationRange& range) { return os << "ParseLocationRange from " << range.start().GetByteOffset() << " to " << range.end().GetByteOffset(); } private: ParseLocationPoint start_; ParseLocationPoint end_; // Intentionally copyable. }; // Translates ParseLocationPoints to offsets and line/column numbers. For // translation of offsets to line/column numbers, does a pass over the input // string to record the byte offsets of every line. Accepted end of line // characters are \n, \r\n, or \r. // // The input absl::string_view must outlive this class. // // NOT thread compatible, because the line offset table is calculated on demand // without any locking. class ParseLocationTranslator { public: explicit ParseLocationTranslator(absl::string_view input); ParseLocationTranslator(const ParseLocationTranslator&) = delete; ParseLocationTranslator& operator=(const ParseLocationTranslator&) = delete; // Calculates the line and column number corresponding to <point>. The // returned column number is a 1-based UTF-8 character index in // ExpandTabs(GetLineText(*line)). The character index can currently be // incorrect for strings containing multi-byte characters because the column // number calculation algorithm and ExpandTabs both assume single-byte // characters. // // Returns a generic::INTERNAL status for invalid positions (byte offset < 0 // or > length of input). absl::StatusOr<std::pair<int, int>> GetLineAndColumnAfterTabExpansion( ParseLocationPoint point) const; // Gets the text for line number <line>. If the line is invalid, returns // a failed absl::Status. absl::StatusOr<absl::string_view> GetLineText(int line) const; // Return <input> with tabs expanded to spaces, assuming 8-char tabs. // <input> must not contain any new line characters. static std::string ExpandTabs(absl::string_view input); // Calculates the byte offset from the start of the input that corresponds to // 'line' and 'column' and returns it in 'byte_offset'. Returns a failed // status if the line and/or column are invalid. Line and column have the // same semantics as described in GetLineAndColumnAfterTabExpansion, i.e., // they are post-tab-expansion. absl::StatusOr<int> GetByteOffsetFromLineAndColumn(int line, int column) const; private: // Calculates and returns the line and column number for byte offset // 'byte_offset', using the same line and column semantics as described in // GetLineAndColumnAfterTabExpansion(). absl::StatusOr<std::pair<int, int>> GetLineAndColumnFromByteOffset( int byte_offset) const; // Calculates line_offsets_ if it has not been calculated yet. void CalculateLineOffsets() const; absl::string_view input_; // line_offset_[i] is start offset of line (i-1) in input_. Calculated on // demand using CalculateLineOffsets(). mutable std::vector<int> line_offsets_; }; } // namespace bigquery_ml_utils #endif // THIRD_PARTY_PY_BIGQUERY_ML_UTILS_SQL_UTILS_PUBLIC_PARSE_LOCATION_H_