include/S_Expression.h (528 lines of code) (raw):
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <algorithm>
#include <cctype>
#include <cstdint>
#include <functional>
#include <initializer_list>
#include <iomanip>
#include <istream>
#include <iterator>
#include <memory>
#include <ostream>
#include <sstream>
#include <stack>
#include <string>
#include <vector>
#include <boost/functional/hash.hpp>
#include <boost/functional/hash_fwd.hpp>
#include <boost/optional.hpp>
#include "Exceptions.h"
namespace sparta {
// Forward declarations.
namespace s_expr_impl {
class Component;
class Pattern;
} // namespace s_expr_impl
/*
* S-expressions are the syntactic structure of the Lisp family of languages.
* They're also a compact, efficient and human-readable format for serializing
* complex data structures (see this document for a more detailed rationale:
* http://people.csail.mit.edu/rivest/Sexp.txt). In particular, the artifacts
* created when performing abstract interpretation (semantic equations, program
* invariants) can be conveniently represented using S-expressions.
*
* This library implements a simple form of S-expressions that is sufficient for
* most serialization purposes. There are two types of atoms: strings and 32-bit
* signed integers. In the serialized form, integers are prefixed with the `#`
* sign and strings are quoted (escape sequences are permitted inside strings).
* By analogy with the symbols of Lisp, if a string contains only alphanumeric
* characters and underscores, the double quotes are optional. Lists of elements
* are represented using parentheses. Note that in our representation and unlike
* Lisp, the empty list `()`, also commonly called nil, is not an atom.
*
* Examples of S-expressions:
* #12
* "a string\n"
* (a (b c) d) ; a comment
* ((#-1 "a, b, c") (#0 d) (#1 ()))
*
* Note that a number that is not prefixed with the `#` character is interpreted
* as a string.
*
* S-expressions are immutable, functional data structures that can share
* subcomponents. In the following code for example, the S-expression e1 is not
* duplicated in e2 and e3, but shared instead:
*
* auto e1 = s_expr({s_expr("a"), s_expr("b")});
* auto e2 = s_expr({s_expr("l1"), e1});
* auto e3 = s_expr({s_expr("l2"), e1});
*
* The disposal of S-expressions is managed by shared pointers under the hood.
*/
class s_expr final {
public:
/*
* The default constructor returns the empty list `()`.
*/
s_expr();
/*
* Constructs an 32-bit signed integer atom.
*/
explicit s_expr(int32_t n);
/*
* Constructs a string atom.
*/
explicit s_expr(const std::string& s);
/*
* Various constructors for a list. The empty list (nil) can be constructed
* with `s_expr({})`.
*/
explicit s_expr(std::initializer_list<s_expr> l);
explicit s_expr(const std::vector<s_expr>& l);
template <typename InputIterator>
s_expr(InputIterator first, InputIterator last);
/*
* Predicate checking whether the S-expression is the empty list `()` or not.
*/
bool is_nil() const;
/*
* Note that nil (the empty list) is not an atom.
*/
bool is_atom() const;
bool is_int32() const;
bool is_string() const;
bool is_list() const;
/*
* Returns the value of an integer atom.
*/
int32_t get_int32() const;
/*
* Returns the contents of a string atom.
*/
const std::string& get_string() const;
/*
* Returns the size of a list.
*/
size_t size() const;
/*
* Returns the element of a list at the given position. The first element of a
* list has index 0.
*/
s_expr operator[](size_t index) const;
/*
* This operation returns the given list minus its first n elements. For
* example, if e is the S-expression `(a (b) (c d) e)`, then:
* e.tail(2) = ((c d) e)
* e.tail(4) = ()
*/
s_expr tail(size_t n) const;
/*
* Structural equality test of S-expressions. If both expressions share some
* subcomponents, the complexity of this operation can be sublinear.
*/
bool equals(const s_expr& other) const;
/*
* The complexity of computing the hash value is linear in the size of the
* S-expression.
*/
size_t hash_value() const;
/*
* Outputs a standard representation of the S-expression on the given stream.
*/
void print(std::ostream& output) const;
/*
* Returns a standard representation of the S-expression as a string.
*/
std::string str() const;
friend bool operator==(const s_expr& e1, const s_expr& e2) {
return e1.equals(e2);
}
friend bool operator!=(const s_expr& e1, const s_expr& e2) {
return !e1.equals(e2);
}
/*
* We need this function in order to use boost::hash<s_expr>.
*/
friend size_t hash_value(const s_expr& e) { return e.hash_value(); }
private:
// Appends an element to a list. This operation is used during parsing.
void add_element(const s_expr& element);
// By construction, m_component can never be null.
std::shared_ptr<s_expr_impl::Component> m_component;
friend class s_expr_istream;
};
} // namespace sparta
inline std::ostream& operator<<(std::ostream& output, const sparta::s_expr& e) {
e.print(output);
return output;
}
namespace sparta {
/*
* This is a stream-like structure for parsing S-expressions from a character
* stream input.
*
* Example usage:
* std::string str = "(a b) (c);";
* std::istringstream input(str);
* s_expr_istream si(input);
* s_expr e1, e2, e3;
* si >> e1 >> e2;
* // e1 = (a b) and e2 = (c)
* si >> e3;
* // parse error: si.fail() returns true
* // si.what() contains the error message
*/
class s_expr_istream final {
public:
s_expr_istream() = delete;
s_expr_istream(const s_expr_istream&) = delete;
s_expr_istream& operator=(const s_expr_istream&) = delete;
s_expr_istream(std::istream& input)
: m_input(input),
m_line_number(1),
m_status(Status::Good),
m_what("OK") {}
s_expr_istream& operator>>(s_expr& expr);
/*
* These status functions are inspired from the standard stream API.
*/
bool good() const { return m_status == Status::Good; }
bool fail() const { return m_status != Status::Good; }
/*
* When failing to read an S-expression from the input stream, we want to
* distinguish between reaching the end of the input and an actual parse
* error.
*/
bool eoi() const { return m_status == Status::EOI; }
/*
* When failing to read an S-expression from the input stream, we can use this
* function to obtain a description of the error.
*/
const std::string& what() const { return m_what; }
private:
enum class Status { EOI, Good, Fail };
void skip_white_spaces();
void set_status(Status status, const std::string& what_arg);
std::stack<s_expr> m_stack;
std::istream& m_input;
size_t m_line_number;
Status m_status;
std::string m_what;
};
/*
* S-expressions are primarily intended to be used as a serialization format for
* complex data structures. When deserializing an S-expression, it would be very
* cumbersome to inspect its structure using the basic accessor functions only.
* Pattern matching is a more compact and readable way of filtering
* S-expressions according to structural criteria. For example, imagine we have
* an encoding of functions as S-expressions:
*
* (function (name "my function") (package "my package")
* (args (#1 "arg1") (#2 "arg2") (#3 "arg3")))
*
* We want to extract the name and arguments of a function while ignoring the
* package name. Using the pattern-matching mechanism for S-expressions, this
* can be coded as follows:
*
* std::string name;
* s_expr args;
* if(s_patn({s_patn("function"),
* s_patn({s_patn("name"), s_patn(&name)}),
* s_patn({s_patn("package"), s_patn()}),
* s_patn({s_patn("args")}, args)}).match_with(e)) {
* // name = "my function"
* // args = ((#1 "arg1") (#2 "arg2") (#3 "arg3"))
* ...
* }
*/
class s_patn {
public:
class pattern_matching_error
: public virtual abstract_interpretation_exception {};
/*
* The wildcard pattern matches any S-expression.
*/
s_patn();
/*
* Matches any S-expression and stores it into the given placeholder.
*/
explicit s_patn(s_expr& placeholder);
/*
* Matches an integer atom with the given value.
*/
explicit s_patn(int32_t n);
/*
* Matches any integer atom and stores its value into the given placeholder.
*/
explicit s_patn(int32_t* placeholder);
/*
* Matches a string atom with the given value.
*/
explicit s_patn(const std::string& s);
/*
* Matches any string atom and stores its value into the given placeholder.
*/
explicit s_patn(std::string* placeholder);
/*
* Matches each element in a list with the given sequence of patterns.
*/
explicit s_patn(std::initializer_list<s_patn> heads);
/*
* Matches the first elements of a list with the given sequence of patterns
* and stores the remaining elements into the given placeholder. If there are
* no remaining elements, tail contains the empty list upon successful
* matching.
*
* Example:
* Assume e = ((1 2) 3 4 (5 6))
*
* s_expr x, y, t;
* if (s_patn({s_patn(x), s_patn(y)}, t).match_with(e)) {
* // x = (1 2)
* // y = 3
* // t = (4 (5 6))
* ...
* }
*/
s_patn(std::initializer_list<s_patn> heads, s_expr& tail);
/*
* Returns true if the pattern matches the given S-expression. Upon successful
* matching, all placeholders in the pattern are be set to their corresponding
* values. If the matching fails, the values of the placeholders are
* undefined.
*/
bool match_with(const s_expr& expr);
void must_match(const s_expr& expr, const std::string& msg);
private:
// By construction, m_pattern can never be null.
std::shared_ptr<s_expr_impl::Pattern> m_pattern;
};
namespace s_expr_impl {
enum class ComponentKind { Int32Atom, StringAtom, List };
// Checks whether a character belongs to a Lisp-like symbol, i.e., a string atom
// that can be represented without quotes.
inline bool is_symbol_char(char c) {
return std::isalnum(c) || c == '_' || c == '-' || c == '/' || c == ':' ||
c == '.';
}
class Component {
public:
virtual ~Component() {}
explicit Component(ComponentKind kind) : m_kind(kind) {}
ComponentKind kind() const { return m_kind; }
virtual bool equals(const std::shared_ptr<Component>& other) const = 0;
virtual size_t hash_value() const = 0;
virtual void print(std::ostream& o) const = 0;
protected:
ComponentKind m_kind;
};
class Int32Atom final : public Component {
public:
explicit Int32Atom(int32_t n)
: Component(ComponentKind::Int32Atom), m_value(n) {}
int32_t get_value() const { return m_value; }
bool equals(const std::shared_ptr<Component>& other) const {
if (this == other.get()) {
// Since S-expressions can share structure, checking for pointer equality
// prevents unnecessary computations.
return true;
}
if (other->kind() != ComponentKind::Int32Atom) {
return false;
}
auto n = std::static_pointer_cast<Int32Atom>(other);
return m_value == n->m_value;
}
size_t hash_value() const {
boost::hash<int32_t> hasher;
return hasher(m_value);
}
void print(std::ostream& output) const { output << "#" << m_value; }
private:
int32_t m_value;
};
class StringAtom final : public Component {
public:
explicit StringAtom(const std::string& s)
: Component(ComponentKind::StringAtom), m_string(s) {}
const std::string& get_string() const { return m_string; }
bool equals(const std::shared_ptr<Component>& other) const {
if (this == other.get()) {
// Since S-expressions can share structure, checking for pointer equality
// prevents unnecessary computations.
return true;
}
if (other->kind() != ComponentKind::StringAtom) {
return false;
}
auto s = std::static_pointer_cast<StringAtom>(other);
return m_string == s->m_string;
}
size_t hash_value() const {
boost::hash<std::string> hasher;
return hasher(m_string);
}
void print(std::ostream& output) const {
if (m_string.empty()) {
// The empty string needs to be explicitly represented.
output << "\"\"";
return;
}
if (std::find_if(m_string.begin(), m_string.end(), [](char c) {
return !is_symbol_char(c);
}) == m_string.end()) {
// If the string only contains alphanumeric characters and underscores, we
// display it without quotes.
output << m_string;
} else {
// The string is quoted and special characters are displayed using escape
// sequences.
output << std::quoted(m_string);
}
}
private:
std::string m_string;
};
class List final : public Component {
public:
List() : Component(ComponentKind::List), m_list(0) { m_list.shrink_to_fit(); }
template <typename InputIterator>
List(InputIterator first, InputIterator last)
: Component(ComponentKind::List), m_list(first, last) {
m_list.shrink_to_fit();
}
size_t size() const { return m_list.size(); }
s_expr get_element(size_t index) const {
try {
return m_list.at(index);
} catch (const std::out_of_range& e) {
// The `at` function throws an exception if the index doesn't lie within
// the bounds of the vector.
BOOST_THROW_EXCEPTION(invalid_argument() << argument_name("index"));
}
}
s_expr tail(size_t index) const {
RUNTIME_CHECK(index <= m_list.size(),
invalid_argument() << argument_name("index"));
// If index == m_list.size(), the function returns the empty list.
return s_expr(std::next(m_list.begin(), index), m_list.end());
}
void add_element(const s_expr& element) { m_list.push_back(element); }
bool equals(const std::shared_ptr<Component>& other) const {
if (this == other.get()) {
// Since S-expressions can share structure, checking for pointer equality
// prevents unnecessary computations.
return true;
}
if (other->kind() != ComponentKind::List) {
return false;
}
auto l = std::static_pointer_cast<List>(other);
if (l->m_list.size() != m_list.size()) {
return false;
}
return std::equal(
m_list.begin(),
m_list.end(),
l->m_list.begin(),
[](const s_expr& e1, const s_expr& e2) { return e1.equals(e2); });
}
size_t hash_value() const {
return boost::hash_range(m_list.begin(), m_list.end());
}
void print(std::ostream& output) const {
output << "(";
for (auto it = m_list.begin(); it != m_list.end(); ++it) {
it->print(output);
if (std::next(it) != m_list.end()) {
output << " ";
}
}
output << ")";
}
private:
std::vector<s_expr> m_list;
};
class Pattern {
public:
virtual ~Pattern() {}
virtual bool match_with(const s_expr& expr) = 0;
};
class WildcardPattern final : public Pattern {
public:
WildcardPattern() = default;
bool match_with(const s_expr& expr) { return true; }
};
class PlaceholderPattern final : public Pattern {
public:
explicit PlaceholderPattern(s_expr& placeholder)
: m_placeholder(placeholder) {}
bool match_with(const s_expr& expr) {
m_placeholder = expr;
return true;
}
private:
s_expr& m_placeholder;
};
class Int32Pattern final : public Pattern {
public:
explicit Int32Pattern(int32_t n) : m_value(n) {}
explicit Int32Pattern(int32_t* placeholder) : m_placeholder(placeholder) {}
bool match_with(const s_expr& expr) {
if (!expr.is_int32()) {
return false;
}
if (m_placeholder) {
**m_placeholder = expr.get_int32();
return true;
}
return m_value == expr.get_int32();
}
private:
int32_t m_value;
boost::optional<int32_t*> m_placeholder;
};
class StringPattern final : public Pattern {
public:
explicit StringPattern(const std::string& s) : m_string(s) {}
explicit StringPattern(std::string* placeholder)
: m_placeholder(placeholder) {}
bool match_with(const s_expr& expr) {
if (!expr.is_string()) {
return false;
}
if (m_placeholder) {
**m_placeholder = expr.get_string();
return true;
}
return m_string == expr.get_string();
}
private:
std::string m_string;
boost::optional<std::string*> m_placeholder;
};
class ListPattern final : public Pattern {
public:
explicit ListPattern(std::initializer_list<s_patn> heads) : m_heads(heads) {}
ListPattern(std::initializer_list<s_patn> heads, s_expr& tail)
: m_heads(heads), m_tail(tail) {}
bool match_with(const s_expr& expr) {
if (!expr.is_list()) {
return false;
}
if (m_heads.size() > expr.size()) {
return false;
}
for (size_t i = 0; i < m_heads.size(); ++i) {
if (!m_heads[i].match_with(expr[i])) {
return false;
}
}
if (m_tail) {
m_tail->get() = expr.tail(m_heads.size());
}
return true;
}
private:
std::vector<s_patn> m_heads;
boost::optional<std::reference_wrapper<s_expr>> m_tail;
};
} // namespace s_expr_impl
inline s_expr::s_expr() : m_component(std::make_shared<s_expr_impl::List>()) {}
inline s_expr::s_expr(int32_t n)
: m_component(std::make_shared<s_expr_impl::Int32Atom>(n)) {}
inline s_expr::s_expr(const std::string& s)
: m_component(std::make_shared<s_expr_impl::StringAtom>(s)) {}
inline s_expr::s_expr(std::initializer_list<s_expr> l)
: m_component(std::make_shared<s_expr_impl::List>(l.begin(), l.end())) {}
inline s_expr::s_expr(const std::vector<s_expr>& l)
: m_component(std::make_shared<s_expr_impl::List>(l.begin(), l.end())) {}
template <typename InputIterator>
inline s_expr::s_expr(InputIterator first, InputIterator last)
: m_component(std::make_shared<s_expr_impl::List>(first, last)) {}
inline bool s_expr::is_nil() const { return is_list() && (size() == 0); }
inline bool s_expr::is_atom() const { return !is_list(); }
inline bool s_expr::is_int32() const {
return m_component->kind() == s_expr_impl::ComponentKind::Int32Atom;
}
inline bool s_expr::is_string() const {
return m_component->kind() == s_expr_impl::ComponentKind::StringAtom;
}
inline bool s_expr::is_list() const {
return m_component->kind() == s_expr_impl::ComponentKind::List;
}
inline int32_t s_expr::get_int32() const {
RUNTIME_CHECK(is_int32(), undefined_operation());
return std::static_pointer_cast<s_expr_impl::Int32Atom>(m_component)
->get_value();
}
inline const std::string& s_expr::get_string() const {
RUNTIME_CHECK(is_string(), undefined_operation());
return std::static_pointer_cast<s_expr_impl::StringAtom>(m_component)
->get_string();
}
inline size_t s_expr::size() const {
RUNTIME_CHECK(is_list(), undefined_operation());
return std::static_pointer_cast<s_expr_impl::List>(m_component)->size();
}
inline s_expr s_expr::operator[](size_t index) const {
RUNTIME_CHECK(is_list(), undefined_operation());
return std::static_pointer_cast<s_expr_impl::List>(m_component)
->get_element(index);
}
inline s_expr s_expr::tail(size_t index) const {
RUNTIME_CHECK(is_list(), undefined_operation());
return std::static_pointer_cast<s_expr_impl::List>(m_component)->tail(index);
}
inline bool s_expr::equals(const s_expr& other) const {
if (m_component == other.m_component) {
// Since S-expressions can share structure, checking for pointer equality
// prevents unnecessary computations.
return true;
}
return m_component->equals(other.m_component);
}
inline size_t s_expr::hash_value() const { return m_component->hash_value(); }
inline void s_expr::print(std::ostream& output) const {
m_component->print(output);
}
inline std::string s_expr::str() const {
std::ostringstream out;
print(out);
return out.str();
}
inline void s_expr::add_element(const s_expr& element) {
RUNTIME_CHECK(m_component->kind() == s_expr_impl::ComponentKind::List,
invalid_argument() << argument_name("element")
<< operation_name("s_expr::add_element()"));
auto list = std::static_pointer_cast<s_expr_impl::List>(m_component);
list->add_element(element);
}
inline s_expr_istream& s_expr_istream::operator>>(s_expr& expr) {
for (;;) {
skip_white_spaces();
if (!m_input.good()) {
if (!m_stack.empty()) {
set_status(Status::Fail, "Incomplete S-expression");
} else {
set_status(Status::EOI, "End of input");
}
return *this;
}
char next_char = m_input.peek();
switch (next_char) {
case '(': {
m_stack.emplace();
m_input.get();
break;
}
case ')': {
if (m_stack.empty()) {
set_status(Status::Fail, "Extra ')' encountered");
return *this;
}
m_input.get();
s_expr list = m_stack.top();
m_stack.pop();
if (m_stack.empty()) {
expr = list;
return *this;
}
m_stack.top().add_element(list);
break;
}
case '#': {
m_input.get();
int32_t n;
m_input >> n;
if (m_input.fail()) {
set_status(Status::Fail, "Error parsing int32_t literal");
return *this;
}
s_expr atom(n);
if (m_stack.empty()) {
expr = atom;
return *this;
}
m_stack.top().add_element(atom);
break;
}
case '"': {
std::string s;
m_input >> std::quoted(s);
if (m_input.fail()) {
set_status(Status::Fail, "Error parsing string literal");
return *this;
}
s_expr atom(s);
if (m_stack.empty()) {
expr = atom;
return *this;
}
m_stack.top().add_element(atom);
break;
}
case ';': {
m_input.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
++m_line_number;
break;
}
default: {
// The next S-expression is necessary a symbol, i.e., an unquoted string.
if (!s_expr_impl::is_symbol_char(next_char)) {
std::ostringstream out;
out << "Unexpected character encountered: '" << next_char << "'";
set_status(Status::Fail, out.str());
return *this;
}
std::ostringstream symbol;
while (m_input.good() && s_expr_impl::is_symbol_char(next_char)) {
symbol << next_char;
m_input.get();
next_char = m_input.peek();
}
s_expr atom(symbol.str());
if (m_stack.empty()) {
expr = atom;
return *this;
}
m_stack.top().add_element(atom);
}
}
}
}
inline void s_expr_istream::skip_white_spaces() {
for (;;) {
char c = m_input.peek();
if (m_input.good() && std::isspace(c)) {
if (c == '\n') {
++m_line_number;
}
m_input.get();
} else {
return;
}
}
}
inline void s_expr_istream::set_status(Status status,
const std::string& what_arg) {
m_status = status;
std::ostringstream ss;
ss << "On line " << m_line_number << ": " << what_arg;
m_what = ss.str();
}
inline s_patn::s_patn()
: m_pattern(std::make_shared<s_expr_impl::WildcardPattern>()) {}
inline s_patn::s_patn(s_expr& placeholder)
: m_pattern(
std::make_shared<s_expr_impl::PlaceholderPattern>(placeholder)) {}
inline s_patn::s_patn(int32_t n)
: m_pattern(std::make_shared<s_expr_impl::Int32Pattern>(n)) {}
inline s_patn::s_patn(int32_t* placeholder)
: m_pattern(std::make_shared<s_expr_impl::Int32Pattern>(placeholder)) {}
inline s_patn::s_patn(const std::string& s)
: m_pattern(std::make_shared<s_expr_impl::StringPattern>(s)) {}
inline s_patn::s_patn(std::string* placeholder)
: m_pattern(std::make_shared<s_expr_impl::StringPattern>(placeholder)) {}
inline s_patn::s_patn(std::initializer_list<s_patn> heads)
: m_pattern(std::make_shared<s_expr_impl::ListPattern>(heads)) {}
inline s_patn::s_patn(std::initializer_list<s_patn> heads, s_expr& tail)
: m_pattern(std::make_shared<s_expr_impl::ListPattern>(heads, tail)) {}
inline bool s_patn::match_with(const s_expr& expr) {
return m_pattern->match_with(expr);
}
inline void s_patn::must_match(const s_expr& expr, const std::string& msg) {
RUNTIME_CHECK(m_pattern->match_with(expr),
pattern_matching_error() << error_msg(
"Could not find match against " + expr.str() + ": " + msg));
}
} // namespace sparta