ecosystem/simple_dom.cpp (313 lines of code) (raw):

#define protected public #include "simple_dom.h" #undef protected #include <fcntl.h> #include <sys/stat.h> #include <stdlib.h> #include <vector> #include <algorithm> #include <photon/common/alog.h> #include <photon/common/alog-stdstring.h> #include <photon/common/utility.h> #include <photon/common/stream.h> #include <photon/common/retval.h> #include <photon/fs/localfs.h> #include <photon/fs/filesystem.h> #include <rapidxml.hpp> #include <rapidjson/reader.h> #define RYML_SINGLE_HDR_DEFINE_NOW #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-but-set-variable" #include <rapidyaml-0.5.0.hpp> #pragma GCC diagnostic pop using namespace std; namespace photon { namespace SimpleDOM { inline int NodeImpl::init_non_root(str key, str value, const NodeImpl* root, uint32_t flags) { _root = root; assert(root); _flags = flags & ~FLAG_IS_ROOT; assert(key.length() <= MAX_KEY_LENGTH); assert(value.length() <= MAX_VALUE_LENGTH); _k_len = key.length(); _v_len = value.length(); auto text_begin = root->_text_begin; assert(key.empty() || key.data() > text_begin); assert(value.empty() || value.data() > key.end()); uint64_t koff, voff; switch ((key.empty() << 1) | value.empty()) { case 0: // key && value koff = key.data() - text_begin; voff = value.data() - key.end(); break; case 1: // key && !value koff = key.data() - text_begin; voff = 0; break; case 2: // !key && value koff = value.data() - text_begin; voff = 0; break; case 3: // !key && !value _k_off = 0; _k_len = 0; _v_off = 0; _v_len = 0; return 0; default: assert(false); return -1; } assert(koff <= MAX_KEY_OFFSET); assert(voff <= MAX_VALUE_OFFSET); _k_off = koff; _v_off = voff; return 0; } inline int NodeImpl::init_root(const char* text_begin, uint32_t node_size, bool text_ownership) { _flags = FLAG_IS_ROOT | FLAG_IS_LAST; if (text_ownership) _flags |= FLAG_TEXT_OWNERSHIP; _text_begin = text_begin; assert(node_size <= MAX_NODE_SIZE); _node_size = node_size; _refcnt = 0; return 0; } template<typename Derived> class DocNode : public NodeImpl { public: vector<Derived> _children; DocNode() = default; DocNode(DocNode&&) = default; DocNode& operator=(DocNode&&) = default; DocNode(const char* text_begin, bool text_ownership) { init_root(text_begin, sizeof(Derived), text_ownership); } DocNode(str key, str value, const NodeImpl* root) { init_non_root(key, value, root, 0); } DocNode(const NodeImpl* root) : DocNode({}, {}, root) { } void print_children(int depth) { for (auto& x: _children) { auto k = x.get_key(), v = x.get_value(); LOG_DEBUG(VALUE(depth), k, ':', v); } } void set_children(vector<Derived>&& nodes, bool _sort = true) { if (nodes.empty()) return; assert(nodes.size() <= MAX_NCHILDREN); if (nodes.size() > MAX_NCHILDREN) nodes.resize(MAX_NCHILDREN); if (_sort) sort(nodes.begin(), nodes.end()); nodes.back()._flags |= FLAG_IS_LAST; // must be after sort!!! _nchildren = nodes.size(); _children = std::move(nodes); } ~DocNode() override { if (is_root()) { assert(_refcnt == 0); if (_flags & FLAG_TEXT_OWNERSHIP) free((void*)_text_begin); } } const NodeImpl* get(size_t i) const override { return (i < _children.size()) ? &_children[i] : nullptr; } const NodeImpl* get(str key) const override { if (_children.empty()) return nullptr; for (size_t i = 0; i < _children.size() - 1; ++i) { assert((_children[i]._flags & FLAG_IS_LAST) == 0); } assert(_children.back()._flags & FLAG_IS_LAST); auto it = std::lower_bound(_children.begin(), _children.end(), key); return (it == _children.end() || it->get_key() != key) ? nullptr : &*it; } }; using namespace rapidjson; class JNode : public DocNode<JNode> { public: using DocNode::DocNode; }; struct JHandler : public BaseReaderHandler<UTF8<>, JHandler> { vector<vector<JNode>> _nodes{1}; str _key; JNode* _root; JHandler(const char* text, bool text_ownership) { assert(_nodes.size() == 1); _root = new JNode(text, text_ownership); } ~JHandler() { assert(_nodes.size() == 1); assert(_nodes.front().size() == 1); _root->set_children(std::move(_nodes.front().front()._children)); } JNode* get_root() { return _root; } void emplace_back(const char* s, size_t length) { str val{s, length}; // _key may be empty() _nodes.back().emplace_back(_key, val, _root); // LOG_DEBUG(_key, ": ", val); _key = {}; } bool Null() { emplace_back(0, 0); return true; } bool Key(const char* s, SizeType len, bool copy) { assert(!copy); _key = {s, len}; return true; } bool String(const char* s, SizeType len, bool copy) { assert(!copy); emplace_back(s, len); return true; } bool RawNumber(const Ch* s, SizeType len, bool copy) { assert(!copy); // LOG_DEBUG(ALogString(s, len)); emplace_back(s, len); return true; } bool RawBool(const Ch* s, SizeType len, bool copy) { assert(!copy); emplace_back(s, len); return true; } bool StartObject() { emplace_back(0, 0); _nodes.emplace_back(); return true; } bool EndObject(SizeType memberCount) { commit(true); return true; } void commit(bool sort) { assert(_nodes.size() > 1); auto temp = std::move(_nodes.back()); _nodes.pop_back(); assert(_nodes.back().size() > 0); // LOG_DEBUG(temp.size(), " elements to ", _nodes.back().back().get_key(), " sort=", sort); _nodes.back().back().set_children(std::move(temp), sort); } bool StartArray() { emplace_back(0, 0); _nodes.emplace_back(); return true; } bool EndArray(SizeType elementCount) { commit(false); return true; } }; static NodeImpl* parse_json(char* text, size_t size, int flags) { const auto kFlags = kParseNumbersAsStringsFlag | kParseBoolsAsStringFlag | kParseInsituFlag | kParseCommentsFlag | kParseTrailingCommasFlag; JHandler h(text, flags & DOC_FREE_TEXT_ON_DESTRUCTION); using Encoding = UTF8<>; GenericInsituStringStream<Encoding> s(text); GenericReader<Encoding, Encoding> reader; reader.Parse<kFlags>(s, h); return h.get_root(); } using namespace rapidxml; class XMLNode : public DocNode<XMLNode> { public: using DocNode::DocNode; unique_ptr<XMLNode> __attributes__{nullptr}; retval<XMLNode*> emplace_back(vector<XMLNode>& nodes, xml_base<char>* x) { if (x->name_size() == 0) return {nullptr, ECANCELED}; str k{x->name(), x->name_size()}; str v{x->value(), x->value_size()}; nodes.emplace_back(k, v, get_root()); // LOG_DEBUG(k, ':', v); return &nodes.back(); } void build(xml_node<char>* xml_node, int depth = 0) { vector<XMLNode> nodes; for (auto x = xml_node->first_node(); x; x = x->next_sibling()) { auto ret = emplace_back(nodes, x); if (ret.succeeded()) ret->build(x, depth + 1); } set_children(std::move(nodes)); assert(nodes.empty()); if (auto x = xml_node->first_attribute()) { do { emplace_back(nodes, x); } while((x = x->next_attribute())); auto a = new XMLNode(get_root()); a->set_children(std::move(nodes)); __attributes__.reset(a); } } const NodeImpl* get(str key) const override { return (key != "__attributes__") ? DocNode::get(key) : __attributes__.get(); } }; static NodeImpl* parse_xml(char* text, size_t size, int flags) { xml_document<char> doc; doc.parse<0>(text); auto root = new XMLNode(text, flags & DOC_FREE_TEXT_ON_DESTRUCTION); assert(root); root->build(&doc); return root; } class YAMLNode : public DocNode<YAMLNode> { public: using DocNode::DocNode; str _to_str(ryml::csubstr s) { return {s.str, s.len}; } void build(ryml::ConstNodeRef yaml_node, int depth = 0) { vector<YAMLNode> nodes; for (const auto& x: yaml_node.children()) { assert(x.has_key() != yaml_node.is_seq()); str k, v; if (x.has_key()) k = _to_str(x.key()); if (x.has_val()) v = _to_str(x.val()); // LOG_DEBUG(k, ':', v); nodes.emplace_back(k, v, get_root()); nodes.back().build(x, depth + 1); } set_children(std::move(nodes), !yaml_node.is_seq()); } }; static NodeImpl* parse_yaml(char* text, size_t size, int flags) { auto yaml = ryml::parse_in_place({text, size}); auto root = new YAMLNode(text, flags & DOC_FREE_TEXT_ON_DESTRUCTION); assert(root); root->build(yaml.rootref()); return root; } static NodeImpl* parse_ini(char* text, size_t size, int flags) { return nullptr; } Node parse(char* text, size_t size, int flags) { if (!text || !size) LOG_ERROR_RETURN(EINVAL, nullptr, "invalid argument:", VALUE(text), VALUE(size)); using Parser = NodeImpl* (*) (char* text, size_t size, int flags); constexpr static Parser parsers[] = {&parse_json, &parse_xml, &parse_yaml, &parse_ini}; auto i = flags & DOC_TYPE_MASK; if ((size_t) i > LEN(parsers)) { if (flags & DOC_FREE_TEXT_IF_PARSING_FAILED) free(text); LOG_ERROR_RETURN(EINVAL, nullptr, "invalid document type ", HEX(i)); } return parsers[i](text, size, flags); } Node parse_file(fs::IFile* file, int flags) { return parse(file->readall(), flags | DOC_OWN_TEXT); } Node parse_file(const char* filename, int flags, fs::IFileSystem* fs) { using namespace fs; auto file = fs ? fs->open(filename, O_RDONLY) : open_localfile_adaptor(filename, O_RDONLY) ; if (!file) LOG_ERRNO_RETURN(0, nullptr, "failed to open file ", filename); DEFER(delete file); return parse_file(file, flags); } } }