example/ex_netdb.cc (284 lines of code) (raw):

// SPDX-License-Identifier: Apache-2.0 // Copyright 2014 Network Geographics /** @file Example tool to process network DB files. This generates an executable that processes a list of network files in a specific format. Tokens are (variable) space separated. Example lines look like 4441:34F8:1E40:1EF:0:0:A0:0/108 partner:ngeo raz ?_ prod,dmz "shared space" 4441:34F8:1E40:1EF:0:0:B0:0/108 yahoo raz ?_ prod,dmz "local routing" There is - an IP address network - type/owner field. If "yahoo", it's type "yahoo" and owner "yahoo". Otherwise it's a partner network, with "partner:" followed by the partner name. - pod / colo code - Useless column marker - Network property flags - Comment in double quotes The goal is to transform this in a standard CSV file, dropping the useless column, and tweaking the flags to use ';' instead of ',' as a separator (to avoid confusing the CSV). Adjacent networks with identical properties should also be merged to reduce the data set size. The comments may or not be kept - currently the implement discards them to get better network coalescence. One of the goals of the over all work is to make it easy to build variants of this code to output CSV files tuned to specific uses. */ #include <unordered_set> #include <iostream> #include <fstream> #include "swoc/TextView.h" #include "swoc/swoc_ip.h" #include "swoc/bwf_ip.h" #include "swoc/bwf_std.h" #include "swoc/bwf_ex.h" #include "swoc/swoc_file.h" #include "swoc/Lexicon.h" using namespace std::literals; using namespace swoc::literals; using swoc::TextView; using swoc::IPEndpoint; using swoc::IP4Addr; using swoc::IP4Range; using swoc::IP6Addr; using swoc::IP6Range; using swoc::IPAddr; using swoc::IPRange; using swoc::IPMask; /// Type for temporary buffer writer output. using W = swoc::LocalBufferWriter<512>; /// Network properties. enum class Flag { INVALID = -1, ///< Value for initialization, invalid. INTERNAL = 0, ///< Internal network. PROD, ///< Production network. DMZ, ///< DMZ SECURE, ///< Secure network NONE ///< No flags - useful because the input uses "-" sometimes to mark no properties. }; namespace std { /// Make the enum size look like a tuple size. template <> struct tuple_size<Flag> : public std::integral_constant<size_t, static_cast<size_t>(Flag::NONE)> {}; } // namespace std /// Bit set for network property flags. using FlagSet = std::bitset<std::tuple_size<Flag>::value>; /// Pod type. enum class PodType { INVALID, ///< Initialization value. YAHOO, ///< Yahoo! pod. PARTNER ///< Partner pod. }; /// Mapping of names and property flags. swoc::Lexicon<Flag> FlagNames{ decltype(FlagNames)::with_multi{{Flag::NONE, {"-", "NONE"}}, {Flag::INTERNAL, {"internal"}}, {Flag::PROD, {"prod"}}, {Flag::DMZ, {"dmz"}}, {Flag::SECURE, {"secure"}}}, Flag::INVALID }; /// Mapping of names and pod types. swoc::Lexicon<PodType> PodTypeNames{ decltype(PodTypeNames)::with{{PodType::YAHOO, "yahoo"}, {PodType::PARTNER, "partner"}}, PodType::INVALID }; // Create BW formatters for the types so they can be used for output. namespace swoc { BufferWriter & bwformat(BufferWriter &w, bwf::Spec const &spec, PodType pt) { return bwformat(w, spec, (PodTypeNames[pt])); } BufferWriter & bwformat(BufferWriter &w, bwf::Spec const &spec, FlagSet const &flags) { bool first_p = true; // Track to get separators correct. // Loop through the indices and write the flag name if it's set. for (unsigned idx = 0; idx < std::tuple_size<Flag>::value; ++idx) { if (flags[idx]) { if (!first_p) { w.write(';'); } bwformat(w, spec, FlagNames[static_cast<Flag>(idx)]); first_p = false; } } return w; } } // namespace swoc // These are used to keep pointers for the same string identical so the payloads // can be directly compared. std::unordered_set<TextView, std::hash<std::string_view>> PodNames; std::unordered_set<TextView, std::hash<std::string_view>> OwnerNames; std::unordered_set<TextView, std::hash<std::string_view>> Descriptions; /// The "color" for the IPSpace. struct Payload { PodType _type = PodType::INVALID; ///< Type of ownership. TextView _owner; ///< Corporate owner. TextView _pod; ///< Pod / colocation. TextView _descr; ///< Description / comment FlagSet _flags; ///< Flags. /// @return @c true if @a this is equal to @a that. bool operator==(Payload const &that) { return _type == that._type && _owner == that._owner && _pod == that._pod && _flags == that._flags && _descr == that._descr; } /// @return @c true if @a this is not equal to @a that. bool operator!=(Payload const &that) { return !(*this == that); } }; /// IPSpace for mapping address to @c Payload using Space = swoc::IPSpace<Payload>; /// Place to store strings parsed from the input files. swoc::MemArena Storage; /// Convert a parsed string into a stored string to make the pointer persistent. TextView store(TextView const &text) { auto span = Storage.alloc(text.size()).rebind<char>(); memcpy(span, text); return span; } /// Process the @a content of a file in to @a space. void process(Space &space, TextView content) { int line_no = 0; /// Track for error reporting. // For each line in @a content for (TextView line; !(line = content.take_prefix_at('\n')).empty();) { ++line_no; line.trim_if(&isspace); // Allow empty lines and '#' comments without error. if (line.empty() || '#' == *line) { continue; } // Get the range, make sure it's a valid range. auto range_token = line.take_prefix_if(&isspace); IPRange range{range_token}; if (range.empty()) { std::cerr << W().print("Invalid range '{}' on line {}\n", range_token, line_no); continue; } // Get the owner / type. auto owner_token = line.ltrim_if(&isspace).take_prefix_if(&isspace); PodType pod_type = PodType::YAHOO; // default to this if no owner specifier found. if (auto type_token = owner_token.split_prefix_at(':'); !type_token.empty()) { pod_type = PodTypeNames[type_token]; if (PodType::INVALID == pod_type) { std::cerr << W().print("Invalid type '{}' on line {}\n", type_token, line_no); continue; } } // normalize owner if (auto spot = OwnerNames.find(owner_token); spot == OwnerNames.end()) { owner_token = store(owner_token); OwnerNames.insert(owner_token); } else { owner_token = *spot; } // Get the pod code. auto pod_token = line.ltrim_if(&isspace).take_prefix_if(&isspace); // normalize if (auto spot = PodNames.find(pod_token); spot == PodNames.end()) { pod_token = store(pod_token); PodNames.insert(pod_token); } else { pod_token = *spot; } line.ltrim_if(&isspace).take_prefix_if(&isspace); // skip bogus column. // Work on the flags. auto flag_token = line.ltrim_if(&isspace).take_prefix_if(&isspace); FlagSet flags; // Loop over the token, picking out comma separated keys. for (TextView key; !(key = flag_token.take_prefix_at(',')).empty();) { auto idx = FlagNames[key]; // look up the key. if (Flag::INVALID == idx) { std::cerr << W().print("Invalid flag '{}' on line {}\n", key, line_no); continue; } if (idx != Flag::NONE) { // "NONE" means the input was marked explicitly as no flags. flags[int(idx)] = true; } } #if 0 // The description is what's left, trim the spaces and then the quotes. auto descr_token = line.trim_if(&isspace).trim('"'); // normalize if ( auto spot = Descriptions.find(descr_token) ; spot == Descriptions.end()) { descr_token = store(descr_token); Descriptions.insert(descr_token); } else { descr_token = *spot; } #endif // Everything went OK, create the payload and put it in the space. Payload payload{pod_type, owner_token, pod_token, {}, flags}; space.blend(range, payload, [&](Payload &lhs, Payload const &rhs) { // It's an error if there's overlap that's not consistent. if (lhs._type != PodType::INVALID && lhs != rhs) { std::cerr << W().print("Range collision while blending {} on line {}\n", range, line_no); } lhs = rhs; return true; }); } } void post_processing_performance_test(Space &old_space) { Space space; swoc::file::path vz_db_path{"vz_netdb.csv"}; std::error_code ec; auto t0 = std::chrono::system_clock::now(); std::string content = swoc::file::load(vz_db_path, ec); TextView text{content}; unsigned line_no = 0; for (TextView line; !(line = text.take_prefix_at('\n')).empty();) { ++line_no; // Get the range, make sure it's a valid range. auto range_token = line.take_prefix_at(','); IPRange range{range_token}; // Get the owner / type. auto pod_type = PodTypeNames[line.take_prefix_at(',')]; auto owner = line.take_prefix_at(','); auto pod_token = line.take_prefix_at(','); auto flag_token = line.take_prefix_at(','); FlagSet flags; // Loop over the token, picking out keys for (TextView key; !(key = flag_token.take_prefix_at(';')).empty();) { auto idx = FlagNames[key]; // look up the key. if (Flag::INVALID == idx) { std::cerr << W().print("Invalid flag '{}'\n", key); continue; } flags[int(idx)] = true; } // Everything went OK, create the payload and put it in the space. Payload payload{pod_type, owner, pod_token, {}, flags}; space.mark(range, payload); } auto vz_delta = std::chrono::system_clock::now() - t0; std::cout << W().print("Reload time - {} ms\n", std::chrono::duration_cast<std::chrono::milliseconds>(vz_delta).count()); if (line_no != space.count()) { std::cerr << W().print("Space count {} does not match line count {}\n", space.count(), line_no); } std::vector<IP4Addr> a4; std::vector<IP6Addr> a6; for (auto &&[r, p] : space) { if (r.is_ip4()) { IP4Addr a = r.min().ip4(); a4.push_back(a); a4.push_back(--IP4Addr(a)); a4.push_back(++IP4Addr(a)); a = r.max().ip4(); a4.push_back(a); a4.push_back(--IP4Addr(a)); a4.push_back(++IP4Addr(a)); } else if (r.is_ip6()) { IP6Addr a = r.min().ip6(); a6.push_back(a); a6.push_back(--IP6Addr(a)); a6.push_back(++IP6Addr(a)); a = r.max().ip6(); a6.push_back(a); a6.push_back(--IP6Addr(a)); a6.push_back(++IP6Addr(a)); } } t0 = std::chrono::system_clock::now(); for (auto const &addr : a4) { [[maybe_unused]] auto spot = space.find(addr); } vz_delta = std::chrono::system_clock::now() - t0; std::cout << W().print("IPv4 time - {} addresses, {} ns total, {} ns per lookup\n", a4.size(), vz_delta.count(), vz_delta.count() / a4.size()); t0 = std::chrono::system_clock::now(); for (auto const &addr : a6) { [[maybe_unused]] auto spot = space.find(addr); } vz_delta = std::chrono::system_clock::now() - t0; std::cout << W().print("IPv6 time - {} addresses, {} ns total, {} ns per lookup\n", a6.size(), vz_delta.count(), vz_delta.count() / a6.size()); t0 = std::chrono::system_clock::now(); for (auto const &addr : a4) { [[maybe_unused]] auto spot = old_space.find(addr); } vz_delta = std::chrono::system_clock::now() - t0; std::cout << W().print("IPv4 time (pre-cleaning) - {} addresses, {} ns total, {} ns per lookup\n", a4.size(), vz_delta.count(), vz_delta.count() / a4.size()); t0 = std::chrono::system_clock::now(); for (auto const &addr : a6) { [[maybe_unused]] auto spot = old_space.find(addr); } vz_delta = std::chrono::system_clock::now() - t0; std::cout << W().print("IPv6 time (pre-cleaning) - {} addresses, {} ns total, {} ns per lookup\n", a6.size(), vz_delta.count(), vz_delta.count() / a6.size()); } int main(int argc, char *argv[]) { Space space; // Set the defaults so bogus input doesn't throw. PodTypeNames.set_default(PodType::INVALID); FlagNames.set_default(Flag::INVALID); // Open the output file. std::ofstream output; output.open("vz_netdb.csv", std::ofstream::out | std::ofstream::trunc); if (!output.is_open()) { std::cerr << W().print("Unable to open output file: {}", swoc::bwf::Errno{errno}); } auto t0 = std::chrono::system_clock::now(); // Process the files in the command line. for (int idx = 1; idx < argc; ++idx) { swoc::file::path path(argv[idx]); std::error_code ec; std::string content = swoc::file::load(path, ec); if (!ec) { std::cout << W().print("Processing {}, {} bytes\n", path, content.size()); process(space, content); } } auto read_delta = std::chrono::system_clock::now() - t0; // Dump the resulting space. std::cout << W().print("{} ranges\n", space.count()); for (auto &&[r, p] : space) { // Note - if description is to be used, it needs to be added here. output << W().print("{},{},{},{},{}\n", r, p._type, p._owner, p._pod, p._flags); } auto write_delta = std::chrono::system_clock::now() - t0; std::cout << W().print("Read & process time - {} ms, write time {} ms\n", std::chrono::duration_cast<std::chrono::milliseconds>(read_delta).count(), std::chrono::duration_cast<std::chrono::milliseconds>(write_delta - read_delta).count()); post_processing_performance_test(space); return 0; }