horovod/common/timeline.h (78 lines of code) (raw):

// Copyright 2019 Uber Technologies, Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================= #ifndef HOROVOD_TIMELINE_H #define HOROVOD_TIMELINE_H #include <atomic> #include <chrono> #include <fstream> #include <iostream> #include <mutex> #include <unordered_map> #include <vector> #include <boost/lockfree/spsc_queue.hpp> #include "common.h" #include "message.h" namespace horovod { namespace common { enum TimelineRecordType { EVENT, MARKER }; struct TimelineRecord { TimelineRecordType type; std::string tensor_name; char phase; std::string op_name; std::string args; std::string marker_name; long ts_micros; }; class TimelineWriter { public: void Initialize(std::string file_name); inline bool IsHealthy() const { return healthy_; } void EnqueueWriteEvent(const std::string& tensor_name, char phase, const std::string& op_name, const std::string& args, long ts_micros); void EnqueueWriteMarker(const std::string& name, long ts_micros); private: void DoWriteEvent(const TimelineRecord& r); void DoWriteMarker(const TimelineRecord& r); void WriterLoop(); // Are we healthy? std::atomic_bool healthy_{false}; // Timeline file. std::ofstream file_; // Timeline record queue. boost::lockfree::spsc_queue<TimelineRecord, boost::lockfree::capacity<1048576>> record_queue_; // Mapping of tensor names to indexes. It is used to reduce size of the // timeline file. std::unordered_map<std::string, int> tensor_table_; }; enum TimelineState { UNKNOWN, NEGOTIATING, TOP_LEVEL, ACTIVITY }; // Writes timeline in Chrome Tracing format. Timeline spec is from: // https://github.com/catapult-project/catapult/tree/master/tracing class Timeline { public: void Initialize(std::string file_name, unsigned int horovod_size); inline bool Initialized() const { return initialized_; } void NegotiateStart(const std::string& tensor_name, Request::RequestType request_type); void NegotiateRankReady(const std::string& tensor_name, int rank); void NegotiateEnd(const std::string& tensor_name); void Start(const std::string& tensor_name, const Response::ResponseType response_type); void ActivityStartAll(const std::vector<TensorTableEntry>& entries, const std::string& activity); void ActivityStart(const std::string& tensor_name, const std::string& activity); void ActivityEndAll(const std::vector<TensorTableEntry>& entries); void ActivityEnd(const std::string& tensor_name); void End(const std::string& tensor_name, std::shared_ptr<Tensor> tensor); void MarkCycleStart(); private: long TimeSinceStartMicros() const; void WriteEvent(const std::string& tensor_name, char phase, const std::string& op_name = "", const std::string& args = ""); void WriteMarker(const std::string& name); // Boolean flag indicating whether Timeline was initialized (and thus should // be recorded). bool initialized_ = false; // Timeline writer. TimelineWriter writer_; // Time point when Horovod was started. std::chrono::steady_clock::time_point start_time_; // A mutex that guards timeline state from concurrent access. std::recursive_mutex mutex_; // Current state of each tensor in the timeline. std::unordered_map<std::string, TimelineState> tensor_states_; // Map of ranks to their string representations. // std::to_string() is very slow. std::vector<std::string> rank_strings_; }; } // namespace common } // namespace horovod #endif // HOROVOD_TIMELINE_H