tools/cpp/getPerformance.cpp

// // getPerformance.cpp // MNN // // Created by MNN on 2019/03/12. // Copyright © 2018, Alibaba Group Holding Limited // #include <string.h> #include <chrono> #include <cstdint> #include <vector> #include <thread> #include <MNN/AutoTime.hpp> #include <stdlib.h> #include <MNN/MNNDefine.h> #include "core/Macro.h" #ifdef MNN_USE_NEON #include <arm_neon.h> #endif class Timer { private: std::chrono::high_resolution_clock::time_point inTime, outTime; public: void startTimer() { inTime = std::chrono::high_resolution_clock::now(); } // unit ms float getCostTimer() { outTime = std::chrono::high_resolution_clock::now(); return (float)(std::chrono::duration_cast<std::chrono::microseconds>(outTime - inTime).count()); } }; int getCpuCounts() { FILE* fp = fopen("/proc/cpuinfo", "rb"); if (fp == nullptr) { MNN_PRINT("fopen error ! \n"); return 0; } int cpuCounts = 0; char data[1024]; while (!feof(fp)) { char* a = fgets(data, 1024, fp); if (a == nullptr) { break; } if (memcmp(data, "processor", 9) == 0) { cpuCounts++; } } fclose(fp); fp = nullptr; return cpuCounts; } // 0 max 1 min 2 cur void getFreqKhz(int cpuid, std::vector<int>& freqVector) { char path[256]; int freqKhz = -1; // max sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid); FILE* fp = fopen(path, "rb"); if (nullptr == fp) { MNN_PRINT("cpuinfo_max_freq fopen error ! \n"); freqVector.emplace_back(0); } else { fscanf(fp, "%d", &freqKhz); fclose(fp); freqVector.push_back(freqKhz); } // min sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_min_freq", cpuid); fp = fopen(path, "rb"); if (nullptr == fp) { MNN_PRINT("cpuinfo_min_freq fopen error ! \n"); freqVector.emplace_back(0); } else { freqKhz = -1; fscanf(fp, "%d", &freqKhz); fclose(fp); freqVector.push_back(freqKhz); } // cur // sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_cur_freq", cpuid); // fp = fopen(path, "rb"); // if(nullptr == fp){ // MNN_PRINT("cpuinfo_cur_freq fopen error ! \n"); // }else{ // freqKhz = -1; // fscanf(fp, "%d", &freqKhz); // fclose(fp); // freqVector.push_back(freqKhz); // } } void cpuFloatMlaTest(int32_t loopCounts) { #ifdef MNN_USE_NEON #ifndef __aarch64__ __asm__ __volatile__( "mov r12, %0\n" "0: \n" "vmla.f32 q15, q15, d0[0] \n" "vmla.f32 q14, q14, d0[1] \n" "vmla.f32 q13, q13, d1[0] \n" "vmla.f32 q12, q12, d1[1] \n" "vmla.f32 q11, q11, d2[0] \n" "vmla.f32 q10, q10, d2[1] \n" "vmla.f32 q9, q9, d3[0] \n" "vmla.f32 q8, q8, d3[1] \n" "vmla.f32 q7, q7, d4[0] \n" "vmla.f32 q6, q6, d4[1] \n" "vmla.f32 q5, q5, d5[0] \n" "vmla.f32 q4, q4, d5[1] \n" "vmla.f32 q3, q3, d6[0] \n" "subs r12, r12, #1 \n" "bne 0b \n" : : "r"(loopCounts) : "cc", "memory", "r12", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q14", "q15" ); #else __asm__ __volatile__( "mov w9, %w0\n" "0: \n" "fmla v31.4s, v31.4s, v0.s[0]\n" "fmla v30.4s, v30.4s, v0.s[1]\n" "fmla v29.4s, v29.4s, v0.s[2]\n" "fmla v28.4s, v28.4s, v0.s[3]\n" "fmla v27.4s, v27.4s, v1.s[0]\n" "fmla v26.4s, v26.4s, v1.s[1]\n" "fmla v25.4s, v25.4s, v1.s[2]\n" "fmla v24.4s, v24.4s, v1.s[3]\n" "fmla v23.4s, v23.4s, v3.s[0]\n" "fmla v22.4s, v22.4s, v3.s[1]\n" "fmla v21.4s, v21.4s, v3.s[2]\n" "fmla v20.4s, v20.4s, v3.s[3]\n" "fmla v19.4s, v19.4s, v4.s[0]\n" "fmla v18.4s, v18.4s, v4.s[1]\n" "fmla v17.4s, v17.4s, v4.s[2]\n" "fmla v16.4s, v16.4s, v4.s[3]\n" "fmla v15.4s, v15.4s, v5.s[0]\n" "fmla v14.4s, v14.4s, v5.s[1]\n" "fmla v13.4s, v13.4s, v5.s[2]\n" "fmla v12.4s, v12.4s, v5.s[3]\n" "fmla v11.4s, v11.4s, v6.s[0]\n" "fmla v10.4s, v10.4s, v6.s[1]\n" "fmla v9.4s, v9.4s, v6.s[2]\n" "fmla v8.4s, v8.4s, v6.s[3]\n" "fmla v7.4s, v7.4s, v2.s[0]\n" "subs w9, w9, #1 \n" "bne 0b \n" : : "r"(loopCounts) : "cc", "memory", "w9", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" ); #endif #endif } void cpuFLOPSPerformance() { int32_t loopCounts = 100000000; MNN_PRINT("CPU PERFORMANCE -> loopCounts : %d \n", loopCounts); std::vector<int> freqVector; for (int i = 0; i < getCpuCounts(); i++) { freqVector.clear(); getFreqKhz(i, freqVector); MNN_PRINT("core %d : max : %d, min : %d \n",i, freqVector.at(0), freqVector.at(1)); } // warm up cpuFloatMlaTest(loopCounts); Timer timeInstance; timeInstance.startTimer(); cpuFloatMlaTest(loopCounts); #ifdef MNN_USE_NEON #ifndef __aarch64__ auto number = (double)loopCounts * 13; #else auto number = (double)loopCounts * 25; #endif #else auto number = 0.0; #endif //FUNC_PRINT(number); float costTime_ms = timeInstance.getCostTimer(); double costTime_s = (double)(costTime_ms) / 1000000.0f; // MNN_PRINT("cost time : %f \n", costTime_s); double mlaCounts_g = number * 4 / 1000000000.0f; float gflops = mlaCounts_g / costTime_s; MNN_PRINT(" ======================== float ===============================\n"); MNN_PRINT("CPU float gflops : %f\n", gflops); } static void _testMemcpy() { int size = 1024 * 1024; int loop = 10000; std::vector<std::thread> threads; int threadNumber = 2; std::vector<std::vector<int8_t>> tmp(threadNumber); for (int i=0; i<threadNumber; ++i) { tmp[i].resize(size); } MNN::Timer _t; for (int i=0; i<threadNumber; ++i) { threads.emplace_back(std::thread([size, loop, i, &tmp]() { auto t0 = tmp[i].data(); for (int i=0; i<loop; ++i) { ::memset(t0, 0, size); } })); } for (auto& t : threads) { t.join(); } float timeInS = (float)_t.durationInUs() / 1000.0f / 1000.0f; float speed = (float)size * (float)threads.size() / 1024.0f / 1024.0f / 1024.0f * (float)loop / timeInS; MNN_PRINT("Memcpy speed: %f GB / s\n", speed); } int main(int argc, const char* argv[]) { MNN_PRINT("Start PERFORMANCE !!! \n"); cpuFLOPSPerformance(); _testMemcpy(); return 0; }

tools/cpp/getPerformance.cpp (193 lines of code) (raw):