|
GTPin
|
The Funlatency tool measures the number of cycles required to execute functions called by the kernel.
To run Funlatency with the default settings, run:
Profilers/Bin/gtpin -t funlatency -- app
When Funlatency runs with the default configuration, it generates a directory named GTPIN_PROFILE_FUNLATENCY Profiling results are saved by GTPin in the file GTPIN_PROFILE_FUNLATENCY\Session_Final\funlatency.txt. The results are presented in the following format:
### Kernel/Shader execution-time profile generated by GTPin ###
Legend:
NA - kernel was not instrumented.
Kernel Name HashID Function Name Inst Freq. Total-latency Total-latency(%) Avg-Cycles Type SIMD Platform Execution descriptor
pi 3650b0add9d1a432 pi(main) [0-116] 6250 3408987510 26.35 545438 CS 8 OpenCL 2 0 0 0
pi 3650b0add9d1a432 func [123-294] 62500 3145307667 24.31 50324 CS 8 OpenCL 2 0 0 0
pi 3650b0add9d1a432 funcX2 [295-466] 125000 6381884664 49.33 51055 CS 8 OpenCL 2 0 0 0
Each line represents a single run (dispatch to HW device) of specific kernel, where the fields have the following meaning:
If the function name is not recognized by GTPin, it generates an artificial name in the format: EmbeddedFunc_***
(Back to the list of all GTPin Sample Tools)
00001 /*========================== begin_copyright_notice ============================ 00002 Copyright (C) 2024-2025 Intel Corporation 00003 00004 SPDX-License-Identifier: MIT 00005 ============================= end_copyright_notice ===========================*/ 00006 00007 /*! 00008 * @file FunLatency Tool definitions 00009 */ 00010 00011 #ifndef FUNLATENCY_H_ 00012 #define FUNLATENCY_H_ 00013 00014 #include <map> 00015 #include <set> 00016 #include <string> 00017 00018 #include "gtpin_api.h" 00019 #include "gtpin_tool_utils.h" 00020 00021 using namespace gtpin; 00022 00023 /// Identifier of the instrumentation site : function ID 00024 using InstrumentSiteId = FunctionId; 00025 00026 /// Array of instrumentation site IDs, indexed by ordinal numbers of corresponding records in the profile buffer 00027 using InstrumentSites = std::vector<InstrumentSiteId>; 00028 00029 /* ============================================================================================= */ 00030 // Struct FunLatencyRecord 00031 /* ============================================================================================= */ 00032 /*! 00033 * Layout of records collected in profile buffer by the Latency tool 00034 */ 00035 struct alignas(uint64_t) FunLatencyRecord 00036 { 00037 uint64_t cycles; ///< Total number of cycles 00038 uint32_t freq; ///< Total number of executions 00039 }; 00040 00041 /* ============================================================================================= */ 00042 // Class FunLatencyDispatchProfile 00043 /* ============================================================================================= */ 00044 /*! 00045 * Profiling data collected during a single kernel dispatch 00046 */ 00047 struct FunLatencyDispatchProfile 00048 { 00049 FunLatencyDispatchProfile(const IGtKernelDispatch& kernelDispatch, InstrumentSiteId siteId); 00050 void Accumulate(const FunLatencyRecord& record); 00051 00052 InstrumentSiteId siteId; ///< Identifier of the instrumentation site: FuntionID 00053 GtKernelExecDesc kernelExecDesc; ///< Kernel execution descriptor 00054 uint64_t cycles; ///< Total number of cycles 00055 uint64_t freq; ///< Total number of executions 00056 }; 00057 00058 /* ============================================================================================= */ 00059 // struct FunctionDescriptor 00060 /* ============================================================================================= */ 00061 /*! 00062 * Static information about the function 00063 */ 00064 struct FunctionDescriptor 00065 { 00066 FunctionDescriptor(const FunctionId fId, const std::string fName, 00067 const bool fIsMain, const InsId firstIns, const InsId lastIns, const uint32_t recNum); 00068 00069 FunctionId id; ///< Identifier of the function 00070 std::string name; ///< function name 00071 bool isMain; ///< true for main kernel function 00072 InsId firstInsId; ///< first Instruction in the function 00073 InsId lastInsId; ///< last Instruction in the function 00074 uint32_t recordNum; ///< record number in profile array 00075 GtReg freqReg; ///< Allocated VReg for freq count 00076 GtReg cycleReg; ///< Allocated VReg for cycles count 00077 }; 00078 00079 /* ============================================================================================= */ 00080 // Class FunLatencyKernelProfile 00081 /* ============================================================================================= */ 00082 /*! 00083 * Aggregated profile of all instrumented kernel dispatches 00084 */ 00085 class FunLatencyKernelProfile 00086 { 00087 public: 00088 typedef std::map<FunctionId, FunctionDescriptor> FunctionMap; 00089 FunLatencyKernelProfile(const IGtKernel& kernel, const IGtCfg& cfg, 00090 const GtProfileArray& profileArray, InstrumentSites&& instrumentSites, FunctionMap&& functions); 00091 00092 /// Add the specified profile of a kernel dispatch 00093 FunLatencyDispatchProfile& AddDispatchProfile(const FunLatencyDispatchProfile& dispatchProfile); 00094 00095 std::string ToString() const; ///< @return Text representation of the profile data 00096 void DumpAsm() const; ///< Dump kernel's assembly text to file 00097 std::string GetName() const { return _name; } ///< @return Kernel's name 00098 const GtProfileArray& GetProfileArray() const { return _profileArray; } ///< @return Profile buffer accessor 00099 00100 /// Given a record number in the profile buffer, return corresponding ID of the instrumentation site 00101 InstrumentSiteId GetSiteId(uint32_t recordNum) const; 00102 00103 /// @return Collection of profiles per kernel dispatch 00104 typedef std::list<FunLatencyDispatchProfile> DispatchProfiles; 00105 const DispatchProfiles& GetDispatchProfiles() const { return _dispatchProfiles; } 00106 00107 private: 00108 std::string _name; ///< Kernel's name 00109 std::string _uniqueName; ///< Kernel's unique name (IGC style) 00110 GtKernelType _type; ///< Kernel's type 00111 GtGpuPlatform _platform; ///< Kernel's platform 00112 uint64_t _hashId; ///< Kernel's hash identifier 00113 GtSimdWidth _simd; ///< Kernel's SIMD width 00114 std::string _asmText; ///< Kernel's assembly text 00115 GtProfileArray _profileArray; ///< Profile buffer accessor 00116 uint64_t _totalKernelCycles; ///< Total cycles of the kernel 00117 InstrumentSites _instrumentSites; ///< Array of instrumentation site IDs 00118 FunctionMap _functionMap; ///< Map each function ID to it's Function descriptor 00119 DispatchProfiles _dispatchProfiles; ///< Profiles per kernel dispatch 00120 }; 00121 00122 /* ============================================================================================= */ 00123 // Class FunLatency 00124 /* ============================================================================================= */ 00125 /*! 00126 * Implementation of the IGtTool interface for the FunLatency tool 00127 */ 00128 class FunLatency : public GtTool 00129 { 00130 public: 00131 /// Implementation of the IGtTool interface 00132 const char* Name() const { return "FunLatency"; } 00133 00134 void OnKernelBuild(IGtKernelInstrument& instrumentor); 00135 void OnKernelRun(IGtKernelDispatch& dispatcher); 00136 void OnKernelComplete(IGtKernelDispatch& dispatcher); 00137 00138 public: 00139 std::string ToString() const; ///< @return Text representation of the profile data 00140 void DumpAsm() const; ///< Dump assembly text of profiled kernels to files 00141 00142 static FunLatency* Instance(); ///< @return Single instance of this class 00143 static void OnFini() { Instance()->Fini(); } ///< Callback function registered with atexit() 00144 protected: 00145 FunLatency() = default; 00146 FunLatency(const FunLatency&) = delete; 00147 FunLatency& operator = (const FunLatency&) = delete; 00148 ~FunLatency() = default; 00149 00150 /// @return Collection of kernel profiles 00151 typedef std::map<GtKernelId, FunLatencyKernelProfile> KernelProfiles; 00152 const KernelProfiles& GetKernelProfiles() const { return _kernels; } 00153 00154 /// Post process and dump profiling data 00155 void Fini(); 00156 private: 00157 /// Generate instrumentation for functions 00158 void InstrumentFunctions(IGtKernelInstrument& instrumentor); 00159 00160 /// Generate instrumentation to be inserted before the code fragment whose latency is measured 00161 void GeneratePreCode(GtGenProcedure& proc, const IGtGenCoder& coder); 00162 00163 /*! 00164 * Generate instrumentation that computes latency and stores the result in the profile buffer. 00165 * - GeneratePostCodeMemBound - Generate memory-bound instrumentation 00166 * - GeneratePostCodeRegBound - Generate register-bound instrumentation. Return success/failure status 00167 * 00168 * @param[in, out] postProc Procedure to be inserted after the code fragment whose latency is measured 00169 * @param[in, out] finiProc Procedure to be inserted before each EOT 00170 * @param[in] coder GEN code generator 00171 * @param[in] profileArray Array of 'FunLatencyRecord's in the profile buffer 00172 * @param[in] funcDesc pointer to the decriptor of the function to be instrumented 00173 * @param[in] isFuncEntry true if the BBL is the first BBL in the Function 00174 */ 00175 bool GeneratePostCodeRegBound(GtGenProcedure& postProc, GtGenProcedure& finiProc, const IGtGenCoder& coder, 00176 const GtProfileArray& profileArray, FunctionDescriptor* funcDesc, bool isFuncEntry, bool updateFrequency, uint32_t simdMask); 00177 void GeneratePostCodeMemBound(GtGenProcedure& proc, const IGtGenCoder& coder, 00178 const GtProfileArray& profileArray, FunctionDescriptor* funcDesc, bool updateFrequency, uint32_t simdMask); 00179 00180 /// @return true/false - use 64-bit/32-bit integer for the cycle counter 00181 static bool Use64BitCounters(const IGtGenCoder& coder); 00182 00183 private: 00184 KernelProfiles _kernels; ///< Collection of kernel profiles 00185 00186 GtReg _addrReg; ///< Virtual register that holds address within profile buffer 00187 GtReg _dataReg; ///< Virtual register that holds data to be read from/written to profile buffer 00188 GtReg _timeReg; ///< Virtual timer register 00189 }; 00190 00191 #endif
00001 /*========================== begin_copyright_notice ============================ 00002 Copyright (C) 2024-2026 Intel Corporation 00003 00004 SPDX-License-Identifier: MIT 00005 ============================= end_copyright_notice ===========================*/ 00006 00007 /*! 00008 * @file Implementation of the FunLatency Tool 00009 */ 00010 #include <fstream> 00011 #include <sstream> 00012 #include <iomanip> 00013 #include <algorithm> 00014 00015 #include "funlatency.h" 00016 00017 using namespace gtpin; 00018 using namespace std; 00019 00020 /* ============================================================================================= */ 00021 // Configuration 00022 /* ============================================================================================= */ 00023 Knob<int> knobNumThreadBuckets("num_thread_buckets", 0, "Number of thread buckets. 0 (default) - maximum thread buckets"); 00024 Knob<bool> knobUseRegInstrument("no_reg_instrument", true, "Disable register-bound instrumentation"); 00025 Knob<bool> knobSkipZeroResults("skip_zero_results", false, "Skip zero results in the Funlatency output"); 00026 Knob<bool> knobIncludeZeroExecMask("include_zero_exec_mask", false, "Include measurements of functions called with zero exec mask"); 00027 00028 /* ============================================================================================= */ 00029 // FunLatency implementation 00030 /* ============================================================================================= */ 00031 FunLatency* FunLatency::Instance() 00032 { 00033 static FunLatency instance; 00034 return &instance; 00035 } 00036 00037 void FunLatency::OnKernelBuild(IGtKernelInstrument& instrumentor) 00038 { 00039 /* 00040 * Register allocation policy 00041 * ====================================================================================================== 00042 * To ensure accuracy of latency measurments, the instrumentation overhead should be as low as possible. 00043 * It is believed that register-bound procedures incur lower overhead than memory-bound procedures. 00044 * Assuming that, we will try to use register-bound instrumentation for as many sites as possible, and 00045 * apply memory-bound instrumentation to the rest of the sites. 00046 * 00047 * To check and guarantee availability of free physical registers we pre-allocate registers 00048 * in the register-bound procedures created by the GeneratePostCodeRegBound() function. 00049 * For this purpose we use the IGtRegAllocator::ReserveVregOperands() interface. 00050 * 00051 * For memory-bound procedures, pre-allocation of registers is not needed, and even can be harmful, as 00052 * explained in the IGtRegAllocator description. On the other hand, excessive pre-allocation of registers 00053 * in the register-bound procedures may cause multiple spills/fills in memory-bound procedures. To avoid 00054 * this situation, we do the following: 00055 * A) RESERVE registers in memory-bound procedures BEFORE generating register-bound procedures using 00056 * IGtRegAllocator::Reserve() 00057 * B) RELEASE registers in memory-bound procedures AFTER generating register-bound procedures using 00058 * IGtRegAllocator::ReleaseReserved() 00059 * 00060 * Function Latency measurment algorithm 00061 * ====================================================================================================== 00062 * Count the cycles it takes for each function to execute from the first instruction to the end of the function 00063 * by starting the timer before the first instruction in each function, and stopping it before the last 00064 * instruction in the function. 00065 * If a function calls other functions, in order to exclude the callee's total cycles from the caller's 00066 * cycles count, the tool will stop the timer before "call" instruction,count the cycels,and restart it 00067 * after the call instruction. 00068 * for example : 00069 * // fun1: fun1: // 00070 * // [... \\ start the timer (preCode) // 00071 * // ... =====\\ [... // 00072 * // ... \\ ...] // 00073 * // ...] \\ stop the timer (postCode) // 00074 * // call fun2 // call fun2 // 00075 * // [... // start the timer // 00076 * // ... =====// [... // 00077 * // ... // ...] // 00078 * // ...] stop the timer // 00079 * // ret ret // 00080 * 00081 * fun1 total cycles measurment will be done in two parts . 00082 * (A) count the total cycles from the function entry to the call ins 00083 * (B) count the total cycles after call ins to the end of the function 00084 * fun1 total cycles = A + B 00085 * 00086 */ 00087 00088 const IGtGenCoder& coder = instrumentor.Coder(); 00089 IGtVregFactory& vregs = coder.VregFactory(); 00090 IGtRegAllocator& ra = coder.RegAllocator(); 00091 bool is64BitCounter = Use64BitCounters(coder); 00092 00093 // Initialize virtual registers 00094 _timeReg = vregs.Make(VREG_TYPE_DWORD); 00095 _addrReg = vregs.MakeMsgAddrScratch(); 00096 _dataReg = vregs.MakeMsgDataScratch(is64BitCounter ? VREG_TYPE_QWORD : VREG_TYPE_DWORD); 00097 00098 // A) Reserve registers used in the memory-bound procedures 00099 ra.Reserve(_dataReg.VregNumber()); 00100 ra.Reserve(_addrReg.VregNumber()); 00101 ra.Reserve(vregs.GetProfileBufferAddrVreg().Num()); 00102 00103 InstrumentFunctions(instrumentor); 00104 00105 // B) Release registers used in the memory-bound procedures 00106 ra.ReleaseReserved(_dataReg.VregNumber()); 00107 ra.ReleaseReserved(_addrReg.VregNumber()); 00108 ra.ReleaseReserved(vregs.GetProfileBufferAddrVreg().Num()); 00109 } 00110 00111 void FunLatency::InstrumentFunctions(IGtKernelInstrument& instrumentor) 00112 { 00113 const IGtKernel& kernel = instrumentor.Kernel(); 00114 const IGtCfg& cfg = instrumentor.Cfg(); 00115 const IGtGenCoder& coder = instrumentor.Coder(); 00116 const IGtGenModel& genModel = kernel.GenModel(); 00117 IGtProfileBufferAllocator& allocator = instrumentor.ProfileBufferAllocator(); 00118 00119 uint32_t simdMask; 00120 switch (kernel.SimdWidth()) 00121 { 00122 case 8: simdMask = 0xFF; break; 00123 case 16: simdMask = 0xFFFF; break; 00124 case 32: simdMask = 0xFFFFFFFF; break; 00125 default: simdMask = 0; break; 00126 } 00127 00128 uint32_t numRecords = cfg.NumFunctions(); // numRecords = number of functions 00129 00130 // Insert code that stops timer at BBL's exit and stores result in the profile buffer 00131 GtGenProcedure postCode; 00132 GtProfileArray profileArray(sizeof(FunLatencyRecord), numRecords, genModel.MaxThreadBuckets()); 00133 profileArray.Allocate(allocator); 00134 00135 GtGenProcedure finiCode; // Procedure that stores results of register-bound measurments in the profile buffer 00136 bool tryRegInstrument = knobUseRegInstrument; // true - try to apply register-bound instrumentation 00137 00138 std::map<FunctionId, FunctionDescriptor> functions; 00139 00140 // Iterate over the function instrument the first and the last BBl in each function. 00141 uint32_t idx = 0; 00142 for (auto func : cfg.Functions()) 00143 { 00144 // Start the timer in each function entry 00145 const IGtBbl& firstBbl = *func->Bbls().first(); 00146 const IGtIns& firstIns = firstBbl.FirstIns(); 00147 const IGtBbl& lastBbl = *func->Bbls().last(); 00148 const IGtIns& lastIns = lastBbl.LastIns(); 00149 00150 // Insert code that starts timer at BBL's entry 00151 GtGenProcedure preCode; 00152 GeneratePreCode(preCode, coder); 00153 InstrumentBbl(instrumentor, firstBbl, GtIpoint::Before(), preCode); 00154 00155 bool isMain = func->IsKernelMain(); 00156 bool isEntry = true; 00157 00158 // Insert FunctionDescriptor for the function into the kernel functions map 00159 functions.emplace(std::piecewise_construct, 00160 std::forward_as_tuple(func->Id()), 00161 std::forward_as_tuple(func->Id(), func->Name(), isMain, firstIns.Id(), lastIns.Id(), idx)); 00162 00163 bool instrumentAfterCall = false; // Insert precode Instrumentation after call instruction 00164 for (auto bbl : func->Bbls()) 00165 { 00166 if (instrumentAfterCall) 00167 { 00168 GtGenProcedure precode; 00169 GeneratePreCode(precode, coder); 00170 InstrumentBbl(instrumentor, *bbl, GtIpoint::Before(), precode); 00171 instrumentAfterCall = false; 00172 } 00173 00174 const IGtIns& lastBblIns = bbl->LastIns(); 00175 00176 if (bbl->IsEot() || (!isMain && (lastBblIns.Opcode() == GED_OPCODE_ret))) 00177 { 00178 00179 // Insert code that stops timer at the last BBL's exit and stores result in the profile buffer 00180 GtGenProcedure postCode; 00181 00182 // Insert fake consumers of source registers to expose hidden latency of EOT instructions 00183 if (bbl->IsEot()) 00184 { 00185 coder.GenerateFakeSrcConsumers(postCode, lastBblIns); 00186 } 00187 00188 if (tryRegInstrument) 00189 { 00190 tryRegInstrument = GeneratePostCodeRegBound(postCode, finiCode, coder, profileArray, &functions.at(func->Id()), isEntry, true, simdMask); 00191 } 00192 if (!tryRegInstrument) 00193 { 00194 GeneratePostCodeMemBound(postCode, coder, profileArray, &functions.at(func->Id()), true, simdMask); 00195 } 00196 InstrumentInstruction(instrumentor, lastBblIns, GtIpoint::Before(), postCode); 00197 idx++; 00198 isEntry = false; 00199 break; 00200 } 00201 else if (lastBblIns.Opcode() == GED_OPCODE_call || lastBblIns.Opcode() == GED_OPCODE_calla) 00202 { 00203 // Insert code that stops timer at the last BBL's exit and stores result in the profile buffer 00204 GtGenProcedure postCode; 00205 00206 if (tryRegInstrument) 00207 { 00208 tryRegInstrument = GeneratePostCodeRegBound(postCode, finiCode, coder, profileArray, &functions.at(func->Id()), isEntry, false, simdMask); 00209 } 00210 if (!tryRegInstrument) 00211 { 00212 GeneratePostCodeMemBound(postCode, coder, profileArray, &functions.at(func->Id()), false, simdMask); 00213 } 00214 InstrumentInstruction(instrumentor, lastBblIns, GtIpoint::Before(), postCode); 00215 00216 instrumentAfterCall = true; 00217 isEntry = false; 00218 } 00219 } 00220 } 00221 // Insert 'finiCode' at all exits of the kernel - before each EOT instruction 00222 instrumentor.InstrumentExits(finiCode); 00223 00224 // Transform functions into array of instrumentation sites 00225 InstrumentSites sites(numRecords); 00226 std::transform(cfg.Functions().begin(), cfg.Functions().end(), sites.begin(), [](const IGtFunction* f) -> FunctionId { return f->Id(); }); 00227 00228 // Create FunLatencyKernelProfile object that represents profile of this kernel 00229 _kernels.emplace(kernel.Id(), FunLatencyKernelProfile(kernel, cfg, profileArray, std::move(sites), std::move(functions))); 00230 } 00231 00232 void FunLatency::OnKernelRun(IGtKernelDispatch& dispatcher) 00233 { 00234 bool isProfileEnabled = false; 00235 const IGtKernel& kernel = dispatcher.Kernel(); 00236 GtKernelExecDesc execDesc; dispatcher.GetExecDescriptor(execDesc); 00237 if (kernel.IsInstrumented() && IsKernelExecProfileEnabled(execDesc, kernel.GpuPlatform(), kernel.Name().Get())) 00238 { 00239 auto it = _kernels.find(kernel.Id()); 00240 00241 if (it != _kernels.end()) 00242 { 00243 IGtProfileBuffer* buffer = dispatcher.CreateProfileBuffer(); GTPIN_ASSERT(buffer); 00244 FunLatencyKernelProfile& kernelProfile = it->second; 00245 const GtProfileArray& profileArray = kernelProfile.GetProfileArray(); 00246 if (profileArray.Initialize(*buffer)) 00247 { 00248 isProfileEnabled = true; 00249 } 00250 else 00251 { 00252 GTPIN_ERROR_MSG(string("FunLatency : ") + string(kernel.Name()) + " : Failed to write into memory buffer"); 00253 } 00254 } 00255 } 00256 dispatcher.SetProfilingMode(isProfileEnabled); 00257 } 00258 00259 void FunLatency::OnKernelComplete(IGtKernelDispatch& dispatcher) 00260 { 00261 const IGtKernel& kernel = dispatcher.Kernel(); 00262 GtKernelExecDesc execDesc; dispatcher.GetExecDescriptor(execDesc); 00263 bool isProfilingEnabled = dispatcher.IsProfilingEnabled(); 00264 if (!isProfilingEnabled || !IsKernelExecProfileEnabled(execDesc, kernel.GpuPlatform(), kernel.Name().Get())) 00265 { 00266 return; // Do nothing with unprofiled kernel dispatches 00267 } 00268 00269 auto it = _kernels.find(kernel.Id()); 00270 00271 if (it != _kernels.end()) 00272 { 00273 const IGtProfileBuffer* buffer = dispatcher.GetProfileBuffer(); GTPIN_ASSERT(buffer); 00274 FunLatencyKernelProfile& kernelProfile = it->second; 00275 const GtProfileArray& profileArray = kernelProfile.GetProfileArray(); 00276 00277 for (uint32_t recordNum = 0; recordNum != profileArray.NumRecords(); ++recordNum) 00278 { 00279 FunLatencyDispatchProfile dispatchProfile(dispatcher, kernelProfile.GetSiteId(recordNum)); 00280 00281 for (uint32_t threadBucket = 0; threadBucket < profileArray.NumThreadBuckets(); ++threadBucket) 00282 { 00283 FunLatencyRecord record; 00284 if (!profileArray.Read(*buffer, &record, recordNum, 1, threadBucket)) 00285 { 00286 GTPIN_ERROR_MSG(string("FunLatency : ") + string(kernel.Name()) + " : Failed to read from memory buffer"); 00287 } 00288 else 00289 { 00290 dispatchProfile.Accumulate(record); 00291 } 00292 } 00293 kernelProfile.AddDispatchProfile(dispatchProfile); 00294 } 00295 } 00296 } 00297 00298 bool FunLatency::Use64BitCounters(const IGtGenCoder& coder) 00299 { 00300 return coder.InstructionFactory().CanAccessAtomically(GED_DATA_TYPE_uq); 00301 } 00302 00303 void FunLatency::GeneratePreCode(GtGenProcedure& proc, const IGtGenCoder& coder) 00304 { 00305 coder.StartTimer(proc, _timeReg); 00306 if (!proc.empty()) { proc.front()->AppendAnnotation(__func__); } 00307 } 00308 00309 bool FunLatency::GeneratePostCodeRegBound(GtGenProcedure& postProc, GtGenProcedure& finiProc, const IGtGenCoder& coder, 00310 const GtProfileArray& profileArray, FunctionDescriptor* funcDesc, bool isFuncEntry, bool updateFrequency, uint32_t simdMask) 00311 { 00312 IGtInsFactory& insF = coder.InstructionFactory(); 00313 IGtVregFactory& vregs = coder.VregFactory(); 00314 IGtRegAllocator& ra = coder.RegAllocator(); 00315 bool is64BitCounter = Use64BitCounters(coder); 00316 00317 GtReg freqReg; 00318 GtReg cycleReg; 00319 if (isFuncEntry) 00320 { 00321 funcDesc->freqReg = vregs.MakeCounter(VREG_TYPE_DWORD); // Frequency counter 00322 funcDesc->cycleReg = vregs.MakeCounter(is64BitCounter ? VREG_TYPE_QWORD : VREG_TYPE_DWORD); // Cycle counter 00323 } 00324 00325 freqReg = funcDesc->freqReg; 00326 cycleReg = funcDesc->cycleReg; 00327 00328 GtReg flagReg = FlagReg(0); 00329 GtPredicate pred(flagReg); 00330 00331 GtReg cycleRegL = { cycleReg, sizeof(uint32_t), 0 }; // Low 32-bits of cycle counter 00332 GtReg cycleRegH = { cycleReg, sizeof(uint32_t), 1 }; // High 32-bits of cycle counter 00333 GtReg dataRegL = { _dataReg, sizeof(uint32_t), 0 }; // Low 32-bits of '_dataReg' 00334 00335 // Generate procedure that computes and aggregates cycles and frequency 00336 GtGenProcedure proc; 00337 coder.StopTimerExt(proc, _timeReg); 00338 00339 proc += insF.MakeAdd(cycleRegL, cycleRegL, _timeReg); // Add elapsed time to lower 32-bits of of 'cycleReg' 00340 if (is64BitCounter) 00341 { 00342 // If cycleRegL overflowed, increment the high 32-bits of 'cycleReg' 00343 proc += insF.MakeCmp(GED_COND_MODIFIER_l, flagReg, cycleRegL, _timeReg); 00344 proc += insF.MakeAdd(cycleRegH, cycleRegH, 1).SetPredicate(pred); 00345 } 00346 00347 GtReg tmpReg = vregs.MakeScratch(); 00348 GtReg tmpReg1 = _timeReg; 00349 if (!knobIncludeZeroExecMask) 00350 { 00351 proc += insF.MakeMov(tmpReg, ChannelEnableReg()); 00352 proc += insF.MakeMov(tmpReg1, DispatchMaskReg()); 00353 proc += insF.MakeAnd(tmpReg, tmpReg1, tmpReg); 00354 proc += insF.MakeAnd(tmpReg, tmpReg, simdMask); 00355 proc += insF.MakeCmp(GED_COND_MODIFIER_z, flagReg, tmpReg, 0); 00356 } 00357 if (updateFrequency) 00358 { 00359 proc += insF.MakeAdd(freqReg, freqReg, 1); // freq++ 00360 if (!knobIncludeZeroExecMask) 00361 { 00362 proc.back()->SetPredicate(!pred); 00363 } 00364 } 00365 00366 if (!knobIncludeZeroExecMask) 00367 { 00368 proc += insF.MakeMov(cycleRegL, 0).SetPredicate(pred); // Clear the low 32-bits of 'cycleReg' 00369 if (is64BitCounter) 00370 { 00371 proc += insF.MakeMov(cycleRegH, 0).SetPredicate(pred); // Clear the high 32-bits of 'cycleReg' 00372 } 00373 } 00374 00375 if (isFuncEntry && !ra.ReserveVregOperands(proc)) 00376 { 00377 return false; // No more free registers 00378 } 00379 proc.front()->AppendAnnotation(__func__); 00380 postProc.MoveAfter(std::move(proc)); 00381 00382 // Generate 'finiProc' procedure that stores aggregated cycles and frequency counters in the profile buffer 00383 profileArray.ComputeAddress(coder, proc, _addrReg, funcDesc->recordNum); 00384 proc += insF.MakeRegMov(_dataReg, cycleReg); 00385 proc += insF.MakeAtomicAdd(NullReg(), _addrReg, _dataReg, (is64BitCounter ? GED_DATA_TYPE_uq : GED_DATA_TYPE_ud)); 00386 00387 if (updateFrequency) 00388 { 00389 profileArray.ComputeRelAddress(coder, proc, _addrReg, _addrReg, offsetof(FunLatencyRecord, freq)); 00390 proc += insF.MakeRegMov(dataRegL, freqReg); 00391 proc += insF.MakeAtomicAdd(NullReg(), _addrReg, _dataReg, GED_DATA_TYPE_ud); 00392 00393 proc.front()->AppendAnnotation("GenerateFiniCode"); 00394 finiProc.MoveAfter(std::move(proc)); 00395 } 00396 00397 return true; 00398 } 00399 00400 void FunLatency::GeneratePostCodeMemBound(GtGenProcedure& proc, const IGtGenCoder& coder, 00401 const GtProfileArray& profileArray, FunctionDescriptor* funcDesc, bool updateFrequency, uint32_t simdMask) 00402 { 00403 IGtInsFactory& insF = coder.InstructionFactory(); 00404 IGtVregFactory& vregs = coder.VregFactory(); 00405 bool is64BitCounter = Use64BitCounters(coder); 00406 GtReg dataRegL = { _dataReg, sizeof(uint32_t), 0 }; // Low 32-bits of the data payload register 00407 00408 // Generate code that computes elapsed time 00409 coder.StopTimerExt(proc, _timeReg); 00410 00411 // cycles += _timeReg 00412 proc += insF.MakeMov(dataRegL, _timeReg); // Move timer value to the low 32-bits of the data register 00413 if (is64BitCounter) 00414 { 00415 // Clear the high 32-bits of the data payload register 00416 GtReg dataRegH = { _dataReg, sizeof(uint32_t), 1 }; 00417 proc += insF.MakeMov(dataRegH, 0); 00418 } 00419 00420 GtReg tmpReg = vregs.MakeScratch(); 00421 GtReg tmpReg1 = _timeReg; 00422 GtReg flagReg = FlagReg(0); 00423 GtPredicate pred(flagReg); 00424 00425 if (!knobIncludeZeroExecMask) 00426 { 00427 proc += insF.MakeMov(tmpReg, ChannelEnableReg()); 00428 proc += insF.MakeMov(tmpReg1, DispatchMaskReg()); 00429 proc += insF.MakeAnd(tmpReg, tmpReg1, tmpReg); 00430 proc += insF.MakeAnd(tmpReg, tmpReg, simdMask); 00431 proc += insF.MakeCmp(GED_COND_MODIFIER_z, flagReg, tmpReg, 0); 00432 } 00433 00434 // _addrReg = address of the current thread's FunLatencyRecord in the profile buffer 00435 profileArray.ComputeAddress(coder, proc, _addrReg, funcDesc->recordNum); 00436 00437 proc += insF.MakeAtomicAdd(NullReg(), _addrReg, _dataReg, (is64BitCounter ? GED_DATA_TYPE_uq : GED_DATA_TYPE_ud)); 00438 if (!knobIncludeZeroExecMask) 00439 { 00440 proc.back()->SetPredicate(!pred); 00441 } 00442 00443 // freq++ 00444 if (updateFrequency) 00445 { 00446 profileArray.ComputeRelAddress(coder, proc, _addrReg, _addrReg, offsetof(FunLatencyRecord, freq)); 00447 proc += insF.MakeAtomicInc(NullReg(), _addrReg, GED_DATA_TYPE_ud); 00448 if (!knobIncludeZeroExecMask) 00449 { 00450 proc.back()->SetPredicate(!pred); 00451 } 00452 } 00453 00454 if (!proc.empty()) { proc.front()->AppendAnnotation(__func__); } 00455 } 00456 00457 std::string FunLatency::ToString() const 00458 { 00459 ostringstream ostr; 00460 ostr << "### Kernel/Shader execution-time profile generated by GTPin ###" << endl << endl; 00461 ostr << "Legend:" << endl; 00462 ostr << "NA - kernel was not instrumented." << endl << endl; 00463 if (!knobIncludeZeroExecMask) 00464 { 00465 ostr << "NOTE: Tool reports cycles and frequencies only for functions that were called with at least one enabled channel in execution mask." << endl << endl; 00466 } 00467 ostr << setw(20) << "Kernel Name" << setw(20) << "HashID" << setw(30) << "Function Name" << setw(15) << "Inst"; 00468 ostr << setw(15) << "Freq." << setw(20) << "Total-latency" << setw(20) << "Total-latency(%)" << setw(20) << "Avg-Cycles"<< setw(10) << "Type"; 00469 ostr << setw(10) << "SIMD" << setw(15) << "Platform" << setw(35) << "Execution descriptor"; 00470 ostr << endl; 00471 for (const auto& kernelEntry : _kernels) 00472 { 00473 ostr << kernelEntry.second.ToString(); 00474 } 00475 return ostr.str(); 00476 } 00477 00478 void FunLatency::DumpAsm() const 00479 { 00480 for (const auto& kernelEntry : _kernels) 00481 { 00482 kernelEntry.second.DumpAsm(); 00483 } 00484 } 00485 00486 void FunLatency::Fini() 00487 { 00488 string profileDir = GTPin_GetCore()->ProfileDir(); 00489 string filePath = JoinPath(profileDir, "FunLatency.txt"); 00490 00491 ofstream fs(filePath); 00492 if (fs.is_open()) 00493 { 00494 fs << ToString(); 00495 fs.close(); 00496 } 00497 else 00498 { 00499 GTPIN_WARNING("FunLatency : could not create file: " + filePath); 00500 } 00501 DumpAsm(); 00502 } 00503 00504 /* ============================================================================================= */ 00505 // FunLatencyDispatchProfile implementation 00506 /* ============================================================================================= */ 00507 FunLatencyDispatchProfile::FunLatencyDispatchProfile(const IGtKernelDispatch& kernelDispatch, InstrumentSiteId id) : 00508 siteId(id), cycles(0), freq(0) 00509 { 00510 kernelDispatch.GetExecDescriptor(kernelExecDesc); 00511 } 00512 00513 void FunLatencyDispatchProfile::Accumulate(const FunLatencyRecord& record) 00514 { 00515 cycles += record.cycles; 00516 freq += record.freq; 00517 } 00518 00519 /* ============================================================================================= */ 00520 // FunctionDescriptor implementation 00521 /* ============================================================================================= */ 00522 FunctionDescriptor::FunctionDescriptor(const FunctionId fId, const std::string fName, 00523 const bool fIsMain, const InsId firstIns, const InsId lastIns, const uint32_t recNum) : 00524 id(fId), name(fName), isMain(fIsMain), firstInsId(firstIns), lastInsId(lastIns), recordNum(recNum) {}; 00525 00526 /* ============================================================================================= */ 00527 // FunLatencyKernelProfile implementation 00528 /* ============================================================================================= */ 00529 FunLatencyKernelProfile::FunLatencyKernelProfile(const IGtKernel& kernel, 00530 const IGtCfg& cfg, 00531 const GtProfileArray& profileArray, 00532 InstrumentSites&& instrumentSites, 00533 FunctionMap&& functions) : 00534 _name(GlueString(kernel.Name())), _uniqueName(kernel.UniqueName()), _type(kernel.Type()), _platform(kernel.GpuPlatform()), _hashId(kernel.HashId()), 00535 _simd(kernel.SimdWidth()), _asmText(CfgAsmText(cfg)), _profileArray(profileArray), _totalKernelCycles(0), 00536 _instrumentSites(std::move(instrumentSites)),_functionMap(std::move(functions)) {}; 00537 00538 FunLatencyDispatchProfile& FunLatencyKernelProfile::AddDispatchProfile(const FunLatencyDispatchProfile& dispatchProfile) 00539 { 00540 _totalKernelCycles += dispatchProfile.cycles; 00541 _dispatchProfiles.push_back(dispatchProfile); 00542 return _dispatchProfiles.back(); 00543 } 00544 00545 InstrumentSiteId FunLatencyKernelProfile::GetSiteId(uint32_t recordNum) const 00546 { 00547 GTPIN_ASSERT(recordNum < _instrumentSites.size()); 00548 return _instrumentSites[recordNum]; 00549 } 00550 00551 void FunLatencyKernelProfile::DumpAsm() const 00552 { 00553 DumpKernelAsmText(_name, _uniqueName, _asmText); 00554 } 00555 00556 string FunLatencyKernelProfile::ToString() const 00557 { 00558 ostringstream ostr; 00559 if (!_dispatchProfiles.empty()) 00560 { 00561 for (const auto& dp : _dispatchProfiles) 00562 { 00563 if (knobSkipZeroResults && dp.freq == 0) 00564 { 00565 continue; // Skip zero results 00566 } 00567 00568 auto func = _functionMap.find(dp.siteId)->second; 00569 double dispatchToExecTotal = (_totalKernelCycles ? ((100.0 * (double)dp.cycles) / _totalKernelCycles) : 0.0); 00570 uint64_t avgCycles = (dp.freq ? (dp.cycles / dp.freq) : 0); 00571 string funcName = func.name + (func.isMain ? "(main)" : ""); 00572 if (funcName.length() > 29) 00573 { 00574 funcName = funcName.substr(0, 29); 00575 } 00576 00577 if (func.isMain) 00578 { 00579 ostr << endl; 00580 } 00581 00582 if (func.isMain) 00583 { 00584 ostr << endl; 00585 } 00586 ostr << setw(20) << _name << setw(20) << hex << _hashId; 00587 ostr << setw(30) << funcName << setw(15) << std::string("[" + std::to_string((uint32_t)func.firstInsId) + "-" + std::to_string((uint32_t)func.lastInsId) + "]"); 00588 ostr << setw(15) << dec << dp.freq << setw(20) << dec << dp.cycles; 00589 ostr << fixed << setprecision(2) << setw(20) << dispatchToExecTotal << setw(20) << avgCycles << setw(10) << _type.ToString(); 00590 ostr << setw(10) << _simd << setw(15) << _platform.ToString() << setw(35) << dp.kernelExecDesc.ToString(_platform, ExecDescAlignedFormat()); 00591 ostr << endl; 00592 } 00593 } 00594 else 00595 { 00596 ostr << setw(20) << _name << setw(20) << hex << _hashId; 00597 ostr << setw(30) << "NA" << setw(15) << +"NA"; 00598 ostr << setw(15) << dec << "NA" << setw(20) << dec << "NA"; 00599 ostr << fixed << setprecision(2) << setw(20) << "NA" << setw(20) << "NA" << setw(10) << "NA"; 00600 ostr << setw(10) << "NA" << setw(15) << "NA" << setw(35) << "NA"; 00601 ostr << endl; 00602 } 00603 00604 ostr << endl; 00605 return ostr.str(); 00606 } 00607 00608 /* ============================================================================================= */ 00609 // GTPin_Entry 00610 /* ============================================================================================= */ 00611 EXPORT_C_FUNC void GTPin_Entry(int argc, const char* argv[]) 00612 { 00613 ConfigureGTPin(argc, argv); 00614 FunLatency::Instance()->Register(); 00615 atexit(FunLatency::OnFini); 00616 }
(Back to the list of all GTPin Sample Tools)
Copyright (C) 2013-2025 Intel Corporation
SPDX-License-Identifier: MIT
1.7.4