|
GTPin
|
The Simdprof_hlif is a GTPin tool for profiling active channels in GPU kernels per channel granularity. The tool applies the same algorithm as GTPin: Simdprof Sample Tool. The tool leverages High Level Instrumentation interface (HLIF).
To run Simdprof_hlif tool use the following command:
Profilers/Bin/gtpin -t simdprof_hlif [GTPin args] -- app [application args]
The following output example shows the results of profiling a kernel that multiplies two matrices of 256x256 single-precision floating-point values.
Channels (SIMD operations) executed by kernels/BBLs
===================================================
----------------------------------------------------------------------------------------------------
GEMM___CS_asm7a9710a97ba42282_simd32_7a9710a97ba42282_0
BBL Head Ins ID Tail Ins ID Channels
0 16 39 487424
chan[ 0]: 49152
chan[ 1]: 36864
chan[ 2]: 36864
chan[ 3]: 36864
chan[ 4]: 36864
chan[ 5]: 36864
chan[ 6]: 36864
chan[ 7]: 36864
chan[ 8]: 18432
chan[ 9]: 18432
chan[10]: 18432
chan[11]: 18432
chan[12]: 18432
chan[13]: 18432
chan[14]: 18432
chan[15]: 18432
chan[16]: 2048
chan[17]: 2048
chan[18]: 2048
chan[19]: 2048
chan[20]: 2048
chan[21]: 2048
chan[22]: 2048
chan[23]: 2048
chan[24]: 2048
chan[25]: 2048
chan[26]: 2048
chan[27]: 2048
chan[28]: 2048
chan[29]: 2048
chan[30]: 2048
chan[31]: 2048
1 40 42 0
2 43 43 65536
chan[ 0]: 2048
chan[ 1]: 2048
chan[ 2]: 2048
chan[ 3]: 2048
chan[ 4]: 2048
chan[ 5]: 2048
chan[ 6]: 2048
chan[ 7]: 2048
chan[ 8]: 2048
chan[ 9]: 2048
chan[10]: 2048
chan[11]: 2048
chan[12]: 2048
chan[13]: 2048
chan[14]: 2048
chan[15]: 2048
chan[16]: 2048
chan[17]: 2048
chan[18]: 2048
chan[19]: 2048
chan[20]: 2048
chan[21]: 2048
chan[22]: 2048
chan[23]: 2048
chan[24]: 2048
chan[25]: 2048
chan[26]: 2048
chan[27]: 2048
chan[28]: 2048
chan[29]: 2048
chan[30]: 2048
chan[31]: 2048
3 44 46 67584
chan[ 0]: 6144
chan[ 1]: 4096
chan[ 2]: 4096
chan[ 3]: 4096
chan[ 4]: 4096
chan[ 5]: 4096
chan[ 6]: 4096
chan[ 7]: 4096
chan[ 8]: 4096
chan[ 9]: 4096
chan[10]: 4096
chan[11]: 4096
chan[12]: 4096
chan[13]: 4096
chan[14]: 4096
chan[15]: 4096
4 47 74 156696576
chan[ 0]: 14678016
chan[ 1]: 8910848
chan[ 2]: 8910848
chan[ 3]: 8910848
chan[ 4]: 8910848
chan[ 5]: 8910848
chan[ 6]: 8910848
chan[ 7]: 8910848
chan[ 8]: 8910848
chan[ 9]: 8910848
chan[10]: 8910848
chan[11]: 8910848
chan[12]: 8910848
chan[13]: 8910848
chan[14]: 8910848
chan[15]: 8910848
chan[16]: 522240
chan[17]: 522240
chan[18]: 522240
chan[19]: 522240
chan[20]: 522240
chan[21]: 522240
chan[22]: 522240
chan[23]: 522240
chan[24]: 522240
chan[25]: 522240
chan[26]: 522240
chan[27]: 522240
chan[28]: 522240
chan[29]: 522240
chan[30]: 522240
chan[31]: 522240
5 75 75 65536
chan[ 0]: 2048
chan[ 1]: 2048
chan[ 2]: 2048
chan[ 3]: 2048
chan[ 4]: 2048
chan[ 5]: 2048
chan[ 6]: 2048
chan[ 7]: 2048
chan[ 8]: 2048
chan[ 9]: 2048
chan[10]: 2048
chan[11]: 2048
chan[12]: 2048
chan[13]: 2048
chan[14]: 2048
chan[15]: 2048
chan[16]: 2048
chan[17]: 2048
chan[18]: 2048
chan[19]: 2048
chan[20]: 2048
chan[21]: 2048
chan[22]: 2048
chan[23]: 2048
chan[24]: 2048
chan[25]: 2048
chan[26]: 2048
chan[27]: 2048
chan[28]: 2048
chan[29]: 2048
chan[30]: 2048
chan[31]: 2048
6 76 85 219136
chan[ 0]: 20480
chan[ 1]: 14336
chan[ 2]: 14336
chan[ 3]: 14336
chan[ 4]: 14336
chan[ 5]: 14336
chan[ 6]: 14336
chan[ 7]: 14336
chan[ 8]: 12288
chan[ 9]: 12288
chan[10]: 12288
chan[11]: 12288
chan[12]: 12288
chan[13]: 12288
chan[14]: 12288
chan[15]: 12288
7 86 86 2048
chan[ 0]: 2048
8 87 87 0
Total 157603840
Total number of kernels: 1
Total number of channels (SIMD operations): 157603840
(Back to the list of all GTPin Sample Tools)
00001 /*========================== begin_copyright_notice ============================ 00002 Copyright (C) 2024-2026 Intel Corporation 00003 00004 SPDX-License-Identifier: MIT 00005 ============================= end_copyright_notice ===========================*/ 00006 00007 /*! 00008 * @file SIMD operation counting tool definitions 00009 */ 00010 #ifndef SIMDPROF_HLIF_H_ 00011 #define SIMDPROF_HLIF_H_ 00012 00013 #include "hlif_basic_defs.h" 00014 00015 #if defined(__cplusplus) 00016 #include "gtpin_api.h" 00017 using namespace gtpin; 00018 #endif 00019 00020 #pragma pack(push, 8) 00021 00022 /* ============================================================================================= */ 00023 // Struct SimdProfCountArgs 00024 /* ============================================================================================= */ 00025 /// Common arguments of HLI function that counts active channels 00026 typedef struct SimdProfCountArgs 00027 { 00028 struct 00029 { 00030 uint32_t channelCounts[32]; ///< Counter of active channels 00031 } out; 00032 00033 #if defined(__cplusplus) 00034 /// Constructor 00035 SimdProfCountArgs() { for (uint32_t i = 0; i < 32; out.channelCounts[i++] = 0); } 00036 #endif 00037 } SimdProfCountArgs; 00038 00039 /* ============================================================================================= */ 00040 // Function CountEnableChannels 00041 /* ============================================================================================= */ 00042 /*! 00043 * @brief HLI function that counts enabled channels 00044 * @param[in] accessMask Per-channel mask of memory accesses 00045 * @param[out] counter Per channel counters 00046 */ 00047 IGC_STACK_CALL void CountEnableChannels(uint32_t accessMask, 00048 __global SimdProfCountArgs* counter); 00049 00050 #if defined(__cplusplus) 00051 using CountEnableChannelsFunc = GtHliFunction<void, uint32_t, SimdProfCountArgs*>; 00052 #endif 00053 00054 /*! 00055 * @brief HLI function that counts enabled channels for SEND instruction 00056 * @param[in] accessMask Per-channel mask of memory accesses 00057 * @param[out] counter Per channel counters 00058 */ 00059 IGC_STACK_CALL void CountEnableChannelsForSend(uint32_t accessMask, 00060 __global SimdProfCountArgs* counter); 00061 00062 #if defined(__cplusplus) 00063 using CountEnableChannelsForSendFunc = GtHliFunction<void, uint32_t, SimdProfCountArgs*>; 00064 #endif 00065 00066 #pragma pack(pop) 00067 00068 #endif
00001 /*========================== begin_copyright_notice ============================ 00002 Copyright (C) 2024-2025 Intel Corporation 00003 00004 SPDX-License-Identifier: MIT 00005 ============================= end_copyright_notice ===========================*/ 00006 00007 /*! 00008 * @file Implementation of the SIMD operation counting tool 00009 */ 00010 00011 #include <algorithm> 00012 #include <array> 00013 #include <vector> 00014 #include <map> 00015 #include <string> 00016 #include <fstream> 00017 #include <sstream> 00018 #include <iomanip> 00019 #include <assert.h> 00020 00021 #include "gtpin_api.h" 00022 #include "gtpin_tool_utils.h" 00023 00024 #include "simdprof_hlif.h" 00025 00026 using namespace gtpin; 00027 using namespace std; 00028 00029 /* ============================================================================================= */ 00030 // Struct SimdProfArgs 00031 /* ============================================================================================= */ 00032 /*! 00033 * SimdProf instrumentation arguments (instruction properties). 00034 * Each unique combination of these arguments requires a separate instrumentation procedure 00035 * to be generated for each group of instructions with these properties 00036 */ 00037 struct SimdProfArgs 00038 { 00039 SimdProfArgs(bool ctrl, uint32_t mask, GtPredicate pred, bool isSend = false) : 00040 maskCtrl(ctrl), execMask(mask), predicate(pred), isSendIns(isSend) {} 00041 00042 inline bool operator< (const SimdProfArgs& other) const; 00043 00044 bool maskCtrl; ///< 'MaskCtrl' flag of instrumented instructions 00045 uint32_t execMask; ///< Execution mask of instrumented instructions 00046 GtPredicate predicate; ///< Predicate of instrumented instructions 00047 bool isSendIns; ///< true if instrumented instructions are SEND instructions 00048 }; 00049 00050 /* ============================================================================================= */ 00051 // Struct SimdProfGroup 00052 /* ============================================================================================= */ 00053 /*! 00054 * Structure that holds information and profiling results for a group of instructions being 00055 * instrumented by a single instrumentation routine. 00056 * @note All instructions within a group have exactly the same SimdProfArgs. 00057 * @note In order to provide separate channel counters per instruction category (e.g. integer, FP, etc.), 00058 * replace the {insCount, opCount} pair with an array of counter pairs per category. 00059 */ 00060 struct SimdProfGroup 00061 { 00062 SimdProfGroup(uint32_t bbl, uint32_t count) : bblId(bbl), insCount(count), totalOpCount(0), simdProfCountArgs() { memset(opCounts.data(), 0, 32 * sizeof(uint64_t)); } 00063 00064 BblId bblId; ///< Identifier of a BBL that contains this group of instructions 00065 uint32_t insCount; ///< Number of instructions in the group 00066 uint64_t totalOpCount; 00067 std::array<uint64_t, 32> opCounts; ///< Number of SIMD operations (effective channels) executed by each instruction in the group 00068 SimdProfCountArgs simdProfCountArgs; 00069 }; 00070 00071 /* ============================================================================================= */ 00072 // Struct SimdProfSection 00073 /* ============================================================================================= */ 00074 /*! 00075 * Structure that holds information on a SimdProf section - sequence of instructions for which 00076 * instrumentation routines can be inserted at the same point. 00077 * @note All instructions within a section are executed with the same value of the flag register - 00078 * single dynamic parameter of the SIMD operation calulator 00079 */ 00080 struct SimdProfSection 00081 { 00082 SimdProfSection(BblId bid, const IGtIns& headIns) : bblId(bid), headInsId(headIns.Id()) {} 00083 00084 /// Add a new instruction to the section. Update the corresponding SimdProf group within this section 00085 void AddInstruction(const IGtIns& ins); 00086 00087 BblId bblId; ///< Id of the corresponding basic block 00088 InsId headInsId; ///< First intruction of the section - common 00089 ///< instrumentation point for all groups in the section 00090 std::map<SimdProfArgs, uint32_t> groups; ///< SimdProf groups along with the number of instructions 00091 std::map<SimdProfArgs, InsId> groupHeadIds; 00092 }; 00093 00094 /* ============================================================================================= */ 00095 // Class SimdProfKernelProfile 00096 /* ============================================================================================= */ 00097 /*! 00098 * Class that represents a kernel profiled by the SimdProf instrumentation 00099 */ 00100 class SimdProfKernelProfile 00101 { 00102 public: 00103 SimdProfKernelProfile(const IGtKernel& kernel); 00104 00105 /*! 00106 * Instrument the kernel. 00107 * The function is called by the OnKernelBuild handler 00108 * @return success/failure status 00109 */ 00110 void Instrument(IGtKernelInstrument& instrumentor); 00111 00112 void WriteArgumentsToDevice(IGtMemoryMapper& memMapper); 00113 00114 /*! 00115 * Read profiling results which are assumed to be collected and stored in the buffer 00116 * associated with the kernel. 00117 * The function is called by the OnKernelComplete handler 00118 */ 00119 void ReadProfileData(); 00120 00121 uint64_t GetTotalOpCounter() const { return _totalOpCount; } 00122 00123 std::string ToString() const; ///< @return Text representation of the profile data 00124 00125 private: 00126 /*! 00127 * Generate instrumentation procedures for all SimdProf groups of the specified SimdProf section. 00128 * Insert instrumentation at the beginning of the section. 00129 * Initialize the _profileData array 00130 * @param[in] instrumentor Instrumentor of the GEN kernel 00131 * @param[in] section SimdProf section to be instrumented 00132 */ 00133 void InstrumentSection(IGtKernelInstrument& instrumentor, const SimdProfSection& section); 00134 00135 /// Increment counter of SIMD operations for the specified BBL by 'incValue' 00136 void UpdateBblOpCounter(BblId bblId, uint64_t* perChannelValues); 00137 00138 /// @return Extended kernel name 00139 std::string ExtendedName() const { return _extName; } 00140 00141 private: 00142 /// Kernel descriptor 00143 std::string _name; ///< Kernel's name 00144 std::string _extName; ///< Kernel's extended name 00145 GtKernelType _type; ///< Kernel's type 00146 GtGpuPlatform _platform; ///< Kernel's platform 00147 uint64_t _hashId; ///< Kernel's hash identifier 00148 GtSimdWidth _simd; ///< Kernel's SIMD width 00149 uint64_t _binarySignature; ///< Kernel's binary signature 00150 uint64_t _totalOpCount = 0; 00151 std::list<SimdProfGroup> _profileData; ///< Profiling data for instrumented SimdProf groups 00152 00153 std::map<BblId, std::pair<InsId, InsId> > _bblInsInfo; ///< Head and tail instructions per BBL 00154 00155 typedef struct OpCounts { 00156 uint64_t counts[32] = {}; 00157 uint64_t total = 0; 00158 } OpCounts; 00159 00160 std::map<BblId, OpCounts> _bblOpCounts; ///< Number of executed SIMD operations per BBL 00161 00162 CountEnableChannelsFunc _countEnableChannelsFunc; 00163 CountEnableChannelsForSendFunc _countEnableChannelsForSendFunc; 00164 }; 00165 00166 /* ============================================================================================= */ 00167 // Class SimdProfHlif 00168 /* ============================================================================================= */ 00169 /*! 00170 * Implementation of the IGtTool interface for the SimdProfHlif tool 00171 */ 00172 class SimdProfHlif : public GtTool 00173 { 00174 public: 00175 /// Implementation of the IGtTool interface 00176 const char* Name() const { return "simdprof_hlif"; } 00177 00178 void OnKernelBuild(IGtKernelInstrument& instrumentor); 00179 void OnKernelRun(IGtKernelDispatch& dispatcher); 00180 void OnKernelComplete(IGtKernelDispatch& dispatcher); 00181 00182 public: 00183 std::string ToString() const; ///< @return Text representation of the profile data 00184 static SimdProfHlif* Instance(); ///< @return Single instance of this class 00185 static void OnFini() { Instance()->Fini(); } ///< Callback function registered with atexit() 00186 void LoadHliLibrary(); ///< Compile and load library of HLI functions 00187 00188 protected: 00189 SimdProfHlif() = default; 00190 SimdProfHlif(const SimdProfHlif&) = delete; 00191 SimdProfHlif& operator = (const SimdProfHlif&) = delete; 00192 ~SimdProfHlif() = default; 00193 00194 void Fini(); ///< Post process and dump profiling data 00195 00196 private: 00197 00198 /// Collection of kernel profiles 00199 using KernelProfilers = std::map<GtKernelId, SimdProfKernelProfile>; 00200 00201 private: 00202 00203 KernelProfilers _kernels; 00204 IGtHliModuleHandle _hliModule = nullptr; ///< Module of HLI functions 00205 }; 00206 00207 00208 /* ============================================================================================= */ 00209 // SimdProfArgs implementation 00210 /* ============================================================================================= */ 00211 00212 bool SimdProfArgs::operator < (const SimdProfArgs& other) const 00213 { 00214 return std::make_tuple(maskCtrl, execMask, predicate, isSendIns) < 00215 std::make_tuple(other.maskCtrl, other.execMask, other.predicate, other.isSendIns); 00216 } 00217 00218 /* ============================================================================================= */ 00219 // SimdProfSection implementation 00220 /* ============================================================================================= */ 00221 00222 void SimdProfSection::AddInstruction(const IGtIns& ins) 00223 { 00224 uint32_t execMask = ins.ExecMask().Bits(); 00225 GtPredicate predicate = ins.Predicate(); 00226 bool maskCtrl = !ins.IsWriteMaskEnabled(); 00227 bool isSendIns = ins.IsSendMessage(); 00228 00229 SimdProfArgs simdProfArgs(maskCtrl, execMask, predicate, isSendIns); 00230 00231 auto it = groups.emplace(simdProfArgs, 0).first; 00232 00233 if (it->second == 0) 00234 { 00235 groupHeadIds.emplace(simdProfArgs, ins.Id()); 00236 } 00237 00238 ++(it->second); 00239 } 00240 00241 /* ============================================================================================= */ 00242 // SimdprofKernelProfile implementation 00243 /* ============================================================================================= */ 00244 00245 SimdProfKernelProfile::SimdProfKernelProfile(const IGtKernel& kernel) : 00246 _name(GlueString(kernel.Name())), _extName(ExtendedKernelName(kernel)), _type(kernel.Type()), _platform(kernel.GpuPlatform()), 00247 _hashId(kernel.HashId()), _simd(kernel.SimdWidth()), _binarySignature(kernel.BinarySignature()), 00248 _countEnableChannelsFunc("CountEnableChannels"), _countEnableChannelsForSendFunc("CountEnableChannelsForSend") {} 00249 00250 void SimdProfKernelProfile::Instrument(IGtKernelInstrument& instrumentor) 00251 { 00252 const IGtCfg& cfg = instrumentor.Cfg(); 00253 IGtMemoryMapper& memMapper = instrumentor.MemoryMapper(); 00254 00255 std::vector<SimdProfSection> sections; // All SimdProf sections in the kernel 00256 00257 // Identify SimdProf sections and #groups in the kernel 00258 for (auto bblPtr : cfg.Bbls()) 00259 { 00260 bool isSectionBegin = true; 00261 00262 // Iterate through sections within the current BBL 00263 for (auto insPtr : bblPtr->Instructions()) 00264 { 00265 const IGtIns& ins = *insPtr; 00266 00267 if (insPtr->Id() < uint32_t(knobMinInstrumentIns)) 00268 { 00269 continue; 00270 } 00271 00272 if (insPtr->Id() > uint32_t(knobMaxInstrumentIns)) 00273 { 00274 continue; 00275 } 00276 00277 if (isSectionBegin) 00278 { 00279 sections.emplace_back(bblPtr->Id(), ins); 00280 isSectionBegin = false; 00281 } 00282 00283 SimdProfSection& section = sections.back(); 00284 section.AddInstruction(ins); 00285 00286 00287 if (ins.IsFlagModifier() || (ins.Id() == bblPtr->LastIns().Id()) || (insPtr->Id() == uint32_t(knobMaxInstrumentIns))) //section end 00288 { 00289 isSectionBegin = true; 00290 } 00291 } 00292 } 00293 00294 // Instrument SimdProf sections and initialize the _profileData array 00295 for (auto& section : sections) { InstrumentSection(instrumentor, section); } 00296 00297 std::map<uint32_t, uint32_t> bblGroupNumber; 00298 00299 for (auto& group : _profileData) 00300 { 00301 bblGroupNumber[group.bblId]++; 00302 memMapper.Map(group.simdProfCountArgs, GT_MMAP_SHARE); 00303 } 00304 00305 // Save BBL information for the post processing phase 00306 for (auto bblPtr : cfg.Bbls()) 00307 { 00308 _bblInsInfo.emplace(bblPtr->Id(), std::make_pair(bblPtr->FirstIns().Id(), bblPtr->LastIns().Id())); 00309 } 00310 } 00311 00312 void SimdProfKernelProfile::WriteArgumentsToDevice(IGtMemoryMapper& memMapper) 00313 { 00314 for (auto& group : _profileData) 00315 { 00316 void* ptr = (void*)&(group.simdProfCountArgs); 00317 memset(ptr, 0, sizeof(SimdProfCountArgs)); 00318 memMapper.Write(ptr, sizeof(SimdProfCountArgs)); 00319 } 00320 } 00321 00322 void SimdProfKernelProfile::InstrumentSection(IGtKernelInstrument& instrumentor, const SimdProfSection& section) 00323 { 00324 const IGtCfg& cfg = instrumentor.Cfg(); 00325 00326 // Instrument each SimdProf group: 00327 // - If a group is associated with a non-SEND instructions, compute the SIMD count by applying CBIT to the SIMD mask. 00328 // - Otherwise, if a group is created for SEND instructions, increment the SIMD count for each SEND whose SIMD mask 00329 // is nonzero. From the EU perspective, SEND instruction is 1 operation, unless the SIMD mask is zero 00330 // Insert each per-group instrumentation procedure at the beginning of the corresponding section 00331 00332 //Insert SimdProf instrumentaion at the beginning of the current section 00333 const IGtIns& ins = cfg.GetInstruction(section.headInsId); 00334 const IGtBbl& bbl = cfg.GetBbl(ins); 00335 00336 for (auto& group : section.groups) 00337 { 00338 00339 auto it = section.groupHeadIds.find(group.first); GTPIN_ASSERT(it != section.groupHeadIds.end()); 00340 const IGtIns& groupHeadIns = cfg.GetInstruction(it->second); 00341 00342 _profileData.emplace_back(bbl.Id(), group.second); 00343 00344 SimdProfGroup& newProfGroup = _profileData.back(); 00345 SimdProfCountArgs* countArgs = const_cast<SimdProfCountArgs*>(&newProfGroup.simdProfCountArgs); 00346 IargInsOpMask accessMask(groupHeadIns); 00347 00348 if (group.first.isSendIns) 00349 { 00350 _countEnableChannelsForSendFunc.InsertCallAtInstruction(instrumentor, groupHeadIns, GtIpoint::Before(), 00351 NullReg(), // Unused return value 00352 accessMask, // arg[1]: Per-channel mask of memory accesses 00353 countArgs // arg[2]: Counter arguments 00354 ); 00355 } 00356 else 00357 { 00358 _countEnableChannelsFunc.InsertCallAtInstruction(instrumentor, groupHeadIns, GtIpoint::Before(), 00359 NullReg(), // Unused return value 00360 accessMask, // arg[1]: Per-channel mask of memory accesses 00361 countArgs // arg[2]: Counter arguments 00362 ); 00363 } 00364 } 00365 } 00366 00367 void SimdProfKernelProfile::ReadProfileData() 00368 { 00369 // Iterate through all SimdProf groups and read counters of executed operations (channels). 00370 for (auto& group : _profileData) 00371 { 00372 uint64_t groupCounts[32] = {}; 00373 00374 SimdProfCountArgs& args = group.simdProfCountArgs; 00375 uint32_t numOfInsts = group.insCount; 00376 00377 for (uint32_t i = 0; i < 32; i++) 00378 { 00379 groupCounts[i] = args.out.channelCounts[i] * numOfInsts; 00380 _totalOpCount += groupCounts[i]; 00381 } 00382 // Update counters of executed operations 00383 UpdateBblOpCounter(group.bblId._value, groupCounts); 00384 } 00385 } 00386 00387 void SimdProfKernelProfile::UpdateBblOpCounter(BblId bblId, uint64_t* perChannelValues) 00388 { 00389 auto it = _bblOpCounts.emplace(bblId, OpCounts()).first; 00390 00391 auto& opCounts = it->second; 00392 00393 for (uint32_t i = 0; i < 32; ++i) 00394 { 00395 opCounts.counts[i] += perChannelValues[i]; 00396 opCounts.total += perChannelValues[i]; 00397 } 00398 } 00399 00400 std::string SimdProfKernelProfile::ToString() const 00401 { 00402 ostringstream ostr; 00403 ostr << ExtendedName() << endl; 00404 00405 if (!_bblOpCounts.empty()) 00406 { 00407 ostr << setw(10) << "BBL" << setw(15) << "Head Ins ID" << setw(15) << "Tail Ins ID" << setw(20) << "Channels" << endl; 00408 for (const auto& bc : _bblOpCounts) 00409 { 00410 auto bblId = bc.first; 00411 uint32_t firstIns = _bblInsInfo.at(bc.first).first; 00412 uint32_t lastIns = _bblInsInfo.at(bc.first).second; 00413 ostr << setw(10) << bblId << setw(15) << firstIns << setw(15) << lastIns << setw(20) << bc.second.total << endl; 00414 if (bc.second.total) 00415 { 00416 for (uint32_t i = 0; i < 32; ++i) 00417 { 00418 if (bc.second.counts[i]) 00419 { 00420 ostr << setw(80) << "chan[" << setw(2) << i << "]: " << bc.second.counts[i] << endl; 00421 } 00422 } 00423 } 00424 } 00425 ostr << setw(10) << "Total" << setw(15) << _totalOpCount << endl; 00426 } 00427 else 00428 { 00429 ostr << "No channels executed" << endl; 00430 } 00431 00432 return ostr.str(); 00433 } 00434 00435 /* ============================================================================================= */ 00436 // SimdProfHlif implementation 00437 /* ============================================================================================= */ 00438 SimdProfHlif* SimdProfHlif::Instance() 00439 { 00440 static SimdProfHlif instance; 00441 return &instance; 00442 } 00443 00444 void SimdProfHlif::OnKernelBuild(IGtKernelInstrument& instrumentor) 00445 { 00446 const IGtKernel& kernel = instrumentor.Kernel(); 00447 auto it = _kernels.emplace(kernel.Id(), kernel).first; 00448 it->second.Instrument(instrumentor); 00449 00450 // Link the kernel with the library of HLI functions 00451 instrumentor.LinkHliModule(_hliModule); 00452 } 00453 00454 void SimdProfHlif::OnKernelRun(IGtKernelDispatch& dispatcher) 00455 { 00456 const IGtKernel& kernel = dispatcher.Kernel(); 00457 GtKernelExecDesc execDesc; dispatcher.GetExecDescriptor(execDesc); 00458 if (kernel.IsInstrumented() && IsKernelExecProfileEnabled(execDesc, kernel.GpuPlatform(), kernel.Name().Get())) 00459 { 00460 dispatcher.SetProfilingMode(true); // Enable instrumentation 00461 00462 // This tool needs an accurate information about memory allocations, which is available on the the final dispatch stage. 00463 // So, on the initial dispatch stage, we only enable instrumentation, and request GTPin to invoke OnKernelRun 00464 // one more time, on the final dispatch stage. If this request is accepted, the initialization of the profile buffer will 00465 // be done on the final dispatch stage, otherwise - on the intial dispatch stage. 00466 if (dispatcher.ReportFinalDispatchStage()) 00467 { 00468 return; 00469 } 00470 00471 auto it = _kernels.find(kernel.Id()); 00472 00473 if (it != _kernels.end()) 00474 { 00475 SimdProfKernelProfile& kernelProfile = it->second; 00476 kernelProfile.WriteArgumentsToDevice(dispatcher.MemoryMapper()); 00477 } 00478 } 00479 else 00480 { 00481 dispatcher.SetProfilingMode(false); 00482 } 00483 } 00484 00485 void SimdProfHlif::OnKernelComplete(IGtKernelDispatch& dispatcher) 00486 { 00487 if (!dispatcher.IsProfilingEnabled()) 00488 { 00489 return; // Do nothing with unprofiled kernel dispatches 00490 } 00491 00492 const IGtKernel& kernel = dispatcher.Kernel(); 00493 auto it = _kernels.find(kernel.Id()); 00494 00495 if (it != _kernels.end()) 00496 { 00497 SimdProfKernelProfile& kernelProfile = it->second; 00498 kernelProfile.ReadProfileData(); 00499 } 00500 } 00501 00502 void SimdProfHlif::LoadHliLibrary() 00503 { 00504 std::string modulePath = JoinPath(GetKnobValue<std::string>("installDir"), "Examples", "simdprof_hlif.cl"); 00505 _hliModule = GTPin_GetCore()->HliLibrary().CompileModuleFromFile(modulePath.c_str()); 00506 GTPIN_ASSERT_MSG(_hliModule != nullptr, "Could not load HLI module " + modulePath); 00507 } 00508 00509 void SimdProfHlif::Fini() 00510 { 00511 string profileDir = GTPin_GetCore()->ProfileDir(); 00512 string filePath = JoinPath(profileDir, "simdprof_hlif.txt"); 00513 00514 ofstream fs(filePath); 00515 if (fs.is_open()) 00516 { 00517 fs << ToString(); 00518 fs.close(); 00519 } 00520 else 00521 { 00522 GTPIN_WARNING("SIMDPROF_HLIF : could not create file: " + filePath); 00523 } 00524 } 00525 00526 string SimdProfHlif::ToString() const 00527 { 00528 ostringstream ostr; 00529 ostr << "Channels (SIMD operations) executed by kernels/BBLs" << endl; 00530 ostr << "===================================================" << endl; 00531 00532 uint64_t totalOpCount = 0; 00533 for (const auto& k : _kernels) 00534 { 00535 ostr << string(100, '-') << endl; 00536 ostr << k.second.ToString() << endl; 00537 totalOpCount += k.second.GetTotalOpCounter(); 00538 } 00539 ostr << "Total number of kernels: " << _kernels.size() << std::endl; 00540 ostr << "Total number of channels (SIMD operations): " << totalOpCount << std::endl; 00541 00542 return ostr.str(); 00543 } 00544 00545 // Define DETACHED_SIMDPROF to use SimdProf functionality in a different tool 00546 #if !defined (DETACHED_SIMDPROF) 00547 /* ============================================================================================= */ 00548 // GTPin_Entry 00549 /* ============================================================================================= */ 00550 EXPORT_C_FUNC void GTPin_Entry(int argc, const char* argv[]) 00551 { 00552 ConfigureGTPin(argc, argv); 00553 SimdProfHlif::Instance()->Register(); 00554 00555 // Compile and load library of HLI functions 00556 SimdProfHlif::Instance()->LoadHliLibrary(); 00557 00558 atexit(SimdProfHlif::OnFini); 00559 } 00560 #endif
00001 /*========================== begin_copyright_notice ============================ 00002 Copyright (C) 2024-2026 Intel Corporation 00003 00004 SPDX-License-Identifier: MIT 00005 ============================= end_copyright_notice ===========================*/ 00006 00007 /*! 00008 * @file Library of High-Level Instrumentation (HLI) functions used by the simdprof_hlif tool 00009 */ 00010 00011 #include "hlif_basic_defs.h" 00012 #include "simdprof_hlif.h" 00013 00014 00015 #define PER_CHANNEL 00016 00017 /*! 00018 * @brief HLI function that counts enabled channels 00019 * @see simdprof_hlif.h for details 00020 */ 00021 IGC_STACK_CALL void CountEnableChannels(uint32_t accessMask, __global SimdProfCountArgs* counter) 00022 { 00023 #ifndef PER_CHANNEL 00024 uint64_t channelCount = (uint64_t)popcount(accessMask); 00025 atom_add(&(counter->out.channelCounts[0]), channelCount); 00026 #else 00027 if (accessMask != 0) 00028 { 00029 for (uint32_t cIndx = 0; cIndx < 32; ++cIndx) 00030 { 00031 if ((accessMask & (0x1 << cIndx)) != 0) 00032 { 00033 atomic_inc(&(counter->out.channelCounts[cIndx])); 00034 } 00035 } 00036 } 00037 #endif 00038 } 00039 00040 /*! 00041 * @brief HLI function that counts enabled channels 00042 * @see simdprof_hlif.h for details 00043 */ 00044 IGC_STACK_CALL void CountEnableChannelsForSend(uint32_t accessMask, __global SimdProfCountArgs* counter) 00045 { 00046 if (accessMask != 0) 00047 { 00048 atomic_inc(&(counter->out.channelCounts[0])); 00049 } 00050 }
(Back to the list of all GTPin Sample Tools)
Copyright (C) 2013-2025 Intel Corporation
SPDX-License-Identifier: MIT
1.7.4