GTPin
GTPin: Simdprof_hlif Sample Tool

The Simdprof_hlif is a GTPin tool for profiling active channels in GPU kernels per channel granularity. The tool applies the same algorithm as GTPin: Simdprof Sample Tool. The tool leverages High Level Instrumentation interface (HLIF).

Running the Simdprof_hlif tool

To run Simdprof_hlif tool use the following command:

Profilers/Bin/gtpin -t simdprof_hlif [GTPin args]  -- app [application args]

Example Output

The following output example shows the results of profiling a kernel that multiplies two matrices of 256x256 single-precision floating-point values.

Channels (SIMD operations) executed by kernels/BBLs
===================================================
----------------------------------------------------------------------------------------------------
GEMM___CS_asm7a9710a97ba42282_simd32_7a9710a97ba42282_0
       BBL    Head Ins ID    Tail Ins ID            Channels
         0             16             39              487424
                                                                           chan[ 0]: 49152
                                                                           chan[ 1]: 36864
                                                                           chan[ 2]: 36864
                                                                           chan[ 3]: 36864
                                                                           chan[ 4]: 36864
                                                                           chan[ 5]: 36864
                                                                           chan[ 6]: 36864
                                                                           chan[ 7]: 36864
                                                                           chan[ 8]: 18432
                                                                           chan[ 9]: 18432
                                                                           chan[10]: 18432
                                                                           chan[11]: 18432
                                                                           chan[12]: 18432
                                                                           chan[13]: 18432
                                                                           chan[14]: 18432
                                                                           chan[15]: 18432
                                                                           chan[16]: 2048
                                                                           chan[17]: 2048
                                                                           chan[18]: 2048
                                                                           chan[19]: 2048
                                                                           chan[20]: 2048
                                                                           chan[21]: 2048
                                                                           chan[22]: 2048
                                                                           chan[23]: 2048
                                                                           chan[24]: 2048
                                                                           chan[25]: 2048
                                                                           chan[26]: 2048
                                                                           chan[27]: 2048
                                                                           chan[28]: 2048
                                                                           chan[29]: 2048
                                                                           chan[30]: 2048
                                                                           chan[31]: 2048
         1             40             42                   0
         2             43             43               65536
                                                                           chan[ 0]: 2048
                                                                           chan[ 1]: 2048
                                                                           chan[ 2]: 2048
                                                                           chan[ 3]: 2048
                                                                           chan[ 4]: 2048
                                                                           chan[ 5]: 2048
                                                                           chan[ 6]: 2048
                                                                           chan[ 7]: 2048
                                                                           chan[ 8]: 2048
                                                                           chan[ 9]: 2048
                                                                           chan[10]: 2048
                                                                           chan[11]: 2048
                                                                           chan[12]: 2048
                                                                           chan[13]: 2048
                                                                           chan[14]: 2048
                                                                           chan[15]: 2048
                                                                           chan[16]: 2048
                                                                           chan[17]: 2048
                                                                           chan[18]: 2048
                                                                           chan[19]: 2048
                                                                           chan[20]: 2048
                                                                           chan[21]: 2048
                                                                           chan[22]: 2048
                                                                           chan[23]: 2048
                                                                           chan[24]: 2048
                                                                           chan[25]: 2048
                                                                           chan[26]: 2048
                                                                           chan[27]: 2048
                                                                           chan[28]: 2048
                                                                           chan[29]: 2048
                                                                           chan[30]: 2048
                                                                           chan[31]: 2048
         3             44             46               67584
                                                                           chan[ 0]: 6144
                                                                           chan[ 1]: 4096
                                                                           chan[ 2]: 4096
                                                                           chan[ 3]: 4096
                                                                           chan[ 4]: 4096
                                                                           chan[ 5]: 4096
                                                                           chan[ 6]: 4096
                                                                           chan[ 7]: 4096
                                                                           chan[ 8]: 4096
                                                                           chan[ 9]: 4096
                                                                           chan[10]: 4096
                                                                           chan[11]: 4096
                                                                           chan[12]: 4096
                                                                           chan[13]: 4096
                                                                           chan[14]: 4096
                                                                           chan[15]: 4096
         4             47             74           156696576
                                                                           chan[ 0]: 14678016
                                                                           chan[ 1]: 8910848
                                                                           chan[ 2]: 8910848
                                                                           chan[ 3]: 8910848
                                                                           chan[ 4]: 8910848
                                                                           chan[ 5]: 8910848
                                                                           chan[ 6]: 8910848
                                                                           chan[ 7]: 8910848
                                                                           chan[ 8]: 8910848
                                                                           chan[ 9]: 8910848
                                                                           chan[10]: 8910848
                                                                           chan[11]: 8910848
                                                                           chan[12]: 8910848
                                                                           chan[13]: 8910848
                                                                           chan[14]: 8910848
                                                                           chan[15]: 8910848
                                                                           chan[16]: 522240
                                                                           chan[17]: 522240
                                                                           chan[18]: 522240
                                                                           chan[19]: 522240
                                                                           chan[20]: 522240
                                                                           chan[21]: 522240
                                                                           chan[22]: 522240
                                                                           chan[23]: 522240
                                                                           chan[24]: 522240
                                                                           chan[25]: 522240
                                                                           chan[26]: 522240
                                                                           chan[27]: 522240
                                                                           chan[28]: 522240
                                                                           chan[29]: 522240
                                                                           chan[30]: 522240
                                                                           chan[31]: 522240
         5             75             75               65536
                                                                           chan[ 0]: 2048
                                                                           chan[ 1]: 2048
                                                                           chan[ 2]: 2048
                                                                           chan[ 3]: 2048
                                                                           chan[ 4]: 2048
                                                                           chan[ 5]: 2048
                                                                           chan[ 6]: 2048
                                                                           chan[ 7]: 2048
                                                                           chan[ 8]: 2048
                                                                           chan[ 9]: 2048
                                                                           chan[10]: 2048
                                                                           chan[11]: 2048
                                                                           chan[12]: 2048
                                                                           chan[13]: 2048
                                                                           chan[14]: 2048
                                                                           chan[15]: 2048
                                                                           chan[16]: 2048
                                                                           chan[17]: 2048
                                                                           chan[18]: 2048
                                                                           chan[19]: 2048
                                                                           chan[20]: 2048
                                                                           chan[21]: 2048
                                                                           chan[22]: 2048
                                                                           chan[23]: 2048
                                                                           chan[24]: 2048
                                                                           chan[25]: 2048
                                                                           chan[26]: 2048
                                                                           chan[27]: 2048
                                                                           chan[28]: 2048
                                                                           chan[29]: 2048
                                                                           chan[30]: 2048
                                                                           chan[31]: 2048
         6             76             85              219136
                                                                           chan[ 0]: 20480
                                                                           chan[ 1]: 14336
                                                                           chan[ 2]: 14336
                                                                           chan[ 3]: 14336
                                                                           chan[ 4]: 14336
                                                                           chan[ 5]: 14336
                                                                           chan[ 6]: 14336
                                                                           chan[ 7]: 14336
                                                                           chan[ 8]: 12288
                                                                           chan[ 9]: 12288
                                                                           chan[10]: 12288
                                                                           chan[11]: 12288
                                                                           chan[12]: 12288
                                                                           chan[13]: 12288
                                                                           chan[14]: 12288
                                                                           chan[15]: 12288
         7             86             86                2048
                                                                           chan[ 0]: 2048
         8             87             87                   0
     Total      157603840

Total number of kernels:                    1
Total number of channels (SIMD operations): 157603840

(Back to the list of all GTPin Sample Tools)

simdprof_hlif.h - Data structures and HLI function declarations.

00001 /*========================== begin_copyright_notice ============================
00002 Copyright (C) 2024-2026 Intel Corporation
00003 
00004 SPDX-License-Identifier: MIT
00005 ============================= end_copyright_notice ===========================*/
00006 
00007 /*!
00008  * @file SIMD operation counting tool definitions
00009  */
00010 #ifndef SIMDPROF_HLIF_H_
00011 #define SIMDPROF_HLIF_H_
00012 
00013 #include "hlif_basic_defs.h"
00014 
00015 #if defined(__cplusplus)
00016 #include "gtpin_api.h"
00017 using namespace gtpin;
00018 #endif
00019 
00020 #pragma pack(push, 8)
00021 
00022 /* ============================================================================================= */
00023 // Struct SimdProfCountArgs
00024 /* ============================================================================================= */
00025 /// Common arguments of HLI function that counts active channels
00026 typedef struct SimdProfCountArgs
00027 {
00028     struct
00029     {
00030         uint32_t channelCounts[32]; ///< Counter of active channels
00031     } out;
00032 
00033     #if defined(__cplusplus)
00034     /// Constructor
00035     SimdProfCountArgs() { for (uint32_t i = 0; i < 32; out.channelCounts[i++] = 0); }
00036     #endif
00037 } SimdProfCountArgs;
00038 
00039 /* ============================================================================================= */
00040 // Function CountEnableChannels
00041 /* ============================================================================================= */
00042 /*!
00043  * @brief HLI function that counts enabled channels
00044  * @param[in]       accessMask     Per-channel mask of memory accesses
00045  * @param[out]      counter        Per channel counters
00046  */
00047 IGC_STACK_CALL void CountEnableChannels(uint32_t accessMask,
00048                                         __global SimdProfCountArgs* counter);
00049 
00050 #if defined(__cplusplus)
00051 using CountEnableChannelsFunc = GtHliFunction<void, uint32_t, SimdProfCountArgs*>;
00052 #endif
00053 
00054 /*!
00055  * @brief HLI function that counts enabled channels for SEND instruction
00056  * @param[in]       accessMask     Per-channel mask of memory accesses
00057  * @param[out]      counter        Per channel counters
00058  */
00059 IGC_STACK_CALL void CountEnableChannelsForSend(uint32_t accessMask,
00060                                                __global SimdProfCountArgs* counter);
00061 
00062 #if defined(__cplusplus)
00063 using CountEnableChannelsForSendFunc = GtHliFunction<void, uint32_t, SimdProfCountArgs*>;
00064 #endif
00065 
00066 #pragma pack(pop)
00067 
00068 #endif

simdprof_hlif.cpp - Tool implementation, instrumentation logic, and result aggregation.

00001 /*========================== begin_copyright_notice ============================
00002 Copyright (C) 2024-2025 Intel Corporation
00003 
00004 SPDX-License-Identifier: MIT
00005 ============================= end_copyright_notice ===========================*/
00006 
00007 /*!
00008  * @file Implementation of the SIMD operation counting tool
00009  */
00010 
00011 #include <algorithm>
00012 #include <array>
00013 #include <vector>
00014 #include <map>
00015 #include <string>
00016 #include <fstream>
00017 #include <sstream>
00018 #include <iomanip>
00019 #include <assert.h>
00020 
00021 #include "gtpin_api.h"
00022 #include "gtpin_tool_utils.h"
00023 
00024 #include "simdprof_hlif.h"
00025 
00026 using namespace gtpin;
00027 using namespace std;
00028 
00029 /* ============================================================================================= */
00030 // Struct SimdProfArgs
00031 /* ============================================================================================= */
00032 /*!
00033  * SimdProf instrumentation arguments (instruction properties).
00034  * Each unique combination of these arguments requires a separate instrumentation procedure
00035  * to be generated for each group of instructions with these properties
00036  */
00037 struct SimdProfArgs
00038 {
00039     SimdProfArgs(bool ctrl, uint32_t mask, GtPredicate pred, bool isSend = false) :
00040                  maskCtrl(ctrl), execMask(mask), predicate(pred), isSendIns(isSend) {}
00041 
00042     inline bool operator< (const SimdProfArgs& other) const;
00043 
00044     bool               maskCtrl;  ///< 'MaskCtrl' flag of instrumented instructions
00045     uint32_t           execMask;  ///< Execution mask of instrumented instructions
00046     GtPredicate        predicate; ///< Predicate of instrumented instructions
00047     bool               isSendIns; ///< true if instrumented instructions are SEND instructions
00048 };
00049 
00050 /* ============================================================================================= */
00051 // Struct SimdProfGroup
00052 /* ============================================================================================= */
00053 /*!
00054  * Structure that holds information and profiling results for a group of instructions being
00055  * instrumented by a single instrumentation routine.
00056  * @note All instructions within a group have exactly the same SimdProfArgs.
00057  * @note In order to provide separate channel counters per instruction category (e.g. integer, FP, etc.),
00058  *       replace the {insCount, opCount} pair with an array of counter pairs per category.
00059  */
00060 struct SimdProfGroup
00061 {
00062     SimdProfGroup(uint32_t bbl, uint32_t count) : bblId(bbl), insCount(count), totalOpCount(0), simdProfCountArgs() { memset(opCounts.data(), 0, 32 * sizeof(uint64_t)); }
00063 
00064     BblId                    bblId;            ///< Identifier of a BBL that contains this group of instructions
00065     uint32_t                 insCount;         ///< Number of instructions in the group
00066     uint64_t                 totalOpCount;
00067     std::array<uint64_t, 32> opCounts;         ///< Number of SIMD operations (effective channels) executed by each instruction in the group
00068     SimdProfCountArgs        simdProfCountArgs;
00069 };
00070 
00071 /* ============================================================================================= */
00072 // Struct SimdProfSection
00073 /* ============================================================================================= */
00074 /*!
00075  * Structure that holds information on a SimdProf section - sequence of instructions for which
00076  * instrumentation routines can be inserted at the same point.
00077  * @note All instructions within a section are executed with the same value of the flag register -
00078  * single dynamic parameter of the SIMD operation calulator
00079  */
00080 struct SimdProfSection
00081 {
00082     SimdProfSection(BblId bid, const IGtIns& headIns) :  bblId(bid), headInsId(headIns.Id()) {}
00083 
00084     /// Add a new instruction to the section. Update the corresponding SimdProf group within this section
00085     void AddInstruction(const IGtIns& ins);
00086 
00087     BblId                            bblId;     ///< Id of the corresponding basic block
00088     InsId                            headInsId; ///< First intruction of the section - common 
00089                                                 ///< instrumentation point for all groups in the section
00090     std::map<SimdProfArgs, uint32_t> groups;    ///< SimdProf groups along with the number of instructions
00091     std::map<SimdProfArgs, InsId>    groupHeadIds;
00092 };
00093 
00094 /* ============================================================================================= */
00095 // Class SimdProfKernelProfile
00096 /* ============================================================================================= */
00097 /*!
00098  * Class that represents a kernel profiled by the SimdProf instrumentation
00099  */
00100 class SimdProfKernelProfile
00101 {
00102 public:
00103     SimdProfKernelProfile(const IGtKernel& kernel);
00104 
00105     /*!
00106      * Instrument the kernel.
00107      * The function is called by the OnKernelBuild handler
00108      * @return success/failure status
00109      */
00110     void Instrument(IGtKernelInstrument& instrumentor);
00111 
00112     void WriteArgumentsToDevice(IGtMemoryMapper& memMapper);
00113 
00114     /*!
00115      * Read profiling results which are assumed to be collected and stored in the buffer
00116      * associated with the kernel.
00117      * The function is called by the OnKernelComplete handler
00118      */
00119      void ReadProfileData();
00120 
00121      uint64_t GetTotalOpCounter() const { return _totalOpCount; }
00122 
00123     std::string           ToString()        const; ///< @return Text representation of the profile data
00124 
00125 private:
00126     /*!
00127      * Generate instrumentation procedures for all SimdProf groups of the specified SimdProf section.
00128      * Insert instrumentation at the beginning of the section.
00129      * Initialize the _profileData array
00130      * @param[in]      instrumentor     Instrumentor of the GEN kernel
00131      * @param[in]      section          SimdProf section to be instrumented
00132      */
00133     void InstrumentSection(IGtKernelInstrument& instrumentor, const SimdProfSection& section);
00134 
00135     /// Increment counter of SIMD operations for the specified BBL by 'incValue'
00136     void UpdateBblOpCounter(BblId bblId, uint64_t* perChannelValues);
00137 
00138     /// @return Extended kernel name
00139     std::string ExtendedName() const { return _extName; }
00140 
00141 private:
00142     /// Kernel descriptor
00143     std::string                         _name;              ///< Kernel's name
00144     std::string                         _extName;           ///< Kernel's extended name
00145     GtKernelType                        _type;              ///< Kernel's type
00146     GtGpuPlatform                       _platform;          ///< Kernel's platform
00147     uint64_t                            _hashId;            ///< Kernel's hash identifier
00148     GtSimdWidth                         _simd;              ///< Kernel's SIMD width
00149     uint64_t                            _binarySignature;   ///< Kernel's binary signature
00150     uint64_t                            _totalOpCount = 0;
00151     std::list<SimdProfGroup>            _profileData;       ///< Profiling data for instrumented SimdProf groups
00152 
00153     std::map<BblId, std::pair<InsId, InsId> >         _bblInsInfo;   ///< Head and tail instructions per BBL
00154 
00155     typedef struct OpCounts {
00156         uint64_t counts[32] = {};
00157         uint64_t total = 0;
00158     } OpCounts;
00159 
00160     std::map<BblId, OpCounts>                         _bblOpCounts;  ///< Number of executed SIMD operations per BBL
00161 
00162     CountEnableChannelsFunc                           _countEnableChannelsFunc;
00163     CountEnableChannelsForSendFunc                    _countEnableChannelsForSendFunc;
00164 };
00165 
00166 /* ============================================================================================= */
00167 // Class SimdProfHlif
00168 /* ============================================================================================= */
00169 /*!
00170  * Implementation of the IGtTool interface for the SimdProfHlif tool
00171  */
00172 class SimdProfHlif : public GtTool
00173 {
00174 public:
00175     /// Implementation of the IGtTool interface
00176     const char* Name() const { return "simdprof_hlif"; }
00177 
00178     void OnKernelBuild(IGtKernelInstrument& instrumentor);
00179     void OnKernelRun(IGtKernelDispatch& dispatcher);
00180     void OnKernelComplete(IGtKernelDispatch& dispatcher);
00181 
00182 public:
00183     std::string ToString() const;                ///< @return Text representation of the profile data
00184     static SimdProfHlif* Instance();             ///< @return Single instance of this class
00185     static void OnFini() { Instance()->Fini(); } ///< Callback function registered with atexit()
00186     void   LoadHliLibrary();                     ///< Compile and load library of HLI functions
00187 
00188 protected:
00189     SimdProfHlif() = default;
00190     SimdProfHlif(const SimdProfHlif&) = delete;
00191     SimdProfHlif& operator = (const SimdProfHlif&) = delete;
00192     ~SimdProfHlif() = default;
00193 
00194     void Fini();                                ///< Post process and dump profiling data
00195 
00196 private:
00197 
00198     /// Collection of kernel profiles
00199     using KernelProfilers = std::map<GtKernelId, SimdProfKernelProfile>;
00200 
00201 private:
00202 
00203     KernelProfilers             _kernels;
00204     IGtHliModuleHandle          _hliModule = nullptr;        ///< Module of HLI functions
00205 };
00206 
00207 
00208 /* ============================================================================================= */
00209 // SimdProfArgs implementation
00210 /* ============================================================================================= */
00211 
00212 bool SimdProfArgs::operator < (const SimdProfArgs& other) const
00213 {
00214     return std::make_tuple(maskCtrl, execMask, predicate, isSendIns) <
00215            std::make_tuple(other.maskCtrl, other.execMask, other.predicate, other.isSendIns);
00216 }
00217 
00218 /* ============================================================================================= */
00219 // SimdProfSection implementation
00220 /* ============================================================================================= */
00221 
00222 void SimdProfSection::AddInstruction(const IGtIns& ins)
00223 {
00224     uint32_t    execMask    = ins.ExecMask().Bits();
00225     GtPredicate predicate   = ins.Predicate();
00226     bool        maskCtrl    = !ins.IsWriteMaskEnabled();
00227     bool        isSendIns   = ins.IsSendMessage();
00228 
00229     SimdProfArgs simdProfArgs(maskCtrl, execMask, predicate, isSendIns);
00230 
00231     auto it = groups.emplace(simdProfArgs, 0).first;
00232 
00233     if (it->second == 0)
00234     {
00235         groupHeadIds.emplace(simdProfArgs, ins.Id());
00236     }
00237 
00238     ++(it->second);
00239 }
00240 
00241 /* ============================================================================================= */
00242 // SimdprofKernelProfile implementation
00243 /* ============================================================================================= */
00244 
00245 SimdProfKernelProfile::SimdProfKernelProfile(const IGtKernel& kernel) :
00246     _name(GlueString(kernel.Name())), _extName(ExtendedKernelName(kernel)), _type(kernel.Type()), _platform(kernel.GpuPlatform()),
00247     _hashId(kernel.HashId()), _simd(kernel.SimdWidth()), _binarySignature(kernel.BinarySignature()),
00248     _countEnableChannelsFunc("CountEnableChannels"), _countEnableChannelsForSendFunc("CountEnableChannelsForSend") {}
00249 
00250 void SimdProfKernelProfile::Instrument(IGtKernelInstrument& instrumentor)
00251 {
00252     const IGtCfg&    cfg       = instrumentor.Cfg();
00253     IGtMemoryMapper& memMapper = instrumentor.MemoryMapper();
00254 
00255     std::vector<SimdProfSection> sections;      // All SimdProf sections in the kernel
00256 
00257     // Identify SimdProf sections and #groups in the kernel
00258     for (auto bblPtr : cfg.Bbls())
00259     {
00260         bool isSectionBegin = true;
00261 
00262         // Iterate through sections within the current BBL
00263         for (auto insPtr : bblPtr->Instructions())
00264         {
00265             const IGtIns& ins = *insPtr;
00266 
00267             if (insPtr->Id() < uint32_t(knobMinInstrumentIns))
00268             {
00269                 continue;
00270             }
00271 
00272             if (insPtr->Id() > uint32_t(knobMaxInstrumentIns))
00273             {
00274                 continue;
00275             }
00276 
00277             if (isSectionBegin)
00278             {
00279                 sections.emplace_back(bblPtr->Id(), ins);
00280                 isSectionBegin = false;
00281             }
00282 
00283             SimdProfSection& section = sections.back();
00284             section.AddInstruction(ins);
00285 
00286 
00287             if (ins.IsFlagModifier() || (ins.Id() == bblPtr->LastIns().Id()) || (insPtr->Id() == uint32_t(knobMaxInstrumentIns))) //section end
00288             {
00289                 isSectionBegin = true;
00290             }
00291         }
00292     }
00293 
00294     // Instrument SimdProf sections and initialize the _profileData array
00295     for (auto& section : sections) { InstrumentSection(instrumentor, section); }
00296 
00297     std::map<uint32_t, uint32_t> bblGroupNumber;
00298 
00299     for (auto& group : _profileData)
00300     {
00301         bblGroupNumber[group.bblId]++;
00302         memMapper.Map(group.simdProfCountArgs, GT_MMAP_SHARE);
00303     }
00304 
00305     // Save BBL information for the post processing phase
00306     for (auto bblPtr : cfg.Bbls())
00307     {
00308         _bblInsInfo.emplace(bblPtr->Id(), std::make_pair(bblPtr->FirstIns().Id(), bblPtr->LastIns().Id()));
00309     }
00310 }
00311 
00312 void SimdProfKernelProfile::WriteArgumentsToDevice(IGtMemoryMapper& memMapper)
00313 {
00314     for (auto& group : _profileData)
00315     {
00316         void* ptr = (void*)&(group.simdProfCountArgs);
00317         memset(ptr, 0, sizeof(SimdProfCountArgs));
00318         memMapper.Write(ptr, sizeof(SimdProfCountArgs));
00319     }
00320 }
00321 
00322 void SimdProfKernelProfile::InstrumentSection(IGtKernelInstrument& instrumentor, const SimdProfSection& section)
00323 {
00324     const IGtCfg&       cfg       = instrumentor.Cfg();
00325 
00326     // Instrument each SimdProf group:
00327     //  - If a group is associated with a non-SEND instructions, compute the SIMD count by applying CBIT to the SIMD mask.
00328     //  - Otherwise, if a group is created for SEND instructions, increment the SIMD count for each SEND whose SIMD mask
00329     //    is nonzero. From the EU perspective, SEND instruction is 1 operation, unless the SIMD mask is zero
00330     // Insert each per-group instrumentation procedure at the beginning of the corresponding section
00331 
00332     //Insert SimdProf instrumentaion at the beginning of the current section
00333     const IGtIns& ins = cfg.GetInstruction(section.headInsId);
00334     const IGtBbl& bbl = cfg.GetBbl(ins);
00335 
00336     for (auto& group : section.groups)
00337     {
00338     
00339         auto it = section.groupHeadIds.find(group.first); GTPIN_ASSERT(it != section.groupHeadIds.end());
00340         const IGtIns& groupHeadIns = cfg.GetInstruction(it->second);
00341         
00342         _profileData.emplace_back(bbl.Id(), group.second);
00343 
00344         SimdProfGroup&      newProfGroup = _profileData.back();
00345         SimdProfCountArgs*  countArgs = const_cast<SimdProfCountArgs*>(&newProfGroup.simdProfCountArgs);
00346         IargInsOpMask       accessMask(groupHeadIns);
00347 
00348         if (group.first.isSendIns)
00349         {
00350             _countEnableChannelsForSendFunc.InsertCallAtInstruction(instrumentor, groupHeadIns, GtIpoint::Before(),
00351                 NullReg(),          // Unused return value
00352                 accessMask,         // arg[1]: Per-channel mask of memory accesses
00353                 countArgs           // arg[2]: Counter arguments
00354             );
00355         }
00356         else
00357         {
00358             _countEnableChannelsFunc.InsertCallAtInstruction(instrumentor, groupHeadIns, GtIpoint::Before(),
00359                 NullReg(),          // Unused return value
00360                 accessMask,         // arg[1]: Per-channel mask of memory accesses
00361                 countArgs           // arg[2]: Counter arguments
00362             );
00363         }
00364     }
00365 }
00366 
00367 void SimdProfKernelProfile::ReadProfileData()
00368 {
00369     // Iterate through all SimdProf groups and read counters of executed operations (channels).
00370     for (auto& group : _profileData)
00371     {
00372         uint64_t groupCounts[32] = {};
00373 
00374         SimdProfCountArgs& args = group.simdProfCountArgs;
00375         uint32_t numOfInsts = group.insCount;
00376 
00377         for (uint32_t i = 0; i < 32; i++)
00378         {
00379             groupCounts[i] = args.out.channelCounts[i] * numOfInsts;
00380             _totalOpCount += groupCounts[i];
00381         }
00382         // Update counters of executed operations
00383         UpdateBblOpCounter(group.bblId._value, groupCounts);
00384     }
00385 }
00386 
00387 void SimdProfKernelProfile::UpdateBblOpCounter(BblId bblId, uint64_t* perChannelValues)
00388 {
00389     auto it = _bblOpCounts.emplace(bblId, OpCounts()).first;
00390 
00391     auto& opCounts = it->second;
00392 
00393     for (uint32_t i = 0; i < 32; ++i)
00394     {
00395         opCounts.counts[i] += perChannelValues[i];
00396         opCounts.total += perChannelValues[i];
00397     }
00398 }
00399 
00400 std::string SimdProfKernelProfile::ToString() const
00401 {
00402     ostringstream ostr;
00403     ostr << ExtendedName() << endl;
00404 
00405     if (!_bblOpCounts.empty())
00406     {
00407         ostr << setw(10) << "BBL" << setw(15) << "Head Ins ID" << setw(15) << "Tail Ins ID" << setw(20) << "Channels" << endl;
00408         for (const auto& bc : _bblOpCounts)
00409         {
00410             auto bblId = bc.first;
00411             uint32_t firstIns = _bblInsInfo.at(bc.first).first;
00412             uint32_t lastIns = _bblInsInfo.at(bc.first).second;
00413             ostr << setw(10) << bblId << setw(15) << firstIns << setw(15) << lastIns << setw(20) << bc.second.total << endl;
00414             if (bc.second.total)
00415             {
00416                 for (uint32_t i = 0; i < 32; ++i)
00417                 {
00418                     if (bc.second.counts[i])
00419                     {
00420                         ostr << setw(80) << "chan[" << setw(2) << i << "]: " << bc.second.counts[i] << endl;
00421                     }
00422                 }
00423             }
00424         }
00425         ostr << setw(10) << "Total" << setw(15) << _totalOpCount << endl;
00426     }
00427     else
00428     {
00429         ostr << "No channels executed" << endl;
00430     }
00431 
00432     return ostr.str();
00433 }
00434 
00435 /* ============================================================================================= */
00436 // SimdProfHlif implementation
00437 /* ============================================================================================= */
00438 SimdProfHlif* SimdProfHlif::Instance()
00439 {
00440     static SimdProfHlif instance;
00441     return &instance;
00442 }
00443 
00444 void SimdProfHlif::OnKernelBuild(IGtKernelInstrument& instrumentor)
00445 {
00446     const IGtKernel& kernel = instrumentor.Kernel();
00447     auto it = _kernels.emplace(kernel.Id(), kernel).first;
00448     it->second.Instrument(instrumentor);
00449     
00450     // Link the kernel with the library of HLI functions
00451     instrumentor.LinkHliModule(_hliModule);
00452 }
00453 
00454 void SimdProfHlif::OnKernelRun(IGtKernelDispatch& dispatcher)
00455 {
00456     const IGtKernel& kernel = dispatcher.Kernel();
00457     GtKernelExecDesc execDesc; dispatcher.GetExecDescriptor(execDesc);
00458     if (kernel.IsInstrumented() && IsKernelExecProfileEnabled(execDesc, kernel.GpuPlatform(), kernel.Name().Get()))
00459     {
00460         dispatcher.SetProfilingMode(true);  // Enable instrumentation
00461 
00462         // This tool needs an accurate information about memory allocations, which is available on the the final dispatch stage.
00463         // So, on the initial dispatch stage, we only enable instrumentation, and request GTPin to invoke OnKernelRun
00464         // one more time, on the final dispatch stage. If this request is accepted, the initialization of the profile buffer will
00465         // be done on the final dispatch stage, otherwise - on the intial dispatch stage.
00466         if (dispatcher.ReportFinalDispatchStage())
00467         {
00468             return;
00469         }
00470 
00471         auto it = _kernels.find(kernel.Id());
00472 
00473         if (it != _kernels.end())
00474         {
00475             SimdProfKernelProfile& kernelProfile = it->second;
00476             kernelProfile.WriteArgumentsToDevice(dispatcher.MemoryMapper());
00477         }
00478     }
00479     else
00480     {
00481         dispatcher.SetProfilingMode(false);
00482     }
00483 }
00484 
00485 void SimdProfHlif::OnKernelComplete(IGtKernelDispatch& dispatcher)
00486 {
00487     if (!dispatcher.IsProfilingEnabled())
00488     {
00489         return; // Do nothing with unprofiled kernel dispatches
00490     }
00491 
00492     const IGtKernel& kernel = dispatcher.Kernel();
00493     auto it = _kernels.find(kernel.Id());
00494 
00495     if (it != _kernels.end())
00496     {
00497         SimdProfKernelProfile& kernelProfile = it->second;
00498         kernelProfile.ReadProfileData();
00499     }
00500 }
00501 
00502 void SimdProfHlif::LoadHliLibrary()
00503 {
00504     std::string modulePath = JoinPath(GetKnobValue<std::string>("installDir"), "Examples", "simdprof_hlif.cl");
00505     _hliModule = GTPin_GetCore()->HliLibrary().CompileModuleFromFile(modulePath.c_str());
00506     GTPIN_ASSERT_MSG(_hliModule != nullptr, "Could not load HLI module " + modulePath);
00507 }
00508 
00509 void SimdProfHlif::Fini()
00510 {
00511     string profileDir = GTPin_GetCore()->ProfileDir();
00512     string filePath = JoinPath(profileDir, "simdprof_hlif.txt");
00513 
00514     ofstream fs(filePath);
00515     if (fs.is_open())
00516     {
00517         fs << ToString();
00518         fs.close();
00519     }
00520     else
00521     {
00522         GTPIN_WARNING("SIMDPROF_HLIF : could not create file: " + filePath);
00523     }
00524 }
00525 
00526 string SimdProfHlif::ToString() const
00527 {
00528     ostringstream ostr;
00529     ostr << "Channels (SIMD operations) executed by kernels/BBLs" << endl;
00530     ostr << "===================================================" << endl;
00531     
00532     uint64_t totalOpCount = 0;
00533     for (const auto& k : _kernels)
00534     {
00535         ostr << string(100, '-') << endl;
00536         ostr << k.second.ToString() << endl;
00537         totalOpCount += k.second.GetTotalOpCounter();
00538     }
00539     ostr << "Total number of kernels:                    " << _kernels.size() << std::endl;
00540     ostr << "Total number of channels (SIMD operations): " << totalOpCount << std::endl;
00541 
00542     return ostr.str();
00543 }
00544 
00545 // Define DETACHED_SIMDPROF to use SimdProf functionality in a different tool
00546 #if !defined (DETACHED_SIMDPROF)
00547 /* ============================================================================================= */
00548 // GTPin_Entry
00549 /* ============================================================================================= */
00550 EXPORT_C_FUNC void GTPin_Entry(int argc, const char* argv[])
00551 {
00552     ConfigureGTPin(argc, argv);
00553     SimdProfHlif::Instance()->Register();
00554     
00555     // Compile and load library of HLI functions
00556     SimdProfHlif::Instance()->LoadHliLibrary();
00557 
00558     atexit(SimdProfHlif::OnFini);
00559 }
00560 #endif

simdprof_hlif.cl - HLI function implementations in OpenCL.

00001 /*========================== begin_copyright_notice ============================
00002 Copyright (C) 2024-2026 Intel Corporation
00003 
00004 SPDX-License-Identifier: MIT
00005 ============================= end_copyright_notice ===========================*/
00006 
00007 /*!
00008  * @file Library of High-Level Instrumentation (HLI) functions used by the simdprof_hlif tool
00009  */
00010 
00011 #include "hlif_basic_defs.h"
00012 #include "simdprof_hlif.h"
00013 
00014 
00015 #define PER_CHANNEL
00016 
00017 /*!
00018  * @brief HLI function that counts enabled channels
00019  * @see simdprof_hlif.h for details
00020  */
00021 IGC_STACK_CALL void CountEnableChannels(uint32_t accessMask, __global SimdProfCountArgs* counter)
00022 {
00023 #ifndef  PER_CHANNEL
00024     uint64_t channelCount = (uint64_t)popcount(accessMask);
00025     atom_add(&(counter->out.channelCounts[0]), channelCount);
00026 #else
00027     if (accessMask != 0)
00028     {
00029         for (uint32_t cIndx = 0; cIndx < 32; ++cIndx)
00030         {
00031             if ((accessMask & (0x1 << cIndx)) != 0)
00032             {
00033                 atomic_inc(&(counter->out.channelCounts[cIndx]));
00034             }
00035         }
00036     }
00037 #endif
00038 }
00039 
00040 /*!
00041  * @brief HLI function that counts enabled channels
00042  * @see simdprof_hlif.h for details
00043  */
00044 IGC_STACK_CALL void CountEnableChannelsForSend(uint32_t accessMask, __global SimdProfCountArgs* counter)
00045 {
00046     if (accessMask != 0)
00047     {
00048         atomic_inc(&(counter->out.channelCounts[0]));
00049     }
00050 }

(Back to the list of all GTPin Sample Tools)


 All Data Structures Functions Variables Typedefs Enumerations Enumerator


  Copyright (C) 2013-2025 Intel Corporation
SPDX-License-Identifier: MIT