GTPin
GTPin: Funlatency Sample Tool

The Funlatency tool measures the number of cycles required to execute functions called by the kernel.

Running Funlatency tool

To run Funlatency with the default settings, run:

Profilers/Bin/gtpin -t funlatency -- app

How to understand Funlatency results

When Funlatency runs with the default configuration, it generates a directory named GTPIN_PROFILE_FUNLATENCY Profiling results are saved by GTPin in the file GTPIN_PROFILE_FUNLATENCY\Session_Final\funlatency.txt. The results are presented in the following format:

### Kernel/Shader execution-time profile generated by GTPin ###

Legend:
NA - kernel was not instrumented.
         Kernel Name              HashID                 Function Name           Inst          Freq.       Total-latency    Total-latency(%)          Avg-Cycles      Type      SIMD       Platform               Execution descriptor
                  pi    3650b0add9d1a432                      pi(main)        [0-116]           6250          3408987510               26.35              545438        CS         8         OpenCL    2         0         0         0
                  pi    3650b0add9d1a432                          func      [123-294]          62500          3145307667               24.31               50324        CS         8         OpenCL    2         0         0         0
                  pi    3650b0add9d1a432                        funcX2      [295-466]         125000          6381884664               49.33               51055        CS         8         OpenCL    2         0         0         0

Each line represents a single run (dispatch to HW device) of specific kernel, where the fields have the following meaning:

If the function name is not recognized by GTPin, it generates an artificial name in the format: EmbeddedFunc_***

(Back to the list of all GTPin Sample Tools)

funlatency.h

00001 /*========================== begin_copyright_notice ============================
00002 Copyright (C) 2024-2025 Intel Corporation
00003 
00004 SPDX-License-Identifier: MIT
00005 ============================= end_copyright_notice ===========================*/
00006 
00007 /*!
00008  * @file FunLatency Tool definitions
00009  */
00010 
00011 #ifndef FUNLATENCY_H_
00012 #define FUNLATENCY_H_
00013 
00014 #include <map>
00015 #include <set>
00016 #include <string>
00017 
00018 #include "gtpin_api.h"
00019 #include "gtpin_tool_utils.h"
00020 
00021 using namespace gtpin;
00022 
00023 /// Identifier of the instrumentation site : function ID
00024 using InstrumentSiteId = FunctionId;
00025 
00026 /// Array of instrumentation site IDs, indexed by ordinal numbers of corresponding records in the profile buffer
00027 using InstrumentSites = std::vector<InstrumentSiteId>;
00028 
00029 /* ============================================================================================= */
00030 // Struct FunLatencyRecord
00031 /* ============================================================================================= */
00032 /*!
00033  * Layout of records collected in profile buffer by the Latency tool
00034  */
00035 struct alignas(uint64_t) FunLatencyRecord
00036 {
00037     uint64_t cycles;    ///< Total number of cycles
00038     uint32_t freq;      ///< Total number of executions
00039 };
00040 
00041 /* ============================================================================================= */
00042 // Class FunLatencyDispatchProfile
00043 /* ============================================================================================= */
00044 /*!
00045  * Profiling data collected during a single kernel dispatch
00046  */
00047 struct  FunLatencyDispatchProfile
00048 {
00049     FunLatencyDispatchProfile(const IGtKernelDispatch& kernelDispatch, InstrumentSiteId siteId);
00050     void Accumulate(const FunLatencyRecord& record);
00051 
00052     InstrumentSiteId  siteId;         ///< Identifier of the instrumentation site: FuntionID
00053     GtKernelExecDesc  kernelExecDesc; ///< Kernel execution descriptor
00054     uint64_t          cycles;         ///< Total number of cycles
00055     uint64_t          freq;           ///< Total number of executions
00056 };
00057 
00058 /* ============================================================================================= */
00059 // struct FunctionDescriptor
00060 /* ============================================================================================= */
00061 /*!
00062  * Static information about the function
00063  */
00064 struct FunctionDescriptor
00065 {
00066     FunctionDescriptor(const FunctionId fId, const std::string fName,
00067         const bool fIsMain, const InsId firstIns, const InsId lastIns, const uint32_t recNum);
00068 
00069     FunctionId      id;         ///< Identifier of the function
00070     std::string     name;       ///< function name
00071     bool            isMain;     ///< true for main kernel function
00072     InsId           firstInsId; ///< first Instruction in the function 
00073     InsId           lastInsId;  ///< last Instruction in the function 
00074     uint32_t        recordNum;  ///< record number in profile array
00075     GtReg           freqReg;    ///< Allocated VReg for freq count
00076     GtReg           cycleReg;    ///< Allocated VReg for cycles count
00077 };
00078 
00079 /* ============================================================================================= */
00080 // Class FunLatencyKernelProfile
00081 /* ============================================================================================= */
00082 /*!
00083  * Aggregated profile of all instrumented kernel dispatches
00084  */
00085 class FunLatencyKernelProfile
00086 {
00087 public: 
00088     typedef std::map<FunctionId, FunctionDescriptor>  FunctionMap;
00089     FunLatencyKernelProfile(const IGtKernel& kernel, const IGtCfg& cfg,
00090         const GtProfileArray& profileArray, InstrumentSites&& instrumentSites, FunctionMap&& functions);
00091     
00092     /// Add the specified profile of a kernel dispatch
00093     FunLatencyDispatchProfile& AddDispatchProfile(const FunLatencyDispatchProfile& dispatchProfile);
00094 
00095     std::string             ToString()        const;                          ///< @return Text representation of the profile data
00096     void                    DumpAsm()         const;                          ///< Dump kernel's assembly text to file
00097     std::string             GetName()         const { return _name; }         ///< @return  Kernel's name
00098     const GtProfileArray&   GetProfileArray() const { return _profileArray; } ///< @return Profile buffer accessor
00099     
00100     /// Given a record number in the profile buffer, return corresponding ID of the instrumentation site   
00101     InstrumentSiteId        GetSiteId(uint32_t recordNum) const;
00102 
00103     /// @return Collection of profiles per kernel dispatch
00104     typedef std::list<FunLatencyDispatchProfile>  DispatchProfiles;
00105     const DispatchProfiles& GetDispatchProfiles() const { return _dispatchProfiles; }
00106 
00107 private:
00108     std::string        _name;              ///< Kernel's name
00109     std::string        _uniqueName;        ///< Kernel's unique name (IGC style)
00110     GtKernelType       _type;              ///< Kernel's type
00111     GtGpuPlatform      _platform;          ///< Kernel's platform
00112     uint64_t           _hashId;            ///< Kernel's hash identifier
00113     GtSimdWidth        _simd;              ///< Kernel's SIMD width
00114     std::string        _asmText;           ///< Kernel's assembly text
00115     GtProfileArray     _profileArray;      ///< Profile buffer accessor
00116     uint64_t           _totalKernelCycles; ///< Total cycles of the kernel
00117     InstrumentSites    _instrumentSites;   ///< Array of instrumentation site IDs
00118     FunctionMap        _functionMap;       ///< Map each function ID to it's Function descriptor
00119     DispatchProfiles   _dispatchProfiles;  ///< Profiles per kernel dispatch
00120 };
00121 
00122 /* ============================================================================================= */
00123 // Class FunLatency
00124 /* ============================================================================================= */
00125 /*!
00126  * Implementation of the IGtTool interface for the FunLatency tool
00127  */
00128 class FunLatency : public GtTool
00129 {
00130 public:
00131     /// Implementation of the IGtTool interface
00132     const char* Name() const { return "FunLatency"; }
00133 
00134     void OnKernelBuild(IGtKernelInstrument& instrumentor);
00135     void OnKernelRun(IGtKernelDispatch& dispatcher);
00136     void OnKernelComplete(IGtKernelDispatch& dispatcher);
00137 
00138 public:
00139     std::string ToString() const;                ///< @return Text representation of the profile data
00140     void        DumpAsm()  const;                ///< Dump assembly text of profiled kernels to files
00141 
00142     static FunLatency* Instance();               ///< @return Single instance of this class
00143     static void OnFini() { Instance()->Fini(); } ///< Callback function registered with atexit()
00144 protected:
00145     FunLatency() = default;
00146     FunLatency(const FunLatency&) = delete;
00147     FunLatency& operator = (const FunLatency&) = delete;
00148     ~FunLatency() = default;
00149 
00150     /// @return Collection of kernel profiles
00151     typedef std::map<GtKernelId, FunLatencyKernelProfile>  KernelProfiles;
00152     const KernelProfiles& GetKernelProfiles() const { return _kernels; }
00153 
00154     /// Post process and dump profiling data
00155     void Fini();
00156 private:
00157     /// Generate instrumentation for functions
00158     void InstrumentFunctions(IGtKernelInstrument& instrumentor);
00159     
00160     /// Generate instrumentation to be inserted before the code fragment whose latency is measured
00161     void GeneratePreCode(GtGenProcedure& proc, const IGtGenCoder& coder);
00162 
00163     /*!
00164      * Generate instrumentation that computes latency and stores the result in the profile buffer.
00165      *  - GeneratePostCodeMemBound  - Generate memory-bound instrumentation
00166      *  - GeneratePostCodeRegBound  - Generate register-bound instrumentation. Return success/failure status
00167      *
00168      * @param[in, out] postProc         Procedure to be inserted after the code fragment whose latency is measured
00169      * @param[in, out] finiProc         Procedure to be inserted before each EOT
00170      * @param[in]      coder            GEN code generator
00171      * @param[in]      profileArray     Array of 'FunLatencyRecord's in the profile buffer
00172      * @param[in]      funcDesc         pointer to the decriptor of the function to be instrumented
00173      * @param[in]      isFuncEntry      true if the BBL is the first BBL in the Function
00174      */
00175     bool GeneratePostCodeRegBound(GtGenProcedure& postProc, GtGenProcedure& finiProc, const IGtGenCoder& coder,
00176         const GtProfileArray& profileArray, FunctionDescriptor* funcDesc, bool isFuncEntry, bool updateFrequency, uint32_t simdMask);
00177     void GeneratePostCodeMemBound(GtGenProcedure& proc, const IGtGenCoder& coder,
00178         const GtProfileArray& profileArray, FunctionDescriptor* funcDesc, bool updateFrequency, uint32_t simdMask);
00179 
00180     /// @return true/false - use 64-bit/32-bit integer for the cycle counter
00181     static bool Use64BitCounters(const IGtGenCoder& coder);
00182 
00183 private:
00184     KernelProfiles  _kernels;   ///< Collection of kernel profiles
00185 
00186     GtReg _addrReg;             ///< Virtual register that holds address within profile buffer
00187     GtReg _dataReg;             ///< Virtual register that holds data to be read from/written to profile buffer 
00188     GtReg _timeReg;             ///< Virtual timer register
00189 };
00190 
00191 #endif

funlatency.cpp

00001 /*========================== begin_copyright_notice ============================
00002 Copyright (C) 2024-2026 Intel Corporation
00003 
00004 SPDX-License-Identifier: MIT
00005 ============================= end_copyright_notice ===========================*/
00006 
00007 /*!
00008  * @file Implementation of the FunLatency Tool
00009  */
00010 #include <fstream>
00011 #include <sstream>
00012 #include <iomanip>
00013 #include <algorithm>
00014 
00015 #include "funlatency.h"
00016 
00017 using namespace gtpin;
00018 using namespace std;
00019 
00020 /* ============================================================================================= */
00021 // Configuration
00022 /* ============================================================================================= */
00023 Knob<int>  knobNumThreadBuckets("num_thread_buckets", 0, "Number of thread buckets. 0 (default) - maximum thread buckets");
00024 Knob<bool> knobUseRegInstrument("no_reg_instrument", true, "Disable register-bound instrumentation");
00025 Knob<bool> knobSkipZeroResults("skip_zero_results", false, "Skip zero results in the Funlatency output");
00026 Knob<bool> knobIncludeZeroExecMask("include_zero_exec_mask", false, "Include measurements of functions called with zero exec mask");
00027 
00028 /* ============================================================================================= */
00029 // FunLatency implementation
00030 /* ============================================================================================= */
00031 FunLatency* FunLatency::Instance()
00032 {
00033     static FunLatency instance;
00034     return &instance;
00035 }
00036 
00037 void FunLatency::OnKernelBuild(IGtKernelInstrument& instrumentor)
00038 {
00039     /*
00040      * Register allocation policy
00041      * ======================================================================================================
00042      * To ensure accuracy of latency measurments, the instrumentation overhead should be as low as possible.
00043      * It is believed that register-bound procedures incur lower overhead than memory-bound procedures.
00044      * Assuming that, we will try to use register-bound instrumentation for as many sites as possible, and
00045      * apply memory-bound instrumentation to the rest of the sites.
00046      *
00047      * To check and guarantee availability of free physical registers we pre-allocate registers
00048      * in the register-bound procedures created by the GeneratePostCodeRegBound() function.
00049      * For this purpose we use the IGtRegAllocator::ReserveVregOperands() interface.
00050      *
00051      * For memory-bound procedures, pre-allocation of registers is not needed, and even can be harmful, as
00052      * explained in the IGtRegAllocator description. On the other hand, excessive pre-allocation of registers
00053      * in the register-bound procedures may cause multiple spills/fills in memory-bound procedures. To avoid
00054      * this situation, we do the following:
00055      *      A) RESERVE registers in memory-bound procedures BEFORE generating register-bound procedures using
00056      *         IGtRegAllocator::Reserve()
00057      *      B) RELEASE registers in memory-bound procedures AFTER generating register-bound procedures using
00058      *         IGtRegAllocator::ReleaseReserved()
00059      * 
00060      * Function Latency measurment algorithm 
00061      * ======================================================================================================
00062      * Count the cycles it takes for each function to execute from the first instruction to the end of the function
00063      * by starting the timer before the first instruction in each function, and stopping it before the last
00064      * instruction in the function.
00065      * If a function calls other functions, in order to exclude the callee's total cycles from the caller's
00066      * cycles count, the tool will stop the timer before "call" instruction,count the cycels,and restart it 
00067      * after the call instruction. 
00068      * for example :
00069      *   //   fun1:                     fun1:                        //
00070      *   //    [...             \\       start the timer (preCode)   //
00071      *   //    ...          =====\\      [...                        //
00072      *   //    ...                \\     ...]                        //
00073      *   //    ...]                \\    stop the timer (postCode)   //
00074      *   //    call fun2           //    call fun2                   //
00075      *   //    [...               //     start the timer             //
00076      *   //    ...          =====//      [...                        //
00077      *   //    ...              //       ...]                        //
00078      *   //    ...]                      stop the timer              //
00079      *   //    ret                       ret                         //
00080      *
00081      *   fun1 total cycles measurment will be done in two parts .
00082      *   (A) count the total cycles from the function entry to the call ins
00083      *   (B) count the total cycles after call ins to the end of the function
00084      *   fun1 total cycles = A + B 
00085      *  
00086      */
00087 
00088     const IGtGenCoder& coder          = instrumentor.Coder();
00089     IGtVregFactory&    vregs          = coder.VregFactory();
00090     IGtRegAllocator&   ra             = coder.RegAllocator();
00091     bool               is64BitCounter = Use64BitCounters(coder);
00092 
00093     // Initialize virtual registers
00094     _timeReg = vregs.Make(VREG_TYPE_DWORD);
00095     _addrReg = vregs.MakeMsgAddrScratch();
00096     _dataReg = vregs.MakeMsgDataScratch(is64BitCounter ? VREG_TYPE_QWORD : VREG_TYPE_DWORD);
00097 
00098     // A) Reserve registers used in the memory-bound procedures
00099     ra.Reserve(_dataReg.VregNumber());
00100     ra.Reserve(_addrReg.VregNumber());
00101     ra.Reserve(vregs.GetProfileBufferAddrVreg().Num());
00102 
00103     InstrumentFunctions(instrumentor);
00104 
00105     // B) Release registers used in the memory-bound procedures
00106     ra.ReleaseReserved(_dataReg.VregNumber());
00107     ra.ReleaseReserved(_addrReg.VregNumber());
00108     ra.ReleaseReserved(vregs.GetProfileBufferAddrVreg().Num());
00109 }
00110 
00111 void FunLatency::InstrumentFunctions(IGtKernelInstrument& instrumentor)
00112 {
00113     const IGtKernel&           kernel    = instrumentor.Kernel();
00114     const IGtCfg&              cfg       = instrumentor.Cfg();
00115     const IGtGenCoder&         coder     = instrumentor.Coder();
00116     const IGtGenModel&         genModel  = kernel.GenModel();
00117     IGtProfileBufferAllocator& allocator = instrumentor.ProfileBufferAllocator();
00118     
00119     uint32_t simdMask;
00120     switch (kernel.SimdWidth())
00121     {
00122     case 8:  simdMask = 0xFF; break;
00123     case 16: simdMask = 0xFFFF; break;
00124     case 32: simdMask = 0xFFFFFFFF; break;
00125     default: simdMask = 0; break;
00126     }
00127 
00128     uint32_t numRecords = cfg.NumFunctions();   // numRecords = number of functions
00129 
00130     // Insert code that stops timer at BBL's exit and stores result in the profile buffer
00131     GtGenProcedure postCode;
00132     GtProfileArray profileArray(sizeof(FunLatencyRecord), numRecords, genModel.MaxThreadBuckets());
00133     profileArray.Allocate(allocator);
00134 
00135     GtGenProcedure finiCode;                                // Procedure that stores results of register-bound measurments in the profile buffer
00136     bool           tryRegInstrument = knobUseRegInstrument; // true - try to apply register-bound instrumentation
00137 
00138     std::map<FunctionId, FunctionDescriptor> functions;
00139 
00140     // Iterate over the function instrument the first and the last BBl in each function.
00141     uint32_t idx = 0;
00142     for (auto func : cfg.Functions())
00143     {
00144         // Start the timer in each function entry 
00145         const IGtBbl& firstBbl = *func->Bbls().first();
00146         const IGtIns& firstIns = firstBbl.FirstIns();
00147         const IGtBbl& lastBbl  = *func->Bbls().last();
00148         const IGtIns& lastIns  = lastBbl.LastIns();
00149 
00150         // Insert code that starts timer at BBL's entry
00151         GtGenProcedure preCode;
00152         GeneratePreCode(preCode, coder);
00153         InstrumentBbl(instrumentor, firstBbl, GtIpoint::Before(), preCode);
00154 
00155         bool isMain  = func->IsKernelMain();
00156         bool isEntry = true;
00157 
00158         // Insert FunctionDescriptor for the function into the kernel functions map
00159         functions.emplace(std::piecewise_construct,
00160             std::forward_as_tuple(func->Id()),
00161             std::forward_as_tuple(func->Id(), func->Name(), isMain, firstIns.Id(), lastIns.Id(), idx));
00162 
00163         bool instrumentAfterCall = false; // Insert precode Instrumentation after call instruction
00164         for (auto bbl : func->Bbls())
00165         {
00166             if (instrumentAfterCall)
00167             {
00168                 GtGenProcedure precode;
00169                 GeneratePreCode(precode, coder);
00170                 InstrumentBbl(instrumentor, *bbl, GtIpoint::Before(), precode);
00171                 instrumentAfterCall = false;
00172             }
00173 
00174             const IGtIns& lastBblIns = bbl->LastIns();
00175 
00176             if (bbl->IsEot() || (!isMain && (lastBblIns.Opcode() == GED_OPCODE_ret)))
00177             {
00178 
00179                 // Insert code that stops timer at the last BBL's exit and stores result in the profile buffer
00180                 GtGenProcedure postCode;
00181 
00182                 // Insert fake consumers of source registers to expose hidden latency of EOT instructions
00183                 if (bbl->IsEot())
00184                 {
00185                     coder.GenerateFakeSrcConsumers(postCode, lastBblIns);
00186                 }
00187 
00188                 if (tryRegInstrument)
00189                 {
00190                     tryRegInstrument = GeneratePostCodeRegBound(postCode, finiCode, coder, profileArray, &functions.at(func->Id()), isEntry, true, simdMask);
00191                 }
00192                 if (!tryRegInstrument)
00193                 {
00194                     GeneratePostCodeMemBound(postCode, coder, profileArray, &functions.at(func->Id()), true, simdMask);
00195                 }
00196                 InstrumentInstruction(instrumentor, lastBblIns, GtIpoint::Before(), postCode);
00197                 idx++;
00198                 isEntry = false;
00199                 break;
00200             }
00201             else if (lastBblIns.Opcode() == GED_OPCODE_call || lastBblIns.Opcode() == GED_OPCODE_calla)
00202             {
00203                 // Insert code that stops timer at the last BBL's exit and stores result in the profile buffer
00204                 GtGenProcedure postCode;
00205 
00206                 if (tryRegInstrument)
00207                 {
00208                     tryRegInstrument = GeneratePostCodeRegBound(postCode, finiCode, coder, profileArray, &functions.at(func->Id()), isEntry, false, simdMask);
00209                 }
00210                 if (!tryRegInstrument)
00211                 {
00212                     GeneratePostCodeMemBound(postCode, coder, profileArray, &functions.at(func->Id()), false, simdMask);
00213                 }
00214                 InstrumentInstruction(instrumentor, lastBblIns, GtIpoint::Before(), postCode);
00215 
00216                 instrumentAfterCall = true;
00217                 isEntry = false;
00218             }
00219         }
00220     }
00221     // Insert 'finiCode' at all exits of the kernel - before each EOT instruction
00222     instrumentor.InstrumentExits(finiCode);
00223 
00224     // Transform functions into array of instrumentation sites
00225     InstrumentSites sites(numRecords);
00226     std::transform(cfg.Functions().begin(), cfg.Functions().end(), sites.begin(), [](const IGtFunction* f) -> FunctionId { return f->Id(); });
00227 
00228     // Create FunLatencyKernelProfile object that represents profile of this kernel
00229     _kernels.emplace(kernel.Id(), FunLatencyKernelProfile(kernel, cfg, profileArray, std::move(sites), std::move(functions)));
00230 }
00231 
00232 void FunLatency::OnKernelRun(IGtKernelDispatch& dispatcher)
00233 {
00234     bool             isProfileEnabled = false;
00235     const IGtKernel& kernel           = dispatcher.Kernel();
00236     GtKernelExecDesc execDesc; dispatcher.GetExecDescriptor(execDesc);
00237     if (kernel.IsInstrumented() && IsKernelExecProfileEnabled(execDesc, kernel.GpuPlatform(), kernel.Name().Get()))
00238     {
00239         auto it = _kernels.find(kernel.Id());
00240 
00241         if (it != _kernels.end())
00242         {
00243             IGtProfileBuffer* buffer = dispatcher.CreateProfileBuffer(); GTPIN_ASSERT(buffer);
00244             FunLatencyKernelProfile& kernelProfile = it->second;
00245             const GtProfileArray& profileArray = kernelProfile.GetProfileArray();
00246             if (profileArray.Initialize(*buffer))
00247             {
00248                 isProfileEnabled = true;
00249             }
00250             else
00251             {
00252                 GTPIN_ERROR_MSG(string("FunLatency : ") + string(kernel.Name()) + " : Failed to write into memory buffer");
00253             }
00254         }
00255     }
00256     dispatcher.SetProfilingMode(isProfileEnabled);
00257 }
00258 
00259 void FunLatency::OnKernelComplete(IGtKernelDispatch& dispatcher)
00260 {
00261     const IGtKernel& kernel = dispatcher.Kernel();
00262     GtKernelExecDesc execDesc; dispatcher.GetExecDescriptor(execDesc);
00263     bool isProfilingEnabled = dispatcher.IsProfilingEnabled();
00264     if (!isProfilingEnabled || !IsKernelExecProfileEnabled(execDesc, kernel.GpuPlatform(), kernel.Name().Get()))
00265     {
00266         return; // Do nothing with unprofiled kernel dispatches
00267     }
00268 
00269     auto it = _kernels.find(kernel.Id());
00270 
00271     if (it != _kernels.end())
00272     {
00273         const IGtProfileBuffer* buffer = dispatcher.GetProfileBuffer(); GTPIN_ASSERT(buffer);
00274         FunLatencyKernelProfile& kernelProfile = it->second;
00275         const GtProfileArray& profileArray = kernelProfile.GetProfileArray();
00276 
00277         for (uint32_t recordNum = 0; recordNum != profileArray.NumRecords(); ++recordNum)
00278         {
00279             FunLatencyDispatchProfile dispatchProfile(dispatcher, kernelProfile.GetSiteId(recordNum));
00280 
00281             for (uint32_t threadBucket = 0; threadBucket < profileArray.NumThreadBuckets(); ++threadBucket)
00282             {
00283                 FunLatencyRecord record;
00284                 if (!profileArray.Read(*buffer, &record, recordNum, 1, threadBucket))
00285                 {
00286                     GTPIN_ERROR_MSG(string("FunLatency : ") + string(kernel.Name()) + " : Failed to read from memory buffer");
00287                 }
00288                 else
00289                 {
00290                     dispatchProfile.Accumulate(record);
00291                 }
00292             }
00293             kernelProfile.AddDispatchProfile(dispatchProfile);
00294         }
00295     }
00296 }
00297 
00298 bool FunLatency::Use64BitCounters(const IGtGenCoder& coder)
00299 {
00300     return coder.InstructionFactory().CanAccessAtomically(GED_DATA_TYPE_uq);
00301 }
00302 
00303 void FunLatency::GeneratePreCode(GtGenProcedure& proc, const IGtGenCoder& coder)
00304 {
00305     coder.StartTimer(proc, _timeReg);
00306     if (!proc.empty()) { proc.front()->AppendAnnotation(__func__); }
00307 }
00308 
00309 bool FunLatency::GeneratePostCodeRegBound(GtGenProcedure& postProc, GtGenProcedure& finiProc, const IGtGenCoder& coder,
00310     const GtProfileArray& profileArray, FunctionDescriptor* funcDesc, bool isFuncEntry, bool updateFrequency, uint32_t simdMask)
00311 {
00312     IGtInsFactory&   insF  = coder.InstructionFactory();
00313     IGtVregFactory&  vregs = coder.VregFactory();
00314     IGtRegAllocator& ra    = coder.RegAllocator();
00315     bool            is64BitCounter = Use64BitCounters(coder);
00316 
00317     GtReg freqReg;
00318     GtReg cycleReg;
00319     if (isFuncEntry)
00320     {
00321         funcDesc->freqReg = vregs.MakeCounter(VREG_TYPE_DWORD);                                         // Frequency counter
00322         funcDesc->cycleReg = vregs.MakeCounter(is64BitCounter ? VREG_TYPE_QWORD : VREG_TYPE_DWORD);      // Cycle counter
00323     }
00324 
00325     freqReg  = funcDesc->freqReg;
00326     cycleReg = funcDesc->cycleReg;
00327 
00328     GtReg       flagReg = FlagReg(0);
00329     GtPredicate pred(flagReg);
00330 
00331     GtReg cycleRegL = { cycleReg, sizeof(uint32_t), 0 };                                        // Low 32-bits of cycle counter
00332     GtReg cycleRegH = { cycleReg, sizeof(uint32_t), 1 };                                        // High 32-bits of cycle counter
00333     GtReg dataRegL  = { _dataReg, sizeof(uint32_t), 0 };                                        // Low 32-bits of '_dataReg'
00334 
00335     // Generate procedure that computes and aggregates cycles and frequency
00336     GtGenProcedure proc;
00337     coder.StopTimerExt(proc, _timeReg);
00338 
00339     proc += insF.MakeAdd(cycleRegL, cycleRegL, _timeReg); // Add elapsed time to lower 32-bits of of 'cycleReg'
00340     if (is64BitCounter)
00341     {
00342         // If cycleRegL overflowed, increment the high 32-bits of 'cycleReg'
00343         proc += insF.MakeCmp(GED_COND_MODIFIER_l, flagReg, cycleRegL, _timeReg);
00344         proc += insF.MakeAdd(cycleRegH, cycleRegH, 1).SetPredicate(pred);
00345     }
00346 
00347     GtReg tmpReg  = vregs.MakeScratch();
00348     GtReg tmpReg1 = _timeReg;
00349     if (!knobIncludeZeroExecMask)
00350     {
00351         proc += insF.MakeMov(tmpReg, ChannelEnableReg());
00352         proc += insF.MakeMov(tmpReg1, DispatchMaskReg());
00353         proc += insF.MakeAnd(tmpReg, tmpReg1, tmpReg);
00354         proc += insF.MakeAnd(tmpReg, tmpReg, simdMask);
00355         proc += insF.MakeCmp(GED_COND_MODIFIER_z, flagReg, tmpReg, 0);
00356     }
00357     if (updateFrequency)
00358     {
00359         proc += insF.MakeAdd(freqReg, freqReg, 1);  // freq++
00360         if (!knobIncludeZeroExecMask)
00361         {
00362             proc.back()->SetPredicate(!pred);
00363         }
00364     }
00365 
00366     if (!knobIncludeZeroExecMask)
00367     {
00368         proc += insF.MakeMov(cycleRegL, 0).SetPredicate(pred); // Clear the low 32-bits of 'cycleReg'
00369         if (is64BitCounter)
00370         {
00371             proc += insF.MakeMov(cycleRegH, 0).SetPredicate(pred); // Clear the high 32-bits of 'cycleReg'
00372         }
00373     }
00374 
00375     if (isFuncEntry && !ra.ReserveVregOperands(proc))
00376     {
00377         return false; // No more free registers
00378     }
00379     proc.front()->AppendAnnotation(__func__);
00380     postProc.MoveAfter(std::move(proc));
00381 
00382     // Generate 'finiProc' procedure that stores aggregated cycles and frequency counters in the profile buffer
00383     profileArray.ComputeAddress(coder, proc, _addrReg, funcDesc->recordNum);
00384     proc += insF.MakeRegMov(_dataReg, cycleReg);
00385     proc += insF.MakeAtomicAdd(NullReg(), _addrReg, _dataReg, (is64BitCounter ? GED_DATA_TYPE_uq : GED_DATA_TYPE_ud));
00386 
00387     if (updateFrequency)
00388     {
00389         profileArray.ComputeRelAddress(coder, proc, _addrReg, _addrReg, offsetof(FunLatencyRecord, freq));
00390         proc += insF.MakeRegMov(dataRegL, freqReg);
00391         proc += insF.MakeAtomicAdd(NullReg(), _addrReg, _dataReg, GED_DATA_TYPE_ud);
00392 
00393         proc.front()->AppendAnnotation("GenerateFiniCode");
00394         finiProc.MoveAfter(std::move(proc));
00395     }
00396 
00397     return true;
00398 }
00399 
00400 void FunLatency::GeneratePostCodeMemBound(GtGenProcedure& proc, const IGtGenCoder& coder,
00401     const GtProfileArray& profileArray, FunctionDescriptor* funcDesc, bool updateFrequency, uint32_t simdMask)
00402 {
00403     IGtInsFactory&  insF           = coder.InstructionFactory();
00404     IGtVregFactory& vregs          = coder.VregFactory();
00405     bool            is64BitCounter = Use64BitCounters(coder);
00406     GtReg           dataRegL       = { _dataReg, sizeof(uint32_t), 0 };  // Low 32-bits of the data payload register
00407 
00408     // Generate code that computes elapsed time
00409     coder.StopTimerExt(proc, _timeReg);
00410 
00411     // cycles += _timeReg
00412     proc += insF.MakeMov(dataRegL, _timeReg);   // Move timer value to the low 32-bits of the data register
00413     if (is64BitCounter)
00414     {
00415         // Clear the high 32-bits of the data payload register
00416         GtReg dataRegH = { _dataReg, sizeof(uint32_t), 1 };
00417         proc += insF.MakeMov(dataRegH, 0);
00418     }
00419 
00420     GtReg       tmpReg  = vregs.MakeScratch();
00421     GtReg       tmpReg1 = _timeReg;
00422     GtReg       flagReg = FlagReg(0);
00423     GtPredicate pred(flagReg);
00424 
00425     if (!knobIncludeZeroExecMask)
00426     {
00427         proc += insF.MakeMov(tmpReg, ChannelEnableReg());
00428         proc += insF.MakeMov(tmpReg1, DispatchMaskReg());
00429         proc += insF.MakeAnd(tmpReg, tmpReg1, tmpReg);
00430         proc += insF.MakeAnd(tmpReg, tmpReg, simdMask);
00431         proc += insF.MakeCmp(GED_COND_MODIFIER_z, flagReg, tmpReg, 0);
00432     }
00433 
00434     // _addrReg =  address of the current thread's FunLatencyRecord in the profile buffer
00435     profileArray.ComputeAddress(coder, proc, _addrReg, funcDesc->recordNum);
00436 
00437     proc += insF.MakeAtomicAdd(NullReg(), _addrReg, _dataReg, (is64BitCounter ? GED_DATA_TYPE_uq : GED_DATA_TYPE_ud));
00438     if (!knobIncludeZeroExecMask)
00439     {
00440         proc.back()->SetPredicate(!pred);
00441     }
00442 
00443     // freq++
00444     if (updateFrequency)
00445     {
00446         profileArray.ComputeRelAddress(coder, proc, _addrReg, _addrReg, offsetof(FunLatencyRecord, freq));
00447         proc += insF.MakeAtomicInc(NullReg(), _addrReg, GED_DATA_TYPE_ud);
00448         if (!knobIncludeZeroExecMask)
00449         {
00450             proc.back()->SetPredicate(!pred);
00451         }
00452     }
00453 
00454     if (!proc.empty()) { proc.front()->AppendAnnotation(__func__); }
00455 }
00456 
00457 std::string FunLatency::ToString() const
00458 {
00459     ostringstream ostr;
00460     ostr << "### Kernel/Shader execution-time profile generated by GTPin ###" << endl << endl;
00461     ostr << "Legend:" << endl;
00462     ostr << "NA - kernel was not instrumented." << endl << endl;
00463     if (!knobIncludeZeroExecMask)
00464     {
00465         ostr << "NOTE: Tool reports cycles and frequencies only for functions that were called with at least one enabled channel in execution mask." << endl << endl;
00466     }
00467     ostr << setw(20) << "Kernel Name" << setw(20) << "HashID" << setw(30) << "Function Name" << setw(15) << "Inst";
00468     ostr << setw(15) << "Freq." << setw(20) << "Total-latency" << setw(20) << "Total-latency(%)" << setw(20) << "Avg-Cycles"<< setw(10) << "Type";
00469     ostr << setw(10) << "SIMD" << setw(15) << "Platform" << setw(35) << "Execution descriptor";
00470     ostr << endl;
00471     for (const auto& kernelEntry : _kernels)
00472     {
00473         ostr << kernelEntry.second.ToString();
00474     }
00475     return ostr.str();
00476 }
00477 
00478 void FunLatency::DumpAsm() const
00479 {
00480     for (const auto& kernelEntry : _kernels)
00481     {
00482         kernelEntry.second.DumpAsm();
00483     }
00484 }
00485 
00486 void FunLatency::Fini()
00487 {
00488     string profileDir = GTPin_GetCore()->ProfileDir();
00489     string filePath = JoinPath(profileDir, "FunLatency.txt");
00490 
00491     ofstream fs(filePath);
00492     if (fs.is_open())
00493     {
00494         fs << ToString();
00495         fs.close();
00496     }
00497     else
00498     {
00499         GTPIN_WARNING("FunLatency : could not create file: " + filePath);
00500     }
00501     DumpAsm();
00502 }
00503 
00504 /* ============================================================================================= */
00505 // FunLatencyDispatchProfile implementation
00506 /* ============================================================================================= */
00507 FunLatencyDispatchProfile::FunLatencyDispatchProfile(const IGtKernelDispatch& kernelDispatch, InstrumentSiteId id) :
00508     siteId(id), cycles(0), freq(0)
00509 {
00510     kernelDispatch.GetExecDescriptor(kernelExecDesc);
00511 }
00512 
00513 void FunLatencyDispatchProfile::Accumulate(const FunLatencyRecord& record)
00514 {
00515     cycles += record.cycles;
00516     freq += record.freq;
00517 }
00518 
00519 /* ============================================================================================= */
00520 // FunctionDescriptor implementation
00521 /* ============================================================================================= */
00522 FunctionDescriptor::FunctionDescriptor(const FunctionId fId, const std::string fName,
00523     const bool fIsMain, const InsId firstIns, const InsId lastIns, const uint32_t recNum) :
00524     id(fId), name(fName), isMain(fIsMain), firstInsId(firstIns), lastInsId(lastIns), recordNum(recNum) {};
00525 
00526 /* ============================================================================================= */
00527 // FunLatencyKernelProfile implementation
00528 /* ============================================================================================= */
00529 FunLatencyKernelProfile::FunLatencyKernelProfile(const IGtKernel& kernel,
00530     const IGtCfg& cfg,
00531     const GtProfileArray& profileArray,
00532     InstrumentSites&& instrumentSites,
00533     FunctionMap&& functions) :
00534     _name(GlueString(kernel.Name())), _uniqueName(kernel.UniqueName()), _type(kernel.Type()), _platform(kernel.GpuPlatform()), _hashId(kernel.HashId()),
00535     _simd(kernel.SimdWidth()), _asmText(CfgAsmText(cfg)), _profileArray(profileArray), _totalKernelCycles(0),
00536     _instrumentSites(std::move(instrumentSites)),_functionMap(std::move(functions)) {};
00537 
00538 FunLatencyDispatchProfile& FunLatencyKernelProfile::AddDispatchProfile(const FunLatencyDispatchProfile& dispatchProfile)
00539 {
00540     _totalKernelCycles += dispatchProfile.cycles;
00541     _dispatchProfiles.push_back(dispatchProfile);
00542     return _dispatchProfiles.back();
00543 }
00544 
00545 InstrumentSiteId FunLatencyKernelProfile::GetSiteId(uint32_t recordNum) const
00546 {
00547     GTPIN_ASSERT(recordNum < _instrumentSites.size());
00548     return _instrumentSites[recordNum];
00549 }
00550 
00551 void FunLatencyKernelProfile::DumpAsm() const
00552 {
00553     DumpKernelAsmText(_name, _uniqueName, _asmText);
00554 }
00555 
00556 string FunLatencyKernelProfile::ToString() const
00557 {
00558     ostringstream ostr;
00559     if (!_dispatchProfiles.empty())
00560     {
00561         for (const auto& dp : _dispatchProfiles)
00562         {
00563             if (knobSkipZeroResults && dp.freq == 0)
00564             {
00565                 continue; // Skip zero results
00566             }
00567 
00568             auto func = _functionMap.find(dp.siteId)->second;
00569             double dispatchToExecTotal = (_totalKernelCycles ? ((100.0 * (double)dp.cycles) / _totalKernelCycles) : 0.0);
00570             uint64_t    avgCycles = (dp.freq ? (dp.cycles / dp.freq) : 0);
00571             string funcName = func.name + (func.isMain ? "(main)" : "");
00572             if (funcName.length() > 29)
00573             {
00574                 funcName = funcName.substr(0, 29);
00575             }
00576 
00577             if (func.isMain)
00578             {
00579                 ostr << endl;
00580             }
00581 
00582             if (func.isMain)
00583             {
00584                 ostr << endl;
00585             }
00586             ostr << setw(20) << _name << setw(20) << hex << _hashId;
00587             ostr << setw(30) << funcName << setw(15) << std::string("[" + std::to_string((uint32_t)func.firstInsId) + "-" + std::to_string((uint32_t)func.lastInsId) + "]");
00588             ostr << setw(15) << dec << dp.freq << setw(20) << dec << dp.cycles;
00589             ostr << fixed << setprecision(2) << setw(20) << dispatchToExecTotal << setw(20) << avgCycles << setw(10) << _type.ToString();
00590             ostr << setw(10) << _simd << setw(15) << _platform.ToString() << setw(35) << dp.kernelExecDesc.ToString(_platform, ExecDescAlignedFormat());
00591             ostr << endl;
00592         }
00593     }
00594     else
00595     {
00596         ostr << setw(20) << _name << setw(20) << hex << _hashId;
00597         ostr << setw(30) << "NA" << setw(15) << +"NA";
00598         ostr << setw(15) << dec << "NA" << setw(20) << dec << "NA";
00599         ostr << fixed << setprecision(2) << setw(20) << "NA" << setw(20) << "NA" << setw(10) << "NA";
00600         ostr << setw(10) << "NA" << setw(15) << "NA" << setw(35) << "NA";
00601         ostr << endl;
00602     }
00603 
00604     ostr << endl;
00605     return ostr.str();
00606 }
00607 
00608 /* ============================================================================================= */
00609 // GTPin_Entry
00610 /* ============================================================================================= */
00611 EXPORT_C_FUNC void GTPin_Entry(int argc, const char* argv[])
00612 {
00613     ConfigureGTPin(argc, argv);
00614     FunLatency::Instance()->Register();
00615     atexit(FunLatency::OnFini);
00616 }

(Back to the list of all GTPin Sample Tools)


 All Data Structures Functions Variables Typedefs Enumerations Enumerator


  Copyright (C) 2013-2025 Intel Corporation
SPDX-License-Identifier: MIT