GTPin
GTReplay: Ops Sample Tool

The Ops tool counts amount of dynamic operations for each instruction

Running Ops tool

To run the Ops tool in its default configuration, use this command:

Profilers\GTReplay\intel64\gtreplay.exe -t ops -- path-to-the-directory-containing-the-trace

(Back to the list of all GTReplay Sample Tools)

ops.cpp

00001 /*========================== begin_copyright_notice ============================
00002 Copyright (C) 2021-2022 Intel Corporation
00003 
00004 SPDX-License-Identifier: MIT
00005 ============================= end_copyright_notice ===========================*/
00006 
00007 /*******************************************************************************************************
00008  * OPS tool
00009  *
00010  * Count dynamic operations according to active channels
00011  *
00012  * NOTE: the tool callbacks might be called from different threads.
00013  */
00014 #include <cstring>
00015 #include <stdio.h>
00016 #include <vector>
00017 #ifdef TARGET_WINDOWS
00018 #include "intrin.h"
00019 #endif
00020 #ifdef TARGET_LINUX
00021 #include "x86intrin.h"
00022 #endif
00023 
00024 #include "gtreplay_assert.h"
00025 #include "gtreplay_client.h"
00026 #include "knob_parser.h"
00027 
00028 
00029 // Global variables 
00030 uint32_t gMaxNumOfHwThreads = 0;
00031 uint32_t gMaxNumOfTiles = 0;
00032 uint32_t gNumOfInstructions = 0;
00033 uint32_t gNumOfDataTypes = 0;
00034 
00035 struct PerTileInstCounter {
00036     PerTileInstCounter()
00037     {
00038         count.resize(gMaxNumOfHwThreads);
00039         for (uint32_t i = 0; i < gMaxNumOfHwThreads; i++)
00040         {
00041             count[i].resize(gNumOfInstructions, 0);
00042         }
00043     }
00044     std::vector<std::vector<uint64_t>> count;
00045 };
00046 
00047 std::vector<PerTileInstCounter>    icount;
00048 std::vector<PerTileInstCounter>    channels;
00049 std::vector<uint64_t>              totalIcount;
00050 std::vector<uint64_t>              totalChannels;
00051 std::vector<uint64_t>              totalOps;
00052 std::vector<std::vector<uint64_t>> opcodes;
00053 uint64_t   total_icount = 0;
00054 uint64_t   total_channels = 0;
00055 std::string kernelName;
00056 
00057 /*
00058  * BeforeInsCallback - callback called before instruction execution
00059  *
00060  * @params[in] tid - the ID of the GPU HW thread for which the callback is called
00061  * @params[in] ins - a handle to the current instruction
00062  * @params[in] state - a handle to the HW Thread state corresponding to tid
00063  */
00064 void BeforeInsCallback(uint32_t tileId, uint32_t tid, GTReplayIns ins, GTReplayState state, void*)
00065 {
00066     GTREPLAY_ASSERT(tileId < gMaxNumOfTiles&& tid < gMaxNumOfHwThreads);
00067 
00068     // Obtain the instruction ID within the kernel
00069     uint32_t id = GTReplay_InsId(ins);
00070     // Update the instruction counter corresponding to the current HW thread and current instruction
00071     icount[tileId].count[tid][id]++;
00072     
00073     // If the instruction is operational
00074     if (GTReplay_IsOperational(ins))
00075     {
00076         // Update the dynamic number of active channels
00077         channels[tileId].count[tid][id] += _mm_popcnt_u32(GTReplay_DynamicExecMask(ins, state));
00078     }
00079 }
00080 
00081 /*
00082  * AfterInsCallback - callback called after instruction execution
00083  *                    An illustration - this tool doesn't need to register after instruction callbacks
00084  *
00085  * @params[in] tid - the ID of the GPU HW thread for which the callback is called
00086  * @params[in] ins - a handle to the current instruction
00087  * @params[in] state - a handle to the HW Thread state corresponding to tid
00088  */
00089 void AfterInsCallback(uint32_t tileId, uint32_t tid, GTReplayIns ins, GTReplayState state, void*)
00090 {
00091 }
00092 
00093 /*
00094  * OnKernelComplete - callback called upon kernel completion
00095  *
00096  * @params[in] kernel - a handle to the kernel
00097  */
00098 void OnKernelComplete(GTReplayKernel kernel)
00099 {
00100     // accumulated icounts and the number of active channels over HW threads
00101     for (uint32_t i = 0; i < gNumOfInstructions; i++)
00102     {
00103         totalIcount[i]   = 0;
00104         totalChannels[i] = 0;
00105 
00106         for (uint32_t tileId = 0; tileId < gMaxNumOfTiles; tileId++)
00107         {
00108             for (uint32_t t = 0; t < gMaxNumOfHwThreads; t++)
00109             {
00110                 totalIcount[i]   += icount[tileId].count[t][i];
00111                 totalChannels[i] += channels[tileId].count[t][i];
00112             }
00113         }
00114 
00115         total_icount   += totalIcount[i];
00116         total_channels += totalChannels[i];
00117     }
00118 
00119     // Update opcode histogram
00120     for (uint32_t i = 0; i < gNumOfInstructions; i++)
00121     {
00122         GTReplayIns ins = GTReplay_Ins(kernel, i);
00123         uint32_t opcode = GTReplay_Opcode(ins);
00124         uint32_t dataType = GTReplay_DataType(ins);
00125 
00126         totalOps[dataType] += totalChannels[i];
00127     }
00128 
00129     std::cout << "\n\n=================\n";
00130     std::cout << "OPS TOOL\n";
00131     std::cout << "=================\n\n";
00132 
00133     // Print the results
00134     std::cout << "Kernel: " << kernelName << "\n\n";
00135     std::cout << "TOTAL ICOUNT   = " << total_icount << "\n\n";
00136     std::cout << "TOTAL CHANNELS = " << total_channels << "\n\n";
00137     
00138     for (uint32_t i = 0; i < gNumOfInstructions; i++)
00139     {
00140         GTReplayIns ins = GTReplay_Ins(kernel, i);
00141 
00142         std::cout << "[" << std::setw(15) << totalChannels[i] << "] " << std::string(GTReplay_Disasm(ins)) << std::endl;
00143     }
00144     
00145     for (uint32_t t = 0; t < gNumOfDataTypes + 1; t++)
00146     {
00147        
00148         if (totalOps[t] == 0)
00149         {
00150             continue;
00151         }
00152         
00153         std::cout << "DATA TYPE: " << std::setw(5) << std::string(GTReplay_DataTypeName(t)) << "   total ops = " << totalOps[t] << "\n\n";
00154     }
00155 }
00156 
00157 /*
00158  * OnKernelBuild - callback called before kernel execution
00159  *                 The purpose of this callback is to traverse the kernel binary and instrument callbacks
00160  *
00161  * @params[in] kernel - a handle to the kernel
00162  */
00163 void OnKernelBuild(GTReplayKernel kernel)
00164 {
00165     uint32_t gModelId = GTReplay_GetModel(kernel);
00166 
00167     gMaxNumOfHwThreads = GTReplay_MaxNumOfHWThreads(gModelId);
00168 
00169     gMaxNumOfTiles = GTReplay_MaxNumOfTiles(kernel);
00170     GTREPLAY_ASSERT(gMaxNumOfTiles);
00171 
00172     gNumOfInstructions = GTReplay_NumOfInstructions(kernel);
00173 
00174     // Traverse all the basic blocks 
00175     for (GTReplayBbl bbl = GTReplay_BblHead(kernel); GTReplay_BblValid(bbl); bbl = GTReplay_BblNext(bbl))
00176     {
00177         // Traverse all the instruction within the basic blocks 
00178         for (GTReplayIns ins = GTReplay_InsHead(bbl); GTReplay_InsValid(ins); ins = GTReplay_InsNext(ins))
00179         {
00180             // Register callback to be called before instruction execution
00181             GTReplay_RegisterCallbackBeforeIns(kernel, ins, BeforeInsCallback, NULL);
00182         }
00183     }
00184 
00185     // Allocate and initialize buffers
00186     icount.resize(gMaxNumOfTiles);
00187     channels.resize(gMaxNumOfTiles);
00188 
00189     totalIcount.resize(gNumOfInstructions, 0);
00190     totalChannels.resize(gNumOfInstructions, 0);
00191     totalOps.resize(gNumOfInstructions, 0);
00192 
00193     gNumOfDataTypes = GTReplay_NumDataTypes();
00194     uint32_t kernelNameSize = 0;
00195     GTReplay_GetKernelName(kernel, &kernelNameSize, nullptr);
00196 
00197     char* buf = new char[kernelNameSize + 1]();
00198     GTReplay_GetKernelName(kernel, &kernelNameSize, buf);
00199 
00200     kernelName = std::string(buf);
00201     
00202     delete[] buf;
00203 }
00204 
00205 /*
00206  * GTReplay_Entry - tool entry point
00207  */
00208 extern "C"
00209 DLLEXP void FASTCALL GTReplay_Entry(int argc, const char *argv[])
00210 {
00211     // configure GTReplay
00212     ConfigureGTReplay(argc, argv);
00213     
00214     // register OnKernelBuild and OnKernelComplete callbacks
00215     GTReplay_RegisterOnKernelBuildCallback(OnKernelBuild);
00216     GTReplay_RegisterOnKernelCompleteCallback(OnKernelComplete);
00217 
00218     // Start GTReplay
00219     GTReplay_Start();
00220 }

(Back to the list of all GTReplay Sample Tools)


 All Data Structures Functions Variables Typedefs Enumerations Enumerator


  Copyright (C) 2013-2025 Intel Corporation
SPDX-License-Identifier: MIT