GTPin
GTReplay: Toggle Sample Tool

The Toggle tool counts the amount of bits toggled by the kernel

Running Ops tool

To run the Toggle tool in its default configuration, use this command:

Profilers\GTReplay\intel64\gtreplay.exe -t toggle -- path-to-the-directory-containing-the-trace

(Back to the list of all GTReplay Sample Tools)

toggle.cpp

00001 /*========================== begin_copyright_notice ============================
00002 Copyright (C) 2021-2022 Intel Corporation
00003 
00004 SPDX-License-Identifier: MIT
00005 ============================= end_copyright_notice ===========================*/
00006 
00007 /*******************************************************************************************************
00008  * TOGGLE tool
00009  *
00010  * Count dynamic amount of toggling bits - the ones that changed their values 0->1 and 1->0
00011  *
00012  * NOTE: the tool callbacks might be called from different threads.
00013  */
00014 #include <stdio.h>
00015 #include <string.h>
00016 #include <vector>
00017 #ifdef TARGET_WINDOWS
00018 #include "intrin.h"
00019 #endif
00020 #ifdef TARGET_LINUX
00021 #include "x86intrin.h"
00022 #endif
00023 
00024 #include "gtreplay_assert.h"
00025 #include "gtreplay_client.h"
00026 #include "knob_parser.h"
00027 
00028 // Structure definitions
00029 typedef union {
00030     uint8_t  byte[32];
00031     uint16_t word[32];
00032     uint32_t dword[32];
00033     uint64_t qword[32];
00034     int8_t   sbyte[32];
00035     int16_t  sword[32];
00036     int32_t  sdword[32];
00037     int64_t  sqword[32];
00038     float    spfloat[32];
00039     double   dpfloat[32];
00040 } Operand;
00041 
00042 typedef struct {
00043     uint32_t dword[8];
00044 } FullReg32;
00045 
00046 typedef struct {
00047     uint32_t dword[16];
00048 } FullReg64;
00049 
00050 typedef union {
00051     FullReg32 reg32[16];
00052     FullReg64 reg64[16];
00053 } SendDest;
00054 
00055 // Global variables 
00056 uint32_t   gMaxNumOfHwThreads = 0;
00057 uint32_t   gMaxNumOfTiles = 0;
00058 uint32_t   gRegWidth = 32;
00059 
00060 uint64_t  total_icount = 0;
00061 uint64_t  total_toggle_bits = 0;
00062 std::vector<std::vector<uint64_t>> icount;
00063 std::vector<std::vector<SendDest>> sendRegsBefore;
00064 std::vector<std::vector<Operand>>  dstBefore;
00065 std::vector<std::vector<uint32_t>> execMask;
00066 std::vector<std::vector<uint64_t>> toggledBits;
00067 std::string kernelName;
00068 
00069 void HandleSendBefore(uint32_t tileId, uint32_t tid, GTReplayIns ins, GTReplayState state)
00070 {
00071     uint32_t numOfElements = 0;
00072     uint32_t elementWidth = 0;
00073 
00074     // Obtain and save the registers
00075     GTReplay_GetSendDestination(ins, state, (uint8_t*)&sendRegsBefore[tileId][tid], &numOfElements, &elementWidth);
00076 }
00077 
00078 void HandleSendAfter(uint32_t tileId, uint32_t tid, GTReplayIns ins, GTReplayState state)
00079 {
00080     SendDest dst;
00081     uint32_t numOfElements = 0;
00082     uint32_t elementWidth = 0;
00083 
00084     // Obtain the registers
00085     GTReplay_GetSendDestination(ins, state, (uint8_t*)&dst, &numOfElements, &elementWidth);
00086 
00087     if (numOfElements == 0)
00088     {
00089         return;
00090     }
00091 
00092     uint32_t count = 0;
00093 
00094     if (gRegWidth == 32)
00095     {
00096         // go over all registers
00097         for (uint32_t i = 0; i < numOfElements; i++)
00098         {
00099             FullReg32 afreg = dst.reg32[i], bfreg = sendRegsBefore[tileId][tid].reg32[i];
00100 
00101             // go over all elements
00102             for (uint32_t j = 0; j < 8; j++)
00103             {
00104                 // compute the amount of toggled bits
00105                 uint32_t tmp = bfreg.dword[j] ^ afreg.dword[j];
00106 
00107                 count += _mm_popcnt_u32(tmp);
00108             }
00109         }
00110     }
00111     else
00112     {
00113         // go over all registers
00114         for (uint32_t i = 0; i < numOfElements; i++)
00115         {
00116             FullReg64 afreg = dst.reg64[i], bfreg = sendRegsBefore[tileId][tid].reg64[i];
00117 
00118             // go over all elements
00119             for (uint32_t j = 0; j < 16; j++)
00120             {
00121                 // compute the amount of toggled bits
00122                 uint32_t tmp = bfreg.dword[j] ^ afreg.dword[j];
00123 
00124                 count += _mm_popcnt_u32(tmp);
00125             }
00126         }
00127     }
00128 
00129     toggledBits[tileId][tid] += count;
00130 }
00131 
00132 /*
00133  * BeforeInsCallback - callback called before instruction execution
00134  *
00135  * @params[in] tid - the ID of the GPU HW thread for which the callback is called
00136  * @params[in] ins - a handle to the current instruction
00137  * @params[in] state - a handle to the HW Thread state corresponding to tid
00138  */
00139 void BeforeInsCallback(uint32_t tileId, uint32_t tid, GTReplayIns ins, GTReplayState state, void*)
00140 {
00141     GTREPLAY_ASSERT(tileId < gMaxNumOfTiles && tid < gMaxNumOfHwThreads);
00142     // Update the instruction counter corresponding to the current HW thread
00143     icount[tileId][tid]++;
00144 
00145     // Check whether the instruction has destination
00146     if (!GTReplay_HasDestination(ins))
00147     {
00148         // If not, there is nothing to do
00149         return;
00150     }
00151 
00152     // Check whether the instruction is a SEND instruction
00153     if (GTReplay_IsSend(ins))
00154     {
00155         // Is yes, handle SEND instruction separately
00156         HandleSendBefore(tileId, tid, ins, state);
00157         return;
00158     }
00159 
00160     // Obtain exec mask
00161     execMask[tileId][tid] = GTReplay_DynamicExecMask(ins, state);
00162     
00163     uint32_t numOfElements = 0;
00164     uint32_t elementWidth = 0;
00165 
00166     // Obtain and save destination before
00167     GTReplay_GetDestination(ins, state, execMask[tileId][tid], (uint8_t*)&dstBefore[tileId][tid], &numOfElements, &elementWidth);
00168 }
00169 
00170 /*
00171  * AfterInsCallback - callback called after instruction execution
00172  *
00173  * @params[in] tid - the ID of the GPU HW thread for which the callback is called
00174  * @params[in] ins - a handle to the current instruction
00175  * @params[in] state - a handle to the HW Thread state corresponding to tid
00176  */
00177 void AfterInsCallback(uint32_t tileId, uint32_t tid, GTReplayIns ins, GTReplayState state, void*)
00178 {
00179     GTREPLAY_ASSERT(tileId < gMaxNumOfTiles&& tid < gMaxNumOfHwThreads);
00180 
00181     // Check whether the instruction has destination
00182     if (!GTReplay_HasDestination(ins))
00183     {
00184         // If not, there is nothing to do
00185         return;
00186     }
00187 
00188     // Check whether the instruction is a SEND instruction
00189     if (GTReplay_IsSend(ins))
00190     {
00191         // Is yes, handle SEND instruction separately
00192         HandleSendAfter(tileId, tid, ins, state);
00193         return;
00194     }
00195 
00196     Operand  dstAfter = {};
00197     uint32_t numOfElements = 0;
00198     uint32_t elementWidth = 0;
00199 
00200     // Obtain destination after
00201     GTReplay_GetDestination(ins, state, execMask[tileId][tid], (uint8_t*)&dstAfter, &numOfElements, &elementWidth);
00202 
00203     uint32_t count = 0;
00204 
00205     // Iterate over all elements
00206     for (uint32_t i = 0; i < numOfElements; i++)
00207     {
00208         // Compute the amount of toggled bits
00209         switch (elementWidth) {
00210         case 1: count += _mm_popcnt_u32(dstBefore[tileId][tid].byte[i] ^ dstAfter.byte[i]); break;
00211         case 2: count += _mm_popcnt_u32(dstBefore[tileId][tid].word[i] ^ dstAfter.word[i]); break;
00212         case 4: count += _mm_popcnt_u32(dstBefore[tileId][tid].dword[i] ^ dstAfter.dword[i]); break;
00213         case 8:
00214         {
00215             uint64_t tmp = dstBefore[tileId][tid].qword[i] ^ dstAfter.qword[i];
00216 
00217             count += _mm_popcnt_u32((uint32_t)(tmp & 0xFFFFFFFF));
00218             count += _mm_popcnt_u32((uint32_t)(tmp >> 32));
00219 
00220             break;
00221         }
00222         default: break;
00223         }
00224     }
00225 
00226     toggledBits[tileId][tid] += count;
00227 }
00228 
00229 /*
00230  * OnKernelComplete - callback called upon kernel completion
00231  *
00232  * @params[in] kernel - a handle to the kernel
00233  */
00234  void OnKernelComplete(GTReplayKernel kernel)
00235 {
00236     total_icount = 0;
00237     total_toggle_bits = 0;
00238 
00239     // Accumulate counters from all HW threads
00240     for (uint32_t tileId = 0; tileId < gMaxNumOfTiles; tileId++)
00241     {
00242         for (uint32_t t = 0; t < gMaxNumOfHwThreads; t++)
00243         {
00244             total_icount += icount[tileId][t];
00245             total_toggle_bits += toggledBits[tileId][t];
00246         }
00247     }
00248 
00249     // Print the results
00250     std::cout << "\n\n=================\n";
00251     std::cout << "BIT TOGGLING TOOL\n";
00252     std::cout << "=================\n\n";
00253     std::cout.imbue(std::locale(""));
00254     std::cout << "Kernel: " << kernelName << "\n\n";
00255     std::cout << "TOTAL ICOUNT       = " << total_icount << "\n\n";
00256     std::cout << "TOTAL TOGGLED BITS = " << total_toggle_bits << "\n\n";
00257 }
00258 
00259 /*
00260  * OnKernelBuild - callback called before kernel execution
00261  *                 The purpose of this callback is to traverse the kernel binary and instrument callbacks
00262  *
00263  * @params[in] kernel - a handle to the kernel
00264  */
00265 void OnKernelBuild(GTReplayKernel kernel)
00266 {
00267     uint32_t gModelId = GTReplay_GetModel(kernel);
00268 
00269     gMaxNumOfHwThreads = GTReplay_MaxNumOfHWThreads(gModelId);
00270 
00271     gMaxNumOfTiles = GTReplay_MaxNumOfTiles(kernel);
00272     GTREPLAY_ASSERT(gMaxNumOfTiles);
00273 
00274     gRegWidth = GTReplay_RegisterWidth(gModelId);
00275 
00276     // Traverse all the basic blocks 
00277     for (GTReplayBbl bbl = GTReplay_BblHead(kernel); GTReplay_BblValid(bbl); bbl = GTReplay_BblNext(bbl))
00278     {
00279         // Traverse all the instruction within the basic blocks 
00280         for (GTReplayIns ins = GTReplay_InsHead(bbl); GTReplay_InsValid(ins); ins = GTReplay_InsNext(ins))
00281         {
00282             // Register callback to be called before instruction execution
00283             GTReplay_RegisterCallbackBeforeIns(kernel, ins, BeforeInsCallback, NULL);
00284             // Register callback to be called after instruction execution
00285             GTReplay_RegisterCallbackAfterIns(kernel, ins, AfterInsCallback, NULL);
00286         }
00287     }
00288 
00289     // Allocate and initialize buffers
00290     icount.resize(gMaxNumOfTiles);
00291     toggledBits.resize(gMaxNumOfTiles);
00292     execMask.resize(gMaxNumOfTiles);
00293     sendRegsBefore.resize(gMaxNumOfTiles);
00294     dstBefore.resize(gMaxNumOfTiles);
00295     for (uint32_t i = 0; i < gMaxNumOfTiles; i++)
00296     {
00297         icount[i].resize(gMaxNumOfHwThreads, 0);
00298         toggledBits[i].resize(gMaxNumOfHwThreads, 0);
00299         execMask[i].resize(gMaxNumOfHwThreads, 0);
00300         sendRegsBefore[i].resize(gMaxNumOfHwThreads);
00301         dstBefore[i].resize(gMaxNumOfHwThreads);
00302     }
00303 
00304     uint32_t kernelNameSize = 0;
00305     GTReplay_GetKernelName(kernel, &kernelNameSize, nullptr);
00306 
00307     char* buf = new char[kernelNameSize + 1]();
00308     GTReplay_GetKernelName(kernel, &kernelNameSize, buf);
00309 
00310     kernelName = std::string(buf);
00311     
00312     delete[] buf;
00313 }
00314 
00315 /*
00316  * GTReplay_Entry - tool entry point
00317  */
00318 extern "C"
00319 DLLEXP void FASTCALL GTReplay_Entry(int argc, const char *argv[])
00320 {
00321     // configure GTReplay
00322     ConfigureGTReplay(argc, argv);
00323     
00324     // register OnKernelBuild and OnKernelComplete callbacks
00325     GTReplay_RegisterOnKernelBuildCallback(OnKernelBuild);
00326     GTReplay_RegisterOnKernelCompleteCallback(OnKernelComplete);
00327 
00328     // Start GTReplay
00329     GTReplay_Start();
00330 }

(Back to the list of all GTReplay Sample Tools)


 All Data Structures Functions Variables Typedefs Enumerations Enumerator


  Copyright (C) 2013-2025 Intel Corporation
SPDX-License-Identifier: MIT