|
GTPin
|
The Ops tool counts amount of dynamic operations for each instruction
To run the Ops tool in its default configuration, use this command:
Profilers\GTReplay\intel64\gtreplay.exe -t ops -- path-to-the-directory-containing-the-trace
(Back to the list of all GTReplay Sample Tools)
00001 /*========================== begin_copyright_notice ============================ 00002 Copyright (C) 2021-2022 Intel Corporation 00003 00004 SPDX-License-Identifier: MIT 00005 ============================= end_copyright_notice ===========================*/ 00006 00007 /******************************************************************************************************* 00008 * OPS tool 00009 * 00010 * Count dynamic operations according to active channels 00011 * 00012 * NOTE: the tool callbacks might be called from different threads. 00013 */ 00014 #include <cstring> 00015 #include <stdio.h> 00016 #include <vector> 00017 #ifdef TARGET_WINDOWS 00018 #include "intrin.h" 00019 #endif 00020 #ifdef TARGET_LINUX 00021 #include "x86intrin.h" 00022 #endif 00023 00024 #include "gtreplay_assert.h" 00025 #include "gtreplay_client.h" 00026 #include "knob_parser.h" 00027 00028 00029 // Global variables 00030 uint32_t gMaxNumOfHwThreads = 0; 00031 uint32_t gMaxNumOfTiles = 0; 00032 uint32_t gNumOfInstructions = 0; 00033 uint32_t gNumOfDataTypes = 0; 00034 00035 struct PerTileInstCounter { 00036 PerTileInstCounter() 00037 { 00038 count.resize(gMaxNumOfHwThreads); 00039 for (uint32_t i = 0; i < gMaxNumOfHwThreads; i++) 00040 { 00041 count[i].resize(gNumOfInstructions, 0); 00042 } 00043 } 00044 std::vector<std::vector<uint64_t>> count; 00045 }; 00046 00047 std::vector<PerTileInstCounter> icount; 00048 std::vector<PerTileInstCounter> channels; 00049 std::vector<uint64_t> totalIcount; 00050 std::vector<uint64_t> totalChannels; 00051 std::vector<uint64_t> totalOps; 00052 std::vector<std::vector<uint64_t>> opcodes; 00053 uint64_t total_icount = 0; 00054 uint64_t total_channels = 0; 00055 std::string kernelName; 00056 00057 /* 00058 * BeforeInsCallback - callback called before instruction execution 00059 * 00060 * @params[in] tid - the ID of the GPU HW thread for which the callback is called 00061 * @params[in] ins - a handle to the current instruction 00062 * @params[in] state - a handle to the HW Thread state corresponding to tid 00063 */ 00064 void BeforeInsCallback(uint32_t tileId, uint32_t tid, GTReplayIns ins, GTReplayState state, void*) 00065 { 00066 GTREPLAY_ASSERT(tileId < gMaxNumOfTiles&& tid < gMaxNumOfHwThreads); 00067 00068 // Obtain the instruction ID within the kernel 00069 uint32_t id = GTReplay_InsId(ins); 00070 // Update the instruction counter corresponding to the current HW thread and current instruction 00071 icount[tileId].count[tid][id]++; 00072 00073 // If the instruction is operational 00074 if (GTReplay_IsOperational(ins)) 00075 { 00076 // Update the dynamic number of active channels 00077 channels[tileId].count[tid][id] += _mm_popcnt_u32(GTReplay_DynamicExecMask(ins, state)); 00078 } 00079 } 00080 00081 /* 00082 * AfterInsCallback - callback called after instruction execution 00083 * An illustration - this tool doesn't need to register after instruction callbacks 00084 * 00085 * @params[in] tid - the ID of the GPU HW thread for which the callback is called 00086 * @params[in] ins - a handle to the current instruction 00087 * @params[in] state - a handle to the HW Thread state corresponding to tid 00088 */ 00089 void AfterInsCallback(uint32_t tileId, uint32_t tid, GTReplayIns ins, GTReplayState state, void*) 00090 { 00091 } 00092 00093 /* 00094 * OnKernelComplete - callback called upon kernel completion 00095 * 00096 * @params[in] kernel - a handle to the kernel 00097 */ 00098 void OnKernelComplete(GTReplayKernel kernel) 00099 { 00100 // accumulated icounts and the number of active channels over HW threads 00101 for (uint32_t i = 0; i < gNumOfInstructions; i++) 00102 { 00103 totalIcount[i] = 0; 00104 totalChannels[i] = 0; 00105 00106 for (uint32_t tileId = 0; tileId < gMaxNumOfTiles; tileId++) 00107 { 00108 for (uint32_t t = 0; t < gMaxNumOfHwThreads; t++) 00109 { 00110 totalIcount[i] += icount[tileId].count[t][i]; 00111 totalChannels[i] += channels[tileId].count[t][i]; 00112 } 00113 } 00114 00115 total_icount += totalIcount[i]; 00116 total_channels += totalChannels[i]; 00117 } 00118 00119 // Update opcode histogram 00120 for (uint32_t i = 0; i < gNumOfInstructions; i++) 00121 { 00122 GTReplayIns ins = GTReplay_Ins(kernel, i); 00123 uint32_t opcode = GTReplay_Opcode(ins); 00124 uint32_t dataType = GTReplay_DataType(ins); 00125 00126 totalOps[dataType] += totalChannels[i]; 00127 } 00128 00129 std::cout << "\n\n=================\n"; 00130 std::cout << "OPS TOOL\n"; 00131 std::cout << "=================\n\n"; 00132 00133 // Print the results 00134 std::cout << "Kernel: " << kernelName << "\n\n"; 00135 std::cout << "TOTAL ICOUNT = " << total_icount << "\n\n"; 00136 std::cout << "TOTAL CHANNELS = " << total_channels << "\n\n"; 00137 00138 for (uint32_t i = 0; i < gNumOfInstructions; i++) 00139 { 00140 GTReplayIns ins = GTReplay_Ins(kernel, i); 00141 00142 std::cout << "[" << std::setw(15) << totalChannels[i] << "] " << std::string(GTReplay_Disasm(ins)) << std::endl; 00143 } 00144 00145 for (uint32_t t = 0; t < gNumOfDataTypes + 1; t++) 00146 { 00147 00148 if (totalOps[t] == 0) 00149 { 00150 continue; 00151 } 00152 00153 std::cout << "DATA TYPE: " << std::setw(5) << std::string(GTReplay_DataTypeName(t)) << " total ops = " << totalOps[t] << "\n\n"; 00154 } 00155 } 00156 00157 /* 00158 * OnKernelBuild - callback called before kernel execution 00159 * The purpose of this callback is to traverse the kernel binary and instrument callbacks 00160 * 00161 * @params[in] kernel - a handle to the kernel 00162 */ 00163 void OnKernelBuild(GTReplayKernel kernel) 00164 { 00165 uint32_t gModelId = GTReplay_GetModel(kernel); 00166 00167 gMaxNumOfHwThreads = GTReplay_MaxNumOfHWThreads(gModelId); 00168 00169 gMaxNumOfTiles = GTReplay_MaxNumOfTiles(kernel); 00170 GTREPLAY_ASSERT(gMaxNumOfTiles); 00171 00172 gNumOfInstructions = GTReplay_NumOfInstructions(kernel); 00173 00174 // Traverse all the basic blocks 00175 for (GTReplayBbl bbl = GTReplay_BblHead(kernel); GTReplay_BblValid(bbl); bbl = GTReplay_BblNext(bbl)) 00176 { 00177 // Traverse all the instruction within the basic blocks 00178 for (GTReplayIns ins = GTReplay_InsHead(bbl); GTReplay_InsValid(ins); ins = GTReplay_InsNext(ins)) 00179 { 00180 // Register callback to be called before instruction execution 00181 GTReplay_RegisterCallbackBeforeIns(kernel, ins, BeforeInsCallback, NULL); 00182 } 00183 } 00184 00185 // Allocate and initialize buffers 00186 icount.resize(gMaxNumOfTiles); 00187 channels.resize(gMaxNumOfTiles); 00188 00189 totalIcount.resize(gNumOfInstructions, 0); 00190 totalChannels.resize(gNumOfInstructions, 0); 00191 totalOps.resize(gNumOfInstructions, 0); 00192 00193 gNumOfDataTypes = GTReplay_NumDataTypes(); 00194 uint32_t kernelNameSize = 0; 00195 GTReplay_GetKernelName(kernel, &kernelNameSize, nullptr); 00196 00197 char* buf = new char[kernelNameSize + 1](); 00198 GTReplay_GetKernelName(kernel, &kernelNameSize, buf); 00199 00200 kernelName = std::string(buf); 00201 00202 delete[] buf; 00203 } 00204 00205 /* 00206 * GTReplay_Entry - tool entry point 00207 */ 00208 extern "C" 00209 DLLEXP void FASTCALL GTReplay_Entry(int argc, const char *argv[]) 00210 { 00211 // configure GTReplay 00212 ConfigureGTReplay(argc, argv); 00213 00214 // register OnKernelBuild and OnKernelComplete callbacks 00215 GTReplay_RegisterOnKernelBuildCallback(OnKernelBuild); 00216 GTReplay_RegisterOnKernelCompleteCallback(OnKernelComplete); 00217 00218 // Start GTReplay 00219 GTReplay_Start(); 00220 }
(Back to the list of all GTReplay Sample Tools)
Copyright (C) 2013-2025 Intel Corporation
SPDX-License-Identifier: MIT
1.7.4