|
GTPin
|
The Cachelineprof is a GTPin tool for profiling memory accesses in GPU kernels in cache-line granularity. The tool leverages High Level Instrumentation interface (HLIF).
To run Cachelineprof tool use the following command:
Profilers/Bin/gtpin -t cachelineprof [cachelineprof args] [GTPin args] -- app [application args]
The tool supports the following profiling modes, selectable via the `--mode` knob:
1. Cacheline Histogram (Default)
Counts the number of unique cachelines accessed by each memory instruction.
2. Memory Footprint
Tracks all cachelines accessed (reads and writes) by the kernel and produces a bitvector representing the memory footprint.
3. Memory Cache Model
Simulates a simple direct-mapped cache model to count cache hits and misses for memory accesses.
4. Memory Access Analysis
Analyzes memory access dependencies, counting Read-After-Read (RaR), Read-After-Write (RaW), Write-After-Read (WaR), and Write-After-Write (WaW) events.
5. Memory Race Detection
Detects memory race conditions, counting Write-After-Write (WaW) memory accesses from different hardware threads.
The following output examples show the results of profiling a kernel that multiplies two matrices of 256x256 single-precision floating-point values.
------------------------------------------------------------------------------------------------------------------------ 0: GEMM___CS_asmcc69d00bb0a665a9_simd32_cc69d00bb0a665a9_0 ------------------------------------------------------------------------------------------------------------------------ Dispatch Id = 0 Instruction Id = 111 1 cachelines were accessed 524288 times Dispatch Id = 0 Instruction Id = 112 1 cachelines were accessed 524288 times Dispatch Id = 0 Instruction Id = 113 1 cachelines were accessed 524288 times Dispatch Id = 0 Instruction Id = 114 1 cachelines were accessed 524288 times Dispatch Id = 0 Instruction Id = 153 1 cachelines were accessed 2048 times Dispatch Id = 0 Instruction Id = 154 1 cachelines were accessed 2048 times
------------------------------------------------------------------------------------------------------------------------ 0: GEMM___CS_asmcc69d00bb0a665a9_simd32_cc69d00bb0a665a9_0 ------------------------------------------------------------------------------------------------------------------------ Dispatch Id = 0 Reads cachelines: ffffb802001b0000 - ffffb802001f0000 ffffb80200880000 - ffffb802008c0000 Total accessed size = 0x80000 Writes cachelines: ffffb802009d0000 - ffffb80200a10000 Total accessed size = 0x40000
------------------------------------------------------------------------------------------------------------------------
0: GEMM___CS_asmcc69d00bb0a665a9_simd32_cc69d00bb0a665a9_0
------------------------------------------------------------------------------------------------------------------------
Dispatch Id = 0
Cache hits = 33607680
Cache misses = 12288
=====================
Total cache hits = 33607680
Total cache misses = 12288
------------------------------------------------------------------------------------------------------------------------
0: GEMM___CS_asmcc69d00bb0a665a9_simd32_cc69d00bb0a665a9_0
------------------------------------------------------------------------------------------------------------------------
Dispatch Id = 0
Read-After-Read accesses = 33554432
Read-After-Write accesses = 0
Write-After-Read accesses = 4096
Write-After-Write accesses = 61440
=====================
Total Read-After-Read accesses = 33554432
Total Read-After-Write accesses = 0
Total Write-After-Read accesses = 4096
Total Write-After-Write accesses = 61440
------------------------------------------------------------------------------------------------------------------------
0: race_condition_example___CS_asm3f070298b846b3d8_simd32_3f070298b846b3d8_0
------------------------------------------------------------------------------------------------------------------------
Dispatch Id = 0
Race conditions detected = 1020
Races detected at these addresses:
ffffb802003f0000
ffffb802003f0004
ffffb802003f0008
ffffb802003f000c
=====================
Total Race conditions detected = 1020
or
------------------------------------------------------------------------------------------------------------------------
0: GEMM___CS_asmb6f5a13c00055b39_simd32_b6f5a13c00055b39_0
------------------------------------------------------------------------------------------------------------------------
Dispatch Id = 0
No race conditions detected
=====================
Total Race conditions detected = 0
if no races were detected.
(Back to the list of all GTPin Sample Tools)
00001 /*========================== begin_copyright_notice ============================ 00002 Copyright (C) 2025-2026 Intel Corporation 00003 00004 SPDX-License-Identifier: MIT 00005 ============================= end_copyright_notice ===========================*/ 00006 00007 /*! 00008 * @file A tool that profiles accessed cachelines 00009 */ 00010 00011 #ifndef CACHELINEPROF_H_ 00012 #define CACHELINEPROF_H_ 00013 00014 #include "hlif_basic_defs.h" 00015 #include "block_2d.h" 00016 00017 #if defined(__cplusplus) 00018 #include "gtpin_api.h" 00019 using namespace gtpin; 00020 #endif 00021 00022 #define MAX_SUPPORTED_CACHE_LINE_HISTOGRAM_SIZE 256 00023 00024 #pragma pack(push, 8) 00025 00026 /* ============================================================================================= */ 00027 // Struct CachelineProfArgs 00028 /* ============================================================================================= */ 00029 /// Common arguments of HLI functions for computing cacheline accesses histogram 00030 typedef struct CachelineProfArgs 00031 { 00032 struct 00033 { 00034 uint32_t numAccesses; ///< Number of elements accessed by the instruction 00035 uint32_t dataSize; ///< Size, in bytes, of the memory element 00036 uint32_t log2CachelineSize; ///< Log2(Cache-line-size) 00037 } in; 00038 struct 00039 { 00040 uint32_t cachelineHistogram[MAX_SUPPORTED_CACHE_LINE_HISTOGRAM_SIZE]; ///< Histogram of the number of cachelines accessed by instruction 00041 uint32_t outOfHistogramValues; 00042 } out; 00043 00044 #if defined(__cplusplus) 00045 /// Constructor 00046 CachelineProfArgs() : in({0, 0, 0}), out({}) {} 00047 #endif 00048 } CachelineProfArgs; 00049 00050 /// Common arguments of HLI functions for computing memory footprint 00051 typedef struct MemoryFootprintArgs 00052 { 00053 struct 00054 { 00055 uint32_t log2CachelineSize; ///< Log2(Cache-line-size) 00056 uint32_t supportedAddressBits; ///< Number of supported least significant bits of the addresses 00057 uint32_t bitVectorSizeInBytes; ///< The size of the bitvector in bytes 00058 } in; 00059 struct 00060 { 00061 uint64_t upperAddressBits; 00062 } out; 00063 } MemoryFootprintArgs; 00064 00065 typedef struct MemoryFootprintArgsCombo 00066 { 00067 __global uint64_t* addresses; 00068 uint32_t accessMask; 00069 uint32_t padding1; // padding to 8 bytes 00070 uint32_t dataSize; 00071 uint32_t padding2; // padding to 8 bytes 00072 uint32_t accessType; 00073 uint32_t padding3; // padding to 8 bytes 00074 __global MemoryFootprintArgs* memoryFootprintArgs; 00075 __global uint8_t* bitvector; 00076 } MemoryFootprintArgsCombo; 00077 00078 /// Common arguments of HLI functions for cache modeling mode 00079 typedef struct MemoryCacheModelArgs 00080 { 00081 struct 00082 { 00083 uint32_t log2CachelineSize; ///< Log2(Cache-line-size) 00084 uint32_t cacheSize; ///< Cache size (assumed a power of 2) 00085 uint32_t hashMaskShift; ///< Number of bits to shift-left the hashMask before applying 00086 } in; 00087 struct 00088 { 00089 uint64_t hits; 00090 uint64_t misses; 00091 } out; 00092 } MemoryCacheModelArgs; 00093 00094 enum MemoryAccessType 00095 { 00096 Load, 00097 Store, 00098 ReadModifyWrite 00099 }; 00100 00101 typedef struct MemoryCacheModelArgsCombo 00102 { 00103 __global uint64_t* addresses; 00104 uint32_t accessMask; 00105 uint32_t padding1; // padding to 8 bytes 00106 uint32_t dataSize; 00107 uint32_t padding2; // padding to 8 bytes 00108 __global MemoryCacheModelArgs* memoryCacheModelArgs; 00109 __global uint64_t* cacheTagsVector; 00110 } MemoryCacheModelArgsCombo; 00111 00112 /// Common arguments of HLI functions for memory access analysis mode 00113 typedef struct MemoryAccessAnalysisArgs 00114 { 00115 struct 00116 { 00117 uint32_t log2CachelineSize; ///< Log2(Cache-line-size) 00118 uint32_t supportedAddressBits; ///< Number of supported least significant bits of the addresses 00119 uint32_t bitVectorSizeInBytes; ///< The size of the bitvector in bytes 00120 } in; 00121 struct 00122 { 00123 uint64_t rarCount; 00124 uint64_t rawCount; 00125 uint64_t warCount; 00126 uint64_t wawCount; 00127 } out; 00128 } MemoryAccessAnalysisArgs; 00129 00130 typedef struct MemoryAccessAnalysisArgsCombo 00131 { 00132 __global uint64_t* addresses; 00133 uint32_t accessMask; 00134 uint32_t padding1; // padding to 8 bytes 00135 uint32_t dataSize; 00136 uint32_t padding2; // padding to 8 bytes 00137 uint32_t accessType; 00138 uint32_t padding3; // padding to 8 bytes 00139 __global MemoryAccessAnalysisArgs* memoryAccessAnalysisArgs; 00140 __global uint32_t* bitvector; 00141 } MemoryAccessAnalysisArgsCombo; 00142 00143 /// Common arguments of HLI functions for memory race condition detection 00144 typedef struct MemoryRaceDetectionArgs 00145 { 00146 struct 00147 { 00148 uint32_t log2CachelineSize; ///< Log2(Cache-line-size) 00149 uint32_t supportedAddressBits; ///< Number of supported least significant bits of the addresses 00150 uint32_t bitVectorSizeInBytes; ///< The size of the bitvector in bytes 00151 } in; 00152 struct 00153 { 00154 uint64_t upperAddressBits; 00155 uint64_t raceCount; 00156 uint64_t rarCount; 00157 uint64_t rawCount; 00158 uint64_t warCount; 00159 uint64_t wawCount; 00160 } out; 00161 } MemoryRaceDetectionArgs; 00162 00163 typedef struct MemoryRaceDetectionArgsCombo 00164 { 00165 __global uint64_t* addresses; 00166 uint32_t accessMask; 00167 uint32_t padding1; // padding to 8 bytes 00168 uint32_t dataSize; 00169 uint32_t padding2; // padding to 8 bytes 00170 uint32_t accessType; 00171 uint32_t padding3; // padding to 8 bytes 00172 uint32_t hwThreadId; 00173 uint32_t padding4; // padding to 8 bytes 00174 __global MemoryRaceDetectionArgs* memoryRaceDetectionArgs; 00175 __global uint32_t* raceDataVector; 00176 __global uint32_t* bitvector; 00177 } MemoryRaceDetectionArgsCombo; 00178 00179 /*! 00180 * @brief HLI function that detects amount of cachelines accessed by a scatter SEND instruction to global or local memory done in A32 or BTS modes 00181 * @param[in] addresses Array of 32-bit addresses/offsets 00182 * @param[in] accessMask Per-channel mask of memory accesses 00183 * @param[in][out] CachelineProfArgs Information about memory access instruction and the resulting cachelines histogram 00184 */ 00185 IGC_STACK_CALL void CheckScatterA32Access(__global const uint32_t* addresses, 00186 uint32_t accessMask, 00187 __global CachelineProfArgs* CachelineProfArgs); 00188 00189 #if defined(__cplusplus) 00190 using CheckScatterA32AccessFunc = GtHliFunction<void, const uint32_t*, uint32_t, CachelineProfArgs*>; 00191 #endif 00192 00193 /*! 00194 * @brief HLI function that detects amount of cachelines accessed by a scatter SEND instruction to global memory done in A64 mode 00195 * @param[in] addresses Array of 64-bit addresses 00196 * @param[in] accessMask Per-channel mask of memory accesses 00197 * @param[in][out] CachelineProfArgs Information about memory access instruction and the resulting cachelines histogram 00198 */ 00199 IGC_STACK_CALL void CheckScatterA64Access(__global const uint64_t* addresses, 00200 uint32_t accessMask, 00201 __global CachelineProfArgs* CachelineProfArgs); 00202 00203 #if defined(__cplusplus) 00204 using CheckScatterA64AccessFunc = GtHliFunction<void, const uint64_t*, uint32_t, CachelineProfArgs*>; 00205 #endif 00206 00207 /*! 00208 * @brief HLI function sets bits within a bitvector that correspond to accessed cachelines 00209 * @param[in] addresses Array of 64-bit addresses 00210 * @param[in] accessMask Per-channel mask of memory accesses 00211 * @param[in] dataSize Data size per single access 00212 * @param[in,out] memoryFootprintArgs Information about memory access 00213 * @param[out] bitvector Resulting per-cacheline bitvector 00214 */ 00215 IGC_STACK_CALL void GlobalMemoryFootprintA64Access(__global MemoryFootprintArgsCombo* memoryFootprintArgsCombo); 00216 00217 #if defined(__cplusplus) 00218 using GlobalMemoryFootprintA64AccessFunc = GtHliFunction<void, const uint64_t*, uint32_t, uint32_t, uint32_t, MemoryFootprintArgs*, uint8_t*>; 00219 #endif 00220 00221 /*! 00222 * @brief HLI function sets bits within a bitvector that correspond to accessed cachelines for Block2D accesses 00223 * @param[in] addresses Array of 64-bit addresses 00224 * @param[in] accessMask Per-channel mask of memory accesses 00225 * @param[in] dataSize Data size per single access 00226 * @param[in,out] memoryFootprintArgs Information about memory access 00227 * @param[out] bitvector Resulting per-cacheline bitvector 00228 */ 00229 IGC_STACK_CALL void GlobalMemoryFootprintBlock2DAccess(__global MemoryFootprintArgsCombo* memoryFootprintArgsCombo); 00230 00231 #if defined(__cplusplus) 00232 using GlobalMemoryFootprintBlock2DAccessFunc = GtHliFunction<void, const uint64_t*, uint32_t, uint32_t, uint32_t, MemoryFootprintArgs*, uint8_t*>; 00233 #endif 00234 00235 00236 /*! 00237 * @brief HLI function that models cache and reports the number of hits and misses 00238 * @param[in] addresses Array of 64-bit addresses 00239 * @param[in] accessMask Per-channel mask of memory accesses 00240 * @param[in] dataSize Data size per single access 00241 * @param[in,out] MemoryCacheModelArgs Information about cache model 00242 * @param[in] cacheTagsVector Vector of cache buckets tags 00243 */ 00244 IGC_STACK_CALL void GlobalMemoryCacheModelA64Access(__global MemoryCacheModelArgsCombo* memoryCacheModelArgsCombo); 00245 00246 #if defined(__cplusplus) 00247 using GlobalMemoryCacheModelA64AccessFunc = GtHliFunction<void, const uint64_t*, uint32_t, uint32_t, MemoryCacheModelArgs*, uint64_t*>; 00248 #endif 00249 00250 00251 /*! 00252 * @brief HLI function that models cache and reports the number of hits and misses 00253 * @param[in] addresses Array of 64-bit addresses 00254 * @param[in] accessMask Per-channel mask of memory accesses 00255 * @param[in] dataSize Data size per single access 00256 * @param[in,out] MemoryCacheModelArgs Information about cache model 00257 * @param[in] cacheTagsVector Vector of cache buckets tags 00258 */ 00259 IGC_STACK_CALL void GlobalMemoryCacheModelBlock2DAccess(__global MemoryCacheModelArgsCombo* memoryCacheModelArgsCombo); 00260 00261 #if defined(__cplusplus) 00262 using GlobalMemoryCacheModelBlock2DAccessFunc = GtHliFunction<void, const uint64_t*, uint32_t, uint32_t, MemoryCacheModelArgs*, uint64_t*>; 00263 #endif 00264 00265 /*! 00266 * @brief HLI function checks type of access (RaR, WaR, WaW, RaW) for A64 access 00267 * @param[in] addresses Array of 64-bit addresses 00268 * @param[in] accessMask Per-channel mask of memory accesses 00269 * @param[in] dataSize Data size per single access 00270 * @param[in] accessType Type of access 00271 * @param[in,out] memoryAccessAnalysisArgsCombo Information about memory access 00272 * @param[out] bitvector Resulting per-cacheline bitvector 00273 */ 00274 IGC_STACK_CALL void GlobalMemoryAccessAnalysisA64Access(__global MemoryAccessAnalysisArgsCombo* memoryAccessAnalysisArgsCombo); 00275 00276 #if defined(__cplusplus) 00277 using GlobalMemoryAccessAnalysisA64AccessFunc = GtHliFunction<void, const uint64_t*, uint32_t, uint32_t, uint32_t, MemoryAccessAnalysisArgs*, uint32_t*>; 00278 #endif 00279 00280 /*! 00281 * @brief HLI function checks type of access (RaR, WaR, WaW, RaW) for load_block2d/store_block2d messages 00282 * @param[in] addresses Array of 64-bit addresses 00283 * @param[in] accessMask Per-channel mask of memory accesses 00284 * @param[in] dataSize Data size 00285 * @param[in] accessType Type of access 00286 * @param[in,out] memoryAccessAnalysisArgs Information about memory access 00287 * @param[out] bitvector Resulting per-cacheline bitvector 00288 */ 00289 IGC_STACK_CALL void GlobalMemoryAccessAnalysisBlock2DAccess(__global MemoryAccessAnalysisArgsCombo* memoryAccessAnalysisArgsCombo); 00290 00291 #if defined(__cplusplus) 00292 using GlobalMemoryAccessAnalysisBlock2DAccessFunc = GtHliFunction<void, const uint64_t*, uint32_t, uint32_t, uint32_t, MemoryAccessAnalysisArgs*, uint8_t*>; 00293 #endif 00294 00295 /*! 00296 * @brief HLI function checks for WaW accesses from different HW threads for A64 access 00297 * @param[in] addresses Array of 64-bit addresses 00298 * @param[in] accessMask Per-channel mask of memory accesses 00299 * @param[in] dataSize Data size per single access 00300 * @param[in] accessType Type of access 00301 * @param[in,out] MemoryRaceDetectionArgsCombo Information about memory access 00302 * @param[in] hwThreadDataVector Resulting data structure 00303 * @param[out] bitvector Resulting per-cacheline bitvector 00304 */ 00305 IGC_STACK_CALL void GlobalMemoryRaceDetectionA64Access(__global MemoryRaceDetectionArgsCombo* memoryRaceDetectionArgsCombo); 00306 00307 #if defined(__cplusplus) 00308 using GlobalMemoryRaceDetectionA64AccessFunc = GtHliFunction<void, const uint64_t*, uint32_t, uint32_t, uint32_t, uint32_t, MemoryRaceDetectionArgs*, uint32_t*, uint32_t*>; 00309 #endif 00310 00311 IGC_STACK_CALL void GlobalMemoryAccessAnalysisAndRaceDetectionA64Access(__global MemoryRaceDetectionArgsCombo* memoryRaceDetectionArgsCombo); 00312 00313 #if defined(__cplusplus) 00314 using GlobalMemoryAccessAnalysisAndRaceDetectionA64AccessFunc = GtHliFunction<void, const uint64_t*, uint32_t, uint32_t, uint32_t, uint32_t, MemoryRaceDetectionArgs*, uint32_t*, uint32_t*>; 00315 #endif 00316 00317 #pragma pack(pop) 00318 00319 #endif // CACHELINEPROF_H_
00001 /*========================== begin_copyright_notice ============================ 00002 Copyright (C) 2024-2026 Intel Corporation 00003 00004 SPDX-License-Identifier: MIT 00005 ============================= end_copyright_notice ===========================*/ 00006 00007 /*! 00008 * @file A tool that profiles accessed cachelines 00009 * The following modes are supported: 00010 * 1. Cacheline histogram - amount of cache lines accessed by the kernel by each SEND instruction. This mode supports all addressing modes. 00011 * 2. Memory footprint - detects cache lines accessed by the kernel per read and per write separately. This mode supports A64 addressing mode only. 00012 * 3. Memory cache model - models a direct-mapped cache and detects amount of cache hits and misses. This mode supports A64 addressing mode only. 00013 * 4. Memory access analysis - detects read-after-read, read-after-write, write-after-read, and write-after-write dependencies between memory accesses. 00014 * This mode supports A64 addressing mode only. 00015 */ 00016 00017 #include <cstring> 00018 #include <map> 00019 #include <vector> 00020 #include <list> 00021 #include <set> 00022 #include <algorithm> 00023 00024 #include "cachelineprof.h" 00025 #include "gen_send_decoder.h" 00026 00027 #include "gtpin_api.h" 00028 #include "gtpin_tool_utils.h" 00029 #include "gt_basic_utils.h" 00030 #include "ged.h" 00031 00032 enum MODE { 00033 MODE_CACHE_HISTOGRAM = 1, 00034 MODE_MEMORY_FOOTPRINT = 2, 00035 MODE_MEMORY_CACHE_MODEL = 3, 00036 MODE_MEMORY_ACCESS_ANALYSIS = 4, 00037 MODE_MEMORY_RACE_DETECTION = 5, 00038 MODE_MEMORY_ACCESS_ANALYSIS_AND_RACE_DETECTION = 6, 00039 }; 00040 00041 constexpr uint32_t ONE_KB = (1024ull); 00042 constexpr uint64_t FOUR_GB = (4 * 1024ull * 1024ull * 1024ull); 00043 constexpr bool hasComboParam = true; 00044 00045 /* ============================================================================================= */ 00046 // Configuration 00047 /* ============================================================================================= */ 00048 Knob<bool> KNOB_NO_COUT("no_cout", false, "Do not send profiling results to standard output device"); 00049 Knob<bool> KNOB_IGNORE_SLM("ignore_slm", false, "Do not profile SLM accesses"); 00050 Knob<bool> KNOB_NO_CACHE_INVALIDATE("no_cache_invalidate", false, "No cache invalidation between enqueues"); 00051 Knob<bool> KNOB_ANALYSIS_INIT_TO_WRITE("init_to_write", false, "Initialize bitvector for analysis as write"); 00052 Knob<bool> KNOB_NO_INVALIDATE("no_invalidate", false, "No data invalidation between enqueues"); 00053 Knob<int> KNOB_MODE("mode", MODE_CACHE_HISTOGRAM, "Profiling mode"); 00054 Knob<int> KNOB_CACHE_SIZE("cache_size", 2048, "Size in KB of the cache (should be power of 2)"); 00055 Knob<int> KNOB_CACHELINE_SIZE("cacheline_size", 64, "Size in bytes of the cacheline (should be power of 2)"); 00056 Knob<int> KNOB_SUPPORTED_ADDRESS_BITS("supported_address_bits", 38, "Number of supported least significant bits within 64-bit address"); 00057 Knob<int> KNOB_CACHE_MODEL_HASH_SHIFT("cache_model_hash_shift", 0, "Amount of bits to shift left the cache hash"); 00058 00059 uint32_t gSupportedAddressBits = 0; 00060 uint32_t gLog2CacheLineSize = 0; 00061 size_t gBitVectorSizeInBytes = 0; 00062 size_t gRaceDataVectorSize = 0; 00063 size_t gNumOfCacheBuckets = 0; 00064 uint32_t gCacheSize = 0; 00065 00066 using BitVector = std::vector<uint8_t>; 00067 using RaceDetectionDataVector = std::vector<uint32_t>; 00068 using CacheBucketTagsVector = std::vector<uint64_t>; 00069 00070 /* ============================================================================================= */ 00071 // Class MemAccess 00072 /* ============================================================================================= */ 00073 /// Information about memory access instruction 00074 struct MemAccess 00075 { 00076 /// Constructor. If memory access is unsupported, the reason can be queried by Error() 00077 explicit MemAccess(const IGtIns &ins); 00078 00079 bool IsValid() const { return _isValid; } ///< @return true for a supported memory access 00080 InsId Id() const { return _insId; } ///< @return Instruction ID 00081 GtRegNum FirstAddrReg() const { return _firstAddrReg; } ///< @return First register in the address payload 00082 uint32_t NumAccesses() const { return _numAccesses; } ///< @return Number of elements in the address payload 00083 uint32_t DataSize() const { return _dataSize; } ///< @return Data size, in bytes, per each address 00084 bool IsSlm() const { return _isSlm; } ///< @return true for SLM access 00085 bool IsBlock2DAccess() const { return _isBlock2D; } ///< @return true for block2d access 00086 const GtMemoryAddrModel& AddrModel() const { return _addrModel; } ///< @return Address model of the memory access 00087 const std::string& Error() const { return _errMsg; } ///< @return Error message on unsupported memory access 00088 00089 /*! 00090 * @return Arguments of the HLI function that compute cache line access histograms 00091 * @note This object owns the CachelineProfArgs structure, but its content is controlled externally 00092 */ 00093 const CachelineProfArgs& GetCachelineProfArgs() const { return _clArgs; } 00094 CachelineProfArgs& GetCachelineProfArgs() { return _clArgs; } 00095 00096 private: 00097 bool _isValid = false; ///< True, if this structure represents supported memory access 00098 InsId _insId; ///< ID of the memory access instruction 00099 GtMemoryAddrModel _addrModel; ///< Address model of the memory access 00100 GtRegNum _firstAddrReg; ///< First register in the address payload of the instruction 00101 uint32_t _numAccesses = 0; ///< Number of elements in the address payload of the instruction 00102 uint32_t _dataSize = 0; ///< Size, in bytes, of the memory range referenced by a single address 00103 CachelineProfArgs _clArgs; ///< Common argument of cacheline profiling functions 00104 00105 std::string _errMsg; ///< Error message on unsupported memory access 00106 bool _isSlm = false; ///< True if the memory access is to SLM 00107 bool _isBlock2D = false; ///< True if the memory access is load_block2d or store_block2d 00108 }; 00109 00110 /* ============================================================================================= */ 00111 // Struct ProfileResults 00112 /* ============================================================================================= */ 00113 /*! 00114 * Profile results per kernel dispatch / per insruction 00115 */ 00116 struct ProfileResults 00117 { 00118 ProfileResults(const IGtKernelDispatch& dispatcher, const MemAccess& memAccess); 00119 00120 uint64_t dispatchId; ///< Unique ID of the kernel dispatch assigned by GTPin 00121 GtKernelExecDesc kernelExecDesc; ///< Kernel execution descriptor 00122 InsId insId; ///< ID of the memory access instruction 00123 uint32_t clHist[MAX_SUPPORTED_CACHE_LINE_HISTOGRAM_SIZE]; ///< Cacheline histogram 00124 uint32_t outOfHistogramValues; ///< Counter how many times the value was out of histogram size 00125 }; 00126 00127 struct ProfileResultsMemFootprint 00128 { 00129 ProfileResultsMemFootprint(const IGtKernelDispatch& dispatcher, const MemoryFootprintArgs& memfootprintArg, BitVector& bitvector); 00130 00131 uint64_t dispatchId; ///< Unique ID of the kernel dispatch assigned by GTPin 00132 GtKernelExecDesc kernelExecDesc; ///< Kernel execution descriptor 00133 BitVector clBitvector; ///< Cacheline bitvector 00134 uint64_t upperAddressBits; ///< Upper bits of the addresses (assumed to be invariant for all kernel accesses per dispatch) 00135 }; 00136 00137 struct ProfileResultsMemCacheModel 00138 { 00139 ProfileResultsMemCacheModel(const IGtKernelDispatch& dispatcher, const MemoryCacheModelArgs& memCacheModelArg); 00140 00141 uint64_t dispatchId; ///< Unique ID of the kernel dispatch assigned by GTPin 00142 GtKernelExecDesc kernelExecDesc; ///< Kernel execution descriptor 00143 uint64_t hits; ///< Amount of counted cache hits 00144 uint64_t misses; ///< Amount of counted cache misses 00145 }; 00146 00147 struct ProfileResultsMemAccessAnalysis 00148 { 00149 ProfileResultsMemAccessAnalysis(const IGtKernelDispatch& dispatcher, const MemoryAccessAnalysisArgs& memAccessAnalysisArg); 00150 00151 uint64_t dispatchId; ///< Unique ID of the kernel dispatch assigned by GTPin 00152 GtKernelExecDesc kernelExecDesc; ///< Kernel execution descriptor 00153 uint64_t rarCount; ///< Number of Read-After-Read accesses 00154 uint64_t rawCount; ///< Number of Read-After-Write accesses 00155 uint64_t warCount; ///< Number of Write-After-Read accesses 00156 uint64_t wawCount; ///< Number of Write-After-Write accesses 00157 }; 00158 00159 struct ProfileResultsMemRaceDetection 00160 { 00161 ProfileResultsMemRaceDetection(const IGtKernelDispatch& dispatcher, const MemoryRaceDetectionArgs& memRaceDetectionArg, BitVector& bitvector); 00162 00163 uint64_t dispatchId; ///< Unique ID of the kernel dispatch assigned by GTPin 00164 GtKernelExecDesc kernelExecDesc; ///< Kernel execution descriptor 00165 BitVector clBitvector; ///< Cacheline bitvector 00166 uint64_t raceCount; ///< Number of detected race conditions 00167 uint64_t rarCount; ///< Number of Read-After-Read accesses 00168 uint64_t rawCount; ///< Number of Read-After-Write accesses 00169 uint64_t warCount; ///< Number of Write-After-Read accesses 00170 uint64_t wawCount; ///< Number of Write-After-Write accesses 00171 uint64_t upperAddressBits; ///< Upper bits of the addresses (assumed to be invariant for all kernel accesses per dispatch) 00172 }; 00173 00174 /* ============================================================================================= */ 00175 // Class KernelProfile 00176 /* ============================================================================================= */ 00177 /// Static properties of the kernel, and its profile data updated on each kernel run 00178 class KernelProfile 00179 { 00180 public: 00181 using MemAccessMap = std::map<InsId, MemAccess>; ///< Information about memory accesses by kernel instructions 00182 00183 public: 00184 KernelProfile(const IGtKernel& kernel, const IGtCfg& cfg); ///< Constructor 00185 ~KernelProfile() = default; 00186 00187 inline GtKernelId Id() const; ///< @return Unique identifier of the kernel 00188 inline GtGpuPlatform Platform() const; ///< @return Kernel's platform 00189 inline const std::string& Name() const; ///< @return Name of the kernel 00190 inline const std::string& UniqueName() const; ///< @return Unique name of the kernel 00191 inline std::string CachelineProfResults() const; ///< @return Profiling results of kernel runs, in text format 00192 inline void DumpAsm() const; ///< Store kernel's assembly text in the file 00193 inline MemAccessMap& GetMemAccessMap(); ///< @return Information about memory accesses in the kernel 00194 inline BitVector& GetMemoryFootprintBitvector(); ///< @return memory footprint bitvector 00195 inline RaceDetectionDataVector& GetRaceDetectionDataVector(); ///< @return race detection data vector 00196 inline CacheBucketTagsVector& GetCacheBucketTagsVector(); ///< @return memory cache bucket tags vector 00197 inline MemoryFootprintArgs& GetMemoryFootprintArg(); ///< @return argument for memory footprint 00198 inline MemoryCacheModelArgs& GetMemoryCacheModelArg(); ///< @return argument for memory cache model 00199 inline MemoryAccessAnalysisArgs& GetMemoryAccessAnalysisArg(); ///< @return argument for memory access analysis 00200 inline MemoryRaceDetectionArgs& GetMemoryRaceDetectionArg(); ///< @return argument for memory race detection 00201 00202 00203 void RecordCachelineProfResults(IGtKernelDispatch& dispatcher); ///< Update profile data with the latest memory check results 00204 void RecordUnsupportedInstruction(const IGtIns& ins, const std::string& errMsg); ///< Record unsupported instruction 00205 00206 private: 00207 GtKernelId _id; ///< Unique identifier of the kernel 00208 GtGpuPlatform _platform; ///< Kernel's platform 00209 std::string _name; ///< Name of the kernel 00210 std::string _uniqueName; ///< Unique name of the kernel 00211 std::string _asmText; ///< Assembly text of the kernel 00212 std::string _unhandledAccesses; ///< List of unhadled memory accesses, in text format 00213 00214 MemAccessMap _memAccessMap; ///< Map: Instruction ID to memory access information 00215 MemoryFootprintArgs _memFootprintArgs; ///< Argument for memory footprint mode 00216 MemoryCacheModelArgs _memCacheModelArgs; ///< Argument for memory cache model mode 00217 MemoryAccessAnalysisArgs _memAccessAnalysisArgs; ///< Argument for memory access analysis mode 00218 MemoryRaceDetectionArgs _memRaceDetectionArgs; ///< Argument for memory race condition detection mode 00219 BitVector _memFootprintBitvector; ///< Bitvector for memory footprint 00220 RaceDetectionDataVector _memRaceDetectionDataVector; ///< Vector of race detection data 00221 CacheBucketTagsVector _memCacheBucketTagsVector; ///< Vector of cache buckets tags 00222 std::list<ProfileResults> _profileResultsHist; ///< Profile results per kernel dispatch / per insruction 00223 std::list<ProfileResultsMemFootprint> _profileResultsMemFootprint; ///< Profile results per kernel dispatch for memory footprint mode 00224 std::list<ProfileResultsMemCacheModel> _profileResultsMemCacheModel; ///< Profile results per kernel dispatch for memory cache model mode 00225 std::list<ProfileResultsMemAccessAnalysis> _profileResultsMemAccessAnalysis; ///< Profile results per kernel dispatch for memory access analysis mode 00226 std::list<ProfileResultsMemRaceDetection> 00227 _profileResultsMemRaceDetection; ///< Profile results per kernel dispatch for memory race condition detection mode 00228 }; 00229 00230 /* ============================================================================================= */ 00231 // Class CachelineProf 00232 /* ============================================================================================= */ 00233 /*! 00234 * A tool that detects amount of cache lines accessed by kernels 00235 */ 00236 class CachelineProf : public GtTool 00237 { 00238 public: 00239 // Implementation of the IGtTool interface 00240 const char* Name() const override { return "Cachelineprof"; } 00241 void OnKernelBuild(IGtKernelInstrument&) override; 00242 void OnKernelRun(IGtKernelDispatch&) override; 00243 void OnKernelComplete(IGtKernelDispatch&) override; 00244 00245 void LoadHliLibrary(); ///< Compile and load library of HLI functions 00246 static CachelineProf* Instance(); ///< Return single instance of this class 00247 static void OnFini() { Instance()->Fini(); } ///< Termination handler registered with atexit() 00248 00249 private: 00250 00251 CachelineProf(); ///< Default constructor 00252 CachelineProf(const CachelineProf&) = delete; ///< Disabled copy constructor 00253 CachelineProf& operator = (const CachelineProf&) = delete; ///< Disabled assignment operator 00254 ~CachelineProf(); ///< Destructor 00255 void Fini(); ///< Post process and dump profiling data 00256 00257 /*! 00258 * Insert a call to HLI function that detects amount of cache lines accessed by the specified instruction 00259 * @param[in] ins The memory access instruction 00260 * @param[in] memAccess Information about memory access 00261 * @param[in] instrumentor Instrumentation interface 00262 * @return true - success, false - the instruction or memory operation is not supported 00263 */ 00264 bool InsertCachelineProf(const IGtIns &ins, const MemAccess& memAccess, IGtKernelInstrument& instrumentor); 00265 bool InsertMemoryFootprintProf(const IGtIns& ins, 00266 const MemAccess& memAccess, 00267 const MemoryFootprintArgs& memfootprintArg, 00268 const BitVector& bitvector, 00269 IGtKernelInstrument& instrumentor); 00270 bool InsertMemoryCacheModelProf(const IGtIns& ins, 00271 const MemAccess& memAccess, 00272 const MemoryCacheModelArgs& memCacheModelArgs, 00273 const CacheBucketTagsVector& cacheBucketTagsVector, 00274 IGtKernelInstrument& instrumentor); 00275 bool InsertMemoryAccessAnalysisProf(const IGtIns& ins, 00276 const MemAccess& memAccess, 00277 const MemoryAccessAnalysisArgs& memAccessAnalysisArg, 00278 const BitVector& bitvector, 00279 IGtKernelInstrument& instrumentor); 00280 bool InsertRaceDetectionProf(const IGtIns& ins, 00281 const MemAccess& memAccess, 00282 const MemoryRaceDetectionArgs& memRaceDetectionArg, 00283 const RaceDetectionDataVector& raceDetectionDataVector, 00284 const BitVector& bitvector, 00285 IGtKernelInstrument& instrumentor); 00286 00287 private: 00288 // Cacheline profiling functions 00289 CheckScatterA32AccessFunc _checkScatterA32AccessFunc; 00290 CheckScatterA64AccessFunc _checkScatterA64AccessFunc; 00291 00292 GlobalMemoryFootprintA64AccessFunc _globalMemoryFootprintA64AccessFunc; 00293 GlobalMemoryFootprintBlock2DAccessFunc _globalMemoryFootprintBlock2DAccessFunc; 00294 00295 GlobalMemoryCacheModelA64AccessFunc _globalMemoryCacheModelA64AccessFunc; 00296 GlobalMemoryCacheModelBlock2DAccessFunc _globalMemoryCacheModelBlock2DAccessFunc; 00297 00298 GlobalMemoryAccessAnalysisA64AccessFunc _globalMemoryAccessAnalysisA64AccessFunc; 00299 GlobalMemoryAccessAnalysisBlock2DAccessFunc _globalMemoryAccessAnalysisBlock2DAccessFunc; 00300 00301 GlobalMemoryRaceDetectionA64AccessFunc _globalMemoryRaceDetectionA64AccessFunc; 00302 GlobalMemoryAccessAnalysisAndRaceDetectionA64AccessFunc _globalMemoryAccessAnalysisAndRaceDetectionA64AccessFunc; 00303 00304 IGtHliModuleHandle _hliModule = nullptr; ///< Module of HLI functions 00305 std::map<GtKernelId, KernelProfile> _kernels; ///< Collection of kernel profiles 00306 }; 00307 00308 /* ============================================================================================= */ 00309 // MemAccess implementation 00310 /* ============================================================================================= */ 00311 MemAccess::MemAccess(const IGtIns &ins) : _insId(ins.Id()) 00312 { 00313 // Get and check data port (SFID) 00314 GtSfid sfid = ins.Sfid(); 00315 if ((sfid != GED_SFID_UGM) && (sfid != GED_SFID_SLM) && (sfid != GED_SFID_DP_DC0) && (sfid != GED_SFID_DP_DC1)) 00316 { 00317 _errMsg = "Unsupported data port " + std::string(sfid.ToString()); 00318 return; 00319 } 00320 00321 _isSlm = (sfid == GED_SFID_SLM); 00322 if (_isSlm && (KNOB_IGNORE_SLM || (KNOB_MODE == MODE_MEMORY_FOOTPRINT) || (KNOB_MODE == MODE_MEMORY_CACHE_MODEL) || (KNOB_MODE == MODE_MEMORY_ACCESS_ANALYSIS))) 00323 { 00324 _errMsg = "SLM access - ignored"; 00325 return; 00326 } 00327 00328 // Retrieve message descriptor 00329 if (!ins.MsgDescRegFile().IsImm()) 00330 { 00331 _errMsg = "SEND message descriptor is not immediate"; 00332 return; 00333 } 00334 00335 // Initialize address model 00336 _addrModel = ins.MemAddrModel(); 00337 if (!_addrModel.IsValid()) 00338 { 00339 _errMsg = "Unsupported/unknown address model"; 00340 return; 00341 } 00342 00343 if (((KNOB_MODE == MODE_MEMORY_FOOTPRINT) || (KNOB_MODE == MODE_MEMORY_CACHE_MODEL) || (KNOB_MODE == MODE_MEMORY_ACCESS_ANALYSIS)) && !_addrModel.IsA64()) 00344 { 00345 _errMsg = "Memory footprint, cache model and memory access analysis modes are supported for A64 access mode only"; 00346 return; 00347 } 00348 00349 // Finally, initialize the rest of data members... 00350 GTPIN_ASSERT(ins.SrcRegFile(0).IsGrf()); 00351 _firstAddrReg = ins.SrcRegOperand(0).Reg().RegNum(); 00352 _numAccesses = ins.NumAccesses(); GTPIN_ASSERT(_numAccesses != 0); 00353 _isBlock2D = ins.IsBlock2DAccess(); 00354 00355 DcSendMsg msg = DcSendMsg::Decode(ins.GetGedIns()); 00356 _dataSize = msg.ElementSize() * msg.NumElements(); 00357 00358 if (_dataSize == 0) 00359 { 00360 _errMsg = "Unsupported SEND operation"; 00361 return; 00362 } 00363 00364 _isValid = true; 00365 } 00366 00367 /* ============================================================================================= */ 00368 // ProfileResults implementation 00369 /* ============================================================================================= */ 00370 ProfileResults::ProfileResults(const IGtKernelDispatch& dispatcher, const MemAccess& memAccess) : 00371 dispatchId(dispatcher.DispatchId()), insId(memAccess.Id()) 00372 { 00373 auto& clArg = memAccess.GetCachelineProfArgs(); 00374 memcpy(&clHist, clArg.out.cachelineHistogram, sizeof(clArg.out.cachelineHistogram)); 00375 outOfHistogramValues = clArg.out.outOfHistogramValues; 00376 dispatcher.GetExecDescriptor(kernelExecDesc); 00377 } 00378 00379 ProfileResultsMemFootprint::ProfileResultsMemFootprint(const IGtKernelDispatch& dispatcher, const MemoryFootprintArgs& arg, BitVector& bitvector) : 00380 dispatchId(dispatcher.DispatchId()) 00381 { 00382 clBitvector.assign(bitvector.begin(), bitvector.end()); 00383 upperAddressBits = arg.out.upperAddressBits; 00384 dispatcher.GetExecDescriptor(kernelExecDesc); 00385 } 00386 00387 ProfileResultsMemCacheModel::ProfileResultsMemCacheModel(const IGtKernelDispatch& dispatcher, const MemoryCacheModelArgs& memCacheModelArg) : 00388 dispatchId(dispatcher.DispatchId()) 00389 { 00390 hits = memCacheModelArg.out.hits; 00391 misses = memCacheModelArg.out.misses; 00392 dispatcher.GetExecDescriptor(kernelExecDesc); 00393 } 00394 00395 ProfileResultsMemAccessAnalysis::ProfileResultsMemAccessAnalysis(const IGtKernelDispatch& dispatcher, const MemoryAccessAnalysisArgs& memAccessAnalysisArg) : 00396 dispatchId(dispatcher.DispatchId()) 00397 { 00398 rarCount = memAccessAnalysisArg.out.rarCount; 00399 rawCount = memAccessAnalysisArg.out.rawCount; 00400 warCount = memAccessAnalysisArg.out.warCount; 00401 wawCount = memAccessAnalysisArg.out.wawCount; 00402 dispatcher.GetExecDescriptor(kernelExecDesc); 00403 } 00404 00405 ProfileResultsMemRaceDetection::ProfileResultsMemRaceDetection(const IGtKernelDispatch& dispatcher, const MemoryRaceDetectionArgs& memRaceDetectionArg, BitVector& bitvector) : 00406 dispatchId(dispatcher.DispatchId()) 00407 { 00408 clBitvector.assign(bitvector.begin(), bitvector.end()); 00409 raceCount = memRaceDetectionArg.out.raceCount; 00410 rarCount = memRaceDetectionArg.out.rarCount; 00411 rawCount = memRaceDetectionArg.out.rawCount; 00412 warCount = memRaceDetectionArg.out.warCount; 00413 wawCount = memRaceDetectionArg.out.wawCount; 00414 upperAddressBits = memRaceDetectionArg.out.upperAddressBits; 00415 dispatcher.GetExecDescriptor(kernelExecDesc); 00416 } 00417 00418 /* ============================================================================================= */ 00419 // KernelProfile implementation 00420 /* ============================================================================================= */ 00421 KernelProfile::KernelProfile(const IGtKernel& kernel, const IGtCfg& cfg) : 00422 _id(kernel.Id()), _platform(kernel.GpuPlatform()), _name(GlueString(kernel.Name())), _uniqueName(kernel.UniqueName()), 00423 _asmText(CfgAsmText(cfg)) 00424 { 00425 // Populate this object with the information about memory accesses 00426 for (auto bblPtr : cfg.Bbls()) 00427 { 00428 for (auto insPtr : bblPtr->Instructions()) 00429 { 00430 const IGtIns& ins = *insPtr; 00431 if (ins.IsMemAccess() && !ins.IsEot()) 00432 { 00433 MemAccess memAccess(ins); 00434 00435 if (memAccess.IsValid()) 00436 { 00437 _memAccessMap.emplace(ins.Id(), memAccess); 00438 } 00439 else 00440 { 00441 RecordUnsupportedInstruction(ins, memAccess.Error()); 00442 } 00443 } 00444 } 00445 } 00446 00447 if (KNOB_MODE == MODE_MEMORY_FOOTPRINT) 00448 { 00449 _memFootprintBitvector.resize(gBitVectorSizeInBytes, 0); 00450 } 00451 00452 if (KNOB_MODE == MODE_MEMORY_CACHE_MODEL) 00453 { 00454 _memCacheBucketTagsVector.resize(gNumOfCacheBuckets, 0); 00455 } 00456 00457 if (KNOB_MODE == MODE_MEMORY_ACCESS_ANALYSIS) 00458 { 00459 _memFootprintBitvector.resize(gBitVectorSizeInBytes, KNOB_ANALYSIS_INIT_TO_WRITE ? 0xFF : 0); 00460 } 00461 00462 if (KNOB_MODE == MODE_MEMORY_RACE_DETECTION || KNOB_MODE == MODE_MEMORY_ACCESS_ANALYSIS_AND_RACE_DETECTION) 00463 { 00464 _memFootprintBitvector.resize(gBitVectorSizeInBytes, 0); 00465 _memRaceDetectionDataVector.resize(gRaceDataVectorSize, 0xFFFFFFFF); 00466 } 00467 } 00468 00469 GtKernelId KernelProfile::Id() const { return _id; } 00470 GtGpuPlatform KernelProfile::Platform() const { return _platform; } 00471 const std::string& KernelProfile::Name() const { return _name; } 00472 const std::string& KernelProfile::UniqueName() const { return _uniqueName; } 00473 void KernelProfile::DumpAsm() const { DumpKernelAsmText(_name, _uniqueName, _asmText); } 00474 KernelProfile::MemAccessMap& KernelProfile::GetMemAccessMap() { return _memAccessMap; } 00475 BitVector& KernelProfile::GetMemoryFootprintBitvector() { return _memFootprintBitvector; } 00476 RaceDetectionDataVector& KernelProfile::GetRaceDetectionDataVector() { return _memRaceDetectionDataVector; } 00477 CacheBucketTagsVector& KernelProfile::GetCacheBucketTagsVector() { return _memCacheBucketTagsVector; } 00478 MemoryFootprintArgs& KernelProfile::GetMemoryFootprintArg() { return _memFootprintArgs; } 00479 MemoryCacheModelArgs& KernelProfile::GetMemoryCacheModelArg() { return _memCacheModelArgs; } 00480 MemoryAccessAnalysisArgs& KernelProfile::GetMemoryAccessAnalysisArg() { return _memAccessAnalysisArgs; } 00481 MemoryRaceDetectionArgs& KernelProfile::GetMemoryRaceDetectionArg() { return _memRaceDetectionArgs; } 00482 00483 00484 std::string KernelProfile::CachelineProfResults() const 00485 { 00486 std::ostringstream os; 00487 00488 bool emptyResults = true; 00489 00490 switch (KNOB_MODE) { 00491 case MODE_CACHE_HISTOGRAM: emptyResults = _profileResultsHist.empty(); break; 00492 case MODE_MEMORY_FOOTPRINT: emptyResults = _profileResultsMemFootprint.empty(); break; 00493 case MODE_MEMORY_CACHE_MODEL: emptyResults = _profileResultsMemCacheModel.empty(); break; 00494 case MODE_MEMORY_ACCESS_ANALYSIS: emptyResults = _profileResultsMemAccessAnalysis.empty(); break; 00495 case MODE_MEMORY_RACE_DETECTION: 00496 case MODE_MEMORY_ACCESS_ANALYSIS_AND_RACE_DETECTION: 00497 emptyResults = _profileResultsMemRaceDetection.empty(); break; 00498 default: break; 00499 } 00500 00501 if (emptyResults) 00502 { 00503 return os.str(); 00504 } 00505 00506 os << std::string(120, '-') << std::endl; 00507 os << std::setw(4) << _id << ": " << _name << "___" << _uniqueName << std::endl; 00508 os << std::string(120, '-') << std::endl; 00509 00510 if (KNOB_MODE == MODE_CACHE_HISTOGRAM) 00511 { 00512 for (const auto& res : _profileResultsHist) 00513 { 00514 bool clFound = false; 00515 for (uint32_t i = 0; i < MAX_SUPPORTED_CACHE_LINE_HISTOGRAM_SIZE; i++) 00516 { 00517 if (res.clHist[i]) 00518 { 00519 clFound = true; 00520 break; 00521 } 00522 } 00523 if (clFound || res.outOfHistogramValues) { 00524 os << std::setw(10) << "Dispatch Id = " << std::dec << res.dispatchId << std::setw(20) << " Instruction Id = " << res.insId << std::endl; 00525 00526 if (clFound) 00527 { 00528 for (uint32_t i = 0; i < MAX_SUPPORTED_CACHE_LINE_HISTOGRAM_SIZE; i++) 00529 { 00530 if (res.clHist[i]) 00531 { 00532 os << std::setw(20) << std::setw(4) << i << " cachelines were accessed " << std::setw(10) << res.clHist[i] << " times" << std::endl; 00533 } 00534 } 00535 } 00536 if (res.outOfHistogramValues) 00537 { 00538 os << std::setw(20) << "A value bigger than the histogram size was detected " << std::setw(10) << res.outOfHistogramValues << " times" << std::endl; 00539 } 00540 } 00541 } 00542 } 00543 00544 struct CacheLine { 00545 uint64_t start; 00546 uint64_t end; 00547 CacheLine(uint64_t s, uint64_t e) : start(s), end(e) {} 00548 }; 00549 00550 if (KNOB_MODE == MODE_MEMORY_FOOTPRINT) 00551 { 00552 for (const auto& res : _profileResultsMemFootprint) 00553 { 00554 uint32_t cacheLineSize = KNOB_CACHELINE_SIZE; 00555 00556 auto GetCacheLines = [&](size_t size, uint8_t* reads, uint8_t* writes, std::list<CacheLine>& readOnly, std::list<CacheLine>& writesOnly, std::list<CacheLine>& readWrites) 00557 { 00558 for (uint32_t i = 0; i < size; i++) 00559 { 00560 uint8_t readByte = reads[i]; 00561 uint8_t writeByte = writes ? writes[i] : 0; 00562 00563 if (readByte == 0 && writeByte == 0) 00564 { 00565 continue; 00566 } 00567 for (uint32_t j = 0; j < 8; j++) 00568 { 00569 // we assume the upper bits of the addresses are the same for all kernel accesses in the specific dispatch 00570 uint64_t start = ((uint64_t)i * cacheLineSize * 8 + j * cacheLineSize) | res.upperAddressBits; 00571 uint64_t end = start + cacheLineSize; 00572 00573 uint8_t readBit = readByte & (1 << j); 00574 uint8_t writeBit = writeByte & (1 << j); 00575 if (readBit && writeBit) 00576 { 00577 readWrites.push_back({ start, end }); 00578 } 00579 else if (readBit) 00580 { 00581 readOnly.push_back({ start, end }); 00582 } 00583 else if (writeBit) 00584 { 00585 writesOnly.push_back({ start, end }); 00586 } 00587 } 00588 } 00589 }; 00590 00591 auto PrintAccessedLines = [&](std::string title, std::list<CacheLine>& accessedLines) 00592 { 00593 if (accessedLines.empty()) 00594 { 00595 return; 00596 } 00597 CacheLine first = accessedLines.front(); 00598 accessedLines.pop_front(); 00599 00600 uint64_t totalAccessedSize = 0; 00601 00602 os << title << std::endl; 00603 00604 while (!accessedLines.empty()) 00605 { 00606 CacheLine next = accessedLines.front(); 00607 accessedLines.pop_front(); 00608 00609 if (first.end == next.start) 00610 { 00611 first.end = next.end; 00612 } 00613 else 00614 { 00615 totalAccessedSize += first.end - first.start; 00616 os << " " << std::hex << first.start << " - " << first.end << std::endl; 00617 first = next; 00618 } 00619 } 00620 totalAccessedSize += first.end - first.start; 00621 os << " " << std::hex << first.start << " - " << first.end << std::endl; 00622 os << " " << "Total accessed size = 0x" << totalAccessedSize << std::endl; 00623 }; 00624 00625 os << std::setw(10) << "\nDispatch Id = " << std::dec << res.dispatchId << std::endl; 00626 00627 size_t size = gBitVectorSizeInBytes >> 1; 00628 00629 std::list<CacheLine> accessedLinesReadsOnly; 00630 std::list<CacheLine> accessedLinesWritesOnly; 00631 std::list<CacheLine> accessedLinesReadWrites; 00632 00633 uint8_t* reads = (uint8_t*)res.clBitvector.data(); 00634 uint8_t* writes = reads + size; 00635 00636 GetCacheLines(size, reads, writes, accessedLinesReadsOnly, accessedLinesWritesOnly, accessedLinesReadWrites); 00637 00638 if (accessedLinesReadsOnly.empty() && accessedLinesWritesOnly.empty() && accessedLinesReadWrites.empty()) 00639 { 00640 os << "Total accessed size = 0" << std::endl; 00641 continue; 00642 } 00643 00644 PrintAccessedLines("Reads cachelines:", accessedLinesReadsOnly); 00645 PrintAccessedLines("Writes cachelines:", accessedLinesWritesOnly); 00646 PrintAccessedLines("Read-Write cachelines:", accessedLinesReadWrites); 00647 } 00648 } 00649 00650 if (KNOB_MODE == MODE_MEMORY_CACHE_MODEL) 00651 { 00652 uint64_t totalHits = 0; 00653 uint64_t totalMisses = 0; 00654 00655 for (const auto& res : _profileResultsMemCacheModel) 00656 { 00657 os << std::setw(10) << "Dispatch Id = " << std::dec << res.dispatchId << std::endl; 00658 os << " Cache hits = " << std::dec << res.hits << std::endl; 00659 os << " Cache misses = " << std::dec << res.misses << std::endl; 00660 00661 totalHits += res.hits; 00662 totalMisses += res.misses; 00663 } 00664 00665 os << "=====================" << std::endl; 00666 os << "Total cache hits = " << std::dec << totalHits << std::endl; 00667 os << "Total cache misses = " << std::dec << totalMisses << std::endl; 00668 } 00669 00670 if (KNOB_MODE == MODE_MEMORY_ACCESS_ANALYSIS) 00671 { 00672 uint64_t totalRAR = 0; 00673 uint64_t totalRAW = 0; 00674 uint64_t totalWAR = 0; 00675 uint64_t totalWAW = 0; 00676 for (const auto& res : _profileResultsMemAccessAnalysis) 00677 { 00678 os << std::setw(10) << "Dispatch Id = " << std::dec << res.dispatchId << std::endl; 00679 os << " Read-After-Read accesses = " << std::dec << res.rarCount << std::endl; 00680 os << " Read-After-Write accesses = " << std::dec << res.rawCount << std::endl; 00681 os << " Write-After-Read accesses = " << std::dec << res.warCount << std::endl; 00682 os << " Write-After-Write accesses = " << std::dec << res.wawCount << std::endl; 00683 00684 totalRAR += res.rarCount; 00685 totalRAW += res.rawCount; 00686 totalWAR += res.warCount; 00687 totalWAW += res.wawCount; 00688 } 00689 00690 os << "=====================" << std::endl; 00691 os << "Total Read-After-Read accesses = " << std::dec << totalRAR << std::endl; 00692 os << "Total Read-After-Write accesses = " << std::dec << totalRAW << std::endl; 00693 os << "Total Write-After-Read accesses = " << std::dec << totalWAR << std::endl; 00694 os << "Total Write-After-Write accesses = " << std::dec << totalWAW << std::endl; 00695 } 00696 00697 if (KNOB_MODE == MODE_MEMORY_RACE_DETECTION || KNOB_MODE == MODE_MEMORY_ACCESS_ANALYSIS_AND_RACE_DETECTION) 00698 { 00699 uint64_t totalDetectedRaces = 0; 00700 for (const auto& res : _profileResultsMemRaceDetection) 00701 { 00702 auto PrintAccessedLines = [&](std::string title, std::list<CacheLine>& accessedLines) 00703 { 00704 if (accessedLines.empty()) 00705 { 00706 return; 00707 } 00708 00709 os << title << std::endl; 00710 00711 while (!accessedLines.empty()) 00712 { 00713 CacheLine cl = accessedLines.front(); 00714 accessedLines.pop_front(); 00715 os << " " << std::hex << cl.start << std::endl; 00716 } 00717 }; 00718 00719 os << std::setw(10) << "Dispatch Id = " << std::dec << res.dispatchId << std::endl; 00720 00721 if (KNOB_MODE == MODE_MEMORY_ACCESS_ANALYSIS_AND_RACE_DETECTION) 00722 { 00723 os << " Read-After-Read accesses = " << std::dec << res.rarCount << std::endl; 00724 os << " Read-After-Write accesses = " << std::dec << res.rawCount << std::endl; 00725 os << " Write-After-Read accesses = " << std::dec << res.warCount << std::endl; 00726 os << " Write-After-Write accesses = " << std::dec << res.wawCount << std::endl; 00727 } 00728 00729 if (res.raceCount == 0) 00730 { 00731 os << " No race conditions detected" << std::endl; 00732 continue; 00733 } 00734 00735 os << " Race conditions detected = " << std::dec << res.raceCount << std::endl << std::endl; 00736 00737 std::list<CacheLine> cacheLines; 00738 const uint8_t* bitvector = res.clBitvector.data(); 00739 uint32_t cacheLineSize = KNOB_CACHELINE_SIZE; 00740 00741 for (uint32_t i = 0; i < res.clBitvector.size(); i++) 00742 { 00743 uint8_t byte = bitvector[i]; 00744 00745 if (byte == 0) 00746 { 00747 continue; 00748 } 00749 for (uint32_t j = 0; j < 8; j++) 00750 { 00751 if (!(byte & (1 << j))) 00752 { 00753 continue; 00754 } 00755 // we assume the upper bits of the addresses are the same for all kernel accesses in the specific dispatch 00756 uint64_t start = ((uint64_t)i * cacheLineSize * 8 + j * cacheLineSize) | res.upperAddressBits; 00757 uint64_t end = start + cacheLineSize; 00758 00759 cacheLines.push_back({ start, end }); 00760 } 00761 } 00762 00763 PrintAccessedLines("Races detected at these addresses:", cacheLines); 00764 00765 totalDetectedRaces += res.raceCount; 00766 } 00767 00768 os << "=====================" << std::endl; 00769 os << "Total Race conditions detected = " << std::dec << totalDetectedRaces << std::endl; 00770 } 00771 00772 return os.str(); 00773 } 00774 00775 void KernelProfile::RecordCachelineProfResults(IGtKernelDispatch& dispatcher) 00776 { 00777 if (KNOB_MODE == MODE_CACHE_HISTOGRAM) 00778 { 00779 for (auto& entry : _memAccessMap) 00780 { 00781 MemAccess& memAccess = entry.second; 00782 _profileResultsHist.emplace_back(dispatcher, memAccess); 00783 } 00784 } 00785 if (KNOB_MODE == MODE_MEMORY_FOOTPRINT) 00786 { 00787 _profileResultsMemFootprint.emplace_back(dispatcher, _memFootprintArgs, _memFootprintBitvector); 00788 } 00789 if (KNOB_MODE == MODE_MEMORY_CACHE_MODEL) 00790 { 00791 _profileResultsMemCacheModel.emplace_back(dispatcher, _memCacheModelArgs); 00792 } 00793 if (KNOB_MODE == MODE_MEMORY_ACCESS_ANALYSIS) 00794 { 00795 _profileResultsMemAccessAnalysis.emplace_back(dispatcher, _memAccessAnalysisArgs); 00796 } 00797 if (KNOB_MODE == MODE_MEMORY_RACE_DETECTION || KNOB_MODE == MODE_MEMORY_ACCESS_ANALYSIS_AND_RACE_DETECTION) 00798 { 00799 _profileResultsMemRaceDetection.emplace_back(dispatcher, _memRaceDetectionArgs, _memFootprintBitvector); 00800 } 00801 } 00802 00803 void KernelProfile::RecordUnsupportedInstruction(const IGtIns& ins, const std::string& errMsg) 00804 { 00805 if (!errMsg.empty()) 00806 { 00807 std::ostringstream os; 00808 os << errMsg << ": [" << std::setw(3) << ins.Id() << "] " << ins.ToString() << std::endl; 00809 _unhandledAccesses.append(os.str()); 00810 } 00811 } 00812 00813 /* ============================================================================================= */ 00814 // CachelineProf implementation 00815 /* ============================================================================================= */ 00816 CachelineProf::CachelineProf() : 00817 _checkScatterA32AccessFunc("CheckScatterA32Access"), 00818 _checkScatterA64AccessFunc("CheckScatterA64Access"), 00819 _globalMemoryFootprintA64AccessFunc("GlobalMemoryFootprintA64Access", GtHliCallStd::IGC_STACK, hasComboParam), 00820 _globalMemoryFootprintBlock2DAccessFunc("GlobalMemoryFootprintBlock2DAccess", GtHliCallStd::IGC_STACK, hasComboParam), 00821 _globalMemoryCacheModelA64AccessFunc("GlobalMemoryCacheModelA64Access", GtHliCallStd::IGC_STACK, hasComboParam), 00822 _globalMemoryCacheModelBlock2DAccessFunc("GlobalMemoryCacheModelBlock2DAccess", GtHliCallStd::IGC_STACK, hasComboParam), 00823 _globalMemoryAccessAnalysisA64AccessFunc("GlobalMemoryAccessAnalysisA64Access", GtHliCallStd::IGC_STACK, hasComboParam), 00824 _globalMemoryAccessAnalysisBlock2DAccessFunc("GlobalMemoryAccessAnalysisBlock2DAccess", GtHliCallStd::IGC_STACK, hasComboParam), 00825 _globalMemoryRaceDetectionA64AccessFunc("GlobalMemoryRaceDetectionA64Access", GtHliCallStd::IGC_STACK, hasComboParam), 00826 _globalMemoryAccessAnalysisAndRaceDetectionA64AccessFunc("GlobalMemoryAccessAnalysisAndRaceDetectionA64Access", GtHliCallStd::IGC_STACK, hasComboParam) 00827 {} 00828 00829 CachelineProf::~CachelineProf() {} 00830 00831 CachelineProf* CachelineProf::Instance() 00832 { 00833 static CachelineProf instance; 00834 return &instance; 00835 } 00836 00837 void CachelineProf::OnKernelBuild(IGtKernelInstrument& instrumentor) 00838 { 00839 const IGtKernel& kernel = instrumentor.Kernel(); 00840 const IGtCfg& cfg = instrumentor.Cfg(); 00841 IGtMemoryMapper& memMapper = instrumentor.MemoryMapper(); 00842 00843 // Create profile for this kernel 00844 auto result = _kernels.emplace(std::piecewise_construct, 00845 std::forward_as_tuple(instrumentor.Kernel().Id()), 00846 std::forward_as_tuple(kernel, cfg)); 00847 GTPIN_ASSERT(result.second); 00848 00849 KernelProfile& kernelProfile = result.first->second; 00850 00851 // Instrument memory accesses and share per-access arguments with HLI functions 00852 for (const auto& entry : kernelProfile.GetMemAccessMap()) 00853 { 00854 const auto& memAccess = entry.second; 00855 auto insId = entry.first; 00856 00857 if (int32_t(insId) < knobMinInstrumentIns || knobMaxInstrumentIns < int32_t(insId)) 00858 { 00859 continue; 00860 } 00861 const IGtIns& ins = cfg.GetInstruction(insId); 00862 00863 if (memAccess.IsValid()) 00864 { 00865 switch (KNOB_MODE) { 00866 case MODE_CACHE_HISTOGRAM: 00867 { 00868 InsertCachelineProf(ins, memAccess, instrumentor); 00869 // Share per-access HLI arguments. 00870 // They will be initialized at the start of the kernel, and copied back to the host memory at completion of the kernel 00871 memMapper.Map(memAccess.GetCachelineProfArgs(), GT_MMAP_SHARE); 00872 break; 00873 } 00874 case MODE_MEMORY_FOOTPRINT: 00875 { 00876 InsertMemoryFootprintProf(ins, 00877 memAccess, 00878 kernelProfile.GetMemoryFootprintArg(), 00879 kernelProfile.GetMemoryFootprintBitvector(), 00880 instrumentor); 00881 break; 00882 } 00883 case MODE_MEMORY_CACHE_MODEL: 00884 { 00885 InsertMemoryCacheModelProf(ins, 00886 memAccess, 00887 kernelProfile.GetMemoryCacheModelArg(), 00888 kernelProfile.GetCacheBucketTagsVector(), 00889 instrumentor); 00890 break; 00891 } 00892 case MODE_MEMORY_ACCESS_ANALYSIS: 00893 { 00894 InsertMemoryAccessAnalysisProf(ins, 00895 memAccess, 00896 kernelProfile.GetMemoryAccessAnalysisArg(), 00897 kernelProfile.GetMemoryFootprintBitvector(), 00898 instrumentor); 00899 break; 00900 } 00901 case MODE_MEMORY_RACE_DETECTION: 00902 case MODE_MEMORY_ACCESS_ANALYSIS_AND_RACE_DETECTION: 00903 { 00904 InsertRaceDetectionProf(ins, 00905 memAccess, 00906 kernelProfile.GetMemoryRaceDetectionArg(), 00907 kernelProfile.GetRaceDetectionDataVector(), 00908 kernelProfile.GetMemoryFootprintBitvector(), 00909 instrumentor); 00910 break; 00911 } 00912 default: GTPIN_ERROR(); 00913 } 00914 } 00915 else 00916 { 00917 GTPIN_ERROR(); 00918 } 00919 } 00920 00921 if (KNOB_MODE == MODE_MEMORY_FOOTPRINT) 00922 { 00923 // Share bitvector argument. 00924 // It will be initialized at the start of the kernel, and copied back to the host memory at completion of the kernel 00925 memMapper.Map(kernelProfile.GetMemoryFootprintArg(), GT_MMAP_SHARE); 00926 memMapper.Map(kernelProfile.GetMemoryFootprintBitvector(), GT_MMAP_SHARE); 00927 } 00928 00929 if (KNOB_MODE == MODE_MEMORY_CACHE_MODEL) 00930 { 00931 // Share cache model tags vector argument. 00932 // It will be initialized at the start of the kernel, and copied back to the host memory at completion of the kernel 00933 memMapper.Map(kernelProfile.GetMemoryCacheModelArg(), GT_MMAP_SHARE); 00934 memMapper.Map(kernelProfile.GetCacheBucketTagsVector(), GT_MMAP_SHARE); 00935 } 00936 00937 if (KNOB_MODE == MODE_MEMORY_ACCESS_ANALYSIS) 00938 { 00939 // Share memory access analysis argument. 00940 // It will be initialized at the start of the kernel, and copied back to the host memory at completion of the kernel 00941 memMapper.Map(kernelProfile.GetMemoryAccessAnalysisArg(), GT_MMAP_SHARE); 00942 memMapper.Map(kernelProfile.GetMemoryFootprintBitvector(), GT_MMAP_SHARE); 00943 } 00944 00945 if (KNOB_MODE == MODE_MEMORY_RACE_DETECTION || KNOB_MODE == MODE_MEMORY_ACCESS_ANALYSIS_AND_RACE_DETECTION) 00946 { 00947 // Share memory access analysis argument. 00948 // It will be initialized at the start of the kernel, and copied back to the host memory at completion of the kernel 00949 memMapper.Map(kernelProfile.GetMemoryFootprintBitvector(), GT_MMAP_SHARE); 00950 memMapper.Map(kernelProfile.GetRaceDetectionDataVector(), GT_MMAP_SHARE); 00951 memMapper.Map(kernelProfile.GetMemoryRaceDetectionArg(), GT_MMAP_SHARE); 00952 } 00953 00954 // Link the kernel with the library of HLI functions 00955 instrumentor.LinkHliModule(_hliModule); 00956 } 00957 00958 void CachelineProf::OnKernelRun(IGtKernelDispatch& dispatcher) 00959 { 00960 const IGtKernel& kernel = dispatcher.Kernel(); 00961 00962 auto it = _kernels.find(kernel.Id()); 00963 if (it == _kernels.end()) 00964 { 00965 return; 00966 } 00967 00968 KernelProfile& kernelProfile = it->second; 00969 00970 if (dispatcher.ExecStage().IsDispatch()) 00971 { 00972 GtKernelExecDesc execDesc; dispatcher.GetExecDescriptor(execDesc); 00973 if (kernel.IsInstrumented() && IsKernelExecProfileEnabled(execDesc, kernel.GpuPlatform())) 00974 { 00975 dispatcher.SetProfilingMode(true); // Enable instrumentation 00976 } 00977 else 00978 { 00979 dispatcher.SetProfilingMode(false); // Disable instrumentation 00980 return; 00981 } 00982 } 00983 00984 IGtMemoryMapper& memMapper = dispatcher.MemoryMapper(); 00985 00986 if (KNOB_MODE == MODE_CACHE_HISTOGRAM) 00987 { 00988 // Initialize per-access arguments of HLI functions 00989 for (auto& entry : kernelProfile.GetMemAccessMap()) 00990 { 00991 auto insId = entry.first; 00992 00993 if (int32_t(insId) < knobMinInstrumentIns || knobMaxInstrumentIns < int32_t(insId)) 00994 { 00995 continue; 00996 } 00997 00998 MemAccess& memAccess = entry.second; 00999 01000 CachelineProfArgs& clArgs = memAccess.GetCachelineProfArgs(); 01001 01002 memset(&clArgs.out, 0, sizeof(clArgs.out)); 01003 clArgs.in.dataSize = memAccess.DataSize(); 01004 clArgs.in.numAccesses = memAccess.NumAccesses(); 01005 clArgs.in.log2CachelineSize = gLog2CacheLineSize; 01006 01007 memMapper.Write(&clArgs, sizeof(clArgs)); 01008 } 01009 } 01010 01011 if (KNOB_MODE == MODE_MEMORY_FOOTPRINT) 01012 { 01013 MemoryFootprintArgs& mfprntArgs = kernelProfile.GetMemoryFootprintArg(); 01014 01015 mfprntArgs.in.log2CachelineSize = gLog2CacheLineSize; 01016 mfprntArgs.in.supportedAddressBits = gSupportedAddressBits; 01017 mfprntArgs.in.bitVectorSizeInBytes = (uint32_t)gBitVectorSizeInBytes; 01018 memMapper.Write(&mfprntArgs, sizeof(mfprntArgs)); 01019 01020 auto& bitvector = kernelProfile.GetMemoryFootprintBitvector(); 01021 memset(bitvector.data(), 0, bitvector.size()); 01022 memMapper.Write(bitvector.data(), (uint32_t)bitvector.size()); 01023 } 01024 01025 if (KNOB_MODE == MODE_MEMORY_CACHE_MODEL) 01026 { 01027 MemoryCacheModelArgs& mCacheModelArgs = kernelProfile.GetMemoryCacheModelArg(); 01028 01029 mCacheModelArgs.in.log2CachelineSize = gLog2CacheLineSize; 01030 mCacheModelArgs.in.cacheSize = gCacheSize; 01031 mCacheModelArgs.in.hashMaskShift = KNOB_CACHE_MODEL_HASH_SHIFT; 01032 mCacheModelArgs.out.hits = 0; 01033 mCacheModelArgs.out.misses = 0; 01034 memMapper.Write(&mCacheModelArgs, sizeof(mCacheModelArgs)); 01035 01036 if (!KNOB_NO_CACHE_INVALIDATE) 01037 { 01038 // Initialize cache model per each enqueue 01039 auto& tagsVector = kernelProfile.GetCacheBucketTagsVector(); 01040 memset(tagsVector.data(), 0, tagsVector.size() * sizeof(uint64_t)); 01041 memMapper.Write(tagsVector.data(), (uint32_t)tagsVector.size() * sizeof(uint64_t)); 01042 } 01043 } 01044 01045 if (KNOB_MODE == MODE_MEMORY_ACCESS_ANALYSIS) 01046 { 01047 MemoryAccessAnalysisArgs& mAccessAnalysisArgs = kernelProfile.GetMemoryAccessAnalysisArg(); 01048 01049 mAccessAnalysisArgs.in.log2CachelineSize = gLog2CacheLineSize; 01050 mAccessAnalysisArgs.in.supportedAddressBits = gSupportedAddressBits; 01051 mAccessAnalysisArgs.in.bitVectorSizeInBytes = (uint32_t)gBitVectorSizeInBytes; 01052 01053 mAccessAnalysisArgs.out.rarCount = 0; 01054 mAccessAnalysisArgs.out.rawCount = 0; 01055 mAccessAnalysisArgs.out.warCount = 0; 01056 mAccessAnalysisArgs.out.wawCount = 0; 01057 memMapper.Write(&mAccessAnalysisArgs, sizeof(mAccessAnalysisArgs)); 01058 01059 if (!KNOB_NO_INVALIDATE) 01060 { 01061 auto& bitvector = kernelProfile.GetMemoryFootprintBitvector(); 01062 memset(bitvector.data(), KNOB_ANALYSIS_INIT_TO_WRITE ? 0xFF : 0, bitvector.size()); 01063 memMapper.Write(bitvector.data(), (uint32_t)bitvector.size()); 01064 } 01065 } 01066 01067 if (KNOB_MODE == MODE_MEMORY_RACE_DETECTION || KNOB_MODE == MODE_MEMORY_ACCESS_ANALYSIS_AND_RACE_DETECTION) 01068 { 01069 MemoryRaceDetectionArgs& mRaceDetectionArgs = kernelProfile.GetMemoryRaceDetectionArg(); 01070 01071 mRaceDetectionArgs.in.log2CachelineSize = gLog2CacheLineSize; 01072 mRaceDetectionArgs.in.supportedAddressBits = gSupportedAddressBits; 01073 mRaceDetectionArgs.in.bitVectorSizeInBytes = (uint32_t)gBitVectorSizeInBytes; 01074 01075 mRaceDetectionArgs.out.raceCount = 0; 01076 mRaceDetectionArgs.out.upperAddressBits = 0; 01077 01078 if (KNOB_MODE == MODE_MEMORY_ACCESS_ANALYSIS_AND_RACE_DETECTION) 01079 { 01080 mRaceDetectionArgs.out.rarCount = 0; 01081 mRaceDetectionArgs.out.rawCount = 0; 01082 mRaceDetectionArgs.out.warCount = 0; 01083 mRaceDetectionArgs.out.wawCount = 0; 01084 } 01085 memMapper.Write(&mRaceDetectionArgs, sizeof(mRaceDetectionArgs)); 01086 01087 if (!KNOB_NO_INVALIDATE) 01088 { 01089 auto& bitvector = kernelProfile.GetMemoryFootprintBitvector(); 01090 memset(bitvector.data(), 0, bitvector.size()); 01091 memMapper.Write(bitvector.data(), (uint32_t)bitvector.size()); 01092 01093 auto& raceDataVector = kernelProfile.GetRaceDetectionDataVector(); 01094 memset(raceDataVector.data(), 0xFF, raceDataVector.size() * sizeof(uint32_t)); 01095 memMapper.Write(raceDataVector.data(), (uint32_t)raceDataVector.size()); 01096 } 01097 } 01098 } 01099 01100 void CachelineProf::OnKernelComplete(IGtKernelDispatch& dispatcher) 01101 { 01102 if (dispatcher.IsProfilingEnabled()) 01103 { 01104 KernelProfile& kernelProfile = _kernels.at(dispatcher.Kernel().Id()); 01105 kernelProfile.RecordCachelineProfResults(dispatcher); 01106 } 01107 } 01108 01109 bool CachelineProf::InsertCachelineProf(const IGtIns &ins, const MemAccess& memAccess, IGtKernelInstrument& instrumentor) 01110 { 01111 GTPIN_ASSERT(memAccess.IsValid() && (memAccess.Id() == ins.Id())); 01112 01113 uint32_t numAccesses = memAccess.NumAccesses(); 01114 if (numAccesses == 0) 01115 { 01116 return false; // Nothing to check 01117 } 01118 01119 uint32_t regSize = instrumentor.Kernel().GenModel().GrfRegSize(); 01120 const GtMemoryAddrModel& addrModel = memAccess.AddrModel(); 01121 uint32_t addrSize = addrModel.PtrSize(); 01122 GtReg firstAddrReg = GrfReg(memAccess.FirstAddrReg(), 0, regSize); 01123 uint32_t numAddrRegs = RoundUp(numAccesses * addrSize, regSize) / regSize; 01124 CachelineProfArgs* checkArgs = const_cast<CachelineProfArgs*>(&memAccess.GetCachelineProfArgs()); 01125 IargConstGrfRange addrPayload(firstAddrReg.RegNum(), numAddrRegs); 01126 IargInsOpMask accessMask(ins); 01127 01128 if (addrModel.IsA64()) 01129 { 01130 _checkScatterA64AccessFunc.InsertCallAtInstruction(instrumentor, ins, GtIpoint::Before(), 01131 NullReg(), // Unused return value 01132 addrPayload, // arg[1]: Base address of the accessed memory range 01133 accessMask, // arg[2]: Per-channel mask of memory accesses 01134 checkArgs // arg[3]: Cacheline check arguments 01135 ); 01136 } 01137 else 01138 { 01139 _checkScatterA32AccessFunc.InsertCallAtInstruction(instrumentor, ins, GtIpoint::Before(), 01140 NullReg(), // Unused return value 01141 addrPayload, // arg[1]: Base address of the accessed memory range 01142 accessMask, // arg[2]: Per-channel mask of memory accesses 01143 checkArgs // arg[3]: Cacheline check arguments 01144 ); 01145 } 01146 return true; 01147 } 01148 01149 bool CachelineProf::InsertMemoryFootprintProf(const IGtIns& ins, const MemAccess& memAccess, const MemoryFootprintArgs& memFootprintArgs, const BitVector& bitvector, IGtKernelInstrument& instrumentor) 01150 { 01151 GTPIN_ASSERT(memAccess.IsValid() && (memAccess.Id() == ins.Id()) && memAccess.AddrModel().IsA64()); 01152 01153 uint32_t numAccesses = memAccess.NumAccesses(); 01154 if (numAccesses == 0) 01155 { 01156 return false; // Nothing to check 01157 } 01158 01159 uint32_t regSize = instrumentor.Kernel().GenModel().GrfRegSize(); 01160 GtReg firstAddrReg = GrfReg(memAccess.FirstAddrReg(), 0, regSize); 01161 uint32_t numAddrRegs = RoundUp(numAccesses * 8, regSize) / regSize; 01162 uint32_t dataSize = memAccess.DataSize(); 01163 MemoryAccessType accessType = ins.IsAtomic() ? MemoryAccessType::ReadModifyWrite : (ins.IsMemRead() ? MemoryAccessType::Load : MemoryAccessType::Store); 01164 MemoryFootprintArgs* args = const_cast<MemoryFootprintArgs*>(&memFootprintArgs); 01165 IargHostPtr footprint = bitvector.data(); 01166 IargConstGrfRange addrPayload(firstAddrReg.RegNum(), numAddrRegs); 01167 IargInsOpMask accessMask(ins); 01168 01169 if (memAccess.IsBlock2DAccess()) 01170 { 01171 _globalMemoryFootprintBlock2DAccessFunc.InsertCallAtInstruction(instrumentor, ins, GtIpoint::Before(), 01172 NullReg(), // Unused return value 01173 addrPayload, // arg[1]: Base address of the accessed memory range 01174 accessMask, // arg[2]: Per-channel mask of memory accesses 01175 dataSize, // arg[3]: Access data size 01176 accessType, // arg[4]: Access type 01177 args, // arg[5]: Memory footprint arguments 01178 footprint // arg[6]: bitvector 01179 ); 01180 } 01181 else 01182 { 01183 _globalMemoryFootprintA64AccessFunc.InsertCallAtInstruction(instrumentor, ins, GtIpoint::Before(), 01184 NullReg(), // Unused return value 01185 addrPayload, // arg[1]: Base address of the accessed memory range 01186 accessMask, // arg[2]: Per-channel mask of memory accesses 01187 dataSize, // arg[3]: Access data size 01188 accessType, // arg[4]: Access type 01189 args, // arg[5]: Memory footprint arguments 01190 footprint // arg[6]: bitvector 01191 ); 01192 } 01193 01194 return true; 01195 } 01196 01197 bool CachelineProf::InsertMemoryAccessAnalysisProf(const IGtIns& ins, const MemAccess& memAccess, const MemoryAccessAnalysisArgs& memAccessAnalysisArgs, const BitVector& bitvector, IGtKernelInstrument& instrumentor) 01198 { 01199 GTPIN_ASSERT(memAccess.IsValid() && (memAccess.Id() == ins.Id()) && memAccess.AddrModel().IsA64()); 01200 01201 uint32_t numAccesses = memAccess.NumAccesses(); 01202 if (numAccesses == 0) 01203 { 01204 return false; // Nothing to check 01205 } 01206 01207 uint32_t regSize = instrumentor.Kernel().GenModel().GrfRegSize(); 01208 GtReg firstAddrReg = GrfReg(memAccess.FirstAddrReg(), 0, regSize); 01209 uint32_t numAddrRegs = RoundUp(numAccesses * 8, regSize) / regSize; 01210 uint32_t dataSize = memAccess.DataSize(); 01211 MemoryAccessAnalysisArgs* args = const_cast<MemoryAccessAnalysisArgs*>(&memAccessAnalysisArgs); 01212 IargHostPtr footprint = bitvector.data(); 01213 MemoryAccessType accessType = ins.IsAtomic() ? MemoryAccessType::ReadModifyWrite : (ins.IsMemRead() ? MemoryAccessType::Load : MemoryAccessType::Store); 01214 IargConstGrfRange addrPayload(firstAddrReg.RegNum(), numAddrRegs); 01215 IargInsOpMask accessMask(ins); 01216 01217 if (memAccess.IsBlock2DAccess()) 01218 { 01219 _globalMemoryAccessAnalysisBlock2DAccessFunc.InsertCallAtInstruction(instrumentor, ins, GtIpoint::Before(), 01220 NullReg(), // Unused return value 01221 addrPayload, // arg[1]: Base address of the accessed memory range 01222 accessMask, // arg[2]: Per-channel mask of memory accesses 01223 dataSize, // arg[3]: Access data size 01224 accessType, // arg[4]: Access type 01225 args, // arg[5]: Memory access analysis arguments 01226 footprint // arg[6]: bitvector 01227 ); 01228 } 01229 else 01230 { 01231 _globalMemoryAccessAnalysisA64AccessFunc.InsertCallAtInstruction(instrumentor, ins, GtIpoint::Before(), 01232 NullReg(), // Unused return value 01233 addrPayload, // arg[1]: Base address of the accessed memory range 01234 accessMask, // arg[2]: Per-channel mask of memory accesses 01235 dataSize, // arg[3]: Access data size 01236 accessType, // arg[4]: Access type 01237 args, // arg[5]: Memory access analysis arguments 01238 footprint // arg[6]: bitvector 01239 ); 01240 } 01241 01242 return true; 01243 } 01244 01245 bool CachelineProf::InsertRaceDetectionProf(const IGtIns& ins, 01246 const MemAccess& memAccess, 01247 const MemoryRaceDetectionArgs& memRaceDetectionArg, 01248 const RaceDetectionDataVector& raceDetectionDataVector, 01249 const BitVector& bitvector, 01250 IGtKernelInstrument& instrumentor) 01251 { 01252 GTPIN_ASSERT(memAccess.IsValid() && (memAccess.Id() == ins.Id()) && memAccess.AddrModel().IsA64()); 01253 01254 uint32_t numAccesses = memAccess.NumAccesses(); 01255 if (numAccesses == 0) 01256 { 01257 return false; // Nothing to check 01258 } 01259 01260 uint32_t regSize = instrumentor.Kernel().GenModel().GrfRegSize(); 01261 GtReg firstAddrReg = GrfReg(memAccess.FirstAddrReg(), 0, regSize); 01262 uint32_t numAddrRegs = RoundUp(memAccess.NumAccesses() * 8, regSize) / regSize; 01263 uint32_t dataSize = memAccess.DataSize(); 01264 MemoryAccessType accessType = ins.IsAtomic() ? MemoryAccessType::ReadModifyWrite : (ins.IsMemRead() ? MemoryAccessType::Load : MemoryAccessType::Store); 01265 MemoryRaceDetectionArgs* args = const_cast<MemoryRaceDetectionArgs*>(&memRaceDetectionArg); 01266 IargHostPtr detectedCacheLines = bitvector.data(); 01267 IargHostPtr raceData = raceDetectionDataVector.data(); 01268 IargConstGrfRange addrPayload(firstAddrReg.RegNum(), numAddrRegs); 01269 IargInsOpMask accessMask(ins); 01270 IargTid threadId; 01271 01272 if (memAccess.IsBlock2DAccess()) 01273 { 01274 } 01275 else 01276 { 01277 if (KNOB_MODE == MODE_MEMORY_RACE_DETECTION) 01278 { 01279 _globalMemoryRaceDetectionA64AccessFunc.InsertCallAtInstruction(instrumentor, ins, GtIpoint::Before(), 01280 NullReg(), // Unused return value 01281 addrPayload, // arg[1]: Base address of the accessed memory range 01282 accessMask, // arg[2]: Per-channel mask of memory accesses 01283 dataSize, // arg[3]: Access data size 01284 accessType, // arg[4]: Access type 01285 threadId, // arg[5]: HW Thread ID 01286 args, // arg[6]: Memory race condition detection arguments 01287 raceData, // arg[7]: race data vector 01288 detectedCacheLines // arg[8]: bitvector - detected cache lines 01289 ); 01290 } 01291 if (KNOB_MODE == MODE_MEMORY_ACCESS_ANALYSIS_AND_RACE_DETECTION) 01292 { 01293 _globalMemoryAccessAnalysisAndRaceDetectionA64AccessFunc.InsertCallAtInstruction(instrumentor, ins, GtIpoint::Before(), 01294 NullReg(), // Unused return value 01295 addrPayload, // arg[1]: Base address of the accessed memory range 01296 accessMask, // arg[2]: Per-channel mask of memory accesses 01297 dataSize, // arg[3]: Access data size 01298 accessType, // arg[4]: Access type 01299 threadId, // arg[5]: HW Thread ID 01300 args, // arg[6]: Memory race condition detection arguments 01301 raceData, // arg[7]: race data vector 01302 detectedCacheLines // arg[8]: bitvector - detected cache lines 01303 ); 01304 } 01305 } 01306 01307 return true; 01308 } 01309 01310 bool CachelineProf::InsertMemoryCacheModelProf(const IGtIns& ins, const MemAccess& memAccess, const MemoryCacheModelArgs& memCacheModelArgs, const CacheBucketTagsVector& cacheBucketTagsVector, IGtKernelInstrument& instrumentor) 01311 { 01312 GTPIN_ASSERT(memAccess.IsValid() && (memAccess.Id() == ins.Id()) && memAccess.AddrModel().IsA64()); 01313 01314 uint32_t numAccesses = memAccess.NumAccesses(); 01315 if (numAccesses == 0) 01316 { 01317 return false; // Nothing to check 01318 } 01319 01320 uint32_t regSize = instrumentor.Kernel().GenModel().GrfRegSize(); 01321 GtReg firstAddrReg = GrfReg(memAccess.FirstAddrReg(), 0, regSize); 01322 uint32_t numAddrRegs = RoundUp(numAccesses * 8, regSize) / regSize; 01323 uint32_t dataSize = memAccess.DataSize(); 01324 MemoryCacheModelArgs* args = const_cast<MemoryCacheModelArgs*>(&memCacheModelArgs); 01325 IargHostPtr tags = cacheBucketTagsVector.data(); 01326 IargConstGrfRange addrPayload(firstAddrReg.RegNum(), numAddrRegs); 01327 IargInsOpMask accessMask(ins); 01328 01329 if (memAccess.IsBlock2DAccess()) 01330 { 01331 _globalMemoryCacheModelBlock2DAccessFunc.InsertCallAtInstruction(instrumentor, ins, GtIpoint::Before(), 01332 NullReg(), // Unused return value 01333 addrPayload, // arg[1]: Base address of the accessed memory range 01334 accessMask, // arg[2]: Per-channel mask of memory accesses 01335 dataSize, // arg[3]: Access data size 01336 args, // arg[4]: Memory footprint arguments 01337 tags // arg[5]: tags vector 01338 ); 01339 } 01340 else 01341 { 01342 _globalMemoryCacheModelA64AccessFunc.InsertCallAtInstruction(instrumentor, ins, GtIpoint::Before(), 01343 NullReg(), // Unused return value 01344 addrPayload, // arg[1]: Base address of the accessed memory range 01345 accessMask, // arg[2]: Per-channel mask of memory accesses 01346 dataSize, // arg[3]: Access data size 01347 args, // arg[4]: Memory footprint arguments 01348 tags // arg[5]: tags vector 01349 ); 01350 } 01351 01352 return true; 01353 } 01354 01355 void CachelineProf::LoadHliLibrary() 01356 { 01357 std::string modulePath = JoinPath(GetKnobValue<std::string>("installDir"), "Examples", "cachelineprof.cl"); 01358 _hliModule = GTPin_GetCore()->HliLibrary().CompileModuleFromFile(modulePath.c_str()); 01359 GTPIN_ASSERT_MSG(_hliModule != nullptr, "Could not load HLI module " + modulePath); 01360 } 01361 01362 void CachelineProf::Fini() 01363 { 01364 std::string str; 01365 01366 // Dump profiling results and assembly code of all kernels 01367 for (const auto& entry : _kernels) 01368 { 01369 const auto& kernelProfile = entry.second; 01370 str += kernelProfile.CachelineProfResults(); 01371 kernelProfile.DumpAsm(); 01372 } 01373 01374 std::ofstream fs(JoinPath(GTPin_GetCore()->ProfileDir(), "cachelineprof.txt")); 01375 GTPIN_ASSERT(fs.is_open()); 01376 fs << str; 01377 01378 if (!KNOB_NO_COUT) 01379 { 01380 std::cout << str; 01381 } 01382 } 01383 01384 /* ============================================================================================= */ 01385 // GTPin_Entry 01386 /* ============================================================================================= */ 01387 EXPORT_C_FUNC void GTPin_Entry(int argc, const char *argv[]) 01388 { 01389 SetKnobValue<int>(0, "scalarize_hlif"); 01390 ConfigureGTPin(argc, argv); 01391 01392 // Register the tool (callbacks) with the GTPin core 01393 CachelineProf::Instance()->Register(); 01394 01395 // Compile and load library of HLI functions 01396 CachelineProf::Instance()->LoadHliLibrary(); 01397 01398 // Register the termination function 01399 atexit(CachelineProf::OnFini); 01400 01401 if (KNOB_MODE != MODE_CACHE_HISTOGRAM) 01402 { 01403 // All the modes except cache histogram support A64 address mode only. 01404 // To make sure the compiler generates A64 accesses, we force IGC to do that. 01405 SetKnobValue<bool>(true, "igc_force_a64"); 01406 } 01407 01408 gLog2CacheLineSize = Bsr32(KNOB_CACHELINE_SIZE); 01409 01410 gSupportedAddressBits = KNOB_SUPPORTED_ADDRESS_BITS; 01411 01412 if (KNOB_MODE == MODE_MEMORY_FOOTPRINT) 01413 { 01414 // Memory footprint mode distinguishes between read and write accesses. For that we allocate a bitvector of the double size. 01415 // The first half of the bitvector is used for read accesses and the second half for write accesses. Since the bitvector is big, 01416 // to save buffer size we reduce one bit from the supported address bits, and thus doubling the buffer size we get the same number of bytes. 01417 gSupportedAddressBits -= 1; 01418 01419 uint64_t bitVectorSizeInBytes = 2 * (((uint64_t)1 << gSupportedAddressBits) >> gLog2CacheLineSize) / 8; 01420 GTPIN_ASSERT_MSG(bitVectorSizeInBytes < FOUR_GB, "Too big buffer - please reduce supported address bits"); // we don't want to support buffers bigger than 4GB 01421 01422 gBitVectorSizeInBytes = (size_t)bitVectorSizeInBytes; 01423 } 01424 01425 if (KNOB_MODE == MODE_MEMORY_CACHE_MODEL) 01426 { 01427 gCacheSize = KNOB_CACHE_SIZE * ONE_KB; 01428 gNumOfCacheBuckets = gCacheSize >> gLog2CacheLineSize; 01429 } 01430 01431 if (KNOB_MODE == MODE_MEMORY_ACCESS_ANALYSIS) 01432 { 01433 uint64_t bitVectorSizeInBytes = (((uint64_t)1 << gSupportedAddressBits) >> gLog2CacheLineSize) / 8; 01434 GTPIN_ASSERT_MSG(bitVectorSizeInBytes < FOUR_GB, "Too big buffer - please reduce supported address bits"); // we don't want to support buffers bigger than 4GB 01435 01436 gBitVectorSizeInBytes = (size_t)bitVectorSizeInBytes; 01437 } 01438 01439 if (KNOB_MODE == MODE_MEMORY_RACE_DETECTION || KNOB_MODE == MODE_MEMORY_ACCESS_ANALYSIS_AND_RACE_DETECTION) 01440 { 01441 // This mode assumes dword accesses and sets maximum of supported address bits to 31 to not exceed 4GB of profiling buffer. 01442 SetKnobValue<int>(4, "cacheline_size"); 01443 SetKnobValue<int>(31, "supported_address_bits"); 01444 01445 gSupportedAddressBits = KNOB_SUPPORTED_ADDRESS_BITS; 01446 gLog2CacheLineSize = Bsr32(KNOB_CACHELINE_SIZE); 01447 01448 uint64_t raceDataVectorSize = (((uint64_t)1 << gSupportedAddressBits) >> gLog2CacheLineSize); 01449 uint64_t bitVectorSizeInBytes = raceDataVectorSize / 8; 01450 GTPIN_ASSERT_MSG(bitVectorSizeInBytes + raceDataVectorSize * sizeof(uint32_t) < FOUR_GB, "Too big buffer - please reduce supported address bits"); 01451 gBitVectorSizeInBytes = (size_t)bitVectorSizeInBytes; 01452 gRaceDataVectorSize = (size_t)raceDataVectorSize; 01453 } 01454 }
00001 /*========================== begin_copyright_notice ============================ 00002 Copyright (C) 2025-2026 Intel Corporation 00003 00004 SPDX-License-Identifier: MIT 00005 ============================= end_copyright_notice ===========================*/ 00006 00007 /*! 00008 * @file Library of High-Level Instrumentation (HLI) functions used by the cachelineprof tool 00009 */ 00010 00011 #include "hlif_basic_defs.h" 00012 #include "cachelineprof.h" 00013 00014 00015 #define OPTIMIZED 00016 00017 static inline uint32_t InsertCacheline32(uint32_t* ids, uint32_t cachelineId, uint32_t nextFreeIndex) 00018 { 00019 if (nextFreeIndex >= 63) 00020 { 00021 return nextFreeIndex; 00022 } 00023 00024 for (uint32_t i = 0; i < nextFreeIndex; i++) 00025 { 00026 if (ids[i] == cachelineId) 00027 { 00028 return nextFreeIndex; 00029 } 00030 } 00031 ids[nextFreeIndex] = cachelineId; 00032 00033 return nextFreeIndex + 1; 00034 } 00035 00036 static inline uint32_t InsertCacheline64(uint64_t* ids, uint64_t cachelineId, uint32_t nextFreeIndex) 00037 { 00038 if (nextFreeIndex > 63) 00039 { 00040 return nextFreeIndex; 00041 } 00042 00043 for (uint32_t i = 0; i < nextFreeIndex; i++) 00044 { 00045 if (ids[i] == cachelineId) 00046 { 00047 return nextFreeIndex; 00048 } 00049 } 00050 ids[nextFreeIndex] = cachelineId; 00051 00052 return nextFreeIndex + 1; 00053 } 00054 00055 00056 /*! 00057 * @brief HLI function that detects amount of cachelines accessed by a scatter SEND instruction to global or local memory done in A32 or BTS modes 00058 * @param[in] addresses Array of 32-bit addresses/offsets 00059 * @param[in] accessMask Per-channel mask of memory accesses 00060 * @param[in][out] CachelineProfArgs Information about memory access instruction and the resulting cachelines histogram 00061 */ 00062 IGC_STACK_CALL void CheckScatterA32Access(__global const uint32_t* addresses, 00063 uint32_t accessMask, 00064 __global CachelineProfArgs* cachelineProfArgs) 00065 { 00066 uint32_t cacheLineIds[MAX_SUPPORTED_CACHE_LINE_HISTOGRAM_SIZE]; 00067 uint32_t detectedCachelines = 0; 00068 00069 if (accessMask != 0) 00070 { 00071 uint32_t numOfElements = cachelineProfArgs->in.numAccesses; 00072 uint32_t dataSize = cachelineProfArgs->in.dataSize; 00073 uint32_t shiftBits = cachelineProfArgs->in.log2CachelineSize; 00074 uint32_t CACHELINE_SIZE_IN_BYTES = 1 << shiftBits; 00075 00076 for (uint32_t eIndx = 0; eIndx != numOfElements; ++eIndx) 00077 { 00078 if ((accessMask & (0x1 << eIndx)) != 0) 00079 { 00080 uint32_t firstByteAddr = addresses[eIndx]; 00081 uint32_t lastByteAddr = firstByteAddr + dataSize - 1; 00082 00083 for (; firstByteAddr < lastByteAddr; firstByteAddr += CACHELINE_SIZE_IN_BYTES) 00084 { 00085 detectedCachelines = InsertCacheline32(cacheLineIds, firstByteAddr >> shiftBits, detectedCachelines); 00086 } 00087 detectedCachelines = InsertCacheline32(cacheLineIds, lastByteAddr >> shiftBits, detectedCachelines); 00088 } 00089 if (detectedCachelines == MAX_SUPPORTED_CACHE_LINE_HISTOGRAM_SIZE) 00090 { 00091 atomic_inc(&(cachelineProfArgs->out.outOfHistogramValues)); 00092 return; 00093 } 00094 } 00095 atomic_inc(&(cachelineProfArgs->out.cachelineHistogram[detectedCachelines])); 00096 } 00097 } 00098 00099 /*! 00100 * @brief HLI function that detects amount of cachelines accessed by a scatter SEND instruction to global memory done in A64 mode 00101 * @param[in] addresses Array of 64-bit addresses 00102 * @param[in] accessMask Per-channel mask of memory accesses 00103 * @param[in][out] CachelineProfArgs Information about memory access instruction and the resulting cachelines histogram 00104 */ 00105 IGC_STACK_CALL void CheckScatterA64Access(__global const uint64_t* addresses, 00106 uint32_t accessMask, 00107 __global CachelineProfArgs* cachelineProfArgs) 00108 { 00109 uint64_t cacheLineIds[MAX_SUPPORTED_CACHE_LINE_HISTOGRAM_SIZE]; 00110 uint32_t detectedCachelines = 0; 00111 00112 if (accessMask != 0) 00113 { 00114 uint32_t numOfElements = cachelineProfArgs->in.numAccesses; 00115 uint32_t dataSize = cachelineProfArgs->in.dataSize; 00116 uint32_t shiftBits = cachelineProfArgs->in.log2CachelineSize; 00117 uint32_t CACHELINE_SIZE_IN_BYTES = 1 << shiftBits; 00118 00119 for (uint32_t eIndx = 0; eIndx != numOfElements; ++eIndx) 00120 { 00121 if ((accessMask & (0x1 << eIndx)) != 0) 00122 { 00123 uint64_t firstByteAddr = addresses[eIndx]; 00124 uint64_t lastByteAddr = firstByteAddr + dataSize - 1; 00125 00126 for (; firstByteAddr < lastByteAddr; firstByteAddr += CACHELINE_SIZE_IN_BYTES) 00127 { 00128 detectedCachelines = InsertCacheline32(cacheLineIds, firstByteAddr >> shiftBits, detectedCachelines); 00129 } 00130 detectedCachelines = InsertCacheline32(cacheLineIds, lastByteAddr >> shiftBits, detectedCachelines); 00131 } 00132 if (detectedCachelines == MAX_SUPPORTED_CACHE_LINE_HISTOGRAM_SIZE) 00133 { 00134 atomic_inc(&(cachelineProfArgs->out.outOfHistogramValues)); 00135 return; 00136 } 00137 } 00138 atomic_inc(&(cachelineProfArgs->out.cachelineHistogram[detectedCachelines])); 00139 } 00140 } 00141 00142 00143 #ifdef OPTIMIZED 00144 static inline void SetBits(__global uint32_t* dwords, uint64_t fromAddr, uint64_t toAddr) 00145 { 00146 uint64_t fromDword = fromAddr >> 5; 00147 uint64_t toDword = toAddr >> 5; 00148 00149 if (fromDword == toDword) 00150 { 00151 uint32_t from = fromAddr & 0x1F; 00152 uint32_t to = toAddr & 0x1F; 00153 uint32_t bits = (((uint32_t)1 << (to - from + 1)) - 1) << to; 00154 atomic_or(&dwords[fromDword], bits); 00155 return; 00156 } 00157 00158 // Set the first dword 00159 { 00160 uint32_t bit = fromAddr & 0x1F; 00161 atomic_or(&dwords[fromDword], (uint32_t)0xFFFFFFFF << bit); 00162 } 00163 00164 // Set the middle dwords 00165 for (uint64_t dword = fromDword + 1; dword < toDword; dword++) 00166 { 00167 atomic_or(&dwords[dword], (uint32_t)0xFFFFFFFF); 00168 } 00169 00170 // Set the last dword 00171 { 00172 uint32_t bit = toAddr & 0x1F; 00173 atomic_or(&dwords[toDword], (uint32_t)0xFFFFFFFF >> (31 - bit)); 00174 } 00175 } 00176 #else 00177 static inline void SetBits(__global uint32_t* dwords, uint64_t fromAddr, uint64_t toAddr) 00178 { 00179 for (uint64_t line = fromAddr; line <= toAddr; line++) 00180 { 00181 uint64_t dword = line >> 5; 00182 uint32_t bit = line & 0x1F; 00183 atomic_or(&dwords[dword], (uint32_t)1 << bit); 00184 } 00185 } 00186 #endif 00187 00188 /*! 00189 * @brief HLI function sets bits within a bitvector that correspond to accessed cachelines 00190 * @param[in] addresses Array of 64-bit addresses 00191 * @param[in] accessMask Per-channel mask of memory accesses 00192 * @param[in,out] memoryFootprintArgs Information about memory access 00193 * @param[out] bitvector Resulting per-cacheline bitvector 00194 */ 00195 IGC_STACK_CALL void GlobalMemoryFootprintA64Access(__global MemoryFootprintArgsCombo* memoryFootprintArgsCombo) 00196 { 00197 __global uint64_t* addresses = memoryFootprintArgsCombo->addresses; 00198 uint32_t accessMask = memoryFootprintArgsCombo->accessMask; 00199 uint32_t dataSize = memoryFootprintArgsCombo->dataSize; 00200 __global MemoryFootprintArgs* memoryFootprintArgs = memoryFootprintArgsCombo->memoryFootprintArgs; 00201 __global uint32_t* bitvector = (__global uint32_t*)memoryFootprintArgsCombo->bitvector; 00202 __global uint32_t* bitvector4Reads = bitvector; 00203 __global uint32_t* bitvector4Writes = (__global uint32_t*)((__global uint8_t*)bitvector + (memoryFootprintArgs->in.bitVectorSizeInBytes >> 1)); 00204 00205 if (accessMask != 0) 00206 { 00207 uint32_t shiftBits = memoryFootprintArgs->in.log2CachelineSize; 00208 uint64_t supportedAddressBitsMask = (((uint64_t)1 << memoryFootprintArgs->in.supportedAddressBits) - 1); 00209 uint64_t unsupportedAddressBitsMask = ~supportedAddressBitsMask; 00210 uint64_t upperAddressBits = 0; 00211 00212 for (uint32_t eIndx = 0; eIndx != 32; ++eIndx) 00213 { 00214 if ((accessMask & (0x1 << eIndx)) != 0) 00215 { 00216 uint64_t firstByteAddr = addresses[eIndx]; 00217 uint64_t lastByteAddr = firstByteAddr + dataSize - 1; 00218 00219 upperAddressBits = (firstByteAddr & unsupportedAddressBitsMask); 00220 00221 firstByteAddr &= supportedAddressBitsMask; 00222 lastByteAddr &= supportedAddressBitsMask; 00223 00224 uint64_t firstByteCacheLineId = firstByteAddr >> shiftBits; 00225 uint64_t lastByteCacheLineId = lastByteAddr >> shiftBits; 00226 00227 if (memoryFootprintArgsCombo->accessType == Store) 00228 { 00229 SetBits(bitvector4Writes, firstByteCacheLineId, lastByteCacheLineId); 00230 } 00231 else 00232 { 00233 SetBits(bitvector4Reads, firstByteCacheLineId, lastByteCacheLineId); 00234 } 00235 00236 if (memoryFootprintArgsCombo->accessType == ReadModifyWrite) 00237 { 00238 SetBits(bitvector4Writes, firstByteCacheLineId, lastByteCacheLineId); 00239 } 00240 } 00241 } 00242 00243 memoryFootprintArgs->out.upperAddressBits = upperAddressBits; 00244 } 00245 } 00246 00247 /*! 00248 * @brief HLI function sets bits within a bitvector that correspond to accessed cachelines 00249 * @param[in] addresses Array of 64-bit addresses 00250 * @param[in] accessMask Per-channel mask of memory accesses 00251 * @param[in,out] memoryFootprintArgs Information about memory access 00252 * @param[out] bitvector Resulting per-cacheline bitvector 00253 */ 00254 IGC_STACK_CALL void GlobalMemoryFootprintBlock2DAccess(__global MemoryFootprintArgsCombo* memoryFootprintArgsCombo) 00255 { 00256 if ((memoryFootprintArgsCombo->accessMask & 0x1) == 0) 00257 { 00258 return; 00259 } 00260 00261 // exec_size is ignored for this message 00262 00263 __global Block2D* block2d = memoryFootprintArgsCombo->addresses; 00264 uint32_t dataSize = memoryFootprintArgsCombo->dataSize; 00265 __global MemoryFootprintArgs* memoryFootprintArgs = memoryFootprintArgsCombo->memoryFootprintArgs; 00266 __global uint32_t* bitvector = (__global uint32_t*)memoryFootprintArgsCombo->bitvector; 00267 __global uint32_t* bitvector4Reads = bitvector; 00268 __global uint32_t* bitvector4Writes = (__global uint32_t*)((__global uint8_t*)bitvector + (memoryFootprintArgs->in.bitVectorSizeInBytes >> 1)); 00269 00270 uint32_t shiftBits = memoryFootprintArgs->in.log2CachelineSize; 00271 uint64_t supportedAddressBitsMask = (((uint64_t)1 << memoryFootprintArgs->in.supportedAddressBits) - 1); 00272 uint64_t unsupportedAddressBitsMask = ~supportedAddressBitsMask; 00273 uint64_t upperAddressBits = 0; 00274 00275 uint64_t surfaceBaseAddress = block2d->surface_base_address; 00276 uint32_t surfaceHeight = (block2d->surface_height & 0x00FFFFFF) + 1; // value -1-encoded 00277 uint32_t surfaceWidth = (block2d->surface_width & 0x00FFFFFF) + 1; // value -1-encoded 00278 uint32_t surfacePitch = (block2d->surface_pitch & 0x00FFFFFF) + 1; // value -1-encoded 00279 00280 uint32_t numBlocks = (block2d->array_length + 1); // value -1-encoded 00281 uint32_t blockHeight = (block2d->block_height + 1); // value -1-encoded 00282 uint32_t blockWidth = (block2d->block_width + 1); // value -1-encoded 00283 int32_t blockStartX = block2d->block_start_x; 00284 int32_t blockStartY = block2d->block_start_y; 00285 00286 uint32_t lastY = blockStartY + blockHeight - 1; 00287 if (lastY >= surfaceHeight) 00288 { 00289 lastY = surfaceHeight - 1; 00290 } 00291 00292 uint32_t lastX = blockStartX + numBlocks * blockWidth - 1; 00293 if (lastX * dataSize >= surfaceWidth) 00294 { 00295 lastX = (surfaceWidth / dataSize) - 1; 00296 } 00297 00298 for (uint32_t y = blockStartY; y <= lastY; y++) 00299 { 00300 uint64_t firstByteAddr = surfaceBaseAddress + y * surfacePitch + blockStartX * dataSize; 00301 uint64_t lastByteAddr = firstByteAddr + dataSize * lastX - 1; 00302 00303 upperAddressBits = (firstByteAddr & unsupportedAddressBitsMask); 00304 00305 firstByteAddr &= supportedAddressBitsMask; 00306 lastByteAddr &= supportedAddressBitsMask; 00307 00308 uint64_t firstByteCacheLineId = firstByteAddr >> shiftBits; 00309 uint64_t lastByteCacheLineId = lastByteAddr >> shiftBits; 00310 00311 if (memoryFootprintArgsCombo->accessType == Store) 00312 { 00313 SetBits(bitvector4Writes, firstByteCacheLineId, lastByteCacheLineId); 00314 } 00315 else 00316 { 00317 SetBits(bitvector4Reads, firstByteCacheLineId, lastByteCacheLineId); 00318 } 00319 00320 if (memoryFootprintArgsCombo->accessType == ReadModifyWrite) 00321 { 00322 SetBits(bitvector4Writes, firstByteCacheLineId, lastByteCacheLineId); 00323 } 00324 } 00325 00326 memoryFootprintArgs->out.upperAddressBits = upperAddressBits; 00327 } 00328 00329 00330 static inline uint32_t CacheIndexHash(uint64_t lineId, uint64_t hashMask, uint32_t shift) 00331 { 00332 return (lineId & (hashMask << shift)) >> shift; 00333 } 00334 00335 static inline void SingleLineAccess(uint64_t lineId, uint64_t hashMask, uint32_t shift, __global uint64_t* tags, __global uint64_t* misses, __global uint64_t* hits) 00336 { 00337 uint32_t bucket = CacheIndexHash(lineId, hashMask, shift); 00338 uint64_t prevLineId = atom_xchg(&tags[bucket], lineId); 00339 if (prevLineId == lineId) 00340 { 00341 atom_inc(hits); 00342 } 00343 else 00344 { 00345 atom_inc(misses); 00346 } 00347 } 00348 00349 /*! 00350 * @brief HLI function that models cache and reports the number of hits and misses 00351 * @param[in] addresses Array of 64-bit addresses 00352 * @param[in] accessMask Per-channel mask of memory accesses 00353 * @param[in] dataSize Data size per single access 00354 * @param[in,out] MemoryCacheModelArgs Information about cache model 00355 * @param[in] cacheTagsVector Vector of cache buckets tags 00356 */ 00357 IGC_STACK_CALL void GlobalMemoryCacheModelA64Access(__global MemoryCacheModelArgsCombo* memoryCacheModelArgsCombo) 00358 { 00359 __global uint64_t* addresses = memoryCacheModelArgsCombo->addresses; 00360 uint32_t accessMask = memoryCacheModelArgsCombo->accessMask; 00361 uint32_t dataSize = memoryCacheModelArgsCombo->dataSize; 00362 __global MemoryCacheModelArgs* memoryCacheModelArgs = memoryCacheModelArgsCombo->memoryCacheModelArgs; 00363 __global uint64_t* tags = (__global uint64_t*)memoryCacheModelArgsCombo->cacheTagsVector; 00364 uint32_t shiftBits = memoryCacheModelArgs->in.log2CachelineSize; 00365 uint64_t numOfBuckets = memoryCacheModelArgs->in.cacheSize >> shiftBits; 00366 uint64_t hashMask = (numOfBuckets - 1); 00367 uint32_t shiftHashMaskBits = memoryCacheModelArgs->in.hashMaskShift; 00368 __global uint64_t* misses = (__global uint64_t*)&memoryCacheModelArgs->out.misses; 00369 __global uint64_t* hits = (__global uint64_t*)&memoryCacheModelArgs->out.hits; 00370 00371 if (accessMask != 0) 00372 { 00373 for (uint32_t eIndx = 0; eIndx != 32; ++eIndx) 00374 { 00375 if ((accessMask & (0x1 << eIndx)) != 0) 00376 { 00377 uint64_t firstByteAddr = addresses[eIndx]; 00378 uint64_t lastByteAddr = firstByteAddr + dataSize - 1; 00379 00380 uint64_t firstByteCacheLineId = firstByteAddr >> shiftBits; 00381 uint64_t lastByteCacheLineId = lastByteAddr >> shiftBits; 00382 00383 for (uint64_t lineId = firstByteCacheLineId; lineId <= lastByteCacheLineId; lineId++) 00384 { 00385 SingleLineAccess(lineId, hashMask, shiftHashMaskBits, tags, misses, hits); 00386 } 00387 } 00388 } 00389 } 00390 } 00391 00392 /*! 00393 * @brief HLI function that models cache and reports the number of hits and misses 00394 * @param[in] addresses Array of 64-bit addresses 00395 * @param[in] accessMask Per-channel mask of memory accesses 00396 * @param[in] dataSize Data size per single access 00397 * @param[in,out] MemoryCacheModelArgs Information about cache model 00398 * @param[in] cacheTagsVector Vector of cache buckets tags 00399 */ 00400 IGC_STACK_CALL void GlobalMemoryCacheModelBlock2DAccess(__global MemoryCacheModelArgsCombo* memoryCacheModelArgsCombo) 00401 { 00402 if ((memoryCacheModelArgsCombo->accessMask & 0x1) == 0) 00403 { 00404 return; 00405 } 00406 00407 // exec_size is ignored for this message 00408 00409 __global Block2D* block2d = memoryCacheModelArgsCombo->addresses; 00410 uint32_t dataSize = memoryCacheModelArgsCombo->dataSize; 00411 __global MemoryCacheModelArgs* memoryCacheModelArgs = memoryCacheModelArgsCombo->memoryCacheModelArgs; 00412 __global uint64_t* tags = (__global uint64_t*)memoryCacheModelArgsCombo->cacheTagsVector; 00413 uint32_t shiftBits = memoryCacheModelArgs->in.log2CachelineSize; 00414 uint64_t numOfBuckets = memoryCacheModelArgs->in.cacheSize >> shiftBits; 00415 uint64_t hashMask = (numOfBuckets - 1); 00416 uint32_t shiftHashMaskBits = memoryCacheModelArgs->in.hashMaskShift; 00417 __global uint64_t* misses = (__global uint64_t*)&memoryCacheModelArgs->out.misses; 00418 __global uint64_t* hits = (__global uint64_t*)&memoryCacheModelArgs->out.hits; 00419 00420 uint64_t surfaceBaseAddress = block2d->surface_base_address; 00421 uint32_t surfaceHeight = (block2d->surface_height & 0x00FFFFFF) + 1; // value -1-encoded 00422 uint32_t surfaceWidth = (block2d->surface_width & 0x00FFFFFF) + 1; // value -1-encoded 00423 uint32_t surfacePitch = (block2d->surface_pitch & 0x00FFFFFF) + 1; // value -1-encoded 00424 00425 uint32_t numBlocks = (block2d->array_length + 1); // value -1-encoded 00426 uint32_t blockHeight = (block2d->block_height + 1); // value -1-encoded 00427 uint32_t blockWidth = (block2d->block_width + 1); // value -1-encoded 00428 int32_t blockStartX = block2d->block_start_x; 00429 int32_t blockStartY = block2d->block_start_y; 00430 00431 uint32_t lastY = blockStartY + blockHeight - 1; 00432 if (lastY >= surfaceHeight) 00433 { 00434 lastY = surfaceHeight - 1; 00435 } 00436 00437 uint32_t lastX = blockStartX + numBlocks * blockWidth - 1; 00438 if (lastX * dataSize >= surfaceWidth) 00439 { 00440 lastX = (surfaceWidth / dataSize) - 1; 00441 } 00442 00443 for (uint32_t y = blockStartY; y <= lastY; y++) 00444 { 00445 uint64_t firstByteAddr = surfaceBaseAddress + y * surfacePitch + blockStartX * dataSize; 00446 uint64_t lastByteAddr = firstByteAddr + dataSize * lastX - 1; 00447 00448 uint64_t firstByteCacheLineId = firstByteAddr >> shiftBits; 00449 uint64_t lastByteCacheLineId = lastByteAddr >> shiftBits; 00450 00451 for (uint64_t lineId = firstByteCacheLineId; lineId <= lastByteCacheLineId; lineId++) 00452 { 00453 SingleLineAccess(lineId, hashMask, shiftHashMaskBits, tags, misses, hits); 00454 } 00455 } 00456 } 00457 00458 /*! 00459 * @brief HLI function checks type of accesses (RaR, WaR, WaW, RaW) 00460 * @param[in] addresses Array of 64-bit addresses 00461 * @param[in] accessMask Per-channel mask of memory accesses 00462 * @param[in] dataSize Data size per single access 00463 * @param[in] accessType Type of access (load, store, atimic) 00464 * @param[in,out] memoryAccessAnalysisArgsCombo Information about memory access 00465 * @param[out] bitvector Resulting per-cacheline bitvector 00466 */ 00467 IGC_STACK_CALL void GlobalMemoryAccessAnalysisA64Access(__global MemoryAccessAnalysisArgsCombo* memoryAccessAnalysisArgsCombo) 00468 { 00469 __global uint64_t* addresses = memoryAccessAnalysisArgsCombo->addresses; 00470 uint32_t accessMask = memoryAccessAnalysisArgsCombo->accessMask; 00471 uint32_t dataSize = memoryAccessAnalysisArgsCombo->dataSize; 00472 uint32_t accessType = memoryAccessAnalysisArgsCombo->accessType; 00473 __global MemoryAccessAnalysisArgs* memoryAccessAnalysisArgs = memoryAccessAnalysisArgsCombo->memoryAccessAnalysisArgs; 00474 __global uint32_t* bitvector = (__global uint32_t*)memoryAccessAnalysisArgsCombo->bitvector; 00475 __global uint64_t* rarCount = (__global uint64_t*)&memoryAccessAnalysisArgs->out.rarCount; 00476 __global uint64_t* rawCount = (__global uint64_t*)&memoryAccessAnalysisArgs->out.rawCount; 00477 __global uint64_t* warCount = (__global uint64_t*)&memoryAccessAnalysisArgs->out.warCount; 00478 __global uint64_t* wawCount = (__global uint64_t*)&memoryAccessAnalysisArgs->out.wawCount; 00479 00480 if (accessMask != 0) 00481 { 00482 uint32_t shiftBits = memoryAccessAnalysisArgs->in.log2CachelineSize; 00483 uint64_t supportedAddressBitsMask = (((uint64_t)1 << memoryAccessAnalysisArgs->in.supportedAddressBits) - 1); 00484 00485 for (uint32_t eIndx = 0; eIndx != 32; ++eIndx) 00486 { 00487 if ((accessMask & (0x1 << eIndx)) != 0) 00488 { 00489 uint64_t firstByteAddr = addresses[eIndx]; 00490 uint64_t lastByteAddr = firstByteAddr + dataSize - 1; 00491 00492 firstByteAddr &= supportedAddressBitsMask; 00493 lastByteAddr &= supportedAddressBitsMask; 00494 00495 uint64_t firstByteCacheLineId = firstByteAddr >> shiftBits; 00496 uint64_t lastByteCacheLineId = lastByteAddr >> shiftBits; 00497 00498 for (uint64_t lineId = firstByteCacheLineId; lineId <= lastByteCacheLineId; lineId++) 00499 { 00500 uint64_t dword = lineId >> 5; 00501 uint32_t bit = lineId & 0x1F; 00502 00503 uint32_t bitMask = (uint32_t)1 << bit; 00504 uint32_t prev = (accessType == Store || accessType == ReadModifyWrite) ? atomic_or(&bitvector[dword], bitMask) : atomic_and(&bitvector[dword], ~bitMask); 00505 00506 uint32_t prevBit = prev & bitMask; 00507 00508 if (accessType == Store) 00509 { 00510 if (prevBit == 0) 00511 { 00512 atom_inc(warCount); 00513 } 00514 else 00515 { 00516 atom_inc(wawCount); 00517 } 00518 } 00519 else 00520 { 00521 if (prevBit == 0) 00522 { 00523 atom_inc(rarCount); 00524 } 00525 else 00526 { 00527 atom_inc(rawCount); 00528 } 00529 } 00530 } 00531 00532 if (accessType == ReadModifyWrite) 00533 { 00534 atom_add(warCount, (lastByteCacheLineId - firstByteCacheLineId + 1)); 00535 } 00536 } 00537 } 00538 } 00539 } 00540 00541 /*! 00542 * @brief HLI function checks type of access (RaR, WaR, WaW, RaW) for load_block2d/store_block2d messages 00543 * @param[in] addresses Array of 64-bit addresses 00544 * @param[in] accessMask Per-channel mask of memory accesses 00545 * @param[in] dataSize Data size 00546 * @param[in] accessType Type of access 00547 * @param[in,out] memoryFootprintArgs Information about memory access 00548 * @param[out] bitvector Resulting per-cacheline bitvector 00549 */ 00550 IGC_STACK_CALL void GlobalMemoryAccessAnalysisBlock2DAccess(__global MemoryAccessAnalysisArgsCombo* memoryAccessAnalysisArgsCombo) 00551 { 00552 if ((memoryAccessAnalysisArgsCombo->accessMask & 0x1) == 0) 00553 { 00554 return; 00555 } 00556 00557 // exec_size is ignored for this message 00558 00559 __global Block2D* block2d = memoryAccessAnalysisArgsCombo->addresses; 00560 uint32_t dataSize = memoryAccessAnalysisArgsCombo->dataSize; 00561 uint32_t accessType = memoryAccessAnalysisArgsCombo->accessType; 00562 __global MemoryAccessAnalysisArgs* memoryAccessAnalysisArgs = memoryAccessAnalysisArgsCombo->memoryAccessAnalysisArgs; 00563 __global uint32_t* bitvector = (__global uint32_t*)memoryAccessAnalysisArgsCombo->bitvector; 00564 __global uint64_t* rarCount = (__global uint64_t*)&memoryAccessAnalysisArgs->out.rarCount; 00565 __global uint64_t* rawCount = (__global uint64_t*)&memoryAccessAnalysisArgs->out.rawCount; 00566 __global uint64_t* warCount = (__global uint64_t*)&memoryAccessAnalysisArgs->out.warCount; 00567 __global uint64_t* wawCount = (__global uint64_t*)&memoryAccessAnalysisArgs->out.wawCount; 00568 00569 uint32_t shiftBits = memoryAccessAnalysisArgs->in.log2CachelineSize; 00570 uint64_t supportedAddressBitsMask = (((uint64_t)1 << memoryAccessAnalysisArgs->in.supportedAddressBits) - 1); 00571 00572 uint64_t surfaceBaseAddress = block2d->surface_base_address; 00573 uint32_t surfaceHeight = (block2d->surface_height & 0x00FFFFFF) + 1; // value -1-encoded 00574 uint32_t surfaceWidth = (block2d->surface_width & 0x00FFFFFF) + 1; // value -1-encoded 00575 uint32_t surfacePitch = (block2d->surface_pitch & 0x00FFFFFF) + 1; // value -1-encoded 00576 00577 uint32_t numBlocks = (block2d->array_length + 1); // value -1-encoded 00578 uint32_t blockHeight = (block2d->block_height + 1); // value -1-encoded 00579 uint32_t blockWidth = (block2d->block_width + 1); // value -1-encoded 00580 int32_t blockStartX = block2d->block_start_x; 00581 int32_t blockStartY = block2d->block_start_y; 00582 00583 uint32_t lastY = blockStartY + blockHeight - 1; 00584 if (lastY >= surfaceHeight) 00585 { 00586 lastY = surfaceHeight - 1; 00587 } 00588 00589 uint32_t lastX = blockStartX + numBlocks * blockWidth - 1; 00590 if (lastX * dataSize >= surfaceWidth) 00591 { 00592 lastX = (surfaceWidth / dataSize) - 1; 00593 } 00594 00595 for (uint32_t y = blockStartY; y <= lastY; y++) 00596 { 00597 uint64_t firstByteAddr = surfaceBaseAddress + y * surfacePitch + blockStartX * dataSize; 00598 uint64_t lastByteAddr = firstByteAddr + dataSize * lastX - 1; 00599 00600 firstByteAddr &= supportedAddressBitsMask; 00601 lastByteAddr &= supportedAddressBitsMask; 00602 00603 uint64_t firstByteCacheLineId = firstByteAddr >> shiftBits; 00604 uint64_t lastByteCacheLineId = lastByteAddr >> shiftBits; 00605 00606 for (uint64_t lineId = firstByteCacheLineId; lineId <= lastByteCacheLineId; lineId++) 00607 { 00608 uint64_t dword = lineId >> 5; 00609 uint32_t bit = lineId & 0x1F; 00610 00611 uint32_t bitMask = (uint32_t)1 << bit; 00612 uint32_t prev = (accessType == Store || accessType == ReadModifyWrite) ? atomic_or(&bitvector[dword], bitMask) : atomic_and(&bitvector[dword], ~bitMask); 00613 00614 uint32_t prevBit = prev & bitMask; 00615 00616 if (accessType == Store) 00617 { 00618 if (prevBit == 0) 00619 { 00620 atom_inc(warCount); 00621 } 00622 else 00623 { 00624 atom_inc(wawCount); 00625 } 00626 } 00627 else 00628 { 00629 if (prevBit == 0) 00630 { 00631 atom_inc(rarCount); 00632 } 00633 else 00634 { 00635 atom_inc(rawCount); 00636 } 00637 } 00638 } 00639 00640 if (accessType == ReadModifyWrite) 00641 { 00642 atom_add(warCount, (lastByteCacheLineId - firstByteCacheLineId + 1)); 00643 } 00644 } 00645 } 00646 00647 /*! 00648 * @brief HLI function checks for WaW accesses from different HW threads for A64 access 00649 * @param[in] addresses Array of 64-bit addresses 00650 * @param[in] accessMask Per-channel mask of memory accesses 00651 * @param[in] dataSize Data size per single access 00652 * @param[in] accessType Type of access 00653 * @param[in,out] memoryAccessAnalysisArgsCombo Information about memory access 00654 * @param[out] bitvector Resulting per-cacheline bitvector 00655 */ 00656 IGC_STACK_CALL void GlobalMemoryRaceDetectionA64Access(__global MemoryRaceDetectionArgsCombo* memoryRaceDetectionArgsCombo) 00657 { 00658 __global uint64_t* addresses = memoryRaceDetectionArgsCombo->addresses; 00659 uint32_t accessMask = memoryRaceDetectionArgsCombo->accessMask; 00660 uint32_t dataSize = memoryRaceDetectionArgsCombo->dataSize; 00661 uint32_t accessType = memoryRaceDetectionArgsCombo->accessType; 00662 uint32_t hwThreadId = memoryRaceDetectionArgsCombo->hwThreadId; 00663 __global MemoryRaceDetectionArgs* memoryRaceDetectionArgs = memoryRaceDetectionArgsCombo->memoryRaceDetectionArgs; 00664 __global uint32_t* raceDataVector = (__global uint32_t*)memoryRaceDetectionArgsCombo->raceDataVector; 00665 __global uint32_t* bitvector = (__global uint32_t*)memoryRaceDetectionArgsCombo->bitvector; 00666 __global uint64_t* raceCount = (__global uint64_t*)&memoryRaceDetectionArgs->out.raceCount; 00667 00668 if (accessMask != 0) 00669 { 00670 uint32_t shiftBits = memoryRaceDetectionArgs->in.log2CachelineSize; 00671 uint64_t supportedAddressBitsMask = (((uint64_t)1 << memoryRaceDetectionArgs->in.supportedAddressBits) - 1); 00672 uint64_t unsupportedAddressBitsMask = ~supportedAddressBitsMask; 00673 uint64_t upperAddressBits = 0; 00674 00675 for (uint32_t eIndx = 0; eIndx != 32; ++eIndx) 00676 { 00677 if ((accessMask & (0x1 << eIndx)) != 0) 00678 { 00679 uint64_t firstByteAddr = addresses[eIndx]; 00680 uint64_t lastByteAddr = firstByteAddr + dataSize - 1; 00681 00682 upperAddressBits = (firstByteAddr & unsupportedAddressBitsMask); 00683 00684 firstByteAddr &= supportedAddressBitsMask; 00685 lastByteAddr &= supportedAddressBitsMask; 00686 00687 uint64_t firstByteCacheLineId = firstByteAddr >> shiftBits; 00688 uint64_t lastByteCacheLineId = lastByteAddr >> shiftBits; 00689 00690 for (uint64_t lineId = firstByteCacheLineId; lineId <= lastByteCacheLineId; lineId++) 00691 { 00692 uint32_t currHwThreadId = hwThreadId & 0xFFFF; 00693 uint32_t mask = currHwThreadId | ((accessType != Load) ? 0x80000000 : 0); 00694 uint32_t prevMask = atomic_xchg(&raceDataVector[lineId], mask); 00695 uint32_t prevBit = (prevMask == 0xFFFFFFFF) ? 0 : (prevMask & 0x80000000); 00696 uint32_t prevHwThreadId = prevMask & 0xFFFF; 00697 00698 if (accessType == Store) 00699 { 00700 if ((prevBit != 0) && prevHwThreadId != currHwThreadId) 00701 { 00702 // if got here a race condition was detected 00703 atom_inc(raceCount); 00704 atomic_or(&bitvector[lineId >> 5], (1 << (lineId & 0x1F))); 00705 } 00706 } 00707 00708 } 00709 } 00710 } 00711 00712 memoryRaceDetectionArgs->out.upperAddressBits = upperAddressBits; 00713 00714 } 00715 } 00716 00717 /*! 00718 * @brief HLI function that counts memory access patterns (RaR, RaW, WaR, WaW) and checks whether WaW accesses from different HW threads for A64 access 00719 * @param[in] addresses Array of 64-bit addresses 00720 * @param[in] accessMask Per-channel mask of memory accesses 00721 * @param[in] dataSize Data size per single access 00722 * @param[in] accessType Type of access 00723 * @param[in,out] memoryAccessAnalysisArgsCombo Information about memory access 00724 * @param[out] bitvector Resulting per-cacheline bitvector 00725 */ 00726 IGC_STACK_CALL void GlobalMemoryAccessAnalysisAndRaceDetectionA64Access(__global MemoryRaceDetectionArgsCombo* memoryRaceDetectionArgsCombo) 00727 { 00728 __global uint64_t* addresses = memoryRaceDetectionArgsCombo->addresses; 00729 uint32_t accessMask = memoryRaceDetectionArgsCombo->accessMask; 00730 uint32_t dataSize = memoryRaceDetectionArgsCombo->dataSize; 00731 uint32_t accessType = memoryRaceDetectionArgsCombo->accessType; 00732 uint32_t hwThreadId = memoryRaceDetectionArgsCombo->hwThreadId; 00733 __global MemoryRaceDetectionArgs* memoryRaceDetectionArgs = memoryRaceDetectionArgsCombo->memoryRaceDetectionArgs; 00734 __global uint32_t* raceDataVector = (__global uint32_t*)memoryRaceDetectionArgsCombo->raceDataVector; 00735 __global uint32_t* bitvector = (__global uint32_t*)memoryRaceDetectionArgsCombo->bitvector; 00736 __global uint64_t* raceCount = (__global uint64_t*)&memoryRaceDetectionArgs->out.raceCount; 00737 __global uint64_t* rarCount = (__global uint64_t*)&memoryRaceDetectionArgs->out.rarCount; 00738 __global uint64_t* rawCount = (__global uint64_t*)&memoryRaceDetectionArgs->out.rawCount; 00739 __global uint64_t* warCount = (__global uint64_t*)&memoryRaceDetectionArgs->out.warCount; 00740 __global uint64_t* wawCount = (__global uint64_t*)&memoryRaceDetectionArgs->out.wawCount; 00741 00742 if (accessMask != 0) 00743 { 00744 uint32_t shiftBits = memoryRaceDetectionArgs->in.log2CachelineSize; 00745 uint64_t supportedAddressBitsMask = (((uint64_t)1 << memoryRaceDetectionArgs->in.supportedAddressBits) - 1); 00746 uint64_t unsupportedAddressBitsMask = ~supportedAddressBitsMask; 00747 uint64_t upperAddressBits = 0; 00748 00749 for (uint32_t eIndx = 0; eIndx != 32; ++eIndx) 00750 { 00751 if ((accessMask & (0x1 << eIndx)) != 0) 00752 { 00753 uint64_t firstByteAddr = addresses[eIndx]; 00754 uint64_t lastByteAddr = firstByteAddr + dataSize - 1; 00755 00756 upperAddressBits = (firstByteAddr & unsupportedAddressBitsMask); 00757 00758 firstByteAddr &= supportedAddressBitsMask; 00759 lastByteAddr &= supportedAddressBitsMask; 00760 00761 uint64_t firstByteCacheLineId = firstByteAddr >> shiftBits; 00762 uint64_t lastByteCacheLineId = lastByteAddr >> shiftBits; 00763 00764 for (uint64_t lineId = firstByteCacheLineId; lineId <= lastByteCacheLineId; lineId++) 00765 { 00766 uint32_t currHwThreadId = hwThreadId & 0xFFFF; 00767 uint32_t mask = currHwThreadId | ((accessType != Load) ? 0x80000000 : 0); 00768 uint32_t prevMask = atomic_xchg(&raceDataVector[lineId], mask); 00769 uint32_t prevBit = (prevMask == 0xFFFFFFFF) ? 0 : (prevMask & 0x80000000); 00770 uint32_t prevHwThreadId = prevMask & 0xFFFF; 00771 00772 if (accessType == Store) 00773 { 00774 if ((prevBit != 0)) 00775 { 00776 if (prevHwThreadId != currHwThreadId) 00777 { 00778 // if got here a race condition was detected 00779 atom_inc(raceCount); 00780 atomic_or(&bitvector[lineId >> 5], (1 << (lineId & 0x1F))); 00781 } 00782 atom_inc(wawCount); 00783 } 00784 else 00785 { 00786 atom_inc(warCount); 00787 } 00788 } 00789 else 00790 { 00791 if (prevBit != 0) 00792 { 00793 atom_inc(rawCount); 00794 } 00795 else 00796 { 00797 atom_inc(rarCount); 00798 } 00799 } 00800 } 00801 00802 if (accessType == ReadModifyWrite) 00803 { 00804 atom_add(warCount, (lastByteCacheLineId - firstByteCacheLineId + 1)); 00805 } 00806 } 00807 } 00808 00809 memoryRaceDetectionArgs->out.upperAddressBits = upperAddressBits; 00810 00811 } 00812 }
(Back to the list of all GTPin Sample Tools)
Copyright (C) 2013-2025 Intel Corporation
SPDX-License-Identifier: MIT
1.7.4