24 #include "hip/hip_runtime.h" 25 #include "hip/hip_runtime_api.h" 29 #include <cuda_runtime.h> 32 #define protoGetCurrentStream DisjointBoxLayout::getCurrentStream() 35 #ifdef PROTO_HIP // HIP 36 #define protoStream_t hipStream_t 37 #define protoMemcpyDeviceToDevice hipMemcpyDeviceToDevice 38 #define protoMemcpyHostToDevice hipMemcpyHostToDevice 39 #define protoMemcpyDeviceToHost hipMemcpyDeviceToHost 40 #define protoError hipError_t //legacy 41 #define protoError_t hipError_t 42 #define protoSuccess hipSuccess 43 #define protoDeviceProp hipDeviceProp_t 44 #define protoPointerAttributes hipPointerAttribute_t 45 #define protoPitchedPtr hipPitchedPtr 46 #define protoArray hipArray 47 #define protoExtent hipExtent 48 #define protoChannelFormatDesc hipChannelFormatDesc 49 #define protoReadModeElementType hipReadModeElementType 50 #define protoEvent_t hipEvent_t 51 #define protoGetLastError() hipGetLastError() 52 #define protoPeekAtLastError() hipPeekAtLastError() 53 #define protoGetErrorString(X) hipGetErrorString(X) 54 #define protoThreadSynchronize() hipDeviceSynchronize() 58 #define protoStream_t cudaStream_t 59 #define protoMemcpyDeviceToDevice cudaMemcpyDeviceToDevice 60 #define protoMemcpyHostToDevice cudaMemcpyHostToDevice 61 #define protoMemcpyDeviceToHost cudaMemcpyDeviceToHost 62 #define protoError cudaError // legacy 63 #define protoError_t cudaError 64 #define protoSuccess cudaSuccess 65 #define protoDeviceProp cudaDeviceProp 66 #define protoPointerAttributes cudaPointerAttributes 67 #define protoPitchedPtr cudaPitchedPtr 68 #define protoArray cudaArray 69 #define protoExtent cudaExtent 70 #define protoChannelFormatDesc cudaChannelFormatDesc 71 #define protoReadModeElementType cudaReadModeElementType 72 #define protoEvent_t cudaEvent_t 73 #define protoGetLastError() cudaGetLastError() 74 #define protoPeekAtLastError() cudaPeekAtLastError() 75 #define protoGetErrorString(X) cudaGetErrorString(X) 76 #define protoThreadSynchronize() cudaThreadSynchronize() 83 #define GPU_CHECK(in) \ 86 protoError_t error = in; \ 87 if(error != protoSuccess) \ 89 std::cout << protoGetErrorString(error); \ 94 #define GPU_CHECK(in) \ 97 std::cout << "Try " << #in << " file: "<< __FILE__ << " line: " << __LINE__ << std::endl; \ 98 protoError_t error = in; \ 99 protoDeviceSynchronizeGPU();\ 100 if(error != protoSuccess) \ 102 std::cout << protoGetErrorString(error); \ 105 else std::cout << "Success " << #in << std::endl; \ 110 #define GPU_CHECK(condition) condition 116 #ifdef PROTO_HIP // HIP 118 #define HC(X) GPU_CHECK(X) 121 #define protoMallocGPU(PTR,NBYTES) storeMemInfo(DEVICE,NBYTES); countMallocDevice(HC(hipMalloc(&PTR,NBYTES))) 122 #define protoFreeGPU(PTR) HC(hipFree(PTR)) 123 #define protoMallocHost(a,b) countMallocDevice(HC(hipHostMalloc(&a,b))) 127 #define protoMallocManaged(a,b) HC(hipMallocManaged(&a,b)) 128 #define protoMemset(a,b,c) HC(hipMemset(a,b,c)) 131 #define protoMemcpyGPU(to,from,size,copyType) HC(hipMemcpy(to,from,size,copyType)) 132 #define protoMemcpyAsyncGPU(to,from,size,copyType,stream) HC(hipMemcpyAsync(to,from,size,copyType, stream)) 133 #define protoMemcpyFromSymbolGPU(a,b,c,d,e) hipMemcpyFromSymbol(a,b,c,d,e) // not used anymore 134 #define protoMemcpyToSymbolGPU(a,b,c,d,e) hipMemcpyToSymbol(a,b,c,d,e) 137 #define protoDeviceSynchronizeGPU() hipDeviceSynchronize() 138 #define protoStreamCreate(X) HC(hipStreamCreate(X)) 139 #define protoStreamDestroy(X) HC(hipStreamDestroy(X)) 140 #define protoStreamSynchronize(X) HC(hipStreamSynchronize(X)) 143 #define protoSetDevice(X) HC(hipSetDevice(X)) 144 #define protoGetDeviceProperties(X,Y) HC(hipGetDeviceProperties(X,Y)) 145 #define protoDeviceReset() HC(hipDeviceReset()) 146 #define protoPointerGetAttributes(X,Y) HC(hipPointerGetAttributes(X,Y)) 147 #define protoGetDeviceCount(X) HC(hipGetDeviceCount(X)) 148 #define protoGetDevice(X) HC(hipGetDevice(X)) 149 #define protoMemGetInfo(X,Y) HC(hipMemGetInfo(X,Y)) 152 #define protoEventCreate(X) HC(hipEventCreate(X)) 153 #define protoEventRecord(X) HC(hipEventRecord(X)) 154 #define protoEventSynchronize(X) hipEventSynchronize(X) 155 #define protoEventElapsedTime(a,b,c) HC(hipEventElapsedTime(a,b,c)) 158 #define protoBindTexture(a,b,c,d,e) HC(hipBindTexture(a,b,c,d,e)) 159 #define protoMalloc3D(a,b) HC(hipMalloc3D(&a,b)) 160 #define make_protoExtent hip_cudaExtent 164 #define CC(X) GPU_CHECK(X) // CudaCheck 168 #define protoMallocGPU(a,b) storeMemInfo(DEVICE,b); countMallocDevice(CC(cudaMalloc(&a,b))) 169 #define protoFreeGPU(a) CC(cudaFree(a)) 174 #define protoMallocHost(a,b) countMallocDevice(CC(cudaMallocHost(&a,b))) 178 #define protoMallocManaged(a,b) CC(cudaMallocManaged(&a,b)) 179 #define protoMemset(a,b,c) CC(cudaMemset(a,b,c)) 182 #define protoMemcpyGPU(to,from,size,copyType) CC(cudaMemcpy(to,from,size,copyType)) 183 #define protoMemcpyAsyncGPU(to,from,size,copyType,stream) CC(cudaMemcpyAsync(to,from,size,copyType, stream)) 184 #define protoMemcpyFromSymbolGPU(a,b,c,d,e) cudaMemcpyFromSymbol(a,b,c,d,e) 185 #define protoMemcpyToSymbolGPU(a,b,c,d,e) cudaMemcpyToSymbol(a,b,c,d,e) 188 #define protoDeviceSynchronizeGPU() cudaDeviceSynchronize() 189 #define protoStreamCreate(X) CC(cudaStreamCreate(X)) 190 #define protoStreamDestroy(X) CC(cudaStreamDestroy(X)) 191 #define protoStreamSynchronize(X) CC(cudaStreamSynchronize(X)) 194 #define protoSetDevice(X) CC(cudaSetDevice(X)) 195 #define protoGetDeviceProperties(X,Y) CC(cudaGetDeviceProperties(X,Y)) 196 #define protoDeviceReset() CC(cudaDeviceReset()) 197 #define protoPointerGetAttributes(X,Y) CC(cudaPointerGetAttributes(X,Y)) 198 #define protoGetDevice(X) CC(cudaGetDevice(X)) 199 #define protoGetDeviceCount(X) CC(cudaGetDeviceCount(X)) 200 #define protoMemGetInfo(X,Y) CC(cudaMemGetInfo(X,Y)) 203 #define protoEventCreate(X) CC(cudaEventCreate(X)) 204 #define protoEventRecord(X) CC(cudaEventRecord(X)) 205 #define protoEventSynchronize(X) cudaEventSynchronize(X) 206 #define protoEventElapsedTime(a,b,c) CC(cudaEventElapsedTime(a,b,c)) 209 #define protoBindTexture(a,b,c,d,e) CC(cudaBindTexture(a,b,c,d,e)) 210 #define protoMalloc3D(a,b) CC(cudaMalloc3D(&a,b)) 211 #define make_protoExtent make_cudaExtent 220 static void printDim(
unsigned int a_in) {std::cout << a_in ;}
221 static void printDim(
dim3 a_in){std::cout <<
"("<<a_in.x<<
","<<a_in.y<<
","<<a_in.z<<
")";}
222 #define PRINT_KERNEL_NAME_ARGS(IN,BLOCKS,THREADS) Ker tmp_name; std::cout << " kernel name: " << typeid(tmp_name).name() << " blocks "; printDim(BLOCKS); std::cout << " number of threads " << THREADS << std::endl; 224 #define PRINT_KERNEL_NAME_ARGS(IN,BLOCKS,THREADS) 227 #define PRINT_KERNEL_NAME_ARGS(IN,BLOCKS,THREADS) 231 #define PRINT_KER(X) std::cout << "Kernel: "<< #X << " file " << __FILE__ << " line " <<__LINE__<< std::endl; \ 233 protoDeviceSynchronizeGPU();\ 234 {protoError_t error = protoPeekAtLastError(); \ 235 protoDeviceSynchronizeGPU();\ 236 if(error != protoSuccess) \ 238 std::cout << protoGetErrorString(error); \ 241 else std::cout << "Success Kernel: "<< #X << std::endl;} 242 #define PRINT_KER_CUDA(X) std::cout << "Kernel: "<< #X << " file " << __FILE__ << " line " <<__LINE__<< std::endl; \ 244 protoDeviceSynchronizeGPU();\ 245 {protoError_t error = protoPeekAtLastError(); \ 246 protoDeviceSynchronizeGPU();\ 247 if(error != protoSuccess) \ 249 std::cout << protoGetErrorString(error); \ 252 else std::cout << "Success Kernel: "<< #X << std::endl;} 254 #define PRINT_KER(X) X 255 #define PRINT_KER_CUDA(X) X 259 #define protoLaunchKernelGPU(Ker, nbBlocks, nbThreads, args...) PRINT_KER(hipLaunchKernelGGL(Ker, nbBlocks, nbThreads, 0, 0, args);) 261 #define protoLaunchKernelGPU(Ker, nbBlocks, nbThreads, args...) PRINT_KER_CUDA( ( Ker<<<nbBlocks,nbThreads>>>(args)) ) 265 #define protoLaunchKernelMemGPU(Ker, nbBlocks, nbThreads, smem, args...) PRINT_KER(hipLaunchKernelGGL( Ker, nbBlocks, nbThreads, smem, 0, args);) 268 #define protoLaunchKernelMem(Ker, nbBlocks, nbThreads, smem, args...) PRINT_KER_CUDA(( Ker<<<nbBlocks, nbThreads,smem>>>(args))) 275 #define protoLaunchKernelMemAsyncGPU(Ker, nbBlocks, nbThreads, smem, stream, args...) PRINT_KER( hipLaunchKernelGGL( Ker, nbBlocks, nbThreads, smem, stream, args);) 277 #define protoLaunchKernelMemAsyncGPU(Ker, nbBlocks, nbThreads, smem, stream, args...) PRINT_KER_CUDA((Ker<<<nbBlocks, nbThreads,smem,stream>>>(args))) 282 inline bool isDeviceMemory(T* ptr)
284 protoPointerAttributes att;
285 protoPointerGetAttributes(&att, ptr);
287 if(att.memoryType == hipMemoryTypeDevice)
return true;
289 if(att.type == 2)
return true;
299 inline void v100tuning(
int nbElems,
int & nbBlocks,
int &blockSize)
303 nbBlocks = ( nbElems + blockSize - 1)/ blockSize;
310 blockSize = ( nbElems + nbBlocks - 1) / nbBlocks;
312 int coeff = blockSize / 32;
315 blockSize = coeff * 32;
317 nbBlocks = ( nbElems + blockSize - 1) / nbBlocks;
Definition: Proto_macros.H:13