20 #if defined PROTO_CUDA 22 #include <cuda_runtime.h> 23 #elif defined PROTO_HIP 24 #include "hip/hip_runtime.h" 25 #include "hip/hip_runtime_api.h" 30 #define protoGetCurrentStream DisjointBoxLayout::getCurrentStream() 34 #define protoStream_t hipStream_t 35 #define protoMemcpyDeviceToDevice hipMemcpyDeviceToDevice 36 #define protoMemcpyHostToDevice hipMemcpyHostToDevice 37 #define protoMemcpyDeviceToHost hipMemcpyDeviceToHost 38 #define protoError hipError_t //legacy 39 #define protoError_t hipError_t 40 #define protoSuccess hipSuccess 41 #define protoDeviceProp hipDeviceProp_t 42 #define protoPointerAttributes hipPointerAttribute_t 43 #define protoPitchedPtr hipPitchedPtr 44 #define protoArray hipArray 45 #define protoExtent hipExtent 46 #define protoChannelFormatDesc hipChannelFormatDesc 47 #define protoReadModeElementType hipReadModeElementType 48 #define protoEvent_t hipEvent_t 49 #define protoGetLastError() hipGetLastError() 50 #define protoPeekAtLastError() hipPeekAtLastError() 51 #define protoGetErrorString(X) hipGetErrorString(X) 52 #define protoThreadSynchronize() hipDeviceSynchronize() 55 #elif defined PROTO_CUDA 56 #define protoStream_t cudaStream_t 57 #define protoMemcpyDeviceToDevice cudaMemcpyDeviceToDevice 58 #define protoMemcpyHostToDevice cudaMemcpyHostToDevice 59 #define protoMemcpyDeviceToHost cudaMemcpyDeviceToHost 60 #define protoError cudaError // legacy 61 #define protoError_t cudaError 62 #define protoSuccess cudaSuccess 63 #define protoDeviceProp cudaDeviceProp 64 #define protoPointerAttributes cudaPointerAttributes 65 #define protoPitchedPtr cudaPitchedPtr 66 #define protoArray cudaArray 67 #define protoExtent cudaExtent 68 #define protoChannelFormatDesc cudaChannelFormatDesc 69 #define protoReadModeElementType cudaReadModeElementType 70 #define protoEvent_t cudaEvent_t 71 #define protoGetLastError() cudaGetLastError() 72 #define protoPeekAtLastError() cudaPeekAtLastError() 73 #define protoGetErrorString(X) cudaGetErrorString(X) 74 #define protoThreadSynchronize() cudaThreadSynchronize() 81 #define GPU_CHECK(in) \ 84 protoError_t error = in; \ 85 if(error != protoSuccess) \ 87 std::cout << protoGetErrorString(error); \ 92 #define GPU_CHECK(in) \ 95 std::cout << "Try " << #in << " file: "<< __FILE__ << " line: " << __LINE__ << std::endl; \ 96 protoError_t error = in; \ 97 protoDeviceSynchronizeGPU();\ 98 if(error != protoSuccess) \ 100 std::cout << protoGetErrorString(error); \ 103 else std::cout << "Success " << #in << std::endl; \ 108 #define GPU_CHECK(condition) condition 114 #if defined PROTO_HIP // HIP 116 #define HC(X) GPU_CHECK(X) 119 #define protoMallocGPU(PTR,NBYTES) storeMemInfo(DEVICE,NBYTES); countMallocDevice(HC(hipMalloc(&PTR,NBYTES))) 120 #define protoFreeGPU(PTR) HC(hipFree(PTR)) 121 #define protoMallocHost(a,b) countMallocDevice(HC(hipHostMalloc(&a,b))) 125 #define protoMallocManaged(a,b) HC(hipMallocManaged(&a,b)) 126 #define protoMemset(a,b,c) HC(hipMemset(a,b,c)) 129 #define protoMemcpyGPU(to,from,size,copyType) HC(hipMemcpy(to,from,size,copyType)) 130 #define protoMemcpyAsyncGPU(to,from,size,copyType,stream) HC(hipMemcpyAsync(to,from,size,copyType, stream)) 131 #define protoMemcpyFromSymbolGPU(a,b,c,d,e) hipMemcpyFromSymbol(a,b,c,d,e) // not used anymore 132 #define protoMemcpyToSymbolGPU(a,b,c,d,e) hipMemcpyToSymbol(a,b,c,d,e) 135 #define protoDeviceSynchronizeGPU() hipDeviceSynchronize() 136 #define protoStreamCreate(X) HC(hipStreamCreate(X)) 137 #define protoStreamDestroy(X) HC(hipStreamDestroy(X)) 138 #define protoStreamSynchronize(X) HC(hipStreamSynchronize(X)) 141 #define protoSetDevice(X) HC(hipSetDevice(X)) 142 #define protoGetDeviceProperties(X,Y) HC(hipGetDeviceProperties(X,Y)) 143 #define protoDeviceReset() HC(hipDeviceReset()) 144 #define protoPointerGetAttributes(X,Y) HC(hipPointerGetAttributes(X,Y)) 145 #define protoGetDeviceCount(X) HC(hipGetDeviceCount(X)) 146 #define protoGetDevice(X) HC(hipGetDevice(X)) 147 #define protoMemGetInfo(X,Y) HC(hipMemGetInfo(X,Y)) 150 #define protoEventCreate(X) HC(hipEventCreate(X)) 151 #define protoEventRecord(X) HC(hipEventRecord(X)) 152 #define protoEventSynchronize(X) hipEventSynchronize(X) 153 #define protoEventElapsedTime(a,b,c) HC(hipEventElapsedTime(a,b,c)) 156 #define protoBindTexture(a,b,c,d,e) HC(hipBindTexture(a,b,c,d,e)) 157 #define protoMalloc3D(a,b) HC(hipMalloc3D(&a,b)) 158 #define make_protoExtent hip_cudaExtent 160 #elif defined PROTO_CUDA 162 #define CC(X) GPU_CHECK(X) // CudaCheck 166 #define protoMallocGPU(a,b) storeMemInfo(DEVICE,b); countMallocDevice(CC(cudaMalloc(&a,b))) 167 #define protoFreeGPU(a) CC(cudaFree(a)) 172 #define protoMallocHost(a,b) countMallocDevice(CC(cudaMallocHost(&a,b))) 176 #define protoMallocManaged(a,b) CC(cudaMallocManaged(&a,b)) 177 #define protoMemset(a,b,c) CC(cudaMemset(a,b,c)) 180 #define protoMemcpyGPU(to,from,size,copyType) CC(cudaMemcpy(to,from,size,copyType)) 181 #define protoMemcpyAsyncGPU(to,from,size,copyType,stream) CC(cudaMemcpyAsync(to,from,size,copyType, stream)) 182 #define protoMemcpyFromSymbolGPU(a,b,c,d,e) cudaMemcpyFromSymbol(a,b,c,d,e) 183 #define protoMemcpyToSymbolGPU(a,b,c,d,e) cudaMemcpyToSymbol(a,b,c,d,e) 186 #define protoDeviceSynchronizeGPU() cudaDeviceSynchronize() 187 #define protoStreamCreate(X) CC(cudaStreamCreate(X)) 188 #define protoStreamDestroy(X) CC(cudaStreamDestroy(X)) 189 #define protoStreamSynchronize(X) CC(cudaStreamSynchronize(X)) 192 #define protoSetDevice(X) CC(cudaSetDevice(X)) 193 #define protoGetDeviceProperties(X,Y) CC(cudaGetDeviceProperties(X,Y)) 194 #define protoDeviceReset() CC(cudaDeviceReset()) 195 #define protoPointerGetAttributes(X,Y) CC(cudaPointerGetAttributes(X,Y)) 196 #define protoGetDevice(X) CC(cudaGetDevice(X)) 197 #define protoGetDeviceCount(X) CC(cudaGetDeviceCount(X)) 198 #define protoMemGetInfo(X,Y) CC(cudaMemGetInfo(X,Y)) 201 #define protoEventCreate(X) CC(cudaEventCreate(X)) 202 #define protoEventRecord(X) CC(cudaEventRecord(X)) 203 #define protoEventSynchronize(X) cudaEventSynchronize(X) 204 #define protoEventElapsedTime(a,b,c) CC(cudaEventElapsedTime(a,b,c)) 207 #define protoBindTexture(a,b,c,d,e) CC(cudaBindTexture(a,b,c,d,e)) 208 #define protoMalloc3D(a,b) CC(cudaMalloc3D(&a,b)) 209 #define make_protoExtent make_cudaExtent 216 static void printDim(
unsigned int a_in) {std::cout << a_in ;}
217 static void printDim(
dim3 a_in){std::cout <<
"("<<a_in.
x<<
","<<a_in.
y<<
","<<a_in.
z<<
")";}
218 #define PRINT_KERNEL_NAME_ARGS(IN,BLOCKS,THREADS) Ker tmp_name; std::cout << " kernel name: " << typeid(tmp_name).name() << " blocks "; printDim(BLOCKS); std::cout << " number of threads " << THREADS << std::endl; 220 #define PRINT_KERNEL_NAME_ARGS(IN,BLOCKS,THREADS) 223 #define PRINT_KERNEL_NAME_ARGS(IN,BLOCKS,THREADS) 227 #define PRINT_KER(X) std::cout << "Kernel: "<< #X << " file " << __FILE__ << " line " <<__LINE__<< std::endl; \ 229 protoDeviceSynchronizeGPU();\ 230 {protoError_t error = protoPeekAtLastError(); \ 231 protoDeviceSynchronizeGPU();\ 232 if(error != protoSuccess) \ 234 std::cout << protoGetErrorString(error); \ 237 else std::cout << "Success Kernel: "<< #X << std::endl;} 239 #define PRINT_KER(X) X 242 #if defined PROTO_HIP 243 #define protoLaunchKernelGPU(Ker, nbBlocks, nbThreads, args...) \ 244 PRINT_KER(hipLaunchKernelGGL(Ker, nbBlocks, nbThreads, 0, 0, args);) 245 #define protoLaunchKernelMemGPU(Ker, nbBlocks, nbThreads, smem, args...) \ 246 PRINT_KER(hipLaunchKernelGGL( Ker, nbBlocks, nbThreads, smem, 0, args);) 247 #define protoLaunchKernelAsyncGPU(Ker, nbBlocks, nbThreads, stream, args...) \ 248 PRINT_KER( hipLaunchKernelGGL( Ker, nbBlocks, nbThreads, 0, stream, args);) 249 #define protoLaunchKernelMemAsyncGPU(Ker, nbBlocks, nbThreads, smem, stream, args...) \ 250 PRINT_KER( hipLaunchKernelGGL( Ker, nbBlocks, nbThreads, smem, stream, args);) 251 #elif defined PROTO_CUDA 252 #define protoLaunchKernelGPU(Ker, nbBlocks, nbThreads, args...) \ 253 PRINT_KER( ( Ker<<<nbBlocks,nbThreads>>>(args)) ) 254 #define protoLaunchKernelMemGPU(Ker, nbBlocks, nbThreads, smem, args...) \ 255 PRINT_KER(( Ker<<<nbBlocks, nbThreads,smem>>>(args))) 256 #define protoLaunchKernelAsyncGPU(Ker, nbBlocks, nbThreads, stream, args...) \ 257 PRINT_KER((Ker<<<nbBlocks, nbThreads,0,stream>>>(args))) 258 #define protoLaunchKernelMemAsyncGPU(Ker, nbBlocks, nbThreads, smem, stream, args...) \ 259 PRINT_KER((Ker<<<nbBlocks, nbThreads,smem,stream>>>(args))) 267 #if defined PROTO_HIP 268 protoPointerAttributes att;
269 protoPointerGetAttributes(&att, ptr);
270 if(att.memoryType == hipMemoryTypeDevice)
return true;
271 #elif defined PROTO_CUDA 272 protoPointerAttributes att;
273 protoPointerGetAttributes(&att, ptr);
274 if(att.type == 2)
return true;
282 inline void v100tuning(
int nbElems,
int & nbBlocks,
int &blockSize)
286 nbBlocks = ( nbElems + blockSize - 1)/ blockSize;
293 blockSize = ( nbElems + nbBlocks - 1) / nbBlocks;
295 int coeff = blockSize / 32;
298 blockSize = coeff * 32;
300 nbBlocks = ( nbElems + blockSize - 1) / nbBlocks;
size_t y
Definition: Proto_Macros.H:12
size_t z
Definition: Proto_Macros.H:13
size_t x
Definition: Proto_Macros.H:11
Definition: Proto_Macros.H:9
void v100tuning(int nbElems, int &nbBlocks, int &blockSize)
Definition: Proto_GPU.H:282
bool isDeviceMemory(T *ptr)
Definition: Proto_GPU.H:264