Proto
Proto_gpu.H
1 
17 
18 #pragma once
19 
20 #ifdef PROTO_CUDA
21 
22 
23 #ifdef PROTO_HIP
24 #include "hip/hip_runtime.h"
25 #include "hip/hip_runtime_api.h"
26 #include "iostream" // Use for CHECK(X)
27 #else
28 #include "cuda.h"
29 #include <cuda_runtime.h>
30 #endif
31 
32 #define protoGetCurrentStream DisjointBoxLayout::getCurrentStream()
33 
35 #ifdef PROTO_HIP // HIP
36  #define protoStream_t hipStream_t
37  #define protoMemcpyDeviceToDevice hipMemcpyDeviceToDevice
38  #define protoMemcpyHostToDevice hipMemcpyHostToDevice
39  #define protoMemcpyDeviceToHost hipMemcpyDeviceToHost
40  #define protoError hipError_t //legacy
41  #define protoError_t hipError_t
42  #define protoSuccess hipSuccess
43  #define protoDeviceProp hipDeviceProp_t
44  #define protoPointerAttributes hipPointerAttribute_t
45  #define protoPitchedPtr hipPitchedPtr
46  #define protoArray hipArray
47  #define protoExtent hipExtent
48  #define protoChannelFormatDesc hipChannelFormatDesc
49  #define protoReadModeElementType hipReadModeElementType
50  #define protoEvent_t hipEvent_t
51  #define protoGetLastError() hipGetLastError()
52  #define protoPeekAtLastError() hipPeekAtLastError()
53  #define protoGetErrorString(X) hipGetErrorString(X)
54  #define protoThreadSynchronize() hipDeviceSynchronize()
55 
56 
57 #else // CUDA
58  #define protoStream_t cudaStream_t
59  #define protoMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
60  #define protoMemcpyHostToDevice cudaMemcpyHostToDevice
61  #define protoMemcpyDeviceToHost cudaMemcpyDeviceToHost
62  #define protoError cudaError // legacy
63  #define protoError_t cudaError
64  #define protoSuccess cudaSuccess
65  #define protoDeviceProp cudaDeviceProp
66  #define protoPointerAttributes cudaPointerAttributes
67  #define protoPitchedPtr cudaPitchedPtr
68  #define protoArray cudaArray
69  #define protoExtent cudaExtent
70  #define protoChannelFormatDesc cudaChannelFormatDesc
71  #define protoReadModeElementType cudaReadModeElementType
72  #define protoEvent_t cudaEvent_t
73  #define protoGetLastError() cudaGetLastError()
74  #define protoPeekAtLastError() cudaPeekAtLastError()
75  #define protoGetErrorString(X) cudaGetErrorString(X)
76  #define protoThreadSynchronize() cudaThreadSynchronize()
77 
78 #endif
79 
80 #ifndef NDEBUG
81 
82  #ifndef superDebug
83  #define GPU_CHECK(in) \
84  do \
85  { \
86  protoError_t error = in; \
87  if(error != protoSuccess) \
88  { \
89  std::cout << protoGetErrorString(error); \
90  exit(0); \
91  }\
92  } while(0)
93  #else
94  #define GPU_CHECK(in) \
95  do \
96  { \
97  std::cout << "Try " << #in << " file: "<< __FILE__ << " line: " << __LINE__ << std::endl; \
98  protoError_t error = in; \
99  protoDeviceSynchronizeGPU();\
100  if(error != protoSuccess) \
101  { \
102  std::cout << protoGetErrorString(error); \
103  exit(0); \
104  }\
105  else std::cout << "Success " << #in << std::endl; \
106  } while(0)
107  #endif
108 
109 #else
110  #define GPU_CHECK(condition) condition
111 #endif
112 
113 
114 
116 #ifdef PROTO_HIP // HIP
117 
118  #define HC(X) GPU_CHECK(X)
119 
120  // MEMORY
121  #define protoMallocGPU(PTR,NBYTES) storeMemInfo(DEVICE,NBYTES); countMallocDevice(HC(hipMalloc(&PTR,NBYTES)))
122  #define protoFreeGPU(PTR) HC(hipFree(PTR))
123  #define protoMallocHost(a,b) countMallocDevice(HC(hipHostMalloc(&a,b)))
124  //#define protoHostAlloc(PTR,NBYTES) HC(hipHostMalloc(&PTR,NBYTES))
125  //#define protoFreeHost(PTR) HC(hipHostFree(PTR))
126  //#define protoHostFree(PTR) HC(hipFreeHost(PTR))
127  #define protoMallocManaged(a,b) HC(hipMallocManaged(&a,b))
128  #define protoMemset(a,b,c) HC(hipMemset(a,b,c))
129 
130  // COPY
131  #define protoMemcpyGPU(to,from,size,copyType) HC(hipMemcpy(to,from,size,copyType))
132  #define protoMemcpyAsyncGPU(to,from,size,copyType,stream) HC(hipMemcpyAsync(to,from,size,copyType, stream))
133  #define protoMemcpyFromSymbolGPU(a,b,c,d,e) hipMemcpyFromSymbol(a,b,c,d,e) // not used anymore
134  #define protoMemcpyToSymbolGPU(a,b,c,d,e) hipMemcpyToSymbol(a,b,c,d,e)
135 
136  // STREAM
137 #define protoDeviceSynchronizeGPU() hipDeviceSynchronize()
138 #define protoStreamCreate(X) HC(hipStreamCreate(X))
139  #define protoStreamDestroy(X) HC(hipStreamDestroy(X))
140  #define protoStreamSynchronize(X) HC(hipStreamSynchronize(X))
141 
142  // DEVICE
143  #define protoSetDevice(X) HC(hipSetDevice(X))
144  #define protoGetDeviceProperties(X,Y) HC(hipGetDeviceProperties(X,Y))
145  #define protoDeviceReset() HC(hipDeviceReset())
146  #define protoPointerGetAttributes(X,Y) HC(hipPointerGetAttributes(X,Y))
147  #define protoGetDeviceCount(X) HC(hipGetDeviceCount(X))
148  #define protoGetDevice(X) HC(hipGetDevice(X))
149  #define protoMemGetInfo(X,Y) HC(hipMemGetInfo(X,Y))
150 
151  // EVENT
152  #define protoEventCreate(X) HC(hipEventCreate(X))
153  #define protoEventRecord(X) HC(hipEventRecord(X))
154  #define protoEventSynchronize(X) hipEventSynchronize(X)
155  #define protoEventElapsedTime(a,b,c) HC(hipEventElapsedTime(a,b,c))
156 
157  // OTHER
158  #define protoBindTexture(a,b,c,d,e) HC(hipBindTexture(a,b,c,d,e))
159  #define protoMalloc3D(a,b) HC(hipMalloc3D(&a,b))
160  #define make_protoExtent hip_cudaExtent
161 
162 #else //CUDA
163 
164  #define CC(X) GPU_CHECK(X) // CudaCheck
165 
166  // MEMORY
167 
168  #define protoMallocGPU(a,b) storeMemInfo(DEVICE,b); countMallocDevice(CC(cudaMalloc(&a,b)))
169  #define protoFreeGPU(a) CC(cudaFree(a))
170  //#define protoHostAlloc(a,b) CC(cudaMallocHost(&a,b))
171 
172 // #define protoMalloc(a,b) CC(cudaMalloc(&a,b))
173 // #define protoFree(a) CC(cudaFree(a))
174  #define protoMallocHost(a,b) countMallocDevice(CC(cudaMallocHost(&a,b)))
175 // #define protoHostAlloc(a,b) CC(cudaHostAlloc(&a,b))
176 
177 // #define protoFreeHost(PTR) CC(cudaFreeHost(PTR))
178  #define protoMallocManaged(a,b) CC(cudaMallocManaged(&a,b))
179  #define protoMemset(a,b,c) CC(cudaMemset(a,b,c))
180 
181  // COPY
182  #define protoMemcpyGPU(to,from,size,copyType) CC(cudaMemcpy(to,from,size,copyType))
183  #define protoMemcpyAsyncGPU(to,from,size,copyType,stream) CC(cudaMemcpyAsync(to,from,size,copyType, stream))
184  #define protoMemcpyFromSymbolGPU(a,b,c,d,e) cudaMemcpyFromSymbol(a,b,c,d,e)
185  #define protoMemcpyToSymbolGPU(a,b,c,d,e) cudaMemcpyToSymbol(a,b,c,d,e)
186 
187  // STREAM
188  #define protoDeviceSynchronizeGPU() cudaDeviceSynchronize()
189  #define protoStreamCreate(X) CC(cudaStreamCreate(X))
190  #define protoStreamDestroy(X) CC(cudaStreamDestroy(X))
191  #define protoStreamSynchronize(X) CC(cudaStreamSynchronize(X))
192 
193  // DEVICE
194  #define protoSetDevice(X) CC(cudaSetDevice(X))
195  #define protoGetDeviceProperties(X,Y) CC(cudaGetDeviceProperties(X,Y))
196  #define protoDeviceReset() CC(cudaDeviceReset())
197  #define protoPointerGetAttributes(X,Y) CC(cudaPointerGetAttributes(X,Y))
198  #define protoGetDevice(X) CC(cudaGetDevice(X))
199  #define protoGetDeviceCount(X) CC(cudaGetDeviceCount(X))
200  #define protoMemGetInfo(X,Y) CC(cudaMemGetInfo(X,Y))
201 
202  // EVENT
203  #define protoEventCreate(X) CC(cudaEventCreate(X))
204  #define protoEventRecord(X) CC(cudaEventRecord(X))
205  #define protoEventSynchronize(X) cudaEventSynchronize(X)
206  #define protoEventElapsedTime(a,b,c) CC(cudaEventElapsedTime(a,b,c))
207 
208  // OTHER
209  #define protoBindTexture(a,b,c,d,e) CC(cudaBindTexture(a,b,c,d,e))
210  #define protoMalloc3D(a,b) CC(cudaMalloc3D(&a,b))
211  #define make_protoExtent make_cudaExtent
212 #endif
213 
214 #include <iostream>
215 
216 // GPU_CHECK(protoGetLastError); is only used in debug mode
217 #include <typeinfo>
218 #ifndef NDEBUG
219 #ifdef superDebug
220 static void printDim(unsigned int a_in) {std::cout << a_in ;}
221 static void printDim(dim3 a_in){std::cout << "("<<a_in.x<<","<<a_in.y<<","<<a_in.z<<")";}
222 #define PRINT_KERNEL_NAME_ARGS(IN,BLOCKS,THREADS) Ker tmp_name; std::cout << " kernel name: " << typeid(tmp_name).name() << " blocks "; printDim(BLOCKS); std::cout << " number of threads " << THREADS << std::endl;
223 #else
224 #define PRINT_KERNEL_NAME_ARGS(IN,BLOCKS,THREADS)
225 #endif
226 #else
227 #define PRINT_KERNEL_NAME_ARGS(IN,BLOCKS,THREADS)
228 #endif
229 
230 #ifdef superDebug
231 #define PRINT_KER(X) std::cout << "Kernel: "<< #X << " file " << __FILE__ << " line " <<__LINE__<< std::endl; \
232  X \
233  protoDeviceSynchronizeGPU();\
234  {protoError_t error = protoPeekAtLastError(); \
235  protoDeviceSynchronizeGPU();\
236  if(error != protoSuccess) \
237  { \
238  std::cout << protoGetErrorString(error); \
239  exit(0); \
240  }\
241  else std::cout << "Success Kernel: "<< #X << std::endl;}
242 #define PRINT_KER_CUDA(X) std::cout << "Kernel: "<< #X << " file " << __FILE__ << " line " <<__LINE__<< std::endl; \
243  X; \
244  protoDeviceSynchronizeGPU();\
245  {protoError_t error = protoPeekAtLastError(); \
246  protoDeviceSynchronizeGPU();\
247  if(error != protoSuccess) \
248  { \
249  std::cout << protoGetErrorString(error); \
250  exit(0); \
251  }\
252  else std::cout << "Success Kernel: "<< #X << std::endl;}
253 #else
254 #define PRINT_KER(X) X
255 #define PRINT_KER_CUDA(X) X
256 #endif
257 
258 #ifdef PROTO_HIP
259 #define protoLaunchKernelGPU(Ker, nbBlocks, nbThreads, args...) PRINT_KER(hipLaunchKernelGGL(Ker, nbBlocks, nbThreads, 0, 0, args);)
260 #else
261 #define protoLaunchKernelGPU(Ker, nbBlocks, nbThreads, args...) PRINT_KER_CUDA( ( Ker<<<nbBlocks,nbThreads>>>(args)) )
262 #endif
263 
264 #ifdef PROTO_HIP
265 #define protoLaunchKernelMemGPU(Ker, nbBlocks, nbThreads, smem, args...) PRINT_KER(hipLaunchKernelGGL( Ker, nbBlocks, nbThreads, smem, 0, args);)
266 #else
267 
268 #define protoLaunchKernelMem(Ker, nbBlocks, nbThreads, smem, args...) PRINT_KER_CUDA(( Ker<<<nbBlocks, nbThreads,smem>>>(args)))
269 
270 #endif
271 
272 
273 
274 #ifdef PROTO_HIP
275 #define protoLaunchKernelMemAsyncGPU(Ker, nbBlocks, nbThreads, smem, stream, args...) PRINT_KER( hipLaunchKernelGGL( Ker, nbBlocks, nbThreads, smem, stream, args);)
276 #else
277 #define protoLaunchKernelMemAsyncGPU(Ker, nbBlocks, nbThreads, smem, stream, args...) PRINT_KER_CUDA((Ker<<<nbBlocks, nbThreads,smem,stream>>>(args)))
278 #endif
279 
280 #ifdef PROTO_CUDA
281 template<typename T>
282 inline bool isDeviceMemory(T* ptr)
283 {
284  protoPointerAttributes att;
285  protoPointerGetAttributes(&att, ptr);
286 #ifdef PROTO_HIP
287  if(att.memoryType == hipMemoryTypeDevice) return true; // = 2-> device allocation
288 #else
289  if(att.type == 2) return true; // = 2-> device allocation
290 #endif
291  return false;
292 }
293 
294 #endif
295 
297 //
298 
299 inline void v100tuning(int nbElems, int & nbBlocks, int &blockSize)
300 {
301  // determine the best block size and block dim
302  blockSize = 256;
303  nbBlocks = ( nbElems + blockSize - 1)/ blockSize;
304  if(nbBlocks < 80)
305  {
306  // On V100 we want at least 80 blocks;
307  nbBlocks = 80;
308  // figure out what is the blockSize for 80 blocks
309  //
310  blockSize = ( nbElems + nbBlocks - 1) / nbBlocks;
311  // as we use __syncthreads(), we want that stride modulo 32 is equal to 0
312  int coeff = blockSize / 32;
313  if(coeff > 0)
314  {
315  blockSize = coeff * 32;
316  // recompute the number of blocks > 80
317  nbBlocks = ( nbElems + blockSize - 1) / nbBlocks;
318  }
319  }
320 }
321 
322 #endif
Definition: Proto_macros.H:13