Proto  3.2
Proto_GPU.H
Go to the documentation of this file.
1 /// In general, all calls to cuda have been redefined as macros in order to easily make a switch with hip at compile time.
2 /// These macros are defined into \texttt{Proto\_gpu.H}.
3 /// To use the HIP, you need to include the following flags: \texttt{-DPROTO\_CUDA} and \texttt{-DPROTO\_HIP}.
4 /// Most of cudaNAME functions are renamed protoNAME such as:
5 ///
6 /// \begin{lstlisting}[language=C++,caption={Macro Define}]
7 /// #if defined PROTO_HIP
8 /// #define protoMalloc(...) hipMalloc(...) // HIP
9 /// #else protoMalloc(...) cudaMalloc(...) // CUDA
10 /// #endif
11 /// \end{lstlisting}
12 ///
13 /// In the following sections, you will find the renamed cuda functions and data types.
14 /// Functions that aren't real cuda Function such as cudaApply or structures such as cudaUglyStruct keep their names.
15 /// More than 95$\%$ of the changes have been made in the Proto.
16 
17 
18 #pragma once
19 
20 #if defined PROTO_CUDA
21 #include "cuda.h"
22 #include <cuda_runtime.h>
23 #elif defined PROTO_HIP
24 #include "hip/hip_runtime.h"
25 #include "hip/hip_runtime_api.h"
26 #endif
27 #include "iostream" // Use for CHECK(X)
28 // #include "Proto_MemInfo.H"
29 
30 #define protoGetCurrentStream DisjointBoxLayout::getCurrentStream()
31 
32 /// Data Types / classes
33 #if defined PROTO_HIP
34  #define protoStream_t hipStream_t
35  #define protoMemcpyDeviceToDevice hipMemcpyDeviceToDevice
36  #define protoMemcpyHostToDevice hipMemcpyHostToDevice
37  #define protoMemcpyDeviceToHost hipMemcpyDeviceToHost
38  #define protoError hipError_t //legacy
39  #define protoError_t hipError_t
40  #define protoSuccess hipSuccess
41  #define protoDeviceProp hipDeviceProp_t
42  #define protoPointerAttributes hipPointerAttribute_t
43  #define protoPitchedPtr hipPitchedPtr
44  #define protoArray hipArray
45  #define protoExtent hipExtent
46  #define protoChannelFormatDesc hipChannelFormatDesc
47  #define protoReadModeElementType hipReadModeElementType
48  #define protoEvent_t hipEvent_t
49  #define protoGetLastError() hipGetLastError()
50  #define protoPeekAtLastError() hipPeekAtLastError()
51  #define protoGetErrorString(X) hipGetErrorString(X)
52  #define protoThreadSynchronize() hipDeviceSynchronize()
53 
54 
55 #elif defined PROTO_CUDA
56  #define protoStream_t cudaStream_t
57  #define protoMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
58  #define protoMemcpyHostToDevice cudaMemcpyHostToDevice
59  #define protoMemcpyDeviceToHost cudaMemcpyDeviceToHost
60  #define protoError cudaError // legacy
61  #define protoError_t cudaError
62  #define protoSuccess cudaSuccess
63  #define protoDeviceProp cudaDeviceProp
64  #define protoPointerAttributes cudaPointerAttributes
65  #define protoPitchedPtr cudaPitchedPtr
66  #define protoArray cudaArray
67  #define protoExtent cudaExtent
68  #define protoChannelFormatDesc cudaChannelFormatDesc
69  #define protoReadModeElementType cudaReadModeElementType
70  #define protoEvent_t cudaEvent_t
71  #define protoGetLastError() cudaGetLastError()
72  #define protoPeekAtLastError() cudaPeekAtLastError()
73  #define protoGetErrorString(X) cudaGetErrorString(X)
74  #define protoThreadSynchronize() cudaThreadSynchronize()
75 
76 #endif
77 
78 #ifndef NDEBUG
79 
80  #ifndef superDebug
81  #define GPU_CHECK(in) \
82  do \
83  { \
84  protoError_t error = in; \
85  if(error != protoSuccess) \
86  { \
87  std::cout << protoGetErrorString(error); \
88  exit(0); \
89  }\
90  } while(0)
91  #else
92  #define GPU_CHECK(in) \
93  do \
94  { \
95  std::cout << "Try " << #in << " file: "<< __FILE__ << " line: " << __LINE__ << std::endl; \
96  protoError_t error = in; \
97  protoDeviceSynchronizeGPU();\
98  if(error != protoSuccess) \
99  { \
100  std::cout << protoGetErrorString(error); \
101  exit(0); \
102  }\
103  else std::cout << "Success " << #in << std::endl; \
104  } while(0)
105  #endif
106 
107 #else
108  #define GPU_CHECK(condition) condition
109 #endif
110 
111 
112 
113 /// Functions
114 #if defined PROTO_HIP // HIP
115 
116  #define HC(X) GPU_CHECK(X)
117 
118  // MEMORY
119  #define protoMallocGPU(PTR,NBYTES) storeMemInfo(DEVICE,NBYTES); countMallocDevice(HC(hipMalloc(&PTR,NBYTES)))
120  #define protoFreeGPU(PTR) HC(hipFree(PTR))
121  #define protoMallocHost(a,b) countMallocDevice(HC(hipHostMalloc(&a,b)))
122  //#define protoHostAlloc(PTR,NBYTES) HC(hipHostMalloc(&PTR,NBYTES))
123  //#define protoFreeHost(PTR) HC(hipHostFree(PTR))
124  //#define protoHostFree(PTR) HC(hipFreeHost(PTR))
125  #define protoMallocManaged(a,b) HC(hipMallocManaged(&a,b))
126  #define protoMemset(a,b,c) HC(hipMemset(a,b,c))
127 
128  // COPY
129  #define protoMemcpyGPU(to,from,size,copyType) HC(hipMemcpy(to,from,size,copyType))
130  #define protoMemcpyAsyncGPU(to,from,size,copyType,stream) HC(hipMemcpyAsync(to,from,size,copyType, stream))
131  #define protoMemcpyFromSymbolGPU(a,b,c,d,e) hipMemcpyFromSymbol(a,b,c,d,e) // not used anymore
132  #define protoMemcpyToSymbolGPU(a,b,c,d,e) hipMemcpyToSymbol(a,b,c,d,e)
133 
134  // STREAM
135  #define protoDeviceSynchronizeGPU() hipDeviceSynchronize()
136  #define protoStreamCreate(X) HC(hipStreamCreate(X))
137  #define protoStreamDestroy(X) HC(hipStreamDestroy(X))
138  #define protoStreamSynchronize(X) HC(hipStreamSynchronize(X))
139 
140  // DEVICE
141  #define protoSetDevice(X) HC(hipSetDevice(X))
142  #define protoGetDeviceProperties(X,Y) HC(hipGetDeviceProperties(X,Y))
143  #define protoDeviceReset() HC(hipDeviceReset())
144  #define protoPointerGetAttributes(X,Y) HC(hipPointerGetAttributes(X,Y))
145  #define protoGetDeviceCount(X) HC(hipGetDeviceCount(X))
146  #define protoGetDevice(X) HC(hipGetDevice(X))
147  #define protoMemGetInfo(X,Y) HC(hipMemGetInfo(X,Y))
148 
149  // EVENT
150  #define protoEventCreate(X) HC(hipEventCreate(X))
151  #define protoEventRecord(X) HC(hipEventRecord(X))
152  #define protoEventSynchronize(X) hipEventSynchronize(X)
153  #define protoEventElapsedTime(a,b,c) HC(hipEventElapsedTime(a,b,c))
154 
155  // OTHER
156  #define protoBindTexture(a,b,c,d,e) HC(hipBindTexture(a,b,c,d,e))
157  #define protoMalloc3D(a,b) HC(hipMalloc3D(&a,b))
158  #define make_protoExtent hip_cudaExtent
159 
160 #elif defined PROTO_CUDA
161 
162  #define CC(X) GPU_CHECK(X) // CudaCheck
163 
164  // MEMORY
165 
166  #define protoMallocGPU(a,b) storeMemInfo(DEVICE,b); countMallocDevice(CC(cudaMalloc(&a,b)))
167  #define protoFreeGPU(a) CC(cudaFree(a))
168  //#define protoHostAlloc(a,b) CC(cudaMallocHost(&a,b))
169 
170 // #define protoMalloc(a,b) CC(cudaMalloc(&a,b))
171 // #define protoFree(a) CC(cudaFree(a))
172  #define protoMallocHost(a,b) countMallocDevice(CC(cudaMallocHost(&a,b)))
173 // #define protoHostAlloc(a,b) CC(cudaHostAlloc(&a,b))
174 
175 // #define protoFreeHost(PTR) CC(cudaFreeHost(PTR))
176  #define protoMallocManaged(a,b) CC(cudaMallocManaged(&a,b))
177  #define protoMemset(a,b,c) CC(cudaMemset(a,b,c))
178 
179  // COPY
180  #define protoMemcpyGPU(to,from,size,copyType) CC(cudaMemcpy(to,from,size,copyType))
181  #define protoMemcpyAsyncGPU(to,from,size,copyType,stream) CC(cudaMemcpyAsync(to,from,size,copyType, stream))
182  #define protoMemcpyFromSymbolGPU(a,b,c,d,e) cudaMemcpyFromSymbol(a,b,c,d,e)
183  #define protoMemcpyToSymbolGPU(a,b,c,d,e) cudaMemcpyToSymbol(a,b,c,d,e)
184 
185  // STREAM
186  #define protoDeviceSynchronizeGPU() cudaDeviceSynchronize()
187  #define protoStreamCreate(X) CC(cudaStreamCreate(X))
188  #define protoStreamDestroy(X) CC(cudaStreamDestroy(X))
189  #define protoStreamSynchronize(X) CC(cudaStreamSynchronize(X))
190 
191  // DEVICE
192  #define protoSetDevice(X) CC(cudaSetDevice(X))
193  #define protoGetDeviceProperties(X,Y) CC(cudaGetDeviceProperties(X,Y))
194  #define protoDeviceReset() CC(cudaDeviceReset())
195  #define protoPointerGetAttributes(X,Y) CC(cudaPointerGetAttributes(X,Y))
196  #define protoGetDevice(X) CC(cudaGetDevice(X))
197  #define protoGetDeviceCount(X) CC(cudaGetDeviceCount(X))
198  #define protoMemGetInfo(X,Y) CC(cudaMemGetInfo(X,Y))
199 
200  // EVENT
201  #define protoEventCreate(X) CC(cudaEventCreate(X))
202  #define protoEventRecord(X) CC(cudaEventRecord(X))
203  #define protoEventSynchronize(X) cudaEventSynchronize(X)
204  #define protoEventElapsedTime(a,b,c) CC(cudaEventElapsedTime(a,b,c))
205 
206  // OTHER
207  #define protoBindTexture(a,b,c,d,e) CC(cudaBindTexture(a,b,c,d,e))
208  #define protoMalloc3D(a,b) CC(cudaMalloc3D(&a,b))
209  #define make_protoExtent make_cudaExtent
210 #endif
211 
212 // GPU_CHECK(protoGetLastError); is only used in debug mode
213 #include <typeinfo>
214 #ifndef NDEBUG
215 #ifdef superDebug
216 static void printDim(unsigned int a_in) {std::cout << a_in ;}
217 static void printDim(dim3 a_in){std::cout << "("<<a_in.x<<","<<a_in.y<<","<<a_in.z<<")";}
218 #define PRINT_KERNEL_NAME_ARGS(IN,BLOCKS,THREADS) Ker tmp_name; std::cout << " kernel name: " << typeid(tmp_name).name() << " blocks "; printDim(BLOCKS); std::cout << " number of threads " << THREADS << std::endl;
219 #else
220 #define PRINT_KERNEL_NAME_ARGS(IN,BLOCKS,THREADS)
221 #endif
222 #else
223 #define PRINT_KERNEL_NAME_ARGS(IN,BLOCKS,THREADS)
224 #endif
225 
226 #ifdef superDebug
227 #define PRINT_KER(X) std::cout << "Kernel: "<< #X << " file " << __FILE__ << " line " <<__LINE__<< std::endl; \
228  X \
229  protoDeviceSynchronizeGPU();\
230  {protoError_t error = protoPeekAtLastError(); \
231  protoDeviceSynchronizeGPU();\
232  if(error != protoSuccess) \
233  { \
234  std::cout << protoGetErrorString(error); \
235  exit(0); \
236  }\
237  else std::cout << "Success Kernel: "<< #X << std::endl;}
238 #else
239 #define PRINT_KER(X) X
240 #endif
241 
242 #if defined PROTO_HIP
243 #define protoLaunchKernelGPU(Ker, nbBlocks, nbThreads, args...) \
244  PRINT_KER(hipLaunchKernelGGL(Ker, nbBlocks, nbThreads, 0, 0, args);)
245 #define protoLaunchKernelMemGPU(Ker, nbBlocks, nbThreads, smem, args...) \
246  PRINT_KER(hipLaunchKernelGGL( Ker, nbBlocks, nbThreads, smem, 0, args);)
247 #define protoLaunchKernelAsyncGPU(Ker, nbBlocks, nbThreads, stream, args...) \
248  PRINT_KER( hipLaunchKernelGGL( Ker, nbBlocks, nbThreads, 0, stream, args);)
249 #define protoLaunchKernelMemAsyncGPU(Ker, nbBlocks, nbThreads, smem, stream, args...) \
250  PRINT_KER( hipLaunchKernelGGL( Ker, nbBlocks, nbThreads, smem, stream, args);)
251 #elif defined PROTO_CUDA
252 #define protoLaunchKernelGPU(Ker, nbBlocks, nbThreads, args...) \
253  PRINT_KER( ( Ker<<<nbBlocks,nbThreads>>>(args)) )
254 #define protoLaunchKernelMemGPU(Ker, nbBlocks, nbThreads, smem, args...) \
255  PRINT_KER(( Ker<<<nbBlocks, nbThreads,smem>>>(args)))
256 #define protoLaunchKernelAsyncGPU(Ker, nbBlocks, nbThreads, stream, args...) \
257  PRINT_KER((Ker<<<nbBlocks, nbThreads,0,stream>>>(args)))
258 #define protoLaunchKernelMemAsyncGPU(Ker, nbBlocks, nbThreads, smem, stream, args...) \
259  PRINT_KER((Ker<<<nbBlocks, nbThreads,smem,stream>>>(args)))
260 #endif
261 
262 
263 template<typename T>
264 inline bool isDeviceMemory(T* ptr)
265 {
266 
267 #if defined PROTO_HIP
268  protoPointerAttributes att;
269  protoPointerGetAttributes(&att, ptr);
270  if(att.memoryType == hipMemoryTypeDevice) return true; // = 2-> device allocation
271 #elif defined PROTO_CUDA
272  protoPointerAttributes att;
273  protoPointerGetAttributes(&att, ptr);
274  if(att.type == 2) return true; // = 2-> device allocation
275 #endif
276  return false;
277 }
278 
279 //// tuning
280 //
281 
282 inline void v100tuning(int nbElems, int & nbBlocks, int &blockSize)
283 {
284  // determine the best block size and block dim
285  blockSize = 256;
286  nbBlocks = ( nbElems + blockSize - 1)/ blockSize;
287  if(nbBlocks < 80)
288  {
289  // On V100 we want at least 80 blocks;
290  nbBlocks = 80;
291  // figure out what is the blockSize for 80 blocks
292  //
293  blockSize = ( nbElems + nbBlocks - 1) / nbBlocks;
294  // as we use __syncthreads(), we want that stride modulo 32 is equal to 0
295  int coeff = blockSize / 32;
296  if(coeff > 0)
297  {
298  blockSize = coeff * 32;
299  // recompute the number of blocks > 80
300  nbBlocks = ( nbElems + blockSize - 1) / nbBlocks;
301  }
302  }
303 }
size_t y
Definition: Proto_Macros.H:12
size_t z
Definition: Proto_Macros.H:13
size_t x
Definition: Proto_Macros.H:11
Definition: Proto_Macros.H:9
void v100tuning(int nbElems, int &nbBlocks, int &blockSize)
Definition: Proto_GPU.H:282
bool isDeviceMemory(T *ptr)
Definition: Proto_GPU.H:264