HIP: Heterogenous-computing Interface for Portability
hip_runtime.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
3 
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22 
28 //#pragma once
29 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H
30 #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H
31 
32 //---
33 // Top part of file can be compiled with any compiler
34 
35 //#include <cstring>
36 #if __cplusplus
37 #include <cmath>
38 #else
39 #include <math.h>
40 #include <string.h>
41 #include <stddef.h>
42 #endif//__cplusplus
43 
44 #if __HCC__
45 
46 // Define NVCC_COMPAT for CUDA compatibility
47 #define NVCC_COMPAT
48 #define CUDA_SUCCESS hipSuccess
49 
50 #include <hip/hip_runtime_api.h>
51 
52 
53 #if USE_PROMOTE_FREE_HCC == 1
54 #define ADDRESS_SPACE_1
55 #define ADDRESS_SPACE_3
56 #else
57 #define ADDRESS_SPACE_1 __attribute__((address_space(1)))
58 #define ADDRESS_SPACE_3 __attribute__((address_space(3)))
59 #endif
60 
61 //---
62 // Remainder of this file only compiles with HCC
63 #if defined __HCC__
64 #include <grid_launch.h>
65 //TODO-HCC-GL - change this to typedef.
66 //typedef grid_launch_parm hipLaunchParm ;
67 
68 #if GENERIC_GRID_LAUNCH == 0
69  #define hipLaunchParm grid_launch_parm
70 #else
71 namespace hip_impl
72 {
73  struct Empty_launch_parm {};
74 }
75 #define hipLaunchParm hip_impl::Empty_launch_parm
76 #endif //GENERIC_GRID_LAUNCH
77 
78 #if defined (GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20) || GENERIC_GRID_LAUNCH == 1
79 #else // Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0.
80 #error (HCC must support GRID_LAUNCH_20)
81 #endif //GRID_LAUNCH_VERSION
82 
83 #endif //HCC
84 
85 #if GENERIC_GRID_LAUNCH==1 && defined __HCC__
86 #include "grid_launch_GGL.hpp"
87 #endif//GENERIC_GRID_LAUNCH
88 
89 extern int HIP_TRACE_API;
90 
91 #ifdef __cplusplus
92 //#include <hip/hcc_detail/hip_texture.h>
93 #include <hip/hcc_detail/hip_ldg.h>
94 #endif
96 #include <hip/hcc_detail/math_functions.h>
97 #include <hip/hcc_detail/device_functions.h>
98 
99 // TODO-HCC remove old definitions ; ~1602 hcc supports __HCC_ACCELERATOR__ define.
100 #if defined (__KALMAR_ACCELERATOR__) && !defined (__HCC_ACCELERATOR__)
101 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__
102 #endif
103 
104 
105 
106 
107 // TODO-HCC add a dummy implementation of assert, need to replace with a proper kernel exit call.
108 #if __HIP_DEVICE_COMPILE__ == 1
109  #undef assert
110  #define assert(COND) { if (COND) {} }
111 #endif
112 
113 
114 
115 // Feature tests:
116 #if defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)
117 // Device compile and not host compile:
118 
119 //TODO-HCC enable __HIP_ARCH_HAS_ATOMICS__ when HCC supports these.
120  // 32-bit Atomics:
121 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
122 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
123 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
124 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
125 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
126 
127 // 64-bit Atomics:
128 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
129 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
130 
131 // Doubles
132 #define __HIP_ARCH_HAS_DOUBLES__ (1)
133 
134 //warp cross-lane operations:
135 #define __HIP_ARCH_HAS_WARP_VOTE__ (1)
136 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1)
137 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1)
138 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
139 
140 //sync
141 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
142 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
143 
144 // misc
145 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
146 #define __HIP_ARCH_HAS_3DGRID__ (1)
147 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
148 
149 #endif /* Device feature flags */
150 
151 
152 #define launch_bounds_impl0(requiredMaxThreadsPerBlock)\
153  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
154 #define launch_bounds_impl1(\
155  requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)\
156  __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock),\
157  amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
158 #define select_impl_(_1, _2, impl_, ...) impl_
159 #define __launch_bounds__(...) select_impl_(\
160  __VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
161 
162 // Detect if we are compiling C++ mode or C mode
163 #if defined(__cplusplus)
164 #define __HCC_CPP__
165 #elif defined(__STDC_VERSION__)
166 #define __HCC_C__
167 #endif
168 
169 // TODO - hipify-clang - change to use the function call.
170 //#define warpSize hc::__wavesize()
171 static constexpr int warpSize = 64;
172 
173 #define clock_t long long int
174 __device__ long long int clock64();
175 __device__ clock_t clock();
176 
177 //abort
178 __device__ void abort();
179 
180 //atomicAdd()
181 __device__ int atomicAdd(int* address, int val);
182 __device__ unsigned int atomicAdd(unsigned int* address,
183  unsigned int val);
184 
185 __device__ unsigned long long int atomicAdd(unsigned long long int* address,
186  unsigned long long int val);
187 
188 __device__ float atomicAdd(float* address, float val);
189 
190 
191 //atomicSub()
192 __device__ int atomicSub(int* address, int val);
193 
194 __device__ unsigned int atomicSub(unsigned int* address,
195  unsigned int val);
196 
197 
198 //atomicExch()
199 __device__ int atomicExch(int* address, int val);
200 
201 __device__ unsigned int atomicExch(unsigned int* address,
202  unsigned int val);
203 
204 __device__ unsigned long long int atomicExch(unsigned long long int* address,
205  unsigned long long int val);
206 
207 __device__ float atomicExch(float* address, float val);
208 
209 
210 //atomicMin()
211 __device__ int atomicMin(int* address, int val);
212 __device__ unsigned int atomicMin(unsigned int* address,
213  unsigned int val);
214 __device__ unsigned long long int atomicMin(unsigned long long int* address,
215  unsigned long long int val);
216 
217 
218 //atomicMax()
219 __device__ int atomicMax(int* address, int val);
220 __device__ unsigned int atomicMax(unsigned int* address,
221  unsigned int val);
222 __device__ unsigned long long int atomicMax(unsigned long long int* address,
223  unsigned long long int val);
224 
225 
226 //atomicCAS()
227 __device__ int atomicCAS(int* address, int compare, int val);
228 __device__ unsigned int atomicCAS(unsigned int* address,
229  unsigned int compare,
230  unsigned int val);
231 __device__ unsigned long long int atomicCAS(unsigned long long int* address,
232  unsigned long long int compare,
233  unsigned long long int val);
234 
235 
236 //atomicAnd()
237 __device__ int atomicAnd(int* address, int val);
238 __device__ unsigned int atomicAnd(unsigned int* address,
239  unsigned int val);
240 __device__ unsigned long long int atomicAnd(unsigned long long int* address,
241  unsigned long long int val);
242 
243 
244 //atomicOr()
245 __device__ int atomicOr(int* address, int val);
246 __device__ unsigned int atomicOr(unsigned int* address,
247  unsigned int val);
248 __device__ unsigned long long int atomicOr(unsigned long long int* address,
249  unsigned long long int val);
250 
251 
252 //atomicXor()
253 __device__ int atomicXor(int* address, int val);
254 __device__ unsigned int atomicXor(unsigned int* address,
255  unsigned int val);
256 __device__ unsigned long long int atomicXor(unsigned long long int* address,
257  unsigned long long int val);
258 
259 //atomicInc()
260 __device__ unsigned int atomicInc(unsigned int* address,
261  unsigned int val);
262 
263 
264 //atomicDec()
265 __device__ unsigned int atomicDec(unsigned int* address,
266  unsigned int val);
267 
268  // warp vote function __all __any __ballot
269 __device__ int __all( int input);
270 __device__ int __any( int input);
271 __device__ unsigned long long int __ballot( int input);
272 
273 #if __HIP_ARCH_GFX701__ == 0
274 
275 // warp shuffle functions
276 #ifdef __cplusplus
277 __device__ int __shfl(int input, int lane, int width=warpSize);
278 __device__ int __shfl_up(int input, unsigned int lane_delta, int width=warpSize);
279 __device__ int __shfl_down(int input, unsigned int lane_delta, int width=warpSize);
280 __device__ int __shfl_xor(int input, int lane_mask, int width=warpSize);
281 __device__ float __shfl(float input, int lane, int width=warpSize);
282 __device__ float __shfl_up(float input, unsigned int lane_delta, int width=warpSize);
283 __device__ float __shfl_down(float input, unsigned int lane_delta, int width=warpSize);
284 __device__ float __shfl_xor(float input, int lane_mask, int width=warpSize);
285 #else
286 __device__ int __shfl(int input, int lane, int width);
287 __device__ int __shfl_up(int input, unsigned int lane_delta, int width);
288 __device__ int __shfl_down(int input, unsigned int lane_delta, int width);
289 __device__ int __shfl_xor(int input, int lane_mask, int width);
290 __device__ float __shfl(float input, int lane, int width);
291 __device__ float __shfl_up(float input, unsigned int lane_delta, int width);
292 __device__ float __shfl_down(float input, unsigned int lane_delta, int width);
293 __device__ float __shfl_xor(float input, int lane_mask, int width);
294 #endif //__cplusplus
295 
296 __device__ unsigned __hip_ds_bpermute(int index, unsigned src);
297 __device__ float __hip_ds_bpermutef(int index, float src);
298 __device__ unsigned __hip_ds_permute(int index, unsigned src);
299 __device__ float __hip_ds_permutef(int index, float src);
300 
301 __device__ unsigned __hip_ds_swizzle(unsigned int src, int pattern);
302 __device__ float __hip_ds_swizzlef(float src, int pattern);
303 
304 __device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl);
305 
306 #endif //__HIP_ARCH_GFX803__ == 1
307 
308 __host__ __device__ int min(int arg1, int arg2);
309 __host__ __device__ int max(int arg1, int arg2);
310 
311 __device__ void* __get_dynamicgroupbaseptr();
312 
313 
338 extern __attribute__((const)) __device__ void __hip_hc_threadfence() __asm("__llvm_fence_sc_dev");
339 extern __attribute__((const)) __device__ void __hip_hc_threadfence_block() __asm("__llvm_fence_sc_wg");
340 
341 
351 // __device__ void __threadfence_block(void);
352 __device__ static inline void __threadfence_block(void) {
353  return __hip_hc_threadfence_block();
354 }
355 
365 // __device__ void __threadfence(void) __attribute__((deprecated("Provided for compile-time compatibility, not yet functional")));
366 __device__ static inline void __threadfence(void) {
367  return __hip_hc_threadfence();
368 }
369 
379 //__device__ void __threadfence_system(void) __attribute__((deprecated("Provided with workaround configuration, see hip_kernel_language.md for details")));
380 __device__ void __threadfence_system(void) ;
381 
382 // doxygen end Fence Fence
388 #define hipThreadIdx_x (hc_get_workitem_id(0))
389 #define hipThreadIdx_y (hc_get_workitem_id(1))
390 #define hipThreadIdx_z (hc_get_workitem_id(2))
391 
392 #define hipBlockIdx_x (hc_get_group_id(0))
393 #define hipBlockIdx_y (hc_get_group_id(1))
394 #define hipBlockIdx_z (hc_get_group_id(2))
395 
396 #define hipBlockDim_x (hc_get_group_size(0))
397 #define hipBlockDim_y (hc_get_group_size(1))
398 #define hipBlockDim_z (hc_get_group_size(2))
399 
400 #define hipGridDim_x (hc_get_num_groups(0))
401 #define hipGridDim_y (hc_get_num_groups(1))
402 #define hipGridDim_z (hc_get_num_groups(2))
403 
404 extern "C" __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size);
405 extern "C" __device__ void* __hip_hc_memset(void* ptr, uint8_t val, size_t size);
406 extern "C" __device__ void* __hip_hc_malloc(size_t);
407 extern "C" __device__ void* __hip_hc_free(void *ptr);
408 
409 static inline __device__ void* malloc(size_t size)
410 {
411  return __hip_hc_malloc(size);
412 }
413 
414 static inline __device__ void* free(void *ptr)
415 {
416  return __hip_hc_free(ptr);
417 }
418 
419 static inline __device__ void* memcpy(void* dst, const void* src, size_t size)
420 {
421  return __hip_hc_memcpy(dst, src, size);
422 }
423 
424 static inline __device__ void* memset(void* ptr, int val, size_t size)
425 {
426  uint8_t val8 = static_cast <uint8_t> (val);
427  return __hip_hc_memset(ptr, val8, size);
428 }
429 
430 
431 
432 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
433 
434 #define HIP_KERNEL_NAME(...) (__VA_ARGS__)
435 #define HIP_SYMBOL(X) #X
436 
437 #if defined __HCC_CPP__
438 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp, const char *kernelNameStr);
439 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm *lp, const char *kernelNameStr);
440 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm *lp, const char *kernelNameStr);
441 extern hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm *lp, const char *kernelNameStr);
442 extern void ihipPostLaunchKernel(const char *kernelName, hipStream_t stream, grid_launch_parm &lp);
443 
444 #if GENERIC_GRID_LAUNCH == 0
445 //#warning "Original hipLaunchKernel defined"
446 // Due to multiple overloaded versions of ihipPreLaunchKernel, the numBlocks3D and blockDim3D can be either size_t or dim3 types
447 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \
448 do {\
449  grid_launch_parm lp;\
450  lp.dynamic_group_mem_bytes = _groupMemBytes; \
451  hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp, #_kernelName)); \
452  _kernelName (lp, ##__VA_ARGS__);\
453  ihipPostLaunchKernel(#_kernelName, trueStream, lp);\
454 } while(0)
455 #endif //GENERIC_GRID_LAUNCH
456 
457 #elif defined (__HCC_C__)
458 
459 //TODO - develop C interface.
460 
461 #endif //__HCC_CPP__
462 
467 // Macro to replace extern __shared__ declarations
468 // to local variable definitions
469 #define HIP_DYNAMIC_SHARED(type, var) \
470  type* var = \
471  (type*)__get_dynamicgroupbaseptr(); \
472 
473 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
474 
475 
476 
481 //extern int HIP_PRINT_ENV ; ///< Print all HIP-related environment variables.
482 //extern int HIP_TRACE_API; ///< Trace HIP APIs.
483 //extern int HIP_LAUNCH_BLOCKING ; ///< Make all HIP APIs host-synchronous
484 
490 // End doxygen API:
496 #endif
497 
498 #endif//HIP_HCC_DETAIL_RUNTIME_H
TODO-doc.
Definition: hip_runtime_api.h:193
#define __host__
Definition: host_defines.h:41
__device__ void __threadfence_system(void)
threadfence_system makes writes to pinned system memory visible on host CPU.
Definition: device_util.cpp:1266
Definition: grid_launch.cpp:33
Definition: hip_hcc_internal.h:491