29 #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H 30 #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_H 48 #define CUDA_SUCCESS hipSuccess 50 #include <hip/hip_runtime_api.h> 53 #if USE_PROMOTE_FREE_HCC == 1 54 #define ADDRESS_SPACE_1 55 #define ADDRESS_SPACE_3 57 #define ADDRESS_SPACE_1 __attribute__((address_space(1))) 58 #define ADDRESS_SPACE_3 __attribute__((address_space(3))) 64 #include <grid_launch.h> 68 #if GENERIC_GRID_LAUNCH == 0 69 #define hipLaunchParm grid_launch_parm 73 struct Empty_launch_parm {};
75 #define hipLaunchParm hip_impl::Empty_launch_parm 76 #endif //GENERIC_GRID_LAUNCH 78 #if defined (GRID_LAUNCH_VERSION) and (GRID_LAUNCH_VERSION >= 20) || GENERIC_GRID_LAUNCH == 1 79 #else // Use field names for grid_launch 2.0 structure, if HCC supports GL 2.0. 80 #error (HCC must support GRID_LAUNCH_20) 81 #endif //GRID_LAUNCH_VERSION 85 #if GENERIC_GRID_LAUNCH==1 && defined __HCC__ 86 #include "grid_launch_GGL.hpp" 87 #endif//GENERIC_GRID_LAUNCH 89 extern int HIP_TRACE_API;
93 #include <hip/hcc_detail/hip_ldg.h> 96 #include <hip/hcc_detail/math_functions.h> 97 #include <hip/hcc_detail/device_functions.h> 100 #if defined (__KALMAR_ACCELERATOR__) && !defined (__HCC_ACCELERATOR__) 101 #define __HCC_ACCELERATOR__ __KALMAR_ACCELERATOR__ 108 #if __HIP_DEVICE_COMPILE__ == 1 110 #define assert(COND) { if (COND) {} } 116 #if defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0) 121 #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1) 122 #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1) 123 #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1) 124 #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1) 125 #define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0) 128 #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1) 129 #define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0) 132 #define __HIP_ARCH_HAS_DOUBLES__ (1) 135 #define __HIP_ARCH_HAS_WARP_VOTE__ (1) 136 #define __HIP_ARCH_HAS_WARP_BALLOT__ (1) 137 #define __HIP_ARCH_HAS_WARP_SHUFFLE__ (1) 138 #define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0) 141 #define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0) 142 #define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0) 145 #define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0) 146 #define __HIP_ARCH_HAS_3DGRID__ (1) 147 #define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0) 152 #define launch_bounds_impl0(requiredMaxThreadsPerBlock)\ 153 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock))) 154 #define launch_bounds_impl1(\ 155 requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)\ 156 __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock),\ 157 amdgpu_waves_per_eu(minBlocksPerMultiprocessor))) 158 #define select_impl_(_1, _2, impl_, ...) impl_ 159 #define __launch_bounds__(...) select_impl_(\ 160 __VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__) 163 #if defined(__cplusplus) 165 #elif defined(__STDC_VERSION__) 171 static constexpr
int warpSize = 64;
173 #define clock_t long long int 174 __device__
long long int clock64();
175 __device__ clock_t clock();
178 __device__
void abort();
181 __device__
int atomicAdd(
int* address,
int val);
182 __device__
unsigned int atomicAdd(
unsigned int* address,
185 __device__
unsigned long long int atomicAdd(
unsigned long long int* address,
186 unsigned long long int val);
188 __device__
float atomicAdd(
float* address,
float val);
192 __device__
int atomicSub(
int* address,
int val);
194 __device__
unsigned int atomicSub(
unsigned int* address,
199 __device__
int atomicExch(
int* address,
int val);
201 __device__
unsigned int atomicExch(
unsigned int* address,
204 __device__
unsigned long long int atomicExch(
unsigned long long int* address,
205 unsigned long long int val);
207 __device__
float atomicExch(
float* address,
float val);
211 __device__
int atomicMin(
int* address,
int val);
212 __device__
unsigned int atomicMin(
unsigned int* address,
214 __device__
unsigned long long int atomicMin(
unsigned long long int* address,
215 unsigned long long int val);
219 __device__
int atomicMax(
int* address,
int val);
220 __device__
unsigned int atomicMax(
unsigned int* address,
222 __device__
unsigned long long int atomicMax(
unsigned long long int* address,
223 unsigned long long int val);
227 __device__
int atomicCAS(
int* address,
int compare,
int val);
228 __device__
unsigned int atomicCAS(
unsigned int* address,
229 unsigned int compare,
231 __device__
unsigned long long int atomicCAS(
unsigned long long int* address,
232 unsigned long long int compare,
233 unsigned long long int val);
237 __device__
int atomicAnd(
int* address,
int val);
238 __device__
unsigned int atomicAnd(
unsigned int* address,
240 __device__
unsigned long long int atomicAnd(
unsigned long long int* address,
241 unsigned long long int val);
245 __device__
int atomicOr(
int* address,
int val);
246 __device__
unsigned int atomicOr(
unsigned int* address,
248 __device__
unsigned long long int atomicOr(
unsigned long long int* address,
249 unsigned long long int val);
253 __device__
int atomicXor(
int* address,
int val);
254 __device__
unsigned int atomicXor(
unsigned int* address,
256 __device__
unsigned long long int atomicXor(
unsigned long long int* address,
257 unsigned long long int val);
260 __device__
unsigned int atomicInc(
unsigned int* address,
265 __device__
unsigned int atomicDec(
unsigned int* address,
269 __device__
int __all(
int input);
270 __device__
int __any(
int input);
271 __device__
unsigned long long int __ballot(
int input);
273 #if __HIP_ARCH_GFX701__ == 0 277 __device__
int __shfl(
int input,
int lane,
int width=warpSize);
278 __device__
int __shfl_up(
int input,
unsigned int lane_delta,
int width=warpSize);
279 __device__
int __shfl_down(
int input,
unsigned int lane_delta,
int width=warpSize);
280 __device__
int __shfl_xor(
int input,
int lane_mask,
int width=warpSize);
281 __device__
float __shfl(
float input,
int lane,
int width=warpSize);
282 __device__
float __shfl_up(
float input,
unsigned int lane_delta,
int width=warpSize);
283 __device__
float __shfl_down(
float input,
unsigned int lane_delta,
int width=warpSize);
284 __device__
float __shfl_xor(
float input,
int lane_mask,
int width=warpSize);
286 __device__
int __shfl(
int input,
int lane,
int width);
287 __device__
int __shfl_up(
int input,
unsigned int lane_delta,
int width);
288 __device__
int __shfl_down(
int input,
unsigned int lane_delta,
int width);
289 __device__
int __shfl_xor(
int input,
int lane_mask,
int width);
290 __device__
float __shfl(
float input,
int lane,
int width);
291 __device__
float __shfl_up(
float input,
unsigned int lane_delta,
int width);
292 __device__
float __shfl_down(
float input,
unsigned int lane_delta,
int width);
293 __device__
float __shfl_xor(
float input,
int lane_mask,
int width);
296 __device__
unsigned __hip_ds_bpermute(
int index,
unsigned src);
297 __device__
float __hip_ds_bpermutef(
int index,
float src);
298 __device__
unsigned __hip_ds_permute(
int index,
unsigned src);
299 __device__
float __hip_ds_permutef(
int index,
float src);
301 __device__
unsigned __hip_ds_swizzle(
unsigned int src,
int pattern);
302 __device__
float __hip_ds_swizzlef(
float src,
int pattern);
304 __device__
int __hip_move_dpp(
int src,
int dpp_ctrl,
int row_mask,
int bank_mask,
bool bound_ctrl);
306 #endif //__HIP_ARCH_GFX803__ == 1 308 __host__ __device__
int min(
int arg1,
int arg2);
309 __host__ __device__
int max(
int arg1,
int arg2);
311 __device__
void* __get_dynamicgroupbaseptr();
338 extern __attribute__((
const)) __device__
void __hip_hc_threadfence() __asm("__llvm_fence_sc_dev");
339 extern __attribute__((const)) __device__
void __hip_hc_threadfence_block() __asm("__llvm_fence_sc_wg");
352 __device__ static inline
void __threadfence_block(
void) {
353 return __hip_hc_threadfence_block();
366 __device__
static inline void __threadfence(
void) {
367 return __hip_hc_threadfence();
388 #define hipThreadIdx_x (hc_get_workitem_id(0)) 389 #define hipThreadIdx_y (hc_get_workitem_id(1)) 390 #define hipThreadIdx_z (hc_get_workitem_id(2)) 392 #define hipBlockIdx_x (hc_get_group_id(0)) 393 #define hipBlockIdx_y (hc_get_group_id(1)) 394 #define hipBlockIdx_z (hc_get_group_id(2)) 396 #define hipBlockDim_x (hc_get_group_size(0)) 397 #define hipBlockDim_y (hc_get_group_size(1)) 398 #define hipBlockDim_z (hc_get_group_size(2)) 400 #define hipGridDim_x (hc_get_num_groups(0)) 401 #define hipGridDim_y (hc_get_num_groups(1)) 402 #define hipGridDim_z (hc_get_num_groups(2)) 404 extern "C" __device__
void* __hip_hc_memcpy(
void* dst,
const void* src,
size_t size);
405 extern "C" __device__
void* __hip_hc_memset(
void* ptr, uint8_t val,
size_t size);
406 extern "C" __device__
void* __hip_hc_malloc(
size_t);
407 extern "C" __device__
void* __hip_hc_free(
void *ptr);
409 static inline __device__
void* malloc(
size_t size)
411 return __hip_hc_malloc(size);
414 static inline __device__
void* free(
void *ptr)
416 return __hip_hc_free(ptr);
419 static inline __device__
void* memcpy(
void* dst,
const void* src,
size_t size)
421 return __hip_hc_memcpy(dst, src, size);
424 static inline __device__
void* memset(
void* ptr,
int val,
size_t size)
426 uint8_t val8 = static_cast <uint8_t> (val);
427 return __hip_hc_memset(ptr, val8, size);
432 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE) 434 #define HIP_KERNEL_NAME(...) (__VA_ARGS__) 435 #define HIP_SYMBOL(X) #X 437 #if defined __HCC_CPP__ 439 extern hipStream_t ihipPreLaunchKernel(
hipStream_t stream,
dim3 grid,
size_t block, grid_launch_parm *lp,
const char *kernelNameStr);
440 extern hipStream_t ihipPreLaunchKernel(
hipStream_t stream,
size_t grid,
dim3 block, grid_launch_parm *lp,
const char *kernelNameStr);
441 extern hipStream_t ihipPreLaunchKernel(
hipStream_t stream,
size_t grid,
size_t block, grid_launch_parm *lp,
const char *kernelNameStr);
442 extern void ihipPostLaunchKernel(
const char *kernelName,
hipStream_t stream, grid_launch_parm &lp);
444 #if GENERIC_GRID_LAUNCH == 0 447 #define hipLaunchKernel(_kernelName, _numBlocks3D, _blockDim3D, _groupMemBytes, _stream, ...) \ 449 grid_launch_parm lp;\ 450 lp.dynamic_group_mem_bytes = _groupMemBytes; \ 451 hipStream_t trueStream = (ihipPreLaunchKernel(_stream, _numBlocks3D, _blockDim3D, &lp, #_kernelName)); \ 452 _kernelName (lp, ##__VA_ARGS__);\ 453 ihipPostLaunchKernel(#_kernelName, trueStream, lp);\ 455 #endif //GENERIC_GRID_LAUNCH 457 #elif defined (__HCC_C__) 469 #define HIP_DYNAMIC_SHARED(type, var) \ 471 (type*)__get_dynamicgroupbaseptr(); \ 473 #define HIP_DYNAMIC_SHARED_ATTRIBUTE 498 #endif//HIP_HCC_DETAIL_RUNTIME_H
Definition: hip_runtime_api.h:193
#define __host__
Definition: host_defines.h:41
__device__ void __threadfence_system(void)
threadfence_system makes writes to pinned system memory visible on host CPU.
Definition: device_util.cpp:1266
Definition: grid_launch.cpp:33
Definition: hip_hcc_internal.h:491