| /* Check whether hipBlas' daxpy works with an interop object. |
| daxpy(N, DA, DX, INCX, DY, INCY) |
| calculates (for DX = DY = 1): |
| DY(1:N) = DY(1:N) + DA * DX(1:N) |
| and otherwise N array elements, taking every INCX-th or INCY-th one, repectively. |
| |
| Based on the interop example in OpenMP's example document */ |
| |
| /* Minimal check whether HIP works - by checking whether the API routines |
| seem to work. This includes a fallback if the header is not |
| available. */ |
| |
| #if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__) |
| #error "Either __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__ must be defined" |
| #endif |
| |
| #if defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__) |
| #error "Either __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__ must be defined" |
| #endif |
| |
| |
| #include <assert.h> |
| #include <omp.h> |
| #include "../libgomp.c-c++-common/on_device_arch.h" |
| |
| |
| #if __has_include(<hipblas/hipblas.h>) && (__has_include(<library_types.h>) || !defined(__HIP_PLATFORM_NVIDIA__)) && !defined(USE_HIP_FALLBACK_HEADER) |
| #ifdef __HIP_PLATFORM_NVIDIA__ |
| /* There seems to be an issue with hip/library_types.h including |
| CUDA's "library_types.h". Include CUDA's one explicitly here. |
| Could possibly worked around by using -isystem vs. -I. */ |
| #include <library_types.h> |
| |
| /* For some reasons, the following symbols do not seem to get |
| mapped from HIP to CUDA, causing link errors. */ |
| #define hipblasSetStream cublasSetStream_v2 |
| #define hipblasDaxpy cublasDaxpy_v2 |
| #define hipblasCreate cublasCreate_v2 |
| #endif |
| #include <hipblas/hipblas.h> |
| |
| #elif defined(__HIP_PLATFORM_AMD__) |
| /* Add a poor man's fallback declaration. */ |
| #if !defined(USE_HIP_FALLBACK_HEADER) |
| #warning "Using fallback declaration for <hipblas/hipblas.h> for __HIP_PLATFORM_AMD__" |
| #endif |
| |
| typedef enum |
| { |
| HIPBLAS_STATUS_SUCCESS = 0 |
| |
| } hipblasStatus_t; |
| |
| typedef struct ihipStream_t* hipStream_t; |
| typedef void* hipblasHandle_t; |
| |
| hipblasStatus_t hipblasCreate (hipblasHandle_t*); |
| hipblasStatus_t hipblasSetStream (hipblasHandle_t, hipStream_t); |
| hipblasStatus_t hipblasDaxpy (hipblasHandle_t, int, const double*, const double*, int, double*, int); |
| |
| #else |
| /* Add a poor man's fallback declaration. */ |
| #if !defined(USE_HIP_FALLBACK_HEADER) |
| #warning "Using fallback declaration for <hipblas/hipblas.h> for __HIP_PLATFORM_NVIDA__" |
| #endif |
| |
| #if __has_include(<cuda.h>) && __has_include(<cudaTypedefs.h>) && __has_include(<cuda_runtime.h>) && __has_include(<cublas_v2.h>) && !defined(USE_CUDA_FALLBACK_HEADER) |
| #include <cuda.h> |
| #include <cudaTypedefs.h> |
| #include <cuda_runtime.h> |
| #include <cublas_v2.h> |
| |
| #else |
| /* Add a poor man's fallback declaration. */ |
| #if defined(USE_CUDA_FALLBACK_HEADER) |
| // no warning |
| #elif !__has_include(<cuda.h>) |
| #warning "Using GCC's cuda.h as fallback for cuda.h" |
| #elif !__has_include(<cudaTypedefs.h>) |
| #warning "Using GCC's cuda.h as fallback for cudaTypedefs.h" |
| #elif !__has_include(<cuda_runtime.h>) |
| #warning "Using GCC's cuda.h as fallback for cuda_runtime.h" |
| #else |
| #warning "Using GCC's cuda.h as fallback for cublas_v2.h" |
| #endif |
| #include "../../../include/cuda/cuda.h" |
| |
| typedef enum { |
| CUBLAS_STATUS_SUCCESS = 0, |
| } cublasStatus_t; |
| |
| typedef CUstream cudaStream_t; |
| typedef struct cublasContext* cublasHandle_t; |
| |
| #define cublasCreate cublasCreate_v2 |
| cublasStatus_t cublasCreate_v2 (cublasHandle_t *); |
| |
| #define cublasSetStream cublasSetStream_v2 |
| cublasStatus_t cublasSetStream_v2 (cublasHandle_t, cudaStream_t); |
| |
| #define cublasDaxpy cublasDaxpy_v2 |
| cublasStatus_t cublasDaxpy_v2(cublasHandle_t, int, const double*, const double*, int, double*, int); |
| #endif |
| |
| #define HIPBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS |
| #define hipblasStatus_t cublasStatus_t |
| #define hipStream_t cudaStream_t |
| #define hipblasHandle_t cublasHandle_t |
| #define hipblasCreate cublasCreate |
| #define hipblasSetStream cublasSetStream |
| #define hipblasDaxpy cublasDaxpy |
| #endif |
| |
| static int used_variant = 0; |
| |
| void |
| run_hipBlasdaxpy (int n, double da, const double *dx, int incx, double *dy, int incy, omp_interop_t obj) |
| { |
| used_variant = 1; |
| |
| omp_interop_rc_t res; |
| hipblasStatus_t stat; |
| |
| omp_intptr_t fr = omp_get_interop_int(obj, omp_ipr_fr_id, &res); |
| assert (res == omp_irc_success && fr == omp_ifr_hip); |
| |
| hipStream_t stream = (hipStream_t) omp_get_interop_ptr (obj, omp_ipr_targetsync, &res); |
| assert (res == omp_irc_success); |
| |
| hipblasHandle_t handle; |
| stat = hipblasCreate (&handle); |
| assert (stat == HIPBLAS_STATUS_SUCCESS); |
| |
| stat = hipblasSetStream (handle, stream); |
| assert (stat == HIPBLAS_STATUS_SUCCESS); |
| |
| /* 'da' can be in host or device space, 'dx' and 'dy' must be in device space. */ |
| stat = hipblasDaxpy (handle, n, &da, dx, 1, dy, 1) ; |
| assert (stat == HIPBLAS_STATUS_SUCCESS); |
| } |
| |
| #if defined(__HIP_PLATFORM_AMD__) |
| #pragma omp declare variant(run_hipBlasdaxpy) \ |
| match(construct={dispatch}, target_device={kind(nohost), arch("amdgcn")}) \ |
| adjust_args(need_device_ptr : dx, dy) \ |
| append_args(interop(targetsync, prefer_type("hip"))) |
| #elif defined(__HIP_PLATFORM_NVIDIA__) |
| #pragma omp declare variant(run_hipBlasdaxpy) \ |
| match(construct={dispatch}, target_device={kind(nohost), arch("nvptx")}) \ |
| adjust_args(need_device_ptr : dx, dy) \ |
| append_args(interop(targetsync, prefer_type("hip"))) |
| #else |
| #error "wrong platform" |
| #endif |
| |
| void |
| run_daxpy (int n, double da, const double *dx, int incx, double *dy, int incy) |
| { |
| used_variant = 2; |
| |
| if (incx == 1 && incy == 1) |
| #pragma omp simd |
| for (int i = 0; i < n; i++) |
| dy[i] += da * dx[i]; |
| else |
| { |
| int ix = 0; |
| int iy = 0; |
| for (int i = 0; i < n; i++) |
| { |
| dy[iy] += da * dx[ix]; |
| ix += incx; |
| iy += incy; |
| } |
| } |
| } |
| |
| |
| void |
| run_test (int dev) |
| { |
| constexpr int N = 1024; |
| |
| // A = {1,2,...,N} |
| // B = {-1, -2, ..., N} |
| // B' = daxpy (N, 3, A, incx=1, B, incy=1) |
| // = B + 3*A |
| // -> B' = {0, 2, 4, 6, ... } |
| |
| double A[N], B[N]; |
| double factor = 3.0; |
| for (int i = 0; i < N; i++) |
| { |
| A[i] = i; |
| B[i] = -i; |
| } |
| |
| if (dev != omp_initial_device && dev != omp_get_num_devices ()) |
| { |
| #pragma omp target enter data device(dev) map(A, B) |
| } |
| |
| used_variant = 99; |
| #pragma omp dispatch device(dev) |
| run_daxpy (N, factor, A, 1, B, 1); |
| |
| if (dev != omp_initial_device && dev != omp_get_num_devices ()) |
| { |
| #pragma omp target exit data device(dev) map(release: A) map(from: B) |
| |
| int tmp = omp_get_default_device (); |
| omp_set_default_device (dev); |
| #if defined(__HIP_PLATFORM_AMD__) |
| if (on_device_arch_gcn ()) |
| #else |
| if (on_device_arch_nvptx ()) |
| #endif |
| assert (used_variant == 1); |
| else |
| assert (used_variant == 2); |
| omp_set_default_device (tmp); |
| } |
| else |
| assert (used_variant == 2); |
| |
| for (int i = 0; i < N; i++) |
| assert (B[i] == 2*i); |
| } |
| |
| int |
| main () |
| { |
| int ndev = omp_get_num_devices (); |
| |
| for (int dev = 0; dev <= ndev; dev++) |
| run_test (dev); |
| run_test (omp_initial_device); |
| |
| return 0; |
| } |