libgomp/testsuite/libgomp.c/interop-hipblas.h - gcc - Git at Google

 /* Check whether hipBlas' daxpy works with an interop object.
      daxpy(N, DA, DX, INCX, DY, INCY)
    calculates (for DX = DY = 1):
      DY(1:N) =  DY(1:N) + DA * DX(1:N)
    and otherwise N array elements, taking every INCX-th or INCY-th one, repectively.

 Based on the interop example in OpenMP's example document  */

 /* Minimal check whether HIP works - by checking whether the API routines
    seem to work.  This includes a fallback if the header is not
    available.  */

 #if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
   #error "Either __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__ must be defined"
 #endif

 #if defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
   #error "Either __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__ must be defined"
 #endif


 #include <assert.h>
 #include <omp.h>
 #include "../libgomp.c-c++-common/on_device_arch.h"


 #if __has_include(<hipblas/hipblas.h>) && (__has_include(<library_types.h>) || !defined(__HIP_PLATFORM_NVIDIA__)) && !defined(USE_HIP_FALLBACK_HEADER)
   #ifdef __HIP_PLATFORM_NVIDIA__
     /* There seems to be an issue with hip/library_types.h including
        CUDA's "library_types.h". Include CUDA's one explicitly here.
        Could possibly worked around by using -isystem vs. -I.  */
     #include <library_types.h>

     /* For some reasons, the following symbols do not seem to get
        mapped from HIP to CUDA, causing link errors.  */
     #define hipblasSetStream cublasSetStream_v2
     #define hipblasDaxpy cublasDaxpy_v2
     #define hipblasCreate cublasCreate_v2
   #endif
   #include <hipblas/hipblas.h>

 #elif defined(__HIP_PLATFORM_AMD__)
   /* Add a poor man's fallback declaration.  */
   #if !defined(USE_HIP_FALLBACK_HEADER)
     #warning "Using fallback declaration for <hipblas/hipblas.h> for __HIP_PLATFORM_AMD__"
   #endif

   typedef enum
   {
     HIPBLAS_STATUS_SUCCESS = 0

   } hipblasStatus_t;

   typedef struct ihipStream_t* hipStream_t;
   typedef void* hipblasHandle_t;

   hipblasStatus_t hipblasCreate (hipblasHandle_t*);
   hipblasStatus_t hipblasSetStream (hipblasHandle_t, hipStream_t);
   hipblasStatus_t hipblasDaxpy (hipblasHandle_t, int, const double*, const double*, int, double*, int);

 #else
   /* Add a poor man's fallback declaration.  */
   #if !defined(USE_HIP_FALLBACK_HEADER)
     #warning "Using fallback declaration for <hipblas/hipblas.h> for __HIP_PLATFORM_NVIDA__"
   #endif

   #if __has_include(<cuda.h>) && __has_include(<cudaTypedefs.h>) && __has_include(<cuda_runtime.h>) && __has_include(<cublas_v2.h>) && !defined(USE_CUDA_FALLBACK_HEADER)
     #include <cuda.h>
     #include <cudaTypedefs.h>
     #include <cuda_runtime.h>
     #include <cublas_v2.h>

   #else
     /* Add a poor man's fallback declaration.  */
     #if defined(USE_CUDA_FALLBACK_HEADER)
       // no warning
     #elif !__has_include(<cuda.h>)
       #warning "Using GCC's cuda.h as fallback for cuda.h"
     #elif !__has_include(<cudaTypedefs.h>)
       #warning "Using GCC's cuda.h as fallback for cudaTypedefs.h"
     #elif !__has_include(<cuda_runtime.h>)
       #warning "Using GCC's cuda.h as fallback for cuda_runtime.h"
     #else
       #warning "Using GCC's cuda.h as fallback for cublas_v2.h"
     #endif
     #include "../../../include/cuda/cuda.h"

     typedef enum {
       CUBLAS_STATUS_SUCCESS = 0,
     } cublasStatus_t;

     typedef CUstream cudaStream_t;
     typedef struct cublasContext* cublasHandle_t;

     #define cublasCreate cublasCreate_v2
     cublasStatus_t cublasCreate_v2 (cublasHandle_t *);

     #define cublasSetStream cublasSetStream_v2
     cublasStatus_t cublasSetStream_v2 (cublasHandle_t, cudaStream_t);

     #define cublasDaxpy cublasDaxpy_v2
     cublasStatus_t cublasDaxpy_v2(cublasHandle_t, int, const double*, const double*, int, double*, int);
   #endif

   #define HIPBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
   #define hipblasStatus_t cublasStatus_t
   #define hipStream_t cudaStream_t
   #define hipblasHandle_t cublasHandle_t
   #define hipblasCreate cublasCreate
   #define hipblasSetStream cublasSetStream
   #define hipblasDaxpy cublasDaxpy
 #endif

 static int used_variant = 0;

 void
 run_hipBlasdaxpy (int n, double da, const double *dx, int incx, double *dy, int incy, omp_interop_t obj)
 {
   used_variant = 1;

   omp_interop_rc_t res;
   hipblasStatus_t stat;

   omp_intptr_t fr = omp_get_interop_int(obj, omp_ipr_fr_id, &res);
   assert (res == omp_irc_success && fr == omp_ifr_hip);

   hipStream_t stream = (hipStream_t) omp_get_interop_ptr (obj, omp_ipr_targetsync, &res);
   assert (res == omp_irc_success);

   hipblasHandle_t handle;
   stat = hipblasCreate (&handle);
   assert (stat == HIPBLAS_STATUS_SUCCESS);

   stat = hipblasSetStream (handle, stream);
   assert (stat == HIPBLAS_STATUS_SUCCESS);

   /* 'da' can be in host or device space, 'dx' and 'dy' must be in device space.  */
   stat = hipblasDaxpy (handle, n, &da, dx, 1, dy, 1) ;
   assert (stat == HIPBLAS_STATUS_SUCCESS);
 }

 #if defined(__HIP_PLATFORM_AMD__)
 #pragma omp declare variant(run_hipBlasdaxpy) \
                        match(construct={dispatch}, target_device={kind(nohost), arch("amdgcn")}) \
                        adjust_args(need_device_ptr : dx, dy) \
                        append_args(interop(targetsync, prefer_type("hip")))
 #elif defined(__HIP_PLATFORM_NVIDIA__)
 #pragma omp declare variant(run_hipBlasdaxpy) \
                        match(construct={dispatch}, target_device={kind(nohost), arch("nvptx")}) \
                        adjust_args(need_device_ptr : dx, dy) \
                        append_args(interop(targetsync, prefer_type("hip")))
 #else
  #error "wrong platform"
 #endif

 void
 run_daxpy (int n, double da, const double *dx, int incx, double *dy, int incy)
 {
   used_variant = 2;

   if (incx == 1 && incy == 1)
     #pragma omp simd
     for (int i = 0; i < n; i++)
       dy[i] += da * dx[i];
   else
     {
       int ix = 0;
       int iy = 0;
       for (int i = 0; i < n; i++)
 	{
 	  dy[iy] += da * dx[ix];
 	  ix += incx;
 	  iy += incy;
 	}
     }
 }


 void
 run_test (int dev)
 {
   constexpr int N = 1024;

   // A = {1,2,...,N}
   // B = {-1, -2, ..., N}
   // B' = daxpy (N, 3, A, incx=1, B, incy=1)
   //    = B + 3*A
   // -> B' = {0, 2, 4, 6, ... }

   double A[N], B[N];
   double factor = 3.0;
   for (int i = 0; i < N; i++)
     {
       A[i] = i;
       B[i] = -i;
     }

   if (dev != omp_initial_device && dev != omp_get_num_devices ())
     {
       #pragma omp target enter data device(dev) map(A, B)
     }

   used_variant = 99;
   #pragma omp dispatch device(dev)
     run_daxpy (N, factor, A, 1, B, 1);

   if (dev != omp_initial_device && dev != omp_get_num_devices ())
     {
       #pragma omp target exit data device(dev) map(release: A) map(from: B)

       int tmp = omp_get_default_device ();
       omp_set_default_device (dev);
 #if defined(__HIP_PLATFORM_AMD__)
       if (on_device_arch_gcn ())
 #else
       if (on_device_arch_nvptx ())
 #endif
 	assert (used_variant == 1);
       else
 	assert (used_variant == 2);
       omp_set_default_device (tmp);
     }
   else
     assert (used_variant == 2);

   for (int i = 0; i < N; i++)
     assert (B[i] == 2*i);
 }

 int
 main ()
 {
   int ndev = omp_get_num_devices ();

   for (int dev = 0; dev <= ndev; dev++)
     run_test (dev);
   run_test (omp_initial_device);

   return 0;
 }
	/* Check whether hipBlas' daxpy works with an interop object.
	daxpy(N, DA, DX, INCX, DY, INCY)
	calculates (for DX = DY = 1):
	DY(1:N) = DY(1:N) + DA * DX(1:N)
	and otherwise N array elements, taking every INCX-th or INCY-th one, repectively.

	Based on the interop example in OpenMP's example document */

	/* Minimal check whether HIP works - by checking whether the API routines
	seem to work. This includes a fallback if the header is not
	available. */

	#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
	#error "Either __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__ must be defined"
	#endif

	#if defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
	#error "Either __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__ must be defined"
	#endif


	#include <assert.h>
	#include <omp.h>
	#include "../libgomp.c-c++-common/on_device_arch.h"


	#if __has_include(<hipblas/hipblas.h>) && (__has_include(<library_types.h>) \|\| !defined(__HIP_PLATFORM_NVIDIA__)) && !defined(USE_HIP_FALLBACK_HEADER)
	#ifdef __HIP_PLATFORM_NVIDIA__
	/* There seems to be an issue with hip/library_types.h including
	CUDA's "library_types.h". Include CUDA's one explicitly here.
	Could possibly worked around by using -isystem vs. -I. */
	#include <library_types.h>

	/* For some reasons, the following symbols do not seem to get
	mapped from HIP to CUDA, causing link errors. */
	#define hipblasSetStream cublasSetStream_v2
	#define hipblasDaxpy cublasDaxpy_v2
	#define hipblasCreate cublasCreate_v2
	#endif
	#include <hipblas/hipblas.h>

	#elif defined(__HIP_PLATFORM_AMD__)
	/* Add a poor man's fallback declaration. */
	#if !defined(USE_HIP_FALLBACK_HEADER)
	#warning "Using fallback declaration for <hipblas/hipblas.h> for __HIP_PLATFORM_AMD__"
	#endif

	typedef enum
	{
	HIPBLAS_STATUS_SUCCESS = 0

	} hipblasStatus_t;

	typedef struct ihipStream_t* hipStream_t;
	typedef void* hipblasHandle_t;

	hipblasStatus_t hipblasCreate (hipblasHandle_t*);
	hipblasStatus_t hipblasSetStream (hipblasHandle_t, hipStream_t);
	hipblasStatus_t hipblasDaxpy (hipblasHandle_t, int, const double, const double, int, double*, int);

	#else
	/* Add a poor man's fallback declaration. */
	#if !defined(USE_HIP_FALLBACK_HEADER)
	#warning "Using fallback declaration for <hipblas/hipblas.h> for __HIP_PLATFORM_NVIDA__"
	#endif

	#if __has_include(<cuda.h>) && __has_include(<cudaTypedefs.h>) && __has_include(<cuda_runtime.h>) && __has_include(<cublas_v2.h>) && !defined(USE_CUDA_FALLBACK_HEADER)
	#include <cuda.h>
	#include <cudaTypedefs.h>
	#include <cuda_runtime.h>
	#include <cublas_v2.h>

	#else
	/* Add a poor man's fallback declaration. */
	#if defined(USE_CUDA_FALLBACK_HEADER)
	// no warning
	#elif !__has_include(<cuda.h>)
	#warning "Using GCC's cuda.h as fallback for cuda.h"
	#elif !__has_include(<cudaTypedefs.h>)
	#warning "Using GCC's cuda.h as fallback for cudaTypedefs.h"
	#elif !__has_include(<cuda_runtime.h>)
	#warning "Using GCC's cuda.h as fallback for cuda_runtime.h"
	#else
	#warning "Using GCC's cuda.h as fallback for cublas_v2.h"
	#endif
	#include "../../../include/cuda/cuda.h"

	typedef enum {
	CUBLAS_STATUS_SUCCESS = 0,
	} cublasStatus_t;

	typedef CUstream cudaStream_t;
	typedef struct cublasContext* cublasHandle_t;

	#define cublasCreate cublasCreate_v2
	cublasStatus_t cublasCreate_v2 (cublasHandle_t *);

	#define cublasSetStream cublasSetStream_v2
	cublasStatus_t cublasSetStream_v2 (cublasHandle_t, cudaStream_t);

	#define cublasDaxpy cublasDaxpy_v2
	cublasStatus_t cublasDaxpy_v2(cublasHandle_t, int, const double, const double, int, double*, int);
	#endif

	#define HIPBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
	#define hipblasStatus_t cublasStatus_t
	#define hipStream_t cudaStream_t
	#define hipblasHandle_t cublasHandle_t
	#define hipblasCreate cublasCreate
	#define hipblasSetStream cublasSetStream
	#define hipblasDaxpy cublasDaxpy
	#endif

	static int used_variant = 0;

	void
	run_hipBlasdaxpy (int n, double da, const double dx, int incx, double dy, int incy, omp_interop_t obj)
	{
	used_variant = 1;

	omp_interop_rc_t res;
	hipblasStatus_t stat;

	omp_intptr_t fr = omp_get_interop_int(obj, omp_ipr_fr_id, &res);
	assert (res == omp_irc_success && fr == omp_ifr_hip);

	hipStream_t stream = (hipStream_t) omp_get_interop_ptr (obj, omp_ipr_targetsync, &res);
	assert (res == omp_irc_success);

	hipblasHandle_t handle;
	stat = hipblasCreate (&handle);
	assert (stat == HIPBLAS_STATUS_SUCCESS);

	stat = hipblasSetStream (handle, stream);
	assert (stat == HIPBLAS_STATUS_SUCCESS);

	/* 'da' can be in host or device space, 'dx' and 'dy' must be in device space. */
	stat = hipblasDaxpy (handle, n, &da, dx, 1, dy, 1) ;
	assert (stat == HIPBLAS_STATUS_SUCCESS);
	}

	#if defined(__HIP_PLATFORM_AMD__)
	#pragma omp declare variant(run_hipBlasdaxpy) \
	match(construct={dispatch}, target_device={kind(nohost), arch("amdgcn")}) \
	adjust_args(need_device_ptr : dx, dy) \
	append_args(interop(targetsync, prefer_type("hip")))
	#elif defined(__HIP_PLATFORM_NVIDIA__)
	#pragma omp declare variant(run_hipBlasdaxpy) \
	match(construct={dispatch}, target_device={kind(nohost), arch("nvptx")}) \
	adjust_args(need_device_ptr : dx, dy) \
	append_args(interop(targetsync, prefer_type("hip")))
	#else
	#error "wrong platform"
	#endif

	void
	run_daxpy (int n, double da, const double dx, int incx, double dy, int incy)
	{
	used_variant = 2;

	if (incx == 1 && incy == 1)
	#pragma omp simd
	for (int i = 0; i < n; i++)
	dy[i] += da * dx[i];
	else
	{
	int ix = 0;
	int iy = 0;
	for (int i = 0; i < n; i++)
	{
	dy[iy] += da * dx[ix];
	ix += incx;
	iy += incy;
	}
	}
	}


	void
	run_test (int dev)
	{
	constexpr int N = 1024;

	// A = {1,2,...,N}
	// B = {-1, -2, ..., N}
	// B' = daxpy (N, 3, A, incx=1, B, incy=1)
	// = B + 3*A
	// -> B' = {0, 2, 4, 6, ... }

	double A[N], B[N];
	double factor = 3.0;
	for (int i = 0; i < N; i++)
	{
	A[i] = i;
	B[i] = -i;
	}

	if (dev != omp_initial_device && dev != omp_get_num_devices ())
	{
	#pragma omp target enter data device(dev) map(A, B)
	}

	used_variant = 99;
	#pragma omp dispatch device(dev)
	run_daxpy (N, factor, A, 1, B, 1);

	if (dev != omp_initial_device && dev != omp_get_num_devices ())
	{
	#pragma omp target exit data device(dev) map(release: A) map(from: B)

	int tmp = omp_get_default_device ();
	omp_set_default_device (dev);
	#if defined(__HIP_PLATFORM_AMD__)
	if (on_device_arch_gcn ())
	#else
	if (on_device_arch_nvptx ())
	#endif
	assert (used_variant == 1);
	else
	assert (used_variant == 2);
	omp_set_default_device (tmp);
	}
	else
	assert (used_variant == 2);

	for (int i = 0; i < N; i++)
	assert (B[i] == 2*i);
	}

	int
	main ()
	{
	int ndev = omp_get_num_devices ();

	for (int dev = 0; dev <= ndev; dev++)
	run_test (dev);
	run_test (omp_initial_device);

	return 0;
	}