libgomp/testsuite/libgomp.hsa.c/tiling-2.c - gcc - Git at Google

 /*

    matmul.c : Matrix Multiplication with tiling for openmp4 example

 */

 #include <stdlib.h>
 #include <math.h>

 #define BLOCK_SIZE 16
 /*
   #define BLOCK_SIZE 32
 */
 #define NSECPERSEC 1000000000L

 typedef struct {
    int width;
    int height;
    int stride;
    int hpad;
    float* elements;
 } Matrix;

 /* Correctly extract the number of nanoseconds from the two time structures */
 long int get_nanosecs( struct timespec start_time, struct timespec end_time) {
    long int nanosecs;
    if ((end_time.tv_nsec-start_time.tv_nsec)<0) nanosecs =
       ((((long int) end_time.tv_sec- (long int) start_time.tv_sec )-1)*NSECPERSEC ) +
       ( NSECPERSEC + (long int) end_time.tv_nsec - (long int) start_time.tv_nsec) ;
    else nanosecs =
       (((long int) end_time.tv_sec- (long int) start_time.tv_sec )*NSECPERSEC ) +
       ( (long int) end_time.tv_nsec - (long int) start_time.tv_nsec );
    return nanosecs;
 }

 void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
      const float* B,const int LDB, const float beta,float* C, const int LDC) ;
 void simple_sgemm_tn(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
      const float* B,const int LDB, const float beta,float* C, const int LDC) ;
 void  tiled_sgemm_tt(const int M,const int N,const int K,const float alpha, const float*A, const int LDA,
      const float* B,const int LDB, const float beta,float* C, const int LDC) ;

 int verify(float* v_res, float* v_ref, int len) {
     int passed = 1;
     int i;
     for (i = 0; i < len; ++i) {
         if (fabs(v_res[i] - v_ref[i]) > 0.001*v_ref[i]) {
 	  __builtin_abort ();
         }
     }
     return passed;
 }


 int main(int argc, char* argv[]){

    Matrix A,B,Bt,C,Cref;
    int a1,a2,a3,i,j;
    struct timespec start_time1, end_time1;
    struct timespec start_time2, end_time2;
    long int nanosecs,total_ops;
    float gflopsTiled,gflopsCPU;

    a1 = 35;
    a2 = 28;
    a3 = 47;

    A.height = a1;
    A.width = a2;
    A.stride = (((A.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
    A.hpad = (((A.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
    A.elements = (float*)malloc(A.stride * A.hpad* sizeof(float));

    B.height = a2;
    B.width = a3;
    B.stride = (((B.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
    B.hpad = (((B.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
    B.elements = (float*)malloc(B.stride * B.hpad * sizeof(float));

    /* Bt is same as B but stored in column-major order */
    Bt.height = B.height;
    Bt.width = B.width;
    Bt.stride = B.stride;
    Bt.hpad = B.hpad;
    Bt.elements = (float*)malloc(Bt.stride * Bt.hpad * sizeof(float));

    C.height = a1;
    C.width = a3;
    C.stride = (((C.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
    C.hpad = (((C.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
    C.elements = (float*)malloc(C.stride * C.hpad * sizeof(float));

    Cref.height = a1;
    Cref.width = a3;
    Cref.stride = (((Cref.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
    Cref.hpad = (((Cref.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
    Cref.elements = (float*)malloc(Cref.stride * Cref.hpad * sizeof(float));

    for(i = 0; i < A.hpad ; i++)
       for(j = 0; j < A.stride; j++) {
          if (( j<A.width ) && (i<A.height)) {
             A.elements[i*A.stride + j] = (i % 3);
          } else {
             A.elements[i*A.stride + j] = 0.0;
          }
       }

    /*  Initialize B and Bt */
    for(i = 0; i < B.hpad ; i++)
       for(j = 0; j < B.stride; j++) {
          if (( j<B.width ) && (i<B.height)) {
             B.elements[i*B.stride+j] = (j % 2);
             Bt.elements[j*Bt.stride+i] = B.elements[i*B.stride+j] ;
          } else {
             B.elements[i*B.stride+j] = 0.0;
             Bt.elements[j*Bt.stride+i] = 0.0;
          }
       }

    /* zero C, and Cref */
    for(i = 0; i < C.hpad; i++)
       for(j = 0; j < C.stride; j++) {
          C.elements[i*C.stride+j] = 0.0;
          Cref.elements[i*Cref.stride+j] = 0.0;
       }

    simple_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,Cref.elements,Cref.stride);
    tiled_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,C.elements,C.stride);

    verify(C.elements, Cref.elements, C.height * C.stride);
    return 0;
 }

 void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
 const float* B,const int LDB, const float beta,float* C, const int LDC) {
    /*  A,B, and C  are in row-major order */
    int c_row,c_col,inner;
    float sum;
    for (c_col  = 0 ;  c_col<N; c_col++ ) {
       for (c_row = 0 ; c_row<M; c_row++ ) {
          sum = 0.0 ;
          for (inner = 0 ; inner<K; inner++ ) {
             sum += A[c_row*LDA + inner] * B[inner*LDB + c_col] ;
          }
          C[c_row*LDC + c_col] = alpha*sum + beta*C[ c_row*LDC + c_col] ;
       }
    }
 }

 /***************************

    tiled_sgemm_tt:  Tiled matrix multiplication:

 ***************************/

 void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
    const float*B, const int LDB, const float beta, float*C, const int LDC){

 #pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N])
 #pragma omp distribute collapse(2)
    for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE) {
       for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE) {

 // We now have M/BLOCK_SIZE * N/BLOCK_SIZE teams = (M*N)/(BLOCK_SIZE*BLOCK_SIZE)
 // The grid global dimensions are M,N,1
 // The grid local dimensions are BLOCK_SIZE,BLOCK_SIZE,1

 // -------------------------------------------------------------------
 //      The rest of this code forms the HSAIL kernel with the
 //      pairs of "paralell for collapse(2)" loops repalced with a barrier.
 //      The kernel initializes these values
 //      C_row_start = get_group_id(0) * BLOCK_SIZE
 //      C_col_start = get_group_id(1) * BLOCK_SIZE
 //      row=get_local_id(0)
 //      col=get_local_id(1)
 // -------------------------------------------------------------------

 //       Each team has a local copy of these mini matrices
          float As[BLOCK_SIZE][BLOCK_SIZE];
          float Bs[BLOCK_SIZE][BLOCK_SIZE];
          float Cs[BLOCK_SIZE][BLOCK_SIZE];
          int C_row, C_col;

          /* Zero Cs for this BLOCK */
 // - - - - - - - - - - - - - - - - - - - -
 // REPLACE NEXT THREE LINES WITH A BARRIER
 #pragma omp parallel for collapse(2)
          for (int row=0 ; row < BLOCK_SIZE ; row++) {
             for (int col=0 ; col < BLOCK_SIZE ; col++) {
 // END BARRIER
 // - - - - - - - - - - - - - - - - - - - -
                Cs[row][col] = 0.0;
             }
          }

          // This kblock loop is run on the master thread of each team
          for (int kblock = 0; kblock  < K ; kblock += BLOCK_SIZE ) {

             // Copy global memory values to local memory
 // - - - - - - - - - - - - - - - - - - - -
 // REPLACE NEXT THREE LINES WITH A BARRIER
 #pragma omp parallel for collapse(2)
             for (int row=0 ; row < BLOCK_SIZE ; row++) {
                for (int col=0 ; col < BLOCK_SIZE ; col++) {
 // END BARRIER
 // - - - - - - - - - - - - - - - - - - - -
                   C_row = C_row_start + row;
                   C_col = C_col_start + col;
 		  if ((C_row < M) && (kblock + col < K))
 		    As[row][col] = A[(C_row*LDA)+ kblock + col];
 		  else
 		    As[row][col] = 0;
 		  if ((kblock + row < K) && C_col < N)
 		    Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
 		  else
 		    Bs[row][col] = 0;
                }
             }

             // Calculate Cs <- Sum(As X Bs) across all kblocks
 // - - - - - - - - - - - - - - - - - - - -
 // REPLACE NEXT THREE LINES WITH A BARRIER
 #pragma omp parallel for collapse(2)
             for (int row=0 ; row < BLOCK_SIZE ; row++) {
                for (int col=0 ; col < BLOCK_SIZE ; col++) {
 // END BARRIER
 // - - - - - - - - - - - - - - - - - - - -
                   for (int e = 0; e < BLOCK_SIZE; ++e)
                      Cs[row][col] += As[row][e] * Bs[e][col];
                 }
             }

          }  /* End for kblock .. */


          // Scale Update actual C from Cs
 // - - - - - - - - - - - - - - - - - - - -
 // REPLACE NEXT THREE LINES WITH A BARRIER
 #pragma omp parallel for collapse(2)
          for (int row=0 ; row < BLOCK_SIZE ; row++) {
             for (int col=0 ; col < BLOCK_SIZE ; col++) {
 // END BARRIER
 // - - - - - - - - - - - - - - - - - - - -
                C_row = C_row_start + row;
                C_col = C_col_start + col;
 	       if ((C_row < M) && (C_col < N)) {
 		 C[(C_row*LDC)+C_col] = alpha*Cs[row][col] + beta*C[(C_row*LDC)+C_col];
 	       }
             }
          }

 // -------------------------------------------------------------------
 // This is the end of the kernel

       }
    }

 }
	/*

	matmul.c : Matrix Multiplication with tiling for openmp4 example

	*/

	#include <stdlib.h>
	#include <math.h>

	#define BLOCK_SIZE 16
	/*
	#define BLOCK_SIZE 32
	*/
	#define NSECPERSEC 1000000000L

	typedef struct {
	int width;
	int height;
	int stride;
	int hpad;
	float* elements;
	} Matrix;

	/* Correctly extract the number of nanoseconds from the two time structures */
	long int get_nanosecs( struct timespec start_time, struct timespec end_time) {
	long int nanosecs;
	if ((end_time.tv_nsec-start_time.tv_nsec)<0) nanosecs =
	((((long int) end_time.tv_sec- (long int) start_time.tv_sec )-1)*NSECPERSEC ) +
	( NSECPERSEC + (long int) end_time.tv_nsec - (long int) start_time.tv_nsec) ;
	else nanosecs =
	(((long int) end_time.tv_sec- (long int) start_time.tv_sec )*NSECPERSEC ) +
	( (long int) end_time.tv_nsec - (long int) start_time.tv_nsec );
	return nanosecs;
	}

	void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
	const float* B,const int LDB, const float beta,float* C, const int LDC) ;
	void simple_sgemm_tn(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
	const float* B,const int LDB, const float beta,float* C, const int LDC) ;
	void tiled_sgemm_tt(const int M,const int N,const int K,const float alpha, const float*A, const int LDA,
	const float* B,const int LDB, const float beta,float* C, const int LDC) ;

	int verify(float* v_res, float* v_ref, int len) {
	int passed = 1;
	int i;
	for (i = 0; i < len; ++i) {
	if (fabs(v_res[i] - v_ref[i]) > 0.001*v_ref[i]) {
	__builtin_abort ();
	}
	}
	return passed;
	}


	int main(int argc, char* argv[]){

	Matrix A,B,Bt,C,Cref;
	int a1,a2,a3,i,j;
	struct timespec start_time1, end_time1;
	struct timespec start_time2, end_time2;
	long int nanosecs,total_ops;
	float gflopsTiled,gflopsCPU;

	a1 = 35;
	a2 = 28;
	a3 = 47;

	A.height = a1;
	A.width = a2;
	A.stride = (((A.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
	A.hpad = (((A.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
	A.elements = (float)malloc(A.stride A.hpad* sizeof(float));

	B.height = a2;
	B.width = a3;
	B.stride = (((B.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
	B.hpad = (((B.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
	B.elements = (float)malloc(B.stride B.hpad * sizeof(float));

	/* Bt is same as B but stored in column-major order */
	Bt.height = B.height;
	Bt.width = B.width;
	Bt.stride = B.stride;
	Bt.hpad = B.hpad;
	Bt.elements = (float)malloc(Bt.stride Bt.hpad * sizeof(float));

	C.height = a1;
	C.width = a3;
	C.stride = (((C.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
	C.hpad = (((C.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
	C.elements = (float)malloc(C.stride C.hpad * sizeof(float));

	Cref.height = a1;
	Cref.width = a3;
	Cref.stride = (((Cref.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
	Cref.hpad = (((Cref.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE;
	Cref.elements = (float)malloc(Cref.stride Cref.hpad * sizeof(float));

	for(i = 0; i < A.hpad ; i++)
	for(j = 0; j < A.stride; j++) {
	if (( j<A.width ) && (i<A.height)) {
	A.elements[i*A.stride + j] = (i % 3);
	} else {
	A.elements[i*A.stride + j] = 0.0;
	}
	}

	/* Initialize B and Bt */
	for(i = 0; i < B.hpad ; i++)
	for(j = 0; j < B.stride; j++) {
	if (( j<B.width ) && (i<B.height)) {
	B.elements[i*B.stride+j] = (j % 2);
	Bt.elements[jBt.stride+i] = B.elements[iB.stride+j] ;
	} else {
	B.elements[i*B.stride+j] = 0.0;
	Bt.elements[j*Bt.stride+i] = 0.0;
	}
	}

	/* zero C, and Cref */
	for(i = 0; i < C.hpad; i++)
	for(j = 0; j < C.stride; j++) {
	C.elements[i*C.stride+j] = 0.0;
	Cref.elements[i*Cref.stride+j] = 0.0;
	}

	simple_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,Cref.elements,Cref.stride);
	tiled_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,C.elements,C.stride);

	verify(C.elements, Cref.elements, C.height * C.stride);
	return 0;
	}

	void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA,
	const float* B,const int LDB, const float beta,float* C, const int LDC) {
	/* A,B, and C are in row-major order */
	int c_row,c_col,inner;
	float sum;
	for (c_col = 0 ; c_col<N; c_col++ ) {
	for (c_row = 0 ; c_row<M; c_row++ ) {
	sum = 0.0 ;
	for (inner = 0 ; inner<K; inner++ ) {
	sum += A[c_rowLDA + inner] B[inner*LDB + c_col] ;
	}
	C[c_rowLDC + c_col] = alphasum + betaC[ c_rowLDC + c_col] ;
	}
	}
	}

	/***************************

	tiled_sgemm_tt: Tiled matrix multiplication:

	***************************/

	void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA,
	const floatB, const int LDB, const float beta, floatC, const int LDC){

	#pragma omp target teams map(to:A[MK],B[KN]) map(from:C[M*N])
	#pragma omp distribute collapse(2)
	for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE) {
	for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE) {

	// We now have M/BLOCK_SIZE * N/BLOCK_SIZE teams = (MN)/(BLOCK_SIZEBLOCK_SIZE)
	// The grid global dimensions are M,N,1
	// The grid local dimensions are BLOCK_SIZE,BLOCK_SIZE,1

	// -------------------------------------------------------------------
	// The rest of this code forms the HSAIL kernel with the
	// pairs of "paralell for collapse(2)" loops repalced with a barrier.
	// The kernel initializes these values
	// C_row_start = get_group_id(0) * BLOCK_SIZE
	// C_col_start = get_group_id(1) * BLOCK_SIZE
	// row=get_local_id(0)
	// col=get_local_id(1)
	// -------------------------------------------------------------------

	// Each team has a local copy of these mini matrices
	float As[BLOCK_SIZE][BLOCK_SIZE];
	float Bs[BLOCK_SIZE][BLOCK_SIZE];
	float Cs[BLOCK_SIZE][BLOCK_SIZE];
	int C_row, C_col;

	/* Zero Cs for this BLOCK */
	// - - - - - - - - - - - - - - - - - - - -
	// REPLACE NEXT THREE LINES WITH A BARRIER
	#pragma omp parallel for collapse(2)
	for (int row=0 ; row < BLOCK_SIZE ; row++) {
	for (int col=0 ; col < BLOCK_SIZE ; col++) {
	// END BARRIER
	// - - - - - - - - - - - - - - - - - - - -
	Cs[row][col] = 0.0;
	}
	}

	// This kblock loop is run on the master thread of each team
	for (int kblock = 0; kblock < K ; kblock += BLOCK_SIZE ) {

	// Copy global memory values to local memory
	// - - - - - - - - - - - - - - - - - - - -
	// REPLACE NEXT THREE LINES WITH A BARRIER
	#pragma omp parallel for collapse(2)
	for (int row=0 ; row < BLOCK_SIZE ; row++) {
	for (int col=0 ; col < BLOCK_SIZE ; col++) {
	// END BARRIER
	// - - - - - - - - - - - - - - - - - - - -
	C_row = C_row_start + row;
	C_col = C_col_start + col;
	if ((C_row < M) && (kblock + col < K))
	As[row][col] = A[(C_row*LDA)+ kblock + col];
	else
	As[row][col] = 0;
	if ((kblock + row < K) && C_col < N)
	Bs[row][col] = B[((kblock+row)*LDB)+ C_col];
	else
	Bs[row][col] = 0;
	}
	}

	// Calculate Cs <- Sum(As X Bs) across all kblocks
	// - - - - - - - - - - - - - - - - - - - -
	// REPLACE NEXT THREE LINES WITH A BARRIER
	#pragma omp parallel for collapse(2)
	for (int row=0 ; row < BLOCK_SIZE ; row++) {
	for (int col=0 ; col < BLOCK_SIZE ; col++) {
	// END BARRIER
	// - - - - - - - - - - - - - - - - - - - -
	for (int e = 0; e < BLOCK_SIZE; ++e)
	Cs[row][col] += As[row][e] * Bs[e][col];
	}
	}

	} /* End for kblock .. */


	// Scale Update actual C from Cs
	// - - - - - - - - - - - - - - - - - - - -
	// REPLACE NEXT THREE LINES WITH A BARRIER
	#pragma omp parallel for collapse(2)
	for (int row=0 ; row < BLOCK_SIZE ; row++) {
	for (int col=0 ; col < BLOCK_SIZE ; col++) {
	// END BARRIER
	// - - - - - - - - - - - - - - - - - - - -
	C_row = C_row_start + row;
	C_col = C_col_start + col;
	if ((C_row < M) && (C_col < N)) {
	C[(C_rowLDC)+C_col] = alphaCs[row][col] + betaC[(C_rowLDC)+C_col];
	}
	}
	}

	// -------------------------------------------------------------------
	// This is the end of the kernel

	}
	}

	}