| /* |
| |
| matmul.c : Matrix Multiplication with tiling for openmp4 example |
| |
| */ |
| |
| #include <stdlib.h> |
| #include <math.h> |
| |
| #define BLOCK_SIZE 16 |
| /* |
| #define BLOCK_SIZE 32 |
| */ |
| #define NSECPERSEC 1000000000L |
| |
| typedef struct { |
| int width; |
| int height; |
| int stride; |
| int hpad; |
| float* elements; |
| } Matrix; |
| |
| /* Correctly extract the number of nanoseconds from the two time structures */ |
| long int get_nanosecs( struct timespec start_time, struct timespec end_time) { |
| long int nanosecs; |
| if ((end_time.tv_nsec-start_time.tv_nsec)<0) nanosecs = |
| ((((long int) end_time.tv_sec- (long int) start_time.tv_sec )-1)*NSECPERSEC ) + |
| ( NSECPERSEC + (long int) end_time.tv_nsec - (long int) start_time.tv_nsec) ; |
| else nanosecs = |
| (((long int) end_time.tv_sec- (long int) start_time.tv_sec )*NSECPERSEC ) + |
| ( (long int) end_time.tv_nsec - (long int) start_time.tv_nsec ); |
| return nanosecs; |
| } |
| |
| void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA, |
| const float* B,const int LDB, const float beta,float* C, const int LDC) ; |
| void simple_sgemm_tn(const int M,const int N,const int K,const float alpha, const float* A,const int LDA, |
| const float* B,const int LDB, const float beta,float* C, const int LDC) ; |
| void tiled_sgemm_tt(const int M,const int N,const int K,const float alpha, const float*A, const int LDA, |
| const float* B,const int LDB, const float beta,float* C, const int LDC) ; |
| |
| int verify(float* v_res, float* v_ref, int len) { |
| int passed = 1; |
| int i; |
| for (i = 0; i < len; ++i) { |
| if (fabs(v_res[i] - v_ref[i]) > 0.001*v_ref[i]) { |
| __builtin_abort (); |
| } |
| } |
| return passed; |
| } |
| |
| |
| int main(int argc, char* argv[]){ |
| |
| Matrix A,B,Bt,C,Cref; |
| int a1,a2,a3,i,j; |
| struct timespec start_time1, end_time1; |
| struct timespec start_time2, end_time2; |
| long int nanosecs,total_ops; |
| float gflopsTiled,gflopsCPU; |
| |
| a1 = 35; |
| a2 = 28; |
| a3 = 47; |
| |
| A.height = a1; |
| A.width = a2; |
| A.stride = (((A.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; |
| A.hpad = (((A.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; |
| A.elements = (float*)malloc(A.stride * A.hpad* sizeof(float)); |
| |
| B.height = a2; |
| B.width = a3; |
| B.stride = (((B.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; |
| B.hpad = (((B.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; |
| B.elements = (float*)malloc(B.stride * B.hpad * sizeof(float)); |
| |
| /* Bt is same as B but stored in column-major order */ |
| Bt.height = B.height; |
| Bt.width = B.width; |
| Bt.stride = B.stride; |
| Bt.hpad = B.hpad; |
| Bt.elements = (float*)malloc(Bt.stride * Bt.hpad * sizeof(float)); |
| |
| C.height = a1; |
| C.width = a3; |
| C.stride = (((C.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; |
| C.hpad = (((C.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; |
| C.elements = (float*)malloc(C.stride * C.hpad * sizeof(float)); |
| |
| Cref.height = a1; |
| Cref.width = a3; |
| Cref.stride = (((Cref.width-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; |
| Cref.hpad = (((Cref.height-1)/BLOCK_SIZE)+1) * BLOCK_SIZE; |
| Cref.elements = (float*)malloc(Cref.stride * Cref.hpad * sizeof(float)); |
| |
| for(i = 0; i < A.hpad ; i++) |
| for(j = 0; j < A.stride; j++) { |
| if (( j<A.width ) && (i<A.height)) { |
| A.elements[i*A.stride + j] = (i % 3); |
| } else { |
| A.elements[i*A.stride + j] = 0.0; |
| } |
| } |
| |
| /* Initialize B and Bt */ |
| for(i = 0; i < B.hpad ; i++) |
| for(j = 0; j < B.stride; j++) { |
| if (( j<B.width ) && (i<B.height)) { |
| B.elements[i*B.stride+j] = (j % 2); |
| Bt.elements[j*Bt.stride+i] = B.elements[i*B.stride+j] ; |
| } else { |
| B.elements[i*B.stride+j] = 0.0; |
| Bt.elements[j*Bt.stride+i] = 0.0; |
| } |
| } |
| |
| /* zero C, and Cref */ |
| for(i = 0; i < C.hpad; i++) |
| for(j = 0; j < C.stride; j++) { |
| C.elements[i*C.stride+j] = 0.0; |
| Cref.elements[i*Cref.stride+j] = 0.0; |
| } |
| |
| simple_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,Cref.elements,Cref.stride); |
| tiled_sgemm_tt(A.height,B.width,B.height,1.0,A.elements,A.stride,B.elements,B.stride,1.0,C.elements,C.stride); |
| |
| verify(C.elements, Cref.elements, C.height * C.stride); |
| return 0; |
| } |
| |
| void simple_sgemm_tt(const int M,const int N,const int K,const float alpha, const float* A,const int LDA, |
| const float* B,const int LDB, const float beta,float* C, const int LDC) { |
| /* A,B, and C are in row-major order */ |
| int c_row,c_col,inner; |
| float sum; |
| for (c_col = 0 ; c_col<N; c_col++ ) { |
| for (c_row = 0 ; c_row<M; c_row++ ) { |
| sum = 0.0 ; |
| for (inner = 0 ; inner<K; inner++ ) { |
| sum += A[c_row*LDA + inner] * B[inner*LDB + c_col] ; |
| } |
| C[c_row*LDC + c_col] = alpha*sum + beta*C[ c_row*LDC + c_col] ; |
| } |
| } |
| } |
| |
| /*************************** |
| |
| tiled_sgemm_tt: Tiled matrix multiplication: |
| |
| ***************************/ |
| |
| void tiled_sgemm_tt(const int M, const int N, const int K, const float alpha, const float*A, const int LDA, |
| const float*B, const int LDB, const float beta, float*C, const int LDC){ |
| |
| #pragma omp target teams map(to:A[M*K],B[K*N]) map(from:C[M*N]) |
| #pragma omp distribute collapse(2) |
| for (int C_row_start=0 ; C_row_start < M ; C_row_start+=BLOCK_SIZE) |
| for (int C_col_start=0 ; C_col_start < N ; C_col_start+=BLOCK_SIZE) |
| { |
| // Each team has a local copy of these mini matrices |
| float As[BLOCK_SIZE][BLOCK_SIZE]; |
| float Bs[BLOCK_SIZE][BLOCK_SIZE]; |
| #pragma omp parallel |
| { |
| int C_row, C_col; |
| float Cval = 0.0; |
| |
| for (int kblock = 0; kblock < K ; kblock += BLOCK_SIZE ) |
| { |
| #pragma omp for collapse(2) |
| for (int row=0 ; row < BLOCK_SIZE ; row++) |
| for (int col=0 ; col < BLOCK_SIZE ; col++) |
| { |
| C_row = C_row_start + row; |
| C_col = C_col_start + col; |
| if ((C_row < M) && (kblock + col < K)) |
| As[row][col] = A[(C_row*LDA)+ kblock + col]; |
| else |
| As[row][col] = 0; |
| if ((kblock + row < K) && C_col < N) |
| Bs[row][col] = B[((kblock+row)*LDB)+ C_col]; |
| else |
| Bs[row][col] = 0; |
| } |
| |
| #pragma omp for collapse(2) |
| for (int row=0 ; row < BLOCK_SIZE ; row++) |
| for (int col=0 ; col < BLOCK_SIZE ; col++) |
| { |
| for (int e = 0; e < BLOCK_SIZE; ++e) |
| Cval += As[row][e] * Bs[e][col]; |
| } |
| } /* End for kblock .. */ |
| |
| |
| #pragma omp for collapse(2) |
| for (int row=0 ; row < BLOCK_SIZE ; row++) |
| for (int col=0 ; col < BLOCK_SIZE ; col++) |
| { |
| C_row = C_row_start + row; |
| C_col = C_col_start + col; |
| if ((C_row < M) && (C_col < N)) |
| C[(C_row*LDC)+C_col] = alpha*Cval + beta*C[(C_row*LDC)+C_col]; |
| |
| } |
| } /* end parallel */ |
| } /* end target teams distribute */ |
| } |