libgfortran/m4/matmul.m4 - gcc - Git at Google

 `/* Implementation of the MATMUL intrinsic
    Copyright (C) 2002-2021 Free Software Foundation, Inc.
    Contributed by Paul Brook <paul@nowt.org>

 This file is part of the GNU Fortran runtime library (libgfortran).

 Libgfortran is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public
 License as published by the Free Software Foundation; either
 version 3 of the License, or (at your option) any later version.

 Libgfortran is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 Under Section 7 of GPL version 3, you are granted additional
 permissions described in the GCC Runtime Library Exception, version
 3.1, as published by the Free Software Foundation.

 You should have received a copy of the GNU General Public License and
 a copy of the GCC Runtime Library Exception along with this program;
 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 <http://www.gnu.org/licenses/>.  */

 #include "libgfortran.h"
 #include <string.h>
 #include <assert.h>'

 include(iparm.m4)dnl

 `#if defined (HAVE_'rtype_name`)

 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
    passed to us by the front-end, in which case we call it for large
    matrices.  */

 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
                           const int *, const 'rtype_name` *, const 'rtype_name` *,
                           const int *, const 'rtype_name` *, const int *,
                           const 'rtype_name` *, 'rtype_name` *, const int *,
                           int, int);

 /* The order of loops is different in the case of plain matrix
    multiplication C=MATMUL(A,B), and in the frequent special case where
    the argument A is the temporary result of a TRANSPOSE intrinsic:
    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
    looking at their strides.

    The equivalent Fortran pseudo-code is:

    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
    IF (.NOT.IS_TRANSPOSED(A)) THEN
      C = 0
      DO J=1,N
        DO K=1,COUNT
          DO I=1,M
            C(I,J) = C(I,J)+A(I,K)*B(K,J)
    ELSE
      DO J=1,N
        DO I=1,M
          S = 0
          DO K=1,COUNT
            S = S+A(I,K)*B(K,J)
          C(I,J) = S
    ENDIF
 */

 /* If try_blas is set to a nonzero value, then the matmul function will
    see if there is a way to perform the matrix multiplication by a call
    to the BLAS gemm function.  */

 extern void matmul_'rtype_code` ('rtype` * const restrict retarray,
 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm);
 export_proto(matmul_'rtype_code`);

 /* Put exhaustive list of possible architectures here here, ORed together.  */

 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)

 #ifdef HAVE_AVX
 'define(`matmul_name',`matmul_'rtype_code`_avx')dnl
 `static void
 'matmul_name` ('rtype` * const restrict retarray,
 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
 static' include(matmul_internal.m4)dnl
 `#endif /* HAVE_AVX */

 #ifdef HAVE_AVX2
 'define(`matmul_name',`matmul_'rtype_code`_avx2')dnl
 `static void
 'matmul_name` ('rtype` * const restrict retarray,
 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 static' include(matmul_internal.m4)dnl
 `#endif /* HAVE_AVX2 */

 #ifdef HAVE_AVX512F
 'define(`matmul_name',`matmul_'rtype_code`_avx512f')dnl
 `static void
 'matmul_name` ('rtype` * const restrict retarray,
 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
 static' include(matmul_internal.m4)dnl
 `#endif  /* HAVE_AVX512F */

 /* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */

 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
 `void
 'matmul_name` ('rtype` * const restrict retarray,
 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
 internal_proto('matmul_name`);
 #endif

 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
 `void
 'matmul_name` ('rtype` * const restrict retarray,
 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
 internal_proto('matmul_name`);
 #endif

 /* Function to fall back to if there is no special processor-specific version.  */
 'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl
 `static' include(matmul_internal.m4)dnl

 `/* Compiling main function, with selection code for the processor.  */

 /* Currently, this is i386 only.  Adjust for other architectures.  */

 void matmul_'rtype_code` ('rtype` * const restrict retarray,
 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
   static void (*matmul_p) ('rtype` * const restrict retarray,
 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm);

   void (*matmul_fn) ('rtype` * const restrict retarray,
 	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm);

   matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
   if (matmul_fn == NULL)
     {
       matmul_fn = matmul_'rtype_code`_vanilla;
       if (__builtin_cpu_is ("intel"))
 	{
           /* Run down the available processors in order of preference.  */
 #ifdef HAVE_AVX512F
 	  if (__builtin_cpu_supports ("avx512f"))
 	    {
 	      matmul_fn = matmul_'rtype_code`_avx512f;
 	      goto store;
 	    }

 #endif  /* HAVE_AVX512F */

 #ifdef HAVE_AVX2
 	  if (__builtin_cpu_supports ("avx2")
 	      && __builtin_cpu_supports ("fma"))
 	    {
 	      matmul_fn = matmul_'rtype_code`_avx2;
 	      goto store;
 	    }

 #endif

 #ifdef HAVE_AVX
 	  if (__builtin_cpu_supports ("avx"))
  	    {
               matmul_fn = matmul_'rtype_code`_avx;
 	      goto store;
 	    }
 #endif  /* HAVE_AVX */
         }
     else if (__builtin_cpu_is ("amd"))
       {
 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
 	if (__builtin_cpu_supports ("avx")
 	    && __builtin_cpu_supports ("fma"))
 	  {
             matmul_fn = matmul_'rtype_code`_avx128_fma3;
 	    goto store;
 	  }
 #endif
 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
 	if (__builtin_cpu_supports ("avx")
 	    && __builtin_cpu_supports ("fma4"))
 	  {
             matmul_fn = matmul_'rtype_code`_avx128_fma4;
 	    goto store;
 	  }
 #endif

       }
    store:
       __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
    }

    (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
 }

 #else  /* Just the vanilla function.  */

 'define(`matmul_name',`matmul_'rtype_code)dnl
 define(`target_attribute',`')dnl
 include(matmul_internal.m4)dnl
 `#endif
 #endif
 '
	`/* Implementation of the MATMUL intrinsic
	Copyright (C) 2002-2021 Free Software Foundation, Inc.
	Contributed by Paul Brook <paul@nowt.org>

	This file is part of the GNU Fortran runtime library (libgfortran).

	Libgfortran is free software; you can redistribute it and/or
	modify it under the terms of the GNU General Public
	License as published by the Free Software Foundation; either
	version 3 of the License, or (at your option) any later version.

	Libgfortran is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	Under Section 7 of GPL version 3, you are granted additional
	permissions described in the GCC Runtime Library Exception, version
	3.1, as published by the Free Software Foundation.

	You should have received a copy of the GNU General Public License and
	a copy of the GCC Runtime Library Exception along with this program;
	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	<http://www.gnu.org/licenses/>. */

	#include "libgfortran.h"
	#include <string.h>
	#include <assert.h>'

	include(iparm.m4)dnl

	`#if defined (HAVE_'rtype_name`)

	/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
	passed to us by the front-end, in which case we call it for large
	matrices. */

	typedef void (blas_call)(const char , const char , const int , const int *,
	const int , const 'rtype_name` , const 'rtype_name` *,
	const int , const 'rtype_name` , const int *,
	const 'rtype_name` , 'rtype_name` , const int *,
	int, int);

	/* The order of loops is different in the case of plain matrix
	multiplication C=MATMUL(A,B), and in the frequent special case where
	the argument A is the temporary result of a TRANSPOSE intrinsic:
	C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
	looking at their strides.

	The equivalent Fortran pseudo-code is:

	DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
	IF (.NOT.IS_TRANSPOSED(A)) THEN
	C = 0
	DO J=1,N
	DO K=1,COUNT
	DO I=1,M
	C(I,J) = C(I,J)+A(I,K)*B(K,J)
	ELSE
	DO J=1,N
	DO I=1,M
	S = 0
	DO K=1,COUNT
	S = S+A(I,K)*B(K,J)
	C(I,J) = S
	ENDIF
	*/

	/* If try_blas is set to a nonzero value, then the matmul function will
	see if there is a way to perform the matrix multiplication by a call
	to the BLAS gemm function. */

	extern void matmul_'rtype_code` ('rtype` * const restrict retarray,
	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
	int blas_limit, blas_call gemm);
	export_proto(matmul_'rtype_code`);

	/* Put exhaustive list of possible architectures here here, ORed together. */

	#if defined(HAVE_AVX) \|\| defined(HAVE_AVX2) \|\| defined(HAVE_AVX512F)

	#ifdef HAVE_AVX
	'define(`matmul_name',`matmul_'rtype_code`_avx')dnl
	`static void
	'matmul_name` ('rtype` * const restrict retarray,
	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
	int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
	static' include(matmul_internal.m4)dnl
	`#endif /* HAVE_AVX */

	#ifdef HAVE_AVX2
	'define(`matmul_name',`matmul_'rtype_code`_avx2')dnl
	`static void
	'matmul_name` ('rtype` * const restrict retarray,
	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
	int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
	static' include(matmul_internal.m4)dnl
	`#endif /* HAVE_AVX2 */

	#ifdef HAVE_AVX512F
	'define(`matmul_name',`matmul_'rtype_code`_avx512f')dnl
	`static void
	'matmul_name` ('rtype` * const restrict retarray,
	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
	int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
	static' include(matmul_internal.m4)dnl
	`#endif /* HAVE_AVX512F */

	/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */

	#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
	'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
	`void
	'matmul_name` ('rtype` * const restrict retarray,
	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
	internal_proto('matmul_name`);
	#endif

	#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
	'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
	`void
	'matmul_name` ('rtype` * const restrict retarray,
	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
	internal_proto('matmul_name`);
	#endif

	/* Function to fall back to if there is no special processor-specific version. */
	'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl
	`static' include(matmul_internal.m4)dnl

	`/* Compiling main function, with selection code for the processor. */

	/* Currently, this is i386 only. Adjust for other architectures. */

	void matmul_'rtype_code` ('rtype` * const restrict retarray,
	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
	int blas_limit, blas_call gemm)
	{
	static void (matmul_p) ('rtype` const restrict retarray,
	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
	int blas_limit, blas_call gemm);

	void (matmul_fn) ('rtype` const restrict retarray,
	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
	int blas_limit, blas_call gemm);

	matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
	if (matmul_fn == NULL)
	{
	matmul_fn = matmul_'rtype_code`_vanilla;
	if (__builtin_cpu_is ("intel"))
	{
	/* Run down the available processors in order of preference. */
	#ifdef HAVE_AVX512F
	if (__builtin_cpu_supports ("avx512f"))
	{
	matmul_fn = matmul_'rtype_code`_avx512f;
	goto store;
	}

	#endif /* HAVE_AVX512F */

	#ifdef HAVE_AVX2
	if (__builtin_cpu_supports ("avx2")
	&& __builtin_cpu_supports ("fma"))
	{
	matmul_fn = matmul_'rtype_code`_avx2;
	goto store;
	}

	#endif

	#ifdef HAVE_AVX
	if (__builtin_cpu_supports ("avx"))
	{
	matmul_fn = matmul_'rtype_code`_avx;
	goto store;
	}
	#endif /* HAVE_AVX */
	}
	else if (__builtin_cpu_is ("amd"))
	{
	#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
	if (__builtin_cpu_supports ("avx")
	&& __builtin_cpu_supports ("fma"))
	{
	matmul_fn = matmul_'rtype_code`_avx128_fma3;
	goto store;
	}
	#endif
	#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
	if (__builtin_cpu_supports ("avx")
	&& __builtin_cpu_supports ("fma4"))
	{
	matmul_fn = matmul_'rtype_code`_avx128_fma4;
	goto store;
	}
	#endif

	}
	store:
	__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
	}

	(*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
	}

	#else /* Just the vanilla function. */

	'define(`matmul_name',`matmul_'rtype_code)dnl
	define(`target_attribute',`')dnl
	include(matmul_internal.m4)dnl
	`#endif
	#endif
	'