diff --git a/step3/dgemm/bl_dgemm_ref.c b/step3/dgemm/bl_dgemm_ref.c index 86bc009..8e76fe6 100644 --- a/step3/dgemm/bl_dgemm_ref.c +++ b/step3/dgemm/bl_dgemm_ref.c @@ -1,94 +1,57 @@ -/* - * -------------------------------------------------------------------------- - * BLISLAB - * -------------------------------------------------------------------------- - * Copyright (C) 2016, The University of Texas at Austin - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - Neither the name of The University of Texas nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * - * bl_dgemm_ref.c - * - * - * Purpose: - * implement reference mkl using GEMM (optional) in C. - * - * Todo: - * - * - * Modification: - * - * - * */ - #include #ifdef USE_BLAS -/* +/* * dgemm prototype * - */ -//void dgemm(char*, char*, int*, int*, int*, double*, double*, + */ +// void dgemm(char*, char*, int*, int*, int*, double*, double*, // int*, double*, int*, double*, double*, int*); -extern void dgemm_(char*, char*, int*, int*, int*, double*, double*, - int*, double*, int*, double*, double*, int*); +extern void dgemm_( + char*, + char*, + int*, + int*, + int*, + double*, + double*, + int*, + double*, + int*, + double*, + double*, + int*); #endif void bl_dgemm_ref( - int m, - int n, - int k, - double *XA, - int lda, - double *XB, - int ldb, - double *XC, - int ldc - ) -{ - // Local variables. - int i, j, p; - double alpha = 1.0, beta = 1.0; - - // Sanity check for early return. - if ( m == 0 || n == 0 || k == 0 ) return; + int m, + int n, + int k, + double* XA, + int lda, + double* XB, + int ldb, + double* XC, + int ldc) { + // Local variables. + int i, j, p; + double alpha = 1.0, beta = 1.0; + + // Sanity check for early return. + if (m == 0 || n == 0 || k == 0) + return; // Reference GEMM implementation. #ifdef USE_BLAS - dgemm_( "N", "N", &m, &n, &k, &alpha, - XA, &lda, XB, &ldb, &beta, XC, &ldc ); + dgemm_("N", "N", &m, &n, &k, &alpha, XA, &lda, XB, &ldb, &beta, XC, &ldc); #else - for ( i = 0; i < m; i ++ ) { - for ( j = 0; j < n; j ++ ) { - for ( p = 0; p < k; p ++ ) { - XC[ j * ldc + i ] += XA[ p * lda + i ] * XB[ j * ldb + p ]; - } - } + for (i = 0; i < m; i++) { + for (j = 0; j < n; j++) { + for (p = 0; p < k; p++) { + XC[j * ldc + i] += XA[p * lda + i] * XB[j * ldb + p]; + } } + } #endif - } - diff --git a/step3/dgemm/bl_dgemm_util.c b/step3/dgemm/bl_dgemm_util.c index 4acd123..8e989d2 100644 --- a/step3/dgemm/bl_dgemm_util.c +++ b/step3/dgemm/bl_dgemm_util.c @@ -1,93 +1,36 @@ -/* - * -------------------------------------------------------------------------- - * BLISLAB - * -------------------------------------------------------------------------- - * Copyright (C) 2016, The University of Texas at Austin - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - Neither the name of The University of Texas nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * - * bl_dgemm_util.c - * - * - * Purpose: - * Utility routines (Mem allocation, Print, etc.) that will come in handy later. - * - * Todo: - * - * - * Modification: - * - * - * */ - #include "bl_dgemm.h" /* * * - */ -double *bl_malloc_aligned( - int m, - int n, - int size - ) -{ - double *ptr; - int err; - - err = posix_memalign( (void**)&ptr, (size_t)GEMM_SIMD_ALIGN_SIZE, size * m * n ); - - if ( err ) { - printf( "bl_malloc_aligned(): posix_memalign() failures" ); - exit( 1 ); - } + */ +double* bl_malloc_aligned(int m, int n, int size) { + double* ptr; + int err; - return ptr; -} + err = + posix_memalign((void**)&ptr, (size_t)GEMM_SIMD_ALIGN_SIZE, size * m * n); + if (err) { + printf("bl_malloc_aligned(): posix_memalign() failures"); + exit(1); + } + return ptr; +} /* * * */ -void bl_dgemm_printmatrix( - double *A, - int lda, - int m, - int n - ) -{ - int i, j; - for ( i = 0; i < m; i ++ ) { - for ( j = 0; j < n; j ++ ) { - printf("%lf\t", A[j * lda + i]); - } - printf("\n"); +void bl_dgemm_printmatrix(double* A, int lda, int m, int n) { + int i, j; + for (i = 0; i < m; i++) { + for (j = 0; j < n; j++) { + printf("%lf\t", A[j * lda + i]); } + printf("\n"); + } } /* @@ -96,83 +39,84 @@ void bl_dgemm_printmatrix( */ static double gtod_ref_time_sec = 0.0; -double bl_clock( void ) -{ - return bl_clock_helper(); +double bl_clock(void) { + return bl_clock_helper(); } #if BL_OS_WINDOWS // --- Begin Windows build definitions ----------------------------------------- -double bl_clock_helper() -{ - LARGE_INTEGER clock_freq = {0}; - LARGE_INTEGER clock_val; - BOOL r_val; - - r_val = QueryPerformanceFrequency( &clock_freq ); - - if ( r_val == 0 ) - { - fprintf( stderr, "\nblislab: %s (line %lu):\nblislab: %s \n", __FILE__, __LINE__, "QueryPerformanceFrequency() failed" ); - fflush( stderr ); - abort(); - } - - r_val = QueryPerformanceCounter( &clock_val ); - - if ( r_val == 0 ) - { - fprintf( stderr, "\nblislab: %s (line %lu):\nblislab: %s \n", __FILE__, __LINE__, "QueryPerformanceFrequency() failed" ); - fflush( stderr ); - abort(); - } - - return ( ( double) clock_val.QuadPart / ( double) clock_freq.QuadPart ); +double bl_clock_helper() { + LARGE_INTEGER clock_freq = {0}; + LARGE_INTEGER clock_val; + BOOL r_val; + + r_val = QueryPerformanceFrequency(&clock_freq); + + if (r_val == 0) { + fprintf( + stderr, + "\nblislab: %s (line %lu):\nblislab: %s \n", + __FILE__, + __LINE__, + "QueryPerformanceFrequency() failed"); + fflush(stderr); + abort(); + } + + r_val = QueryPerformanceCounter(&clock_val); + + if (r_val == 0) { + fprintf( + stderr, + "\nblislab: %s (line %lu):\nblislab: %s \n", + __FILE__, + __LINE__, + "QueryPerformanceFrequency() failed"); + fflush(stderr); + abort(); + } + + return ((double)clock_val.QuadPart / (double)clock_freq.QuadPart); } // --- End Windows build definitions ------------------------------------------- #elif BL_OS_OSX // --- Begin OSX build definitions ------------------------------------------- -double bl_clock_helper() -{ - mach_timebase_info_data_t timebase; - mach_timebase_info( &timebase ); +double bl_clock_helper() { + mach_timebase_info_data_t timebase; + mach_timebase_info(&timebase); - uint64_t nsec = mach_absolute_time(); + uint64_t nsec = mach_absolute_time(); - double the_time = (double) nsec * 1.0e-9 * timebase.numer / timebase.denom; + double the_time = (double)nsec * 1.0e-9 * timebase.numer / timebase.denom; - if ( gtod_ref_time_sec == 0.0 ) - gtod_ref_time_sec = the_time; + if (gtod_ref_time_sec == 0.0) + gtod_ref_time_sec = the_time; - return the_time - gtod_ref_time_sec; + return the_time - gtod_ref_time_sec; } // --- End OSX build definitions --------------------------------------------- #else // --- Begin Linux build definitions ------------------------------------------- -double bl_clock_helper() -{ - double the_time, norm_sec; - struct timespec ts; +double bl_clock_helper() { + double the_time, norm_sec; + struct timespec ts; - clock_gettime( CLOCK_MONOTONIC, &ts ); + clock_gettime(CLOCK_MONOTONIC, &ts); - if ( gtod_ref_time_sec == 0.0 ) - gtod_ref_time_sec = ( double ) ts.tv_sec; + if (gtod_ref_time_sec == 0.0) + gtod_ref_time_sec = (double)ts.tv_sec; - norm_sec = ( double ) ts.tv_sec - gtod_ref_time_sec; + norm_sec = (double)ts.tv_sec - gtod_ref_time_sec; - the_time = norm_sec + ts.tv_nsec * 1.0e-9; + the_time = norm_sec + ts.tv_nsec * 1.0e-9; - return the_time; + return the_time; } // --- End Linux build definitions --------------------------------------------- #endif - - - diff --git a/step3/dgemm/my_dgemm.c b/step3/dgemm/my_dgemm.c index e528e0e..d3aa293 100644 --- a/step3/dgemm/my_dgemm.c +++ b/step3/dgemm/my_dgemm.c @@ -1,228 +1,164 @@ -/* - * -------------------------------------------------------------------------- - * BLISLAB - * -------------------------------------------------------------------------- - * Copyright (C) 2016, The University of Texas at Austin - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - Neither the name of The University of Texas nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * - * bl_dgemm.c - * - * - * Purpose: - * this is the main file of blislab dgemm. - * - * Todo: - * - * - * Modification: - * - * - * */ - #include -#include "bl_dgemm_kernel.h" #include "bl_dgemm.h" +#include "bl_dgemm_kernel.h" -inline void packA_mcxkc_d( - int m, - int k, - double *XA, - int ldXA, - int offseta, - double *packA - ) -{ - int i, p; - double *a_pntr[ DGEMM_MR ]; - - for ( i = 0; i < m; i ++ ) { - a_pntr[ i ] = XA + ( offseta + i ); - } +inline void +packA_mcxkc_d(int m, int k, double* XA, int ldXA, int offseta, double* packA) { + int i, p; + double* a_pntr[DGEMM_MR]; - for ( i = m; i < DGEMM_MR; i ++ ) { - a_pntr[ i ] = XA + ( offseta + 0 ); - } + for (i = 0; i < m; i++) { + a_pntr[i] = XA + (offseta + i); + } - for ( p = 0; p < k; p ++ ) { - for ( i = 0; i < DGEMM_MR; i ++ ) { - *packA = *a_pntr[ i ]; - packA ++; - a_pntr[ i ] = a_pntr[ i ] + ldXA; - } + for (i = m; i < DGEMM_MR; i++) { + a_pntr[i] = XA + (offseta + 0); + } + + for (p = 0; p < k; p++) { + for (i = 0; i < DGEMM_MR; i++) { + *packA = *a_pntr[i]; + packA++; + a_pntr[i] = a_pntr[i] + ldXA; } + } } - /* * -------------------------------------------------------------------------- */ inline void packB_kcxnc_d( - int n, - int k, - double *XB, - int ldXB, // ldXB is the original k - int offsetb, - double *packB - ) -{ - int j, p; - double *b_pntr[ DGEMM_NR ]; - - for ( j = 0; j < n; j ++ ) { - b_pntr[ j ] = XB + ldXB * ( offsetb + j ); - } - - for ( j = n; j < DGEMM_NR; j ++ ) { - b_pntr[ j ] = XB + ldXB * ( offsetb + 0 ); - } - - for ( p = 0; p < k; p ++ ) { - for ( j = 0; j < DGEMM_NR; j ++ ) { - *packB ++ = *b_pntr[ j ] ++; - } + int n, + int k, + double* XB, + int ldXB, // ldXB is the original k + int offsetb, + double* packB) { + int j, p; + double* b_pntr[DGEMM_NR]; + + for (j = 0; j < n; j++) { + b_pntr[j] = XB + ldXB * (offsetb + j); + } + + for (j = n; j < DGEMM_NR; j++) { + b_pntr[j] = XB + ldXB * (offsetb + 0); + } + + for (p = 0; p < k; p++) { + for (j = 0; j < DGEMM_NR; j++) { + *packB++ = *b_pntr[j]++; } + } } /* * -------------------------------------------------------------------------- */ void bl_macro_kernel( - int m, - int n, - int k, - double *packA, - double *packB, - double *C, - int ldc - ) -{ - int i, ii, j; - aux_t aux; - char *str; - - aux.b_next = packB; - - for ( j = 0; j < n; j += DGEMM_NR ) { // 2-th loop around micro-kernel - aux.n = min( n - j, DGEMM_NR ); - for ( i = 0; i < m; i += DGEMM_MR ) { // 1-th loop around micro-kernel - aux.m = min( m - i, DGEMM_MR ); - if ( i + DGEMM_MR >= m ) { - aux.b_next += DGEMM_NR * k; - } - - ( *bl_micro_kernel ) ( - k, - &packA[ i * k ], - &packB[ j * k ], - &C[ j * ldc + i ], - (unsigned long long) ldc, - &aux - ); - } // 1-th loop around micro-kernel - } // 2-th loop around micro-kernel + int m, + int n, + int k, + double* packA, + double* packB, + double* C, + int ldc) { + int i, ii, j; + aux_t aux; + char* str; + + aux.b_next = packB; + + for (j = 0; j < n; j += DGEMM_NR) { // 2-th loop around micro-kernel + aux.n = min(n - j, DGEMM_NR); + for (i = 0; i < m; i += DGEMM_MR) { // 1-th loop around micro-kernel + aux.m = min(m - i, DGEMM_MR); + if (i + DGEMM_MR >= m) { + aux.b_next += DGEMM_NR * k; + } + + (*bl_micro_kernel)( + k, + &packA[i * k], + &packB[j * k], + &C[j * ldc + i], + (unsigned long long)ldc, + &aux); + } // 1-th loop around micro-kernel + } // 2-th loop around micro-kernel } // C must be aligned void bl_dgemm( - int m, - int n, - int k, - double *XA, - int lda, - double *XB, - int ldb, - double *C, // must be aligned - int ldc // ldc must also be aligned - ) -{ - int i, j, p; - int ic, ib, jc, jb, pc, pb; - int ir, jr; - double *packA, *packB; - char *str; - - // Early return if possible - if ( m == 0 || n == 0 || k == 0 ) { - printf( "bl_dgemm(): early return\n" ); - return; - } + int m, + int n, + int k, + double* XA, + int lda, + double* XB, + int ldb, + double* C, // must be aligned + int ldc // ldc must also be aligned +) { + int i, j, p; + int ic, ib, jc, jb, pc, pb; + int ir, jr; + double *packA, *packB; + char* str; + + // Early return if possible + if (m == 0 || n == 0 || k == 0) { + printf("bl_dgemm(): early return\n"); + return; + } + + // Allocate packing buffers + packA = bl_malloc_aligned(DGEMM_KC, (DGEMM_MC + 1), sizeof(double)); + packB = bl_malloc_aligned(DGEMM_KC, (DGEMM_NC + 1), sizeof(double)); + + for (jc = 0; jc < n; jc += DGEMM_NC) { // 5-th loop around micro-kernel + jb = min(n - jc, DGEMM_NC); + for (pc = 0; pc < k; pc += DGEMM_KC) { // 4-th loop around micro-kernel + pb = min(k - pc, DGEMM_KC); + + for (j = 0; j < jb; j += DGEMM_NR) { + packB_kcxnc_d( + min(jb - j, DGEMM_NR), + pb, + &XB[pc], + k, // should be ldXB instead + jc + j, + &packB[j * pb]); + } + + for (ic = 0; ic < m; ic += DGEMM_MC) { // 3-rd loop around micro-kernel + + ib = min(m - ic, DGEMM_MC); + + for (i = 0; i < ib; i += DGEMM_MR) { + packA_mcxkc_d( + min(ib - i, DGEMM_MR), + pb, + &XA[pc * lda], + m, + ic + i, + &packA[0 * DGEMM_MC * pb + i * pb]); + } - // Allocate packing buffers - packA = bl_malloc_aligned( DGEMM_KC, ( DGEMM_MC + 1 ) , sizeof(double) ); - packB = bl_malloc_aligned( DGEMM_KC, ( DGEMM_NC + 1 ) , sizeof(double) ); - - for ( jc = 0; jc < n; jc += DGEMM_NC ) { // 5-th loop around micro-kernel - jb = min( n - jc, DGEMM_NC ); - for ( pc = 0; pc < k; pc += DGEMM_KC ) { // 4-th loop around micro-kernel - pb = min( k - pc, DGEMM_KC ); - - for ( j = 0; j < jb; j += DGEMM_NR ) { - packB_kcxnc_d( - min( jb - j, DGEMM_NR ), - pb, - &XB[ pc ], - k, // should be ldXB instead - jc + j, - &packB[ j * pb ] - ); - } - - - for ( ic = 0; ic < m; ic += DGEMM_MC ) { // 3-rd loop around micro-kernel - - ib = min( m - ic, DGEMM_MC ); - - for ( i = 0; i < ib; i += DGEMM_MR ) { - packA_mcxkc_d( - min( ib - i, DGEMM_MR ), - pb, - &XA[ pc * lda ], - m, - ic + i, - &packA[ 0 * DGEMM_MC * pb + i * pb ] - ); - } - - bl_macro_kernel( - ib, - jb, - pb, - packA + 0 * DGEMM_MC * pb, - packB, - &C[ jc * ldc + ic ], - ldc - ); - } // End 3.rd loop around micro-kernel - } // End 4.th loop around micro-kernel - } // End 5.th loop around micro-kernel - - free( packA ); - free( packB ); + bl_macro_kernel( + ib, + jb, + pb, + packA + 0 * DGEMM_MC * pb, + packB, + &C[jc * ldc + ic], + ldc); + } // End 3.rd loop around micro-kernel + } // End 4.th loop around micro-kernel + } // End 5.th loop around micro-kernel + + free(packA); + free(packB); } - diff --git a/step3/include/bl_config.h b/step3/include/bl_config.h index 2cce3bf..4c37cee 100644 --- a/step3/include/bl_config.h +++ b/step3/include/bl_config.h @@ -1,48 +1,3 @@ -/* - * -------------------------------------------------------------------------- - * BLISLAB - * -------------------------------------------------------------------------- - * Copyright (C) 2016, The University of Texas at Austin - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - Neither the name of The University of Texas nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * - * bl_config.h - * - * - * Purpose: - * this header file contains configuration parameters. - * - * Todo: - * - * - * Modification: - * - * - * */ - #ifndef BLISLAB_CONFIG_H #define BLISLAB_CONFIG_H @@ -61,7 +16,6 @@ extern "C" { #define DGEMM_MR 8 #define DGEMM_NR 4 - //#define DGEMM_MC 72 //#define DGEMM_NC 4080 //#define DGEMM_KC 256 @@ -72,8 +26,6 @@ extern "C" { ////#define DGEMM_MR 6 ////#define DGEMM_NR 8 - - //#define BL_MICRO_KERNEL bl_dgemm_int_8x4 //#define BL_MICRO_KERNEL bl_dgemm_asm_8x4 //#define BL_MICRO_KERNEL bl_dgemm_asm_8x6 @@ -87,4 +39,3 @@ extern "C" { #endif #endif - diff --git a/step3/include/bl_dgemm.h b/step3/include/bl_dgemm.h index d1f6d46..d946c8d 100644 --- a/step3/include/bl_dgemm.h +++ b/step3/include/bl_dgemm.h @@ -1,49 +1,3 @@ -/* - * -------------------------------------------------------------------------- - * BLISLAB - * -------------------------------------------------------------------------- - * Copyright (C) 2016, The University of Texas at Austin - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - Neither the name of The University of Texas nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * - * bl_dgemm.h - * - * - * Purpose: - * this header file contains all function prototypes. - * - * Todo: - * - * - * Modification: - * - * - * */ - - #ifndef BLISLAB_DGEMM_H #define BLISLAB_DGEMM_H @@ -54,14 +8,12 @@ extern "C" { #endif -#include #include - +#include #include #include - // Determine the target operating system #if defined(_WIN32) || defined(__CYGWIN__) #define BL_OS_WINDOWS 1 @@ -76,7 +28,7 @@ extern "C" { #elif defined(__bg__) #define BL_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ - defined(__bsdi__) || defined(__DragonFly__) + defined(__bsdi__) || defined(__DragonFly__) #define BL_OS_BSD 1 #else #error "Cannot determine operating system" @@ -84,64 +36,53 @@ extern "C" { // gettimeofday() needs this. #if BL_OS_WINDOWS - #include +#include #elif BL_OS_OSX - #include +#include #else - #include - #include +#include +#include #endif #include "bl_config.h" -#define min( i, j ) ( (i)<(j) ? (i): (j) ) +#define min(i, j) ((i) < (j) ? (i) : (j)) -#define A( i, j ) A[ (j)*lda + (i) ] -#define B( i, j ) B[ (j)*ldb + (i) ] -#define C( i, j ) C[ (j)*ldc + (i) ] -#define C_ref( i, j ) C_ref[ (j)*ldc_ref + (i) ] +#define A(i, j) A[(j)*lda + (i)] +#define B(i, j) B[(j)*ldb + (i)] +#define C(i, j) C[(j)*ldc + (i)] +#define C_ref(i, j) C_ref[(j)*ldc_ref + (i)] void bl_dgemm( - int m, - int n, - int k, - double *A, - int lda, - double *B, - int ldb, - double *C, - int ldc - ); - -double *bl_malloc_aligned( - int m, - int n, - int size - ); - -void bl_printmatrix( - double *A, - int lda, - int m, - int n - ); - -double bl_clock( void ); + int m, + int n, + int k, + double* A, + int lda, + double* B, + int ldb, + double* C, + int ldc); + +double* bl_malloc_aligned(int m, int n, int size); + +void bl_printmatrix(double* A, int lda, int m, int n); + +double bl_clock(void); double bl_clock_helper(); void bl_dgemm_ref( - int m, - int n, - int k, - double *XA, - int lda, - double *XB, - int ldb, - double *XC, - int ldc - ); - -void bl_get_range( int n, int bf, int* start, int* end ); + int m, + int n, + int k, + double* XA, + int lda, + double* XB, + int ldb, + double* XC, + int ldc); + +void bl_get_range(int n, int bf, int* start, int* end); // End extern "C" construct block. #ifdef __cplusplus @@ -149,4 +90,3 @@ void bl_get_range( int n, int bf, int* start, int* end ); #endif #endif - diff --git a/step3/include/bl_dgemm_kernel.h b/step3/include/bl_dgemm_kernel.h index 764fdac..9f26f7c 100644 --- a/step3/include/bl_dgemm_kernel.h +++ b/step3/include/bl_dgemm_kernel.h @@ -1,57 +1,10 @@ -/* - * -------------------------------------------------------------------------- - * BLISLAB - * -------------------------------------------------------------------------- - * Copyright (C) 2016, The University of Texas at Austin - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - Neither the name of The University of Texas nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * - * bl_dgemm_kernel.h - * - * - * Purpose: - * this header file contains all function prototypes. - * - * Todo: - * - * - * Modification: - * - * - * */ - - #ifndef BLISLAB_DGEMM_KERNEL_H #define BLISLAB_DGEMM_KERNEL_H #include "bl_config.h" -#include #include // AVX - +#include // Allow C++ users to include this header file in their source code. However, // we make the extern "C" conditional on whether we're using a C++ compiler, @@ -63,98 +16,100 @@ extern "C" { typedef unsigned long long dim_t; typedef union { - __m256d v; - __m256i u; - double d[ 4 ]; + __m256d v; + __m256i u; + double d[4]; } v4df_t; - typedef union { - __m128i v; - int d[ 4 ]; + __m128i v; + int d[4]; } v4li_t; struct aux_s { - double *b_next; - float *b_next_s; - char *flag; - int pc; - int m; - int n; + double* b_next; + float* b_next_s; + char* flag; + int pc; + int m; + int n; }; typedef struct aux_s aux_t; -void bl_dgemm_ukr( int k, - double *a, - double *b, - double *c, - unsigned long long ldc, - aux_t* data ); - -void bl_dgemm_int_8x4( int k, - double *a, - double *b, - double *c, - unsigned long long ldc, - aux_t* data ); - -void bl_dgemm_asm_8x4( int k, - double *a, - double *b, - double *c, - unsigned long long ldc, - aux_t* data ); - -void bl_dgemm_asm_12x4( int k, - double *a, - double *b, - double *c, - unsigned long long ldc, - aux_t* data ); - -void bl_dgemm_asm_8x6( int k, - double *a, - double *b, - double *c, - unsigned long long ldc, - aux_t* data ); - -void bl_dgemm_asm_6x8( int k, - double *a, - double *b, - double *c, - unsigned long long ldc, - aux_t* data ); - -void bl_dgemm_asm_4x12( int k, - double *a, - double *b, - double *c, - unsigned long long ldc, - aux_t* data ); - -static void (*bl_micro_kernel) ( - int k, - double *a, - double *b, - double *c, - unsigned long long ldc, - aux_t *aux - ) = { - BL_MICRO_KERNEL - //bl_dgemm_ukr - //bl_dgemm_int_8x4 - //bl_dgemm_asm_8x4 - //bl_dgemm_asm_8x6 - //bl_dgemm_asm_12x4 +void bl_dgemm_ukr( + int k, + double* a, + double* b, + double* c, + unsigned long long ldc, + aux_t* data); + +void bl_dgemm_int_8x4( + int k, + double* a, + double* b, + double* c, + unsigned long long ldc, + aux_t* data); + +void bl_dgemm_asm_8x4( + int k, + double* a, + double* b, + double* c, + unsigned long long ldc, + aux_t* data); + +void bl_dgemm_asm_12x4( + int k, + double* a, + double* b, + double* c, + unsigned long long ldc, + aux_t* data); + +void bl_dgemm_asm_8x6( + int k, + double* a, + double* b, + double* c, + unsigned long long ldc, + aux_t* data); + +void bl_dgemm_asm_6x8( + int k, + double* a, + double* b, + double* c, + unsigned long long ldc, + aux_t* data); + +void bl_dgemm_asm_4x12( + int k, + double* a, + double* b, + double* c, + unsigned long long ldc, + aux_t* data); + +static void (*bl_micro_kernel)( + int k, + double* a, + double* b, + double* c, + unsigned long long ldc, + aux_t* aux) = { + BL_MICRO_KERNEL + // bl_dgemm_ukr + // bl_dgemm_int_8x4 + // bl_dgemm_asm_8x4 + // bl_dgemm_asm_8x6 + // bl_dgemm_asm_12x4 }; - - // End extern "C" construct block. #ifdef __cplusplus } #endif #endif - diff --git a/step3/kernels/bl_dgemm_asm_12x4.c b/step3/kernels/bl_dgemm_asm_12x4.c index 6cb970c..2812bb1 100644 --- a/step3/kernels/bl_dgemm_asm_12x4.c +++ b/step3/kernels/bl_dgemm_asm_12x4.c @@ -1,663 +1,625 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - - #include "bl_dgemm_kernel.h" -#define inc_t unsigned long long +#define inc_t unsigned long long -#define DGEMM_INPUT_GS_BETA_NZ \ - "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ - "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ - "vmovlpd (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ - "vmovhpd (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ - "vperm2f128 $0x20, %%ymm1, %%ymm0, %%ymm0 \n\t" /*\ - "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ - "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ - "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ - "vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \ - "vperm2f128 $0x20, %%ymm1, %%ymm2, %%ymm2 \n\t"*/ +#define DGEMM_INPUT_GS_BETA_NZ \ + "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ + "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ + "vmovlpd (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ + "vmovhpd (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ + "vperm2f128 $0x20, %%ymm1, %%ymm0, %%ymm0 \n\t" /* \ + "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ + "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ + "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ + "vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \ + "vperm2f128 $0x20, %%ymm1, %%ymm2, %%ymm2 \n\t"*/ -#define DGEMM_OUTPUT_GS_BETA_NZ \ - "vextractf128 $1, %%ymm0, %%xmm1 \n\t" \ - "vmovlpd %%xmm0, (%%rcx ) \n\t" \ - "vmovhpd %%xmm0, (%%rcx,%%rsi ) \n\t" \ - "vmovlpd %%xmm1, (%%rcx,%%rsi,2) \n\t" \ - "vmovhpd %%xmm1, (%%rcx,%%r13 ) \n\t" /*\ - "vextractf128 $1, %%ymm2, %%xmm1 \n\t" \ - "vmovlpd %%xmm2, (%%rcx,%%rsi,4) \n\t" \ - "vmovhpd %%xmm2, (%%rcx,%%r15 ) \n\t" \ - "vmovlpd %%xmm1, (%%rcx,%%r13,2) \n\t" \ - "vmovhpd %%xmm1, (%%rcx,%%r10 ) \n\t"*/ +#define DGEMM_OUTPUT_GS_BETA_NZ \ + "vextractf128 $1, %%ymm0, %%xmm1 \n\t" \ + "vmovlpd %%xmm0, (%%rcx ) \n\t" \ + "vmovhpd %%xmm0, (%%rcx,%%rsi ) \n\t" \ + "vmovlpd %%xmm1, (%%rcx,%%rsi,2) \n\t" \ + "vmovhpd %%xmm1, (%%rcx,%%r13 ) \n\t" /* \ + "vextractf128 $1, %%ymm2, %%xmm1 \n\t" \ + "vmovlpd %%xmm2, (%%rcx,%%rsi,4) \n\t" \ + "vmovhpd %%xmm2, (%%rcx,%%r15 ) \n\t" \ + "vmovlpd %%xmm1, (%%rcx,%%r13,2) \n\t" \ + "vmovhpd %%xmm1, (%%rcx,%%r10 ) \n\t"*/ void bl_dgemm_asm_12x4( - int k, - double* a, - double* b, - double* c, - inc_t ldc, - aux_t* data - ) -{ - //void* a_next = bli_auxinfo_next_a( data ); - //void* b_next = bli_auxinfo_next_b( data ); + int k, + double* a, + double* b, + double* c, + inc_t ldc, + aux_t* data) { + // void* a_next = bli_auxinfo_next_a( data ); + // void* b_next = bli_auxinfo_next_b( data ); - const inc_t cs_c = ldc; - const inc_t rs_c = 1; - double alpha_val = 1.0, beta_val = 1.0; - double *alpha, *beta; + const inc_t cs_c = ldc; + const inc_t rs_c = 1; + double alpha_val = 1.0, beta_val = 1.0; + double *alpha, *beta; - alpha = &alpha_val; - beta = &beta_val; + alpha = &alpha_val; + beta = &beta_val; - dim_t k_iter = (unsigned long long)k / 4; - dim_t k_left = (unsigned long long)k % 4; + dim_t k_iter = (unsigned long long)k / 4; + dim_t k_left = (unsigned long long)k % 4; - __asm__ volatile - ( - " \n\t" - "vzeroall \n\t" // zero all xmm/ymm registers. - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - //"movq %9, %%r15 \n\t" // load address of b_next. - " \n\t" - " \n\t" // initialize loop by pre-loading - "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" - "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" - "vmovapd 2 * 32(%%rax), %%ymm2 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(double) - " \n\t" - "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*cs_c; - "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c - "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c - "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*cs_c - "prefetcht0 7 * 8(%%rcx,%%r13) \n\t" // prefetch c + 3*cs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".DLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 16 * 32(%%rax) \n\t" - " \n\t" - "vbroadcastsd 0 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" - " \n\t" - "vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" - " \n\t" - "vbroadcastsd 2 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" - " \n\t" - "vbroadcastsd 3 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" - "vmovapd 3 * 32(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" - "vmovapd 4 * 32(%%rax), %%ymm1 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" - "vmovapd 5 * 32(%%rax), %%ymm2 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 1 - "vbroadcastsd 4 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" - " \n\t" - "vbroadcastsd 5 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" - " \n\t" - "vbroadcastsd 6 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" - " \n\t" - "vbroadcastsd 7 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" - "vmovapd 6 * 32(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" - "vmovapd 7 * 32(%%rax), %%ymm1 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" - "vmovapd 8 * 32(%%rax), %%ymm2 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 2 - "prefetcht0 22 * 32(%%rax) \n\t" - " \n\t" - "vbroadcastsd 8 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" - " \n\t" - "vbroadcastsd 9 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" - " \n\t" - "vbroadcastsd 10 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" - " \n\t" - "vbroadcastsd 11 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" - "vmovapd 9 * 32(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" - "vmovapd 10 * 32(%%rax), %%ymm1 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" - "vmovapd 11 * 32(%%rax), %%ymm2 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 3 - "vbroadcastsd 12 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" - " \n\t" - "vbroadcastsd 13 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" - " \n\t" - "vbroadcastsd 14 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" - " \n\t" - "vbroadcastsd 15 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" - "vmovapd 12 * 32(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" - "vmovapd 13 * 32(%%rax), %%ymm1 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" - "vmovapd 14 * 32(%%rax), %%ymm2 \n\t" - " \n\t" - " \n\t" - " \n\t" - "addq $4 * 12 * 8, %%rax \n\t" // a += 4*12 (unroll x mr) - "addq $4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr) - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".DLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "prefetcht0 16 * 32(%%rax) \n\t" - " \n\t" - "vbroadcastsd 0 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" - " \n\t" - "vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" - " \n\t" - "vbroadcastsd 2 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" - " \n\t" - "vbroadcastsd 3 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" - "vmovapd 3 * 32(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" - "vmovapd 4 * 32(%%rax), %%ymm1 \n\t" - "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" - "vmovapd 5 * 32(%%rax), %%ymm2 \n\t" - " \n\t" - " \n\t" - " \n\t" - "addq $1 * 12 * 8, %%rax \n\t" // a += 1*12 (unroll x mr) - "addq $1 * 4 * 8, %%rbx \n\t" // b += 1*4 (unroll x nr) - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".DPOSTACCUM: \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha and duplicate - "vbroadcastsd (%%rbx), %%ymm3 \n\t" // load beta and duplicate - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha - "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" - "vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm7, %%ymm7 \n\t" - "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" - "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" - "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" - "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" - "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" - "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" - "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" - "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(double) - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // rdx = c + 4*rs_c; - "leaq (%%rcx,%%rsi,8), %%r12 \n\t" // r12 = c + 8*rs_c; - " \n\t" - "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*rs_c; - //"leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*rs_c; - //"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*rs_c; - " \n\t" - " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 8*cs_c % 32 == 0, AND - " \n\t" // rs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // column-stored - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (8*cs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. - "vucomisd %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. - "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case - " \n\t" - " \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .DCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".DGENSTORED: \n\t" - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c + 4*rs_c - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm5, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm11, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "movq %%r12, %%rcx \n\t" // rcx = c + 8*rs_c - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm9, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm15, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DCOLSTORED: \n\t" - " \n\t" - " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" - "vmovapd %%ymm0, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovapd (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm5, %%ymm3, %%ymm1 \n\t" - "vmovapd %%ymm1, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - "vmovapd (%%r12), %%ymm2 \n\t" - "vfmadd213pd %%ymm6, %%ymm3, %%ymm2 \n\t" - "vmovapd %%ymm2, (%%r12) \n\t" - "addq %%rdi, %%r12 \n\t" - " \n\t" - " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" - "vmovapd %%ymm0, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovapd (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm8, %%ymm3, %%ymm1 \n\t" - "vmovapd %%ymm1, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - "vmovapd (%%r12), %%ymm2 \n\t" - "vfmadd213pd %%ymm9, %%ymm3, %%ymm2 \n\t" - "vmovapd %%ymm2, (%%r12) \n\t" - "addq %%rdi, %%r12 \n\t" - " \n\t" - " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" - "vmovapd %%ymm0, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovapd (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm11, %%ymm3, %%ymm1 \n\t" - "vmovapd %%ymm1, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - "vmovapd (%%r12), %%ymm2 \n\t" - "vfmadd213pd %%ymm12, %%ymm3, %%ymm2 \n\t" - "vmovapd %%ymm2, (%%r12) \n\t" - "addq %%rdi, %%r12 \n\t" - " \n\t" - " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" - "vmovapd %%ymm0, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovapd (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm14, %%ymm3, %%ymm1 \n\t" - "vmovapd %%ymm1, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - "vmovapd (%%r12), %%ymm2 \n\t" - "vfmadd213pd %%ymm15, %%ymm3, %%ymm2 \n\t" - "vmovapd %%ymm2, (%%r12) \n\t" - "addq %%rdi, %%r12 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DBETAZERO: \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .DCOLSTORBZ \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".DGENSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovapd %%ymm4, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm7, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm10, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm13, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c + 4*rs_c - " \n\t" - " \n\t" - "vmovapd %%ymm5, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm8, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm11, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm14, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "movq %%r12, %%rcx \n\t" // rcx = c + 8*rs_c - " \n\t" - " \n\t" - "vmovapd %%ymm6, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm9, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm12, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm15, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DCOLSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovapd %%ymm4, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovapd %%ymm5, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - "vmovapd %%ymm6, (%%r12) \n\t" - "addq %%rdi, %%r12 \n\t" - " \n\t" - "vmovapd %%ymm7, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovapd %%ymm8, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - "vmovapd %%ymm9, (%%r12) \n\t" - "addq %%rdi, %%r12 \n\t" - " \n\t" - "vmovapd %%ymm10, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovapd %%ymm11, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - "vmovapd %%ymm12, (%%r12) \n\t" - "addq %%rdi, %%r12 \n\t" - " \n\t" - "vmovapd %%ymm13, (%%rcx) \n\t" - //"addq %%rdi, %%rcx \n\t" - "vmovapd %%ymm14, (%%rdx) \n\t" - //"addq %%rdi, %%rdx \n\t" - "vmovapd %%ymm15, (%%r12) \n\t" - //"addq %%rdi, %%r12 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DDONE: \n\t" - " \n\t" + __asm__ volatile + ( + " \n\t" + "vzeroall \n\t" // zero all xmm/ymm registers. + " \n\t" + " \n\t" + "movq %2, %%rax \n\t" // load address of a. + "movq %3, %%rbx \n\t" // load address of b. + //"movq %9, %%r15 \n\t" // load address of b_next. + " \n\t" + " \n\t" // initialize loop by pre-loading + "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" + "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" + "vmovapd 2 * 32(%%rax), %%ymm2 \n\t" + " \n\t" + "movq %6, %%rcx \n\t" // load address of c + "movq %8, %%rdi \n\t" // load cs_c + "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(double) + " \n\t" + "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*cs_c; + "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c + "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c + "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*cs_c + "prefetcht0 7 * 8(%%rcx,%%r13) \n\t" // prefetch c + 3*cs_c + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %0, %%rsi \n\t" // i = k_iter; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that + " \n\t" // contains the k_left loop. + " \n\t" + " \n\t" + ".DLOOPKITER: \n\t" // MAIN LOOP + " \n\t" + " \n\t" + " \n\t" // iteration 0 + "prefetcht0 16 * 32(%%rax) \n\t" + " \n\t" + "vbroadcastsd 0 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" + " \n\t" + "vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" + " \n\t" + "vbroadcastsd 2 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" + " \n\t" + "vbroadcastsd 3 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" + "vmovapd 3 * 32(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" + "vmovapd 4 * 32(%%rax), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" + "vmovapd 5 * 32(%%rax), %%ymm2 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" // iteration 1 + "vbroadcastsd 4 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" + " \n\t" + "vbroadcastsd 5 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" + " \n\t" + "vbroadcastsd 6 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" + " \n\t" + "vbroadcastsd 7 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" + "vmovapd 6 * 32(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" + "vmovapd 7 * 32(%%rax), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" + "vmovapd 8 * 32(%%rax), %%ymm2 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" // iteration 2 + "prefetcht0 22 * 32(%%rax) \n\t" + " \n\t" + "vbroadcastsd 8 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" + " \n\t" + "vbroadcastsd 9 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" + " \n\t" + "vbroadcastsd 10 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" + " \n\t" + "vbroadcastsd 11 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" + "vmovapd 9 * 32(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" + "vmovapd 10 * 32(%%rax), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" + "vmovapd 11 * 32(%%rax), %%ymm2 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" // iteration 3 + "vbroadcastsd 12 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" + " \n\t" + "vbroadcastsd 13 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" + " \n\t" + "vbroadcastsd 14 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" + " \n\t" + "vbroadcastsd 15 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" + "vmovapd 12 * 32(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" + "vmovapd 13 * 32(%%rax), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" + "vmovapd 14 * 32(%%rax), %%ymm2 \n\t" + " \n\t" + " \n\t" + " \n\t" + "addq $4 * 12 * 8, %%rax \n\t" // a += 4*12 (unroll x mr) + "addq $4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr) + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .DLOOPKITER \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".DCONSIDKLEFT: \n\t" + " \n\t" + "movq %1, %%rsi \n\t" // i = k_left; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. + " \n\t" // else, we prepare to enter k_left loop. + " \n\t" + " \n\t" + ".DLOOPKLEFT: \n\t" // EDGE LOOP + " \n\t" + "prefetcht0 16 * 32(%%rax) \n\t" + " \n\t" + "vbroadcastsd 0 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" + " \n\t" + "vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" + " \n\t" + "vbroadcastsd 2 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" + " \n\t" + "vbroadcastsd 3 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" + "vmovapd 3 * 32(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" + "vmovapd 4 * 32(%%rax), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" + "vmovapd 5 * 32(%%rax), %%ymm2 \n\t" + " \n\t" + " \n\t" + " \n\t" + "addq $1 * 12 * 8, %%rax \n\t" // a += 1*12 (unroll x mr) + "addq $1 * 4 * 8, %%rbx \n\t" // b += 1*4 (unroll x nr) + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + ".DPOSTACCUM: \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %4, %%rax \n\t" // load address of alpha + "movq %5, %%rbx \n\t" // load address of beta + "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha and duplicate + "vbroadcastsd (%%rbx), %%ymm3 \n\t" // load beta and duplicate + " \n\t" + "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha + "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" + "vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t" + "vmulpd %%ymm0, %%ymm7, %%ymm7 \n\t" + "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" + "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" + "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" + "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" + "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" + "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" + "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %7, %%rsi \n\t" // load rs_c + "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(double) + " \n\t" + "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // rdx = c + 4*rs_c; + "leaq (%%rcx,%%rsi,8), %%r12 \n\t" // r12 = c + 8*rs_c; + " \n\t" + "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*rs_c; + //"leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*rs_c; + //"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*rs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" // determine if + " \n\t" // c % 32 == 0, AND + " \n\t" // 8*cs_c % 32 == 0, AND + " \n\t" // rs_c == 1 + " \n\t" // ie: aligned, ldim aligned, and + " \n\t" // column-stored + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8. + "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); + "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. + "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); + "testq $31, %%rdi \n\t" // set ZF if (8*cs_c) & 32 is zero. + "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); + " \n\t" // and(bl,bh) followed by + " \n\t" // and(bh,al) will reveal result + " \n\t" + " \n\t" // now avoid loading C if beta == 0 + " \n\t" + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. + "vucomisd %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. + "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case + " \n\t" + " \n\t" + " \n\t" // check if aligned/column-stored + "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. + "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. + "jne .DCOLSTORED \n\t" // jump to column storage case + " \n\t" + " \n\t" + " \n\t" + ".DGENSTORED: \n\t" + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "movq %%rdx, %%rcx \n\t" // rcx = c + 4*rs_c + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm5, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm11, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "movq %%r12, %%rcx \n\t" // rcx = c + 8*rs_c + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm9, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm15, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "jmp .DDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".DCOLSTORED: \n\t" + " \n\t" + " \n\t" + "vmovapd (%%rcx), %%ymm0 \n\t" + "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" + "vmovapd %%ymm0, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovapd (%%rdx), %%ymm1 \n\t" + "vfmadd213pd %%ymm5, %%ymm3, %%ymm1 \n\t" + "vmovapd %%ymm1, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovapd (%%r12), %%ymm2 \n\t" + "vfmadd213pd %%ymm6, %%ymm3, %%ymm2 \n\t" + "vmovapd %%ymm2, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + "vmovapd (%%rcx), %%ymm0 \n\t" + "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" + "vmovapd %%ymm0, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovapd (%%rdx), %%ymm1 \n\t" + "vfmadd213pd %%ymm8, %%ymm3, %%ymm1 \n\t" + "vmovapd %%ymm1, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovapd (%%r12), %%ymm2 \n\t" + "vfmadd213pd %%ymm9, %%ymm3, %%ymm2 \n\t" + "vmovapd %%ymm2, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + "vmovapd (%%rcx), %%ymm0 \n\t" + "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" + "vmovapd %%ymm0, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovapd (%%rdx), %%ymm1 \n\t" + "vfmadd213pd %%ymm11, %%ymm3, %%ymm1 \n\t" + "vmovapd %%ymm1, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovapd (%%r12), %%ymm2 \n\t" + "vfmadd213pd %%ymm12, %%ymm3, %%ymm2 \n\t" + "vmovapd %%ymm2, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + "vmovapd (%%rcx), %%ymm0 \n\t" + "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" + "vmovapd %%ymm0, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovapd (%%rdx), %%ymm1 \n\t" + "vfmadd213pd %%ymm14, %%ymm3, %%ymm1 \n\t" + "vmovapd %%ymm1, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovapd (%%r12), %%ymm2 \n\t" + "vfmadd213pd %%ymm15, %%ymm3, %%ymm2 \n\t" + "vmovapd %%ymm2, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "jmp .DDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".DBETAZERO: \n\t" + " \n\t" // check if aligned/column-stored + "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. + "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. + "jne .DCOLSTORBZ \n\t" // jump to column storage case + " \n\t" + " \n\t" + " \n\t" + ".DGENSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovapd %%ymm4, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm7, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm10, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm13, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "movq %%rdx, %%rcx \n\t" // rcx = c + 4*rs_c + " \n\t" + " \n\t" + "vmovapd %%ymm5, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm8, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm11, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm14, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "movq %%r12, %%rcx \n\t" // rcx = c + 8*rs_c + " \n\t" + " \n\t" + "vmovapd %%ymm6, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm9, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm12, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm15, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "jmp .DDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".DCOLSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovapd %%ymm4, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovapd %%ymm5, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovapd %%ymm6, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + "vmovapd %%ymm7, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovapd %%ymm8, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovapd %%ymm9, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + "vmovapd %%ymm10, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovapd %%ymm11, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovapd %%ymm12, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + "vmovapd %%ymm13, (%%rcx) \n\t" + //"addq %%rdi, %%rcx \n\t" + "vmovapd %%ymm14, (%%rdx) \n\t" + //"addq %%rdi, %%rdx \n\t" + "vmovapd %%ymm15, (%%r12) \n\t" + //"addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".DDONE: \n\t" + " \n\t" - : // output operands (none) - : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c)/*, // 8 - "m" (b_next), // 9 - "m" (a_next)*/ // 10 - : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", - "memory" - ); + : // output operands (none) + : // input operands + "m" (k_iter), // 0 + "m" (k_left), // 1 + "m" (a), // 2 + "m" (b), // 3 + "m" (alpha), // 4 + "m" (beta), // 5 + "m" (c), // 6 + "m" (rs_c), // 7 + "m" (cs_c)/*, // 8 + "m" (b_next), // 9 + "m" (a_next)*/ // 10 + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ); } - diff --git a/step3/kernels/bl_dgemm_asm_4x12.c b/step3/kernels/bl_dgemm_asm_4x12.c index 6b7e9ac..0305d2e 100644 --- a/step3/kernels/bl_dgemm_asm_4x12.c +++ b/step3/kernels/bl_dgemm_asm_4x12.c @@ -1,699 +1,660 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - #include "bl_dgemm_kernel.h" -#define inc_t unsigned long long +#define inc_t unsigned long long -#define DGEMM_INPUT_GS_BETA_NZ \ - "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ - "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ - "vmovlpd (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ - "vmovhpd (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ - "vperm2f128 $0x20, %%ymm1, %%ymm0, %%ymm0 \n\t" /*\ - "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ - "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ - "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ - "vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \ - "vperm2f128 $0x20, %%ymm1, %%ymm2, %%ymm2 \n\t"*/ +#define DGEMM_INPUT_GS_BETA_NZ \ + "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ + "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ + "vmovlpd (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ + "vmovhpd (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ + "vperm2f128 $0x20, %%ymm1, %%ymm0, %%ymm0 \n\t" /* \ + "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ + "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ + "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ + "vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \ + "vperm2f128 $0x20, %%ymm1, %%ymm2, %%ymm2 \n\t"*/ -#define DGEMM_OUTPUT_GS_BETA_NZ \ - "vextractf128 $1, %%ymm0, %%xmm1 \n\t" \ - "vmovlpd %%xmm0, (%%rcx ) \n\t" \ - "vmovhpd %%xmm0, (%%rcx,%%rsi ) \n\t" \ - "vmovlpd %%xmm1, (%%rcx,%%rsi,2) \n\t" \ - "vmovhpd %%xmm1, (%%rcx,%%r13 ) \n\t" /*\ - "vextractf128 $1, %%ymm2, %%xmm1 \n\t" \ - "vmovlpd %%xmm2, (%%rcx,%%rsi,4) \n\t" \ - "vmovhpd %%xmm2, (%%rcx,%%r15 ) \n\t" \ - "vmovlpd %%xmm1, (%%rcx,%%r13,2) \n\t" \ - "vmovhpd %%xmm1, (%%rcx,%%r10 ) \n\t"*/ +#define DGEMM_OUTPUT_GS_BETA_NZ \ + "vextractf128 $1, %%ymm0, %%xmm1 \n\t" \ + "vmovlpd %%xmm0, (%%rcx ) \n\t" \ + "vmovhpd %%xmm0, (%%rcx,%%rsi ) \n\t" \ + "vmovlpd %%xmm1, (%%rcx,%%rsi,2) \n\t" \ + "vmovhpd %%xmm1, (%%rcx,%%r13 ) \n\t" /* \ + "vextractf128 $1, %%ymm2, %%xmm1 \n\t" \ + "vmovlpd %%xmm2, (%%rcx,%%rsi,4) \n\t" \ + "vmovhpd %%xmm2, (%%rcx,%%r15 ) \n\t" \ + "vmovlpd %%xmm1, (%%rcx,%%r13,2) \n\t" \ + "vmovhpd %%xmm1, (%%rcx,%%r10 ) \n\t"*/ -void bl_dgemm_asm_4x12 - ( - int k, - double* a, - double* b, - double* c, - inc_t ldc, - aux_t* data - ) -{ - //void* a_next = bli_auxinfo_next_a( data ); - //void* b_next = bli_auxinfo_next_b( data ); +void bl_dgemm_asm_4x12( + int k, + double* a, + double* b, + double* c, + inc_t ldc, + aux_t* data) { + // void* a_next = bli_auxinfo_next_a( data ); + // void* b_next = bli_auxinfo_next_b( data ); - //dim_t k_iter = k / 4; - //dim_t k_left = k % 4; + // dim_t k_iter = k / 4; + // dim_t k_left = k % 4; - const inc_t cs_c = ldc; - const inc_t rs_c = 1; - double alpha_val = 1.0, beta_val = 1.0; - double *alpha, *beta; + const inc_t cs_c = ldc; + const inc_t rs_c = 1; + double alpha_val = 1.0, beta_val = 1.0; + double *alpha, *beta; - alpha = &alpha_val; - beta = &beta_val; + alpha = &alpha_val; + beta = &beta_val; - dim_t k_iter = (unsigned long long)k / 4; - dim_t k_left = (unsigned long long)k % 4; + dim_t k_iter = (unsigned long long)k / 4; + dim_t k_left = (unsigned long long)k % 4; + __asm__ volatile + ( + " \n\t" + "vzeroall \n\t" // zero all xmm/ymm registers. + " \n\t" + " \n\t" + "movq %2, %%rax \n\t" // load address of a. + "movq %3, %%rbx \n\t" // load address of b. + //"movq %9, %%r15 \n\t" // load address of b_next. + " \n\t" + "addq $32 * 4, %%rbx \n\t" + " \n\t" // initialize loop by pre-loading + "vmovapd -4 * 32(%%rbx), %%ymm1 \n\t" + "vmovapd -3 * 32(%%rbx), %%ymm2 \n\t" + "vmovapd -2 * 32(%%rbx), %%ymm3 \n\t" + " \n\t" + "movq %6, %%rcx \n\t" // load address of c + "movq %7, %%rdi \n\t" // load rs_c + "leaq (,%%rdi,8), %%rdi \n\t" // rs_c *= sizeof(double) + " \n\t" + "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*rs_c; + "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c + "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*rs_c + "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*rs_c + "prefetcht0 7 * 8(%%rcx,%%r13) \n\t" // prefetch c + 3*rs_c + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %0, %%rsi \n\t" // i = k_iter; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that + " \n\t" // contains the k_left loop. + " \n\t" + " \n\t" + ".DLOOPKITER: \n\t" // MAIN LOOP + " \n\t" + " \n\t" + " \n\t" // iteration 0 + "prefetcht0 24 * 8(%%rax) \n\t" + " \n\t" + "vbroadcastf128 0 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" + "vbroadcastf128 2 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" + "vmovapd -1 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" + "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" + "vmovapd 1 * 32(%%rbx), %%ymm3 \n\t" + " \n\t" + " \n\t" // iteration 1 + "vbroadcastf128 4 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" + "vbroadcastf128 6 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" + "vmovapd 2 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" + "vmovapd 3 * 32(%%rbx), %%ymm2 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" + "vmovapd 4 * 32(%%rbx), %%ymm3 \n\t" + " \n\t" + "prefetcht0 32 * 8(%%rax) \n\t" + " \n\t" // iteration 2 + "vbroadcastf128 8 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" + "vbroadcastf128 10 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" + "vmovapd 5 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" + "vmovapd 6 * 32(%%rbx), %%ymm2 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" + "vmovapd 7 * 32(%%rbx), %%ymm3 \n\t" + " \n\t" + " \n\t" // iteration 3 + "vbroadcastf128 12 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" + "vbroadcastf128 14 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" + "vmovapd 8 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" + "vmovapd 9 * 32(%%rbx), %%ymm2 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" + "vmovapd 10 * 32(%%rbx), %%ymm3 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "addq $4 * 4 * 8, %%rax \n\t" // a += 4*4 (unroll x mr) + "addq $4 * 12 * 8, %%rbx \n\t" // b += 4*12 (unroll x nr) + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .DLOOPKITER \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".DCONSIDKLEFT: \n\t" + " \n\t" + "movq %1, %%rsi \n\t" // i = k_left; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. + " \n\t" // else, we prepare to enter k_left loop. + " \n\t" + " \n\t" + ".DLOOPKLEFT: \n\t" // EDGE LOOP + " \n\t" + "prefetcht0 24 * 8(%%rax) \n\t" + " \n\t" + "vbroadcastf128 0 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" + "vbroadcastf128 2 * 8(%%rax), %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" + "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" + "vmovapd -1 * 32(%%rbx), %%ymm1 \n\t" + "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" + "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" + "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" + "vmovapd 1 * 32(%%rbx), %%ymm3 \n\t" + " \n\t" + " \n\t" + " \n\t" + "addq $1 * 4 * 8, %%rax \n\t" // a += 1*4 (unroll x mr) + "addq $1 * 12 * 8, %%rbx \n\t" // b += 1*12 (unroll x nr) + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + ".DPOSTACCUM: \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %4, %%rax \n\t" // load address of alpha + "movq %5, %%rbx \n\t" // load address of beta + "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha and duplicate + "vbroadcastsd (%%rbx), %%ymm3 \n\t" // load beta and duplicate + " \n\t" + "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha + "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" + "vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t" + "vmulpd %%ymm0, %%ymm7, %%ymm7 \n\t" + "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" + "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" + "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" + "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" + "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" + "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" + "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t" + " \n\t" + " \n\t" // ymm4 : ( ab00 ab11 ab02 ab13 ) + " \n\t" // ymm7 : ( ab10 ab01 ab12 ab03 ) + " \n\t" // ymm10: ( ab20 ab31 ab22 ab33 ) + " \n\t" // ymm13: ( ab30 ab21 ab32 ab23 ) + " \n\t" + " \n\t" // ymm5 : ( ab04 ab15 ab06 ab17 ) + " \n\t" // ymm8 : ( ab14 ab05 ab16 ab07 ) + " \n\t" // ymm11: ( ab24 ab35 ab26 ab37 ) + " \n\t" // ymm14: ( ab34 ab25 ab36 ab27 ) + " \n\t" + " \n\t" // ymm6 : ( ab08 ab19 ab0A ab1B ) + " \n\t" // ymm9 : ( ab18 ab09 ab1A ab0B ) + " \n\t" // ymm12: ( ab28 ab39 ab2A ab3B ) + " \n\t" // ymm15: ( ab38 ab29 ab3A ab2B ) + "vmovapd %%ymm4, %%ymm0 \n\t" + "vshufpd $0xa, %%ymm7, %%ymm4, %%ymm4 \n\t" + "vshufpd $0xa, %%ymm0, %%ymm7, %%ymm7 \n\t" + " \n\t" + "vmovapd %%ymm5, %%ymm0 \n\t" + "vshufpd $0xa, %%ymm8, %%ymm5, %%ymm5 \n\t" + "vshufpd $0xa, %%ymm0, %%ymm8, %%ymm8 \n\t" + " \n\t" + "vmovapd %%ymm6, %%ymm0 \n\t" + "vshufpd $0xa, %%ymm9, %%ymm6, %%ymm6 \n\t" + "vshufpd $0xa, %%ymm0, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vmovapd %%ymm10, %%ymm0 \n\t" + "vshufpd $0xa, %%ymm13, %%ymm10, %%ymm10 \n\t" + "vshufpd $0xa, %%ymm0, %%ymm13, %%ymm13 \n\t" + " \n\t" + "vmovapd %%ymm11, %%ymm0 \n\t" + "vshufpd $0xa, %%ymm14, %%ymm11, %%ymm11 \n\t" + "vshufpd $0xa, %%ymm0, %%ymm14, %%ymm14 \n\t" + " \n\t" + "vmovapd %%ymm12, %%ymm0 \n\t" + "vshufpd $0xa, %%ymm15, %%ymm12, %%ymm12 \n\t" + "vshufpd $0xa, %%ymm0, %%ymm15, %%ymm15 \n\t" + " \n\t" // ymm4 : ( ab00 ab01 ab02 ab03 ) + " \n\t" // ymm7 : ( ab10 ab11 ab12 ab13 ) + " \n\t" // ymm10: ( ab20 ab21 ab22 ab23 ) + " \n\t" // ymm13: ( ab30 ab31 ab32 ab33 ) + " \n\t" + " \n\t" // ymm5 : ( ab04 ab05 ab06 ab07 ) + " \n\t" // ymm8 : ( ab14 ab15 ab16 ab17 ) + " \n\t" // ymm11: ( ab24 ab25 ab26 ab27 ) + " \n\t" // ymm14: ( ab34 ab35 ab36 ab37 ) + " \n\t" + " \n\t" // ymm6 : ( ab08 ab09 ab0A ab0B ) + " \n\t" // ymm9 : ( ab18 ab19 ab1A ab1B ) + " \n\t" // ymm12: ( ab28 ab29 ab2A ab2B ) + " \n\t" // ymm15: ( ab38 ab39 ab3A ab3B ) + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + //"m" (rs_c), // 7 rdi + //"m" (cs_c), // 8 rsi + "movq %8, %%rsi \n\t" // load cs_c + "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(double) + " \n\t" + "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // rdx = c + 4*cs_c; + "leaq (%%rcx,%%rsi,8), %%r12 \n\t" // r12 = c + 8*cs_c; + " \n\t" + "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" // determine if + " \n\t" // c % 32 == 0, AND + " \n\t" // 8*rs_c % 32 == 0, AND + " \n\t" // cs_c == 1 + " \n\t" // ie: aligned, ldim aligned, and + " \n\t" // row-stored + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); + "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. + "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); + "testq $31, %%rdi \n\t" // set ZF if (8*rs_c) & 32 is zero. + "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); + " \n\t" // and(bl,bh) followed by + " \n\t" // and(bh,al) will reveal result + " \n\t" + " \n\t" // now avoid loading C if beta == 0 + " \n\t" + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. + "vucomisd %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. + "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case + " \n\t" + " \n\t" + " \n\t" // check if aligned/row-stored + "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. + "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. + "jne .DROWSTORED \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".DGENSTORED: \n\t" + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "movq %%rdx, %%rcx \n\t" // rcx = c + 4*cs_c + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm5, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm11, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "movq %%r12, %%rcx \n\t" // rcx = c + 8*cs_c + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm9, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm15, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "jmp .DDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".DROWSTORED: \n\t" + " \n\t" + " \n\t" + "vmovapd (%%rcx), %%ymm0 \n\t" + "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" + "vmovapd %%ymm0, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovapd (%%rdx), %%ymm1 \n\t" + "vfmadd213pd %%ymm5, %%ymm3, %%ymm1 \n\t" + "vmovapd %%ymm1, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovapd (%%r12), %%ymm2 \n\t" + "vfmadd213pd %%ymm6, %%ymm3, %%ymm2 \n\t" + "vmovapd %%ymm2, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + "vmovapd (%%rcx), %%ymm0 \n\t" + "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" + "vmovapd %%ymm0, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovapd (%%rdx), %%ymm1 \n\t" + "vfmadd213pd %%ymm8, %%ymm3, %%ymm1 \n\t" + "vmovapd %%ymm1, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovapd (%%r12), %%ymm2 \n\t" + "vfmadd213pd %%ymm9, %%ymm3, %%ymm2 \n\t" + "vmovapd %%ymm2, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + "vmovapd (%%rcx), %%ymm0 \n\t" + "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" + "vmovapd %%ymm0, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovapd (%%rdx), %%ymm1 \n\t" + "vfmadd213pd %%ymm11, %%ymm3, %%ymm1 \n\t" + "vmovapd %%ymm1, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovapd (%%r12), %%ymm2 \n\t" + "vfmadd213pd %%ymm12, %%ymm3, %%ymm2 \n\t" + "vmovapd %%ymm2, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + "vmovapd (%%rcx), %%ymm0 \n\t" + "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" + "vmovapd %%ymm0, (%%rcx) \n\t" + //"addq %%rdi, %%rcx \n\t" + "vmovapd (%%rdx), %%ymm1 \n\t" + "vfmadd213pd %%ymm14, %%ymm3, %%ymm1 \n\t" + "vmovapd %%ymm1, (%%rdx) \n\t" + //"addq %%rdi, %%rdx \n\t" + "vmovapd (%%r12), %%ymm2 \n\t" + "vfmadd213pd %%ymm15, %%ymm3, %%ymm2 \n\t" + "vmovapd %%ymm2, (%%r12) \n\t" + //"addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "jmp .DDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".DBETAZERO: \n\t" + " \n\t" // check if aligned/row-stored + "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. + "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. + "jne .DROWSTORBZ \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".DGENSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovapd %%ymm4, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm7, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm10, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm13, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "movq %%rdx, %%rcx \n\t" // rcx = c + 4*cs_c + " \n\t" + " \n\t" + "vmovapd %%ymm5, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm8, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm11, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm14, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "movq %%r12, %%rcx \n\t" // rcx = c + 8*cs_c + " \n\t" + " \n\t" + "vmovapd %%ymm6, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm9, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm12, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovapd %%ymm15, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + //"addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "jmp .DDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".DROWSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovapd %%ymm4, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovapd %%ymm5, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovapd %%ymm6, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + "vmovapd %%ymm7, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovapd %%ymm8, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovapd %%ymm9, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + "vmovapd %%ymm10, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovapd %%ymm11, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + "vmovapd %%ymm12, (%%r12) \n\t" + "addq %%rdi, %%r12 \n\t" + " \n\t" + "vmovapd %%ymm13, (%%rcx) \n\t" + //"addq %%rdi, %%rcx \n\t" + "vmovapd %%ymm14, (%%rdx) \n\t" + //"addq %%rdi, %%rdx \n\t" + "vmovapd %%ymm15, (%%r12) \n\t" + //"addq %%rdi, %%r12 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".DDONE: \n\t" + " \n\t" - __asm__ volatile - ( - " \n\t" - "vzeroall \n\t" // zero all xmm/ymm registers. - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - //"movq %9, %%r15 \n\t" // load address of b_next. - " \n\t" - "addq $32 * 4, %%rbx \n\t" - " \n\t" // initialize loop by pre-loading - "vmovapd -4 * 32(%%rbx), %%ymm1 \n\t" - "vmovapd -3 * 32(%%rbx), %%ymm2 \n\t" - "vmovapd -2 * 32(%%rbx), %%ymm3 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %7, %%rdi \n\t" // load rs_c - "leaq (,%%rdi,8), %%rdi \n\t" // rs_c *= sizeof(double) - " \n\t" - "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*rs_c; - "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c - "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*rs_c - "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*rs_c - "prefetcht0 7 * 8(%%rcx,%%r13) \n\t" // prefetch c + 3*rs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".DLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 24 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastf128 0 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" - "vbroadcastf128 2 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" - "vmovapd -1 * 32(%%rbx), %%ymm1 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" - "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" - "vmovapd 1 * 32(%%rbx), %%ymm3 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vbroadcastf128 4 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" - "vbroadcastf128 6 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" - "vmovapd 2 * 32(%%rbx), %%ymm1 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" - "vmovapd 3 * 32(%%rbx), %%ymm2 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" - "vmovapd 4 * 32(%%rbx), %%ymm3 \n\t" - " \n\t" - "prefetcht0 32 * 8(%%rax) \n\t" - " \n\t" // iteration 2 - "vbroadcastf128 8 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" - "vbroadcastf128 10 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" - "vmovapd 5 * 32(%%rbx), %%ymm1 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" - "vmovapd 6 * 32(%%rbx), %%ymm2 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" - "vmovapd 7 * 32(%%rbx), %%ymm3 \n\t" - " \n\t" - " \n\t" // iteration 3 - "vbroadcastf128 12 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" - "vbroadcastf128 14 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" - "vmovapd 8 * 32(%%rbx), %%ymm1 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" - "vmovapd 9 * 32(%%rbx), %%ymm2 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" - "vmovapd 10 * 32(%%rbx), %%ymm3 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "addq $4 * 4 * 8, %%rax \n\t" // a += 4*4 (unroll x mr) - "addq $4 * 12 * 8, %%rbx \n\t" // b += 4*12 (unroll x nr) - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".DLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "prefetcht0 24 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastf128 0 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm4 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm5 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm6 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm7 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm8 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm9 \n\t" - "vbroadcastf128 2 * 8(%%rax), %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm10 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm11 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm12 \n\t" - "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" - "vfmadd231pd %%ymm1, %%ymm0, %%ymm13 \n\t" - "vmovapd -1 * 32(%%rbx), %%ymm1 \n\t" - "vfmadd231pd %%ymm2, %%ymm0, %%ymm14 \n\t" - "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" - "vfmadd231pd %%ymm3, %%ymm0, %%ymm15 \n\t" - "vmovapd 1 * 32(%%rbx), %%ymm3 \n\t" - " \n\t" - " \n\t" - " \n\t" - "addq $1 * 4 * 8, %%rax \n\t" // a += 1*4 (unroll x mr) - "addq $1 * 12 * 8, %%rbx \n\t" // b += 1*12 (unroll x nr) - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".DPOSTACCUM: \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha and duplicate - "vbroadcastsd (%%rbx), %%ymm3 \n\t" // load beta and duplicate - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha - "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" - "vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm7, %%ymm7 \n\t" - "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" - "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" - "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" - "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" - "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" - "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" - "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" - "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" // ymm4 : ( ab00 ab11 ab02 ab13 ) - " \n\t" // ymm7 : ( ab10 ab01 ab12 ab03 ) - " \n\t" // ymm10: ( ab20 ab31 ab22 ab33 ) - " \n\t" // ymm13: ( ab30 ab21 ab32 ab23 ) - " \n\t" - " \n\t" // ymm5 : ( ab04 ab15 ab06 ab17 ) - " \n\t" // ymm8 : ( ab14 ab05 ab16 ab07 ) - " \n\t" // ymm11: ( ab24 ab35 ab26 ab37 ) - " \n\t" // ymm14: ( ab34 ab25 ab36 ab27 ) - " \n\t" - " \n\t" // ymm6 : ( ab08 ab19 ab0A ab1B ) - " \n\t" // ymm9 : ( ab18 ab09 ab1A ab0B ) - " \n\t" // ymm12: ( ab28 ab39 ab2A ab3B ) - " \n\t" // ymm15: ( ab38 ab29 ab3A ab2B ) - "vmovapd %%ymm4, %%ymm0 \n\t" - "vshufpd $0xa, %%ymm7, %%ymm4, %%ymm4 \n\t" - "vshufpd $0xa, %%ymm0, %%ymm7, %%ymm7 \n\t" - " \n\t" - "vmovapd %%ymm5, %%ymm0 \n\t" - "vshufpd $0xa, %%ymm8, %%ymm5, %%ymm5 \n\t" - "vshufpd $0xa, %%ymm0, %%ymm8, %%ymm8 \n\t" - " \n\t" - "vmovapd %%ymm6, %%ymm0 \n\t" - "vshufpd $0xa, %%ymm9, %%ymm6, %%ymm6 \n\t" - "vshufpd $0xa, %%ymm0, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmovapd %%ymm10, %%ymm0 \n\t" - "vshufpd $0xa, %%ymm13, %%ymm10, %%ymm10 \n\t" - "vshufpd $0xa, %%ymm0, %%ymm13, %%ymm13 \n\t" - " \n\t" - "vmovapd %%ymm11, %%ymm0 \n\t" - "vshufpd $0xa, %%ymm14, %%ymm11, %%ymm11 \n\t" - "vshufpd $0xa, %%ymm0, %%ymm14, %%ymm14 \n\t" - " \n\t" - "vmovapd %%ymm12, %%ymm0 \n\t" - "vshufpd $0xa, %%ymm15, %%ymm12, %%ymm12 \n\t" - "vshufpd $0xa, %%ymm0, %%ymm15, %%ymm15 \n\t" - " \n\t" // ymm4 : ( ab00 ab01 ab02 ab03 ) - " \n\t" // ymm7 : ( ab10 ab11 ab12 ab13 ) - " \n\t" // ymm10: ( ab20 ab21 ab22 ab23 ) - " \n\t" // ymm13: ( ab30 ab31 ab32 ab33 ) - " \n\t" - " \n\t" // ymm5 : ( ab04 ab05 ab06 ab07 ) - " \n\t" // ymm8 : ( ab14 ab15 ab16 ab17 ) - " \n\t" // ymm11: ( ab24 ab25 ab26 ab27 ) - " \n\t" // ymm14: ( ab34 ab35 ab36 ab37 ) - " \n\t" - " \n\t" // ymm6 : ( ab08 ab09 ab0A ab0B ) - " \n\t" // ymm9 : ( ab18 ab19 ab1A ab1B ) - " \n\t" // ymm12: ( ab28 ab29 ab2A ab2B ) - " \n\t" // ymm15: ( ab38 ab39 ab3A ab3B ) - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - //"m" (rs_c), // 7 rdi - //"m" (cs_c), // 8 rsi - "movq %8, %%rsi \n\t" // load cs_c - "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(double) - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // rdx = c + 4*cs_c; - "leaq (%%rcx,%%rsi,8), %%r12 \n\t" // r12 = c + 8*cs_c; - " \n\t" - "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; - " \n\t" - " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 8*rs_c % 32 == 0, AND - " \n\t" // cs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // row-stored - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (8*rs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. - "vucomisd %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. - "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case - " \n\t" - " \n\t" - " \n\t" // check if aligned/row-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .DROWSTORED \n\t" // jump to row storage case - " \n\t" - " \n\t" - " \n\t" - ".DGENSTORED: \n\t" - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c + 4*cs_c - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm5, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm11, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "movq %%r12, %%rcx \n\t" // rcx = c + 8*cs_c - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm9, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm15, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DROWSTORED: \n\t" - " \n\t" - " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" - "vmovapd %%ymm0, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovapd (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm5, %%ymm3, %%ymm1 \n\t" - "vmovapd %%ymm1, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - "vmovapd (%%r12), %%ymm2 \n\t" - "vfmadd213pd %%ymm6, %%ymm3, %%ymm2 \n\t" - "vmovapd %%ymm2, (%%r12) \n\t" - "addq %%rdi, %%r12 \n\t" - " \n\t" - " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" - "vmovapd %%ymm0, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovapd (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm8, %%ymm3, %%ymm1 \n\t" - "vmovapd %%ymm1, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - "vmovapd (%%r12), %%ymm2 \n\t" - "vfmadd213pd %%ymm9, %%ymm3, %%ymm2 \n\t" - "vmovapd %%ymm2, (%%r12) \n\t" - "addq %%rdi, %%r12 \n\t" - " \n\t" - " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" - "vmovapd %%ymm0, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovapd (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm11, %%ymm3, %%ymm1 \n\t" - "vmovapd %%ymm1, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - "vmovapd (%%r12), %%ymm2 \n\t" - "vfmadd213pd %%ymm12, %%ymm3, %%ymm2 \n\t" - "vmovapd %%ymm2, (%%r12) \n\t" - "addq %%rdi, %%r12 \n\t" - " \n\t" - " \n\t" - "vmovapd (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" - "vmovapd %%ymm0, (%%rcx) \n\t" - //"addq %%rdi, %%rcx \n\t" - "vmovapd (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm14, %%ymm3, %%ymm1 \n\t" - "vmovapd %%ymm1, (%%rdx) \n\t" - //"addq %%rdi, %%rdx \n\t" - "vmovapd (%%r12), %%ymm2 \n\t" - "vfmadd213pd %%ymm15, %%ymm3, %%ymm2 \n\t" - "vmovapd %%ymm2, (%%r12) \n\t" - //"addq %%rdi, %%r12 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DBETAZERO: \n\t" - " \n\t" // check if aligned/row-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .DROWSTORBZ \n\t" // jump to row storage case - " \n\t" - " \n\t" - " \n\t" - ".DGENSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovapd %%ymm4, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm7, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm10, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm13, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c + 4*cs_c - " \n\t" - " \n\t" - "vmovapd %%ymm5, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm8, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm11, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm14, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "movq %%r12, %%rcx \n\t" // rcx = c + 8*cs_c - " \n\t" - " \n\t" - "vmovapd %%ymm6, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm9, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm12, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovapd %%ymm15, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - //"addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DROWSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovapd %%ymm4, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovapd %%ymm5, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - "vmovapd %%ymm6, (%%r12) \n\t" - "addq %%rdi, %%r12 \n\t" - " \n\t" - "vmovapd %%ymm7, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovapd %%ymm8, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - "vmovapd %%ymm9, (%%r12) \n\t" - "addq %%rdi, %%r12 \n\t" - " \n\t" - "vmovapd %%ymm10, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovapd %%ymm11, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - "vmovapd %%ymm12, (%%r12) \n\t" - "addq %%rdi, %%r12 \n\t" - " \n\t" - "vmovapd %%ymm13, (%%rcx) \n\t" - //"addq %%rdi, %%rcx \n\t" - "vmovapd %%ymm14, (%%rdx) \n\t" - //"addq %%rdi, %%rdx \n\t" - "vmovapd %%ymm15, (%%r12) \n\t" - //"addq %%rdi, %%r12 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DDONE: \n\t" - " \n\t" - - : // output operands (none) - : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c)/*, // 8 - "m" (b_next), // 9 - "m" (a_next)*/ // 10 - : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", - "memory" - ); + : // output operands (none) + : // input operands + "m" (k_iter), // 0 + "m" (k_left), // 1 + "m" (a), // 2 + "m" (b), // 3 + "m" (alpha), // 4 + "m" (beta), // 5 + "m" (c), // 6 + "m" (rs_c), // 7 + "m" (cs_c)/*, // 8 + "m" (b_next), // 9 + "m" (a_next)*/ // 10 + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ); } - diff --git a/step3/kernels/bl_dgemm_asm_6x8.c b/step3/kernels/bl_dgemm_asm_6x8.c index cdf8b33..8839bb4 100644 --- a/step3/kernels/bl_dgemm_asm_6x8.c +++ b/step3/kernels/bl_dgemm_asm_6x8.c @@ -1,624 +1,584 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - #include "bl_dgemm_kernel.h" -#define inc_t unsigned long long - +#define inc_t unsigned long long -#define DGEMM_INPUT_GS_BETA_NZ \ - "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ - "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ - "vmovlpd (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ - "vmovhpd (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ - "vperm2f128 $0x20, %%ymm1, %%ymm0, %%ymm0 \n\t" /*\ - "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ - "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ - "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ - "vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \ - "vperm2f128 $0x20, %%ymm1, %%ymm2, %%ymm2 \n\t"*/ +#define DGEMM_INPUT_GS_BETA_NZ \ + "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ + "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ + "vmovlpd (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ + "vmovhpd (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ + "vperm2f128 $0x20, %%ymm1, %%ymm0, %%ymm0 \n\t" /* \ + "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ + "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ + "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ + "vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \ + "vperm2f128 $0x20, %%ymm1, %%ymm2, %%ymm2 \n\t"*/ -#define DGEMM_OUTPUT_GS_BETA_NZ \ - "vextractf128 $1, %%ymm0, %%xmm1 \n\t" \ - "vmovlpd %%xmm0, (%%rcx ) \n\t" \ - "vmovhpd %%xmm0, (%%rcx,%%rsi ) \n\t" \ - "vmovlpd %%xmm1, (%%rcx,%%rsi,2) \n\t" \ - "vmovhpd %%xmm1, (%%rcx,%%r13 ) \n\t" /*\ - "vextractf128 $1, %%ymm2, %%xmm1 \n\t" \ - "vmovlpd %%xmm2, (%%rcx,%%rsi,4) \n\t" \ - "vmovhpd %%xmm2, (%%rcx,%%r15 ) \n\t" \ - "vmovlpd %%xmm1, (%%rcx,%%r13,2) \n\t" \ - "vmovhpd %%xmm1, (%%rcx,%%r10 ) \n\t"*/ +#define DGEMM_OUTPUT_GS_BETA_NZ \ + "vextractf128 $1, %%ymm0, %%xmm1 \n\t" \ + "vmovlpd %%xmm0, (%%rcx ) \n\t" \ + "vmovhpd %%xmm0, (%%rcx,%%rsi ) \n\t" \ + "vmovlpd %%xmm1, (%%rcx,%%rsi,2) \n\t" \ + "vmovhpd %%xmm1, (%%rcx,%%r13 ) \n\t" /* \ + "vextractf128 $1, %%ymm2, %%xmm1 \n\t" \ + "vmovlpd %%xmm2, (%%rcx,%%rsi,4) \n\t" \ + "vmovhpd %%xmm2, (%%rcx,%%r15 ) \n\t" \ + "vmovlpd %%xmm1, (%%rcx,%%r13,2) \n\t" \ + "vmovhpd %%xmm1, (%%rcx,%%r10 ) \n\t"*/ -void bl_dgemm_asm_6x8 - ( - int k, - double* a, - double* b, - double* c, - inc_t ldc, - aux_t* data - ) -{ - //void* a_next = bli_auxinfo_next_a( data ); - //void* b_next = bli_auxinfo_next_b( data ); +void bl_dgemm_asm_6x8( + int k, + double* a, + double* b, + double* c, + inc_t ldc, + aux_t* data) { + // void* a_next = bli_auxinfo_next_a( data ); + // void* b_next = bli_auxinfo_next_b( data ); - //uint64_t k_iter = k / 4; - //uint64_t k_left = k % 4; + // uint64_t k_iter = k / 4; + // uint64_t k_left = k % 4; - const inc_t cs_c = ldc; - const inc_t rs_c = 1; - double alpha_val = 1.0, beta_val = 1.0; - double *alpha, *beta; + const inc_t cs_c = ldc; + const inc_t rs_c = 1; + double alpha_val = 1.0, beta_val = 1.0; + double *alpha, *beta; - alpha = &alpha_val; - beta = &beta_val; + alpha = &alpha_val; + beta = &beta_val; - dim_t k_iter = (unsigned long long)k / 4; - dim_t k_left = (unsigned long long)k % 4; + dim_t k_iter = (unsigned long long)k / 4; + dim_t k_left = (unsigned long long)k % 4; + __asm__ volatile + ( + " \n\t" + "vzeroall \n\t" // zero all xmm/ymm registers. + " \n\t" + " \n\t" + "movq %2, %%rax \n\t" // load address of a. + "movq %3, %%rbx \n\t" // load address of b. + //"movq %9, %%r15 \n\t" // load address of b_next. + " \n\t" + "addq $32 * 4, %%rbx \n\t" + " \n\t" // initialize loop by pre-loading + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + "movq %6, %%rcx \n\t" // load address of c + "movq %7, %%rdi \n\t" // load rs_c + "leaq (,%%rdi,8), %%rdi \n\t" // rs_c *= sizeof(double) + " \n\t" + "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*rs_c; + "leaq (%%rcx,%%r13,1), %%rdx \n\t" // rdx = c + 3*rs_c; + "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c + "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*rs_c + "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*rs_c + "prefetcht0 7 * 8(%%rdx) \n\t" // prefetch c + 3*rs_c + "prefetcht0 7 * 8(%%rdx,%%rdi) \n\t" // prefetch c + 4*rs_c + "prefetcht0 7 * 8(%%rdx,%%rdi,2) \n\t" // prefetch c + 5*rs_c + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %0, %%rsi \n\t" // i = k_iter; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that + " \n\t" // contains the k_left loop. + " \n\t" + " \n\t" + ".DLOOPKITER: \n\t" // MAIN LOOP + " \n\t" + " \n\t" + " \n\t" // iteration 0 + "prefetcht0 64 * 8(%%rax) \n\t" + " \n\t" + "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps -2 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 1 + "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps 0 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 2 + "prefetcht0 76 * 8(%%rax) \n\t" + " \n\t" + "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 14 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps 2 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" // iteration 3 + "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 22 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 23 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "addq $4 * 6 * 8, %%rax \n\t" // a += 4*6 (unroll x mr) + "addq $4 * 8 * 8, %%rbx \n\t" // b += 4*8 (unroll x nr) + " \n\t" + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .DLOOPKITER \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".DCONSIDKLEFT: \n\t" + " \n\t" + "movq %1, %%rsi \n\t" // i = k_left; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. + " \n\t" // else, we prepare to enter k_left loop. + " \n\t" + " \n\t" + ".DLOOPKLEFT: \n\t" // EDGE LOOP + " \n\t" + "prefetcht0 64 * 8(%%rax) \n\t" + " \n\t" + "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" + "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "addq $1 * 6 * 8, %%rax \n\t" // a += 1*6 (unroll x mr) + "addq $1 * 8 * 8, %%rbx \n\t" // b += 1*8 (unroll x nr) + " \n\t" + "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + ".DPOSTACCUM: \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %4, %%rax \n\t" // load address of alpha + "movq %5, %%rbx \n\t" // load address of beta + "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha and duplicate + "vbroadcastsd (%%rbx), %%ymm3 \n\t" // load beta and duplicate + " \n\t" + "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha + "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" + "vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t" + "vmulpd %%ymm0, %%ymm7, %%ymm7 \n\t" + "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" + "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" + "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" + "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" + "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" + "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" + "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %8, %%rsi \n\t" // load cs_c + "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(double) + " \n\t" + "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*cs_c; + " \n\t" + "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; + //"leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*cs_c; + //"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c; + " \n\t" + " \n\t" + " \n\t" // now avoid loading C if beta == 0 + " \n\t" + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. + "vucomisd %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. + "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case + " \n\t" + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .DROWSTORED \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".DGENSTORED: \n\t" + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + " \n\t" + " \n\t" + "movq %%rdx, %%rcx \n\t" // rcx = c + 4*cs_c + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm5, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm9, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm11, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm15, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + " \n\t" + " \n\t" + " \n\t" + "jmp .DDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".DROWSTORED: \n\t" + " \n\t" + " \n\t" + "vfmadd231pd (%%rcx), %%ymm3, %%ymm4 \n\t" + "vmovups %%ymm4, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vfmadd231pd (%%rdx), %%ymm3, %%ymm5 \n\t" + "vmovups %%ymm5, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "vfmadd231pd (%%rcx), %%ymm3, %%ymm6 \n\t" + "vmovups %%ymm6, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vfmadd231pd (%%rdx), %%ymm3, %%ymm7 \n\t" + "vmovups %%ymm7, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "vfmadd231pd (%%rcx), %%ymm3, %%ymm8 \n\t" + "vmovups %%ymm8, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vfmadd231pd (%%rdx), %%ymm3, %%ymm9 \n\t" + "vmovups %%ymm9, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "vfmadd231pd (%%rcx), %%ymm3, %%ymm10 \n\t" + "vmovups %%ymm10, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vfmadd231pd (%%rdx), %%ymm3, %%ymm11 \n\t" + "vmovups %%ymm11, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "vfmadd231pd (%%rcx), %%ymm3, %%ymm12 \n\t" + "vmovups %%ymm12, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vfmadd231pd (%%rdx), %%ymm3, %%ymm13 \n\t" + "vmovups %%ymm13, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "vfmadd231pd (%%rcx), %%ymm3, %%ymm14 \n\t" + "vmovups %%ymm14, (%%rcx) \n\t" + //"addq %%rdi, %%rcx \n\t" + "vfmadd231pd (%%rdx), %%ymm3, %%ymm15 \n\t" + "vmovups %%ymm15, (%%rdx) \n\t" + //"addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + " \n\t" + "jmp .DDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".DBETAZERO: \n\t" + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. + "jz .DROWSTORBZ \n\t" // jump to row storage case + " \n\t" + " \n\t" + " \n\t" + ".DGENSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm4, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm6, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm8, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm10, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm12, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm14, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + " \n\t" + " \n\t" + "movq %%rdx, %%rcx \n\t" // rcx = c + 4*cs_c + " \n\t" + " \n\t" + "vmovaps %%ymm5, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm7, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm9, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm11, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm13, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += rs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm15, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + " \n\t" + " \n\t" + " \n\t" + "jmp .DDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".DROWSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovups %%ymm4, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovups %%ymm5, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + "vmovups %%ymm6, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovups %%ymm7, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "vmovups %%ymm8, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovups %%ymm9, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "vmovups %%ymm10, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovups %%ymm11, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "vmovups %%ymm12, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovups %%ymm13, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "vmovups %%ymm14, (%%rcx) \n\t" + //"addq %%rdi, %%rcx \n\t" + "vmovups %%ymm15, (%%rdx) \n\t" + //"addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".DDONE: \n\t" + " \n\t" - __asm__ volatile - ( - " \n\t" - "vzeroall \n\t" // zero all xmm/ymm registers. - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - //"movq %9, %%r15 \n\t" // load address of b_next. - " \n\t" - "addq $32 * 4, %%rbx \n\t" - " \n\t" // initialize loop by pre-loading - "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %7, %%rdi \n\t" // load rs_c - "leaq (,%%rdi,8), %%rdi \n\t" // rs_c *= sizeof(double) - " \n\t" - "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*rs_c; - "leaq (%%rcx,%%r13,1), %%rdx \n\t" // rdx = c + 3*rs_c; - "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c - "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*rs_c - "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*rs_c - "prefetcht0 7 * 8(%%rdx) \n\t" // prefetch c + 3*rs_c - "prefetcht0 7 * 8(%%rdx,%%rdi) \n\t" // prefetch c + 4*rs_c - "prefetcht0 7 * 8(%%rdx,%%rdi,2) \n\t" // prefetch c + 5*rs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".DLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 64 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps -2 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 1 - "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps 0 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 2 - "prefetcht0 76 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 14 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps 2 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" // iteration 3 - "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 22 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 23 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $4 * 6 * 8, %%rax \n\t" // a += 4*6 (unroll x mr) - "addq $4 * 8 * 8, %%rbx \n\t" // b += 4*8 (unroll x nr) - " \n\t" - "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".DLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "prefetcht0 64 * 8(%%rax) \n\t" - " \n\t" - "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" - "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $1 * 6 * 8, %%rax \n\t" // a += 1*6 (unroll x mr) - "addq $1 * 8 * 8, %%rbx \n\t" // b += 1*8 (unroll x nr) - " \n\t" - "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".DPOSTACCUM: \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha and duplicate - "vbroadcastsd (%%rbx), %%ymm3 \n\t" // load beta and duplicate - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha - "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" - "vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm7, %%ymm7 \n\t" - "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" - "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" - "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" - "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" - "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" - "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" - "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" - "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %8, %%rsi \n\t" // load cs_c - "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(double) - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*cs_c; - " \n\t" - "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; - //"leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*cs_c; - //"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c; - " \n\t" - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. - "vucomisd %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. - "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case - " \n\t" - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. - "jz .DROWSTORED \n\t" // jump to row storage case - " \n\t" - " \n\t" - " \n\t" - ".DGENSTORED: \n\t" - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c + 4*cs_c - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm5, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm9, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm11, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm15, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DROWSTORED: \n\t" - " \n\t" - " \n\t" - "vfmadd231pd (%%rcx), %%ymm3, %%ymm4 \n\t" - "vmovups %%ymm4, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231pd (%%rdx), %%ymm3, %%ymm5 \n\t" - "vmovups %%ymm5, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231pd (%%rcx), %%ymm3, %%ymm6 \n\t" - "vmovups %%ymm6, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231pd (%%rdx), %%ymm3, %%ymm7 \n\t" - "vmovups %%ymm7, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231pd (%%rcx), %%ymm3, %%ymm8 \n\t" - "vmovups %%ymm8, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231pd (%%rdx), %%ymm3, %%ymm9 \n\t" - "vmovups %%ymm9, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231pd (%%rcx), %%ymm3, %%ymm10 \n\t" - "vmovups %%ymm10, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231pd (%%rdx), %%ymm3, %%ymm11 \n\t" - "vmovups %%ymm11, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231pd (%%rcx), %%ymm3, %%ymm12 \n\t" - "vmovups %%ymm12, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vfmadd231pd (%%rdx), %%ymm3, %%ymm13 \n\t" - "vmovups %%ymm13, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vfmadd231pd (%%rcx), %%ymm3, %%ymm14 \n\t" - "vmovups %%ymm14, (%%rcx) \n\t" - //"addq %%rdi, %%rcx \n\t" - "vfmadd231pd (%%rdx), %%ymm3, %%ymm15 \n\t" - "vmovups %%ymm15, (%%rdx) \n\t" - //"addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DBETAZERO: \n\t" - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. - "jz .DROWSTORBZ \n\t" // jump to row storage case - " \n\t" - " \n\t" - " \n\t" - ".DGENSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovaps %%ymm4, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm6, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm8, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm10, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm12, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm14, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c + 4*cs_c - " \n\t" - " \n\t" - "vmovaps %%ymm5, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm7, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm9, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm11, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm13, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += rs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm15, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DROWSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm4, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm5, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovups %%ymm6, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm7, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm8, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm9, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm10, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm11, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm12, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovups %%ymm13, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovups %%ymm14, (%%rcx) \n\t" - //"addq %%rdi, %%rcx \n\t" - "vmovups %%ymm15, (%%rdx) \n\t" - //"addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DDONE: \n\t" - " \n\t" - - : // output operands (none) - : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c)/*, // 8 - "m" (b_next), // 9 - "m" (a_next)*/ // 10 - : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", - "memory" - ); + : // output operands (none) + : // input operands + "m" (k_iter), // 0 + "m" (k_left), // 1 + "m" (a), // 2 + "m" (b), // 3 + "m" (alpha), // 4 + "m" (beta), // 5 + "m" (c), // 6 + "m" (rs_c), // 7 + "m" (cs_c)/*, // 8 + "m" (b_next), // 9 + "m" (a_next)*/ // 10 + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ); } - diff --git a/step3/kernels/bl_dgemm_asm_8x4.c b/step3/kernels/bl_dgemm_asm_8x4.c index 376b778..eeba22d 100644 --- a/step3/kernels/bl_dgemm_asm_8x4.c +++ b/step3/kernels/bl_dgemm_asm_8x4.c @@ -1,396 +1,495 @@ #include "bl_dgemm_kernel.h" - void bl_dgemm_asm_8x4( - int k, - double *a, - double *b, - double *c, + int k, + double* a, + double* b, + double* c, unsigned long long ldc, - aux_t *aux - ) -{ - unsigned long long k_iter = (unsigned long long)k / 4; - unsigned long long k_left = (unsigned long long)k % 4; + aux_t* aux) { + unsigned long long k_iter = (unsigned long long)k / 4; + unsigned long long k_left = (unsigned long long)k % 4; - __asm__ volatile - ( - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. ( v ) - "movq %3, %%rbx \n\t" // load address of b. ( v ) - "movq %5, %%r15 \n\t" // load address of b_next. ( v ) - "addq $-4 * 64, %%r15 \n\t" // ( ? ) - " \n\t" - "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by pre-loading - "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" // elements of a and b. - "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" - " \n\t" - " \n\t" - "movq %4, %%rcx \n\t" // load address of c - "movq %6, %%rdi \n\t" // load ldc - "leaq (,%%rdi,8), %%rdi \n\t" // ldc * sizeof(double) - "leaq (%%rcx,%%rdi,2), %%r10 \n\t" // load address of c + 2 * ldc; - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "prefetcht0 3 * 8(%%rcx) \n\t" // prefetch c + 0 * ldc - "prefetcht0 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1 * ldc - "prefetcht0 3 * 8(%%r10) \n\t" // prefetch c + 2 * ldc - "prefetcht0 3 * 8(%%r10,%%rdi) \n\t" // prefetch c + 3 * ldc - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "vxorpd %%ymm8, %%ymm8, %%ymm8 \n\t" // set ymm8 to 0 ( v ) - "vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t" - "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" - "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" - "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" - "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" - "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" - "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; ( v ) - "testq %%rsi, %%rsi \n\t" // check i via logical AND. ( v ) - "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that ( v ) - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".DLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - "addq $4 * 4 * 8, %%r15 \n\t" // b_next += 4*4 (unroll x nr) ( v ) - " \n\t" - " \n\t" // iteration 0 - "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" // preload a47 for iter 0 - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" // ymm6 ( c_tmp0 ) = ymm0 ( a03 ) * ymm2( b0 ) - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" // ymm4 ( b0x3_0 ) - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" // ymm7 ( c_tmp1 ) = ymm0 ( a03 ) * ymm3( b0x5 ) - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" // ymm5 ( b0x3_1 ) - "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" // ymm15 ( c_03_0 ) += ymm6( c_tmp0 ) - "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" // ymm13 ( c_03_1 ) += ymm7( c_tmp1 ) - " \n\t" - "prefetcht0 16 * 32(%%rax) \n\t" // prefetch a03 for iter 1 - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovapd 1 * 32(%%rbx), %%ymm2 \n\t" // preload b for iter 1 - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" - "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" - "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 2 * 32(%%rax), %%ymm0 \n\t" // preload a03 for iter 1 - "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" - "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" - "prefetcht0 0 * 32(%%r15) \n\t" // prefetch b_next[0*4] - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" - "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 1 - "vmovapd 3 * 32(%%rax), %%ymm1 \n\t" // preload a47 for iter 1 - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" - "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" - " \n\t" - "prefetcht0 18 * 32(%%rax) \n\t" // prefetch a for iter 9 ( ? ) - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovapd 2 * 32(%%rbx), %%ymm2 \n\t" // preload b for iter 2 - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" - "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" - "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 4 * 32(%%rax), %%ymm0 \n\t" // preload a03 for iter 2 - "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" - "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" - "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 2 - "vmovapd 5 * 32(%%rax), %%ymm1 \n\t" // preload a47 for iter 2 - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" - "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" - " \n\t" - "prefetcht0 20 * 32(%%rax) \n\t" // prefetch a for iter 10 ( ? ) - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovapd 3 * 32(%%rbx), %%ymm2 \n\t" // preload b for iter 3 - "addq $4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr) - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" - "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" - "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 6 * 32(%%rax), %%ymm0 \n\t" // preload a03 for iter 3 - "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" - "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" - "prefetcht0 2 * 32(%%r15) \n\t" // prefetch b_next[2*4] - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" - "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 3 - "vmovapd 7 * 32(%%rax), %%ymm1 \n\t" // preload a47 for iter 3 - "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr) - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" - "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" - " \n\t" - "prefetcht0 14 * 32(%%rax) \n\t" // prefetch a for iter 11 ( ? ) - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" // preload b for iter 4 - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" - "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" - "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" // preload a03 for iter 4 - "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" - "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" - "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".DLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" // preload a47 - "addq $8 * 1 * 8, %%rax \n\t" // a += 8 (1 x mr) - "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" - "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" - "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" - "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" - "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" - "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" - " \n\t" - "prefetcht0 14 * 32(%%rax) \n\t" // prefetch a03 for iter 7 later ( ? ) - "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" - "vmovapd 1 * 32(%%rbx), %%ymm2 \n\t" - "addq $4 * 1 * 8, %%rbx \n\t" // b += 4 (1 x nr) - "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" - "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" - "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" - "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" - "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" - "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" - "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" - " \n\t" - "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" - "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" - "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" - "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".DPOSTACCUM: \n\t" - " \n\t" - " \n\t" - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 - " \n\t" // ab11 ab10 ab13 ab12 - " \n\t" // ab22 ab23 ab20 ab21 - " \n\t" // ab33 ) ab32 ) ab31 ) ab30 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43 - " \n\t" // ab51 ab50 ab53 ab52 - " \n\t" // ab62 ab63 ab60 ab61 - " \n\t" // ab73 ) ab72 ) ab71 ) ab70 ) - " \n\t" - "vmovapd %%ymm15, %%ymm7 \n\t" - "vshufpd $0xa, %%ymm15, %%ymm13, %%ymm15 \n\t" - "vshufpd $0xa, %%ymm13, %%ymm7, %%ymm13 \n\t" - " \n\t" - "vmovapd %%ymm11, %%ymm7 \n\t" - "vshufpd $0xa, %%ymm11, %%ymm9, %%ymm11 \n\t" - "vshufpd $0xa, %%ymm9, %%ymm7, %%ymm9 \n\t" - " \n\t" - "vmovapd %%ymm14, %%ymm7 \n\t" - "vshufpd $0xa, %%ymm14, %%ymm12, %%ymm14 \n\t" - "vshufpd $0xa, %%ymm12, %%ymm7, %%ymm12 \n\t" - " \n\t" - "vmovapd %%ymm10, %%ymm7 \n\t" - "vshufpd $0xa, %%ymm10, %%ymm8, %%ymm10 \n\t" - "vshufpd $0xa, %%ymm8, %%ymm7, %%ymm8 \n\t" - " \n\t" - " \n\t" // ymm15: ymm13: ymm11: ymm9: - " \n\t" // ( ab01 ( ab00 ( ab03 ( ab02 - " \n\t" // ab11 ab10 ab13 ab12 - " \n\t" // ab23 ab22 ab21 ab20 - " \n\t" // ab33 ) ab32 ) ab31 ) ab30 ) - " \n\t" - " \n\t" // ymm14: ymm12: ymm10: ymm8: - " \n\t" // ( ab41 ( ab40 ( ab43 ( ab42 - " \n\t" // ab51 ab50 ab53 ab52 - " \n\t" // ab63 ab62 ab61 ab60 - " \n\t" // ab73 ) ab72 ) ab71 ) ab70 ) - " \n\t" - "vmovapd %%ymm15, %%ymm7 \n\t" - "vperm2f128 $0x30, %%ymm15, %%ymm11, %%ymm15 \n\t" - "vperm2f128 $0x12, %%ymm7, %%ymm11, %%ymm11 \n\t" - " \n\t" - "vmovapd %%ymm13, %%ymm7 \n\t" - "vperm2f128 $0x30, %%ymm13, %%ymm9, %%ymm13 \n\t" - "vperm2f128 $0x12, %%ymm7, %%ymm9, %%ymm9 \n\t" - " \n\t" - "vmovapd %%ymm14, %%ymm7 \n\t" - "vperm2f128 $0x30, %%ymm14, %%ymm10, %%ymm14 \n\t" - "vperm2f128 $0x12, %%ymm7, %%ymm10, %%ymm10 \n\t" - " \n\t" - "vmovapd %%ymm12, %%ymm7 \n\t" - "vperm2f128 $0x30, %%ymm12, %%ymm8, %%ymm12 \n\t" - "vperm2f128 $0x12, %%ymm7, %%ymm8, %%ymm8 \n\t" - " \n\t" - " \n\t" // ymm9: ymm11: ymm13: ymm15: - " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03 - " \n\t" // ab10 ab11 ab12 ab13 - " \n\t" // ab20 ab21 ab22 ab23 - " \n\t" // ab30 ) ab31 ) ab32 ) ab33 ) - " \n\t" - " \n\t" // ymm8: ymm10: ymm12: ymm14: - " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43 - " \n\t" // ab50 ab51 ab52 ab53 - " \n\t" // ab60 ab61 ab62 ab63 - " \n\t" // ab70 ) ab71 ) ab72 ) ab73 ) - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %4, %%rcx \n\t" // load address of c - "movq %6, %%rdi \n\t" // load ldc - "leaq (,%%rdi,8), %%rdi \n\t" // rsi = ldc * sizeof(double) - " \n\t" - " \n\t" - "vmovapd 0 * 32(%%rcx), %%ymm0 \n\t" // ymm0 = C_c( 0:3, 0 ) - "vaddpd %%ymm9, %%ymm0, %%ymm9 \n\t" // ymm0 += ymm9 - "vmovapd 1 * 32(%%rcx), %%ymm1 \n\t" // ymm0 = C_c( 4:7, 0 ) - "vaddpd %%ymm8, %%ymm1, %%ymm8 \n\t" // ymm0 += ymm8 - " \n\t" - "addq %%rdi, %%rcx \n\t" - " \n\t" - "vmovapd 0 * 32(%%rcx), %%ymm2 \n\t" // ymm0 = C_c( 0:3, 1 ) - "vaddpd %%ymm11, %%ymm2, %%ymm11 \n\t" // ymm0 += ymm11 - "vmovapd 1 * 32(%%rcx), %%ymm3 \n\t" // ymm0 = C_c( 4:7, 1 ) - "vaddpd %%ymm10, %%ymm3, %%ymm10 \n\t" // ymm0 += ymm10 - " \n\t" - "addq %%rdi, %%rcx \n\t" - " \n\t" - "vmovapd 0 * 32(%%rcx), %%ymm4 \n\t" // ymm0 = C_c( 0:3, 2 ) - "vaddpd %%ymm13, %%ymm4, %%ymm13 \n\t" // ymm0 += ymm13 - "vmovapd 1 * 32(%%rcx), %%ymm5 \n\t" // ymm0 = C_c( 4:7, 2 ) - "vaddpd %%ymm12, %%ymm5, %%ymm12 \n\t" // ymm0 += ymm12 - " \n\t" - "addq %%rdi, %%rcx \n\t" - " \n\t" - "vmovapd 0 * 32(%%rcx), %%ymm6 \n\t" // ymm0 = C_c( 0:3, 3 ) - "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" // ymm0 += ymm15 - "vmovapd 1 * 32(%%rcx), %%ymm7 \n\t" // ymm0 = C_c( 4:7, 3 ) - "vaddpd %%ymm14, %%ymm7, %%ymm14 \n\t" // ymm0 += ymm14 - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".STOREBACK: \n\t" - " \n\t" - "movq %4, %%rcx \n\t" // load address of c - "movq %6, %%rdi \n\t" // load address of ldc - "leaq (,%%rdi,8), %%rdi \n\t" // rsi = ldc * sizeof(double) - " \n\t" - "vmovapd %%ymm9, 0(%%rcx) \n\t" // C_c( 0, 0:3 ) = ymm9 - "vmovapd %%ymm8, 32(%%rcx) \n\t" // C_c( 1, 0:3 ) = ymm8 - "addq %%rdi, %%rcx \n\t" - "vmovapd %%ymm11, 0(%%rcx) \n\t" // C_c( 2, 0:3 ) = ymm11 - "vmovapd %%ymm10, 32(%%rcx) \n\t" // C_c( 3, 0:3 ) = ymm10 - "addq %%rdi, %%rcx \n\t" - "vmovapd %%ymm13, 0(%%rcx) \n\t" // C_c( 4, 0:3 ) = ymm13 - "vmovapd %%ymm12, 32(%%rcx) \n\t" // C_c( 5, 0:3 ) = ymm12 - "addq %%rdi, %%rcx \n\t" - "vmovapd %%ymm15, 0(%%rcx) \n\t" // C_c( 6, 0:3 ) = ymm15 - "vmovapd %%ymm14, 32(%%rcx) \n\t" // C_c( 7, 0:3 ) = ymm14 - " \n\t" - ".DDONE: \n\t" - " \n\t" - : // output operands (none) - : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (c), // 4 - "m" (aux->b_next), // 5 - "m" (ldc) // 6 - : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", - "memory" - ); + __asm__ volatile( + " \n\t" + " \n\t" + "movq %2, %%rax \n\t" // load address of a. + // ( v ) + "movq %3, %%rbx \n\t" // load address of b. + // ( v ) + "movq %5, %%r15 \n\t" // load address of + // b_next. ( v + // ) + "addq $-4 * 64, %%r15 \n\t" // ( ? ) + " \n\t" + "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by + // pre-loading + "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" // elements of a and b. + "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" + " \n\t" + " \n\t" + "movq %4, %%rcx \n\t" // load address of c + "movq %6, %%rdi \n\t" // load ldc + "leaq (,%%rdi,8), %%rdi \n\t" // ldc * sizeof(double) + "leaq (%%rcx,%%rdi,2), %%r10 \n\t" // load address of c + + // 2 * ldc; + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "prefetcht0 3 * 8(%%rcx) \n\t" // prefetch c + 0 * ldc + "prefetcht0 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1 * ldc + "prefetcht0 3 * 8(%%r10) \n\t" // prefetch c + 2 * ldc + "prefetcht0 3 * 8(%%r10,%%rdi) \n\t" // prefetch c + 3 * ldc + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "vxorpd %%ymm8, %%ymm8, %%ymm8 \n\t" // set ymm8 to 0 ( v ) + "vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t" + "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" + "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" + "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" + "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" + "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" + "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %0, %%rsi \n\t" // i = k_iter; ( v ) + "testq %%rsi, %%rsi \n\t" // check i via logical + // AND. ( v ) + "je .DCONSIDKLEFT \n\t" // if i == 0, jump to + // code that ( v ) + " \n\t" // contains the k_left + // loop. + " \n\t" + " \n\t" + ".DLOOPKITER: \n\t" // MAIN LOOP + " \n\t" + "addq $4 * 4 * 8, %%r15 \n\t" // b_next += 4*4 + // (unroll x nr) ( + // v ) + " \n\t" + " \n\t" // iteration 0 + "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" // preload a47 for iter + // 0 + "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" // ymm6 ( c_tmp0 ) = + // ymm0 ( a03 ) * ymm2( + // b0 ) + "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" // ymm4 ( b0x3_0 ) + "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" // ymm7 ( c_tmp1 ) = + // ymm0 ( a03 ) * ymm3( + // b0x5 ) + "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" // ymm5 ( b0x3_1 ) + "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" // ymm15 ( c_03_0 ) += + // ymm6( c_tmp0 ) + "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" // ymm13 ( c_03_1 ) += + // ymm7( c_tmp1 ) + " \n\t" + "prefetcht0 16 * 32(%%rax) \n\t" // prefetch a03 for + // iter 1 + "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" + "vmovapd 1 * 32(%%rbx), %%ymm2 \n\t" // preload b for iter 1 + "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" + "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" + "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" + "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" + " \n\t" + "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" + "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" + "vmovapd 2 * 32(%%rax), %%ymm0 \n\t" // preload a03 for iter + // 1 + "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" + "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" + "prefetcht0 0 * 32(%%r15) \n\t" // prefetch b_next[0*4] + " \n\t" + "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" + "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" + "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" + "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" + " \n\t" + " \n\t" + " \n\t" // iteration 1 + "vmovapd 3 * 32(%%rax), %%ymm1 \n\t" // preload a47 for iter + // 1 + "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" + "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" + "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" + "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" + "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" + " \n\t" + "prefetcht0 18 * 32(%%rax) \n\t" // prefetch a for iter + // 9 ( ? ) + "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" + "vmovapd 2 * 32(%%rbx), %%ymm2 \n\t" // preload b for iter 2 + "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" + "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" + "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" + "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" + " \n\t" + "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" + "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" + "vmovapd 4 * 32(%%rax), %%ymm0 \n\t" // preload a03 for iter + // 2 + "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" + "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" + " \n\t" + "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" + "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" + "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" + "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" + " \n\t" + " \n\t" + " \n\t" // iteration 2 + "vmovapd 5 * 32(%%rax), %%ymm1 \n\t" // preload a47 for iter + // 2 + "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" + "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" + "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" + "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" + "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" + " \n\t" + "prefetcht0 20 * 32(%%rax) \n\t" // prefetch a for iter + // 10 ( ? ) + "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" + "vmovapd 3 * 32(%%rbx), %%ymm2 \n\t" // preload b for iter 3 + "addq $4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x + // nr) + "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" + "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" + "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" + "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" + " \n\t" + "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" + "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" + "vmovapd 6 * 32(%%rax), %%ymm0 \n\t" // preload a03 for iter + // 3 + "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" + "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" + "prefetcht0 2 * 32(%%r15) \n\t" // prefetch b_next[2*4] + " \n\t" + "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" + "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" + "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" + "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" + " \n\t" + " \n\t" + " \n\t" // iteration 3 + "vmovapd 7 * 32(%%rax), %%ymm1 \n\t" // preload a47 for iter + // 3 + "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x + // mr) + "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" + "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" + "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" + "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" + "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" + " \n\t" + "prefetcht0 14 * 32(%%rax) \n\t" // prefetch a for iter + // 11 ( ? ) + "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" + "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" // preload b for iter 4 + "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" + "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" + "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" + "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" + " \n\t" + "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" + "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" + "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" // preload a03 for iter + // 4 + "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" + "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" + " \n\t" + "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" + "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" + "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" + "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .DLOOPKITER \n\t" // iterate again if i + // != 0. + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".DCONSIDKLEFT: \n\t" + " \n\t" + "movq %1, %%rsi \n\t" // i = k_left; + "testq %%rsi, %%rsi \n\t" // check i via logical + // AND. + "je .DPOSTACCUM \n\t" // if i == 0, we're + // done; jump to end. + " \n\t" // else, we prepare to + // enter k_left loop. + " \n\t" + " \n\t" + ".DLOOPKLEFT: \n\t" // EDGE LOOP + " \n\t" + "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" // preload a47 + "addq $8 * 1 * 8, %%rax \n\t" // a += 8 (1 x mr) + "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t" + "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t" + "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t" + "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t" + "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" + "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t" + " \n\t" + "prefetcht0 14 * 32(%%rax) \n\t" // prefetch a03 for + // iter 7 later ( ? ) + "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t" + "vmovapd 1 * 32(%%rbx), %%ymm2 \n\t" + "addq $4 * 1 * 8, %%rbx \n\t" // b += 4 (1 x nr) + "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t" + "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t" + "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t" + "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t" + " \n\t" + "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t" + "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t" + "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" + "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t" + "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t" + " \n\t" + "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t" + "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t" + "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t" + "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .DLOOPKLEFT \n\t" // iterate again if i + // != 0. + " \n\t" + " \n\t" + " \n\t" + ".DPOSTACCUM: \n\t" + " \n\t" + " \n\t" + " \n\t" // ymm15: ymm13: + // ymm11: ymm9: + " \n\t" // ( ab00 ( ab01 ( + // ab02 ( ab03 + " \n\t" // ab11 ab10 ab13 + // ab12 + " \n\t" // ab22 ab23 ab20 + // ab21 + " \n\t" // ab33 ) ab32 ) + // ab31 ) ab30 ) + " \n\t" + " \n\t" // ymm14: ymm12: + // ymm10: ymm8: + " \n\t" // ( ab40 ( ab41 ( + // ab42 ( ab43 + " \n\t" // ab51 ab50 ab53 + // ab52 + " \n\t" // ab62 ab63 ab60 + // ab61 + " \n\t" // ab73 ) ab72 ) + // ab71 ) ab70 ) + " \n\t" + "vmovapd %%ymm15, %%ymm7 \n\t" + "vshufpd $0xa, %%ymm15, %%ymm13, %%ymm15 \n\t" + "vshufpd $0xa, %%ymm13, %%ymm7, %%ymm13 \n\t" + " \n\t" + "vmovapd %%ymm11, %%ymm7 \n\t" + "vshufpd $0xa, %%ymm11, %%ymm9, %%ymm11 \n\t" + "vshufpd $0xa, %%ymm9, %%ymm7, %%ymm9 \n\t" + " \n\t" + "vmovapd %%ymm14, %%ymm7 \n\t" + "vshufpd $0xa, %%ymm14, %%ymm12, %%ymm14 \n\t" + "vshufpd $0xa, %%ymm12, %%ymm7, %%ymm12 \n\t" + " \n\t" + "vmovapd %%ymm10, %%ymm7 \n\t" + "vshufpd $0xa, %%ymm10, %%ymm8, %%ymm10 \n\t" + "vshufpd $0xa, %%ymm8, %%ymm7, %%ymm8 \n\t" + " \n\t" + " \n\t" // ymm15: ymm13: + // ymm11: ymm9: + " \n\t" // ( ab01 ( ab00 ( + // ab03 ( ab02 + " \n\t" // ab11 ab10 ab13 + // ab12 + " \n\t" // ab23 ab22 ab21 + // ab20 + " \n\t" // ab33 ) ab32 ) + // ab31 ) ab30 ) + " \n\t" + " \n\t" // ymm14: ymm12: + // ymm10: ymm8: + " \n\t" // ( ab41 ( ab40 ( + // ab43 ( ab42 + " \n\t" // ab51 ab50 ab53 + // ab52 + " \n\t" // ab63 ab62 ab61 + // ab60 + " \n\t" // ab73 ) ab72 ) + // ab71 ) ab70 ) + " \n\t" + "vmovapd %%ymm15, %%ymm7 \n\t" + "vperm2f128 $0x30, %%ymm15, %%ymm11, %%ymm15 \n\t" + "vperm2f128 $0x12, %%ymm7, %%ymm11, %%ymm11 \n\t" + " \n\t" + "vmovapd %%ymm13, %%ymm7 \n\t" + "vperm2f128 $0x30, %%ymm13, %%ymm9, %%ymm13 \n\t" + "vperm2f128 $0x12, %%ymm7, %%ymm9, %%ymm9 \n\t" + " \n\t" + "vmovapd %%ymm14, %%ymm7 \n\t" + "vperm2f128 $0x30, %%ymm14, %%ymm10, %%ymm14 \n\t" + "vperm2f128 $0x12, %%ymm7, %%ymm10, %%ymm10 \n\t" + " \n\t" + "vmovapd %%ymm12, %%ymm7 \n\t" + "vperm2f128 $0x30, %%ymm12, %%ymm8, %%ymm12 \n\t" + "vperm2f128 $0x12, %%ymm7, %%ymm8, %%ymm8 \n\t" + " \n\t" + " \n\t" // ymm9: ymm11: + // ymm13: ymm15: + " \n\t" // ( ab00 ( ab01 ( + // ab02 ( ab03 + " \n\t" // ab10 ab11 ab12 + // ab13 + " \n\t" // ab20 ab21 ab22 + // ab23 + " \n\t" // ab30 ) ab31 ) + // ab32 ) ab33 ) + " \n\t" + " \n\t" // ymm8: ymm10: + // ymm12: ymm14: + " \n\t" // ( ab40 ( ab41 ( + // ab42 ( ab43 + " \n\t" // ab50 ab51 ab52 + // ab53 + " \n\t" // ab60 ab61 ab62 + // ab63 + " \n\t" // ab70 ) ab71 ) + // ab72 ) ab73 ) + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %4, %%rcx \n\t" // load address of c + "movq %6, %%rdi \n\t" // load ldc + "leaq (,%%rdi,8), %%rdi \n\t" // rsi = ldc * + // sizeof(double) + " \n\t" + " \n\t" + "vmovapd 0 * 32(%%rcx), %%ymm0 \n\t" // ymm0 = C_c( 0:3, 0 ) + "vaddpd %%ymm9, %%ymm0, %%ymm9 \n\t" // ymm0 += ymm9 + "vmovapd 1 * 32(%%rcx), %%ymm1 \n\t" // ymm0 = C_c( 4:7, 0 ) + "vaddpd %%ymm8, %%ymm1, %%ymm8 \n\t" // ymm0 += ymm8 + " \n\t" + "addq %%rdi, %%rcx \n\t" + " \n\t" + "vmovapd 0 * 32(%%rcx), %%ymm2 \n\t" // ymm0 = C_c( 0:3, 1 ) + "vaddpd %%ymm11, %%ymm2, %%ymm11 \n\t" // ymm0 += ymm11 + "vmovapd 1 * 32(%%rcx), %%ymm3 \n\t" // ymm0 = C_c( 4:7, 1 ) + "vaddpd %%ymm10, %%ymm3, %%ymm10 \n\t" // ymm0 += ymm10 + " \n\t" + "addq %%rdi, %%rcx \n\t" + " \n\t" + "vmovapd 0 * 32(%%rcx), %%ymm4 \n\t" // ymm0 = C_c( 0:3, 2 ) + "vaddpd %%ymm13, %%ymm4, %%ymm13 \n\t" // ymm0 += ymm13 + "vmovapd 1 * 32(%%rcx), %%ymm5 \n\t" // ymm0 = C_c( 4:7, 2 ) + "vaddpd %%ymm12, %%ymm5, %%ymm12 \n\t" // ymm0 += ymm12 + " \n\t" + "addq %%rdi, %%rcx \n\t" + " \n\t" + "vmovapd 0 * 32(%%rcx), %%ymm6 \n\t" // ymm0 = C_c( 0:3, 3 ) + "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t" // ymm0 += ymm15 + "vmovapd 1 * 32(%%rcx), %%ymm7 \n\t" // ymm0 = C_c( 4:7, 3 ) + "vaddpd %%ymm14, %%ymm7, %%ymm14 \n\t" // ymm0 += ymm14 + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".STOREBACK: \n\t" + " \n\t" + "movq %4, %%rcx \n\t" // load address of c + "movq %6, %%rdi \n\t" // load address of ldc + "leaq (,%%rdi,8), %%rdi \n\t" // rsi = ldc * + // sizeof(double) + " \n\t" + "vmovapd %%ymm9, 0(%%rcx) \n\t" // C_c( 0, 0:3 ) = + // ymm9 + "vmovapd %%ymm8, 32(%%rcx) \n\t" // C_c( 1, 0:3 ) = + // ymm8 + "addq %%rdi, %%rcx \n\t" + "vmovapd %%ymm11, 0(%%rcx) \n\t" // C_c( 2, 0:3 ) = + // ymm11 + "vmovapd %%ymm10, 32(%%rcx) \n\t" // C_c( 3, 0:3 ) = + // ymm10 + "addq %%rdi, %%rcx \n\t" + "vmovapd %%ymm13, 0(%%rcx) \n\t" // C_c( 4, 0:3 ) = + // ymm13 + "vmovapd %%ymm12, 32(%%rcx) \n\t" // C_c( 5, 0:3 ) = + // ymm12 + "addq %%rdi, %%rcx \n\t" + "vmovapd %%ymm15, 0(%%rcx) \n\t" // C_c( 6, 0:3 ) = + // ymm15 + "vmovapd %%ymm14, 32(%%rcx) \n\t" // C_c( 7, 0:3 ) = + // ymm14 + " \n\t" + ".DDONE: \n\t" + " \n\t" + : // output operands (none) + : // input operands + "m"(k_iter), // 0 + "m"(k_left), // 1 + "m"(a), // 2 + "m"(b), // 3 + "m"(c), // 4 + "m"(aux->b_next), // 5 + "m"(ldc) // 6 + : // register clobber list + "rax", + "rbx", + "rcx", + "rdx", + "rsi", + "rdi", + "r8", + "r9", + "r10", + "r11", + "r12", + "r13", + "r14", + "r15", + "xmm0", + "xmm1", + "xmm2", + "xmm3", + "xmm4", + "xmm5", + "xmm6", + "xmm7", + "xmm8", + "xmm9", + "xmm10", + "xmm11", + "xmm12", + "xmm13", + "xmm14", + "xmm15", + "memory"); - //printf( "ldc = %d\n", ldc ); - //printf( "%lf, %lf, %lf, %lf\n", c[0], c[ ldc + 0], c[ ldc * 2 + 0], c[ ldc * 3 + 0] ); - //printf( "%lf, %lf, %lf, %lf\n", c[1], c[ ldc + 1], c[ ldc * 2 + 1], c[ ldc * 3 + 1] ); - //printf( "%lf, %lf, %lf, %lf\n", c[2], c[ ldc + 2], c[ ldc * 2 + 2], c[ ldc * 3 + 2] ); - //printf( "%lf, %lf, %lf, %lf\n", c[3], c[ ldc + 3], c[ ldc * 2 + 3], c[ ldc * 3 + 3] ); - //printf( "%lf, %lf, %lf, %lf\n", c[4], c[ ldc + 4], c[ ldc * 2 + 4], c[ ldc * 3 + 4] ); - //printf( "%lf, %lf, %lf, %lf\n", c[5], c[ ldc + 5], c[ ldc * 2 + 5], c[ ldc * 3 + 5] ); - //printf( "%lf, %lf, %lf, %lf\n", c[6], c[ ldc + 6], c[ ldc * 2 + 6], c[ ldc * 3 + 6] ); - //printf( "%lf, %lf, %lf, %lf\n", c[7], c[ ldc + 7], c[ ldc * 2 + 7], c[ ldc * 3 + 7] ); + // printf( "ldc = %d\n", ldc ); + // printf( "%lf, %lf, %lf, %lf\n", c[0], c[ ldc + 0], c[ ldc * 2 + 0], c[ ldc + // * 3 + 0] ); printf( "%lf, %lf, %lf, %lf\n", c[1], c[ ldc + 1], c[ ldc * 2 + + // 1], c[ ldc * 3 + 1] ); printf( "%lf, %lf, %lf, %lf\n", c[2], c[ ldc + 2], + // c[ ldc * 2 + 2], c[ ldc * 3 + 2] ); printf( "%lf, %lf, %lf, %lf\n", c[3], + // c[ ldc + 3], c[ ldc * 2 + 3], c[ ldc * 3 + 3] ); printf( "%lf, %lf, %lf, + // %lf\n", c[4], c[ ldc + 4], c[ ldc * 2 + 4], c[ ldc * 3 + 4] ); printf( + // "%lf, %lf, %lf, %lf\n", c[5], c[ ldc + 5], c[ ldc * 2 + 5], c[ ldc * 3 + 5] + // ); printf( "%lf, %lf, %lf, %lf\n", c[6], c[ ldc + 6], c[ ldc * 2 + 6], c[ + // ldc + // * 3 + 6] ); printf( "%lf, %lf, %lf, %lf\n", c[7], c[ ldc + 7], c[ ldc * 2 + + // 7], c[ ldc * 3 + 7] ); } diff --git a/step3/kernels/bl_dgemm_asm_8x6.c b/step3/kernels/bl_dgemm_asm_8x6.c index 4db84bf..7f5ed39 100644 --- a/step3/kernels/bl_dgemm_asm_8x6.c +++ b/step3/kernels/bl_dgemm_asm_8x6.c @@ -1,657 +1,619 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - #include "bl_dgemm_kernel.h" -#define inc_t unsigned long long +#define inc_t unsigned long long +#define DGEMM_INPUT_GS_BETA_NZ \ + "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ + "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ + "vmovlpd (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ + "vmovhpd (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ + "vperm2f128 $0x20, %%ymm1, %%ymm0, %%ymm0 \n\t" /* \ + "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ + "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ + "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ + "vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \ + "vperm2f128 $0x20, %%ymm1, %%ymm2, %%ymm2 \n\t"*/ -#define DGEMM_INPUT_GS_BETA_NZ \ - "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ - "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ - "vmovlpd (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ - "vmovhpd (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ - "vperm2f128 $0x20, %%ymm1, %%ymm0, %%ymm0 \n\t" /*\ - "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ - "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ - "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ - "vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \ - "vperm2f128 $0x20, %%ymm1, %%ymm2, %%ymm2 \n\t"*/ - -#define DGEMM_OUTPUT_GS_BETA_NZ \ - "vextractf128 $1, %%ymm0, %%xmm1 \n\t" \ - "vmovlpd %%xmm0, (%%rcx ) \n\t" \ - "vmovhpd %%xmm0, (%%rcx,%%rsi ) \n\t" \ - "vmovlpd %%xmm1, (%%rcx,%%rsi,2) \n\t" \ - "vmovhpd %%xmm1, (%%rcx,%%r13 ) \n\t" /*\ - "vextractf128 $1, %%ymm2, %%xmm1 \n\t" \ - "vmovlpd %%xmm2, (%%rcx,%%rsi,4) \n\t" \ - "vmovhpd %%xmm2, (%%rcx,%%r15 ) \n\t" \ - "vmovlpd %%xmm1, (%%rcx,%%r13,2) \n\t" \ - "vmovhpd %%xmm1, (%%rcx,%%r10 ) \n\t"*/ +#define DGEMM_OUTPUT_GS_BETA_NZ \ + "vextractf128 $1, %%ymm0, %%xmm1 \n\t" \ + "vmovlpd %%xmm0, (%%rcx ) \n\t" \ + "vmovhpd %%xmm0, (%%rcx,%%rsi ) \n\t" \ + "vmovlpd %%xmm1, (%%rcx,%%rsi,2) \n\t" \ + "vmovhpd %%xmm1, (%%rcx,%%r13 ) \n\t" /* \ + "vextractf128 $1, %%ymm2, %%xmm1 \n\t" \ + "vmovlpd %%xmm2, (%%rcx,%%rsi,4) \n\t" \ + "vmovhpd %%xmm2, (%%rcx,%%r15 ) \n\t" \ + "vmovlpd %%xmm1, (%%rcx,%%r13,2) \n\t" \ + "vmovhpd %%xmm1, (%%rcx,%%r10 ) \n\t"*/ void bl_dgemm_asm_8x6( - int k, - double* a, - double* b, - double* c, - inc_t ldc, - aux_t* data - ) -{ - //void* a_next = bli_auxinfo_next_a( data ); - //void* b_next = bli_auxinfo_next_b( data ); + int k, + double* a, + double* b, + double* c, + inc_t ldc, + aux_t* data) { + // void* a_next = bli_auxinfo_next_a( data ); + // void* b_next = bli_auxinfo_next_b( data ); - const inc_t cs_c = ldc; - const inc_t rs_c = 1; - double alpha_val = 1.0, beta_val = 1.0; - double *alpha, *beta; + const inc_t cs_c = ldc; + const inc_t rs_c = 1; + double alpha_val = 1.0, beta_val = 1.0; + double *alpha, *beta; - alpha = &alpha_val; - beta = &beta_val; + alpha = &alpha_val; + beta = &beta_val; - dim_t k_iter = (unsigned long long)k / 4; - dim_t k_left = (unsigned long long)k % 4; + dim_t k_iter = (unsigned long long)k / 4; + dim_t k_left = (unsigned long long)k % 4; - __asm__ volatile - ( - " \n\t" - "vzeroall \n\t" // zero all xmm/ymm registers. - " \n\t" - " \n\t" - "movq %2, %%rax \n\t" // load address of a. - "movq %3, %%rbx \n\t" // load address of b. - //"movq %9, %%r15 \n\t" // load address of b_next. - " \n\t" - "addq $32 * 4, %%rax \n\t" - " \n\t" // initialize loop by pre-loading - "vmovaps -4 * 32(%%rax), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - "movq %6, %%rcx \n\t" // load address of c - "movq %8, %%rdi \n\t" // load cs_c - "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(double) - " \n\t" - "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*cs_c; - "leaq (%%rcx,%%r13,1), %%rdx \n\t" // rdx = c + 3*cs_c; - "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c - "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c - "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*cs_c - "prefetcht0 7 * 8(%%rdx) \n\t" // prefetch c + 3*cs_c - "prefetcht0 7 * 8(%%rdx,%%rdi) \n\t" // prefetch c + 4*cs_c - "prefetcht0 7 * 8(%%rdx,%%rdi,2) \n\t" // prefetch c + 5*cs_c - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %0, %%rsi \n\t" // i = k_iter; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that - " \n\t" // contains the k_left loop. - " \n\t" - " \n\t" - ".DLOOPKITER: \n\t" // MAIN LOOP - " \n\t" - " \n\t" - " \n\t" // iteration 0 - "prefetcht0 16 * 32(%%rax) \n\t" - " \n\t" - "vbroadcastsd 0 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 2 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 3 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 4 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 5 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps -2 * 32(%%rax), %%ymm0 \n\t" - "vmovaps -1 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 1 - "vbroadcastsd 6 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 7 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 8 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 9 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 10 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 11 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps 0 * 32(%%rax), %%ymm0 \n\t" - "vmovaps 1 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 2 - "prefetcht0 20 * 32(%%rax) \n\t" - " \n\t" - "vbroadcastsd 12 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 13 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 14 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 15 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 16 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 17 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "vmovaps 2 * 32(%%rax), %%ymm0 \n\t" - "vmovaps 3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" // iteration 3 - "vbroadcastsd 18 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 19 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 20 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 21 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 22 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 23 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr) - "addq $4 * 6 * 8, %%rbx \n\t" // b += 4*6 (unroll x nr) - " \n\t" - "vmovaps -4 * 32(%%rax), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKITER \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DCONSIDKLEFT: \n\t" - " \n\t" - "movq %1, %%rsi \n\t" // i = k_left; - "testq %%rsi, %%rsi \n\t" // check i via logical AND. - "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. - " \n\t" // else, we prepare to enter k_left loop. - " \n\t" - " \n\t" - ".DLOOPKLEFT: \n\t" // EDGE LOOP - " \n\t" - "prefetcht0 16 * 32(%%rax) \n\t" - " \n\t" - "vbroadcastsd 0 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" - " \n\t" - "vbroadcastsd 2 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 3 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" - " \n\t" - "vbroadcastsd 4 * 8(%%rbx), %%ymm2 \n\t" - "vbroadcastsd 5 * 8(%%rbx), %%ymm3 \n\t" - "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" - "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" - "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" - "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" - " \n\t" - "addq $1 * 8 * 8, %%rax \n\t" // a += 1*8 (unroll x mr) - "addq $1 * 6 * 8, %%rbx \n\t" // b += 1*6 (unroll x nr) - " \n\t" - "vmovaps -4 * 32(%%rax), %%ymm0 \n\t" - "vmovaps -3 * 32(%%rax), %%ymm1 \n\t" - " \n\t" - " \n\t" - "decq %%rsi \n\t" // i -= 1; - "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. - " \n\t" - " \n\t" - " \n\t" - ".DPOSTACCUM: \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %4, %%rax \n\t" // load address of alpha - "movq %5, %%rbx \n\t" // load address of beta - "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha and duplicate - "vbroadcastsd (%%rbx), %%ymm3 \n\t" // load beta and duplicate - " \n\t" - "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha - "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" - "vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t" - "vmulpd %%ymm0, %%ymm7, %%ymm7 \n\t" - "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" - "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" - "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" - "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" - "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" - "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" - "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" - "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - "movq %7, %%rsi \n\t" // load rs_c - "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(double) - " \n\t" - "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c; - " \n\t" - "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*rs_c; - //"leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*rs_c; - //"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*rs_c; - " \n\t" - " \n\t" - " \n\t" - " \n\t" // determine if - " \n\t" // c % 32 == 0, AND - " \n\t" // 8*cs_c % 32 == 0, AND - " \n\t" // rs_c == 1 - " \n\t" // ie: aligned, ldim aligned, and - " \n\t" // column-stored - " \n\t" - "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8. - "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); - "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. - "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); - "testq $31, %%rdi \n\t" // set ZF if (8*cs_c) & 32 is zero. - "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); - " \n\t" // and(bl,bh) followed by - " \n\t" // and(bh,al) will reveal result - " \n\t" - " \n\t" // now avoid loading C if beta == 0 - " \n\t" - "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. - "vucomisd %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. - "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case - " \n\t" - " \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .DCOLSTORED \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".DGENSTORED: \n\t" - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c + 4*rs_c - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm5, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm9, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm11, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - DGEMM_INPUT_GS_BETA_NZ - "vfmadd213pd %%ymm15, %%ymm3, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DCOLSTORED: \n\t" - " \n\t" - " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm5, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm7, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm9, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm11, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm13, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovaps (%%rcx), %%ymm0 \n\t" - "vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t" - "vmovaps %%ymm0, (%%rcx) \n\t" - //"addq %%rdi, %%rcx \n\t" - "vmovaps (%%rdx), %%ymm1 \n\t" - "vfmadd213pd %%ymm15, %%ymm3, %%ymm1 \n\t" - "vmovaps %%ymm1, (%%rdx) \n\t" - //"addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DBETAZERO: \n\t" - " \n\t" // check if aligned/column-stored - "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. - "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. - "jne .DCOLSTORBZ \n\t" // jump to column storage case - " \n\t" - " \n\t" - " \n\t" - ".DGENSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovaps %%ymm4, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm6, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm8, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm10, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm12, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm14, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - "movq %%rdx, %%rcx \n\t" // rcx = c + 4*rs_c - " \n\t" - " \n\t" - "vmovaps %%ymm5, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm7, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm9, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm11, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm13, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - "addq %%rdi, %%rcx \n\t" // c += cs_c; - " \n\t" - " \n\t" - "vmovaps %%ymm15, %%ymm0 \n\t" - DGEMM_OUTPUT_GS_BETA_NZ - " \n\t" - " \n\t" - " \n\t" - "jmp .DDONE \n\t" // jump to end. - " \n\t" - " \n\t" - " \n\t" - ".DCOLSTORBZ: \n\t" - " \n\t" - " \n\t" - "vmovaps %%ymm4, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm5, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - "vmovaps %%ymm6, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm7, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovaps %%ymm8, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm9, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovaps %%ymm10, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm11, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovaps %%ymm12, (%%rcx) \n\t" - "addq %%rdi, %%rcx \n\t" - "vmovaps %%ymm13, (%%rdx) \n\t" - "addq %%rdi, %%rdx \n\t" - " \n\t" - " \n\t" - "vmovaps %%ymm14, (%%rcx) \n\t" - " \n\t" - "vmovaps %%ymm15, (%%rdx) \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - " \n\t" - ".DDONE: \n\t" - " \n\t" + __asm__ volatile + ( + " \n\t" + "vzeroall \n\t" // zero all xmm/ymm registers. + " \n\t" + " \n\t" + "movq %2, %%rax \n\t" // load address of a. + "movq %3, %%rbx \n\t" // load address of b. + //"movq %9, %%r15 \n\t" // load address of b_next. + " \n\t" + "addq $32 * 4, %%rax \n\t" + " \n\t" // initialize loop by pre-loading + "vmovaps -4 * 32(%%rax), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rax), %%ymm1 \n\t" + " \n\t" + "movq %6, %%rcx \n\t" // load address of c + "movq %8, %%rdi \n\t" // load cs_c + "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(double) + " \n\t" + "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*cs_c; + "leaq (%%rcx,%%r13,1), %%rdx \n\t" // rdx = c + 3*cs_c; + "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c + "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c + "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*cs_c + "prefetcht0 7 * 8(%%rdx) \n\t" // prefetch c + 3*cs_c + "prefetcht0 7 * 8(%%rdx,%%rdi) \n\t" // prefetch c + 4*cs_c + "prefetcht0 7 * 8(%%rdx,%%rdi,2) \n\t" // prefetch c + 5*cs_c + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %0, %%rsi \n\t" // i = k_iter; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that + " \n\t" // contains the k_left loop. + " \n\t" + " \n\t" + ".DLOOPKITER: \n\t" // MAIN LOOP + " \n\t" + " \n\t" + " \n\t" // iteration 0 + "prefetcht0 16 * 32(%%rax) \n\t" + " \n\t" + "vbroadcastsd 0 * 8(%%rbx), %%ymm2 \n\t" + "vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 2 * 8(%%rbx), %%ymm2 \n\t" + "vbroadcastsd 3 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 4 * 8(%%rbx), %%ymm2 \n\t" + "vbroadcastsd 5 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps -2 * 32(%%rax), %%ymm0 \n\t" + "vmovaps -1 * 32(%%rax), %%ymm1 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" // iteration 1 + "vbroadcastsd 6 * 8(%%rbx), %%ymm2 \n\t" + "vbroadcastsd 7 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 8 * 8(%%rbx), %%ymm2 \n\t" + "vbroadcastsd 9 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 10 * 8(%%rbx), %%ymm2 \n\t" + "vbroadcastsd 11 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps 0 * 32(%%rax), %%ymm0 \n\t" + "vmovaps 1 * 32(%%rax), %%ymm1 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" // iteration 2 + "prefetcht0 20 * 32(%%rax) \n\t" + " \n\t" + "vbroadcastsd 12 * 8(%%rbx), %%ymm2 \n\t" + "vbroadcastsd 13 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 14 * 8(%%rbx), %%ymm2 \n\t" + "vbroadcastsd 15 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 16 * 8(%%rbx), %%ymm2 \n\t" + "vbroadcastsd 17 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "vmovaps 2 * 32(%%rax), %%ymm0 \n\t" + "vmovaps 3 * 32(%%rax), %%ymm1 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" // iteration 3 + "vbroadcastsd 18 * 8(%%rbx), %%ymm2 \n\t" + "vbroadcastsd 19 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 20 * 8(%%rbx), %%ymm2 \n\t" + "vbroadcastsd 21 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 22 * 8(%%rbx), %%ymm2 \n\t" + "vbroadcastsd 23 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr) + "addq $4 * 6 * 8, %%rbx \n\t" // b += 4*6 (unroll x nr) + " \n\t" + "vmovaps -4 * 32(%%rax), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rax), %%ymm1 \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .DLOOPKITER \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".DCONSIDKLEFT: \n\t" + " \n\t" + "movq %1, %%rsi \n\t" // i = k_left; + "testq %%rsi, %%rsi \n\t" // check i via logical AND. + "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. + " \n\t" // else, we prepare to enter k_left loop. + " \n\t" + " \n\t" + ".DLOOPKLEFT: \n\t" // EDGE LOOP + " \n\t" + "prefetcht0 16 * 32(%%rax) \n\t" + " \n\t" + "vbroadcastsd 0 * 8(%%rbx), %%ymm2 \n\t" + "vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" + " \n\t" + "vbroadcastsd 2 * 8(%%rbx), %%ymm2 \n\t" + "vbroadcastsd 3 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" + " \n\t" + "vbroadcastsd 4 * 8(%%rbx), %%ymm2 \n\t" + "vbroadcastsd 5 * 8(%%rbx), %%ymm3 \n\t" + "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" + "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" + "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" + "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" + " \n\t" + "addq $1 * 8 * 8, %%rax \n\t" // a += 1*8 (unroll x mr) + "addq $1 * 6 * 8, %%rbx \n\t" // b += 1*6 (unroll x nr) + " \n\t" + "vmovaps -4 * 32(%%rax), %%ymm0 \n\t" + "vmovaps -3 * 32(%%rax), %%ymm1 \n\t" + " \n\t" + " \n\t" + "decq %%rsi \n\t" // i -= 1; + "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. + " \n\t" + " \n\t" + " \n\t" + ".DPOSTACCUM: \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %4, %%rax \n\t" // load address of alpha + "movq %5, %%rbx \n\t" // load address of beta + "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha and duplicate + "vbroadcastsd (%%rbx), %%ymm3 \n\t" // load beta and duplicate + " \n\t" + "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha + "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" + "vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t" + "vmulpd %%ymm0, %%ymm7, %%ymm7 \n\t" + "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" + "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" + "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" + "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" + "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" + "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" + "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + "movq %7, %%rsi \n\t" // load rs_c + "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(double) + " \n\t" + "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c; + " \n\t" + "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*rs_c; + //"leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*rs_c; + //"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*rs_c; + " \n\t" + " \n\t" + " \n\t" + " \n\t" // determine if + " \n\t" // c % 32 == 0, AND + " \n\t" // 8*cs_c % 32 == 0, AND + " \n\t" // rs_c == 1 + " \n\t" // ie: aligned, ldim aligned, and + " \n\t" // column-stored + " \n\t" + "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8. + "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); + "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. + "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); + "testq $31, %%rdi \n\t" // set ZF if (8*cs_c) & 32 is zero. + "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); + " \n\t" // and(bl,bh) followed by + " \n\t" // and(bh,al) will reveal result + " \n\t" + " \n\t" // now avoid loading C if beta == 0 + " \n\t" + "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. + "vucomisd %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. + "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case + " \n\t" + " \n\t" + " \n\t" // check if aligned/column-stored + "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. + "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. + "jne .DCOLSTORED \n\t" // jump to column storage case + " \n\t" + " \n\t" + " \n\t" + ".DGENSTORED: \n\t" + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + " \n\t" + " \n\t" + "movq %%rdx, %%rcx \n\t" // rcx = c + 4*rs_c + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm5, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm9, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm11, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + DGEMM_INPUT_GS_BETA_NZ + "vfmadd213pd %%ymm15, %%ymm3, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + " \n\t" + " \n\t" + " \n\t" + "jmp .DDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".DCOLSTORED: \n\t" + " \n\t" + " \n\t" + "vmovaps (%%rcx), %%ymm0 \n\t" + "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" + "vmovaps %%ymm0, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovaps (%%rdx), %%ymm1 \n\t" + "vfmadd213pd %%ymm5, %%ymm3, %%ymm1 \n\t" + "vmovaps %%ymm1, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "vmovaps (%%rcx), %%ymm0 \n\t" + "vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t" + "vmovaps %%ymm0, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovaps (%%rdx), %%ymm1 \n\t" + "vfmadd213pd %%ymm7, %%ymm3, %%ymm1 \n\t" + "vmovaps %%ymm1, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "vmovaps (%%rcx), %%ymm0 \n\t" + "vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t" + "vmovaps %%ymm0, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovaps (%%rdx), %%ymm1 \n\t" + "vfmadd213pd %%ymm9, %%ymm3, %%ymm1 \n\t" + "vmovaps %%ymm1, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "vmovaps (%%rcx), %%ymm0 \n\t" + "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" + "vmovaps %%ymm0, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovaps (%%rdx), %%ymm1 \n\t" + "vfmadd213pd %%ymm11, %%ymm3, %%ymm1 \n\t" + "vmovaps %%ymm1, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "vmovaps (%%rcx), %%ymm0 \n\t" + "vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t" + "vmovaps %%ymm0, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovaps (%%rdx), %%ymm1 \n\t" + "vfmadd213pd %%ymm13, %%ymm3, %%ymm1 \n\t" + "vmovaps %%ymm1, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "vmovaps (%%rcx), %%ymm0 \n\t" + "vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t" + "vmovaps %%ymm0, (%%rcx) \n\t" + //"addq %%rdi, %%rcx \n\t" + "vmovaps (%%rdx), %%ymm1 \n\t" + "vfmadd213pd %%ymm15, %%ymm3, %%ymm1 \n\t" + "vmovaps %%ymm1, (%%rdx) \n\t" + //"addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + " \n\t" + "jmp .DDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".DBETAZERO: \n\t" + " \n\t" // check if aligned/column-stored + "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. + "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. + "jne .DCOLSTORBZ \n\t" // jump to column storage case + " \n\t" + " \n\t" + " \n\t" + ".DGENSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm4, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm6, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm8, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm10, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm12, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm14, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + " \n\t" + " \n\t" + "movq %%rdx, %%rcx \n\t" // rcx = c + 4*rs_c + " \n\t" + " \n\t" + "vmovaps %%ymm5, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm7, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm9, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm11, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm13, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + "addq %%rdi, %%rcx \n\t" // c += cs_c; + " \n\t" + " \n\t" + "vmovaps %%ymm15, %%ymm0 \n\t" + DGEMM_OUTPUT_GS_BETA_NZ + " \n\t" + " \n\t" + " \n\t" + "jmp .DDONE \n\t" // jump to end. + " \n\t" + " \n\t" + " \n\t" + ".DCOLSTORBZ: \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm4, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovaps %%ymm5, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + "vmovaps %%ymm6, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovaps %%ymm7, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm8, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovaps %%ymm9, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm10, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovaps %%ymm11, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm12, (%%rcx) \n\t" + "addq %%rdi, %%rcx \n\t" + "vmovaps %%ymm13, (%%rdx) \n\t" + "addq %%rdi, %%rdx \n\t" + " \n\t" + " \n\t" + "vmovaps %%ymm14, (%%rcx) \n\t" + " \n\t" + "vmovaps %%ymm15, (%%rdx) \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + " \n\t" + ".DDONE: \n\t" + " \n\t" - : // output operands (none) - : // input operands - "m" (k_iter), // 0 - "m" (k_left), // 1 - "m" (a), // 2 - "m" (b), // 3 - "m" (alpha), // 4 - "m" (beta), // 5 - "m" (c), // 6 - "m" (rs_c), // 7 - "m" (cs_c)/*, // 8 - "m" (b_next), // 9 - "m" (a_next)*/ // 10 - : // register clobber list - "rax", "rbx", "rcx", "rdx", "rsi", "rdi", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", - "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", - "xmm12", "xmm13", "xmm14", "xmm15", - "memory" - ); + : // output operands (none) + : // input operands + "m" (k_iter), // 0 + "m" (k_left), // 1 + "m" (a), // 2 + "m" (b), // 3 + "m" (alpha), // 4 + "m" (beta), // 5 + "m" (c), // 6 + "m" (rs_c), // 7 + "m" (cs_c)/*, // 8 + "m" (b_next), // 9 + "m" (a_next)*/ // 10 + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15", + "memory" + ); } - diff --git a/step3/kernels/bl_dgemm_int_8x4.c b/step3/kernels/bl_dgemm_int_8x4.c index 0f3fe12..c4faf72 100644 --- a/step3/kernels/bl_dgemm_int_8x4.c +++ b/step3/kernels/bl_dgemm_int_8x4.c @@ -1,634 +1,580 @@ -/* - - BLIS - An object-based framework for developing high-performance BLAS-like - libraries. - - Copyright (C) 2014, The University of Texas at Austin - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - Neither the name of The University of Texas at Austin nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - #include "bl_dgemm_kernel.h" -#define inc_t unsigned long long +#define inc_t unsigned long long void bl_dgemm_int_8x4( - int k, - double* a, - double* b, - double* c, - inc_t ldc, - aux_t* data - ) -{ - const inc_t cs_c = ldc; - const inc_t rs_c = 1; - double alpha_val = 1.0, beta_val = 1.0; - double *alpha, *beta; - - alpha = &alpha_val; - beta = &beta_val; - - ////void* a_next = bli_auxinfo_next_a( data ); - //void* b_next = bli_auxinfo_next_b( data ); - double *b_next = data->b_next; - - dim_t k_iter = (unsigned long long)k / 2; - dim_t k_left = (unsigned long long)k % 2; - - dim_t i; - - double *c00, *c01, *c02, *c03; - double *c40, *c41, *c42, *c43; - - // Quad registers. - __m256d va0_3, va4_7; - __m256d vA0_3, vA4_7; - __m256d vb0, vb1, vb2, vb3; - __m256d vb; - __m256d vB0; - - __m256d va0_3b_0, va4_7b_0; - __m256d va0_3b_1, va4_7b_1; - __m256d va0_3b_2, va4_7b_2; - __m256d va0_3b_3, va4_7b_3; - - __m256d va0_3b0, va4_7b0; - __m256d va0_3b1, va4_7b1; - __m256d va0_3b2, va4_7b2; - __m256d va0_3b3, va4_7b3; + int k, + double* a, + double* b, + double* c, + inc_t ldc, + aux_t* data) { + const inc_t cs_c = ldc; + const inc_t rs_c = 1; + double alpha_val = 1.0, beta_val = 1.0; + double *alpha, *beta; + alpha = &alpha_val; + beta = &beta_val; - __m256d valpha, vbeta, vtmp; - __m256d vc0_3_0, vc0_3_1, vc0_3_2, vc0_3_3; - __m256d vc4_7_0, vc4_7_1, vc4_7_2, vc4_7_3; + ////void* a_next = bli_auxinfo_next_a( data ); + // void* b_next = bli_auxinfo_next_b( data ); + double* b_next = data->b_next; - __m128d aa, bb; - + dim_t k_iter = (unsigned long long)k / 2; + dim_t k_left = (unsigned long long)k % 2; - __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"(a) ); - __asm__ volatile( "prefetcht2 0(%0) \n\t" : :"r"(b_next) ); - __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"(c) ); + dim_t i; + double *c00, *c01, *c02, *c03; + double *c40, *c41, *c42, *c43; + // Quad registers. + __m256d va0_3, va4_7; + __m256d vA0_3, vA4_7; + __m256d vb0, vb1, vb2, vb3; + __m256d vb; + __m256d vB0; - va0_3b0 = _mm256_setzero_pd(); - va0_3b1 = _mm256_setzero_pd(); - va0_3b2 = _mm256_setzero_pd(); - va0_3b3 = _mm256_setzero_pd(); + __m256d va0_3b_0, va4_7b_0; + __m256d va0_3b_1, va4_7b_1; + __m256d va0_3b_2, va4_7b_2; + __m256d va0_3b_3, va4_7b_3; - va4_7b0 = _mm256_setzero_pd(); - va4_7b1 = _mm256_setzero_pd(); - va4_7b2 = _mm256_setzero_pd(); - va4_7b3 = _mm256_setzero_pd(); + __m256d va0_3b0, va4_7b0; + __m256d va0_3b1, va4_7b1; + __m256d va0_3b2, va4_7b2; + __m256d va0_3b3, va4_7b3; - va0_3b_0 = _mm256_setzero_pd(); - va0_3b_1 = _mm256_setzero_pd(); - va0_3b_2 = _mm256_setzero_pd(); - va0_3b_3 = _mm256_setzero_pd(); + __m256d valpha, vbeta, vtmp; + __m256d vc0_3_0, vc0_3_1, vc0_3_2, vc0_3_3; + __m256d vc4_7_0, vc4_7_1, vc4_7_2, vc4_7_3; - va4_7b_0 = _mm256_setzero_pd(); - va4_7b_1 = _mm256_setzero_pd(); - va4_7b_2 = _mm256_setzero_pd(); - va4_7b_3 = _mm256_setzero_pd(); + __m128d aa, bb; - // Load va0_3 - va0_3 = _mm256_load_pd( a ); - // Load va4_7 - va4_7 = _mm256_load_pd( a + 4 ); + __asm__ volatile("prefetcht0 0(%0) \n\t" : : "r"(a)); + __asm__ volatile("prefetcht2 0(%0) \n\t" : : "r"(b_next)); + __asm__ volatile("prefetcht0 0(%0) \n\t" : : "r"(c)); - // Load vb (b0,b1,b2,b3) - vb0 = _mm256_load_pd( b ); + va0_3b0 = _mm256_setzero_pd(); + va0_3b1 = _mm256_setzero_pd(); + va0_3b2 = _mm256_setzero_pd(); + va0_3b3 = _mm256_setzero_pd(); + va4_7b0 = _mm256_setzero_pd(); + va4_7b1 = _mm256_setzero_pd(); + va4_7b2 = _mm256_setzero_pd(); + va4_7b3 = _mm256_setzero_pd(); - for( i = 0; i < k_iter; ++i ) - { - __asm__ volatile( "prefetcht0 192(%0) \n\t" : :"r"(a) ); + va0_3b_0 = _mm256_setzero_pd(); + va0_3b_1 = _mm256_setzero_pd(); + va0_3b_2 = _mm256_setzero_pd(); + va0_3b_3 = _mm256_setzero_pd(); - // Load va0_3 (Prefetch) - // Prefetch A03 - vA0_3 = _mm256_load_pd( a + 8 ); + va4_7b_0 = _mm256_setzero_pd(); + va4_7b_1 = _mm256_setzero_pd(); + va4_7b_2 = _mm256_setzero_pd(); + va4_7b_3 = _mm256_setzero_pd(); - // Iteration 0. - vtmp = _mm256_mul_pd( va0_3, vb0 ); - va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp ); + // Load va0_3 + va0_3 = _mm256_load_pd(a); + // Load va4_7 + va4_7 = _mm256_load_pd(a + 4); - vtmp = _mm256_mul_pd( va4_7, vb0 ); - va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp ); + // Load vb (b0,b1,b2,b3) + vb0 = _mm256_load_pd(b); - // Load va4_7 (Prefetch) - // Prefetch A47 - vA4_7 = _mm256_load_pd( a + 12 ); + for (i = 0; i < k_iter; ++i) { + __asm__ volatile("prefetcht0 192(%0) \n\t" : : "r"(a)); - // Shuffle vb (b1,b0,b3,b2) - vb1 = _mm256_shuffle_pd( vb0, vb0, 0x5 ); + // Load va0_3 (Prefetch) + // Prefetch A03 + vA0_3 = _mm256_load_pd(a + 8); - vtmp = _mm256_mul_pd( va0_3, vb1 ); - va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp ); + // Iteration 0. + vtmp = _mm256_mul_pd(va0_3, vb0); + va0_3b_0 = _mm256_add_pd(va0_3b_0, vtmp); - vtmp = _mm256_mul_pd( va4_7, vb1 ); - va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp ); + vtmp = _mm256_mul_pd(va4_7, vb0); + va4_7b_0 = _mm256_add_pd(va4_7b_0, vtmp); - // Permute vb (b3,b2,b1,b0) - vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 ); + // Load va4_7 (Prefetch) + // Prefetch A47 + vA4_7 = _mm256_load_pd(a + 12); - // Load vb (b0,b1,b2,b3) (Prefetch) - vB0 = _mm256_load_pd( b + 4 ); + // Shuffle vb (b1,b0,b3,b2) + vb1 = _mm256_shuffle_pd(vb0, vb0, 0x5); - vtmp = _mm256_mul_pd( va0_3, vb2 ); - va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp ); + vtmp = _mm256_mul_pd(va0_3, vb1); + va0_3b_1 = _mm256_add_pd(va0_3b_1, vtmp); - vtmp = _mm256_mul_pd( va4_7, vb2 ); - va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp ); + vtmp = _mm256_mul_pd(va4_7, vb1); + va4_7b_1 = _mm256_add_pd(va4_7b_1, vtmp); - // Shuffle vb (b3,b2,b1,b0) - vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 ); + // Permute vb (b3,b2,b1,b0) + vb2 = _mm256_permute2f128_pd(vb1, vb1, 0x1); - vtmp = _mm256_mul_pd( va0_3, vb3 ); - va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp ); + // Load vb (b0,b1,b2,b3) (Prefetch) + vB0 = _mm256_load_pd(b + 4); - vtmp = _mm256_mul_pd( va4_7, vb3 ); - va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp ); + vtmp = _mm256_mul_pd(va0_3, vb2); + va0_3b_2 = _mm256_add_pd(va0_3b_2, vtmp); - // Iteration 1. + vtmp = _mm256_mul_pd(va4_7, vb2); + va4_7b_2 = _mm256_add_pd(va4_7b_2, vtmp); - __asm__ volatile( "prefetcht0 512(%0) \n\t" : :"r"(a) ); - - // Load va0_3 (Next iteration) - va0_3 = _mm256_load_pd( a + 16 ); + // Shuffle vb (b3,b2,b1,b0) + vb3 = _mm256_shuffle_pd(vb2, vb2, 0x5); - vtmp = _mm256_mul_pd( vA0_3, vB0 ); - va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp ); + vtmp = _mm256_mul_pd(va0_3, vb3); + va0_3b_3 = _mm256_add_pd(va0_3b_3, vtmp); - vb1 = _mm256_shuffle_pd( vB0, vB0, 0x5 ); + vtmp = _mm256_mul_pd(va4_7, vb3); + va4_7b_3 = _mm256_add_pd(va4_7b_3, vtmp); - vtmp = _mm256_mul_pd( vA4_7, vB0 ); - va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp ); + // Iteration 1. - vtmp = _mm256_mul_pd( vA0_3, vb1 ); - va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp ); + __asm__ volatile("prefetcht0 512(%0) \n\t" : : "r"(a)); - // Load va4_7 (Next iteration) - va4_7 = _mm256_load_pd( a + 20 ); + // Load va0_3 (Next iteration) + va0_3 = _mm256_load_pd(a + 16); - vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 ); + vtmp = _mm256_mul_pd(vA0_3, vB0); + va0_3b_0 = _mm256_add_pd(va0_3b_0, vtmp); - vtmp = _mm256_mul_pd( vA4_7, vb1 ); - va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp ); + vb1 = _mm256_shuffle_pd(vB0, vB0, 0x5); - vtmp = _mm256_mul_pd( vA0_3, vb2 ); - va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp ); + vtmp = _mm256_mul_pd(vA4_7, vB0); + va4_7b_0 = _mm256_add_pd(va4_7b_0, vtmp); - vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 ); + vtmp = _mm256_mul_pd(vA0_3, vb1); + va0_3b_1 = _mm256_add_pd(va0_3b_1, vtmp); - vtmp = _mm256_mul_pd( vA4_7, vb2 ); - va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp ); + // Load va4_7 (Next iteration) + va4_7 = _mm256_load_pd(a + 20); - // Load vb0(Next iteration) - vb0 = _mm256_load_pd( b + 8 ); + vb2 = _mm256_permute2f128_pd(vb1, vb1, 0x1); - vtmp = _mm256_mul_pd( vA0_3, vb3 ); - va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp ); + vtmp = _mm256_mul_pd(vA4_7, vb1); + va4_7b_1 = _mm256_add_pd(va4_7b_1, vtmp); - vtmp = _mm256_mul_pd( vA4_7, vb3 ); - va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp ); + vtmp = _mm256_mul_pd(vA0_3, vb2); + va0_3b_2 = _mm256_add_pd(va0_3b_2, vtmp); - a += 16; - b += 8; + vb3 = _mm256_shuffle_pd(vb2, vb2, 0x5); - } + vtmp = _mm256_mul_pd(vA4_7, vb2); + va4_7b_2 = _mm256_add_pd(va4_7b_2, vtmp); + // Load vb0(Next iteration) + vb0 = _mm256_load_pd(b + 8); - for( i = 0; i < k_left; ++i ) - { - // Iteration 0. + vtmp = _mm256_mul_pd(vA0_3, vb3); + va0_3b_3 = _mm256_add_pd(va0_3b_3, vtmp); - // Load va0_3 - va0_3 = _mm256_load_pd( a ); - // Load va4_7 - va4_7 = _mm256_load_pd( a + 4 ); + vtmp = _mm256_mul_pd(vA4_7, vb3); + va4_7b_3 = _mm256_add_pd(va4_7b_3, vtmp); - // Load vb (b0,b1,b2,b3) - vb = _mm256_load_pd( b ); + a += 16; + b += 8; + } - vtmp = _mm256_mul_pd( va0_3, vb ); - va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp ); + for (i = 0; i < k_left; ++i) { + // Iteration 0. - vtmp = _mm256_mul_pd( va4_7, vb ); - va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp ); + // Load va0_3 + va0_3 = _mm256_load_pd(a); + // Load va4_7 + va4_7 = _mm256_load_pd(a + 4); - // Shuffle vb (b1,b0,b3,b2) - vb = _mm256_shuffle_pd( vb, vb, 0x5 ); + // Load vb (b0,b1,b2,b3) + vb = _mm256_load_pd(b); - vtmp = _mm256_mul_pd( va0_3, vb ); - va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp ); + vtmp = _mm256_mul_pd(va0_3, vb); + va0_3b_0 = _mm256_add_pd(va0_3b_0, vtmp); - vtmp = _mm256_mul_pd( va4_7, vb ); - va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp ); + vtmp = _mm256_mul_pd(va4_7, vb); + va4_7b_0 = _mm256_add_pd(va4_7b_0, vtmp); - // Permute vb (b3,b2,b1,b0) - vb = _mm256_permute2f128_pd( vb, vb, 0x1 ); + // Shuffle vb (b1,b0,b3,b2) + vb = _mm256_shuffle_pd(vb, vb, 0x5); - vtmp = _mm256_mul_pd( va0_3, vb ); - va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp ); + vtmp = _mm256_mul_pd(va0_3, vb); + va0_3b_1 = _mm256_add_pd(va0_3b_1, vtmp); - vtmp = _mm256_mul_pd( va4_7, vb ); - va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp ); + vtmp = _mm256_mul_pd(va4_7, vb); + va4_7b_1 = _mm256_add_pd(va4_7b_1, vtmp); - // Shuffle vb (b3,b2,b1,b0) - vb = _mm256_shuffle_pd( vb, vb, 0x5 ); + // Permute vb (b3,b2,b1,b0) + vb = _mm256_permute2f128_pd(vb, vb, 0x1); - vtmp = _mm256_mul_pd( va0_3, vb ); - va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp ); + vtmp = _mm256_mul_pd(va0_3, vb); + va0_3b_2 = _mm256_add_pd(va0_3b_2, vtmp); - vtmp = _mm256_mul_pd( va4_7, vb ); - va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp ); + vtmp = _mm256_mul_pd(va4_7, vb); + va4_7b_2 = _mm256_add_pd(va4_7b_2, vtmp); - a += 8; - b += 4; + // Shuffle vb (b3,b2,b1,b0) + vb = _mm256_shuffle_pd(vb, vb, 0x5); - } + vtmp = _mm256_mul_pd(va0_3, vb); + va0_3b_3 = _mm256_add_pd(va0_3b_3, vtmp); + vtmp = _mm256_mul_pd(va4_7, vb); + va4_7b_3 = _mm256_add_pd(va4_7b_3, vtmp); - - vbeta = _mm256_broadcast_sd( beta ); - - - __m256d vtmpa_0_3b_0 = _mm256_blend_pd( va0_3b_0, va0_3b_1, 0x6 ); - __m256d vtmpa_0_3b_1 = _mm256_blend_pd( va0_3b_1, va0_3b_0, 0x6 ); - - __m256d vtmpa_0_3b_2 = _mm256_blend_pd( va0_3b_2, va0_3b_3, 0x6 ); - __m256d vtmpa_0_3b_3 = _mm256_blend_pd( va0_3b_3, va0_3b_2, 0x6 ); - - __m256d vtmpa_4_7b_0 = _mm256_blend_pd( va4_7b_0, va4_7b_1, 0x6 ); - __m256d vtmpa_4_7b_1 = _mm256_blend_pd( va4_7b_1, va4_7b_0, 0x6 ); - - __m256d vtmpa_4_7b_2 = _mm256_blend_pd( va4_7b_2, va4_7b_3, 0x6 ); - __m256d vtmpa_4_7b_3 = _mm256_blend_pd( va4_7b_3, va4_7b_2, 0x6 ); - - - valpha = _mm256_broadcast_sd( alpha ); - - - va0_3b0 = _mm256_permute2f128_pd( vtmpa_0_3b_0, vtmpa_0_3b_2, 0x30 ); - va0_3b3 = _mm256_permute2f128_pd( vtmpa_0_3b_2, vtmpa_0_3b_0, 0x30 ); - - va0_3b1 = _mm256_permute2f128_pd( vtmpa_0_3b_1, vtmpa_0_3b_3, 0x30 ); - va0_3b2 = _mm256_permute2f128_pd( vtmpa_0_3b_3, vtmpa_0_3b_1, 0x30 ); - - va4_7b0 = _mm256_permute2f128_pd( vtmpa_4_7b_0, vtmpa_4_7b_2, 0x30 ); - va4_7b3 = _mm256_permute2f128_pd( vtmpa_4_7b_2, vtmpa_4_7b_0, 0x30 ); - - va4_7b1 = _mm256_permute2f128_pd( vtmpa_4_7b_1, vtmpa_4_7b_3, 0x30 ); - va4_7b2 = _mm256_permute2f128_pd( vtmpa_4_7b_3, vtmpa_4_7b_1, 0x30 ); - - - - if( rs_c == 1 ) - { - // Calculate address - c00 = ( c + 0*rs_c + 0*cs_c ); - // Load - //vc0_3_0 = _mm256_load_pd( c + 0*rs_c + 0*cs_c ); - vc0_3_0 = _mm256_load_pd( c00 ); - // Scale by alpha - vtmp = _mm256_mul_pd( valpha, va0_3b0); - // Scale by beta - vc0_3_0 = _mm256_mul_pd( vbeta, vc0_3_0 ); - // Add gemm result - vc0_3_0 = _mm256_add_pd( vc0_3_0, vtmp ); - // Store back to memory - _mm256_store_pd( c00, vc0_3_0 ); - - // Calculate address - c40 = ( c + 4*rs_c + 0*cs_c ); - // Load - //vc4_7_0 = _mm256_load_pd( c + 4*rs_c + 0*cs_c ); - vc4_7_0 = _mm256_load_pd( c40 ); - // Scale by alpha - vtmp = _mm256_mul_pd( valpha, va4_7b0); - // Scale by beta - vc4_7_0 = _mm256_mul_pd( vbeta, vc4_7_0 ); - // Add gemm result - vc4_7_0 = _mm256_add_pd( vc4_7_0, vtmp ); - // Store back to memory - _mm256_store_pd( c40, vc4_7_0 ); - - // Calculate address - c01 = ( c + 0*rs_c + 1*cs_c ); - // Load - //vc0_3_1 = _mm256_load_pd( c + 0*rs_c + 1*cs_c ); - vc0_3_1 = _mm256_load_pd( c01 ); - // Scale by alpha - vtmp = _mm256_mul_pd( valpha, va0_3b1); - // Scale by beta - vc0_3_1 = _mm256_mul_pd( vbeta, vc0_3_1 ); - // Add gemm result - vc0_3_1 = _mm256_add_pd( vc0_3_1, vtmp ); - // Store back to memory - _mm256_store_pd( c01, vc0_3_1 ); - - - // Calculate address - c41 = ( c + 4*rs_c + 1*cs_c ); - // Load - //vc4_7_1 = _mm256_load_pd( c + 4*rs_c + 1*cs_c ); - vc4_7_1 = _mm256_load_pd( c41 ); - // Scale by alpha - vtmp = _mm256_mul_pd( valpha, va4_7b1); - // Scale by beta - vc4_7_1 = _mm256_mul_pd( vbeta, vc4_7_1 ); - // Add gemm result - vc4_7_1 = _mm256_add_pd( vc4_7_1, vtmp ); - // Store back to memory - _mm256_store_pd( c41, vc4_7_1 ); - - // Calculate address - c02 = ( c + 0*rs_c + 2*cs_c ); - // Load - //vc0_3_2 = _mm256_load_pd( c + 0*rs_c + 2*cs_c ); - vc0_3_2 = _mm256_load_pd( c02 ); - // Scale by alpha - vtmp = _mm256_mul_pd( valpha, va0_3b2); - // Scale by beta - vc0_3_2 = _mm256_mul_pd( vbeta, vc0_3_2 ); - // Add gemm result - vc0_3_2 = _mm256_add_pd( vc0_3_2, vtmp ); - // Store back to memory - _mm256_store_pd( c02, vc0_3_2 ); - - // Calculate address - c42 = ( c + 4*rs_c + 2*cs_c ); - // Load - //vc4_7_2 = _mm256_load_pd( c + 4*rs_c + 2*cs_c ); - vc4_7_2 = _mm256_load_pd( c42 ); - // Scale by alpha - vtmp = _mm256_mul_pd( valpha, va4_7b2); - // Scale by beta - vc4_7_2 = _mm256_mul_pd( vbeta, vc4_7_2 ); - // Add gemm result - vc4_7_2 = _mm256_add_pd( vc4_7_2, vtmp ); - // Store back to memory - _mm256_store_pd( c42, vc4_7_2 ); - - // Calculate address - c03 = ( c + 0*rs_c + 3*cs_c ); - // Load - //vc0_3_3 = _mm256_load_pd( c + 0*rs_c + 3*cs_c ); - vc0_3_3 = _mm256_load_pd( c03 ); - // Scale by alpha - vtmp = _mm256_mul_pd( valpha, va0_3b3); - // Scale by beta - vc0_3_3 = _mm256_mul_pd( vbeta, vc0_3_3 ); - // Add gemm result - vc0_3_3 = _mm256_add_pd( vc0_3_3, vtmp ); - // Store back to memory - _mm256_store_pd( c03, vc0_3_3 ); - - // Calculate address - c43 = ( c + 4*rs_c + 3*cs_c ); - // Load - //vc4_7_3 = _mm256_load_pd( c + 4*rs_c + 3*cs_c ); - vc4_7_3 = _mm256_load_pd( c43 ); - // Scale by alpha - vtmp = _mm256_mul_pd( valpha, va4_7b3); - // Scale by beta - vc4_7_3 = _mm256_mul_pd( vbeta, vc4_7_3 ); - // Add gemm result - vc4_7_3 = _mm256_add_pd( vc4_7_3, vtmp ); - // Store back to memory - _mm256_store_pd( c43, vc4_7_3 ); - - } - else - { - // Calculate address - c00 = ( c + 0*rs_c + 0*cs_c ); - // Load - //vc0_3_0 = _mm256_load_pd( c + 0*rs_c + 0*cs_c ); - vc0_3_0 = _mm256_set_pd( *(c + 3*rs_c + 0*cs_c ), - *(c + 2*rs_c + 0*cs_c ), - *(c + 1*rs_c + 0*cs_c ), - *(c + 0*rs_c + 0*cs_c ) ); - // Scale by alpha - vtmp = _mm256_mul_pd( valpha, va0_3b0); - // Scale by beta - vc0_3_0 = _mm256_mul_pd( vbeta, vc0_3_0 ); - // Add gemm result - vc0_3_0 = _mm256_add_pd( vc0_3_0, vtmp ); - // Store back to memory - //_mm256_store_pd( c00, vc0_3_0 ); - - aa = _mm256_extractf128_pd( vc0_3_0, 0 ) ; - bb = _mm256_extractf128_pd( vc0_3_0, 1 ) ; - - _mm_storel_pd( c + 0*rs_c + 0*cs_c, aa ); - _mm_storeh_pd( c + 1*rs_c + 0*cs_c, aa ); - _mm_storel_pd( c + 2*rs_c + 0*cs_c, bb ); - _mm_storeh_pd( c + 3*rs_c + 0*cs_c, bb ); - - // Calculate address - c40 = ( c + 4*rs_c + 0*cs_c ); - // Load - //vc4_7_0 = _mm256_load_pd( c + 4*rs_c + 0*cs_c ); - vc4_7_0 = _mm256_set_pd( *(c + 7*rs_c + 0*cs_c ), - *(c + 6*rs_c + 0*cs_c ), - *(c + 5*rs_c + 0*cs_c ), - *(c + 4*rs_c + 0*cs_c ) ); - // Scale by alpha - vtmp = _mm256_mul_pd( valpha, va4_7b0); - // Scale by beta - vc4_7_0 = _mm256_mul_pd( vbeta, vc4_7_0 ); - // Add gemm result - vc4_7_0 = _mm256_add_pd( vc4_7_0, vtmp ); - // Store back to memory - //_mm256_store_pd( c40, vc4_7_0 ); - - aa = _mm256_extractf128_pd( vc4_7_0, 0 ) ; - bb = _mm256_extractf128_pd( vc4_7_0, 1 ) ; - - _mm_storel_pd( c + 4*rs_c + 0*cs_c, aa ); - _mm_storeh_pd( c + 5*rs_c + 0*cs_c, aa ); - _mm_storel_pd( c + 6*rs_c + 0*cs_c, bb ); - _mm_storeh_pd( c + 7*rs_c + 0*cs_c, bb ); - - // Calculate address - c01 = ( c + 0*rs_c + 1*cs_c ); - // Load - //vc0_3_1 = _mm256_load_pd( c + 0*rs_c + 1*cs_c ); - vc0_3_1 = _mm256_set_pd( *(c + 3*rs_c + 1*cs_c ), - *(c + 2*rs_c + 1*cs_c ), - *(c + 1*rs_c + 1*cs_c ), - *(c + 0*rs_c + 1*cs_c ) ); - // Scale by alpha - vtmp = _mm256_mul_pd( valpha, va0_3b1); - // Scale by beta - vc0_3_1 = _mm256_mul_pd( vbeta, vc0_3_1 ); - // Add gemm result - vc0_3_1 = _mm256_add_pd( vc0_3_1, vtmp ); - // Store back to memory - //_mm256_store_pd( c01, vc0_3_1 ); - - aa = _mm256_extractf128_pd( vc0_3_1, 0 ) ; - bb = _mm256_extractf128_pd( vc0_3_1, 1 ) ; - - _mm_storel_pd( c + 0*rs_c + 1*cs_c, aa ); - _mm_storeh_pd( c + 1*rs_c + 1*cs_c, aa ); - _mm_storel_pd( c + 2*rs_c + 1*cs_c, bb ); - _mm_storeh_pd( c + 3*rs_c + 1*cs_c, bb ); - - // Calculate address - c41 = ( c + 4*rs_c + 1*cs_c ); - // Load - //vc4_7_1 = _mm256_load_pd( c + 4*rs_c + 1*cs_c ); - vc4_7_1 = _mm256_set_pd( *(c + 7*rs_c + 1*cs_c ), - *(c + 6*rs_c + 1*cs_c ), - *(c + 5*rs_c + 1*cs_c ), - *(c + 4*rs_c + 1*cs_c ) ); - // Scale by alpha - vtmp = _mm256_mul_pd( valpha, va4_7b1); - // Scale by beta - vc4_7_1 = _mm256_mul_pd( vbeta, vc4_7_1 ); - // Add gemm result - vc4_7_1 = _mm256_add_pd( vc4_7_1, vtmp ); - // Store back to memory - //_mm256_store_pd( c41, vc4_7_1 ); - - aa = _mm256_extractf128_pd( vc4_7_1, 0 ) ; - bb = _mm256_extractf128_pd( vc4_7_1, 1 ) ; - - _mm_storel_pd( c + 4*rs_c + 1*cs_c, aa ); - _mm_storeh_pd( c + 5*rs_c + 1*cs_c, aa ); - _mm_storel_pd( c + 6*rs_c + 1*cs_c, bb ); - _mm_storeh_pd( c + 7*rs_c + 1*cs_c, bb ); - - // Calculate address - c02 = ( c + 0*rs_c + 2*cs_c ); - // Load - //vc0_3_2 = _mm256_load_pd( c + 0*rs_c + 2*cs_c ); - vc0_3_2 = _mm256_set_pd( *(c + 3*rs_c + 2*cs_c ), - *(c + 2*rs_c + 2*cs_c ), - *(c + 1*rs_c + 2*cs_c ), - *(c + 0*rs_c + 2*cs_c ) ); - // Scale by alpha - vtmp = _mm256_mul_pd( valpha, va0_3b2); - // Scale by beta - vc0_3_2 = _mm256_mul_pd( vbeta, vc0_3_2 ); - // Add gemm result - vc0_3_2 = _mm256_add_pd( vc0_3_2, vtmp ); - // Store back to memory - //_mm256_store_pd( c02, vc0_3_2 ); - - aa = _mm256_extractf128_pd( vc0_3_2, 0 ) ; - bb = _mm256_extractf128_pd( vc0_3_2, 1 ) ; - - _mm_storel_pd( c + 0*rs_c + 2*cs_c, aa ); - _mm_storeh_pd( c + 1*rs_c + 2*cs_c, aa ); - _mm_storel_pd( c + 2*rs_c + 2*cs_c, bb ); - _mm_storeh_pd( c + 3*rs_c + 2*cs_c, bb ); - - // Calculate address - c42 = ( c + 4*rs_c + 2*cs_c ); - // Load - //vc4_7_2 = _mm256_load_pd( c + 4*rs_c + 2*cs_c ); - vc4_7_2 = _mm256_set_pd( *(c + 7*rs_c + 2*cs_c ), - *(c + 6*rs_c + 2*cs_c ), - *(c + 5*rs_c + 2*cs_c ), - *(c + 4*rs_c + 2*cs_c ) ); - // Scale by alpha - vtmp = _mm256_mul_pd( valpha, va4_7b2); - // Scale by beta - vc4_7_2 = _mm256_mul_pd( vbeta, vc4_7_2 ); - // Add gemm result - vc4_7_2 = _mm256_add_pd( vc4_7_2, vtmp ); - // Store back to memory - //_mm256_store_pd( c42, vc4_7_2 ); - - aa = _mm256_extractf128_pd( vc4_7_2, 0 ) ; - bb = _mm256_extractf128_pd( vc4_7_2, 1 ) ; - - _mm_storel_pd( c + 4*rs_c + 2*cs_c, aa ); - _mm_storeh_pd( c + 5*rs_c + 2*cs_c, aa ); - _mm_storel_pd( c + 6*rs_c + 2*cs_c, bb ); - _mm_storeh_pd( c + 7*rs_c + 2*cs_c, bb ); - - // Calculate address - c03 = ( c + 0*rs_c + 3*cs_c ); - // Load - //vc0_3_3 = _mm256_load_pd( c + 0*rs_c + 3*cs_c ); - vc0_3_3 = _mm256_set_pd( *(c + 3*rs_c + 3*cs_c ), - *(c + 2*rs_c + 3*cs_c ), - *(c + 1*rs_c + 3*cs_c ), - *(c + 0*rs_c + 3*cs_c ) ); - // Scale by alpha - vtmp = _mm256_mul_pd( valpha, va0_3b3); - // Scale by beta - vc0_3_3 = _mm256_mul_pd( vbeta, vc0_3_3 ); - // Add gemm result - vc0_3_3 = _mm256_add_pd( vc0_3_3, vtmp ); - // Store back to memory - //_mm256_store_pd( c03, vc0_3_3 ); - - aa = _mm256_extractf128_pd( vc0_3_3, 0 ) ; - bb = _mm256_extractf128_pd( vc0_3_3, 1 ) ; - - _mm_storel_pd( c + 0*rs_c + 3*cs_c, aa ); - _mm_storeh_pd( c + 1*rs_c + 3*cs_c, aa ); - _mm_storel_pd( c + 2*rs_c + 3*cs_c, bb ); - _mm_storeh_pd( c + 3*rs_c + 3*cs_c, bb ); - - - - // Calculate address - c43 = ( c + 4*rs_c + 3*cs_c ); - // Load - //vc4_7_3 = _mm256_load_pd( c + 4*rs_c + 3*cs_c ); - vc4_7_3 = _mm256_set_pd( *(c + 7*rs_c + 3*cs_c ), - *(c + 6*rs_c + 3*cs_c ), - *(c + 5*rs_c + 3*cs_c ), - *(c + 4*rs_c + 3*cs_c ) ); - // Scale by alpha - vtmp = _mm256_mul_pd( valpha, va4_7b3); - // Scale by beta - vc4_7_3 = _mm256_mul_pd( vbeta, vc4_7_3 ); - // Add gemm result - vc4_7_3 = _mm256_add_pd( vc4_7_3, vtmp ); - // Store back to memory - //_mm256_store_pd( c43, vc4_7_3 ); - - aa = _mm256_extractf128_pd( vc4_7_3, 0 ) ; - bb = _mm256_extractf128_pd( vc4_7_3, 1 ) ; - - _mm_storel_pd( c + 4*rs_c + 3*cs_c, aa ); - _mm_storeh_pd( c + 5*rs_c + 3*cs_c, aa ); - _mm_storel_pd( c + 6*rs_c + 3*cs_c, bb ); - _mm_storeh_pd( c + 7*rs_c + 3*cs_c, bb ); - } - + a += 8; + b += 4; + } + + vbeta = _mm256_broadcast_sd(beta); + + __m256d vtmpa_0_3b_0 = _mm256_blend_pd(va0_3b_0, va0_3b_1, 0x6); + __m256d vtmpa_0_3b_1 = _mm256_blend_pd(va0_3b_1, va0_3b_0, 0x6); + + __m256d vtmpa_0_3b_2 = _mm256_blend_pd(va0_3b_2, va0_3b_3, 0x6); + __m256d vtmpa_0_3b_3 = _mm256_blend_pd(va0_3b_3, va0_3b_2, 0x6); + + __m256d vtmpa_4_7b_0 = _mm256_blend_pd(va4_7b_0, va4_7b_1, 0x6); + __m256d vtmpa_4_7b_1 = _mm256_blend_pd(va4_7b_1, va4_7b_0, 0x6); + + __m256d vtmpa_4_7b_2 = _mm256_blend_pd(va4_7b_2, va4_7b_3, 0x6); + __m256d vtmpa_4_7b_3 = _mm256_blend_pd(va4_7b_3, va4_7b_2, 0x6); + + valpha = _mm256_broadcast_sd(alpha); + + va0_3b0 = _mm256_permute2f128_pd(vtmpa_0_3b_0, vtmpa_0_3b_2, 0x30); + va0_3b3 = _mm256_permute2f128_pd(vtmpa_0_3b_2, vtmpa_0_3b_0, 0x30); + + va0_3b1 = _mm256_permute2f128_pd(vtmpa_0_3b_1, vtmpa_0_3b_3, 0x30); + va0_3b2 = _mm256_permute2f128_pd(vtmpa_0_3b_3, vtmpa_0_3b_1, 0x30); + + va4_7b0 = _mm256_permute2f128_pd(vtmpa_4_7b_0, vtmpa_4_7b_2, 0x30); + va4_7b3 = _mm256_permute2f128_pd(vtmpa_4_7b_2, vtmpa_4_7b_0, 0x30); + + va4_7b1 = _mm256_permute2f128_pd(vtmpa_4_7b_1, vtmpa_4_7b_3, 0x30); + va4_7b2 = _mm256_permute2f128_pd(vtmpa_4_7b_3, vtmpa_4_7b_1, 0x30); + + if (rs_c == 1) { + // Calculate address + c00 = (c + 0 * rs_c + 0 * cs_c); + // Load + // vc0_3_0 = _mm256_load_pd( c + 0*rs_c + 0*cs_c ); + vc0_3_0 = _mm256_load_pd(c00); + // Scale by alpha + vtmp = _mm256_mul_pd(valpha, va0_3b0); + // Scale by beta + vc0_3_0 = _mm256_mul_pd(vbeta, vc0_3_0); + // Add gemm result + vc0_3_0 = _mm256_add_pd(vc0_3_0, vtmp); + // Store back to memory + _mm256_store_pd(c00, vc0_3_0); + + // Calculate address + c40 = (c + 4 * rs_c + 0 * cs_c); + // Load + // vc4_7_0 = _mm256_load_pd( c + 4*rs_c + 0*cs_c ); + vc4_7_0 = _mm256_load_pd(c40); + // Scale by alpha + vtmp = _mm256_mul_pd(valpha, va4_7b0); + // Scale by beta + vc4_7_0 = _mm256_mul_pd(vbeta, vc4_7_0); + // Add gemm result + vc4_7_0 = _mm256_add_pd(vc4_7_0, vtmp); + // Store back to memory + _mm256_store_pd(c40, vc4_7_0); + + // Calculate address + c01 = (c + 0 * rs_c + 1 * cs_c); + // Load + // vc0_3_1 = _mm256_load_pd( c + 0*rs_c + 1*cs_c ); + vc0_3_1 = _mm256_load_pd(c01); + // Scale by alpha + vtmp = _mm256_mul_pd(valpha, va0_3b1); + // Scale by beta + vc0_3_1 = _mm256_mul_pd(vbeta, vc0_3_1); + // Add gemm result + vc0_3_1 = _mm256_add_pd(vc0_3_1, vtmp); + // Store back to memory + _mm256_store_pd(c01, vc0_3_1); + + // Calculate address + c41 = (c + 4 * rs_c + 1 * cs_c); + // Load + // vc4_7_1 = _mm256_load_pd( c + 4*rs_c + 1*cs_c ); + vc4_7_1 = _mm256_load_pd(c41); + // Scale by alpha + vtmp = _mm256_mul_pd(valpha, va4_7b1); + // Scale by beta + vc4_7_1 = _mm256_mul_pd(vbeta, vc4_7_1); + // Add gemm result + vc4_7_1 = _mm256_add_pd(vc4_7_1, vtmp); + // Store back to memory + _mm256_store_pd(c41, vc4_7_1); + + // Calculate address + c02 = (c + 0 * rs_c + 2 * cs_c); + // Load + // vc0_3_2 = _mm256_load_pd( c + 0*rs_c + 2*cs_c ); + vc0_3_2 = _mm256_load_pd(c02); + // Scale by alpha + vtmp = _mm256_mul_pd(valpha, va0_3b2); + // Scale by beta + vc0_3_2 = _mm256_mul_pd(vbeta, vc0_3_2); + // Add gemm result + vc0_3_2 = _mm256_add_pd(vc0_3_2, vtmp); + // Store back to memory + _mm256_store_pd(c02, vc0_3_2); + + // Calculate address + c42 = (c + 4 * rs_c + 2 * cs_c); + // Load + // vc4_7_2 = _mm256_load_pd( c + 4*rs_c + 2*cs_c ); + vc4_7_2 = _mm256_load_pd(c42); + // Scale by alpha + vtmp = _mm256_mul_pd(valpha, va4_7b2); + // Scale by beta + vc4_7_2 = _mm256_mul_pd(vbeta, vc4_7_2); + // Add gemm result + vc4_7_2 = _mm256_add_pd(vc4_7_2, vtmp); + // Store back to memory + _mm256_store_pd(c42, vc4_7_2); + + // Calculate address + c03 = (c + 0 * rs_c + 3 * cs_c); + // Load + // vc0_3_3 = _mm256_load_pd( c + 0*rs_c + 3*cs_c ); + vc0_3_3 = _mm256_load_pd(c03); + // Scale by alpha + vtmp = _mm256_mul_pd(valpha, va0_3b3); + // Scale by beta + vc0_3_3 = _mm256_mul_pd(vbeta, vc0_3_3); + // Add gemm result + vc0_3_3 = _mm256_add_pd(vc0_3_3, vtmp); + // Store back to memory + _mm256_store_pd(c03, vc0_3_3); + + // Calculate address + c43 = (c + 4 * rs_c + 3 * cs_c); + // Load + // vc4_7_3 = _mm256_load_pd( c + 4*rs_c + 3*cs_c ); + vc4_7_3 = _mm256_load_pd(c43); + // Scale by alpha + vtmp = _mm256_mul_pd(valpha, va4_7b3); + // Scale by beta + vc4_7_3 = _mm256_mul_pd(vbeta, vc4_7_3); + // Add gemm result + vc4_7_3 = _mm256_add_pd(vc4_7_3, vtmp); + // Store back to memory + _mm256_store_pd(c43, vc4_7_3); + + } else { + // Calculate address + c00 = (c + 0 * rs_c + 0 * cs_c); + // Load + // vc0_3_0 = _mm256_load_pd( c + 0*rs_c + 0*cs_c ); + vc0_3_0 = _mm256_set_pd( + *(c + 3 * rs_c + 0 * cs_c), + *(c + 2 * rs_c + 0 * cs_c), + *(c + 1 * rs_c + 0 * cs_c), + *(c + 0 * rs_c + 0 * cs_c)); + // Scale by alpha + vtmp = _mm256_mul_pd(valpha, va0_3b0); + // Scale by beta + vc0_3_0 = _mm256_mul_pd(vbeta, vc0_3_0); + // Add gemm result + vc0_3_0 = _mm256_add_pd(vc0_3_0, vtmp); + // Store back to memory + //_mm256_store_pd( c00, vc0_3_0 ); + + aa = _mm256_extractf128_pd(vc0_3_0, 0); + bb = _mm256_extractf128_pd(vc0_3_0, 1); + + _mm_storel_pd(c + 0 * rs_c + 0 * cs_c, aa); + _mm_storeh_pd(c + 1 * rs_c + 0 * cs_c, aa); + _mm_storel_pd(c + 2 * rs_c + 0 * cs_c, bb); + _mm_storeh_pd(c + 3 * rs_c + 0 * cs_c, bb); + + // Calculate address + c40 = (c + 4 * rs_c + 0 * cs_c); + // Load + // vc4_7_0 = _mm256_load_pd( c + 4*rs_c + 0*cs_c ); + vc4_7_0 = _mm256_set_pd( + *(c + 7 * rs_c + 0 * cs_c), + *(c + 6 * rs_c + 0 * cs_c), + *(c + 5 * rs_c + 0 * cs_c), + *(c + 4 * rs_c + 0 * cs_c)); + // Scale by alpha + vtmp = _mm256_mul_pd(valpha, va4_7b0); + // Scale by beta + vc4_7_0 = _mm256_mul_pd(vbeta, vc4_7_0); + // Add gemm result + vc4_7_0 = _mm256_add_pd(vc4_7_0, vtmp); + // Store back to memory + //_mm256_store_pd( c40, vc4_7_0 ); + + aa = _mm256_extractf128_pd(vc4_7_0, 0); + bb = _mm256_extractf128_pd(vc4_7_0, 1); + + _mm_storel_pd(c + 4 * rs_c + 0 * cs_c, aa); + _mm_storeh_pd(c + 5 * rs_c + 0 * cs_c, aa); + _mm_storel_pd(c + 6 * rs_c + 0 * cs_c, bb); + _mm_storeh_pd(c + 7 * rs_c + 0 * cs_c, bb); + + // Calculate address + c01 = (c + 0 * rs_c + 1 * cs_c); + // Load + // vc0_3_1 = _mm256_load_pd( c + 0*rs_c + 1*cs_c ); + vc0_3_1 = _mm256_set_pd( + *(c + 3 * rs_c + 1 * cs_c), + *(c + 2 * rs_c + 1 * cs_c), + *(c + 1 * rs_c + 1 * cs_c), + *(c + 0 * rs_c + 1 * cs_c)); + // Scale by alpha + vtmp = _mm256_mul_pd(valpha, va0_3b1); + // Scale by beta + vc0_3_1 = _mm256_mul_pd(vbeta, vc0_3_1); + // Add gemm result + vc0_3_1 = _mm256_add_pd(vc0_3_1, vtmp); + // Store back to memory + //_mm256_store_pd( c01, vc0_3_1 ); + + aa = _mm256_extractf128_pd(vc0_3_1, 0); + bb = _mm256_extractf128_pd(vc0_3_1, 1); + + _mm_storel_pd(c + 0 * rs_c + 1 * cs_c, aa); + _mm_storeh_pd(c + 1 * rs_c + 1 * cs_c, aa); + _mm_storel_pd(c + 2 * rs_c + 1 * cs_c, bb); + _mm_storeh_pd(c + 3 * rs_c + 1 * cs_c, bb); + + // Calculate address + c41 = (c + 4 * rs_c + 1 * cs_c); + // Load + // vc4_7_1 = _mm256_load_pd( c + 4*rs_c + 1*cs_c ); + vc4_7_1 = _mm256_set_pd( + *(c + 7 * rs_c + 1 * cs_c), + *(c + 6 * rs_c + 1 * cs_c), + *(c + 5 * rs_c + 1 * cs_c), + *(c + 4 * rs_c + 1 * cs_c)); + // Scale by alpha + vtmp = _mm256_mul_pd(valpha, va4_7b1); + // Scale by beta + vc4_7_1 = _mm256_mul_pd(vbeta, vc4_7_1); + // Add gemm result + vc4_7_1 = _mm256_add_pd(vc4_7_1, vtmp); + // Store back to memory + //_mm256_store_pd( c41, vc4_7_1 ); + + aa = _mm256_extractf128_pd(vc4_7_1, 0); + bb = _mm256_extractf128_pd(vc4_7_1, 1); + + _mm_storel_pd(c + 4 * rs_c + 1 * cs_c, aa); + _mm_storeh_pd(c + 5 * rs_c + 1 * cs_c, aa); + _mm_storel_pd(c + 6 * rs_c + 1 * cs_c, bb); + _mm_storeh_pd(c + 7 * rs_c + 1 * cs_c, bb); + + // Calculate address + c02 = (c + 0 * rs_c + 2 * cs_c); + // Load + // vc0_3_2 = _mm256_load_pd( c + 0*rs_c + 2*cs_c ); + vc0_3_2 = _mm256_set_pd( + *(c + 3 * rs_c + 2 * cs_c), + *(c + 2 * rs_c + 2 * cs_c), + *(c + 1 * rs_c + 2 * cs_c), + *(c + 0 * rs_c + 2 * cs_c)); + // Scale by alpha + vtmp = _mm256_mul_pd(valpha, va0_3b2); + // Scale by beta + vc0_3_2 = _mm256_mul_pd(vbeta, vc0_3_2); + // Add gemm result + vc0_3_2 = _mm256_add_pd(vc0_3_2, vtmp); + // Store back to memory + //_mm256_store_pd( c02, vc0_3_2 ); + + aa = _mm256_extractf128_pd(vc0_3_2, 0); + bb = _mm256_extractf128_pd(vc0_3_2, 1); + + _mm_storel_pd(c + 0 * rs_c + 2 * cs_c, aa); + _mm_storeh_pd(c + 1 * rs_c + 2 * cs_c, aa); + _mm_storel_pd(c + 2 * rs_c + 2 * cs_c, bb); + _mm_storeh_pd(c + 3 * rs_c + 2 * cs_c, bb); + + // Calculate address + c42 = (c + 4 * rs_c + 2 * cs_c); + // Load + // vc4_7_2 = _mm256_load_pd( c + 4*rs_c + 2*cs_c ); + vc4_7_2 = _mm256_set_pd( + *(c + 7 * rs_c + 2 * cs_c), + *(c + 6 * rs_c + 2 * cs_c), + *(c + 5 * rs_c + 2 * cs_c), + *(c + 4 * rs_c + 2 * cs_c)); + // Scale by alpha + vtmp = _mm256_mul_pd(valpha, va4_7b2); + // Scale by beta + vc4_7_2 = _mm256_mul_pd(vbeta, vc4_7_2); + // Add gemm result + vc4_7_2 = _mm256_add_pd(vc4_7_2, vtmp); + // Store back to memory + //_mm256_store_pd( c42, vc4_7_2 ); + + aa = _mm256_extractf128_pd(vc4_7_2, 0); + bb = _mm256_extractf128_pd(vc4_7_2, 1); + + _mm_storel_pd(c + 4 * rs_c + 2 * cs_c, aa); + _mm_storeh_pd(c + 5 * rs_c + 2 * cs_c, aa); + _mm_storel_pd(c + 6 * rs_c + 2 * cs_c, bb); + _mm_storeh_pd(c + 7 * rs_c + 2 * cs_c, bb); + + // Calculate address + c03 = (c + 0 * rs_c + 3 * cs_c); + // Load + // vc0_3_3 = _mm256_load_pd( c + 0*rs_c + 3*cs_c ); + vc0_3_3 = _mm256_set_pd( + *(c + 3 * rs_c + 3 * cs_c), + *(c + 2 * rs_c + 3 * cs_c), + *(c + 1 * rs_c + 3 * cs_c), + *(c + 0 * rs_c + 3 * cs_c)); + // Scale by alpha + vtmp = _mm256_mul_pd(valpha, va0_3b3); + // Scale by beta + vc0_3_3 = _mm256_mul_pd(vbeta, vc0_3_3); + // Add gemm result + vc0_3_3 = _mm256_add_pd(vc0_3_3, vtmp); + // Store back to memory + //_mm256_store_pd( c03, vc0_3_3 ); + + aa = _mm256_extractf128_pd(vc0_3_3, 0); + bb = _mm256_extractf128_pd(vc0_3_3, 1); + + _mm_storel_pd(c + 0 * rs_c + 3 * cs_c, aa); + _mm_storeh_pd(c + 1 * rs_c + 3 * cs_c, aa); + _mm_storel_pd(c + 2 * rs_c + 3 * cs_c, bb); + _mm_storeh_pd(c + 3 * rs_c + 3 * cs_c, bb); + + // Calculate address + c43 = (c + 4 * rs_c + 3 * cs_c); + // Load + // vc4_7_3 = _mm256_load_pd( c + 4*rs_c + 3*cs_c ); + vc4_7_3 = _mm256_set_pd( + *(c + 7 * rs_c + 3 * cs_c), + *(c + 6 * rs_c + 3 * cs_c), + *(c + 5 * rs_c + 3 * cs_c), + *(c + 4 * rs_c + 3 * cs_c)); + // Scale by alpha + vtmp = _mm256_mul_pd(valpha, va4_7b3); + // Scale by beta + vc4_7_3 = _mm256_mul_pd(vbeta, vc4_7_3); + // Add gemm result + vc4_7_3 = _mm256_add_pd(vc4_7_3, vtmp); + // Store back to memory + //_mm256_store_pd( c43, vc4_7_3 ); + + aa = _mm256_extractf128_pd(vc4_7_3, 0); + bb = _mm256_extractf128_pd(vc4_7_3, 1); + + _mm_storel_pd(c + 4 * rs_c + 3 * cs_c, aa); + _mm_storeh_pd(c + 5 * rs_c + 3 * cs_c, aa); + _mm_storel_pd(c + 6 * rs_c + 3 * cs_c, bb); + _mm_storeh_pd(c + 7 * rs_c + 3 * cs_c, bb); + } } - - diff --git a/step3/kernels/bl_dgemm_ukr.c b/step3/kernels/bl_dgemm_ukr.c index 8969e98..704e94e 100644 --- a/step3/kernels/bl_dgemm_ukr.c +++ b/step3/kernels/bl_dgemm_ukr.c @@ -1,33 +1,27 @@ #include #include "bl_dgemm_kernel.h" -//micro-panel a is stored in column major, lda=DGEMM_MR. -#define a(i,j) a[ (j)*DGEMM_MR + (i) ] -//micro-panel b is stored in row major, ldb=DGEMM_NR. -#define b(i,j) b[ (i)*DGEMM_NR + (j) ] -//result c is stored in column major. -#define c(i,j) c[ (j)*ldc + (i) ] +// micro-panel a is stored in column major, lda=DGEMM_MR. +#define a(i, j) a[(j)*DGEMM_MR + (i)] +// micro-panel b is stored in row major, ldb=DGEMM_NR. +#define b(i, j) b[(i)*DGEMM_NR + (j)] +// result c is stored in column major. +#define c(i, j) c[(j)*ldc + (i)] +void bl_dgemm_ukr( + int k, + double* a, + double* b, + double* c, + unsigned long long ldc, + aux_t* data) { + int l, j, i; -void bl_dgemm_ukr( int k, - double *a, - double *b, - double *c, - unsigned long long ldc, - aux_t* data ) -{ - int l, j, i; - - for ( l = 0; l < k; ++l ) - { - for ( j = 0; j < DGEMM_NR; ++j ) - { - for ( i = 0; i < DGEMM_MR; ++i ) - { - c( i, j ) += a( i, l ) * b( l, j ); - } - } + for (l = 0; l < k; ++l) { + for (j = 0; j < DGEMM_NR; ++j) { + for (i = 0; i < DGEMM_MR; ++i) { + c(i, j) += a(i, l) * b(l, j); + } } - + } } - diff --git a/step3/makefile b/step3/makefile index e3a4d0f..e586f53 100644 --- a/step3/makefile +++ b/step3/makefile @@ -26,7 +26,7 @@ BLISLAB_OBJ=$(FRAME_CC_SRC:.c=.o) $(FRAME_CPP_SRC:.cpp=.o) $(KERNEL_SRC:.c=.o) $ all: $(LIBBLISLAB) $(SHAREDLIBBLISLAB) TESTBLISLAB TESTBLISLAB: $(LIBBLISLAB) - cd $(BLISLAB_DIR)/test && $(MAKE) && cd $(BLISLAB_DIR) $(LDFLAGS) + cd $(BLISLAB_DIR)/test && $(MAKE) && cd $(BLISLAB_DIR) $(LIBBLISLAB): $(BLISLAB_OBJ) $(ARCH) $(ARCHFLAGS) $@ $(BLISLAB_OBJ) diff --git a/step3/sourceme.sh b/step3/sourceme.sh index d82d8b0..2a7e70e 100755 --- a/step3/sourceme.sh +++ b/step3/sourceme.sh @@ -7,7 +7,8 @@ export BLISLAB_USE_INTEL=false echo "BLISLAB_USE_INTEL = $BLISLAB_USE_INTEL" # Whether reference implementation uses BLAS or not? -export BLISLAB_USE_BLAS=true +# export BLISLAB_USE_BLAS=true +export BLISLAB_USE_BLAS=false echo "BLISLAB_USE_BLAS = $BLISLAB_USE_BLAS" # Optimization Level (O0, O1, O2, O3) @@ -16,7 +17,7 @@ echo "COMPILER_OPT_LEVEL = $COMPILER_OPT_LEVEL" # Manually set the BLAS path if BLIS_USE_BLAS=true and using GNU compiler. #export BLAS_DIR=/u/jianyu/lib/blis -export BLAS_DIR=/u/jianyu/lib/openblas +#export BLAS_DIR=/u/jianyu/lib/openblas echo "BLAS_DIR = $BLAS_DIR" # Parallel Options diff --git a/step3/test/collect_result_step3.sh b/step3/test/collect_result_step3.sh index 597f6d6..5483310 100755 --- a/step3/test/collect_result_step3.sh +++ b/step3/test/collect_result_step3.sh @@ -1,2 +1,2 @@ +#!/bin/sh ./run_bl_dgemm.sh | tee step3_result.m - diff --git a/step3/test/test_bl_dgemm.c b/step3/test/test_bl_dgemm.c index 7a743f3..fb022f4 100644 --- a/step3/test/test_bl_dgemm.c +++ b/step3/test/test_bl_dgemm.c @@ -1,209 +1,128 @@ -/* - * -------------------------------------------------------------------------- - * BLISLAB - * -------------------------------------------------------------------------- - * Copyright (C) 2016, The University of Texas at Austin - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - Neither the name of The University of Texas nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * - * test_bl_dgemm.c - * - * - * Purpose: - * test driver for BLISLAB dgemm routine and reference dgemm routine. - * - * Todo: - * - * - * Modification: - * - * - * */ - - #include "bl_dgemm.h" #define USE_SET_DIFF 1 #define TOLERANCE 1E-10 void computeError( - int ldc, - int ldc_ref, - int m, - int n, - double *C, - double *C_ref - ) -{ - int i, j; - for ( i = 0; i < m; i ++ ) { - for ( j = 0; j < n; j ++ ) { - if ( fabs( C( i, j ) - C_ref( i, j ) ) > TOLERANCE ) { - printf( "C[ %d ][ %d ] != C_ref, %E, %E\n", i, j, C( i, j ), C_ref( i, j ) ); - break; - } - } + int ldc, + int ldc_ref, + int m, + int n, + double* C, + double* C_ref) { + int i, j; + for (i = 0; i < m; i++) { + for (j = 0; j < n; j++) { + if (fabs(C(i, j) - C_ref(i, j)) > TOLERANCE) { + printf("C[ %d ][ %d ] != C_ref, %E, %E\n", i, j, C(i, j), C_ref(i, j)); + break; + } } - + } } -void test_bl_dgemm( - int m, - int n, - int k - ) -{ - int i, j, p, nx; - double *A, *B, *C, *C_ref; - double tmp, error, flops; - double ref_beg, ref_time, bl_dgemm_beg, bl_dgemm_time; - int nrepeats; - int lda, ldb, ldc, ldc_ref; - double ref_rectime, bl_dgemm_rectime; - - A = (double*)malloc( sizeof(double) * m * k ); - B = (double*)malloc( sizeof(double) * k * n ); - - lda = m; - ldb = k; +void test_bl_dgemm(int m, int n, int k) { + int i, j, p, nx; + double *A, *B, *C, *C_ref; + double tmp, error, flops; + double ref_beg, ref_time, bl_dgemm_beg, bl_dgemm_time; + int nrepeats; + int lda, ldb, ldc, ldc_ref; + double ref_rectime, bl_dgemm_rectime; + + A = (double*)malloc(sizeof(double) * m * k); + B = (double*)malloc(sizeof(double) * k * n); + + lda = m; + ldb = k; #ifdef DGEMM_MR - ldc = ( ( m - 1 ) / DGEMM_MR + 1 ) * DGEMM_MR; + ldc = ((m - 1) / DGEMM_MR + 1) * DGEMM_MR; #else - ldc = m; + ldc = m; #endif - ldc_ref = m; - C = bl_malloc_aligned( ldc, n + 4, sizeof(double) ); - C_ref = (double*)malloc( sizeof(double) * m * n ); + ldc_ref = m; + C = bl_malloc_aligned(ldc, n + 4, sizeof(double)); + C_ref = (double*)malloc(sizeof(double) * m * n); - nrepeats = 3; + nrepeats = 3; - srand48 (time(NULL)); + srand48(time(NULL)); - // Randonly generate points in [ 0, 1 ]. - for ( p = 0; p < k; p ++ ) { - for ( i = 0; i < m; i ++ ) { - A( i, p ) = (double)( drand48() ); - } + // Randonly generate points in [ 0, 1 ]. + for (p = 0; p < k; p++) { + for (i = 0; i < m; i++) { + A(i, p) = (double)(drand48()); } - for ( j = 0; j < n; j ++ ) { - for ( p = 0; p < k; p ++ ) { - B( p, j ) = (double)( drand48() ); - } + } + for (j = 0; j < n; j++) { + for (p = 0; p < k; p++) { + B(p, j) = (double)(drand48()); } + } - for ( j = 0; j < n; j ++ ) { - for ( i = 0; i < m; i ++ ) { - C_ref( i, j ) = (double)( 0.0 ); - C( i, j ) = (double)( 0.0 ); - } + for (j = 0; j < n; j++) { + for (i = 0; i < m; i++) { + C_ref(i, j) = (double)(0.0); + C(i, j) = (double)(0.0); } - - for ( i = 0; i < nrepeats; i ++ ) { - bl_dgemm_beg = bl_clock(); - { - bl_dgemm( - m, - n, - k, - A, - lda, - B, - ldb, - C, - ldc - ); - } - bl_dgemm_time = bl_clock() - bl_dgemm_beg; - - if ( i == 0 ) { - bl_dgemm_rectime = bl_dgemm_time; - } else { - bl_dgemm_rectime = bl_dgemm_time < bl_dgemm_rectime ? bl_dgemm_time : bl_dgemm_rectime; - } + } + + for (i = 0; i < nrepeats; i++) { + bl_dgemm_beg = bl_clock(); + { bl_dgemm(m, n, k, A, lda, B, ldb, C, ldc); } + bl_dgemm_time = bl_clock() - bl_dgemm_beg; + + if (i == 0) { + bl_dgemm_rectime = bl_dgemm_time; + } else { + bl_dgemm_rectime = + bl_dgemm_time < bl_dgemm_rectime ? bl_dgemm_time : bl_dgemm_rectime; } + } - for ( i = 0; i < nrepeats; i ++ ) { - ref_beg = bl_clock(); - { - bl_dgemm_ref( - m, - n, - k, - A, - lda, - B, - ldb, - C_ref, - ldc_ref - ); - } - ref_time = bl_clock() - ref_beg; - - if ( i == 0 ) { - ref_rectime = ref_time; - } else { - ref_rectime = ref_time < ref_rectime ? ref_time : ref_rectime; - } + for (i = 0; i < nrepeats; i++) { + ref_beg = bl_clock(); + { bl_dgemm_ref(m, n, k, A, lda, B, ldb, C_ref, ldc_ref); } + ref_time = bl_clock() - ref_beg; + + if (i == 0) { + ref_rectime = ref_time; + } else { + ref_rectime = ref_time < ref_rectime ? ref_time : ref_rectime; } + } + + computeError(ldc, ldc_ref, m, n, C, C_ref); + + // Compute overall floating point operations. + flops = (m * n / (1000.0 * 1000.0 * 1000.0)) * (2 * k); - computeError( - ldc, - ldc_ref, - m, - n, - C, - C_ref - ); - - // Compute overall floating point operations. - flops = ( m * n / ( 1000.0 * 1000.0 * 1000.0 ) ) * ( 2 * k ); - - printf( "%5d\t %5d\t %5d\t %5.2lf\t %5.2lf\n", - m, n, k, flops / bl_dgemm_rectime, flops / ref_rectime ); - - free( A ); - free( B ); - free( C ); - free( C_ref ); + printf( + "%5d\t %5d\t %5d\t %5.2lf\t %5.2lf\n", + m, + n, + k, + flops / bl_dgemm_rectime, + flops / ref_rectime); + + free(A); + free(B); + free(C); + free(C_ref); } -int main( int argc, char *argv[] ) -{ - int m, n, k; +int main(int argc, char* argv[]) { + int m, n, k; - if ( argc != 4 ) { - printf( "Error: require 3 arguments, but only %d provided.\n", argc - 1 ); - exit( 0 ); - } + if (argc != 4) { + printf("Error: require 3 arguments, but only %d provided.\n", argc - 1); + exit(0); + } - sscanf( argv[ 1 ], "%d", &m ); - sscanf( argv[ 2 ], "%d", &n ); - sscanf( argv[ 3 ], "%d", &k ); + sscanf(argv[1], "%d", &m); + sscanf(argv[2], "%d", &n); + sscanf(argv[3], "%d", &k); - test_bl_dgemm( m, n, k ); + test_bl_dgemm(m, n, k); - return 0; + return 0; } -