矩阵乘法优化:4x4矩阵块优化方法

news/2025/2/13 19:06:09/

MMult_4x4_3.h

一次计算C中的4x4小块

0.24gflops

2.1%

1

MMult_4x4_4.h

一次计算C中的4x4小块

0.24gflops

2.1%

1

MMult_4x4_5.h

一次计算C中的4x4小块,将16个循环合并一个

0.25gflops

2.2%

1

MMult_4x4_6.h

一次计算C中的4x4小块(我们在寄存器中累加C的元素,并对a的元素使用寄存器)

1.75gflops

16.0%

1

MMult_4x4_7.h

在MMult_4x4_6的基础上用指针来寻址B中的元素

1.75gflops

16.0%

1

MMult_4x4_8.h

使用更多的寄存器

1.75gflops

16.0%

1

MMult_4x4_10.h

NEON指令集优化

2.6gflops

23.8%

1

MMult_4x4_11.h

NEON指令集优化, 并且为了保持较小问题规模所获得的性能,我们分块矩阵C(以及相应的A和B)

2.6gflops

23.8%

1

MMult_4x4_13.h

NEON指令集优化, 对矩阵A和B进行Pack,这样就可以连续访问内存

2.6gflops

23.8%

1

MMult_4x4_3:

一次计算C矩阵的16个元素:

void AddDot( int k, float *x, int incx,  float *y, float *gamma )
{int p;for ( p=0; p<k; p++ ){*gamma += x[p] * Y(p);     }
}void AddDot4x4( int k, float *a, int lda,  float *b, int ldb, float *c, int ldc )
{/* First row */AddDot( k, &A( 0, 0 ), lda, &B( 0, 0 ), &C( 0, 0 ) );AddDot( k, &A( 0, 0 ), lda, &B( 0, 1 ), &C( 0, 1 ) );AddDot( k, &A( 0, 0 ), lda, &B( 0, 2 ), &C( 0, 2 ) );AddDot( k, &A( 0, 0 ), lda, &B( 0, 3 ), &C( 0, 3 ) );/* Second row */AddDot( k, &A( 1, 0 ), lda, &B( 0, 0 ), &C( 1, 0 ) );AddDot( k, &A( 1, 0 ), lda, &B( 0, 1 ), &C( 1, 1 ) );AddDot( k, &A( 1, 0 ), lda, &B( 0, 2 ), &C( 1, 2 ) );AddDot( k, &A( 1, 0 ), lda, &B( 0, 3 ), &C( 1, 3 ) );/* Third row */AddDot( k, &A( 2, 0 ), lda, &B( 0, 0 ), &C( 2, 0 ) );AddDot( k, &A( 2, 0 ), lda, &B( 0, 1 ), &C( 2, 1 ) );AddDot( k, &A( 2, 0 ), lda, &B( 0, 2 ), &C( 2, 2 ) );AddDot( k, &A( 2, 0 ), lda, &B( 0, 3 ), &C( 2, 3 ) );/* Four row */AddDot( k, &A( 3, 0 ), lda, &B( 0, 0 ), &C( 3, 0 ) );AddDot( k, &A( 3, 0 ), lda, &B( 0, 1 ), &C( 3, 1 ) );AddDot( k, &A( 3, 0 ), lda, &B( 0, 2 ), &C( 3, 2 ) );AddDot( k, &A( 3, 0 ), lda, &B( 0, 3 ), &C( 3, 3 ) );
}void MY_MMult_4x4_3( int m, int n, int k, float *a, int lda, float *b, int ldb,float *c, int ldc )
{int i, j;for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );}}
}

MMult_4x4_4:

将上一步的矩阵元素直接乘加计算,合并成循环操作。将两个计算函数合并成一个。并显式地将C矩阵中各个索引的元素作为计算结果:

void AddDot4x4( int k, float *a, int lda,  float *b, int ldb, float *c, int ldc )
{int p;/* First row *///  AddDot( k, &A( 0, 0 ), lda, &B( 0, 0 ), &C( 0, 0 ) );for ( p=0; p<k; p++ ){C( 0, 0 ) += A( 0, p ) * B( p, 0 );     }//  AddDot( k, &A( 0, 0 ), lda, &B( 0, 1 ), &C( 0, 1 ) );for ( p=0; p<k; p++ ){C( 0, 1 ) += A( 0, p ) * B( p, 1 );     }//  AddDot( k, &A( 0, 0 ), lda, &B( 0, 2 ), &C( 0, 2 ) );for ( p=0; p<k; p++ ){C( 0, 2 ) += A( 0, p ) * B( p, 2 );     }//  AddDot( k, &A( 0, 0 ), lda, &B( 0, 3 ), &C( 0, 3 ) );for ( p=0; p<k; p++ ){C( 0, 3 ) += A( 0, p ) * B( p, 3 );     }/* Second row *///  AddDot( k, &A( 1, 0 ), lda, &B( 0, 0 ), &C( 1, 0 ) );for ( p=0; p<k; p++ ){C( 1, 0 ) += A( 1, p ) * B( p, 0 );     }//  AddDot( k, &A( 1, 0 ), lda, &B( 0, 1 ), &C( 1, 1 ) );for ( p=0; p<k; p++ ){C( 1, 1 ) += A( 1, p ) * B( p, 1 );     }//  AddDot( k, &A( 1, 0 ), lda, &B( 0, 2 ), &C( 1, 2 ) );for ( p=0; p<k; p++ ){C( 1, 2 ) += A( 1, p ) * B( p, 2 );     }//  AddDot( k, &A( 1, 0 ), lda, &B( 0, 3 ), &C( 1, 3 ) );for ( p=0; p<k; p++ ){C( 1, 3 ) += A( 1, p ) * B( p, 3 );     }/* Third row *///  AddDot( k, &A( 2, 0 ), lda, &B( 0, 0 ), &C( 2, 0 ) );for ( p=0; p<k; p++ ){C( 2, 0 ) += A( 2, p ) * B( p, 0 );     }//  AddDot( k, &A( 2, 0 ), lda, &B( 0, 1 ), &C( 2, 1 ) );for ( p=0; p<k; p++ ){C( 2, 1 ) += A( 2, p ) * B( p, 1 );     }//  AddDot( k, &A( 2, 0 ), lda, &B( 0, 2 ), &C( 2, 2 ) );for ( p=0; p<k; p++ ){C( 2, 2 ) += A( 2, p ) * B( p, 2 );     }//  AddDot( k, &A( 2, 0 ), lda, &B( 0, 3 ), &C( 2, 3 ) );for ( p=0; p<k; p++ ){C( 2, 3 ) += A( 2, p ) * B( p, 3 );     }/* Four row *///  AddDot( k, &A( 3, 0 ), lda, &B( 0, 0 ), &C( 3, 0 ) );for ( p=0; p<k; p++ ){C( 3, 0 ) += A( 3, p ) * B( p, 0 );     }//  AddDot( k, &A( 3, 0 ), lda, &B( 0, 1 ), &C( 3, 1 ) );for ( p=0; p<k; p++ ){C( 3, 1 ) += A( 3, p ) * B( p, 1 );     }//  AddDot( k, &A( 3, 0 ), lda, &B( 0, 2 ), &C( 3, 2 ) );for ( p=0; p<k; p++ ){C( 3, 2 ) += A( 3, p ) * B( p, 2 );     }//  AddDot( k, &A( 3, 0 ), lda, &B( 0, 3 ), &C( 3, 3 ) );for ( p=0; p<k; p++ ){C( 3, 3 ) += A( 3, p ) * B( p, 3 );     }
}void MY_MMult_4x4_4( int m, int n, int k, float *a, int lda, float *b, int ldb,float *c, int ldc )
{int i, j;for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );}}
}

MMult_4x4_5:

上一步一次for循环计算1个C中元素,这一步一次for循环计算4个C中元素:

void AddDot4x4( int k, float *a, int lda,  float *b, int ldb, float *c, int ldc )
{int p;for ( p=0; p<k; p++ ){/* First row */C( 0, 0 ) += A( 0, p ) * B( p, 0 );     C( 0, 1 ) += A( 0, p ) * B( p, 1 );     C( 0, 2 ) += A( 0, p ) * B( p, 2 );     C( 0, 3 ) += A( 0, p ) * B( p, 3 );     /* Second row */C( 1, 0 ) += A( 1, p ) * B( p, 0 );     C( 1, 1 ) += A( 1, p ) * B( p, 1 );     C( 1, 2 ) += A( 1, p ) * B( p, 2 );     C( 1, 3 ) += A( 1, p ) * B( p, 3 );     /* Third row */C( 2, 0 ) += A( 2, p ) * B( p, 0 );     C( 2, 1 ) += A( 2, p ) * B( p, 1 );     C( 2, 2 ) += A( 2, p ) * B( p, 2 );     C( 2, 3 ) += A( 2, p ) * B( p, 3 );     /* Fourth row */C( 3, 0 ) += A( 3, p ) * B( p, 0 );     C( 3, 1 ) += A( 3, p ) * B( p, 1 );     C( 3, 2 ) += A( 3, p ) * B( p, 2 );     C( 3, 3 ) += A( 3, p ) * B( p, 3 );}     
}void MY_MMult_4x4_5( int m, int n, int k, float *a, int lda, float *b, int ldb,float *c, int ldc )
{int i, j;for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );}}
}

MMult_4x4_6:

将A和C矩阵中元素送入寄存器来优化计算。每次只计算1个C中元素:

void AddDot4x4( int k, float *a, int lda,  float *b, int ldb, float *c, int ldc )
{int p;register float c_00_reg,   c_01_reg,   c_02_reg,   c_03_reg,  c_10_reg,   c_11_reg,   c_12_reg,   c_13_reg,  c_20_reg,   c_21_reg,   c_22_reg,   c_23_reg,  c_30_reg,   c_31_reg,   c_32_reg,   c_33_reg,a_0p_reg,a_1p_reg,a_2p_reg,a_3p_reg;c_00_reg = 0.0;   c_01_reg = 0.0;   c_02_reg = 0.0;   c_03_reg = 0.0;c_10_reg = 0.0;   c_11_reg = 0.0;   c_12_reg = 0.0;   c_13_reg = 0.0;c_20_reg = 0.0;   c_21_reg = 0.0;   c_22_reg = 0.0;   c_23_reg = 0.0;c_30_reg = 0.0;   c_31_reg = 0.0;   c_32_reg = 0.0;   c_33_reg = 0.0;for ( p=0; p<k; p++ ){a_0p_reg = A( 0, p );a_1p_reg = A( 1, p );a_2p_reg = A( 2, p );a_3p_reg = A( 3, p );/* First row */c_00_reg += a_0p_reg * B( p, 0 );     c_01_reg += a_0p_reg * B( p, 1 );     c_02_reg += a_0p_reg * B( p, 2 );     c_03_reg += a_0p_reg * B( p, 3 );     /* Second row */c_10_reg += a_1p_reg * B( p, 0 );     c_11_reg += a_1p_reg * B( p, 1 );     c_12_reg += a_1p_reg * B( p, 2 );     c_13_reg += a_1p_reg * B( p, 3 );     /* Third row */c_20_reg += a_2p_reg * B( p, 0 );     c_21_reg += a_2p_reg * B( p, 1 );     c_22_reg += a_2p_reg * B( p, 2 );     c_23_reg += a_2p_reg * B( p, 3 );     /* Four row */c_30_reg += a_3p_reg * B( p, 0 );     c_31_reg += a_3p_reg * B( p, 1 );     c_32_reg += a_3p_reg * B( p, 2 );     c_33_reg += a_3p_reg * B( p, 3 );     }C( 0, 0 ) += c_00_reg;   C( 0, 1 ) += c_01_reg;   C( 0, 2 ) += c_02_reg;   C( 0, 3 ) += c_03_reg;C( 1, 0 ) += c_10_reg;   C( 1, 1 ) += c_11_reg;   C( 1, 2 ) += c_12_reg;   C( 1, 3 ) += c_13_reg;C( 2, 0 ) += c_20_reg;   C( 2, 1 ) += c_21_reg;   C( 2, 2 ) += c_22_reg;   C( 2, 3 ) += c_23_reg;C( 3, 0 ) += c_30_reg;   C( 3, 1 ) += c_31_reg;   C( 3, 2 ) += c_32_reg;   C( 3, 3 ) += c_33_reg;
}void MY_MMult_4x4_6( int m, int n, int k, float *a, int lda, float *b, int ldb,float *c, int ldc )
{int i, j;for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );}}
}

MMult_4x4_7:

将A和C矩阵中元素送入寄存器,并用指针来索引B矩阵中元素,使用索引++来获取B矩阵中下一个元素:

void AddDot4x4( int k, float *a, int lda,  float *b, int ldb, float *c, int ldc )
{int p;register float c_00_reg,   c_01_reg,   c_02_reg,   c_03_reg,  c_10_reg,   c_11_reg,   c_12_reg,   c_13_reg,  c_20_reg,   c_21_reg,   c_22_reg,   c_23_reg,  c_30_reg,   c_31_reg,   c_32_reg,   c_33_reg,a_0p_reg,a_1p_reg,a_2p_reg,a_3p_reg;float /* Point to the current elements in the four columns of B */*b_p0_pntr, *b_p1_pntr, *b_p2_pntr, *b_p3_pntr; c_00_reg = 0.0;   c_01_reg = 0.0;   c_02_reg = 0.0;   c_03_reg = 0.0;c_10_reg = 0.0;   c_11_reg = 0.0;   c_12_reg = 0.0;   c_13_reg = 0.0;c_20_reg = 0.0;   c_21_reg = 0.0;   c_22_reg = 0.0;   c_23_reg = 0.0;c_30_reg = 0.0;   c_31_reg = 0.0;   c_32_reg = 0.0;   c_33_reg = 0.0;for ( p=0; p<k; p++ ){a_0p_reg = A( 0, p );a_1p_reg = A( 1, p );a_2p_reg = A( 2, p );a_3p_reg = A( 3, p );b_p0_pntr = &B( p, 0 );b_p1_pntr = &B( p, 1 );b_p2_pntr = &B( p, 2 );b_p3_pntr = &B( p, 3 );	  /* First row */c_00_reg += a_0p_reg * *b_p0_pntr;     c_01_reg += a_0p_reg * *b_p1_pntr;     c_02_reg += a_0p_reg * *b_p2_pntr;     c_03_reg += a_0p_reg * *b_p3_pntr;     /* Second row */c_10_reg += a_1p_reg * *b_p0_pntr;     c_11_reg += a_1p_reg * *b_p1_pntr;     c_12_reg += a_1p_reg * *b_p2_pntr;     c_13_reg += a_1p_reg * *b_p3_pntr;     /* Third row */c_20_reg += a_2p_reg * *b_p0_pntr;     c_21_reg += a_2p_reg * *b_p1_pntr;     c_22_reg += a_2p_reg * *b_p2_pntr;     c_23_reg += a_2p_reg * *b_p3_pntr;     /* Four row */c_30_reg += a_3p_reg * *b_p0_pntr++;     c_31_reg += a_3p_reg * *b_p1_pntr++;     c_32_reg += a_3p_reg * *b_p2_pntr++;     c_33_reg += a_3p_reg * *b_p3_pntr++;     }C( 0, 0 ) += c_00_reg;   C( 0, 1 ) += c_01_reg;   C( 0, 2 ) += c_02_reg;   C( 0, 3 ) += c_03_reg;C( 1, 0 ) += c_10_reg;   C( 1, 1 ) += c_11_reg;   C( 1, 2 ) += c_12_reg;   C( 1, 3 ) += c_13_reg;C( 2, 0 ) += c_20_reg;   C( 2, 1 ) += c_21_reg;   C( 2, 2 ) += c_22_reg;   C( 2, 3 ) += c_23_reg;C( 3, 0 ) += c_30_reg;   C( 3, 1 ) += c_31_reg;   C( 3, 2 ) += c_32_reg;   C( 3, 3 ) += c_33_reg;
}void MY_MMult_4x4_7( int m, int n, int k, float *a, int lda, float *b, int ldb,float *c, int ldc )
{int i, j;for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );}}
}

MMult_4x4_8:

将A,B,C矩阵中元素都送入寄存器中来加速计算:

void AddDot4x4( int k, float *a, int lda,  float *b, int ldb, float *c, int ldc )
{int p;c_00_reg,   c_01_reg,   c_02_reg,   c_03_reg,  c_10_reg,   c_11_reg,   c_12_reg,   c_13_reg,  c_20_reg,   c_21_reg,   c_22_reg,   c_23_reg,  c_30_reg,   c_31_reg,   c_32_reg,   c_33_reg,a_0p_reg,a_1p_reg,a_2p_reg,a_3p_reg,b_p0_reg,b_p1_reg,b_p2_reg,b_p3_reg;float /* Point to the current elements in the four rows of A */*a_0p_pntr, *a_1p_pntr, *a_2p_pntr, *a_3p_pntr;a_0p_pntr = &A( 0, 0);a_1p_pntr = &A( 1, 0);a_2p_pntr = &A( 2, 0);a_3p_pntr = &A( 3, 0);c_00_reg = 0.0;   c_01_reg = 0.0;   c_02_reg = 0.0;   c_03_reg = 0.0;c_10_reg = 0.0;   c_11_reg = 0.0;   c_12_reg = 0.0;   c_13_reg = 0.0;c_20_reg = 0.0;   c_21_reg = 0.0;   c_22_reg = 0.0;   c_23_reg = 0.0;c_30_reg = 0.0;   c_31_reg = 0.0;   c_32_reg = 0.0;   c_33_reg = 0.0;for ( p=0; p<k; p++ ){a_0p_reg = *a_0p_pntr++;a_1p_reg = *a_1p_pntr++;a_2p_reg = *a_2p_pntr++;a_3p_reg = *a_3p_pntr++;b_p0_reg = B( p, 0);b_p1_reg = B( p, 1);b_p2_reg = B( p, 2);b_p3_reg = B( p, 3);/* First row */c_00_reg += a_0p_reg * b_p0_reg;c_01_reg += a_0p_reg * b_p1_reg;c_02_reg += a_0p_reg * b_p2_reg;c_03_reg += a_0p_reg * b_p3_reg;/* Second row */c_10_reg += a_1p_reg * b_p0_reg;c_11_reg += a_1p_reg * b_p1_reg;c_12_reg += a_1p_reg * b_p2_reg;c_13_reg += a_1p_reg * b_p3_reg;/* Third row */c_20_reg += a_2p_reg * b_p0_reg;c_21_reg += a_2p_reg * b_p1_reg;c_22_reg += a_2p_reg * b_p2_reg;c_23_reg += a_2p_reg * b_p3_reg;/* Four row */c_30_reg += a_3p_reg * b_p0_reg;c_31_reg += a_3p_reg * b_p1_reg;c_32_reg += a_3p_reg * b_p2_reg;c_33_reg += a_3p_reg * b_p3_reg;}C( 0, 0 ) += c_00_reg;   C( 0, 1 ) += c_01_reg;   C( 0, 2 ) += c_02_reg;   C( 0, 3 ) += c_03_reg;C( 1, 0 ) += c_10_reg;   C( 1, 1 ) += c_11_reg;   C( 1, 2 ) += c_12_reg;   C( 1, 3 ) += c_13_reg;C( 2, 0 ) += c_20_reg;   C( 2, 1 ) += c_21_reg;   C( 2, 2 ) += c_22_reg;   C( 2, 3 ) += c_23_reg;C( 3, 0 ) += c_30_reg;   C( 3, 1 ) += c_31_reg;   C( 3, 2 ) += c_32_reg;   C( 3, 3 ) += c_33_reg;
}void MY_MMult_4x4_8( int m, int n, int k, float *a, int lda, float *b, int ldb,float *c, int ldc )
{int i, j;for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */for ( i=0; i<m; i+=4 ){        /* Loop over the rows of C */AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );}}
}


http://www.ppmy.cn/news/837863.html

相关文章

js逆向补环境-调试工具vscode与nodejs使用之无环境联调

目录 一、啊哈一、Nodejs安装1、nodejs最新版本的安装&#xff08;windows&#xff09;2、旧版nodejs更新成最新版本&#xff08;windows&#xff09;3、nodejs安装&#xff08;linux&#xff09; 二、vscode安装使用(windows)1、下载安装vscode2、vscode运行插件Code Runner安…

qq授权一

至此&#xff0c;创建应用成功

qq授权

OAuth2.0&#xff1a; 基本授权的开发网络协议&#xff0c;允许用户在第三方调用自己在其他网站上的信息如图片个人列表等。 基本原理&#xff1a; 在第三方应用和服务提供商之间有一个中间应用层&#xff0c;由中间应用层来通过token来进行协调。 授权qq登录流程&#xff1a; …

腾讯qq的授权管理查看页面

链接&#xff1a;https://connect.qq.com/manage.html#/appauth/user?fromauth 用手机QQ扫描弹出的二维码后&#xff0c;即可进入下列页面&#xff0c;查看所有实用腾讯QQ app授权过的网站和应用&#xff1a; 要获取更多Jerry的原创文章&#xff0c;请关注公众号"汪子熙…

qq授权二

1、腾讯QQ提供授权码---给应用网站 a、需要生成授权码链接&#xff0c;授权成功之后&#xff0c;使用授权码获取accessToken 参考网址 eg&#xff1a; https://graph.qq.com/oauth2.0/authorize?response_typecode&client_id101420900&redirect_urihttps://accoun…

实现app第三方QQ授权登录

关于QQ授权登录的实现&#xff0c;官网有详细描述&#xff0c;点击链接 。 这里根据自己的实现过程简单介绍一下整个过程&#xff1a; 1. 注册成为开发者&#xff0c;创建应用&#xff0c;获取appid&#xff0c;这个appid后面要用到。详情步骤请查看官网链接 2. 在你的Android工…

QQ授权登录和微信授权登录

官方文档&#xff1a; http://wiki.open.qq.com/wiki/%E3%80%90QQ%E7%99%BB%E5%BD%95%E3%80%91%E6%94%BE%E7%BD%AE%E2%80%9CQQ%E7%99%BB%E5%BD%95%E2%80%9D%E6%8C%89%E9%92%AE https://open.weixin.qq.com/cgi-bin/showdocument?actiondir_list&tresource/res_list&ve…

网站授权QQ登录

网站授权QQ登录 功能流程 用户通过QQ扫码就登录了咱们的网站&#xff0c;B格拉高了&#xff0c;减少了输入密码的烦恼 申请流程 首先需要去QQ互联申请应用 填写网站的相关信息&#xff0c;以及回调地址&#xff0c;需要进行审核。 申请流程暂时不说了&#xff0c;百度一下…