/* * trans.c - Matrix transpose B = A^T * * Each transpose function must have a prototype of the form: * void trans(int M, int N, int A[N][M], int B[M][N]); * * A transpose function is evaluated by counting the number of misses * on a 1KB direct mapped cache with a block size of 32 bytes. */ #include #include "cachelab.h" #define BLOCK_SIZE 8 #define SUB_BLOCK_SIZE 4 int is_transpose(int M, int N, int A[N][M], int B[M][N]); /* * transpose_submit - This is the solution transpose function that you * will be graded on for Part B of the assignment. Do not change * the description string "Transpose submission", as the driver * searches for that string to identify the transpose function to * be graded. */ char transpose_submit_desc[] = "Transpose submission"; void transpose_submit(int M, int N, int A[N][M], int B[M][N]) { //BLOCK_SIZE是8,代表每个块的int数目:8,也代表我处理的基本单元:8*8矩阵 //用于循环的变量x,y,i int i, x, y; //用于读取数据的8个变量 int a0,a1,a2,a3,a4,a5,a6,a7; //情况1:32*32或者48*48 if(M == 32 || M == 48) { for(x = 0; x < (N/BLOCK_SIZE); x ++) { for(y = 0; y < (M/BLOCK_SIZE); y ++) { //遍历每个8*8的子块 for(i = 0; i < BLOCK_SIZE; i ++) { //读取A中对应行的数据 a0 = A[BLOCK_SIZE*x + i][BLOCK_SIZE*y + 0]; a1 = A[BLOCK_SIZE*x + i][BLOCK_SIZE*y + 1]; a2 = A[BLOCK_SIZE*x + i][BLOCK_SIZE*y + 2]; a3 = A[BLOCK_SIZE*x + i][BLOCK_SIZE*y + 3]; a4 = A[BLOCK_SIZE*x + i][BLOCK_SIZE*y + 4]; a5 = A[BLOCK_SIZE*x + i][BLOCK_SIZE*y + 5]; a6 = A[BLOCK_SIZE*x + i][BLOCK_SIZE*y + 6]; a7 = A[BLOCK_SIZE*x + i][BLOCK_SIZE*y + 7]; //写入到B中对应列 B[BLOCK_SIZE*y + 0][BLOCK_SIZE*x + i]= a0; B[BLOCK_SIZE*y + 1][BLOCK_SIZE*x + i]= a1; B[BLOCK_SIZE*y + 2][BLOCK_SIZE*x + i]= a2; B[BLOCK_SIZE*y + 3][BLOCK_SIZE*x + i]= a3; B[BLOCK_SIZE*y + 4][BLOCK_SIZE*x + i]= a4; B[BLOCK_SIZE*y + 5][BLOCK_SIZE*x + i]= a5; B[BLOCK_SIZE*y + 6][BLOCK_SIZE*x + i]= a6; B[BLOCK_SIZE*y + 7][BLOCK_SIZE*x + i]= a7; } } } } //情况2:64*64 else if(M == 64) { //遍历每个8*8的子块 for(x = 0; x < 8; x ++) { for(y = 0; y < 8; y ++) { //处理A矩阵的前四行 for(i = 0; i < SUB_BLOCK_SIZE; i ++) { //读入一行8个数据 a0 = A[BLOCK_SIZE*x + i][BLOCK_SIZE*y + 0]; a1 = A[BLOCK_SIZE*x + i][BLOCK_SIZE*y + 1]; a2 = A[BLOCK_SIZE*x + i][BLOCK_SIZE*y + 2]; a3 = A[BLOCK_SIZE*x + i][BLOCK_SIZE*y + 3]; a4 = A[BLOCK_SIZE*x + i][BLOCK_SIZE*y + 4]; a5 = A[BLOCK_SIZE*x + i][BLOCK_SIZE*y + 5]; a6 = A[BLOCK_SIZE*x + i][BLOCK_SIZE*y + 6]; a7 = A[BLOCK_SIZE*x + i][BLOCK_SIZE*y + 7]; //前四个数据写到B11的正确位置(左上角) B[BLOCK_SIZE*y + 0][BLOCK_SIZE*x + i]= a0; B[BLOCK_SIZE*y + 1][BLOCK_SIZE*x + i]= a1; B[BLOCK_SIZE*y + 2][BLOCK_SIZE*x + i]= a2; B[BLOCK_SIZE*y + 3][BLOCK_SIZE*x + i]= a3; //后四个数据本应该写到B21(左下角),为了减少miss,我把它们写到B12(右上角)的对应位置 B[BLOCK_SIZE*y + 0][BLOCK_SIZE*x + i + 4]= a4; B[BLOCK_SIZE*y + 1][BLOCK_SIZE*x + i + 4]= a5; B[BLOCK_SIZE*y + 2][BLOCK_SIZE*x + i + 4]= a6; B[BLOCK_SIZE*y + 3][BLOCK_SIZE*x + i + 4]= a7; } //处理A矩阵的后四行,这次是按列读取数据,每次先读第i列,再读i+4列 for(i = 0; i