Cache Simulator
Cache Simulator
N == 64) { transpose_6464(M,N,A,B); } else { transpose_6167(M,N,A,B); } ENSURES(is_transpose(M, N, A, B)); } char transpose_6464_desc[] = "64x64"; void transpose_6464(int M, int N, int A[N][M], int B[M][N]) { int i,j,ai,aj,k; //loop variables int t0,t1,t2,t3; //temp variables int blockSize = 8; for(i = 0; i < N; i+=blockSize) { for(j = 0; j < M; j+=blockSize) { //split each block into 4 segments: /*a|b c|d*/ //and we want to transpose to: /*a-1|c-1 b-1|d-1*/ //keep in mind we still need to transpose //inside each segment (notice the -1 notation) //segment a aj = j; for(ai = i; ai < i+(blockSize/2); ai++) { //store A 1 x blockSize/2 into variables t0 = A[ai][aj]; t1 = A[ai][aj+1]; t2 = A[ai][aj+2]; t3 = A[ai][aj+3]; /*move into B, this way you keep A in the cache and don't loose it after the first few mov es from A to B*/ B[aj][ai] = B[aj+1][ai] B[aj+2][ai] B[aj+3][ai] } //segment b aj = j+(blockSize/2); for(ai = i; ai < i+(blockSize/2); ai++) { //store A 1 x blockSize/2 into variables t0 = A[ai][aj]; t1 = A[ai][aj+1]; t2 = A[ai][aj+2]; t3 = A[ai][aj+3]; /*move into B, this way you keep A in the t0; = t1; = t2; = t3;
cache and don't loose it after the first few mov es from A to B*/ B[aj-(blockSize/2)][ai+(blockSize/2)] = B[aj-(blockSize/2)+1][ai+(blockSize/2)] B[aj-(blockSize/2)+2][ai+(blockSize/2)] B[aj-(blockSize/2)+3][ai+(blockSize/2)] } /*Notice that above we moved segment b into the upper ri ght corner of the block, This is because later when we move segment b into the up per right corner, we will also move segment b into the bottom left corner, keep in mind b is already transposed just not in the cor rect place, but since segment b is already transposed we move it int o the bottom right corner row by row and segment c column by column from matrix A, so this wi ll transpose c while we are moving it into the top right corner of the block in matrix B*/ /*This is better because we won't be jumping around if w e were to move segment b into the bottom right corner of the block in matrix B and then mo ving c into the top right corner of the block in matrix B*/ //segment b & c for(k = 0; k < (blockSize/2); k++) { //store 1 x blockSize/2 into temp variables t0 = B[j+k][i+(blockSize/2)]; t1 = B[j+k][i+(blockSize/2)+1]; t2 = B[j+k][i+(blockSize/2)+2]; t3 = B[j+k][i+(blockSize/2)+3]; //move C into the upper right corner of block in B while transposing it B[j+k][i+(blockSize/2)] = A[i+(blockSize/2)][j+k ]; B[j+k][i+(blockSize/2)+1] = A[i+(blockSize/2)+1] [j+k]; B[j+k][i+(blockSize/2)+2] = A[i+(blockSize/2)+2] [j+k]; B[j+k][i+(blockSize/2)+3] = A[i+(blockSize/2)+3] [j+k]; //move the temp variables into the bottom left c orner, without transposing it B[j+(blockSize/2)+k][i] = B[j+(blockSize/2)+k][i+1] B[j+(blockSize/2)+k][i+2] B[j+(blockSize/2)+k][i+3] } //segment d aj = j+(blockSize/2); for(ai = i+(blockSize/2); ai < i+blockSize; ai++) { //store A 1 x blockSize/2 into variables t0 = A[ai][aj]; t1 = A[ai][aj+1]; t0; = t1; = t2; = t3; t0; = t1; = t2; = t3;
t2 = A[ai][aj+2]; t3 = A[ai][aj+3]; /*move into B, this way you keep A in the cache and don't loose it after the first few mov es from A to B*/ B[aj][ai] = B[aj+1][ai] B[aj+2][ai] B[aj+3][ai] } } } } char transpose_3232_desc[] = "32x32"; void transpose_3232(int M, int N, int A[N][M], int B[M][N]) { int dn, m, n; //loop variables int t0, t1, t2, t3, t4, t5, t6, t7; //temp variables int blockSize = 8; for(n = 0; n < N; n += blockSize) { for(m = 0; m < M; m += blockSize) { for(dn = n; dn < n+blockSize; dn++) { //store A 1 x blockSize into variables t0 = A[dn][m]; t1 = A[dn][m+1]; t2 = A[dn][m+2]; t3 = A[dn][m+3]; t4 = A[dn][m+4]; t5 = A[dn][m+5]; t6 = A[dn][m+6]; t7 = A[dn][m+7]; /*move into B, this way you keep A in the cache and don't loose it after the first few mov es from A to B*/ B[m][dn] = B[m+1][dn] B[m+2][dn] B[m+3][dn] B[m+4][dn] B[m+5][dn] B[m+6][dn] B[m+7][dn] } } } } char transpose_6167_desc[] = "61x67"; void transpose_6167(int M, int N, int A[N][M], int B[M][N]) { int dn, m, n; //loop variables int t0, t1, t2, t3, t4, t5, t6, t7; //temp variables int blockSize = 8; //oversize N and M to the nearest multiple of the blockSize t0; = t1; = t2; = t3; = t4; = t5; = t6; = t7; t0; = t1; = t2; = t3;
int overN = N+(blockSize-(N%blockSize)); int overM = M+(blockSize-(M%blockSize)); /*we let the blocking "overflow" and block by 8*/ for(n = 0; n < overN; n += blockSize) { for(m = 0; m < overM; m += blockSize) { for(dn = n; dn < n+blockSize; dn++) { if(dn < N) { //store A 1 x blockSize into variables if(m < M) {t0 = A[dn][m];} if(m+1 < M) {t1 = A[dn][m+1];} if(m+2 < M) {t2 = A[dn][m+2];} if(m+3 < M) {t3 = A[dn][m+3];} if(m+4 < M) {t4 = A[dn][m+4];} //make sure you aren't accessing something out o f range if(m+5 < M) {t5 = A[dn][m+5];} if(m+6 < M) {t6 = A[dn][m+6];} if(m+7 < M) {t7 = A[dn][m+7];} /*move into B, this way you keep A in the cache and don't loose it after the first few mov es from A to B*/ if(m < if(m+1 if(m+2 if(m+3 if(m+5 range if(m+5 < M) {B[m+5][dn] = t5;} if(m+6 < M) {B[m+6][dn] = t6;} if(m+7 < M) {B[m+7][dn] = t7;} } } } M) {B[m][dn] = t0;} < M) {B[m+1][dn] = t1;} < M) {B[m+2][dn] = t2;} < M) {B[m+3][dn] = t3;} < M) {B[m+4][dn] = t4;}