理论上还有第个perflab....
Part A
之前寒假的时候beginend说过cachelab很难,但是感觉做下来还行?也可能是他把malloclab记错成了cachelab也说不定(
不管了。Part A就是要按照书上cache memory的组织结构写一个简单的判断器,即给定若干读和写,来判断每次对内存的操作是/否命中缓存。事实上开一个三维数组就好了(如果不实现Blocks甚至只需要二维)
不过这一次还是学到很多东西的,这里罗列一下
getopt()
函数,这个函数来自unistd.h
或直接是getopt.h
,两者选哪一个取决于C的标准。这个函数实现了从argc, argv
中一个一个取出参数的功能,并且提供了[必选参数/可选参数/单独参数]三类参数的提取,很好用。如果有类似--debug
这样的参数可能要用上long_getopt()
之类的函数strtok()
函数,这个最早见到是在PA lab里面。可以理解为split()
函数sscanf()
和atoi()
,这个可以把字符串转数字。根据数字的进制选择用不用sscanf
。事实上还有类似的atof()
、itoa()
、sprintf()
这样的函数。多看官方文档~calloc()
类似于malloc()
,区别在于calloc()
会初始化分配的内存为0
,常用与数组(回想一下数组的默认初始化)
个人觉得库函数还是很好用的,至少比自己写要精炼得多了。看来还是要多研究研究别人造过的轮子啊
难点大概就在于getopt()
和如何优雅地取出对应的位,还有就是对M
操作的处理。这些都不算太难,写就完了。注意LRU的策略指的是最后一次访问最早的先被删。
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <getopt.h>
#include <stdbool.h>
#include "cachelab.h"
#define CMD_ARGS "h::v::s:E:b:t:"
#define INF 0x7FFFFFFF
typedef long long LL;
typedef struct {
int *block;
int sign, last;
bool used;
} Set;
Set **Line;
int s, E, b, S, B;
int hit_cnt, miss_cnt, evict_cnt;
char *filename;
bool debug = false;
void output(char *s) {
if (debug) printf("%s", s);
}
void hit() {
output("hit ");
++ hit_cnt;
}
void miss() {
output("miss ");
++ miss_cnt;
}
void evict() {
output("eviction ");
++ evict_cnt;
}
FILE *openFile(char *filename) {
FILE *fin = fopen(filename, "r");
if (fin == NULL) {
puts("Error: file not found");
exit(-1);
}
return fin;
}
void update(int cur_time, int set_no, int block_offset, int sign, int wsiz) {
int rec = -1, rec_last = INF;
for (int i = 0; i < E; ++ i) {
Set *tmp_line = &Line[set_no][i];
if (!tmp_line->used) {
rec = i;
break;
}
if (tmp_line->last < rec_last) {
rec_last = tmp_line->last;
rec = i;
}
}
Line[set_no][rec].last = cur_time;
Line[set_no][rec].sign = sign;
if (Line[set_no][rec].used) {
evict();
} else {
Line[set_no][rec].used = true;
}
}
bool load(int cur_time, int set_no, int block_offset, int sign, int wsiz) {
for (int i = 0; i < E; ++ i) {
Set *tmp_line = &Line[set_no][i];
if (tmp_line->used && tmp_line->sign == sign) {
tmp_line->last = cur_time;
hit();
return true;
}
}
miss();
update(cur_time, set_no, block_offset, sign, wsiz);
return false;
}
void process(FILE *fin) {
char *cmd = (char *)malloc(51 * sizeof(char));
char *tmp_cmd = cmd;
int cur_time = 0;
for (fgets(cmd, 30, fin); !feof(fin); cmd = tmp_cmd, fgets(cmd, 30, fin)) {
if (cmd[0] == 'I') continue;
while ( (*cmd) == ' ') cmd ++;
cur_time ++;
cmd[strlen(cmd) - 1] = ' ';
char *type = strtok(cmd, " ,");
char *addr = strtok(NULL, " ,");
char *wsiz = strtok(NULL, " ,");
int wsiz_n = atoi(wsiz);
LL addr_n; sscanf(addr, "%llx", &addr_n);
if (debug) printf("%s %llx,%s ", type, addr_n, wsiz);
int block_offset = addr_n & (B - 1);
int set_no = (addr_n >> b) & (S - 1);
int sign = (addr_n >> (s + b) );
load(cur_time, set_no, block_offset, sign, wsiz_n);
if (type[0] == 'M') {
load(cur_time, set_no, block_offset, sign, wsiz_n);
}
output("
");
}
printSummary(hit_cnt, miss_cnt, evict_cnt);
}
void init() {
Line = malloc(S * sizeof(Set *));
for (int i = 0; i < S; ++ i) {
Line[i] = malloc(E * sizeof(Set));
for (int j = 0; j < E; ++ j) {
Line[i][j].block = malloc(B * sizeof(int));
Line[i][j].used = false;
}
}
}
int main(int argc, char *const *argv) {
char *filename;
for (int opt; ~(opt = getopt(argc, argv, CMD_ARGS)); ) {
switch (opt) {
case 's': {
s = atoi(optarg);
S = 1 << s;
break;
}
case 'E': {
E = atoi(optarg);
break;
}
case 'b': {
b = atoi(optarg);
B = 1 << b;
break;
}
case 't': {
filename = optarg;
break;
}
case 'v': {
debug = true;
}
}
}
init();
process(openFile(filename));
return 0;
}
Part B
写死我了...
一个最naive的优化就是视频中说的blocking,通过恰当分块就可以实现高效利用cache
对于32x32的问题,答案很简单就是分成8x8的块,61x67的也类似,难的在于64x64
难点在于:一次访存会加载连续8个元素,而访问行数大于4时就会出现thrashing,因此用8x8的block会thrashing,用4x4的block则利用不充分
最后是看了别人的解析才会做的,具体可以看这篇https://www.cnblogs.com/liqiuhao/p/8026100.html
大概意思就是把8x8分成四个4x4,每次用4x8的方式移动,这样是坠吼的
/*
* transpose_submit - This is the solution transpose function that you
* will be graded on for Part B of the assignment. Do not change
* the description string "Transpose submission", as the driver
* searches for that string to identify the transpose function to
* be graded.
*/
char transpose_submit_desc[] = "Transpose submission";
void transpose_submit(int M, int N, int A[N][M], int B[M][N])
{
int t0, t1, t2, t3, t4, t5, t6, t7, t8;
if (M == 64) {
for (int si = 0; si < N; si += 8) {
for (int sj = 0; sj < M; sj += 8) {
for (int i = si; i < si + 4; ++ i) {
t0 = A[i][sj];
t1 = A[i][sj + 1];
t2 = A[i][sj + 2];
t3 = A[i][sj + 3];
t4 = A[i][sj + 4];
t5 = A[i][sj + 5];
t6 = A[i][sj + 6];
t7 = A[i][sj + 7];
B[sj][i] = t0;
B[sj + 1][i] = t1;
B[sj + 2][i] = t2;
B[sj + 3][i] = t3;
B[sj][i + 4] = t7;
B[sj + 1][i + 4] = t6;
B[sj + 2][i + 4] = t5;
B[sj + 3][i + 4] = t4;
}
for (int j = 0; j < 4; ++ j) {
t0 = A[si + 4][sj + j + 4];
t1 = A[si + 5][sj + j + 4];
t2 = A[si + 6][sj + j + 4];
t3 = A[si + 7][sj + j + 4];
t4 = A[si + 4][sj + 3 - j];
t5 = A[si + 5][sj + 3 - j];
t6 = A[si + 6][sj + 3 - j];
t7 = A[si + 7][sj + 3 - j];
B[sj + j + 4][si] = B[sj + 3 - j][si + 4];
B[sj + j + 4][si + 1] = B[sj + 3 - j][si + 5];
B[sj + j + 4][si + 2] = B[sj + 3 - j][si + 6];
B[sj + j + 4][si + 3] = B[sj + 3 - j][si + 7];
B[sj + 3 - j][si + 4] = t4;
B[sj + 3 - j][si + 5] = t5;
B[sj + 3 - j][si + 6] = t6;
B[sj + 3 - j][si + 7] = t7;
B[sj + j + 4][si + 4] = t0;
B[sj + j + 4][si + 5] = t1;
B[sj + j + 4][si + 6] = t2;
B[sj + j + 4][si + 7] = t3;
}
}
}
} else if (M == 61) {
for (int si = 0; si < N; si += 13) {
for (int sj = 0; sj < M; sj += 8) {
for (int i = si; i < si + 13 && i < N; ++ i) {
for (int j = sj; j < sj + 8 && j < M; ++ j) {
t0 = A[i][j];
B[j][i] = t0;
}
}
}
}
} else if (M == 32) {
for (int si = 0; si < N; si += 8) {
for (int sj = 0; sj < M; sj += 8) {
for (int i = si; i < si + 8; ++ i) {
t1 = A[i][sj];
t2 = A[i][sj + 1];
t3 = A[i][sj + 2];
t4 = A[i][sj + 3];
t5 = A[i][sj + 4];
t6 = A[i][sj + 5];
t7 = A[i][sj + 6];
t8 = A[i][sj + 7];
B[sj][i] = t1;
B[sj + 1][i] = t2;
B[sj + 2][i] = t3;
B[sj + 3][i] = t4;
B[sj + 4][i] = t5;
B[sj + 5][i] = t6;
B[sj + 6][i] = t7;
B[sj + 7][i] = t8;
}
}
}
}
}