strlen的实现是通过4个字节4个字节进行枚举,然后通过位运算来判断这4个字节中是否有一个字节含有0,这样的话,效率就提高了4倍。
这个效率提高是假设a&b&c&d与a&b有差不多效率的前提下。
那用8字节8字节来偏移的话,是不是更快呢?32位机上不会,64位机上会提高一倍。因为a&b在64位下会提高一倍,因为32位的寄存器大小是32位的,对于分别MOV高位与低位两次。
本来实验a&b&c&d与a&b的速度的,经实验验证,这两个效率确实是差不多的,然后去看汇编,看指令条数,在没有使用-O优化下,指令的条数差别跟运算符号的个数的倍数相同,就让我感到疑惑了。
下面附上实验的代码:
#include <iostream> #include <time.h> #include <cstdio> #include <string> using namespace std; int _strlen(const char *str) { const unsigned int *p = (const unsigned int *) str; unsigned int low = 0x01010101; unsigned int high = 0x80808080; while (true) { unsigned int d = *p++; if (((d - low) & ~d & high) != 0) { // handle [0...256) //if (((d - low) & high) != 0) { // handle [0...128) break; } } const char *q = (const char *)(p - 1); for (int i = 0; i < (int)sizeof(unsigned int); i++) { if (q[i] == 0) { return q - str + i; } } return -1; } int _strlen2(const char *str) { const char *p = str; while (*p != 0) { p++; } return p - str; } int _strlen3(const char *str) { const unsigned long long *p = (const unsigned long long *) str; unsigned long long low = 0x0101010101010101; unsigned long long high = 0x8080808080808080; while (true) { unsigned long long d = *p++; if (((d - low) & ~d & high) != 0) { // handle [0...256) //if (((d - low) & high) != 0) { // handle [0...128) break; } } const char *q = (const char *)(p - 1); for (int i = 0; i < (int)sizeof(unsigned long long); i++) { if (q[i] == 0) { return q - str + i; } } return -1; } size_t _strlen4(const char *str) { const char *char_ptr; const unsigned long int *longword_ptr; unsigned long int longword, himagic, lomagic; /* Handle the first few characters by reading one character at a time. Do this until CHAR_PTR is aligned on a longword boundary. */ for (char_ptr = str; ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0; ++char_ptr) if (*char_ptr == '