memcpy?memmove?
//#pragma GCC optimize(2) #include<bits/stdc++.h> using namespace std; const int n=50000000; int a[n+10],b[n+10],c[n+10],d[n+10],e[n+10],f[n+10]; int main() { int i; srand(2395); for(i=1;i<=n;i++) a[i]=rand(); clock_t st1=clock(); memcpy(b+1,a+1,sizeof(int)*n); clock_t ed1=clock(); clock_t st2=clock(); memmove(c+1,a+1,sizeof(int)*n); clock_t ed2=clock(); clock_t st3=clock(); for(i=1;i<=n;i++) d[i]=a[i]; clock_t ed3=clock(); clock_t st4=clock(); for(i=1;i<=n-3;i+=4) { e[i]=a[i]; e[i+1]=a[i+1]; e[i+2]=a[i+2]; e[i+3]=a[i+3]; } (i<=n)&&(e[i]=a[i]); (i+1<=n)&&(e[i+1]=a[i+1]); (i+2<=n)&&(e[i+2]=a[i+2]); clock_t ed4=clock(); clock_t st5=clock(); for(i=1;i<=n-7;i+=8) { f[i]=a[i]; f[i+1]=a[i+1]; f[i+2]=a[i+2]; f[i+3]=a[i+3]; f[i+4]=a[i+4]; f[i+5]=a[i+5]; f[i+6]=a[i+6]; f[i+7]=a[i+7]; } (i<=n)&&(f[i]=a[i]); (i+1<=n)&&(f[i+1]=a[i+1]); (i+2<=n)&&(f[i+2]=a[i+2]); (i+3<=n)&&(f[i+3]=a[i+3]); (i+4<=n)&&(f[i+4]=a[i+4]); (i+5<=n)&&(f[i+5]=a[i+5]); (i+6<=n)&&(f[i+6]=a[i+6]); clock_t ed5=clock(); cout<<"time1:"<<ed1-st1<<' '<<memcmp(a+1,b+1,sizeof(int)*n)<<' '; cout<<"time2:"<<ed2-st2<<' '<<memcmp(a+1,c+1,sizeof(int)*n)<<' '; cout<<"time3:"<<ed3-st3<<' '<<memcmp(a+1,d+1,sizeof(int)*n)<<' '; cout<<"time4:"<<ed4-st4<<' '<<memcmp(a+1,e+1,sizeof(int)*n)<<' '; cout<<"time5:"<<ed5-st5<<' '<<memcmp(a+1,f+1,sizeof(int)*n)<<' '; return 0; }
不开优化:
time1:139254 0
time2:198093 0
time3:601853 0
time4:588247 0
time5:598584 0
O2:
time1:138256 0
time2:139235 0
time3:426570 0
time4:322532 0
time5:301933 0
Ofast:
time1:137893 0
time2:140585 0
time3:422154 0
time4:309306 0
time5:298620 0
很显然在大数据(n=50000000)下memcpy最快
另外,在小数据(比如n=26)下,测试得到明显直接赋值(time3)最快
在较小数据(比如n=1000)下,测试得到memmove最快?
快速乘
测试对比程序:

#include<bits/stdc++.h> using namespace std; typedef long long ll; ll rd() { return rand()|(ll(rand())<<32); } ll md; ll mul1(ll x,ll y) { x%=md;y%=md; ll t=x*y-ll((long double)x/md*y+0.5)*md; return t<0?t+md:t; } ll mul2(ll x,ll y) { x%=md;y%=md; ll t=x*y-ll((long double)x*y/md+0.5)*md; return t<0?t+md:t; } ll mul3(ll x,ll y) { x%=md;y%=md; ll t=x*y-ll((long double)x/md*y+1e-8)*md; return t<0?t+md:t; } ll mul0(ll x,ll y) { return __int128(x)*y%md; } ll a,b; int main() { int T=0; srand(3254244); while(1) { T++; ll a=rd(),b=rd(); md=rd();//%ll(1e18); //cout<<a<<' '<<b<<' '<<md<<' '; ll t1=mul1(a,b),t2=mul0(a,b);//可将mul1改为mul2/mul3 //cout<<t1<<' '<<t2<<' '; if(t1!=t2) { printf("%d ",T); puts("test"); int t;cin>>t; } //int t;cin>>t; } return 0; }
经过一些测试,可以发现,mul3效果最差(在模数>=1e17时,100000组以内就拍出锅);应该是1e-8不够
mul2效果没有mul1好(模数不设额外上限时,100000组以内出锅;上限1e18时,20秒不出锅)
mul1效果最好(模数不设额外上限时,20秒不出锅)
原因就不知道了。。。