整除分块十分naive,但是卡常之后就不清真了。
常数优化也是一门技术啊!
如果需要计算$$sum_{i=1}^nlfloor frac{n}{i}
floor$$
有一个naive的做法就是
for (long long i=1,la; i<=n; i=la+1){
la=n/(n/i);
ans+=(n/i)*(la-i+1);
}
但是,这样不仅根号有2的常数,瓶颈上还有3次除法(可优化至2次),如果n是一个较大的数,跑起来很man。
今天突然看到了松1自己的提交,于是兴冲冲地又复习了一下优越的算法。
首先推式子
要求$$sum_{i=1}^n sum_{j=1}^n [ij <= n] $$
可拆为$$ sum_{i=1}^{ lfloor sqrt {n}
floor} sum_{j=1}^n [ij leq n] +sum_{i= lfloor sqrt{n}
floor +1}^n sum_{j=1}^n [ij leq n] $$
变换边界条件$$ sum_{i=1}^{ lfloor sqrt {n}
floor} sum_{j=1}^n [ij leq n] +sum_{i= lfloor sqrt{n}
floor +1}^n sum_{j=1}^{lfloor sqrt{n}
floor} [i*j leq n] $$
在把前后两项变得一样
[sum_{i=1}^{ lfloor sqrt {n}
floor} sum_{j=1}^n [i*j leq n] +sum_{i=1}^n sum_{j=1}^{lfloor sqrt{n}
floor} [i*j leq n]-sum_{i=1}^{ lfloor sqrt {n}
floor} sum_{j=1}^{lfloor sqrt {n}
floor}[i*j leq n]
]
合并一下
[2* sum_{i=1}^{lfloor sqrt {n}
floor} sum_{j=1}^n [i*j leq n] - ( lfloor sqrt{n}
floor)^2
]
换一种表示
[2* sum_{i=1}^{lfloor sqrt{n}
floor} lfloor frac{n}{i}
floor -(lfloor sqrt{n}
floor) ^2
]
就可以快速计算啦!
%:pragma GCC target("avx")
%:pragma GCC optimize(3)
%:pragma GCC optimize("Ofast")
%:pragma GCC optimize("inline")
%:pragma GCC optimize("-fgcse")
%:pragma GCC optimize("-fgcse-lm")
%:pragma GCC optimize("-fipa-sra")
%:pragma GCC optimize("-ftree-pre")
%:pragma GCC optimize("-ftree-vrp")
%:pragma GCC optimize("-fpeephole2")
%:pragma GCC optimize("-ffast-math")
%:pragma GCC optimize("-fsched-spec")
%:pragma GCC optimize("unroll-loops")
%:pragma GCC optimize("-falign-jumps")
%:pragma GCC optimize("-falign-loops")
%:pragma GCC optimize("-falign-labels")
%:pragma GCC optimize("-fdevirtualize")
%:pragma GCC optimize("-fcaller-saves")
%:pragma GCC optimize("-fcrossjumping")
%:pragma GCC optimize("-fthread-jumps")
%:pragma GCC optimize("-funroll-loops")
%:pragma GCC optimize("-fwhole-program")
%:pragma GCC optimize("-freorder-blocks")
%:pragma GCC optimize("-fschedule-insns")
%:pragma GCC optimize("inline-functions")
%:pragma GCC optimize("-ftree-tail-merge")
%:pragma GCC optimize("-fschedule-insns2")
%:pragma GCC optimize("-fstrict-aliasing")
%:pragma GCC optimize("-fstrict-overflow")
%:pragma GCC optimize("-falign-functions")
%:pragma GCC optimize("-fcse-skip-blocks")
%:pragma GCC optimize("-fcse-follow-jumps")
%:pragma GCC optimize("-fsched-interblock")
%:pragma GCC optimize("-fpartial-inlining")
%:pragma GCC optimize("no-stack-protector")
%:pragma GCC optimize("-freorder-functions")
%:pragma GCC optimize("-findirect-inlining")
%:pragma GCC optimize("-frerun-cse-after-loop")
%:pragma GCC optimize("inline-small-functions")
%:pragma GCC optimize("-finline-small-functions")
%:pragma GCC optimize("-ftree-switch-conversion")
%:pragma GCC optimize("-foptimize-sibling-calls")
%:pragma GCC optimize("-fexpensive-optimizations")
%:pragma GCC optimize("-funsafe-loop-optimizations")
%:pragma GCC optimize("inline-functions-called-once")
%:pragma GCC optimize("-fdelete-null-pointer-checks")
#include <iostream>
#include <cmath>
using namespace std;
typedef unsigned long long ll;
int main(){
ll n; cin>>n;
ll ans=0;
ll p=sqrt(n);
for (ll i=p; i; --i) ans+=n/i;
ans=ans*2-p*p;
cout<<ans<<endl;
}
还不够快?
利用$$ lfloor frac{n}{2i}
floor =lfloor frac{lfloor frac{n}{i}
floor}{2}
floor $$
可以优化
%:pragma GCC target("avx")
%:pragma GCC optimize(3)
%:pragma GCC optimize("Ofast")
%:pragma GCC optimize("inline")
%:pragma GCC optimize("-fgcse")
%:pragma GCC optimize("-fgcse-lm")
%:pragma GCC optimize("-fipa-sra")
%:pragma GCC optimize("-ftree-pre")
%:pragma GCC optimize("-ftree-vrp")
%:pragma GCC optimize("-fpeephole2")
%:pragma GCC optimize("-ffast-math")
%:pragma GCC optimize("-fsched-spec")
%:pragma GCC optimize("unroll-loops")
%:pragma GCC optimize("-falign-jumps")
%:pragma GCC optimize("-falign-loops")
%:pragma GCC optimize("-falign-labels")
%:pragma GCC optimize("-fdevirtualize")
%:pragma GCC optimize("-fcaller-saves")
%:pragma GCC optimize("-fcrossjumping")
%:pragma GCC optimize("-fthread-jumps")
%:pragma GCC optimize("-funroll-loops")
%:pragma GCC optimize("-fwhole-program")
%:pragma GCC optimize("-freorder-blocks")
%:pragma GCC optimize("-fschedule-insns")
%:pragma GCC optimize("inline-functions")
%:pragma GCC optimize("-ftree-tail-merge")
%:pragma GCC optimize("-fschedule-insns2")
%:pragma GCC optimize("-fstrict-aliasing")
%:pragma GCC optimize("-fstrict-overflow")
%:pragma GCC optimize("-falign-functions")
%:pragma GCC optimize("-fcse-skip-blocks")
%:pragma GCC optimize("-fcse-follow-jumps")
%:pragma GCC optimize("-fsched-interblock")
%:pragma GCC optimize("-fpartial-inlining")
%:pragma GCC optimize("no-stack-protector")
%:pragma GCC optimize("-freorder-functions")
%:pragma GCC optimize("-findirect-inlining")
%:pragma GCC optimize("-frerun-cse-after-loop")
%:pragma GCC optimize("inline-small-functions")
%:pragma GCC optimize("-finline-small-functions")
%:pragma GCC optimize("-ftree-switch-conversion")
%:pragma GCC optimize("-foptimize-sibling-calls")
%:pragma GCC optimize("-fexpensive-optimizations")
%:pragma GCC optimize("-funsafe-loop-optimizations")
%:pragma GCC optimize("inline-functions-called-once")
%:pragma GCC optimize("-fdelete-null-pointer-checks")
%:pragma GCC target("sse2,sse3,ssse3,sse4")
#include <iostream>
#include <cmath>
using namespace std;
typedef long long ll;
int main(){
ll n; cin>>n;
ll ans=0;
ll p=sqrt(n),z=n/p;
for (ll i=1; i<=p; i+=2){
ll t=n/i;
while (t>=z){
ans+=t;
t>>=1;
}
}
ans=ans*2-p*p;
cout<<ans;
}
还不够快,减少一次判断?
#pragma GCC target("avx")
#pragma GCC optimize(3)
#pragma GCC optimize("Ofast")
#pragma GCC optimize("inline")
#pragma GCC optimize("-fgcse")
#pragma GCC optimize("-fgcse-lm")
#pragma GCC optimize("-fipa-sra")
#pragma GCC optimize("-ftree-pre")
#pragma GCC optimize("-ftree-vrp")
#pragma GCC optimize("-fpeephole2")
#pragma GCC optimize("-ffast-math")
#pragma GCC optimize("-fsched-spec")
#pragma GCC optimize("unroll-loops")
#pragma GCC optimize("-falign-jumps")
#pragma GCC optimize("-falign-loops")
#pragma GCC optimize("-falign-labels")
#pragma GCC optimize("-fdevirtualize")
#pragma GCC optimize("-fcaller-saves")
#pragma GCC optimize("-fcrossjumping")
#pragma GCC optimize("-fthread-jumps")
#pragma GCC optimize("-funroll-loops")
#pragma GCC optimize("-fwhole-program")
#pragma GCC optimize("-freorder-blocks")
#pragma GCC optimize("-fschedule-insns")
#pragma GCC optimize("inline-functions")
#pragma GCC optimize("-ftree-tail-merge")
#pragma GCC optimize("-fschedule-insns2")
#pragma GCC optimize("-fstrict-aliasing")
#pragma GCC optimize("-fstrict-overflow")
#pragma GCC optimize("-falign-functions")
#pragma GCC optimize("-fcse-skip-blocks")
#pragma GCC optimize("-fcse-follow-jumps")
#pragma GCC optimize("-fsched-interblock")
#pragma GCC optimize("-fpartial-inlining")
#pragma GCC optimize("no-stack-protector")
#pragma GCC optimize("-freorder-functions")
#pragma GCC optimize("-findirect-inlining")
#pragma GCC optimize("-frerun-cse-after-loop")
#pragma GCC optimize("inline-small-functions")
#pragma GCC optimize("-finline-small-functions")
#pragma GCC optimize("-ftree-switch-conversion")
#pragma GCC optimize("-foptimize-sibling-calls")
#pragma GCC optimize("-fexpensive-optimizations")
#pragma GCC optimize("-funsafe-loop-optimizations")
#pragma GCC optimize("inline-functions-called-once")
#pragma GCC optimize("-fdelete-null-pointer-checks")
#pragma GCC target("sse2,sse3,ssse3,sse4")
#include <iostream>
#include <cmath>
using namespace std;
typedef long long ll;
int main(){
ll n; cin>>n;
ll ans=0;
ll p=sqrt(n),z=n/p;
for (ll i=1,t=n; i<=p; t=n/(i+=2))
do{
ans+=t;
}while ((t>>=1)>=z);
ans=ans*2-p*p;
cout<<ans;
}
显然这还是没有到极致,不过我觉得已经挺快了。
1e16在机房的普通台式机上只需0.5s
还不够快?优化除法次数吧!
#pragma GCC target("avx")
#pragma GCC optimize(3)
#pragma GCC optimize("Ofast")
#pragma GCC optimize("inline")
#pragma GCC optimize("-fgcse")
#pragma GCC optimize("-fgcse-lm")
#pragma GCC optimize("-fipa-sra")
#pragma GCC optimize("-ftree-pre")
#pragma GCC optimize("-ftree-vrp")
#pragma GCC optimize("-fpeephole2")
#pragma GCC optimize("-ffast-math")
#pragma GCC optimize("-fsched-spec")
#pragma GCC optimize("unroll-loops")
#pragma GCC optimize("-falign-jumps")
#pragma GCC optimize("-falign-loops")
#pragma GCC optimize("-falign-labels")
#pragma GCC optimize("-fdevirtualize")
#pragma GCC optimize("-fcaller-saves")
#pragma GCC optimize("-fcrossjumping")
#pragma GCC optimize("-fthread-jumps")
#pragma GCC optimize("-funroll-loops")
#pragma GCC optimize("-fwhole-program")
#pragma GCC optimize("-freorder-blocks")
#pragma GCC optimize("-fschedule-insns")
#pragma GCC optimize("inline-functions")
#pragma GCC optimize("-ftree-tail-merge")
#pragma GCC optimize("-fschedule-insns2")
#pragma GCC optimize("-fstrict-aliasing")
#pragma GCC optimize("-fstrict-overflow")
#pragma GCC optimize("-falign-functions")
#pragma GCC optimize("-fcse-skip-blocks")
#pragma GCC optimize("-fcse-follow-jumps")
#pragma GCC optimize("-fsched-interblock")
#pragma GCC optimize("-fpartial-inlining")
#pragma GCC optimize("no-stack-protector")
#pragma GCC optimize("-freorder-functions")
#pragma GCC optimize("-findirect-inlining")
#pragma GCC optimize("-frerun-cse-after-loop")
#pragma GCC optimize("inline-small-functions")
#pragma GCC optimize("-finline-small-functions")
#pragma GCC optimize("-ftree-switch-conversion")
#pragma GCC optimize("-foptimize-sibling-calls")
#pragma GCC optimize("-fexpensive-optimizations")
#pragma GCC optimize("-funsafe-loop-optimizations")
#pragma GCC optimize("inline-functions-called-once")
#pragma GCC optimize("-fdelete-null-pointer-checks")
#pragma GCC target("sse2,sse3,ssse3,sse4")
#include <iostream>
#include <cmath>
using namespace std;
typedef long long ll;
#define C 13
int main(){
ll n; cin>>n;
ll ans=0;
ll p=sqrt(n),z=n/p;
for (ll i=1,t=n,la=n+C; i<=p;){
ll tmp=t;
do{
ans+=tmp;
}while ((tmp>>=1)>=z);
if (la-t<C){
la=t;
i+=2;
ll g=i*(--t);
while (g>n){
--t;
g-=i;
}
}
else{
la=t;
t=n/(i+=2);
}
}
ans=ans*2-p*p;
cout<<ans;
}
现在1e16只需0.38s左右
这东西貌似有一个(log)做法,先咕着。