真好用...
把系数相乘看成((a*M+b)*(c*M+d)=a*c*M^2+(a*d+b*c)*M+b*d)的形式,这样FFT的精度就够用了...
#include<cmath>
#include<cstdio>
#include<algorithm>
#define LL long long
using namespace std;
#define double long double
const double pi=acos(-1);
const int maxn=300004;
int n,m,tt,M,re[maxn],ans[maxn];
struct jz{
double x,y;
jz(double x=0,double y=0):x(x),y(y){}
jz operator+(const jz &b)const{return jz(x+b.x,y+b.y);}
jz operator-(const jz &b)const{return jz(x-b.x,y-b.y);}
jz operator*(const jz &b)const{return jz(x*b.x-y*b.y,x*b.y+y*b.x);}
}a[maxn],b[maxn],c[maxn],d[maxn],A[maxn];
void FFT(jz a[],int f){
for (int i=1;i<=n;i++) if (i<re[i]) swap(a[i],a[re[i]]);
for (int i=1;i<n;i<<=1){
jz w(1,0),wn(cos(pi/i),sin(pi*f/i)),x,y;
for (int j=0;j<n;j+=(i<<1),w=jz(1,0))
for (int k=0;k<i;k++,w=w*wn){
x=a[j+k];y=a[j+k+i]*w;
a[j+k]=x+y;a[j+k+i]=x-y;
}
}
}
void work(jz a[],jz b[],int w){
for (int i=0;i<n;i++) A[i]=a[i]*b[i];FFT(A,-1);
for (int i=0;i<n;i++) ans[i]=(ans[i]+(LL)(A[i].x/n+0.5)%tt*w%tt)%tt;
}
int main(){
freopen("exam.in","r",stdin);
freopen("exam.out","w",stdout);
scanf("%d%d%d",&n,&m,&tt);M=sqrt(tt);
for (int i=0,x;i<=n;i++) scanf("%d",&x),x%=tt,b[i].x=x%M,a[i].x=x/M;
for (int i=0,x;i<=m;i++) scanf("%d",&x),x%=tt,d[i].x=x%M,c[i].x=x/M;
int l=0;for (m+=n,n=1;n<=m;n<<=1,l++);
for (int i=0;i<n;i++) re[i]=((re[i>>1]>>1)|((i&1)<<(l-1)));
FFT(a,1);FFT(b,1);FFT(c,1);FFT(d,1);
work(a,c,(LL)M*M%tt);work(b,d,1);
work(a,d,M%tt);work(b,c,M%tt);
for (int i=0;i<=m;i++) printf("%d ",ans[i]);
return 0;
}