zoukankan      html  css  js  c++  java
  • NanoVG 优化笔记:性能提高5倍的秘密

    NanoVG 优化笔记

    nanovg正如其名称所示的那样,是一个非常小巧的矢量绘图函数库。相比cairo和skia的数十万行代码,nanovg不足5000行的C语言代码,称为nano也是名副其实了。nanovg的设计、接口和代码质量都堪称典范,唯一美中不足的就是性能不太理想。特别是在Android的低端机型和大屏幕的机型上,一个简单的界面每秒只能画十几帧。最近我把AWTK移植到Android上时,就碰到了这个尴尬的问题。

    经过优化之后,AWTK在低端机型上,整体渲染性能有了3到5倍的提升。这里做个笔记,供有需要的朋友参考。

    nanovg的性能瓶颈在于片段着色器(fragment shader),片段着色器可以认为是为GPU提供的一个回调函数,该回调函数在处理每个像素时被调用,在每一帧绘制时都会执行数百万次,可见该函数的对性能的影响是很大的。

    我们先看看nanovg的片段着色器(fragment shader)代码:

    	static const char* fillFragShader =
    		"#ifdef GL_ES
    "
    		"#if defined(GL_FRAGMENT_PRECISION_HIGH) || defined(NANOVG_GL3)
    "
    		" precision highp float;
    "
    		"#else
    "
    		" precision mediump float;
    "
    		"#endif
    "
    		"#endif
    "
    		"#ifdef NANOVG_GL3
    "
    		"#ifdef USE_UNIFORMBUFFER
    "
    		"	layout(std140) uniform frag {
    "
    		"		mat3 scissorMat;
    "
    		"		mat3 paintMat;
    "
    		"		vec4 innerCol;
    "
    		"		vec4 outerCol;
    "
    		"		vec2 scissorExt;
    "
    		"		vec2 scissorScale;
    "
    		"		vec2 extent;
    "
    		"		float radius;
    "
    		"		float feather;
    "
    		"		float strokeMult;
    "
    		"		float strokeThr;
    "
    		"		int texType;
    "
    		"		int type;
    "
    		"	};
    "
    		"#else
    " // NANOVG_GL3 && !USE_UNIFORMBUFFER
    		"	uniform vec4 frag[UNIFORMARRAY_SIZE];
    "
    		"#endif
    "
    		"	uniform sampler2D tex;
    "
    		"	in vec2 ftcoord;
    "
    		"	in vec2 fpos;
    "
    		"	out vec4 outColor;
    "
    		"#else
    " // !NANOVG_GL3
    		"	uniform vec4 frag[UNIFORMARRAY_SIZE];
    "
    		"	uniform sampler2D tex;
    "
    		"	varying vec2 ftcoord;
    "
    		"	varying vec2 fpos;
    "
    		"#endif
    "
    		"#ifndef USE_UNIFORMBUFFER
    "
    		"	#define scissorMat mat3(frag[0].xyz, frag[1].xyz, frag[2].xyz)
    "
    		"	#define paintMat mat3(frag[3].xyz, frag[4].xyz, frag[5].xyz)
    "
    		"	#define innerCol frag[6]
    "
    		"	#define outerCol frag[7]
    "
    		"	#define scissorExt frag[8].xy
    "
    		"	#define scissorScale frag[8].zw
    "
    		"	#define extent frag[9].xy
    "
    		"	#define radius frag[9].z
    "
    		"	#define feather frag[9].w
    "
    		"	#define strokeMult frag[10].x
    "
    		"	#define strokeThr frag[10].y
    "
    		"	#define texType int(frag[10].z)
    "
    		"	#define type int(frag[10].w)
    "
    		"#endif
    "
    		"
    "
    		"float sdroundrect(vec2 pt, vec2 ext, float rad) {
    "
    		"	vec2 ext2 = ext - vec2(rad,rad);
    "
    		"	vec2 d = abs(pt) - ext2;
    "
    		"	return min(max(d.x,d.y),0.0) + length(max(d,0.0)) - rad;
    "
    		"}
    "
    		"
    "
    		"// Scissoring
    "
    		"float scissorMask(vec2 p) {
    "
    		"	vec2 sc = (abs((scissorMat * vec3(p,1.0)).xy) - scissorExt);
    "
    		"	sc = vec2(0.5,0.5) - sc * scissorScale;
    "
    		"	return clamp(sc.x,0.0,1.0) * clamp(sc.y,0.0,1.0);
    "
    		"}
    "
    		"#ifdef EDGE_AA
    "
    		"// Stroke - from [0..1] to clipped pyramid, where the slope is 1px.
    "
    		"float strokeMask() {
    "
    		"	return min(1.0, (1.0-abs(ftcoord.x*2.0-1.0))*strokeMult) * min(1.0, ftcoord.y);
    "
    		"}
    "
    		"#endif
    "
    		"
    "
    		"void main(void) {
    "
    		"   vec4 result;
    "
    		"	float scissor = scissorMask(fpos);
    "
    		"#ifdef EDGE_AA
    "
    		"	float strokeAlpha = strokeMask();
    "
    		"	if (strokeAlpha < strokeThr) discard;
    "
    		"#else
    "
    		"	float strokeAlpha = 1.0;
    "
    		"#endif
    "
    		"	if (type == 0) {			// Gradient
    "
    		"		// Calculate gradient color using box gradient
    "
    		"		vec2 pt = (paintMat * vec3(fpos,1.0)).xy;
    "
    		"		float d = clamp((sdroundrect(pt, extent, radius) + feather*0.5) / feather, 0.0, 1.0);
    "
    		"		vec4 color = mix(innerCol,outerCol,d);
    "
    		"		// Combine alpha
    "
    		"		color *= strokeAlpha * scissor;
    "
    		"		result = color;
    "
    		"	} else if (type == 1) {		// Image
    "
    		"		// Calculate color fron texture
    "
    		"		vec2 pt = (paintMat * vec3(fpos,1.0)).xy / extent;
    "
    		"#ifdef NANOVG_GL3
    "
    		"		vec4 color = texture(tex, pt);
    "
    		"#else
    "
    		"		vec4 color = texture2D(tex, pt);
    "
    		"#endif
    "
    		"		if (texType == 1) color = vec4(color.xyz*color.w,color.w);"
    		"		if (texType == 2) color = vec4(color.x);"
    		"		// Apply color tint and alpha.
    "
    		"		color *= innerCol;
    "
    		"		// Combine alpha
    "
    		"		color *= strokeAlpha * scissor;
    "
    		"		result = color;
    "
    		"	} else if (type == 2) {		// Stencil fill
    "
    		"		result = vec4(1,1,1,1);
    "
    		"	} else if (type == 3) {		// Textured tris
    "
    		"#ifdef NANOVG_GL3
    "
    		"		vec4 color = texture(tex, ftcoord);
    "
    		"#else
    "
    		"		vec4 color = texture2D(tex, ftcoord);
    "
    		"#endif
    "
    		"		if (texType == 1) color = vec4(color.xyz*color.w,color.w);"
    		"		if (texType == 2) color = vec4(color.x);"
    		"		color *= scissor;
    "
    		"		result = color * innerCol;
    "
    		"	}
    "
    		"#ifdef NANOVG_GL3
    "
    		"	outColor = result;
    "
    		"#else
    "
    		"	gl_FragColor = result;
    "
    		"#endif
    "
    		"}
    ";
    

    它的功能很完整也很复杂,裁剪和反走样都做了处理。仔细分析之后,我发现了几个性能问题:

    一、颜色填充的问题

    简单颜色填充和渐变颜色填充使用了相同的代码:

    		"	if (type == 0) {			// Gradient
    "
    		"		// Calculate gradient color using box gradient
    "
    		"		vec2 pt = (paintMat * vec3(fpos,1.0)).xy;
    "
    		"		float d = clamp((sdroundrect(pt, extent, radius) + feather*0.5) / feather, 0.0, 1.0);
    "
    		"		vec4 color = mix(innerCol,outerCol,d);
    "
    		"		// Combine alpha
    "
    		"		color *= strokeAlpha * scissor;
    "
    		"		result = color;
    "
    

    问题

    简单颜色填充只需一条指令,而渐变颜色填充则需要数十条指令。这两种情况重用一段代码,会让简单颜色填充慢10倍以上。

    方案

    把颜色填充分成以下几种情况,分别进行优化:

    • 矩形简单颜色填充。

    对于无需裁剪的矩形(这是最常见的情况),直接赋值即可,性能提高20倍以上。

          " if (type == 5) {    //fast fill color
    "
          "   result = innerCol;
    "
    
    • 通用多边形简单颜色填充。

    去掉渐变的采样函数,性能会提高一倍以上:

        " } else if(type == 7) {      // fill color
    "
          "   strokeAlpha = strokeMask();
    "
          "   if (strokeAlpha < strokeThr) discard;
    "
          "   float scissor = scissorMask(fpos);
    "
          "   vec4 color = innerCol;
    "
          "   color *= strokeAlpha * scissor;
    "
          "   result = color;
    "
    
    
    • 渐变颜色填充(只占极小的部分)。

    这种情况非常少见,还是使用之前的代码。

    效果:

    平均情况,填充性能提高10倍以上!

    二、字体的问题

    对于文字而言,需要显示的像素和不显示的像素,平均算下来在1:1左右。

    		"	} else if (type == 3) {		// Textured tris
    "
    		"#ifdef NANOVG_GL3
    "
    		"		vec4 color = texture(tex, ftcoord);
    "
    		"#else
    "
    		"		vec4 color = texture2D(tex, ftcoord);
    "
    		"#endif
    "
    		"		if (texType == 1) color = vec4(color.xyz*color.w,color.w);"
    		"		if (texType == 2) color = vec4(color.x);"
    		"		color *= scissor;
    "
    		"		result = color * innerCol;
    "
    		"	}
    "
    

    问题:

    如果显示的像素和不显示的像素都走完整的流程,会浪费调一半的时间。

    方案:

    • 当color.x < 0.02时直接跳过。
    • 裁剪和反走样放到判断语句之后。
          " } else if (type == 3) {   // Textured tris
    "
          "#ifdef NANOVG_GL3
    "
          "   vec4 color = texture(tex, ftcoord);
    "
          "#else
    "
          "   vec4 color = texture2D(tex, ftcoord);
    "
          "#endif
    "
          "   if(color.x < 0.02) discard;
    "
          "   strokeAlpha = strokeMask();
    "
          "   if (strokeAlpha < strokeThr) discard;
    "
          "   float scissor = scissorMask(fpos);
    "
          "   color = vec4(color.x);"
          "   color *= scissor;
    "
          "   result = color * innerCol;
    "
          " }
    "
    

    效果:

    字体渲染性能提高一倍!

    三、反走样的问题

    反走样的实现函数如下(其实我也不懂):

    		"float strokeMask() {
    "
    		"	return min(1.0, (1.0-abs(ftcoord.x*2.0-1.0))*strokeMult) * min(1.0, ftcoord.y);
    "
    		"}
    "
    

    问题:

    与简单的赋值操作相比,加上反走样功能,性能会下降5-10倍。但是不加反走样功能,绘制多边形时边缘效果比较差。不加不好看,加了又太慢,看起来是个两难的选择。

    方案:

    矩形填充是可以不用反走样功能的。而90%以上的情况都是矩形填充。矩形填充单独处理,一条指令搞定,性能提高20倍以上:

          " if (type == 5) {    //fast fill color
    "
          "   result = innerCol;
    "
    

    效果:

    配合裁剪和矩形的优化,性能提高10倍以上。

    四、裁剪的问题

    裁剪放到Shader中虽然合理,但是性能就要大大折扣了。

    		"// Scissoring
    "
    		"float scissorMask(vec2 p) {
    "
    		"	vec2 sc = (abs((scissorMat * vec3(p,1.0)).xy) - scissorExt);
    "
    		"	sc = vec2(0.5,0.5) - sc * scissorScale;
    "
    		"	return clamp(sc.x,0.0,1.0) * clamp(sc.y,0.0,1.0);
    "
    		"}
    "
    

    问题:

    与简单的赋值操作相比,加上裁剪功能,性能会下降10以上倍。但是不加裁剪功能,像滚动视图这样的控件就没法实现,这看起来也是个两难的选择。

    方案:

    而90%以上的填充都是在裁剪区域的内部的,没有必要每个像素都去判断,放在Shader之外进行判断即可。

    static int glnvg__pathInScissor(const NVGpath* path, NVGscissor* scissor) {
      int32_t i = 0;
      float cx = scissor->xform[4];
      float cy = scissor->xform[5];
      float hw = scissor->extent[0];
      float hh = scissor->extent[1];
    
      float l = cx - hw;
      float t = cy - hh;
      float r = l + 2 * hw - 1;
      float b = t + 2 * hh - 1;
    
      const NVGvertex* verts = path->fill;
      for (i = 0; i < path->nfill; i++) {
        const NVGvertex* iter = verts + i;
        int x = iter->x;
        int y = iter->y;
        if (x < l || x > r || y < t || y > b) {
          return 0;
        }
      }
    
      return 1;
    }
    

    效果:

    配合裁剪和矩形的优化,性能提高10倍以上。

    五、综合

    综合裁剪、反走样和矩形,新增3个类型,进行特殊处理:

    • 快速填充无需裁剪的矩形:NSVG_SHADER_FAST_FILLCOLOR
    • 快速填充无需裁剪的图片:NSVG_SHADER_FAST_FILLIMG
    • 快速用简单颜色填充多边形:NSVG_SHADER_FILLCOLOR

    裁剪、反走样和矩形可以组合更多类型,进行更精细的优化。但即使只作这三种情况处理,AWTK在Android平台的整体性能已经有了3-5倍的提高,demoui在我们测试的机型上,都稳稳的保持在60FPS,没有必要为了性能增加它的复杂度了。

    详细情况和完整代码请参考AWTK

  • 相关阅读:
    Python的异常处理
    flex stage.width 与stage.stageWidth的区别
    Flex timer使用 keydown事件注册到stage
    flex 事件注册和鼠标拖动
    window.open a.href打开窗口referer的问题
    java UUID的创建
    flex chrome浏览器调试flex程序
    EL表达式 requestScope initParam用法
    Java8 Lumbda表达式 初步
    jQuery 动画的执行
  • 原文地址:https://www.cnblogs.com/hzcya1995/p/13332987.html
Copyright © 2011-2022 走看看