Optimizing the performance of the heavy fragment shader

I need help optimizing the following set of shaders:

Vertex:

precision mediump float; uniform vec2 rubyTextureSize; attribute vec4 vPosition; attribute vec2 a_TexCoordinate; varying vec2 tc; void main() { gl_Position = vPosition; tc = a_TexCoordinate; } 

Fragment:

 precision mediump float; /* Uniforms - rubyTexture: texture sampler - rubyTextureSize: size of the texture before rendering */ uniform sampler2D rubyTexture; uniform vec2 rubyTextureSize; uniform vec2 rubyTextureFract; /* Varying attributes - tc: coordinate of the texel being processed - xyp_[]_[]_[]: a packed coordinate for 3 areas within the texture */ varying vec2 tc; /* Constants */ /* Inequation coefficients for interpolation Equations are in the form: Ay + Bx = C 45, 30, and 60 denote the angle from x each line the cooeficient variable set builds */ const vec4 Ai = vec4(1.0, -1.0, -1.0, 1.0); const vec4 B45 = vec4(1.0, 1.0, -1.0, -1.0); const vec4 C45 = vec4(1.5, 0.5, -0.5, 0.5); const vec4 B30 = vec4(0.5, 2.0, -0.5, -2.0); const vec4 C30 = vec4(1.0, 1.0, -0.5, 0.0); const vec4 B60 = vec4(2.0, 0.5, -2.0, -0.5); const vec4 C60 = vec4(2.0, 0.0, -1.0, 0.5); const vec4 M45 = vec4(0.4, 0.4, 0.4, 0.4); const vec4 M30 = vec4(0.2, 0.4, 0.2, 0.4); const vec4 M60 = M30.yxwz; const vec4 Mshift = vec4(0.2); // Coefficient for weighted edge detection const float coef = 2.0; // Threshold for if luminance values are "equal" const vec4 threshold = vec4(0.32); // Conversion from RGB to Luminance (from GIMP) const vec3 lum = vec3(0.21, 0.72, 0.07); // Performs same logic operation as && for vectors bvec4 _and_(bvec4 A, bvec4 B) { return bvec4(Ax && Bx, Ay && By, Az && Bz, Aw && Bw); } // Performs same logic operation as || for vectors bvec4 _or_(bvec4 A, bvec4 B) { return bvec4(Ax || Bx, Ay || By, Az || Bz, Aw || Bw); } // Converts 4 3-color vectors into 1 4-value luminance vector vec4 lum_to(vec3 v0, vec3 v1, vec3 v2, vec3 v3) { // return vec4(dot(lum, v0), dot(lum, v1), dot(lum, v2), dot(lum, v3)); return mat4(v0.x, v1.x, v2.x, v3.x, v0.y, v1.y, v2.y, v3.y, v0.z, v1.z, v2.z, v3.z, 0.0, 0.0, 0.0, 0.0) * vec4(lum, 0.0); } // Gets the difference between 2 4-value luminance vectors vec4 lum_df(vec4 A, vec4 B) { return abs(A - B); } // Determines if 2 4-value luminance vectors are "equal" based on threshold bvec4 lum_eq(vec4 A, vec4 B) { return lessThan(lum_df(A, B), threshold); } vec4 lum_wd(vec4 a, vec4 b, vec4 c, vec4 d, vec4 e, vec4 f, vec4 g, vec4 h) { return lum_df(a, b) + lum_df(a, c) + lum_df(d, e) + lum_df(d, f) + 4.0 * lum_df(g, h); } // Gets the difference between 2 3-value rgb colors float c_df(vec3 c1, vec3 c2) { vec3 df = abs(c1 - c2); return df.r + df.g + df.b; } void main() { /* Mask for algorhithm +-----+-----+-----+-----+-----+ | | 1 | 2 | 3 | | +-----+-----+-----+-----+-----+ | 5 | 6 | 7 | 8 | 9 | +-----+-----+-----+-----+-----+ | 10 | 11 | 12 | 13 | 14 | +-----+-----+-----+-----+-----+ | 15 | 16 | 17 | 18 | 19 | +-----+-----+-----+-----+-----+ | | 21 | 22 | 23 | | +-----+-----+-----+-----+-----+ */ float x = rubyTextureFract.x; float y = rubyTextureFract.y; vec4 xyp_1_2_3 = tc.xxxy + vec4(-x, 0.0, x, -2.0 * y); vec4 xyp_6_7_8 = tc.xxxy + vec4(-x, 0.0, x, -y); vec4 xyp_11_12_13 = tc.xxxy + vec4(-x, 0.0, x, 0.0); vec4 xyp_16_17_18 = tc.xxxy + vec4(-x, 0.0, x, y); vec4 xyp_21_22_23 = tc.xxxy + vec4(-x, 0.0, x, 2.0 * y); vec4 xyp_5_10_15 = tc.xyyy + vec4(-2.0 * x, -y, 0.0, y); vec4 xyp_9_14_9 = tc.xyyy + vec4(2.0 * x, -y, 0.0, y); // Get mask values by performing texture lookup with the uniform sampler vec3 P1 = texture2D(rubyTexture, xyp_1_2_3.xw).rgb; vec3 P2 = texture2D(rubyTexture, xyp_1_2_3.yw).rgb; vec3 P3 = texture2D(rubyTexture, xyp_1_2_3.zw).rgb; vec3 P6 = texture2D(rubyTexture, xyp_6_7_8.xw).rgb; vec3 P7 = texture2D(rubyTexture, xyp_6_7_8.yw).rgb; vec3 P8 = texture2D(rubyTexture, xyp_6_7_8.zw).rgb; vec3 P11 = texture2D(rubyTexture, xyp_11_12_13.xw).rgb; vec3 P12 = texture2D(rubyTexture, xyp_11_12_13.yw).rgb; vec3 P13 = texture2D(rubyTexture, xyp_11_12_13.zw).rgb; vec3 P16 = texture2D(rubyTexture, xyp_16_17_18.xw).rgb; vec3 P17 = texture2D(rubyTexture, xyp_16_17_18.yw).rgb; vec3 P18 = texture2D(rubyTexture, xyp_16_17_18.zw).rgb; vec3 P21 = texture2D(rubyTexture, xyp_21_22_23.xw).rgb; vec3 P22 = texture2D(rubyTexture, xyp_21_22_23.yw).rgb; vec3 P23 = texture2D(rubyTexture, xyp_21_22_23.zw).rgb; vec3 P5 = texture2D(rubyTexture, xyp_5_10_15.xy).rgb; vec3 P10 = texture2D(rubyTexture, xyp_5_10_15.xz).rgb; vec3 P15 = texture2D(rubyTexture, xyp_5_10_15.xw).rgb; vec3 P9 = texture2D(rubyTexture, xyp_9_14_9.xy).rgb; vec3 P14 = texture2D(rubyTexture, xyp_9_14_9.xz).rgb; vec3 P19 = texture2D(rubyTexture, xyp_9_14_9.xw).rgb; // Store luminance values of each point in groups of 4 // so that we may operate on all four corners at once vec4 p7 = lum_to(P7, P11, P17, P13); vec4 p8 = lum_to(P8, P6, P16, P18); vec4 p11 = p7.yzwx; // P11, P17, P13, P7 vec4 p12 = lum_to(P12, P12, P12, P12); vec4 p13 = p7.wxyz; // P13, P7, P11, P17 vec4 p14 = lum_to(P14, P2, P10, P22); vec4 p16 = p8.zwxy; // P16, P18, P8, P6 vec4 p17 = p7.zwxy; // P17, P13, P7, P11 vec4 p18 = p8.wxyz; // P18, P8, P6, P16 vec4 p19 = lum_to(P19, P3, P5, P21); vec4 p22 = p14.wxyz; // P22, P14, P2, P10 vec4 p23 = lum_to(P23, P9, P1, P15); // Scale current texel coordinate to [0..1] vec2 fp = fract(tc * rubyTextureSize); // Determine amount of "smoothing" or mixing that could be done on texel corners vec4 AiMulFpy = Ai * fp.y; vec4 B45MulFpx = B45 * fp.x; vec4 ma45 = smoothstep(C45 - M45, C45 + M45, AiMulFpy + B45MulFpx); vec4 ma30 = smoothstep(C30 - M30, C30 + M30, AiMulFpy + B30 * fp.x); vec4 ma60 = smoothstep(C60 - M60, C60 + M60, AiMulFpy + B60 * fp.x); vec4 marn = smoothstep(C45 - M45 + Mshift, C45 + M45 + Mshift, AiMulFpy + B45MulFpx); // Perform edge weight calculations vec4 e45 = lum_wd(p12, p8, p16, p18, p22, p14, p17, p13); vec4 econt = lum_wd(p17, p11, p23, p13, p7, p19, p12, p18); vec4 e30 = lum_df(p13, p16); vec4 e60 = lum_df(p8, p17); // Calculate rule results for interpolation bvec4 r45_1 = _and_(notEqual(p12, p13), notEqual(p12, p17)); bvec4 r45_2 = _and_(not (lum_eq(p13, p7)), not (lum_eq(p13, p8))); bvec4 r45_3 = _and_(not (lum_eq(p17, p11)), not (lum_eq(p17, p16))); bvec4 r45_4_1 = _and_(not (lum_eq(p13, p14)), not (lum_eq(p13, p19))); bvec4 r45_4_2 = _and_(not (lum_eq(p17, p22)), not (lum_eq(p17, p23))); bvec4 r45_4 = _and_(lum_eq(p12, p18), _or_(r45_4_1, r45_4_2)); bvec4 r45_5 = _or_(lum_eq(p12, p16), lum_eq(p12, p8)); bvec4 r45 = _and_(r45_1, _or_(_or_(_or_(r45_2, r45_3), r45_4), r45_5)); bvec4 r30 = _and_(notEqual(p12, p16), notEqual(p11, p16)); bvec4 r60 = _and_(notEqual(p12, p8), notEqual(p7, p8)); // Combine rules with edge weights bvec4 edr45 = _and_(lessThan(e45, econt), r45); bvec4 edrrn = lessThanEqual(e45, econt); bvec4 edr30 = _and_(lessThanEqual(coef * e30, e60), r30); bvec4 edr60 = _and_(lessThanEqual(coef * e60, e30), r60); // Finalize interpolation rules and cast to float (0.0 for false, 1.0 for true) vec4 final45 = vec4(_and_(_and_(not (edr30), not (edr60)), edr45)); vec4 final30 = vec4(_and_(_and_(edr45, not (edr60)), edr30)); vec4 final60 = vec4(_and_(_and_(edr45, not (edr30)), edr60)); vec4 final36 = vec4(_and_(_and_(edr60, edr30), edr45)); vec4 finalrn = vec4(_and_(not (edr45), edrrn)); // Determine the color to mix with for each corner vec4 px = step(lum_df(p12, p17), lum_df(p12, p13)); // Determine the mix amounts by combining the final rule result and corresponding // mix amount for the rule in each corner vec4 mac = final36 * max(ma30, ma60) + final30 * ma30 + final60 * ma60 + final45 * ma45 + finalrn * marn; /* Calculate the resulting color by traversing clockwise and counter-clockwise around the corners of the texel Finally choose the result that has the largest difference from the texel original color */ vec3 res1 = P12; res1 = mix(res1, mix(P13, P17, px.x), mac.x); res1 = mix(res1, mix(P7, P13, px.y), mac.y); res1 = mix(res1, mix(P11, P7, px.z), mac.z); res1 = mix(res1, mix(P17, P11, px.w), mac.w); vec3 res2 = P12; res2 = mix(res2, mix(P17, P11, px.w), mac.w); res2 = mix(res2, mix(P11, P7, px.z), mac.z); res2 = mix(res2, mix(P7, P13, px.y), mac.y); res2 = mix(res2, mix(P13, P17, px.x), mac.x); gl_FragColor = vec4(mix(res1, res2, step(c_df(P12, res1), c_df(P12, res2))), 1.0); } 

Shaders get a two-dimensional texture and are designed to scale it on a 2D surface with high resolution (device screen). This is an optimization of the SABR scaling algorithm in case it matters.

It already works and performs OK on very high-end devices (for example, LG Nexus 4), but on weak devices it works very slowly.

The devices that really matter to me are the Samsung Galaxy S 2 \ 3, with the Mali 400MP GPU, which work terribly with this shader.

So far I have tried:

  • Eliminating differences (ARM Mali leadership tips) is a minor improvement.
  • The override functions of mix () with my own did nothing good.
  • decreasing float accuracy to lowp - nothing has changed.

I measure performance by calculating the rendering time (before and after eglSwapBuffers) - this gives me a very linear and consistent performance measurement.

Other than that, I really don't know where to look or what can be optimized here ...

I know that this is a heavy algorithm, and I do not ask for advice on which alternative scaling methods to use - I tried a lot, and this algorithm gives the best visual result. I want to use the same algorithm in an optimized way.

UPDATE

  • I found that if I do all the texture constant sampling instead of dependent vectors, I get a significant performance improvement, so this is obviously a big bottleneck - possibly due to the cache. However, I still need to make these selections. I played with at least some of the notes with vec2 variables (without any swizzling), but nothing improved. Interestingly, this can be a good way to effectively poll 21 texels.

  • I found that most of the calculations are done several times using the same set of texels - because the result is scaled by at least x2, and I'm polling using GL_NEAREST. There are at least 4 fragments that fall exactly in the same texels. If you scale x4 on a high resolution device, there are 16 fragments that fall on the same texels, which is a lot of waste. Is there a way to do an extra shader pass that will calculate all values ​​that are not changed for several fragments? I was thinking of rendering extra texture off-screen, but I need to store several values ​​for each texel, not just one.

UPDATE

  • He tried to simplify logical expressions using well-known logical rules - he saved me several operations, but had no impact on performance.

UPDATE

  • The thought of a way to transfer calculations to the vertex shader - just has a “geometry” that creates my full screen, but with a lot of vertices that correspond to each original pixel before scaling. For example, if my original texture is 320x200 and my target screen is 1280x800, they will be evenly distributed over 320x200 vertices. Then do most of the calculations at these vertices. The problem is that my target devices (S2 \ S3) do not support vertex texture fetching.

UPDATE

  • The measured performance on the LG Nexus 4 compared to the Samsung Galaxy S3 shows that the Nexus 4 is more than 10 times faster. How can it be? These are 2 devices from the same generation, the same resolution, etc. Can the Mali 400MP be really bad in certain situations? I am sure that there is something very specific that makes it work so slowly compared to Nexus 4 (but have not yet found it).
+3
source share
3 answers

In my experience, the performance of a mobile GPU is roughly proportional to the number of texture2D calls. You have 21, which is really a lot. Typically, in-memory searches are hundreds of times slower than calculations, so you can do a lot of calculations and still be a bottleneck for texture searches. (This also means that optimizing the rest of your code is likely to be ineffective, as it means that instead of being busy while it is waiting for a texture search, it will be inactive while it is waiting for a texture search.) Thus, you need to reduce the number of textures that you call.

It’s hard to say how to do this, since I don’t really understand your shader, but some ideas:

  • divide it into a horizontal passage and then into a vertical passage. This only works for some shaders, for example. blurs, but this can seriously reduce the number of texture searches. For example, 5x5 Gaussian blur naively performs 25 texture searches; if done horizontally, then vertically, it uses only 10.
  • use linear filtering to "cheat". If you select exactly 4 pixels instead of the middle of 1 pixel with linear filtering turned on, you will get the average value for all 4 pixels for free. However, I do not know how this affects your shader. In the blur example, using linear filtering to sample two pixels simultaneously on one side of the middle pixel, you can try 5 pixels with 3 texture2D calls, reducing 5x5 blur to 6 samples both horizontally and vertically.
  • just use a smaller core (so you don’t take so many samples). This affects the quality, so you probably need some way to determine the performance of the device and switch to a lower shader level when the device is slow.
+3
source

There are a few Mali-400 oddities that you might need to know about:

  • You really should use the variables without any hangs to search for the texture (ie, calculate "xyp_1_2_3.xw", etc. in the vertex shader and use one different one to search for the texture instead of swizzling them).
  • With a certain number of instructions (unfortunately, the NDA forbids me to show this number), performance decreases rather badly. You can get the command counter from a standalone compiler. To fix this, you can split the shader into several smaller ones and use the undocumented GL_ARM_framebuffer_read-extension to read the result of the previous one. (Google can tell you how to use it, it seems. Grapping a little bit in the standalone shader compiler binaries can also help)
+1
source

The upper performance limit of your fragment shader (the lower execution time limit) is set by 21 texture loads and one is written to the framebuffer ( gl_FragColor = ). It would be advisable to build a fragmented shader that simply performs 21 loads, accumulating the result of each load in one vec4, and then writing it down. If you run this shader on your complex target hardware, you will learn about the delta between where your more complex shader is located and its maximum potential performance for these specific versions of the GPU / driver / platform. Your real shader can only be slower, so if this simple test shader is too slow, you will need to look further for a solution.

Once this baseline is installed, I have only vague tips for improving the shader that really interests you, but maybe my reasoning is of interest. I see that your code has all texture loads grouped together at the top. At the hardware level, texture loads have an extremely long delay, but the GPUs can perform other functions during their flight, including starting other threads in the same unit of work. This means that the final shader binary, which has many arithmetic operations distributed between loads, will do arithmetic work for free in the shadow of loads, and also that a shader program using several registers will allow you to run many threads for at the same time each thread potentially executes its arithmetic work, while other threads block the loading of texels. We hope that any shader compiler will move your code to achieve the necessary rotation. However, this does not hurt to give him a hand, and thus:

  • Try moving each arithmetic operator up (lexically) into the file as high as possible, without breaking. This can help spread your load if the compiler misses the trick.
  • Try to use all the intermediate results as soon as possible so that the compiler can recognize that their variables are dead and thus free the registers. This can reduce the use of registers and therefore increase the busyness of your program. One idea to achieve this effect, if you have a bunch of partial results that are summed up at the end, would be to convert the final summation of many variables holding the partial results into accumulation into one variable as each partial result is generated.

As always with performance, YMMV

0
source

Source: https://habr.com/ru/post/1495846/


All Articles