glsl - Bug in OpenGL? Identical float computation leads to different results -
the "minimial example" quite long unfortunately bug disappears if drop seamingly irrelevant code.
here c++ code:
#define _use_math_defines #include <iostream> #include <iomanip> #include <vector> #include <math.h> #include <time.h> #include <fstream> #include <sstream> //opengl includes #include <gl/glew.h> //opengl extension wrangler #include <gl/freeglut.h> //free opengl utility toolkit (includes gl.h , glu.h) void fpressenter(){ std::cout << "press enter exit."; std::cin.get(); } void fcompileshaderfromstring(const std::string& source, const gluint shaderid){ const char* code = source.c_str(); glshadersource(shaderid, 1, &code, null); glcompileshader(shaderid); int status; glgetshaderiv(shaderid, gl_compile_status, &status); if(status == gl_false){ int length = 0; glgetshaderiv(shaderid, gl_info_log_length, &length); if(length > 0){ char* log = new char[length]; int written = 0; glgetshaderinfolog(shaderid, length, &written, log); std::cout << log << std::endl; delete[] log; } } } void fcompileshaderfromfile(const char* filename, const gluint shaderid){ std::ifstream infile(filename, std::ios::in); std::ostringstream code; while(infile.good()){ int ti = infile.get(); if(!infile.eof()) code << (char) ti; } infile.close(); fcompileshaderfromstring(code.str(), shaderid); } void flinkprogram(const gluint programid){ gllinkprogram(programid); int status; glgetprogramiv(programid, gl_link_status, &status); if(status == gl_false){ int length = 0; glgetprogramiv(programid, gl_info_log_length, &length); if(length > 0){ char* log = new char[length]; int written = 0; glgetprograminfolog(programid, length, &written, log); std::cout << log << std::endl; delete[] log; } } } int main(){ //opengl setup //glut int argc = 1; char* argv[1] = {(char*) ""}; glutinit(&argc, argv); glutinitwindowposition((glutget(glut_screen_width) - 256) / 2, (glutget(glut_screen_height) - 256) / 2); glutinitwindowsize(256, 256); glutinitdisplaymode(glut_rgb); glutcreatewindow("bug"); gluthidewindow(); //glew glenum glew_ok = glewinit(); if(glew_ok != glew_ok){fprintf(stderr, "glew error: '%s'\n", glewgeterrorstring(glew_ok)); fpressenter(); return(1);} //main program //auxiliary variables gluint shaderid, programid, bufferid; float* bufferpointer; float* data = new float[32 * 32 * 2]; float dsup, esup; //compile shader program , create buffer in graphics card memory shaderid = glcreateshader(gl_compute_shader); fcompileshaderfromfile("shader.txt", shaderid); programid = glcreateprogram(); glattachshader(programid, shaderid); flinkprogram(programid); gldetachshader(programid, shaderid); gldeleteshader(shaderid); gluseprogram(programid); glgenbuffers(1, &bufferid); glbindbufferbase(gl_shader_storage_buffer, 0, bufferid); glbufferdata(gl_shader_storage_buffer, 32 * 32 * 2 * sizeof(float), null, gl_stream_read); glbindbuffer(gl_shader_storage_buffer, 0); //actual computation std::cout << "starting computation" << std::endl << std::endl << std::flush; dsup = 0.f; esup = 0.f; gluniform1i(0, 0); gluniform1i(1, 0); gldispatchcompute(4, 4, 1); glmemorybarrier(gl_all_barrier_bits); glfinish(); glbindbufferbase(gl_shader_storage_buffer, 0, bufferid); bufferpointer = (float*) glmapbuffer(gl_shader_storage_buffer, gl_read_only); memcpy(data, bufferpointer, 32 * 32 * 2 * sizeof(float)); glunmapbuffer(gl_shader_storage_buffer); glbindbuffer(gl_shader_storage_buffer, 0); for(int z2 = 0; z2 < 32; z2++) for(int z3 = 0; z3 < 32; z3++){ if(data[z2 * 32 * 2 + z3 * 2 + 0] > dsup) dsup = data[z2 * 32 * 2 + z3 * 2 + 0]; if(data[z2 * 32 * 2 + z3 * 2 + 1] > esup) esup = data[z2 * 32 * 2 + z3 * 2 + 1]; } std::cout << std::setprecision(8) << dsup << ", " << std::setprecision(8) << esup << std::endl; //cleanup delete[] data; gldeletebuffers(1, &bufferid); gldeleteprogram(programid); fpressenter(); return(0); }
and here shader:
#version 430 core const uint mg = 2048; const uint mb = 512; const uint = 128; layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in; layout(binding = 0) buffer outputblock{ float data[32][32][2]; } output; layout(location = 0) uniform int goffset; layout(location = 1) uniform int boffset; vec2 fe(float x){return(vec2(cos(6.2831853e0f * x), sin(6.2831853e0f * x)));} vec2 fcomplextimes(const vec2 a, const vec2 b){return(vec2(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x));} vec2 fcomplextimesi(const vec2 a){return(vec2(-a.y, a.x));} vec2 fcomplextimesminusi(const vec2 a){return(vec2(a.y, -a.x));} float dsup, esup; void fdesuptaylor_a10_w3_d2(){ const vec2 1 = vec2(1.f, 0.f); uint q, zg, zb, zy, n; float fq, fb, fg, fys, fymb, fmg, fmb, fatmy; float dsum, esum, dsummandsup, esummandsup, dabs, eabs; float t; vec2 td, te; vec2 di_0_, di_0_y, di_0_yy, di_1_, di_1_g, di_1_b, di_1_y, di_1_gg, di_1_gb, di_1_gy, di_1_bg, di_1_bb, di_1_by, di_1_yg, di_1_yb, di_1_yy, di_2_, di_2_g, di_2_b, di_2_y, di_2_gg, di_2_gb, di_2_gy, di_2_bg, di_2_bb, di_2_by, di_2_yg, di_2_yb, di_2_yy; vec2 ei_0_, ei_0_y, ei_0_yy, ei_1_, ei_1_g, ei_1_b, ei_1_y, ei_1_gg, ei_1_gb, ei_1_gy, ei_1_bg, ei_1_bb, ei_1_by, ei_1_yg, ei_1_yb, ei_1_yy, ei_2_, ei_2_g, ei_2_b, ei_2_y, ei_2_gg, ei_2_gb, ei_2_gy, ei_2_bg, ei_2_bb, ei_2_by, ei_2_yg, ei_2_yb, ei_2_yy; vec2 d_2_, d_2_g, d_2_b, d_2_y, d_2_gg, d_2_gb, d_2_gy, d_2_bg, d_2_bb, d_2_by, d_2_yg, d_2_yb, d_2_yy, d_3_, d_3_g, d_3_b, d_3_y, d_3_gg, d_3_gb, d_3_gy, d_3_bg, d_3_bb, d_3_by, d_3_yg, d_3_yb, d_3_yy; vec2 e_2_, e_2_g, e_2_b, e_2_y, e_2_gg, e_2_gb, e_2_gy, e_2_bg, e_2_bb, e_2_by, e_2_yg, e_2_yb, e_2_yy, e_3_, e_3_g, e_3_b, e_3_y, e_3_gg, e_3_gb, e_3_gy, e_3_bg, e_3_bb, e_3_by, e_3_yg, e_3_yb, e_3_yy; float r_g, r_b, r_y, r_gg, r_gb, r_gy, r_bg, r_bb, r_by, r_yg, r_yb, r_yy; fmg = float(mg); fmb = float(mb); fatmy = 10.f * float(my); r_g = 2.f * fmg; r_b = 2.f * fmb; r_y = 2.f * fatmy; r_gg = 8.f * pow(fmg, 2.f); r_gb = 8.f * fmg * fmb; r_gy = 8.f * fmg * fatmy; r_bg = 8.f * fmg * fmb; r_bb = 8.f * pow(fmb, 2.f); r_by = 8.f * fmb * fatmy; r_yg = 8.f * fmg * fatmy; r_yb = 8.f * fmb * fatmy; r_yy = 8.f * pow(fatmy, 2.f); dsup = 0.f; esup = 0.f; zg = goffset + gl_globalinvocationid.x; fg = float(zg) / fmg; zb = boffset + gl_globalinvocationid.y; fb = float(zb) / fmb; for(q = 0; q < 10; q++){ fq = float(q); dsum = 0.f; esum = 0.f; for(n = 0; n <= 1031; n++){ dsummandsup = 0.f; esummandsup = 0.f; fys = float(n) / 10.f; for(zy = 0; zy <= my; zy++){ fymb = fys + float(zy) / fatmy - fb; t = fb + fymb; di_0_ = 1 + fe(1.e-1f * fq + t) + fe(2.e-1f * fq + 2.f * t) + fe(3.e-1f * fq + 3.f * t) + fe(4.e-1f * fq + 4.f * t) + fe(5.e-1f * fq + 5.f * t) + fe(6.e-1f * fq + 6.f * t) + fe(7.e-1f * fq + 7.f * t) + fe(8.e-1f * fq + 8.f * t) + fe(9.e-1f * fq + 9.f * t); ei_0_ = fe(10.f * t); td = fe(1.e-1f * fq + t) + 2.f * fe(2.e-1f * fq + 2.f * t) + 3.f * fe(3.e-1f * fq + 3.f * t) + 4.f * fe(4.e-1f * fq + 4.f * t) + 5.f * fe(5.e-1f * fq + 5.f * t) + 6.f * fe(6.e-1f * fq + 6.f * t) + 7.f * fe(7.e-1f * fq + 7.f * t) + 8.f * fe(8.e-1f * fq + 8.f * t) + 9.f * fe(9.e-1f * fq + 9.f * t); td = fcomplextimesi(td); te = fe(10.f * t); te = fcomplextimesi(te); di_0_y = 6.2831853e0f * td; ei_0_y = 6.2831853e1f * te; td = fe(1.e-1f * fq + t) + 4.f * fe(2.e-1f * fq + 2.f * t) + 9.f * fe(3.e-1f * fq + 3.f * t) + 16.f * fe(4.e-1f * fq + 4.f * t) + 25.f * fe(5.e-1f * fq + 5.f * t) + 36.f * fe(6.e-1f * fq + 6.f * t) + 49.f * fe(7.e-1f * fq + 7.f * t) + 64.f * fe(8.e-1f * fq + 8.f * t) + 81.f * fe(9.e-1f * fq + 9.f * t); td = -td; te = fe(10.f * t); te = -te; di_0_yy = 3.9478418e1f * td; ei_0_yy = 3.9478418e3f * te; t = fg + fb + 9.9019514e-2f * fymb; di_1_ = 1 + fe(-fq + t) + fe(-2.f * fq + 2.f * t) + fe(-3.f * fq + 3.f * t) + fe(-4.f * fq + 4.f * t) + fe(-5.f * fq + 5.f * t) + fe(-6.f * fq + 6.f * t) + fe(-7.f * fq + 7.f * t) + fe(-8.f * fq + 8.f * t) + fe(-9.f * fq + 9.f * t); ei_1_ = fe(10.f * t); td = fe(-fq + t) + 2.f * fe(-2.f * fq + 2.f * t) + 3.f * fe(-3.f * fq + 3.f * t) + 4.f * fe(-4.f * fq + 4.f * t) + 5.f * fe(-5.f * fq + 5.f * t) + 6.f * fe(-6.f * fq + 6.f * t) + 7.f * fe(-7.f * fq + 7.f * t) + 8.f * fe(-8.f * fq + 8.f * t) + 9.f * fe(-9.f * fq + 9.f * t); td = fcomplextimesi(td); te = fe(10.f * t); te = fcomplextimesi(te); di_1_g = 6.2831853e0f * td; ei_1_g = 6.2831853e1f * te; di_1_b = 5.6610274e0f * td; ei_1_b = 5.6610274e1f * te; di_1_y = 6.2215795e-1f * td; ei_1_y = 6.2215795e0f * te; td = fe(-fq + t) + 4.f * fe(-2.f * fq + 2.f * t) + 9.f * fe(-3.f * fq + 3.f * t) + 16.f * fe(-4.f * fq + 4.f * t) + 25.f * fe(-5.f * fq + 5.f * t) + 36.f * fe(-6.f * fq + 6.f * t) + 49.f * fe(-7.f * fq + 7.f * t) + 64.f * fe(-8.f * fq + 8.f * t) + 81.f * fe(-9.f * fq + 9.f * t); td = -td; te = fe(10.f * t); te = -te; di_1_gg = 3.9478418e1f * td; ei_1_gg = 3.9478418e3f * te; di_1_gb = 3.5569284e1f * td; ei_1_gb = 3.5569284e3f * te; di_1_gy = 3.9091337e0f * td; ei_1_gy = 3.9091337e2f * te; di_1_bg = 3.5569284e1f * td; ei_1_bg = 3.5569284e3f * te; di_1_bb = 3.2047231e1f * td; ei_1_bb = 3.2047231e3f * te; di_1_by = 3.5220532e0f * td; ei_1_by = 3.5220532e2f * te; di_1_yg = 3.9091337e0f * td; ei_1_yg = 3.9091337e2f * te; di_1_yb = 3.5220532e0f * td; ei_1_yb = 3.5220532e2f * te; di_1_yy = 3.8708052e-1f * td; ei_1_yy = 3.8708052e1f * te; t = -10.f * fg + fb + 9.8048641e-3f * fymb; di_2_ = 1 + fe(1.01e1f * fq + t) + fe(2.02e1f * fq + 2.f * t) + fe(3.03e1f * fq + 3.f * t) + fe(4.04e1f * fq + 4.f * t) + fe(5.05e1f * fq + 5.f * t) + fe(6.06e1f * fq + 6.f * t) + fe(7.07e1f * fq + 7.f * t) + fe(8.08e1f * fq + 8.f * t) + fe(9.09e1f * fq + 9.f * t); ei_2_ = fe(10.f * t); td = fe(1.01e1f * fq + t) + 2.f * fe(2.02e1f * fq + 2.f * t) + 3.f * fe(3.03e1f * fq + 3.f * t) + 4.f * fe(4.04e1f * fq + 4.f * t) + 5.f * fe(5.05e1f * fq + 5.f * t) + 6.f * fe(6.06e1f * fq + 6.f * t) + 7.f * fe(7.07e1f * fq + 7.f * t) + 8.f * fe(8.08e1f * fq + 8.f * t) + 9.f * fe(9.09e1f * fq + 9.f * t); td = fcomplextimesi(td); te = fe(10.f * t); te = fcomplextimesi(te); di_2_g = -6.2831853e1f * td; ei_2_g = -6.2831853e2f * te; di_2_b = 6.2215795e0f * td; ei_2_b = 6.2215795e1f * te; di_2_y = 6.1605778e-2f * td; ei_2_y = 6.1605778e-1f * te; td = fe(1.01e1f * fq + t) + 4.f * fe(2.02e1f * fq + 2.f * t) + 9.f * fe(3.03e1f * fq + 3.f * t) + 16.f * fe(4.04e1f * fq + 4.f * t) + 25.f * fe(5.05e1f * fq + 5.f * t) + 36.f * fe(6.06e1f * fq + 6.f * t) + 49.f * fe(7.07e1f * fq + 7.f * t) + 64.f * fe(8.08e1f * fq + 8.f * t) + 81.f * fe(9.09e1f * fq + 9.f * t); td = -td; te = fe(10.f * t); te = -te; di_2_gg = 3.9478418e3f * td; ei_2_gg = 3.9478418e5f * te; di_2_gb = -3.9091337e2f * td; ei_2_gb = -3.9091337e4f * te; di_2_gy = -3.8708052e0f * td; ei_2_gy = -3.8708052e2f * te; di_2_bg = -3.9091337e2f * td; ei_2_bg = -3.9091337e4f * te; di_2_bb = 3.8708052e1f * td; ei_2_bb = 3.8708052e3f * te; di_2_by = 3.8328525e-1f * td; ei_2_by = 3.8328525e1f * te; di_2_yg = -3.8708052e0f * td; ei_2_yg = -3.8708052e2f * te; di_2_yb = 3.8328525e-1f * td; ei_2_yb = 3.8328525e1f * te; di_2_yy = 3.7952719e-3f * td; ei_2_yy = 3.7952719e-1f * te; d_2_ = fcomplextimes(di_0_, di_1_) + ei_0_; d_2_g = fcomplextimes(di_0_, di_1_g); d_2_b = fcomplextimes(di_0_, di_1_b); d_2_y = fcomplextimes(di_0_y, di_1_) + fcomplextimes(di_0_, di_1_y) + ei_0_y; d_2_gg = fcomplextimes(di_0_, di_1_gg); d_2_gb = fcomplextimes(di_0_, di_1_gb); d_2_gy = fcomplextimes(di_0_y, di_1_g) + fcomplextimes(di_0_, di_1_gy); d_2_bg = fcomplextimes(di_0_, di_1_bg); d_2_bb = fcomplextimes(di_0_, di_1_bb); d_2_by = fcomplextimes(di_0_y, di_1_b) + fcomplextimes(di_0_, di_1_by); d_2_yg = fcomplextimes(di_0_y, di_1_g) + fcomplextimes(di_0_, di_1_yg); d_2_yb = fcomplextimes(di_0_y, di_1_b) + fcomplextimes(di_0_, di_1_yb); d_2_yy = fcomplextimes(di_0_yy, di_1_) + fcomplextimes(di_0_y, di_1_y) + fcomplextimes(di_0_y, di_1_y) + fcomplextimes(di_0_, di_1_yy) + ei_0_yy; e_2_ = fcomplextimes(di_0_, ei_1_); e_2_g = fcomplextimes(di_0_, ei_1_g); e_2_b = fcomplextimes(di_0_, ei_1_b); e_2_y = fcomplextimes(di_0_y, ei_1_) + fcomplextimes(di_0_, ei_1_y); e_2_gg = fcomplextimes(di_0_, ei_1_gg); e_2_gb = fcomplextimes(di_0_, ei_1_gb); e_2_gy = fcomplextimes(di_0_y, ei_1_g) + fcomplextimes(di_0_, ei_1_gy); e_2_bg = fcomplextimes(di_0_, ei_1_bg); e_2_bb = fcomplextimes(di_0_, ei_1_bb); e_2_by = fcomplextimes(di_0_y, ei_1_b) + fcomplextimes(di_0_, ei_1_by); e_2_yg = fcomplextimes(di_0_y, ei_1_g) + fcomplextimes(di_0_, ei_1_yg); e_2_yb = fcomplextimes(di_0_y, ei_1_b) + fcomplextimes(di_0_, ei_1_yb); e_2_yy = fcomplextimes(di_0_yy, ei_1_) + fcomplextimes(di_0_y, ei_1_y) + fcomplextimes(di_0_y, ei_1_y) + fcomplextimes(di_0_, ei_1_yy); d_3_ = fcomplextimes(d_2_, di_2_) + e_2_; d_3_g = fcomplextimes(d_2_g, di_2_) + fcomplextimes(d_2_, di_2_g) + e_2_g; d_3_b = fcomplextimes(d_2_b, di_2_) + fcomplextimes(d_2_, di_2_b) + e_2_b; d_3_y = fcomplextimes(d_2_y, di_2_) + fcomplextimes(d_2_, di_2_y) + e_2_y; d_3_gg = fcomplextimes(d_2_gg, di_2_) + fcomplextimes(d_2_g, di_2_g) + fcomplextimes(d_2_g, di_2_g) + fcomplextimes(d_2_, di_2_gg) + e_2_gg; d_3_gb = fcomplextimes(d_2_gb, di_2_) + fcomplextimes(d_2_g, di_2_b) + fcomplextimes(d_2_b, di_2_g) + fcomplextimes(d_2_, di_2_gb) + e_2_gb; d_3_gy = fcomplextimes(d_2_gy, di_2_) + fcomplextimes(d_2_g, di_2_y) + fcomplextimes(d_2_y, di_2_g) + fcomplextimes(d_2_, di_2_gy) + e_2_gy; d_3_bg = fcomplextimes(d_2_bg, di_2_) + fcomplextimes(d_2_b, di_2_g) + fcomplextimes(d_2_g, di_2_b) + fcomplextimes(d_2_, di_2_bg) + e_2_bg; d_3_bb = fcomplextimes(d_2_bb, di_2_) + fcomplextimes(d_2_b, di_2_b) + fcomplextimes(d_2_b, di_2_b) + fcomplextimes(d_2_, di_2_bb) + e_2_bb; d_3_by = fcomplextimes(d_2_by, di_2_) + fcomplextimes(d_2_b, di_2_y) + fcomplextimes(d_2_y, di_2_b) + fcomplextimes(d_2_, di_2_by) + e_2_by; //the bug occurs here: use 1 of 2 following semantically identical lines different results d_3_yg = fcomplextimes(d_2_gy, di_2_) + fcomplextimes(d_2_g, di_2_y) + fcomplextimes(d_2_y, di_2_g) + fcomplextimes(d_2_, di_2_gy) + e_2_gy; //d_3_yg = d_3_gy; d_3_yb = fcomplextimes(d_2_yb, di_2_) + fcomplextimes(d_2_y, di_2_b) + fcomplextimes(d_2_b, di_2_y) + fcomplextimes(d_2_, di_2_yb) + e_2_yb; d_3_yy = fcomplextimes(d_2_yy, di_2_) + fcomplextimes(d_2_y, di_2_y) + fcomplextimes(d_2_y, di_2_y) + fcomplextimes(d_2_, di_2_yy) + e_2_yy; dabs = length(d_3_) + length(d_3_g) / r_g + length(d_3_b) / r_b + length(d_3_y) / r_y + length(d_3_gg) / r_gg + length(d_3_gb) / r_gb + length(d_3_gy) / r_gy + length(d_3_bg) / r_bg + length(d_3_bb) / r_bb + length(d_3_by) / r_by + length(d_3_yg) / r_yg + length(d_3_yb) / r_yb + length(d_3_yy) / r_yy; dsummandsup = max(dsummandsup, dabs); e_3_ = fcomplextimes(d_2_, ei_2_); e_3_g = fcomplextimes(d_2_g, ei_2_) + fcomplextimes(d_2_, ei_2_g); e_3_b = fcomplextimes(d_2_b, ei_2_) + fcomplextimes(d_2_, ei_2_b); e_3_y = fcomplextimes(d_2_y, ei_2_) + fcomplextimes(d_2_, ei_2_y); e_3_gg = fcomplextimes(d_2_gg, ei_2_) + fcomplextimes(d_2_g, ei_2_g) + fcomplextimes(d_2_g, ei_2_g) + fcomplextimes(d_2_, ei_2_gg); e_3_gb = fcomplextimes(d_2_gb, ei_2_) + fcomplextimes(d_2_g, ei_2_b) + fcomplextimes(d_2_b, ei_2_g) + fcomplextimes(d_2_, ei_2_gb); e_3_gy = fcomplextimes(d_2_gy, ei_2_) + fcomplextimes(d_2_g, ei_2_y) + fcomplextimes(d_2_y, ei_2_g) + fcomplextimes(d_2_, ei_2_gy); e_3_bg = fcomplextimes(d_2_bg, ei_2_) + fcomplextimes(d_2_b, ei_2_g) + fcomplextimes(d_2_g, ei_2_b) + fcomplextimes(d_2_, ei_2_bg); e_3_bb = fcomplextimes(d_2_bb, ei_2_) + fcomplextimes(d_2_b, ei_2_b) + fcomplextimes(d_2_b, ei_2_b) + fcomplextimes(d_2_, ei_2_bb); e_3_by = fcomplextimes(d_2_by, ei_2_) + fcomplextimes(d_2_b, ei_2_y) + fcomplextimes(d_2_y, ei_2_b) + fcomplextimes(d_2_, ei_2_by); e_3_yg = fcomplextimes(d_2_yg, ei_2_) + fcomplextimes(d_2_y, ei_2_g) + fcomplextimes(d_2_g, ei_2_y) + fcomplextimes(d_2_, ei_2_yg); e_3_yb = fcomplextimes(d_2_yb, ei_2_) + fcomplextimes(d_2_y, ei_2_b) + fcomplextimes(d_2_b, ei_2_y) + fcomplextimes(d_2_, ei_2_yb); e_3_yy = fcomplextimes(d_2_yy, ei_2_) + fcomplextimes(d_2_y, ei_2_y) + fcomplextimes(d_2_y, ei_2_y) + fcomplextimes(d_2_, ei_2_yy); eabs = length(e_3_) + length(e_3_g) / r_g + length(e_3_b) / r_b + length(e_3_y) / r_y + length(e_3_gg) / r_gg + length(e_3_gb) / r_gb + length(e_3_gy) / r_gy + length(e_3_bg) / r_bg + length(e_3_bb) / r_bb + length(e_3_by) / r_by + length(e_3_yg) / r_yg + length(e_3_yb) / r_yb + length(e_3_yy) / r_yy; esummandsup = max(esummandsup, eabs); } dsum += dsummandsup; esum += esummandsup; } for(n = 1032; n <= 10402; n++){ dsummandsup = 0.f; esummandsup = 0.f; fys = float(n) / 10.f; for(zy = 0; zy <= my; zy++){ fymb = fys + float(zy) / fatmy - fb; t = fb + fymb; di_0_ = 1 + fe(1.e-1f * fq + t) + fe(2.e-1f * fq + 2.f * t) + fe(3.e-1f * fq + 3.f * t) + fe(4.e-1f * fq + 4.f * t) + fe(5.e-1f * fq + 5.f * t) + fe(6.e-1f * fq + 6.f * t) + fe(7.e-1f * fq + 7.f * t) + fe(8.e-1f * fq + 8.f * t) + fe(9.e-1f * fq + 9.f * t); ei_0_ = fe(10.f * t); td = fe(1.e-1f * fq + t) + 2.f * fe(2.e-1f * fq + 2.f * t) + 3.f * fe(3.e-1f * fq + 3.f * t) + 4.f * fe(4.e-1f * fq + 4.f * t) + 5.f * fe(5.e-1f * fq + 5.f * t) + 6.f * fe(6.e-1f * fq + 6.f * t) + 7.f * fe(7.e-1f * fq + 7.f * t) + 8.f * fe(8.e-1f * fq + 8.f * t) + 9.f * fe(9.e-1f * fq + 9.f * t); td = fcomplextimesi(td); te = fe(10.f * t); te = fcomplextimesi(te); di_0_y = 6.2831853e0f * td; ei_0_y = 6.2831853e1f * te; td = fe(1.e-1f * fq + t) + 4.f * fe(2.e-1f * fq + 2.f * t) + 9.f * fe(3.e-1f * fq + 3.f * t) + 16.f * fe(4.e-1f * fq + 4.f * t) + 25.f * fe(5.e-1f * fq + 5.f * t) + 36.f * fe(6.e-1f * fq + 6.f * t) + 49.f * fe(7.e-1f * fq + 7.f * t) + 64.f * fe(8.e-1f * fq + 8.f * t) + 81.f * fe(9.e-1f * fq + 9.f * t); td = -td; te = fe(10.f * t); te = -te; di_0_yy = 3.9478418e1f * td; ei_0_yy = 3.9478418e3f * te; t = fg + fb + 9.9019514e-2f * fymb; di_1_ = 1 + fe(-fq + t) + fe(-2.f * fq + 2.f * t) + fe(-3.f * fq + 3.f * t) + fe(-4.f * fq + 4.f * t) + fe(-5.f * fq + 5.f * t) + fe(-6.f * fq + 6.f * t) + fe(-7.f * fq + 7.f * t) + fe(-8.f * fq + 8.f * t) + fe(-9.f * fq + 9.f * t); ei_1_ = fe(10.f * t); td = fe(-fq + t) + 2.f * fe(-2.f * fq + 2.f * t) + 3.f * fe(-3.f * fq + 3.f * t) + 4.f * fe(-4.f * fq + 4.f * t) + 5.f * fe(-5.f * fq + 5.f * t) + 6.f * fe(-6.f * fq + 6.f * t) + 7.f * fe(-7.f * fq + 7.f * t) + 8.f * fe(-8.f * fq + 8.f * t) + 9.f * fe(-9.f * fq + 9.f * t); td = fcomplextimesi(td); te = fe(10.f * t); te = fcomplextimesi(te); di_1_g = 6.2831853e0f * td; ei_1_g = 6.2831853e1f * te; di_1_b = 5.6610274e0f * td; ei_1_b = 5.6610274e1f * te; di_1_y = 6.2215795e-1f * td; ei_1_y = 6.2215795e0f * te; td = fe(-fq + t) + 4.f * fe(-2.f * fq + 2.f * t) + 9.f * fe(-3.f * fq + 3.f * t) + 16.f * fe(-4.f * fq + 4.f * t) + 25.f * fe(-5.f * fq + 5.f * t) + 36.f * fe(-6.f * fq + 6.f * t) + 49.f * fe(-7.f * fq + 7.f * t) + 64.f * fe(-8.f * fq + 8.f * t) + 81.f * fe(-9.f * fq + 9.f * t); td = -td; te = fe(10.f * t); te = -te; di_1_gg = 3.9478418e1f * td; ei_1_gg = 3.9478418e3f * te; di_1_gb = 3.5569284e1f * td; ei_1_gb = 3.5569284e3f * te; di_1_gy = 3.9091337e0f * td; ei_1_gy = 3.9091337e2f * te; di_1_bg = 3.5569284e1f * td; ei_1_bg = 3.5569284e3f * te; di_1_bb = 3.2047231e1f * td; ei_1_bb = 3.2047231e3f * te; di_1_by = 3.5220532e0f * td; ei_1_by = 3.5220532e2f * te; di_1_yg = 3.9091337e0f * td; ei_1_yg = 3.9091337e2f * te; di_1_yb = 3.5220532e0f * td; ei_1_yb = 3.5220532e2f * te; di_1_yy = 3.8708052e-1f * td; ei_1_yy = 3.8708052e1f * te; t = -10.f * fg + fb + 9.8048641e-3f * fymb; di_2_ = 1 + fe(1.01e1f * fq + t) + fe(2.02e1f * fq + 2.f * t) + fe(3.03e1f * fq + 3.f * t) + fe(4.04e1f * fq + 4.f * t) + fe(5.05e1f * fq + 5.f * t) + fe(6.06e1f * fq + 6.f * t) + fe(7.07e1f * fq + 7.f * t) + fe(8.08e1f * fq + 8.f * t) + fe(9.09e1f * fq + 9.f * t); ei_2_ = fe(10.f * t); td = fe(1.01e1f * fq + t) + 2.f * fe(2.02e1f * fq + 2.f * t) + 3.f * fe(3.03e1f * fq + 3.f * t) + 4.f * fe(4.04e1f * fq + 4.f * t) + 5.f * fe(5.05e1f * fq + 5.f * t) + 6.f * fe(6.06e1f * fq + 6.f * t) + 7.f * fe(7.07e1f * fq + 7.f * t) + 8.f * fe(8.08e1f * fq + 8.f * t) + 9.f * fe(9.09e1f * fq + 9.f * t); td = fcomplextimesi(td); te = fe(10.f * t); te = fcomplextimesi(te); di_2_g = -6.2831853e1f * td; ei_2_g = -6.2831853e2f * te; di_2_b = 6.2215795e0f * td; ei_2_b = 6.2215795e1f * te; di_2_y = 6.1605778e-2f * td; ei_2_y = 6.1605778e-1f * te; td = fe(1.01e1f * fq + t) + 4.f * fe(2.02e1f * fq + 2.f * t) + 9.f * fe(3.03e1f * fq + 3.f * t) + 16.f * fe(4.04e1f * fq + 4.f * t) + 25.f * fe(5.05e1f * fq + 5.f * t) + 36.f * fe(6.06e1f * fq + 6.f * t) + 49.f * fe(7.07e1f * fq + 7.f * t) + 64.f * fe(8.08e1f * fq + 8.f * t) + 81.f * fe(9.09e1f * fq + 9.f * t); td = -td; te = fe(10.f * t); te = -te; di_2_gg = 3.9478418e3f * td; ei_2_gg = 3.9478418e5f * te; di_2_gb = -3.9091337e2f * td; ei_2_gb = -3.9091337e4f * te; di_2_gy = -3.8708052e0f * td; ei_2_gy = -3.8708052e2f * te; di_2_bg = -3.9091337e2f * td; ei_2_bg = -3.9091337e4f * te; di_2_bb = 3.8708052e1f * td; ei_2_bb = 3.8708052e3f * te; di_2_by = 3.8328525e-1f * td; ei_2_by = 3.8328525e1f * te; di_2_yg = -3.8708052e0f * td; ei_2_yg = -3.8708052e2f * te; di_2_yb = 3.8328525e-1f * td; ei_2_yb = 3.8328525e1f * te; di_2_yy = 3.7952719e-3f * td; ei_2_yy = 3.7952719e-1f * te; d_2_ = fcomplextimes(di_0_, di_1_) + ei_0_; d_2_g = fcomplextimes(di_0_, di_1_g); d_2_b = fcomplextimes(di_0_, di_1_b); d_2_y = fcomplextimes(di_0_y, di_1_) + fcomplextimes(di_0_, di_1_y) + ei_0_y; d_2_gg = fcomplextimes(di_0_, di_1_gg); d_2_gb = fcomplextimes(di_0_, di_1_gb); d_2_gy = fcomplextimes(di_0_y, di_1_g) + fcomplextimes(di_0_, di_1_gy); d_2_bg = fcomplextimes(di_0_, di_1_bg); d_2_bb = fcomplextimes(di_0_, di_1_bb); d_2_by = fcomplextimes(di_0_y, di_1_b) + fcomplextimes(di_0_, di_1_by); d_2_yg = fcomplextimes(di_0_y, di_1_g) + fcomplextimes(di_0_, di_1_yg); d_2_yb = fcomplextimes(di_0_y, di_1_b) + fcomplextimes(di_0_, di_1_yb); d_2_yy = fcomplextimes(di_0_yy, di_1_) + fcomplextimes(di_0_y, di_1_y) + fcomplextimes(di_0_y, di_1_y) + fcomplextimes(di_0_, di_1_yy) + ei_0_yy; e_2_ = fcomplextimes(di_0_, ei_1_); e_2_g = fcomplextimes(di_0_, ei_1_g); e_2_b = fcomplextimes(di_0_, ei_1_b); e_2_y = fcomplextimes(di_0_y, ei_1_) + fcomplextimes(di_0_, ei_1_y); e_2_gg = fcomplextimes(di_0_, ei_1_gg); e_2_gb = fcomplextimes(di_0_, ei_1_gb); e_2_gy = fcomplextimes(di_0_y, ei_1_g) + fcomplextimes(di_0_, ei_1_gy); e_2_bg = fcomplextimes(di_0_, ei_1_bg); e_2_bb = fcomplextimes(di_0_, ei_1_bb); e_2_by = fcomplextimes(di_0_y, ei_1_b) + fcomplextimes(di_0_, ei_1_by); e_2_yg = fcomplextimes(di_0_y, ei_1_g) + fcomplextimes(di_0_, ei_1_yg); e_2_yb = fcomplextimes(di_0_y, ei_1_b) + fcomplextimes(di_0_, ei_1_yb); e_2_yy = fcomplextimes(di_0_yy, ei_1_) + fcomplextimes(di_0_y, ei_1_y) + fcomplextimes(di_0_y, ei_1_y) + fcomplextimes(di_0_, ei_1_yy); e_3_ = fcomplextimes(d_2_, ei_2_); e_3_g = fcomplextimes(d_2_g, ei_2_) + fcomplextimes(d_2_, ei_2_g); e_3_b = fcomplextimes(d_2_b, ei_2_) + fcomplextimes(d_2_, ei_2_b); e_3_y = fcomplextimes(d_2_y, ei_2_) + fcomplextimes(d_2_, ei_2_y); e_3_gg = fcomplextimes(d_2_gg, ei_2_) + fcomplextimes(d_2_g, ei_2_g) + fcomplextimes(d_2_g, ei_2_g) + fcomplextimes(d_2_, ei_2_gg); e_3_gb = fcomplextimes(d_2_gb, ei_2_) + fcomplextimes(d_2_g, ei_2_b) + fcomplextimes(d_2_b, ei_2_g) + fcomplextimes(d_2_, ei_2_gb); e_3_gy = fcomplextimes(d_2_gy, ei_2_) + fcomplextimes(d_2_g, ei_2_y) + fcomplextimes(d_2_y, ei_2_g) + fcomplextimes(d_2_, ei_2_gy); e_3_bg = fcomplextimes(d_2_bg, ei_2_) + fcomplextimes(d_2_b, ei_2_g) + fcomplextimes(d_2_g, ei_2_b) + fcomplextimes(d_2_, ei_2_bg); e_3_bb = fcomplextimes(d_2_bb, ei_2_) + fcomplextimes(d_2_b, ei_2_b) + fcomplextimes(d_2_b, ei_2_b) + fcomplextimes(d_2_, ei_2_bb); e_3_by = fcomplextimes(d_2_by, ei_2_) + fcomplextimes(d_2_b, ei_2_y) + fcomplextimes(d_2_y, ei_2_b) + fcomplextimes(d_2_, ei_2_by); e_3_yg = fcomplextimes(d_2_yg, ei_2_) + fcomplextimes(d_2_y, ei_2_g) + fcomplextimes(d_2_g, ei_2_y) + fcomplextimes(d_2_, ei_2_yg); e_3_yb = fcomplextimes(d_2_yb, ei_2_) + fcomplextimes(d_2_y, ei_2_b) + fcomplextimes(d_2_b, ei_2_y) + fcomplextimes(d_2_, ei_2_yb); e_3_yy = fcomplextimes(d_2_yy, ei_2_) + fcomplextimes(d_2_y, ei_2_y) + fcomplextimes(d_2_y, ei_2_y) + fcomplextimes(d_2_, ei_2_yy); eabs = length(e_3_) + length(e_3_g) / r_g + length(e_3_b) / r_b + length(e_3_y) / r_y + length(e_3_gg) / r_gg + length(e_3_gb) / r_gb + length(e_3_gy) / r_gy + length(e_3_bg) / r_bg + length(e_3_bb) / r_bb + length(e_3_by) / r_by + length(e_3_yg) / r_yg + length(e_3_yb) / r_yb + length(e_3_yy) / r_yy; esummandsup = max(esummandsup, eabs); } dsum += dsummandsup; esum += esummandsup; } dsup = max(dsup, dsum); esup = max(esup, esum); } } void main(void){ fdesuptaylor_a10_w3_d2(); output.data[gl_globalinvocationid.x][gl_globalinvocationid.y][0] = dsup; output.data[gl_globalinvocationid.x][gl_globalinvocationid.y][1] = esup; }
if run above program , use 1 of 2 identical lines in shader marked comment different results value esup (second of printed values). if use first line
d_3_yg = fcomplextimes(d_2_gy, di_2_) + fcomplextimes(d_2_g, di_2_y) + fcomplextimes(d_2_y, di_2_g) + fcomplextimes(d_2_, di_2_gy) + e_2_gy;
i esup = 85930.453 , if use
d_3_yg = d_3_gy;
(note should give same result) esup = 85928.164.
also, if change lines
zg = goffset + gl_globalinvocationid.x; fg = float(zg) / fmg; zb = boffset + gl_globalinvocationid.y; fb = float(zb) / fmb;
to
zg = gl_globalinvocationid.x; fg = float(zg) / fmg; zb = gl_globalinvocationid.y; fb = float(zb) / fmb;
(note goffset , boffset set 0 in c++) yet result (esup = 85929.844 , esup = 85929.742 when use first or second lines respectively).
another strange aspect "full bug" seems occur if constant set 128. other values of such 64, 127, 129, or 256 changing 2 lines not change value of esup removing goffset , boffset still does.
also, value of d_3_yg (which set 2 different lines) should play no role computation of esup dsup. dsup stays constant , esup changes.
i'm using nvidia quadro m2000m , visual studio 2012 compile. ideas problem be?
note program needs 20 seconds run , screen freezes during time. in windows need increase tdrdelay to, say, 60 (seconds)
https://docs.microsoft.com/en-us/windows-hardware/drivers/display/tdr-registry-keys
as windows kills gpu computations take longer 2 seconds default.
your shader lot of float calculus. line operates previous results. precision error accumulates.
also, gpu may calculations in higher precision; each time store result gets truncated. times compiler can optimize it, times can't.
you should read
what every programmer should know floating-point arithmetic
or
what every computer scientist should know floating-point arithmetic
problems increase special, bad conditioned cases. simple geometrical analogous issue determinig point of intersection between 2 almost parallel lines. result can vary lot.
you should try simplify long calculations. using double-precision types (available since opengl 4.0) may help, perhaps not enough.
Comments
Post a Comment