glsl - Bug in OpenGL? Identical float computation leads to different results -


the "minimial example" quite long unfortunately bug disappears if drop seamingly irrelevant code.

here c++ code:

#define _use_math_defines  #include <iostream> #include <iomanip> #include <vector> #include <math.h> #include <time.h> #include <fstream> #include <sstream>  //opengl includes #include <gl/glew.h>                    //opengl extension wrangler #include <gl/freeglut.h>                //free opengl utility toolkit (includes gl.h , glu.h)  void fpressenter(){     std::cout << "press enter exit.";     std::cin.get(); }  void fcompileshaderfromstring(const std::string& source, const gluint shaderid){     const char* code = source.c_str();      glshadersource(shaderid, 1, &code, null);      glcompileshader(shaderid);      int status;     glgetshaderiv(shaderid, gl_compile_status, &status);     if(status == gl_false){         int length = 0;         glgetshaderiv(shaderid, gl_info_log_length, &length);         if(length > 0){             char* log = new char[length];             int written = 0;             glgetshaderinfolog(shaderid, length, &written, log);             std::cout << log << std::endl;             delete[] log;         }     } }  void fcompileshaderfromfile(const char* filename, const gluint shaderid){     std::ifstream infile(filename, std::ios::in);     std::ostringstream code;      while(infile.good()){         int ti = infile.get();         if(!infile.eof()) code << (char) ti;     }     infile.close();      fcompileshaderfromstring(code.str(), shaderid); }  void flinkprogram(const gluint programid){     gllinkprogram(programid);      int status;     glgetprogramiv(programid, gl_link_status, &status);     if(status == gl_false){         int length = 0;         glgetprogramiv(programid, gl_info_log_length, &length);         if(length > 0){             char* log = new char[length];             int written = 0;             glgetprograminfolog(programid, length, &written, log);             std::cout << log << std::endl;             delete[] log;         }     } }  int main(){     //opengl setup      //glut     int argc = 1;     char* argv[1] = {(char*) ""};     glutinit(&argc, argv);      glutinitwindowposition((glutget(glut_screen_width) - 256) / 2, (glutget(glut_screen_height) - 256) / 2);     glutinitwindowsize(256, 256);     glutinitdisplaymode(glut_rgb);      glutcreatewindow("bug");     gluthidewindow();      //glew     glenum glew_ok = glewinit();     if(glew_ok != glew_ok){fprintf(stderr, "glew error: '%s'\n", glewgeterrorstring(glew_ok)); fpressenter(); return(1);}      //main program      //auxiliary variables     gluint shaderid, programid, bufferid;     float* bufferpointer;     float* data = new float[32 * 32 * 2];     float dsup, esup;      //compile shader program , create buffer in graphics card memory     shaderid = glcreateshader(gl_compute_shader);     fcompileshaderfromfile("shader.txt", shaderid);      programid = glcreateprogram();     glattachshader(programid, shaderid);     flinkprogram(programid);     gldetachshader(programid, shaderid);      gldeleteshader(shaderid);      gluseprogram(programid);      glgenbuffers(1, &bufferid);      glbindbufferbase(gl_shader_storage_buffer, 0, bufferid);     glbufferdata(gl_shader_storage_buffer, 32 * 32 * 2 * sizeof(float), null, gl_stream_read);     glbindbuffer(gl_shader_storage_buffer, 0);      //actual computation     std::cout << "starting computation" << std::endl << std::endl << std::flush;      dsup = 0.f;     esup = 0.f;      gluniform1i(0, 0);     gluniform1i(1, 0);      gldispatchcompute(4, 4, 1);      glmemorybarrier(gl_all_barrier_bits);     glfinish();      glbindbufferbase(gl_shader_storage_buffer, 0, bufferid);     bufferpointer = (float*) glmapbuffer(gl_shader_storage_buffer, gl_read_only);     memcpy(data, bufferpointer, 32 * 32 * 2 * sizeof(float));     glunmapbuffer(gl_shader_storage_buffer);     glbindbuffer(gl_shader_storage_buffer, 0);      for(int z2 = 0; z2 < 32; z2++) for(int z3 = 0; z3 < 32; z3++){         if(data[z2 * 32 * 2 + z3 * 2 + 0] > dsup) dsup = data[z2 * 32 * 2 + z3 * 2 + 0];         if(data[z2 * 32 * 2 + z3 * 2 + 1] > esup) esup = data[z2 * 32 * 2 + z3 * 2 + 1];     }      std::cout << std::setprecision(8) << dsup << ", " << std::setprecision(8) << esup << std::endl;      //cleanup     delete[] data;      gldeletebuffers(1, &bufferid);      gldeleteprogram(programid);      fpressenter();     return(0); } 

and here shader:

#version 430 core  const uint mg = 2048; const uint mb = 512; const uint = 128;  layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;  layout(binding = 0) buffer outputblock{     float data[32][32][2]; } output;  layout(location = 0) uniform int goffset; layout(location = 1) uniform int boffset;  vec2 fe(float x){return(vec2(cos(6.2831853e0f * x), sin(6.2831853e0f * x)));} vec2 fcomplextimes(const vec2 a, const vec2 b){return(vec2(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x));} vec2 fcomplextimesi(const vec2 a){return(vec2(-a.y, a.x));} vec2 fcomplextimesminusi(const vec2 a){return(vec2(a.y, -a.x));}  float dsup, esup;  void fdesuptaylor_a10_w3_d2(){     const vec2 1 = vec2(1.f, 0.f);     uint q, zg, zb, zy, n;     float fq, fb, fg, fys, fymb, fmg, fmb, fatmy;     float dsum, esum, dsummandsup, esummandsup, dabs, eabs;     float t;     vec2 td, te;     vec2 di_0_, di_0_y, di_0_yy, di_1_, di_1_g, di_1_b, di_1_y, di_1_gg, di_1_gb, di_1_gy, di_1_bg, di_1_bb, di_1_by, di_1_yg, di_1_yb, di_1_yy, di_2_, di_2_g, di_2_b, di_2_y, di_2_gg, di_2_gb, di_2_gy, di_2_bg, di_2_bb, di_2_by, di_2_yg, di_2_yb, di_2_yy;     vec2 ei_0_, ei_0_y, ei_0_yy, ei_1_, ei_1_g, ei_1_b, ei_1_y, ei_1_gg, ei_1_gb, ei_1_gy, ei_1_bg, ei_1_bb, ei_1_by, ei_1_yg, ei_1_yb, ei_1_yy, ei_2_, ei_2_g, ei_2_b, ei_2_y, ei_2_gg, ei_2_gb, ei_2_gy, ei_2_bg, ei_2_bb, ei_2_by, ei_2_yg, ei_2_yb, ei_2_yy;     vec2 d_2_, d_2_g, d_2_b, d_2_y, d_2_gg, d_2_gb, d_2_gy, d_2_bg, d_2_bb, d_2_by, d_2_yg, d_2_yb, d_2_yy, d_3_, d_3_g, d_3_b, d_3_y, d_3_gg, d_3_gb, d_3_gy, d_3_bg, d_3_bb, d_3_by, d_3_yg, d_3_yb, d_3_yy;     vec2 e_2_, e_2_g, e_2_b, e_2_y, e_2_gg, e_2_gb, e_2_gy, e_2_bg, e_2_bb, e_2_by, e_2_yg, e_2_yb, e_2_yy, e_3_, e_3_g, e_3_b, e_3_y, e_3_gg, e_3_gb, e_3_gy, e_3_bg, e_3_bb, e_3_by, e_3_yg, e_3_yb, e_3_yy;     float r_g, r_b, r_y, r_gg, r_gb, r_gy, r_bg, r_bb, r_by, r_yg, r_yb, r_yy;      fmg = float(mg);     fmb = float(mb);     fatmy = 10.f * float(my);      r_g = 2.f * fmg;     r_b = 2.f * fmb;     r_y = 2.f * fatmy;     r_gg = 8.f * pow(fmg, 2.f);     r_gb = 8.f * fmg * fmb;     r_gy = 8.f * fmg * fatmy;     r_bg = 8.f * fmg * fmb;     r_bb = 8.f * pow(fmb, 2.f);     r_by = 8.f * fmb * fatmy;     r_yg = 8.f * fmg * fatmy;     r_yb = 8.f * fmb * fatmy;     r_yy = 8.f * pow(fatmy, 2.f);      dsup = 0.f;     esup = 0.f;      zg = goffset + gl_globalinvocationid.x;     fg = float(zg) / fmg;      zb = boffset + gl_globalinvocationid.y;     fb = float(zb) / fmb;      for(q = 0; q < 10; q++){         fq = float(q);          dsum = 0.f;         esum = 0.f;          for(n = 0; n <= 1031; n++){             dsummandsup = 0.f;             esummandsup = 0.f;              fys = float(n) / 10.f;              for(zy = 0; zy <= my; zy++){                 fymb = fys + float(zy) / fatmy - fb;                  t = fb + fymb;                  di_0_ = 1 + fe(1.e-1f * fq + t) + fe(2.e-1f * fq + 2.f * t) + fe(3.e-1f * fq + 3.f * t) + fe(4.e-1f * fq + 4.f * t) + fe(5.e-1f * fq + 5.f * t) + fe(6.e-1f * fq + 6.f * t) + fe(7.e-1f * fq + 7.f * t) + fe(8.e-1f * fq + 8.f * t) + fe(9.e-1f * fq + 9.f * t);                 ei_0_ = fe(10.f * t);                  td = fe(1.e-1f * fq + t) + 2.f * fe(2.e-1f * fq + 2.f * t) + 3.f * fe(3.e-1f * fq + 3.f * t) + 4.f * fe(4.e-1f * fq + 4.f * t) + 5.f * fe(5.e-1f * fq + 5.f * t) + 6.f * fe(6.e-1f * fq + 6.f * t) + 7.f * fe(7.e-1f * fq + 7.f * t) + 8.f * fe(8.e-1f * fq + 8.f * t) + 9.f * fe(9.e-1f * fq + 9.f * t); td = fcomplextimesi(td);                 te = fe(10.f * t); te = fcomplextimesi(te);                 di_0_y = 6.2831853e0f * td; ei_0_y = 6.2831853e1f * te;                  td = fe(1.e-1f * fq + t) + 4.f * fe(2.e-1f * fq + 2.f * t) + 9.f * fe(3.e-1f * fq + 3.f * t) + 16.f * fe(4.e-1f * fq + 4.f * t) + 25.f * fe(5.e-1f * fq + 5.f * t) + 36.f * fe(6.e-1f * fq + 6.f * t) + 49.f * fe(7.e-1f * fq + 7.f * t) + 64.f * fe(8.e-1f * fq + 8.f * t) + 81.f * fe(9.e-1f * fq + 9.f * t); td = -td;                 te = fe(10.f * t); te = -te;                 di_0_yy = 3.9478418e1f * td; ei_0_yy = 3.9478418e3f * te;                  t = fg + fb + 9.9019514e-2f * fymb;                  di_1_ = 1 + fe(-fq + t) + fe(-2.f * fq + 2.f * t) + fe(-3.f * fq + 3.f * t) + fe(-4.f * fq + 4.f * t) + fe(-5.f * fq + 5.f * t) + fe(-6.f * fq + 6.f * t) + fe(-7.f * fq + 7.f * t) + fe(-8.f * fq + 8.f * t) + fe(-9.f * fq + 9.f * t);                 ei_1_ = fe(10.f * t);                  td = fe(-fq + t) + 2.f * fe(-2.f * fq + 2.f * t) + 3.f * fe(-3.f * fq + 3.f * t) + 4.f * fe(-4.f * fq + 4.f * t) + 5.f * fe(-5.f * fq + 5.f * t) + 6.f * fe(-6.f * fq + 6.f * t) + 7.f * fe(-7.f * fq + 7.f * t) + 8.f * fe(-8.f * fq + 8.f * t) + 9.f * fe(-9.f * fq + 9.f * t); td = fcomplextimesi(td);                 te = fe(10.f * t); te = fcomplextimesi(te);                 di_1_g = 6.2831853e0f * td; ei_1_g = 6.2831853e1f * te;                 di_1_b = 5.6610274e0f * td; ei_1_b = 5.6610274e1f * te;                 di_1_y = 6.2215795e-1f * td; ei_1_y = 6.2215795e0f * te;                  td = fe(-fq + t) + 4.f * fe(-2.f * fq + 2.f * t) + 9.f * fe(-3.f * fq + 3.f * t) + 16.f * fe(-4.f * fq + 4.f * t) + 25.f * fe(-5.f * fq + 5.f * t) + 36.f * fe(-6.f * fq + 6.f * t) + 49.f * fe(-7.f * fq + 7.f * t) + 64.f * fe(-8.f * fq + 8.f * t) + 81.f * fe(-9.f * fq + 9.f * t); td = -td;                 te = fe(10.f * t); te = -te;                 di_1_gg = 3.9478418e1f * td; ei_1_gg = 3.9478418e3f * te;                 di_1_gb = 3.5569284e1f * td; ei_1_gb = 3.5569284e3f * te;                 di_1_gy = 3.9091337e0f * td; ei_1_gy = 3.9091337e2f * te;                 di_1_bg = 3.5569284e1f * td; ei_1_bg = 3.5569284e3f * te;                 di_1_bb = 3.2047231e1f * td; ei_1_bb = 3.2047231e3f * te;                 di_1_by = 3.5220532e0f * td; ei_1_by = 3.5220532e2f * te;                 di_1_yg = 3.9091337e0f * td; ei_1_yg = 3.9091337e2f * te;                 di_1_yb = 3.5220532e0f * td; ei_1_yb = 3.5220532e2f * te;                 di_1_yy = 3.8708052e-1f * td; ei_1_yy = 3.8708052e1f * te;                  t = -10.f * fg + fb + 9.8048641e-3f * fymb;                  di_2_ = 1 + fe(1.01e1f * fq + t) + fe(2.02e1f * fq + 2.f * t) + fe(3.03e1f * fq + 3.f * t) + fe(4.04e1f * fq + 4.f * t) + fe(5.05e1f * fq + 5.f * t) + fe(6.06e1f * fq + 6.f * t) + fe(7.07e1f * fq + 7.f * t) + fe(8.08e1f * fq + 8.f * t) + fe(9.09e1f * fq + 9.f * t);                 ei_2_ = fe(10.f * t);                  td = fe(1.01e1f * fq + t) + 2.f * fe(2.02e1f * fq + 2.f * t) + 3.f * fe(3.03e1f * fq + 3.f * t) + 4.f * fe(4.04e1f * fq + 4.f * t) + 5.f * fe(5.05e1f * fq + 5.f * t) + 6.f * fe(6.06e1f * fq + 6.f * t) + 7.f * fe(7.07e1f * fq + 7.f * t) + 8.f * fe(8.08e1f * fq + 8.f * t) + 9.f * fe(9.09e1f * fq + 9.f * t); td = fcomplextimesi(td);                 te = fe(10.f * t); te = fcomplextimesi(te);                 di_2_g = -6.2831853e1f * td; ei_2_g = -6.2831853e2f * te;                 di_2_b = 6.2215795e0f * td; ei_2_b = 6.2215795e1f * te;                 di_2_y = 6.1605778e-2f * td; ei_2_y = 6.1605778e-1f * te;                  td = fe(1.01e1f * fq + t) + 4.f * fe(2.02e1f * fq + 2.f * t) + 9.f * fe(3.03e1f * fq + 3.f * t) + 16.f * fe(4.04e1f * fq + 4.f * t) + 25.f * fe(5.05e1f * fq + 5.f * t) + 36.f * fe(6.06e1f * fq + 6.f * t) + 49.f * fe(7.07e1f * fq + 7.f * t) + 64.f * fe(8.08e1f * fq + 8.f * t) + 81.f * fe(9.09e1f * fq + 9.f * t); td = -td;                 te = fe(10.f * t); te = -te;                 di_2_gg = 3.9478418e3f * td; ei_2_gg = 3.9478418e5f * te;                 di_2_gb = -3.9091337e2f * td; ei_2_gb = -3.9091337e4f * te;                 di_2_gy = -3.8708052e0f * td; ei_2_gy = -3.8708052e2f * te;                 di_2_bg = -3.9091337e2f * td; ei_2_bg = -3.9091337e4f * te;                 di_2_bb = 3.8708052e1f * td; ei_2_bb = 3.8708052e3f * te;                 di_2_by = 3.8328525e-1f * td; ei_2_by = 3.8328525e1f * te;                 di_2_yg = -3.8708052e0f * td; ei_2_yg = -3.8708052e2f * te;                 di_2_yb = 3.8328525e-1f * td; ei_2_yb = 3.8328525e1f * te;                 di_2_yy = 3.7952719e-3f * td; ei_2_yy = 3.7952719e-1f * te;                  d_2_ = fcomplextimes(di_0_, di_1_) + ei_0_;                 d_2_g = fcomplextimes(di_0_, di_1_g);                 d_2_b = fcomplextimes(di_0_, di_1_b);                 d_2_y = fcomplextimes(di_0_y, di_1_) + fcomplextimes(di_0_, di_1_y) + ei_0_y;                 d_2_gg = fcomplextimes(di_0_, di_1_gg);                 d_2_gb = fcomplextimes(di_0_, di_1_gb);                 d_2_gy = fcomplextimes(di_0_y, di_1_g) + fcomplextimes(di_0_, di_1_gy);                 d_2_bg = fcomplextimes(di_0_, di_1_bg);                 d_2_bb = fcomplextimes(di_0_, di_1_bb);                 d_2_by = fcomplextimes(di_0_y, di_1_b) + fcomplextimes(di_0_, di_1_by);                 d_2_yg = fcomplextimes(di_0_y, di_1_g) + fcomplextimes(di_0_, di_1_yg);                 d_2_yb = fcomplextimes(di_0_y, di_1_b) + fcomplextimes(di_0_, di_1_yb);                 d_2_yy = fcomplextimes(di_0_yy, di_1_) + fcomplextimes(di_0_y, di_1_y) + fcomplextimes(di_0_y, di_1_y) + fcomplextimes(di_0_, di_1_yy) + ei_0_yy;                  e_2_ = fcomplextimes(di_0_, ei_1_);                 e_2_g = fcomplextimes(di_0_, ei_1_g);                 e_2_b = fcomplextimes(di_0_, ei_1_b);                 e_2_y = fcomplextimes(di_0_y, ei_1_) + fcomplextimes(di_0_, ei_1_y);                 e_2_gg = fcomplextimes(di_0_, ei_1_gg);                 e_2_gb = fcomplextimes(di_0_, ei_1_gb);                 e_2_gy = fcomplextimes(di_0_y, ei_1_g) + fcomplextimes(di_0_, ei_1_gy);                 e_2_bg = fcomplextimes(di_0_, ei_1_bg);                 e_2_bb = fcomplextimes(di_0_, ei_1_bb);                 e_2_by = fcomplextimes(di_0_y, ei_1_b) + fcomplextimes(di_0_, ei_1_by);                 e_2_yg = fcomplextimes(di_0_y, ei_1_g) + fcomplextimes(di_0_, ei_1_yg);                 e_2_yb = fcomplextimes(di_0_y, ei_1_b) + fcomplextimes(di_0_, ei_1_yb);                 e_2_yy = fcomplextimes(di_0_yy, ei_1_) + fcomplextimes(di_0_y, ei_1_y) + fcomplextimes(di_0_y, ei_1_y) + fcomplextimes(di_0_, ei_1_yy);                  d_3_ = fcomplextimes(d_2_, di_2_) + e_2_;                 d_3_g = fcomplextimes(d_2_g, di_2_) + fcomplextimes(d_2_, di_2_g) + e_2_g;                 d_3_b = fcomplextimes(d_2_b, di_2_) + fcomplextimes(d_2_, di_2_b) + e_2_b;                 d_3_y = fcomplextimes(d_2_y, di_2_) + fcomplextimes(d_2_, di_2_y) + e_2_y;                 d_3_gg = fcomplextimes(d_2_gg, di_2_) + fcomplextimes(d_2_g, di_2_g) + fcomplextimes(d_2_g, di_2_g) + fcomplextimes(d_2_, di_2_gg) + e_2_gg;                 d_3_gb = fcomplextimes(d_2_gb, di_2_) + fcomplextimes(d_2_g, di_2_b) + fcomplextimes(d_2_b, di_2_g) + fcomplextimes(d_2_, di_2_gb) + e_2_gb;                 d_3_gy = fcomplextimes(d_2_gy, di_2_) + fcomplextimes(d_2_g, di_2_y) + fcomplextimes(d_2_y, di_2_g) + fcomplextimes(d_2_, di_2_gy) + e_2_gy;                 d_3_bg = fcomplextimes(d_2_bg, di_2_) + fcomplextimes(d_2_b, di_2_g) + fcomplextimes(d_2_g, di_2_b) + fcomplextimes(d_2_, di_2_bg) + e_2_bg;                 d_3_bb = fcomplextimes(d_2_bb, di_2_) + fcomplextimes(d_2_b, di_2_b) + fcomplextimes(d_2_b, di_2_b) + fcomplextimes(d_2_, di_2_bb) + e_2_bb;                 d_3_by = fcomplextimes(d_2_by, di_2_) + fcomplextimes(d_2_b, di_2_y) + fcomplextimes(d_2_y, di_2_b) + fcomplextimes(d_2_, di_2_by) + e_2_by;                 //the bug occurs here: use 1 of 2 following semantically identical lines different results                 d_3_yg = fcomplextimes(d_2_gy, di_2_) + fcomplextimes(d_2_g, di_2_y) + fcomplextimes(d_2_y, di_2_g) + fcomplextimes(d_2_, di_2_gy) + e_2_gy;                 //d_3_yg = d_3_gy;                 d_3_yb = fcomplextimes(d_2_yb, di_2_) + fcomplextimes(d_2_y, di_2_b) + fcomplextimes(d_2_b, di_2_y) + fcomplextimes(d_2_, di_2_yb) + e_2_yb;                 d_3_yy = fcomplextimes(d_2_yy, di_2_) + fcomplextimes(d_2_y, di_2_y) + fcomplextimes(d_2_y, di_2_y) + fcomplextimes(d_2_, di_2_yy) + e_2_yy;                  dabs = length(d_3_) + length(d_3_g) / r_g + length(d_3_b) / r_b + length(d_3_y) / r_y + length(d_3_gg) / r_gg + length(d_3_gb) / r_gb + length(d_3_gy) / r_gy + length(d_3_bg) / r_bg + length(d_3_bb) / r_bb + length(d_3_by) / r_by + length(d_3_yg) / r_yg + length(d_3_yb) / r_yb + length(d_3_yy) / r_yy;                 dsummandsup = max(dsummandsup, dabs);                  e_3_ = fcomplextimes(d_2_, ei_2_);                 e_3_g = fcomplextimes(d_2_g, ei_2_) + fcomplextimes(d_2_, ei_2_g);                 e_3_b = fcomplextimes(d_2_b, ei_2_) + fcomplextimes(d_2_, ei_2_b);                 e_3_y = fcomplextimes(d_2_y, ei_2_) + fcomplextimes(d_2_, ei_2_y);                 e_3_gg = fcomplextimes(d_2_gg, ei_2_) + fcomplextimes(d_2_g, ei_2_g) + fcomplextimes(d_2_g, ei_2_g) + fcomplextimes(d_2_, ei_2_gg);                 e_3_gb = fcomplextimes(d_2_gb, ei_2_) + fcomplextimes(d_2_g, ei_2_b) + fcomplextimes(d_2_b, ei_2_g) + fcomplextimes(d_2_, ei_2_gb);                 e_3_gy = fcomplextimes(d_2_gy, ei_2_) + fcomplextimes(d_2_g, ei_2_y) + fcomplextimes(d_2_y, ei_2_g) + fcomplextimes(d_2_, ei_2_gy);                 e_3_bg = fcomplextimes(d_2_bg, ei_2_) + fcomplextimes(d_2_b, ei_2_g) + fcomplextimes(d_2_g, ei_2_b) + fcomplextimes(d_2_, ei_2_bg);                 e_3_bb = fcomplextimes(d_2_bb, ei_2_) + fcomplextimes(d_2_b, ei_2_b) + fcomplextimes(d_2_b, ei_2_b) + fcomplextimes(d_2_, ei_2_bb);                 e_3_by = fcomplextimes(d_2_by, ei_2_) + fcomplextimes(d_2_b, ei_2_y) + fcomplextimes(d_2_y, ei_2_b) + fcomplextimes(d_2_, ei_2_by);                 e_3_yg = fcomplextimes(d_2_yg, ei_2_) + fcomplextimes(d_2_y, ei_2_g) + fcomplextimes(d_2_g, ei_2_y) + fcomplextimes(d_2_, ei_2_yg);                 e_3_yb = fcomplextimes(d_2_yb, ei_2_) + fcomplextimes(d_2_y, ei_2_b) + fcomplextimes(d_2_b, ei_2_y) + fcomplextimes(d_2_, ei_2_yb);                 e_3_yy = fcomplextimes(d_2_yy, ei_2_) + fcomplextimes(d_2_y, ei_2_y) + fcomplextimes(d_2_y, ei_2_y) + fcomplextimes(d_2_, ei_2_yy);                  eabs = length(e_3_) + length(e_3_g) / r_g + length(e_3_b) / r_b + length(e_3_y) / r_y + length(e_3_gg) / r_gg + length(e_3_gb) / r_gb + length(e_3_gy) / r_gy + length(e_3_bg) / r_bg + length(e_3_bb) / r_bb + length(e_3_by) / r_by + length(e_3_yg) / r_yg + length(e_3_yb) / r_yb + length(e_3_yy) / r_yy;                 esummandsup = max(esummandsup, eabs);             }              dsum += dsummandsup;             esum += esummandsup;         }         for(n = 1032; n <= 10402; n++){             dsummandsup = 0.f;             esummandsup = 0.f;              fys = float(n) / 10.f;              for(zy = 0; zy <= my; zy++){                 fymb = fys + float(zy) / fatmy - fb;                  t = fb + fymb;                  di_0_ = 1 + fe(1.e-1f * fq + t) + fe(2.e-1f * fq + 2.f * t) + fe(3.e-1f * fq + 3.f * t) + fe(4.e-1f * fq + 4.f * t) + fe(5.e-1f * fq + 5.f * t) + fe(6.e-1f * fq + 6.f * t) + fe(7.e-1f * fq + 7.f * t) + fe(8.e-1f * fq + 8.f * t) + fe(9.e-1f * fq + 9.f * t);                 ei_0_ = fe(10.f * t);                  td = fe(1.e-1f * fq + t) + 2.f * fe(2.e-1f * fq + 2.f * t) + 3.f * fe(3.e-1f * fq + 3.f * t) + 4.f * fe(4.e-1f * fq + 4.f * t) + 5.f * fe(5.e-1f * fq + 5.f * t) + 6.f * fe(6.e-1f * fq + 6.f * t) + 7.f * fe(7.e-1f * fq + 7.f * t) + 8.f * fe(8.e-1f * fq + 8.f * t) + 9.f * fe(9.e-1f * fq + 9.f * t); td = fcomplextimesi(td);                 te = fe(10.f * t); te = fcomplextimesi(te);                 di_0_y = 6.2831853e0f * td; ei_0_y = 6.2831853e1f * te;                  td = fe(1.e-1f * fq + t) + 4.f * fe(2.e-1f * fq + 2.f * t) + 9.f * fe(3.e-1f * fq + 3.f * t) + 16.f * fe(4.e-1f * fq + 4.f * t) + 25.f * fe(5.e-1f * fq + 5.f * t) + 36.f * fe(6.e-1f * fq + 6.f * t) + 49.f * fe(7.e-1f * fq + 7.f * t) + 64.f * fe(8.e-1f * fq + 8.f * t) + 81.f * fe(9.e-1f * fq + 9.f * t); td = -td;                 te = fe(10.f * t); te = -te;                 di_0_yy = 3.9478418e1f * td; ei_0_yy = 3.9478418e3f * te;                  t = fg + fb + 9.9019514e-2f * fymb;                  di_1_ = 1 + fe(-fq + t) + fe(-2.f * fq + 2.f * t) + fe(-3.f * fq + 3.f * t) + fe(-4.f * fq + 4.f * t) + fe(-5.f * fq + 5.f * t) + fe(-6.f * fq + 6.f * t) + fe(-7.f * fq + 7.f * t) + fe(-8.f * fq + 8.f * t) + fe(-9.f * fq + 9.f * t);                 ei_1_ = fe(10.f * t);                  td = fe(-fq + t) + 2.f * fe(-2.f * fq + 2.f * t) + 3.f * fe(-3.f * fq + 3.f * t) + 4.f * fe(-4.f * fq + 4.f * t) + 5.f * fe(-5.f * fq + 5.f * t) + 6.f * fe(-6.f * fq + 6.f * t) + 7.f * fe(-7.f * fq + 7.f * t) + 8.f * fe(-8.f * fq + 8.f * t) + 9.f * fe(-9.f * fq + 9.f * t); td = fcomplextimesi(td);                 te = fe(10.f * t); te = fcomplextimesi(te);                 di_1_g = 6.2831853e0f * td; ei_1_g = 6.2831853e1f * te;                 di_1_b = 5.6610274e0f * td; ei_1_b = 5.6610274e1f * te;                 di_1_y = 6.2215795e-1f * td; ei_1_y = 6.2215795e0f * te;                  td = fe(-fq + t) + 4.f * fe(-2.f * fq + 2.f * t) + 9.f * fe(-3.f * fq + 3.f * t) + 16.f * fe(-4.f * fq + 4.f * t) + 25.f * fe(-5.f * fq + 5.f * t) + 36.f * fe(-6.f * fq + 6.f * t) + 49.f * fe(-7.f * fq + 7.f * t) + 64.f * fe(-8.f * fq + 8.f * t) + 81.f * fe(-9.f * fq + 9.f * t); td = -td;                 te = fe(10.f * t); te = -te;                 di_1_gg = 3.9478418e1f * td; ei_1_gg = 3.9478418e3f * te;                 di_1_gb = 3.5569284e1f * td; ei_1_gb = 3.5569284e3f * te;                 di_1_gy = 3.9091337e0f * td; ei_1_gy = 3.9091337e2f * te;                 di_1_bg = 3.5569284e1f * td; ei_1_bg = 3.5569284e3f * te;                 di_1_bb = 3.2047231e1f * td; ei_1_bb = 3.2047231e3f * te;                 di_1_by = 3.5220532e0f * td; ei_1_by = 3.5220532e2f * te;                 di_1_yg = 3.9091337e0f * td; ei_1_yg = 3.9091337e2f * te;                 di_1_yb = 3.5220532e0f * td; ei_1_yb = 3.5220532e2f * te;                 di_1_yy = 3.8708052e-1f * td; ei_1_yy = 3.8708052e1f * te;                  t = -10.f * fg + fb + 9.8048641e-3f * fymb;                  di_2_ = 1 + fe(1.01e1f * fq + t) + fe(2.02e1f * fq + 2.f * t) + fe(3.03e1f * fq + 3.f * t) + fe(4.04e1f * fq + 4.f * t) + fe(5.05e1f * fq + 5.f * t) + fe(6.06e1f * fq + 6.f * t) + fe(7.07e1f * fq + 7.f * t) + fe(8.08e1f * fq + 8.f * t) + fe(9.09e1f * fq + 9.f * t);                 ei_2_ = fe(10.f * t);                  td = fe(1.01e1f * fq + t) + 2.f * fe(2.02e1f * fq + 2.f * t) + 3.f * fe(3.03e1f * fq + 3.f * t) + 4.f * fe(4.04e1f * fq + 4.f * t) + 5.f * fe(5.05e1f * fq + 5.f * t) + 6.f * fe(6.06e1f * fq + 6.f * t) + 7.f * fe(7.07e1f * fq + 7.f * t) + 8.f * fe(8.08e1f * fq + 8.f * t) + 9.f * fe(9.09e1f * fq + 9.f * t); td = fcomplextimesi(td);                 te = fe(10.f * t); te = fcomplextimesi(te);                 di_2_g = -6.2831853e1f * td; ei_2_g = -6.2831853e2f * te;                 di_2_b = 6.2215795e0f * td; ei_2_b = 6.2215795e1f * te;                 di_2_y = 6.1605778e-2f * td; ei_2_y = 6.1605778e-1f * te;                  td = fe(1.01e1f * fq + t) + 4.f * fe(2.02e1f * fq + 2.f * t) + 9.f * fe(3.03e1f * fq + 3.f * t) + 16.f * fe(4.04e1f * fq + 4.f * t) + 25.f * fe(5.05e1f * fq + 5.f * t) + 36.f * fe(6.06e1f * fq + 6.f * t) + 49.f * fe(7.07e1f * fq + 7.f * t) + 64.f * fe(8.08e1f * fq + 8.f * t) + 81.f * fe(9.09e1f * fq + 9.f * t); td = -td;                 te = fe(10.f * t); te = -te;                 di_2_gg = 3.9478418e3f * td; ei_2_gg = 3.9478418e5f * te;                 di_2_gb = -3.9091337e2f * td; ei_2_gb = -3.9091337e4f * te;                 di_2_gy = -3.8708052e0f * td; ei_2_gy = -3.8708052e2f * te;                 di_2_bg = -3.9091337e2f * td; ei_2_bg = -3.9091337e4f * te;                 di_2_bb = 3.8708052e1f * td; ei_2_bb = 3.8708052e3f * te;                 di_2_by = 3.8328525e-1f * td; ei_2_by = 3.8328525e1f * te;                 di_2_yg = -3.8708052e0f * td; ei_2_yg = -3.8708052e2f * te;                 di_2_yb = 3.8328525e-1f * td; ei_2_yb = 3.8328525e1f * te;                 di_2_yy = 3.7952719e-3f * td; ei_2_yy = 3.7952719e-1f * te;                  d_2_ = fcomplextimes(di_0_, di_1_) + ei_0_;                 d_2_g = fcomplextimes(di_0_, di_1_g);                 d_2_b = fcomplextimes(di_0_, di_1_b);                 d_2_y = fcomplextimes(di_0_y, di_1_) + fcomplextimes(di_0_, di_1_y) + ei_0_y;                 d_2_gg = fcomplextimes(di_0_, di_1_gg);                 d_2_gb = fcomplextimes(di_0_, di_1_gb);                 d_2_gy = fcomplextimes(di_0_y, di_1_g) + fcomplextimes(di_0_, di_1_gy);                 d_2_bg = fcomplextimes(di_0_, di_1_bg);                 d_2_bb = fcomplextimes(di_0_, di_1_bb);                 d_2_by = fcomplextimes(di_0_y, di_1_b) + fcomplextimes(di_0_, di_1_by);                 d_2_yg = fcomplextimes(di_0_y, di_1_g) + fcomplextimes(di_0_, di_1_yg);                 d_2_yb = fcomplextimes(di_0_y, di_1_b) + fcomplextimes(di_0_, di_1_yb);                 d_2_yy = fcomplextimes(di_0_yy, di_1_) + fcomplextimes(di_0_y, di_1_y) + fcomplextimes(di_0_y, di_1_y) + fcomplextimes(di_0_, di_1_yy) + ei_0_yy;                  e_2_ = fcomplextimes(di_0_, ei_1_);                 e_2_g = fcomplextimes(di_0_, ei_1_g);                 e_2_b = fcomplextimes(di_0_, ei_1_b);                 e_2_y = fcomplextimes(di_0_y, ei_1_) + fcomplextimes(di_0_, ei_1_y);                 e_2_gg = fcomplextimes(di_0_, ei_1_gg);                 e_2_gb = fcomplextimes(di_0_, ei_1_gb);                 e_2_gy = fcomplextimes(di_0_y, ei_1_g) + fcomplextimes(di_0_, ei_1_gy);                 e_2_bg = fcomplextimes(di_0_, ei_1_bg);                 e_2_bb = fcomplextimes(di_0_, ei_1_bb);                 e_2_by = fcomplextimes(di_0_y, ei_1_b) + fcomplextimes(di_0_, ei_1_by);                 e_2_yg = fcomplextimes(di_0_y, ei_1_g) + fcomplextimes(di_0_, ei_1_yg);                 e_2_yb = fcomplextimes(di_0_y, ei_1_b) + fcomplextimes(di_0_, ei_1_yb);                 e_2_yy = fcomplextimes(di_0_yy, ei_1_) + fcomplextimes(di_0_y, ei_1_y) + fcomplextimes(di_0_y, ei_1_y) + fcomplextimes(di_0_, ei_1_yy);                  e_3_ = fcomplextimes(d_2_, ei_2_);                 e_3_g = fcomplextimes(d_2_g, ei_2_) + fcomplextimes(d_2_, ei_2_g);                 e_3_b = fcomplextimes(d_2_b, ei_2_) + fcomplextimes(d_2_, ei_2_b);                 e_3_y = fcomplextimes(d_2_y, ei_2_) + fcomplextimes(d_2_, ei_2_y);                 e_3_gg = fcomplextimes(d_2_gg, ei_2_) + fcomplextimes(d_2_g, ei_2_g) + fcomplextimes(d_2_g, ei_2_g) + fcomplextimes(d_2_, ei_2_gg);                 e_3_gb = fcomplextimes(d_2_gb, ei_2_) + fcomplextimes(d_2_g, ei_2_b) + fcomplextimes(d_2_b, ei_2_g) + fcomplextimes(d_2_, ei_2_gb);                 e_3_gy = fcomplextimes(d_2_gy, ei_2_) + fcomplextimes(d_2_g, ei_2_y) + fcomplextimes(d_2_y, ei_2_g) + fcomplextimes(d_2_, ei_2_gy);                 e_3_bg = fcomplextimes(d_2_bg, ei_2_) + fcomplextimes(d_2_b, ei_2_g) + fcomplextimes(d_2_g, ei_2_b) + fcomplextimes(d_2_, ei_2_bg);                 e_3_bb = fcomplextimes(d_2_bb, ei_2_) + fcomplextimes(d_2_b, ei_2_b) + fcomplextimes(d_2_b, ei_2_b) + fcomplextimes(d_2_, ei_2_bb);                 e_3_by = fcomplextimes(d_2_by, ei_2_) + fcomplextimes(d_2_b, ei_2_y) + fcomplextimes(d_2_y, ei_2_b) + fcomplextimes(d_2_, ei_2_by);                 e_3_yg = fcomplextimes(d_2_yg, ei_2_) + fcomplextimes(d_2_y, ei_2_g) + fcomplextimes(d_2_g, ei_2_y) + fcomplextimes(d_2_, ei_2_yg);                 e_3_yb = fcomplextimes(d_2_yb, ei_2_) + fcomplextimes(d_2_y, ei_2_b) + fcomplextimes(d_2_b, ei_2_y) + fcomplextimes(d_2_, ei_2_yb);                 e_3_yy = fcomplextimes(d_2_yy, ei_2_) + fcomplextimes(d_2_y, ei_2_y) + fcomplextimes(d_2_y, ei_2_y) + fcomplextimes(d_2_, ei_2_yy);                  eabs = length(e_3_) + length(e_3_g) / r_g + length(e_3_b) / r_b + length(e_3_y) / r_y + length(e_3_gg) / r_gg + length(e_3_gb) / r_gb + length(e_3_gy) / r_gy + length(e_3_bg) / r_bg + length(e_3_bb) / r_bb + length(e_3_by) / r_by + length(e_3_yg) / r_yg + length(e_3_yb) / r_yb + length(e_3_yy) / r_yy;                 esummandsup = max(esummandsup, eabs);             }              dsum += dsummandsup;             esum += esummandsup;         }          dsup = max(dsup, dsum);         esup = max(esup, esum);     } }  void main(void){     fdesuptaylor_a10_w3_d2();      output.data[gl_globalinvocationid.x][gl_globalinvocationid.y][0] = dsup;     output.data[gl_globalinvocationid.x][gl_globalinvocationid.y][1] = esup; } 

if run above program , use 1 of 2 identical lines in shader marked comment different results value esup (second of printed values). if use first line

d_3_yg = fcomplextimes(d_2_gy, di_2_) + fcomplextimes(d_2_g, di_2_y) + fcomplextimes(d_2_y, di_2_g) + fcomplextimes(d_2_, di_2_gy) + e_2_gy; 

i esup = 85930.453 , if use

d_3_yg = d_3_gy; 

(note should give same result) esup = 85928.164.

also, if change lines

zg = goffset + gl_globalinvocationid.x; fg = float(zg) / fmg;  zb = boffset + gl_globalinvocationid.y; fb = float(zb) / fmb; 

to

zg = gl_globalinvocationid.x; fg = float(zg) / fmg;  zb = gl_globalinvocationid.y; fb = float(zb) / fmb; 

(note goffset , boffset set 0 in c++) yet result (esup = 85929.844 , esup = 85929.742 when use first or second lines respectively).

another strange aspect "full bug" seems occur if constant set 128. other values of such 64, 127, 129, or 256 changing 2 lines not change value of esup removing goffset , boffset still does.

also, value of d_3_yg (which set 2 different lines) should play no role computation of esup dsup. dsup stays constant , esup changes.

i'm using nvidia quadro m2000m , visual studio 2012 compile. ideas problem be?

note program needs 20 seconds run , screen freezes during time. in windows need increase tdrdelay to, say, 60 (seconds)

https://docs.microsoft.com/en-us/windows-hardware/drivers/display/tdr-registry-keys

as windows kills gpu computations take longer 2 seconds default.

your shader lot of float calculus. line operates previous results. precision error accumulates.

also, gpu may calculations in higher precision; each time store result gets truncated. times compiler can optimize it, times can't.

you should read
what every programmer should know floating-point arithmetic
or
what every computer scientist should know floating-point arithmetic

problems increase special, bad conditioned cases. simple geometrical analogous issue determinig point of intersection between 2 almost parallel lines. result can vary lot.

you should try simplify long calculations. using double-precision types (available since opengl 4.0) may help, perhaps not enough.


Comments

Popular posts from this blog

ubuntu - PHP script to find files of certain extensions in a directory, returns populated array when run in browser, but empty array when run from terminal -

php - How can i create a user dashboard -

javascript - How to detect toggling of the fullscreen-toolbar in jQuery Mobile? -