void GLReplay::InitPostVSBuffers()

in renderdoc/driver/gl/gl_postvs.cpp [216:1954]


void GLReplay::InitPostVSBuffers(uint32_t eventId)
{
  if(m_PostVSData.find(eventId) != m_PostVSData.end())
    return;

  GLPostVSData &ret = m_PostVSData[eventId];

  if(m_pDriver->IsUnsafeDraw(eventId))
  {
    ret.gsout.status = ret.vsout.status = "Errors detected with drawcall";
    return;
  }

  MakeCurrentReplayContext(&m_ReplayCtx);

  GLMarkerRegion postvs(StringFormat::Fmt("PostVS for %u", eventId));

  WrappedOpenGL &drv = *m_pDriver;
  if(drv.m_ActiveFeedback)
  {
    drv.glEndTransformFeedback();
    drv.m_WasActiveFeedback = true;
  }

  GLResourceManager *rm = m_pDriver->GetResourceManager();

  GLRenderState rs;
  rs.FetchState(&drv);
  GLuint elArrayBuffer = 0;
  if(rs.VAO.name)
    drv.glGetIntegerv(eGL_ELEMENT_ARRAY_BUFFER_BINDING, (GLint *)&elArrayBuffer);

  if(HasExt[ARB_query_buffer_object])
    drv.glBindBuffer(eGL_QUERY_BUFFER, 0);

  // reflection structures
  ShaderReflection *vsRefl = NULL;
  ShaderReflection *tesRefl = NULL;
  ShaderReflection *gsRefl = NULL;
  SPIRVPatchData vsPatch, tesPatch, gsPatch;

  // the program we'll be binding, that we attach shaders to
  GLuint feedbackProg = drv.glCreateProgram();

  // one shader per stage (vs = 0, etc)
  GLuint stageShaders[4] = {};

  // temporary programs created as needed if the original program was created with
  // glCreateShaderProgramv and we don't have a shader to attach
  GLuint tmpShaders[4] = {};

  // the ID if we need to recompile into tmpShaders. This can also be required if gl_DrawID is used
  // since we can't get that faithfully.
  ResourceId recompile[4] = {};

  // these are the 'real' programs with uniform values that we need to copy over to our separable
  // programs. They may be duplicated if there's one program bound to multiple ages
  // one program per stage (vs = 0, etc)
  GLuint stageSrcPrograms[4] = {};

  const ActionDescription *action = m_pDriver->GetAction(eventId);
  const GLDrawParams &drawParams = m_pDriver->GetDrawParameters(eventId);

  if(action->numIndices == 0)
  {
    ret.gsout.status = ret.vsout.status = "Empty drawcall (0 indices/vertices)";
    return;
  }

  if((action->flags & ActionFlags::Instanced) && action->numInstances == 0)
  {
    ret.gsout.status = ret.vsout.status = "Empty drawcall (0 instances)";
    return;
  }

  uint32_t glslVer = 0;
  FixedFunctionVertexOutputs outputUsage = {};

  if(rs.Program.name == 0)
  {
    if(rs.Pipeline.name == 0)
    {
      ret.gsout.status = ret.vsout.status = "No program or pipeline bound at draw";
      RDCERR("%s", ret.vsout.status.c_str());
      return;
    }
    else
    {
      ResourceId id = rm->GetResID(rs.Pipeline);
      auto &pipeDetails = m_pDriver->m_Pipelines[id];

      for(int i = 0; i < 4; i++)
      {
        if(pipeDetails.stageShaders[i] != ResourceId())
        {
          ShaderReflection *refl = NULL;
          if(i == 0)
          {
            refl = vsRefl = GetShader(ResourceId(), pipeDetails.stageShaders[i], ShaderEntryPoint());
            glslVer = m_pDriver->m_Shaders[pipeDetails.stageShaders[0]].version;
            vsPatch = m_pDriver->m_Shaders[pipeDetails.stageShaders[0]].patchData;

            CheckVertexOutputUses(m_pDriver->m_Shaders[pipeDetails.stageShaders[0]].sources,
                                  outputUsage);
          }
          else if(i == 2)
          {
            refl = tesRefl = GetShader(ResourceId(), pipeDetails.stageShaders[2], ShaderEntryPoint());
            tesPatch = m_pDriver->m_Shaders[pipeDetails.stageShaders[2]].patchData;
          }
          else if(i == 3)
          {
            refl = gsRefl = GetShader(ResourceId(), pipeDetails.stageShaders[3], ShaderEntryPoint());
            gsPatch = m_pDriver->m_Shaders[pipeDetails.stageShaders[3]].patchData;
          }

          stageShaders[i] = rm->GetCurrentResource(pipeDetails.stageShaders[i]).name;
          stageSrcPrograms[i] = rm->GetCurrentResource(pipeDetails.stagePrograms[i]).name;

          if(stageShaders[i] == stageSrcPrograms[i])
          {
            const WrappedOpenGL::ProgramData &progDetails =
                m_pDriver->m_Programs[pipeDetails.stagePrograms[i]];

            if(progDetails.shaderProgramUnlinkable)
            {
              recompile[i] = pipeDetails.stageShaders[i];
            }
          }

          if(refl)
          {
            for(const SigParameter &sig : refl->inputSignature)
            {
              if(sig.systemValue == ShaderBuiltin::DrawIndex)
              {
                recompile[i] = pipeDetails.stageShaders[i];
                break;
              }
            }
          }
        }
      }
    }
  }
  else
  {
    auto &progDetails = m_pDriver->m_Programs[rm->GetResID(rs.Program)];

    for(int i = 0; i < 4; i++)
    {
      if(progDetails.stageShaders[0] != ResourceId())
      {
        ShaderReflection *refl = NULL;
        if(i == 0)
        {
          refl = vsRefl = GetShader(ResourceId(), progDetails.stageShaders[0], ShaderEntryPoint());
          glslVer = m_pDriver->m_Shaders[progDetails.stageShaders[0]].version;
          vsPatch = m_pDriver->m_Shaders[progDetails.stageShaders[0]].patchData;

          CheckVertexOutputUses(m_pDriver->m_Shaders[progDetails.stageShaders[0]].sources,
                                outputUsage);
        }
        else if(i == 2 && progDetails.stageShaders[2] != ResourceId())
        {
          refl = tesRefl = GetShader(ResourceId(), progDetails.stageShaders[2], ShaderEntryPoint());
          tesPatch = m_pDriver->m_Shaders[progDetails.stageShaders[2]].patchData;
        }
        else if(i == 3 && progDetails.stageShaders[3] != ResourceId())
        {
          refl = gsRefl = GetShader(ResourceId(), progDetails.stageShaders[3], ShaderEntryPoint());
          gsPatch = m_pDriver->m_Shaders[progDetails.stageShaders[3]].patchData;
        }

        stageShaders[i] = rm->GetCurrentResource(progDetails.stageShaders[i]).name;

        if(refl)
        {
          for(const SigParameter &sig : refl->inputSignature)
          {
            if(sig.systemValue == ShaderBuiltin::DrawIndex)
            {
              recompile[i] = progDetails.stageShaders[i];
              break;
            }
          }
        }
      }

      stageSrcPrograms[i] = rs.Program.name;
    }
  }

  for(int i = 0; i < 4; i++)
  {
    if(recompile[i] != ResourceId())
    {
      const WrappedOpenGL::ShaderData &shadDetails = m_pDriver->m_Shaders[recompile[i]];

      stageShaders[i] = tmpShaders[i] = RecompileShader(drv, shadDetails, action->drawIndex);
    }
  }

  if(vsRefl == NULL || stageShaders[0] == 0)
  {
    ret.gsout.status = ret.vsout.status = "No vertex shader bound";

    // delete any temporaries
    for(size_t i = 0; i < 4; i++)
      if(tmpShaders[i])
        drv.glDeleteShader(tmpShaders[i]);

    return;
  }

  if(tesRefl || gsRefl)
  {
    // put a general error in here in case anything goes wrong fetching VS outputs
    ret.gsout.status =
        "No geometry/tessellation output fetched due to error processing vertex stage.";
  }
  else
  {
    ret.gsout.status = "No geometry and no tessellation shader bound.";
  }

  // GLES requires a fragment shader even with rasterizer discard, so we'll attach this
  GLuint dummyFrag = 0;

  if(IsGLES)
  {
    dummyFrag = drv.glCreateShader(eGL_FRAGMENT_SHADER);

    if(glslVer == 0)
      glslVer = 100;

    rdcstr src =
        StringFormat::Fmt("#version %d %s\nvoid main() {}\n", glslVer, glslVer == 100 ? "" : "es");

    const char *csrc = src.c_str();

    drv.glShaderSource(dummyFrag, 1, &csrc, NULL);
    drv.glCompileShader(dummyFrag);

    GLint status = 0;
    drv.glGetShaderiv(dummyFrag, eGL_COMPILE_STATUS, &status);

    if(status == 0)
    {
      drv.glDeleteShader(dummyFrag);
      dummyFrag = 0;

      if(HasExt[ARB_separate_shader_objects])
      {
        RDCERR(
            "Couldn't create dummy fragment shader for GLES, trying to set program to be "
            "separable");
        drv.glProgramParameteri(feedbackProg, eGL_PROGRAM_SEPARABLE, GL_TRUE);
      }
      else
      {
        RDCERR(
            "Couldn't create dummy fragment shader for GLES, separable programs not available. "
            "Vertex output data will likely be broken");
      }
    }
  }

  uint32_t stride = 0;
  GLuint vsOrigShader = 0;

  bool hasPosition = false;

  for(const SigParameter &sig : vsRefl->outputSignature)
  {
    if(sig.systemValue == ShaderBuiltin::Position)
    {
      hasPosition = true;
      break;
    }
  }

  if(vsRefl->encoding == ShaderEncoding::OpenGLSPIRV)
  {
    // SPIR-V path
    vsOrigShader = stageShaders[0];

    stageShaders[0] = tmpShaders[0] = drv.glCreateShader(eGL_VERTEX_SHADER);

    rdcarray<uint32_t> spirv;
    spirv.resize(vsRefl->rawBytes.size() / sizeof(uint32_t));
    memcpy(spirv.data(), vsRefl->rawBytes.data(), vsRefl->rawBytes.size());

    AddXFBAnnotations(*vsRefl, vsPatch, 0, vsRefl->entryPoint.c_str(), spirv, stride);

    drv.glShaderBinary(1, &stageShaders[0], eGL_SHADER_BINARY_FORMAT_SPIR_V, spirv.data(),
                       (GLsizei)spirv.size() * 4);

    drv.glSpecializeShader(stageShaders[0], vsRefl->entryPoint.c_str(), 0, NULL, NULL);

    char buffer[1024] = {};
    GLint status = 0;
    GL.glGetShaderiv(stageShaders[0], eGL_COMPILE_STATUS, &status);
    if(status == 0)
    {
      GL.glGetShaderInfoLog(stageShaders[0], 1024, NULL, buffer);
      RDCERR("SPIR-V post-vs patched shader compile error: %s", buffer);
      ret.vsout.status = "Failed to patch SPIR-V vertex shader to use transform feedback.";
      return;
    }
    // attach the vertex shader
    drv.glAttachShader(feedbackProg, stageShaders[0]);

    // attach the dummy fragment shader, if it exists
    if(dummyFrag)
      drv.glAttachShader(feedbackProg, dummyFrag);

    drv.glLinkProgram(feedbackProg);

    drv.glGetProgramiv(feedbackProg, eGL_LINK_STATUS, &status);

    if(status == 0)
    {
      drv.glGetProgramInfoLog(feedbackProg, 1024, NULL, buffer);
      RDCERR("SPIR-V post-vs patched program link error: %s", buffer);
      ret.vsout.status = "Failed to patch SPIR-V vertex shader to use transform feedback.";
      return;
    }
  }
  else
  {
    // non-SPIRV path

    // attach the vertex shader
    drv.glAttachShader(feedbackProg, stageShaders[0]);

    // attach the dummy fragment shader, if it exists
    if(dummyFrag)
      drv.glAttachShader(feedbackProg, dummyFrag);

    CopyProgramAttribBindings(stageSrcPrograms[0], feedbackProg, vsRefl);

    rdcarray<const char *> varyings;
    MakeVaryingsFromShaderReflection(*vsRefl, varyings, stride);

    // this is REALLY ugly, but I've seen problems with varying specification, so we try and
    // do some fixup by removing prefixes from the results we got from PROGRAM_OUTPUT.
    //
    // the problem I've seen is:
    //
    // struct vertex
    // {
    //   vec4 Color;
    // };
    //
    // layout(location = 0) out vertex Out;
    //
    // (from g_truc gl-410-primitive-tessellation-2). On AMD the varyings are what you might expect
    // (from the PROGRAM_OUTPUT interface names reflected out): "Out.Color", "gl_Position"
    // however nvidia complains unless you use "Color", "gl_Position". This holds even if you add
    // other variables to the vertex struct.
    //
    // strangely another sample that in-lines the output block like so:
    //
    // out block
    // {
    //   vec2 Texcoord;
    // } Out;
    //
    // uses "block.Texcoord" (reflected name from PROGRAM_OUTPUT and accepted by varyings string on
    // both vendors). This is inconsistent as it's type.member not structname.member as move.
    //
    // The spec is very vague on exactly what these names should be, so I can't say which is correct
    // out of these three possibilities.
    //
    // So our 'fix' is to loop while we have problems linking with the varyings (since we know
    // otherwise linking should succeed, as we only get here with a successfully linked separable
    // program - if it fails to link, it's assigned 0 earlier) and remove any prefixes from
    // variables seen in the link error string.
    // The error string is something like:
    // "error: Varying (named Out.Color) specified but not present in the program object."
    //
    // Yeh. Ugly. Not guaranteed to work at all, but hopefully the common case will just be a single
    // block without any nesting so this might work.
    // At least we don't have to reallocate strings all over, since the memory is
    // already owned elsewhere, we just need to modify pointers to trim prefixes. Bright side?

    GLint status = 0;
    bool finished = false;
    for(;;)
    {
      // don't print debug messages from these links - we know some might fail but as long as we
      // eventually get one to work that's fine.
      drv.SuppressDebugMessages(true);

      // specify current varyings & relink
      drv.glTransformFeedbackVaryings(feedbackProg, (GLsizei)varyings.size(), &varyings[0],
                                      eGL_INTERLEAVED_ATTRIBS);
      drv.glLinkProgram(feedbackProg);

      drv.SuppressDebugMessages(false);

      drv.glGetProgramiv(feedbackProg, eGL_LINK_STATUS, &status);

      // all good! Hopefully we'll mostly hit this
      if(status == 1)
        break;

      RDCWARN("Failed to link postvs program with varyings");

      // if finished is true, this was our last attempt - there are no
      // more fixups possible
      if(finished)
      {
        RDCWARN("No fixups possible");
        break;
      }

      RDCLOG("Attempting fixup...");

      char buffer[1025] = {0};
      drv.glGetProgramInfoLog(feedbackProg, 1024, NULL, buffer);

      // assume we're finished and can't retry any more after this.
      // if we find a potential 'fixup' we'll set this back to false
      finished = true;

      // see if any of our current varyings are present in the buffer string
      for(size_t i = 0; i < varyings.size(); i++)
      {
        if(strstr(buffer, varyings[i]))
        {
          const char *prefix_removed = strchr(varyings[i], '.');

          // does it contain a prefix?
          if(prefix_removed)
          {
            prefix_removed++;    // now this is our string without the prefix

            // first check this won't cause a duplicate - if it does, we have to try something else
            bool duplicate = false;
            for(size_t j = 0; j < varyings.size(); j++)
            {
              if(!strcmp(varyings[j], prefix_removed))
              {
                duplicate = true;
                break;
              }
            }

            if(!duplicate)
            {
              // we'll attempt this fixup
              RDCWARN("Attempting XFB varying fixup, subst '%s' for '%s'", varyings[i],
                      prefix_removed);
              varyings[i] = prefix_removed;
              finished = false;

              // don't try more than one at once (just in case)
              break;
            }
          }
        }
      }
    }

    if(status == 0)
    {
      // if we STILL can't link then something is really messy. Some drivers like AMD reflect out
      // unused variables when reflecting a separable program, then complain when they are passed in
      // as varyings. We remove all the varyings, link the program, then reflect it as-is and try to
      // use the output signature from that as the varyings.
      RDCWARN("Failed to generate XFB varyings from normal reflection - making one final attempt.");
      RDCWARN(
          "This is often caused by sensitive drivers and output variables declared but never "
          "written to.");

      drv.SuppressDebugMessages(true);

      drv.glTransformFeedbackVaryings(feedbackProg, 0, NULL, eGL_INTERLEAVED_ATTRIBS);
      drv.glLinkProgram(feedbackProg);

      drv.glGetProgramiv(feedbackProg, eGL_LINK_STATUS, &status);

      if(status == 1)
      {
        ShaderReflection tempRefl;
        MakeShaderReflection(eGL_VERTEX_SHADER, feedbackProg, tempRefl, outputUsage);

        // remake the varyings with tempRefl to 'trim' the output signature
        MakeVaryingsFromShaderReflection(*vsRefl, varyings, stride, &tempRefl);

        drv.glTransformFeedbackVaryings(feedbackProg, (GLsizei)varyings.size(), &varyings[0],
                                        eGL_INTERLEAVED_ATTRIBS);
        drv.glLinkProgram(feedbackProg);

        drv.glGetProgramiv(feedbackProg, eGL_LINK_STATUS, &status);
      }
      else
      {
        RDCWARN("Can't link program with no varyings!");
      }

      drv.SuppressDebugMessages(false);
    }

    if(status == 0)
    {
      char buffer[1025] = {0};
      drv.glGetProgramInfoLog(feedbackProg, 1024, NULL, buffer);
      RDCERR("Failed to fix-up. Link error making xfb vs program: %s", buffer);

      // delete any temporaries
      for(size_t i = 0; i < 4; i++)
        if(tmpShaders[i])
          drv.glDeleteShader(tmpShaders[i]);

      drv.glDeleteShader(dummyFrag);

      drv.glDeleteProgram(feedbackProg);

      ret.vsout.status = "Failed to relink program to use transform feedback.";
      return;
    }
  }

  // here the SPIR-V and GLSL paths recombine.

  // copy across any uniform values, bindings etc from the real program containing
  // the vertex stage
  {
    PerStageReflections stages;
    m_pDriver->FillReflectionArray(ProgramRes(drv.GetCtx(), stageSrcPrograms[0]), stages);

    PerStageReflections dstStages;
    m_pDriver->FillReflectionArray(ProgramRes(drv.GetCtx(), feedbackProg), dstStages);

    CopyProgramUniforms(stages, stageSrcPrograms[0], dstStages, feedbackProg);
  }

  // we don't want to do any work, so just discard before rasterizing
  drv.glEnable(eGL_RASTERIZER_DISCARD);

  // bind our program and do the feedback draw
  drv.glUseProgram(feedbackProg);
  drv.glBindProgramPipeline(0);

  if(HasExt[ARB_transform_feedback2])
    drv.glBindTransformFeedback(eGL_TRANSFORM_FEEDBACK, DebugData.feedbackObj);

  bool flipY = false;

  if(HasExt[ARB_clip_control])
  {
    GLenum clipOrigin = eGL_LOWER_LEFT;
    GL.glGetIntegerv(eGL_CLIP_ORIGIN, (GLint *)&clipOrigin);

    if(clipOrigin == eGL_UPPER_LEFT)
      flipY = true;
  }

  GLuint idxBuf = 0;

  if(vsRefl->outputSignature.empty())
  {
    // nothing to do, store an empty cache
  }
  else
  {
    if(!(action->flags & ActionFlags::Indexed))
    {
      uint64_t outputSize = uint64_t(action->numIndices) * stride;

      if(action->flags & ActionFlags::Instanced)
        outputSize *= action->numInstances;

      // resize up the buffer if needed for the vertex output data
      if(DebugData.feedbackBufferSize < outputSize)
      {
        uint64_t oldSize = DebugData.feedbackBufferSize;
        DebugData.feedbackBufferSize = CalcMeshOutputSize(DebugData.feedbackBufferSize, outputSize);
        RDCWARN("Resizing xfb buffer from %llu to %llu for output", oldSize,
                DebugData.feedbackBufferSize);
        if(DebugData.feedbackBufferSize > INTPTR_MAX)
        {
          RDCERR("Too much data generated");
          DebugData.feedbackBufferSize = INTPTR_MAX;
        }
        drv.glNamedBufferDataEXT(DebugData.feedbackBuffer, (GLsizeiptr)DebugData.feedbackBufferSize,
                                 NULL, eGL_DYNAMIC_READ);
      }

      // need to rebind this here because of an AMD bug that seems to ignore the buffer
      // bindings in the feedback object - or at least it errors if the default feedback
      // object has no buffers bound. Fortunately the state is still object-local so
      // we don't have to restore the buffer binding on the default feedback object.
      drv.glBindBufferBase(eGL_TRANSFORM_FEEDBACK_BUFFER, 0, DebugData.feedbackBuffer);

      drv.glBeginQuery(eGL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN, DebugData.feedbackQueries[0]);
      drv.glBeginTransformFeedback(eGL_POINTS);

      if(action->flags & ActionFlags::Instanced)
      {
        if(HasExt[ARB_base_instance])
        {
          drv.glDrawArraysInstancedBaseInstance(eGL_POINTS, action->vertexOffset, action->numIndices,
                                                action->numInstances, action->instanceOffset);
        }
        else
        {
          drv.glDrawArraysInstanced(eGL_POINTS, action->vertexOffset, action->numIndices,
                                    action->numInstances);
        }
      }
      else
      {
        drv.glDrawArrays(eGL_POINTS, action->vertexOffset, action->numIndices);
      }
    }
    else    // action is indexed
    {
      ResourceId idxId = rm->GetResID(BufferRes(drv.GetCtx(), elArrayBuffer));

      bytebuf idxdata;
      GetBufferData(idxId, action->indexOffset * drawParams.indexWidth,
                    action->numIndices * drawParams.indexWidth, idxdata);

      rdcarray<uint32_t> indices;

      uint8_t *idx8 = (uint8_t *)&idxdata[0];
      uint16_t *idx16 = (uint16_t *)&idxdata[0];
      uint32_t *idx32 = (uint32_t *)&idxdata[0];

      // only read as many indices as were available in the buffer
      uint32_t numIndices =
          RDCMIN(uint32_t(idxdata.size() / drawParams.indexWidth), action->numIndices);

      // grab all unique vertex indices referenced
      for(uint32_t i = 0; i < numIndices; i++)
      {
        uint32_t i32 = 0;
        if(drawParams.indexWidth == 1)
          i32 = uint32_t(idx8[i]);
        else if(drawParams.indexWidth == 2)
          i32 = uint32_t(idx16[i]);
        else if(drawParams.indexWidth == 4)
          i32 = idx32[i];

        auto it = std::lower_bound(indices.begin(), indices.end(), i32);

        if(it != indices.end() && *it == i32)
          continue;

        indices.insert(it - indices.begin(), i32);
      }

      // if we read out of bounds, we'll also have a 0 index being referenced
      // (as 0 is read). Don't insert 0 if we already have 0 though
      if(numIndices < action->numIndices && (indices.empty() || indices[0] != 0))
        indices.insert(0, 0);

      // An index buffer could be something like: 500, 501, 502, 501, 503, 502
      // in which case we can't use the existing index buffer without filling 499 slots of vertex
      // data with padding. Instead we rebase the indices based on the smallest vertex so it becomes
      // 0, 1, 2, 1, 3, 2 and then that matches our stream-out'd buffer.
      //
      // Note that there could also be gaps, like: 500, 501, 502, 510, 511, 512
      // which would become 0, 1, 2, 3, 4, 5 and so the old index buffer would no longer be valid.
      // We just stream-out a tightly packed list of unique indices, and then remap the index buffer
      // so that what did point to 500 points to 0 (accounting for rebasing), and what did point
      // to 510 now points to 3 (accounting for the unique sort).

      // we use a map here since the indices may be sparse. Especially considering if an index
      // is 'invalid' like 0xcccccccc then we don't want an array of 3.4 billion entries.
      std::map<uint32_t, size_t> indexRemap;
      for(size_t i = 0; i < indices.size(); i++)
      {
        // by definition, this index will only appear once in indices[]
        indexRemap[indices[i]] = i;
      }

      // generate a temporary index buffer with our 'unique index set' indices,
      // so we can transform feedback each referenced vertex once
      GLuint indexSetBuffer = 0;
      drv.glGenBuffers(1, &indexSetBuffer);
      drv.glBindBuffer(eGL_ELEMENT_ARRAY_BUFFER, indexSetBuffer);
      drv.glNamedBufferDataEXT(indexSetBuffer, sizeof(uint32_t) * indices.size(), &indices[0],
                               eGL_STATIC_DRAW);

      uint32_t outputSize = (uint32_t)indices.size() * stride;

      if(action->flags & ActionFlags::Instanced)
        outputSize *= action->numInstances;

      // resize up the buffer if needed for the vertex output data
      if(DebugData.feedbackBufferSize < outputSize)
      {
        uint64_t oldSize = DebugData.feedbackBufferSize;
        DebugData.feedbackBufferSize = CalcMeshOutputSize(DebugData.feedbackBufferSize, outputSize);
        RDCWARN("Resizing xfb buffer from %llu to %llu for output", oldSize,
                DebugData.feedbackBufferSize);
        if(DebugData.feedbackBufferSize > INTPTR_MAX)
        {
          RDCERR("Too much data generated");
          DebugData.feedbackBufferSize = INTPTR_MAX;
        }
        drv.glNamedBufferDataEXT(DebugData.feedbackBuffer, (GLsizeiptr)DebugData.feedbackBufferSize,
                                 NULL, eGL_DYNAMIC_READ);
      }

      // need to rebind this here because of an AMD bug that seems to ignore the buffer
      // bindings in the feedback object - or at least it errors if the default feedback
      // object has no buffers bound. Fortunately the state is still object-local so
      // we don't have to restore the buffer binding on the default feedback object.
      drv.glBindBufferBase(eGL_TRANSFORM_FEEDBACK_BUFFER, 0, DebugData.feedbackBuffer);

      drv.glBeginQuery(eGL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN, DebugData.feedbackQueries[0]);
      drv.glBeginTransformFeedback(eGL_POINTS);

      if(action->flags & ActionFlags::Instanced)
      {
        if(HasExt[ARB_base_instance])
        {
          drv.glDrawElementsInstancedBaseVertexBaseInstance(
              eGL_POINTS, (GLsizei)indices.size(), eGL_UNSIGNED_INT, NULL, action->numInstances,
              action->baseVertex, action->instanceOffset);
        }
        else
        {
          drv.glDrawElementsInstancedBaseVertex(eGL_POINTS, (GLsizei)indices.size(), eGL_UNSIGNED_INT,
                                                NULL, action->numInstances, action->baseVertex);
        }
      }
      else
      {
        drv.glDrawElementsBaseVertex(eGL_POINTS, (GLsizei)indices.size(), eGL_UNSIGNED_INT, NULL,
                                     action->baseVertex);
      }

      // delete the buffer, we don't need it anymore
      drv.glBindBuffer(eGL_ELEMENT_ARRAY_BUFFER, elArrayBuffer);
      drv.glDeleteBuffers(1, &indexSetBuffer);

      uint32_t stripRestartValue32 = 0;

      if(rs.Enabled[GLRenderState::eEnabled_PrimitiveRestart] ||
         rs.Enabled[GLRenderState::eEnabled_PrimitiveRestartFixedIndex])
      {
        stripRestartValue32 = rs.Enabled[GLRenderState::eEnabled_PrimitiveRestartFixedIndex]
                                  ? ~0U
                                  : rs.PrimitiveRestartIndex;
      }

      // rebase existing index buffer to point from 0 onwards (which will index into our
      // stream-out'd vertex buffer)
      if(drawParams.indexWidth == 1)
      {
        uint8_t stripRestartValue = stripRestartValue32 & 0xff;

        for(uint32_t i = 0; i < numIndices; i++)
        {
          // preserve primitive restart indices
          if(stripRestartValue && idx8[i] == stripRestartValue)
            continue;

          idx8[i] = uint8_t(indexRemap[idx8[i]]);
        }
      }
      else if(drawParams.indexWidth == 2)
      {
        uint16_t stripRestartValue = stripRestartValue32 & 0xffff;

        for(uint32_t i = 0; i < numIndices; i++)
        {
          // preserve primitive restart indices
          if(stripRestartValue && idx16[i] == stripRestartValue)
            continue;

          idx16[i] = uint16_t(indexRemap[idx16[i]]);
        }
      }
      else
      {
        uint32_t stripRestartValue = stripRestartValue32;

        for(uint32_t i = 0; i < numIndices; i++)
        {
          // preserve primitive restart indices
          if(stripRestartValue && idx32[i] == stripRestartValue)
            continue;

          idx32[i] = uint32_t(indexRemap[idx32[i]]);
        }
      }

      // make the index buffer that can be used to render this postvs data - the original
      // indices, repointed (since we transform feedback to the start of our feedback
      // buffer and only tightly packed unique indices).
      if(!idxdata.empty())
      {
        drv.glGenBuffers(1, &idxBuf);
        drv.glBindBuffer(eGL_ELEMENT_ARRAY_BUFFER, idxBuf);
        drv.glNamedBufferDataEXT(idxBuf, (GLsizeiptr)idxdata.size(), &idxdata[0], eGL_STATIC_DRAW);
      }

      // restore previous element array buffer binding
      drv.glBindBuffer(eGL_ELEMENT_ARRAY_BUFFER, elArrayBuffer);
    }

    drv.glEndTransformFeedback();
    drv.glEndQuery(eGL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN);

    bool error = false;

    // this should be the same as the draw size
    GLuint primsWritten = 0;
    drv.glGetQueryObjectuiv(DebugData.feedbackQueries[0], eGL_QUERY_RESULT, &primsWritten);

    if(primsWritten == 0)
    {
      // we bailed out much earlier if this was a draw of 0 verts
      RDCERR("No primitives written - but we must have had some number of vertices in the draw");
      error = true;
      ret.vsout.status = "Error obtaining vertex data via transform feedback";
    }

    // get buffer data from buffer attached to feedback object
    float *data = (float *)drv.glMapNamedBufferEXT(DebugData.feedbackBuffer, eGL_READ_ONLY);

    if(data == NULL)
    {
      drv.glUnmapNamedBufferEXT(DebugData.feedbackBuffer);
      RDCERR("Couldn't map feedback buffer!");
      error = true;
      ret.vsout.status = "Error reading back vertex data from GPU";
    }

    if(error)
    {
      // restore replay state we trashed
      drv.glUseProgram(rs.Program.name);
      drv.glBindProgramPipeline(rs.Pipeline.name);

      drv.glBindBuffer(eGL_ARRAY_BUFFER, rs.BufferBindings[GLRenderState::eBufIdx_Array].name);
      drv.glBindBuffer(eGL_ELEMENT_ARRAY_BUFFER, elArrayBuffer);

      if(HasExt[ARB_transform_feedback2])
        drv.glBindTransformFeedback(eGL_TRANSFORM_FEEDBACK, rs.FeedbackObj.name);

      if(!rs.Enabled[GLRenderState::eEnabled_RasterizerDiscard])
        drv.glDisable(eGL_RASTERIZER_DISCARD);
      else
        drv.glEnable(eGL_RASTERIZER_DISCARD);

      // delete any temporaries
      for(size_t i = 0; i < 4; i++)
        if(tmpShaders[i])
          drv.glDeleteShader(tmpShaders[i]);

      drv.glDeleteShader(dummyFrag);

      drv.glDeleteProgram(feedbackProg);

      return;
    }

    // create a buffer with this data, for future use (typed to ARRAY_BUFFER so we
    // can render from it to display previews).
    GLuint vsoutBuffer = 0;
    drv.glGenBuffers(1, &vsoutBuffer);
    drv.glBindBuffer(eGL_ARRAY_BUFFER, vsoutBuffer);
    drv.glNamedBufferDataEXT(vsoutBuffer, stride * primsWritten, data, eGL_STATIC_DRAW);

    byte *byteData = (byte *)data;

    float nearp = 0.1f;
    float farp = 100.0f;

    Vec4f *pos0 = (Vec4f *)byteData;

    bool found = false;

    for(GLuint i = 1; hasPosition && i < primsWritten; i++)
    {
      //////////////////////////////////////////////////////////////////////////////////
      // derive near/far, assuming a standard perspective matrix
      //
      // the transformation from from pre-projection {Z,W} to post-projection {Z,W}
      // is linear. So we can say Zpost = Zpre*m + c . Here we assume Wpre = 1
      // and we know Wpost = Zpre from the perspective matrix.
      // we can then see from the perspective matrix that
      // m = F/(F-N)
      // c = -(F*N)/(F-N)
      //
      // with re-arranging and substitution, we then get:
      // N = -c/m
      // F = c/(1-m)
      //
      // so if we can derive m and c then we can determine N and F. We can do this with
      // two points, and we pick them reasonably distinct on z to reduce floating-point
      // error

      Vec4f *pos = (Vec4f *)(byteData + i * stride);

      if(fabs(pos->w - pos0->w) > 0.01f && fabs(pos->z - pos0->z) > 0.01f)
      {
        Vec2f A(pos0->w, pos0->z);
        Vec2f B(pos->w, pos->z);

        float m = (B.y - A.y) / (B.x - A.x);
        float c = B.y - B.x * m;

        if(m == 1.0f || c == 0.0f)
          continue;

        if(-c / m <= 0.000001f)
          continue;

        nearp = -c / m;
        farp = c / (1 - m);

        found = true;

        break;
      }
    }

    // if we didn't find anything, all z's and w's were identical.
    // If the z is positive and w greater for the first element then
    // we detect this projection as reversed z with infinite far plane
    if(!found && pos0->z > 0.0f && pos0->w > pos0->z)
    {
      nearp = pos0->z;
      farp = FLT_MAX;
    }

    drv.glUnmapNamedBufferEXT(DebugData.feedbackBuffer);

    // store everything out to the PostVS data cache
    ret.vsin.topo = drawParams.topo;
    ret.vsout.buf = vsoutBuffer;
    ret.vsout.vertStride = stride;
    ret.vsout.nearPlane = nearp;
    ret.vsout.farPlane = farp;

    ret.vsout.useIndices = bool(action->flags & ActionFlags::Indexed);
    ret.vsout.numVerts = action->numIndices;

    ret.vsout.instStride = 0;
    if(action->flags & ActionFlags::Instanced)
      ret.vsout.instStride = (stride * primsWritten) / RDCMAX(1U, action->numInstances);

    ret.vsout.idxBuf = 0;
    ret.vsout.idxByteWidth = drawParams.indexWidth;
    if(ret.vsout.useIndices && idxBuf)
    {
      ret.vsout.idxBuf = idxBuf;
    }

    ret.vsout.hasPosOut = hasPosition;

    ret.vsout.topo = drawParams.topo;
  }

  if(tesRefl || gsRefl)
  {
    ret.gsout.status.clear();

    ShaderReflection *lastRefl = gsRefl;
    SPIRVPatchData lastPatch = gsPatch;
    int lastIndex = 3;

    if(!lastRefl)
    {
      lastRefl = tesRefl;
      lastPatch = tesPatch;
      lastIndex = 2;
    }

    bool lastSPIRV = (lastRefl->encoding == ShaderEncoding::OpenGLSPIRV);

    RDCASSERT(lastRefl);

    // if the vertex shader was SPIR-V we didn't attach it and instead attached a tmp one with
    // patched SPIR-V. Detach it and attach the original one without any XFB annotations
    if(vsOrigShader)
    {
      drv.glDetachShader(feedbackProg, stageShaders[0]);
      stageShaders[0] = vsOrigShader;
      drv.glAttachShader(feedbackProg, stageShaders[0]);
    }

    // attach the other non-vertex shaders
    for(int i = 1; i < 4; i++)
    {
      if(stageShaders[i])
      {
        // if the last shader is non-SPIR-V, don't attach it - we'll build our own
        if(lastSPIRV && i == lastIndex)
          continue;

        drv.glAttachShader(feedbackProg, stageShaders[i]);
      }
    }

    GLint status = 0;

    hasPosition = false;

    for(const SigParameter &sig : lastRefl->outputSignature)
    {
      if(sig.systemValue == ShaderBuiltin::Position)
      {
        hasPosition = true;
        break;
      }
    }

    if(lastSPIRV)
    {
      // SPIR-V path
      stageShaders[lastIndex] = tmpShaders[lastIndex] = drv.glCreateShader(ShaderEnum(lastIndex));

      rdcarray<uint32_t> spirv;
      spirv.resize(lastRefl->rawBytes.size() / sizeof(uint32_t));
      memcpy(spirv.data(), lastRefl->rawBytes.data(), lastRefl->rawBytes.size());

      AddXFBAnnotations(*lastRefl, lastPatch, 0, lastRefl->entryPoint.c_str(), spirv, stride);

      drv.glShaderBinary(1, &stageShaders[lastIndex], eGL_SHADER_BINARY_FORMAT_SPIR_V, spirv.data(),
                         (GLsizei)spirv.size() * 4);

      drv.glSpecializeShader(stageShaders[lastIndex], lastRefl->entryPoint.c_str(), 0, NULL, NULL);

      char buffer[1024] = {};
      GL.glGetShaderiv(stageShaders[lastIndex], eGL_COMPILE_STATUS, &status);
      if(status == 0)
      {
        GL.glGetShaderInfoLog(stageShaders[lastIndex], 1024, NULL, buffer);
        RDCERR("SPIR-V post-gs patched shader compile error: %s", buffer);
        ret.gsout.status =
            "Failed to patch SPIR-V geometry/tessellation shader to use transform feedback.";
        return;
      }

      // attach the last shader
      drv.glAttachShader(feedbackProg, stageShaders[lastIndex]);

      drv.glLinkProgram(feedbackProg);

      drv.glGetProgramiv(feedbackProg, eGL_LINK_STATUS, &status);

      if(status == 0)
      {
        drv.glGetProgramInfoLog(feedbackProg, 1024, NULL, buffer);
        RDCERR("SPIR-V post-gs patched program link error: %s", buffer);
        ret.gsout.status =
            "Failed to patch SPIR-V geometry/tessellation shader to use transform feedback.";
        return;
      }
    }
    else
    {
      rdcarray<const char *> varyings;

      MakeVaryingsFromShaderReflection(*lastRefl, varyings, stride);

      // see above for the justification/explanation of this monstrosity.

      bool finished = false;
      for(;;)
      {
        drv.SuppressDebugMessages(true);

        // specify current varyings & relink
        drv.glTransformFeedbackVaryings(feedbackProg, (GLsizei)varyings.size(), &varyings[0],
                                        eGL_INTERLEAVED_ATTRIBS);
        drv.glLinkProgram(feedbackProg);

        drv.glGetProgramiv(feedbackProg, eGL_LINK_STATUS, &status);

        drv.SuppressDebugMessages(false);

        // all good! Hopefully we'll mostly hit this
        if(status == 1)
          break;

        // if finished is true, this was our last attempt - there are no
        // more fixups possible
        if(finished)
          break;

        char buffer[1025] = {0};
        drv.glGetProgramInfoLog(feedbackProg, 1024, NULL, buffer);

        // assume we're finished and can't retry any more after this.
        // if we find a potential 'fixup' we'll set this back to false
        finished = true;

        // see if any of our current varyings are present in the buffer string
        for(size_t i = 0; i < varyings.size(); i++)
        {
          if(strstr(buffer, varyings[i]))
          {
            const char *prefix_removed = strchr(varyings[i], '.');

            // does it contain a prefix?
            if(prefix_removed)
            {
              prefix_removed++;    // now this is our string without the prefix

              // first check this won't cause a duplicate - if it does, we have to try something
              // else
              bool duplicate = false;
              for(size_t j = 0; j < varyings.size(); j++)
              {
                if(!strcmp(varyings[j], prefix_removed))
                {
                  duplicate = true;
                  break;
                }
              }

              if(!duplicate)
              {
                // we'll attempt this fixup
                RDCWARN("Attempting XFB varying fixup, subst '%s' for '%s'", varyings[i],
                        prefix_removed);
                varyings[i] = prefix_removed;
                finished = false;

                // don't try more than one at once (just in case)
                break;
              }
            }
          }
        }
      }

      if(status == 0)
      {
        // if we STILL can't link then something is really messy. Some drivers like AMD reflect out
        // unused variables when reflecting a separable program, then complain when they are passed
        // in as varyings. We remove all the varyings, link the program, then reflect it as-is and
        // try to use the output signature from that as the varyings.
        RDCWARN(
            "Failed to generate XFB varyings from normal reflection - making one final attempt.");
        RDCWARN(
            "This is often caused by sensitive drivers and output variables declared but never "
            "written to.");

        drv.SuppressDebugMessages(true);

        drv.glTransformFeedbackVaryings(feedbackProg, 0, NULL, eGL_INTERLEAVED_ATTRIBS);
        drv.glLinkProgram(feedbackProg);

        drv.glGetProgramiv(feedbackProg, eGL_LINK_STATUS, &status);

        if(status == 1)
        {
          ShaderReflection tempRefl;
          MakeShaderReflection(ShaderEnum((size_t)lastRefl->stage), feedbackProg, tempRefl,
                               outputUsage);

          // remake the varyings with tempRefl to 'trim' the output signature
          MakeVaryingsFromShaderReflection(*lastRefl, varyings, stride, &tempRefl);

          drv.glTransformFeedbackVaryings(feedbackProg, (GLsizei)varyings.size(), &varyings[0],
                                          eGL_INTERLEAVED_ATTRIBS);
          drv.glLinkProgram(feedbackProg);

          drv.glGetProgramiv(feedbackProg, eGL_LINK_STATUS, &status);
        }
        else
        {
          RDCWARN("Can't link program with no varyings!");
        }

        drv.SuppressDebugMessages(false);
      }
    }

    // detach the shaders now that linking is complete
    for(int i = 0; i < 4; i++)
      if(stageShaders[i])
        drv.glDetachShader(feedbackProg, stageShaders[i]);

    if(status == 0)
    {
      char buffer[1025] = {0};
      drv.glGetProgramInfoLog(feedbackProg, 1024, NULL, buffer);
      RDCERR("Failed to fix-up. Link error making xfb last program: %s", buffer);
    }
    else
    {
      PerStageReflections dstStages;
      m_pDriver->FillReflectionArray(ProgramRes(drv.GetCtx(), feedbackProg), dstStages);

      // copy across any uniform values, bindings etc from the real program containing
      // the vertex stage
      {
        PerStageReflections stages;
        m_pDriver->FillReflectionArray(ProgramRes(drv.GetCtx(), stageSrcPrograms[0]), stages);

        CopyProgramUniforms(stages, stageSrcPrograms[0], dstStages, feedbackProg);
      }

      // if tessellation is enabled, bind & copy uniforms. Note, control shader is optional
      // independent of eval shader (default values are used for the tessellation levels).
      if(stageSrcPrograms[1])
      {
        PerStageReflections stages;
        m_pDriver->FillReflectionArray(ProgramRes(drv.GetCtx(), stageSrcPrograms[1]), stages);

        CopyProgramUniforms(stages, stageSrcPrograms[1], dstStages, feedbackProg);
      }

      if(stageSrcPrograms[2])
      {
        PerStageReflections stages;
        m_pDriver->FillReflectionArray(ProgramRes(drv.GetCtx(), stageSrcPrograms[2]), stages);

        CopyProgramUniforms(stages, stageSrcPrograms[2], dstStages, feedbackProg);
      }

      // if we have a geometry shader, bind & copy uniforms
      if(stageSrcPrograms[3])
      {
        PerStageReflections stages;
        m_pDriver->FillReflectionArray(ProgramRes(drv.GetCtx(), stageSrcPrograms[3]), stages);

        CopyProgramUniforms(stages, stageSrcPrograms[3], dstStages, feedbackProg);
      }

      // bind our program and do the feedback draw
      drv.glUseProgram(feedbackProg);
      drv.glBindProgramPipeline(0);

      if(HasExt[ARB_transform_feedback2])
        drv.glBindTransformFeedback(eGL_TRANSFORM_FEEDBACK, DebugData.feedbackObj);

      // need to rebind this here because of an AMD bug that seems to ignore the buffer
      // bindings in the feedback object - or at least it errors if the default feedback
      // object has no buffers bound. Fortunately the state is still object-local so
      // we don't have to restore the buffer binding on the default feedback object.
      drv.glBindBufferBase(eGL_TRANSFORM_FEEDBACK_BUFFER, 0, DebugData.feedbackBuffer);

      idxBuf = 0;

      GLenum shaderOutMode = eGL_TRIANGLES;
      GLenum lastOutTopo = eGL_TRIANGLES;

      uint32_t maxOutputSize = stride;

      if(action->flags & ActionFlags::Instanced)
        maxOutputSize *= action->numInstances;

      uint32_t numInputPrimitives = action->numIndices;
      GLenum drawtopo = MakeGLPrimitiveTopology(drawParams.topo);

      switch(drawParams.topo)
      {
        case Topology::Unknown:
        case Topology::PointList: break;
        case Topology::LineList: numInputPrimitives /= 2; break;
        case Topology::LineStrip: numInputPrimitives -= 1; break;
        case Topology::LineLoop: break;
        case Topology::TriangleList: numInputPrimitives /= 3; break;
        case Topology::TriangleStrip:
        case Topology::TriangleFan: numInputPrimitives -= 2; break;
        case Topology::LineList_Adj: numInputPrimitives /= 4; break;
        case Topology::LineStrip_Adj: numInputPrimitives -= 3; break;
        case Topology::TriangleList_Adj: numInputPrimitives /= 6; break;
        case Topology::TriangleStrip_Adj: numInputPrimitives -= 5; break;
        case Topology::PatchList_1CPs:
        case Topology::PatchList_2CPs:
        case Topology::PatchList_3CPs:
        case Topology::PatchList_4CPs:
        case Topology::PatchList_5CPs:
        case Topology::PatchList_6CPs:
        case Topology::PatchList_7CPs:
        case Topology::PatchList_8CPs:
        case Topology::PatchList_9CPs:
        case Topology::PatchList_10CPs:
        case Topology::PatchList_11CPs:
        case Topology::PatchList_12CPs:
        case Topology::PatchList_13CPs:
        case Topology::PatchList_14CPs:
        case Topology::PatchList_15CPs:
        case Topology::PatchList_16CPs:
        case Topology::PatchList_17CPs:
        case Topology::PatchList_18CPs:
        case Topology::PatchList_19CPs:
        case Topology::PatchList_20CPs:
        case Topology::PatchList_21CPs:
        case Topology::PatchList_22CPs:
        case Topology::PatchList_23CPs:
        case Topology::PatchList_24CPs:
        case Topology::PatchList_25CPs:
        case Topology::PatchList_26CPs:
        case Topology::PatchList_27CPs:
        case Topology::PatchList_28CPs:
        case Topology::PatchList_29CPs:
        case Topology::PatchList_30CPs:
        case Topology::PatchList_31CPs:
        case Topology::PatchList_32CPs:
          numInputPrimitives /= PatchList_Count(drawParams.topo);
          break;
      }

      if(lastRefl == gsRefl)
      {
        drv.glGetProgramiv(feedbackProg, eGL_GEOMETRY_OUTPUT_TYPE, (GLint *)&shaderOutMode);

        GLint maxVerts = 1;

        drv.glGetProgramiv(feedbackProg, eGL_GEOMETRY_VERTICES_OUT, (GLint *)&maxVerts);

        if(shaderOutMode == eGL_TRIANGLE_STRIP)
        {
          lastOutTopo = eGL_TRIANGLES;
          maxVerts = RDCMAX(3, maxVerts);
        }
        else if(shaderOutMode == eGL_LINE_STRIP)
        {
          lastOutTopo = eGL_LINES;
          maxVerts = RDCMAX(2, maxVerts);
        }
        else if(shaderOutMode == eGL_POINTS)
        {
          lastOutTopo = eGL_POINTS;
          maxVerts = RDCMAX(1, maxVerts);
        }

        maxOutputSize *= maxVerts * numInputPrimitives;
      }
      else if(lastRefl == tesRefl)
      {
        drv.glGetProgramiv(feedbackProg, eGL_TESS_GEN_MODE, (GLint *)&shaderOutMode);

        uint32_t outputPrimitiveVerts = 1;

        if(shaderOutMode == eGL_QUADS)
        {
          lastOutTopo = eGL_TRIANGLES;
          outputPrimitiveVerts = 3;
        }
        else if(shaderOutMode == eGL_ISOLINES)
        {
          lastOutTopo = eGL_LINES;
          outputPrimitiveVerts = 2;
        }
        else if(shaderOutMode == eGL_TRIANGLES)
        {
          lastOutTopo = eGL_TRIANGLES;
          outputPrimitiveVerts = 3;
        }

        // assume an average maximum tessellation level of 32
        maxOutputSize *= 32 * outputPrimitiveVerts * numInputPrimitives;
      }

      // resize up the buffer if needed for the vertex output data
      if(DebugData.feedbackBufferSize < maxOutputSize)
      {
        uint64_t oldSize = DebugData.feedbackBufferSize;
        DebugData.feedbackBufferSize =
            CalcMeshOutputSize(DebugData.feedbackBufferSize, maxOutputSize);
        RDCWARN("Conservatively resizing xfb buffer from %llu to %llu for output", oldSize,
                DebugData.feedbackBufferSize);
        if(DebugData.feedbackBufferSize > INTPTR_MAX)
        {
          RDCERR("Too much data generated");
          DebugData.feedbackBufferSize = INTPTR_MAX;
        }
        drv.glNamedBufferDataEXT(DebugData.feedbackBuffer, (GLsizeiptr)DebugData.feedbackBufferSize,
                                 NULL, eGL_DYNAMIC_READ);
      }

      GLenum idxType = eGL_UNSIGNED_BYTE;
      if(drawParams.indexWidth == 2)
        idxType = eGL_UNSIGNED_SHORT;
      else if(drawParams.indexWidth == 4)
        idxType = eGL_UNSIGNED_INT;

      // instanced draws must be replayed one at a time so we can record the number of primitives
      // from
      // each drawcall, as due to expansion this can vary per-instance.
      if(action->flags & ActionFlags::Instanced)
      {
        // if there is only one instance it's a trivial case and we don't need to bother with the
        // expensive path
        if(action->numInstances > 1)
        {
          // ensure we have enough queries
          uint32_t curSize = (uint32_t)DebugData.feedbackQueries.size();
          if(curSize < action->numInstances)
          {
            DebugData.feedbackQueries.resize(action->numInstances);
            drv.glGenQueries(action->numInstances - curSize,
                             DebugData.feedbackQueries.data() + curSize);
          }

          // do incremental draws to get the output size. We have to do this O(N^2) style because
          // there's no way to replay only a single instance. We have to replay 1, 2, 3, ... N
          // instances and count the total number of verts each time, then we can see from the
          // difference how much each instance wrote.
          for(uint32_t inst = 1; inst <= action->numInstances; inst++)
          {
            drv.glBindBufferBase(eGL_TRANSFORM_FEEDBACK_BUFFER, 0, DebugData.feedbackBuffer);
            drv.glBeginQuery(eGL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN,
                             DebugData.feedbackQueries[inst - 1]);
            drv.glBeginTransformFeedback(lastOutTopo);

            if(!(action->flags & ActionFlags::Indexed))
            {
              if(HasExt[ARB_base_instance])
              {
                drv.glDrawArraysInstancedBaseInstance(
                    drawtopo, action->vertexOffset, action->numIndices, inst, action->instanceOffset);
              }
              else
              {
                drv.glDrawArraysInstanced(drawtopo, action->vertexOffset, action->numIndices, inst);
              }
            }
            else
            {
              if(HasExt[ARB_base_instance])
              {
                drv.glDrawElementsInstancedBaseVertexBaseInstance(
                    drawtopo, action->numIndices, idxType,
                    (const void *)(uintptr_t(action->indexOffset) * uintptr_t(drawParams.indexWidth)),
                    inst, action->baseVertex, action->instanceOffset);
              }
              else
              {
                drv.glDrawElementsInstancedBaseVertex(
                    drawtopo, action->numIndices, idxType,
                    (const void *)(uintptr_t(action->indexOffset) * uintptr_t(drawParams.indexWidth)),
                    inst, action->baseVertex);
              }
            }

            drv.glEndTransformFeedback();
            drv.glEndQuery(eGL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN);
          }
        }
        else
        {
          drv.glBeginQuery(eGL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN, DebugData.feedbackQueries[0]);
          drv.glBeginTransformFeedback(lastOutTopo);

          if(!(action->flags & ActionFlags::Indexed))
          {
            if(HasExt[ARB_base_instance])
            {
              drv.glDrawArraysInstancedBaseInstance(drawtopo, action->vertexOffset,
                                                    action->numIndices, action->numInstances,
                                                    action->instanceOffset);
            }
            else
            {
              drv.glDrawArraysInstanced(drawtopo, action->vertexOffset, action->numIndices,
                                        action->numInstances);
            }
          }
          else
          {
            if(HasExt[ARB_base_instance])
            {
              drv.glDrawElementsInstancedBaseVertexBaseInstance(
                  drawtopo, action->numIndices, idxType,
                  (const void *)(uintptr_t(action->indexOffset) * uintptr_t(drawParams.indexWidth)),
                  action->numInstances, action->baseVertex, action->instanceOffset);
            }
            else
            {
              drv.glDrawElementsInstancedBaseVertex(
                  drawtopo, action->numIndices, idxType,
                  (const void *)(uintptr_t(action->indexOffset) * uintptr_t(drawParams.indexWidth)),
                  action->numInstances, action->baseVertex);
            }
          }

          drv.glEndTransformFeedback();
          drv.glEndQuery(eGL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN);
        }
      }
      else
      {
        drv.glBeginQuery(eGL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN, DebugData.feedbackQueries[0]);
        drv.glBeginTransformFeedback(lastOutTopo);

        if(!(action->flags & ActionFlags::Indexed))
        {
          drv.glDrawArrays(drawtopo, action->vertexOffset, action->numIndices);
        }
        else
        {
          drv.glDrawElementsBaseVertex(
              drawtopo, action->numIndices, idxType,
              (const void *)(uintptr_t(action->indexOffset) * uintptr_t(drawParams.indexWidth)),
              action->baseVertex);
        }

        drv.glEndTransformFeedback();
        drv.glEndQuery(eGL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN);
      }

      rdcarray<GLPostVSData::InstData> instData;

      GLuint primsWritten = 0;

      if((action->flags & ActionFlags::Instanced) && action->numInstances > 1)
      {
        uint64_t prevVertCount = 0;

        for(uint32_t inst = 0; inst < action->numInstances; inst++)
        {
          drv.glGetQueryObjectuiv(DebugData.feedbackQueries[inst], eGL_QUERY_RESULT, &primsWritten);

          uint32_t vertCount = 3 * primsWritten;

          GLPostVSData::InstData d;
          d.numVerts = uint32_t(vertCount - prevVertCount);
          d.bufOffset = uint32_t(stride * prevVertCount);
          prevVertCount = vertCount;

          instData.push_back(d);
        }
      }
      else
      {
        primsWritten = 0;
        drv.glGetQueryObjectuiv(DebugData.feedbackQueries[0], eGL_QUERY_RESULT, &primsWritten);
      }

      bool error = false;

      if(primsWritten == 0)
      {
        RDCWARN("No primitives written by last vertex processing stage");
        error = true;
        ret.gsout.status = "No detectable output generated by geometry/tessellation shaders";
      }

      // get buffer data from buffer attached to feedback object
      float *data = (float *)drv.glMapNamedBufferEXT(DebugData.feedbackBuffer, eGL_READ_ONLY);

      if(data == NULL)
      {
        drv.glUnmapNamedBufferEXT(DebugData.feedbackBuffer);
        RDCERR("Couldn't map feedback buffer!");
        ret.gsout.status = "Couldn't read back geometry/tessellation output data from GPU";
        error = true;
      }

      if(error)
      {
        // delete temporary program we made
        drv.glDeleteProgram(feedbackProg);

        // restore replay state we trashed
        drv.glUseProgram(rs.Program.name);
        drv.glBindProgramPipeline(rs.Pipeline.name);

        drv.glBindBuffer(eGL_ARRAY_BUFFER, rs.BufferBindings[GLRenderState::eBufIdx_Array].name);
        drv.glBindBuffer(eGL_ELEMENT_ARRAY_BUFFER, elArrayBuffer);

        if(HasExt[ARB_transform_feedback2])
          drv.glBindTransformFeedback(eGL_TRANSFORM_FEEDBACK, rs.FeedbackObj.name);

        if(!rs.Enabled[GLRenderState::eEnabled_RasterizerDiscard])
          drv.glDisable(eGL_RASTERIZER_DISCARD);
        else
          drv.glEnable(eGL_RASTERIZER_DISCARD);

        // delete any temporaries
        for(size_t i = 0; i < 4; i++)
          if(tmpShaders[i])
            drv.glDeleteShader(tmpShaders[i]);

        drv.glDeleteShader(dummyFrag);

        return;
      }

      if(lastRefl == tesRefl)
      {
        // primitive counter is the number of primitives, not vertices
        if(shaderOutMode == eGL_TRIANGLES ||
           shaderOutMode == eGL_QUADS)    // query for quads returns # triangles
          ret.gsout.numVerts = primsWritten * 3;
        else if(shaderOutMode == eGL_ISOLINES)
          ret.gsout.numVerts = primsWritten * 2;
      }
      else if(lastRefl == gsRefl)
      {
        // primitive counter is the number of primitives, not vertices
        if(shaderOutMode == eGL_POINTS)
          ret.gsout.numVerts = primsWritten;
        else if(shaderOutMode == eGL_LINE_STRIP)
          ret.gsout.numVerts = primsWritten * 2;
        else if(shaderOutMode == eGL_TRIANGLE_STRIP)
          ret.gsout.numVerts = primsWritten * 3;
      }

      // create a buffer with this data, for future use (typed to ARRAY_BUFFER so we
      // can render from it to display previews).
      GLuint lastoutBuffer = 0;
      drv.glGenBuffers(1, &lastoutBuffer);
      drv.glBindBuffer(eGL_ARRAY_BUFFER, lastoutBuffer);
      drv.glNamedBufferDataEXT(lastoutBuffer, stride * ret.gsout.numVerts, data, eGL_STATIC_DRAW);

      byte *byteData = (byte *)data;

      float nearp = 0.1f;
      float farp = 100.0f;

      Vec4f *pos0 = (Vec4f *)byteData;

      bool found = false;

      for(uint32_t i = 1; hasPosition && i < ret.gsout.numVerts; i++)
      {
        //////////////////////////////////////////////////////////////////////////////////
        // derive near/far, assuming a standard perspective matrix
        //
        // the transformation from from pre-projection {Z,W} to post-projection {Z,W}
        // is linear. So we can say Zpost = Zpre*m + c . Here we assume Wpre = 1
        // and we know Wpost = Zpre from the perspective matrix.
        // we can then see from the perspective matrix that
        // m = F/(F-N)
        // c = -(F*N)/(F-N)
        //
        // with re-arranging and substitution, we then get:
        // N = -c/m
        // F = c/(1-m)
        //
        // so if we can derive m and c then we can determine N and F. We can do this with
        // two points, and we pick them reasonably distinct on z to reduce floating-point
        // error

        Vec4f *pos = (Vec4f *)(byteData + i * stride);

        if(fabs(pos->w - pos0->w) > 0.01f && fabs(pos->z - pos0->z) > 0.01f)
        {
          Vec2f A(pos0->w, pos0->z);
          Vec2f B(pos->w, pos->z);

          float m = (B.y - A.y) / (B.x - A.x);
          float c = B.y - B.x * m;

          if(m == 1.0f || c == 0.0f)
            continue;

          if(-c / m <= 0.000001f)
            continue;

          nearp = -c / m;
          farp = c / (1 - m);

          found = true;

          break;
        }
      }

      // if we didn't find anything, all z's and w's were identical.
      // If the z is positive and w greater for the first element then
      // we detect this projection as reversed z with infinite far plane
      if(!found && pos0->z > 0.0f && pos0->w > pos0->z)
      {
        nearp = pos0->z;
        farp = FLT_MAX;
      }

      drv.glUnmapNamedBufferEXT(DebugData.feedbackBuffer);

      // store everything out to the PostVS data cache
      ret.gsout.buf = lastoutBuffer;
      ret.gsout.instStride = 0;
      if(action->flags & ActionFlags::Instanced)
      {
        ret.gsout.numVerts /= RDCMAX(1U, action->numInstances);
        ret.gsout.instStride = stride * ret.gsout.numVerts;
      }
      ret.gsout.vertStride = stride;
      ret.gsout.nearPlane = nearp;
      ret.gsout.farPlane = farp;

      ret.gsout.useIndices = false;

      ret.gsout.flipY = flipY;

      ret.gsout.hasPosOut = hasPosition;

      ret.gsout.idxBuf = 0;
      ret.gsout.idxByteWidth = 0;

      ret.gsout.topo = MakePrimitiveTopology(lastOutTopo);

      ret.gsout.instData = instData;
    }
  }
  else
  {
    ret.vsout.flipY = flipY;
  }

  // delete temporary program we made
  drv.glDeleteProgram(feedbackProg);

  // restore replay state we trashed
  drv.glUseProgram(rs.Program.name);
  drv.glBindProgramPipeline(rs.Pipeline.name);

  drv.glBindBuffer(eGL_ARRAY_BUFFER, rs.BufferBindings[GLRenderState::eBufIdx_Array].name);
  drv.glBindBuffer(eGL_ELEMENT_ARRAY_BUFFER, elArrayBuffer);

  if(HasExt[ARB_query_buffer_object])
    drv.glBindBuffer(eGL_QUERY_BUFFER, rs.BufferBindings[GLRenderState::eBufIdx_Query].name);

  if(HasExt[ARB_transform_feedback2])
    drv.glBindTransformFeedback(eGL_TRANSFORM_FEEDBACK, rs.FeedbackObj.name);

  if(!rs.Enabled[GLRenderState::eEnabled_RasterizerDiscard])
    drv.glDisable(eGL_RASTERIZER_DISCARD);
  else
    drv.glEnable(eGL_RASTERIZER_DISCARD);

  // delete any temporaries
  for(size_t i = 0; i < 4; i++)
    if(tmpShaders[i])
      drv.glDeleteShader(tmpShaders[i]);

  drv.glDeleteShader(dummyFrag);
}