renderer_opengl: Add assembly program code paths

Add code required to use OpenGL assembly programs based on
NV_gpu_program5. Decompilation for ARB programs is intended to be added
in a follow up commit. This does **not** include ARB decompilation and
it's not in an usable state.

The intention behind assembly programs is to reduce shader stutter
significantly on drivers supporting NV_gpu_program5 (and other required
extensions). Currently only Nvidia's proprietary driver supports these
extensions.

Add a UI option hidden for now to avoid people enabling this option
accidentally.

This code path has some limitations that OpenGL compatibility doesn't
have:
- NV_shader_storage_buffer_object is limited to 16 entries for a single
OpenGL context state (I don't know if this is an intended limitation, an
specification issue or I am missing something). Currently causes issues
on The Legend of Zelda: Link's Awakening.
- NV_parameter_buffer_object can't bind buffers using an offset
different to zero. The used workaround is to copy to a temporary buffer
(this doesn't happen often so it's not an issue).

On the other hand, it has the following advantages:
- Shaders build a lot faster.
- We have control over how floating point rounding is done over
individual instructions (SPIR-V on Vulkan can't do this).
- Operations on shared memory can be unsigned and signed.
- Transform feedbacks are dynamic state (not yet implemented).
- Parameter buffers (uniform buffers) are per stage, matching NVN and
hardware's behavior.
- The API to bind and create assembly programs makes sense, unlike
ARB_separate_shader_objects.
This commit is contained in:
ReinUsesLisp 2020-05-17 22:32:49 -03:00
parent 47a7c4f4fe
commit 420cc13248
12 changed files with 339 additions and 109 deletions

View File

@ -13,6 +13,7 @@
#include "common/logging/log.h" #include "common/logging/log.h"
#include "common/scope_exit.h" #include "common/scope_exit.h"
#include "core/settings.h"
#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_resource_manager.h"
@ -183,10 +184,16 @@ Device::Device() : base_bindings{BuildBaseBindings()} {
has_precise_bug = TestPreciseBug(); has_precise_bug = TestPreciseBug();
has_broken_compute = is_intel_proprietary; has_broken_compute = is_intel_proprietary;
has_fast_buffer_sub_data = is_nvidia; has_fast_buffer_sub_data = is_nvidia;
use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&
GLAD_GL_NV_compute_program5;
LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi); LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug); LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug); LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug);
if (Settings::values.use_assembly_shaders && !use_assembly_shaders) {
LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported");
}
} }
Device::Device(std::nullptr_t) { Device::Device(std::nullptr_t) {

View File

@ -88,6 +88,10 @@ public:
return has_fast_buffer_sub_data; return has_fast_buffer_sub_data;
} }
bool UseAssemblyShaders() const {
return use_assembly_shaders;
}
private: private:
static bool TestVariableAoffi(); static bool TestVariableAoffi();
static bool TestPreciseBug(); static bool TestPreciseBug();
@ -107,6 +111,7 @@ private:
bool has_precise_bug{}; bool has_precise_bug{};
bool has_broken_compute{}; bool has_broken_compute{};
bool has_fast_buffer_sub_data{}; bool has_fast_buffer_sub_data{};
bool use_assembly_shaders{};
}; };
} // namespace OpenGL } // namespace OpenGL

View File

@ -94,17 +94,30 @@ void oglEnable(GLenum cap, bool state) {
} // Anonymous namespace } // Anonymous namespace
RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
ScreenInfo& info, GLShader::ProgramManager& program_manager, const Device& device, ScreenInfo& info,
StateTracker& state_tracker) ProgramManager& program_manager, StateTracker& state_tracker)
: RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device, state_tracker}, : RasterizerAccelerated{system.Memory()}, device{device}, texture_cache{system, *this, device,
state_tracker},
shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, shader_cache{*this, system, emu_window, device}, query_cache{system, *this},
buffer_cache{*this, system, device, STREAM_BUFFER_SIZE}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE},
fence_manager{system, *this, texture_cache, buffer_cache, query_cache}, system{system}, fence_manager{system, *this, texture_cache, buffer_cache, query_cache}, system{system},
screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} { screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} {
CheckExtensions(); CheckExtensions();
if (device.UseAssemblyShaders()) {
glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
for (const GLuint cbuf : staging_cbufs) {
glNamedBufferStorage(cbuf, static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize),
nullptr, 0);
}
}
} }
RasterizerOpenGL::~RasterizerOpenGL() {} RasterizerOpenGL::~RasterizerOpenGL() {
if (device.UseAssemblyShaders()) {
glDeleteBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
}
}
void RasterizerOpenGL::CheckExtensions() { void RasterizerOpenGL::CheckExtensions() {
if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) { if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) {
@ -230,6 +243,7 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
MICROPROFILE_SCOPE(OpenGL_Shader); MICROPROFILE_SCOPE(OpenGL_Shader);
auto& gpu = system.GPU().Maxwell3D(); auto& gpu = system.GPU().Maxwell3D();
std::size_t num_ssbos = 0;
u32 clip_distances = 0; u32 clip_distances = 0;
for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
@ -261,6 +275,14 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
Shader shader{shader_cache.GetStageProgram(program)}; Shader shader{shader_cache.GetStageProgram(program)};
if (device.UseAssemblyShaders()) {
// Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this
// all stages share the same bindings.
const std::size_t num_stage_ssbos = shader->GetEntries().global_memory_entries.size();
ASSERT_MSG(num_stage_ssbos == 0 || num_ssbos == 0, "SSBOs on more than one stage");
num_ssbos += num_stage_ssbos;
}
// Stage indices are 0 - 5 // Stage indices are 0 - 5
const std::size_t stage = index == 0 ? 0 : index - 1; const std::size_t stage = index == 0 ? 0 : index - 1;
SetupDrawConstBuffers(stage, shader); SetupDrawConstBuffers(stage, shader);
@ -526,6 +548,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
SyncFramebufferSRGB(); SyncFramebufferSRGB();
buffer_cache.Acquire(); buffer_cache.Acquire();
current_cbuf = 0;
std::size_t buffer_size = CalculateVertexArraysSize(); std::size_t buffer_size = CalculateVertexArraysSize();
@ -535,9 +558,9 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
} }
// Uniform space for the 5 shader stages // Uniform space for the 5 shader stages
buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + buffer_size =
(sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) * Common::AlignUp<std::size_t>(buffer_size, 4) +
Maxwell::MaxShaderStage; (sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage;
// Add space for at least 18 constant buffers // Add space for at least 18 constant buffers
buffer_size += Maxwell::MaxConstBuffers * buffer_size += Maxwell::MaxConstBuffers *
@ -558,12 +581,14 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
} }
// Setup emulation uniform buffer. // Setup emulation uniform buffer.
GLShader::MaxwellUniformData ubo; if (!device.UseAssemblyShaders()) {
MaxwellUniformData ubo;
ubo.SetFromRegs(gpu); ubo.SetFromRegs(gpu);
const auto [buffer, offset] = const auto [buffer, offset] =
buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset, glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset,
static_cast<GLsizeiptr>(sizeof(ubo))); static_cast<GLsizeiptr>(sizeof(ubo)));
}
// Setup shaders and their used resources. // Setup shaders and their used resources.
texture_cache.GuardSamplers(true); texture_cache.GuardSamplers(true);
@ -635,11 +660,11 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
} }
buffer_cache.Acquire(); buffer_cache.Acquire();
current_cbuf = 0;
auto kernel = shader_cache.GetComputeKernel(code_addr); auto kernel = shader_cache.GetComputeKernel(code_addr);
SetupComputeTextures(kernel); SetupComputeTextures(kernel);
SetupComputeImages(kernel); SetupComputeImages(kernel);
program_manager.BindComputeShader(kernel->GetHandle());
const std::size_t buffer_size = const std::size_t buffer_size =
Tegra::Engines::KeplerCompute::NumConstBuffers * Tegra::Engines::KeplerCompute::NumConstBuffers *
@ -652,6 +677,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
buffer_cache.Unmap(); buffer_cache.Unmap();
const auto& launch_desc = system.GPU().KeplerCompute().launch_description; const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
program_manager.BindCompute(kernel->GetHandle());
glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
++num_queued_commands; ++num_queued_commands;
} }
@ -812,14 +838,20 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
} }
void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) { void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) {
static constexpr std::array PARAMETER_LUT = {
GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV};
MICROPROFILE_SCOPE(OpenGL_UBO); MICROPROFILE_SCOPE(OpenGL_UBO);
const auto& stages = system.GPU().Maxwell3D().state.shader_stages; const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
const auto& shader_stage = stages[stage_index]; const auto& shader_stage = stages[stage_index];
u32 binding = device.GetBaseBindings(stage_index).uniform_buffer; u32 binding =
device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).uniform_buffer;
for (const auto& entry : shader->GetEntries().const_buffers) { for (const auto& entry : shader->GetEntries().const_buffers) {
const auto& buffer = shader_stage.const_buffers[entry.GetIndex()]; const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
SetupConstBuffer(binding++, buffer, entry); SetupConstBuffer(PARAMETER_LUT[stage_index], binding++, buffer, entry);
} }
} }
@ -835,16 +867,21 @@ void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
buffer.address = config.Address(); buffer.address = config.Address();
buffer.size = config.size; buffer.size = config.size;
buffer.enabled = mask[entry.GetIndex()]; buffer.enabled = mask[entry.GetIndex()];
SetupConstBuffer(binding++, buffer, entry); SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding++, buffer, entry);
} }
} }
void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
const Tegra::Engines::ConstBufferInfo& buffer,
const ConstBufferEntry& entry) { const ConstBufferEntry& entry) {
if (!buffer.enabled) { if (!buffer.enabled) {
// Set values to zero to unbind buffers // Set values to zero to unbind buffers
glBindBufferRange(GL_UNIFORM_BUFFER, binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0, if (device.UseAssemblyShaders()) {
sizeof(float)); glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
} else {
glBindBufferRange(GL_UNIFORM_BUFFER, binding,
buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float));
}
return; return;
} }
@ -853,9 +890,19 @@ void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::Const
const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
const auto alignment = device.GetUniformBufferAlignment(); const auto alignment = device.GetUniformBufferAlignment();
const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false, auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false,
device.HasFastBufferSubData()); device.HasFastBufferSubData());
if (!device.UseAssemblyShaders()) {
glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size); glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
return;
}
if (offset != 0) {
const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size);
cbuf = staging_cbuf;
offset = 0;
}
glBindBufferRangeNV(stage, binding, cbuf, offset, size);
} }
void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) { void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) {
@ -863,7 +910,8 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad
auto& memory_manager{gpu.MemoryManager()}; auto& memory_manager{gpu.MemoryManager()};
const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]}; const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};
u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer; u32 binding =
device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
for (const auto& entry : shader->GetEntries().global_memory_entries) { for (const auto& entry : shader->GetEntries().global_memory_entries) {
const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset}; const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};
const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)}; const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)};

View File

@ -56,8 +56,8 @@ struct DrawParameters;
class RasterizerOpenGL : public VideoCore::RasterizerAccelerated { class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {
public: public:
explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
ScreenInfo& info, GLShader::ProgramManager& program_manager, const Device& device, ScreenInfo& info,
StateTracker& state_tracker); ProgramManager& program_manager, StateTracker& state_tracker);
~RasterizerOpenGL() override; ~RasterizerOpenGL() override;
void Draw(bool is_indexed, bool is_instanced) override; void Draw(bool is_indexed, bool is_instanced) override;
@ -106,7 +106,7 @@ private:
void SetupComputeConstBuffers(const Shader& kernel); void SetupComputeConstBuffers(const Shader& kernel);
/// Configures a constant buffer. /// Configures a constant buffer.
void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
const ConstBufferEntry& entry); const ConstBufferEntry& entry);
/// Configures the current global memory entries to use for the draw command. /// Configures the current global memory entries to use for the draw command.
@ -224,7 +224,7 @@ private:
void SetupShaders(GLenum primitive_mode); void SetupShaders(GLenum primitive_mode);
const Device device; const Device& device;
TextureCacheOpenGL texture_cache; TextureCacheOpenGL texture_cache;
ShaderCacheOpenGL shader_cache; ShaderCacheOpenGL shader_cache;
@ -236,7 +236,7 @@ private:
Core::System& system; Core::System& system;
ScreenInfo& screen_info; ScreenInfo& screen_info;
GLShader::ProgramManager& program_manager; ProgramManager& program_manager;
StateTracker& state_tracker; StateTracker& state_tracker;
static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
@ -248,6 +248,12 @@ private:
std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers> std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
enabled_transform_feedback_buffers; enabled_transform_feedback_buffers;
static constexpr std::size_t NUM_CONSTANT_BUFFERS =
Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
std::size_t current_cbuf = 0;
/// Number of commands queued to the OpenGL driver. Reseted on flush. /// Number of commands queued to the OpenGL driver. Reseted on flush.
std::size_t num_queued_commands = 0; std::size_t num_queued_commands = 0;

View File

@ -125,6 +125,15 @@ void OGLProgram::Release() {
handle = 0; handle = 0;
} }
void OGLAssemblyProgram::Release() {
if (handle == 0) {
return;
}
MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
glDeleteProgramsARB(1, &handle);
handle = 0;
}
void OGLPipeline::Create() { void OGLPipeline::Create() {
if (handle != 0) if (handle != 0)
return; return;

View File

@ -167,6 +167,22 @@ public:
GLuint handle = 0; GLuint handle = 0;
}; };
class OGLAssemblyProgram : private NonCopyable {
public:
OGLAssemblyProgram() = default;
OGLAssemblyProgram(OGLAssemblyProgram&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
~OGLAssemblyProgram() {
Release();
}
/// Deletes the internal OpenGL resource
void Release();
GLuint handle = 0;
};
class OGLPipeline : private NonCopyable { class OGLPipeline : private NonCopyable {
public: public:
OGLPipeline() = default; OGLPipeline() = default;

View File

@ -97,6 +97,24 @@ constexpr ShaderType GetShaderType(Maxwell::ShaderProgram program_type) {
return {}; return {};
} }
constexpr GLenum AssemblyEnum(ShaderType shader_type) {
switch (shader_type) {
case ShaderType::Vertex:
return GL_VERTEX_PROGRAM_NV;
case ShaderType::TesselationControl:
return GL_TESS_CONTROL_PROGRAM_NV;
case ShaderType::TesselationEval:
return GL_TESS_EVALUATION_PROGRAM_NV;
case ShaderType::Geometry:
return GL_GEOMETRY_PROGRAM_NV;
case ShaderType::Fragment:
return GL_FRAGMENT_PROGRAM_NV;
case ShaderType::Compute:
return GL_COMPUTE_PROGRAM_NV;
}
return {};
}
std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) { std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) {
return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier); return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier);
} }
@ -120,18 +138,43 @@ std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) {
return registry; return registry;
} }
std::shared_ptr<OGLProgram> BuildShader(const Device& device, ShaderType shader_type, ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 unique_identifier,
u64 unique_identifier, const ShaderIR& ir, const ShaderIR& ir, const Registry& registry,
const Registry& registry, bool hint_retrievable = false) { bool hint_retrievable = false) {
const std::string shader_id = MakeShaderID(unique_identifier, shader_type); const std::string shader_id = MakeShaderID(unique_identifier, shader_type);
LOG_INFO(Render_OpenGL, "{}", shader_id); LOG_INFO(Render_OpenGL, "{}", shader_id);
auto program = std::make_shared<ProgramHandle>();
if (device.UseAssemblyShaders()) {
const std::string arb = "Not implemented";
GLuint& arb_prog = program->assembly_program.handle;
// Commented out functions signal OpenGL errors but are compatible with apitrace.
// Use them only to capture and replay on apitrace.
#if 0
glGenProgramsNV(1, &arb_prog);
glLoadProgramNV(AssemblyEnum(shader_type), arb_prog, static_cast<GLsizei>(arb.size()),
reinterpret_cast<const GLubyte*>(arb.data()));
#else
glGenProgramsARB(1, &arb_prog);
glNamedProgramStringEXT(arb_prog, AssemblyEnum(shader_type), GL_PROGRAM_FORMAT_ASCII_ARB,
static_cast<GLsizei>(arb.size()), arb.data());
#endif
const auto err = reinterpret_cast<const char*>(glGetString(GL_PROGRAM_ERROR_STRING_NV));
if (err && *err) {
LOG_CRITICAL(Render_OpenGL, "{}", err);
LOG_INFO(Render_OpenGL, "\n{}", arb);
}
} else {
const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id); const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
OGLShader shader; OGLShader shader;
shader.Create(glsl.c_str(), GetGLShaderType(shader_type)); shader.Create(glsl.c_str(), GetGLShaderType(shader_type));
auto program = std::make_shared<OGLProgram>(); program->source_program.Create(true, hint_retrievable, shader.handle);
program->Create(true, hint_retrievable, shader.handle); }
return program; return program;
} }
@ -153,15 +196,22 @@ std::unordered_set<GLenum> GetSupportedFormats() {
CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
std::shared_ptr<VideoCommon::Shader::Registry> registry, std::shared_ptr<VideoCommon::Shader::Registry> registry,
ShaderEntries entries, std::shared_ptr<OGLProgram> program) ShaderEntries entries, ProgramSharedPtr program_)
: RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)}, : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)},
size_in_bytes{size_in_bytes}, program{std::move(program)} {} size_in_bytes{size_in_bytes}, program{std::move(program_)} {
// Assign either the assembly program or source program. We can't have both.
handle = program->assembly_program.handle;
if (handle == 0) {
handle = program->source_program.handle;
}
ASSERT(handle != 0);
}
CachedShader::~CachedShader() = default; CachedShader::~CachedShader() = default;
GLuint CachedShader::GetHandle() const { GLuint CachedShader::GetHandle() const {
DEBUG_ASSERT(registry->IsConsistent()); DEBUG_ASSERT(registry->IsConsistent());
return program->handle; return handle;
} }
Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
@ -239,7 +289,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
return; return;
} }
const std::vector gl_cache = disk_cache.LoadPrecompiled(); std::vector<ShaderDiskCachePrecompiled> gl_cache;
if (!device.UseAssemblyShaders()) {
// Only load precompiled cache when we are not using assembly shaders
gl_cache = disk_cache.LoadPrecompiled();
}
const auto supported_formats = GetSupportedFormats(); const auto supported_formats = GetSupportedFormats();
// Track if precompiled cache was altered during loading to know if we have to // Track if precompiled cache was altered during loading to know if we have to
@ -278,7 +332,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
auto registry = MakeRegistry(entry); auto registry = MakeRegistry(entry);
const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry); const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry);
std::shared_ptr<OGLProgram> program; ProgramSharedPtr program;
if (precompiled_entry) { if (precompiled_entry) {
// If the shader is precompiled, attempt to load it with // If the shader is precompiled, attempt to load it with
program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats); program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats);
@ -332,6 +386,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
return; return;
} }
if (device.UseAssemblyShaders()) {
// Don't store precompiled binaries for assembly shaders.
return;
}
// TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw
// before precompiling them // before precompiling them
@ -339,7 +398,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
const u64 id = (*transferable)[i].unique_identifier; const u64 id = (*transferable)[i].unique_identifier;
const auto it = find_precompiled(id); const auto it = find_precompiled(id);
if (it == gl_cache.end()) { if (it == gl_cache.end()) {
const GLuint program = runtime_cache.at(id).program->handle; const GLuint program = runtime_cache.at(id).program->source_program.handle;
disk_cache.SavePrecompiled(id, program); disk_cache.SavePrecompiled(id, program);
precompiled_cache_altered = true; precompiled_cache_altered = true;
} }
@ -350,7 +409,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
} }
} }
std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram( ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram(
const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry, const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
const std::unordered_set<GLenum>& supported_formats) { const std::unordered_set<GLenum>& supported_formats) {
if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) { if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) {
@ -358,15 +417,15 @@ std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
return {}; return {};
} }
auto program = std::make_shared<OGLProgram>(); auto program = std::make_shared<ProgramHandle>();
program->handle = glCreateProgram(); GLuint& handle = program->source_program.handle;
glProgramParameteri(program->handle, GL_PROGRAM_SEPARABLE, GL_TRUE); handle = glCreateProgram();
glProgramBinary(program->handle, precompiled_entry.binary_format, glProgramParameteri(handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
precompiled_entry.binary.data(), glProgramBinary(handle, precompiled_entry.binary_format, precompiled_entry.binary.data(),
static_cast<GLsizei>(precompiled_entry.binary.size())); static_cast<GLsizei>(precompiled_entry.binary.size()));
GLint link_status; GLint link_status;
glGetProgramiv(program->handle, GL_LINK_STATUS, &link_status); glGetProgramiv(handle, GL_LINK_STATUS, &link_status);
if (link_status == GL_FALSE) { if (link_status == GL_FALSE) {
LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing"); LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing");
return {}; return {};

View File

@ -43,8 +43,14 @@ struct UnspecializedShader;
using Shader = std::shared_ptr<CachedShader>; using Shader = std::shared_ptr<CachedShader>;
using Maxwell = Tegra::Engines::Maxwell3D::Regs; using Maxwell = Tegra::Engines::Maxwell3D::Regs;
struct ProgramHandle {
OGLProgram source_program;
OGLAssemblyProgram assembly_program;
};
using ProgramSharedPtr = std::shared_ptr<ProgramHandle>;
struct PrecompiledShader { struct PrecompiledShader {
std::shared_ptr<OGLProgram> program; ProgramSharedPtr program;
std::shared_ptr<VideoCommon::Shader::Registry> registry; std::shared_ptr<VideoCommon::Shader::Registry> registry;
ShaderEntries entries; ShaderEntries entries;
}; };
@ -87,12 +93,13 @@ public:
private: private:
explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
std::shared_ptr<VideoCommon::Shader::Registry> registry, std::shared_ptr<VideoCommon::Shader::Registry> registry,
ShaderEntries entries, std::shared_ptr<OGLProgram> program); ShaderEntries entries, ProgramSharedPtr program);
std::shared_ptr<VideoCommon::Shader::Registry> registry; std::shared_ptr<VideoCommon::Shader::Registry> registry;
ShaderEntries entries; ShaderEntries entries;
std::size_t size_in_bytes = 0; std::size_t size_in_bytes = 0;
std::shared_ptr<OGLProgram> program; ProgramSharedPtr program;
GLuint handle = 0;
}; };
class ShaderCacheOpenGL final : public RasterizerCache<Shader> { class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
@ -115,7 +122,7 @@ protected:
void FlushObjectInner(const Shader& object) override {} void FlushObjectInner(const Shader& object) override {}
private: private:
std::shared_ptr<OGLProgram> GeneratePrecompiledProgram( ProgramSharedPtr GeneratePrecompiledProgram(
const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry, const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
const std::unordered_set<GLenum>& supported_formats); const std::unordered_set<GLenum>& supported_formats);

View File

@ -6,47 +6,107 @@
#include "common/common_types.h" #include "common/common_types.h"
#include "video_core/engines/maxwell_3d.h" #include "video_core/engines/maxwell_3d.h"
#include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_shader_manager.h"
namespace OpenGL::GLShader { namespace OpenGL {
ProgramManager::ProgramManager() = default; ProgramManager::ProgramManager(const Device& device) {
use_assembly_programs = device.UseAssemblyShaders();
ProgramManager::~ProgramManager() = default; if (use_assembly_programs) {
glEnable(GL_COMPUTE_PROGRAM_NV);
void ProgramManager::Create() { } else {
graphics_pipeline.Create(); graphics_pipeline.Create();
glBindProgramPipeline(graphics_pipeline.handle); glBindProgramPipeline(graphics_pipeline.handle);
} }
}
ProgramManager::~ProgramManager() = default;
void ProgramManager::BindCompute(GLuint program) {
if (use_assembly_programs) {
glBindProgramARB(GL_COMPUTE_PROGRAM_NV, program);
} else {
is_graphics_bound = false;
glUseProgram(program);
}
}
void ProgramManager::BindGraphicsPipeline() { void ProgramManager::BindGraphicsPipeline() {
if (use_assembly_programs) {
UpdateAssemblyPrograms();
} else {
UpdateSourcePrograms();
}
}
void ProgramManager::BindHostPipeline(GLuint pipeline) {
if (use_assembly_programs) {
if (geometry_enabled) {
geometry_enabled = false;
old_state.geometry = 0;
glDisable(GL_GEOMETRY_PROGRAM_NV);
}
}
glBindProgramPipeline(pipeline);
}
void ProgramManager::RestoreGuestPipeline() {
if (use_assembly_programs) {
glBindProgramPipeline(0);
} else {
glBindProgramPipeline(graphics_pipeline.handle);
}
}
void ProgramManager::UpdateAssemblyPrograms() {
const auto update_state = [](GLenum stage, bool& enabled, GLuint current, GLuint old) {
if (current == old) {
return;
}
if (current == 0) {
if (enabled) {
enabled = false;
glDisable(stage);
}
return;
}
if (!enabled) {
enabled = true;
glEnable(stage);
}
glBindProgramARB(stage, current);
};
update_state(GL_VERTEX_PROGRAM_NV, vertex_enabled, current_state.vertex, old_state.vertex);
update_state(GL_GEOMETRY_PROGRAM_NV, geometry_enabled, current_state.geometry,
old_state.geometry);
update_state(GL_FRAGMENT_PROGRAM_NV, fragment_enabled, current_state.fragment,
old_state.fragment);
old_state = current_state;
}
void ProgramManager::UpdateSourcePrograms() {
if (!is_graphics_bound) { if (!is_graphics_bound) {
is_graphics_bound = true; is_graphics_bound = true;
glUseProgram(0); glUseProgram(0);
} }
// Avoid updating the pipeline when values have no changed const GLuint handle = graphics_pipeline.handle;
if (old_state == current_state) { const auto update_state = [handle](GLenum stage, GLuint current, GLuint old) {
if (current == old) {
return; return;
} }
glUseProgramStages(handle, stage, current);
// Workaround for AMD bug };
static constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT | update_state(GL_VERTEX_SHADER_BIT, current_state.vertex, old_state.vertex);
GL_FRAGMENT_SHADER_BIT}; update_state(GL_GEOMETRY_SHADER_BIT, current_state.geometry, old_state.geometry);
const GLuint handle = graphics_pipeline.handle; update_state(GL_FRAGMENT_SHADER_BIT, current_state.fragment, old_state.fragment);
glUseProgramStages(handle, all_used_stages, 0);
glUseProgramStages(handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader);
glUseProgramStages(handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader);
glUseProgramStages(handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader);
old_state = current_state; old_state = current_state;
} }
void ProgramManager::BindComputeShader(GLuint program) {
is_graphics_bound = false;
glUseProgram(program);
}
void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) { void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
const auto& regs = maxwell.regs; const auto& regs = maxwell.regs;
@ -54,4 +114,4 @@ void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
y_direction = regs.screen_y_control.y_negate == 0 ? 1.0f : -1.0f; y_direction = regs.screen_y_control.y_negate == 0 ? 1.0f : -1.0f;
} }
} // namespace OpenGL::GLShader } // namespace OpenGL

View File

@ -11,7 +11,9 @@
#include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/maxwell_to_gl.h" #include "video_core/renderer_opengl/maxwell_to_gl.h"
namespace OpenGL::GLShader { namespace OpenGL {
class Device;
/// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned /// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned
/// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at /// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at
@ -28,50 +30,58 @@ static_assert(sizeof(MaxwellUniformData) < 16384,
class ProgramManager { class ProgramManager {
public: public:
explicit ProgramManager(); explicit ProgramManager(const Device& device);
~ProgramManager(); ~ProgramManager();
void Create(); /// Binds a compute program
void BindCompute(GLuint program);
/// Updates the graphics pipeline and binds it. /// Updates bound programs.
void BindGraphicsPipeline(); void BindGraphicsPipeline();
/// Binds a compute shader. /// Binds an OpenGL pipeline object unsynchronized with the guest state.
void BindComputeShader(GLuint program); void BindHostPipeline(GLuint pipeline);
/// Rewinds BindHostPipeline state changes.
void RestoreGuestPipeline();
void UseVertexShader(GLuint program) { void UseVertexShader(GLuint program) {
current_state.vertex_shader = program; current_state.vertex = program;
} }
void UseGeometryShader(GLuint program) { void UseGeometryShader(GLuint program) {
current_state.geometry_shader = program; current_state.geometry = program;
} }
void UseFragmentShader(GLuint program) { void UseFragmentShader(GLuint program) {
current_state.fragment_shader = program; current_state.fragment = program;
} }
private: private:
struct PipelineState { struct PipelineState {
bool operator==(const PipelineState& rhs) const noexcept { GLuint vertex = 0;
return vertex_shader == rhs.vertex_shader && fragment_shader == rhs.fragment_shader && GLuint geometry = 0;
geometry_shader == rhs.geometry_shader; GLuint fragment = 0;
}
bool operator!=(const PipelineState& rhs) const noexcept {
return !operator==(rhs);
}
GLuint vertex_shader = 0;
GLuint fragment_shader = 0;
GLuint geometry_shader = 0;
}; };
/// Update NV_gpu_program5 programs.
void UpdateAssemblyPrograms();
/// Update GLSL programs.
void UpdateSourcePrograms();
OGLPipeline graphics_pipeline; OGLPipeline graphics_pipeline;
OGLPipeline compute_pipeline;
PipelineState current_state; PipelineState current_state;
PipelineState old_state; PipelineState old_state;
bool use_assembly_programs = false;
bool is_graphics_bound = true; bool is_graphics_bound = true;
bool vertex_enabled = false;
bool geometry_enabled = false;
bool fragment_enabled = false;
}; };
} // namespace OpenGL::GLShader } // namespace OpenGL

View File

@ -316,7 +316,7 @@ public:
RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system, RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system,
Core::Frontend::GraphicsContext& context) Core::Frontend::GraphicsContext& context)
: RendererBase{emu_window}, emu_window{emu_window}, system{system}, context{context}, : RendererBase{emu_window}, emu_window{emu_window}, system{system}, context{context},
has_debug_tool{HasDebugTool()} {} program_manager{device}, has_debug_tool{HasDebugTool()} {}
RendererOpenGL::~RendererOpenGL() = default; RendererOpenGL::~RendererOpenGL() = default;
@ -468,8 +468,9 @@ void RendererOpenGL::InitOpenGLObjects() {
vertex_program.Create(true, false, vertex_shader.handle); vertex_program.Create(true, false, vertex_shader.handle);
fragment_program.Create(true, false, fragment_shader.handle); fragment_program.Create(true, false, fragment_shader.handle);
// Create program pipeline pipeline.Create();
program_manager.Create(); glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, vertex_program.handle);
glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, fragment_program.handle);
// Generate VBO handle for drawing // Generate VBO handle for drawing
vertex_buffer.Create(); vertex_buffer.Create();
@ -508,7 +509,7 @@ void RendererOpenGL::CreateRasterizer() {
if (rasterizer) { if (rasterizer) {
return; return;
} }
rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info, rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, device, screen_info,
program_manager, state_tracker); program_manager, state_tracker);
} }
@ -620,10 +621,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
state_tracker.NotifyClipControl(); state_tracker.NotifyClipControl();
state_tracker.NotifyAlphaTest(); state_tracker.NotifyAlphaTest();
program_manager.UseVertexShader(vertex_program.handle); program_manager.BindHostPipeline(pipeline.handle);
program_manager.UseGeometryShader(0);
program_manager.UseFragmentShader(fragment_program.handle);
program_manager.BindGraphicsPipeline();
glEnable(GL_CULL_FACE); glEnable(GL_CULL_FACE);
if (screen_info.display_srgb) { if (screen_info.display_srgb) {
@ -665,6 +663,8 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
glClear(GL_COLOR_BUFFER_BIT); glClear(GL_COLOR_BUFFER_BIT);
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
program_manager.RestoreGuestPipeline();
} }
bool RendererOpenGL::TryPresent(int timeout_ms) { bool RendererOpenGL::TryPresent(int timeout_ms) {

View File

@ -9,6 +9,7 @@
#include "common/common_types.h" #include "common/common_types.h"
#include "common/math_util.h" #include "common/math_util.h"
#include "video_core/renderer_base.h" #include "video_core/renderer_base.h"
#include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_shader_manager.h"
#include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/gl_state_tracker.h"
@ -95,6 +96,7 @@ private:
Core::Frontend::EmuWindow& emu_window; Core::Frontend::EmuWindow& emu_window;
Core::System& system; Core::System& system;
Core::Frontend::GraphicsContext& context; Core::Frontend::GraphicsContext& context;
const Device device;
StateTracker state_tracker{system}; StateTracker state_tracker{system};
@ -102,13 +104,14 @@ private:
OGLBuffer vertex_buffer; OGLBuffer vertex_buffer;
OGLProgram vertex_program; OGLProgram vertex_program;
OGLProgram fragment_program; OGLProgram fragment_program;
OGLPipeline pipeline;
OGLFramebuffer screenshot_framebuffer; OGLFramebuffer screenshot_framebuffer;
/// Display information for Switch screen /// Display information for Switch screen
ScreenInfo screen_info; ScreenInfo screen_info;
/// Global dummy shader pipeline /// Global dummy shader pipeline
GLShader::ProgramManager program_manager; ProgramManager program_manager;
/// OpenGL framebuffer data /// OpenGL framebuffer data
std::vector<u8> gl_framebuffer_data; std::vector<u8> gl_framebuffer_data;