OpenGL計算シェーダーでバッファーを正しく使用する

Aug 19 2020

パフォーマンスを最大化するために、行列/ベクトル演算を使用して最初に作成したアルゴリズムをOpenGLカーネルに書き直しています。

私はOpenGLの基本的な知識を持っているので、物事を機能させることができましたが、OpenGLが提供するさまざまな選択、特に私の場合に大きな影響を与えると思われるバッファーのパラメーターを選択する際には、多くの問題があります。たくさんのデータを読み書きします。

私は3つのカーネルを順番に呼び出します：

最初：

/* Generated constants (for all three shaders): 
 *   #version 430
 *   const vec3 orig
 *   const float vx
 *   const ivec2 size
 *   const uint projections
 *   const uint subIterations
 */
layout(local_size_x = 1, local_size_y = 1) in;

layout(std430, binding = 0) buffer bufferA { //GL_SHADER_STORAGE_BUFFER, GL_DYNAMIC_READ
    uint bufferProjection[]; //Written and read (AtomicAdd) by this shader, read by the second kernel
};
layout(std430, binding = 1) readonly buffer bufferB { //GL_SHADER_STORAGE_BUFFER, GL_DYNAMIC_READ
    uint layer[]; //Written and read by the third kernel, read by this shader and by glGetNamedBufferSubData
};
layout(std140) uniform bufferMat { //GL_UNIFORM_BUFFER, GL_STATIC_DRAW
    mat4 proj_mat[projections*subIterations]; //Read only by this shader and the third
};
layout(location = 0) uniform int z;
layout(location = 1) uniform int subit;

void main() {
    vec4 layer_coords = vec4(orig,1.0) + vec4(gl_GlobalInvocationID.x, z, gl_GlobalInvocationID.y, 0.0)*vx;
    uint val = layer[gl_GlobalInvocationID.y*size.x + gl_GlobalInvocationID.x];
    for(int i = 0; i < projections; ++i) {
        vec4 proj_coords = proj_mat[subit+i*subIterations]*layer_coords;
        ivec2 tex_coords = ivec2(floor((proj_coords.xy*size)/(2.0*proj_coords.w)) + size/2);
        bool valid = all(greaterThanEqual(tex_coords, ivec2(0,0))) && all(lessThan(tex_coords, size));
        atomicAdd(bufferProjection[tex_coords.y*size.x+tex_coords.x+i*(size.x*size.y)], valid?val:0);
    }
}

2番目：

layout(local_size_x = 1, local_size_y = 1) in;

layout(std430, binding = 0) buffer bufferA { //GL_SHADER_STORAGE_BUFFER, GL_DYNAMIC_READ
    float updateProjection[]; //Written by this shader, read by the third kernel
};
layout(std430, binding = 1) readonly buffer bufferB { //GL_SHADER_STORAGE_BUFFER, GL_DYNAMIC_READ
    uint bufferProjection[]; //Written by the first, read by this shader
};
layout(std430, binding = 2) readonly buffer bufferC { //GL_SHADER_STORAGE_BUFFER, GL_DYNAMIC_READ
    uint originalProjection[]; //Only modified by glBufferSubData, read by this shader
};

void main() {
    for(int i = 0; i < projections; ++i) {
        updateProjection[gl_GlobalInvocationID.x+i*(size.x*size.y)] = float(originalProjection[gl_GlobalInvocationID.x+i*(size.x*size.y)])/float(bufferProjection[gl_GlobalInvocationID.x+i*(size.x*size.y)]);
    }
}

第三：

layout(local_size_x = 1, local_size_y = 1) in;

layout(std430, binding = 0) readonly buffer bufferA { //GL_SHADER_STORAGE_BUFFER, GL_DYNAMIC_READ
    float updateProjection[]; //Written by the second kernel, read by this shader
};
layout(std430, binding = 1) buffer bufferB { //GL_SHADER_STORAGE_BUFFER, GL_DYNAMIC_READ
    uint layer[]; //Written and read by this shader, read by the first kernel and by glGetNamedBufferSubData
};
layout(std140) uniform bufferMat { //GL_UNIFORM_BUFFER, GL_STATIC_DRAW
    mat4 proj_mat[projections*subIterations]; //Read only by this shader and and the first
};
layout(location = 0) uniform int z;
layout(location = 1) uniform int subit;
layout(location = 2) uniform float weight;

void main() {
    vec4 layer_coords = vec4(orig,1.0) + vec4(gl_GlobalInvocationID.x, z, gl_GlobalInvocationID.y, 0.0)*vx;
    float acc = 0;
    for(int i = 0; i < projections; ++i) {
        vec4 proj_coords = proj_mat[subit+i*subIterations]*layer_coords;
        ivec2 tex_coords = ivec2(floor((proj_coords.xy*size)/(2.0*proj_coords.w)) + size/2);
        bool valid = all(greaterThanEqual(tex_coords, ivec2(0,0))) && all(lessThan(tex_coords, size));
        acc += valid?updateProjection[tex_coords.y*size.x+tex_coords.x+i*(size.x*size.y)]:0;
    }
    float val = pow(float(layer[gl_GlobalInvocationID.y*size.x + gl_GlobalInvocationID.x])*(acc/projections), weight);
    layer[gl_GlobalInvocationID.y*size.x + gl_GlobalInvocationID.x] = uint(val);
}

OpenGLドキュメントを読んで思いついたもの：

アルゴリズムのすべての期間で同じ値は、シェーダーをコンパイルする前にconstとして生成されます。forループ境界に特に役立ちます
他のバッファーと比較して非常に小さいbufferMatは、SSBOよりも優れたパフォーマンスを持つはずのUBOに配置されます。コンパイル時定数にすることで、イベントのパフォーマンスを向上させることができますか？小さいですが、それでも数百mat4
他のバッファは、読み取りと書き込みの両方が数回行われるため、SSBOよりも優れているはずです。
バッファの「usage」パラメータに最適な値を理解するのに苦労しています。すべてのバッファが何度か書き込まれ、読み取られているので、ここに何を置くべきかわかりません。
私が正しく理解していれば、local_sizeは呼び出し間でデータを共有する場合にのみ役立つので、1つに保つ必要がありますか？

それらのカーネルを最適化するためにどこを見ればよいかについての推奨事項やヒントを喜んで受け取ります！

回答

1 NicolBolas Aug 19 2020 at 21:51

コンパイル時定数にすることで、イベントのパフォーマンスを向上させることができますか？

プロファイルする必要があります。そうは言っても、「数百mat4」は「小さい」ではありません。

バッファの「usage」パラメータに最適な値を理解するのに苦労しています。すべてのバッファが何度か書き込まれ、読み取られているので、ここに何を置くべきかわかりません。

まず、使用パラメータは約あるあなたのバッファオブジェクトの使用方法ではなく、その背後にあるメモリのOpenGLのの使用方法。それは、彼らが同じような機能について話している、あるglBufferSubData、glMapBufferRange等々、と。READCPUはバッファから読み取りますが、書き込みは行わないことを意味します。DRAWこれは、CPUがバッファに書き込むが、バッファからは読み取らないことを意味します。

第二に...あなたは本当に気にするべきではありません。使用上のヒントはひどく、指定が不十分であり、誤用されているため、多くの実装では完全に無視されています。NVIDIAのGL実装は、おそらくそれらを最も真剣に受け止めているものです。

代わりに、不変のストレージバッファを使用してください。これらの「使用上のヒント」はヒントではありません。それらはAPI要件です。を使用しない場合GL_DYNAMIC_STORAGE_BIT、を介してバッファに書き込むことはできませんglBufferSubData。などなど。

私が正しく理解していれば、local_sizeは呼び出し間でデータを共有する場合にのみ役立つので、1つに保つ必要がありますか？

いいえ。実際には、1を使用しないでください。すべての呼び出しが実行バリアなどなしで独自のことを行っている場合は、作業しているハードウェアの波面サイズと同等のローカルサイズを選択する必要があります。。明らかにそれは実装に依存しますが、32は確かに1よりも良いデフォルトです。