The code below is supposed to calculate total brightness of tsrc texture.
The good thing is HD Graphics 4600 calculates it just fine. The bad thing is GTX 980 does not.
The values I read from read-back buffer fluctuate wildly, but they seem to stay below correct value.
I took the code for atomic addition of float values from this thread http://www.gamedev.net/topic/613648-dx11-interlockedadd-on-floats-in-pixel-shader-workaround/
I have no idea what's going on. Thanks in advance.
EDIT: 'globallycoherent' doesn't work. Using 'InterlockedAdd' and summing uint's doesn't work.
#define TotalGroups 32
#define RSDT "RootFlags(0), UAV(u0), DescriptorTable(SRV(t0))" // Descriptor table is required for texture
Texture2D<float4> tsrc: register(t0);
RWByteAddressBuffer total : register(u0);
groupshared float4 bpacked[TotalGroups*TotalGroups];
float brightness(float4 cl) {
// TODO: replace with correct implementation
return cl.r + cl.g + cl.b;
}
[RootSignature(RSDT)]
[numthreads(TotalGroups,TotalGroups,1)]
void CSTotal(uint3 gtid: SV_GroupThreadId, uint3 gid : SV_GroupId, uint gindex : SV_GroupIndex, uint3 dtid : SV_DispatchThreadID) {
uint2 crd = (gid.xy * TotalGroups + gtid.xy)*2;
float br[4];
[unroll]
for (uint x = 0; x < 2; ++x) {
[unroll]
for (uint y = 0; y < 2; ++y) {
// Color outside of tsrc is guarantied to be 0.
br[y * 2 + x] = brightness(tsrc[crd+uint2(x,y)]);
}
}
bpacked[gindex] = float4(br[0],br[1],br[2],br[3]);
if (all(dtid == uint3(0, 0, 0))) {
? // set initial value of total brightness accumulator
total.Store(0, asuint(0.0));
};
AllMemoryBarrierWithGroupSync();
? // bpacked array now contains brightness in each component of each value
// reduce bpacked to single value
[unroll]
for (uint thres = TotalGroups*TotalGroups / 2; thres > 0; thres /= 2) {
if (gindex < thres) {
bpacked[gindex] += bpacked[gindex + thres];
}
AllMemoryBarrierWithGroupSync();
}
if (gindex == 0) {
float4 cl = bpacked[0];
float value = cl.r + cl.g + cl.b + cl.a;
// First thread in thread group atomically adds calculated brightness to the accumulator
uint comp, orig = total.Load(0);
[allow_uav_condition]do
{
comp = orig;
total.InterlockedCompareExchange(0, comp, asuint(asfloat(orig) + value), orig);
} while (orig != comp);
}
}
Invocation of compute shader is written in Rust. But it should be sufficiently readable.
I'm sure that Rust bindings to D3D12 are not the cause for the problem. I work with them for months without problems.
let src_desc = srv_tex2d_default_slice_mip(srcdesc.Format, 0, 1);
core.dev.create_shader_resource_view(Some(&src), Some(&src_desc), res.total_dheap.cpu_handle(0));
clist.set_pipeline_state(&self.total_cpso);
clist.set_compute_root_signature(&self.total_rs);
clist.set_descriptor_heaps(&[res.total_dheap.get()]);
clist.set_compute_root_descriptor_table(1, res.total_dheap.gpu_handle(0));
clist.set_compute_root_unordered_access_view(0, res.rw_total.get_gpu_virtual_address());
clist.resource_barrier(&[
*ResourceBarrier::transition(&src,
D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE),
*ResourceBarrier::transition(&res.rw_total,
D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS),
]);
clist.dispatch(cw / TOTAL_CHUNK_SIZE, ch / TOTAL_CHUNK_SIZE, 1);
clist.resource_barrier(&[
*ResourceBarrier::transition(&res.rw_total,
D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE),
]);
clist.copy_resource(&res.rb_total, &res.rw_total);
clist.resource_barrier(&[
*ResourceBarrier::transition(&res.rw_total,
D3D12_RESOURCE_STATE_COPY_SOURCE, D3D12_RESOURCE_STATE_COMMON),
]);
try!(clist.close());
core.compute_queue.execute_command_lists(&[clist]);
wait_for_compute_queue(core, &res.fence, &create_event());
let total_brightness = res.total_brightness();
let avg_brightness = total_brightness / cw as f32 / ch as f32;