astc_decoder: Combine FastReplicate functions to work around new NV driver bug

The new Nvidia drivers have a bug where the FastReplicateTo6 function produces a lookup into the REPLICATE_TO_8 table rather than the REPLICATE_TO_6 table.

This seems to be an optimization gone wrong. Combining the logic of the FastReplicate functions seems to address the bug.
This commit is contained in:
ameerj 2022-01-16 15:52:34 -05:00
parent 480b03b645
commit a5bff8e9b3
1 changed files with 46 additions and 34 deletions

View File

@ -155,9 +155,6 @@ uint SwizzleOffset(uvec2 pos) {
// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
// is the same as [(num_bits - 1):0] and repeats all the way down. // is the same as [(num_bits - 1):0] and repeats all the way down.
uint Replicate(uint val, uint num_bits, uint to_bit) { uint Replicate(uint val, uint num_bits, uint to_bit) {
if (num_bits == 0 || to_bit == 0) {
return 0;
}
const uint v = val & uint((1 << num_bits) - 1); const uint v = val & uint((1 << num_bits) - 1);
uint res = v; uint res = v;
uint reslen = num_bits; uint reslen = num_bits;
@ -187,42 +184,57 @@ uint ReplicateBitTo9(uint value) {
return REPLICATE_1_BIT_TO_9_TABLE[value]; return REPLICATE_1_BIT_TO_9_TABLE[value];
} }
uint FastReplicateTo8(uint value, uint num_bits) { uint FastReplicate(uint value, uint num_bits, uint to_bit) {
switch (num_bits) { if (num_bits == 0) {
case 1: return 0;
return REPLICATE_1_BIT_TO_8_TABLE[value]; }
case 2: if (num_bits == to_bit) {
return REPLICATE_2_BIT_TO_8_TABLE[value];
case 3:
return REPLICATE_3_BIT_TO_8_TABLE[value];
case 4:
return REPLICATE_4_BIT_TO_8_TABLE[value];
case 5:
return REPLICATE_5_BIT_TO_8_TABLE[value];
case 6:
return REPLICATE_6_BIT_TO_8_TABLE[value];
case 7:
return REPLICATE_7_BIT_TO_8_TABLE[value];
case 8:
return value; return value;
} }
return Replicate(value, num_bits, 8); if (to_bit == 6) {
switch (num_bits) {
case 1:
return REPLICATE_1_BIT_TO_6_TABLE[value];
case 2:
return REPLICATE_2_BIT_TO_6_TABLE[value];
case 3:
return REPLICATE_3_BIT_TO_6_TABLE[value];
case 4:
return REPLICATE_4_BIT_TO_6_TABLE[value];
case 5:
return REPLICATE_5_BIT_TO_6_TABLE[value];
default:
break;
}
} else { /* if (to_bit == 8) */
switch (num_bits) {
case 1:
return REPLICATE_1_BIT_TO_8_TABLE[value];
case 2:
return REPLICATE_2_BIT_TO_8_TABLE[value];
case 3:
return REPLICATE_3_BIT_TO_8_TABLE[value];
case 4:
return REPLICATE_4_BIT_TO_8_TABLE[value];
case 5:
return REPLICATE_5_BIT_TO_8_TABLE[value];
case 6:
return REPLICATE_6_BIT_TO_8_TABLE[value];
case 7:
return REPLICATE_7_BIT_TO_8_TABLE[value];
default:
break;
}
}
return Replicate(value, num_bits, to_bit);
}
uint FastReplicateTo8(uint value, uint num_bits) {
return FastReplicate(value, num_bits, 8);
} }
uint FastReplicateTo6(uint value, uint num_bits) { uint FastReplicateTo6(uint value, uint num_bits) {
switch (num_bits) { return FastReplicate(value, num_bits, 6);
case 1:
return REPLICATE_1_BIT_TO_6_TABLE[value];
case 2:
return REPLICATE_2_BIT_TO_6_TABLE[value];
case 3:
return REPLICATE_3_BIT_TO_6_TABLE[value];
case 4:
return REPLICATE_4_BIT_TO_6_TABLE[value];
case 5:
return REPLICATE_5_BIT_TO_6_TABLE[value];
}
return Replicate(value, num_bits, 6);
} }
uint Div3Floor(uint v) { uint Div3Floor(uint v) {