@@ -130,7 +130,7 @@ __device__ __forceinline__ Tile *load_linear_tile(Tile *tile,
130
130
aligned_tile[idx * 4 + 3 ] = in.w ;
131
131
}
132
132
133
- uint32_t processed_in_main = left_after_prologue & - 4 ; // equivalent to (x / 4) * 4
133
+ uint32_t processed_in_main = left_after_prologue & ~ 0x3 ; ; // equivalent to (x / 4) * 4
134
134
uint32_t left_after_main = left_after_prologue - processed_in_main;
135
135
136
136
// epilogue
@@ -238,7 +238,7 @@ __device__ __forceinline__ Tile *slice_load_linear_tile(
238
238
aligned_tile[idx * 4 + 3 ] = in.w ;
239
239
}
240
240
241
- uint32_t processed_in_main = left_after_prologue & - 4 ; // equivalent to (x / 4) * 4
241
+ uint32_t processed_in_main = left_after_prologue & ~ 0x3 ; // equivalent to (x / 4) * 4
242
242
uint32_t left_after_main = left_after_prologue - processed_in_main;
243
243
244
244
// epilogue
@@ -339,7 +339,7 @@ __device__ __forceinline__ void load_planar_tile(Tile tile[][kBlockSize / kStati
339
339
tile[c][xy] = in.w ;
340
340
}
341
341
342
- uint32_t processed_in_main = left_after_prologue & - 4 ; // equivalent to (x / 4) * 4
342
+ uint32_t processed_in_main = left_after_prologue & ~ 0x3 ; // equivalent to (x / 4) * 4
343
343
uint32_t left_after_main = left_after_prologue - processed_in_main;
344
344
345
345
// epilogue
@@ -521,8 +521,6 @@ __device__ __forceinline__ void store_planar_hwc_pad(
521
521
int64_t start_x = static_cast <int64_t >(blockIdx .x - sample.first_block ) * kBlockSize ;
522
522
int64_t end_x = ::min (start_x + kBlockSize , sample.sample_size );
523
523
524
- const auto *__restrict__ fill_values = static_cast <const float16 *>(sample.fill_values );
525
-
526
524
// Preload the norm values so they are accessed via registers and not from gmem via pointer.
527
525
Compute norm_mul[kOutChannels ], norm_add[kOutChannels ];
528
526
@@ -545,7 +543,6 @@ __device__ __forceinline__ void store_planar_hwc_pad(
545
543
546
544
// TODO(klecki) in the version without mirror, we can keep one offset, as we can start the
547
545
// output pointer at the output tile.
548
- auto *out_aligned = sample.out ;
549
546
auto *out_h2 = reinterpret_cast <__half2 *>(sample.out );
550
547
uint32_t to_write = end_x_padded - start_x_padded;
551
548
@@ -726,8 +723,6 @@ __global__ void Hwc2HwcNormalizePadFp16(const Hwc2HwcChwSampleDesc<Out, In> *sam
726
723
uint32_t *first_blocks, uint32_t num_samples) {
727
724
static_assert (std::is_same<In, uint8_t >::value, " Only uint8_t supported as input" );
728
725
729
- constexpr int kOutChannels = kStaticChannels + 1 ;
730
-
731
726
int sample_idx = FindSampleIdx (first_blocks, num_samples);
732
727
const auto sample = samples[sample_idx];
733
728
0 commit comments