Skip to content

Commit 2a74a22

Browse files
committed
Speeding up AVX, SSE
1 parent 765e065 commit 2a74a22

26 files changed

+697
-378
lines changed

src/avx/image_to_oklab.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,20 +64,25 @@ macro_rules! triple_to_oklab {
6464
}
6565

6666
#[inline(always)]
67-
pub unsafe fn avx_image_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
67+
pub unsafe fn avx_image_to_oklab<
68+
const CHANNELS_CONFIGURATION: u8,
69+
const TARGET: u8,
70+
const TRANSFER_FUNCTION: u8,
71+
>(
6872
start_cx: usize,
6973
src: *const u8,
7074
src_offset: usize,
7175
width: u32,
7276
dst: *mut f32,
7377
dst_offset: usize,
74-
transfer_function: TransferFunction,
78+
_: TransferFunction,
7579
) -> usize {
7680
let target: OklabTarget = TARGET.into();
7781
let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
7882
let channels = image_configuration.get_channels_count();
7983
let mut cx = start_cx;
8084

85+
let transfer_function: TransferFunction = TRANSFER_FUNCTION.into();
8186
let transfer = get_avx2_linear_transfer(transfer_function);
8287

8388
let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32;

src/avx/linear_to_image.rs

Lines changed: 29 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ use std::arch::x86_64::*;
1212

1313
use crate::avx::gamma_curves::get_avx_gamma_transfer;
1414
use crate::avx::routines::avx_vld_f32_and_deinterleave;
15-
use crate::avx::{avx2_interleave_rgb, avx2_interleave_rgba_epi8, avx2_pack_s32, avx2_pack_u16};
15+
use crate::avx::{avx2_interleave_rgb, avx2_interleave_rgba_epi8, avx2_pack_u16, avx2_pack_u32};
1616
use crate::image::ImageConfiguration;
1717
use crate::{
1818
avx_store_and_interleave_v3_half_u8, avx_store_and_interleave_v3_u8,
@@ -22,9 +22,8 @@ use crate::{
2222
#[inline(always)]
2323
unsafe fn gamma_vld<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
2424
src: *const f32,
25-
transfer_function: TransferFunction,
25+
transfer: &unsafe fn(__m256) -> __m256,
2626
) -> (__m256i, __m256i, __m256i, __m256i) {
27-
let transfer = get_avx_gamma_transfer(transfer_function);
2827
let v_scale_alpha = _mm256_set1_ps(255f32);
2928
let (mut r_f32, mut g_f32, mut b_f32, mut a_f32) =
3029
avx_vld_f32_and_deinterleave::<CHANNELS_CONFIGURATION>(src);
@@ -46,50 +45,57 @@ unsafe fn gamma_vld<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
4645
}
4746

4847
#[inline(always)]
49-
pub unsafe fn avx_linear_to_gamma<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
48+
pub unsafe fn avx_linear_to_gamma<
49+
const CHANNELS_CONFIGURATION: u8,
50+
const USE_ALPHA: bool,
51+
const TRANSFER_FUNCTION: u8,
52+
>(
5053
start_cx: usize,
5154
src: *const f32,
5255
src_offset: u32,
5356
dst: *mut u8,
5457
dst_offset: u32,
5558
width: u32,
56-
transfer_function: TransferFunction,
59+
_: TransferFunction,
5760
) -> usize {
5861
let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
5962
let channels = image_configuration.get_channels_count();
6063
let mut cx = start_cx;
6164

65+
let transfer_function: TransferFunction = TRANSFER_FUNCTION.into();
66+
let transfer = get_avx_gamma_transfer(transfer_function);
67+
6268
while cx + 32 < width as usize {
6369
let offset_src_ptr =
6470
((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels);
6571

6672
let src_ptr_0 = offset_src_ptr;
6773

6874
let (r_row0_, g_row0_, b_row0_, a_row0_) =
69-
gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_0, transfer_function);
75+
gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_0, &transfer);
7076

7177
let src_ptr_1 = offset_src_ptr.add(8 * channels);
7278

7379
let (r_row1_, g_row1_, b_row1_, a_row1_) =
74-
gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_1, transfer_function);
80+
gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_1, &transfer);
7581

7682
let src_ptr_2 = offset_src_ptr.add(8 * 2 * channels);
7783

7884
let (r_row2_, g_row2_, b_row2_, a_row2_) =
79-
gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_2, transfer_function);
85+
gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_2, &transfer);
8086

8187
let src_ptr_3 = offset_src_ptr.add(8 * 3 * channels);
8288

8389
let (r_row3_, g_row3_, b_row3_, a_row3_) =
84-
gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_3, transfer_function);
90+
gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_3, &transfer);
8591

86-
let r_row01 = avx2_pack_s32(r_row0_, r_row1_);
87-
let g_row01 = avx2_pack_s32(g_row0_, g_row1_);
88-
let b_row01 = avx2_pack_s32(b_row0_, b_row1_);
92+
let r_row01 = avx2_pack_u32(r_row0_, r_row1_);
93+
let g_row01 = avx2_pack_u32(g_row0_, g_row1_);
94+
let b_row01 = avx2_pack_u32(b_row0_, b_row1_);
8995

90-
let r_row23 = avx2_pack_s32(r_row2_, r_row3_);
91-
let g_row23 = avx2_pack_s32(g_row2_, g_row3_);
92-
let b_row23 = avx2_pack_s32(b_row2_, b_row3_);
96+
let r_row23 = avx2_pack_u32(r_row2_, r_row3_);
97+
let g_row23 = avx2_pack_u32(g_row2_, g_row3_);
98+
let b_row23 = avx2_pack_u32(b_row2_, b_row3_);
9399

94100
let r_row = avx2_pack_u16(r_row01, r_row23);
95101
let g_row = avx2_pack_u16(g_row01, g_row23);
@@ -98,8 +104,8 @@ pub unsafe fn avx_linear_to_gamma<const CHANNELS_CONFIGURATION: u8, const USE_AL
98104
let dst_ptr = dst.add(dst_offset as usize + cx * channels);
99105

100106
if USE_ALPHA {
101-
let a_row01 = avx2_pack_s32(a_row0_, a_row1_);
102-
let a_row23 = avx2_pack_s32(a_row2_, a_row3_);
107+
let a_row01 = avx2_pack_u32(a_row0_, a_row1_);
108+
let a_row23 = avx2_pack_u32(a_row2_, a_row3_);
103109
let a_row = avx2_pack_u16(a_row01, a_row23);
104110
avx_store_and_interleave_v4_u8!(
105111
dst_ptr,
@@ -125,16 +131,16 @@ pub unsafe fn avx_linear_to_gamma<const CHANNELS_CONFIGURATION: u8, const USE_AL
125131
let src_ptr_0 = offset_src_ptr;
126132

127133
let (r_row0_, g_row0_, b_row0_, a_row0_) =
128-
gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_0, transfer_function);
134+
gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_0, &transfer);
129135

130136
let src_ptr_1 = offset_src_ptr.add(8 * channels);
131137

132138
let (r_row1_, g_row1_, b_row1_, a_row1_) =
133-
gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_1, transfer_function);
139+
gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_1, &transfer);
134140

135-
let r_row01 = avx2_pack_s32(r_row0_, r_row1_);
136-
let g_row01 = avx2_pack_s32(g_row0_, g_row1_);
137-
let b_row01 = avx2_pack_s32(b_row0_, b_row1_);
141+
let r_row01 = avx2_pack_u32(r_row0_, r_row1_);
142+
let g_row01 = avx2_pack_u32(g_row0_, g_row1_);
143+
let b_row01 = avx2_pack_u32(b_row0_, b_row1_);
138144

139145
let r_row = avx2_pack_u16(r_row01, zeros);
140146
let g_row = avx2_pack_u16(g_row01, zeros);
@@ -143,7 +149,7 @@ pub unsafe fn avx_linear_to_gamma<const CHANNELS_CONFIGURATION: u8, const USE_AL
143149
let dst_ptr = dst.add(dst_offset as usize + cx * channels);
144150

145151
if USE_ALPHA {
146-
let a_row01 = avx2_pack_s32(a_row0_, a_row1_);
152+
let a_row01 = avx2_pack_u32(a_row0_, a_row1_);
147153
let a_row = avx2_pack_u16(a_row01, zeros);
148154
avx_store_and_interleave_v4_half_u8!(
149155
dst_ptr,

src/avx/oklab_to_image.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,15 +112,20 @@ unsafe fn avx_oklab_vld<const CHANNELS_CONFIGURATION: u8>(
112112
}
113113

114114
#[inline(always)]
115-
pub unsafe fn avx_oklab_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
115+
pub unsafe fn avx_oklab_to_image<
116+
const CHANNELS_CONFIGURATION: u8,
117+
const TARGET: u8,
118+
const TRANSFER_FUNCTION: u8,
119+
>(
116120
start_cx: usize,
117121
src: *const f32,
118122
src_offset: u32,
119123
dst: *mut u8,
120124
dst_offset: u32,
121125
width: u32,
122-
transfer_function: TransferFunction,
126+
_: TransferFunction,
123127
) -> usize {
128+
let transfer_function: TransferFunction = TRANSFER_FUNCTION.into();
124129
let transfer = get_avx_gamma_transfer(transfer_function);
125130
let target: OklabTarget = TARGET.into();
126131
let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();

src/avx/to_linear.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,19 +38,24 @@ unsafe fn triple_to_linear(
3838
}
3939

4040
#[inline(always)]
41-
pub unsafe fn avx_channels_to_linear<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
41+
pub unsafe fn avx_channels_to_linear<
42+
const CHANNELS_CONFIGURATION: u8,
43+
const USE_ALPHA: bool,
44+
const TRANSFER_FUNCTION: u8,
45+
>(
4246
start_cx: usize,
4347
src: *const u8,
4448
src_offset: usize,
4549
width: u32,
4650
dst: *mut f32,
4751
dst_offset: usize,
48-
transfer_function: TransferFunction,
52+
_: TransferFunction,
4953
) -> usize {
5054
let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
5155
let channels = image_configuration.get_channels_count();
5256
let mut cx = start_cx;
5357

58+
let transfer_function: TransferFunction = TRANSFER_FUNCTION.into();
5459
let transfer = get_avx2_linear_transfer(transfer_function);
5560

5661
let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32;

src/avx/to_xyz_lab.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ pub unsafe fn avx2_image_to_xyz_lab<
2626
const CHANNELS_CONFIGURATION: u8,
2727
const USE_ALPHA: bool,
2828
const TARGET: u8,
29+
const TRANSFER_FUNCTION: u8,
2930
>(
3031
start_cx: usize,
3132
src: *const u8,
@@ -36,7 +37,7 @@ pub unsafe fn avx2_image_to_xyz_lab<
3637
a_linearized: *mut f32,
3738
a_offset: usize,
3839
matrix: &[[f32; 3]; 3],
39-
transfer_function: TransferFunction,
40+
_: TransferFunction,
4041
) -> usize {
4142
if USE_ALPHA {
4243
if a_linearized.is_null() {
@@ -48,6 +49,7 @@ pub unsafe fn avx2_image_to_xyz_lab<
4849
let channels = image_configuration.get_channels_count();
4950
let mut cx = start_cx;
5051

52+
let transfer_function: TransferFunction = TRANSFER_FUNCTION.into();
5153
let transfer = get_avx2_linear_transfer(transfer_function);
5254

5355
let cq1 = _mm256_set1_ps(*matrix.get_unchecked(0).get_unchecked(0));

0 commit comments

Comments
 (0)