Skip to content

Commit 014d371

Browse files
committed
Optimized SRTP stream lookup with SSE2.
Stream lookup by SSRC is now performed using SSE2 intrinsics, which is considerably faster when there are many streams in the list. Although the lookup still has linear complexity, its absolute times are reduced and with tens to hundreds elements are lower or comparable with a typical rb-tree equivalent. Expected stream lookup performance of scalar array-based implementation and its SSE2 version compared to the list-based implementation that was used previously: SSRCs speedup (scalar) speedup (SSE2) 1 0.39x 0.22x 3 0.57x 0.23x 5 0.69x 0.62x 10 0.77x 1.43x 20 0.86x 2.38x 30 0.87x 3.44x 50 1.13x 6.21x 100 1.25x 8.51x 200 1.30x 9.83x Performance tested on an Intel Core i7 2600K CPU.
1 parent 372491b commit 014d371

File tree

1 file changed

+63
-0
lines changed

1 file changed

+63
-0
lines changed

srtp/stream_list.c

+63
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,13 @@
4848
#include <stdint.h>
4949
#include <string.h>
5050

51+
#if defined(__SSE2__)
52+
#include <emmintrin.h>
53+
#if defined(_MSC_VER)
54+
#include <intrin.h>
55+
#endif
56+
#endif
57+
5158
#include "srtp_priv.h"
5259
#include "err.h"
5360
#include "alloc.h"
@@ -69,6 +76,61 @@ void srtp_stream_list_init(srtp_stream_list_t *streams)
6976
*/
7077
uint32_t srtp_stream_list_find(const srtp_stream_list_t *streams, uint32_t ssrc)
7178
{
79+
#if defined(__SSE2__)
80+
const uint32_t *const ssrcs = streams->ssrcs;
81+
const __m128i mm_ssrc = _mm_set1_epi32(ssrc);
82+
uint32_t pos = 0u, n = (streams->size + 7u) & ~(uint32_t)(7u);
83+
for (uint32_t m = n & ~(uint32_t)(15u); pos < m; pos += 16u) {
84+
__m128i mm1 = _mm_loadu_si128((const __m128i *)(ssrcs + pos));
85+
__m128i mm2 = _mm_loadu_si128((const __m128i *)(ssrcs + pos + 4u));
86+
__m128i mm3 = _mm_loadu_si128((const __m128i *)(ssrcs + pos + 8u));
87+
__m128i mm4 = _mm_loadu_si128((const __m128i *)(ssrcs + pos + 12u));
88+
mm1 = _mm_cmpeq_epi32(mm1, mm_ssrc);
89+
mm2 = _mm_cmpeq_epi32(mm2, mm_ssrc);
90+
mm3 = _mm_cmpeq_epi32(mm3, mm_ssrc);
91+
mm4 = _mm_cmpeq_epi32(mm4, mm_ssrc);
92+
mm1 = _mm_packs_epi32(mm1, mm2);
93+
mm3 = _mm_packs_epi32(mm3, mm4);
94+
mm1 = _mm_packs_epi16(mm1, mm3);
95+
uint32_t mask = _mm_movemask_epi8(mm1);
96+
if (mask) {
97+
#if defined(_MSC_VER)
98+
unsigned long bit_pos;
99+
_BitScanForward(&bit_pos, mask);
100+
pos += bit_pos;
101+
#else
102+
pos += __builtin_ctz(mask);
103+
#endif
104+
105+
goto done;
106+
}
107+
}
108+
109+
if (pos < n) {
110+
__m128i mm1 = _mm_loadu_si128((const __m128i *)(ssrcs + pos));
111+
__m128i mm2 = _mm_loadu_si128((const __m128i *)(ssrcs + pos + 4u));
112+
mm1 = _mm_cmpeq_epi32(mm1, mm_ssrc);
113+
mm2 = _mm_cmpeq_epi32(mm2, mm_ssrc);
114+
mm1 = _mm_packs_epi32(mm1, mm2);
115+
116+
uint32_t mask = _mm_movemask_epi8(mm1);
117+
if (mask) {
118+
#if defined(_MSC_VER)
119+
unsigned long bit_pos;
120+
_BitScanForward(&bit_pos, mask);
121+
pos += bit_pos / 2u;
122+
#else
123+
pos += __builtin_ctz(mask) / 2u;
124+
#endif
125+
goto done;
126+
}
127+
128+
pos += 8u;
129+
}
130+
131+
done:
132+
return pos;
133+
#else /* defined(__SSE2__) */
72134
/* walk down list until ssrc is found */
73135
uint32_t pos = 0u, n = streams->size;
74136
for (; pos < n; ++pos) {
@@ -77,6 +139,7 @@ uint32_t srtp_stream_list_find(const srtp_stream_list_t *streams, uint32_t ssrc)
77139
}
78140

79141
return pos;
142+
#endif /* defined(__SSE2__) */
80143
}
81144

82145
/*

0 commit comments

Comments
 (0)