OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_transform_sse2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_transform_sse2.cpp
34// Author: Aous Naman
35// Date: 28 August 2019
36//***************************************************************************/
37
38#include "ojph_arch.h"
39#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
40
41#include <climits>
42#include <cstdio>
43
44#include "ojph_defs.h"
45#include "ojph_mem.h"
46#include "ojph_params.h"
48
49#include "ojph_transform.h"
51
52#include <emmintrin.h>
53
54namespace ojph {
55 namespace local {
56
58 // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
59 static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m)
60 {
61 // note than m must be obtained using
62 // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt));
63 __m128i x = _mm_srli_epi64(a, amt);
64 x = _mm_xor_si128(x, m);
65 __m128i result = _mm_sub_epi64(x, m);
66 return result;
67 }
68
70 static inline
71 void sse2_deinterleave32(float* dpl, float* dph, float* sp, int width)
72 {
73 for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
74 {
75 __m128 a = _mm_load_ps(sp);
76 __m128 b = _mm_load_ps(sp + 4);
77 __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
78 __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
79 _mm_store_ps(dpl, c);
80 _mm_store_ps(dph, d);
81 }
82 }
83
85 static inline
86 void sse2_interleave32(float* dp, float* spl, float* sph, int width) \
87 {
88 for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
89 {
90 __m128 a = _mm_load_ps(spl);
91 __m128 b = _mm_load_ps(sph);
92 __m128 c = _mm_unpacklo_ps(a, b);
93 __m128 d = _mm_unpackhi_ps(a, b);
94 _mm_store_ps(dp, c);
95 _mm_store_ps(dp + 4, d);
96 }
97 }
98
100 static inline
101 void sse2_deinterleave64(void* dpl, void* dph, const void* sp, int width)
102 {
103 for (; width > 0; width -= 4,
104 sp = (const char*)sp + 32,
105 dpl = (char*)dpl + 16,
106 dph = (char*)dph + 16)
107 {
108 __m128i a = _mm_load_si128((const __m128i*)sp);
109 __m128i b = _mm_load_si128((const __m128i*)((const char*)sp + 16));
110 __m128i c = _mm_unpacklo_epi64(a, b);
111 __m128i d = _mm_unpackhi_epi64(a, b);
112 _mm_store_si128((__m128i*)dpl, c);
113 _mm_store_si128((__m128i*)dph, d);
114 }
115 }
116
118 static inline
119 void sse2_interleave64(void* dp, const void* spl, const void* sph,
120 int width)
121 {
122 for (; width > 0; width -= 4,
123 dp = (char*)dp + 32,
124 spl = (const char*)spl + 16,
125 sph = (const char*)sph + 16)
126 {
127 __m128i a = _mm_load_si128((const __m128i*)spl);
128 __m128i b = _mm_load_si128((const __m128i*)sph);
129 __m128i c = _mm_unpacklo_epi64(a, b);
130 __m128i d = _mm_unpackhi_epi64(a, b);
131 _mm_store_si128((__m128i*)dp, c);
132 _mm_store_si128((__m128i*)((char*)dp + 16), d);
133 }
134 }
135
137 static
138 void sse2_rev_vert_step32(const lifting_step* s, const line_buf* sig,
139 const line_buf* other, const line_buf* aug,
140 ui32 repeat, bool synthesis)
141 {
142 const si32 a = s->rev.Aatk;
143 const si32 b = s->rev.Batk;
144 const ui8 e = s->rev.Eatk;
145 __m128i vb = _mm_set1_epi32(b);
146
147 si32* dst = aug->i32;
148 const si32* src1 = sig->i32, * src2 = other->i32;
149 // The general definition of the wavelet in Part 2 is slightly
150 // different to part 2, although they are mathematically equivalent
151 // here, we identify the simpler form from Part 1 and employ them
152 if (a == 1)
153 { // 5/3 update and any case with a == 1
154 int i = (int)repeat;
155 if (synthesis)
156 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
157 {
158 __m128i s1 = _mm_load_si128((__m128i*)src1);
159 __m128i s2 = _mm_load_si128((__m128i*)src2);
160 __m128i d = _mm_load_si128((__m128i*)dst);
161 __m128i t = _mm_add_epi32(s1, s2);
162 __m128i v = _mm_add_epi32(vb, t);
163 __m128i w = _mm_srai_epi32(v, e);
164 d = _mm_sub_epi32(d, w);
165 _mm_store_si128((__m128i*)dst, d);
166 }
167 else
168 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
169 {
170 __m128i s1 = _mm_load_si128((__m128i*)src1);
171 __m128i s2 = _mm_load_si128((__m128i*)src2);
172 __m128i d = _mm_load_si128((__m128i*)dst);
173 __m128i t = _mm_add_epi32(s1, s2);
174 __m128i v = _mm_add_epi32(vb, t);
175 __m128i w = _mm_srai_epi32(v, e);
176 d = _mm_add_epi32(d, w);
177 _mm_store_si128((__m128i*)dst, d);
178 }
179 }
180 else if (a == -1 && b == 1 && e == 1)
181 { // 5/3 predict
182 int i = (int)repeat;
183 if (synthesis)
184 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
185 {
186 __m128i s1 = _mm_load_si128((__m128i*)src1);
187 __m128i s2 = _mm_load_si128((__m128i*)src2);
188 __m128i d = _mm_load_si128((__m128i*)dst);
189 __m128i t = _mm_add_epi32(s1, s2);
190 __m128i w = _mm_srai_epi32(t, e);
191 d = _mm_add_epi32(d, w);
192 _mm_store_si128((__m128i*)dst, d);
193 }
194 else
195 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
196 {
197 __m128i s1 = _mm_load_si128((__m128i*)src1);
198 __m128i s2 = _mm_load_si128((__m128i*)src2);
199 __m128i d = _mm_load_si128((__m128i*)dst);
200 __m128i t = _mm_add_epi32(s1, s2);
201 __m128i w = _mm_srai_epi32(t, e);
202 d = _mm_sub_epi32(d, w);
203 _mm_store_si128((__m128i*)dst, d);
204 }
205 }
206 else if (a == -1)
207 { // any case with a == -1, which is not 5/3 predict
208 int i = (int)repeat;
209 if (synthesis)
210 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
211 {
212 __m128i s1 = _mm_load_si128((__m128i*)src1);
213 __m128i s2 = _mm_load_si128((__m128i*)src2);
214 __m128i d = _mm_load_si128((__m128i*)dst);
215 __m128i t = _mm_add_epi32(s1, s2);
216 __m128i v = _mm_sub_epi32(vb, t);
217 __m128i w = _mm_srai_epi32(v, e);
218 d = _mm_sub_epi32(d, w);
219 _mm_store_si128((__m128i*)dst, d);
220 }
221 else
222 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
223 {
224 __m128i s1 = _mm_load_si128((__m128i*)src1);
225 __m128i s2 = _mm_load_si128((__m128i*)src2);
226 __m128i d = _mm_load_si128((__m128i*)dst);
227 __m128i t = _mm_add_epi32(s1, s2);
228 __m128i v = _mm_sub_epi32(vb, t);
229 __m128i w = _mm_srai_epi32(v, e);
230 d = _mm_add_epi32(d, w);
231 _mm_store_si128((__m128i*)dst, d);
232 }
233 }
234 else { // general case
235 // 32bit multiplication is not supported in sse2; we need sse4.1,
236 // where we can use _mm_mullo_epi32, which multiplies 32bit x 32bit,
237 // keeping the LSBs
238 if (synthesis)
239 for (ui32 i = repeat; i > 0; --i)
240 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
241 else
242 for (ui32 i = repeat; i > 0; --i)
243 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
244 }
245 }
246
248 static
249 void sse2_rev_vert_step64(const lifting_step* s, const line_buf* sig,
250 const line_buf* other, const line_buf* aug,
251 ui32 repeat, bool synthesis)
252 {
253 const si64 a = s->rev.Aatk;
254 const si64 b = s->rev.Batk;
255 const ui8 e = s->rev.Eatk;
256 __m128i vb = _mm_set1_epi64x(b);
257 __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
258
259 si64* dst = aug->i64;
260 const si64* src1 = sig->i64, * src2 = other->i64;
261 // The general definition of the wavelet in Part 2 is slightly
262 // different to part 2, although they are mathematically equivalent
263 // here, we identify the simpler form from Part 1 and employ them
264 if (a == 1)
265 { // 5/3 update and any case with a == 1
266 int i = (int)repeat;
267 if (synthesis)
268 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
269 {
270 __m128i s1 = _mm_load_si128((__m128i*)src1);
271 __m128i s2 = _mm_load_si128((__m128i*)src2);
272 __m128i d = _mm_load_si128((__m128i*)dst);
273 __m128i t = _mm_add_epi64(s1, s2);
274 __m128i v = _mm_add_epi64(vb, t);
275 __m128i w = sse2_mm_srai_epi64(v, e, ve);
276 d = _mm_sub_epi64(d, w);
277 _mm_store_si128((__m128i*)dst, d);
278 }
279 else
280 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
281 {
282 __m128i s1 = _mm_load_si128((__m128i*)src1);
283 __m128i s2 = _mm_load_si128((__m128i*)src2);
284 __m128i d = _mm_load_si128((__m128i*)dst);
285 __m128i t = _mm_add_epi64(s1, s2);
286 __m128i v = _mm_add_epi64(vb, t);
287 __m128i w = sse2_mm_srai_epi64(v, e, ve);
288 d = _mm_add_epi64(d, w);
289 _mm_store_si128((__m128i*)dst, d);
290 }
291 }
292 else if (a == -1 && b == 1 && e == 1)
293 { // 5/3 predict
294 int i = (int)repeat;
295 if (synthesis)
296 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
297 {
298 __m128i s1 = _mm_load_si128((__m128i*)src1);
299 __m128i s2 = _mm_load_si128((__m128i*)src2);
300 __m128i d = _mm_load_si128((__m128i*)dst);
301 __m128i t = _mm_add_epi64(s1, s2);
302 __m128i w = sse2_mm_srai_epi64(t, e, ve);
303 d = _mm_add_epi64(d, w);
304 _mm_store_si128((__m128i*)dst, d);
305 }
306 else
307 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
308 {
309 __m128i s1 = _mm_load_si128((__m128i*)src1);
310 __m128i s2 = _mm_load_si128((__m128i*)src2);
311 __m128i d = _mm_load_si128((__m128i*)dst);
312 __m128i t = _mm_add_epi64(s1, s2);
313 __m128i w = sse2_mm_srai_epi64(t, e, ve);
314 d = _mm_sub_epi64(d, w);
315 _mm_store_si128((__m128i*)dst, d);
316 }
317 }
318 else if (a == -1)
319 { // any case with a == -1, which is not 5/3 predict
320 int i = (int)repeat;
321 if (synthesis)
322 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
323 {
324 __m128i s1 = _mm_load_si128((__m128i*)src1);
325 __m128i s2 = _mm_load_si128((__m128i*)src2);
326 __m128i d = _mm_load_si128((__m128i*)dst);
327 __m128i t = _mm_add_epi64(s1, s2);
328 __m128i v = _mm_sub_epi64(vb, t);
329 __m128i w = sse2_mm_srai_epi64(v, e, ve);
330 d = _mm_sub_epi64(d, w);
331 _mm_store_si128((__m128i*)dst, d);
332 }
333 else
334 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
335 {
336 __m128i s1 = _mm_load_si128((__m128i*)src1);
337 __m128i s2 = _mm_load_si128((__m128i*)src2);
338 __m128i d = _mm_load_si128((__m128i*)dst);
339 __m128i t = _mm_add_epi64(s1, s2);
340 __m128i v = _mm_sub_epi64(vb, t);
341 __m128i w = sse2_mm_srai_epi64(v, e, ve);
342 d = _mm_add_epi64(d, w);
343 _mm_store_si128((__m128i*)dst, d);
344 }
345 }
346 else { // general case
347 // 64bit multiplication is not supported in sse2
348 if (synthesis)
349 for (ui32 i = repeat; i > 0; --i)
350 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
351 else
352 for (ui32 i = repeat; i > 0; --i)
353 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
354 }
355 }
356
358 void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig,
359 const line_buf* other, const line_buf* aug,
360 ui32 repeat, bool synthesis)
361 {
362 if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) ||
363 ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
364 ((other != NULL) && (other->flags & line_buf::LFT_32BIT)))
365 {
366 assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
367 (other == NULL || other->flags & line_buf::LFT_32BIT) &&
368 (aug == NULL || aug->flags & line_buf::LFT_32BIT));
369 sse2_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
370 }
371 else
372 {
373 assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
374 (other == NULL || other->flags & line_buf::LFT_64BIT) &&
375 (aug == NULL || aug->flags & line_buf::LFT_64BIT));
376 sse2_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
377 }
378 }
379
381 static
382 void sse2_rev_horz_ana32(const param_atk* atk, const line_buf* ldst,
383 const line_buf* hdst, const line_buf* src,
384 ui32 width, bool even)
385 {
386 if (width > 1)
387 {
388 // split src into ldst and hdst
389 {
390 float* dpl = even ? ldst->f32 : hdst->f32;
391 float* dph = even ? hdst->f32 : ldst->f32;
392 float* sp = src->f32;
393 int w = (int)width;
394 sse2_deinterleave32(dpl, dph, sp, w);
395 }
396
397 si32* hp = hdst->i32, * lp = ldst->i32;
398 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
399 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
400 ui32 num_steps = atk->get_num_steps();
401 for (ui32 j = num_steps; j > 0; --j)
402 {
403 // first lifting step
404 const lifting_step* s = atk->get_step(j - 1);
405 const si32 a = s->rev.Aatk;
406 const si32 b = s->rev.Batk;
407 const ui8 e = s->rev.Eatk;
408 __m128i vb = _mm_set1_epi32(b);
409
410 // extension
411 lp[-1] = lp[0];
412 lp[l_width] = lp[l_width - 1];
413 // lifting step
414 const si32* sp = lp;
415 si32* dp = hp;
416 if (a == 1)
417 { // 5/3 update and any case with a == 1
418 int i = (int)h_width;
419 if (even)
420 {
421 for (; i > 0; i -= 4, sp += 4, dp += 4)
422 {
423 __m128i s1 = _mm_load_si128((__m128i*)sp);
424 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
425 __m128i d = _mm_load_si128((__m128i*)dp);
426 __m128i t = _mm_add_epi32(s1, s2);
427 __m128i v = _mm_add_epi32(vb, t);
428 __m128i w = _mm_srai_epi32(v, e);
429 d = _mm_add_epi32(d, w);
430 _mm_store_si128((__m128i*)dp, d);
431 }
432 }
433 else
434 {
435 for (; i > 0; i -= 4, sp += 4, dp += 4)
436 {
437 __m128i s1 = _mm_load_si128((__m128i*)sp);
438 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
439 __m128i d = _mm_load_si128((__m128i*)dp);
440 __m128i t = _mm_add_epi32(s1, s2);
441 __m128i v = _mm_add_epi32(vb, t);
442 __m128i w = _mm_srai_epi32(v, e);
443 d = _mm_add_epi32(d, w);
444 _mm_store_si128((__m128i*)dp, d);
445 }
446 }
447 }
448 else if (a == -1 && b == 1 && e == 1)
449 { // 5/3 predict
450 int i = (int)h_width;
451 if (even)
452 for (; i > 0; i -= 4, sp += 4, dp += 4)
453 {
454 __m128i s1 = _mm_load_si128((__m128i*)sp);
455 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
456 __m128i d = _mm_load_si128((__m128i*)dp);
457 __m128i t = _mm_add_epi32(s1, s2);
458 __m128i w = _mm_srai_epi32(t, e);
459 d = _mm_sub_epi32(d, w);
460 _mm_store_si128((__m128i*)dp, d);
461 }
462 else
463 for (; i > 0; i -= 4, sp += 4, dp += 4)
464 {
465 __m128i s1 = _mm_load_si128((__m128i*)sp);
466 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
467 __m128i d = _mm_load_si128((__m128i*)dp);
468 __m128i t = _mm_add_epi32(s1, s2);
469 __m128i w = _mm_srai_epi32(t, e);
470 d = _mm_sub_epi32(d, w);
471 _mm_store_si128((__m128i*)dp, d);
472 }
473 }
474 else if (a == -1)
475 { // any case with a == -1, which is not 5/3 predict
476 int i = (int)h_width;
477 if (even)
478 for (; i > 0; i -= 4, sp += 4, dp += 4)
479 {
480 __m128i s1 = _mm_load_si128((__m128i*)sp);
481 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
482 __m128i d = _mm_load_si128((__m128i*)dp);
483 __m128i t = _mm_add_epi32(s1, s2);
484 __m128i v = _mm_sub_epi32(vb, t);
485 __m128i w = _mm_srai_epi32(v, e);
486 d = _mm_add_epi32(d, w);
487 _mm_store_si128((__m128i*)dp, d);
488 }
489 else
490 for (; i > 0; i -= 4, sp += 4, dp += 4)
491 {
492 __m128i s1 = _mm_load_si128((__m128i*)sp);
493 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
494 __m128i d = _mm_load_si128((__m128i*)dp);
495 __m128i t = _mm_add_epi32(s1, s2);
496 __m128i v = _mm_sub_epi32(vb, t);
497 __m128i w = _mm_srai_epi32(v, e);
498 d = _mm_add_epi32(d, w);
499 _mm_store_si128((__m128i*)dp, d);
500 }
501 }
502 else {
503 // general case
504 // 64bit multiplication is not supported in sse2
505 if (even)
506 for (ui32 i = h_width; i > 0; --i, sp++, dp++)
507 *dp += (b + a * (sp[0] + sp[1])) >> e;
508 else
509 for (ui32 i = h_width; i > 0; --i, sp++, dp++)
510 *dp += (b + a * (sp[-1] + sp[0])) >> e;
511 }
512
513 // swap buffers
514 si32* t = lp; lp = hp; hp = t;
515 even = !even;
516 ui32 w = l_width; l_width = h_width; h_width = w;
517 }
518 }
519 else {
520 if (even)
521 ldst->i32[0] = src->i32[0];
522 else
523 hdst->i32[0] = src->i32[0] << 1;
524 }
525 }
526
528 static
529 void sse2_rev_horz_ana64(const param_atk* atk, const line_buf* ldst,
530 const line_buf* hdst, const line_buf* src,
531 ui32 width, bool even)
532 {
533 if (width > 1)
534 {
535 // split src into ldst and hdst
536 {
537 void* dpl = even ? ldst->p : hdst->p;
538 void* dph = even ? hdst->p : ldst->p;
539 const void* sp = src->p;
540 int w = (int)width;
541 sse2_deinterleave64(dpl, dph, sp, w);
542 }
543
544 si64* hp = hdst->i64, * lp = ldst->i64;
545 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
546 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
547 ui32 num_steps = atk->get_num_steps();
548 for (ui32 j = num_steps; j > 0; --j)
549 {
550 // first lifting step
551 const lifting_step* s = atk->get_step(j - 1);
552 const si32 a = s->rev.Aatk;
553 const si32 b = s->rev.Batk;
554 const ui8 e = s->rev.Eatk;
555 __m128i vb = _mm_set1_epi64x(b);
556 __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
557
558 // extension
559 lp[-1] = lp[0];
560 lp[l_width] = lp[l_width - 1];
561 // lifting step
562 const si64* sp = lp;
563 si64* dp = hp;
564 if (a == 1)
565 { // 5/3 update and any case with a == 1
566 int i = (int)h_width;
567 if (even)
568 {
569 for (; i > 0; i -= 2, sp += 2, dp += 2)
570 {
571 __m128i s1 = _mm_load_si128((__m128i*)sp);
572 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
573 __m128i d = _mm_load_si128((__m128i*)dp);
574 __m128i t = _mm_add_epi64(s1, s2);
575 __m128i v = _mm_add_epi64(vb, t);
576 __m128i w = sse2_mm_srai_epi64(v, e, ve);
577 d = _mm_add_epi64(d, w);
578 _mm_store_si128((__m128i*)dp, d);
579 }
580 }
581 else
582 {
583 for (; i > 0; i -= 2, sp += 2, dp += 2)
584 {
585 __m128i s1 = _mm_load_si128((__m128i*)sp);
586 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
587 __m128i d = _mm_load_si128((__m128i*)dp);
588 __m128i t = _mm_add_epi64(s1, s2);
589 __m128i v = _mm_add_epi64(vb, t);
590 __m128i w = sse2_mm_srai_epi64(v, e, ve);
591 d = _mm_add_epi64(d, w);
592 _mm_store_si128((__m128i*)dp, d);
593 }
594 }
595 }
596 else if (a == -1 && b == 1 && e == 1)
597 { // 5/3 predict
598 int i = (int)h_width;
599 if (even)
600 for (; i > 0; i -= 2, sp += 2, dp += 2)
601 {
602 __m128i s1 = _mm_load_si128((__m128i*)sp);
603 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
604 __m128i d = _mm_load_si128((__m128i*)dp);
605 __m128i t = _mm_add_epi64(s1, s2);
606 __m128i w = sse2_mm_srai_epi64(t, e, ve);
607 d = _mm_sub_epi64(d, w);
608 _mm_store_si128((__m128i*)dp, d);
609 }
610 else
611 for (; i > 0; i -= 2, sp += 2, dp += 2)
612 {
613 __m128i s1 = _mm_load_si128((__m128i*)sp);
614 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
615 __m128i d = _mm_load_si128((__m128i*)dp);
616 __m128i t = _mm_add_epi64(s1, s2);
617 __m128i w = sse2_mm_srai_epi64(t, e, ve);
618 d = _mm_sub_epi64(d, w);
619 _mm_store_si128((__m128i*)dp, d);
620 }
621 }
622 else if (a == -1)
623 { // any case with a == -1, which is not 5/3 predict
624 int i = (int)h_width;
625 if (even)
626 for (; i > 0; i -= 2, sp += 2, dp += 2)
627 {
628 __m128i s1 = _mm_load_si128((__m128i*)sp);
629 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
630 __m128i d = _mm_load_si128((__m128i*)dp);
631 __m128i t = _mm_add_epi64(s1, s2);
632 __m128i v = _mm_sub_epi64(vb, t);
633 __m128i w = sse2_mm_srai_epi64(v, e, ve);
634 d = _mm_add_epi64(d, w);
635 _mm_store_si128((__m128i*)dp, d);
636 }
637 else
638 for (; i > 0; i -= 2, sp += 2, dp += 2)
639 {
640 __m128i s1 = _mm_load_si128((__m128i*)sp);
641 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
642 __m128i d = _mm_load_si128((__m128i*)dp);
643 __m128i t = _mm_add_epi64(s1, s2);
644 __m128i v = _mm_sub_epi64(vb, t);
645 __m128i w = sse2_mm_srai_epi64(v, e, ve);
646 d = _mm_add_epi64(d, w);
647 _mm_store_si128((__m128i*)dp, d);
648 }
649 }
650 else {
651 // general case
652 // 64bit multiplication is not supported in sse2
653 if (even)
654 for (ui32 i = h_width; i > 0; --i, sp++, dp++)
655 *dp += (b + a * (sp[0] + sp[1])) >> e;
656 else
657 for (ui32 i = h_width; i > 0; --i, sp++, dp++)
658 *dp += (b + a * (sp[-1] + sp[0])) >> e;
659 }
660
661 // swap buffers
662 si64* t = lp; lp = hp; hp = t;
663 even = !even;
664 ui32 w = l_width; l_width = h_width; h_width = w;
665 }
666 }
667 else {
668 if (even)
669 ldst->i64[0] = src->i64[0];
670 else
671 hdst->i64[0] = src->i64[0] << 1;
672 }
673 }
674
676 void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
677 const line_buf* hdst, const line_buf* src,
678 ui32 width, bool even)
679 {
680 if (src->flags & line_buf::LFT_32BIT)
681 {
682 assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
683 (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
684 sse2_rev_horz_ana32(atk, ldst, hdst, src, width, even);
685 }
686 else
687 {
688 assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
689 (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) &&
690 (src == NULL || src->flags & line_buf::LFT_64BIT));
691 sse2_rev_horz_ana64(atk, ldst, hdst, src, width, even);
692 }
693 }
694
696 void sse2_rev_horz_syn32(const param_atk* atk, const line_buf* dst,
697 const line_buf* lsrc, const line_buf* hsrc,
698 ui32 width, bool even)
699 {
700 if (width > 1)
701 {
702 bool ev = even;
703 si32* oth = hsrc->i32, * aug = lsrc->i32;
704 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
705 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
706 ui32 num_steps = atk->get_num_steps();
707 for (ui32 j = 0; j < num_steps; ++j)
708 {
709 const lifting_step* s = atk->get_step(j);
710 const si32 a = s->rev.Aatk;
711 const si32 b = s->rev.Batk;
712 const ui8 e = s->rev.Eatk;
713 __m128i vb = _mm_set1_epi32(b);
714
715 // extension
716 oth[-1] = oth[0];
717 oth[oth_width] = oth[oth_width - 1];
718 // lifting step
719 const si32* sp = oth;
720 si32* dp = aug;
721 if (a == 1)
722 { // 5/3 update and any case with a == 1
723 int i = (int)aug_width;
724 if (ev)
725 {
726 for (; i > 0; i -= 4, sp += 4, dp += 4)
727 {
728 __m128i s1 = _mm_load_si128((__m128i*)sp);
729 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
730 __m128i d = _mm_load_si128((__m128i*)dp);
731 __m128i t = _mm_add_epi32(s1, s2);
732 __m128i v = _mm_add_epi32(vb, t);
733 __m128i w = _mm_srai_epi32(v, e);
734 d = _mm_sub_epi32(d, w);
735 _mm_store_si128((__m128i*)dp, d);
736 }
737 }
738 else
739 {
740 for (; i > 0; i -= 4, sp += 4, dp += 4)
741 {
742 __m128i s1 = _mm_load_si128((__m128i*)sp);
743 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
744 __m128i d = _mm_load_si128((__m128i*)dp);
745 __m128i t = _mm_add_epi32(s1, s2);
746 __m128i v = _mm_add_epi32(vb, t);
747 __m128i w = _mm_srai_epi32(v, e);
748 d = _mm_sub_epi32(d, w);
749 _mm_store_si128((__m128i*)dp, d);
750 }
751 }
752 }
753 else if (a == -1 && b == 1 && e == 1)
754 { // 5/3 predict
755 int i = (int)aug_width;
756 if (ev)
757 for (; i > 0; i -= 4, sp += 4, dp += 4)
758 {
759 __m128i s1 = _mm_load_si128((__m128i*)sp);
760 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
761 __m128i d = _mm_load_si128((__m128i*)dp);
762 __m128i t = _mm_add_epi32(s1, s2);
763 __m128i w = _mm_srai_epi32(t, e);
764 d = _mm_add_epi32(d, w);
765 _mm_store_si128((__m128i*)dp, d);
766 }
767 else
768 for (; i > 0; i -= 4, sp += 4, dp += 4)
769 {
770 __m128i s1 = _mm_load_si128((__m128i*)sp);
771 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
772 __m128i d = _mm_load_si128((__m128i*)dp);
773 __m128i t = _mm_add_epi32(s1, s2);
774 __m128i w = _mm_srai_epi32(t, e);
775 d = _mm_add_epi32(d, w);
776 _mm_store_si128((__m128i*)dp, d);
777 }
778 }
779 else if (a == -1)
780 { // any case with a == -1, which is not 5/3 predict
781 int i = (int)aug_width;
782 if (ev)
783 for (; i > 0; i -= 4, sp += 4, dp += 4)
784 {
785 __m128i s1 = _mm_load_si128((__m128i*)sp);
786 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
787 __m128i d = _mm_load_si128((__m128i*)dp);
788 __m128i t = _mm_add_epi32(s1, s2);
789 __m128i v = _mm_sub_epi32(vb, t);
790 __m128i w = _mm_srai_epi32(v, e);
791 d = _mm_sub_epi32(d, w);
792 _mm_store_si128((__m128i*)dp, d);
793 }
794 else
795 for (; i > 0; i -= 4, sp += 4, dp += 4)
796 {
797 __m128i s1 = _mm_load_si128((__m128i*)sp);
798 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
799 __m128i d = _mm_load_si128((__m128i*)dp);
800 __m128i t = _mm_add_epi32(s1, s2);
801 __m128i v = _mm_sub_epi32(vb, t);
802 __m128i w = _mm_srai_epi32(v, e);
803 d = _mm_sub_epi32(d, w);
804 _mm_store_si128((__m128i*)dp, d);
805 }
806 }
807 else {
808 // general case
809 // 32bit multiplication is not supported in sse2; we need sse4.1,
810 // where we can use _mm_mullo_epi32, which multiplies
811 // 32bit x 32bit, keeping the LSBs
812 if (ev)
813 for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
814 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
815 else
816 for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
817 *dp -= (b + a * (sp[0] + sp[1])) >> e;
818 }
819
820 // swap buffers
821 si32* t = aug; aug = oth; oth = t;
822 ev = !ev;
823 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
824 }
825
826 // combine both lsrc and hsrc into dst
827 {
828 float* dp = dst->f32;
829 float* spl = even ? lsrc->f32 : hsrc->f32;
830 float* sph = even ? hsrc->f32 : lsrc->f32;
831 int w = (int)width;
832 sse2_interleave32(dp, spl, sph, w);
833 }
834 }
835 else {
836 if (even)
837 dst->i32[0] = lsrc->i32[0];
838 else
839 dst->i32[0] = hsrc->i32[0] >> 1;
840 }
841 }
842
844 void sse2_rev_horz_syn64(const param_atk* atk, const line_buf* dst,
845 const line_buf* lsrc, const line_buf* hsrc,
846 ui32 width, bool even)
847 {
848 if (width > 1)
849 {
850 bool ev = even;
851 si64* oth = hsrc->i64, * aug = lsrc->i64;
852 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
853 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
854 ui32 num_steps = atk->get_num_steps();
855 for (ui32 j = 0; j < num_steps; ++j)
856 {
857 const lifting_step* s = atk->get_step(j);
858 const si32 a = s->rev.Aatk;
859 const si32 b = s->rev.Batk;
860 const ui8 e = s->rev.Eatk;
861 __m128i vb = _mm_set1_epi64x(b);
862 __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
863
864 // extension
865 oth[-1] = oth[0];
866 oth[oth_width] = oth[oth_width - 1];
867 // lifting step
868 const si64* sp = oth;
869 si64* dp = aug;
870 if (a == 1)
871 { // 5/3 update and any case with a == 1
872 int i = (int)aug_width;
873 if (ev)
874 {
875 for (; i > 0; i -= 2, sp += 2, dp += 2)
876 {
877 __m128i s1 = _mm_load_si128((__m128i*)sp);
878 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
879 __m128i d = _mm_load_si128((__m128i*)dp);
880 __m128i t = _mm_add_epi64(s1, s2);
881 __m128i v = _mm_add_epi64(vb, t);
882 __m128i w = sse2_mm_srai_epi64(v, e, ve);
883 d = _mm_sub_epi64(d, w);
884 _mm_store_si128((__m128i*)dp, d);
885 }
886 }
887 else
888 {
889 for (; i > 0; i -= 2, sp += 2, dp += 2)
890 {
891 __m128i s1 = _mm_load_si128((__m128i*)sp);
892 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
893 __m128i d = _mm_load_si128((__m128i*)dp);
894 __m128i t = _mm_add_epi64(s1, s2);
895 __m128i v = _mm_add_epi64(vb, t);
896 __m128i w = sse2_mm_srai_epi64(v, e, ve);
897 d = _mm_sub_epi64(d, w);
898 _mm_store_si128((__m128i*)dp, d);
899 }
900 }
901 }
902 else if (a == -1 && b == 1 && e == 1)
903 { // 5/3 predict
904 int i = (int)aug_width;
905 if (ev)
906 for (; i > 0; i -= 2, sp += 2, dp += 2)
907 {
908 __m128i s1 = _mm_load_si128((__m128i*)sp);
909 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
910 __m128i d = _mm_load_si128((__m128i*)dp);
911 __m128i t = _mm_add_epi64(s1, s2);
912 __m128i w = sse2_mm_srai_epi64(t, e, ve);
913 d = _mm_add_epi64(d, w);
914 _mm_store_si128((__m128i*)dp, d);
915 }
916 else
917 for (; i > 0; i -= 2, sp += 2, dp += 2)
918 {
919 __m128i s1 = _mm_load_si128((__m128i*)sp);
920 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
921 __m128i d = _mm_load_si128((__m128i*)dp);
922 __m128i t = _mm_add_epi64(s1, s2);
923 __m128i w = sse2_mm_srai_epi64(t, e, ve);
924 d = _mm_add_epi64(d, w);
925 _mm_store_si128((__m128i*)dp, d);
926 }
927 }
928 else if (a == -1)
929 { // any case with a == -1, which is not 5/3 predict
930 int i = (int)aug_width;
931 if (ev)
932 for (; i > 0; i -= 2, sp += 2, dp += 2)
933 {
934 __m128i s1 = _mm_load_si128((__m128i*)sp);
935 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
936 __m128i d = _mm_load_si128((__m128i*)dp);
937 __m128i t = _mm_add_epi64(s1, s2);
938 __m128i v = _mm_sub_epi64(vb, t);
939 __m128i w = sse2_mm_srai_epi64(v, e, ve);
940 d = _mm_sub_epi64(d, w);
941 _mm_store_si128((__m128i*)dp, d);
942 }
943 else
944 for (; i > 0; i -= 2, sp += 2, dp += 2)
945 {
946 __m128i s1 = _mm_load_si128((__m128i*)sp);
947 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
948 __m128i d = _mm_load_si128((__m128i*)dp);
949 __m128i t = _mm_add_epi64(s1, s2);
950 __m128i v = _mm_sub_epi64(vb, t);
951 __m128i w = sse2_mm_srai_epi64(v, e, ve);
952 d = _mm_sub_epi64(d, w);
953 _mm_store_si128((__m128i*)dp, d);
954 }
955 }
956 else {
957 // general case
958 // 64bit multiplication is not supported in sse2
959 if (ev)
960 for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
961 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
962 else
963 for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
964 *dp -= (b + a * (sp[0] + sp[1])) >> e;
965 }
966
967 // swap buffers
968 si64* t = aug; aug = oth; oth = t;
969 ev = !ev;
970 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
971 }
972
973 // combine both lsrc and hsrc into dst
974 {
975 void* dp = dst->p;
976 const void* spl = even ? lsrc->p : hsrc->p;
977 const void* sph = even ? hsrc->p : lsrc->p;
978 int w = (int)width;
979 sse2_interleave64(dp, spl, sph, w);
980 }
981 }
982 else {
983 if (even)
984 dst->i64[0] = lsrc->i64[0];
985 else
986 dst->i64[0] = hsrc->i64[0] >> 1;
987 }
988 }
989
991 void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst,
992 const line_buf* lsrc, const line_buf* hsrc,
993 ui32 width, bool even)
994 {
995 if (dst->flags & line_buf::LFT_32BIT)
996 {
997 assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) &&
998 (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
999 sse2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
1000 }
1001 else
1002 {
1003 assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
1004 (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) &&
1005 (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
1006 sse2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
1007 }
1008 }
1009
1010 } // !local
1011} // !ojph
1012
1013#endif
void sse2_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void sse2_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void sse2_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
int64_t si64
Definition ojph_defs.h:57
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50