OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_transform_vsx.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2021, Aous Naman
6// Copyright (c) 2021, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2021, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_transform_vsx.cpp
34// Author: Aous Naman
35// Date: 09 February 2021
36//***************************************************************************/
37
38#include <cstdio>
39#include "ojph_simd_vsx.h"
40
41#include "ojph_defs.h"
42#include "ojph_arch.h"
43#include "ojph_mem.h"
44#include "ojph_params.h"
46
47#include "ojph_transform.h"
49
50namespace ojph {
51 namespace local {
52
54 static inline
55 void vsx_deinterleave32(float* dpl, float* dph, float* sp, int width)
56 {
57 for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
58 {
59 v128_t a = vsx_v128_load(sp);
60 v128_t b = vsx_v128_load(sp + 4);
61 v128_t c = vsx_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2);
62 v128_t d = vsx_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3);
63 // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
64 // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
65 vsx_v128_store(dpl, c);
66 vsx_v128_store(dph, d);
67 }
68 }
69
71 static inline
72 void vsx_interleave32(float* dp, float* spl, float* sph, int width)
73 {
74 for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
75 {
76 v128_t a = vsx_v128_load(spl);
77 v128_t b = vsx_v128_load(sph);
78 v128_t c = vsx_i32x4_shuffle(a, b, 0, 4 + 0, 1, 4 + 1);
79 v128_t d = vsx_i32x4_shuffle(a, b, 2, 4 + 2, 3, 4 + 3);
80 // v128_t c = _mm_unpacklo_ps(a, b);
81 // v128_t d = _mm_unpackhi_ps(a, b);
82 vsx_v128_store(dp, c);
83 vsx_v128_store(dp + 4, d);
84 }
85 }
86
88 static inline
89 void vsx_deinterleave64(void* dpl, void* dph, const void* sp, int width)
90 {
91 for (; width > 0; width -= 4,
92 sp = (const char*)sp + 32,
93 dpl = (char*)dpl + 16,
94 dph = (char*)dph + 16)
95 {
96 v128_t a = vsx_v128_load(sp);
97 v128_t b = vsx_v128_load((const char*)sp + 16);
98 v128_t c = vsx_i64x2_shuffle(a, b, 0, 2 + 0);
99 v128_t d = vsx_i64x2_shuffle(a, b, 1, 2 + 1);
100 vsx_v128_store(dpl, c);
101 vsx_v128_store(dph, d);
102 }
103 }
104
106 static inline
107 void vsx_interleave64(void* dp, const void* spl, const void* sph,
108 int width)
109 {
110 for (; width > 0; width -= 4,
111 dp = (char*)dp + 32,
112 spl = (const char*)spl + 16,
113 sph = (const char*)sph + 16)
114 {
115 v128_t a = vsx_v128_load(spl);
116 v128_t b = vsx_v128_load(sph);
117 v128_t c = vsx_i64x2_shuffle(a, b, 0, 2 + 0);
118 v128_t d = vsx_i64x2_shuffle(a, b, 1, 2 + 1);
119 vsx_v128_store(dp, c);
120 vsx_v128_store((char*)dp + 16, d);
121 }
122 }
123
125 static inline void vsx_multiply_const(float* p, float f, int width)
126 {
127 v128_t factor = vsx_f32x4_splat(f);
128 for (; width > 0; width -= 4, p += 4)
129 {
130 v128_t s = vsx_v128_load(p);
131 vsx_v128_store(p, vsx_f32x4_mul(factor, s));
132 }
133 }
134
136 void vsx_irv_vert_step(const lifting_step* s, const line_buf* sig,
137 const line_buf* other, const line_buf* aug,
138 ui32 repeat, bool synthesis)
139 {
140 float a = s->irv.Aatk;
141 if (synthesis)
142 a = -a;
143
144 v128_t factor = vsx_f32x4_splat(a);
145
146 float* dst = aug->f32;
147 const float* src1 = sig->f32, * src2 = other->f32;
148 int i = (int)repeat;
149 for ( ; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
150 {
151 v128_t s1 = vsx_v128_load(src1);
152 v128_t s2 = vsx_v128_load(src2);
153 v128_t d = vsx_v128_load(dst);
154 d = vsx_f32x4_add(d, vsx_f32x4_mul(factor, vsx_f32x4_add(s1, s2)));
155 vsx_v128_store(dst, d);
156 }
157 }
158
160 void vsx_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
161 {
162 vsx_multiply_const(aug->f32, K, (int)repeat);
163 }
164
166 void vsx_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
167 const line_buf* hdst, const line_buf* src,
168 ui32 width, bool even)
169 {
170 if (width > 1)
171 {
172 // split src into ldst and hdst
173 {
174 float* dpl = even ? ldst->f32 : hdst->f32;
175 float* dph = even ? hdst->f32 : ldst->f32;
176 float* sp = src->f32;
177 int w = (int)width;
178 vsx_deinterleave32(dpl, dph, sp, w);
179 }
180
181 // the actual horizontal transform
182 float* hp = hdst->f32, * lp = ldst->f32;
183 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
184 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
185 ui32 num_steps = atk->get_num_steps();
186 for (ui32 j = num_steps; j > 0; --j)
187 {
188 const lifting_step* s = atk->get_step(j - 1);
189 const float a = s->irv.Aatk;
190
191 // extension
192 lp[-1] = lp[0];
193 lp[l_width] = lp[l_width - 1];
194 // lifting step
195 const float* sp = lp;
196 float* dp = hp;
197 int i = (int)h_width;
198 v128_t f = vsx_f32x4_splat(a);
199 if (even)
200 {
201 for (; i > 0; i -= 4, sp += 4, dp += 4)
202 {
203 v128_t m = vsx_v128_load(sp);
204 v128_t n = vsx_v128_load(sp + 1);
205 v128_t p = vsx_v128_load(dp);
206 p = vsx_f32x4_add(p, vsx_f32x4_mul(f, vsx_f32x4_add(m, n)));
207 vsx_v128_store(dp, p);
208 }
209 }
210 else
211 {
212 for (; i > 0; i -= 4, sp += 4, dp += 4)
213 {
214 v128_t m = vsx_v128_load(sp);
215 v128_t n = vsx_v128_load(sp - 1);
216 v128_t p = vsx_v128_load(dp);
217 p = vsx_f32x4_add(p, vsx_f32x4_mul(f, vsx_f32x4_add(m, n)));
218 vsx_v128_store(dp, p);
219 }
220 }
221
222 // swap buffers
223 float* t = lp; lp = hp; hp = t;
224 even = !even;
225 ui32 w = l_width; l_width = h_width; h_width = w;
226 }
227
228 { // multiply by K or 1/K
229 float K = atk->get_K();
230 float K_inv = 1.0f / K;
231 vsx_multiply_const(lp, K_inv, (int)l_width);
232 vsx_multiply_const(hp, K, (int)h_width);
233 }
234 }
235 else {
236 if (even)
237 ldst->f32[0] = src->f32[0];
238 else
239 hdst->f32[0] = src->f32[0] * 2.0f;
240 }
241 }
242
244 void vsx_irv_horz_syn(const param_atk* atk, const line_buf* dst,
245 const line_buf* lsrc, const line_buf* hsrc,
246 ui32 width, bool even)
247 {
248 if (width > 1)
249 {
250 bool ev = even;
251 float* oth = hsrc->f32, * aug = lsrc->f32;
252 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
253 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
254
255 { // multiply by K or 1/K
256 float K = atk->get_K();
257 float K_inv = 1.0f / K;
258 vsx_multiply_const(aug, K, (int)aug_width);
259 vsx_multiply_const(oth, K_inv, (int)oth_width);
260 }
261
262 // the actual horizontal transform
263 ui32 num_steps = atk->get_num_steps();
264 for (ui32 j = 0; j < num_steps; ++j)
265 {
266 const lifting_step* s = atk->get_step(j);
267 const float a = s->irv.Aatk;
268
269 // extension
270 oth[-1] = oth[0];
271 oth[oth_width] = oth[oth_width - 1];
272 // lifting step
273 const float* sp = oth;
274 float* dp = aug;
275 int i = (int)aug_width;
276 v128_t f = vsx_f32x4_splat(a);
277 if (ev)
278 {
279 for ( ; i > 0; i -= 4, sp += 4, dp += 4)
280 {
281 v128_t m = vsx_v128_load(sp);
282 v128_t n = vsx_v128_load(sp - 1);
283 v128_t p = vsx_v128_load(dp);
284 p = vsx_f32x4_sub(p, vsx_f32x4_mul(f, vsx_f32x4_add(m, n)));
285 vsx_v128_store(dp, p);
286 }
287 }
288 else
289 {
290 for ( ; i > 0; i -= 4, sp += 4, dp += 4)
291 {
292 v128_t m = vsx_v128_load(sp);
293 v128_t n = vsx_v128_load(sp + 1);
294 v128_t p = vsx_v128_load(dp);
295 p = vsx_f32x4_sub(p, vsx_f32x4_mul(f, vsx_f32x4_add(m, n)));
296 vsx_v128_store(dp, p);
297 }
298 }
299
300 // swap buffers
301 float* t = aug; aug = oth; oth = t;
302 ev = !ev;
303 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
304 }
305
306 // combine both lsrc and hsrc into dst
307 {
308 float* dp = dst->f32;
309 float* spl = even ? lsrc->f32 : hsrc->f32;
310 float* sph = even ? hsrc->f32 : lsrc->f32;
311 int w = (int)width;
312 vsx_interleave32(dp, spl, sph, w);
313 }
314 }
315 else {
316 if (even)
317 dst->f32[0] = lsrc->f32[0];
318 else
319 dst->f32[0] = hsrc->f32[0] * 0.5f;
320 }
321 }
322
324 void vsx_rev_vert_step32(const lifting_step* s, const line_buf* sig,
325 const line_buf* other, const line_buf* aug,
326 ui32 repeat, bool synthesis)
327 {
328 const si32 a = s->rev.Aatk;
329 const si32 b = s->rev.Batk;
330 const ui8 e = s->rev.Eatk;
331 v128_t va = vsx_i32x4_splat(a);
332 v128_t vb = vsx_i32x4_splat(b);
333
334 si32* dst = aug->i32;
335 const si32* src1 = sig->i32, * src2 = other->i32;
336 // The general definition of the wavelet in Part 2 is slightly
337 // different to part 2, although they are mathematically equivalent
338 // here, we identify the simpler form from Part 1 and employ them
339 if (a == 1)
340 { // 5/3 update and any case with a == 1
341 int i = (int)repeat;
342 if (synthesis)
343 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
344 {
345 v128_t s1 = vsx_v128_load((v128_t*)src1);
346 v128_t s2 = vsx_v128_load((v128_t*)src2);
347 v128_t d = vsx_v128_load((v128_t*)dst);
348 v128_t t = vsx_i32x4_add(s1, s2);
349 v128_t v = vsx_i32x4_add(vb, t);
350 v128_t w = vsx_i32x4_shr(v, e);
351 d = vsx_i32x4_sub(d, w);
352 vsx_v128_store((v128_t*)dst, d);
353 }
354 else
355 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
356 {
357 v128_t s1 = vsx_v128_load((v128_t*)src1);
358 v128_t s2 = vsx_v128_load((v128_t*)src2);
359 v128_t d = vsx_v128_load((v128_t*)dst);
360 v128_t t = vsx_i32x4_add(s1, s2);
361 v128_t v = vsx_i32x4_add(vb, t);
362 v128_t w = vsx_i32x4_shr(v, e);
363 d = vsx_i32x4_add(d, w);
364 vsx_v128_store((v128_t*)dst, d);
365 }
366 }
367 else if (a == -1 && b == 1 && e == 1)
368 { // 5/3 predict
369 int i = (int)repeat;
370 if (synthesis)
371 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
372 {
373 v128_t s1 = vsx_v128_load((v128_t*)src1);
374 v128_t s2 = vsx_v128_load((v128_t*)src2);
375 v128_t d = vsx_v128_load((v128_t*)dst);
376 v128_t t = vsx_i32x4_add(s1, s2);
377 v128_t w = vsx_i32x4_shr(t, e);
378 d = vsx_i32x4_add(d, w);
379 vsx_v128_store((v128_t*)dst, d);
380 }
381 else
382 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
383 {
384 v128_t s1 = vsx_v128_load((v128_t*)src1);
385 v128_t s2 = vsx_v128_load((v128_t*)src2);
386 v128_t d = vsx_v128_load((v128_t*)dst);
387 v128_t t = vsx_i32x4_add(s1, s2);
388 v128_t w = vsx_i32x4_shr(t, e);
389 d = vsx_i32x4_sub(d, w);
390 vsx_v128_store((v128_t*)dst, d);
391 }
392 }
393 else if (a == -1)
394 { // any case with a == -1, which is not 5/3 predict
395 int i = (int)repeat;
396 if (synthesis)
397 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
398 {
399 v128_t s1 = vsx_v128_load((v128_t*)src1);
400 v128_t s2 = vsx_v128_load((v128_t*)src2);
401 v128_t d = vsx_v128_load((v128_t*)dst);
402 v128_t t = vsx_i32x4_add(s1, s2);
403 v128_t v = vsx_i32x4_sub(vb, t);
404 v128_t w = vsx_i32x4_shr(v, e);
405 d = vsx_i32x4_sub(d, w);
406 vsx_v128_store((v128_t*)dst, d);
407 }
408 else
409 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
410 {
411 v128_t s1 = vsx_v128_load((v128_t*)src1);
412 v128_t s2 = vsx_v128_load((v128_t*)src2);
413 v128_t d = vsx_v128_load((v128_t*)dst);
414 v128_t t = vsx_i32x4_add(s1, s2);
415 v128_t v = vsx_i32x4_sub(vb, t);
416 v128_t w = vsx_i32x4_shr(v, e);
417 d = vsx_i32x4_add(d, w);
418 vsx_v128_store((v128_t*)dst, d);
419 }
420 }
421 else
422 { // general case
423 int i = (int)repeat;
424 if (synthesis)
425 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
426 {
427 v128_t s1 = vsx_v128_load((v128_t*)src1);
428 v128_t s2 = vsx_v128_load((v128_t*)src2);
429 v128_t d = vsx_v128_load((v128_t*)dst);
430 v128_t t = vsx_i32x4_add(s1, s2);
431 v128_t u = vsx_i32x4_mul(va, t);
432 v128_t v = vsx_i32x4_add(vb, u);
433 v128_t w = vsx_i32x4_shr(v, e);
434 d = vsx_i32x4_sub(d, w);
435 vsx_v128_store((v128_t*)dst, d);
436 }
437 else
438 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
439 {
440 v128_t s1 = vsx_v128_load((v128_t*)src1);
441 v128_t s2 = vsx_v128_load((v128_t*)src2);
442 v128_t d = vsx_v128_load((v128_t*)dst);
443 v128_t t = vsx_i32x4_add(s1, s2);
444 v128_t u = vsx_i32x4_mul(va, t);
445 v128_t v = vsx_i32x4_add(vb, u);
446 v128_t w = vsx_i32x4_shr(v, e);
447 d = vsx_i32x4_add(d, w);
448 vsx_v128_store((v128_t*)dst, d);
449 }
450 }
451 }
452
454 void vsx_rev_vert_step64(const lifting_step* s, const line_buf* sig,
455 const line_buf* other, const line_buf* aug,
456 ui32 repeat, bool synthesis)
457 {
458 const si32 a = s->rev.Aatk;
459 const si32 b = s->rev.Batk;
460 const ui8 e = s->rev.Eatk;
461 v128_t va = vsx_i64x2_splat(a);
462 v128_t vb = vsx_i64x2_splat(b);
463
464 si64* dst = aug->i64;
465 const si64* src1 = sig->i64, * src2 = other->i64;
466 // The general definition of the wavelet in Part 2 is slightly
467 // different to part 2, although they are mathematically equivalent
468 // here, we identify the simpler form from Part 1 and employ them
469 if (a == 1)
470 { // 5/3 update and any case with a == 1
471 int i = (int)repeat;
472 if (synthesis)
473 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
474 {
475 v128_t s1 = vsx_v128_load((v128_t*)src1);
476 v128_t s2 = vsx_v128_load((v128_t*)src2);
477 v128_t d = vsx_v128_load((v128_t*)dst);
478 v128_t t = vsx_i64x2_add(s1, s2);
479 v128_t v = vsx_i64x2_add(vb, t);
480 v128_t w = vsx_i64x2_shr(v, e);
481 d = vsx_i64x2_sub(d, w);
482 vsx_v128_store((v128_t*)dst, d);
483 }
484 else
485 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
486 {
487 v128_t s1 = vsx_v128_load((v128_t*)src1);
488 v128_t s2 = vsx_v128_load((v128_t*)src2);
489 v128_t d = vsx_v128_load((v128_t*)dst);
490 v128_t t = vsx_i64x2_add(s1, s2);
491 v128_t v = vsx_i64x2_add(vb, t);
492 v128_t w = vsx_i64x2_shr(v, e);
493 d = vsx_i64x2_add(d, w);
494 vsx_v128_store((v128_t*)dst, d);
495 }
496 }
497 else if (a == -1 && b == 1 && e == 1)
498 { // 5/3 predict
499 int i = (int)repeat;
500 if (synthesis)
501 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
502 {
503 v128_t s1 = vsx_v128_load((v128_t*)src1);
504 v128_t s2 = vsx_v128_load((v128_t*)src2);
505 v128_t d = vsx_v128_load((v128_t*)dst);
506 v128_t t = vsx_i64x2_add(s1, s2);
507 v128_t w = vsx_i64x2_shr(t, e);
508 d = vsx_i64x2_add(d, w);
509 vsx_v128_store((v128_t*)dst, d);
510 }
511 else
512 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
513 {
514 v128_t s1 = vsx_v128_load((v128_t*)src1);
515 v128_t s2 = vsx_v128_load((v128_t*)src2);
516 v128_t d = vsx_v128_load((v128_t*)dst);
517 v128_t t = vsx_i64x2_add(s1, s2);
518 v128_t w = vsx_i64x2_shr(t, e);
519 d = vsx_i64x2_sub(d, w);
520 vsx_v128_store((v128_t*)dst, d);
521 }
522 }
523 else if (a == -1)
524 { // any case with a == -1, which is not 5/3 predict
525 int i = (int)repeat;
526 if (synthesis)
527 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
528 {
529 v128_t s1 = vsx_v128_load((v128_t*)src1);
530 v128_t s2 = vsx_v128_load((v128_t*)src2);
531 v128_t d = vsx_v128_load((v128_t*)dst);
532 v128_t t = vsx_i64x2_add(s1, s2);
533 v128_t v = vsx_i64x2_sub(vb, t);
534 v128_t w = vsx_i64x2_shr(v, e);
535 d = vsx_i64x2_sub(d, w);
536 vsx_v128_store((v128_t*)dst, d);
537 }
538 else
539 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
540 {
541 v128_t s1 = vsx_v128_load((v128_t*)src1);
542 v128_t s2 = vsx_v128_load((v128_t*)src2);
543 v128_t d = vsx_v128_load((v128_t*)dst);
544 v128_t t = vsx_i64x2_add(s1, s2);
545 v128_t v = vsx_i64x2_sub(vb, t);
546 v128_t w = vsx_i64x2_shr(v, e);
547 d = vsx_i64x2_add(d, w);
548 vsx_v128_store((v128_t*)dst, d);
549 }
550 }
551 else
552 { // general case
553 int i = (int)repeat;
554 if (synthesis)
555 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
556 {
557 v128_t s1 = vsx_v128_load((v128_t*)src1);
558 v128_t s2 = vsx_v128_load((v128_t*)src2);
559 v128_t d = vsx_v128_load((v128_t*)dst);
560 v128_t t = vsx_i64x2_add(s1, s2);
561 v128_t u = vsx_i64x2_mul(va, t);
562 v128_t v = vsx_i64x2_add(vb, u);
563 v128_t w = vsx_i64x2_shr(v, e);
564 d = vsx_i64x2_sub(d, w);
565 vsx_v128_store((v128_t*)dst, d);
566 }
567 else
568 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
569 {
570 v128_t s1 = vsx_v128_load((v128_t*)src1);
571 v128_t s2 = vsx_v128_load((v128_t*)src2);
572 v128_t d = vsx_v128_load((v128_t*)dst);
573 v128_t t = vsx_i64x2_add(s1, s2);
574 v128_t u = vsx_i64x2_mul(va, t);
575 v128_t v = vsx_i64x2_add(vb, u);
576 v128_t w = vsx_i64x2_shr(v, e);
577 d = vsx_i64x2_add(d, w);
578 vsx_v128_store((v128_t*)dst, d);
579 }
580 }
581 }
582
584 void vsx_rev_vert_step(const lifting_step* s, const line_buf* sig,
585 const line_buf* other, const line_buf* aug,
586 ui32 repeat, bool synthesis)
587 {
588 if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) ||
589 ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
590 ((other != NULL) && (other->flags & line_buf::LFT_32BIT)))
591 {
592 assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
593 (other == NULL || other->flags & line_buf::LFT_32BIT) &&
594 (aug == NULL || aug->flags & line_buf::LFT_32BIT));
595 vsx_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
596 }
597 else
598 {
599 assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
600 (other == NULL || other->flags & line_buf::LFT_64BIT) &&
601 (aug == NULL || aug->flags & line_buf::LFT_64BIT));
602 vsx_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
603 }
604 }
605
607 static
608 void vsx_rev_horz_ana32(const param_atk* atk, const line_buf* ldst,
609 const line_buf* hdst, const line_buf* src,
610 ui32 width, bool even)
611 {
612 if (width > 1)
613 {
614 // combine both lsrc and hsrc into dst
615 {
616 float* dpl = even ? ldst->f32 : hdst->f32;
617 float* dph = even ? hdst->f32 : ldst->f32;
618 float* sp = src->f32;
619 int w = (int)width;
620 vsx_deinterleave32(dpl, dph, sp, w);
621 }
622
623 si32* hp = hdst->i32, * lp = ldst->i32;
624 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
625 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
626 ui32 num_steps = atk->get_num_steps();
627 for (ui32 j = num_steps; j > 0; --j)
628 {
629 // first lifting step
630 const lifting_step* s = atk->get_step(j - 1);
631 const si32 a = s->rev.Aatk;
632 const si32 b = s->rev.Batk;
633 const ui8 e = s->rev.Eatk;
634 v128_t va = vsx_i32x4_splat(a);
635 v128_t vb = vsx_i32x4_splat(b);
636
637 // extension
638 lp[-1] = lp[0];
639 lp[l_width] = lp[l_width - 1];
640 // lifting step
641 const si32* sp = lp;
642 si32* dp = hp;
643 if (a == 1)
644 { // 5/3 update and any case with a == 1
645 int i = (int)h_width;
646 if (even)
647 {
648 for (; i > 0; i -= 4, sp += 4, dp += 4)
649 {
650 v128_t s1 = vsx_v128_load((v128_t*)sp);
651 v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
652 v128_t d = vsx_v128_load((v128_t*)dp);
653 v128_t t = vsx_i32x4_add(s1, s2);
654 v128_t v = vsx_i32x4_add(vb, t);
655 v128_t w = vsx_i32x4_shr(v, e);
656 d = vsx_i32x4_add(d, w);
657 vsx_v128_store((v128_t*)dp, d);
658 }
659 }
660 else
661 {
662 for (; i > 0; i -= 4, sp += 4, dp += 4)
663 {
664 v128_t s1 = vsx_v128_load((v128_t*)sp);
665 v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
666 v128_t d = vsx_v128_load((v128_t*)dp);
667 v128_t t = vsx_i32x4_add(s1, s2);
668 v128_t v = vsx_i32x4_add(vb, t);
669 v128_t w = vsx_i32x4_shr(v, e);
670 d = vsx_i32x4_add(d, w);
671 vsx_v128_store((v128_t*)dp, d);
672 }
673 }
674 }
675 else if (a == -1 && b == 1 && e == 1)
676 { // 5/3 predict
677 int i = (int)h_width;
678 if (even)
679 for (; i > 0; i -= 4, sp += 4, dp += 4)
680 {
681 v128_t s1 = vsx_v128_load((v128_t*)sp);
682 v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
683 v128_t d = vsx_v128_load((v128_t*)dp);
684 v128_t t = vsx_i32x4_add(s1, s2);
685 v128_t w = vsx_i32x4_shr(t, e);
686 d = vsx_i32x4_sub(d, w);
687 vsx_v128_store((v128_t*)dp, d);
688 }
689 else
690 for (; i > 0; i -= 4, sp += 4, dp += 4)
691 {
692 v128_t s1 = vsx_v128_load((v128_t*)sp);
693 v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
694 v128_t d = vsx_v128_load((v128_t*)dp);
695 v128_t t = vsx_i32x4_add(s1, s2);
696 v128_t w = vsx_i32x4_shr(t, e);
697 d = vsx_i32x4_sub(d, w);
698 vsx_v128_store((v128_t*)dp, d);
699 }
700 }
701 else if (a == -1)
702 { // any case with a == -1, which is not 5/3 predict
703 int i = (int)h_width;
704 if (even)
705 for (; i > 0; i -= 4, sp += 4, dp += 4)
706 {
707 v128_t s1 = vsx_v128_load((v128_t*)sp);
708 v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
709 v128_t d = vsx_v128_load((v128_t*)dp);
710 v128_t t = vsx_i32x4_add(s1, s2);
711 v128_t v = vsx_i32x4_sub(vb, t);
712 v128_t w = vsx_i32x4_shr(v, e);
713 d = vsx_i32x4_add(d, w);
714 vsx_v128_store((v128_t*)dp, d);
715 }
716 else
717 for (; i > 0; i -= 4, sp += 4, dp += 4)
718 {
719 v128_t s1 = vsx_v128_load((v128_t*)sp);
720 v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
721 v128_t d = vsx_v128_load((v128_t*)dp);
722 v128_t t = vsx_i32x4_add(s1, s2);
723 v128_t v = vsx_i32x4_sub(vb, t);
724 v128_t w = vsx_i32x4_shr(v, e);
725 d = vsx_i32x4_add(d, w);
726 vsx_v128_store((v128_t*)dp, d);
727 }
728 }
729 else
730 { // general case
731 int i = (int)h_width;
732 if (even)
733 for (; i > 0; i -= 4, sp += 4, dp += 4)
734 {
735 v128_t s1 = vsx_v128_load((v128_t*)sp);
736 v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
737 v128_t d = vsx_v128_load((v128_t*)dp);
738 v128_t t = vsx_i32x4_add(s1, s2);
739 v128_t u = vsx_i32x4_mul(va, t);
740 v128_t v = vsx_i32x4_add(vb, u);
741 v128_t w = vsx_i32x4_shr(v, e);
742 d = vsx_i32x4_add(d, w);
743 vsx_v128_store((v128_t*)dp, d);
744 }
745 else
746 for (; i > 0; i -= 4, sp += 4, dp += 4)
747 {
748 v128_t s1 = vsx_v128_load((v128_t*)sp);
749 v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
750 v128_t d = vsx_v128_load((v128_t*)dp);
751 v128_t t = vsx_i32x4_add(s1, s2);
752 v128_t u = vsx_i32x4_mul(va, t);
753 v128_t v = vsx_i32x4_add(vb, u);
754 v128_t w = vsx_i32x4_shr(v, e);
755 d = vsx_i32x4_add(d, w);
756 vsx_v128_store((v128_t*)dp, d);
757 }
758 }
759
760 // swap buffers
761 si32* t = lp; lp = hp; hp = t;
762 even = !even;
763 ui32 w = l_width; l_width = h_width; h_width = w;
764 }
765 }
766 else {
767 if (even)
768 ldst->i32[0] = src->i32[0];
769 else
770 hdst->i32[0] = src->i32[0] << 1;
771 }
772 }
773
775 static
776 void vsx_rev_horz_ana64(const param_atk* atk, const line_buf* ldst,
777 const line_buf* hdst, const line_buf* src,
778 ui32 width, bool even)
779 {
780 if (width > 1)
781 {
782 // combine both lsrc and hsrc into dst
783 {
784 void* dpl = even ? ldst->p : hdst->p;
785 void* dph = even ? hdst->p : ldst->p;
786 const void* sp = src->p;
787 int w = (int)width;
788 vsx_deinterleave64(dpl, dph, sp, w);
789 }
790
791 si64* hp = hdst->i64, * lp = ldst->i64;
792 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
793 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
794 ui32 num_steps = atk->get_num_steps();
795 for (ui32 j = num_steps; j > 0; --j)
796 {
797 // first lifting step
798 const lifting_step* s = atk->get_step(j - 1);
799 const si32 a = s->rev.Aatk;
800 const si32 b = s->rev.Batk;
801 const ui8 e = s->rev.Eatk;
802 v128_t va = vsx_i64x2_splat(a);
803 v128_t vb = vsx_i64x2_splat(b);
804
805 // extension
806 lp[-1] = lp[0];
807 lp[l_width] = lp[l_width - 1];
808 // lifting step
809 const si64* sp = lp;
810 si64* dp = hp;
811 if (a == 1)
812 { // 5/3 update and any case with a == 1
813 int i = (int)h_width;
814 if (even)
815 {
816 for (; i > 0; i -= 2, sp += 2, dp += 2)
817 {
818 v128_t s1 = vsx_v128_load((v128_t*)sp);
819 v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
820 v128_t d = vsx_v128_load((v128_t*)dp);
821 v128_t t = vsx_i64x2_add(s1, s2);
822 v128_t v = vsx_i64x2_add(vb, t);
823 v128_t w = vsx_i64x2_shr(v, e);
824 d = vsx_i64x2_add(d, w);
825 vsx_v128_store((v128_t*)dp, d);
826 }
827 }
828 else
829 {
830 for (; i > 0; i -= 2, sp += 2, dp += 2)
831 {
832 v128_t s1 = vsx_v128_load((v128_t*)sp);
833 v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
834 v128_t d = vsx_v128_load((v128_t*)dp);
835 v128_t t = vsx_i64x2_add(s1, s2);
836 v128_t v = vsx_i64x2_add(vb, t);
837 v128_t w = vsx_i64x2_shr(v, e);
838 d = vsx_i64x2_add(d, w);
839 vsx_v128_store((v128_t*)dp, d);
840 }
841 }
842 }
843 else if (a == -1 && b == 1 && e == 1)
844 { // 5/3 predict
845 int i = (int)h_width;
846 if (even)
847 for (; i > 0; i -= 2, sp += 2, dp += 2)
848 {
849 v128_t s1 = vsx_v128_load((v128_t*)sp);
850 v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
851 v128_t d = vsx_v128_load((v128_t*)dp);
852 v128_t t = vsx_i64x2_add(s1, s2);
853 v128_t w = vsx_i64x2_shr(t, e);
854 d = vsx_i64x2_sub(d, w);
855 vsx_v128_store((v128_t*)dp, d);
856 }
857 else
858 for (; i > 0; i -= 2, sp += 2, dp += 2)
859 {
860 v128_t s1 = vsx_v128_load((v128_t*)sp);
861 v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
862 v128_t d = vsx_v128_load((v128_t*)dp);
863 v128_t t = vsx_i64x2_add(s1, s2);
864 v128_t w = vsx_i64x2_shr(t, e);
865 d = vsx_i64x2_sub(d, w);
866 vsx_v128_store((v128_t*)dp, d);
867 }
868 }
869 else if (a == -1)
870 { // any case with a == -1, which is not 5/3 predict
871 int i = (int)h_width;
872 if (even)
873 for (; i > 0; i -= 2, sp += 2, dp += 2)
874 {
875 v128_t s1 = vsx_v128_load((v128_t*)sp);
876 v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
877 v128_t d = vsx_v128_load((v128_t*)dp);
878 v128_t t = vsx_i64x2_add(s1, s2);
879 v128_t v = vsx_i64x2_sub(vb, t);
880 v128_t w = vsx_i64x2_shr(v, e);
881 d = vsx_i64x2_add(d, w);
882 vsx_v128_store((v128_t*)dp, d);
883 }
884 else
885 for (; i > 0; i -= 2, sp += 2, dp += 2)
886 {
887 v128_t s1 = vsx_v128_load((v128_t*)sp);
888 v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
889 v128_t d = vsx_v128_load((v128_t*)dp);
890 v128_t t = vsx_i64x2_add(s1, s2);
891 v128_t v = vsx_i64x2_sub(vb, t);
892 v128_t w = vsx_i64x2_shr(v, e);
893 d = vsx_i64x2_add(d, w);
894 vsx_v128_store((v128_t*)dp, d);
895 }
896 }
897 else
898 { // general case
899 int i = (int)h_width;
900 if (even)
901 for (; i > 0; i -= 2, sp += 2, dp += 2)
902 {
903 v128_t s1 = vsx_v128_load((v128_t*)sp);
904 v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
905 v128_t d = vsx_v128_load((v128_t*)dp);
906 v128_t t = vsx_i64x2_add(s1, s2);
907 v128_t u = vsx_i64x2_mul(va, t);
908 v128_t v = vsx_i64x2_add(vb, u);
909 v128_t w = vsx_i64x2_shr(v, e);
910 d = vsx_i64x2_add(d, w);
911 vsx_v128_store((v128_t*)dp, d);
912 }
913 else
914 for (; i > 0; i -= 2, sp += 2, dp += 2)
915 {
916 v128_t s1 = vsx_v128_load((v128_t*)sp);
917 v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
918 v128_t d = vsx_v128_load((v128_t*)dp);
919 v128_t t = vsx_i64x2_add(s1, s2);
920 v128_t u = vsx_i64x2_mul(va, t);
921 v128_t v = vsx_i64x2_add(vb, u);
922 v128_t w = vsx_i64x2_shr(v, e);
923 d = vsx_i64x2_add(d, w);
924 vsx_v128_store((v128_t*)dp, d);
925 }
926 }
927
928 // swap buffers
929 si64* t = lp; lp = hp; hp = t;
930 even = !even;
931 ui32 w = l_width; l_width = h_width; h_width = w;
932 }
933 }
934 else {
935 if (even)
936 ldst->i64[0] = src->i64[0];
937 else
938 hdst->i64[0] = src->i64[0] << 1;
939 }
940 }
941
943 void vsx_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
944 const line_buf* hdst, const line_buf* src,
945 ui32 width, bool even)
946 {
947 if (src->flags & line_buf::LFT_32BIT)
948 {
949 assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
950 (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
951 vsx_rev_horz_ana32(atk, ldst, hdst, src, width, even);
952 }
953 else
954 {
955 assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
956 (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) &&
957 (src == NULL || src->flags & line_buf::LFT_64BIT));
958 vsx_rev_horz_ana64(atk, ldst, hdst, src, width, even);
959 }
960 }
961
963 void vsx_rev_horz_syn32(const param_atk* atk, const line_buf* dst,
964 const line_buf* lsrc, const line_buf* hsrc,
965 ui32 width, bool even)
966 {
967 if (width > 1)
968 {
969 bool ev = even;
970 si32* oth = hsrc->i32, * aug = lsrc->i32;
971 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
972 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
973 ui32 num_steps = atk->get_num_steps();
974 for (ui32 j = 0; j < num_steps; ++j)
975 {
976 const lifting_step* s = atk->get_step(j);
977 const si32 a = s->rev.Aatk;
978 const si32 b = s->rev.Batk;
979 const ui8 e = s->rev.Eatk;
980 v128_t va = vsx_i32x4_splat(a);
981 v128_t vb = vsx_i32x4_splat(b);
982
983 // extension
984 oth[-1] = oth[0];
985 oth[oth_width] = oth[oth_width - 1];
986 // lifting step
987 const si32* sp = oth;
988 si32* dp = aug;
989 if (a == 1)
990 { // 5/3 update and any case with a == 1
991 int i = (int)aug_width;
992 if (ev)
993 {
994 for (; i > 0; i -= 4, sp += 4, dp += 4)
995 {
996 v128_t s1 = vsx_v128_load((v128_t*)sp);
997 v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
998 v128_t d = vsx_v128_load((v128_t*)dp);
999 v128_t t = vsx_i32x4_add(s1, s2);
1000 v128_t v = vsx_i32x4_add(vb, t);
1001 v128_t w = vsx_i32x4_shr(v, e);
1002 d = vsx_i32x4_sub(d, w);
1003 vsx_v128_store((v128_t*)dp, d);
1004 }
1005 }
1006 else
1007 {
1008 for (; i > 0; i -= 4, sp += 4, dp += 4)
1009 {
1010 v128_t s1 = vsx_v128_load((v128_t*)sp);
1011 v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
1012 v128_t d = vsx_v128_load((v128_t*)dp);
1013 v128_t t = vsx_i32x4_add(s1, s2);
1014 v128_t v = vsx_i32x4_add(vb, t);
1015 v128_t w = vsx_i32x4_shr(v, e);
1016 d = vsx_i32x4_sub(d, w);
1017 vsx_v128_store((v128_t*)dp, d);
1018 }
1019 }
1020 }
1021 else if (a == -1 && b == 1 && e == 1)
1022 { // 5/3 predict
1023 int i = (int)aug_width;
1024 if (ev)
1025 for (; i > 0; i -= 4, sp += 4, dp += 4)
1026 {
1027 v128_t s1 = vsx_v128_load((v128_t*)sp);
1028 v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
1029 v128_t d = vsx_v128_load((v128_t*)dp);
1030 v128_t t = vsx_i32x4_add(s1, s2);
1031 v128_t w = vsx_i32x4_shr(t, e);
1032 d = vsx_i32x4_add(d, w);
1033 vsx_v128_store((v128_t*)dp, d);
1034 }
1035 else
1036 for (; i > 0; i -= 4, sp += 4, dp += 4)
1037 {
1038 v128_t s1 = vsx_v128_load((v128_t*)sp);
1039 v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
1040 v128_t d = vsx_v128_load((v128_t*)dp);
1041 v128_t t = vsx_i32x4_add(s1, s2);
1042 v128_t w = vsx_i32x4_shr(t, e);
1043 d = vsx_i32x4_add(d, w);
1044 vsx_v128_store((v128_t*)dp, d);
1045 }
1046 }
1047 else if (a == -1)
1048 { // any case with a == -1, which is not 5/3 predict
1049 int i = (int)aug_width;
1050 if (ev)
1051 for (; i > 0; i -= 4, sp += 4, dp += 4)
1052 {
1053 v128_t s1 = vsx_v128_load((v128_t*)sp);
1054 v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
1055 v128_t d = vsx_v128_load((v128_t*)dp);
1056 v128_t t = vsx_i32x4_add(s1, s2);
1057 v128_t v = vsx_i32x4_sub(vb, t);
1058 v128_t w = vsx_i32x4_shr(v, e);
1059 d = vsx_i32x4_sub(d, w);
1060 vsx_v128_store((v128_t*)dp, d);
1061 }
1062 else
1063 for (; i > 0; i -= 4, sp += 4, dp += 4)
1064 {
1065 v128_t s1 = vsx_v128_load((v128_t*)sp);
1066 v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
1067 v128_t d = vsx_v128_load((v128_t*)dp);
1068 v128_t t = vsx_i32x4_add(s1, s2);
1069 v128_t v = vsx_i32x4_sub(vb, t);
1070 v128_t w = vsx_i32x4_shr(v, e);
1071 d = vsx_i32x4_sub(d, w);
1072 vsx_v128_store((v128_t*)dp, d);
1073 }
1074 }
1075 else
1076 { // general case
1077 int i = (int)aug_width;
1078 if (ev)
1079 for (; i > 0; i -= 4, sp += 4, dp += 4)
1080 {
1081 v128_t s1 = vsx_v128_load((v128_t*)sp);
1082 v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
1083 v128_t d = vsx_v128_load((v128_t*)dp);
1084 v128_t t = vsx_i32x4_add(s1, s2);
1085 v128_t u = vsx_i32x4_mul(va, t);
1086 v128_t v = vsx_i32x4_add(vb, u);
1087 v128_t w = vsx_i32x4_shr(v, e);
1088 d = vsx_i32x4_sub(d, w);
1089 vsx_v128_store((v128_t*)dp, d);
1090 }
1091 else
1092 for (; i > 0; i -= 4, sp += 4, dp += 4)
1093 {
1094 v128_t s1 = vsx_v128_load((v128_t*)sp);
1095 v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
1096 v128_t d = vsx_v128_load((v128_t*)dp);
1097 v128_t t = vsx_i32x4_add(s1, s2);
1098 v128_t u = vsx_i32x4_mul(va, t);
1099 v128_t v = vsx_i32x4_add(vb, u);
1100 v128_t w = vsx_i32x4_shr(v, e);
1101 d = vsx_i32x4_sub(d, w);
1102 vsx_v128_store((v128_t*)dp, d);
1103 }
1104 }
1105
1106 // swap buffers
1107 si32* t = aug; aug = oth; oth = t;
1108 ev = !ev;
1109 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1110 }
1111
1112 // combine both lsrc and hsrc into dst
1113 {
1114 float* dp = dst->f32;
1115 float* spl = even ? lsrc->f32 : hsrc->f32;
1116 float* sph = even ? hsrc->f32 : lsrc->f32;
1117 int w = (int)width;
1118 vsx_interleave32(dp, spl, sph, w);
1119 }
1120 }
1121 else {
1122 if (even)
1123 dst->i32[0] = lsrc->i32[0];
1124 else
1125 dst->i32[0] = hsrc->i32[0] >> 1;
1126 }
1127 }
1128
1130 void vsx_rev_horz_syn64(const param_atk* atk, const line_buf* dst,
1131 const line_buf* lsrc, const line_buf* hsrc,
1132 ui32 width, bool even)
1133 {
1134 if (width > 1)
1135 {
1136 bool ev = even;
1137 si64* oth = hsrc->i64, * aug = lsrc->i64;
1138 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
1139 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
1140 ui32 num_steps = atk->get_num_steps();
1141 for (ui32 j = 0; j < num_steps; ++j)
1142 {
1143 const lifting_step* s = atk->get_step(j);
1144 const si32 a = s->rev.Aatk;
1145 const si32 b = s->rev.Batk;
1146 const ui8 e = s->rev.Eatk;
1147 v128_t va = vsx_i64x2_splat(a);
1148 v128_t vb = vsx_i64x2_splat(b);
1149
1150 // extension
1151 oth[-1] = oth[0];
1152 oth[oth_width] = oth[oth_width - 1];
1153 // lifting step
1154 const si64* sp = oth;
1155 si64* dp = aug;
1156 if (a == 1)
1157 { // 5/3 update and any case with a == 1
1158 int i = (int)aug_width;
1159 if (ev)
1160 {
1161 for (; i > 0; i -= 2, sp += 2, dp += 2)
1162 {
1163 v128_t s1 = vsx_v128_load((v128_t*)sp);
1164 v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
1165 v128_t d = vsx_v128_load((v128_t*)dp);
1166 v128_t t = vsx_i64x2_add(s1, s2);
1167 v128_t v = vsx_i64x2_add(vb, t);
1168 v128_t w = vsx_i64x2_shr(v, e);
1169 d = vsx_i64x2_sub(d, w);
1170 vsx_v128_store((v128_t*)dp, d);
1171 }
1172 }
1173 else
1174 {
1175 for (; i > 0; i -= 2, sp += 2, dp += 2)
1176 {
1177 v128_t s1 = vsx_v128_load((v128_t*)sp);
1178 v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
1179 v128_t d = vsx_v128_load((v128_t*)dp);
1180 v128_t t = vsx_i64x2_add(s1, s2);
1181 v128_t v = vsx_i64x2_add(vb, t);
1182 v128_t w = vsx_i64x2_shr(v, e);
1183 d = vsx_i64x2_sub(d, w);
1184 vsx_v128_store((v128_t*)dp, d);
1185 }
1186 }
1187 }
1188 else if (a == -1 && b == 1 && e == 1)
1189 { // 5/3 predict
1190 int i = (int)aug_width;
1191 if (ev)
1192 for (; i > 0; i -= 2, sp += 2, dp += 2)
1193 {
1194 v128_t s1 = vsx_v128_load((v128_t*)sp);
1195 v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
1196 v128_t d = vsx_v128_load((v128_t*)dp);
1197 v128_t t = vsx_i64x2_add(s1, s2);
1198 v128_t w = vsx_i64x2_shr(t, e);
1199 d = vsx_i64x2_add(d, w);
1200 vsx_v128_store((v128_t*)dp, d);
1201 }
1202 else
1203 for (; i > 0; i -= 2, sp += 2, dp += 2)
1204 {
1205 v128_t s1 = vsx_v128_load((v128_t*)sp);
1206 v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
1207 v128_t d = vsx_v128_load((v128_t*)dp);
1208 v128_t t = vsx_i64x2_add(s1, s2);
1209 v128_t w = vsx_i64x2_shr(t, e);
1210 d = vsx_i64x2_add(d, w);
1211 vsx_v128_store((v128_t*)dp, d);
1212 }
1213 }
1214 else if (a == -1)
1215 { // any case with a == -1, which is not 5/3 predict
1216 int i = (int)aug_width;
1217 if (ev)
1218 for (; i > 0; i -= 2, sp += 2, dp += 2)
1219 {
1220 v128_t s1 = vsx_v128_load((v128_t*)sp);
1221 v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
1222 v128_t d = vsx_v128_load((v128_t*)dp);
1223 v128_t t = vsx_i64x2_add(s1, s2);
1224 v128_t v = vsx_i64x2_sub(vb, t);
1225 v128_t w = vsx_i64x2_shr(v, e);
1226 d = vsx_i64x2_sub(d, w);
1227 vsx_v128_store((v128_t*)dp, d);
1228 }
1229 else
1230 for (; i > 0; i -= 2, sp += 2, dp += 2)
1231 {
1232 v128_t s1 = vsx_v128_load((v128_t*)sp);
1233 v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
1234 v128_t d = vsx_v128_load((v128_t*)dp);
1235 v128_t t = vsx_i64x2_add(s1, s2);
1236 v128_t v = vsx_i64x2_sub(vb, t);
1237 v128_t w = vsx_i64x2_shr(v, e);
1238 d = vsx_i64x2_sub(d, w);
1239 vsx_v128_store((v128_t*)dp, d);
1240 }
1241 }
1242 else
1243 { // general case
1244 int i = (int)aug_width;
1245 if (ev)
1246 for (; i > 0; i -= 2, sp += 2, dp += 2)
1247 {
1248 v128_t s1 = vsx_v128_load((v128_t*)sp);
1249 v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
1250 v128_t d = vsx_v128_load((v128_t*)dp);
1251 v128_t t = vsx_i64x2_add(s1, s2);
1252 v128_t u = vsx_i64x2_mul(va, t);
1253 v128_t v = vsx_i64x2_add(vb, u);
1254 v128_t w = vsx_i64x2_shr(v, e);
1255 d = vsx_i64x2_sub(d, w);
1256 vsx_v128_store((v128_t*)dp, d);
1257 }
1258 else
1259 for (; i > 0; i -= 2, sp += 2, dp += 2)
1260 {
1261 v128_t s1 = vsx_v128_load((v128_t*)sp);
1262 v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
1263 v128_t d = vsx_v128_load((v128_t*)dp);
1264 v128_t t = vsx_i64x2_add(s1, s2);
1265 v128_t u = vsx_i64x2_mul(va, t);
1266 v128_t v = vsx_i64x2_add(vb, u);
1267 v128_t w = vsx_i64x2_shr(v, e);
1268 d = vsx_i64x2_sub(d, w);
1269 vsx_v128_store((v128_t*)dp, d);
1270 }
1271 }
1272
1273 // swap buffers
1274 si64* t = aug; aug = oth; oth = t;
1275 ev = !ev;
1276 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1277 }
1278
1279 // combine both lsrc and hsrc into dst
1280 {
1281 void* dp = dst->p;
1282 const void* spl = even ? lsrc->p : hsrc->p;
1283 const void* sph = even ? hsrc->p : lsrc->p;
1284 int w = (int)width;
1285 vsx_interleave64(dp, spl, sph, w);
1286 }
1287 }
1288 else {
1289 if (even)
1290 dst->i64[0] = lsrc->i64[0];
1291 else
1292 dst->i64[0] = hsrc->i64[0] >> 1;
1293 }
1294 }
1295
1297 void vsx_rev_horz_syn(const param_atk* atk, const line_buf* dst,
1298 const line_buf* lsrc, const line_buf* hsrc,
1299 ui32 width, bool even)
1300 {
1301 if (dst->flags & line_buf::LFT_32BIT)
1302 {
1303 assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) &&
1304 (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
1305 vsx_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
1306 }
1307 else
1308 {
1309 assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
1310 (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) &&
1311 (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
1312 vsx_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
1313 }
1314 }
1315
1316 } // !local
1317} // !ojph
float * f32
Definition ojph_mem.h:187
void vsx_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
static void vsx_multiply_const(float *p, float f, int width)
static void vsx_rev_horz_ana64(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void vsx_rev_horz_syn64(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void vsx_deinterleave32(float *dpl, float *dph, float *sp, int width)
void vsx_rev_vert_step32(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void vsx_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
static void vsx_rev_horz_ana32(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
static void vsx_interleave32(float *dp, float *spl, float *sph, int width)
void vsx_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void vsx_rev_horz_syn32(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void vsx_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void vsx_rev_vert_step64(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void vsx_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void vsx_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void vsx_deinterleave64(void *dpl, void *dph, const void *sp, int width)
void vsx_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void vsx_interleave64(void *dp, const void *spl, const void *sph, int width)
int64_t si64
Definition ojph_defs.h:57
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50
static v128_t vsx_i32x4_sub(v128_t a, v128_t b)
static v128_t vsx_f32x4_mul(v128_t a, v128_t b)
static v128_t vsx_f32x4_sub(v128_t a, v128_t b)
static v128_t vsx_f32x4_add(v128_t a, v128_t b)
static v128_t vsx_f32x4_splat(float x)
static v128_t vsx_i32x4_add(v128_t a, v128_t b)
__vector unsigned char v128_t
static void vsx_v128_store(void *p, v128_t a)
static v128_t vsx_i32x4_mul(v128_t a, v128_t b)
#define vsx_i64x2_shuffle(a, b, c0, c1)
static v128_t vsx_i64x2_mul(v128_t a, v128_t b)
#define vsx_i32x4_shuffle(a, b, c0, c1, c2, c3)
static v128_t vsx_i64x2_splat(long long x)
static v128_t vsx_i64x2_shr(v128_t a, int n)
static v128_t vsx_i64x2_add(v128_t a, v128_t b)
static v128_t vsx_i64x2_sub(v128_t a, v128_t b)
static v128_t vsx_i32x4_splat(int x)
static v128_t vsx_v128_load(const void *p)
static v128_t vsx_i32x4_shr(v128_t a, int n)
const lifting_step * get_step(ui32 s) const