40#include "ojph_simd_vsx.h"
59 __asm__(
"xvrspi %x0,%x1" :
"=wa"(w) :
"wa"((
vsx_v_f32)a));
60 return (
v128_t)vec_cts(w, 0);
65 const ui32 src_line_offset,
67 const ui32 dst_line_offset,
74 const si32 *sp = src_line->
i32 + src_line_offset;
75 si32 *dp = dst_line->
i32 + dst_line_offset;
77 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
86 const si32 *sp = src_line->
i32 + src_line_offset;
87 si64 *dp = dst_line->
i64 + dst_line_offset;
89 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
108 const si64 *sp = src_line->
i64 + src_line_offset;
109 si32 *dp = dst_line->
i32 + dst_line_offset;
111 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
126 const ui32 src_line_offset,
128 const ui32 dst_line_offset,
135 const si32 *sp = src_line->
i32 + src_line_offset;
136 si32 *dp = dst_line->
i32 + dst_line_offset;
139 for (
int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
152 const si32 *sp = src_line->
i32 + src_line_offset;
153 si64 *dp = dst_line->
i64 + dst_line_offset;
156 for (
int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
185 const si64 *sp = src_line->
i64 + src_line_offset;
186 si32 *dp = dst_line->
i32 + dst_line_offset;
189 for (
int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
193 v128_t s, t0, t1, p, n, m, tm;
220 for (
ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
235 for (
ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
250 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
266 for (
int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
292 template <
bool NLT_TYPE3>
296 ui32 bit_depth,
bool is_signed,
ui32 width)
303 assert(bit_depth <= 32);
304 const float* sp = src_line->
f32;
305 si32* dp = dst_line->
i32 + dst_line_offset;
312 si32 neg_limit = (
si32)INT_MIN >> (32 - bit_depth);
323 for (
int i = (
int)width; i > 0; i -= 4, sp += 4, dp += 4) {
343 for (
int i = (
int)width; i > 0; i -= 4, sp += 4, dp += 4) {
358 ui32 bit_depth,
bool is_signed,
ui32 width)
361 dst_line_offset, bit_depth, is_signed, width);
367 ui32 bit_depth,
bool is_signed,
ui32 width)
370 dst_line_offset, bit_depth, is_signed, width);
374 template <
bool NLT_TYPE3>
378 ui32 bit_depth,
bool is_signed,
ui32 width)
385 assert(bit_depth <= 32);
388 const si32* sp = src_line->
i32 + src_line_offset;
389 float* dp = dst_line->
f32;
394 for (
int i = (
int)width; i > 0; i -= 4, sp += 4, dp += 4) {
412 for (
int i = (
int)width; i > 0; i -= 4, sp += 4, dp += 4) {
425 ui32 bit_depth,
bool is_signed,
ui32 width)
428 dst_line, bit_depth, is_signed, width);
434 ui32 bit_depth,
bool is_signed,
ui32 width)
437 dst_line, bit_depth, is_signed, width);
465 for (
int i = (repeat + 3) >> 2; i > 0; --i)
478 rp += 4; gp += 4; bp += 4;
479 yp += 4; cbp += 4; crp += 4;
492 for (
int i = (repeat + 3) >> 2; i > 0; --i)
510 yp += 2; cbp += 2; crp += 2;
524 rp += 4; gp += 4; bp += 4;
525 yp += 2; cbp += 2; crp += 2;
554 for (
int i = (repeat + 3) >> 2; i > 0; --i)
568 yp += 4; cbp += 4; crp += 4;
569 rp += 4; gp += 4; bp += 4;
582 for (
int i = (repeat + 3) >> 2; i > 0; --i)
584 v128_t my, mcb, mcr, tr0, tg0, tb0, tr1, tg1, tb1;
594 yp += 2; cbp += 2; crp += 2;
613 yp += 2; cbp += 2; crp += 2;
614 rp += 4; gp += 4; bp += 4;
621 float *y,
float *cb,
float *cr,
ui32 repeat)
628 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i)
639 r += 4; g += 4; b += 4;
640 y += 4; cb += 4; cr += 4;
646 float *r,
float *g,
float *b,
ui32 repeat)
652 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i)
662 y += 4; cb += 4; cr += 4;
663 r += 4; g += 4; b += 4;
static v128_t ojph_vsx_i32x4_min_lt(v128_t a, v128_t b, v128_t x, v128_t y)
void vsx_irv_convert_to_float_nlt_type3(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
static void local_vsx_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
void vsx_ict_backward(const float *y, const float *cb, const float *cr, float *r, float *g, float *b, ui32 repeat)
void vsx_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width)
void vsx_rev_convert_nlt_type3(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void vsx_ict_forward(const float *r, const float *g, const float *b, float *y, float *cb, float *cr, ui32 repeat)
void vsx_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width)
void vsx_rct_backward(const line_buf *y, const line_buf *cb, const line_buf *cr, line_buf *r, line_buf *g, line_buf *b, ui32 repeat)
static v128_t ojph_convert_float_to_i32(v128_t a)
void vsx_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, ui32 width)
void vsx_rev_convert(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
static void local_vsx_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
void vsx_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, ui32 width)
void vsx_rct_forward(const line_buf *r, const line_buf *g, const line_buf *b, line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat)
void vsx_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
void vsx_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
void vsx_irv_convert_to_integer_nlt_type3(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
static v128_t ojph_vsx_i32x4_max_ge(v128_t a, v128_t b, v128_t x, v128_t y)
__vector unsigned int vsx_v_u32
static v128_t vsx_i32x4_sub(v128_t a, v128_t b)
static v128_t vsx_f32x4_mul(v128_t a, v128_t b)
static v128_t vsx_f32x4_sub(v128_t a, v128_t b)
static v128_t vsx_i64x2_lt(v128_t a, v128_t b)
static v128_t vsx_f32x4_convert_i32x4(v128_t a)
static v128_t vsx_f32x4_add(v128_t a, v128_t b)
static v128_t vsx_f32x4_ge(v128_t a, v128_t b)
static v128_t vsx_i64x2_extend_high_i32x4(v128_t a)
static v128_t vsx_f32x4_splat(float x)
static v128_t vsx_i32x4_shl(v128_t a, int n)
static v128_t vsx_i32x4_add(v128_t a, v128_t b)
static v128_t vsx_f32x4_lt(v128_t a, v128_t b)
static v128_t vsx_i64x2_extend_low_i32x4(v128_t a)
__vector unsigned char v128_t
static v128_t vsx_v128_andnot(v128_t a, v128_t b)
static void vsx_v128_store(void *p, v128_t a)
static v128_t vsx_v128_and(v128_t a, v128_t b)
static v128_t vsx_i64x2_shl(v128_t a, int n)
static v128_t vsx_v128_or(v128_t a, v128_t b)
static v128_t vsx_i32x4_lt(v128_t a, v128_t b)
#define vsx_i32x4_shuffle(a, b, c0, c1, c2, c3)
static v128_t vsx_i64x2_splat(long long x)
static v128_t vsx_i64x2_shr(v128_t a, int n)
static v128_t vsx_i64x2_add(v128_t a, v128_t b)
static v128_t vsx_i64x2_sub(v128_t a, v128_t b)
static v128_t vsx_i32x4_splat(int x)
static v128_t vsx_v128_load(const void *p)
static v128_t vsx_i32x4_shr(v128_t a, int n)
static v128_t vsx_i32x4_gt(v128_t a, v128_t b)
static const float GAMMA_CR2R
static const float BETA_CbF
static const float GAMMA_CB2B
static const float ALPHA_RF
static const float GAMMA_CB2G
static const float GAMMA_CR2G
static const float ALPHA_BF
static const float BETA_CrF
static const float ALPHA_GF