OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_simd_vsx.h
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2026, Aous Naman
6// Copyright (c) 2026, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2026, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_simd_vsx.h
34//
35// 128-bit SIMD helpers for POWER VSX, used by the ojph_*_vsx.cpp
36// kernels. Lane numbering and operation semantics follow the same
37// conventions as the other 128-bit kernels in this codebase (lane 0
38// is the lowest memory address). Supported targets are POWER9
39// (ISA 3.0) and newer, little-endian only (ppc64le).
40//***************************************************************************/
41
42#ifndef OJPH_SIMD_VSX_H
43#define OJPH_SIMD_VSX_H
44
45#if !defined(__powerpc64__) && !defined(__PPC64__)
46 #error "this header is for 64-bit POWER targets only"
47#endif
48#if !defined(__LITTLE_ENDIAN__) && \
49 !(defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
50 #error "this header assumes a little-endian target (ppc64le)"
51#endif
52
53#include <altivec.h>
54#include <cstring>
55
56#include "ojph_defs.h"
57
58// altivec.h leaks these context-sensitive keywords as macros under GNU C;
59// they break standard headers and the codebase (e.g. std::vector)
60#undef vector
61#undef pixel
62#undef bool
63
64typedef __vector unsigned char v128_t;
65
66typedef __vector signed char vsx_v_i8;
67typedef __vector unsigned char vsx_v_u8;
68typedef __vector signed short vsx_v_i16;
69typedef __vector unsigned short vsx_v_u16;
70typedef __vector signed int vsx_v_i32;
71typedef __vector unsigned int vsx_v_u32;
72typedef __vector signed long long vsx_v_i64;
73typedef __vector unsigned long long vsx_v_u64;
74typedef __vector float vsx_v_f32;
75
76//---------------------------------------------------------------------------
77// load/store (alignment-agnostic; lxv/stxv handle unaligned addresses)
78//---------------------------------------------------------------------------
79static inline v128_t vsx_v128_load(const void *p)
80{ return vec_xl(0, (const unsigned char *)p); }
81
82static inline void vsx_v128_store(void *p, v128_t a)
83{ vec_xst(a, 0, (unsigned char *)p); }
84
85#define vsx_v128_store32_lane(p, a, i) \
86 do { vsx_v_i32 t_ = (vsx_v_i32)(a); int v_ = t_[(i)]; \
87 std::memcpy((p), &v_, 4); } while (0)
88
89//---------------------------------------------------------------------------
90// constants, splats, makes
91//---------------------------------------------------------------------------
92// functions, not macros, so that an argument that is itself a macro
93// expanding to an argument list (e.g. OJPH_REPEAT4) works
94static inline v128_t vsx_i8x16_const(
95 signed char c0, signed char c1, signed char c2, signed char c3,
96 signed char c4, signed char c5, signed char c6, signed char c7,
97 signed char c8, signed char c9, signed char c10, signed char c11,
98 signed char c12, signed char c13, signed char c14, signed char c15)
99{ vsx_v_i8 v = {c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15};
100 return (v128_t)v; }
101static inline v128_t vsx_i16x8_const(short c0, short c1, short c2,
102 short c3, short c4, short c5,
103 short c6, short c7)
104{ vsx_v_i16 v = {c0,c1,c2,c3,c4,c5,c6,c7}; return (v128_t)v; }
105static inline v128_t vsx_u16x8_const(unsigned short c0, unsigned short c1,
106 unsigned short c2, unsigned short c3,
107 unsigned short c4, unsigned short c5,
108 unsigned short c6, unsigned short c7)
109{ vsx_v_u16 v = {c0,c1,c2,c3,c4,c5,c6,c7}; return (v128_t)v; }
110static inline v128_t vsx_i32x4_const(int c0, int c1, int c2, int c3)
111{ vsx_v_i32 v = {c0,c1,c2,c3}; return (v128_t)v; }
112static inline v128_t vsx_u32x4_const(unsigned int c0, unsigned int c1,
113 unsigned int c2, unsigned int c3)
114{ vsx_v_u32 v = {c0,c1,c2,c3}; return (v128_t)v; }
115static inline v128_t vsx_i64x2_const(long long c0, long long c1)
116{ vsx_v_i64 v = {c0,c1}; return (v128_t)v; }
117static inline v128_t vsx_u64x2_const(unsigned long long c0,
118 unsigned long long c1)
119{ vsx_v_u64 v = {c0,c1}; return (v128_t)v; }
120
121static inline v128_t vsx_i8x16_splat(signed char x)
122{ ojph_unused(x); return (v128_t)vec_splats(x); }
123static inline v128_t vsx_i16x8_splat(short x)
124{ ojph_unused(x); return (v128_t)vec_splats(x); }
125static inline v128_t vsx_i32x4_splat(int x)
126{ ojph_unused(x); return (v128_t)vec_splats(x); }
127static inline v128_t vsx_u32x4_splat(unsigned int x)
128{ ojph_unused(x); return (v128_t)vec_splats(x); }
129static inline v128_t vsx_i64x2_splat(long long x)
130{ ojph_unused(x); return (v128_t)vec_splats((signed long long)x); }
131static inline v128_t vsx_f32x4_splat(float x)
132{ ojph_unused(x); return (v128_t)vec_splats(x); }
133
134static inline v128_t vsx_i32x4_make(int a, int b, int c, int d)
135{ return (v128_t)(vsx_v_i32){a, b, c, d}; }
136
137//---------------------------------------------------------------------------
138// lane extraction (subscript is little-endian lane order)
139//---------------------------------------------------------------------------
140#define vsx_u8x16_extract_lane(a, i) (((vsx_v_u8)(a))[(i)])
141#define vsx_u16x8_extract_lane(a, i) (((vsx_v_u16)(a))[(i)])
142#define vsx_i32x4_extract_lane(a, i) (((vsx_v_i32)(a))[(i)])
143#define vsx_u32x4_extract_lane(a, i) (((vsx_v_u32)(a))[(i)])
144#define vsx_i64x2_extract_lane(a, i) (((vsx_v_i64)(a))[(i)])
145
146//---------------------------------------------------------------------------
147// bitwise
148//---------------------------------------------------------------------------
149static inline v128_t vsx_v128_and(v128_t a, v128_t b)
150{ return vec_and(a, b); }
151static inline v128_t vsx_v128_or(v128_t a, v128_t b)
152{ return vec_or(a, b); }
153static inline v128_t vsx_v128_xor(v128_t a, v128_t b)
154{ return vec_xor(a, b); }
155// a & ~b (same operand order as vec_andc)
157{ return vec_andc(a, b); }
158
159//---------------------------------------------------------------------------
160// integer arithmetic
161//---------------------------------------------------------------------------
163{ return (v128_t)vec_add((vsx_v_i8)a, (vsx_v_i8)b); }
165{ return (v128_t)vec_add((vsx_v_i16)a, (vsx_v_i16)b); }
167{ return (v128_t)vec_add((vsx_v_i32)a, (vsx_v_i32)b); }
169{ return (v128_t)vec_add((vsx_v_i64)a, (vsx_v_i64)b); }
170
172{ return (v128_t)vec_sub((vsx_v_i16)a, (vsx_v_i16)b); }
174{ return (v128_t)vec_sub((vsx_v_i32)a, (vsx_v_i32)b); }
176{ return (v128_t)vec_sub((vsx_v_i64)a, (vsx_v_i64)b); }
177
178// low half of products; vmladduhm / vmuluwm; i64x2 is lowered by the
179// compiler (mulld on ISA 3.0, vmulld on ISA 3.1)
181{ return (v128_t)((vsx_v_i16)a * (vsx_v_i16)b); }
183{ return (v128_t)((vsx_v_i32)a * (vsx_v_i32)b); }
185{ return (v128_t)((vsx_v_i64)a * (vsx_v_i64)b); }
186
188{ return (v128_t)vec_abs((vsx_v_i8)a); }
190{ return (v128_t)vec_min((vsx_v_u8)a, (vsx_v_u8)b); }
192{ return (v128_t)vec_max((vsx_v_i16)a, (vsx_v_i16)b); }
193
194//---------------------------------------------------------------------------
195// shifts (scalar count, modulo lane width)
196//---------------------------------------------------------------------------
197static inline v128_t vsx_i16x8_shl(v128_t a, int n)
198{ return (v128_t)vec_sl((vsx_v_i16)a, vec_splats((unsigned short)n)); }
199static inline v128_t vsx_i32x4_shl(v128_t a, int n)
200{ return (v128_t)vec_sl((vsx_v_i32)a, vec_splats((unsigned int)n)); }
201static inline v128_t vsx_i64x2_shl(v128_t a, int n)
202{ return (v128_t)vec_sl((vsx_v_i64)a,
203 vec_splats((unsigned long long)n)); }
204
205static inline v128_t vsx_i32x4_shr(v128_t a, int n) // arithmetic
206{ return (v128_t)vec_sra((vsx_v_i32)a, vec_splats((unsigned int)n)); }
207static inline v128_t vsx_i64x2_shr(v128_t a, int n) // arithmetic
208{ return (v128_t)vec_sra((vsx_v_i64)a,
209 vec_splats((unsigned long long)n)); }
210
211static inline v128_t vsx_u16x8_shr(v128_t a, int n) // logical
212{ return (v128_t)vec_sr((vsx_v_u16)a, vec_splats((unsigned short)n)); }
213static inline v128_t vsx_u32x4_shr(v128_t a, int n) // logical
214{ return (v128_t)vec_sr((vsx_v_u32)a, vec_splats((unsigned int)n)); }
215static inline v128_t vsx_u64x2_shr(v128_t a, int n) // logical
216{ return (v128_t)vec_sr((vsx_v_u64)a,
217 vec_splats((unsigned long long)n)); }
218
219//---------------------------------------------------------------------------
220// comparisons (true lanes -> all-ones, false lanes -> all-zeros)
221//---------------------------------------------------------------------------
222static inline v128_t vsx_i8x16_eq(v128_t a, v128_t b)
223{ return (v128_t)vec_cmpeq((vsx_v_i8)a, (vsx_v_i8)b); }
224static inline v128_t vsx_i16x8_eq(v128_t a, v128_t b)
225{ return (v128_t)vec_cmpeq((vsx_v_i16)a, (vsx_v_i16)b); }
226static inline v128_t vsx_i32x4_eq(v128_t a, v128_t b)
227{ return (v128_t)vec_cmpeq((vsx_v_i32)a, (vsx_v_i32)b); }
228
229static inline v128_t vsx_i8x16_gt(v128_t a, v128_t b)
230{ return (v128_t)vec_cmpgt((vsx_v_i8)a, (vsx_v_i8)b); }
231static inline v128_t vsx_i32x4_gt(v128_t a, v128_t b)
232{ return (v128_t)vec_cmpgt((vsx_v_i32)a, (vsx_v_i32)b); }
233static inline v128_t vsx_i32x4_lt(v128_t a, v128_t b)
234{ return (v128_t)vec_cmplt((vsx_v_i32)a, (vsx_v_i32)b); }
235static inline v128_t vsx_i64x2_lt(v128_t a, v128_t b)
236{ return (v128_t)vec_cmplt((vsx_v_i64)a, (vsx_v_i64)b); }
237
238static inline v128_t vsx_f32x4_ge(v128_t a, v128_t b)
239{ return (v128_t)vec_cmpge((vsx_v_f32)a, (vsx_v_f32)b); }
240static inline v128_t vsx_f32x4_lt(v128_t a, v128_t b)
241{ return (v128_t)vec_cmplt((vsx_v_f32)a, (vsx_v_f32)b); }
242
243//---------------------------------------------------------------------------
244// float arithmetic and conversions
245//---------------------------------------------------------------------------
247{ return (v128_t)vec_add((vsx_v_f32)a, (vsx_v_f32)b); }
249{ return (v128_t)vec_sub((vsx_v_f32)a, (vsx_v_f32)b); }
251{ return (v128_t)vec_mul((vsx_v_f32)a, (vsx_v_f32)b); }
252
253// xvcvspsxws: truncating, saturating (NaN gives 0x80000000; the
254// callers never pass NaN)
256{ return (v128_t)vec_cts((vsx_v_f32)a, 0); }
258{ return (v128_t)vec_ctf((vsx_v_i32)a, 0); }
259
260//---------------------------------------------------------------------------
261// widening
262//---------------------------------------------------------------------------
264{
265 // vsx_v_i32 v = (vsx_v_i32)a;
266 // return (v128_t)__builtin_convertvector(
267 // __builtin_shufflevector(v, v, 0, 1), vsx_v_i64);
268
269 // Unpacks and sign-extends elements 0 and 1 on Little Endian
270 return (v128_t)vec_unpackl((vsx_v_i32)a);
271}
273{
274 // vsx_v_i32 v = (vsx_v_i32)a;
275 // return (v128_t)__builtin_convertvector(
276 // __builtin_shufflevector(v, v, 2, 3), vsx_v_i64);
277
278 // Unpacks and sign-extends elements 2 and 3 on Little Endian
279 return (v128_t)vec_unpackh((vsx_v_i32)a);
280}
281
282//---------------------------------------------------------------------------
283// shuffles (immediate lane indices; 0..N-1 from a, N..2N-1 from b)
284//---------------------------------------------------------------------------
285// #define vsx_i8x16_shuffle(a, b, c0,c1,c2,c3,c4,c5,c6,c7,
286// c8,c9,c10,c11,c12,c13,c14,c15)
287// ((v128_t)__builtin_shufflevector((vsx_v_u8)(a), (vsx_v_u8)(b),
288// c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15))
289// #define vsx_i16x8_shuffle(a, b, c0,c1,c2,c3,c4,c5,c6,c7)
290// ((v128_t)__builtin_shufflevector((vsx_v_i16)(a), (vsx_v_i16)(b),
291// c0,c1,c2,c3,c4,c5,c6,c7))
292// #define vsx_i32x4_shuffle(a, b, c0,c1,c2,c3)
293// ((v128_t)__builtin_shufflevector((vsx_v_i32)(a), (vsx_v_i32)(b),
294// c0,c1,c2,c3))
295// #define vsx_i64x2_shuffle(a, b, c0,c1)
296// ((v128_t)__builtin_shufflevector((vsx_v_i64)(a), (vsx_v_i64)(b), c0,c1))
297
298// 8-bit Shuffle (Maps direct element indices to raw byte indices)
299#define vsx_i8x16_shuffle(a, b, c0,c1,c2,c3,c4,c5,c6,c7, \
300 c8,c9,c10,c11,c12,c13,c14,c15) \
301 ((v128_t)vec_perm((vsx_v_u8)(a), (vsx_v_u8)(b), (vsx_v_u8){ \
302 (c0), (c1), (c2), (c3), (c4), (c5), (c6), (c7), \
303 (c8), (c9), (c10),(c11),(c12),(c13),(c14),(c15) \
304 }))
305
306// 16-bit Shuffle (Multiplies element index by 2 to get byte offsets)
307#define vsx_i16x8_shuffle(a, b, c0,c1,c2,c3,c4,c5,c6,c7) \
308 ((v128_t)vec_perm((vsx_v_u8)(a), (vsx_v_u8)(b), (vsx_v_u8){ \
309 (c0)*2, (c0)*2+1, (c1)*2, (c1)*2+1, \
310 (c2)*2, (c2)*2+1, (c3)*2, (c3)*2+1, \
311 (c4)*2, (c4)*2+1, (c5)*2, (c5)*2+1, \
312 (c6)*2, (c6)*2+1, (c7)*2, (c7)*2+1 \
313 }))
314
315// 32-bit Shuffle (Multiplies element index by 4 to get byte offsets)
316#define vsx_i32x4_shuffle(a, b, c0,c1,c2,c3) \
317 ((v128_t)vec_perm((vsx_v_u8)(a), (vsx_v_u8)(b), (vsx_v_u8){ \
318 (c0)*4, (c0)*4+1, (c0)*4+2, (c0)*4+3, \
319 (c1)*4, (c1)*4+1, (c1)*4+2, (c1)*4+3, \
320 (c2)*4, (c2)*4+1, (c2)*4+2, (c2)*4+3, \
321 (c3)*4, (c3)*4+1, (c3)*4+2, (c3)*4+3 \
322 }))
323
324// 64-bit Shuffle (Multiplies element index by 8 to get byte offsets)
325#define vsx_i64x2_shuffle(a, b, c0,c1) \
326 ((v128_t)vec_perm((vsx_v_u8)(a), (vsx_v_u8)(b), (vsx_v_u8){ \
327 (c0)*8, (c0)*8+1, (c0)*8+2, (c0)*8+3, \
328 (c0)*8+4, (c0)*8+5, (c0)*8+6, (c0)*8+7, \
329 (c1)*8, (c1)*8+1, (c1)*8+2, (c1)*8+3, \
330 (c1)*8+4, (c1)*8+5, (c1)*8+6, (c1)*8+7 \
331 }))
332
333//---------------------------------------------------------------------------
334// swizzle: runtime byte-table lookup; lanes with index > 15 give 0
335//---------------------------------------------------------------------------
337{
338 v128_t r = vec_perm(a, a, idx);
339 v128_t oob = (v128_t)vec_cmpgt((vsx_v_u8)idx,
340 vec_splats((unsigned char)15));
341 return vec_andc(r, oob);
342}
343
344//---------------------------------------------------------------------------
345// bitmask: MSB of each byte lane -> bit of result, lane 0 -> bit 0
346// (vbpermq gathers the 16 selected bits into bits 48..63 of the
347// big-endian first doubleword, which is doubleword 1 on ppc64le)
348//---------------------------------------------------------------------------
349static inline int vsx_i8x16_bitmask(v128_t a)
350{
351#if defined(__POWER10_VECTOR__)
352 return (int)vec_extractm(a); // ISA 3.1 native movemask
353#else
354 const vsx_v_u8 perm = { 120, 112, 104, 96, 88, 80, 72, 64,
355 56, 48, 40, 32, 24, 16, 8, 0 };
356 vsx_v_u64 r = (vsx_v_u64)vec_bperm(a, perm);
357 return (int)r[1];
358#endif
359}
360
361#endif // OJPH_SIMD_VSX_H
#define ojph_unused(x)
Definition ojph_defs.h:78
static v128_t vsx_i8x16_splat(signed char x)
__vector unsigned int vsx_v_u32
static v128_t vsx_i32x4_sub(v128_t a, v128_t b)
static v128_t vsx_i32x4_make(int a, int b, int c, int d)
static v128_t vsx_i16x8_sub(v128_t a, v128_t b)
static v128_t vsx_u16x8_shr(v128_t a, int n)
static v128_t vsx_f32x4_mul(v128_t a, v128_t b)
static int vsx_i8x16_bitmask(v128_t a)
__vector signed int vsx_v_i32
static v128_t vsx_i8x16_swizzle(v128_t a, v128_t idx)
static v128_t vsx_f32x4_sub(v128_t a, v128_t b)
static v128_t vsx_i8x16_const(signed char c0, signed char c1, signed char c2, signed char c3, signed char c4, signed char c5, signed char c6, signed char c7, signed char c8, signed char c9, signed char c10, signed char c11, signed char c12, signed char c13, signed char c14, signed char c15)
static v128_t vsx_i64x2_lt(v128_t a, v128_t b)
static v128_t vsx_f32x4_convert_i32x4(v128_t a)
static v128_t vsx_f32x4_add(v128_t a, v128_t b)
__vector float vsx_v_f32
static v128_t vsx_v128_xor(v128_t a, v128_t b)
__vector signed char vsx_v_i8
static v128_t vsx_i16x8_mul(v128_t a, v128_t b)
static v128_t vsx_u32x4_shr(v128_t a, int n)
__vector signed long long vsx_v_i64
static v128_t vsx_i16x8_splat(short x)
static v128_t vsx_u32x4_const(unsigned int c0, unsigned int c1, unsigned int c2, unsigned int c3)
static v128_t vsx_i32x4_const(int c0, int c1, int c2, int c3)
static v128_t vsx_u8x16_min(v128_t a, v128_t b)
static v128_t vsx_f32x4_ge(v128_t a, v128_t b)
static v128_t vsx_i64x2_extend_high_i32x4(v128_t a)
static v128_t vsx_f32x4_splat(float x)
__vector signed short vsx_v_i16
static v128_t vsx_u64x2_const(unsigned long long c0, unsigned long long c1)
static v128_t vsx_i32x4_shl(v128_t a, int n)
static v128_t vsx_i16x8_max(v128_t a, v128_t b)
static v128_t vsx_i64x2_const(long long c0, long long c1)
static v128_t vsx_i8x16_abs(v128_t a)
static v128_t vsx_i32x4_add(v128_t a, v128_t b)
static v128_t vsx_f32x4_lt(v128_t a, v128_t b)
static v128_t vsx_i64x2_extend_low_i32x4(v128_t a)
__vector unsigned char v128_t
__vector unsigned char vsx_v_u8
static v128_t vsx_i8x16_add(v128_t a, v128_t b)
static v128_t vsx_u16x8_const(unsigned short c0, unsigned short c1, unsigned short c2, unsigned short c3, unsigned short c4, unsigned short c5, unsigned short c6, unsigned short c7)
static v128_t vsx_i16x8_add(v128_t a, v128_t b)
static v128_t vsx_v128_andnot(v128_t a, v128_t b)
static void vsx_v128_store(void *p, v128_t a)
static v128_t vsx_u64x2_shr(v128_t a, int n)
static v128_t vsx_v128_and(v128_t a, v128_t b)
static v128_t vsx_i64x2_shl(v128_t a, int n)
static v128_t vsx_i32x4_mul(v128_t a, v128_t b)
static v128_t vsx_i16x8_eq(v128_t a, v128_t b)
static v128_t vsx_v128_or(v128_t a, v128_t b)
static v128_t vsx_i32x4_lt(v128_t a, v128_t b)
__vector unsigned short vsx_v_u16
static v128_t vsx_i8x16_eq(v128_t a, v128_t b)
static v128_t vsx_i64x2_mul(v128_t a, v128_t b)
static v128_t vsx_i64x2_splat(long long x)
__vector unsigned long long vsx_v_u64
static v128_t vsx_i64x2_shr(v128_t a, int n)
static v128_t vsx_i64x2_add(v128_t a, v128_t b)
static v128_t vsx_i64x2_sub(v128_t a, v128_t b)
static v128_t vsx_i32x4_splat(int x)
static v128_t vsx_v128_load(const void *p)
static v128_t vsx_i16x8_const(short c0, short c1, short c2, short c3, short c4, short c5, short c6, short c7)
static v128_t vsx_i32x4_shr(v128_t a, int n)
static v128_t vsx_i16x8_shl(v128_t a, int n)
static v128_t vsx_u32x4_splat(unsigned int x)
static v128_t vsx_i32x4_trunc_sat_f32x4(v128_t a)
static v128_t vsx_i8x16_gt(v128_t a, v128_t b)
static v128_t vsx_i32x4_eq(v128_t a, v128_t b)
static v128_t vsx_i32x4_gt(v128_t a, v128_t b)