OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_simd_vsx.h
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2026, Aous Naman
6// Copyright (c) 2026, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2026, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_simd_vsx.h
34//
35// 128-bit SIMD helpers for POWER VSX, used by the ojph_*_vsx.cpp
36// kernels. Lane numbering and operation semantics follow the same
37// conventions as the other 128-bit kernels in this codebase (lane 0
38// is the lowest memory address). Supported targets are POWER9
39// (ISA 3.0) and newer, little-endian only (ppc64le).
40//***************************************************************************/
41
42#ifndef OJPH_SIMD_VSX_H
43#define OJPH_SIMD_VSX_H
44
45#if !defined(__powerpc64__) && !defined(__PPC64__)
46 #error "this header is for 64-bit POWER targets only"
47#endif
48#if !defined(__LITTLE_ENDIAN__) && \
49 !(defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
50 #error "this header assumes a little-endian target (ppc64le)"
51#endif
52
53#include <altivec.h>
54#include <cstring>
55
56// altivec.h leaks these context-sensitive keywords as macros under GNU C;
57// they break standard headers and the codebase (e.g. std::vector)
58#undef vector
59#undef pixel
60#undef bool
61
62typedef __vector unsigned char v128_t;
63
64typedef __vector signed char vsx_v_i8;
65typedef __vector unsigned char vsx_v_u8;
66typedef __vector signed short vsx_v_i16;
67typedef __vector unsigned short vsx_v_u16;
68typedef __vector signed int vsx_v_i32;
69typedef __vector unsigned int vsx_v_u32;
70typedef __vector signed long long vsx_v_i64;
71typedef __vector unsigned long long vsx_v_u64;
72typedef __vector float vsx_v_f32;
73
74//---------------------------------------------------------------------------
75// load/store (alignment-agnostic; lxv/stxv handle unaligned addresses)
76//---------------------------------------------------------------------------
77static inline v128_t vsx_v128_load(const void *p)
78{ return vec_xl(0, (const unsigned char *)p); }
79
80static inline void vsx_v128_store(void *p, v128_t a)
81{ vec_xst(a, 0, (unsigned char *)p); }
82
83#define vsx_v128_store32_lane(p, a, i) \
84 do { vsx_v_i32 t_ = (vsx_v_i32)(a); int v_ = t_[(i)]; \
85 std::memcpy((p), &v_, 4); } while (0)
86
87//---------------------------------------------------------------------------
88// constants, splats, makes
89//---------------------------------------------------------------------------
90// functions, not macros, so that an argument that is itself a macro
91// expanding to an argument list (e.g. OJPH_REPEAT4) works
92static inline v128_t vsx_i8x16_const(
93 signed char c0, signed char c1, signed char c2, signed char c3,
94 signed char c4, signed char c5, signed char c6, signed char c7,
95 signed char c8, signed char c9, signed char c10, signed char c11,
96 signed char c12, signed char c13, signed char c14, signed char c15)
97{ vsx_v_i8 v = {c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15};
98 return (v128_t)v; }
99static inline v128_t vsx_i16x8_const(short c0, short c1, short c2,
100 short c3, short c4, short c5,
101 short c6, short c7)
102{ vsx_v_i16 v = {c0,c1,c2,c3,c4,c5,c6,c7}; return (v128_t)v; }
103static inline v128_t vsx_u16x8_const(unsigned short c0, unsigned short c1,
104 unsigned short c2, unsigned short c3,
105 unsigned short c4, unsigned short c5,
106 unsigned short c6, unsigned short c7)
107{ vsx_v_u16 v = {c0,c1,c2,c3,c4,c5,c6,c7}; return (v128_t)v; }
108static inline v128_t vsx_i32x4_const(int c0, int c1, int c2, int c3)
109{ vsx_v_i32 v = {c0,c1,c2,c3}; return (v128_t)v; }
110static inline v128_t vsx_u32x4_const(unsigned int c0, unsigned int c1,
111 unsigned int c2, unsigned int c3)
112{ vsx_v_u32 v = {c0,c1,c2,c3}; return (v128_t)v; }
113static inline v128_t vsx_i64x2_const(long long c0, long long c1)
114{ vsx_v_i64 v = {c0,c1}; return (v128_t)v; }
115static inline v128_t vsx_u64x2_const(unsigned long long c0,
116 unsigned long long c1)
117{ vsx_v_u64 v = {c0,c1}; return (v128_t)v; }
118
119static inline v128_t vsx_i8x16_splat(signed char x)
120{ return (v128_t)vec_splats(x); }
121static inline v128_t vsx_i16x8_splat(short x)
122{ return (v128_t)vec_splats(x); }
123static inline v128_t vsx_i32x4_splat(int x)
124{ return (v128_t)vec_splats(x); }
125static inline v128_t vsx_u32x4_splat(unsigned int x)
126{ return (v128_t)vec_splats(x); }
127static inline v128_t vsx_i64x2_splat(long long x)
128{ return (v128_t)vec_splats((signed long long)x); }
129static inline v128_t vsx_f32x4_splat(float x)
130{ return (v128_t)vec_splats(x); }
131
132static inline v128_t vsx_i32x4_make(int a, int b, int c, int d)
133{ return (v128_t)(vsx_v_i32){a, b, c, d}; }
134
135//---------------------------------------------------------------------------
136// lane extraction (subscript is little-endian lane order)
137//---------------------------------------------------------------------------
138#define vsx_u8x16_extract_lane(a, i) (((vsx_v_u8)(a))[(i)])
139#define vsx_u16x8_extract_lane(a, i) (((vsx_v_u16)(a))[(i)])
140#define vsx_i32x4_extract_lane(a, i) (((vsx_v_i32)(a))[(i)])
141#define vsx_u32x4_extract_lane(a, i) (((vsx_v_u32)(a))[(i)])
142#define vsx_i64x2_extract_lane(a, i) (((vsx_v_i64)(a))[(i)])
143
144//---------------------------------------------------------------------------
145// bitwise
146//---------------------------------------------------------------------------
147static inline v128_t vsx_v128_and(v128_t a, v128_t b)
148{ return vec_and(a, b); }
149static inline v128_t vsx_v128_or(v128_t a, v128_t b)
150{ return vec_or(a, b); }
151static inline v128_t vsx_v128_xor(v128_t a, v128_t b)
152{ return vec_xor(a, b); }
153// a & ~b (same operand order as vec_andc)
155{ return vec_andc(a, b); }
156
157//---------------------------------------------------------------------------
158// integer arithmetic
159//---------------------------------------------------------------------------
161{ return (v128_t)vec_add((vsx_v_i8)a, (vsx_v_i8)b); }
163{ return (v128_t)vec_add((vsx_v_i16)a, (vsx_v_i16)b); }
165{ return (v128_t)vec_add((vsx_v_i32)a, (vsx_v_i32)b); }
167{ return (v128_t)vec_add((vsx_v_i64)a, (vsx_v_i64)b); }
168
170{ return (v128_t)vec_sub((vsx_v_i16)a, (vsx_v_i16)b); }
172{ return (v128_t)vec_sub((vsx_v_i32)a, (vsx_v_i32)b); }
174{ return (v128_t)vec_sub((vsx_v_i64)a, (vsx_v_i64)b); }
175
176// low half of products; vmladduhm / vmuluwm; i64x2 is lowered by the
177// compiler (mulld on ISA 3.0, vmulld on ISA 3.1)
179{ return (v128_t)((vsx_v_i16)a * (vsx_v_i16)b); }
181{ return (v128_t)((vsx_v_i32)a * (vsx_v_i32)b); }
183{ return (v128_t)((vsx_v_i64)a * (vsx_v_i64)b); }
184
186{ return (v128_t)vec_abs((vsx_v_i8)a); }
188{ return (v128_t)vec_min((vsx_v_u8)a, (vsx_v_u8)b); }
190{ return (v128_t)vec_max((vsx_v_i16)a, (vsx_v_i16)b); }
191
192//---------------------------------------------------------------------------
193// shifts (scalar count, modulo lane width)
194//---------------------------------------------------------------------------
195static inline v128_t vsx_i16x8_shl(v128_t a, int n)
196{ return (v128_t)vec_sl((vsx_v_i16)a, vec_splats((unsigned short)n)); }
197static inline v128_t vsx_i32x4_shl(v128_t a, int n)
198{ return (v128_t)vec_sl((vsx_v_i32)a, vec_splats((unsigned int)n)); }
199static inline v128_t vsx_i64x2_shl(v128_t a, int n)
200{ return (v128_t)vec_sl((vsx_v_i64)a,
201 vec_splats((unsigned long long)n)); }
202
203static inline v128_t vsx_i32x4_shr(v128_t a, int n) // arithmetic
204{ return (v128_t)vec_sra((vsx_v_i32)a, vec_splats((unsigned int)n)); }
205static inline v128_t vsx_i64x2_shr(v128_t a, int n) // arithmetic
206{ return (v128_t)vec_sra((vsx_v_i64)a,
207 vec_splats((unsigned long long)n)); }
208
209static inline v128_t vsx_u16x8_shr(v128_t a, int n) // logical
210{ return (v128_t)vec_sr((vsx_v_u16)a, vec_splats((unsigned short)n)); }
211static inline v128_t vsx_u32x4_shr(v128_t a, int n) // logical
212{ return (v128_t)vec_sr((vsx_v_u32)a, vec_splats((unsigned int)n)); }
213static inline v128_t vsx_u64x2_shr(v128_t a, int n) // logical
214{ return (v128_t)vec_sr((vsx_v_u64)a,
215 vec_splats((unsigned long long)n)); }
216
217//---------------------------------------------------------------------------
218// comparisons (true lanes -> all-ones, false lanes -> all-zeros)
219//---------------------------------------------------------------------------
220static inline v128_t vsx_i8x16_eq(v128_t a, v128_t b)
221{ return (v128_t)vec_cmpeq((vsx_v_i8)a, (vsx_v_i8)b); }
222static inline v128_t vsx_i16x8_eq(v128_t a, v128_t b)
223{ return (v128_t)vec_cmpeq((vsx_v_i16)a, (vsx_v_i16)b); }
224static inline v128_t vsx_i32x4_eq(v128_t a, v128_t b)
225{ return (v128_t)vec_cmpeq((vsx_v_i32)a, (vsx_v_i32)b); }
226
227static inline v128_t vsx_i8x16_gt(v128_t a, v128_t b)
228{ return (v128_t)vec_cmpgt((vsx_v_i8)a, (vsx_v_i8)b); }
229static inline v128_t vsx_i32x4_gt(v128_t a, v128_t b)
230{ return (v128_t)vec_cmpgt((vsx_v_i32)a, (vsx_v_i32)b); }
231static inline v128_t vsx_i32x4_lt(v128_t a, v128_t b)
232{ return (v128_t)vec_cmplt((vsx_v_i32)a, (vsx_v_i32)b); }
233static inline v128_t vsx_i64x2_lt(v128_t a, v128_t b)
234{ return (v128_t)vec_cmplt((vsx_v_i64)a, (vsx_v_i64)b); }
235
236static inline v128_t vsx_f32x4_ge(v128_t a, v128_t b)
237{ return (v128_t)vec_cmpge((vsx_v_f32)a, (vsx_v_f32)b); }
238static inline v128_t vsx_f32x4_lt(v128_t a, v128_t b)
239{ return (v128_t)vec_cmplt((vsx_v_f32)a, (vsx_v_f32)b); }
240
241//---------------------------------------------------------------------------
242// float arithmetic and conversions
243//---------------------------------------------------------------------------
245{ return (v128_t)vec_add((vsx_v_f32)a, (vsx_v_f32)b); }
247{ return (v128_t)vec_sub((vsx_v_f32)a, (vsx_v_f32)b); }
249{ return (v128_t)vec_mul((vsx_v_f32)a, (vsx_v_f32)b); }
250
251// xvcvspsxws: truncating, saturating (NaN gives 0x80000000; the
252// callers never pass NaN)
254{ return (v128_t)vec_cts((vsx_v_f32)a, 0); }
256{ return (v128_t)vec_ctf((vsx_v_i32)a, 0); }
257
258//---------------------------------------------------------------------------
259// widening
260//---------------------------------------------------------------------------
262{
263 vsx_v_i32 v = (vsx_v_i32)a;
264 return (v128_t)__builtin_convertvector(
265 __builtin_shufflevector(v, v, 0, 1), vsx_v_i64);
266}
268{
269 vsx_v_i32 v = (vsx_v_i32)a;
270 return (v128_t)__builtin_convertvector(
271 __builtin_shufflevector(v, v, 2, 3), vsx_v_i64);
272}
273
274//---------------------------------------------------------------------------
275// shuffles (immediate lane indices; 0..N-1 from a, N..2N-1 from b)
276//---------------------------------------------------------------------------
277#define vsx_i8x16_shuffle(a, b, c0,c1,c2,c3,c4,c5,c6,c7, \
278 c8,c9,c10,c11,c12,c13,c14,c15) \
279 ((v128_t)__builtin_shufflevector((vsx_v_u8)(a), (vsx_v_u8)(b), \
280 c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15))
281#define vsx_i16x8_shuffle(a, b, c0,c1,c2,c3,c4,c5,c6,c7) \
282 ((v128_t)__builtin_shufflevector((vsx_v_i16)(a), (vsx_v_i16)(b), \
283 c0,c1,c2,c3,c4,c5,c6,c7))
284#define vsx_i32x4_shuffle(a, b, c0,c1,c2,c3) \
285 ((v128_t)__builtin_shufflevector((vsx_v_i32)(a), (vsx_v_i32)(b), \
286 c0,c1,c2,c3))
287#define vsx_i64x2_shuffle(a, b, c0,c1) \
288 ((v128_t)__builtin_shufflevector((vsx_v_i64)(a), (vsx_v_i64)(b), c0,c1))
289
290//---------------------------------------------------------------------------
291// swizzle: runtime byte-table lookup; lanes with index > 15 give 0
292//---------------------------------------------------------------------------
294{
295 v128_t r = vec_perm(a, a, idx);
296 v128_t oob = (v128_t)vec_cmpgt((vsx_v_u8)idx,
297 vec_splats((unsigned char)15));
298 return vec_andc(r, oob);
299}
300
301//---------------------------------------------------------------------------
302// bitmask: MSB of each byte lane -> bit of result, lane 0 -> bit 0
303// (vbpermq gathers the 16 selected bits into bits 48..63 of the
304// big-endian first doubleword, which is doubleword 1 on ppc64le)
305//---------------------------------------------------------------------------
306static inline int vsx_i8x16_bitmask(v128_t a)
307{
308#if defined(__POWER10_VECTOR__)
309 return (int)vec_extractm(a); // ISA 3.1 native movemask
310#else
311 const vsx_v_u8 perm = { 120, 112, 104, 96, 88, 80, 72, 64,
312 56, 48, 40, 32, 24, 16, 8, 0 };
313 vsx_v_u64 r = (vsx_v_u64)vec_bperm(a, perm);
314 return (int)r[1];
315#endif
316}
317
318#endif // OJPH_SIMD_VSX_H
static v128_t vsx_i8x16_splat(signed char x)
__vector unsigned int vsx_v_u32
static v128_t vsx_i32x4_sub(v128_t a, v128_t b)
static v128_t vsx_i32x4_make(int a, int b, int c, int d)
static v128_t vsx_i16x8_sub(v128_t a, v128_t b)
static v128_t vsx_u16x8_shr(v128_t a, int n)
static v128_t vsx_f32x4_mul(v128_t a, v128_t b)
static int vsx_i8x16_bitmask(v128_t a)
__vector signed int vsx_v_i32
static v128_t vsx_i8x16_swizzle(v128_t a, v128_t idx)
static v128_t vsx_f32x4_sub(v128_t a, v128_t b)
static v128_t vsx_i8x16_const(signed char c0, signed char c1, signed char c2, signed char c3, signed char c4, signed char c5, signed char c6, signed char c7, signed char c8, signed char c9, signed char c10, signed char c11, signed char c12, signed char c13, signed char c14, signed char c15)
static v128_t vsx_i64x2_lt(v128_t a, v128_t b)
static v128_t vsx_f32x4_convert_i32x4(v128_t a)
static v128_t vsx_f32x4_add(v128_t a, v128_t b)
__vector float vsx_v_f32
static v128_t vsx_v128_xor(v128_t a, v128_t b)
__vector signed char vsx_v_i8
static v128_t vsx_i16x8_mul(v128_t a, v128_t b)
static v128_t vsx_u32x4_shr(v128_t a, int n)
__vector signed long long vsx_v_i64
static v128_t vsx_i16x8_splat(short x)
static v128_t vsx_u32x4_const(unsigned int c0, unsigned int c1, unsigned int c2, unsigned int c3)
static v128_t vsx_i32x4_const(int c0, int c1, int c2, int c3)
static v128_t vsx_u8x16_min(v128_t a, v128_t b)
static v128_t vsx_f32x4_ge(v128_t a, v128_t b)
static v128_t vsx_i64x2_extend_high_i32x4(v128_t a)
static v128_t vsx_f32x4_splat(float x)
__vector signed short vsx_v_i16
static v128_t vsx_u64x2_const(unsigned long long c0, unsigned long long c1)
static v128_t vsx_i32x4_shl(v128_t a, int n)
static v128_t vsx_i16x8_max(v128_t a, v128_t b)
static v128_t vsx_i64x2_const(long long c0, long long c1)
static v128_t vsx_i8x16_abs(v128_t a)
static v128_t vsx_i32x4_add(v128_t a, v128_t b)
static v128_t vsx_f32x4_lt(v128_t a, v128_t b)
static v128_t vsx_i64x2_extend_low_i32x4(v128_t a)
__vector unsigned char v128_t
__vector unsigned char vsx_v_u8
static v128_t vsx_i8x16_add(v128_t a, v128_t b)
static v128_t vsx_u16x8_const(unsigned short c0, unsigned short c1, unsigned short c2, unsigned short c3, unsigned short c4, unsigned short c5, unsigned short c6, unsigned short c7)
static v128_t vsx_i16x8_add(v128_t a, v128_t b)
static v128_t vsx_v128_andnot(v128_t a, v128_t b)
static void vsx_v128_store(void *p, v128_t a)
static v128_t vsx_u64x2_shr(v128_t a, int n)
static v128_t vsx_v128_and(v128_t a, v128_t b)
static v128_t vsx_i64x2_shl(v128_t a, int n)
static v128_t vsx_i32x4_mul(v128_t a, v128_t b)
static v128_t vsx_i16x8_eq(v128_t a, v128_t b)
static v128_t vsx_v128_or(v128_t a, v128_t b)
static v128_t vsx_i32x4_lt(v128_t a, v128_t b)
__vector unsigned short vsx_v_u16
static v128_t vsx_i8x16_eq(v128_t a, v128_t b)
static v128_t vsx_i64x2_mul(v128_t a, v128_t b)
static v128_t vsx_i64x2_splat(long long x)
__vector unsigned long long vsx_v_u64
static v128_t vsx_i64x2_shr(v128_t a, int n)
static v128_t vsx_i64x2_add(v128_t a, v128_t b)
static v128_t vsx_i64x2_sub(v128_t a, v128_t b)
static v128_t vsx_i32x4_splat(int x)
static v128_t vsx_v128_load(const void *p)
static v128_t vsx_i16x8_const(short c0, short c1, short c2, short c3, short c4, short c5, short c6, short c7)
static v128_t vsx_i32x4_shr(v128_t a, int n)
static v128_t vsx_i16x8_shl(v128_t a, int n)
static v128_t vsx_u32x4_splat(unsigned int x)
static v128_t vsx_i32x4_trunc_sat_f32x4(v128_t a)
static v128_t vsx_i8x16_gt(v128_t a, v128_t b)
static v128_t vsx_i32x4_eq(v128_t a, v128_t b)
static v128_t vsx_i32x4_gt(v128_t a, v128_t b)