42#if defined(__apple_build_version__)
47#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
61#ifdef OJPH_COMPILER_MSVC
63 #define unlikely(x) (x)
65 #define likely(x) __builtin_expect((x), 1)
66 #define unlikely(x) __builtin_expect((x), 0)
84 static ui32 uvlc_tbl_pair1[33 * 33];
85 static ui32 uvlc_tbl_pair2[33 * 33];
86 static ui32 ulvc_cwd_pre[33];
87 static int ulvc_cwd_pre_len[33];
88 static ui32 ulvc_cwd_suf[33];
89 static int ulvc_cwd_suf_len[33];
94 struct vlc_src_table {
int c_q, rho, u_off, e_k, e_1, cwd, cwd_len; };
95 vlc_src_table tbl0[] = {
98 size_t tbl0_size =
sizeof(tbl0) /
sizeof(vlc_src_table);
100 si32 pattern_popcnt[16];
101 for (
ui32 i = 0; i < 16; ++i)
104 vlc_src_table* src_tbl = tbl0;
106 size_t tbl_size = tbl0_size;
107 for (
int i = 0; i < 2048; ++i)
109 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
110 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
114 vlc_src_table *best_entry = NULL;
118 for (
size_t j = 0; j < tbl_size; ++j)
120 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
121 if (src_tbl[j].u_off == 1)
122 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
126 int ones_count = pattern_popcnt[src_tbl[j].e_k];
127 if (ones_count >= best_e_k)
129 best_entry = src_tbl + j;
130 best_e_k = ones_count;
137 for (
size_t j = 0; j < tbl_size; ++j)
139 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
140 if (src_tbl[j].u_off == 0)
142 best_entry = src_tbl + j;
148 tgt_tbl[i] = (
ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
153 vlc_src_table tbl1[] = {
156 size_t tbl1_size =
sizeof(tbl1) /
sizeof(vlc_src_table);
160 tbl_size = tbl1_size;
161 for (
int i = 0; i < 2048; ++i)
163 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
164 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
168 vlc_src_table *best_entry = NULL;
172 for (
size_t j = 0; j < tbl_size; ++j)
174 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
175 if (src_tbl[j].u_off == 1)
176 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
180 int ones_count = pattern_popcnt[src_tbl[j].e_k];
181 if (ones_count >= best_e_k)
183 best_entry = src_tbl + j;
184 best_e_k = ones_count;
191 for (
size_t j = 0; j < tbl_size; ++j)
193 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
194 if (src_tbl[j].u_off == 0)
196 best_entry = src_tbl + j;
202 tgt_tbl[i] = (
ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
215 ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2;
216 ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4;
217 ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1;
218 ulvc_cwd_pre_len[2] = 2;
219 ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3;
220 ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0;
221 ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1;
222 ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0;
223 ulvc_cwd_suf_len[2] = 0;
224 ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1;
225 for (
int i = 5; i < 33; ++i)
228 ulvc_cwd_pre_len[i] = 3;
229 ulvc_cwd_suf[i] = (
ui32)(i-5);
230 ulvc_cwd_suf_len[i] = 5;
236 static void uvlc_init_pair_tables()
238 for (
int uq0 = 0; uq0 < 33; ++uq0) {
239 for (
int uq1 = 0; uq1 < 33; ++uq1) {
243 if (uq0 > 2 && uq1 > 2) {
244 cwd |= ulvc_cwd_pre[uq0 - 2];
245 len += ulvc_cwd_pre_len[uq0 - 2];
246 cwd |= ulvc_cwd_pre[uq1 - 2] << len;
247 len += ulvc_cwd_pre_len[uq1 - 2];
248 cwd |= ulvc_cwd_suf[uq0 - 2] << len;
249 len += ulvc_cwd_suf_len[uq0 - 2];
250 cwd |= ulvc_cwd_suf[uq1 - 2] << len;
251 len += ulvc_cwd_suf_len[uq1 - 2];
252 }
else if (uq0 > 2 && uq1 > 0) {
253 cwd |= ulvc_cwd_pre[uq0];
254 len += ulvc_cwd_pre_len[uq0];
255 cwd |= (
ui32)(uq1 - 1) << len;
257 cwd |= ulvc_cwd_suf[uq0] << len;
258 len += ulvc_cwd_suf_len[uq0];
260 cwd |= ulvc_cwd_pre[uq0];
261 len += ulvc_cwd_pre_len[uq0];
262 cwd |= ulvc_cwd_pre[uq1] << len;
263 len += ulvc_cwd_pre_len[uq1];
264 cwd |= ulvc_cwd_suf[uq0] << len;
265 len += ulvc_cwd_suf_len[uq0];
266 cwd |= ulvc_cwd_suf[uq1] << len;
267 len += ulvc_cwd_suf_len[uq1];
269 uvlc_tbl_pair1[uq0 * 33 + uq1] = (cwd << 5) | (
ui32)len;
272 cwd |= ulvc_cwd_pre[uq0];
273 len += ulvc_cwd_pre_len[uq0];
274 cwd |= ulvc_cwd_pre[uq1] << len;
275 len += ulvc_cwd_pre_len[uq1];
276 cwd |= ulvc_cwd_suf[uq0] << len;
277 len += ulvc_cwd_suf_len[uq0];
278 cwd |= ulvc_cwd_suf[uq1] << len;
279 len += ulvc_cwd_suf_len[uq1];
280 uvlc_tbl_pair2[uq0 * 33 + uq1] = (cwd << 5) | (
ui32)len;
287 static bool tables_initialized =
false;
288 static std::once_flag tables_initialized_flag;
289 std::call_once(tables_initialized_flag, []() {
294 uvlc_init_pair_tables();
296 return tables_initialized;
322 melp->buf_size = buffer_size;
323 melp->remaining_bits = 8;
330 static const int mel_exp[13] = {0,0,0,1,1,1,2,2,2,3,3,4,5};
336 melp->tmp = (melp->tmp << num_bits) | (
int)bits;
337 melp->remaining_bits -= num_bits;
338 if (melp->remaining_bits <= 0) {
339 int excess = -melp->remaining_bits;
340 ui8 byte = (
ui8)(melp->tmp >> excess);
341 melp->buf[melp->pos++] = byte;
342 melp->tmp &= (1 << excess) - 1;
343 melp->remaining_bits += 8 - (
byte == 0xFF);
353 if (melp->run >= melp->threshold) {
354 mel_emit_bits(melp, 1, 1);
356 melp->k =
ojph_min(12, melp->k + 1);
357 melp->threshold = 1 << mel_exp[melp->k];
360 int t = mel_exp[melp->k];
361 mel_emit_bits(melp, (
ui32)melp->run & ((1u << t) - 1), t + 1);
364 melp->threshold = 1 << mel_exp[melp->k];
418 vlcp->buf = data + buffer_size - 1;
420 vlcp->buf_size = buffer_size;
425 vlcp->last_greater_than_8F =
true;
432 while (vlcp->used_bits >= 8) {
433 int escape = (int)vlcp->last_greater_than_8F;
434 int is_7f = (int)((vlcp->tmp & 0x7F) == 0x7F);
435 int need_stuff = (escape & is_7f) != 0 ? 1 : 0;
436 int bits = 8 - need_stuff;
438 ui8 byte = (
ui8)(vlcp->tmp & ((1u << bits) - 1));
439 *(vlcp->buf - vlcp->pos) =
byte;
442 vlcp->used_bits -= bits;
443 vlcp->last_greater_than_8F =
byte > 0x8F;
452 int avail = 64 - vlcp->used_bits;
453 if (likely(avail > 0 && cwd_len <= avail)) {
454 vlcp->tmp |= cwd << vlcp->used_bits;
455 vlcp->used_bits += cwd_len;
458 if (likely(avail > 0))
459 vlcp->tmp |= cwd << vlcp->used_bits;
460 vlcp->used_bits = 64;
474 mel_emit_bits(melp, 1, 1);
476 if (vlcp->last_greater_than_8F && (vlcp->tmp & 0x7f) == 0x7f) {
477 *(vlcp->buf - vlcp->pos) = 0x7f;
480 vlcp->used_bits -= 7;
483 melp->tmp = melp->tmp << melp->remaining_bits;
484 int mel_mask = (0xFF << melp->remaining_bits) & 0xFF;
485 int vlc_mask = 0xFF >> (8 - vlcp->used_bits);
486 if ((mel_mask | vlc_mask) == 0)
489 if (melp->pos >= melp->buf_size)
490 OJPH_ERROR(0x00020003,
"mel encoder's buffer is full");
491 ui8 vlcp_tmp = (
ui8)vlcp->tmp;
492 int fuse = melp->tmp | vlcp_tmp;
493 if ( ( ((fuse ^ melp->tmp) & mel_mask)
494 | ((fuse ^ vlcp_tmp) & vlc_mask) ) == 0
495 && (fuse != 0xFF) && vlcp->pos > 1)
497 melp->buf[melp->pos++] = (
ui8)fuse;
501 if (vlcp->pos >= vlcp->buf_size)
502 OJPH_ERROR(0x00020004,
"vlc encoder's buffer is full");
503 melp->buf[melp->pos++] = (
ui8)melp->tmp;
504 *(vlcp->buf - vlcp->pos) = (
ui8)vlcp_tmp;
530 msp->buf_size = buffer_size;
533 msp->last_was_ff =
false;
540 if (msp->last_was_ff) {
541 if (msp->used_bits < 7)
543 msp->buf[msp->pos++] = (
ui8)(msp->tmp & 0x7F);
546 msp->last_was_ff =
false;
549 while (msp->used_bits >= 8) {
550 int n_bytes = msp->used_bits >> 3;
551 if (n_bytes > 8) n_bytes = 8;
553 ui64 word = msp->tmp;
554 ui64 valid_mask = (n_bytes < 8)
555 ? (1ULL << (n_bytes * 8)) - 1 : ~(
ui64)0;
558 ui64 ff_detect = (w - 0x0101010101010101ULL) & ~w
559 & 0x8080808080808080ULL;
560 ff_detect &= valid_mask;
562 if (likely(ff_detect == 0)) {
563 memcpy(msp->buf + msp->pos, &word, (
size_t)n_bytes);
564 msp->pos += (
ui32)n_bytes;
566 msp->tmp >>= (n_bytes * 8);
569 msp->used_bits -= n_bytes * 8;
572 int safe = ff_pos + 1;
573 memcpy(msp->buf + msp->pos, &word, (
size_t)safe);
574 msp->pos += (
ui32)safe;
580 msp->used_bits -= bits;
582 if (msp->used_bits >= 7) {
583 msp->buf[msp->pos++] = (
ui8)(msp->tmp & 0x7F);
586 msp->last_was_ff =
false;
588 msp->last_was_ff =
true;
600 int avail = 64 - msp->used_bits;
601 if (likely(avail > 0 && cwd_len <= avail)) {
602 msp->tmp |= cwd << msp->used_bits;
603 msp->used_bits += cwd_len;
606 if (likely(avail > 0))
607 msp->tmp |= cwd << msp->used_bits;
642 int max_bits = msp->last_was_ff ? 7 : 8;
643 int t = max_bits - msp->used_bits;
644 ui32 byte = (
ui32)(msp->tmp & ((1ULL << msp->used_bits) - 1));
645 byte |= (0xFFu & ((1u << t) - 1)) << msp->used_bits;
648 if (msp->pos >= msp->buf_size)
649 OJPH_ERROR(0x00020006,
"magnitude sign encoder's buffer is full");
650 msp->buf[msp->pos++] = (
ui8)
byte;
653 else if (msp->last_was_ff)
657#define ZERO _mm256_setzero_si256()
658#define ONE _mm256_set1_epi32(1)
661inline __m256i avx2_lzcnt_epi32(__m256i v) {
663 v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v);
665 v = _mm256_castps_si256(_mm256_cvtepi32_ps(v));
666 v = _mm256_srli_epi32(v, 23);
667 v = _mm256_subs_epu16(_mm256_set1_epi32(158), v);
668 v = _mm256_min_epi16(v, _mm256_set1_epi32(32));
673inline __m256i avx2_cmpneq_epi32(__m256i v, __m256i v2) {
674 return _mm256_xor_si256(_mm256_cmpeq_epi32(v, v2), _mm256_set1_epi32((int32_t)0xffffffff));
677static void proc_pixel(__m256i *src_vec,
ui32 p,
678 __m256i *eq_vec, __m256i *s_vec,
679 __m256i &rho_vec, __m256i &e_qmax_vec)
686 for (
ui32 i = 0; i < 4; ++i) {
688 val_vec[i] = _mm256_add_epi32(src_vec[i], src_vec[i]);
691 val_vec[i] = _mm256_srli_epi32(val_vec[i], (
int)p);
694 val_vec[i] = _mm256_and_si256(val_vec[i], _mm256_set1_epi32((
int)~1u));
697 const __m256i val_notmask = avx2_cmpneq_epi32(val_vec[i], ZERO);
704 val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
705 _eq_vec[i] = avx2_lzcnt_epi32(val_vec[i]);
706 _eq_vec[i] = _mm256_sub_epi32(_mm256_set1_epi32(32), _eq_vec[i]);
713 val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
714 _s_vec[i] = _mm256_srli_epi32(src_vec[i], 31);
715 _s_vec[i] = _mm256_add_epi32(_s_vec[i], val_vec[i]);
717 _eq_vec[i] = _mm256_and_si256(_eq_vec[i], val_notmask);
718 _s_vec[i] = _mm256_and_si256(_s_vec[i], val_notmask);
719 val_vec[i] = _mm256_srli_epi32(val_notmask, 31);
723 const __m256i idx = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
737 for (
ui32 i = 0; i < 2; ++i) {
738 tmp1 = _mm256_permutevar8x32_epi32(_eq_vec[0 + i], idx);
739 tmp2 = _mm256_permutevar8x32_epi32(_eq_vec[2 + i], idx);
740 eq_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
741 eq_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
743 tmp1 = _mm256_permutevar8x32_epi32(_s_vec[0 + i], idx);
744 tmp2 = _mm256_permutevar8x32_epi32(_s_vec[2 + i], idx);
745 s_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
746 s_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
748 tmp1 = _mm256_permutevar8x32_epi32(val_vec[0 + i], idx);
749 tmp2 = _mm256_permutevar8x32_epi32(val_vec[2 + i], idx);
750 _rho_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
751 _rho_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
754 e_qmax_vec = _mm256_max_epi32(eq_vec[0], eq_vec[1]);
755 e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[2]);
756 e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[3]);
757 _rho_vec[1] = _mm256_slli_epi32(_rho_vec[1], 1);
758 _rho_vec[2] = _mm256_slli_epi32(_rho_vec[2], 2);
759 _rho_vec[3] = _mm256_slli_epi32(_rho_vec[3], 3);
760 rho_vec = _mm256_or_si256(_rho_vec[0], _rho_vec[1]);
761 rho_vec = _mm256_or_si256(rho_vec, _rho_vec[2]);
762 rho_vec = _mm256_or_si256(rho_vec, _rho_vec[3]);
778static void rotate_matrix(__m256i *matrix)
780 __m256i tmp1 = _mm256_unpacklo_epi32(matrix[0], matrix[1]);
781 __m256i tmp2 = _mm256_unpacklo_epi32(matrix[2], matrix[3]);
782 __m256i tmp3 = _mm256_unpackhi_epi32(matrix[0], matrix[1]);
783 __m256i tmp4 = _mm256_unpackhi_epi32(matrix[2], matrix[3]);
785 matrix[0] = _mm256_unpacklo_epi64(tmp1, tmp2);
786 matrix[1] = _mm256_unpacklo_epi64(tmp3, tmp4);
787 matrix[2] = _mm256_unpackhi_epi64(tmp1, tmp2);
788 matrix[3] = _mm256_unpackhi_epi64(tmp3, tmp4);
790 tmp1 = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x20);
791 matrix[2] = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x31);
794 tmp1 = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x20);
795 matrix[3] = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x31);
799static void proc_ms_encode(
ms_struct *msp,
809 auto tmp = _mm256_and_si256(tuple_vec, ONE);
810 tmp = _mm256_sub_epi32(uq_vec, tmp);
811 auto tmp1 = _mm256_and_si256(rho_vec, ONE);
812 auto mask = avx2_cmpneq_epi32(tmp1, ZERO);
813 m_vec[0] = _mm256_and_si256(mask, tmp);
816 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(2));
817 tmp = _mm256_srli_epi32(tmp, 1);
818 tmp = _mm256_sub_epi32(uq_vec, tmp);
819 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
820 mask = avx2_cmpneq_epi32(tmp1, ZERO);
821 m_vec[1] = _mm256_and_si256(mask, tmp);
824 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(4));
825 tmp = _mm256_srli_epi32(tmp, 2);
826 tmp = _mm256_sub_epi32(uq_vec, tmp);
827 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
828 mask = avx2_cmpneq_epi32(tmp1, ZERO);
829 m_vec[2] = _mm256_and_si256(mask, tmp);
832 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(8));
833 tmp = _mm256_srli_epi32(tmp, 3);
834 tmp = _mm256_sub_epi32(uq_vec, tmp);
835 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
836 mask = avx2_cmpneq_epi32(tmp1, ZERO);
837 m_vec[3] = _mm256_and_si256(mask, tmp);
839 rotate_matrix(m_vec);
840 rotate_matrix(s_vec);
846 for (
ui32 i = 0; i < 4; ++i) {
850 _mm256_storeu_si256((__m256i *)cwd_len, m_vec[i]);
851 tmp = _mm256_sllv_epi32(ONE, m_vec[i]);
852 tmp = _mm256_sub_epi32(tmp, ONE);
853 tmp = _mm256_and_si256(tmp, s_vec[i]);
854 _mm256_storeu_si256((__m256i*)cwd, tmp);
856 for (
ui32 j = 0; j < 4; j += 2) {
858 ui64 _cwd = cwd[idx0];
859 int _cwd_len = cwd_len[idx0];
860 _cwd |= ((
ui64)cwd[idx0 + 1]) << _cwd_len;
861 _cwd_len += cwd_len[idx0 + 1];
863 ui32 idx1 = (j + 1) * 2;
864 int len1 = cwd_len[idx1] + cwd_len[idx1 + 1];
865 if (likely(_cwd_len + len1 <= 64)) {
866 _cwd |= ((
ui64)cwd[idx1]) << _cwd_len;
867 _cwd_len += cwd_len[idx1];
868 _cwd |= ((
ui64)cwd[idx1 + 1]) << _cwd_len;
869 _cwd_len += cwd_len[idx1 + 1];
870 ms_encode_nodefer(msp, _cwd, _cwd_len);
872 ms_encode_nodefer(msp, _cwd, _cwd_len);
874 _cwd_len = cwd_len[idx1];
875 _cwd |= ((
ui64)cwd[idx1 + 1]) << _cwd_len;
876 _cwd_len += cwd_len[idx1 + 1];
877 ms_encode_nodefer(msp, _cwd, _cwd_len);
884static __m256i cal_eps_vec(__m256i *eq_vec, __m256i &u_q_vec,
894 auto u_q_mask = _mm256_cmpgt_epi32(u_q_vec, ZERO);
896 auto mask = _mm256_cmpeq_epi32(eq_vec[0], e_qmax_vec);
897 auto eps_vec = _mm256_srli_epi32(mask, 31);
899 mask = _mm256_cmpeq_epi32(eq_vec[1], e_qmax_vec);
900 auto tmp = _mm256_srli_epi32(mask, 31);
901 tmp = _mm256_slli_epi32(tmp, 1);
902 eps_vec = _mm256_or_si256(eps_vec, tmp);
904 mask = _mm256_cmpeq_epi32(eq_vec[2], e_qmax_vec);
905 tmp = _mm256_srli_epi32(mask, 31);
906 tmp = _mm256_slli_epi32(tmp, 2);
907 eps_vec = _mm256_or_si256(eps_vec, tmp);
909 mask = _mm256_cmpeq_epi32(eq_vec[3], e_qmax_vec);
910 tmp = _mm256_srli_epi32(mask, 31);
911 tmp = _mm256_slli_epi32(tmp, 3);
912 eps_vec = _mm256_or_si256(eps_vec, tmp);
914 return _mm256_and_si256(u_q_mask, eps_vec);
917static void update_lep(
ui32 x, __m256i &prev_e_val_vec,
918 __m256i *eq_vec, __m256i *e_val_vec,
919 const __m256i left_shift)
925 auto tmp = _mm256_permutevar8x32_epi32(eq_vec[3], left_shift);
926 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_e_val_vec)), 0);
927 prev_e_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(eq_vec[3], 7), 0);
928 e_val_vec[x] = _mm256_max_epi32(eq_vec[1], tmp);
932static void update_lcxp(
ui32 x, __m256i &prev_cx_val_vec,
933 __m256i &rho_vec, __m256i *cx_val_vec,
934 const __m256i left_shift)
940 auto tmp = _mm256_permutevar8x32_epi32(rho_vec, left_shift);
941 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_cx_val_vec)), 0);
942 prev_cx_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(rho_vec, 7), 0);
944 tmp = _mm256_and_si256(tmp, _mm256_set1_epi32(8));
945 tmp = _mm256_srli_epi32(tmp, 3);
947 auto tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
948 tmp1 = _mm256_srli_epi32(tmp1, 1);
949 cx_val_vec[x] = _mm256_or_si256(tmp, tmp1);
952static __m256i cal_tuple(__m256i &cq_vec, __m256i &rho_vec,
953 __m256i &eps_vec,
ui32 *vlc_tbl)
956 auto tmp = _mm256_slli_epi32(cq_vec, 8);
957 auto tmp1 = _mm256_slli_epi32(rho_vec, 4);
958 tmp = _mm256_add_epi32(tmp, tmp1);
959 tmp = _mm256_add_epi32(tmp, eps_vec);
960 return _mm256_i32gather_epi32((
const int *)vlc_tbl, tmp, 4);
963static __m256i proc_cq1(
ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
964 const __m256i right_shift)
971 auto tmp = _mm256_srli_epi32(rho_vec, 1);
972 auto tmp1 = _mm256_and_si256(rho_vec, ONE);
973 return _mm256_or_si256(tmp, tmp1);
976static __m256i proc_cq2(
ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
977 const __m256i right_shift)
981 auto lcxp1_vec = _mm256_permutevar8x32_epi32(cx_val_vec[x], right_shift);
982 auto tmp = _mm256_permutevar8x32_epi32(lcxp1_vec, right_shift);
984#ifdef OJPH_ARCH_X86_64
985 tmp = _mm256_insert_epi64(tmp,
986 _mm_cvtsi128_si64(_mm256_castsi256_si128(cx_val_vec[x + 1])), 3);
987#elif (defined OJPH_ARCH_I386)
988 int lsb = _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1]));
989 tmp = _mm256_insert_epi32(tmp, lsb, 6);
990 int msb = _mm_extract_epi32(_mm256_castsi256_si128(cx_val_vec[x + 1]), 1);
991 tmp = _mm256_insert_epi32(tmp, msb, 7);
993 #error Error unsupport compiler
995 tmp = _mm256_slli_epi32(tmp, 2);
996 auto tmp1 = _mm256_insert_epi32(lcxp1_vec,
997 _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1])), 7);
998 tmp = _mm256_add_epi32(tmp1, tmp);
1000 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
1001 tmp1 = _mm256_srli_epi32(tmp1, 1);
1002 tmp = _mm256_or_si256(tmp, tmp1);
1004 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
1005 tmp1 = _mm256_srli_epi32(tmp1, 2);
1007 return _mm256_or_si256(tmp, tmp1);
1010static void proc_mel_encode1(
mel_struct *melp, __m256i &cq_vec,
1011 __m256i &rho_vec, __m256i u_q_vec,
ui32 ignore,
1012 const __m256i right_shift)
1014 int32_t mel_need_encode[8];
1015 int32_t mel_need_encode2[8];
1017 int32_t mel_bit2[8];
1020 _mm256_storeu_si256((__m256i *)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO));
1022 _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
1026 auto tmp = _mm256_permutevar8x32_epi32(u_q_vec, right_shift);
1027 auto tmp1 = _mm256_min_epi32(u_q_vec, tmp);
1028 _mm256_storeu_si256((__m256i*)mel_bit2, _mm256_srli_epi32(_mm256_cmpgt_epi32(tmp1, _mm256_set1_epi32(2)), 31));
1031 auto need_encode2 = _mm256_cmpgt_epi32(u_q_vec, ZERO);
1032 _mm256_storeu_si256((__m256i*)mel_need_encode2, _mm256_and_si256(need_encode2, _mm256_cmpgt_epi32(tmp, ZERO)));
1034 ui32 i_max = 8 - (ignore / 2);
1036 for (
ui32 i = 0; i < i_max; i += 2) {
1037 if (mel_need_encode[i]) {
1041 if (i + 1 < i_max) {
1042 if (mel_need_encode[i + 1]) {
1047 if (mel_need_encode2[i]) {
1053static void proc_mel_encode2(
mel_struct *melp, __m256i &cq_vec,
1054 __m256i &rho_vec, __m256i u_q_vec,
ui32 ignore,
1055 const __m256i right_shift)
1060 __m256i need = _mm256_cmpeq_epi32(cq_vec, ZERO);
1061 ui32 mask = (
ui32)_mm256_movemask_epi8(need);
1064 ui32 i_max = 8 - (ignore / 2);
1066 mask &= (1u << (i_max * 4)) - 1;
1072 _mm256_storeu_si256((__m256i*)mel_bit,
1073 _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
1077 ui32 i = bit_pos / 4;
1083using fn_proc_mel_encode = void (*)(
mel_struct *, __m256i &, __m256i &,
1084 __m256i,
ui32,
const __m256i);
1090 val = tuple[i + 0] >> 4;
1091 size = tuple[i + 0] & 7;
1093 val |= (
ui64)(tuple[i + 1] >> 4) << size;
1094 size += tuple[i + 1] & 7;
1097 val |= (
ui64)(entry >> 5) << size;
1098 size += entry & 0x1F;
1104 ui32 i_max = 8 - (ignore / 2);
1107 for (; i + 2 < i_max; i += 4) {
1108 ui64 val1;
int size1;
1109 build_vlc_uvlc_pair(tuple, u_q, i,
uvlc_tbl, val1, size1);
1110 ui64 val2;
int size2;
1111 build_vlc_uvlc_pair(tuple, u_q, i + 2,
uvlc_tbl, val2, size2);
1112 vlc_encode(vlcp, val1 | (val2 << size1), size1 + size2);
1116 build_vlc_uvlc_pair(tuple, u_q, i,
uvlc_tbl, val, size);
1126 __m256i *e_val_vec, __m256i &prev_e_val_vec,
1127 __m256i *cx_val_vec, __m256i &prev_cx_val_vec,
1129 const __m256i &right_shift,
const __m256i &left_shift)
1139 for (
ui32 x = 0; x < n_loop; ++x) {
1142 if ((x == (n_loop - 1)) && (_width % 16)) {
1143 ui32 tmp_buf[16] = { 0 };
1144 memcpy(tmp_buf, sp, (_width % 16) *
sizeof(
ui32));
1145 src_vec[0] = _mm256_loadu_si256((__m256i*)(tmp_buf));
1146 src_vec[2] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
1147 if (y + 1 < height) {
1148 memcpy(tmp_buf, sp + stride, (_width % 16) *
sizeof(
ui32));
1149 src_vec[1] = _mm256_loadu_si256((__m256i*)(tmp_buf));
1150 src_vec[3] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
1158 src_vec[0] = _mm256_loadu_si256((__m256i*)(sp));
1159 src_vec[2] = _mm256_loadu_si256((__m256i*)(sp + 8));
1161 if (y + 1 < height) {
1162 src_vec[1] = _mm256_loadu_si256((__m256i*)(sp + stride));
1163 src_vec[3] = _mm256_loadu_si256((__m256i*)(sp + 8 + stride));
1172 __m256i rho_vec, e_qmax_vec;
1173 proc_pixel(src_vec, p, eq_vec, s_vec, rho_vec, e_qmax_vec);
1176 tmp = _mm256_permutevar8x32_epi32(e_val_vec[x], right_shift);
1177 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(e_val_vec[x + 1])), 7);
1179 auto max_e_vec = _mm256_max_epi32(tmp, e_val_vec[x]);
1180 max_e_vec = _mm256_sub_epi32(max_e_vec, ONE);
1183 tmp = _mm256_max_epi32(max_e_vec, ONE);
1184 tmp1 = _mm256_sub_epi32(rho_vec, ONE);
1185 tmp1 = _mm256_and_si256(rho_vec, tmp1);
1187 auto cmp = _mm256_cmpeq_epi32(tmp1, ZERO);
1188 auto kappa_vec1_ = _mm256_and_si256(cmp, ONE);
1189 auto kappa_vec2_ = _mm256_and_si256(_mm256_xor_si256(cmp, _mm256_set1_epi32((int32_t)0xffffffff)), tmp);
1190 const __m256i kappa_vec = _mm256_max_epi32(kappa_vec1_, kappa_vec2_);
1193 tmp = proc_cq1(x, cx_val_vec, rho_vec, right_shift);
1195 tmp = proc_cq2(x, cx_val_vec, rho_vec, right_shift);
1197 auto cq_vec = _mm256_permutevar8x32_epi32(tmp, left_shift);
1198 cq_vec = _mm256_insert_epi32(cq_vec, prev_cq, 0);
1199 prev_cq = (
ui32)_mm256_extract_epi32(tmp, 7);
1201 update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
1202 update_lcxp(x, prev_cx_val_vec, rho_vec, cx_val_vec, left_shift);
1206 auto uq_vec = _mm256_max_epi32(kappa_vec, e_qmax_vec);
1207 auto u_q_vec = _mm256_sub_epi32(uq_vec, kappa_vec);
1209 auto eps_vec = cal_eps_vec(eq_vec, u_q_vec, e_qmax_vec);
1210 __m256i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
1211 ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
1214 proc_mel_encode1(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1217 proc_mel_encode2(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1220 proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec);
1224 tuple_vec = _mm256_srli_epi32(tuple_vec, 4);
1225 _mm256_storeu_si256((__m256i*)tuple, tuple_vec);
1226 _mm256_storeu_si256((__m256i*)u_q, u_q_vec);
1228 ui32 i_max = 8 - (_ignore / 2);
1229 if (i_max & 1) { tuple[i_max] = 0; u_q[i_max] = 0; }
1230 tuple[8] = 0; u_q[8] = 0;
1232 proc_vlc_encode(&vlc, tuple, u_q, _ignore,
1233 (PASS == 1) ? uvlc_tbl_pair1 : uvlc_tbl_pair2);
1240 ojph::mem_elastic_allocator *elastic,
1241 ojph::coded_lists *& coded)
1245 ui32 width = (_width + 15) & ~15u;
1246 ui32 ignore = width - _width;
1247 const int ms_size = (16384 * 16 + 14) / 15;
1248 const int mel_vlc_size = 3072;
1249 const int mel_size = 192;
1250 const int vlc_size = mel_vlc_size - mel_size;
1252 ui8 ms_buf[ms_size];
1253 ui8 mel_vlc_buf[mel_vlc_size];
1254 ui8 *mel_buf = mel_vlc_buf;
1255 ui8 *vlc_buf = mel_vlc_buf + mel_size;
1262 ms_init(&ms, ms_size, ms_buf);
1264 const ui32 p = 30 - missing_msbs;
1275 const __m256i right_shift = _mm256_set_epi32(
1276 0, 7, 6, 5, 4, 3, 2, 1
1279 const __m256i left_shift = _mm256_set_epi32(
1280 6, 5, 4, 3, 2, 1, 0, 7
1283 ui32 n_loop = (width + 15) / 16;
1285 __m256i e_val_vec[65];
1287 e_val_vec[i] = ZERO;
1289 __m256i prev_e_val_vec = ZERO;
1291 __m256i cx_val_vec[65];
1292 __m256i prev_cx_val_vec = ZERO;
1299 for (
ui32 y = 0; y < height; y += 2)
1301 e_val_vec[n_loop] = prev_e_val_vec;
1303 tmp = _mm256_and_si256(prev_cx_val_vec, _mm256_set1_epi32(8));
1304 cx_val_vec[n_loop] = _mm256_srli_epi32(tmp, 3);
1306 prev_e_val_vec = ZERO;
1307 prev_cx_val_vec = ZERO;
1309 ui32 *sp = buf + y * stride;
1312 encode_x_loop<1>(sp, stride, height, y, n_loop, _width,
1313 ignore, p, mel, vlc, ms,
1314 e_val_vec, prev_e_val_vec,
1315 cx_val_vec, prev_cx_val_vec, prev_cq,
1316 right_shift, left_shift);
1318 encode_x_loop<2>(sp, stride, height, y, n_loop, _width,
1319 ignore, p, mel, vlc, ms,
1320 e_val_vec, prev_e_val_vec,
1321 cx_val_vec, prev_cx_val_vec, prev_cq,
1322 right_shift, left_shift);
1324 tmp = _mm256_permutevar8x32_epi32(cx_val_vec[0], right_shift);
1325 tmp = _mm256_slli_epi32(tmp, 2);
1326 tmp = _mm256_add_epi32(tmp, cx_val_vec[0]);
1327 prev_cq = (
ui32)_mm_cvtsi128_si32(_mm256_castsi256_si128(tmp));
1335 lengths[0] = mel.pos + vlc.pos + ms.pos;
1336 elastic->
get_buffer(mel.pos + vlc.pos + ms.pos, coded);
1337 memcpy(coded->
buf, ms.buf, ms.pos);
1338 memcpy(coded->
buf + ms.pos, mel.buf, mel.pos);
1339 memcpy(coded->
buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos);
1342 ui32 num_bytes = mel.pos + vlc.pos;
1343 coded->
buf[lengths[0]-1] = (
ui8)(num_bytes >> 4);
1344 coded->
buf[lengths[0]-2] = coded->
buf[lengths[0]-2] & 0xF0;
1345 coded->
buf[lengths[0]-2] =
1346 (
ui8)(coded->
buf[lengths[0]-2] | (num_bytes & 0xF));
void get_buffer(ui32 needed_bytes, coded_lists *&p)
static bool uvlc_init_tables()
Initializes uvlc_tbl0 and uvlc_tbl1 tables.
static bool vlc_init_tables()
Initializes vlc_tbl0 and vlc_tbl1 tables, from table0.h and table1.h.
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
static void ms_terminate(ms_struct *msp)
static void vlc_encode(vlc_struct *vlcp, int cwd, int cwd_len)
static void terminate_mel_vlc(mel_struct *melp, vlc_struct *vlcp)
void ojph_encode_codeblock_avx2(ui32 *buf, ui32 missing_msbs, ui32 num_passes, ui32 width, ui32 height, ui32 stride, ui32 *lengths, ojph::mem_elastic_allocator *elastic, ojph::coded_lists *&coded)
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static void ms_init(ms_struct *msp, ui32 buffer_size, ui8 *data)
static void mel_encode(mel_struct *melp, bool bit)
bool initialize_block_encoder_tables_avx2()
static void vlc_init(vlc_struct *vlcp, ui32 buffer_size, ui8 *data)
static uvlc_tbl_struct uvlc_tbl[num_uvlc_entries]
static ui32 population_count(ui32 val)
static ui32 count_trailing_zeros(ui32 val)
#define OJPH_FORCE_INLINE
#define OJPH_ERROR(t,...)
bool last_greater_than_8F