vp2intersectd
void _mm_2intersect_epi32 (__m128i a, __m128i b, __mmask8* k1, __mmask8* k2)
Synopsis
void _mm_2intersect_epi32 (__m128i a, __m128i b, __mmask8* k1, __mmask8* k2)
#include <immintrin.h>
Instruction: vp2intersectd k, xmm, xmm
CPUID Flags: AVX512_VP2INTERSECT + AVX512VL
Description
Compute intersection of packed 32-bit integer vectors a and b, and store indication of match in the corresponding bit of two mask registers specified by k1 and k2. A match in corresponding elements of a and b is indicated by a set bit in the corresponding bit of the mask registers.
Operation
MEM[k1+7:k1] := 0
MEM[k2+7:k2] := 0
FOR i := 0 TO 3
FOR j := 0 TO 3
match := (a.dword[i] == b.dword[j] ? 1 : 0)
MEM[k1+7:k1].bit[i] |= match
MEM[k2+7:k2].bit[j] |= match
ENDFOR
ENDFOR
vp2intersectd
void _mm256_2intersect_epi32 (__m256i a, __m256i b, __mmask8* k1, __mmask8* k2)
Synopsis
void _mm256_2intersect_epi32 (__m256i a, __m256i b, __mmask8* k1, __mmask8* k2)
#include <immintrin.h>
Instruction: vp2intersectd k, ymm, ymm
CPUID Flags: AVX512_VP2INTERSECT + AVX512VL
Description
Compute intersection of packed 32-bit integer vectors a and b, and store indication of match in the corresponding bit of two mask registers specified by k1 and k2. A match in corresponding elements of a and b is indicated by a set bit in the corresponding bit of the mask registers.
Operation
MEM[k1+7:k1] := 0
MEM[k2+7:k2] := 0
FOR i := 0 TO 7
FOR j := 0 TO 7
match := (a.dword[i] == b.dword[j] ? 1 : 0)
MEM[k1+7:k1].bit[i] |= match
MEM[k2+7:k2].bit[j] |= match
ENDFOR
ENDFOR
vp2intersectd
void _mm512_2intersect_epi32 (__m512i a, __m512i b, __mmask16* k1, __mmask16* k2)
Synopsis
void _mm512_2intersect_epi32 (__m512i a, __m512i b, __mmask16* k1, __mmask16* k2)
#include <immintrin.h>
Instruction: vp2intersectd k, zmm, zmm
CPUID Flags: AVX512_VP2INTERSECT + AVX512F
Description
Compute intersection of packed 32-bit integer vectors a and b, and store indication of match in the corresponding bit of two mask registers specified by k1 and k2. A match in corresponding elements of a and b is indicated by a set bit in the corresponding bit of the mask registers.
Operation
MEM[k1+15:k1] := 0
MEM[k2+15:k2] := 0
FOR i := 0 TO 15
FOR j := 0 TO 15
match := (a.dword[i] == b.dword[j] ? 1 : 0)
MEM[k1+15:k1].bit[i] |= match
MEM[k2+15:k2].bit[j] |= match
ENDFOR
ENDFOR
vp2intersectq
void _mm_2intersect_epi64 (__m128i a, __m128i b, __mmask8* k1, __mmask8* k2)
Synopsis
void _mm_2intersect_epi64 (__m128i a, __m128i b, __mmask8* k1, __mmask8* k2)
#include <immintrin.h>
Instruction: vp2intersectq k, xmm, xmm
CPUID Flags: AVX512_VP2INTERSECT + AVX512VL
Description
Compute intersection of packed 64-bit integer vectors a and b, and store indication of match in the corresponding bit of two mask registers specified by k1 and k2. A match in corresponding elements of a and b is indicated by a set bit in the corresponding bit of the mask registers.
Operation
MEM[k1+7:k1] := 0
MEM[k2+7:k2] := 0
FOR i := 0 TO 1
FOR j := 0 TO 1
match := (a.qword[i] == b.qword[j] ? 1 : 0)
MEM[k1+7:k1].bit[i] |= match
MEM[k2+7:k2].bit[j] |= match
ENDFOR
ENDFOR
vp2intersectq
void _mm256_2intersect_epi64 (__m256i a, __m256i b, __mmask8* k1, __mmask8* k2)
Synopsis
void _mm256_2intersect_epi64 (__m256i a, __m256i b, __mmask8* k1, __mmask8* k2)
#include <immintrin.h>
Instruction: vp2intersectq k, ymm, ymm
CPUID Flags: AVX512_VP2INTERSECT + AVX512VL
Description
Compute intersection of packed 64-bit integer vectors a and b, and store indication of match in the corresponding bit of two mask registers specified by k1 and k2. A match in corresponding elements of a and b is indicated by a set bit in the corresponding bit of the mask registers.
Operation
MEM[k1+7:k1] := 0
MEM[k2+7:k2] := 0
FOR i := 0 TO 3
FOR j := 0 TO 3
match := (a.qword[i] == b.qword[j] ? 1 : 0)
MEM[k1+7:k1].bit[i] |= match
MEM[k2+7:k2].bit[j] |= match
ENDFOR
ENDFOR
vp2intersectq
void _mm512_2intersect_epi64 (__m512i a, __m512i b, __mmask8* k1, __mmask8* k2)
Synopsis
void _mm512_2intersect_epi64 (__m512i a, __m512i b, __mmask8* k1, __mmask8* k2)
#include <immintrin.h>
Instruction: vp2intersectq k, zmm, zmm
CPUID Flags: AVX512_VP2INTERSECT + AVX512F
Description
Compute intersection of packed 64-bit integer vectors a and b, and store indication of match in the corresponding bit of two mask registers specified by k1 and k2. A match in corresponding elements of a and b is indicated by a set bit in the corresponding bit of the mask registers.
Operation
MEM[k1+7:k1] := 0
MEM[k2+7:k2] := 0
FOR i := 0 TO 7
FOR j := 0 TO 7
match := (a.qword[i] == b.qword[j] ? 1 : 0)
MEM[k1+7:k1].bit[i] |= match
MEM[k2+7:k2].bit[j] |= match
ENDFOR
ENDFOR
aadd
void _aadd_i32 (int* __A, int __B)
Synopsis
void _aadd_i32 (int* __A, int __B)
#include <x86gprintrin.h>
Instruction: aadd m32, r32
CPUID Flags: RAO_INT
Description
Atomically add a 32-bit value at memory operand __A and a 32-bit __B, and store the result to the same memory location.
Operation
MEM[__A+31:__A] := MEM[__A+31:__A] + __B[31:0]
aadd
void _aadd_i64 (__int64* __A, __int64 __B)
Synopsis
void _aadd_i64 (__int64* __A, __int64 __B)
#include <x86gprintrin.h>
Instruction: aadd m64, r64
CPUID Flags: RAO_INT
Description
Atomically add a 64-bit value at memory operand __A and a 64-bit __B, and store the result to the same memory location.
Operation
MEM[__A+63:__A] := MEM[__A+63:__A] + __B[63:0]
aand
void _aand_i32 (int* __A, int __B)
Synopsis
void _aand_i32 (int* __A, int __B)
#include <x86gprintrin.h>
Instruction: aand m32, r32
CPUID Flags: RAO_INT
Description
Atomically and a 32-bit value at memory operand __A and a 32-bit __B, and store the result to the same memory location.
Operation
MEM[__A+31:__A] := MEM[__A+31:__A] AND __B[31:0]
aand
void _aand_i64 (__int64* __A, __int64 __B)
Synopsis
void _aand_i64 (__int64* __A, __int64 __B)
#include <x86gprintrin.h>
Instruction: aand m64, r64
CPUID Flags: RAO_INT
Description
Atomically and a 64-bit value at memory operand __A and a 64-bit __B, and store the result to the same memory location.
Operation
MEM[__A+63:__A] := MEM[__A+63:__A] AND __B[63:0]
pabsw
__m128i _mm_abs_epi16 (__m128i a)
Synopsis
__m128i _mm_abs_epi16 (__m128i a)
#include <tmmintrin.h>
Instruction: pabsw xmm, xmm
CPUID Flags: SSSE3
Description
Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := ABS(a[i+15:i])
ENDFOR
Latency and Throughput
vpabsw
__m128i _mm_mask_abs_epi16 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_abs_epi16 (__m128i src, __mmask8 k, __m128i a)
#include <immintrin.h>
Instruction: vpabsw xmm {k}, xmm
CPUID Flags: AVX512BW + AVX512VL
Description
Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := ABS(a[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
Latency and Throughput
vpabsw
__m128i _mm_maskz_abs_epi16 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_abs_epi16 (__mmask8 k, __m128i a)
#include <immintrin.h>
Instruction: vpabsw xmm {z}, xmm
CPUID Flags: AVX512BW + AVX512VL
Description
Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := ABS(a[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
Latency and Throughput
vpabsw
__m256i _mm256_abs_epi16 (__m256i a)
Synopsis
__m256i _mm256_abs_epi16 (__m256i a)
#include <immintrin.h>
Instruction: vpabsw ymm, ymm
CPUID Flags: AVX2
Description
Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 15
i := j*16
dst[i+15:i] := ABS(a[i+15:i])
ENDFOR
dst[MAX:256] := 0
Latency and Throughput
vpabsw
__m256i _mm256_mask_abs_epi16 (__m256i src, __mmask16 k, __m256i a)
Synopsis
__m256i _mm256_mask_abs_epi16 (__m256i src, __mmask16 k, __m256i a)
#include <immintrin.h>
Instruction: vpabsw ymm {k}, ymm
CPUID Flags: AVX512BW + AVX512VL
Description
Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := ABS(a[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
Latency and Throughput
vpabsw
__m256i _mm256_maskz_abs_epi16 (__mmask16 k, __m256i a)
Synopsis
__m256i _mm256_maskz_abs_epi16 (__mmask16 k, __m256i a)
#include <immintrin.h>
Instruction: vpabsw ymm {z}, ymm
CPUID Flags: AVX512BW + AVX512VL
Description
Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := ABS(a[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
Latency and Throughput
vpabsw
__m512i _mm512_abs_epi16 (__m512i a)
Synopsis
__m512i _mm512_abs_epi16 (__m512i a)
#include <immintrin.h>
Instruction: vpabsw zmm, zmm
CPUID Flags: AVX512BW
Description
Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 31
i := j*16
dst[i+15:i] := ABS(a[i+15:i])
ENDFOR
dst[MAX:512] := 0
Latency and Throughput
vpabsw
__m512i _mm512_mask_abs_epi16 (__m512i src, __mmask32 k, __m512i a)
Synopsis
__m512i _mm512_mask_abs_epi16 (__m512i src, __mmask32 k, __m512i a)
#include <immintrin.h>
Instruction: vpabsw zmm {k}, zmm
CPUID Flags: AVX512BW
Description
Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := ABS(a[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
Latency and Throughput
vpabsw
__m512i _mm512_maskz_abs_epi16 (__mmask32 k, __m512i a)
Synopsis
__m512i _mm512_maskz_abs_epi16 (__mmask32 k, __m512i a)
#include <immintrin.h>
Instruction: vpabsw zmm {z}, zmm
CPUID Flags: AVX512BW
Description
Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := ABS(a[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
Latency and Throughput
pabsd
__m128i _mm_abs_epi32 (__m128i a)
Synopsis
__m128i _mm_abs_epi32 (__m128i a)
#include <tmmintrin.h>
Instruction: pabsd xmm, xmm
CPUID Flags: SSSE3
Description
Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ABS(a[i+31:i])
ENDFOR
Latency and Throughput
vpabsd
__m128i _mm_mask_abs_epi32 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_abs_epi32 (__m128i src, __mmask8 k, __m128i a)
#include <immintrin.h>
Instruction: vpabsd xmm {k}, xmm
CPUID Flags: AVX512F + AVX512VL
Description
Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := ABS(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
Latency and Throughput
vpabsd
__m128i _mm_maskz_abs_epi32 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_abs_epi32 (__mmask8 k, __m128i a)
#include <immintrin.h>
Instruction: vpabsd xmm {z}, xmm
CPUID Flags: AVX512F + AVX512VL
Description
Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := ABS(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
Latency and Throughput
vpabsd
__m256i _mm256_abs_epi32 (__m256i a)
Synopsis
__m256i _mm256_abs_epi32 (__m256i a)
#include <immintrin.h>
Instruction: vpabsd ymm, ymm
CPUID Flags: AVX2
Description
Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ABS(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
Latency and Throughput
vpabsd
__m256i _mm256_mask_abs_epi32 (__m256i src, __mmask8 k, __m256i a)
Synopsis
__m256i _mm256_mask_abs_epi32 (__m256i src, __mmask8 k, __m256i a)
#include <immintrin.h>
Instruction: vpabsd ymm {k}, ymm
CPUID Flags: AVX512F + AVX512VL
Description
Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := ABS(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
Latency and Throughput
vpabsd
__m256i _mm256_maskz_abs_epi32 (__mmask8 k, __m256i a)
Synopsis
__m256i _mm256_maskz_abs_epi32 (__mmask8 k, __m256i a)
#include <immintrin.h>
Instruction: vpabsd ymm {z}, ymm
CPUID Flags: AVX512F + AVX512VL
Description
Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := ABS(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
Latency and Throughput
vpabsd
__m512i _mm512_abs_epi32 (__m512i a)
Synopsis
__m512i _mm512_abs_epi32 (__m512i a)
#include <immintrin.h>
Instruction: vpabsd zmm, zmm
CPUID Flags: AVX512F
Description
Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ABS(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
Latency and Throughput
vpabsd
__m512i _mm512_mask_abs_epi32 (__m512i src, __mmask16 k, __m512i a)
Synopsis
__m512i _mm512_mask_abs_epi32 (__m512i src, __mmask16 k, __m512i a)
#include <immintrin.h>
Instruction: vpabsd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ABS(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
Latency and Throughput
vpabsd
__m512i _mm512_maskz_abs_epi32 (__mmask16 k, __m512i a)
Synopsis
__m512i _mm512_maskz_abs_epi32 (__mmask16 k, __m512i a)
#include <immintrin.h>
Instruction: vpabsd zmm {z}, zmm
CPUID Flags: AVX512F
Description
Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ABS(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
Latency and Throughput
vpabsq
__m128i _mm_abs_epi64 (__m128i a)
Synopsis
__m128i _mm_abs_epi64 (__m128i a)
#include <immintrin.h>
Instruction: vpabsq xmm, xmm
CPUID Flags: AVX512F + AVX512VL
Description
Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ABS(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
Latency and Throughput
vpabsq
__m128i _mm_mask_abs_epi64 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_abs_epi64 (__m128i src, __mmask8 k, __m128i a)
#include <immintrin.h>
Instruction: vpabsq xmm {k}, xmm
CPUID Flags: AVX512F + AVX512VL
Description
Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := ABS(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
Latency and Throughput
vpabsq
__m128i _mm_maskz_abs_epi64 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_abs_epi64 (__mmask8 k, __m128i a)
#include <immintrin.h>
Instruction: vpabsq xmm {z}, xmm
CPUID Flags: AVX512F + AVX512VL
Description
Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := ABS(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
Latency and Throughput
vpabsq
__m256i _mm256_abs_epi64 (__m256i a)
Synopsis
__m256i _mm256_abs_epi64 (__m256i a)
#include <immintrin.h>
Instruction: vpabsq ymm, ymm
CPUID Flags: AVX512F + AVX512VL
Description
Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ABS(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
Latency and Throughput
vpabsq
__m256i _mm256_mask_abs_epi64 (__m256i src, __mmask8 k, __m256i a)
Synopsis
__m256i _mm256_mask_abs_epi64 (__m256i src, __mmask8 k, __m256i a)
#include <immintrin.h>
Instruction: vpabsq ymm {k}, ymm
CPUID Flags: AVX512F + AVX512VL
Description
Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := ABS(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
Latency and Throughput
vpabsq
__m256i _mm256_maskz_abs_epi64 (__mmask8 k, __m256i a)
Synopsis
__m256i _mm256_maskz_abs_epi64 (__mmask8 k, __m256i a)
#include <immintrin.h>
Instruction: vpabsq ymm {z}, ymm
CPUID Flags: AVX512F + AVX512VL
Description
Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := ABS(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
Latency and Throughput
vpabsq
__m512i _mm512_abs_epi64 (__m512i a)
Synopsis
__m512i _mm512_abs_epi64 (__m512i a)
#include <immintrin.h>
Instruction: vpabsq zmm, zmm
CPUID Flags: AVX512F
Description
Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ABS(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
Latency and Throughput
vpabsq
__m512i _mm512_mask_abs_epi64 (__m512i src, __mmask8 k, __m512i a)
Synopsis
__m512i _mm512_mask_abs_epi64 (__m512i src, __mmask8 k, __m512i a)
#include <immintrin.h>
Instruction: vpabsq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ABS(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
Latency and Throughput
vpabsq
__m512i _mm512_maskz_abs_epi64 (__mmask8 k, __m512i a)
Synopsis
__m512i _mm512_maskz_abs_epi64 (__mmask8 k, __m512i a)
#include <immintrin.h>
Instruction: vpabsq zmm {z}, zmm
CPUID Flags: AVX512F
Description
Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ABS(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
Latency and Throughput
pabsb
__m128i _mm_abs_epi8 (__m128i a)
Synopsis
__m128i _mm_abs_epi8 (__m128i a)
#include <tmmintrin.h>
Instruction: pabsb xmm, xmm
CPUID Flags: SSSE3
Description
Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 15
i := j*8
dst[i+7:i] := ABS(a[i+7:i])
ENDFOR
Latency and Throughput
vpabsb
__m128i _mm_mask_abs_epi8 (__m128i src, __mmask16 k, __m128i a)
Synopsis
__m128i _mm_mask_abs_epi8 (__m128i src, __mmask16 k, __m128i a)
#include <immintrin.h>
Instruction: vpabsb xmm {k}, xmm
CPUID Flags: AVX512BW + AVX512VL
Description
Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := ABS(a[i+7:i])
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
Latency and Throughput
vpabsb
__m128i _mm_maskz_abs_epi8 (__mmask16 k, __m128i a)
Synopsis
__m128i _mm_maskz_abs_epi8 (__mmask16 k, __m128i a)
#include <immintrin.h>
Instruction: vpabsb xmm {z}, xmm
CPUID Flags: AVX512BW + AVX512VL
Description
Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := ABS(a[i+7:i])
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
Latency and Throughput
vpabsb
__m256i _mm256_abs_epi8 (__m256i a)
Synopsis
__m256i _mm256_abs_epi8 (__m256i a)
#include <immintrin.h>
Instruction: vpabsb ymm, ymm
CPUID Flags: AVX2
Description
Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 31
i := j*8
dst[i+7:i] := ABS(a[i+7:i])
ENDFOR
dst[MAX:256] := 0
Latency and Throughput
vpabsb
__m256i _mm256_mask_abs_epi8 (__m256i src, __mmask32 k, __m256i a)
Synopsis
__m256i _mm256_mask_abs_epi8 (__m256i src, __mmask32 k, __m256i a)
#include <immintrin.h>
Instruction: vpabsb ymm {k}, ymm
CPUID Flags: AVX512BW + AVX512VL
Description
Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := ABS(a[i+7:i])
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
Latency and Throughput
vpabsb
__m256i _mm256_maskz_abs_epi8 (__mmask32 k, __m256i a)
Synopsis
__m256i _mm256_maskz_abs_epi8 (__mmask32 k, __m256i a)
#include <immintrin.h>
Instruction: vpabsb ymm {z}, ymm
CPUID Flags: AVX512BW + AVX512VL
Description
Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := ABS(a[i+7:i])
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
Latency and Throughput
vpabsb
__m512i _mm512_abs_epi8 (__m512i a)
Synopsis
__m512i _mm512_abs_epi8 (__m512i a)
#include <immintrin.h>
Instruction: vpabsb zmm, zmm
CPUID Flags: AVX512BW
Description
Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 63
i := j*8
dst[i+7:i] := ABS(a[i+7:i])
ENDFOR
dst[MAX:512] := 0
Latency and Throughput
vpabsb
__m512i _mm512_mask_abs_epi8 (__m512i src, __mmask64 k, __m512i a)
Synopsis
__m512i _mm512_mask_abs_epi8 (__m512i src, __mmask64 k, __m512i a)
#include <immintrin.h>
Instruction: vpabsb zmm {k}, zmm
CPUID Flags: AVX512BW
Description
Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := ABS(a[i+7:i])
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
Latency and Throughput
vpabsb
__m512i _mm512_maskz_abs_epi8 (__mmask64 k, __m512i a)
Synopsis
__m512i _mm512_maskz_abs_epi8 (__mmask64 k, __m512i a)
#include <immintrin.h>
Instruction: vpabsb zmm {z}, zmm
CPUID Flags: AVX512BW
Description
Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := ABS(a[i+7:i])
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
Latency and Throughput
vpandq
__m512d _mm512_abs_pd (__m512d v2)
Synopsis
__m512d _mm512_abs_pd (__m512d v2)
#include <immintrin.h>
Instruction: vpandq zmm, zmm, m512
CPUID Flags: AVX512F
Description
Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ABS(v2[i+63:i])
ENDFOR
dst[MAX:512] := 0
Latency and Throughput
vpandq
__m512d _mm512_mask_abs_pd (__m512d src, __mmask8 k, __m512d v2)
Synopsis
__m512d _mm512_mask_abs_pd (__m512d src, __mmask8 k, __m512d v2)
#include <immintrin.h>
Instruction: vpandq zmm {k}, zmm, m512
CPUID Flags: AVX512F
Description
Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ABS(v2[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
Latency and Throughput
...
__m128h _mm_abs_ph (__m128h v2)
Synopsis
__m128h _mm_abs_ph (__m128h v2)
#include <immintrin.h>
Instruction: Sequence
CPUID Flags: AVX512_FP16 + AVX512VL
Description
Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing the results in dst.
Operation
FOR j := 0 to 7
dst.fp16[j] := ABS(v2.fp16[j])
ENDFOR
dst[MAX:128] := 0
...
__m256h _mm256_abs_ph (__m256h v2)
Synopsis
__m256h _mm256_abs_ph (__m256h v2)
#include <immintrin.h>
Instruction: Sequence
CPUID Flags: AVX512_FP16 + AVX512VL
Description
Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing the results in dst.
Operation
FOR j := 0 to 15
dst.fp16[j] := ABS(v2.fp16[j])
ENDFOR
dst[MAX:256] := 0
...
__m512h _mm512_abs_ph (__m512h v2)
Synopsis
__m512h _mm512_abs_ph (__m512h v2)
#include <immintrin.h>
Instruction: Sequence
CPUID Flags: AVX512_FP16
Description
Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing the results in dst.
Operation
FOR j := 0 to 31
dst.fp16[j] := ABS(v2.fp16[j])
ENDFOR
dst[MAX:512] := 0
pabsw
__m64 _mm_abs_pi16 (__m64 a)
Synopsis
__m64 _mm_abs_pi16 (__m64 a)
#include <tmmintrin.h>
Instruction: pabsw mm, mm
CPUID Flags: SSSE3
Description
Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 3
i := j*16
dst[i+15:i] := ABS(Int(a[i+15:i]))
ENDFOR
Latency and Throughput
pabsd
__m64 _mm_abs_pi32 (__m64 a)
Synopsis
__m64 _mm_abs_pi32 (__m64 a)
#include <tmmintrin.h>
Instruction: pabsd mm, mm
CPUID Flags: SSSE3
Description
Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 1
i := j*32
dst[i+31:i] := ABS(a[i+31:i])
ENDFOR
Latency and Throughput
pabsb
__m64 _mm_abs_pi8 (__m64 a)
Synopsis
__m64 _mm_abs_pi8 (__m64 a)
#include <tmmintrin.h>
Instruction: pabsb mm, mm
CPUID Flags: SSSE3
Description
Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 7
i := j*8
dst[i+7:i] := ABS(Int(a[i+7:i]))
ENDFOR
Latency and Throughput
vpandd
__m512 _mm512_abs_ps (__m512 v2)
Synopsis
__m512 _mm512_abs_ps (__m512 v2)
#include <immintrin.h>
Instruction: vpandd zmm, zmm, m512
CPUID Flags: AVX512F
Description
Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ABS(v2[i+31:i])
ENDFOR
dst[MAX:512] := 0
Latency and Throughput