#define GGML_COMMON_IMPL_C #include "ggml-common.h" #include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu.h" #include "simd-mappings.h" #include "../../quants.h" #include "../../ggml-cpu-impl.h" #include #include #include #include // for qsort #include // for GGML_ASSERT #define GROUP_MAX_EPS 1e-15f #define GROUP_MAX_EPS_IQ3_XXS 1e-8f #define GROUP_MAX_EPS_IQ2_S 1e-8f #define GROUP_MAX_EPS_IQ1_M 1e-7f #define GROUP_MAX_EPS_IQ1_S 1e-12f #define UNUSED GGML_UNUSED // some compilers don't provide _mm256_set_m128i, e.g. gcc 7 #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) // multiply int8_t, add results pairwise twice static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { // Get absolute values of x vectors const __m128i ax = _mm_sign_epi8(x, x); // Sign the values of the y vectors const __m128i sy = _mm_sign_epi8(y, x); // Perform multiplication and create 16-bit values const __m128i dot = _mm_maddubs_epi16(ax, sy); const __m128i ones = _mm_set1_epi16(1); return _mm_madd_epi16(ones, dot); } #if __AVX__ || __AVX2__ || __AVX512F__ // horizontally add 8 floats static inline float hsum_float_8(const __m256 x) { __m128 res = _mm256_extractf128_ps(x, 1); res = _mm_add_ps(res, _mm256_castps256_ps128(x)); res = _mm_add_ps(res, _mm_movehl_ps(res, res)); res = _mm_add_ss(res, _mm_movehdup_ps(res)); return _mm_cvtss_f32(res); } // horizontally add 8 int32_t static inline int hsum_i32_8(const __m256i a) { const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1)); const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128); const __m128i sum64 = _mm_add_epi32(hi64, sum128); const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); } // horizontally add 4 int32_t static inline int hsum_i32_4(const __m128i a) { const __m128i hi64 = _mm_unpackhi_epi64(a, a); const __m128i sum64 = _mm_add_epi32(hi64, a); const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); } #if defined(__AVX2__) || defined(__AVX512F__) static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { const __m256i ax = _mm256_sign_epi8(x, x); const __m256i sy = _mm256_sign_epi8(y, x); return _mm256_maddubs_epi16(ax, sy); } // spread 32 bits to 32 bytes { 0x00, 0xFF } static inline __m256i bytes_from_bits_32(const uint8_t * x) { uint32_t x32; memcpy(&x32, x, sizeof(uint32_t)); const __m256i shuf_mask = _mm256_set_epi64x( 0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000); __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask); const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe); bytes = _mm256_or_si256(bytes, bit_mask); return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1)); } // Unpack 32 4-bit fields into 32 bytes // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) { const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi); const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp); const __m256i lowMask = _mm256_set1_epi8( 0xF ); return _mm256_and_si256(lowMask, bytes); } // add int16_t pairwise and return as float vector static inline __m256 sum_i16_pairs_float(const __m256i x) { const __m256i ones = _mm256_set1_epi16(1); const __m256i summed_pairs = _mm256_madd_epi16(ones, x); return _mm256_cvtepi32_ps(summed_pairs); } static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { #if defined(__AVX512VNNI__) && defined(__AVX512VL__) const __m256i zero = _mm256_setzero_si256(); const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); return _mm256_cvtepi32_ps(summed_pairs); #elif defined(__AVXVNNI__) const __m256i zero = _mm256_setzero_si256(); const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy); return _mm256_cvtepi32_ps(summed_pairs); #else // Perform multiplication and create 16-bit values const __m256i dot = _mm256_maddubs_epi16(ax, sy); return sum_i16_pairs_float(dot); #endif } // multiply int8_t, add results pairwise twice and return as float vector static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { #if __AVXVNNIINT8__ const __m256i zero = _mm256_setzero_si256(); const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y); return _mm256_cvtepi32_ps(summed_pairs); #else // Get absolute values of x vectors const __m256i ax = _mm256_sign_epi8(x, x); // Sign the values of the y vectors const __m256i sy = _mm256_sign_epi8(y, x); return mul_sum_us8_pairs_float(ax, sy); #endif } static inline __m128i packNibbles( __m256i bytes ) { // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh #if __AVX512F__ const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000 bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh return _mm256_cvtepi16_epi8(bytes); // abcd_efgh #else const __m256i lowByte = _mm256_set1_epi16( 0xFF ); __m256i high = _mm256_andnot_si256( lowByte, bytes ); __m256i low = _mm256_and_si256( lowByte, bytes ); high = _mm256_srli_epi16( high, 4 ); bytes = _mm256_or_si256( low, high ); // Compress uint16_t lanes into bytes __m128i r0 = _mm256_castsi256_si128( bytes ); __m128i r1 = _mm256_extracti128_si256( bytes, 1 ); return _mm_packus_epi16( r0, r1 ); #endif } #elif defined(__AVX__) static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) { // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh const __m128i lowByte = _mm_set1_epi16( 0xFF ); __m128i high = _mm_andnot_si128( lowByte, bytes1 ); __m128i low = _mm_and_si128( lowByte, bytes1 ); high = _mm_srli_epi16( high, 4 ); bytes1 = _mm_or_si128( low, high ); high = _mm_andnot_si128( lowByte, bytes2 ); low = _mm_and_si128( lowByte, bytes2 ); high = _mm_srli_epi16( high, 4 ); bytes2 = _mm_or_si128( low, high ); return _mm_packus_epi16( bytes1, bytes2); } static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) { const __m128i ax = _mm_sign_epi8(x, x); const __m128i sy = _mm_sign_epi8(y, x); return _mm_maddubs_epi16(ax, sy); } // spread 32 bits to 32 bytes { 0x00, 0xFF } static inline __m256i bytes_from_bits_32(const uint8_t * x) { uint32_t x32; memcpy(&x32, x, sizeof(uint32_t)); const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000); const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202); __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl); __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh); const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe); bytesl = _mm_or_si128(bytesl, bit_mask); bytesh = _mm_or_si128(bytesh, bit_mask); bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1)); bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1)); return MM256_SET_M128I(bytesh, bytesl); } // Unpack 32 4-bit fields into 32 bytes // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) { // Load 16 bytes from memory __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi); __m128i tmph = _mm_srli_epi16(tmpl, 4); const __m128i lowMask = _mm_set1_epi8(0xF); tmpl = _mm_and_si128(lowMask, tmpl); tmph = _mm_and_si128(lowMask, tmph); return MM256_SET_M128I(tmph, tmpl); } // add int16_t pairwise and return as float vector static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) { const __m128i ones = _mm_set1_epi16(1); const __m128i summed_pairsl = _mm_madd_epi16(ones, xl); const __m128i summed_pairsh = _mm_madd_epi16(ones, xh); const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl); return _mm256_cvtepi32_ps(summed_pairs); } static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { const __m128i axl = _mm256_castsi256_si128(ax); const __m128i axh = _mm256_extractf128_si256(ax, 1); const __m128i syl = _mm256_castsi256_si128(sy); const __m128i syh = _mm256_extractf128_si256(sy, 1); // Perform multiplication and create 16-bit values const __m128i dotl = _mm_maddubs_epi16(axl, syl); const __m128i doth = _mm_maddubs_epi16(axh, syh); return sum_i16_pairs_float(doth, dotl); } // multiply int8_t, add results pairwise twice and return as float vector static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { const __m128i xl = _mm256_castsi256_si128(x); const __m128i xh = _mm256_extractf128_si256(x, 1); const __m128i yl = _mm256_castsi256_si128(y); const __m128i yh = _mm256_extractf128_si256(y, 1); // Get absolute values of x vectors const __m128i axl = _mm_sign_epi8(xl, xl); const __m128i axh = _mm_sign_epi8(xh, xh); // Sign the values of the y vectors const __m128i syl = _mm_sign_epi8(yl, xl); const __m128i syh = _mm_sign_epi8(yh, xh); // Perform multiplication and create 16-bit values const __m128i dotl = _mm_maddubs_epi16(axl, syl); const __m128i doth = _mm_maddubs_epi16(axh, syh); return sum_i16_pairs_float(doth, dotl); } // larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_1_1, const __m128i x_2_0, const __m128i x_2_1, const __m128i y_1_0, const __m128i y_1_1, const __m128i y_2_0, const __m128i y_2_1) { const __m128i mone = _mm_set1_epi16(1); const __m128i p16_1_0 = mul_add_epi8_sse(x_1_0, y_1_0); const __m128i p16_1_1 = mul_add_epi8_sse(x_1_1, y_1_1); const __m128i p16_2_0 = mul_add_epi8_sse(x_2_0, y_2_0); const __m128i p16_2_1 = mul_add_epi8_sse(x_2_1, y_2_1); const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone); const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone); const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone); const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone); const __m128i p_1 = _mm_add_epi32(p_1_0, p_1_1); const __m128i p_2 = _mm_add_epi32(p_2_0, p_2_1); return _mm256_cvtepi32_ps(MM256_SET_M128I(p_2, p_1)); } // quad fp16 delta calculation static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) { // GGML_CPU_FP16_TO_FP32 is faster than Intel F16C return _mm256_set_m128(_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x1) * GGML_CPU_FP16_TO_FP32(y1)), _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x0) * GGML_CPU_FP16_TO_FP32(y0))); } static inline __m256 quad_mx_delta_float(const int8_t x0, const float y0, const int8_t x1, const float y1) { return _mm256_set_m128(_mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x1) * GGML_CPU_FP16_TO_FP32(y1)), _mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x0) * GGML_CPU_FP16_TO_FP32(y0))); } #endif #elif defined(__SSSE3__) // horizontally add 4x4 floats static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) { __m128 res_0 =_mm_hadd_ps(a, b); __m128 res_1 =_mm_hadd_ps(c, d); __m128 res =_mm_hadd_ps(res_0, res_1); res =_mm_hadd_ps(res, res); res =_mm_hadd_ps(res, res); return _mm_cvtss_f32(res); } #endif // __AVX__ || __AVX2__ || __AVX512F__ #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(QK8_0 == 32); assert(k % QK8_0 == 0); const int nb = k / QK8_0; block_q8_0 * GGML_RESTRICT y = vy; #if defined(__AVX2__) || defined(__AVX__) for (int i = 0; i < nb; i++) { // Load elements into 4 AVX vectors __m256 v0 = _mm256_loadu_ps( x ); __m256 v1 = _mm256_loadu_ps( x + 8 ); __m256 v2 = _mm256_loadu_ps( x + 16 ); __m256 v3 = _mm256_loadu_ps( x + 24 ); x += 32; // Compute max(abs(e)) for the block const __m256 signBit = _mm256_set1_ps( -0.0f ); __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); const float maxScalar = _mm_cvtss_f32( max4 ); // Quantize these floats const float d = maxScalar / 127.f; y[i].d = GGML_CPU_FP32_TO_FP16(d); const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; const __m256 mul = _mm256_set1_ps( id ); // Apply the multiplier v0 = _mm256_mul_ps( v0, mul ); v1 = _mm256_mul_ps( v1, mul ); v2 = _mm256_mul_ps( v2, mul ); v3 = _mm256_mul_ps( v3, mul ); // Round to nearest integer v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); // Convert floats to integers __m256i i0 = _mm256_cvtps_epi32( v0 ); __m256i i1 = _mm256_cvtps_epi32( v1 ); __m256i i2 = _mm256_cvtps_epi32( v2 ); __m256i i3 = _mm256_cvtps_epi32( v3 ); #if defined(__AVX2__) // Convert int32 to int16 i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 // Convert int16 to int8 i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 // We got our precious signed bytes, but the order is now wrong // These AVX2 pack instructions process 16-byte pieces independently // The following instruction is fixing the order const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); i0 = _mm256_permutevar8x32_epi32( i0, perm ); _mm256_storeu_si256((__m256i *)y[i].qs, i0); #else // Since we don't have in AVX some necessary functions, // we split the registers in half and call AVX2 analogs from SSE __m128i ni0 = _mm256_castsi256_si128( i0 ); __m128i ni1 = _mm256_extractf128_si256( i0, 1); __m128i ni2 = _mm256_castsi256_si128( i1 ); __m128i ni3 = _mm256_extractf128_si256( i1, 1); __m128i ni4 = _mm256_castsi256_si128( i2 ); __m128i ni5 = _mm256_extractf128_si256( i2, 1); __m128i ni6 = _mm256_castsi256_si128( i3 ); __m128i ni7 = _mm256_extractf128_si256( i3, 1); // Convert int32 to int16 ni0 = _mm_packs_epi32( ni0, ni1 ); ni2 = _mm_packs_epi32( ni2, ni3 ); ni4 = _mm_packs_epi32( ni4, ni5 ); ni6 = _mm_packs_epi32( ni6, ni7 ); // Convert int16 to int8 ni0 = _mm_packs_epi16( ni0, ni2 ); ni4 = _mm_packs_epi16( ni4, ni6 ); _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); #endif } #else GGML_UNUSED(nb); // scalar quantize_row_q8_0_ref(x, y, k); #endif } void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(k % QK8_1 == 0); const int nb = k / QK8_1; block_q8_1 * GGML_RESTRICT y = vy; #if defined(__AVX2__) || defined(__AVX__) for (int i = 0; i < nb; i++) { // Load elements into 4 AVX vectors __m256 v0 = _mm256_loadu_ps( x ); __m256 v1 = _mm256_loadu_ps( x + 8 ); __m256 v2 = _mm256_loadu_ps( x + 16 ); __m256 v3 = _mm256_loadu_ps( x + 24 ); x += 32; // Compute max(abs(e)) for the block const __m256 signBit = _mm256_set1_ps( -0.0f ); __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); const float max_scalar = _mm_cvtss_f32( max4 ); // Quantize these floats const float d = max_scalar / 127.f; y[i].d = GGML_CPU_FP32_TO_FP16(d); const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; const __m256 mul = _mm256_set1_ps( id ); // Apply the multiplier v0 = _mm256_mul_ps( v0, mul ); v1 = _mm256_mul_ps( v1, mul ); v2 = _mm256_mul_ps( v2, mul ); v3 = _mm256_mul_ps( v3, mul ); // Round to nearest integer v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); // Convert floats to integers __m256i i0 = _mm256_cvtps_epi32( v0 ); __m256i i1 = _mm256_cvtps_epi32( v1 ); __m256i i2 = _mm256_cvtps_epi32( v2 ); __m256i i3 = _mm256_cvtps_epi32( v3 ); #if defined(__AVX2__) // Compute the sum of the quants and set y[i].s y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)))); // Convert int32 to int16 i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 // Convert int16 to int8 i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 // We got our precious signed bytes, but the order is now wrong // These AVX2 pack instructions process 16-byte pieces independently // The following instruction is fixing the order const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); i0 = _mm256_permutevar8x32_epi32( i0, perm ); _mm256_storeu_si256((__m256i *)y[i].qs, i0); #else // Since we don't have in AVX some necessary functions, // we split the registers in half and call AVX2 analogs from SSE __m128i ni0 = _mm256_castsi256_si128( i0 ); __m128i ni1 = _mm256_extractf128_si256( i0, 1); __m128i ni2 = _mm256_castsi256_si128( i1 ); __m128i ni3 = _mm256_extractf128_si256( i1, 1); __m128i ni4 = _mm256_castsi256_si128( i2 ); __m128i ni5 = _mm256_extractf128_si256( i2, 1); __m128i ni6 = _mm256_castsi256_si128( i3 ); __m128i ni7 = _mm256_extractf128_si256( i3, 1); // Compute the sum of the quants and set y[i].s const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3)); const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7)); y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1))); // Convert int32 to int16 ni0 = _mm_packs_epi32( ni0, ni1 ); ni2 = _mm_packs_epi32( ni2, ni3 ); ni4 = _mm_packs_epi32( ni4, ni5 ); ni6 = _mm_packs_epi32( ni6, ni7 ); // Convert int16 to int8 ni0 = _mm_packs_epi16( ni0, ni2 ); ni4 = _mm_packs_epi16( ni4, ni6 ); _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); #endif } #else GGML_UNUSED(nb); // scalar quantize_row_q8_1_ref(x, y, k); #endif } // placeholder implementation for Apple targets void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { quantize_row_q8_K_ref(x, y, k); } //===================================== Dot products ================================= // // Helper functions // #if __AVX__ || __AVX2__ || __AVX512F__ // shuffles to pick the required scales in dot products static inline __m256i get_scale_shuffle_q3k(int i) { static const uint8_t k_shuffle[128] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, }; return _mm256_loadu_si256((const __m256i*)k_shuffle + i); } static inline __m256i get_scale_shuffle_k4(int i) { static const uint8_t k_shuffle[256] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15 }; return _mm256_loadu_si256((const __m256i*)k_shuffle + i); } static inline __m128i get_scale_shuffle(int i) { static const uint8_t k_shuffle[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11, 12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13, 14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15 }; return _mm_loadu_si128((const __m128i*)k_shuffle + i); } #endif void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_q4_0 * GGML_RESTRICT x = vx; const block_q8_0 * GGML_RESTRICT y = vy; int ib = 0; float sumf = 0; #if defined(__AVX2__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); // Main loop for (; ib < nb; ++ib) { /* Compute combined scale for the block */ const __m256 d = _mm256_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) ); __m256i qx = bytes_from_nibbles_32(x[ib].qs); // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. const __m256i off = _mm256_set1_epi8( 8 ); qx = _mm256_sub_epi8( qx, off ); __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs); const __m256 q = mul_sum_i8_pairs_float(qx, qy); /* Multiply q with scale and accumulate */ acc = _mm256_fmadd_ps( d, q, acc ); } sumf = hsum_float_8(acc); #elif defined(__AVX__) __m256 accum = _mm256_setzero_ps(); for (; ib + 1 < nb; ib += 2) { const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs); const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs); const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1); const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs); const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1); const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8)); const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8)); const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8)); const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8)); const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0); const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1); const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0); const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1); const __m128i p_1 = _mm_add_epi16(p16_1_0, p16_1_1); const __m128i p_2 = _mm_add_epi16(p16_2_0, p16_2_1); const __m256 p = sum_i16_pairs_float(p_2, p_1); const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d); accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum); } sumf = hsum_float_8(accum); #elif defined(__SSSE3__) // set constants const __m128i lowMask = _mm_set1_epi8(0xF); const __m128i off = _mm_set1_epi8(8); // Initialize accumulator with zeros __m128 acc_0 = _mm_setzero_ps(); __m128 acc_1 = _mm_setzero_ps(); __m128 acc_2 = _mm_setzero_ps(); __m128 acc_3 = _mm_setzero_ps(); for (; ib + 1 < nb; ib += 2) { _mm_prefetch(&x[ib] + sizeof(block_q4_0), _MM_HINT_T0); _mm_prefetch(&y[ib] + sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 0 and 1 const __m128 d_0_1 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) ); const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[ib].qs); __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1); __m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs); bx_0 = _mm_sub_epi8(bx_0, off); const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4)); __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16)); bx_1 = _mm_sub_epi8(bx_1, off); const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); _mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0); _mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 2 and 3 const __m128 d_2_3 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) ); const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3); __m128i by_2 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs); bx_2 = _mm_sub_epi8(bx_2, off); const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4)); __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[ib + 1].qs + 16)); bx_3 = _mm_sub_epi8(bx_3, off); const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); // Convert int32_t to float __m128 p0 = _mm_cvtepi32_ps(i32_0); __m128 p1 = _mm_cvtepi32_ps(i32_1); __m128 p2 = _mm_cvtepi32_ps(i32_2); __m128 p3 = _mm_cvtepi32_ps(i32_3); // Apply the scale __m128 p0_d = _mm_mul_ps( d_0_1, p0 ); __m128 p1_d = _mm_mul_ps( d_0_1, p1 ); __m128 p2_d = _mm_mul_ps( d_2_3, p2 ); __m128 p3_d = _mm_mul_ps( d_2_3, p3 ); // Acummulate acc_0 = _mm_add_ps(p0_d, acc_0); acc_1 = _mm_add_ps(p1_d, acc_1); acc_2 = _mm_add_ps(p2_d, acc_2); acc_3 = _mm_add_ps(p3_d, acc_3); } sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); #endif for (; ib < nb; ++ib) { int sumi0 = 0; int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { const int v0 = (x[ib].qs[j] & 0x0F) - 8; const int v1 = (x[ib].qs[j] >> 4) - 8; sumi0 += (v0 * y[ib].qs[j]); sumi1 += (v1 * y[ib].qs[j + qk/2]); } int sumi = sumi0 + sumi1; sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); } *s = sumf; } void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_1; const int nb = n / qk; assert(n % qk == 0); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_q4_1 * GGML_RESTRICT x = vx; const block_q8_1 * GGML_RESTRICT y = vy; int ib = 0; #if defined(__AVX2__) || defined(__AVX__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); float summs = 0; // Main loop for (; ib < nb; ++ib) { const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d); summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); const __m256 d0v = _mm256_set1_ps( d0 ); const __m256 d1v = _mm256_set1_ps( d1 ); // Compute combined scales const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes const __m256i qx = bytes_from_nibbles_32(x[ib].qs); const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[ib].qs ); const __m256 xy = mul_sum_us8_pairs_float(qx, qy); // Accumulate d0*d1*x*y #if defined(__AVX2__) acc = _mm256_fmadd_ps( d0d1, xy, acc ); #else acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc ); #endif } *s = hsum_float_8(acc) + summs; #else UNUSED(nb); UNUSED(x); UNUSED(y); UNUSED(ib); ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); assert(n % QK_MXFP4 == 0); static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same"); const block_mxfp4 * GGML_RESTRICT x = vx; const block_q8_0 * GGML_RESTRICT y = vy; const int nb = n / QK_MXFP4; int ib = 0; float sumf = 0; #if defined __AVX2__ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4); const __m128i m4b = _mm_set1_epi8(0x0f); const __m256i mone = _mm256_set1_epi16(1); __m256 accum1 = _mm256_setzero_ps(); __m256 accum2 = _mm256_setzero_ps(); for (; ib + 1 < nb; ib += 2) { const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs); const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs); const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs); const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs); const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)), _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b))); const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)), _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b))); const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); const __m256i p_1 = _mm256_madd_epi16(p16_1, mone); const __m256i p_2 = _mm256_madd_epi16(p16_2, mone); accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 0].e)), _mm256_cvtepi32_ps(p_1), accum1); accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 1].e)), _mm256_cvtepi32_ps(p_2), accum2); } sumf = hsum_float_8(_mm256_add_ps(accum1, accum2)); #elif defined __AVX__ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4); const __m128i m4b = _mm_set1_epi8(0x0f); __m256 accum = _mm256_setzero_ps(); for (; ib + 1 < nb; ib += 2) { const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs); const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs); const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1); const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs); const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1); const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)); const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)); const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)); const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)); const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1); const __m256 deltas = quad_mx_delta_float(x[ib].e, y[ib].d, x[ib + 1].e, y[ib + 1].d); accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum); } sumf = hsum_float_8(accum); #endif for (; ib < nb; ++ib) { const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e); int sumi1 = 0; int sumi2 = 0; for (int j = 0; j < QK_MXFP4/2; ++j) { sumi1 += y[ib].qs[j + 0] * kvalues_mxfp4[x[ib].qs[j] & 0xf]; sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >> 4]; } sumf += d * (sumi1 + sumi2); } *s = sumf; } void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; int ib = 0; assert(n % qk == 0); assert(qk == QK5_0); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_q5_0 * GGML_RESTRICT x = vx; const block_q8_0 * GGML_RESTRICT y = vy; #if defined(__AVX2__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); // Main loop for (; ib < nb; ++ib) { /* Compute combined scale for the block */ const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); __m256i qx = bytes_from_nibbles_32(x[ib].qs); __m256i bxhi = bytes_from_bits_32(x[ib].qh); bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0)); qx = _mm256_or_si256(qx, bxhi); __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs); const __m256 q = mul_sum_i8_pairs_float(qx, qy); /* Multiply q with scale and accumulate */ acc = _mm256_fmadd_ps(d, q, acc); } *s = hsum_float_8(acc); #elif defined(__AVX__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); __m128i mask = _mm_set1_epi8((char)0xF0); // Main loop for (; ib < nb; ++ib) { /* Compute combined scale for the block */ const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs); const __m256i bxhi = bytes_from_bits_32(x[ib].qh); __m128i bxhil = _mm256_castsi256_si128(bxhi); __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); bxhil = _mm_andnot_si128(bxhil, mask); bxhih = _mm_andnot_si128(bxhih, mask); __m128i bxl = _mm256_castsi256_si128(bx_0); __m128i bxh = _mm256_extractf128_si256(bx_0, 1); bxl = _mm_or_si128(bxl, bxhil); bxh = _mm_or_si128(bxh, bxhih); bx_0 = MM256_SET_M128I(bxh, bxl); const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs); const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0); /* Multiply q with scale and accumulate */ acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc); } *s = hsum_float_8(acc); #else UNUSED(nb); UNUSED(ib); UNUSED(x); UNUSED(y); ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_1; const int nb = n / qk; int ib = 0; assert(n % qk == 0); assert(qk == QK5_1); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_q5_1 * GGML_RESTRICT x = vx; const block_q8_1 * GGML_RESTRICT y = vy; #if defined(__AVX2__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); float summs = 0.0f; // Main loop for (; ib < nb; ++ib) { const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d)); summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); __m256i qx = bytes_from_nibbles_32(x[ib].qs); __m256i bxhi = bytes_from_bits_32(x[ib].qh); bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10)); qx = _mm256_or_si256(qx, bxhi); const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d)); const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs); const __m256 q = mul_sum_us8_pairs_float(qx, qy); acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc); } *s = hsum_float_8(acc) + summs; #elif defined(__AVX__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); __m128i mask = _mm_set1_epi8(0x10); float summs = 0.0f; // Main loop for (; ib < nb; ++ib) { const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d)); summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs); const __m256i bxhi = bytes_from_bits_32(x[ib].qh); __m128i bxhil = _mm256_castsi256_si128(bxhi); __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); bxhil = _mm_and_si128(bxhil, mask); bxhih = _mm_and_si128(bxhih, mask); __m128i bxl = _mm256_castsi256_si128(bx_0); __m128i bxh = _mm256_extractf128_si256(bx_0, 1); bxl = _mm_or_si128(bxl, bxhil); bxh = _mm_or_si128(bxh, bxhih); bx_0 = MM256_SET_M128I(bxh, bxl); const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d)); const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs); const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0); acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc); } *s = hsum_float_8(acc) + summs; #else UNUSED(nb); UNUSED(ib); UNUSED(x); UNUSED(y); ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_q8_0 * GGML_RESTRICT x = vx; const block_q8_0 * GGML_RESTRICT y = vy; int ib = 0; float sumf = 0; #if defined(__AVX2__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); // Main loop for (; ib < nb; ++ib) { // Compute combined scale for the block const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); __m256i qx = _mm256_loadu_si256((const __m256i *)x[ib].qs); __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs); const __m256 q = mul_sum_i8_pairs_float(qx, qy); // Multiply q with scale and accumulate acc = _mm256_fmadd_ps( d, q, acc ); } sumf = hsum_float_8(acc); #elif defined(__AVX__) __m256 accum = _mm256_setzero_ps(); for (; ib + 1 < nb; ib += 2) { const __m128i qx_1_0 = _mm_loadu_si128((const __m128i *)x[ib].qs); const __m128i qx_1_1 = _mm_loadu_si128((const __m128i *)x[ib].qs + 1); const __m128i qx_2_0 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); const __m128i qx_2_1 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs + 1); const __m128i qy_1_0 = _mm_loadu_si128((const __m128i *)y[ib].qs); const __m128i qy_1_1 = _mm_loadu_si128((const __m128i *)y[ib].qs + 1); const __m128i qy_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs); const __m128i qy_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1); const __m256 p = mul_sum_i8_quad_float(qx_1_0, qx_1_1, qx_2_0, qx_2_1, qy_1_0, qy_1_1, qy_2_0, qy_2_1); const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d); accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum); } sumf = hsum_float_8(accum); #endif for (; ib < nb; ++ib) { int sumi = 0; for (int j = 0; j < qk; j++) { sumi += x[ib].qs[j]*y[ib].qs[j]; } sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); } *s = sumf; } void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_tq1_0 * GGML_RESTRICT x = vx; const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; #if defined(__AVX2__) __m256 sumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { // 16-bit sums __m256i sumi0 = _mm256_setzero_si256(); __m256i sumi1 = _mm256_setzero_si256(); __m256i sumi2 = _mm256_setzero_si256(); // first 32 bytes of 5 elements { __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs)); // 8-bit multiplies with shifts, masks and adds __m256i qx1 = _mm256_add_epi8(qx0, _mm256_add_epi8(qx0, qx0)); // 1 * 3 __m256i qx2 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx0, 3), _mm256_set1_epi8(-8)), qx0); // 1 * 9 __m256i qx3 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx1, 3), _mm256_set1_epi8(-8)), qx1); // 3 * 9 __m256i qx4 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx2, 3), _mm256_set1_epi8(-8)), qx2); // 9 * 9 // TODO: can _mm256_mulhi_epu16 be faster even if 16-bits? // Cancel the +1 from avg so that it behaves like a halving add qx0 = _mm256_subs_epu8(qx0, _mm256_set1_epi8(1)); qx1 = _mm256_subs_epu8(qx1, _mm256_set1_epi8(1)); qx2 = _mm256_subs_epu8(qx2, _mm256_set1_epi8(1)); qx3 = _mm256_subs_epu8(qx3, _mm256_set1_epi8(1)); qx4 = _mm256_subs_epu8(qx4, _mm256_set1_epi8(1)); // Multiply by 3 and get the top 2 bits qx0 = _mm256_avg_epu8(qx0, _mm256_avg_epu8(qx0, _mm256_setzero_si256())); qx1 = _mm256_avg_epu8(qx1, _mm256_avg_epu8(qx1, _mm256_setzero_si256())); qx2 = _mm256_avg_epu8(qx2, _mm256_avg_epu8(qx2, _mm256_setzero_si256())); qx3 = _mm256_avg_epu8(qx3, _mm256_avg_epu8(qx3, _mm256_setzero_si256())); qx4 = _mm256_avg_epu8(qx4, _mm256_avg_epu8(qx4, _mm256_setzero_si256())); qx0 = _mm256_and_si256(_mm256_srli_epi16(qx0, 6), _mm256_set1_epi8(3)); qx1 = _mm256_and_si256(_mm256_srli_epi16(qx1, 6), _mm256_set1_epi8(3)); qx2 = _mm256_and_si256(_mm256_srli_epi16(qx2, 6), _mm256_set1_epi8(3)); qx3 = _mm256_and_si256(_mm256_srli_epi16(qx3, 6), _mm256_set1_epi8(3)); qx4 = _mm256_and_si256(_mm256_srli_epi16(qx4, 6), _mm256_set1_epi8(3)); const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 0)); const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 32)); const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 64)); const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 96)); const __m256i qy4 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 128)); qx0 = _mm256_maddubs_epi16(qx0, qy0); qx1 = _mm256_maddubs_epi16(qx1, qy1); qx2 = _mm256_maddubs_epi16(qx2, qy2); qx3 = _mm256_maddubs_epi16(qx3, qy3); qx4 = _mm256_maddubs_epi16(qx4, qy4); sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1)); sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3)); sumi2 = _mm256_add_epi16(sumi2, qx4); } // last 16 bytes of 5-element, along with the 4 bytes of 4 elements { __m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].qs + 32)); uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned __m256i qx5_l = _mm256_cvtepu8_epi16(_mm_set1_epi32(qh)); __m128i qx1 = _mm_add_epi8(qx0, _mm_add_epi8(qx0, qx0)); // 1 * 3 __m128i qx2 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx0, 3), _mm_set1_epi8(-8)), qx0); // 1 * 9 __m128i qx3 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx1, 3), _mm_set1_epi8(-8)), qx1); // 3 * 9 __m128i qx4 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx2, 3), _mm_set1_epi8(-8)), qx2); // 9 * 9 __m256i qx01 = MM256_SET_M128I(qx1, qx0); __m256i qx23 = MM256_SET_M128I(qx3, qx2); // avx2 does not have 8-bit multiplies, so 16-bit it is. qx5_l = _mm256_mullo_epi16(qx5_l, _mm256_set_epi16(27, 27, 27, 27, 9, 9, 9, 9, 3, 3, 3, 3, 1, 1, 1, 1)); qx5_l = _mm256_and_si256(qx5_l, _mm256_set1_epi16(0xFF)); __m128i qx5 = _mm_packus_epi16(_mm256_castsi256_si128(qx5_l), _mm256_extracti128_si256(qx5_l, 1)); __m256i qx45 = MM256_SET_M128I(qx5, qx4); // Cancel the +1 from avg so that it behaves like a halving add qx01 = _mm256_subs_epu8(qx01, _mm256_set1_epi8(1)); qx23 = _mm256_subs_epu8(qx23, _mm256_set1_epi8(1)); qx45 = _mm256_subs_epu8(qx45, _mm256_set1_epi8(1)); // Multiply by 3 and get the top 2 bits qx01 = _mm256_avg_epu8(qx01, _mm256_avg_epu8(qx01, _mm256_setzero_si256())); qx23 = _mm256_avg_epu8(qx23, _mm256_avg_epu8(qx23, _mm256_setzero_si256())); qx45 = _mm256_avg_epu8(qx45, _mm256_avg_epu8(qx45, _mm256_setzero_si256())); qx01 = _mm256_and_si256(_mm256_srli_epi16(qx01, 6), _mm256_set1_epi8(3)); qx23 = _mm256_and_si256(_mm256_srli_epi16(qx23, 6), _mm256_set1_epi8(3)); qx45 = _mm256_and_si256(_mm256_srli_epi16(qx45, 6), _mm256_set1_epi8(3)); const __m256i qy01 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 160)); const __m256i qy23 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 192)); const __m256i qy45 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 224)); qx01 = _mm256_maddubs_epi16(qx01, qy01); qx23 = _mm256_maddubs_epi16(qx23, qy23); qx45 = _mm256_maddubs_epi16(qx45, qy45); sumi0 = _mm256_add_epi16(sumi0, qx01); sumi1 = _mm256_add_epi16(sumi1, qx23); sumi2 = _mm256_add_epi16(sumi2, qx45); } const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums); const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d)); sumi0 = _mm256_sub_epi16(sumi0, ysum); sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2)); sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1)); sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf); } *s = hsum_float_8(sumf); #else UNUSED(x); UNUSED(y); UNUSED(nb); ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_tq2_0 * GGML_RESTRICT x = vx; const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; #if defined(__AVX2__) __m256 sumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { // 16-bit sums, because 256*127 still fits __m256i sumi0 = _mm256_setzero_si256(); __m256i sumi1 = _mm256_setzero_si256(); for (size_t j = 0; j < sizeof(x->qs); j += 32) { __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs + j)); __m256i qx1 = _mm256_srli_epi16(qx0, 2); __m256i qx2 = _mm256_srli_epi16(qx0, 4); __m256i qx3 = _mm256_srli_epi16(qx0, 6); // 0, 1, 2 (should not be 3) qx0 = _mm256_and_si256(qx0, _mm256_set1_epi8(3)); qx1 = _mm256_and_si256(qx1, _mm256_set1_epi8(3)); qx2 = _mm256_and_si256(qx2, _mm256_set1_epi8(3)); qx3 = _mm256_and_si256(qx3, _mm256_set1_epi8(3)); const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 0)); const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 32)); const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 64)); const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 96)); qx0 = _mm256_maddubs_epi16(qx0, qy0); qx1 = _mm256_maddubs_epi16(qx1, qy1); qx2 = _mm256_maddubs_epi16(qx2, qy2); qx3 = _mm256_maddubs_epi16(qx3, qy3); sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1)); sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3)); } const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums); const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d)); sumi0 = _mm256_add_epi16(sumi0, sumi1); sumi0 = _mm256_sub_epi16(sumi0, ysum); sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1)); sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf); } *s = hsum_float_8(sumf); #else UNUSED(x); UNUSED(y); UNUSED(nb); ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_q2_K * GGML_RESTRICT x = vx; const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; #if defined __AVX2__ const __m256i m3 = _mm256_set1_epi8(3); const __m128i m4 = _mm_set1_epi8(0xF); __m256 acc = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const uint8_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); const __m128i scales8 = _mm_and_si128(mins_and_scales, m4); const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4); const __m256i mins = _mm256_cvtepi8_epi16(mins8); const __m256i prod = _mm256_madd_epi16(mins, _mm256_loadu_si256((const __m256i*)y[i].bsums)); acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(prod), acc); const __m256i all_scales = _mm256_cvtepi8_epi16(scales8); const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0); const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1); const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)}; __m256i sumi = _mm256_setzero_si256(); for (int j = 0; j < QK_K/128; ++j) { const __m256i q2bits = _mm256_loadu_si256((const __m256i*)q2); q2 += 32; const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; const __m256i q2_0 = _mm256_and_si256(q2bits, m3); const __m256i q2_1 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), m3); const __m256i q2_2 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), m3); const __m256i q2_3 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), m3); __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0); __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1); __m256i p2 = _mm256_maddubs_epi16(q2_2, q8_2); __m256i p3 = _mm256_maddubs_epi16(q2_3, q8_3); p0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(0)), p0); p1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(1)), p1); p2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(2)), p2); p3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(3)), p3); p0 = _mm256_add_epi32(p0, p1); p2 = _mm256_add_epi32(p2, p3); sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p0, p2)); } acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc); } *s = hsum_float_8(acc); #elif defined __AVX__ const __m128i m3 = _mm_set1_epi8(0x3); const __m128i m4 = _mm_set1_epi8(0xF); const __m128i m2 = _mm_set1_epi8(0x2); __m256 acc = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const uint8_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; // load mins and scales from block_q2_K.scales[QK_K/16] const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); const __m128i scales16 = _mm_and_si128(mins_and_scales, m4); const __m128i mins16 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4); const __m128i mins_0 = _mm_cvtepi8_epi16(mins16); const __m128i mins_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(mins16, mins16)); // summs = y[i].bsums * (x[i].scales >> 4) in 16bits*8*2 to 32bits*4*2 const __m128i summs_0 = _mm_madd_epi16(mins_0, _mm_loadu_si128((const __m128i*)&y[i].bsums[0])); const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8])); // sumf += -dmin * summs in 32bits*8 acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc); const __m128i scales_0 = _mm_cvtepi8_epi16(scales16); const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16)); const __m128i scales[2] = { scales_0, scales_1 }; __m128i sumi_0 = _mm_setzero_si128(); __m128i sumi_1 = _mm_setzero_si128(); for (int j = 0; j < QK_K/128; ++j) { // load Q8 quants int8*16*8 from block_q8_K.qs[QK_K] const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; // load 2bits*16*8 from block_q2_K.qs[QK_K/4] __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16; const __m128i q2_0 = _mm_and_si128(q2bits, m3); const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3); const __m128i q2_4 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3); const __m128i q2_6 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3); q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16; const __m128i q2_1 = _mm_and_si128(q2bits, m3); const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3); const __m128i q2_5 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3); const __m128i q2_7 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3); // isuml = q8[l] * ((q2[l] >> shift) & 3) in 8bits*16*8 to 16bits*8*8 __m128i p0 = _mm_maddubs_epi16(q2_0, q8_0); __m128i p1 = _mm_maddubs_epi16(q2_1, q8_1); __m128i p2 = _mm_maddubs_epi16(q2_2, q8_2); __m128i p3 = _mm_maddubs_epi16(q2_3, q8_3); __m128i p4 = _mm_maddubs_epi16(q2_4, q8_4); __m128i p5 = _mm_maddubs_epi16(q2_5, q8_5); __m128i p6 = _mm_maddubs_epi16(q2_6, q8_6); __m128i p7 = _mm_maddubs_epi16(q2_7, q8_7); // isum += (x[i].scales[is++] & 0xF) * isuml in 16bits*8*8 to 32bits*4*8 __m128i shuffle = _mm_set1_epi16(0x0100); p0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p0); shuffle = _mm_add_epi16(shuffle, m2); p1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p1); shuffle = _mm_add_epi16(shuffle, m2); p2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p2); shuffle = _mm_add_epi16(shuffle, m2); p3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p3); shuffle = _mm_add_epi16(shuffle, m2); p4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p4); shuffle = _mm_add_epi16(shuffle, m2); p5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p5); shuffle = _mm_add_epi16(shuffle, m2); p6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p6); shuffle = _mm_add_epi16(shuffle, m2); p7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p7); p0 = _mm_add_epi32(p0, p1); p2 = _mm_add_epi32(p2, p3); p4 = _mm_add_epi32(p4, p5); p6 = _mm_add_epi32(p6, p7); // isum in 32bits*4*2 sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p0, p2)); sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p4, p6)); } // sumf += dall * isum - dmin * summs in 32bits __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc); } *s = hsum_float_8(acc); #else UNUSED(x); UNUSED(y); UNUSED(nb); ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const uint32_t kmask1 = 0x03030303; const uint32_t kmask2 = 0x0f0f0f0f; const block_q3_K * GGML_RESTRICT x = vx; const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; #if defined __AVX2__ const __m256i m3 = _mm256_set1_epi8(3); const __m256i mone = _mm256_set1_epi8(1); const __m128i m32 = _mm_set1_epi8(32); __m256 acc = _mm256_setzero_ps(); uint32_t aux[3]; for (int i = 0; i < nb; ++i) { const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q3 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; // Set up scales memcpy(aux, x[i].scales, 12); __m128i scales128 = _mm_set_epi32( ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4), ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4), (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4), (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4)); scales128 = _mm_sub_epi8(scales128, m32); const __m256i all_scales = _mm256_cvtepi8_epi16(scales128); const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0); const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1); const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)}; // high bit const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask); // integer accumulator __m256i sumi = _mm256_setzero_si256(); int bit = 0; int is = 0; for (int j = 0; j < QK_K/128; ++j) { // load low 2 bits const __m256i q3bits = _mm256_loadu_si256((const __m256i*)q3); q3 += 32; // prepare low and high bits const __m256i q3l_0 = _mm256_and_si256(q3bits, m3); const __m256i q3h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2); ++bit; const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 2), m3); const __m256i q3h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2); ++bit; const __m256i q3l_2 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 4), m3); const __m256i q3h_2 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2); ++bit; const __m256i q3l_3 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 6), m3); const __m256i q3h_3 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2); ++bit; // load Q8 quants const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16, // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set, // and 2 if the high bit was set) __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0); __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1); __m256i q8s_2 = _mm256_maddubs_epi16(q3h_2, q8_2); __m256i q8s_3 = _mm256_maddubs_epi16(q3h_3, q8_3); __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0); __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1); __m256i p16_2 = _mm256_maddubs_epi16(q3l_2, q8_2); __m256i p16_3 = _mm256_maddubs_epi16(q3l_3, q8_3); p16_0 = _mm256_sub_epi16(p16_0, q8s_0); p16_1 = _mm256_sub_epi16(p16_1, q8s_1); p16_2 = _mm256_sub_epi16(p16_2, q8s_2); p16_3 = _mm256_sub_epi16(p16_3, q8s_3); // multiply with scales p16_0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0); p16_1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1); p16_2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2); p16_3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3); // accumulate p16_0 = _mm256_add_epi32(p16_0, p16_1); p16_2 = _mm256_add_epi32(p16_2, p16_3); sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_2)); } // multiply with block scale and accumulate acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc); } *s = hsum_float_8(acc); #elif defined __AVX__ const __m128i m3 = _mm_set1_epi8(3); const __m128i mone = _mm_set1_epi8(1); const __m128i m32 = _mm_set1_epi8(32); const __m128i m2 = _mm_set1_epi8(2); __m256 acc = _mm256_setzero_ps(); const uint32_t *aux; for (int i = 0; i < nb; ++i) { const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q3 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; // Set up scales aux = (const uint32_t *)x[i].scales; __m128i scales128 = _mm_set_epi32( ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4), ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4), (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4), (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4)); scales128 = _mm_sub_epi8(scales128, m32); const __m128i scales_0 = _mm_cvtepi8_epi16(scales128); const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales128, scales128)); const __m128i scales[2] = { scales_0, scales_1 }; // high bit *128*2 from block_q3_K.hmask[QK_K/8] const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].hmask[0]); const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].hmask[16]); // integer accumulator __m128i sumi_0 = _mm_setzero_si128(); __m128i sumi_1 = _mm_setzero_si128(); for (int j = 0; j < QK_K/128; ++j) { // load low 2 bits *64*2 from block_q3_K.qs[QK_K/4] const __m128i q3bits_0 = _mm_loadu_si128((const __m128i*)q3); q3 += 16; const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16; // prepare low and high bits const int bit = j << 2; const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3); const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3); const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2); const __m128i q3h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit)), bit), 2); const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 2), m3); const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 2), m3); const __m128i q3h_2 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+1)), bit+1), 2); const __m128i q3h_3 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+1)), bit+1), 2); const __m128i q3l_4 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 4), m3); const __m128i q3l_5 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 4), m3); const __m128i q3h_4 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+2)), bit+2), 2); const __m128i q3h_5 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+2)), bit+2), 2); const __m128i q3l_6 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 6), m3); const __m128i q3l_7 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 6), m3); const __m128i q3h_6 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+3)), bit+3), 2); const __m128i q3h_7 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+3)), bit+3), 2); // load Q8 quants from block_q8_K.qs[QK_K] const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16, // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set, // and 2 if the high bit was set) __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, q8_0); __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, q8_1); __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, q8_2); __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, q8_3); __m128i q8s_4 = _mm_maddubs_epi16(q3h_4, q8_4); __m128i q8s_5 = _mm_maddubs_epi16(q3h_5, q8_5); __m128i q8s_6 = _mm_maddubs_epi16(q3h_6, q8_6); __m128i q8s_7 = _mm_maddubs_epi16(q3h_7, q8_7); __m128i p16_0 = _mm_maddubs_epi16(q3l_0, q8_0); __m128i p16_1 = _mm_maddubs_epi16(q3l_1, q8_1); __m128i p16_2 = _mm_maddubs_epi16(q3l_2, q8_2); __m128i p16_3 = _mm_maddubs_epi16(q3l_3, q8_3); __m128i p16_4 = _mm_maddubs_epi16(q3l_4, q8_4); __m128i p16_5 = _mm_maddubs_epi16(q3l_5, q8_5); __m128i p16_6 = _mm_maddubs_epi16(q3l_6, q8_6); __m128i p16_7 = _mm_maddubs_epi16(q3l_7, q8_7); p16_0 = _mm_sub_epi16(p16_0, q8s_0); p16_1 = _mm_sub_epi16(p16_1, q8s_1); p16_2 = _mm_sub_epi16(p16_2, q8s_2); p16_3 = _mm_sub_epi16(p16_3, q8s_3); p16_4 = _mm_sub_epi16(p16_4, q8s_4); p16_5 = _mm_sub_epi16(p16_5, q8s_5); p16_6 = _mm_sub_epi16(p16_6, q8s_6); p16_7 = _mm_sub_epi16(p16_7, q8s_7); // multiply with scales __m128i shuffle = _mm_set1_epi16(0x0100); p16_0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_0); shuffle = _mm_add_epi16(shuffle, m2); p16_1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_1); shuffle = _mm_add_epi16(shuffle, m2); p16_2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_2); shuffle = _mm_add_epi16(shuffle, m2); p16_3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_3); shuffle = _mm_add_epi16(shuffle, m2); p16_4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_4); shuffle = _mm_add_epi16(shuffle, m2); p16_5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_5); shuffle = _mm_add_epi16(shuffle, m2); p16_6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_6); shuffle = _mm_add_epi16(shuffle, m2); p16_7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_7); // accumulate p16_0 = _mm_add_epi32(p16_0, p16_1); p16_2 = _mm_add_epi32(p16_2, p16_3); p16_4 = _mm_add_epi32(p16_4, p16_5); p16_6 = _mm_add_epi32(p16_6, p16_7); sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_6)); } // multiply with block scale and accumulate __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc); } *s = hsum_float_8(acc); #else UNUSED(kmask1); UNUSED(kmask2); UNUSED(x); UNUSED(y); UNUSED(nb); ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_q4_K * GGML_RESTRICT x = vx; const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; static const uint32_t kmask1 = 0x3f3f3f3f; static const uint32_t kmask2 = 0x0f0f0f0f; static const uint32_t kmask3 = 0x03030303; uint32_t utmp[4]; #if defined __AVX2__ const __m256i m4 = _mm256_set1_epi8(0xF); __m256 acc = _mm256_setzero_ps(); __m128 acc_m = _mm_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); const uint32_t uaux = utmp[1] & kmask1; utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); utmp[2] = uaux; utmp[0] &= kmask1; const uint8_t * GGML_RESTRICT q4 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0])); const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums); const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1)); const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s); acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m); const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0); const __m256i scales = MM256_SET_M128I(sc128, sc128); __m256i sumi = _mm256_setzero_si256(); for (int j = 0; j < QK_K/64; ++j) { const __m256i scale_l = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0)); const __m256i scale_h = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1)); const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); q4 += 32; const __m256i q4l = _mm256_and_si256(q4bits, m4); const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4); const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; __m256i p16l = _mm256_maddubs_epi16(q4l, q8l); p16l = _mm256_madd_epi16(scale_l, p16l); const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; __m256i p16h = _mm256_maddubs_epi16(q4h, q8h); p16h = _mm256_madd_epi16(scale_h, p16h); const __m256i sumj = _mm256_add_epi32(p16l, p16h); sumi = _mm256_add_epi32(sumi, sumj); } __m256 vd = _mm256_set1_ps(d); acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc); } acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m)); acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m)); *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m); #elif defined __AVX__ const __m128i m4 = _mm_set1_epi8(0xF); const __m128i m2 = _mm_set1_epi8(0x2); __m256 acc = _mm256_setzero_ps(); __m128 acc_m = _mm_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const uint8_t * GGML_RESTRICT q4 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); const uint32_t uaux = utmp[1] & kmask1; utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); utmp[2] = uaux; utmp[0] &= kmask1; const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]); const __m128i scales = _mm_cvtepu8_epi16(utmps); const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps)); const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]); const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]); const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1); const __m128i prod = _mm_madd_epi16(mins, q8s); acc_m = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod)), acc_m); __m128i sumi_0 = _mm_setzero_si128(); __m128i sumi_1 = _mm_setzero_si128(); __m128i shuffle = _mm_set1_epi16(0x0100); for (int j = 0; j < QK_K/64; ++j) { const __m128i scale_l = _mm_shuffle_epi8(scales, shuffle); shuffle = _mm_add_epi16(shuffle, m2); const __m128i scale_h = _mm_shuffle_epi8(scales, shuffle); shuffle = _mm_add_epi16(shuffle, m2); __m128i q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16; const __m128i q4l_0 = _mm_and_si128(q4bits, m4); const __m128i q4h_0 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4); q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16; const __m128i q4l_1 = _mm_and_si128(q4bits, m4); const __m128i q4h_1 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4); const __m128i q8l_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; __m128i p16l = _mm_maddubs_epi16(q4l_0, q8l_0); p16l = _mm_madd_epi16(scale_l, p16l); sumi_0 = _mm_add_epi32(sumi_0, p16l); const __m128i q8l_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; p16l = _mm_maddubs_epi16(q4l_1, q8l_1); p16l = _mm_madd_epi16(scale_l, p16l); sumi_1 = _mm_add_epi32(sumi_1, p16l); const __m128i q8h_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; __m128i p16h = _mm_maddubs_epi16(q4h_0, q8h_0); p16h = _mm_madd_epi16(scale_h, p16h); sumi_0 = _mm_add_epi32(sumi_0, p16h); const __m128i q8h_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; p16h = _mm_maddubs_epi16(q4h_1, q8h_1); p16h = _mm_madd_epi16(scale_h, p16h); sumi_1 = _mm_add_epi32(sumi_1, p16h); } __m256 vd = _mm256_set1_ps(d); __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc); } acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m)); acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m)); *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m); #else UNUSED(x); UNUSED(y); UNUSED(nb); UNUSED(kmask1); UNUSED(kmask2); UNUSED(kmask3); UNUSED(utmp); ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_q5_K * GGML_RESTRICT x = vx; const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; static const uint32_t kmask1 = 0x3f3f3f3f; static const uint32_t kmask2 = 0x0f0f0f0f; static const uint32_t kmask3 = 0x03030303; uint32_t utmp[4]; #if defined __AVX2__ const __m256i m4 = _mm256_set1_epi8(0xF); const __m128i mzero = _mm_setzero_si128(); const __m256i mone = _mm256_set1_epi8(1); __m256 acc = _mm256_setzero_ps(); float summs = 0.f; for (int i = 0; i < nb; ++i) { const uint8_t * GGML_RESTRICT q5 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); const uint32_t uaux = utmp[1] & kmask1; utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); utmp[2] = uaux; utmp[0] &= kmask1; const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0])); const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums); const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1)); const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s); const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero); summs += dmin * _mm_extract_epi32(hsum, 0); const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0); const __m256i scales = MM256_SET_M128I(sc128, sc128); const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh); __m256i hmask = mone; __m256i sumi = _mm256_setzero_si256(); int bit = 0; for (int j = 0; j < QK_K/64; ++j) { const __m256i scale_0 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0)); const __m256i scale_1 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1)); const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); q5 += 32; const __m256i q5l_0 = _mm256_and_si256(q5bits, m4); const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4); const __m256i q5_0 = _mm256_add_epi8(q5l_0, q5h_0); hmask = _mm256_slli_epi16(hmask, 1); const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4); const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4); const __m256i q5_1 = _mm256_add_epi8(q5l_1, q5h_1); hmask = _mm256_slli_epi16(hmask, 1); const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; __m256i p16_0 = _mm256_maddubs_epi16(q5_0, q8_0); __m256i p16_1 = _mm256_maddubs_epi16(q5_1, q8_1); p16_0 = _mm256_madd_epi16(scale_0, p16_0); p16_1 = _mm256_madd_epi16(scale_1, p16_1); sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1)); } __m256 vd = _mm256_set1_ps(d); acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc); } *s = hsum_float_8(acc) + summs; #elif defined __AVX__ const __m128i m4 = _mm_set1_epi8(0xF); const __m128i mzero = _mm_setzero_si128(); const __m128i mone = _mm_set1_epi8(1); const __m128i m2 = _mm_set1_epi8(2); __m256 acc = _mm256_setzero_ps(); float summs = 0.f; for (int i = 0; i < nb; ++i) { const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const uint8_t * GGML_RESTRICT q5 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); const uint32_t uaux = utmp[1] & kmask1; utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); utmp[2] = uaux; utmp[0] &= kmask1; const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]); const __m128i scales = _mm_cvtepu8_epi16(utmps); const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps)); const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]); const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]); const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1); const __m128i prod = _mm_madd_epi16(mins, q8s); const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero); summs += dmin * _mm_extract_epi32(hsum, 0); const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].qh[0]); const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].qh[16]); __m128i hmask = mone; __m128i sumi_0 = _mm_setzero_si128(); __m128i sumi_1 = _mm_setzero_si128(); int bit = 0; __m128i shuffle = _mm_set1_epi16(0x0100); for (int j = 0; j < QK_K/64; ++j) { const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle); shuffle = _mm_add_epi16(shuffle, m2); const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle); shuffle = _mm_add_epi16(shuffle, m2); const __m128i q5bits_0 = _mm_loadu_si128((const __m128i*)q5); q5 += 16; const __m128i q5bits_1 = _mm_loadu_si128((const __m128i*)q5); q5 += 16; __m128i q5l_0 = _mm_and_si128(q5bits_0, m4); __m128i q5l_1 = _mm_and_si128(q5bits_1, m4); __m128i q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4); __m128i q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4); __m128i q5_0 = _mm_add_epi8(q5l_0, q5h_0); __m128i q5_1 = _mm_add_epi8(q5l_1, q5h_1); hmask = _mm_slli_epi16(hmask, 1); __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; __m128i p16_0 = _mm_maddubs_epi16(q5_0, q8_0); __m128i p16_1 = _mm_maddubs_epi16(q5_1, q8_1); p16_0 = _mm_madd_epi16(scale_0, p16_0); p16_1 = _mm_madd_epi16(scale_0, p16_1); q5l_0 = _mm_and_si128(_mm_srli_epi16(q5bits_0, 4), m4); q5l_1 = _mm_and_si128(_mm_srli_epi16(q5bits_1, 4), m4); q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4); q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4); q5_0 = _mm_add_epi8(q5l_0, q5h_0); q5_1 = _mm_add_epi8(q5l_1, q5h_1); hmask = _mm_slli_epi16(hmask, 1); q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; __m128i p16_2 = _mm_maddubs_epi16(q5_0, q8_0); __m128i p16_3 = _mm_maddubs_epi16(q5_1, q8_1); p16_2 = _mm_madd_epi16(scale_1, p16_2); p16_3 = _mm_madd_epi16(scale_1, p16_3); sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3)); } __m256 vd = _mm256_set1_ps(d); __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc); } *s = hsum_float_8(acc) + summs; #else UNUSED(x); UNUSED(y); UNUSED(nb); UNUSED(kmask1); UNUSED(kmask2); UNUSED(kmask3); UNUSED(utmp); ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_q6_K * GGML_RESTRICT x = vx; const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; #if defined __AVX2__ const __m256i m4 = _mm256_set1_epi8(0xF); const __m256i m2 = _mm256_set1_epi8(3); const __m256i m32s = _mm256_set1_epi8(32); __m256 acc = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q4 = x[i].ql; const uint8_t * GGML_RESTRICT qh = x[i].qh; const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); __m256i sumi = _mm256_setzero_si256(); int is = 0; for (int j = 0; j < QK_K/128; ++j) { const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0)); const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1)); const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2)); const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3)); is += 4; const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32; const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32; const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32; const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4); const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4); const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4); const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4); const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0); const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1); const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2); const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3); const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0); __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1); __m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2); __m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3); __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0); __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1); __m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2); __m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3); p16_0 = _mm256_sub_epi16(p16_0, q8s_0); p16_1 = _mm256_sub_epi16(p16_1, q8s_1); p16_2 = _mm256_sub_epi16(p16_2, q8s_2); p16_3 = _mm256_sub_epi16(p16_3, q8s_3); p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0); p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1); p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2); p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3); sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1)); sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3)); } acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc); } *s = hsum_float_8(acc); #elif defined __AVX__ const __m128i m3 = _mm_set1_epi8(3); const __m128i m15 = _mm_set1_epi8(15); __m256 acc = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q4 = x[i].ql; const uint8_t * GGML_RESTRICT qh = x[i].qh; const int8_t * GGML_RESTRICT q8 = y[i].qs; // handle the q6_k -32 offset separately using bsums const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums); const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1); const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales); const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8)); const __m128i q8sclsub_0 = _mm_slli_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), 5); const __m128i q8sclsub_1 = _mm_slli_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), 5); __m128i sumi_0 = _mm_setzero_si128(); __m128i sumi_1 = _mm_setzero_si128(); int is = 0; for (int j = 0; j < QK_K/128; ++j) { const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16; const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16; const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4); const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4); const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(12)), 2); const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(12)), 2); const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(48)); const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(48)); const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(-64)), 2); const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(-64)), 2); const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0); const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1); const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2); const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3); const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4); const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5); const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6); const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7); const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0); __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1); __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2); __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3); __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4); __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5); __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6); __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7); const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0)); const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1)); const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2)); const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3)); is += 4; p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0); p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_0, 8)), p16_1); p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2); p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_1, 8)), p16_3); p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4); p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_2, 8)), p16_5); p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6); p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_3, 8)), p16_7); sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3)); sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6)); sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7)); } sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0); sumi_1 = _mm_sub_epi32(sumi_1, q8sclsub_1); const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc); } *s = hsum_float_8(acc); #else UNUSED(x); UNUSED(y); UNUSED(nb); ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } #if defined (__AVX__) || defined (__AVX2__) static const int8_t keven_signs_q2xs[1024] = { 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, }; #endif void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_iq2_xxs * GGML_RESTRICT x = vx; const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; #if defined(__AVX2__) const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; uint32_t aux32[4]; const uint8_t * aux8 = (const uint8_t *)aux32; __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; __m256i sumi1 = _mm256_setzero_si256(); __m256i sumi2 = _mm256_setzero_si256(); for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8; const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]); const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]); const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127], signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127], signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]); const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1); const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2); const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); const uint16_t ls1 = aux32[1] >> 28; const uint16_t ls2 = aux32[3] >> 28; const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1)); const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1)); sumi1 = _mm256_add_epi32(sumi1, p1); sumi2 = _mm256_add_epi32(sumi2, p2); } accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); } *s = 0.125f * hsum_float_8(accumf); #elif defined(__AVX__) const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; uint32_t aux32[4]; const uint8_t * aux8 = (const uint8_t *)aux32; __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; __m128i sumi1_0 = _mm_setzero_si128(); __m128i sumi1_1 = _mm_setzero_si128(); __m128i sumi2_0 = _mm_setzero_si128(); __m128i sumi2_1 = _mm_setzero_si128(); for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8; const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]); const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]); const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]); const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]); const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]); const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]); const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]); const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0); const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1); const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0); const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1); const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); const uint16_t ls1 = aux32[1] >> 28; const uint16_t ls2 = aux32[3] >> 28; const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1)); const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1)); const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1)); const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1)); sumi1_0 = _mm_add_epi32(sumi1_0, p1_0); sumi1_1 = _mm_add_epi32(sumi1_1, p1_1); sumi2_0 = _mm_add_epi32(sumi2_0, p2_0); sumi2_1 = _mm_add_epi32(sumi2_1, p2_1); } accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); } *s = 0.125f * hsum_float_8(accumf); #else UNUSED(x); UNUSED(y); UNUSED(nb); ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_iq2_xs * GGML_RESTRICT x = vx; const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; #if defined(__AVX2__) const __m256i mone = _mm256_set1_epi8(1); static const char block_sign_shuffle_mask_1[32] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, }; static const char block_sign_shuffle_mask_2[32] = { 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, }; static const uint8_t bit_selector_mask_bytes[32] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, }; const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes); const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1); const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2); static const uint8_t k_bit_helper[32] = { 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, }; const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper); const __m256i m511 = _mm256_set1_epi16(511); const __m128i m4 = _mm_set1_epi8(0xf); const __m128i m1 = _mm_set1_epi8(1); uint64_t aux64; // somewhat hacky, but gives a significant boost in performance __m256i aux_gindex; const uint16_t * gindex = (const uint16_t *)&aux_gindex; __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(&aux64, x[i].scales, 8); __m128i stmp = _mm_set1_epi64x(aux64); stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4)); const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1); __m256i sumi1 = _mm256_setzero_si256(); __m256i sumi2 = _mm256_setzero_si256(); for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) { const __m256i q2_data = _mm256_loadu_si256((const __m256i*)q2); q2 += 16; aux_gindex = _mm256_and_si256(q2_data, m511); const __m256i partial_sign_bits = _mm256_srli_epi16(q2_data, 9); const __m256i partial_sign_bits_upper = _mm256_srli_epi16(q2_data, 13); const __m256i partial_sign_bits_for_counting = _mm256_xor_si256(partial_sign_bits, partial_sign_bits_upper); const __m256i odd_bits = _mm256_shuffle_epi8(bit_helper, partial_sign_bits_for_counting); const __m256i full_sign_bits = _mm256_or_si256(partial_sign_bits, odd_bits); const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i q8_3 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i q8_4 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]], iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]); const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]], iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]); const __m256i q2_3 = _mm256_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]], iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]); const __m256i q2_4 = _mm256_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]], iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]); const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits); const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1); const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l); const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h); __m256i signs; signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1); signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone)); signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_2); signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone)); signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_1); signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); const __m256i q8s_3 = _mm256_sign_epi8(q8_3, _mm256_or_si256(signs, mone)); signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_2); signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); const __m256i q8s_4 = _mm256_sign_epi8(q8_4, _mm256_or_si256(signs, mone)); const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); const __m256i dot3 = _mm256_maddubs_epi16(q2_3, q8s_3); const __m256i dot4 = _mm256_maddubs_epi16(q2_4, q8s_4); const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0))); const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1))); const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2))); const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3))); sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1)); sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2)); sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3)); sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4)); } accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); } *s = 0.125f * hsum_float_8(accumf); #elif defined(__AVX__) const __m128i mone = _mm_set1_epi8(1); static const char block_sign_shuffle_mask_1[32] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, }; static const char block_sign_shuffle_mask_2[32] = { 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, }; static const uint8_t bit_selector_mask_bytes[32] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, }; const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes); const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1); const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1); const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1); const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2); const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1); static const uint8_t k_bit_helper[32] = { 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, }; const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper); const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1); const __m128i m511 = _mm_set1_epi16(511); const __m128i m4 = _mm_set1_epi8(0xf); const __m128i m1 = _mm_set1_epi8(1); uint64_t aux64; // somewhat hacky, but gives a significant boost in performance __m256i aux_gindex; const uint16_t * gindex = (const uint16_t *)&aux_gindex; __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(&aux64, x[i].scales, 8); __m128i stmp = _mm_set1_epi64x(aux64); stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4)); const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1); __m128i sumi1_0 = _mm_setzero_si128(); __m128i sumi1_1 = _mm_setzero_si128(); __m128i sumi2_0 = _mm_setzero_si128(); __m128i sumi2_1 = _mm_setzero_si128(); for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) { const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2); const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1); q2 += 16; aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511)); const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9); const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9); const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13); const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13); const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0); const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1); const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0); const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1); const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0); const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1); const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]); const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]); const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]); const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]); const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]); const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]); const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]); const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]); // AVX2 full_signs_1 is full_sign_bits_0 here // AVX2 full_signs_2 is full_sign_bits_1 here __m128i signs_0, signs_1; signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0); signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1); signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0); signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1); const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone)); const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone)); signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0); signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1); signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0); signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1); const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone)); const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone)); signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0); signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1); signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0); signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1); const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone)); const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone)); signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0); signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1); signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0); signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1); const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone)); const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone)); const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); const __m128i dot3_0 = _mm_maddubs_epi16(q2_3_0, q8s_3_0); const __m128i dot3_1 = _mm_maddubs_epi16(q2_3_1, q8s_3_1); const __m128i dot4_0 = _mm_maddubs_epi16(q2_4_0, q8s_4_0); const __m128i dot4_1 = _mm_maddubs_epi16(q2_4_1, q8s_4_1); __m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)); const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp); const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8)); sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)); const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp); const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8)); sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2)); const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp); const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8)); sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3)); const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp); const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8)); sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0)); sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1)); sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0)); sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1)); sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0)); sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1)); sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0)); sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1)); } accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); } *s = 0.125f * hsum_float_8(accumf); #else UNUSED(x); UNUSED(y); UNUSED(nb); ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_iq2_s * GGML_RESTRICT x = vx; const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; #if defined(__AVX2__) static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 }; static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, }; const __m128i m4 = _mm_set1_epi8(0xf); const __m128i m1 = _mm_set1_epi8(1); const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1); const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2); uint64_t aux64; __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(&aux64, x[i].scales, 8); const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1); const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15 __m256i sumi1 = _mm256_setzero_si256(); __m256i sumi2 = _mm256_setzero_si256(); for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)], iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)], iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)], iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]); const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)], iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)], iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)], iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]); qs += 8; __m256i aux256 = _mm256_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16)); aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2); const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1); aux256 = _mm256_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16)); aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2); const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2); signs += 4; const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1 const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3 const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0))); const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1))); sumi1 = _mm256_add_epi32(sumi1, p1); sumi2 = _mm256_add_epi32(sumi2, p2); } accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); } *s = 0.125f * hsum_float_8(accumf); #elif defined(__AVX__) static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 }; static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, }; const __m128i m4 = _mm_set1_epi8(0xf); const __m128i m1 = _mm_set1_epi8(1); const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1); const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1); const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2); const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1); uint64_t aux64; __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(&aux64, x[i].scales, 8); const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1); const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8); const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8)); __m128i sumi1_0 = _mm_setzero_si128(); __m128i sumi1_1 = _mm_setzero_si128(); __m128i sumi2_0 = _mm_setzero_si128(); __m128i sumi2_1 = _mm_setzero_si128(); for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)], iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]); const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)], iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]); const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)], iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]); const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)], iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]); qs += 8; __m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16)); __m128i aux128_1 = aux128_0; aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0); aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1); const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0); const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1); const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0); const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1); aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16)); aux128_1 = aux128_0; aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0); aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1); const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0); const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1); const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0); const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1); signs += 4; const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0))); const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1))); const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0))); const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1))); sumi1_0 = _mm_add_epi32(sumi1_0, p1_0); sumi1_1 = _mm_add_epi32(sumi1_1, p1_1); sumi2_0 = _mm_add_epi32(sumi2_0, p2_0); sumi2_1 = _mm_add_epi32(sumi2_1, p2_1); } accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); } *s = 0.125f * hsum_float_8(accumf); #else UNUSED(x); UNUSED(y); UNUSED(nb); ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_iq3_xxs * GGML_RESTRICT x = vx; const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; #if defined(__AVX2__) const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; uint32_t aux32[2]; __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; __m256i sumi1 = _mm256_setzero_si256(); __m256i sumi2 = _mm256_setzero_si256(); for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]], iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); q3 += 8; const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]], iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); q3 += 8; memcpy(aux32, gas, 8); gas += 8; const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127], signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]); const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127], signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1); const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2); const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); const uint16_t ls1 = aux32[0] >> 28; const uint16_t ls2 = aux32[1] >> 28; const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1)); const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1)); sumi1 = _mm256_add_epi32(sumi1, p1); sumi2 = _mm256_add_epi32(sumi2, p2); } accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); } *s = 0.25f * hsum_float_8(accumf); #elif defined(__AVX__) const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; uint32_t aux32[2]; __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; __m128i sumi1_0 = _mm_setzero_si128(); __m128i sumi1_1 = _mm_setzero_si128(); __m128i sumi2_0 = _mm_setzero_si128(); __m128i sumi2_1 = _mm_setzero_si128(); for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]); q3 += 8; const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]); q3 += 8; memcpy(aux32, gas, 8); gas += 8; const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]); const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]); const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]); const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0); const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1); const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0); const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1); const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); const uint16_t ls1 = aux32[0] >> 28; const uint16_t ls2 = aux32[1] >> 28; const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1)); const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1)); const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1)); const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1)); sumi1_0 = _mm_add_epi32(sumi1_0, p1_0); sumi1_1 = _mm_add_epi32(sumi1_1, p1_1); sumi2_0 = _mm_add_epi32(sumi2_0, p2_0); sumi2_1 = _mm_add_epi32(sumi2_1, p2_1); } accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); } *s = 0.25f * hsum_float_8(accumf); #else UNUSED(x); UNUSED(y); UNUSED(nb); ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_iq3_s * GGML_RESTRICT x = vx; const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; #if defined(__AVX2__) static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 }; static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, }; const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1); const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2); const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); const __m256i idx_mask = _mm256_set1_epi32(256); typedef union { __m256i vec[2]; uint32_t index[16]; } index_t; index_t idx; __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; const int8_t * GGML_RESTRICT q8 = y[i].qs; __m256i sumi1 = _mm256_setzero_si256(); __m256i sumi2 = _mm256_setzero_si256(); for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16; idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]); idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]); idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask); idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask); idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l))); idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1))); // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange. //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4); //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4); const __m256i q2_1 = _mm256_set_epi32( iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]], iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]] ); const __m256i q2_2 = _mm256_set_epi32( iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]], iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]] ); __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16)); aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2); const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1); aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16)); aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2); const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2); signs += 4; const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); const uint16_t ls1 = x[i].scales[ib32/2] & 0xf; const uint16_t ls2 = x[i].scales[ib32/2] >> 4; const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1)); const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1)); sumi1 = _mm256_add_epi32(sumi1, p1); sumi2 = _mm256_add_epi32(sumi2, p2); } accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); } *s = hsum_float_8(accumf); #elif defined(__AVX__) static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 }; static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, }; const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1); const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1); const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2); const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1); const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256); const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16); const __m128i idx_mask = _mm_set1_epi32(256); typedef union { __m128i vec[4]; uint32_t index[16]; } index_t; index_t idx; __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; const int8_t * GGML_RESTRICT q8 = y[i].qs; __m128i sumi1_0 = _mm_setzero_si128(); __m128i sumi1_1 = _mm_setzero_si128(); __m128i sumi2_0 = _mm_setzero_si128(); __m128i sumi2_1 = _mm_setzero_si128(); for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs); const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp); const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16; idx.vec[0] = _mm_set1_epi32(qh[ib32+0]); idx.vec[1] = idx.vec[0]; idx.vec[2] = _mm_set1_epi32(qh[ib32+1]); idx.vec[3] = idx.vec[2]; idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask); idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask); idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask); idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask); idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0)); idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8))); idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1)); idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8))); const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]); const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]); const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]); const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]); __m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16)); __m128i aux128_1 = aux128_0; aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0); aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1); const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0); const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1); const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0); const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1); aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16)); aux128_1 = aux128_0; aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0); aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1); const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0); const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1); const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0); const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1); signs += 4; const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0); const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1); const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0); const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1); const uint16_t ls1 = x[i].scales[ib32/2] & 0xf; const uint16_t ls2 = x[i].scales[ib32/2] >> 4; const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1)); const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1)); const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1)); const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1)); sumi1_0 = _mm_add_epi32(sumi1_0, p1_0); sumi1_1 = _mm_add_epi32(sumi1_1, p1_1); sumi2_0 = _mm_add_epi32(sumi2_0, p2_0); sumi2_1 = _mm_add_epi32(sumi2_1, p2_1); } accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf); } *s = hsum_float_8(accumf); #else UNUSED(x); UNUSED(y); UNUSED(nb); ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_iq1_s * GGML_RESTRICT x = vx; const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; #if defined __AVX2__ __m256 accum = _mm256_setzero_ps(); float accum1 = 0; for (int i = 0; i < nb; ++i) { const int8_t * q8 = y[i].qs; const uint8_t * qs = x[i].qs; const uint16_t * qh = x[i].qh; __m256i sumi = _mm256_setzero_si256(); int sumi1 = 0; for (int ib = 0; ib < QK_K/32; ib += 2) { #ifdef __BMI2__ const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL); const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL); const uint16_t *idx1 = (const uint16_t *)(&packed_idx1); const uint16_t *idx2 = (const uint16_t *)(&packed_idx2); const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]); const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]); #else const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)], iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]); const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)], iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]); #endif qs += 8; const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1); const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2); const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1; const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1; const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(ls1)); const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(ls2)); sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p1, p2)); sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1 + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2; } const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum); accum1 += d * sumi1; } *s = hsum_float_8(accum) + IQ1S_DELTA * accum1; #elif defined __AVX__ __m256 accum = _mm256_setzero_ps(); float accum1 = 0; for (int i = 0; i < nb; ++i) { const int8_t * q8 = y[i].qs; const uint8_t * qs = x[i].qs; const uint16_t * qh = x[i].qh; __m128i sumi1_0 = _mm_setzero_si128(); __m128i sumi1_1 = _mm_setzero_si128(); int sumi1 = 0; for (int ib = 0; ib < QK_K/32; ib += 2) { const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]); const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]); const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]); const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]); qs += 8; const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0); const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1); const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0); const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1); const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1; const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1; const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1)); const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1)); const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2)); const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2)); sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0)); sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1)); sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1 + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2; } const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum); accum1 += d * sumi1; } *s = hsum_float_8(accum) + IQ1S_DELTA * accum1; #else UNUSED(x); UNUSED(y); UNUSED(nb); ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); const block_iq1_m * GGML_RESTRICT x = vx; const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; iq1m_scale_t scale; #if defined __AVX2__ const __m256i mask = _mm256_set1_epi16(0x7); const __m256i mone = _mm256_set1_epi16(1); const __m256i mone8 = _mm256_set1_epi8(1); const __m256i mtwo8 = _mm256_set1_epi8(2); // VPSHUFB cannot cross 128-bit lanes so odd shifts go to upper half. const __m256i scales_shift = _mm256_set_epi64x(9, 3, 6, 0); __m256 accum1 = _mm256_setzero_ps(); __m256 accum2 = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const int8_t * q8 = y[i].qs; const uint8_t * qs = x[i].qs; const uint8_t * qh = x[i].qh; const uint16_t * sc = (const uint16_t *)x[i].scales; scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); // Extract 3-bit scales (16 values) __m256i scales = _mm256_set1_epi64x(*(const uint64_t*)sc); scales = _mm256_srlv_epi64(scales, scales_shift); scales = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scales, mask), 1), mone); // Indices to repeat each scale 8 times. __m256i scales_idx1 = _mm256_set1_epi16(0x0100); __m256i scales_idx2 = _mm256_add_epi8(scales_idx1, _mm256_set1_epi8(8)); __m256i sumi1 = _mm256_setzero_si256(); __m256i sumi2 = _mm256_setzero_si256(); for (int ib = 0; ib < QK_K/32; ib += 2) { #ifdef __BMI2__ const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL); const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(*(const uint16_t*)(qh + 2) & 0x7777, 0xf000f000f000f00ULL); const uint16_t *idx1 = (const uint16_t *)(&packed_idx1); const uint16_t *idx2 = (const uint16_t *)(&packed_idx2); const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]); const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]); // Convert signs to bytes 0x81 (negative) or 0x01 (positive) const uint64_t delta_sign = _pdep_u64(*(const uint32_t*)(qh) & 0x88888888, 0xf0f0f0f0f0f0f0f0ULL); const __m256i delta1 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign))); const __m256i delta2 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign >> 32))); #else const __m256i q1b_1 = _mm256_set_epi64x( iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)], iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)] ); const __m256i q1b_2 = _mm256_set_epi64x( iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)], iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)] ); const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101, qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101, qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); #endif const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1); const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2); const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1)); const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2)); __m256i scale1 = _mm256_shuffle_epi8(scales, scales_idx1); __m256i scale2 = _mm256_shuffle_epi8(scales, scales_idx2); scales_idx1 = _mm256_add_epi8(scales_idx1, mtwo8); scales_idx2 = _mm256_add_epi8(scales_idx2, mtwo8); const __m256i p1 = _mm256_madd_epi16(dot1, scale1); const __m256i p2 = _mm256_madd_epi16(dot2, scale2); const __m256i p3 = _mm256_madd_epi16(dot3, scale1); const __m256i p4 = _mm256_madd_epi16(dot4, scale2); sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2)); sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4)); qs += 8; qh += 4; } const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16)); accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1); accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2); } *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2); #elif defined __AVX__ const __m128i mask = _mm_set1_epi16(0x7); const __m128i mone = _mm_set1_epi16(1); __m256 accum1 = _mm256_setzero_ps(); __m256 accum2 = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const int8_t * q8 = y[i].qs; const uint8_t * qs = x[i].qs; const uint8_t * qh = x[i].qh; const uint16_t * sc = (const uint16_t *)x[i].scales; scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); __m128i sumi1_0 = _mm_setzero_si128(); __m128i sumi1_1 = _mm_setzero_si128(); __m128i sumi2_0 = _mm_setzero_si128(); __m128i sumi2_1 = _mm_setzero_si128(); for (int ib = 0; ib < QK_K/32; ib += 2) { const __m128i q1b_1_0 = _mm_set_epi64x( iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]); const __m128i q1b_1_1 = _mm_set_epi64x( iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]); const __m128i q1b_2_0 = _mm_set_epi64x( iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]); const __m128i q1b_2_1 = _mm_set_epi64x( iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]); const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0); const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1); const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0); const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1); const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0); const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1); const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0); const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1); __m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0); __m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3); __m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6); __m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9); scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone); scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone); scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone); scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone); const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0); const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1); const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0); const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1); const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0); const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1); const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0); const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1); sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0)); sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1)); sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0)); sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1)); qs += 8; qh += 4; } const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16)); accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1); accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2); } *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2); #else UNUSED(x); UNUSED(y); UNUSED(nb); UNUSED(scale); ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); assert(n % QK4_NL == 0); static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same"); const block_iq4_nl * GGML_RESTRICT x = vx; const block_q8_0 * GGML_RESTRICT y = vy; const int nb = n / QK4_NL; int ib = 0; float sumf = 0; #if defined __AVX2__ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl); const __m128i m4b = _mm_set1_epi8(0x0f); const __m256i mone = _mm256_set1_epi16(1); __m256 accum1 = _mm256_setzero_ps(); __m256 accum2 = _mm256_setzero_ps(); for (; ib + 1 < nb; ib += 2) { const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs); const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs); const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs); const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs); const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)), _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b))); const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)), _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b))); const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); const __m256i p_1 = _mm256_madd_epi16(p16_1, mone); const __m256i p_2 = _mm256_madd_epi16(p16_2, mone); accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)), _mm256_cvtepi32_ps(p_1), accum1); accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)), _mm256_cvtepi32_ps(p_2), accum2); } sumf = hsum_float_8(_mm256_add_ps(accum1, accum2)); #elif defined __AVX__ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl); const __m128i m4b = _mm_set1_epi8(0x0f); __m256 accum = _mm256_setzero_ps(); for (; ib + 1 < nb; ib += 2) { const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs); const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs); const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1); const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs); const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1); const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)); const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)); const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)); const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)); const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1); const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d); accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum); } sumf = hsum_float_8(accum); #endif for (; ib < nb; ++ib) { const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); int sumi1 = 0, sumi2 = 0; for (int j = 0; j < QK4_NL/2; ++j) { sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4]; } sumf += d * (sumi1 + sumi2); } *s = sumf; } void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs); assert(n % QK_K == 0); const block_iq4_xs * GGML_RESTRICT x = vx; const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; #if defined __AVX2__ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl); const __m128i m4b = _mm_set1_epi8(0x0f); __m256 accum = _mm256_setzero_ps(); for (int ibl = 0; ibl < nb; ++ibl) { const uint8_t * qs = x[ibl].qs; const int8_t * q8 = y[ibl].qs; uint16_t sh = x[ibl].scales_h; __m256i sumi1 = _mm256_setzero_si256(); __m256i sumi2 = _mm256_setzero_si256(); for (int ib = 0; ib < QK_K/32; ib += 2) { const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs); qs += 16; const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16; const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)), _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b))); const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)), _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b))); const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1); const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32; const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32; sh >>= 4; const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1)); const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2)); sumi1 = _mm256_add_epi32(p_1, sumi1); sumi2 = _mm256_add_epi32(p_2, sumi2); } accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum); } *s = hsum_float_8(accum); #elif defined __AVX__ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl); const __m128i m4b = _mm_set1_epi8(0x0f); __m256 accum = _mm256_setzero_ps(); for (int ibl = 0; ibl < nb; ++ibl) { const uint8_t * qs = x[ibl].qs; const int8_t * q8 = y[ibl].qs; uint16_t sh = x[ibl].scales_h; __m128i sumi1_0 = _mm_setzero_si128(); __m128i sumi1_1 = _mm_setzero_si128(); __m128i sumi2_0 = _mm_setzero_si128(); __m128i sumi2_1 = _mm_setzero_si128(); for (int ib = 0; ib < QK_K/32; ib += 2) { const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16; const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16; const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16; const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)); const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)); const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)); const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)); const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0); const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1); const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0); const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1); const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32; const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32; sh >>= 4; const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1)); const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1)); const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2)); const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2)); sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0); sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1); sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0); sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1); } __m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0); __m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1); accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum); } *s = hsum_float_8(accum); #else UNUSED(x); UNUSED(y); UNUSED(nb); ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); #endif }