#include #include #include #if (defined(__i386__) || defined(__x86_64__)) #define HAVE_AVX2 1 #include static const char AVX2_MASK[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }; __attribute__((target("avx2"))) size_t avx2_bytecount(const char *haystack, const char needle, size_t haystack_len) { assert (haystack_len >= 32); #define SUM_ADD(count, u8s, temp) do { \ temp = _mm256_sad_epu8(u8s, _mm256_setzero_si256()); \ count += _mm256_extract_epi64(temp, 0) + _mm256_extract_epi64(temp, 1) + \ _mm256_extract_epi64(temp, 2) + _mm256_extract_epi64(temp, 3); \ } while(0) #define mm256_from_offset(slice, offset) _mm256_loadu_si256((__m256i*)(slice + offset)) size_t offset = 0; size_t count = 0; __m256i sums; __m256i needles = _mm256_set1_epi8((char)needle); // 8160 while (haystack_len >= offset + 32 * 255) { __m256i counts = _mm256_setzero_si256(); for (int i = 0; i < 255; ++i) { counts = _mm256_sub_epi8( counts, _mm256_cmpeq_epi8(mm256_from_offset(haystack, offset), needles) ); offset += 32; } SUM_ADD(count, counts, sums); } // 4096 if (haystack_len >= offset + 32 * 128) { __m256i counts = _mm256_setzero_si256(); for (int i = 0; i < 128; ++i) { counts = _mm256_sub_epi8( counts, _mm256_cmpeq_epi8(mm256_from_offset(haystack, offset), needles) ); offset += 32; } SUM_ADD(count, counts, sums); } // 32 __m256i counts = _mm256_setzero_si256(); for (size_t i = 0; i < (haystack_len - offset) / 32; ++i) { counts = _mm256_sub_epi8( counts, _mm256_cmpeq_epi8(mm256_from_offset(haystack, offset + i * 32), needles) ); } if (haystack_len % 32 != 0) { counts = _mm256_sub_epi8( counts, _mm256_and_si256( _mm256_cmpeq_epi8(mm256_from_offset(haystack, haystack_len - 32), needles), mm256_from_offset(AVX2_MASK, haystack_len % 32) ) ); } SUM_ADD(count, counts, sums); return count; #undef mm256_from_offset #undef SUM_ADD } #define HAVE_SSE4_1 #include static const char SSE4_1_MASK[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }; __attribute__((target("sse4.1"))) size_t sse41_bytecount(const char *haystack, const char needle, size_t haystack_len) { assert(haystack_len >= 16); #define mm_from_offset(haystack, offset) _mm_loadu_si128((__m128i *)(haystack + offset)) #define SUM_ADD(count, u8s, temp) do { \ temp = _mm_sad_epu8(u8s, _mm_setzero_si128()); \ count += _mm_extract_epi32(sums, 0) + _mm_extract_epi32(sums, 2); \ } while (0) const char *ptr = (const char *)haystack; size_t offset = 0; size_t count = 0; const __m128i needles = _mm_set1_epi8((char)needle); __m128i sums; // 4080 while (haystack_len >= offset + 16 * 255) { __m128i counts = _mm_setzero_si128(); for (size_t i=0; i < 255; i++) { counts = _mm_sub_epi8( counts, _mm_cmpeq_epi8(mm_from_offset(ptr, offset), needles) ); offset += 16; } SUM_ADD(count, counts, sums); } // 2048 if (haystack_len >= offset + 16 * 128) { __m128i counts = _mm_setzero_si128(); for (size_t i=0; i < 128; i++) { counts = _mm_sub_epi8( counts, _mm_cmpeq_epi8(mm_from_offset(ptr, offset), needles) ); offset += 16; } SUM_ADD(count, counts, sums); } // 16 __m128i counts = _mm_setzero_si128(); for (size_t i=0; i < ((haystack_len - offset) / 16); i++) { counts = _mm_sub_epi8( counts, _mm_cmpeq_epi8(mm_from_offset(ptr, offset + i * 16), needles) ); } if (haystack_len % 16 != 0) { counts = _mm_sub_epi8( counts, _mm_and_si128( _mm_cmpeq_epi8(mm_from_offset(ptr, haystack_len - 16), needles), mm_from_offset(SSE4_1_MASK, haystack_len % 16) ) ); } SUM_ADD(count, counts, sums); return count; #undef mm_from_offset #undef SUM_ADD } #endif static size_t naive_bytecount(const char *haystack, const char needle, size_t haystack_len) { size_t count = 0; const char *ptr = haystack; const char *end_ptr = ptr + haystack_len; while (ptr < end_ptr) { if (*ptr == needle) { count++; } ptr++; } return count; } static size_t naive_bytecount_32(const char *haystack, const char needle, size_t haystack_len) { uint32_t count = 0; const char *ptr = haystack; const char *end_ptr = ptr + haystack_len; while (ptr < end_ptr) { if (*ptr == needle) { count++; } ptr++; } return count; } static size_t _fallback_bytecount(const char *haystack, const char needle, size_t haystack_len) { if (haystack_len < UINT32_MAX) { return naive_bytecount_32(haystack, needle, haystack_len); } return naive_bytecount(haystack, needle, haystack_len); } #ifdef HAVE_AVX2 __attribute__((target("sse4.1,avx2"))) static size_t _avx2_bytecount(const char *haystack, const char needle, size_t haystack_len) { if (haystack_len >= 32) { return avx2_bytecount(haystack, needle, haystack_len); } #ifdef HAVE_SSE4_1 if (haystack_len >= 16) { return sse41_bytecount(haystack, needle, haystack_len); } #endif return _fallback_bytecount(haystack, needle, haystack_len); } #endif #ifdef HAVE_SSE4_1 __attribute__((target("sse4.1"))) static size_t _sse41_bytecount(const char *haystack, const char needle, size_t haystack_len) { if (haystack_len >= 16) { return sse41_bytecount(haystack, needle, haystack_len); } return _fallback_bytecount(haystack, needle, haystack_len); } #endif __attribute__((unused)) static size_t (*resolve_bytecount (void))(const char *, const char, size_t) { __builtin_cpu_init(); #ifdef HAVE_AVX2 if (__builtin_cpu_supports("avx2")) { return _avx2_bytecount; } #endif #ifdef HAVE_SSE4_1 if (__builtin_cpu_supports("sse4.1")) { return _sse41_bytecount; } #endif return _fallback_bytecount; } static size_t inner_bytecount(const char *haystack, const char needle, size_t haystack_len) __attribute__((ifunc("resolve_bytecount"))); static size_t bytecount(const void *haystack, int needle, size_t haystack_len) { return inner_bytecount((const char *)haystack, (const char)needle, haystack_len); }