Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support AVX2 for run_container_to_uint32_array #642

Merged
merged 2 commits into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions benchmarks/run_container_benchmark.c
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,31 @@ int main() {
free(testvalues);
run_container_free(Bt);
}

printf("==dense range test \n");
for (int howmany = 32; howmany <= (1 << 16); howmany *= 8) {
run_container_t* Bt = run_container_create();
for (int j = 0; j < howmany; ++j) {
uint16_t min = (uint16_t)pcg32_random() % 4096;
uint16_t max = min + 4096;
int32_t nruns_greater =
rle16_count_greater(Bt->runs, Bt->n_runs, max);
int32_t nruns_less =
rle16_count_less(Bt->runs, Bt->n_runs - nruns_greater, min);
run_container_add_range_nruns(Bt, min, max, nruns_less,
nruns_greater);
}
printf("\n number of values in container = %d\n",
run_container_cardinality(Bt));
int card = run_container_cardinality(Bt);
uint32_t* out = malloc(sizeof(uint32_t) * (unsigned long)card);
BEST_TIME(run_container_to_uint32_array(out, Bt, 1234), card, repeat,
card);
free(out);

run_container_free(Bt);
}

printf("\n");

run_container_t* B1 = run_container_create();
Expand Down
98 changes: 80 additions & 18 deletions src/containers/run.c
Original file line number Diff line number Diff line change
Expand Up @@ -636,24 +636,6 @@ void run_container_andnot(const run_container_t *src_1,
}
}

ALLOW_UNALIGNED
int run_container_to_uint32_array(void *vout, const run_container_t *cont,
uint32_t base) {
int outpos = 0;
uint32_t *out = (uint32_t *)vout;
for (int i = 0; i < cont->n_runs; ++i) {
uint32_t run_start = base + cont->runs[i].value;
uint16_t le = cont->runs[i].length;
for (int j = 0; j <= le; ++j) {
uint32_t val = run_start + j;
memcpy(out + outpos, &val,
sizeof(uint32_t)); // should be compiled as a MOV on x64
outpos++;
}
}
return outpos;
}

/*
* Print this container using printf (useful for debugging).
*/
Expand Down Expand Up @@ -1026,6 +1008,39 @@ static inline int _avx2_run_container_cardinality(const run_container_t *run) {
return sum;
}

ALLOW_UNALIGNED
int _avx2_run_container_to_uint32_array(void *vout, const run_container_t *cont,
uint32_t base) {
int outpos = 0;
uint32_t *out = (uint32_t *)vout;

for (int i = 0; i < cont->n_runs; ++i) {
uint32_t run_start = base + cont->runs[i].value;
uint16_t le = cont->runs[i].length;
int j = 0;
__m256i run_start_v = _mm256_set1_epi32(run_start);
stdpain marked this conversation as resolved.
Show resolved Hide resolved
// [8,8,8,8....]
__m256i inc = _mm256_set1_epi32(8);
// used for generate sequence:
// [0, 1, 2, 3...], [8, 9, 10,...]
__m256i delta = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
for (j = 0; j + 8 <= le; j += 8) {
__m256i val_v = _mm256_add_epi32(run_start_v, delta);
_mm256_storeu_si256((__m256i *)(out + outpos), val_v);
delta = _mm256_add_epi32(inc, delta);
outpos += 8;
}

for (; j <= le; ++j) {
uint32_t val = run_start + j;
memcpy(out + outpos, &val,
sizeof(uint32_t)); // should be compiled as a MOV on x64
outpos++;
}
}
return outpos;
}

CROARING_UNTARGET_AVX2

/* Get the cardinality of `run'. Requires an actual computation. */
Expand Down Expand Up @@ -1055,6 +1070,34 @@ int run_container_cardinality(const run_container_t *run) {
return _scalar_run_container_cardinality(run);
}
}

int _scalar_run_container_to_uint32_array(void *vout,
const run_container_t *cont,
uint32_t base) {
int outpos = 0;
uint32_t *out = (uint32_t *)vout;
for (int i = 0; i < cont->n_runs; ++i) {
uint32_t run_start = base + cont->runs[i].value;
uint16_t le = cont->runs[i].length;
for (int j = 0; j <= le; ++j) {
uint32_t val = run_start + j;
memcpy(out + outpos, &val,
sizeof(uint32_t)); // should be compiled as a MOV on x64
outpos++;
}
}
return outpos;
}

int run_container_to_uint32_array(void *vout, const run_container_t *cont,
uint32_t base) {
if (croaring_hardware_support() & ROARING_SUPPORTS_AVX2) {
return _avx2_run_container_to_uint32_array(vout, cont, base);
} else {
return _scalar_run_container_to_uint32_array(vout, cont, base);
}
}

#else

/* Get the cardinality of `run'. Requires an actual computation. */
Expand All @@ -1071,6 +1114,25 @@ int run_container_cardinality(const run_container_t *run) {

return sum;
}

ALLOW_UNALIGNED
int run_container_to_uint32_array(void *vout, const run_container_t *cont,
uint32_t base) {
int outpos = 0;
uint32_t *out = (uint32_t *)vout;
for (int i = 0; i < cont->n_runs; ++i) {
uint32_t run_start = base + cont->runs[i].value;
uint16_t le = cont->runs[i].length;
for (int j = 0; j <= le; ++j) {
uint32_t val = run_start + j;
memcpy(out + outpos, &val,
sizeof(uint32_t)); // should be compiled as a MOV on x64
outpos++;
}
}
return outpos;
}

#endif

#ifdef __cplusplus
Expand Down