84 lines
2.0 KiB
C
84 lines
2.0 KiB
C
#include <stdio.h>
|
|
#include <stdint.h>
|
|
|
|
#define TOP2(x) ((x) >> 30)
|
|
|
|
void swap(uint32_t* a, uint32_t* b) {
|
|
uint32_t tmp = *a;
|
|
*a = *b;
|
|
*b = tmp;
|
|
}
|
|
|
|
// TODO: instead of swaps, we need a single tmp var + only memcpys!!!
|
|
// TODO: we can do "ILP-memcpy": key from b2->b3, value from b2->b3, key from b1->b2, value from b1...
|
|
// Rem.: The latter is faster likely than calling memcpy function even though its simd-optimized...
|
|
void partition_top2(uint32_t* arr, int n) {
|
|
int b0 = 0, b1 = 0, b2 = 0, b3 = 0;
|
|
|
|
while (b3 < n) {
|
|
uint32_t val = arr[b3];
|
|
uint32_t top = TOP2(val);
|
|
|
|
if (top == 0) {
|
|
// Cascade: swap into b2, b1, b0
|
|
swap(&arr[b3], &arr[b2]);
|
|
swap(&arr[b2], &arr[b1]);
|
|
swap(&arr[b1], &arr[b0]);
|
|
b0++; b1++; b2++;
|
|
} else if (top == 1) {
|
|
// Cascade: swap into b2, b1
|
|
swap(&arr[b3], &arr[b2]);
|
|
swap(&arr[b2], &arr[b1]);
|
|
b1++; b2++;
|
|
} else if (top == 2) {
|
|
// Swap into b2
|
|
swap(&arr[b3], &arr[b2]);
|
|
b2++;
|
|
}
|
|
// else (top == 3), do nothing
|
|
|
|
b3++;
|
|
}
|
|
}
|
|
|
|
int main() {
|
|
uint32_t arr[] = {
|
|
0x40000001, // top 2 bits = 01
|
|
0x00000002, // 00
|
|
0xC0000003, // 11
|
|
0x80000004, // 10
|
|
0x00000005, // 00
|
|
0x40000006, // 01
|
|
0xC0000007, // 11
|
|
0x80000008, // 10
|
|
};
|
|
|
|
int n = sizeof(arr) / sizeof(arr[0]);
|
|
|
|
printf("Before:\n");
|
|
for (int i = 0; i < n; ++i)
|
|
printf("0x%08X ", arr[i]);
|
|
printf("\n");
|
|
|
|
// Optional: Show top 2 bits for verification
|
|
printf("\nTop 2 bits:\n");
|
|
for (int i = 0; i < n; ++i)
|
|
printf("%u ", TOP2(arr[i]));
|
|
printf("\n");
|
|
|
|
partition_top2(arr, n);
|
|
|
|
printf("\nAfter:\n");
|
|
for (int i = 0; i < n; ++i)
|
|
printf("0x%08X ", arr[i]);
|
|
printf("\n");
|
|
|
|
// Optional: Show top 2 bits for verification
|
|
printf("\nTop 2 bits:\n");
|
|
for (int i = 0; i < n; ++i)
|
|
printf("%u ", TOP2(arr[i]));
|
|
printf("\n");
|
|
|
|
return 0;
|
|
}
|