#include #include #define TOP2(x) ((x) >> 30) void swap(uint32_t* a, uint32_t* b) { uint32_t tmp = *a; *a = *b; *b = tmp; } // TODO: instead of swaps, we need a single tmp var + only memcpys!!! // TODO: we can do "ILP-memcpy": key from b2->b3, value from b2->b3, key from b1->b2, value from b1... // Rem.: The latter is faster likely than calling memcpy function even though its simd-optimized... void partition_top2(uint32_t* arr, int n) { int b0 = 0, b1 = 0, b2 = 0, b3 = 0; while (b3 < n) { uint32_t val = arr[b3]; uint32_t top = TOP2(val); if (top == 0) { // Cascade: swap into b2, b1, b0 swap(&arr[b3], &arr[b2]); swap(&arr[b2], &arr[b1]); swap(&arr[b1], &arr[b0]); b0++; b1++; b2++; } else if (top == 1) { // Cascade: swap into b2, b1 swap(&arr[b3], &arr[b2]); swap(&arr[b2], &arr[b1]); b1++; b2++; } else if (top == 2) { // Swap into b2 swap(&arr[b3], &arr[b2]); b2++; } // else (top == 3), do nothing b3++; } } int main() { uint32_t arr[] = { 0x40000001, // top 2 bits = 01 0x00000002, // 00 0xC0000003, // 11 0x80000004, // 10 0x00000005, // 00 0x40000006, // 01 0xC0000007, // 11 0x80000008, // 10 }; int n = sizeof(arr) / sizeof(arr[0]); printf("Before:\n"); for (int i = 0; i < n; ++i) printf("0x%08X ", arr[i]); printf("\n"); // Optional: Show top 2 bits for verification printf("\nTop 2 bits:\n"); for (int i = 0; i < n; ++i) printf("%u ", TOP2(arr[i])); printf("\n"); partition_top2(arr, n); printf("\nAfter:\n"); for (int i = 0; i < n; ++i) printf("0x%08X ", arr[i]); printf("\n"); // Optional: Show top 2 bits for verification printf("\nTop 2 bits:\n"); for (int i = 0; i < n; ++i) printf("%u ", TOP2(arr[i])); printf("\n"); return 0; }