#ifndef SWAB_SORT_H
#define SWAB_SORT_H

/* A fast quicksort-like new alg created in Csolnok, Hungary with:
 * - 4-way partitioning with 0..5 copies (not swaps) per elem per run
 * - ensured O(log2(n)) worst recursion depth
 *
 * LICENCE: CC-BY, 2025 May 08
 * Author: Richárd István Thier (also author of the Magyarsort)
 */

typedef uint32_t sch_rand_state;

/** Create rand state for schwab_sort using a seed - can give 0 if uninterested */
static inline sch_rand_state schwab_rand_state(uint32_t seed) {
	return seed;
}

/** 32-bit LCG for fast random generations - from my fastrand.h */
static inline uint32_t schwab_lcg(sch_rand_state *state) {
	*state = *state * 1664525u + 1013904223u;
	return *state;
}

/** Get pivot index in [0, len-1] without modulus - from my fastrand.h */
static inline uint32_t schwab_pick_pivot(sch_rand_state *state, uint32_t len) {
	uint32_t rand = schwab_lcg(state);
	/* Multiply by len, take the upper 32 bits of the 64-bit result */
	return (uint32_t)(((uint64_t)rand * len) >> 32);
}

/**
 * 4-way partitioning
 *
 * Expects: arr[plo] <= kmid <= arr[phi]
 * Results: arr[low..plo - 1] <= arr[plo..pmid - 1] <= arr[pmid..phi - 1] <= arr[phi.. high]
 *
 * Also: Adding together lengths of all results arrays shrinks by 1 compared to start arr.
 *       This means that we ensure recursions / loops always end in quicksort...
 *
 * @param arr The array to partition
 * @param low Inclusive smallest index.
 * @param high Inclusive highest index.
 * @param plo IN-OUT: input low pivot, output index until elements <= low pivot.
 * @param kmid IN: The mid spliting value (like a pivot value, but can be imaginary nonexistent)
 * @param pmid OUT: output index until elements <= mid pivot.
 * @param phi IN-OUT: input high pivot, output index until elements <= high pivot.
 */
static inline void schwab_partition(
		uint32_t *arr,
		int low,
		int high,
		int *plo,
		uint32_t kmid,
		int *pmid,
		int *phi) {

	/* Keys only - no element copy is made here */
	uint32_t klo = arr[*plo];
	uint32_t khi = arr[*phi];

	/* [*] Swapping arr[phi]<->arr[high] ensures stop condition later */
	uint32_t tmphi = arr[*phi];
	arr[*phi] = arr[high];
	arr[high] = tmphi;

	/* Aren't inclusive end indices of 4 "blocks" - b0 is smallest vals */
	int b0 = low, b1 = low, b2 = low, b3 = low;

	while(b3 < high) {
		/* This I moved to be first for hot code path for constant / smallrange */
		if(arr[b3] >= khi) {
			++b3;
			continue;
		}

		/* TODO: should be copy of whole element when not just uint32s! */
		uint32_t curr = arr[b3];

		/* TODO: We can do "ILP-memcpy"s here:
		 *
		 * Key from b2->b3, value from b2->b3, key from b1->b2, value from b1... etc
		 * This is likely faster than calling a memcpy if we code this for not just uint32s!
		 */
		if(curr < klo) {
			arr[b3] = arr[b2];
			arr[b2] = arr[b1];
			arr[b1] = arr[b0];
			arr[b0] = curr;
			++b0; ++b1; ++b2; ++b3;
			continue;
		}

		if(curr < kmid) {
			arr[b3] = arr[b2];
			arr[b2] = arr[b1];
			arr[b1] = curr;
			++b1; ++b2; ++b3;
		} else {
			arr[b3] = arr[b2];
			arr[b2] = curr;
			++b2; ++b3;
		}
	}

	/* [*] Swap the chosen pivot to begin of last block */
	/* This way we can return bigger index and by that */
	/* this always removes an element per run at least */
	tmphi = arr[b2];
	arr[b2] = arr[high];
	arr[high] = tmphi;
	++b2;

	/* Handle output vars as per doc comment */
	*plo = b0;
	*pmid = b1;
	*phi = b2; /* Because of: [*] */
}

/** Swabic-sort its somewhat similar to quicksort but 4-way and tricky */
static inline void schwab_sort(
		uint32_t *array,
		int low,
		int high,
		sch_rand_state *state) {

	/* Loop handles longest sub-sort-task which ensused log tree depth */
	while(low < high) {
		int r0 = schwab_pick_pivot(state, (high + 1) - low) + low;
		int r1 = schwab_pick_pivot(state, (high + 1) - low) + low;
		uint32_t klo = array[r0];
		uint32_t khi = array[r1];
		int plo = r0;
		int phi = r1;
		if(klo > khi) {
			uint32_t ktmp = klo;
			klo = khi;
			khi = ktmp;

			plo = r1;
			phi = r0;
		}

		uint32_t kmid = klo + (khi - klo) / 2;

		int pmid;
		schwab_partition(array, low, high, &plo, kmid, &pmid, &phi);

		/* See where NOT to recurse to avoid worst case stack depth */
		/* Rem.: These might be "not real" length but we only use them to comparisons */
		/* REM.: The "real" lengths might be off-by-one but these are FASTER! */
		int lolen = plo - low;
		int hilen = high - phi;

		/* Rewrite loop for worst subtask goal and recurse others! */
		/* Let the branch predictor try to predict input data path */
		/* Rem.: Best would be to check for biggest in all 4 block */
		/*       But that would complicate codes above this point! */
		/* Rem.: Order of operations try to be a cache-friendly as */
		/*       possible, but had to put loops changes to the end */
		if(lolen < hilen) {
			schwab_sort(array, low, plo - 1, state);
			schwab_sort(array, plo, pmid - 1, state);
			schwab_sort(array, pmid, phi - 1, state);

			low = phi;
			/* high = high; */
		} else {
			schwab_sort(array, phi, high, state);
			schwab_sort(array, pmid, phi - 1, state);
			schwab_sort(array, plo, pmid - 1, state);

			/* low = low; */
			high = plo - 1;
		}
	}
}

#endif /* SWAB_SORT_H */