diff --git a/magyarsort.h b/magyarsort.h
index c2d538e..8cd9627 100644
--- a/magyarsort.h
+++ b/magyarsort.h
@@ -44,6 +44,12 @@ namespace MagyarSort {
 	static constexpr int BITS_PER_DIGIT = 4; // "bit / helyiérték"
 	static constexpr int DIGIT_RANGE = 16; // "helyiérték állapottér"
 #else
+/*
+	// Per-word digits sorting
+	static constexpr int DIGITS = 2; // "helyiérték"
+	static constexpr int BITS_PER_DIGIT = 16; // "bit / helyiérték"
+	static constexpr int DIGIT_RANGE = 65536; // "helyiérték állapottér"
+*/
 	// Per-byte digits sorting
 	static constexpr int DIGITS = 4; // "helyiérték"
 	static constexpr int BITS_PER_DIGIT = 8; // "bit / helyiérték"
@@ -240,7 +246,22 @@ namespace MagyarSort {
 		memset(prev, 0, sizeof(prev));
 
 		// This is a template-unrolled loop too
-		PMagic2<DIGIT_RANGE - 1, COUNTER_TYP>(radics, prev);
+		if constexpr (DIGIT_RANGE < 1024) {
+			// Extra optimization for bytes and nibbles - totally unrolled loop!
+			PMagic2<DIGIT_RANGE - 1, COUNTER_TYP>(radics, prev);
+		} else {
+			// The above would not work for words and higher up...
+			#pragma GCC unroll 16
+			for(int j = 0; j < DIGITS; ++j) {
+				int offset = 0;
+				#pragma GCC unroll 64
+				for(int i = 0; i < DIGIT_RANGE; ++i) {
+					int DSTART = (j * DIGIT_RANGE);
+					radics[DSTART + i] += prev[j];
+					prev[j] = radics[DSTART + i];
+				}
+			}
+		}
 	}
 
 	/** Recursive Functor: no class should be generated I think (compiler should be smart) */
@@ -334,7 +355,9 @@ namespace MagyarSort {
 #endif // !NO_MLOCK
 		// Write prefetchin'
 		//__builtin_prefetch(&radicsOut[..], 1);
-		PrefetchMagic<DIGITS * DIGIT_RANGE, (64/sizeof(COUNTER_TYP)), COUNTER_TYP, 1/*w*/> pm(radics);
+		if constexpr (DIGIT_RANGE <= 1024) {
+			PrefetchMagic<DIGITS * DIGIT_RANGE, (64/sizeof(COUNTER_TYP)), COUNTER_TYP, 1/*w*/> pm(radics);
+		}
 		memset(radics, 0, sizeof(radics));
 
 		// Calculate occurences of digits
diff --git a/makefile b/makefile
index 939f35a..d2c2b12 100644
--- a/makefile
+++ b/makefile
@@ -12,6 +12,7 @@ release_debug_sym: test.cpp magyarsort.h
 
 release: test.cpp magyarsort.h
 	g++ test.cpp -DNDEBUG -std=c++17 -O2 -o test.out
+	# g++ test.cpp -DNDEBUG -std=c++17 -O2 -ftree-vectorize -fopt-info-vec-missed -o test.out
 
 release_ypsu: ypsu.cpp magyarsort.h
 	g++ ypsu.cpp -DNDEBUG -std=c++17 -O2 -o ypsu.out
diff --git a/test.cpp b/test.cpp
index bc97415..12861f0 100644
--- a/test.cpp
+++ b/test.cpp
@@ -5,15 +5,21 @@
 // Uncomment next line to follow Creel: https://www.youtube.com/watch?v=ujb2CIWE8zY
 // #define CREEL // Overwrites TEST_LEN to 16 and sets MAGYAR_SORT_NIBBLE!
 
+// Uncomment and give a value for input being modulo this value!
+//#define INPUT_MOD (65536*128)
+
 // Number of input elements to generate - unused when CREEL is defined!
-#define SORT_WIDTH 200000000
-//#define SORT_WIDTH 40000000
+//#define SORT_WIDTH 200000000
+#define SORT_WIDTH 40000000
 // Uncomment this to use nibbles as digits and not bytes - CREEL defines this anyways
 //#define MAGYAR_SORT_NIBBLE
 
 // Uncomment if you want to see output before / after sorts (debugging for example)
 //#define PRINT_OUTPUT
 
+// Uncomment if you want to see how many elements are unique and duplicant in the input (debugging info)
+#define COUNT_DUPLICANTS
+
 //#define SKA_SORT
 
 // Uncomment for perf / cachegring and similar runs!
@@ -86,7 +92,11 @@ static inline std::vector<uint32_t> GenerateInput() {
 	ret.resize(SORT_WIDTH);
 
 	for(size_t ek = 0; ek < SORT_WIDTH; ++ek) {
+#ifndef INPUT_MOD
 		ret[ek] = (uint32_t)std::rand();
+#else
+		ret[ek] = (uint32_t)std::rand() % INPUT_MOD;
+#endif
 	}
 
 	return ret;
@@ -155,9 +165,27 @@ int main() {
 
 #ifndef MEASURE_ONLY
 	bool good = true;
+#ifdef COUNT_DUPLICANTS
+	size_t dups = 0;
+	uint32_t prev = (in1.size() > 0) ? in1[0] : 0;
+#endif // COUNT_DUPLICANTS
 	for(size_t i = 0; good && (i < in1.size()); ++i) {
 		good &= (in1[i] == in2[i]);
+#ifdef COUNT_DUPLICANTS
+		if(i > 0) {
+			uint32_t curr = in1[i];
+			if(curr == prev) {
+				++dups;
+			} else {
+				prev = curr;
+			}
+		}
+#endif // COUNT_DUPLICANTS
 	}
+#ifdef COUNT_DUPLICANTS
+	printf("Duplications are %d out of %d, which is %f percent\n", dups, in1.size(), (float)(dups * 100) / in1.size());
+#endif // COUNT_DUPLICANTS
+
 #endif // !MEASURE_ONLY
 
 	printf("Results:\n\n");