diff --git a/neargoodsort_and_merge_ideas.md b/neargoodsort_and_merge_ideas.md
index 42486f1..ce8fb21 100644
--- a/neargoodsort_and_merge_ideas.md
+++ b/neargoodsort_and_merge_ideas.md
@@ -96,6 +96,34 @@ On the worst case its clearly O(n^2) because we always just get a single element
 
 Space analysis is roughly same as the non-optimized merge sort - see below for space optimized merge steps - maybe useful for this to!
 
+## Remark: Yes there is a variant in which we only need outnsi and no outsi vectors!
+
+For split:
+
+* Have i,j indices on the input
+* i reads and j writes - except when i == j no self-overwrite happens.
+* read from [i]
+* put (pushback) either to outnsi or [j]
+* this means j <= i
+* j only grows when was put there
+* if i is after length of input, j tells bounds of outsi in input arr
+
+Recursion just done as-is above...
+
+For merge:
+
+* See that input has junk data at its end right as many as there is in the corresponding outnsi count
+* so we merge from right-to-left. 
+* The (j - 1) tells where to read from one merge source
+* The end of the vector (or popback / peek) tells where to read from other merge source
+* The (i - 1) is destination where to write the smaller among the two
+* The corresponding merge-source-indices step only when they are moved to destination and i always moves.
+
+This ends with the input array being sorted.
+
+Rem.: Maybe it is worth it to set the outnsi vectors to be of length |input| / 2 or maybe even |input| to avoid vector growth at cost of more memory usage...
+
+Rem.: This is based on the merge-sort last idea below - which as I said in git commit I think is well known, but just came up with.
 
 # A random bad inplace-merge idea
 
diff --git a/outliersort.h b/outliersort.h
new file mode 100644
index 0000000..e4d68ba
--- /dev/null
+++ b/outliersort.h
@@ -0,0 +1,51 @@
+/**
+ * Sort algorithm for nearly-sorted data.
+ *
+ * Runtime:
+ * - Worst case O(n^2) [smallrange / descending]
+ * - random case O(nlogn)
+ * Space: same as runtime
+ */
+#include <vector> // can do with malloc only if needed but this spares more ram. Also can do with a stack data type..
+#include <cstdint> // uint32_t
+
+// TODO: outliersort(...) function with parametrizable "what is an outlier?" and "init" functions!
+//       why? Because when I renamed outnsi to outlier I realized the only rule is to keep the begin
+//       of the array sorted but otherwise we better put into the outlier data that would make us
+//       less likely to be able to fill-in more data points (think of real outliers like [1 9000 2 3 4 ...8999]
+
+// TODO: simple to make this work with custom comparator and type..
+void nsort(uint32_t *arr, int n) {
+	// Temp space - worst case O(n) size
+	std::vector<uint32_t> outlier;
+
+	// Split [i: read, j: write]
+	uint32_t max = (n > 0) ? arr[0] : 0;
+	uint32_t i = 1;
+	uint32_t j = 1;
+
+	while(i < n) {
+		if(arr[i] < max) {
+			outlier.push_back(arr[i++]);
+		} else if(i > j) {
+			arr[j++] = arr[i++];
+		}
+	}
+
+	if(outlier.size() > 1) {
+		// Recursion
+		nsort(&outlier[0], outlier.size());
+
+		// Merge-from-right [j: sorted right-index, i: write index]
+		uint32_t k = outlier.size() - 1;
+		--j;
+		--i;
+		while(i >= 0) {
+			if(outlier[k] > arr[j]) {
+				arr[i--] = outlier[k--];
+			} else {
+				arr[i--] = arr[j--];
+			}
+		}
+	}
+}