diff --git a/neargoodsort_and_merge_ideas.md b/neargoodsort_and_merge_ideas.md index 42486f1..ce8fb21 100644 --- a/neargoodsort_and_merge_ideas.md +++ b/neargoodsort_and_merge_ideas.md @@ -96,6 +96,34 @@ On the worst case its clearly O(n^2) because we always just get a single element Space analysis is roughly same as the non-optimized merge sort - see below for space optimized merge steps - maybe useful for this to! +## Remark: Yes there is a variant in which we only need outnsi and no outsi vectors! + +For split: + +* Have i,j indices on the input +* i reads and j writes - except when i == j no self-overwrite happens. +* read from [i] +* put (pushback) either to outnsi or [j] +* this means j <= i +* j only grows when was put there +* if i is after length of input, j tells bounds of outsi in input arr + +Recursion just done as-is above... + +For merge: + +* See that input has junk data at its end right as many as there is in the corresponding outnsi count +* so we merge from right-to-left. +* The (j - 1) tells where to read from one merge source +* The end of the vector (or popback / peek) tells where to read from other merge source +* The (i - 1) is destination where to write the smaller among the two +* The corresponding merge-source-indices step only when they are moved to destination and i always moves. + +This ends with the input array being sorted. + +Rem.: Maybe it is worth it to set the outnsi vectors to be of length |input| / 2 or maybe even |input| to avoid vector growth at cost of more memory usage... + +Rem.: This is based on the merge-sort last idea below - which as I said in git commit I think is well known, but just came up with. # A random bad inplace-merge idea diff --git a/outliersort.h b/outliersort.h new file mode 100644 index 0000000..e4d68ba --- /dev/null +++ b/outliersort.h @@ -0,0 +1,51 @@ +/** + * Sort algorithm for nearly-sorted data. + * + * Runtime: + * - Worst case O(n^2) [smallrange / descending] + * - random case O(nlogn) + * Space: same as runtime + */ +#include // can do with malloc only if needed but this spares more ram. Also can do with a stack data type.. +#include // uint32_t + +// TODO: outliersort(...) function with parametrizable "what is an outlier?" and "init" functions! +// why? Because when I renamed outnsi to outlier I realized the only rule is to keep the begin +// of the array sorted but otherwise we better put into the outlier data that would make us +// less likely to be able to fill-in more data points (think of real outliers like [1 9000 2 3 4 ...8999] + +// TODO: simple to make this work with custom comparator and type.. +void nsort(uint32_t *arr, int n) { + // Temp space - worst case O(n) size + std::vector outlier; + + // Split [i: read, j: write] + uint32_t max = (n > 0) ? arr[0] : 0; + uint32_t i = 1; + uint32_t j = 1; + + while(i < n) { + if(arr[i] < max) { + outlier.push_back(arr[i++]); + } else if(i > j) { + arr[j++] = arr[i++]; + } + } + + if(outlier.size() > 1) { + // Recursion + nsort(&outlier[0], outlier.size()); + + // Merge-from-right [j: sorted right-index, i: write index] + uint32_t k = outlier.size() - 1; + --j; + --i; + while(i >= 0) { + if(outlier[k] > arr[j]) { + arr[i--] = outlier[k--]; + } else { + arr[i--] = arr[j--]; + } + } + } +}