Index: lib/libc/stdlib/WikiHelpers.h
===================================================================
--- /dev/null
+++ lib/libc/stdlib/WikiHelpers.h
@@ -0,0 +1,405 @@
+/* whether to give WikiSort a full-size cache, to see how it performs when given more memory */
+#define DYNAMIC_CACHE false
+
+// Gets the address of a given index in an array
+#define Index_addr(base, ind) ((base) + ((size) * (ind)))
+#define COMPARE(arg1, arg2) ((compare(arg1, arg2) == 1))
+
+size_t Min(const size_t a, const size_t b) {
+	if (a < b) return a;
+	return b;
+}
+
+size_t Max(const size_t a, const size_t b) {
+	if (a > b) return a;
+	return b;
+}
+
+typedef int (*Comparison)(const void *, const void *);
+
+/* structure to represent ranges within the array */
+typedef struct {
+	size_t start;
+	size_t end;
+} Range;
+
+size_t Range_length(Range range) { return range.end - range.start; }
+
+Range Range_new(const size_t start, const size_t end) {
+	Range range;
+	range.start = start;
+	range.end = end;
+	return range;
+}
+
+/* toolbox functions used by the sorter */
+
+void Swap(void *va, void *vb, size_t s)
+{
+  uint8_t t, *a = (uint8_t*)va, *b = (uint8_t*)vb;
+  while(s--)
+    t = a[s], a[s] = b[s], b[s] = t;
+}
+
+/* 63 -> 32, 64 -> 64, etc. */
+/* this comes from Hacker's Delight */
+/* TODO Generalize this */
+size_t FloorPowerOfTwo (const size_t value) {
+	size_t x = value;
+	x = x | (x >> 1);
+	x = x | (x >> 2);
+	x = x | (x >> 4);
+	x = x | (x >> 8);
+	x = x | (x >> 16);
+#if __LP64__
+	x = x | (x >> 32);
+#endif
+	return x - (x >> 1);
+}
+
+/* find the index of the first value within the range that is equal to array[index] */
+size_t BinaryFirst(const void *arr, const void *value, const Range range, size_t size, const Comparison compare) {
+	const char *array = (const char *)arr;
+	size_t start = range.start, end = range.end - 1;
+	if (range.start >= range.end) return range.start;
+	while (start < end) {
+		size_t mid = start + (end - start)/2;
+		if (COMPARE(Index_addr(array, mid), value))
+			start = mid + 1;
+		else
+			end = mid;
+	}
+	if (start == range.end - 1 && COMPARE(Index_addr(array, start), value)) start++;
+	return start;
+}
+
+/* find the index of the last value within the range that is equal to array[index], plus 1 */
+size_t BinaryLast(const void *arr, const void *value, const Range range, size_t size, const Comparison compare) {
+	const char *array = (const char *)arr;
+	size_t start = range.start, end = range.end - 1;
+	if (range.start >= range.end) return range.end;
+	while (start < end) {
+		size_t mid = start + (end - start)/2;
+		if (!COMPARE(value, Index_addr(array, mid)))
+			start = mid + 1;
+		else
+			end = mid;
+	}
+	if (start == range.end - 1 && !COMPARE(value, Index_addr(array, start))) start++;
+	return start;
+}
+
+/* combine a linear search with a binary search to reduce the number of comparisons in situations */
+/* where have some idea as to how many unique values there are and where the next value might be */
+size_t FindFirstForward(const void *arr, const void *value, const Range range, size_t size, const Comparison compare, const size_t unique) {
+	const char *array = (const char *)arr;
+	size_t skip, index;
+	if (Range_length(range) == 0) return range.start;
+	skip = Max(Range_length(range)/unique, 1);
+
+	for (index = range.start + skip; COMPARE(Index_addr(array, index - 1), value); index += skip)
+		if (index >= range.end - skip)
+			return BinaryFirst(array, value, Range_new(index, range.end), size, compare);
+
+	return BinaryFirst(array, value, Range_new(index - skip, index), size, compare);
+}
+
+size_t FindLastForward(const void *arr, const void *value, const Range range, size_t size, const Comparison compare, const size_t unique) {
+	const char *array = (const char *)arr;
+	size_t skip, index;
+	if (Range_length(range) == 0) return range.start;
+	skip = Max(Range_length(range)/unique, 1);
+
+	for (index = range.start + skip; !COMPARE(value, Index_addr(array, index - 1)); index += skip)
+		if (index >= range.end - skip)
+			return BinaryLast(array, value, Range_new(index, range.end), size, compare);
+
+	return BinaryLast(array, value, Range_new(index - skip, index), size, compare);
+}
+
+size_t FindFirstBackward(const void *arr, const void *value, const Range range, size_t size, const Comparison compare, const size_t unique) {
+	const char *array = (const char *)arr;
+	size_t skip, index;
+	if (Range_length(range) == 0) return range.start;
+	skip = Max(Range_length(range)/unique, 1);
+
+	for (index = range.end - skip; index > range.start && !COMPARE(Index_addr(array, index - 1), value); index -= skip)
+		if (index < range.start + skip)
+			return BinaryFirst(array, value, Range_new(range.start, index), size, compare);
+
+	return BinaryFirst(array, value, Range_new(index, index + skip), size, compare);
+}
+
+size_t FindLastBackward(const void *arr, const void *value, const Range range, size_t size, const Comparison compare, const size_t unique) {
+	const char *array = (const char *)arr;
+	size_t skip, index;
+	if (Range_length(range) == 0) return range.start;
+	skip = Max(Range_length(range)/unique, 1);
+
+	for (index = range.end - skip; index > range.start && COMPARE(value, Index_addr(array, index - 1)); index -= skip)
+		if (index < range.start + skip)
+			return BinaryLast(array, value, Range_new(range.start, index), size, compare);
+
+	return BinaryLast(array, value, Range_new(index, index + skip), size, compare);
+}
+
+/*
+ * Generic insertionsort borrowed from merge.c
+ */
+void InsertionSort(void *arr, const Range range, size_t size, const Comparison compare) {
+	char *a = (char *)arr;
+	char *ai, *s, *t, *u, tmp;
+	int i;
+
+	a = Index_addr(a, range.start);
+	size_t n = range.end - range.start;
+	for (ai = a + size; --n >= 1; ai += size)
+		for (t = ai; t > a; t -= size) {
+			u = t - size;
+			if (compare(t, u) != 1) break;
+			Swap(u, t, size);
+		}
+}
+
+/* reverse a range of values within the array */
+void Reverse(void *arr, const Range range, size_t size) {
+	char *array = (char *)arr;
+	size_t index;
+	for (index = Range_length(range)/2; index > 0; index--)
+		Swap(Index_addr(array, range.start + index - 1), Index_addr(array, range.end - index), size);
+}
+
+/* swap a series of values in the array */
+void BlockSwap(void *arr, const size_t start1, const size_t start2, const size_t block_size, size_t size) {
+	char *array = (char *)arr;
+	size_t index;
+	for (index = 0; index < block_size; index++)
+		Swap(Index_addr(array, start1 + index), Index_addr(array, start2 + index), size);
+}
+
+/* rotate the values in an array ([0 1 2 3] becomes [1 2 3 0] if we rotate by 1) */
+/* this assumes that 0 <= amount <= range.length() */
+void Rotate(void *arr, const size_t amount, const Range range, void *cache_base, const size_t cache_size, size_t size) {
+	char *array = (char *)arr;
+	char *cache = (char *)cache_base;
+	size_t split; Range range1, range2;
+	if (Range_length(range) == 0) return;
+
+	split = range.start + amount;
+	range1 = Range_new(range.start, split);
+	range2 = Range_new(split, range.end);
+
+	/* if the smaller of the two ranges fits into the cache, it's *slightly* faster copying it there and shifting the elements over */
+	if (Range_length(range1) <= Range_length(range2)) {
+		if (Range_length(range1) <= cache_size) {
+			memcpy(Index_addr(cache, 0), Index_addr(array, range1.start), Range_length(range1) * size);
+			memmove(Index_addr(array, range1.start), Index_addr(array, range2.start), Range_length(range2) * size);
+			memcpy(Index_addr(array, range1.start + Range_length(range2)), Index_addr(cache, 0), Range_length(range1) * size);
+			return;
+		}
+	} else {
+		if (Range_length(range2) <= cache_size) {
+			memcpy(Index_addr(cache, 0), Index_addr(array, range2.start), Range_length(range2) * size);
+			memmove(Index_addr(array, range2.end - Range_length(range1)), Index_addr(array, range1.start), Range_length(range1) * size);
+			memcpy(Index_addr(array, range1.start), Index_addr(cache, 0), Range_length(range2) * size);
+			return;
+		}
+	}
+
+	Reverse(array, range1, size);
+	Reverse(array, range2, size);
+	Reverse(array, range, size);
+}
+
+/* calculate how to scale the index value to the range within the array */
+/* the bottom-up merge sort only operates on values that are powers of two, */
+/* so scale down to that power of two, then use a fraction to scale back again */
+typedef struct {
+	size_t size, power_of_two;
+	size_t numerator, decimal;
+	size_t denominator, decimal_step, numerator_step;
+} WikiIterator;
+
+void WikiIterator_begin(WikiIterator *me) {
+	me->numerator = me->decimal = 0;
+}
+
+Range WikiIterator_nextRange(WikiIterator *me) {
+	size_t start = me->decimal;
+
+	me->decimal += me->decimal_step;
+	me->numerator += me->numerator_step;
+	if (me->numerator >= me->denominator) {
+		me->numerator -= me->denominator;
+		me->decimal++;
+	}
+
+	return Range_new(start, me->decimal);
+}
+
+bool WikiIterator_finished(WikiIterator *me) {
+	return (me->decimal >= me->size);
+}
+
+bool WikiIterator_nextLevel(WikiIterator *me) {
+	me->decimal_step += me->decimal_step;
+	me->numerator_step += me->numerator_step;
+	if (me->numerator_step >= me->denominator) {
+		me->numerator_step -= me->denominator;
+		me->decimal_step++;
+	}
+
+	return (me->decimal_step < me->size);
+}
+
+size_t WikiIterator_length(WikiIterator *me) {
+	return me->decimal_step;
+}
+
+WikiIterator WikiIterator_new(size_t size2, size_t min_level) {
+	WikiIterator me;
+	me.size = size2;
+	me.power_of_two = FloorPowerOfTwo(me.size);
+	me.denominator = me.power_of_two/min_level;
+	me.numerator_step = me.size % me.denominator;
+	me.decimal_step = me.size/me.denominator;
+	WikiIterator_begin(&me);
+	return me;
+}
+
+/* merge two ranges from one array and save the results into a different array */
+void MergeInto(void *arr_from, const Range A, const Range B, size_t size, const Comparison compare, void *arr_into) {
+	char *from = (char *)arr_from;
+	char *into = (char *)arr_into;
+	char *A_index = Index_addr(from, A.start), *B_index = Index_addr(from, B.start);
+	char *A_last = Index_addr(from, A.end), *B_last = Index_addr(from, B.end);
+	char *insert_index = Index_addr(into, 0);
+
+	while (true) {
+		if (!COMPARE(B_index, A_index)) {
+			for (size_t i = 0; i < size; i++)
+				 insert_index[i] = A_index[i];
+			A_index += size;
+			insert_index += size;
+			if (A_index == A_last) {
+				/* copy the remainder of B into the final array */
+				memcpy(insert_index, B_index, (B_last - B_index));
+				break;
+			}
+		} else {
+			for (size_t j = 0; j < size; j++)
+				 insert_index[j] = B_index[j];
+			B_index += size;
+			insert_index += size;
+			if (B_index == B_last) {
+				/* copy the remainder of A into the final array */
+				memcpy(insert_index, A_index, (A_last - A_index));
+				break;
+			}
+		}
+	}
+}
+
+/* merge operation using an external buffer, */
+void MergeExternal(void *arr, const Range A, const Range B, size_t size, const Comparison compare, void *cache_base) {
+	char *array = (char *)arr;
+	char *cache = (char *)cache_base;
+	/* A fits into the cache, so use that instead of the internal buffer */
+	char *A_index = Index_addr(cache, 0);
+	char *B_index = Index_addr(array, B.start);
+	char *insert_index = Index_addr(array, A.start);
+	char *A_last = Index_addr(cache, Range_length(A));
+	char *B_last = Index_addr(array, B.end);
+
+	if (Range_length(B) > 0 && Range_length(A) > 0) {
+		while (true) {
+			if (!COMPARE(B_index, A_index)) {
+				for (size_t i = 0; i < size; i++)
+					insert_index[i] = A_index[i];
+				A_index += size;
+				insert_index += size;
+				if (A_index == A_last) break;
+			} else {
+				for (size_t j = 0; j < size; j++)
+					insert_index[j] = B_index[j];
+				B_index += size;
+				insert_index += size;
+				if (B_index == B_last) break;
+			}
+		}
+	}
+
+	/* copy the remainder of A into the final array */
+	memcpy(insert_index, A_index, (A_last - A_index));
+}
+
+/* merge operation using an internal buffer */
+void MergeInternal(void *arr, const Range A, const Range B, size_t size, const Comparison compare, const Range buffer) {
+	char *array = (char *)arr;
+	/* whenever we find a value to add to the final array, swap it with the value that's already in that spot */
+	/* when this algorithm is finished, 'buffer' will contain its original contents, but in a different order */
+	size_t A_count = 0, B_count = 0, insert = 0;
+
+	if (Range_length(B) > 0 && Range_length(A) > 0) {
+		while (true) {
+			if (!COMPARE(Index_addr(array, B.start + B_count), Index_addr(array, buffer.start + A_count))) {
+				Swap(Index_addr(array, A.start + insert), Index_addr(array, buffer.start + A_count), size);
+				A_count++;
+				insert++;
+				if (A_count >= Range_length(A)) break;
+			} else {
+				Swap(Index_addr(array, A.start + insert), Index_addr(array, B.start + B_count), size);
+				B_count++;
+				insert++;
+				if (B_count >= Range_length(B)) break;
+			}
+		}
+	}
+
+	/* swap the remainder of A into the final array */
+	BlockSwap(array, buffer.start + A_count, A.start + insert, Range_length(A) - A_count, size);
+}
+
+/* merge operation without a buffer */
+void MergeInPlace(void *arr, Range A, Range B, size_t size, const Comparison compare, void *cache_base, const size_t cache_size) {
+	char *array = (char *)arr;
+	char *cache = (char *)cache_base;
+	if (Range_length(A) == 0 || Range_length(B) == 0) return;
+
+	/*
+	 this just repeatedly binary searches into B and rotates A into position.
+	 the paper suggests using the 'rotation-based Hwang and Lin algorithm' here,
+	 but I decided to stick with this because it had better situational performance
+
+	 (Hwang and Lin is designed for merging subarrays of very different sizes,
+	 but WikiSort almost always uses subarrays that are roughly the same size)
+
+	 normally this is incredibly suboptimal, but this function is only called
+	 when none of the A or B blocks in any subarray contained 2√A unique values,
+	 which places a hard limit on the number of times this will ACTUALLY need
+	 to binary search and rotate.
+
+	 according to my analysis the worst case is √A rotations performed on √A items
+	 once the constant factors are removed, which ends up being O(n)
+
+	 again, this is NOT a general-purpose solution – it only works well in this case!
+	 kind of like how the O(n^2) insertion sort is used in some places
+	 */
+
+	while (true) {
+		/* find the first place in B where the first item in A needs to be inserted */
+		size_t mid = BinaryFirst(array, Index_addr(array, A.start), B, size, compare);
+
+		/* rotate A into place */
+		size_t amount = mid - A.end;
+		Rotate(array, Range_length(A), Range_new(A.start, mid), cache, cache_size, size);
+		if (B.end == mid) break;
+
+		/* calculate the new A and B ranges */
+		B.start = mid;
+		A = Range_new(A.start + amount, B.start);
+		A.start = BinaryLast(array, Index_addr(array, A.start), A, size, compare);
+		if (Range_length(A) == 0) break;
+	}
+}
+
Index: lib/libc/stdlib/wiki.c
===================================================================
--- /dev/null
+++ lib/libc/stdlib/wiki.c
@@ -0,0 +1,669 @@
+/***********************************************************
+ WikiSort (public domain license)
+ https://github.com/BonzaiThePenguin/WikiSort
+
+ to run:
+ clang -o WikiSort.x WikiSort.c -O3
+ (or replace 'clang' with 'gcc')
+ ./WikiSort.x
+***********************************************************/
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "wikihelpers.h"
+
+/* bottom-up merge sort combined with an in-place merge algorithm for O(1) memory use */
+int WikiSort(void *arr, size_t nmemb, size_t size, Comparison compare) {
+	char *array = (char *)arr;
+/* use a small cache to speed up some of the operations */
+#if DYNAMIC_CACHE
+	size_t cache_size;
+	char *cache = NULL;
+#else
+/* since the cache size is fixed, it's still O(1) memory! */
+/* just keep in mind that making it too small ruins the point (nothing will fit into it), */
+/* and making it too large also ruins the point (so much for "low memory"!) */
+/* removing the cache entirely still gives 70% of the performance of a standard merge */
+#define CACHE_SIZE 512
+	const size_t cache_size = CACHE_SIZE;
+	char *cache = (char *)malloc(CACHE_SIZE * size);
+#endif
+
+	WikiIterator iterator;
+
+	/* if the array is of size 0, 1, 2, or 3, just sort them like so: */
+	if (nmemb < 4) {
+		if (nmemb == 3) {
+			/* hard-coded insertion sort */
+			if (COMPARE(Index_addr(array, 1), Index_addr(array, 0)))
+				Swap(Index_addr(array, 0), Index_addr(array, 1), size);
+			if (COMPARE(Index_addr(array, 2), Index_addr(array, 1))) {
+				Swap(Index_addr(array, 1), Index_addr(array, 2), size);
+				if (COMPARE(Index_addr(array, 1), Index_addr(array, 0)))
+					Swap(Index_addr(array, 0), Index_addr(array, 1), size);
+			}
+		} else if (nmemb == 2) {
+			/* swap the items if they're out of order */
+			if (COMPARE(Index_addr(array, 1), Index_addr(array, 0)))
+				Swap(Index_addr(array, 0), Index_addr(array, 1), size);
+		}
+
+		if (cache)
+			free(cache);
+
+		return 0;
+	}
+
+	/* sort groups of 4-8 items at a time using an unstable sorting network, */
+	/* but keep track of the original item orders to force it to be stable */
+	/* http://pages.ripco.net/~jgamble/nw.html */
+	iterator = WikiIterator_new(nmemb, 4);
+	WikiIterator_begin(&iterator);
+	while (!WikiIterator_finished(&iterator)) {
+		uint8_t order[] = {0, 1, 2, 3, 4, 5, 6, 7};
+		Range range = WikiIterator_nextRange(&iterator);
+
+#define SWAP(x, y)                                                                                                                                                                                          \
+	if (COMPARE(Index_addr(array, range.start + y), Index_addr(array, range.start + x)) || (order[x] > order[y] && !COMPARE(Index_addr(array, range.start + x), Index_addr(array, range.start + y)))) { \
+		Swap(Index_addr(array, range.start + x), Index_addr(array, range.start + y), size);                                                                                                               \
+		Swap(&(order[x]), &(order[y]), sizeof(uint8_t));                                                                                                                                                                   \
+	}
+
+		if (Range_length(range) == 8) {
+			SWAP(0, 1);
+			SWAP(2, 3);
+			SWAP(4, 5);
+			SWAP(6, 7);
+			SWAP(0, 2);
+			SWAP(1, 3);
+			SWAP(4, 6);
+			SWAP(5, 7);
+			SWAP(1, 2);
+			SWAP(5, 6);
+			SWAP(0, 4);
+			SWAP(3, 7);
+			SWAP(1, 5);
+			SWAP(2, 6);
+			SWAP(1, 4);
+			SWAP(3, 6);
+			SWAP(2, 4);
+			SWAP(3, 5);
+			SWAP(3, 4);
+
+		} else if (Range_length(range) == 7) {
+			SWAP(1, 2);
+			SWAP(3, 4);
+			SWAP(5, 6);
+			SWAP(0, 2);
+			SWAP(3, 5);
+			SWAP(4, 6);
+			SWAP(0, 1);
+			SWAP(4, 5);
+			SWAP(2, 6);
+			SWAP(0, 4);
+			SWAP(1, 5);
+			SWAP(0, 3);
+			SWAP(2, 5);
+			SWAP(1, 3);
+			SWAP(2, 4);
+			SWAP(2, 3);
+
+		} else if (Range_length(range) == 6) {
+			SWAP(1, 2);
+			SWAP(4, 5);
+			SWAP(0, 2);
+			SWAP(3, 5);
+			SWAP(0, 1);
+			SWAP(3, 4);
+			SWAP(2, 5);
+			SWAP(0, 3);
+			SWAP(1, 4);
+			SWAP(2, 4);
+			SWAP(1, 3);
+			SWAP(2, 3);
+
+		} else if (Range_length(range) == 5) {
+			SWAP(0, 1);
+			SWAP(3, 4);
+			SWAP(2, 4);
+			SWAP(2, 3);
+			SWAP(1, 4);
+			SWAP(0, 3);
+			SWAP(0, 2);
+			SWAP(1, 3);
+			SWAP(1, 2);
+
+		} else if (Range_length(range) == 4) {
+			SWAP(0, 1);
+			SWAP(2, 3);
+			SWAP(0, 2);
+			SWAP(1, 3);
+			SWAP(1, 2);
+		}
+	}
+
+	if (nmemb < 8) {
+		if (cache)
+			free(cache);
+		return 0;
+	}
+
+#if DYNAMIC_CACHE
+	/* good choices for the cache size are: */
+	/* (size + 1)/2 – turns into a full-speed standard merge sort since everything fits into the cache */
+	cache_size = (nmemb + 1) / 2;
+	cache = (char *)malloc(cache_size * size);
+
+	if (!cache) {
+		/* sqrt((size + 1)/2) + 1 – this will be the size of the A blocks at the largest level of merges, */
+		/* so a buffer of this size would allow it to skip using internal or in-place merges for anything */
+		cache_size = sqrt(cache_size) + 1;
+		cache = (char *)malloc(cache_size * size);
+
+		if (!cache) {
+			/* 512 – chosen from careful testing as a good balance between fixed-size memory use and run time */
+			if (cache_size > 512) {
+				cache_size = 512;
+				cache = (char *)malloc(cache_size * size);
+			}
+
+			/* 0 – if the system simply cannot allocate any extra memory whatsoever, no memory works just fine */
+			if (!cache)
+				cache_size = 0;
+		}
+	}
+#endif
+	/* then merge sort the higher levels, which can be 8-15, 16-31, 32-63, 64-127, etc. */
+	while (true) {
+		/* if every A and B block will fit into the cache, use a special branch specifically for merging with the cache */
+		/* (we use < rather than <= since the block size might be one more than iterator.length()) */
+		if (WikiIterator_length(&iterator) < cache_size) {
+			/* if four subarrays fit into the cache, it's faster to merge both pairs of subarrays into the cache, */
+			/* then merge the two merged subarrays from the cache back into the original array */
+			if ((WikiIterator_length(&iterator) + 1) * 4 <= cache_size && WikiIterator_length(&iterator) * 4 <= nmemb) {
+				WikiIterator_begin(&iterator);
+				while (!WikiIterator_finished(&iterator)) {
+					/* merge A1 and B1 into the cache */
+					Range A1, B1, A2, B2, A3, B3;
+					A1 = WikiIterator_nextRange(&iterator);
+					B1 = WikiIterator_nextRange(&iterator);
+					A2 = WikiIterator_nextRange(&iterator);
+					B2 = WikiIterator_nextRange(&iterator);
+
+					if (COMPARE(Index_addr(array, B1.end - 1), Index_addr(array, A1.start))) {
+						/* the two ranges are in reverse order, so copy them in reverse order into the cache */
+						memcpy(Index_addr(cache, Range_length(B1)), Index_addr(array, A1.start), Range_length(A1) * size);
+						memcpy(Index_addr(cache, 0), Index_addr(array, B1.start), Range_length(B1) * size);
+					} else if (COMPARE(Index_addr(array, B1.start), Index_addr(array, A1.end - 1))) {
+						/* these two ranges weren't already in order, so merge them into the cache */
+						MergeInto(array, A1, B1, size, compare, Index_addr(cache, 0));
+					} else {
+						/* if A1, B1, A2, and B2 are all in order, skip doing anything else */
+						if (!COMPARE(Index_addr(array, B2.start), Index_addr(array, A2.end - 1)) && !COMPARE(Index_addr(array, A2.start), Index_addr(array, B1.end - 1)))
+							continue;
+
+						/* copy A1 and B1 into the cache in the same order */
+						memcpy(Index_addr(cache, 0), Index_addr(array, A1.start), Range_length(A1) * size);
+						memcpy(Index_addr(cache, Range_length(A1)), Index_addr(array, B1.start), Range_length(B1) * size);
+					}
+					A1 = Range_new(A1.start, B1.end);
+
+					/* merge A2 and B2 into the cache */
+					if (COMPARE(Index_addr(array, B2.end - 1), Index_addr(array, A2.start))) {
+						/* the two ranges are in reverse order, so copy them in reverse order into the cache */
+						memcpy(Index_addr(cache, Range_length(A1) + Range_length(B2)), Index_addr(array, A2.start), Range_length(A2) * size);
+						memcpy(Index_addr(cache, Range_length(A1)), Index_addr(array, B2.start), Range_length(B2) * size);
+					} else if (COMPARE(Index_addr(array, B2.start), Index_addr(array, A2.end - 1))) {
+						/* these two ranges weren't already in order, so merge them into the cache */
+						MergeInto(array, A2, B2, size, compare, Index_addr(cache, Range_length(A1)));
+					} else {
+						/* copy A2 and B2 into the cache in the same order */
+						memcpy(Index_addr(cache, Range_length(A1)), Index_addr(array, A2.start), Range_length(A2) * size);
+						memcpy(Index_addr(cache, Range_length(A1) + Range_length(A2)), Index_addr(array, B2.start), Range_length(B2) * size);
+					}
+					A2 = Range_new(A2.start, B2.end);
+
+					/* merge A1 and A2 from the cache into the array */
+					A3 = Range_new(0, Range_length(A1));
+					B3 = Range_new(Range_length(A1), Range_length(A1) + Range_length(A2));
+
+					if (COMPARE(Index_addr(cache, B3.end - 1), Index_addr(cache, A3.start))) {
+						/* the two ranges are in reverse order, so copy them in reverse order into the array */
+						memcpy(Index_addr(array, A1.start + Range_length(A2)), Index_addr(cache, A3.start), Range_length(A3) * size);
+						memcpy(Index_addr(array, A1.start), Index_addr(cache, B3.start), Range_length(B3) * size);
+					} else if (COMPARE(Index_addr(cache, B3.start), Index_addr(cache, A3.end - 1))) {
+						/* these two ranges weren't already in order, so merge them back into the array */
+						MergeInto(cache, A3, B3, size, compare, Index_addr(array, A1.start));
+					} else {
+						/* copy A3 and B3 into the array in the same order */
+						memcpy(Index_addr(array, A1.start), Index_addr(cache, A3.start), Range_length(A3) * size);
+						memcpy(Index_addr(array, A1.start + Range_length(A1)), Index_addr(cache, B3.start), Range_length(B3) * size);
+					}
+				}
+
+				/* we merged two levels at the same time, so we're done with this level already */
+				/* (iterator.nextLevel() is called again at the bottom of this outer merge loop) */
+				WikiIterator_nextLevel(&iterator);
+
+			} else {
+				WikiIterator_begin(&iterator);
+				while (!WikiIterator_finished(&iterator)) {
+					Range A = WikiIterator_nextRange(&iterator);
+					Range B = WikiIterator_nextRange(&iterator);
+
+					if (COMPARE(Index_addr(array, B.end - 1), Index_addr(array, A.start))) {
+						/* the two ranges are in reverse order, so a simple rotation should fix it */
+						Rotate(array, Range_length(A), Range_new(A.start, B.end), cache, cache_size, size);
+					} else if (COMPARE(Index_addr(array, B.start), Index_addr(array, A.end - 1))) {
+						/* these two ranges weren't already in order, so we'll need to merge them! */
+						memcpy(Index_addr(cache, 0), Index_addr(array, A.start), Range_length(A) * size);
+						MergeExternal(array, A, B, size, compare, cache);
+					}
+				}
+			}
+		} else {
+			/* this is where the in-place merge logic starts!
+			 1. pull out two internal buffers each containing √A unique values
+				1a. adjust block_size and buffer_size if we couldn't find enough unique values
+			 2. loop over the A and B subarrays within this level of the merge sort
+			 3. break A and B into blocks of size 'block_size'
+			 4. "tag" each of the A blocks with values from the first internal buffer
+			 5. roll the A blocks through the B blocks and drop/rotate them where they belong
+			 6. merge each A block with any B values that follow, using the cache or the second internal buffer
+			 7. sort the second internal buffer if it exists
+			 8. redistribute the two internal buffers back into the array */
+
+			size_t block_size = sqrt(WikiIterator_length(&iterator));
+			size_t buffer_size = WikiIterator_length(&iterator) / block_size + 1;
+
+			/* as an optimization, we really only need to pull out the internal buffers once for each level of merges */
+			/* after that we can reuse the same buffers over and over, then redistribute it when we're finished with this level */
+			Range buffer1, buffer2, A, B;
+			bool find_separately;
+			size_t index, last, count, find, start, pull_index = 0;
+			struct {
+				size_t from, to, count;
+				Range range;
+			} pull[2];
+			pull[0].from = pull[0].to = pull[0].count = 0;
+			pull[0].range = Range_new(0, 0);
+			pull[1].from = pull[1].to = pull[1].count = 0;
+			pull[1].range = Range_new(0, 0);
+
+			buffer1 = Range_new(0, 0);
+			buffer2 = Range_new(0, 0);
+
+			/* find two internal buffers of size 'buffer_size' each */
+			find = buffer_size + buffer_size;
+			find_separately = false;
+
+			if (block_size <= cache_size) {
+				/* if every A block fits into the cache then we won't need the second internal buffer, */
+				/* so we really only need to find 'buffer_size' unique values */
+				find = buffer_size;
+			} else if (find > WikiIterator_length(&iterator)) {
+				/* we can't fit both buffers into the same A or B subarray, so find two buffers separately */
+				find = buffer_size;
+				find_separately = true;
+			}
+
+			/* we need to find either a single contiguous space containing 2√A unique values (which will be split up into two buffers of size √A each), */
+			/* or we need to find one buffer of < 2√A unique values, and a second buffer of √A unique values, */
+			/* OR if we couldn't find that many unique values, we need the largest possible buffer we can get */
+
+			/* in the case where it couldn't find a single buffer of at least √A unique values, */
+			/* all of the Merge steps must be replaced by a different merge algorithm (MergeInPlace) */
+			WikiIterator_begin(&iterator);
+			while (!WikiIterator_finished(&iterator)) {
+				A = WikiIterator_nextRange(&iterator);
+				B = WikiIterator_nextRange(&iterator);
+
+/* just store information about where the values will be pulled from and to, */
+/* as well as how many values there are, to create the two internal buffers */
+#define PULL(_to)                                           \
+	pull[pull_index].range = Range_new(A.start, B.end); \
+	pull[pull_index].count = count;                     \
+	pull[pull_index].from = index;                      \
+	pull[pull_index].to = _to
+
+				/* check A for the number of unique values we need to fill an internal buffer */
+				/* these values will be pulled out to the start of A */
+				for (last = A.start, count = 1; count < find; last = index, count++) {
+					index = FindLastForward(array, Index_addr(array, last), Range_new(last + 1, A.end), size, compare, find - count);
+					if (index == A.end)
+						break;
+				}
+				index = last;
+
+				if (count >= buffer_size) {
+					/* keep track of the range within the array where we'll need to "pull out" these values to create the internal buffer */
+					PULL(A.start);
+					pull_index = 1;
+
+					if (count == buffer_size + buffer_size) {
+						/* we were able to find a single contiguous section containing 2√A unique values, */
+						/* so this section can be used to contain both of the internal buffers we'll need */
+						buffer1 = Range_new(A.start, A.start + buffer_size);
+						buffer2 = Range_new(A.start + buffer_size, A.start + count);
+						break;
+					} else if (find == buffer_size + buffer_size) {
+						/* we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values, */
+						/* so we still need to find a second separate buffer of at least √A unique values */
+						buffer1 = Range_new(A.start, A.start + count);
+						find = buffer_size;
+					} else if (block_size <= cache_size) {
+						/* we found the first and only internal buffer that we need, so we're done! */
+						buffer1 = Range_new(A.start, A.start + count);
+						break;
+					} else if (find_separately) {
+						/* found one buffer, but now find the other one */
+						buffer1 = Range_new(A.start, A.start + count);
+						find_separately = false;
+					} else {
+						/* we found a second buffer in an 'A' subarray containing √A unique values, so we're done! */
+						buffer2 = Range_new(A.start, A.start + count);
+						break;
+					}
+				} else if (pull_index == 0 && count > Range_length(buffer1)) {
+					/* keep track of the largest buffer we were able to find */
+					buffer1 = Range_new(A.start, A.start + count);
+					PULL(A.start);
+				}
+
+				/* check B for the number of unique values we need to fill an internal buffer */
+				/* these values will be pulled out to the end of B */
+				for (last = B.end - 1, count = 1; count < find; last = index - 1, count++) {
+					index = FindFirstBackward(array, Index_addr(array, last), Range_new(B.start, last), size, compare, find - count);
+					if (index == B.start)
+						break;
+				}
+				index = last;
+
+				if (count >= buffer_size) {
+					/* keep track of the range within the array where we'll need to "pull out" these values to create the internal buffer */
+					PULL(B.end);
+					pull_index = 1;
+
+					if (count == buffer_size + buffer_size) {
+						/* we were able to find a single contiguous section containing 2√A unique values, */
+						/* so this section can be used to contain both of the internal buffers we'll need */
+						buffer1 = Range_new(B.end - count, B.end - buffer_size);
+						buffer2 = Range_new(B.end - buffer_size, B.end);
+						break;
+					} else if (find == buffer_size + buffer_size) {
+						/* we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values, */
+						/* so we still need to find a second separate buffer of at least √A unique values */
+						buffer1 = Range_new(B.end - count, B.end);
+						find = buffer_size;
+					} else if (block_size <= cache_size) {
+						/* we found the first and only internal buffer that we need, so we're done! */
+						buffer1 = Range_new(B.end - count, B.end);
+						break;
+					} else if (find_separately) {
+						/* found one buffer, but now find the other one */
+						buffer1 = Range_new(B.end - count, B.end);
+						find_separately = false;
+					} else {
+						/* buffer2 will be pulled out from a 'B' subarray, so if the first buffer was pulled out from the corresponding 'A' subarray, */
+						/* we need to adjust the end point for that A subarray so it knows to stop redistributing its values before reaching buffer2 */
+						if (pull[0].range.start == A.start)
+							pull[0].range.end -= pull[1].count;
+
+						/* we found a second buffer in an 'B' subarray containing √A unique values, so we're done! */
+						buffer2 = Range_new(B.end - count, B.end);
+						break;
+					}
+				} else if (pull_index == 0 && count > Range_length(buffer1)) {
+					/* keep track of the largest buffer we were able to find */
+					buffer1 = Range_new(B.end - count, B.end);
+					PULL(B.end);
+				}
+			}
+
+			/* pull out the two ranges so we can use them as internal buffers */
+			for (pull_index = 0; pull_index < 2; pull_index++) {
+				Range range;
+				size_t length = pull[pull_index].count;
+
+				if (pull[pull_index].to < pull[pull_index].from) {
+					/* we're pulling the values out to the left, which means the start of an A subarray */
+					index = pull[pull_index].from;
+					for (count = 1; count < length; count++) {
+						index = FindFirstBackward(array, Index_addr(array, index - 1), Range_new(pull[pull_index].to, pull[pull_index].from - (count - 1)), size, compare, length - count);
+						range = Range_new(index + 1, pull[pull_index].from + 1);
+						Rotate(array, Range_length(range) - count, range, cache, cache_size, size);
+						pull[pull_index].from = index + count;
+					}
+				} else if (pull[pull_index].to > pull[pull_index].from) {
+					/* we're pulling values out to the right, which means the end of a B subarray */
+					index = pull[pull_index].from + 1;
+					for (count = 1; count < length; count++) {
+						index = FindLastForward(array, Index_addr(array, index), Range_new(index, pull[pull_index].to), size, compare, length - count);
+						range = Range_new(pull[pull_index].from, index - 1);
+						Rotate(array, count, range, cache, cache_size, size);
+						pull[pull_index].from = index - 1 - count;
+					}
+				}
+			}
+
+			/* adjust block_size and buffer_size based on the values we were able to pull out */
+			buffer_size = Range_length(buffer1);
+			block_size = WikiIterator_length(&iterator) / buffer_size + 1;
+
+			/* the first buffer NEEDS to be large enough to tag each of the evenly sized A blocks, */
+			/* so this was originally here to test the math for adjusting block_size above */
+			/* assert((WikiIterator_length(&iterator) + 1)/block_size <= buffer_size); */
+
+			/* now that the two internal buffers have been created, it's time to merge each A+B combination at this level of the merge sort! */
+			WikiIterator_begin(&iterator);
+			while (!WikiIterator_finished(&iterator)) {
+				A = WikiIterator_nextRange(&iterator);
+				B = WikiIterator_nextRange(&iterator);
+
+				/* remove any parts of A or B that are being used by the internal buffers */
+				start = A.start;
+				if (start == pull[0].range.start) {
+					if (pull[0].from > pull[0].to) {
+						A.start += pull[0].count;
+
+						/* if the internal buffer takes up the entire A or B subarray, then there's nothing to merge */
+						/* this only happens for very small subarrays, like √4 = 2, 2 * (2 internal buffers) = 4, */
+						/* which also only happens when cache_size is small or 0 since it'd otherwise use MergeExternal */
+						if (Range_length(A) == 0)
+							continue;
+					} else if (pull[0].from < pull[0].to) {
+						B.end -= pull[0].count;
+						if (Range_length(B) == 0)
+							continue;
+					}
+				}
+				if (start == pull[1].range.start) {
+					if (pull[1].from > pull[1].to) {
+						A.start += pull[1].count;
+						if (Range_length(A) == 0)
+							continue;
+					} else if (pull[1].from < pull[1].to) {
+						B.end -= pull[1].count;
+						if (Range_length(B) == 0)
+							continue;
+					}
+				}
+
+				if (COMPARE(Index_addr(array, B.end - 1), Index_addr(array, A.start))) {
+					/* the two ranges are in reverse order, so a simple rotation should fix it */
+					Rotate(array, Range_length(A), Range_new(A.start, B.end), cache, cache_size, size);
+				} else if (COMPARE(Index_addr(array, A.end), Index_addr(array, A.end - 1))) {
+					/* these two ranges weren't already in order, so we'll need to merge them! */
+					Range blockA, firstA, lastA, lastB, blockB;
+					size_t indexA, findA;
+
+					/* break the remainder of A into blocks. firstA is the uneven-sized first A block */
+					blockA = Range_new(A.start, A.end);
+					firstA = Range_new(A.start, A.start + Range_length(blockA) % block_size);
+
+					/* swap the first value of each A block with the value in buffer1 */
+					for (indexA = buffer1.start, index = firstA.end; index < blockA.end; indexA++, index += block_size) Swap(Index_addr(array, indexA), Index_addr(array, index), size);
+
+					/* start rolling the A blocks through the B blocks! */
+					/* whenever we leave an A block behind, we'll need to merge the previous A block with any B blocks that follow it, so track that information as well */
+					lastA = firstA;
+					lastB = Range_new(0, 0);
+					blockB = Range_new(B.start, B.start + Min(block_size, Range_length(B)));
+					blockA.start += Range_length(firstA);
+					indexA = buffer1.start;
+
+					/* if the first unevenly sized A block fits into the cache, copy it there for when we go to Merge it */
+					/* otherwise, if the second buffer is available, block swap the contents into that */
+					if (Range_length(lastA) <= cache_size)
+						memcpy(Index_addr(cache, 0), Index_addr(array, lastA.start), Range_length(lastA) * size);
+					else if (Range_length(buffer2) > 0)
+						BlockSwap(array, lastA.start, buffer2.start, Range_length(lastA), size);
+
+					if (Range_length(blockA) > 0) {
+						while (true) {
+							/* if there's a previous B block and the first value of the minimum A block is <= the last value of the previous B block, */
+							/* then drop that minimum A block behind. or if there are no B blocks left then keep dropping the remaining A blocks. */
+							if ((Range_length(lastB) > 0 && !COMPARE(Index_addr(array, lastB.end - 1), Index_addr(array, indexA))) || Range_length(blockB) == 0) {
+								/* figure out where to split the previous B block, and rotate it at the split */
+								size_t B_split = BinaryFirst(array, Index_addr(array, indexA), lastB, size, compare);
+								size_t B_remaining = lastB.end - B_split;
+
+								/* swap the minimum A block to the beginning of the rolling A blocks */
+								size_t minA = blockA.start;
+								for (findA = minA + block_size; findA < blockA.end; findA += block_size)
+									if (COMPARE(Index_addr(array, findA), Index_addr(array, minA)))
+										minA = findA;
+								BlockSwap(array, blockA.start, minA, block_size, size);
+
+								/* swap the first item of the previous A block back with its original value, which is stored in buffer1 */
+								Swap(Index_addr(array, blockA.start), Index_addr(array, indexA), size);
+								indexA++;
+
+								/*
+								 locally merge the previous A block with the B values that follow it
+								 if lastA fits into the external cache we'll use that (with MergeExternal),
+								 or if the second internal buffer exists we'll use that (with MergeInternal),
+								 or failing that we'll use a strictly in-place merge algorithm (MergeInPlace)
+								 */
+								if (Range_length(lastA) <= cache_size)
+									MergeExternal(array, lastA, Range_new(lastA.end, B_split), size, compare, cache);
+								else if (Range_length(buffer2) > 0)
+									MergeInternal(array, lastA, Range_new(lastA.end, B_split), size, compare, buffer2);
+								else
+									MergeInPlace(array, lastA, Range_new(lastA.end, B_split), size, compare, cache, cache_size);
+
+								if (Range_length(buffer2) > 0 || block_size <= cache_size) {
+									/* copy the previous A block into the cache or buffer2, since that's where we need it to be when we go to merge it anyway */
+									if (block_size <= cache_size)
+										memcpy(Index_addr(cache, 0), Index_addr(array, blockA.start), block_size * size);
+									else
+										BlockSwap(array, blockA.start, buffer2.start, block_size, size);
+
+									/* this is equivalent to rotating, but faster */
+									/* the area normally taken up by the A block is either the contents of buffer2, or data we don't need anymore since we memcopied it */
+									/* either way, we don't need to retain the order of those items, so instead of rotating we can just block swap B to where it belongs */
+									BlockSwap(array, B_split, blockA.start + block_size - B_remaining, B_remaining, size);
+								} else {
+									/* we are unable to use the 'buffer2' trick to speed up the rotation operation since buffer2 doesn't exist, so perform a normal rotation */
+									Rotate(array, blockA.start - B_split, Range_new(B_split, blockA.start + block_size), cache, cache_size, size);
+								}
+
+								/* update the range for the remaining A blocks, and the range remaining from the B block after it was split */
+								lastA = Range_new(blockA.start - B_remaining, blockA.start - B_remaining + block_size);
+								lastB = Range_new(lastA.end, lastA.end + B_remaining);
+
+								/* if there are no more A blocks remaining, this step is finished! */
+								blockA.start += block_size;
+								if (Range_length(blockA) == 0)
+									break;
+
+							} else if (Range_length(blockB) < block_size) {
+								/* move the last B block, which is unevenly sized, to before the remaining A blocks, by using a rotation */
+								/* the cache is disabled here since it might contain the contents of the previous A block */
+								Rotate(array, blockB.start - blockA.start, Range_new(blockA.start, blockB.end), cache, 0, size);
+
+								lastB = Range_new(blockA.start, blockA.start + Range_length(blockB));
+								blockA.start += Range_length(blockB);
+								blockA.end += Range_length(blockB);
+								blockB.end = blockB.start;
+							} else {
+								/* roll the leftmost A block to the end by swapping it with the next B block */
+								BlockSwap(array, blockA.start, blockB.start, block_size, size);
+								lastB = Range_new(blockA.start, blockA.start + block_size);
+
+								blockA.start += block_size;
+								blockA.end += block_size;
+								blockB.start += block_size;
+
+								if (blockB.end > B.end - block_size)
+									blockB.end = B.end;
+								else
+									blockB.end += block_size;
+							}
+						}
+					}
+
+					/* merge the last A block with the remaining B values */
+					if (Range_length(lastA) <= cache_size)
+						MergeExternal(array, lastA, Range_new(lastA.end, B.end), size, compare, cache);
+					else if (Range_length(buffer2) > 0)
+						MergeInternal(array, lastA, Range_new(lastA.end, B.end), size, compare, buffer2);
+					else
+						MergeInPlace(array, lastA, Range_new(lastA.end, B.end), size, compare, cache, cache_size);
+				}
+			}
+
+			/* when we're finished with this merge step we should have the one or two internal buffers left over, where the second buffer is all jumbled up */
+			/* insertion sort the second buffer, then redistribute the buffers back into the array using the opposite process used for creating the buffer */
+
+			/* while an unstable sort like quicksort could be applied here, in benchmarks it was consistently slightly slower than a simple insertion sort, */
+			/* even for tens of millions of items. this may be because insertion sort is quite fast when the data is already somewhat sorted, like it is here */
+			InsertionSort(array, buffer2, size, compare);
+
+			for (pull_index = 0; pull_index < 2; pull_index++) {
+				size_t amount, unique = pull[pull_index].count * 2;
+				if (pull[pull_index].from > pull[pull_index].to) {
+					/* the values were pulled out to the left, so redistribute them back to the right */
+					Range buffer = Range_new(pull[pull_index].range.start, pull[pull_index].range.start + pull[pull_index].count);
+					while (Range_length(buffer) > 0) {
+						index = FindFirstForward(array, Index_addr(array, buffer.start), Range_new(buffer.end, pull[pull_index].range.end), size, compare, unique);
+						amount = index - buffer.end;
+						Rotate(array, Range_length(buffer), Range_new(buffer.start, index), cache, cache_size, size);
+						buffer.start += (amount + 1);
+						buffer.end += amount;
+						unique -= 2;
+					}
+				} else if (pull[pull_index].from < pull[pull_index].to) {
+					/* the values were pulled out to the right, so redistribute them back to the left */
+					Range buffer = Range_new(pull[pull_index].range.end - pull[pull_index].count, pull[pull_index].range.end);
+					while (Range_length(buffer) > 0) {
+						index = FindLastBackward(array, Index_addr(array, buffer.end - 1), Range_new(pull[pull_index].range.start, buffer.start), size, compare, unique);
+						amount = buffer.start - index;
+						Rotate(array, amount, Range_new(index, buffer.end), cache, cache_size, size);
+						buffer.start -= amount;
+						buffer.end -= (amount + 1);
+						unique -= 2;
+					}
+				}
+			}
+		}
+
+		/* double the size of each A and B subarray that will be merged in the next level */
+		if (!WikiIterator_nextLevel(&iterator))
+			break;
+	}
+
+	if (cache)
+		free(cache);
+	return 0;
+#undef CACHE_SIZE
+}