Radix sort go brrr

2024-10-10 20:54:25 -06:00 · 2024-10-10 20:54:25 -06:00 · feed5c7077
commit feed5c7077
parent a25f823f9d
2 changed files with 112 additions and 1 deletions
--- a/OutputProcessor.cpp
+++ b/OutputProcessor.cpp
@ -3,12 +3,121 @@
 #include <fstream>
 #include <iomanip>
 #include <iostream>
+#include <optional>
 #include <ostream>
 #include <string>
 #include <vector>

 #include <cstdint>

+/**
+ * @brief Recursively most significant digit radix sorts a vector of indexes,
+ * based on the alphabetical value of a vector of strings. The returned vector
+ * is the same index vector but re-arranged to show where the elements in the
+ * string vector should be placed.
+ *
+ * @param INDEXES The vector of indexes to sort
+ * @param VECTOR_TO_SORT The string vector to base the sort off of. This will
+ * not be modified, and is only used to decide where an index in the other
+ * vector gets placed during sort.
+ * @param DEPTH The current sort depth, should be 0 or not passed if called from
+ * outside of this function. This controls which character of strings is
+ * inspected during sort.
+ */
+void radixSortIndexes(std::vector<size_t> &INDEXES,
+					  const std::vector<std::string> &VECTOR_TO_SORT,
+					  const unsigned int DEPTH = 0) {
+	// Construct 26 buckets, where 0 = A, 1 = B, 2 = C, ..., 25 = Z
+	std::vector<std::vector<size_t>> buckets(26);
+	// Another "bucket" for words that have already been completely sorted, as
+	// they have no character to check at position `DEPTH`
+	std::optional<size_t> alreadySorted = std::nullopt;
+
+	// Pass over each index, bucketing based on the character corresponding to
+	// the current depth
+	for (size_t i = 0; i < INDEXES.size(); i++) {
+		const size_t INDEX_TO_SORT = INDEXES.at(i);
+		const std::string &WORD = VECTOR_TO_SORT.at(INDEX_TO_SORT);
+
+		// Check if the word has any more characters to bucket. If it doesn't,
+		// place it in the special `alreadySorted` bucket. If it does, add it to
+		// the correct bucket for the current depth.
+		if (WORD.length() == DEPTH) {
+			alreadySorted = INDEX_TO_SORT;
+		} else {
+			buckets.at(WORD.at(DEPTH) - 65).push_back(INDEX_TO_SORT);
+		}
+	}
+
+	// Recursively apply bucket sort to each bucket unless it is already
+	// completely sorted (has no elements or only has one). With this we cascade
+	// the bucketing as far as is necessary, flattening after we have reached a
+	// depth at which there is no more to bucket (each bucket has 0 or 1
+	// elements)
+	for (size_t i = 0; i < buckets.size(); i++) {
+		std::vector<size_t> &bucket = buckets.at(i);
+
+		if (bucket.size() > 1) {
+			radixSortIndexes(bucket, VECTOR_TO_SORT, DEPTH + 1);
+		}
+	}
+
+	// Flatten the buckets at the current stage. We first add the
+	// `alreadySorted` value (less characters should go before more characters),
+	// and then append each item from each bucket individually.
+	std::vector<size_t> flattenedBucket;
+	if (alreadySorted.has_value()) {
+		flattenedBucket.push_back(alreadySorted.value());
+	}
+	for (size_t i = 0; i < buckets.size(); i++) {
+		flattenedBucket.insert(flattenedBucket.end(), buckets.at(i).begin(),
+							   buckets.at(i).end());
+	}
+
+	// Finally, replace the indexes with the sorted result
+	INDEXES = flattenedBucket;
+}
+
+/**
+ * @brief Sorts the `words` vector (and `wordCounts` alongside) alphabetically
+ * using a most significant digit radix sort.
+ *
+ * @param words The list of words to sort alphabetically
+ * @param wordCounts The vector of word counts aligned to the `words` vector,
+ * which will be be adjusted based on the result of sorting `words`
+ */
+void radixSort(std::vector<std::string> &words,
+			   std::vector<unsigned int> &wordCounts) {
+	// Create a vector of indexes the size of the amount of words we have. This
+	// is the vector that will actually be returned sorted in the end, where
+	// each element of this vector `i` is set to the index of `words` or
+	// `wordCounts` that belongs in position `i` when sorted. By doing this, we
+	// avoid having to try and pass around both the words and their
+	// corresponding counts throughout the sort, and can just re-assemble the
+	// vectors at the end.
+	std::vector<size_t> indexVector(words.size());
+	for (size_t i = 0; i < words.size(); i++) {
+		indexVector.push_back(i);
+	}
+
+	// Sort the `indexVector` vector against the `words` vector, starting with
+	// depth 0 (the left-most character)
+	radixSortIndexes(indexVector, words);
+
+	// Reconstruct the `words` and `wordCounts` vectors from the list of
+	// indexes, and replace the originals with the new ones
+	std::vector<std::string> sortedWords;
+	std::vector<unsigned int> sortedWordCounts;
+
+	for (size_t i = 0; i < indexVector.size(); i++) {
+		sortedWords.push_back(words.at(indexVector.at(i)));
+		sortedWordCounts.push_back(wordCounts.at(indexVector.at(i)));
+	}
+
+	words = sortedWords;
+	wordCounts = sortedWordCounts;
+}
+
 OutputProcessor::OutputProcessor() {
 	_fileOut = std::ofstream();
 	_allWords = std::vector<std::string>();
@ -77,6 +186,8 @@ void OutputProcessor::analyzeWords(std::vector<std::string> allWords,
 		// Increment total word count
 		_totalWordCount++;
 	}
+
+	radixSort(_uniqueWords, _wordCounts);
 }

 bool OutputProcessor::openStream() {
--- a/test.zsh
+++ b/test.zsh
@ -3,6 +3,6 @@ for test in {aliceChapter1,greeneggsandham,happybirthday,romeoandjuliet}; do
 		input/$test.txt
 		output.txt
 		EOF
-    delta solutions/$test.out output.txt
+    delta solutions/${test}_xc.out output.txt
 done
 echo "All tests finished"