From feed5c707700830f6980bb7dc657ea98fd1ebb7f63c6200da3879e68a0d1b14c Mon Sep 17 00:00:00 2001 From: Tyler Beckman Date: Thu, 10 Oct 2024 20:54:25 -0600 Subject: [PATCH] Radix sort go brrr --- OutputProcessor.cpp | 111 ++++++++++++++++++++++++++++++++++++++++++++ test.zsh | 2 +- 2 files changed, 112 insertions(+), 1 deletion(-) diff --git a/OutputProcessor.cpp b/OutputProcessor.cpp index 67d7f4d..2f3e440 100644 --- a/OutputProcessor.cpp +++ b/OutputProcessor.cpp @@ -3,12 +3,121 @@ #include #include #include +#include #include #include #include #include +/** + * @brief Recursively most significant digit radix sorts a vector of indexes, + * based on the alphabetical value of a vector of strings. The returned vector + * is the same index vector but re-arranged to show where the elements in the + * string vector should be placed. + * + * @param INDEXES The vector of indexes to sort + * @param VECTOR_TO_SORT The string vector to base the sort off of. This will + * not be modified, and is only used to decide where an index in the other + * vector gets placed during sort. + * @param DEPTH The current sort depth, should be 0 or not passed if called from + * outside of this function. This controls which character of strings is + * inspected during sort. + */ +void radixSortIndexes(std::vector &INDEXES, + const std::vector &VECTOR_TO_SORT, + const unsigned int DEPTH = 0) { + // Construct 26 buckets, where 0 = A, 1 = B, 2 = C, ..., 25 = Z + std::vector> buckets(26); + // Another "bucket" for words that have already been completely sorted, as + // they have no character to check at position `DEPTH` + std::optional alreadySorted = std::nullopt; + + // Pass over each index, bucketing based on the character corresponding to + // the current depth + for (size_t i = 0; i < INDEXES.size(); i++) { + const size_t INDEX_TO_SORT = INDEXES.at(i); + const std::string &WORD = VECTOR_TO_SORT.at(INDEX_TO_SORT); + + // Check if the word has any more characters to bucket. If it doesn't, + // place it in the special `alreadySorted` bucket. If it does, add it to + // the correct bucket for the current depth. + if (WORD.length() == DEPTH) { + alreadySorted = INDEX_TO_SORT; + } else { + buckets.at(WORD.at(DEPTH) - 65).push_back(INDEX_TO_SORT); + } + } + + // Recursively apply bucket sort to each bucket unless it is already + // completely sorted (has no elements or only has one). With this we cascade + // the bucketing as far as is necessary, flattening after we have reached a + // depth at which there is no more to bucket (each bucket has 0 or 1 + // elements) + for (size_t i = 0; i < buckets.size(); i++) { + std::vector &bucket = buckets.at(i); + + if (bucket.size() > 1) { + radixSortIndexes(bucket, VECTOR_TO_SORT, DEPTH + 1); + } + } + + // Flatten the buckets at the current stage. We first add the + // `alreadySorted` value (less characters should go before more characters), + // and then append each item from each bucket individually. + std::vector flattenedBucket; + if (alreadySorted.has_value()) { + flattenedBucket.push_back(alreadySorted.value()); + } + for (size_t i = 0; i < buckets.size(); i++) { + flattenedBucket.insert(flattenedBucket.end(), buckets.at(i).begin(), + buckets.at(i).end()); + } + + // Finally, replace the indexes with the sorted result + INDEXES = flattenedBucket; +} + +/** + * @brief Sorts the `words` vector (and `wordCounts` alongside) alphabetically + * using a most significant digit radix sort. + * + * @param words The list of words to sort alphabetically + * @param wordCounts The vector of word counts aligned to the `words` vector, + * which will be be adjusted based on the result of sorting `words` + */ +void radixSort(std::vector &words, + std::vector &wordCounts) { + // Create a vector of indexes the size of the amount of words we have. This + // is the vector that will actually be returned sorted in the end, where + // each element of this vector `i` is set to the index of `words` or + // `wordCounts` that belongs in position `i` when sorted. By doing this, we + // avoid having to try and pass around both the words and their + // corresponding counts throughout the sort, and can just re-assemble the + // vectors at the end. + std::vector indexVector(words.size()); + for (size_t i = 0; i < words.size(); i++) { + indexVector.push_back(i); + } + + // Sort the `indexVector` vector against the `words` vector, starting with + // depth 0 (the left-most character) + radixSortIndexes(indexVector, words); + + // Reconstruct the `words` and `wordCounts` vectors from the list of + // indexes, and replace the originals with the new ones + std::vector sortedWords; + std::vector sortedWordCounts; + + for (size_t i = 0; i < indexVector.size(); i++) { + sortedWords.push_back(words.at(indexVector.at(i))); + sortedWordCounts.push_back(wordCounts.at(indexVector.at(i))); + } + + words = sortedWords; + wordCounts = sortedWordCounts; +} + OutputProcessor::OutputProcessor() { _fileOut = std::ofstream(); _allWords = std::vector(); @@ -77,6 +186,8 @@ void OutputProcessor::analyzeWords(std::vector allWords, // Increment total word count _totalWordCount++; } + + radixSort(_uniqueWords, _wordCounts); } bool OutputProcessor::openStream() { diff --git a/test.zsh b/test.zsh index 30ba644..ccdf66d 100755 --- a/test.zsh +++ b/test.zsh @@ -3,6 +3,6 @@ for test in {aliceChapter1,greeneggsandham,happybirthday,romeoandjuliet}; do input/$test.txt output.txt EOF - delta solutions/$test.out output.txt + delta solutions/${test}_xc.out output.txt done echo "All tests finished"