Radix sort go brrr
This commit is contained in:
parent
a25f823f9d
commit
feed5c7077
2 changed files with 112 additions and 1 deletions
|
@ -3,12 +3,121 @@
|
|||
#include <fstream>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <optional>
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
/**
|
||||
* @brief Recursively most significant digit radix sorts a vector of indexes,
|
||||
* based on the alphabetical value of a vector of strings. The returned vector
|
||||
* is the same index vector but re-arranged to show where the elements in the
|
||||
* string vector should be placed.
|
||||
*
|
||||
* @param INDEXES The vector of indexes to sort
|
||||
* @param VECTOR_TO_SORT The string vector to base the sort off of. This will
|
||||
* not be modified, and is only used to decide where an index in the other
|
||||
* vector gets placed during sort.
|
||||
* @param DEPTH The current sort depth, should be 0 or not passed if called from
|
||||
* outside of this function. This controls which character of strings is
|
||||
* inspected during sort.
|
||||
*/
|
||||
void radixSortIndexes(std::vector<size_t> &INDEXES,
|
||||
const std::vector<std::string> &VECTOR_TO_SORT,
|
||||
const unsigned int DEPTH = 0) {
|
||||
// Construct 26 buckets, where 0 = A, 1 = B, 2 = C, ..., 25 = Z
|
||||
std::vector<std::vector<size_t>> buckets(26);
|
||||
// Another "bucket" for words that have already been completely sorted, as
|
||||
// they have no character to check at position `DEPTH`
|
||||
std::optional<size_t> alreadySorted = std::nullopt;
|
||||
|
||||
// Pass over each index, bucketing based on the character corresponding to
|
||||
// the current depth
|
||||
for (size_t i = 0; i < INDEXES.size(); i++) {
|
||||
const size_t INDEX_TO_SORT = INDEXES.at(i);
|
||||
const std::string &WORD = VECTOR_TO_SORT.at(INDEX_TO_SORT);
|
||||
|
||||
// Check if the word has any more characters to bucket. If it doesn't,
|
||||
// place it in the special `alreadySorted` bucket. If it does, add it to
|
||||
// the correct bucket for the current depth.
|
||||
if (WORD.length() == DEPTH) {
|
||||
alreadySorted = INDEX_TO_SORT;
|
||||
} else {
|
||||
buckets.at(WORD.at(DEPTH) - 65).push_back(INDEX_TO_SORT);
|
||||
}
|
||||
}
|
||||
|
||||
// Recursively apply bucket sort to each bucket unless it is already
|
||||
// completely sorted (has no elements or only has one). With this we cascade
|
||||
// the bucketing as far as is necessary, flattening after we have reached a
|
||||
// depth at which there is no more to bucket (each bucket has 0 or 1
|
||||
// elements)
|
||||
for (size_t i = 0; i < buckets.size(); i++) {
|
||||
std::vector<size_t> &bucket = buckets.at(i);
|
||||
|
||||
if (bucket.size() > 1) {
|
||||
radixSortIndexes(bucket, VECTOR_TO_SORT, DEPTH + 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Flatten the buckets at the current stage. We first add the
|
||||
// `alreadySorted` value (less characters should go before more characters),
|
||||
// and then append each item from each bucket individually.
|
||||
std::vector<size_t> flattenedBucket;
|
||||
if (alreadySorted.has_value()) {
|
||||
flattenedBucket.push_back(alreadySorted.value());
|
||||
}
|
||||
for (size_t i = 0; i < buckets.size(); i++) {
|
||||
flattenedBucket.insert(flattenedBucket.end(), buckets.at(i).begin(),
|
||||
buckets.at(i).end());
|
||||
}
|
||||
|
||||
// Finally, replace the indexes with the sorted result
|
||||
INDEXES = flattenedBucket;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Sorts the `words` vector (and `wordCounts` alongside) alphabetically
|
||||
* using a most significant digit radix sort.
|
||||
*
|
||||
* @param words The list of words to sort alphabetically
|
||||
* @param wordCounts The vector of word counts aligned to the `words` vector,
|
||||
* which will be be adjusted based on the result of sorting `words`
|
||||
*/
|
||||
void radixSort(std::vector<std::string> &words,
|
||||
std::vector<unsigned int> &wordCounts) {
|
||||
// Create a vector of indexes the size of the amount of words we have. This
|
||||
// is the vector that will actually be returned sorted in the end, where
|
||||
// each element of this vector `i` is set to the index of `words` or
|
||||
// `wordCounts` that belongs in position `i` when sorted. By doing this, we
|
||||
// avoid having to try and pass around both the words and their
|
||||
// corresponding counts throughout the sort, and can just re-assemble the
|
||||
// vectors at the end.
|
||||
std::vector<size_t> indexVector(words.size());
|
||||
for (size_t i = 0; i < words.size(); i++) {
|
||||
indexVector.push_back(i);
|
||||
}
|
||||
|
||||
// Sort the `indexVector` vector against the `words` vector, starting with
|
||||
// depth 0 (the left-most character)
|
||||
radixSortIndexes(indexVector, words);
|
||||
|
||||
// Reconstruct the `words` and `wordCounts` vectors from the list of
|
||||
// indexes, and replace the originals with the new ones
|
||||
std::vector<std::string> sortedWords;
|
||||
std::vector<unsigned int> sortedWordCounts;
|
||||
|
||||
for (size_t i = 0; i < indexVector.size(); i++) {
|
||||
sortedWords.push_back(words.at(indexVector.at(i)));
|
||||
sortedWordCounts.push_back(wordCounts.at(indexVector.at(i)));
|
||||
}
|
||||
|
||||
words = sortedWords;
|
||||
wordCounts = sortedWordCounts;
|
||||
}
|
||||
|
||||
OutputProcessor::OutputProcessor() {
|
||||
_fileOut = std::ofstream();
|
||||
_allWords = std::vector<std::string>();
|
||||
|
@ -77,6 +186,8 @@ void OutputProcessor::analyzeWords(std::vector<std::string> allWords,
|
|||
// Increment total word count
|
||||
_totalWordCount++;
|
||||
}
|
||||
|
||||
radixSort(_uniqueWords, _wordCounts);
|
||||
}
|
||||
|
||||
bool OutputProcessor::openStream() {
|
||||
|
|
2
test.zsh
2
test.zsh
|
@ -3,6 +3,6 @@ for test in {aliceChapter1,greeneggsandham,happybirthday,romeoandjuliet}; do
|
|||
input/$test.txt
|
||||
output.txt
|
||||
EOF
|
||||
delta solutions/$test.out output.txt
|
||||
delta solutions/${test}_xc.out output.txt
|
||||
done
|
||||
echo "All tests finished"
|
||||
|
|
Loading…
Reference in a new issue