Compare commits
3 commits
Author | SHA256 | Date | |
---|---|---|---|
62a4f22ca9 | |||
0bba075976 | |||
feed5c7077 |
3 changed files with 124 additions and 59 deletions
2
Makefile
2
Makefile
|
@ -8,7 +8,7 @@ SRC_FILES = main.cpp InputProcessor.cpp OutputProcessor.cpp
|
||||||
## Adds only the necessary files for build into a .tar.gz file, named appropriately
|
## Adds only the necessary files for build into a .tar.gz file, named appropriately
|
||||||
ARCHIVED_FILES = Makefile $(SRC_FILES) $(SRC_FILES:.cpp=.h) $(SRC_FILES:.cpp=.hpp)
|
ARCHIVED_FILES = Makefile $(SRC_FILES) $(SRC_FILES:.cpp=.h) $(SRC_FILES:.cpp=.hpp)
|
||||||
pack: fmtc
|
pack: fmtc
|
||||||
tar --ignore-failed-read -czvf $(TARGET).tar.gz $(shell echo $(ARCHIVED_FILES) | xargs ls -d 2>/dev/null)
|
tar --ignore-failed-read -czvf $(TARGET).tar.gz {In,Out}putProcessor.{cpp,h}
|
||||||
|
|
||||||
## Runs the pack target and then attempts to build & run the program to make sure it functions correctly
|
## Runs the pack target and then attempts to build & run the program to make sure it functions correctly
|
||||||
pack-test: pack
|
pack-test: pack
|
||||||
|
|
|
@ -1,75 +1,141 @@
|
||||||
|
/**
|
||||||
|
* @author Tyler Beckman (tyler_beckman@mines.edu)
|
||||||
|
* @brief A3 - A program to parse a text input and analyze it for statistics
|
||||||
|
* based on word and letter frequency, and then output them to a user-specified
|
||||||
|
* file. It assumes text is only alphabetical + spaces + the punctuation
|
||||||
|
* contained within main.cpp. In addition, the list of word counts is sorted
|
||||||
|
* using a recursive MSD radix sort before being outputted into the specified
|
||||||
|
* file.
|
||||||
|
* @version 1
|
||||||
|
* @date 2024-10-10
|
||||||
|
*
|
||||||
|
* Resources used:
|
||||||
|
* For the general program (not sorting), I utilized all autocomplete and
|
||||||
|
* cppreference to find the detailed reference of functions I needed to use. For
|
||||||
|
* implementing radix sort I primarily used
|
||||||
|
* https://en.wikipedia.org/wiki/Radix_sort#Most_significant_digit,_forward_recursive
|
||||||
|
* and a lot of trial and error. The sorting part is also VERY commented to make
|
||||||
|
* sure I knew exactly what I was doing at each point and why I was doing it.
|
||||||
|
*/
|
||||||
|
|
||||||
#include "OutputProcessor.h"
|
#include "OutputProcessor.h"
|
||||||
|
|
||||||
#include <atomic>
|
|
||||||
#include <chrono>
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <future>
|
|
||||||
#include <iomanip>
|
#include <iomanip>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <optional>
|
||||||
#include <ostream>
|
#include <ostream>
|
||||||
#include <random>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <thread>
|
|
||||||
#include <utility>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
|
||||||
bool checkSorted(const std::vector<std::string> &vector) {
|
/**
|
||||||
for (size_t i = 0; i < vector.size() - 1; i++) {
|
* @brief Recursively most significant digit radix sorts a vector of indexes,
|
||||||
if (vector.at(i) > vector.at(i + 1)) {
|
* based on the alphabetical value of a vector of strings. The returned vector
|
||||||
return false;
|
* is the same index vector but re-arranged to show where the elements in the
|
||||||
|
* string vector should be placed.
|
||||||
|
*
|
||||||
|
* @param INDEXES The vector of indexes to sort
|
||||||
|
* @param VECTOR_TO_SORT The string vector to base the sort off of. This will
|
||||||
|
* not be modified, and is only used to decide where an index in the other
|
||||||
|
* vector gets placed during sort.
|
||||||
|
* @param DEPTH The current sort depth, should be 0 or not passed if called from
|
||||||
|
* outside of this function. This controls which character of strings is
|
||||||
|
* inspected during sort.
|
||||||
|
*/
|
||||||
|
void radixSortIndexes(std::vector<size_t> &INDEXES,
|
||||||
|
const std::vector<std::string> &VECTOR_TO_SORT,
|
||||||
|
const unsigned int DEPTH = 0) {
|
||||||
|
// Construct 26 buckets, where 0 = A, 1 = B, 2 = C, ..., 25 = Z
|
||||||
|
std::vector<std::vector<size_t>> buckets(26);
|
||||||
|
// Another "bucket" for words that have already been completely sorted, as
|
||||||
|
// they have no character to check at position `DEPTH`
|
||||||
|
std::optional<size_t> alreadySorted = std::nullopt;
|
||||||
|
|
||||||
|
// Pass over each index, bucketing based on the character corresponding to
|
||||||
|
// the current depth
|
||||||
|
for (size_t i = 0; i < INDEXES.size(); i++) {
|
||||||
|
const size_t INDEX_TO_SORT = INDEXES.at(i);
|
||||||
|
const std::string &WORD = VECTOR_TO_SORT.at(INDEX_TO_SORT);
|
||||||
|
|
||||||
|
// Check if the word has any more characters to bucket. If it doesn't,
|
||||||
|
// place it in the special `alreadySorted` bucket. If it does, add it to
|
||||||
|
// the correct bucket for the current depth.
|
||||||
|
if (WORD.length() == DEPTH) {
|
||||||
|
alreadySorted = INDEX_TO_SORT;
|
||||||
|
} else {
|
||||||
|
buckets.at(WORD.at(DEPTH) - 65).push_back(INDEX_TO_SORT);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true;
|
|
||||||
|
// Recursively apply bucket sort to each bucket unless it is already
|
||||||
|
// completely sorted (has no elements or only has one). With this we cascade
|
||||||
|
// the bucketing as far as is necessary, flattening after we have reached a
|
||||||
|
// depth at which there is no more to bucket (each bucket has 0 or 1
|
||||||
|
// elements)
|
||||||
|
for (size_t i = 0; i < buckets.size(); i++) {
|
||||||
|
std::vector<size_t> &bucket = buckets.at(i);
|
||||||
|
|
||||||
|
if (bucket.size() > 1) {
|
||||||
|
radixSortIndexes(bucket, VECTOR_TO_SORT, DEPTH + 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void bozosortAlignedVectors(std::vector<std::string> &vector1,
|
// Flatten the buckets at the current stage. We first add the
|
||||||
std::vector<unsigned int> &vector2) {
|
// `alreadySorted` value (less characters should go before more characters),
|
||||||
auto threadCount = std::thread::hardware_concurrency();
|
// and then append each item from each bucket individually.
|
||||||
if (threadCount == 0)
|
std::vector<size_t> flattenedBucket;
|
||||||
threadCount = 8;
|
if (alreadySorted.has_value()) {
|
||||||
|
flattenedBucket.push_back(alreadySorted.value());
|
||||||
std::atomic<bool> shouldAbort(false);
|
}
|
||||||
std::vector<std::thread> threads{};
|
for (size_t i = 0; i < buckets.size(); i++) {
|
||||||
for (unsigned int i = 0; i < threadCount; i++) {
|
flattenedBucket.insert(flattenedBucket.end(), buckets.at(i).begin(),
|
||||||
std::thread t(
|
buckets.at(i).end());
|
||||||
[vector1, vector2, &shouldAbort,
|
|
||||||
i](std::vector<std::string>* vector1Original, std::vector<unsigned int>* vector2Original) mutable {
|
|
||||||
std::mt19937 twister(std::chrono::steady_clock::now()
|
|
||||||
.time_since_epoch()
|
|
||||||
.count() +
|
|
||||||
i);
|
|
||||||
std::uniform_int_distribution<size_t> dist(0,
|
|
||||||
vector1.size() - 1);
|
|
||||||
do {
|
|
||||||
if (shouldAbort) return;
|
|
||||||
size_t first = dist(twister);
|
|
||||||
size_t second = dist(twister);
|
|
||||||
|
|
||||||
std::string temp = vector1.at(first);
|
|
||||||
vector1.at(first) = vector1.at(second);
|
|
||||||
vector1.at(second) = temp;
|
|
||||||
|
|
||||||
// Also swap elements in the aligned vector. If I store
|
|
||||||
// where everything moved to maybe it could be faster?
|
|
||||||
unsigned int temp2 = vector2.at(first);
|
|
||||||
vector2.at(first) = vector2.at(second);
|
|
||||||
vector2.at(second) = temp2;
|
|
||||||
} while (!checkSorted(vector1));
|
|
||||||
|
|
||||||
*vector1Original = vector1;
|
|
||||||
*vector2Original = vector2;
|
|
||||||
shouldAbort = true;
|
|
||||||
},
|
|
||||||
&vector1, &vector2);
|
|
||||||
|
|
||||||
threads.push_back(std::move(t));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (unsigned int i = 0; i < threadCount; i++) {
|
// Finally, replace the indexes with the sorted result
|
||||||
threads.at(i).join();
|
INDEXES = flattenedBucket;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Sorts the `words` vector (and `wordCounts` alongside) alphabetically
|
||||||
|
* using a most significant digit radix sort.
|
||||||
|
*
|
||||||
|
* @param words The list of words to sort alphabetically
|
||||||
|
* @param wordCounts The vector of word counts aligned to the `words` vector,
|
||||||
|
* which will be be adjusted based on the result of sorting `words`
|
||||||
|
*/
|
||||||
|
void radixSort(std::vector<std::string> &words,
|
||||||
|
std::vector<unsigned int> &wordCounts) {
|
||||||
|
// Create a vector of indexes the size of the amount of words we have. This
|
||||||
|
// is the vector that will actually be returned sorted in the end, where
|
||||||
|
// each element of this vector `i` is set to the index of `words` or
|
||||||
|
// `wordCounts` that belongs in position `i` when sorted. By doing this, we
|
||||||
|
// avoid having to try and pass around both the words and their
|
||||||
|
// corresponding counts throughout the sort, and can just re-assemble the
|
||||||
|
// vectors at the end.
|
||||||
|
std::vector<size_t> indexVector(words.size());
|
||||||
|
for (size_t i = 0; i < words.size(); i++) {
|
||||||
|
indexVector.push_back(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort the `indexVector` vector against the `words` vector, starting with
|
||||||
|
// depth 0 (the left-most character)
|
||||||
|
radixSortIndexes(indexVector, words);
|
||||||
|
|
||||||
|
// Reconstruct the `words` and `wordCounts` vectors from the list of
|
||||||
|
// indexes, and replace the originals with the new ones
|
||||||
|
std::vector<std::string> sortedWords;
|
||||||
|
std::vector<unsigned int> sortedWordCounts;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < indexVector.size(); i++) {
|
||||||
|
sortedWords.push_back(words.at(indexVector.at(i)));
|
||||||
|
sortedWordCounts.push_back(wordCounts.at(indexVector.at(i)));
|
||||||
|
}
|
||||||
|
|
||||||
|
words = sortedWords;
|
||||||
|
wordCounts = sortedWordCounts;
|
||||||
}
|
}
|
||||||
|
|
||||||
OutputProcessor::OutputProcessor() {
|
OutputProcessor::OutputProcessor() {
|
||||||
|
@ -141,8 +207,7 @@ void OutputProcessor::analyzeWords(std::vector<std::string> allWords,
|
||||||
_totalWordCount++;
|
_totalWordCount++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// :3
|
radixSort(_uniqueWords, _wordCounts);
|
||||||
bozosortAlignedVectors(_uniqueWords, _wordCounts);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool OutputProcessor::openStream() {
|
bool OutputProcessor::openStream() {
|
||||||
|
|
2
test.zsh
2
test.zsh
|
@ -3,6 +3,6 @@ for test in {aliceChapter1,greeneggsandham,happybirthday,romeoandjuliet}; do
|
||||||
input/$test.txt
|
input/$test.txt
|
||||||
output.txt
|
output.txt
|
||||||
EOF
|
EOF
|
||||||
delta solutions/$test.out output.txt
|
delta solutions/${test}_xc.out output.txt
|
||||||
done
|
done
|
||||||
echo "All tests finished"
|
echo "All tests finished"
|
||||||
|
|
Loading…
Reference in a new issue