2024-10-10 21:12:43 -06:00
|
|
|
/**
|
|
|
|
* @author Tyler Beckman (tyler_beckman@mines.edu)
|
2024-10-10 21:15:06 -06:00
|
|
|
* @brief A3 - A program to parse a text input and analyze it for statistics
|
|
|
|
* based on word and letter frequency, and then output them to a user-specified
|
|
|
|
* file. It assumes text is only alphabetical + spaces + the punctuation
|
|
|
|
* contained within main.cpp. In addition, the list of word counts is sorted
|
|
|
|
* using a recursive MSD radix sort before being outputted into the specified
|
|
|
|
* file.
|
2024-10-10 21:12:43 -06:00
|
|
|
* @version 1
|
|
|
|
* @date 2024-10-10
|
|
|
|
*
|
|
|
|
* Resources used:
|
|
|
|
* For the general program (not sorting), I utilized all autocomplete and
|
|
|
|
* cppreference to find the detailed reference of functions I needed to use. For
|
|
|
|
* implementing radix sort I primarily used
|
|
|
|
* https://en.wikipedia.org/wiki/Radix_sort#Most_significant_digit,_forward_recursive
|
|
|
|
* and a lot of trial and error. The sorting part is also VERY commented to make
|
|
|
|
* sure I knew exactly what I was doing at each point and why I was doing it.
|
|
|
|
*/
|
|
|
|
|
2024-10-07 02:08:54 -06:00
|
|
|
#include "OutputProcessor.h"
|
|
|
|
|
|
|
|
#include <fstream>
|
2024-10-07 18:17:46 -06:00
|
|
|
#include <iomanip>
|
2024-10-07 02:08:54 -06:00
|
|
|
#include <iostream>
|
2024-10-10 20:54:25 -06:00
|
|
|
#include <optional>
|
2024-10-07 18:17:46 -06:00
|
|
|
#include <ostream>
|
|
|
|
#include <string>
|
2024-10-07 02:08:54 -06:00
|
|
|
#include <vector>
|
|
|
|
|
2024-10-09 17:20:26 -06:00
|
|
|
#include <cstdint>
|
|
|
|
|
2024-10-10 20:54:25 -06:00
|
|
|
/**
|
|
|
|
* @brief Recursively most significant digit radix sorts a vector of indexes,
|
|
|
|
* based on the alphabetical value of a vector of strings. The returned vector
|
|
|
|
* is the same index vector but re-arranged to show where the elements in the
|
|
|
|
* string vector should be placed.
|
|
|
|
*
|
|
|
|
* @param INDEXES The vector of indexes to sort
|
|
|
|
* @param VECTOR_TO_SORT The string vector to base the sort off of. This will
|
|
|
|
* not be modified, and is only used to decide where an index in the other
|
|
|
|
* vector gets placed during sort.
|
|
|
|
* @param DEPTH The current sort depth, should be 0 or not passed if called from
|
|
|
|
* outside of this function. This controls which character of strings is
|
|
|
|
* inspected during sort.
|
|
|
|
*/
|
|
|
|
void radixSortIndexes(std::vector<size_t> &INDEXES,
|
|
|
|
const std::vector<std::string> &VECTOR_TO_SORT,
|
|
|
|
const unsigned int DEPTH = 0) {
|
|
|
|
// Construct 26 buckets, where 0 = A, 1 = B, 2 = C, ..., 25 = Z
|
|
|
|
std::vector<std::vector<size_t>> buckets(26);
|
|
|
|
// Another "bucket" for words that have already been completely sorted, as
|
|
|
|
// they have no character to check at position `DEPTH`
|
|
|
|
std::optional<size_t> alreadySorted = std::nullopt;
|
|
|
|
|
|
|
|
// Pass over each index, bucketing based on the character corresponding to
|
|
|
|
// the current depth
|
|
|
|
for (size_t i = 0; i < INDEXES.size(); i++) {
|
|
|
|
const size_t INDEX_TO_SORT = INDEXES.at(i);
|
|
|
|
const std::string &WORD = VECTOR_TO_SORT.at(INDEX_TO_SORT);
|
|
|
|
|
|
|
|
// Check if the word has any more characters to bucket. If it doesn't,
|
|
|
|
// place it in the special `alreadySorted` bucket. If it does, add it to
|
|
|
|
// the correct bucket for the current depth.
|
|
|
|
if (WORD.length() == DEPTH) {
|
|
|
|
alreadySorted = INDEX_TO_SORT;
|
|
|
|
} else {
|
|
|
|
buckets.at(WORD.at(DEPTH) - 65).push_back(INDEX_TO_SORT);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Recursively apply bucket sort to each bucket unless it is already
|
|
|
|
// completely sorted (has no elements or only has one). With this we cascade
|
|
|
|
// the bucketing as far as is necessary, flattening after we have reached a
|
|
|
|
// depth at which there is no more to bucket (each bucket has 0 or 1
|
|
|
|
// elements)
|
|
|
|
for (size_t i = 0; i < buckets.size(); i++) {
|
|
|
|
std::vector<size_t> &bucket = buckets.at(i);
|
|
|
|
|
|
|
|
if (bucket.size() > 1) {
|
|
|
|
radixSortIndexes(bucket, VECTOR_TO_SORT, DEPTH + 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Flatten the buckets at the current stage. We first add the
|
|
|
|
// `alreadySorted` value (less characters should go before more characters),
|
|
|
|
// and then append each item from each bucket individually.
|
|
|
|
std::vector<size_t> flattenedBucket;
|
|
|
|
if (alreadySorted.has_value()) {
|
|
|
|
flattenedBucket.push_back(alreadySorted.value());
|
|
|
|
}
|
|
|
|
for (size_t i = 0; i < buckets.size(); i++) {
|
|
|
|
flattenedBucket.insert(flattenedBucket.end(), buckets.at(i).begin(),
|
|
|
|
buckets.at(i).end());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Finally, replace the indexes with the sorted result
|
|
|
|
INDEXES = flattenedBucket;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @brief Sorts the `words` vector (and `wordCounts` alongside) alphabetically
|
|
|
|
* using a most significant digit radix sort.
|
|
|
|
*
|
|
|
|
* @param words The list of words to sort alphabetically
|
|
|
|
* @param wordCounts The vector of word counts aligned to the `words` vector,
|
|
|
|
* which will be be adjusted based on the result of sorting `words`
|
|
|
|
*/
|
|
|
|
void radixSort(std::vector<std::string> &words,
|
|
|
|
std::vector<unsigned int> &wordCounts) {
|
|
|
|
// Create a vector of indexes the size of the amount of words we have. This
|
|
|
|
// is the vector that will actually be returned sorted in the end, where
|
|
|
|
// each element of this vector `i` is set to the index of `words` or
|
|
|
|
// `wordCounts` that belongs in position `i` when sorted. By doing this, we
|
|
|
|
// avoid having to try and pass around both the words and their
|
|
|
|
// corresponding counts throughout the sort, and can just re-assemble the
|
|
|
|
// vectors at the end.
|
|
|
|
std::vector<size_t> indexVector(words.size());
|
|
|
|
for (size_t i = 0; i < words.size(); i++) {
|
|
|
|
indexVector.push_back(i);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Sort the `indexVector` vector against the `words` vector, starting with
|
|
|
|
// depth 0 (the left-most character)
|
|
|
|
radixSortIndexes(indexVector, words);
|
|
|
|
|
|
|
|
// Reconstruct the `words` and `wordCounts` vectors from the list of
|
|
|
|
// indexes, and replace the originals with the new ones
|
|
|
|
std::vector<std::string> sortedWords;
|
|
|
|
std::vector<unsigned int> sortedWordCounts;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < indexVector.size(); i++) {
|
|
|
|
sortedWords.push_back(words.at(indexVector.at(i)));
|
|
|
|
sortedWordCounts.push_back(wordCounts.at(indexVector.at(i)));
|
|
|
|
}
|
|
|
|
|
|
|
|
words = sortedWords;
|
|
|
|
wordCounts = sortedWordCounts;
|
|
|
|
}
|
|
|
|
|
2024-10-07 02:08:54 -06:00
|
|
|
OutputProcessor::OutputProcessor() {
|
2024-10-07 18:17:46 -06:00
|
|
|
_fileOut = std::ofstream();
|
|
|
|
_allWords = std::vector<std::string>();
|
|
|
|
_uniqueWords = std::vector<std::string>();
|
|
|
|
_letterCounts = std::vector<unsigned int>(26, 0);
|
|
|
|
_wordCounts = std::vector<unsigned int>();
|
|
|
|
_totalLetterCount = 0;
|
|
|
|
_totalWordCount = 0;
|
2024-10-07 02:08:54 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
void OutputProcessor::analyzeWords(std::vector<std::string> allWords,
|
2024-10-09 17:20:26 -06:00
|
|
|
const std::string PUNCTUATION) {
|
2024-10-07 18:17:46 -06:00
|
|
|
// Iterate over all words, processing incrementally
|
|
|
|
for (size_t wordIdx = 0; wordIdx < allWords.size(); wordIdx++) {
|
|
|
|
std::string &word = allWords.at(wordIdx);
|
|
|
|
|
|
|
|
// Remove punctuation from word
|
|
|
|
size_t punctuationIdx = 0;
|
2024-10-09 17:20:26 -06:00
|
|
|
while ((punctuationIdx = word.find_first_of(PUNCTUATION)) !=
|
2024-10-07 18:17:46 -06:00
|
|
|
std::string::npos) {
|
|
|
|
word.erase(punctuationIdx, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Save word internally
|
|
|
|
_allWords.push_back(word);
|
|
|
|
|
|
|
|
// Check all unique words for a match, and if so increment the count
|
|
|
|
bool foundUnique = false;
|
|
|
|
size_t uniqueWordIdx;
|
|
|
|
for (uniqueWordIdx = 0; uniqueWordIdx < _uniqueWords.size();
|
|
|
|
uniqueWordIdx++) {
|
|
|
|
if (_uniqueWords.at(uniqueWordIdx) == word) {
|
|
|
|
foundUnique = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// If no unique word exists, add it to both vectors
|
|
|
|
if (!foundUnique) {
|
|
|
|
_uniqueWords.push_back(word);
|
|
|
|
_wordCounts.push_back(1);
|
|
|
|
} else {
|
|
|
|
_wordCounts.at(uniqueWordIdx)++;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add letter count for each letter in the word
|
|
|
|
for (size_t letterIdx = 0; letterIdx < word.length(); letterIdx++) {
|
|
|
|
char letter = word.at(letterIdx);
|
|
|
|
// Normalize to uppercase
|
|
|
|
if (letter >= 'a' && letter <= 'z') {
|
2024-10-09 17:20:26 -06:00
|
|
|
letter -= 97;
|
|
|
|
} else {
|
|
|
|
if (letter >= 'A' && letter <= 'Z') {
|
|
|
|
letter -= 65;
|
|
|
|
} else {
|
|
|
|
continue;
|
|
|
|
}
|
2024-10-07 18:17:46 -06:00
|
|
|
}
|
|
|
|
// Subtracting an uppercase letter by 65 creates its alphabetical
|
|
|
|
// index
|
|
|
|
_letterCounts.at(letter)++;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Sum total letter count
|
|
|
|
_totalLetterCount += word.length();
|
|
|
|
|
|
|
|
// Increment total word count
|
|
|
|
_totalWordCount++;
|
|
|
|
}
|
2024-10-10 20:54:25 -06:00
|
|
|
|
|
|
|
radixSort(_uniqueWords, _wordCounts);
|
2024-10-07 02:08:54 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
bool OutputProcessor::openStream() {
|
2024-10-07 18:17:46 -06:00
|
|
|
std::string file;
|
|
|
|
std::cout << "What is the name of the file you would like to write to? ";
|
|
|
|
std::cin >> file;
|
|
|
|
|
|
|
|
if (std::cin.fail()) {
|
|
|
|
std::cerr << "Invalid file input" << std::endl;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
_fileOut.open(file);
|
|
|
|
if (_fileOut.fail()) {
|
|
|
|
std::cerr << "Unable to open file, does it exist?" << std::endl;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
2024-10-07 02:08:54 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
void OutputProcessor::closeStream() { _fileOut.close(); }
|
|
|
|
|
|
|
|
void OutputProcessor::write() {
|
2024-10-07 18:17:46 -06:00
|
|
|
// Calculate longest word length, longest number length, most common word,
|
|
|
|
// and least common word for later use in one pass for efficiency
|
|
|
|
size_t longestWordLength = 0;
|
|
|
|
|
2024-10-09 17:20:26 -06:00
|
|
|
size_t mostCommonWordIdx = 0;
|
|
|
|
size_t leastCommonWordIdx = 0;
|
2024-10-07 18:17:46 -06:00
|
|
|
|
|
|
|
for (size_t uniqueWordIdx = 0; uniqueWordIdx < _uniqueWords.size();
|
|
|
|
uniqueWordIdx++) {
|
|
|
|
std::string &uniqueWord = _uniqueWords.at(uniqueWordIdx);
|
|
|
|
unsigned long wordCount = _wordCounts.at(uniqueWordIdx);
|
|
|
|
|
|
|
|
if (uniqueWord.length() > longestWordLength) {
|
|
|
|
longestWordLength = uniqueWord.length();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Equality can be ignored here because we want the word that was
|
|
|
|
// encountered first, so any subsequent extremes can be ignored
|
2024-10-09 17:20:26 -06:00
|
|
|
if (wordCount < _wordCounts.at(leastCommonWordIdx)) {
|
|
|
|
leastCommonWordIdx = uniqueWordIdx;
|
2024-10-07 18:17:46 -06:00
|
|
|
} else {
|
2024-10-09 17:20:26 -06:00
|
|
|
if (wordCount > _wordCounts.at(mostCommonWordIdx)) {
|
|
|
|
mostCommonWordIdx = uniqueWordIdx;
|
2024-10-07 18:17:46 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
_fileOut << "Read in " << _totalWordCount << " words" << std::endl;
|
|
|
|
_fileOut << "Encountered " << _uniqueWords.size() << " unique words"
|
|
|
|
<< std::endl;
|
|
|
|
|
|
|
|
// Print out each unique word and how often it happened
|
2024-10-09 17:20:26 -06:00
|
|
|
const size_t MOST_COMMON_WORD_COUNT_LENGTH =
|
|
|
|
std::to_string(_wordCounts.at(mostCommonWordIdx)).length();
|
2024-10-07 18:17:46 -06:00
|
|
|
for (size_t uniqueWordIdx = 0; uniqueWordIdx < _uniqueWords.size();
|
|
|
|
uniqueWordIdx++) {
|
|
|
|
_fileOut << std::setw(longestWordLength) << std::left
|
2024-10-09 17:20:26 -06:00
|
|
|
<< _uniqueWords.at(uniqueWordIdx) << " : "
|
|
|
|
<< std::setw(MOST_COMMON_WORD_COUNT_LENGTH) << std::right
|
2024-10-07 18:17:46 -06:00
|
|
|
<< _wordCounts.at(uniqueWordIdx) << std::endl;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Print the most and least common word
|
2024-10-09 17:20:26 -06:00
|
|
|
const std::string &MOST_COMMON_WORD = _uniqueWords.at(mostCommonWordIdx);
|
|
|
|
const std::string &LEAST_COMMON_WORD = _uniqueWords.at(leastCommonWordIdx);
|
2024-10-07 18:17:46 -06:00
|
|
|
size_t longerFrequentWordLength =
|
2024-10-09 17:20:26 -06:00
|
|
|
MOST_COMMON_WORD.length() > LEAST_COMMON_WORD.length()
|
|
|
|
? MOST_COMMON_WORD.length()
|
|
|
|
: LEAST_COMMON_WORD.length();
|
|
|
|
size_t mostFrequentWordCountLength =
|
|
|
|
std::to_string(_wordCounts.at(mostCommonWordIdx)).length();
|
2024-10-07 18:17:46 -06:00
|
|
|
|
|
|
|
_fileOut << " Most Frequent Word: " << std::setw(longerFrequentWordLength)
|
2024-10-09 17:20:26 -06:00
|
|
|
<< std::left << MOST_COMMON_WORD << " " << std::right
|
|
|
|
<< std::setw(mostFrequentWordCountLength)
|
|
|
|
<< _wordCounts.at(mostCommonWordIdx) << " (" << std::setw(7)
|
|
|
|
<< std::fixed << std::setprecision(3) << std::right
|
|
|
|
<< (float)_wordCounts.at(mostCommonWordIdx) / _totalWordCount * 100
|
|
|
|
<< "%)" << std::endl;
|
|
|
|
_fileOut << "Least Frequent Word: " << std::setw(longerFrequentWordLength)
|
|
|
|
<< std::left << LEAST_COMMON_WORD << " " << std::right
|
|
|
|
<< std::setw(mostFrequentWordCountLength)
|
|
|
|
<< _wordCounts.at(leastCommonWordIdx) << " (" << std::setw(7)
|
|
|
|
<< std::fixed << std::setprecision(3) << std::right
|
|
|
|
<< (float)_wordCounts.at(leastCommonWordIdx) / _totalWordCount *
|
|
|
|
100
|
|
|
|
<< "%)" << std::endl;
|
2024-10-07 18:17:46 -06:00
|
|
|
|
2024-10-09 17:20:26 -06:00
|
|
|
// Calculate the most and least common letters to display
|
|
|
|
uint8_t mostCommonLetterIdx = 0;
|
|
|
|
uint8_t leastCommonLetterIdx = 0;
|
2024-10-07 18:17:46 -06:00
|
|
|
|
|
|
|
for (size_t letterIdx = 0; letterIdx < 26; letterIdx++) {
|
|
|
|
// Here not using "or equals" means the letters later alphabetically get
|
|
|
|
// ignored if they occur the same amount
|
|
|
|
if (_letterCounts.at(letterIdx) <
|
2024-10-09 17:20:26 -06:00
|
|
|
_letterCounts.at(leastCommonLetterIdx)) {
|
|
|
|
leastCommonLetterIdx = letterIdx;
|
2024-10-07 18:17:46 -06:00
|
|
|
} else {
|
|
|
|
if (_letterCounts.at(letterIdx) >
|
2024-10-09 17:20:26 -06:00
|
|
|
_letterCounts.at(mostCommonLetterIdx)) {
|
|
|
|
mostCommonLetterIdx = letterIdx;
|
2024-10-07 18:17:46 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Print out each letter along with the amount of times it occurs
|
2024-10-09 17:20:26 -06:00
|
|
|
const size_t MOST_COMMON_LETTER_COUNT_LENGTH =
|
|
|
|
std::to_string(_letterCounts.at(mostCommonLetterIdx)).length();
|
2024-10-07 18:17:46 -06:00
|
|
|
for (size_t letterIdx = 0; letterIdx < 26; letterIdx++) {
|
|
|
|
_fileOut << (char)(letterIdx + 65) << ": "
|
2024-10-09 17:20:26 -06:00
|
|
|
<< std::setw(MOST_COMMON_LETTER_COUNT_LENGTH) << std::right
|
2024-10-07 18:17:46 -06:00
|
|
|
<< _letterCounts.at(letterIdx) << std::endl;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Print out the most and least common letters in total
|
2024-10-09 17:20:26 -06:00
|
|
|
_fileOut << " Most Frequent Letter: " << (char)(mostCommonLetterIdx + 65)
|
|
|
|
<< " " << std::setw(MOST_COMMON_LETTER_COUNT_LENGTH) << std::right
|
|
|
|
<< _letterCounts.at(mostCommonLetterIdx) << " (" << std::setw(7)
|
2024-10-07 18:17:46 -06:00
|
|
|
<< std::fixed << std::setprecision(3)
|
2024-10-09 17:20:26 -06:00
|
|
|
<< ((float)_letterCounts.at(mostCommonLetterIdx) /
|
|
|
|
_totalLetterCount * 100)
|
2024-10-07 18:17:46 -06:00
|
|
|
<< "%)" << std::endl;
|
2024-10-09 17:20:26 -06:00
|
|
|
_fileOut << "Least Frequent Letter: " << (char)(leastCommonLetterIdx + 65)
|
|
|
|
<< " " << std::setw(MOST_COMMON_LETTER_COUNT_LENGTH) << std::right
|
|
|
|
<< _letterCounts.at(leastCommonLetterIdx) << " (" << std::setw(7)
|
2024-10-07 18:17:46 -06:00
|
|
|
<< std::fixed << std::setprecision(3)
|
2024-10-09 17:20:26 -06:00
|
|
|
<< ((float)_letterCounts.at(leastCommonLetterIdx) /
|
|
|
|
_totalLetterCount * 100)
|
2024-10-07 18:17:46 -06:00
|
|
|
<< "%)" << std::endl;
|
2024-10-07 02:08:54 -06:00
|
|
|
}
|