/** * @author Tyler Beckman (tyler_beckman@mines.edu) * @brief A3 - A program to parse a text input and analyze it for statistics based on * word and letter frequency, and then output them to a user-specified file. It * assumes text is only alphabetical + spaces + the punctuation contained within * main.cpp. In addition, the list of word counts is sorted using a recursive * MSD radix sort before being outputted into the specified file. * @version 1 * @date 2024-10-10 * * Resources used: * For the general program (not sorting), I utilized all autocomplete and * cppreference to find the detailed reference of functions I needed to use. For * implementing radix sort I primarily used * https://en.wikipedia.org/wiki/Radix_sort#Most_significant_digit,_forward_recursive * and a lot of trial and error. The sorting part is also VERY commented to make * sure I knew exactly what I was doing at each point and why I was doing it. */ #include "OutputProcessor.h" #include #include #include #include #include #include #include #include /** * @brief Recursively most significant digit radix sorts a vector of indexes, * based on the alphabetical value of a vector of strings. The returned vector * is the same index vector but re-arranged to show where the elements in the * string vector should be placed. * * @param INDEXES The vector of indexes to sort * @param VECTOR_TO_SORT The string vector to base the sort off of. This will * not be modified, and is only used to decide where an index in the other * vector gets placed during sort. * @param DEPTH The current sort depth, should be 0 or not passed if called from * outside of this function. This controls which character of strings is * inspected during sort. */ void radixSortIndexes(std::vector &INDEXES, const std::vector &VECTOR_TO_SORT, const unsigned int DEPTH = 0) { // Construct 26 buckets, where 0 = A, 1 = B, 2 = C, ..., 25 = Z std::vector> buckets(26); // Another "bucket" for words that have already been completely sorted, as // they have no character to check at position `DEPTH` std::optional alreadySorted = std::nullopt; // Pass over each index, bucketing based on the character corresponding to // the current depth for (size_t i = 0; i < INDEXES.size(); i++) { const size_t INDEX_TO_SORT = INDEXES.at(i); const std::string &WORD = VECTOR_TO_SORT.at(INDEX_TO_SORT); // Check if the word has any more characters to bucket. If it doesn't, // place it in the special `alreadySorted` bucket. If it does, add it to // the correct bucket for the current depth. if (WORD.length() == DEPTH) { alreadySorted = INDEX_TO_SORT; } else { buckets.at(WORD.at(DEPTH) - 65).push_back(INDEX_TO_SORT); } } // Recursively apply bucket sort to each bucket unless it is already // completely sorted (has no elements or only has one). With this we cascade // the bucketing as far as is necessary, flattening after we have reached a // depth at which there is no more to bucket (each bucket has 0 or 1 // elements) for (size_t i = 0; i < buckets.size(); i++) { std::vector &bucket = buckets.at(i); if (bucket.size() > 1) { radixSortIndexes(bucket, VECTOR_TO_SORT, DEPTH + 1); } } // Flatten the buckets at the current stage. We first add the // `alreadySorted` value (less characters should go before more characters), // and then append each item from each bucket individually. std::vector flattenedBucket; if (alreadySorted.has_value()) { flattenedBucket.push_back(alreadySorted.value()); } for (size_t i = 0; i < buckets.size(); i++) { flattenedBucket.insert(flattenedBucket.end(), buckets.at(i).begin(), buckets.at(i).end()); } // Finally, replace the indexes with the sorted result INDEXES = flattenedBucket; } /** * @brief Sorts the `words` vector (and `wordCounts` alongside) alphabetically * using a most significant digit radix sort. * * @param words The list of words to sort alphabetically * @param wordCounts The vector of word counts aligned to the `words` vector, * which will be be adjusted based on the result of sorting `words` */ void radixSort(std::vector &words, std::vector &wordCounts) { // Create a vector of indexes the size of the amount of words we have. This // is the vector that will actually be returned sorted in the end, where // each element of this vector `i` is set to the index of `words` or // `wordCounts` that belongs in position `i` when sorted. By doing this, we // avoid having to try and pass around both the words and their // corresponding counts throughout the sort, and can just re-assemble the // vectors at the end. std::vector indexVector(words.size()); for (size_t i = 0; i < words.size(); i++) { indexVector.push_back(i); } // Sort the `indexVector` vector against the `words` vector, starting with // depth 0 (the left-most character) radixSortIndexes(indexVector, words); // Reconstruct the `words` and `wordCounts` vectors from the list of // indexes, and replace the originals with the new ones std::vector sortedWords; std::vector sortedWordCounts; for (size_t i = 0; i < indexVector.size(); i++) { sortedWords.push_back(words.at(indexVector.at(i))); sortedWordCounts.push_back(wordCounts.at(indexVector.at(i))); } words = sortedWords; wordCounts = sortedWordCounts; } OutputProcessor::OutputProcessor() { _fileOut = std::ofstream(); _allWords = std::vector(); _uniqueWords = std::vector(); _letterCounts = std::vector(26, 0); _wordCounts = std::vector(); _totalLetterCount = 0; _totalWordCount = 0; } void OutputProcessor::analyzeWords(std::vector allWords, const std::string PUNCTUATION) { // Iterate over all words, processing incrementally for (size_t wordIdx = 0; wordIdx < allWords.size(); wordIdx++) { std::string &word = allWords.at(wordIdx); // Remove punctuation from word size_t punctuationIdx = 0; while ((punctuationIdx = word.find_first_of(PUNCTUATION)) != std::string::npos) { word.erase(punctuationIdx, 1); } // Save word internally _allWords.push_back(word); // Check all unique words for a match, and if so increment the count bool foundUnique = false; size_t uniqueWordIdx; for (uniqueWordIdx = 0; uniqueWordIdx < _uniqueWords.size(); uniqueWordIdx++) { if (_uniqueWords.at(uniqueWordIdx) == word) { foundUnique = true; break; } } // If no unique word exists, add it to both vectors if (!foundUnique) { _uniqueWords.push_back(word); _wordCounts.push_back(1); } else { _wordCounts.at(uniqueWordIdx)++; } // Add letter count for each letter in the word for (size_t letterIdx = 0; letterIdx < word.length(); letterIdx++) { char letter = word.at(letterIdx); // Normalize to uppercase if (letter >= 'a' && letter <= 'z') { letter -= 97; } else { if (letter >= 'A' && letter <= 'Z') { letter -= 65; } else { continue; } } // Subtracting an uppercase letter by 65 creates its alphabetical // index _letterCounts.at(letter)++; } // Sum total letter count _totalLetterCount += word.length(); // Increment total word count _totalWordCount++; } radixSort(_uniqueWords, _wordCounts); } bool OutputProcessor::openStream() { std::string file; std::cout << "What is the name of the file you would like to write to? "; std::cin >> file; if (std::cin.fail()) { std::cerr << "Invalid file input" << std::endl; return false; } _fileOut.open(file); if (_fileOut.fail()) { std::cerr << "Unable to open file, does it exist?" << std::endl; return false; } return true; } void OutputProcessor::closeStream() { _fileOut.close(); } void OutputProcessor::write() { // Calculate longest word length, longest number length, most common word, // and least common word for later use in one pass for efficiency size_t longestWordLength = 0; size_t mostCommonWordIdx = 0; size_t leastCommonWordIdx = 0; for (size_t uniqueWordIdx = 0; uniqueWordIdx < _uniqueWords.size(); uniqueWordIdx++) { std::string &uniqueWord = _uniqueWords.at(uniqueWordIdx); unsigned long wordCount = _wordCounts.at(uniqueWordIdx); if (uniqueWord.length() > longestWordLength) { longestWordLength = uniqueWord.length(); } // Equality can be ignored here because we want the word that was // encountered first, so any subsequent extremes can be ignored if (wordCount < _wordCounts.at(leastCommonWordIdx)) { leastCommonWordIdx = uniqueWordIdx; } else { if (wordCount > _wordCounts.at(mostCommonWordIdx)) { mostCommonWordIdx = uniqueWordIdx; } } } _fileOut << "Read in " << _totalWordCount << " words" << std::endl; _fileOut << "Encountered " << _uniqueWords.size() << " unique words" << std::endl; // Print out each unique word and how often it happened const size_t MOST_COMMON_WORD_COUNT_LENGTH = std::to_string(_wordCounts.at(mostCommonWordIdx)).length(); for (size_t uniqueWordIdx = 0; uniqueWordIdx < _uniqueWords.size(); uniqueWordIdx++) { _fileOut << std::setw(longestWordLength) << std::left << _uniqueWords.at(uniqueWordIdx) << " : " << std::setw(MOST_COMMON_WORD_COUNT_LENGTH) << std::right << _wordCounts.at(uniqueWordIdx) << std::endl; } // Print the most and least common word const std::string &MOST_COMMON_WORD = _uniqueWords.at(mostCommonWordIdx); const std::string &LEAST_COMMON_WORD = _uniqueWords.at(leastCommonWordIdx); size_t longerFrequentWordLength = MOST_COMMON_WORD.length() > LEAST_COMMON_WORD.length() ? MOST_COMMON_WORD.length() : LEAST_COMMON_WORD.length(); size_t mostFrequentWordCountLength = std::to_string(_wordCounts.at(mostCommonWordIdx)).length(); _fileOut << " Most Frequent Word: " << std::setw(longerFrequentWordLength) << std::left << MOST_COMMON_WORD << " " << std::right << std::setw(mostFrequentWordCountLength) << _wordCounts.at(mostCommonWordIdx) << " (" << std::setw(7) << std::fixed << std::setprecision(3) << std::right << (float)_wordCounts.at(mostCommonWordIdx) / _totalWordCount * 100 << "%)" << std::endl; _fileOut << "Least Frequent Word: " << std::setw(longerFrequentWordLength) << std::left << LEAST_COMMON_WORD << " " << std::right << std::setw(mostFrequentWordCountLength) << _wordCounts.at(leastCommonWordIdx) << " (" << std::setw(7) << std::fixed << std::setprecision(3) << std::right << (float)_wordCounts.at(leastCommonWordIdx) / _totalWordCount * 100 << "%)" << std::endl; // Calculate the most and least common letters to display uint8_t mostCommonLetterIdx = 0; uint8_t leastCommonLetterIdx = 0; for (size_t letterIdx = 0; letterIdx < 26; letterIdx++) { // Here not using "or equals" means the letters later alphabetically get // ignored if they occur the same amount if (_letterCounts.at(letterIdx) < _letterCounts.at(leastCommonLetterIdx)) { leastCommonLetterIdx = letterIdx; } else { if (_letterCounts.at(letterIdx) > _letterCounts.at(mostCommonLetterIdx)) { mostCommonLetterIdx = letterIdx; } } } // Print out each letter along with the amount of times it occurs const size_t MOST_COMMON_LETTER_COUNT_LENGTH = std::to_string(_letterCounts.at(mostCommonLetterIdx)).length(); for (size_t letterIdx = 0; letterIdx < 26; letterIdx++) { _fileOut << (char)(letterIdx + 65) << ": " << std::setw(MOST_COMMON_LETTER_COUNT_LENGTH) << std::right << _letterCounts.at(letterIdx) << std::endl; } // Print out the most and least common letters in total _fileOut << " Most Frequent Letter: " << (char)(mostCommonLetterIdx + 65) << " " << std::setw(MOST_COMMON_LETTER_COUNT_LENGTH) << std::right << _letterCounts.at(mostCommonLetterIdx) << " (" << std::setw(7) << std::fixed << std::setprecision(3) << ((float)_letterCounts.at(mostCommonLetterIdx) / _totalLetterCount * 100) << "%)" << std::endl; _fileOut << "Least Frequent Letter: " << (char)(leastCommonLetterIdx + 65) << " " << std::setw(MOST_COMMON_LETTER_COUNT_LENGTH) << std::right << _letterCounts.at(leastCommonLetterIdx) << " (" << std::setw(7) << std::fixed << std::setprecision(3) << ((float)_letterCounts.at(leastCommonLetterIdx) / _totalLetterCount * 100) << "%)" << std::endl; }