A3/OutputProcessor.cpp

280 lines
8.7 KiB
C++
Raw Normal View History

2024-10-07 02:08:54 -06:00
#include "OutputProcessor.h"
2024-10-09 19:44:09 -06:00
#include <atomic>
#include <chrono>
2024-10-07 02:08:54 -06:00
#include <fstream>
2024-10-09 19:44:09 -06:00
#include <future>
#include <iomanip>
2024-10-07 02:08:54 -06:00
#include <iostream>
#include <ostream>
2024-10-09 19:44:09 -06:00
#include <random>
#include <string>
2024-10-09 19:44:09 -06:00
#include <thread>
#include <utility>
2024-10-07 02:08:54 -06:00
#include <vector>
2024-10-09 17:20:26 -06:00
#include <cstdint>
2024-10-09 19:44:09 -06:00
bool checkSorted(const std::vector<std::string> &vector) {
for (size_t i = 0; i < vector.size() - 1; i++) {
if (vector.at(i) > vector.at(i + 1)) {
return false;
}
}
return true;
}
void bozosortAlignedVectors(std::vector<std::string> &vector1,
std::vector<unsigned int> &vector2) {
auto threadCount = std::thread::hardware_concurrency();
if (threadCount == 0)
threadCount = 8;
std::atomic<bool> shouldAbort(false);
std::vector<std::thread> threads{};
for (unsigned int i = 0; i < threadCount; i++) {
std::thread t(
[vector1, vector2, &shouldAbort,
i](std::vector<std::string>* vector1Original, std::vector<unsigned int>* vector2Original) mutable {
std::mt19937 twister(std::chrono::steady_clock::now()
.time_since_epoch()
.count() +
i);
std::uniform_int_distribution<size_t> dist(0,
vector1.size() - 1);
do {
if (shouldAbort) return;
size_t first = dist(twister);
size_t second = dist(twister);
std::string temp = vector1.at(first);
vector1.at(first) = vector1.at(second);
vector1.at(second) = temp;
// Also swap elements in the aligned vector. If I store
// where everything moved to maybe it could be faster?
unsigned int temp2 = vector2.at(first);
vector2.at(first) = vector2.at(second);
vector2.at(second) = temp2;
} while (!checkSorted(vector1));
*vector1Original = vector1;
*vector2Original = vector2;
shouldAbort = true;
},
&vector1, &vector2);
threads.push_back(std::move(t));
}
for (unsigned int i = 0; i < threadCount; i++) {
threads.at(i).join();
}
}
2024-10-07 02:08:54 -06:00
OutputProcessor::OutputProcessor() {
_fileOut = std::ofstream();
_allWords = std::vector<std::string>();
_uniqueWords = std::vector<std::string>();
_letterCounts = std::vector<unsigned int>(26, 0);
_wordCounts = std::vector<unsigned int>();
_totalLetterCount = 0;
_totalWordCount = 0;
2024-10-07 02:08:54 -06:00
}
void OutputProcessor::analyzeWords(std::vector<std::string> allWords,
2024-10-09 17:20:26 -06:00
const std::string PUNCTUATION) {
// Iterate over all words, processing incrementally
for (size_t wordIdx = 0; wordIdx < allWords.size(); wordIdx++) {
std::string &word = allWords.at(wordIdx);
// Remove punctuation from word
size_t punctuationIdx = 0;
2024-10-09 17:20:26 -06:00
while ((punctuationIdx = word.find_first_of(PUNCTUATION)) !=
std::string::npos) {
word.erase(punctuationIdx, 1);
}
// Save word internally
_allWords.push_back(word);
// Check all unique words for a match, and if so increment the count
bool foundUnique = false;
size_t uniqueWordIdx;
for (uniqueWordIdx = 0; uniqueWordIdx < _uniqueWords.size();
uniqueWordIdx++) {
if (_uniqueWords.at(uniqueWordIdx) == word) {
foundUnique = true;
break;
}
}
// If no unique word exists, add it to both vectors
if (!foundUnique) {
_uniqueWords.push_back(word);
_wordCounts.push_back(1);
} else {
_wordCounts.at(uniqueWordIdx)++;
}
// Add letter count for each letter in the word
for (size_t letterIdx = 0; letterIdx < word.length(); letterIdx++) {
char letter = word.at(letterIdx);
// Normalize to uppercase
if (letter >= 'a' && letter <= 'z') {
2024-10-09 17:20:26 -06:00
letter -= 97;
} else {
if (letter >= 'A' && letter <= 'Z') {
letter -= 65;
} else {
continue;
}
}
// Subtracting an uppercase letter by 65 creates its alphabetical
// index
_letterCounts.at(letter)++;
}
// Sum total letter count
_totalLetterCount += word.length();
// Increment total word count
_totalWordCount++;
}
2024-10-09 19:44:09 -06:00
// :3
bozosortAlignedVectors(_uniqueWords, _wordCounts);
2024-10-07 02:08:54 -06:00
}
bool OutputProcessor::openStream() {
std::string file;
std::cout << "What is the name of the file you would like to write to? ";
std::cin >> file;
if (std::cin.fail()) {
std::cerr << "Invalid file input" << std::endl;
return false;
}
_fileOut.open(file);
if (_fileOut.fail()) {
std::cerr << "Unable to open file, does it exist?" << std::endl;
return false;
}
return true;
2024-10-07 02:08:54 -06:00
}
void OutputProcessor::closeStream() { _fileOut.close(); }
void OutputProcessor::write() {
// Calculate longest word length, longest number length, most common word,
// and least common word for later use in one pass for efficiency
size_t longestWordLength = 0;
2024-10-09 17:20:26 -06:00
size_t mostCommonWordIdx = 0;
size_t leastCommonWordIdx = 0;
for (size_t uniqueWordIdx = 0; uniqueWordIdx < _uniqueWords.size();
uniqueWordIdx++) {
std::string &uniqueWord = _uniqueWords.at(uniqueWordIdx);
unsigned long wordCount = _wordCounts.at(uniqueWordIdx);
if (uniqueWord.length() > longestWordLength) {
longestWordLength = uniqueWord.length();
}
// Equality can be ignored here because we want the word that was
// encountered first, so any subsequent extremes can be ignored
2024-10-09 17:20:26 -06:00
if (wordCount < _wordCounts.at(leastCommonWordIdx)) {
leastCommonWordIdx = uniqueWordIdx;
} else {
2024-10-09 17:20:26 -06:00
if (wordCount > _wordCounts.at(mostCommonWordIdx)) {
mostCommonWordIdx = uniqueWordIdx;
}
}
}
_fileOut << "Read in " << _totalWordCount << " words" << std::endl;
_fileOut << "Encountered " << _uniqueWords.size() << " unique words"
<< std::endl;
// Print out each unique word and how often it happened
2024-10-09 17:20:26 -06:00
const size_t MOST_COMMON_WORD_COUNT_LENGTH =
std::to_string(_wordCounts.at(mostCommonWordIdx)).length();
for (size_t uniqueWordIdx = 0; uniqueWordIdx < _uniqueWords.size();
uniqueWordIdx++) {
_fileOut << std::setw(longestWordLength) << std::left
2024-10-09 17:20:26 -06:00
<< _uniqueWords.at(uniqueWordIdx) << " : "
<< std::setw(MOST_COMMON_WORD_COUNT_LENGTH) << std::right
<< _wordCounts.at(uniqueWordIdx) << std::endl;
}
// Print the most and least common word
2024-10-09 17:20:26 -06:00
const std::string &MOST_COMMON_WORD = _uniqueWords.at(mostCommonWordIdx);
const std::string &LEAST_COMMON_WORD = _uniqueWords.at(leastCommonWordIdx);
size_t longerFrequentWordLength =
2024-10-09 17:20:26 -06:00
MOST_COMMON_WORD.length() > LEAST_COMMON_WORD.length()
? MOST_COMMON_WORD.length()
: LEAST_COMMON_WORD.length();
size_t mostFrequentWordCountLength =
std::to_string(_wordCounts.at(mostCommonWordIdx)).length();
_fileOut << " Most Frequent Word: " << std::setw(longerFrequentWordLength)
2024-10-09 17:20:26 -06:00
<< std::left << MOST_COMMON_WORD << " " << std::right
<< std::setw(mostFrequentWordCountLength)
<< _wordCounts.at(mostCommonWordIdx) << " (" << std::setw(7)
<< std::fixed << std::setprecision(3) << std::right
<< (float)_wordCounts.at(mostCommonWordIdx) / _totalWordCount * 100
<< "%)" << std::endl;
_fileOut << "Least Frequent Word: " << std::setw(longerFrequentWordLength)
<< std::left << LEAST_COMMON_WORD << " " << std::right
<< std::setw(mostFrequentWordCountLength)
<< _wordCounts.at(leastCommonWordIdx) << " (" << std::setw(7)
<< std::fixed << std::setprecision(3) << std::right
<< (float)_wordCounts.at(leastCommonWordIdx) / _totalWordCount *
100
<< "%)" << std::endl;
2024-10-09 17:20:26 -06:00
// Calculate the most and least common letters to display
uint8_t mostCommonLetterIdx = 0;
uint8_t leastCommonLetterIdx = 0;
for (size_t letterIdx = 0; letterIdx < 26; letterIdx++) {
// Here not using "or equals" means the letters later alphabetically get
// ignored if they occur the same amount
if (_letterCounts.at(letterIdx) <
2024-10-09 17:20:26 -06:00
_letterCounts.at(leastCommonLetterIdx)) {
leastCommonLetterIdx = letterIdx;
} else {
if (_letterCounts.at(letterIdx) >
2024-10-09 17:20:26 -06:00
_letterCounts.at(mostCommonLetterIdx)) {
mostCommonLetterIdx = letterIdx;
}
}
}
// Print out each letter along with the amount of times it occurs
2024-10-09 17:20:26 -06:00
const size_t MOST_COMMON_LETTER_COUNT_LENGTH =
std::to_string(_letterCounts.at(mostCommonLetterIdx)).length();
for (size_t letterIdx = 0; letterIdx < 26; letterIdx++) {
_fileOut << (char)(letterIdx + 65) << ": "
2024-10-09 17:20:26 -06:00
<< std::setw(MOST_COMMON_LETTER_COUNT_LENGTH) << std::right
<< _letterCounts.at(letterIdx) << std::endl;
}
// Print out the most and least common letters in total
2024-10-09 17:20:26 -06:00
_fileOut << " Most Frequent Letter: " << (char)(mostCommonLetterIdx + 65)
<< " " << std::setw(MOST_COMMON_LETTER_COUNT_LENGTH) << std::right
<< _letterCounts.at(mostCommonLetterIdx) << " (" << std::setw(7)
<< std::fixed << std::setprecision(3)
2024-10-09 17:20:26 -06:00
<< ((float)_letterCounts.at(mostCommonLetterIdx) /
_totalLetterCount * 100)
<< "%)" << std::endl;
2024-10-09 17:20:26 -06:00
_fileOut << "Least Frequent Letter: " << (char)(leastCommonLetterIdx + 65)
<< " " << std::setw(MOST_COMMON_LETTER_COUNT_LENGTH) << std::right
<< _letterCounts.at(leastCommonLetterIdx) << " (" << std::setw(7)
<< std::fixed << std::setprecision(3)
2024-10-09 17:20:26 -06:00
<< ((float)_letterCounts.at(leastCommonLetterIdx) /
_totalLetterCount * 100)
<< "%)" << std::endl;
2024-10-07 02:08:54 -06:00
}