A3/OutputProcessor.cpp

#include "OutputProcessor.h"

#include <atomic>
#include <chrono>
#include <fstream>
#include <future>
#include <iomanip>
#include <iostream>
#include <ostream>
#include <random>
#include <string>
#include <thread>
#include <utility>
#include <vector>

#include <cstdint>

bool checkSorted(const std::vector<std::string> &vector) {
	for (size_t i = 0; i < vector.size() - 1; i++) {
		if (vector.at(i) > vector.at(i + 1)) {
			return false;
		}
	}
	return true;
}

void bozosortAlignedVectors(std::vector<std::string> &vector1,
							std::vector<unsigned int> &vector2) {
	auto threadCount = std::thread::hardware_concurrency();
	if (threadCount == 0)
		threadCount = 8;

	std::atomic<bool> shouldAbort(false);
	std::vector<std::thread> threads{};
	for (unsigned int i = 0; i < threadCount; i++) {
		std::thread t(
			[vector1, vector2, &shouldAbort,
			 i](std::vector<std::string>* vector1Original, std::vector<unsigned int>* vector2Original) mutable {
				std::mt19937 twister(std::chrono::steady_clock::now()
										 .time_since_epoch()
										 .count() +
									 i);
				std::uniform_int_distribution<size_t> dist(0,
														   vector1.size() - 1);
				do {
					if (shouldAbort) return;
					size_t first = dist(twister);
					size_t second = dist(twister);

					std::string temp = vector1.at(first);
					vector1.at(first) = vector1.at(second);
					vector1.at(second) = temp;

					// Also swap elements in the aligned vector. If I store
					// where everything moved to maybe it could be faster?
					unsigned int temp2 = vector2.at(first);
					vector2.at(first) = vector2.at(second);
					vector2.at(second) = temp2;
				} while (!checkSorted(vector1));

				*vector1Original = vector1;
				*vector2Original = vector2;
				shouldAbort = true;
			},
			&vector1, &vector2);

		threads.push_back(std::move(t));
	}

	for (unsigned int i = 0; i < threadCount; i++) {
		threads.at(i).join();
	}
}

OutputProcessor::OutputProcessor() {
	_fileOut = std::ofstream();
	_allWords = std::vector<std::string>();
	_uniqueWords = std::vector<std::string>();
	_letterCounts = std::vector<unsigned int>(26, 0);
	_wordCounts = std::vector<unsigned int>();
	_totalLetterCount = 0;
	_totalWordCount = 0;
}

void OutputProcessor::analyzeWords(std::vector<std::string> allWords,
								   const std::string PUNCTUATION) {
	// Iterate over all words, processing incrementally
	for (size_t wordIdx = 0; wordIdx < allWords.size(); wordIdx++) {
		std::string &word = allWords.at(wordIdx);

		// Remove punctuation from word
		size_t punctuationIdx = 0;
		while ((punctuationIdx = word.find_first_of(PUNCTUATION)) !=
			   std::string::npos) {
			word.erase(punctuationIdx, 1);
		}

		// Save word internally
		_allWords.push_back(word);

		// Check all unique words for a match, and if so increment the count
		bool foundUnique = false;
		size_t uniqueWordIdx;
		for (uniqueWordIdx = 0; uniqueWordIdx < _uniqueWords.size();
			 uniqueWordIdx++) {
			if (_uniqueWords.at(uniqueWordIdx) == word) {
				foundUnique = true;
				break;
			}
		}
		// If no unique word exists, add it to both vectors
		if (!foundUnique) {
			_uniqueWords.push_back(word);
			_wordCounts.push_back(1);
		} else {
			_wordCounts.at(uniqueWordIdx)++;
		}

		// Add letter count for each letter in the word
		for (size_t letterIdx = 0; letterIdx < word.length(); letterIdx++) {
			char letter = word.at(letterIdx);
			// Normalize to uppercase
			if (letter >= 'a' && letter <= 'z') {
				letter -= 97;
			} else {
				if (letter >= 'A' && letter <= 'Z') {
					letter -= 65;
				} else {
					continue;
				}
			}
			// Subtracting an uppercase letter by 65 creates its alphabetical
			// index
			_letterCounts.at(letter)++;
		}

		// Sum total letter count
		_totalLetterCount += word.length();

		// Increment total word count
		_totalWordCount++;
	}

	// :3
	bozosortAlignedVectors(_uniqueWords, _wordCounts);
}

bool OutputProcessor::openStream() {
	std::string file;
	std::cout << "What is the name of the file you would like to write to? ";
	std::cin >> file;

	if (std::cin.fail()) {
		std::cerr << "Invalid file input" << std::endl;
		return false;
	}

	_fileOut.open(file);
	if (_fileOut.fail()) {
		std::cerr << "Unable to open file, does it exist?" << std::endl;
		return false;
	}

	return true;
}

void OutputProcessor::closeStream() { _fileOut.close(); }

void OutputProcessor::write() {
	// Calculate longest word length, longest number length, most common word,
	// and least common word for later use in one pass for efficiency
	size_t longestWordLength = 0;

	size_t mostCommonWordIdx = 0;
	size_t leastCommonWordIdx = 0;

	for (size_t uniqueWordIdx = 0; uniqueWordIdx < _uniqueWords.size();
		 uniqueWordIdx++) {
		std::string &uniqueWord = _uniqueWords.at(uniqueWordIdx);
		unsigned long wordCount = _wordCounts.at(uniqueWordIdx);

		if (uniqueWord.length() > longestWordLength) {
			longestWordLength = uniqueWord.length();
		}

		// Equality can be ignored here because we want the word that was
		// encountered first, so any subsequent extremes can be ignored
		if (wordCount < _wordCounts.at(leastCommonWordIdx)) {
			leastCommonWordIdx = uniqueWordIdx;
		} else {
			if (wordCount > _wordCounts.at(mostCommonWordIdx)) {
				mostCommonWordIdx = uniqueWordIdx;
			}
		}
	}

	_fileOut << "Read in " << _totalWordCount << " words" << std::endl;
	_fileOut << "Encountered " << _uniqueWords.size() << " unique words"
			 << std::endl;

	// Print out each unique word and how often it happened
	const size_t MOST_COMMON_WORD_COUNT_LENGTH =
		std::to_string(_wordCounts.at(mostCommonWordIdx)).length();
	for (size_t uniqueWordIdx = 0; uniqueWordIdx < _uniqueWords.size();
		 uniqueWordIdx++) {
		_fileOut << std::setw(longestWordLength) << std::left
				 << _uniqueWords.at(uniqueWordIdx) << " : "
				 << std::setw(MOST_COMMON_WORD_COUNT_LENGTH) << std::right
				 << _wordCounts.at(uniqueWordIdx) << std::endl;
	}

	// Print the most and least common word
	const std::string &MOST_COMMON_WORD = _uniqueWords.at(mostCommonWordIdx);
	const std::string &LEAST_COMMON_WORD = _uniqueWords.at(leastCommonWordIdx);
	size_t longerFrequentWordLength =
		MOST_COMMON_WORD.length() > LEAST_COMMON_WORD.length()
			? MOST_COMMON_WORD.length()
			: LEAST_COMMON_WORD.length();
	size_t mostFrequentWordCountLength =
		std::to_string(_wordCounts.at(mostCommonWordIdx)).length();

	_fileOut << " Most Frequent Word: " << std::setw(longerFrequentWordLength)
			 << std::left << MOST_COMMON_WORD << " " << std::right
			 << std::setw(mostFrequentWordCountLength)
			 << _wordCounts.at(mostCommonWordIdx) << " (" << std::setw(7)
			 << std::fixed << std::setprecision(3) << std::right
			 << (float)_wordCounts.at(mostCommonWordIdx) / _totalWordCount * 100
			 << "%)" << std::endl;
	_fileOut << "Least Frequent Word: " << std::setw(longerFrequentWordLength)
			 << std::left << LEAST_COMMON_WORD << " " << std::right
			 << std::setw(mostFrequentWordCountLength)
			 << _wordCounts.at(leastCommonWordIdx) << " (" << std::setw(7)
			 << std::fixed << std::setprecision(3) << std::right
			 << (float)_wordCounts.at(leastCommonWordIdx) / _totalWordCount *
					100
			 << "%)" << std::endl;

	// Calculate the most and least common letters to display
	uint8_t mostCommonLetterIdx = 0;
	uint8_t leastCommonLetterIdx = 0;

	for (size_t letterIdx = 0; letterIdx < 26; letterIdx++) {
		// Here not using "or equals" means the letters later alphabetically get
		// ignored if they occur the same amount
		if (_letterCounts.at(letterIdx) <
			_letterCounts.at(leastCommonLetterIdx)) {
			leastCommonLetterIdx = letterIdx;
		} else {
			if (_letterCounts.at(letterIdx) >
				_letterCounts.at(mostCommonLetterIdx)) {
				mostCommonLetterIdx = letterIdx;
			}
		}
	}

	// Print out each letter along with the amount of times it occurs
	const size_t MOST_COMMON_LETTER_COUNT_LENGTH =
		std::to_string(_letterCounts.at(mostCommonLetterIdx)).length();
	for (size_t letterIdx = 0; letterIdx < 26; letterIdx++) {
		_fileOut << (char)(letterIdx + 65) << ": "
				 << std::setw(MOST_COMMON_LETTER_COUNT_LENGTH) << std::right
				 << _letterCounts.at(letterIdx) << std::endl;
	}

	// Print out the most and least common letters in total
	_fileOut << " Most Frequent Letter: " << (char)(mostCommonLetterIdx + 65)
			 << " " << std::setw(MOST_COMMON_LETTER_COUNT_LENGTH) << std::right
			 << _letterCounts.at(mostCommonLetterIdx) << " (" << std::setw(7)
			 << std::fixed << std::setprecision(3)
			 << ((float)_letterCounts.at(mostCommonLetterIdx) /
				 _totalLetterCount * 100)
			 << "%)" << std::endl;
	_fileOut << "Least Frequent Letter: " << (char)(leastCommonLetterIdx + 65)
			 << " " << std::setw(MOST_COMMON_LETTER_COUNT_LENGTH) << std::right
			 << _letterCounts.at(leastCommonLetterIdx) << " (" << std::setw(7)
			 << std::fixed << std::setprecision(3)
			 << ((float)_letterCounts.at(leastCommonLetterIdx) /
				 _totalLetterCount * 100)
			 << "%)" << std::endl;
}