本文是學(xué)習(xí) 多線程服務(wù)端編程的練習(xí)
書籍作者陳碩的博客也有提到這個(gè)題目
http://blog.csdn.net/solstice/article/details/8497475
第一個(gè)層次很簡(jiǎn)單 單機(jī) 一個(gè)小文件 讀進(jìn)來進(jìn)行處理 然后對(duì)每個(gè)單詞進(jìn)行統(tǒng)計(jì)排序 記錄每個(gè)單詞出現(xiàn)頻率
#include <algorithm>#include <iostream>#include <unordered_map>#include <vector>#include <sstream>#include <fstream> using namespace std; std::unordered_map<std::string, int> mapStringFrequent; void SplitStrig(std::string& line, std::vector<std::string>& vecString) { typedef std::string::size_type strSize; strSize i = 0; while (i != line.size()) { while (i != line.size() && isspace(line[i])) ++i; strSize j = i; while (j != line.size() && !isspace(line[j])) ++j; if (i != j) { vecString.push_back(line.substr(i, j - i)); i = j; } }} bool CheckWord(std::string s) { typedef std::string::size_type strSize; strSize i = 0; while (i != s.size()) { if (isalpha(s[i])) { i++; } else { return false; } } return true;} void VecInputMap(std::vector<std::string>& vecString, std::unordered_map<std::string, int>& mapStringFrequent) { for (auto it : vecString) { if (CheckWord(it)) { transform(it.begin(), it.end(), it.begin(), ::tolower); mapStringFrequent[it]++; } }} bool ReadFile(const string& fileName) { bool ret = false; std::ifstream in(fileName); if (in.bad() || !in) { return ret; } std::string line; std::vector<std::string> vecString; try { while (getline(in, line)) { //std::cout << line << std::endl; SplitStrig(line, vecString); } VecInputMap(vecString, mapStringFrequent); } catch (std::exception& e) { std::cout << line << std::endl; in.close(); std::cerr << e.what() << std::endl; return ret; } in.close(); ret = true; return ret;} void WriteToFile(const string& fileName, std::vector<std::pair<int, std::string>>& vecWordFreq) { std::ofstream out(fileName); if (out.bad() || !out) { return; } for (auto& it : vecWordFreq) { out << it.second << '/t' << it.first << std::endl; } out.close(); }//============================================================ int main(){ bool ret = ReadFile("1.txt"); if (!ret) return -1; //ret = ReadFile("2.txt"); //if (!ret) // return -1; //ret = ReadFile("3.txt"); //if (!ret) // return -1; //ret = ReadFile("4.txt"); //if (!ret) // return -1; //ret = ReadFile("5.txt"); //if (!ret) // return -1; //ret = ReadFile("6.txt"); //if (!ret) // return -1; //ret = ReadFile("7.txt"); //if (!ret) // return -1; //ret = ReadFile("8.txt"); //if (!ret) // return -1; //ret = ReadFile("9.txt"); //if (!ret) // return -1; //ret = ReadFile("10.txt"); //if (!ret) // return -1; std::vector<std::pair<int, std::string>> freq; freq.reserve(mapStringFrequent.size()); for (auto& it : mapStringFrequent){ freq.push_back(make_pair(it.second, it.first)); } std::sort(freq.begin(), freq.end(), [](const std::pair<int, std::string>& lhs, // const auto& lhs in C++14 const std::pair<int, std::string>& rhs) { return lhs.first > rhs.first; }); //for (auto it : freq) { // std::cout << it.first << '/t' << it.second << '/n'; //} WriteToFile("freqResult.txt", freq); return 0;}
第二個(gè)層次 就是文件較大 單詞量較多 如果一次性讀入并使用vector容器存放 以及使用hash容器進(jìn)行統(tǒng)計(jì)頻率 會(huì)出現(xiàn)內(nèi)存等資源不足現(xiàn)象
那么就依次讀取進(jìn)內(nèi)存 將解析的單詞存放進(jìn)vector中,當(dāng)vector中的容量到達(dá)一個(gè)閾值 將vector中的單詞放進(jìn)map容器中統(tǒng)計(jì)單詞出現(xiàn)頻率 vector容器清空
但是map容器有多個(gè) 各個(gè)單詞存放進(jìn)那個(gè)map容器根據(jù)hash函數(shù)決定
當(dāng)map容器的容量達(dá)到閾值后 寫入文件 map容器清空
這樣我們就得到多個(gè)記錄單詞出現(xiàn)頻率的文本 供后繼步驟分析合并 但是一個(gè)單詞的出現(xiàn)頻率肯定只出現(xiàn)在一個(gè)記錄文本中(因?yàn)橄嗤膯卧~具有相同的hash值 對(duì)應(yīng)同一個(gè)記錄文本)
如圖
代碼如下(記錄合并代碼后繼完成)
#include <algorithm>#include <iostream>#include <unordered_map>#include <vector>#include <sstream>#include <fstream> using namespace std; #define BUCKET_NUM 10std::unordered_map<std::string, int> mapStringFrequent[BUCKET_NUM];std::ofstream out[BUCKET_NUM]; #define MAX_WORD_SIZE 1024*1024 void SplitStrig(std::string& line, std::vector<std::string>& vecString) { typedef std::string::size_type strSize; strSize i = 0; while (i != line.size()) { while (i != line.size() && isspace(line[i])) ++i; strSize j = i; while (j != line.size() && !isspace(line[j])) ++j; if (i != j) { vecString.push_back(line.substr(i, j - i)); i = j; } }} bool CheckWord(std::string s) { typedef std::string::size_type strSize; strSize i = 0; while (i != s.size()) { if (isalpha(s[i])) { i++; } else { return false; } } return true;} void VecInputMap(std::vector<std::string>& vecString, std::unordered_map<std::string, int> mapStringFrequent[]) { for (auto it : vecString) { if (CheckWord(it)) { transform(it.begin(), it.end(), it.begin(), ::tolower); int idx = std::hash<string>()(it) % BUCKET_NUM; mapStringFrequent[idx][it]++; if (mapStringFrequent[idx].size() >= MAX_WORD_SIZE) { for (auto& it : mapStringFrequent[idx]) { out[idx] << it.second << '/t' << it.first << std::endl; } mapStringFrequent[idx].clear(); } } }} bool ReadFile(const string& fileName) { bool ret = false; std::ifstream in(fileName); if (in.bad() || !in) { return ret; } std::string line; std::vector<std::string> vecString; try { while (getline(in, line)) { SplitStrig(line, vecString); //std::cout << line << std::endl; if (vecString.size() >= MAX_WORD_SIZE) { //std::cout << "too many words" << std::endl; VecInputMap(vecString, mapStringFrequent); vecString.clear(); } } for (int i = 0; i < BUCKET_NUM; i++) { for (auto& it : mapStringFrequent[i]) { out[i] << it.second << '/t' << it.first << std::endl; } mapStringFrequent[i].clear(); } } catch (std::exception& e) { in.close(); std::cerr << e.what() << std::endl; return ret; } std::cout << vecString.size() << std::endl; ret = true; return ret;} int main(){ std::string fileName = "0bucket.txt"; for (int i = 0; i < BUCKET_NUM; i++) { out[i].close(); out[i].open(fileName); fileName[0]++; } ReadFile("LargeFile.txt"); return 0;}運(yùn)行效果如圖
運(yùn)行前

運(yùn)行后

合并步驟如圖:


合并代碼如下// WordFrequentMerge.cpp : 定義控制臺(tái)應(yīng)用程序的入口點(diǎn)。//#include "stdafx.h"#include <algorithm>#include <iostream>#include <unordered_map>#include <deque>#include <vector>#include <sstream>#include <fstream>using namespace std;#define DEFAULT_FILE_NAME "0bucket.txt"#define BUCKET_NUM 10std::ifstream in[BUCKET_NUM];std::deque<std::pair<int, std::string>> deqFreqRecord[BUCKET_NUM];void OpenFile() { std::string fileName = "0bucket.txt"; for (int i = 0; i < BUCKET_NUM; i++) { in[i].close(); in[i].open(fileName); if (in[i].bad() || !in[i]) { std::cerr << "open file error.exit!!" << std::endl; return; } fileName[0]++; }}void ReadSortEle_(std::ifstream& in, std::deque<std::pair<int, std::string>>& d) { std::string line,word; int freq; try { while (getline(in, line)) { std::string::size_type pos = line.find('/t'); if (pos != std::string::npos) { freq = std::stoi(line.substr(0, pos)); word = line.substr(pos + 1); //std::cout << word << "/t" << freq << std::endl; d.push_back(std::make_pair(freq, word)); } } } catch (std::exception& e) { std::cout << e.what() << std::endl; return; } std::sort(d.begin(),d.end(), [](const std::pair<int, std::string>& lhs, const std::pair<int, std::string>& rhs) { return lhs.first > rhs.first; });}void ReadSortEle(std::ifstream in[], std::deque<std::pair<int, std::string>> d[]) { for (int i = 0; i < BUCKET_NUM; i++) { ReadSortEle_(in[i],d[i]); }}bool HeapCompareFunc(std::pair<int, std::string>& l, std::pair<int, std::string>& r) { return l.first < r.first;}void MergeFrequent(std::deque<std::pair<int, std::string>> d[]) { vector<std::pair<int, std::string>> vecHeap; for (int i = 0; i < BUCKET_NUM; i++) { if (!d[i].empty()) { vecHeap.push_back(d[i].front()); d[i].pop_front(); } } std::make_heap(vecHeap.begin(), vecHeap.end(), HeapCompareFunc); while (!vecHeap.empty()) { std::pop_heap(vecHeap.begin(), vecHeap.end(), HeapCompareFunc); std::pair<int, std::string> p = vecHeap.back(); vecHeap.pop_back(); std::cout << p.second << "/t" << p.first << std::endl; int idx = std::hash<string>()(p.second) % BUCKET_NUM; if (!d[idx].empty()) { vecHeap.push_back(d[idx].front()); std::push_heap(vecHeap.begin(), vecHeap.end(), HeapCompareFunc); d[idx].pop_front(); } }}int main(){ OpenFile(); ReadSortEle(in, deqFreqRecord); MergeFrequent(deqFreqRecord); return 0;}代碼還有諸多待優(yōu)化地方
比如讀入單詞后 另開線程處理而不是等待處理完畢后再讀取單詞
比如數(shù)據(jù)的傳遞 考慮傳遞指針和起始位置 而不是傳遞拷貝
技術(shù)博客 http://blog.csdn.net/stecdeng 技術(shù)交流群 群號(hào)碼:324164944 歡迎c c++ windows驅(qū)動(dòng)愛好者 服務(wù)器程序員溝通交流
新聞熱點(diǎn)
疑難解答
圖片精選
網(wǎng)友關(guān)注