国产探花免费观看_亚洲丰满少妇自慰呻吟_97日韩有码在线_资源在线日韩欧美_一区二区精品毛片,辰东完美世界有声小说,欢乐颂第一季,yy玄幻小说排行榜完本

首頁 > 學(xué)院 > 開發(fā)設(shè)計(jì) > 正文

統(tǒng)計(jì)單詞出現(xiàn)頻率及排序 從單機(jī)到多機(jī)合作

2019-11-08 02:06:48
字體:
供稿:網(wǎng)友

本文是學(xué)習(xí) 多線程服務(wù)端編程的練習(xí)

書籍作者陳碩的博客也有提到這個(gè)題目

http://blog.csdn.net/solstice/article/details/8497475

 

第一個(gè)層次很簡(jiǎn)單 單機(jī) 一個(gè)小文件 讀進(jìn)來進(jìn)行處理 然后對(duì)每個(gè)單詞進(jìn)行統(tǒng)計(jì)排序 記錄每個(gè)單詞出現(xiàn)頻率

#include <algorithm>#include <iostream>#include <unordered_map>#include <vector>#include <sstream>#include <fstream> using namespace std; std::unordered_map<std::string, int> mapStringFrequent;  void SplitStrig(std::string& line, std::vector<std::string>& vecString) {    typedef std::string::size_type strSize;    strSize i = 0;    while (i != line.size()) {        while (i != line.size() && isspace(line[i]))            ++i;        strSize j = i;        while (j != line.size() && !isspace(line[j]))            ++j;        if (i != j) {            vecString.push_back(line.substr(i, j - i));            i = j;        }    }} bool CheckWord(std::string s) {    typedef std::string::size_type strSize;    strSize i = 0;    while (i != s.size()) {        if (isalpha(s[i])) {            i++;        }        else {            return false;        }    }    return true;}   void VecInputMap(std::vector<std::string>& vecString, std::unordered_map<std::string, int>& mapStringFrequent) {    for (auto it : vecString) {        if (CheckWord(it)) {            transform(it.begin(), it.end(), it.begin(), ::tolower);            mapStringFrequent[it]++;        }    }}  bool ReadFile(const string& fileName) {    bool ret = false;    std::ifstream in(fileName);    if (in.bad() || !in) {        return ret;    }    std::string line;    std::vector<std::string> vecString;    try {        while (getline(in, line))        {            //std::cout << line <<  std::endl;            SplitStrig(line, vecString);        }         VecInputMap(vecString, mapStringFrequent);    }    catch (std::exception& e) {        std::cout << line << std::endl;        in.close();        std::cerr << e.what() << std::endl;        return ret;    }     in.close();    ret = true;    return ret;}  void WriteToFile(const string& fileName, std::vector<std::pair<int, std::string>>& vecWordFreq) {    std::ofstream out(fileName);    if (out.bad() || !out) {        return;    }     for (auto& it : vecWordFreq) {        out << it.second << '/t' << it.first << std::endl;    }    out.close(); }//============================================================ int main(){    bool ret = ReadFile("1.txt");    if (!ret)        return -1;     //ret = ReadFile("2.txt");    //if (!ret)    //  return -1;    //ret = ReadFile("3.txt");    //if (!ret)    //  return -1;    //ret = ReadFile("4.txt");    //if (!ret)    //  return -1;     //ret = ReadFile("5.txt");    //if (!ret)    //  return -1;     //ret = ReadFile("6.txt");    //if (!ret)    //  return -1;     //ret = ReadFile("7.txt");    //if (!ret)    //  return -1;     //ret = ReadFile("8.txt");    //if (!ret)    //  return -1;     //ret = ReadFile("9.txt");    //if (!ret)    //  return -1;     //ret = ReadFile("10.txt");    //if (!ret)    //  return -1;     std::vector<std::pair<int, std::string>> freq;    freq.reserve(mapStringFrequent.size());     for (auto& it : mapStringFrequent){        freq.push_back(make_pair(it.second, it.first));    }     std::sort(freq.begin(), freq.end(), [](const std::pair<int, std::string>& lhs,  // const auto& lhs in C++14            const std::pair<int, std::string>& rhs) {        return lhs.first > rhs.first;    });     //for (auto it : freq) {    //  std::cout << it.first << '/t' << it.second << '/n';    //}     WriteToFile("freqResult.txt", freq);    return 0;}

第二個(gè)層次 就是文件較大 單詞量較多 如果一次性讀入并使用vector容器存放 以及使用hash容器進(jìn)行統(tǒng)計(jì)頻率 會(huì)出現(xiàn)內(nèi)存等資源不足現(xiàn)象

那么就依次讀取進(jìn)內(nèi)存 將解析的單詞存放進(jìn)vector中,當(dāng)vector中的容量到達(dá)一個(gè)閾值 將vector中的單詞放進(jìn)map容器中統(tǒng)計(jì)單詞出現(xiàn)頻率 vector容器清空

但是map容器有多個(gè) 各個(gè)單詞存放進(jìn)那個(gè)map容器根據(jù)hash函數(shù)決定  

當(dāng)map容器的容量達(dá)到閾值后 寫入文件 map容器清空

這樣我們就得到多個(gè)記錄單詞出現(xiàn)頻率的文本 供后繼步驟分析合并 但是一個(gè)單詞的出現(xiàn)頻率肯定只出現(xiàn)在一個(gè)記錄文本中(因?yàn)橄嗤膯卧~具有相同的hash值 對(duì)應(yīng)同一個(gè)記錄文本)

如圖

 

代碼如下(記錄合并代碼后繼完成)

#include <algorithm>#include <iostream>#include <unordered_map>#include <vector>#include <sstream>#include <fstream> using namespace std; #define BUCKET_NUM  10std::unordered_map<std::string, int> mapStringFrequent[BUCKET_NUM];std::ofstream out[BUCKET_NUM];  #define MAX_WORD_SIZE   1024*1024  void SplitStrig(std::string& line, std::vector<std::string>& vecString) {    typedef std::string::size_type strSize;    strSize i = 0;    while (i != line.size()) {        while (i != line.size() && isspace(line[i]))            ++i;        strSize j = i;        while (j != line.size() && !isspace(line[j]))            ++j;        if (i != j) {            vecString.push_back(line.substr(i, j - i));            i = j;        }    }} bool CheckWord(std::string s) {    typedef std::string::size_type strSize;    strSize i = 0;    while (i != s.size()) {        if (isalpha(s[i])) {            i++;        }        else {            return false;        }    }    return true;} void VecInputMap(std::vector<std::string>& vecString, std::unordered_map<std::string, int> mapStringFrequent[]) {    for (auto it : vecString) {        if (CheckWord(it)) {            transform(it.begin(), it.end(), it.begin(), ::tolower);            int idx = std::hash<string>()(it) % BUCKET_NUM;            mapStringFrequent[idx][it]++;            if (mapStringFrequent[idx].size() >= MAX_WORD_SIZE) {                for (auto& it : mapStringFrequent[idx]) {                    out[idx] << it.second << '/t' << it.first << std::endl;                }                mapStringFrequent[idx].clear();            }        }    }} bool ReadFile(const string& fileName) {    bool ret = false;    std::ifstream in(fileName);    if (in.bad() || !in) {        return ret;    }    std::string line;    std::vector<std::string> vecString;     try {        while (getline(in, line))        {            SplitStrig(line, vecString);            //std::cout << line << std::endl;            if (vecString.size() >= MAX_WORD_SIZE) {                //std::cout << "too many words" << std::endl;                VecInputMap(vecString, mapStringFrequent);                vecString.clear();            }        }         for (int i = 0; i < BUCKET_NUM; i++) {            for (auto& it : mapStringFrequent[i]) {                out[i] << it.second << '/t' << it.first << std::endl;            }            mapStringFrequent[i].clear();        }     }    catch (std::exception& e) {        in.close();        std::cerr << e.what() << std::endl;        return ret;    }     std::cout << vecString.size() << std::endl;    ret = true;    return ret;} int main(){    std::string fileName = "0bucket.txt";    for (int i = 0; i < BUCKET_NUM; i++) {        out[i].close();        out[i].open(fileName);        fileName[0]++;    }     ReadFile("LargeFile.txt");          return 0;}

運(yùn)行效果如圖

運(yùn)行前

運(yùn)行后

合并步驟如圖:合并代碼如下
// WordFrequentMerge.cpp : 定義控制臺(tái)應(yīng)用程序的入口點(diǎn)。//#include "stdafx.h"#include <algorithm>#include <iostream>#include <unordered_map>#include <deque>#include <vector>#include <sstream>#include <fstream>using namespace std;#define DEFAULT_FILE_NAME	"0bucket.txt"#define BUCKET_NUM	10std::ifstream in[BUCKET_NUM];std::deque<std::pair<int, std::string>> deqFreqRecord[BUCKET_NUM];void OpenFile() {	std::string fileName = "0bucket.txt";	for (int i = 0; i < BUCKET_NUM; i++) {		in[i].close();		in[i].open(fileName);		if (in[i].bad() || !in[i]) {			std::cerr << "open file error.exit!!" << std::endl;			return;		}		fileName[0]++;	}}void ReadSortEle_(std::ifstream& in, std::deque<std::pair<int, std::string>>& d) {	std::string line,word;	int freq;	try {		while (getline(in, line)) {			std::string::size_type pos = line.find('/t');			if (pos != std::string::npos) {				freq = std::stoi(line.substr(0, pos));				word = line.substr(pos + 1);				//std::cout << word << "/t" << freq << std::endl;				d.push_back(std::make_pair(freq, word));			}		}	}	catch (std::exception& e) {		std::cout << e.what() << std::endl;		return;	}	std::sort(d.begin(),d.end(), [](const std::pair<int, std::string>& lhs,  		const std::pair<int, std::string>& rhs) {		return lhs.first > rhs.first;	});}void ReadSortEle(std::ifstream in[], std::deque<std::pair<int, std::string>> d[]) {	for (int i = 0; i < BUCKET_NUM; i++) {		ReadSortEle_(in[i],d[i]);	}}bool HeapCompareFunc(std::pair<int, std::string>& l,	std::pair<int, std::string>& r) {	return l.first < r.first;}void MergeFrequent(std::deque<std::pair<int, std::string>> d[]) {	vector<std::pair<int, std::string>> vecHeap;	for (int i = 0; i < BUCKET_NUM; i++) {		if (!d[i].empty()) {			vecHeap.push_back(d[i].front());			d[i].pop_front();		}	}	std::make_heap(vecHeap.begin(), vecHeap.end(), HeapCompareFunc);	while (!vecHeap.empty()) {		std::pop_heap(vecHeap.begin(), vecHeap.end(), HeapCompareFunc);		std::pair<int, std::string> p = vecHeap.back();		vecHeap.pop_back();		std::cout << p.second << "/t" << p.first << std::endl;		int idx = std::hash<string>()(p.second) % BUCKET_NUM;				if (!d[idx].empty()) {			vecHeap.push_back(d[idx].front());			std::push_heap(vecHeap.begin(), vecHeap.end(), HeapCompareFunc);			d[idx].pop_front();		}	}}int main(){	OpenFile();	ReadSortEle(in, deqFreqRecord);	MergeFrequent(deqFreqRecord);    return 0;}

代碼還有諸多待優(yōu)化地方

比如讀入單詞后 另開線程處理而不是等待處理完畢后再讀取單詞

比如數(shù)據(jù)的傳遞 考慮傳遞指針和起始位置 而不是傳遞拷貝

技術(shù)博客 http://blog.csdn.net/stecdeng 技術(shù)交流群 群號(hào)碼:324164944 歡迎c c++ windows驅(qū)動(dòng)愛好者 服務(wù)器程序員溝通交流


發(fā)表評(píng)論 共有條評(píng)論
用戶名: 密碼:
驗(yàn)證碼: 匿名發(fā)表
主站蜘蛛池模板: 静乐县| 射阳县| 林芝县| 叶城县| 左云县| 淅川县| 苏尼特左旗| 措勤县| 阿拉善左旗| 荥经县| 馆陶县| 吴川市| 牙克石市| 清丰县| 莱芜市| 绍兴县| 如皋市| 雷山县| 尉犁县| 怀安县| 汤原县| 通山县| 乌拉特后旗| 东乡族自治县| 德州市| 岳普湖县| 定远县| 泌阳县| 融水| 宜兰市| 化隆| 五峰| 平和县| 屏东市| 遵化市| 乌兰县| 尤溪县| 上饶县| 宿州市| 婺源县| 长春市|