C++ Day10 统计圣经文本的词频 &文本查询程序

一、编程题--统计圣经出现的单词以及词频

统计一篇英文(The_Holy_Bible.txt)文章中出现的单词和词频，输入：某篇文章的绝对路径；输出：词典（词典中的内容为每一行都是一个“单词词频”）

词典的存储格式：

 1 |   a 66          |
 2 |   abandon 77    |
 3 |   public 88     |
 4 |    ......       |
 5 |_________________|
 6  
 7 struct Record
 8 {
 9     string _word;
10     int _frequency;
11 };
12  
13 class Dictionary
14 {
15 public:
16     //......
17     void read(const std::string &filename);
18     void store(const std::string &filename);
19 private:
20     vector<Record> _dict;
21 };

要将每一个单词及其出现次数存入vector中，然后遍历vector将单词及频率存储到另一个文件，从The_Holy_Bible.txt中读取单词，若是一个个读，速度较慢，可以先使用 getline ( )读取一行数据

到一个line中,再用一个istringstream 读到每一个单词里面去，读取到的单词可能不是标准单词，比如：abc123 abc？这样的字符串，使用 dealWord函数处理，忽略掉不标准的单词

 1 class Dictionary{
 2 public:
 3     Dictionary(int capa){
 4         m_dict.reserve(capa);  //预留空间
 5     }
 6 
 7     void read(const string &filename){
 8         ifstream ifs(filename);
 9         if(!ifs.good()){
10             cerr << "open" << filename << " error" << endl;
11             return;
12         }
13         //读filename这个文件，然后对每一个单词做处理
14         
15         //逐个单词去读，IO次数过多。所以可读一行到缓冲区, 再用一个istringstream 读到每一个单词里面去，这样比逐个单词读快
16         string line;
17         while(getline(ifs, line)){   //读一行，读到line
18             //再对一行数据的单词处理
19             istringstream iss(line);
20             string word;
21             //用iss每读一个，写道一个单词里面去
22             while(iss >> word){
23                 //处理成正确单词
24                 string newWord = dealWord(word);
25                 //将满足条件的单词存到vevtor中 
26                 insert(newWord);   
27             }
28         }
29 
30         ifs.close();
31     }
32 
33     void store(const string &filename){
34         ofstream ofs(filename);
35         if(!ofs.good()){
36             cerr << "open" << filename << " error" << endl;
37             return;
38         }
39         for(size_t idx = 0; idx != m_dict.size(); ++idx){
40             ofs << m_dict[idx]._word << "      " << m_dict[idx]._frequence << endl;
41         }
42         ofs.close();
43     }
44 
45     string dealWord(const string &word){
46         for(size_t idx = 0; idx != word.size(); ++idx){
47 
48             if(!isalpha(word[idx])){
49                 return string();  //返回一个空串
50             }
51         }
52         return word;
53     }
54 
55     void insert(const string &word){　　//存入vector
56         if(word == string()){ //传入空串
57             return;
58         }
59         size_t idx = 0;
60         for(idx = 0; idx != m_dict.size(); ++idx){
61 
62             if(word == m_dict[idx]._word){  //单词出现了
63                 m_dict[idx]._frequence++;
64                 break;  //频率++后不必再向后循环
65             }
66         }
67 
68         if(idx == m_dict.size()){  //单词第一次出现 vector为空时 & 遍历完还没找到
69             m_dict.push_back (Record(word, 1));
70         }
71     }
72 
73 private:
74     vector<Record> m_dict;
75 };

二、使用map容器实现词频统计

 1 class Dictionary{ 
 2 public:
 3 
 4     void read(const string &filename){
 5         ifstream ifs(filename);
 6         if(!ifs.good()){
 7             cerr << "open" << filename << " fail" << endl;
 8             return;
 9         }
10 
11         string line;
12         while(ifs >> line){
13             istringstream iss(line);
14             string word;
15 
16             while(iss >> word){
17                 string newWord = dealWord(word);
18                 if(newWord != string()){  //不为空串
19                     ++m_dict[newWord];  //不存在单词时，map会将其添加 (word作为key，value及频率++)
20                 }
21             }
22         }
23     }
24     
25     //存储到文件
26     void store(const string &filename){
27         ofstream ofs(filename);
28         if(!ofs.good()){
29             cerr << "ofs open " << filename << " error" << endl;
30             return;
31         }
32 
33         map<string, int>::iterator it;
34         for(it = m_dict.begin(); it != m_dict.end(); it++){
35             ofs << it->first << "   " << it->second << endl;
36         }
37 
38         ofs.close();
39     }
40 
41     string dealWord(const string &word){
42         for(size_t idx = 0; idx != word.size(); idx++){
43             if(!isalpha(word[idx])){
44                 return string();  //不是字母，返回空串
45             }
46         }
47         return word;
48     }
49 
50 private:
51     map<string,int> m_dict;  // key value  所有元素会按照元素的键值自动排序，从小到大
52 };

三、文本查询程序该程序将读取用户指定的任意文本文件【china_daily.txt】，然后允许用户从该文件中查找单词。查询的结果是该单词出现的次数，并列出每次出现所在的行。

如果某单词在同一行中多次出现，程序将只显示该行一次。行号按升序显示。

示例：使用提供的文件内容，然后查找单词 "element"。输出的前几行为：

---------------------------------------------
element occurs 125 times.
(line 62) element with a given key.
(line 64) second element with the same key.
(line 153) element |==| operator.
(line 250) the element type.
(line 398) corresponding element.
---------------------------------------------

思路：

数据结构设计：

element occurs 125 times. 这行使用词频统计算法，用一个map存储，map<int, string> _dict;

每一行的内容 element with a given key ....要用一个vector存储 vector <string> line;

单词和它对应的行号，单词所对应的不同行号用一个set存储，单词与行号对应起来，用map 存储， map<string, set<int>> wordsNumber;

接口

 1 class Query{
 2 
 3 public:
 4     void readFile(const string &filename);  //读取文件
 5 
 6     void preProcessLine(string &line);  //对读取到到的一行中的单词进行处理
 7 
 8     void query(const string &word);   //单词查询
 9 
10 private:
11 
12     vector<string> m_line;  //储存每一行
13     map<string, int> m_dict;  //存储单词及其出现频率
14     map<string, set<int>> wordsNumber;  //存储单词及其对应的行号信息
15 };

实现

 1 void Query::readFile(const string &filename){
 2     ifstream ifs(filename);
 3 
 4     if(!ifs.good()){
 5         std::cerr << "open "  << filename << " fail" << endl;
 6         return;
 7     }
 8 
 9     string line;
10     int lineNumber = 0;
11     while(getline(ifs, line)){
12 
13         m_line.push_back(line);  //读一行，并将处理前的结果存入vector
14 
15         preProcessLine(line);
16 
17         istringstream iss(line);
18         string word;
19         
20         while(iss >> word){   //拿到一行后，对一行中的单词一个一个处理
21             ++m_dict[word];  //单词第一次出现，插入，频率++，否则将对应单词的频率++
22             
23             //记录单词所在的行
24             wordsNumber[word].insert(lineNumber);  //插入行号到set
25         }
26         ++lineNumber;
27     }
28 }
29 
30 void Query::preProcessLine(string &line){
31     for(auto &ch : line){
32         if(!isalpha(ch)){  //不是字母，将其变为空格
33             ch = ' ';
34         }else if(isupper(ch)){
35             ch = tolower(ch);
36         }
37     }
38 }
39 
40 void Query::query(const string &word){
41     int count = m_dict[word];  //获取词频
42     cout << word << " occurs " << count << (count > 1 ? " times." : " time.") << endl;
43 
44     auto lines = wordsNumber[word];  //auto 为set<int>类型
45     for(auto &number : lines){   //遍历lines及遍历set   
46         cout << "   (line " << number + 1 << ") " << m_line[number] << endl;  //得到行号，打印出行号对应的vector行
47     }
48 
49 }

测试

 1 void test(){
 2     Query qe;
 3     qe.readFile("china_daily.txt");
 4     cout << "please input a word" << endl;
 5 
 6     string word;
 7     while(cin >> word){
 8         qe.query(word);
 9     }
10 }