zoukankan      html  css  js  c++  java
  • DocSegment.cpp

      1 /*
      2  * DocSegment.cpp
      3  * Created on: 2011-11-10
      4  *   function:分析网页算法的实现
      5  *   将原始网页库中存储的网页转化为
      6  *   一组词的集合.
      7  */
      8 #include <algorithm>
      9 #include <fstream>
     10 #include <iostream>
     11 #include <map>
     12 #include <vector>
     13 
     14 #include "Md5.h"
     15 #include "Url.h"
     16 #include "Document.h"
     17 #include "ChSeg/Dict.h"
     18 #include "ChSeg/HzSeg.h"
     19 #include "StrFun.h"
     20 
     21 CDict iDict;
     22 
     23 using namespace std;
     24 
     25 const unsigned int HEADER_BUF_SIZE = 1024;//记录头和网页头信息的最大长度
     26 //const unsigned int MAX_DOC_ID = 12932;        // you should change according "Doc.idx"
     27 const unsigned int MAX_DOC_ID = 767;
     28 //所要处理的原始网页库中文档的个数
     29 //不同的原始网页库文档数不同,这个值
     30 //需要更改,可以通过URL索引文件[Url.idx]得到
     31 
     32 int main(int argc, char* argv[])
     33 {
     34     string strLine, strFileName=argv[1];
     35     CUrl iUrl;
     36     vector<CUrl> vecCUrl;//为什么不是map
     37     CDocument iDocument;
     38     vector<CDocument> vecCDocument;//vector容器保存文档对象
     39     unsigned int docId = 0;
     40 
     41     //ifstream ifs("Tianwang.raw.2559638448");
     42     ifstream ifs(strFileName.c_str());//文档对象
     43 
     44     if (!ifs) {
     45         cerr << "Cannot open tianwang.img.info for input\n";
     46         return -1;
     47     }
     48 
     49     ifstream ifsUrl("Url.idx.sort_uniq");
     50     if (!ifsUrl) {
     51         cerr << "Cannot open Url.idx.sort_uniq for input\n";
     52         return -1;
     53     }
     54     ifstream ifsDoc("Doc.idx");
     55     if (!ifsDoc) {
     56         cout<<"不能打开网页索引文件"<<endl;
     57         cerr << "Cannot open Doc.idx for input\n";
     58         return -1;
     59     }
     60 
     61     while (getline(ifsUrl, strLine)) {//读入md5到id的映射
     62         char chksum[33];
     63         int  docid;
     64 
     65         memset(chksum, 0, 33);
     66         //cout<<strLine.c_str()<<endl<<endl;
     67         sscanf( strLine.c_str(), "%s%d", chksum, &docid );
     68         //cout<<strLine.c_str();
     69         iUrl.m_sChecksum = chksum;
     70         iUrl.m_nDocId = docid;
     71         vecCUrl.push_back(iUrl);
     72     }
     73 
     74     while (getline(ifsDoc,strLine)){
     75         /* docid:文档编号
     76          * pos:偏移
     77          * length:好像和DocIndex的版本有点问题*/
     78 
     79         int docid,pos,length;
     80         char chksum[33];
     81 
     82         //cout<<strLine<<endl<<endl;
     83 
     84         memset(chksum, 0, 33);
     85         sscanf( strLine.c_str(), "%d%d%d%s", &docid, &pos, &length,chksum );
     86         //cout<<endl<<docid<< pos<<length<<endl;
     87         iDocument.m_nDocId = docid;
     88         iDocument.m_nPos = pos;
     89         iDocument.m_nLength = length;
     90         iDocument.m_sChecksum = chksum;//网页的MD5
     91         vecCDocument.push_back(iDocument);
     92     }
     93 
     94 
     95 
     96     //保存网页分析的结果
     97     strFileName += ".seg";
     98     ofstream fout(strFileName.c_str(), ios::in|ios::out|ios::trunc|ios::binary);
     99     for ( docId=0; docId<MAX_DOC_ID; docId++ ){
    100 
    101         // find document according to docId
    102         int length = vecCDocument[docId+1].m_nPos - vecCDocument[docId].m_nPos -1;//当前网页文件的长度
    103         char *pContent = new char[length+1];//记录内容
    104         memset(pContent, 0, length+1);
    105         ifs.seekg(vecCDocument[docId].m_nPos);//移动读取位置
    106         ifs.read(pContent, length);
    107 
    108         char *s;
    109         s = pContent;
    110 
    111         // 过滤记录头
    112         int bytesRead = 0,newlines = 0;
    113         while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1) {
    114             if (*s == '\n')
    115                 newlines++;
    116             else
    117                 newlines = 0;
    118             s++;
    119             bytesRead++;
    120         }
    121         if (bytesRead == HEADER_BUF_SIZE-1) continue;
    122 
    123 
    124         // 过滤网页头部信息
    125         bytesRead = 0,newlines = 0;
    126         while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1) {
    127             if (*s == '\n')
    128                 newlines++;
    129             else
    130                 newlines = 0;
    131             s++;
    132             bytesRead++;
    133         }
    134         if (bytesRead == HEADER_BUF_SIZE-1) continue;//一般没有1024??
    135 
    136         //iDocument.m_sBody = s;
    137         //过滤网页体的正文信息
    138         iDocument.RemoveTags(s);
    139         iDocument.m_sBodyNoTags = s;
    140 
    141         delete[] pContent;
    142         string strLine = iDocument.m_sBodyNoTags;
    143 
    144         CStrFun::ReplaceStr(strLine, "&nbsp;", " ");//将网页体正文中的"&nbsp"替换成" "
    145         CStrFun::EmptyStr(strLine); // set " \t\r\n" to " "
    146 
    147 
    148         //将网页体正文信息进行分词
    149         CHzSeg iHzSeg;
    150         //cout<<strLine<<endl<<endl;
    151         strLine = iHzSeg.SegmentSentenceMM(iDict,strLine);
    152         cout<<docId<<";";
    153         cout<<strLine;
    154         fout << docId << endl << strLine;
    155         fout << endl;
    156         
    157     }
    158 
    159     return(0);
    160 }
  • 相关阅读:
    c++0.9-----c++ primer之noexcept解读
    c++0.8-----快速定位c++源码位置的小技巧
    c++0.7-----源码分析:iostate及badbit/failbit/eofbit/goodbit以及io文件的包含关系<原创>
    c++0.6-----如何在自己搭建的c++环境中使用extern变量
    c++0.5-----如何在widows下面搭建最简洁的c++环境
    c++0.4-----面向对象的三种关系(继承/复合/委托)
    c++0.3----this指针/static/namespace
    c++0.2-----基于对象的类(包含指针)
    3、静态代理模式
    2、工厂方法模式
  • 原文地址:https://www.cnblogs.com/kakamilan/p/2591408.html
Copyright © 2011-2022 走看看