zoukankan      html  css  js  c++  java
  • DocSegment.cpp

      1 /*
      2  * DocSegment.cpp
      3  * Created on: 2011-11-10
      4  *   function:分析网页算法的实现
      5  *   将原始网页库中存储的网页转化为
      6  *   一组词的集合.
      7  */
      8 #include <algorithm>
      9 #include <fstream>
     10 #include <iostream>
     11 #include <map>
     12 #include <vector>
     13 
     14 #include "Md5.h"
     15 #include "Url.h"
     16 #include "Document.h"
     17 #include "ChSeg/Dict.h"
     18 #include "ChSeg/HzSeg.h"
     19 #include "StrFun.h"
     20 
     21 CDict iDict;
     22 
     23 using namespace std;
     24 
     25 const unsigned int HEADER_BUF_SIZE = 1024;//记录头和网页头信息的最大长度
     26 //const unsigned int MAX_DOC_ID = 12932;        // you should change according "Doc.idx"
     27 const unsigned int MAX_DOC_ID = 767;
     28 //所要处理的原始网页库中文档的个数
     29 //不同的原始网页库文档数不同,这个值
     30 //需要更改,可以通过URL索引文件[Url.idx]得到
     31 
     32 int main(int argc, char* argv[])
     33 {
     34     string strLine, strFileName=argv[1];
     35     CUrl iUrl;
     36     vector<CUrl> vecCUrl;//为什么不是map
     37     CDocument iDocument;
     38     vector<CDocument> vecCDocument;//vector容器保存文档对象
     39     unsigned int docId = 0;
     40 
     41     //ifstream ifs("Tianwang.raw.2559638448");
     42     ifstream ifs(strFileName.c_str());//文档对象
     43 
     44     if (!ifs) {
     45         cerr << "Cannot open tianwang.img.info for input\n";
     46         return -1;
     47     }
     48 
     49     ifstream ifsUrl("Url.idx.sort_uniq");
     50     if (!ifsUrl) {
     51         cerr << "Cannot open Url.idx.sort_uniq for input\n";
     52         return -1;
     53     }
     54     ifstream ifsDoc("Doc.idx");
     55     if (!ifsDoc) {
     56         cout<<"不能打开网页索引文件"<<endl;
     57         cerr << "Cannot open Doc.idx for input\n";
     58         return -1;
     59     }
     60 
     61     while (getline(ifsUrl, strLine)) {//读入md5到id的映射
     62         char chksum[33];
     63         int  docid;
     64 
     65         memset(chksum, 0, 33);
     66         //cout<<strLine.c_str()<<endl<<endl;
     67         sscanf( strLine.c_str(), "%s%d", chksum, &docid );
     68         //cout<<strLine.c_str();
     69         iUrl.m_sChecksum = chksum;
     70         iUrl.m_nDocId = docid;
     71         vecCUrl.push_back(iUrl);
     72     }
     73 
     74     while (getline(ifsDoc,strLine)){
     75         /* docid:文档编号
     76          * pos:偏移
     77          * length:好像和DocIndex的版本有点问题*/
     78 
     79         int docid,pos,length;
     80         char chksum[33];
     81 
     82         //cout<<strLine<<endl<<endl;
     83 
     84         memset(chksum, 0, 33);
     85         sscanf( strLine.c_str(), "%d%d%d%s", &docid, &pos, &length,chksum );
     86         //cout<<endl<<docid<< pos<<length<<endl;
     87         iDocument.m_nDocId = docid;
     88         iDocument.m_nPos = pos;
     89         iDocument.m_nLength = length;
     90         iDocument.m_sChecksum = chksum;//网页的MD5
     91         vecCDocument.push_back(iDocument);
     92     }
     93 
     94 
     95 
     96     //保存网页分析的结果
     97     strFileName += ".seg";
     98     ofstream fout(strFileName.c_str(), ios::in|ios::out|ios::trunc|ios::binary);
     99     for ( docId=0; docId<MAX_DOC_ID; docId++ ){
    100 
    101         // find document according to docId
    102         int length = vecCDocument[docId+1].m_nPos - vecCDocument[docId].m_nPos -1;//当前网页文件的长度
    103         char *pContent = new char[length+1];//记录内容
    104         memset(pContent, 0, length+1);
    105         ifs.seekg(vecCDocument[docId].m_nPos);//移动读取位置
    106         ifs.read(pContent, length);
    107 
    108         char *s;
    109         s = pContent;
    110 
    111         // 过滤记录头
    112         int bytesRead = 0,newlines = 0;
    113         while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1) {
    114             if (*s == '\n')
    115                 newlines++;
    116             else
    117                 newlines = 0;
    118             s++;
    119             bytesRead++;
    120         }
    121         if (bytesRead == HEADER_BUF_SIZE-1) continue;
    122 
    123 
    124         // 过滤网页头部信息
    125         bytesRead = 0,newlines = 0;
    126         while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1) {
    127             if (*s == '\n')
    128                 newlines++;
    129             else
    130                 newlines = 0;
    131             s++;
    132             bytesRead++;
    133         }
    134         if (bytesRead == HEADER_BUF_SIZE-1) continue;//一般没有1024??
    135 
    136         //iDocument.m_sBody = s;
    137         //过滤网页体的正文信息
    138         iDocument.RemoveTags(s);
    139         iDocument.m_sBodyNoTags = s;
    140 
    141         delete[] pContent;
    142         string strLine = iDocument.m_sBodyNoTags;
    143 
    144         CStrFun::ReplaceStr(strLine, "&nbsp;", " ");//将网页体正文中的"&nbsp"替换成" "
    145         CStrFun::EmptyStr(strLine); // set " \t\r\n" to " "
    146 
    147 
    148         //将网页体正文信息进行分词
    149         CHzSeg iHzSeg;
    150         //cout<<strLine<<endl<<endl;
    151         strLine = iHzSeg.SegmentSentenceMM(iDict,strLine);
    152         cout<<docId<<";";
    153         cout<<strLine;
    154         fout << docId << endl << strLine;
    155         fout << endl;
    156         
    157     }
    158 
    159     return(0);
    160 }
  • 相关阅读:
    VS2008 环境中完美搭建 Qt 4.7.4 静态编译的调试与发布 Inchroy's Blog 博客频道 CSDN.NET
    编写可丢弃的代码
    c++ using namespace std; 海明威 博客园
    解决MySQL server has gone away
    nginx upstream 调度策略
    (2006, 'MySQL server has gone away') 错误解决 dba007的空间 51CTO技术博客
    Linux IO模型漫谈(2) 轩脉刃 博客园
    redis源码笔记 initServer 刘浩de技术博客 博客园
    MySQLdb批量插入数据
    词库的扩充百度百科的抓取你知道这些热词吗? rabbit9898 ITeye技术网站
  • 原文地址:https://www.cnblogs.com/kakamilan/p/2591408.html
Copyright © 2011-2022 走看看