1 /* 2 * DocSegment.cpp 3 * Created on: 2011-11-10 4 * function:分析网页算法的实现 5 * 将原始网页库中存储的网页转化为 6 * 一组词的集合. 7 */ 8 #include <algorithm> 9 #include <fstream> 10 #include <iostream> 11 #include <map> 12 #include <vector> 13 14 #include "Md5.h" 15 #include "Url.h" 16 #include "Document.h" 17 #include "ChSeg/Dict.h" 18 #include "ChSeg/HzSeg.h" 19 #include "StrFun.h" 20 21 CDict iDict; 22 23 using namespace std; 24 25 const unsigned int HEADER_BUF_SIZE = 1024;//记录头和网页头信息的最大长度 26 //const unsigned int MAX_DOC_ID = 12932; // you should change according "Doc.idx" 27 const unsigned int MAX_DOC_ID = 767; 28 //所要处理的原始网页库中文档的个数 29 //不同的原始网页库文档数不同,这个值 30 //需要更改,可以通过URL索引文件[Url.idx]得到 31 32 int main(int argc, char* argv[]) 33 { 34 string strLine, strFileName=argv[1]; 35 CUrl iUrl; 36 vector<CUrl> vecCUrl;//为什么不是map 37 CDocument iDocument; 38 vector<CDocument> vecCDocument;//vector容器保存文档对象 39 unsigned int docId = 0; 40 41 //ifstream ifs("Tianwang.raw.2559638448"); 42 ifstream ifs(strFileName.c_str());//文档对象 43 44 if (!ifs) { 45 cerr << "Cannot open tianwang.img.info for input\n"; 46 return -1; 47 } 48 49 ifstream ifsUrl("Url.idx.sort_uniq"); 50 if (!ifsUrl) { 51 cerr << "Cannot open Url.idx.sort_uniq for input\n"; 52 return -1; 53 } 54 ifstream ifsDoc("Doc.idx"); 55 if (!ifsDoc) { 56 cout<<"不能打开网页索引文件"<<endl; 57 cerr << "Cannot open Doc.idx for input\n"; 58 return -1; 59 } 60 61 while (getline(ifsUrl, strLine)) {//读入md5到id的映射 62 char chksum[33]; 63 int docid; 64 65 memset(chksum, 0, 33); 66 //cout<<strLine.c_str()<<endl<<endl; 67 sscanf( strLine.c_str(), "%s%d", chksum, &docid ); 68 //cout<<strLine.c_str(); 69 iUrl.m_sChecksum = chksum; 70 iUrl.m_nDocId = docid; 71 vecCUrl.push_back(iUrl); 72 } 73 74 while (getline(ifsDoc,strLine)){ 75 /* docid:文档编号 76 * pos:偏移 77 * length:好像和DocIndex的版本有点问题*/ 78 79 int docid,pos,length; 80 char chksum[33]; 81 82 //cout<<strLine<<endl<<endl; 83 84 memset(chksum, 0, 33); 85 sscanf( strLine.c_str(), "%d%d%d%s", &docid, &pos, &length,chksum ); 86 //cout<<endl<<docid<< pos<<length<<endl; 87 iDocument.m_nDocId = docid; 88 iDocument.m_nPos = pos; 89 iDocument.m_nLength = length; 90 iDocument.m_sChecksum = chksum;//网页的MD5 91 vecCDocument.push_back(iDocument); 92 } 93 94 95 96 //保存网页分析的结果 97 strFileName += ".seg"; 98 ofstream fout(strFileName.c_str(), ios::in|ios::out|ios::trunc|ios::binary); 99 for ( docId=0; docId<MAX_DOC_ID; docId++ ){ 100 101 // find document according to docId 102 int length = vecCDocument[docId+1].m_nPos - vecCDocument[docId].m_nPos -1;//当前网页文件的长度 103 char *pContent = new char[length+1];//记录内容 104 memset(pContent, 0, length+1); 105 ifs.seekg(vecCDocument[docId].m_nPos);//移动读取位置 106 ifs.read(pContent, length); 107 108 char *s; 109 s = pContent; 110 111 // 过滤记录头 112 int bytesRead = 0,newlines = 0; 113 while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1) { 114 if (*s == '\n') 115 newlines++; 116 else 117 newlines = 0; 118 s++; 119 bytesRead++; 120 } 121 if (bytesRead == HEADER_BUF_SIZE-1) continue; 122 123 124 // 过滤网页头部信息 125 bytesRead = 0,newlines = 0; 126 while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1) { 127 if (*s == '\n') 128 newlines++; 129 else 130 newlines = 0; 131 s++; 132 bytesRead++; 133 } 134 if (bytesRead == HEADER_BUF_SIZE-1) continue;//一般没有1024?? 135 136 //iDocument.m_sBody = s; 137 //过滤网页体的正文信息 138 iDocument.RemoveTags(s); 139 iDocument.m_sBodyNoTags = s; 140 141 delete[] pContent; 142 string strLine = iDocument.m_sBodyNoTags; 143 144 CStrFun::ReplaceStr(strLine, " ", " ");//将网页体正文中的" "替换成" " 145 CStrFun::EmptyStr(strLine); // set " \t\r\n" to " " 146 147 148 //将网页体正文信息进行分词 149 CHzSeg iHzSeg; 150 //cout<<strLine<<endl<<endl; 151 strLine = iHzSeg.SegmentSentenceMM(iDict,strLine); 152 cout<<docId<<";"; 153 cout<<strLine; 154 fout << docId << endl << strLine; 155 fout << endl; 156 157 } 158 159 return(0); 160 }