zoukankan      html  css  js  c++  java
  • 一个CHttpFile下载网页的例子

    /*************************************************************************************
    项目是分析各视频的真实地址 生成一个dll, 供其它项目使用, 项目中使用了zlib, boost, 要另下载
    如果是sohu网站则自己分析地址. 如果是其它视频网站才从flvcd上获取结果. 项目中使用了: 1.gizp解压. 2.UTF8与GB2312转码 3.boost正则表达式 boost查找单个匹配, 查找所有匹配 4.sohu视频地址是分了四类视频分析的. 可以用fiddler查找功能查找到所想要的几个字符串 5.文件获取是使用的MFC中的CHttpFile获取的, 尝试用了WinINet和WinHTTP ms的api访问网络的都不怎么行. chrome浏览器第一个版本是用winhttp访问网络的. 也试过socket访问网络 但要跳转什么的太繁了 6.函数导出, 可以用def文件. 也可以用dellexport 7.多线程CreateThread 注: 网络访问花了相当大的时间 正则表达式boost中的perl正则表达式. "要写成\" \要写成\\, 要多用查找替换. 匹配多个结果时要迭代搜索查询 网络给的数据是压缩的gzip问题也花了好长时间. utf-8与gb2312转换也花了好长时间. buff最后一次读取时, 字符串没法控制. 内存初始化是没置成0就行了 函数导出研究了两种方法, 花了很长时间. 多线程没花多长时间 ************************************************************************************
    */

    Analyzer.cpp

    vector<string> Analyzer::GetPropertyInIntegratedBrackets(string strPropertyName, string strJson)
    {
    	vector<string> vect;
    	regex regclipsURL("(?<=(" + strPropertyName + "\":\\[))[^]]+?(?=(]))");
    	boost::smatch what;
    	string strclipsURL = "";
    	//转成另一个变量再传,不然出错 强转是强的指针,以前是结构类型,强指针没用
    	if(regex_search(strJson, what, regclipsURL))
    	{
    		strclipsURL = what[0];
    	}
    	int iIndex = 0;
    	while (iIndex >= 0)
    	{
    		iIndex = strclipsURL.find(',');
    		if(iIndex > 0)
    		{
    			vect.push_back(strclipsURL.substr(1, iIndex - 2));//去了两边的双引号
    			strclipsURL = strclipsURL.substr(iIndex + 1);
    		}
    		else
    			vect.push_back(strclipsURL.substr(1,strlen(strclipsURL.c_str()) - 2)); //去了两边的双引号
    	} 
    	return vect;
    }
    int Analyzer::httpgzdecompress(Byte *zdata, uLong nzdata, Byte *data, uLong *ndata)
    {
    	int err = 0;
    	z_stream d_stream = {0}; /* decompression stream */
    	static char dummy_head[2] = 
    	{
    		0x8 + 0x7 * 0x10,
    		(((0x8 + 0x7 * 0x10) * 0x100 + 30) / 31 * 31) & 0xFF,
    	};
    	d_stream.zalloc = (alloc_func)0;
    	d_stream.zfree = (free_func)0;
    	d_stream.opaque = (voidpf)0;
    	d_stream.next_in  = zdata;
    	d_stream.avail_in = 0;
    	d_stream.next_out = data;
    	if(inflateInit2(&d_stream, 47) != Z_OK) return -1;
    	while (d_stream.total_out < *ndata && d_stream.total_in < nzdata) 
    	{
    		d_stream.avail_in = d_stream.avail_out = 1; /* force small buffers */
    		if((err = inflate(&d_stream, Z_NO_FLUSH)) == Z_STREAM_END) break;
    		if(err != Z_OK )
    		{
    			if(err == Z_DATA_ERROR)
    			{
    				d_stream.next_in = (Bytef*) dummy_head;
    				d_stream.avail_in = sizeof(dummy_head);
    				if((err = inflate(&d_stream, Z_NO_FLUSH)) != Z_OK) 
    				{
    					return -1;
    				}
    			}
    			else return -1;
    		}
    	}
    	if(inflateEnd(&d_stream) != Z_OK) return -1;
    	*ndata = d_stream.total_out;
    	return 0;
    }
    //ms-help://MS.VSCC.v90/MS.MSDNQTR.v90.chs/intl/unicode_81rn.htm
    //将UTF8字符串转换为gb2312    
    CString Analyzer::ConvertUTF8toGB2312(const char *pData, size_t size)
    {
    	size_t n = MultiByteToWideChar(CP_UTF8, 0, pData, (int)size, NULL, 0);
    	WCHAR   *   pChar   =   new   WCHAR[n+1];
    
    	n = MultiByteToWideChar(CP_UTF8, 0, pData, (int)size, pChar, n);
    	pChar[n]=0;
    
    	n = WideCharToMultiByte(936, 0, pChar, -1, 0, 0, 0, 0);
    	char *p = new char[n+1];
    
    	n = WideCharToMultiByte(936, 0, pChar, -1, p, (int)n, 0, 0);
    	CString result(p);
    
    	delete []pChar;
    	delete []p;
    	return result;
    } 
    
    CString Analyzer::GetPageHtml(CString strUrl) 
    {
    	CString strHtml = "";//获取HTML
    	try
    	{
    		strUrl = strUrl.Trim();
    		CInternetSession session("HttpClient");
    		session.SetOption(INTERNET_OPTION_CONNECT_TIMEOUT, 5000);      // 5秒的连接超时
    		session.SetOption(INTERNET_OPTION_SEND_TIMEOUT, 1000);           // 1秒的发送超时
    		session.SetOption(INTERNET_OPTION_RECEIVE_TIMEOUT, 7000);        // 7秒的接收超时
    		session.SetOption(INTERNET_OPTION_DATA_SEND_TIMEOUT, 1000);     // 1秒的发送超时
    		session.SetOption(INTERNET_OPTION_DATA_RECEIVE_TIMEOUT, 7000);       // 7秒的接收超时
    		session.SetOption(INTERNET_OPTION_CONNECT_RETRIES, 1);          // 1次重试
    		CHttpFile* pFile = (CHttpFile*)session.OpenURL((LPCTSTR)strUrl, 1, INTERNET_FLAG_RELOAD | INTERNET_FLAG_TRANSFER_BINARY);
    		DWORD dwStatusCode;   
    		pFile-> QueryInfoStatusCode(dwStatusCode);
    
    		if(dwStatusCode == HTTP_STATUS_OK)   
    		{   
    			CString strLength = "";
    			CString strHeaders = "";
    			pFile->QueryInfo(HTTP_QUERY_CONTENT_LENGTH, strLength);
    			pFile->QueryInfo(HTTP_QUERY_RAW_HEADERS_CRLF, strHeaders);
    			long lLength = 4096 * 500;
    			byte* pbHtml = new byte[lLength]; //在堆上动态分配内存
    			memset(pbHtml, 0, lLength); //初始化
    			byte sRecived[512];
    			int iIndex = 0;
    			int num = 0;
    			while((num = pFile->Read(sRecived,512)) > 0 )
    			{
    				memcpy(pbHtml+iIndex, sRecived, num);
    				iIndex+=num;
    			}
    			pbHtml[iIndex] = NULL;
    
    			if(strHeaders.Find("gzip") > -1)
    			{
    				uLong ulLength = 4096 * 500;
    				byte* pbData = new byte[ulLength];
    				memset(pbData,0,ulLength);
    				httpgzdecompress(pbHtml, lLength, pbData, &ulLength);
    				pbData[ulLength] = NULL;
    				strHtml = (CHAR*)pbData;
    				delete pbData;
    			}
    			else
    			{
    				strHtml = (CHAR*)pbHtml;
    				if(strHeaders.MakeLower().Find("utf-8") > - 1 || strHtml.MakeLower().Find("utf-8") > -1)//strHtml变成小写了
    				{
    					strHtml = ConvertUTF8toGB2312((CHAR*)pbHtml,strlen((CHAR*)pbHtml));//编码转换
    				}
    				else//重新得到大小写区分的
    				{
    					strHtml = (CHAR*)pbHtml;
    				}
    			}
    			delete pbHtml;
    		}
    		pFile -> Close();   
    		delete pFile; 
    		session.Close();
    		return strHtml;
    	}
    	catch (CException* e)
    	{
    		(void)e;
    		this->m_State = Analyzer_State_NetError;
    		return "";
    	}
    }
    
    

      源码下载

  • 相关阅读:
    css之布局
    css之浮动
    白扯之聊聊我们的情怀
    Vue之指令
    Vue之vue.js声明式渲染
    AJAX经常遇到的那些问题
    HTTP之cookie技术
    正则表达式资料
    require.js资料
    AMD 和 CMD 的区别
  • 原文地址:https://www.cnblogs.com/barrysgy/p/3217459.html
Copyright © 2011-2022 走看看