zoukankan      html  css  js  c++  java
  • Windows下比较简单的获取网页源码的方法

    第一个方法是使用MFC里面的 <afxinet.h>

     CString GetHttpFileData(CString strUrl)
    {
         CInternetSession Session("Internet Explorer", 0);
         CHttpFile *pHttpFile = NULL;
         CString strData;
         CString strClip;
         pHttpFile = (CHttpFile*)Session.OpenURL(strUrl);
        while ( pHttpFile->ReadString(strClip) )
         {
          strData += strClip;
          }
         return strData;
    }

    要讲一下,pHttpFile->ReadString() 每次可能只读一个数据片断,读多少次取决于网络状况,所以要把每次读到的数据加到总数据的尾部,用了CString 省去了缓冲区处理:) 
    别忘了包含头文件#include <afxinet.h> 在工程设置,里面要选择 using MFC 要不然编译不了

    第二种是使用WinNet的纯API实现的

    #define MAXBLOCKSIZE 1024
    #include <windows.h>
    #include <wininet.h>
    
    #pragma comment(lib, "wininet.lib")
    
    void GetWebSrcCode(const char *Url);
    
    int _tmain(int argc, _TCHAR* argv[])
    {
        GetWebSrcCode("http://www.cnblogs.com/");
    
        return 0;
    }
    
    void GetWebSrcCode(const char *Url)
    {
        HINTERNET hSession = InternetOpen("zwt", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
        if (hSession != NULL)
        {
            HINTERNET hURL = InternetOpenUrl(hSession, Url, NULL, 0, INTERNET_FLAG_DONT_CACHE, 0);
            if (hURL != NULL)
            {
                char Temp[MAXBLOCKSIZE] = {0};
                ULONG Number = 1;
    
                FILE *stream;
                if( (stream = fopen( "E:\test.html", "wb" )) != NULL )
                {
                    while (Number > 0)
                    {
                        InternetReadFile(hURL, Temp, MAXBLOCKSIZE - 1, &Number);
                        fwrite(Temp, sizeof (char), Number , stream);
                    }
                    fclose( stream );
                }
    
                InternetCloseHandle(hURL);
                hURL = NULL;
            }
    
            InternetCloseHandle(hSession);
            hSession = NULL;
        }
    }

    第三种就是使用非封装过的Socket实现了

    int main(int argc, char* argv[])
    {
        SOCKET hsocket;
        SOCKADDR_IN saServer;
        WSADATA wsadata;
        LPHOSTENT lphostent;
        int nRet;
        char Dest[3000];  
        char* host_name="blog.sina.com.cn";
        char* req="GET /s/blog_44acab2f01016gz3.html HTTP/1.1
    "
            "User-Agent: Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0C; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)
    "
            "Host:blog.sina.com.cn
    
    ";
    
    
        // 初始化套接字  
        if(WSAStartup(MAKEWORD(2,2),&wsadata))
            printf("初始化SOCKET出错!");
        lphostent=gethostbyname(host_name);   
        if(lphostent==NULL)   
            printf("lphostent为空!");   
        hsocket = socket(AF_INET,SOCK_STREAM,IPPROTO_TCP);   
        saServer.sin_family = AF_INET;   
        saServer.sin_port = htons(80);   
        saServer.sin_addr =*((LPIN_ADDR)*lphostent->h_addr_list);   
        // 利用SOCKET连接   
        nRet = connect(hsocket,(LPSOCKADDR)&saServer,sizeof(SOCKADDR_IN));   
        if(nRet == SOCKET_ERROR)   
        {
            printf("建立连接时出错!");   
            closesocket(hsocket);
            return 0;
        }
        // 利用SOCKET发送   
    
        nRet = send(hsocket,req,strlen(req),0);   
        if(nRet==SOCKET_ERROR)   
        {   
            printf("发送数据包时出错!");   
            closesocket(hsocket);   
        }   
        nRet=1;   
        while(nRet>0)   
        {   
            // 接收返回数据包   
            nRet=recv(hsocket,(LPSTR)Dest,sizeof(Dest),0);   
            if(nRet>0)
                Dest[nRet]=0;
            else   
                Dest[0]=0;  
            char sDest[3000] = {0};
            UTF8_2_GB2312(sDest,nRet,Dest,nRet);
            // 显示返回数据包的大小、内容  
            //printf("
    Received bytes:%d
    ",nRet);   
            printf("Result:
    %s",sDest);   
        }
    }

    另外,以上我们获取网页的时候,获取到的可能是UTF8,似乎目前大多数网站都用的这种编码吧!下面是编码转换。

    void UTF_8ToUnicode(wchar_t* pOut,char *pText)
    {   
        char* uchar = (char *)pOut; 
        uchar[1] = ((pText[0] & 0x0F) << 4) + ((pText[1] >> 2) & 0x0F);
        uchar[0] = ((pText[1] & 0x03) << 6) + (pText[2] & 0x3F); 
    } 
    void Gb2312ToUnicode(wchar_t* pOut,char *gbBuffer) 
    {   
        ::MultiByteToWideChar(CP_ACP,MB_PRECOMPOSED,gbBuffer,2,pOut,1);
    } 
    void UTF_8ToGB2312(char*pOut, char *pText, int pLen)  
    {   
        char Ctemp[4];   
        memset(Ctemp,0,4); 
        int i =0 ,j = 0; 
        while(i < pLen) 
        {  
            if(pText[i] >= 0)  
            {  
                pOut[j++] = pText[i++]; 
            } 
            else 
            {  
                WCHAR Wtemp; 
                UTF_8ToUnicode(&Wtemp,pText + i);
                UnicodeToGB2312(Ctemp,Wtemp); 
                pOut[j] = Ctemp[0];
                pOut[j + 1] = Ctemp[1];  
                i += 3;   
                j += 2;  
            }   
        } 
        pOut[j] ='
    '; 
        return; 
    } 

    这是是转换成GB2312的代码

  • 相关阅读:
    SVM
    决策树
    神经网络
    机器学习之降维方法
    机器学习之特征选择
    浏览器状态码大全
    哈希表
    社区发现算法总结(二)
    社区发现算法总结(一)
    聚类篇-------度量
  • 原文地址:https://www.cnblogs.com/croot/p/3391003.html
Copyright © 2011-2022 走看看