zoukankan      html  css  js  c++  java
  • 网页抓取总结(一)

    今天在公司学了网页抓取,感觉在学校C就学了个皮毛,到了公司啥都不懂。做个简单的总结

    1、建工程不在一个文件夹,调用函数时,写的头文件要带路径,如#include "lyPublic/lyCodeConvert.c"

    2、窗口事件要修改 工程-设置-连接的“/subsystem:console /incremental:yes” ,改为“/subsystem:windows /incremental:yes”

    3、在抓取网页时,传递的网址,UTF-8和GBK之间的转换。要将GBK转为UTF-8后再打开,不然会丢失关键词

    今天成果:

    #include <stdlib.h>
    #include <stdio.h>
    #include <string.h>
    #include "lyGetHttpResult.h"
    #include "lyPublic/lyCodeConvert.c"
    int main()
    {
        char szUrl[512] = "";
        char svData[1024 * 40] = "";
        char *szData = NULL;
        FILE *fp;
        char *p, *q;
        char strFrom[100]="",strTo[100]="";
        int len, falg;
        sprintf(szUrl, "http://www.chazidian.com/jinyicidaquan/");
        szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);
        if(!szData)
            return NULL;
    
        CodeConvert(szData, svData, sizeof(svData), 1);
    //    puts(svData);
    /*    if(fopen("Text.txt", "r+") == NULL)
            fp=fopen("Text.txt", "w+r");
        else
            fp=fopen("Text.txt", "r+");
        fputs(svData, fp);*/
        gets(strFrom);
        while(strstr(svData, strFrom) == NULL)//判断是否在本页,不在的话进入下一页
        {
            p = strstr(svData, "下一页");    
            q = p-60;
            memset(szUrl, 0, sizeof(szUrl));
            len = 0;
            while(q++ < p)
                szUrl[len++]=*q;
            szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);
            CodeConvert(szData, svData, sizeof(svData), 1);
            //HanziToAnsi(szData, sizeof(szData),svData,sizeof(svData));
        }
    
        p = strstr(svData, strFrom);//找到起点
        falg = 0;//标记是前词还是后词
        if(*(p-1)=='/')
        {
            q = p - 1;
        }
        else
        {
            q = p - 1;
            while(*q!='/')
            {
                q--;
            }
            p=q+1;
            falg = 1;//标记为后词
        }
    
        while(*q!='"')
            q--;
    
        memset(szUrl, 0, sizeof(szUrl));
        len = 0;
        while(++q < p)
            szUrl[len++] = *q;
        if(!falg)
        {
            CodeConvert(strFrom, strTo, sizeof(strTo), 2);
            strcat(szUrl, strTo);
        }
        puts(szUrl);
        szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);
        CodeConvert(szData, svData, sizeof(svData), 1);//转码
    //    HanziToAnsi(szData, sizeof(szData),svData,sizeof(svData));
        puts(svData);
    
        if(fopen("Text.txt", "r+") == NULL)
            fp=fopen("Text.txt", "w+r");
        else
            fp=fopen("Text.txt", "r+");
        fputs(svData, fp);
    
    
        free(szData);
        szData = NULL;
        return 1;
    }

     优化1.2版:

     1 #include <stdlib.h>
     2 #include <stdio.h>
     3 #include <string.h>
     4 #include "lyGetHttpResult.h"
     5 #include "lyPublic/lyCodeConvert.c"
     6 int main()
     7 {
     8 
     9     char szUrl[512] = "";
    10     char svData[1024 * 40] = "";
    11     char *szData = NULL;
    12 //    FILE *fp;
    13 //    char *p, *q,*q2,*p2;
    14     char *p;
    15     char strFrom[100] = "", strTo[100] = "";
    16     char findStr[20] = "", andStr[20] = "</span> - ";//查找标记串
    17     char outStr[100] = "",reStr[100] = "";
    18     char str[100] = "",str2[100] = "";
    19     int len;
    20     while(gets(strFrom))
    21     {                //初串
    22         memset(str,0,sizeof(str));
    23         memset(reStr,0,sizeof(reStr));
    24         memset(str2,0,sizeof(str2));
    25         memset(findStr,0,sizeof(findStr));
    26         memset(strTo,0,sizeof(strTo));
    27         strcpy(str,"http://www.chazidian.com/jinyici/");
    28         strcpy(reStr,strFrom);
    29         CodeConvert(strFrom, str2, sizeof(str2), 2);//先将汉字GBK转为UTF-8再接道网址后面
    30         strcat(str,str2);
    31 
    32         sprintf(szUrl, str);
    33         szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);
    34         if(!szData)
    35             return NULL;
    36 
    37         CodeConvert(szData, svData, sizeof(svData), 1);//找汉字的时候是找GBK。,所以还要转回来
    38         /*    if(fopen("Text.txt", "r+") == NULL)
    39                     fp=fopen("Text.txt", "w+r");
    40                 else
    41                     fp=fopen("Text.txt", "r+");
    42                 fputs(svData, fp);*/
    43         strcpy(findStr,strFrom);//
    44         strcat(findStr,andStr);
    45         p = strstr(svData, findStr);
    46         len = strlen(outStr);
    47         while(*p != '
    ') ///有雨原网页的特点,设置为遇到回车结束
    48         {
    49             if(*p != '<' && (*p < 'a'||*p > 'z') && *p != '/' && *p != '>' && *p != '-')
    50             {
    51                 outStr[len++] = *p;
    52             }
    53             p++;
    54         }
    55         puts(outStr);
    56 
    57         p = strstr(outStr,reStr);//去重
    58         len = strlen(reStr);
    59         p+=len+2;
    60         printf("%s
    ",p);
    61         memset(strFrom,0,sizeof(strFrom));
    62         memset(outStr,0,sizeof(outStr));
    63         free(szData);
    64         szData = NULL;
    65     }
    66     return 1;
    67 }
    View Code
  • 相关阅读:
    vue项目实现路由按需加载
    常用的meta标签
    聊聊https
    Centos 7 忘记root密码修改方法
    find常用命令
    linux上的mysql忘记密码
    kaill 安装zenmap软件
    selenium.common.exceptions.WebDriverException: Message: 'chromedriver'解决
    centos7 开机执行脚本或者命令
    Linux下载常用命令
  • 原文地址:https://www.cnblogs.com/zibuyu/p/3194275.html
Copyright © 2011-2022 走看看