zoukankan      html  css  js  c++  java
  • 网页抓取- 3

    http://www.diyifanwen.com/jinyici/jinyici-A/

    页面抓取

    #include <stdlib.h>
    #include <stdio.h>
    #include <string.h>
    #include "lyGetHttpResult.h"
    #include "lyPublic/lyCodeConvert.h"
    int main()
    {
        
        char szUrl[512] = "";
        char *svData= NULL;
        char *szData = NULL;
        FILE *fp;
        char *p, *q, *s, *t;
        char strFrom[100] = "http://www.diyifanwen.com/", strTo[100] = "";
        //char andStr[20] = "</span> - ";//查找标记串
        char outStr[1024*50]= "";
        char str[500]= "",str2[500]= "";
        char next[100]= "",End[100] = "http://www.diyifanwen.com//jinyici/jinyici-A/";
        int len;
        fp = fopen("1.txt","rt+");
        sprintf(szUrl,"http://www.diyifanwen.com/jinyici/jinyici-A/");
        szData = GetDataFromWeb(szUrl,NULL,NULL,1, 5);
    //    fputs(szData, fp);
    //    CodeConvert(szData, svData, sizeof(svData),1);
        p = strstr(szData , " title=");
    //    printf("%c", End[43]);
    //    p = strstr(szData , "昂首挺立");
        while(p !=NULL)
        {
            q=p;
            q-=60;
            while(*q != '/')
                q++;
            strcpy(str, strFrom);
            strcpy(strTo,str);
            len = strlen(strTo);
            --p;
            while(q<p)
            {
                strTo[len++] = *(q++);
            }
            q+=9;
            while(*q != '"')
            {
                printf("%c",*q);
                fputc(*(q++) , fp);
            }
            fputs("  ",fp);
            puts(strTo);
            svData = GetDataFromWeb(strTo , NULL, NULL , 1, 5);
            while(!szData)
                svData = GetDataFromWeb(strTo , NULL, NULL , 1, 5);
            strcpy(szData,szData);
        //    CodeConvert(svData, outStr,sizeof(outStr),1);
        //    fputs(svData,fp);
            s = strstr(svData, "】</span>");
            t = strstr(svData, "<br><span>");
            s+=3;;
            while(s < t)
            {
                if(*s != '' && *s!='<'&&*s != '/'&& *s != '>' && *s != '&' &&(*s < 'a'||*s > 'z'))
                {
                    printf("%c",*s);
                    fputc(*s, fp);
                }
                else if(*s==';')
                    fputc(' ',fp);
                s++;
            }
            fputc('
    ',fp);
            p+=20;
            q = strstr(p, " title=");
            p = q;
            memset(strTo,0,sizeof(strTo));
            memset(outStr,0,sizeof(outStr));
            if(p == NULL)
            {
                strcpy(strTo,str);
                if(!strstr(szData,"下一页</a> <a href"))            
                {
                    printf("oooo");
                    End[43]++;
                    if(End[43]>'Z')
                        break;
                    szData =GetDataFromWeb(End,NULL,NULL,1 ,5);
                    while(!szData)
                        svData = GetDataFromWeb(strTo , NULL, NULL , 1, 5);
                    p = strstr(szData," title=");
                    continue;
                }
                s = strstr(szData ,"下一页");
                t = s-55;
                s-=15;
                while(*t != '=')
                    t++;
                t+=2;
                s-=3;
                len = strlen(strTo);
                while(t < s)
                {
                    strTo[len++] = *t;
                    t++;
                }
                szData = GetDataFromWeb(strTo,NULL,NULL,1 ,5);
                    while(!szData)
                        svData = GetDataFromWeb(strTo , NULL, NULL , 1, 5);
                p = strstr(szData, " title=");
            }
            memset(str,0,sizeof(str));
            memset(strTo,0,sizeof(strTo));
            memset(outStr,0,sizeof(outStr));
        }
    
    
        fclose(fp);
        free(svData);
        free(szData);
        free(p);
        free(q);
        free(s);
        free(t);
        return 0;
    }
  • 相关阅读:
    实验二 结对编程 第二阶段
    实验二 结对编程 第一阶段
    实验一 GIT 代码版本管理
    实验五 单元测试
    实验四 代码审查
    结对编程 第二阶段
    结对编程 第一阶段
    实验一 GIT代码版本管理
    实验五 单元测试
    实验四 代码评审
  • 原文地址:https://www.cnblogs.com/zibuyu/p/3205056.html
Copyright © 2011-2022 走看看