zoukankan      html  css  js  c++  java
  • Linux企业级项目实践之网络爬虫(15)——区分文本文件和二进制文件

    HTTP协议支持文本和二进制文件传输。最常见的html格式的页面即文本,图片、音乐等为二进制文件。我们要对这两类文件加以区分并分别处理。


    static char * BIN_SUFFIXES = ".jpg.jpeg.gif.png.ico.bmp.swf";
    static int is_bin_url(char *url)
    {
        char *p = NULL;
        if ((p = strrchr(url, '.')) != NULL) {
            if (strstr(BIN_SUFFIXES, p) == NULL)
                return 0;
            else
                return 1;
        }
        return 0;
    }

    char * attach_domain(char *url, const char *domain)
    {
        if (url == NULL)
            return NULL;
    
        if (strncmp(url, "http", 4) == 0) {
            return url;
    
        } else if (*url == '/') {
            int i;
            int ulen = strlen(url);
            int dlen = strlen(domain);
            char *tmp = (char *)malloc(ulen+dlen+1);
            for (i = 0; i < dlen; i++)
                tmp[i] = domain[i];
            for (i = 0; i < ulen; i++)
                tmp[i+dlen] = url[i];
            tmp[ulen+dlen] = '';
            free(url);
            return tmp;
    
        } else {
            //do nothing
            free(url);
            return NULL;
        }
    }
    
    char * url2fn(const Url * url)
    {
        int i = 0;
        int l1 = strlen(url->domain);
        int l2 = strlen(url->path);
        char *fn = (char *)malloc(l1+l2+2);
    
        for (i = 0; i < l1; i++)
            fn[i] = url->domain[i];
    
        fn[l1++] = '_';
    
        for (i = 0; i < l2; i++)
            fn[l1+i] = (url->path[i] == '/' ? '_' : url->path[i]);
    
        fn[l1+l2] = '';
    
        return fn;
    }


  • 相关阅读:
    网络--会话层、表示层、应用层
    Unix C
    操作系统原理
    TCP/UDP
    config OSX firewall programmatically
    Objective-C 浅谈
    OS X background process
    CreateProcessWithToken 1058 error
    WebBrowser keystroke
    MFC加载PNG图片并实现双缓冲
  • 原文地址:https://www.cnblogs.com/new0801/p/6176996.html
Copyright © 2011-2022 走看看