zoukankan      html  css  js  c++  java
  • 百度收录链接抓取小程序

    set_time_limit(0);
    header("Content-type:text/html;charset=utf-8");
        $updatePoint = date("Y-m-d", time());
        $patMD = date("m-d", time());
    $xmlDatas = '';
    for($i=0;$i<76;$i++)
    {
    
    $page = $i*10;
    $conts = file_get_contents("http://www.baidu.com/s?wd=site%3Awww.xxxx.com%20%E4%B9%90%E5%A4%A9%E5%A0%82&pn={$page}&oq=site%3Awww.xxxx.com%20%E4%B9%90%E5%A4%A9%E5%A0%82&ie=utf-8&rsv_idx=1&rsv_pq=aff4775f00063733&rsv_t=ff065MbpZuOoe%2B%2BV4iOkvVuzeSXd1n2FRBQwnnwPHtpsy%2F7pPFaTfcrWm4M&f=8&rsv_bp=1&tn=baidu");
    $pat = '|\"http://www.baidu.com/link?url=?([^>]*)\"s|U';   
        
    $xmlDatas .= getLists($pat, $conts, $updatePoint);
    
    
       
    }
    
     if(file_exists(dirname(__FILE__)."/silian.txt")){
            file_put_contents(dirname(__FILE__)."/silian.txt", $xmlDatas);
        }else{
            $fp = fopen(dirname(__FILE__)."/silian.txt", 'w+b');
            fwrite($fp, $xmlDatas);
            fclose($fp);
        }
        
        function getLists($pattern, $contents, $updatePoint){
            preg_match_all($pattern, $contents, $matches);
           
            $lists = $matches[0];
            $xmlData = "";
            $lists = array_unique($lists);//过滤重复的 ;
       
            
            if(!empty($lists)){
             
                foreach ($lists as $key => $value) {
                    # code...
            
                  
                    $value = trim($value, '"');
                   
                    $value = substr($value, 0,-1);
                    $value = trim($value,'"');
                  
                     
                    $info = parse_url($value);
    
                    $fp = fsockopen($info['host'], 80,$errno, $errstr, 30);
                   
                    fputs($fp,"GET {$info['path']}?{$info['query']} HTTP/1.0"."
    ");
                    fputs($fp, "Host: {$info['host']}"."
    ");
                    fputs($fp, "Connection: close"."
    ");
                    fputs($fp, "
    ");
                    $rewrite = '';
                    while(!feof($fp)) {
                        $line = fgets($fp,512);
                        if($line != " " ) {
                            if(strpos($line,'Location:') !== false) {
                                $rewrite = str_replace("Location: ",'',$line);
                            }
                        }else {
                            break;
                        }
                    }
                  
                    $value = $rewrite;
        
                  
                    $xmlData .=  $value ;
                }
                return $xmlData;
            }else{
                exit();
            } 
        }
    

     此抓取主要用于百度收录的网址查询,没有直接按关键词查询来查询收录情况。

  • 相关阅读:
    XNA之3D文字
    SQL2005调用C#编写的DLL
    C#绘图工具之Rotate
    ASP.NET中的WebService
    数据库同步之复制技术
    C#之TCP消息的发送和接受
    Tsql清空表数据的两种方式truncate and delete
    Code First Migrations数据迁移方法
    MSSQLSERVER跨服务器连接
    windows下wget命令行下载工具的使用
  • 原文地址:https://www.cnblogs.com/haishashou/p/5600341.html
Copyright © 2011-2022 走看看