zoukankan      html  css  js  c++  java
  • 一个爬虫

    <?php
    read();
        function read(){
            //为了万无一失
            header("Content-type:text/html;charset=utf-8");
            echo '<meta charset="utf8">';
            $myfile = fopen('D:歌词.txt.txt','r');
            echo '1';
            $info = [];
            $num = 0;
            $number = 0;
            while($line = fgets($myfile)){
    
    
                //获取用户名
                $net_name_index = strpos($line,'用户名:');
                $net_name_end = strpos($line,'email:',$net_name_index);
                $net_name = trim(substr($line,$net_name_index+strlen('用户名:'),$net_name_end-($net_name_index+strlen('用户名:'))));
                
    
                //获取email
                $email_index = strpos($line,'email:',$net_name_end);
                $email_end = strpos($line,'真名:',$email_index);
                $email = trim(substr($line,$email_index+strlen('email:'),$email_end-($email_index+strlen('email:'))));
                
    
                //获取真名
                $name_index = strpos($line,'真名:',$email_end);
                $name_end = strpos($line,'身份证号:',$name_index);
                $name = trim(substr($line,$name_index+strlen('真名:'),$name_end-($name_index+strlen('真名:'))));
                
    
                //获取身份证号
                $idCard_index = strpos($line,'身份证号:',$name_end);
                $idCard_end = strpos($line,'绑定手机号',$idCard_index);
                $idCard = trim(substr($line,$idCard_index+strlen('身份证号:'),$idCard_end-($idCard_index+strlen('身份证号:'))));
                
                if(strlen($idCard)!=18){
                    continue;
                }
                $number = $number+1;
                //获取手机号
                $phone_number_index = strpos($line,'绑定手机号',$idCard_end);
                $phone_number_end = strpos($line,'账户可',$phone_number_index);
                $phone_number = trim(substr($line,$phone_number_index+strlen('绑定手机号'),$phone_number_end-($phone_number_index+strlen('绑定手机号'))));
                
    
                //获取银行卡号
                $bankCard_index = strpos($line,'行卡号:',$phone_number_end);
                $bankCard_end = strpos($line,'银行:',$bankCard_index);
                $bankCard = trim(substr($line,$bankCard_index+strlen('行卡号:'),$bankCard_end-($bankCard_index+strlen('行卡号:'))));
                
                //这么多重复代码。我甚至可以写个类
    
    
                //抓取身份证号信息集
                $idCrad_url = 'http://qq.ip138.com/idsearch/index.asp?action=idcard&userid='.$idCard;
                $idCrad_curl = curl($idCrad_url,'gb2312');
                $idCard_result = getIDinfo($idCrad_curl);
                
    
    $idnex = $num++;
    
                
                if(strlen($bankCard)>15&&strlen($bankCard)<20){
                    $bankCard_url = 'http://www.cardcn.com/search.php?word='.$bankCard;
                    
                    $bankCard_curl = curl($bankCard_url);
                    if(substr_count($bankCard_curl,'对不起')==0){
                        $bankCard_result = getBankinfo($bankCard_curl);    
                        $info[$idnex]['bankCard_info'] = $bankCard_result;
                    }
                }    
    
                
    
                $info[$idnex]['net_name'] = $net_name;
                $info[$idnex]['email'] = $email;
                $info[$idnex]['name'] = $name;            
                $info[$idnex]['idCard'] = $idCard;            
                $info[$idnex]['phone_number'] = $phone_number;
                $info[$idnex]['bankCard'] = $bankCard;
    
                $info[$idnex]['idCrad_info'] = $idCard_result;
    
            }
            cl_slqi($info);
            echo $number;
        }
    
    //$url :html链接
    //return :解析后的html文档(字符串)
    //获取CURL请求的输出信息,这个可以爬取https,非常好
    function curl($url,$coding='utf-8') { 
        //初始化
        $ch = curl_init();
        //设置选项,包括url
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_HEADER, 0);//不返回response头部信息
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); //TRUE 将curl_exec()获取的信息以字符串返回,而不是直接输出。
       
       curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); //支持重定向
        //不验证证书和host
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
    
        $result = curl_exec($ch);
        //释放curl句柄
        curl_close($ch);
          //如果网站不是utf-8编码的话要转码
          if($coding!='utf-8'){
              $result= iconv($coding,"utf-8//IGNORE",$result);  
          }   
        return $result;   
    }
    
    //处理并返回身份证信息
    function getIDinfo($crul){
        
         $sex_index = strpos($crul,'别:</td><td class="tdc2">');
         $date_index = strpos($crul,'生日期:</td><td class="tdc2">',$sex_index);
         $idcard_place_index = strpos($crul,';地:</td><td class="tdc2">',$date_index);
         $idcard_place_end = strpos($crul,'<br/></td></t',$idcard_place_index);
    
        $id_info = [];
        $id_info['date'] = trim(substr($crul, $date_index+strlen('生日期:</td><td class="tdc2">'),4));
        $id_info['sex'] = trim(substr($crul,$sex_index+strlen('别:</td><td class="tdc2">'),3));
        $id_info['idCard_space'] = trim(substr($crul,$idcard_place_index+strlen(';地:</td><td class="tdc2">'),$idcard_place_end-($idcard_place_index+strlen(';地:</td><td class="tdc2">'))));
        return $id_info;
    }
    
    //处理并返回银行卡信息
    function getBankinfo($bank_crul){
        $bank_info = [];
        //银行卡归属地
        $back_space_index = strpos($bank_crul,'e">归属信息:</font>');
        $back_space_end = strpos($bank_crul,'</dt>',$back_space_index);
        $bank_info['back_space'] = trim(substr($bank_crul,$back_space_index+strlen('e">归属信息:</font>'),$back_space_end-($back_space_index+strlen('e">归属信息:</font>'))));
    
        //银行名称
        $bank_name_index = strpos($bank_crul,'e">银行名称:</font>',$back_space_end);
        $bank_name_end = strpos($bank_crul,'</dt>',$bank_name_index);
        $bank_info['bank_name'] = trim(substr($bank_crul,$bank_name_index+strlen('e">银行名称:</font>'),$bank_name_end-($bank_name_index+strlen('e">银行名称:</font>'))));
    
        //银行卡名称
        $bankCard_name_index = strpos($bank_crul,'e">银行卡名:</font>',$bank_name_end);
        $bankCard_name_end =  strpos($bank_crul,'</dt>',$bankCard_name_index);
        $bank_info['bankCard_name'] = trim(substr($bank_crul,$bankCard_name_index+strlen('e">银行卡名:</font>'),$bankCard_name_end-($bankCard_name_index+strlen('e">银行卡名:</font>'))));
    
        //银行卡种类
        $bank_info['bank_kind'] = getKeyWord($bank_crul,'<dt><font class="con_sub_title">银行卡种:</font>','</dt>');
        return $bank_info;
        
    }
    
    
    //截取有用的子串(爬虫相关)
    //$info=网页  $first_key=开始的字符串  $last_key=结束的字符串
    //return 中间的字符串;
    function getKeyWord($info,$first_key,$last_key){
        $len = strlen($first_key);
        $first_key_start = strpos($info,$first_key);
        $last_key_start = strpos($info,$last_key,$first_key_start);
        $keyword = trim(substr($info,$first_key_start+$len,$last_key_start-$first_key_start-$len));
        return $keyword;
    }
    
    //把数据写入到数据库
    function cl_slqi($arr){
        $con = mysqli_connect('localhost','root','root','aiqiyi');
        if(!$con){
            die('could not connect');
        }
        $temp = 0;
        foreach($arr as $value=>$key){
            if(!isset($key['bankCard_info'])){
                $sql = "insert into info(name,idCard,idCard_space,sex,date,net_name,email,phone_number) values('{$key['name']}','{$key['idCard']}','{$key['idCrad_info']['idCard_space']}','{$key['idCrad_info']['sex']}','{$key['idCrad_info']['date']}','{$key['net_name']}','{$key['email']}','{$key['phone_number']}')";    
            }else{
                $sql = "insert into info(name,idCard,idCard_space,sex,date,net_name,email,phone_number,bankCard,back_name,bankCard_name,back_kind,back_space) values('{$key['name']}','{$key['idCard']}','{$key['idCrad_info']['idCard_space']}','{$key['idCrad_info']['sex']}','{$key['idCrad_info']['date']}','{$key['net_name']}','{$key['email']}','{$key['phone_number']}','{$key['bankCard']}','{$key['bankCard_info']['bank_name']}','{$key['bankCard_info']['bankCard_name']}','{$key['bankCard_info']['bank_kind']}','{$key['bankCard_info']['back_space']}')";
            }
    
            if(mysqli_query($con,$sql)){
                echo 'insert成功!这是第'.$temp.'个成功!';
                $temp++;
                echo "
    ";
            }else{
                echo 'insert失败!';echo "
    ";
            }
    
        }
    }
    ?>
  • 相关阅读:
    golang实现单链表
    koa中间执行机制
    vuex源码简析
    从浏览器渲染过程看重绘回流
    javascript的this
    js 设计模式:观察者和发布订阅模式
    H5 移动端 键盘遮挡焦点元素解决方案
    webpack4 css modules
    Daily,一个入门级的 React Native 应用
    javascript: 类型转换
  • 原文地址:https://www.cnblogs.com/cl94/p/9020751.html
Copyright © 2011-2022 走看看