zoukankan      html  css  js  c++  java
  • 爬虫之初试

      MD,这个星期简直了~~有时候真是一步错步步错啊,各种套路,好了,废话不多说,因为急求数据,于是按照同事的方法,自己写了一份爬虫,算是初试吧。

      1 <?php
      2 header("Content-type: text/html; charset=utf-8");
      3 include_once "Connection.php";
      4 include_once "autoload.php";
      5 
      6 $db = NewADOConnection('pgsql');
      7 $db->Connect(~~~~~);
      8 $db->Execute("set names utf8;");
     14 $city= [
     15     ['beijing','北京'],
     16     ['tianjin','天津'],
     17     ['shanghai','上海'],
     18     ['chongqing','重庆'],
     19     ['guangzhou','广州'],
     20     ['shenzhen','深圳'],
     21     ['shijiazhuang','石家庄'],
     22     ['taiyuan','太原'],
     23     ['huhehaote','呼和浩特'],
     24     ['shenyang','沈阳'],
     25     ['changchun','长春'],
     26     ['haerbin','哈尔滨'],
     27     ['nanjing','南京'],
     28     ['hangzhou','杭州'],
     29     ['hefei','合肥'],
     30     ['fujianfuzhou','福州'],
     31     ['nanchang','南昌'],
     32     ['jinan','济南'],
     33     ['zhengzhou','郑州'],
     34     ['wuhan','武汉'],
     35     ['changsha','长沙'],
     36     ['guangzhou','广州'],
     37     ['nanning','南宁'],
     38     ['haikou','海口'],
     39     ['chengdu','成都'],
     40     ['guiyang','贵阳'],
     41     ['kunming','昆明'],
     42     ['lasa','拉萨'],
     43     ['xian','西安'],
     44     ['lanzhou','兰州'],
     45     ['xining','西宁'],
     46     ['yinchuan','银川'],
     47     ['wulumuqi','乌鲁木齐']
     48 ];
     49 
     50 
     51 for($i = 1; $i<count($city);$i++){
     52     $cityData = (getCityData($city[$i][0]));
     53     var_dump($cityData);
     54     $res = toDataBase($cityData,$city[$i][1]);
     55     if($res === false){
     56         var_dump($city[$i]);
     57         break;
     58     }
     59     var_dump($res);
     60 }
     61 
     62 
     63 
     64 //获取城市12月数据
     65 function getCityData($city="beijing"){
     67     $beginYear = "2014";
     68     $month = ["01","02","03","04","05","06","07","08","09","10","11","12"];
     69 
     70     $result = [];
     71     for($i = 0; $i<12;$i++){
     72         $time = $beginYear.$month[$i];
     73         $url = "http://www.tianqihoubao.com/lishi/".$city."/month/".$time.".html";   //地址拼接
     74         $output = httpcurl($url);
     75         $output = iconv("GBK", "UTF-8", $output);   //转码
     76         $output = pregData($output);
     77         $result[]  = $output;
     78     }
     79     return $result;
     80 }
     81 
     82 function toDataBase($data, $city){
     83     if(!$data){
     84         return false;
     85     }
     86     $sql = "insert into air_report(date,state,temperature,wind,city) values ";
     87     $monthLength = count($data);
     88     $outerValues = "";
     89     for($i=0; $i<$monthLength; $i++ ){
     90         $dayLength = count($data[$i]);
     91         $values = "";
     92         for($j=0; $j<$dayLength; $j++){
     93             $values .= "(";
     94             $values .= implode($data[$i][$j],',').",'".$city."'";
     95             $values.="),";
     96 //            var_dump($values);
     97         }
     98         $outerValues = $outerValues.$values;
     99     }
    100     $outerValues = substr($outerValues,0,strlen($outerValues)-1);
    101     $sql = $sql.$outerValues;
    102     global $db;
    103     $res = $db->Execute($sql);
    104     if($res == false){
    105         return false;
    106     }
    107 //    var_dump($db->errorMsg());
    108     return $res;
    109 }
    110 
    111 
    112 //解析数据
    113 function pregData($str){
    114     $rule = "/<table width="100%".*?>.*?</table>/ism";
    115     $ruleTable = "/<td.*?>.*?</td>/ism";
    116     $ruleTr = "/<tr.*?>.*?</tr>/ism";
    117     $output = "";
    118     preg_match_all($rule, $str, $output);
    119     preg_match_all($ruleTable, $output[0][0], $output);
    120     $output = $output[0];
    121     $output = array_slice($output, 4);
    122     $trLength = count($output);
    123     $info = [];
    124     for($i = 0; $i<$trLength; $i++){
    125         $output[$i] = "'".trim(strip_tags($output[$i]))."'";    //直接strip_tags()取数据
    126     }
    127     $info = array_chunk($output, 4);
    128     return $info;
    129 }
    130 
    131 
    132 function httpcurl($url, $post_data = null)
    133 {
    134     $ch = curl_init();
    135     curl_setopt($ch, CURLOPT_URL, $url);//x
    136     curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    137 //    curl_setopt($ch, CURLOPT_POST, 1);
    138 //    curl_setopt($ch, CURLOPT_BINARYTRANSFER, true);
    139 //    curl_setopt($ch, CURLOPT_POSTFIELDS, $post_data);
    140     $output = curl_exec($ch);
    141     curl_close($ch);
    142     return $output;
    143 }

      数据有时取出来后格式还是有问题的,刚好碰上的一个取出来的值里面包含不可见的字符,想要去除暂时还没有想到合适的方法。

    jeyfang
  • 相关阅读:
    [图论分块] HDU 4858 项目管理
    pytorch 自定义权重变量初始化
    Linux系统下命令静默安装weblogic12c
    第六章 类文件结构(2)
    第六章 类文件结构(1)
    第三章 垃圾收集器与内存分配策略
    第二章(3)实战: OutOfMemoryError异常
    第二章(2)HotSpot虚拟机对象探秘
    第二章(1) Java内存区域与内存溢出异常
    第一章 走进java
  • 原文地址:https://www.cnblogs.com/jeyfang/p/6321878.html
Copyright © 2011-2022 走看看