zoukankan      html  css  js  c++  java
  • 抓取“华强电子网”供应商程序

      1 <?php
      2 /** 
      3 * 抓取“华强电子网”供应商主程序 
      4 * author Lee. 
      5 * Last modify $Date: 2012-2-2 12:55:35 $ 
      6 */
      7 require_once './config.inc.php';
      8 class huaqiang {
      9     private $key; // 型号
     10     private $pageNum; // 页码
     11     
     12     /**
     13      * 入口程序
     14      */
     15     public function go($key) {
     16         $this->key = $key;
     17         if ($this->checkIsExistsData()) {
     18             $this->pageNum = $this->getPageNum();
     19             $this->getInfo();
     20         }
     21     }
     22     
     23     /**
     24      * 获取页面内容
     25      * @param Number $page
     26      * @return string
     27      */
     28     private function getContent($page=1) {
     29         $re = file_get_contents($this->getUrl($this->key, $page));
     30         return $re;
     31     }
     32     
     33     /**
     34      * 检查第一页是否有数据
     35      * @return 有返回 true;无返回 false
     36      */
     37     private function checkIsExistsData() {
     38         if (stristr($this->getContent(), '<span class="s_curr g_vm">1</span>')) {
     39             return true;
     40         } else {
     41             return false;
     42         }
     43     }
     44     
     45     /**
     46      * 获取供应商 url 链接数组
     47      * @return ArrayObject
     48      */
     49     private function getInfo() {
     50         if ($this->pageNum==1) { # 处理只有一页的情况
     51             $arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent()));
     52             # 循环抓取信息
     53             foreach ($arr as $k=>$v) {
     54                 $infoArr = $this->getInfoByShopUrl($v);
     55                 if ($this->execAdd($infoArr)) echo 'Add Success!!';
     56                 $this->sleep();
     57             }
     58             $this->sleep();
     59         } elseif ($this->pageNum>1) { # 多页
     60             for ($i=1; $i<=$this->pageNum; $i++) {
     61                 $arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent($i)));
     62                 # 循环抓取信息
     63                 foreach ($arr as $k=>$v) {
     64                     $infoArr = $this->getInfoByShopUrl($v);
     65                     if ($this->execAdd($infoArr)) echo 'Add Success!!';
     66                     $this->sleep();
     67                 }
     68                 $this->sleep();
     69             }
     70         }    
     71     }
     72     
     73     /**
     74      * 执行添加数据库
     75      * @param array $infoArr
     76      * @return Number 是否添加成功
     77      */
     78     private function execAdd($infoArr) {
     79         $m = new Model();
     80         if (!$m->isExists('huaqiang', "company='{$infoArr['company']}'")) {
     81             $num = $m->insert('huaqiang', array('company','mobile','phone','fax','region','address','website','zip','email','qq','msn','market','shopUrl'), array($infoArr['company'],$infoArr['mobile'],$infoArr['phone'],$infoArr['fax'],$infoArr['region'],$infoArr['address'],$infoArr['website'],$infoArr['zip'],$infoArr['email'],$infoArr['qq'],$infoArr['msn'],$infoArr['market'],$infoArr['shopUrl']));
     82         }
     83         return $num;
     84     }
     85     
     86     /**
     87      * 抓取信息
     88      * @param $url 
     89      * @return ArrayObject
     90      */
     91     private function getInfoByShopUrl($url) {
     92         $re = $mobileRe = $faxRe = $marketRe = $msnRe = $zipRe = $urlRe = $emailRe = $qqRe = $this->getUrlInfo($url);
     93         preg_match_all('/<li class=\"g\_fl tit\">公司名称:<\/li><li class="g_fl cont">(.+)<\/li>.+<li class="g_fl tit">电话:<\/li><li class=\"g\_fl cont\">(.+)<\/li>.+<li class=\"g\_fl tit\">所在地区:<\/li><li class=\"g\_fl cont\">(.+)<\/li>.+<li class=\"g\_fl tit\">详细地址:<\/li><li class=\"g\_fl cont\">(.+)<\/li>.+<li class=\"g\_fl tit\">QQ:<\/li><li class=\"g\_fl cont\">(.+)<\/li>/Usi', $re, $shopArr);
     94         preg_match_all('/<li class=\"g\_fl tit\">手机:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $mobileRe, $mobileArr);
     95         preg_match_all('/<li class=\"g\_fl tit\">传真:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $faxRe, $faxArr);
     96         preg_match_all('/<li class=\"g\_fl tit\">网址:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $urlRe, $urlArr);
     97         preg_match_all('/<li class=\"g\_fl tit\">MSN:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $msnRe, $msnArr);
     98         preg_match_all('/<li class=\"g\_fl tit\">邮政编码:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $zipRe, $zipArr);
     99         preg_match_all('/<li class=\"g\_fl tit\">所属电子市场:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $marketRe, $marketArr);
    100         preg_match_all('/<li class=\"g\_fl tit\">电子邮箱:<\/li><li class=\"g\_fl cont cor\">(.*)<\/li>/Usi', $emailRe, $emailArr);
    101         preg_match_all('/<li class=\"g\_fl tit\">QQ:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $qqRe, $qqArr);
    102         $infoArr = array(
    103             'company'=>trim($shopArr[1][0]),
    104             'mobile'=>empty($mobileArr[1][0]) ? '' : $mobileArr[1][0],
    105             'phone'=>$this->stripPhoneTags(trim($shopArr[2][0])),
    106             'fax'=>empty($faxArr[1][0]) ? '' : $faxArr[1][0],
    107             'region'=>trim($shopArr[3][0]),
    108             'address'=>trim($shopArr[4][0]),
    109             'zip'=>empty($zipArr[1][0]) ? '' : $zipArr[1][0],
    110             'email'=>empty($emailArr[1][0]) ? '' : $emailArr[1][0],
    111             'qq'=>empty($qqArr[1][0]) ? '' : $qqArr[1][0],
    112             'msn'=>empty($msnArr[1][0]) ? '' : $msnArr[1][0],
    113             'market'=>empty($marketArr[1][0]) ? '' : $marketArr[1][0],
    114             'website'=>empty($urlArr[1][0]) ? '' : $this->stripATags($urlArr[1][0]),
    115             'shopUrl'=>$url
    116         );
    117         return $infoArr;
    118     }
    119     
    120     /**
    121      * 供应商店铺链接添加 contact.html
    122      * @param array $arr
    123      * @return string     
    124      */
    125     private function shopAddContact($arr) {
    126         foreach ($arr as $k=>$v) {
    127             $arr[$k] = $v . '/contact.html';
    128         }
    129         return $arr;
    130     }
    131     
    132     /**
    133      * 去掉网址的 A 标签
    134      * @param string $site
    135      * @return string
    136      */
    137     private function stripATags($site) {
    138         $site = preg_replace('/<a.+>(.+)<\/a>/', '\1', $site);
    139         return $site;
    140     }
    141     
    142     /**
    143      * 去掉手机多余标签
    144      * @param string $phone
    145      * @return string
    146      */
    147     private function stripPhoneTags($phone) {
    148         $phone = str_replace('<span>', '', $phone);
    149         $phone = str_replace('</span>', ' ', $phone);
    150         $phone = str_replace('<br />', '', $phone);
    151         return $phone;
    152     }
    153     
    154     /**
    155      * 根据页面获取供应商 url 数组
    156      * @param string $re
    157      * @return ArrayObject
    158      */
    159     private function shopUrlMatchReArr($re) {
    160         preg_match_all('/<li class="col3"><a class=\"company\" target=\"\_blank\" href=\"(.+)\" value=\".+\">.+<\/a>/Usi', $re, $arr);
    161         $arr = array_unique($arr[1]);
    162         return $arr;
    163     }
    164     
    165     /**
    166      * 获取页码
    167      * @return Number
    168      */
    169     private function getPageNum() {
    170         $i = 1;
    171         while (true) {
    172             $re = $this->getContent($i);
    173             if (stristr($re, '<span class="g_vm s_f0f s_f0f1"  title="下一页">')) break;
    174             $i++;
    175             $this->sleep();
    176         }
    177         return $i;
    178     }
    179     
    180     /**
    181      * 获取 URL 链接
    182      * @param string $str
    183      * @param int $page 页码
    184      * @return string
    185      */
    186     private function getUrl($str, $page=1) {
    187         return "http://www.hqew.com/ic/{$str}_____0_00_0_{$page}.html";
    188     }
    189     
    190     /**
    191      * 获取页面内容
    192      * @param string $url
    193      * @return string
    194      */
    195     private function getUrlInfo($url) {
    196         $re = file_get_contents($url);
    197         return $re;
    198     }
    199     
    200     /**
    201      * 休眠时间,默认5秒
    202      */
    203     private function sleep($seconds=5) {
    204         sleep($seconds);    
    205     }
    206 }
    207 /**
    208  * 使用方法:1、先实例化一个类;2、调用 go($param) 方法,$param 为型号
    209  * 程序运行思路:根据“华强电子网”的IC搜索功能,输入型号进行搜索,然后抓取供应商信息
    210  */
    211 /**
    212  * 数据库结构
    213  * 
    214 CREATE TABLE `huaqiang` (
    215     `id` mediumint(8) unsigned NOT NULL auto_increment,
    216     `company` varchar(500) NOT NULL,
    217     `mobile` varchar(500) NOT NULL,
    218     `phone` varchar(500) NOT NULL,
    219     `fax` varchar(500) NOT NULL,
    220     `region` varchar(500) NOT NULL,
    221     `address` varchar(500) NOT NULL,
    222     `website` varchar(200) NOT NULL,
    223     `zip` varchar(100) NOT NULL,
    224     `email` varchar(500) NOT NULL,
    225     `qq` varchar(200) NOT NULL,
    226     `msn` varchar(200) NOT NULL,
    227     `market` varchar(500) NOT NULL,
    228     `shopUrl` varchar(200) NOT NULL,
    229     PRIMARY KEY  (`id`)
    230 ) ENGINE=InnoDB DEFAULT CHARSET=utf8
    231  */
    232 $c = new huaqiang();
    233 $arr = array_unique(array('MAX3232', 'AML8613', 'MT6225A', 'OM8373PS/N3/A', 'PT7313', 'MAX8212ESA', 'TL431', 'S3C2440', 'TMS320F2812PGFA', 'PCM1704', 'AN6717', 'CA3162E', 'CA3161E', 'LM393N', 'DS18B20', 'SHT10', 'AML8613', 'AN6717', 'LM393N', 'CA3161E', 'CA3162E', 'PCM1704', 'STK392-040', 'K1667', 'MAX232', 'STM32F103', 'LM358'));
    234 foreach ($arr as $v) {
    235     $c->go($v);
    236 }
    237 ?>

  • 相关阅读:
    复合文字(C99)
    复浮点数(C99)
    字符串的数组形式与指针形式
    《设计模式之禅》学习笔记(十)
    旧关键字的新位置(C99)
    C的存储类、链接和内存管理
    scanf( )函数的格式化输入
    《设计模式之禅》学习笔记(十三)
    yum软件包管理器
    《设计模式之禅》学习笔记(十五)
  • 原文地址:https://www.cnblogs.com/lookyou/p/2646776.html
Copyright © 2011-2022 走看看