1 <?php 2 /** 3 * 抓取“华强电子网”供应商主程序 4 * author Lee. 5 * Last modify $Date: 2012-2-2 12:55:35 $ 6 */ 7 require_once './config.inc.php'; 8 class huaqiang { 9 private $key; // 型号 10 private $pageNum; // 页码 11 12 /** 13 * 入口程序 14 */ 15 public function go($key) { 16 $this->key = $key; 17 if ($this->checkIsExistsData()) { 18 $this->pageNum = $this->getPageNum(); 19 $this->getInfo(); 20 } 21 } 22 23 /** 24 * 获取页面内容 25 * @param Number $page 26 * @return string 27 */ 28 private function getContent($page=1) { 29 $re = file_get_contents($this->getUrl($this->key, $page)); 30 return $re; 31 } 32 33 /** 34 * 检查第一页是否有数据 35 * @return 有返回 true;无返回 false 36 */ 37 private function checkIsExistsData() { 38 if (stristr($this->getContent(), '<span class="s_curr g_vm">1</span>')) { 39 return true; 40 } else { 41 return false; 42 } 43 } 44 45 /** 46 * 获取供应商 url 链接数组 47 * @return ArrayObject 48 */ 49 private function getInfo() { 50 if ($this->pageNum==1) { # 处理只有一页的情况 51 $arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent())); 52 # 循环抓取信息 53 foreach ($arr as $k=>$v) { 54 $infoArr = $this->getInfoByShopUrl($v); 55 if ($this->execAdd($infoArr)) echo 'Add Success!!'; 56 $this->sleep(); 57 } 58 $this->sleep(); 59 } elseif ($this->pageNum>1) { # 多页 60 for ($i=1; $i<=$this->pageNum; $i++) { 61 $arr = $this->shopAddContact($this->shopUrlMatchReArr($this->getContent($i))); 62 # 循环抓取信息 63 foreach ($arr as $k=>$v) { 64 $infoArr = $this->getInfoByShopUrl($v); 65 if ($this->execAdd($infoArr)) echo 'Add Success!!'; 66 $this->sleep(); 67 } 68 $this->sleep(); 69 } 70 } 71 } 72 73 /** 74 * 执行添加数据库 75 * @param array $infoArr 76 * @return Number 是否添加成功 77 */ 78 private function execAdd($infoArr) { 79 $m = new Model(); 80 if (!$m->isExists('huaqiang', "company='{$infoArr['company']}'")) { 81 $num = $m->insert('huaqiang', array('company','mobile','phone','fax','region','address','website','zip','email','qq','msn','market','shopUrl'), array($infoArr['company'],$infoArr['mobile'],$infoArr['phone'],$infoArr['fax'],$infoArr['region'],$infoArr['address'],$infoArr['website'],$infoArr['zip'],$infoArr['email'],$infoArr['qq'],$infoArr['msn'],$infoArr['market'],$infoArr['shopUrl'])); 82 } 83 return $num; 84 } 85 86 /** 87 * 抓取信息 88 * @param $url 89 * @return ArrayObject 90 */ 91 private function getInfoByShopUrl($url) { 92 $re = $mobileRe = $faxRe = $marketRe = $msnRe = $zipRe = $urlRe = $emailRe = $qqRe = $this->getUrlInfo($url); 93 preg_match_all('/<li class=\"g\_fl tit\">公司名称:<\/li><li class="g_fl cont">(.+)<\/li>.+<li class="g_fl tit">电话:<\/li><li class=\"g\_fl cont\">(.+)<\/li>.+<li class=\"g\_fl tit\">所在地区:<\/li><li class=\"g\_fl cont\">(.+)<\/li>.+<li class=\"g\_fl tit\">详细地址:<\/li><li class=\"g\_fl cont\">(.+)<\/li>.+<li class=\"g\_fl tit\">QQ:<\/li><li class=\"g\_fl cont\">(.+)<\/li>/Usi', $re, $shopArr); 94 preg_match_all('/<li class=\"g\_fl tit\">手机:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $mobileRe, $mobileArr); 95 preg_match_all('/<li class=\"g\_fl tit\">传真:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $faxRe, $faxArr); 96 preg_match_all('/<li class=\"g\_fl tit\">网址:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $urlRe, $urlArr); 97 preg_match_all('/<li class=\"g\_fl tit\">MSN:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $msnRe, $msnArr); 98 preg_match_all('/<li class=\"g\_fl tit\">邮政编码:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $zipRe, $zipArr); 99 preg_match_all('/<li class=\"g\_fl tit\">所属电子市场:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $marketRe, $marketArr); 100 preg_match_all('/<li class=\"g\_fl tit\">电子邮箱:<\/li><li class=\"g\_fl cont cor\">(.*)<\/li>/Usi', $emailRe, $emailArr); 101 preg_match_all('/<li class=\"g\_fl tit\">QQ:<\/li><li class=\"g\_fl cont\">(.*)<\/li>/Usi', $qqRe, $qqArr); 102 $infoArr = array( 103 'company'=>trim($shopArr[1][0]), 104 'mobile'=>empty($mobileArr[1][0]) ? '' : $mobileArr[1][0], 105 'phone'=>$this->stripPhoneTags(trim($shopArr[2][0])), 106 'fax'=>empty($faxArr[1][0]) ? '' : $faxArr[1][0], 107 'region'=>trim($shopArr[3][0]), 108 'address'=>trim($shopArr[4][0]), 109 'zip'=>empty($zipArr[1][0]) ? '' : $zipArr[1][0], 110 'email'=>empty($emailArr[1][0]) ? '' : $emailArr[1][0], 111 'qq'=>empty($qqArr[1][0]) ? '' : $qqArr[1][0], 112 'msn'=>empty($msnArr[1][0]) ? '' : $msnArr[1][0], 113 'market'=>empty($marketArr[1][0]) ? '' : $marketArr[1][0], 114 'website'=>empty($urlArr[1][0]) ? '' : $this->stripATags($urlArr[1][0]), 115 'shopUrl'=>$url 116 ); 117 return $infoArr; 118 } 119 120 /** 121 * 供应商店铺链接添加 contact.html 122 * @param array $arr 123 * @return string 124 */ 125 private function shopAddContact($arr) { 126 foreach ($arr as $k=>$v) { 127 $arr[$k] = $v . '/contact.html'; 128 } 129 return $arr; 130 } 131 132 /** 133 * 去掉网址的 A 标签 134 * @param string $site 135 * @return string 136 */ 137 private function stripATags($site) { 138 $site = preg_replace('/<a.+>(.+)<\/a>/', '\1', $site); 139 return $site; 140 } 141 142 /** 143 * 去掉手机多余标签 144 * @param string $phone 145 * @return string 146 */ 147 private function stripPhoneTags($phone) { 148 $phone = str_replace('<span>', '', $phone); 149 $phone = str_replace('</span>', ' ', $phone); 150 $phone = str_replace('<br />', '', $phone); 151 return $phone; 152 } 153 154 /** 155 * 根据页面获取供应商 url 数组 156 * @param string $re 157 * @return ArrayObject 158 */ 159 private function shopUrlMatchReArr($re) { 160 preg_match_all('/<li class="col3"><a class=\"company\" target=\"\_blank\" href=\"(.+)\" value=\".+\">.+<\/a>/Usi', $re, $arr); 161 $arr = array_unique($arr[1]); 162 return $arr; 163 } 164 165 /** 166 * 获取页码 167 * @return Number 168 */ 169 private function getPageNum() { 170 $i = 1; 171 while (true) { 172 $re = $this->getContent($i); 173 if (stristr($re, '<span class="g_vm s_f0f s_f0f1" title="下一页">')) break; 174 $i++; 175 $this->sleep(); 176 } 177 return $i; 178 } 179 180 /** 181 * 获取 URL 链接 182 * @param string $str 183 * @param int $page 页码 184 * @return string 185 */ 186 private function getUrl($str, $page=1) { 187 return "http://www.hqew.com/ic/{$str}_____0_00_0_{$page}.html"; 188 } 189 190 /** 191 * 获取页面内容 192 * @param string $url 193 * @return string 194 */ 195 private function getUrlInfo($url) { 196 $re = file_get_contents($url); 197 return $re; 198 } 199 200 /** 201 * 休眠时间,默认5秒 202 */ 203 private function sleep($seconds=5) { 204 sleep($seconds); 205 } 206 } 207 /** 208 * 使用方法:1、先实例化一个类;2、调用 go($param) 方法,$param 为型号 209 * 程序运行思路:根据“华强电子网”的IC搜索功能,输入型号进行搜索,然后抓取供应商信息 210 */ 211 /** 212 * 数据库结构 213 * 214 CREATE TABLE `huaqiang` ( 215 `id` mediumint(8) unsigned NOT NULL auto_increment, 216 `company` varchar(500) NOT NULL, 217 `mobile` varchar(500) NOT NULL, 218 `phone` varchar(500) NOT NULL, 219 `fax` varchar(500) NOT NULL, 220 `region` varchar(500) NOT NULL, 221 `address` varchar(500) NOT NULL, 222 `website` varchar(200) NOT NULL, 223 `zip` varchar(100) NOT NULL, 224 `email` varchar(500) NOT NULL, 225 `qq` varchar(200) NOT NULL, 226 `msn` varchar(200) NOT NULL, 227 `market` varchar(500) NOT NULL, 228 `shopUrl` varchar(200) NOT NULL, 229 PRIMARY KEY (`id`) 230 ) ENGINE=InnoDB DEFAULT CHARSET=utf8 231 */ 232 $c = new huaqiang(); 233 $arr = array_unique(array('MAX3232', 'AML8613', 'MT6225A', 'OM8373PS/N3/A', 'PT7313', 'MAX8212ESA', 'TL431', 'S3C2440', 'TMS320F2812PGFA', 'PCM1704', 'AN6717', 'CA3162E', 'CA3161E', 'LM393N', 'DS18B20', 'SHT10', 'AML8613', 'AN6717', 'LM393N', 'CA3161E', 'CA3162E', 'PCM1704', 'STK392-040', 'K1667', 'MAX232', 'STM32F103', 'LM358')); 234 foreach ($arr as $v) { 235 $c->go($v); 236 } 237 ?>