zoukankan      html  css  js  c++  java
  • php 抓取中国统计局 最新县及县以上行政区划代码

    起因:

      前两天突然想找个省市县的行政代码库,发现网上要么不是最新的,要么要帐号,要积分,要钱。让人好烦,就写了这个脚本。

    数据库结构:

     1 CREATE TABLE IF NOT EXISTS `area` (
     2   `id` int(11) NOT NULL auto_increment,
     3   `code` varchar(6) NOT NULL,
     4   `name` varchar(20) NOT NULL,
     5   `citycode` varchar(6) NOT NULL,
     6   PRIMARY KEY  (`id`)
     7 ) ENGINE=Innodb  DEFAULT CHARSET=utf8;
     8  
     9 CREATE TABLE IF NOT EXISTS `city` (
    10   `id` int(11) NOT NULL auto_increment,
    11   `code` varchar(6) NOT NULL,
    12   `name` varchar(20) NOT NULL,
    13   `provincecode` varchar(6) NOT NULL,
    14   PRIMARY KEY  (`id`)
    15 ) ENGINE=Innodb  DEFAULT CHARSET=utf8;
    16 
    17  
    18 CREATE TABLE IF NOT EXISTS `province` (
    19   `id` int(11) NOT NULL auto_increment,
    20   `code` varchar(6) NOT NULL,
    21   `name` varchar(20) NOT NULL,
    22   PRIMARY KEY  (`id`)
    23 ) ENGINE=Innodb  DEFAULT CHARSET=utf8 ;

    脚本文件:

      1 <?php
      2 set_time_limit(0);
      3 /**
      4  * 
      5  */
      6 class get_city_code {
      7     //
      8     private $html = '';
      9     public  $code_rt;
     10     private static $instance = '';
     11     private $db = '';
     12     private $box = array();
     13     private $url = 'http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201504/t20150415_712722.html';
     14     //
     15     private function __construct() {
     16         
     17     }
     18 
     19     public static function getInstance() {
     20         if ( ! self::$instance instanceof get_city_code ) {
     21             self::$instance = new self();
     22         }
     23         return self::$instance;
     24     }
     25     public  function start() {
     26         //
     27         $this->connect_tongji_html();
     28         $this->code_rt = new code_result();
     29         $this->code_rt->html = $this->html;
     30         $this->code_rt->filter_all_data();
     31     }
     32     private function connect_tongji_html() {
     33         $ch = curl_init();
     34         $url = $this->getUrl();
     35 
     36         curl_setopt($ch, CURLOPT_URL, $url);
     37         curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
     38         curl_setopt($ch, CURLOPT_HEADER, 0);
     39         //执行并获取HTML文档内容
     40         $this->html = curl_exec($ch);
     41 
     42         //释放curl句柄
     43         curl_close($ch);
     44     }
     45 
     46     //获取url
     47     private function getUrl() {
     48         return $this->url;
     49     }
     50 
     51     //插入数据库
     52     public function insert() {
     53         $i=0;
     54         $box = array();
     55         foreach($this->code_rt->code_data as $k => $v) {
     56             $k_arr = str_split($k,2);
     57             $first = $k_arr[0];
     58             $second = $k_arr[1];
     59             $three = $k_arr[2];
     60 
     61             if( !empty( $box[$first]) ){
     62                 if( !empty($box[$first][$second]) ) {
     63                     $box[$first][$second][$three] = $v;
     64                 }else{                    
     65                     $box[$first][$second][] = $v;
     66                 }
     67             }else{
     68                 $box[$first][] = array($v);
     69             }
     70         }
     71         
     72         $this->get_db();
     73         //print_r($box);
     74         foreach( $box as $k1=>$v1){
     75 
     76             $code1 = $k1."0000";
     77             $name1 = $v1[0][0];
     78             $sql = "insert into province values (NULL,'".$code1."','".$name1."')";
     79             $this->db->query($sql);
     80             foreach ( $v1 as $k2 => $v2 ) {
     81                 if($k2 == 0) {
     82                     continue;
     83                 }
     84                 $code2 = $k1.$k2."00";
     85                 $name2 = $v2[0]=='市辖区' ? $name1 : $v2[0];
     86                 if( $name2 == '县'){
     87                     continue;
     88                 }
     89                 $sql = "insert into city values (NULL,'".$code2."','".$name2."','".$code1."')";
     90                 $this->db->query($sql);
     91 
     92                 foreach( $v2 as $k3=>$v3 ) {
     93                     if($k3 == 0){
     94                         continue;
     95                     }
     96                     $code3 = $k1.$k2.$k3;
     97                     $name3 = $v3;
     98                     $sql = "insert into area values (NULL,'".$code3."','".$name3."','".$code2."')";
     99                     $this->db->query($sql);
    100                 }
    101             }
    102         }
    103 
    104         $this->db->close();
    105     }
    106 
    107     //
    108     private function get_db () {
    109         $db = new mysqli('localhost','root','sunl','blog');
    110         $db->set_charset('utf8');
    111         $this->db = $db;
    112     }
    113 }
    114 
    115 class code_result {
    116     public $html = '';
    117     public $code_data = array();
    118     private $code_arr  = array();
    119     private $name_arr  = array();
    120 
    121     public function __construct () {
    122 
    123     }
    124 
    125     public function filter_all_data() {
    126         //获取所有的p标签
    127         $patten = "/<p.*></p>/";
    128         preg_match($patten, $this->html, $p);
    129         //去除所有的&nbsp;
    130         $this->html = preg_replace( '/&nbsp;/', '', strip_tags($p[0]));
    131 
    132         $this->html = preg_replace( '/s+/', '', $this->html );
    133         $this->html = preg_replace( '/ /', '', $this->html );
    134         $this->html = preg_replace( '/ /', '', $this->html );
    135         $this->html = preg_replace( '/ /', '', $this->html );
    136         //echo $this->html;
    137         
    138         //匹配code id
    139         $patten2 = "/[d{6}]+/";
    140         preg_match_all($patten2, $this->html, $this->code_arr);
    141         
    142         //匹配县市名称
    143         $patten3 = "/[x{4e00}-x{9fa5}]+/u";
    144         preg_match_all($patten3, $this->html, $this->name_arr);
    145         
    146         $this->code_data = array_combine($this->code_arr[0] , $this->name_arr[0]);
    147         //print_r($this->code_data);die;
    148     }
    149 
    150     public function getCodeData() {
    151         return $this->code_data;
    152     }
    153 }
    154 
    155 $code = get_city_code::getInstance();
    156 $code->start();
    157 $code->insert();
    158 ?>
  • 相关阅读:
    Java Web(八) MVC和三层架构
    Java Web(九) 用户管理系统
    Java Web(十一) 分页功能的实现
    hibernate(四) 双向多对多映射关系
    合并两个排序的链表
    反转链表
    链表中倒数第k个结点
    堆排序
    计算机网络常见面试题
    字节对齐原则
  • 原文地址:https://www.cnblogs.com/faronl/p/4891946.html
Copyright © 2011-2022 走看看