zoukankan      html  css  js  c++  java
  • 用Python实现的一个简单的爬取省市乡镇的行政区划信息的脚本

    img

    # coding=utf-8
    # Creeper
    import os
    import bs4
    import time
    import MySQLdb
    import urllib2
    import datetime
    import warnings
    import traceback
    import ConfigParser
    
    try:
        basedir = os.path.dirname(os.path.abspath(__file__))
    except NameError:
        import sys
        basedir = os.path.dirname(os.path.abspath(sys.argv[0]))
    
    SETTINGS_FILE = os.path.join(basedir, 'settings.ini')
    GLOBAL_CONFIG = {
        'server': {
            'debug': False,
        },
        'db': {
            'host': '127.0.0.1',
            'port': '3306',
            'user': 'root',
            'password': '',
            'dbname': 'test',
            'table': 'group'
        }
    }
    
    def __config(item):
        GLOBAL_CONFIG[sec][item[0]] = item[1]
    try:
        parser = ConfigParser.ConfigParser()
        parser.readfp(open(SETTINGS_FILE))
        for sec in parser.sections():
            map(__config, parser.items(sec))
    except:
        print 'settings.ini needed'
        raise
    
    N = 0
    
    class Handle(object):
    
        def __init__(self, *args, **kwargs):
            super(Handle, self).__init__()
            self.db = {}
            self.conn = None
            self.cursor = None
            self._cursor(**kwargs)
    
        def _cursor(self, **kwargs):
            self.db.update(**kwargs)
            host = kwargs.get('host', '127.0.0.1')
            port = int(kwargs.get('port', 3306))
            user = kwargs.get('user', 'root')
            pwd = kwargs.get('password', '')
            dbname = kwargs.get('dbname', 'test')
            charset = kwargs.get('charset', 'utf8')
            _conn = MySQLdb.connect(user=user, passwd=pwd,
                    host=host, port=port, charset=charset)
            try:
                _conn.select_db(dbname)
            except:
                sql = """CREATE DATABASE IF NOT EXISTS `%s` DEFAULT CHARSET utf8 COLLATE utf8_general_ci;
                      """ % dbname
                _conn.cursor().execute(sql)
                _conn.select_db(dbname)
            self.conn = _conn
            self.cursor = _conn.cursor()
    
        def format_pk(self, pk, index):
            d = {1:2, 2:4, 3:6, 4:9, 5:12}
            try:
                pk = int(float(pk))
            except:
                raise ValueError, 'the primary key must be integer or string interger'
            return int(str(pk)[:d[index]])
            while (pk * 1.0 / 10).is_integer():
                pk = pk * 1.0 / 10
            return pk < 10 and int(pk) * 10 or int(pk)
    
        def do_execute(self, pk, name, type, parent='NULL'):
            global N
            now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S %f')
            if N%100 == 0 and N != 0:
                print 'Total: %s ---%s' % (N, now)
                # time.sleep(5)
            if not parent or parent == '':
                parent = 'NULL'
            sql = """INSERT INTO `%s` (`id`, `name`, `type`, `parent`) VALUES(%s, '%s', '%s', %s);
                  """ % (self.db['table'], pk, name, type, parent)
            try:
                self.cursor.execute(sql)
                self.conn.commit()
                print '+',
                N += 1
            except MySQLdb.Warning, w:
                print "
    Warning:%s" % str(w)
                print '#',
            except MySQLdb.Error, e:
                if not 'Duplicate entry' in str(e):
                    print "
    Error:%s" % str(e)
                    self.debug(pk, name)
                    self.debug(pk, parent)
                else:
                    print '=',
                    self.debug(pk, name)
            except:
                traceback.print_exc()
                print '?',
    
        def do_executemany(self, items, params=None):
            sql = """INSERT INTO `%s` (`id`, `name`, `type`, `parent`) VALUES """ % self.db['table']
            sql +="(%s, %s, %s, %s)";
            try:
                self.cursor.executemany(sql, items)
                self.conn.commit()
            except MySQLdb.Error, e:
                print "Error:%s" % str(e)
            except:
                traceback.print_exc()
    
    
    class Creeper(Handle):
    
        def __init__(self, *args, **kwargs):
            Handle.__init__(self, *args, **kwargs)
            self.init_db()
            self.root_url = ''
            self._type = {1:'province',2:'city',3:'county',4:'town',5:'village'}
    
        def init_db(self):
            __sql = """CREATE TABLE IF NOT EXISTS `%s` (
              `id` bigint(20) NOT NULL,
              `name` varchar(30) NOT NULL,
              `type` varchar(30) NOT NULL,
              `parent` bigint(20) DEFAULT NULL,
    
              PRIMARY KEY (`id`),
              KEY `Group_12345` (`name`),
              KEY `Group_67890` (`parent`),
              FOREIGN KEY(parent) REFERENCES `%s` (`id`)
            ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
            """ % (self.db['table'], self.db['table'])
    
            try:
                self.cursor.execute(__sql)
                self.cursor.execute("""CREATE TABLE IF NOT EXISTS `debug` (
                      `id` int(11) NOT NULL AUTO_INCREMENT,
                      `badid` bigint(20) NOT NULL,
                      `others` varchar(256) DEFAULT NULL,
                      PRIMARY KEY (`id`)
                    ) ENGINE=InnoDB DEFAULT CHARSET=utf8;""")
                self.conn.commit()
            except MySQLdb.Warning, w:
                if not 'already exists' in str(w):
                    print "Warning:%s" % str(w)
            except MySQLdb.Error, e:
                print "Error:%s" % str(e)
                traceback.print_exc()
            except:
                traceback.print_exc()
                pass
    
        def debug(self, *args):
            try:
                _sql = """INSERT INTO `debug` (`badid`, `others`) VALUES(%s, '%s');
                       """ % (args[0], args[1])
                self.cursor.execute(_sql)
                self.conn.commit()
            except:
                traceback.print_exc()
            finally:
                return
    
    
        def __get_url(self, tup, index):
            id = tup[0]
            parent = tup[-1]
            __url = []
            for i in range(0, index, 2):
                __url.append(str(id)[i: i+2])
            __url.append(str(id))
            return self.root_url + '/'.join(__url) + '.html'
    
        def format_tag(self, x_tag, url, index, limit=None):
            if limit and index > limit: return
            parent = str(url.split('/2013/')[1].split('/')[-1])[:-5]
            __type = self._type[index]
            __parent = not parent and 'NULL' or parent
            if isinstance(x_tag, bs4.element.Tag) and x_tag.has_attr('href'):
                print '.',
                href = x_tag['href']
                child_url = '/'.join(url.split('/')[:-1])
                full_url = '/'.join([child_url, href])
                __pk = self.format_pk(href.split('.html')[0].split('/')[-1], index)
                __name = x_tag.text
                i = (__pk, __name, __type, __parent)
                self.do_execute(*i)
                self.get_info(full_url, index + 1, limit)
            else:
                print '*',
                __pk = self.format_pk(x_tag[0].text, index)
                __name = x_tag[1].text
                self.do_execute(__pk, __name, __type, __parent)
    
        def get_info(self, url, index, limit=None):
            # 解析页面,获取目标区域的数据
            # 获取单元数据,格式化为可供插入数据库的元组
            try:
                __html = urllib2.urlopen(url).read()
                __soup = bs4.BeautifulSoup(__html, from_encoding='gbk')
                __tr = __soup('tr', class_='%str' % self._type[index])
            except:
                try:
                    self.debug(0, url)
                except:
                    traceback.print_exc()
                finally:
                    return
            __lst = []
            for tr in __tr:
                # 每个tr中的多个td代表多个省
                if index == 1:
                    for td in tr('td'):
                        __lst.extend(td('a'))
                    continue
    
                # 每个tr中的多个td代表一个节点,取最后一个td中的a标签
                if tr('td')[-1]('a'):
                    __lst.extend(tr('td')[-1]('a'))
                else:
                # 没有子节点的元素,单纯的通过td中的数据创建
                    self.format_tag([tr('td')[0], tr('td')[-1]], url, index, limit)
    
            for a in __lst:
                self.format_tag(a, url, index, limit)
    
    
        def do_get_childs(self, index):
            # 1. 查询获取当前层级节点数目
            _sql = """SELECT COUNT( * ) FROM  `%s` WHERE `type`='%s';
                   """ % (self.db['table'], self._type[index])
            self.cursor.execute(_sql)
            total = self.cursor.fetchone()[0]
    
            # 2. 遍历父节点,获取子节点数据
            for i in range(0, total, 30):
                _sql = """SELECT `id`, `parent` FROM `%s` WHERE `type` = '%s' LIMIT %s, %s;
                       """ % (self.db['table'], self._type[index], i, i+30)
                self.cursor.execute(_sql)
                id_and_parent = list(set(self.cursor.fetchall()))
                for tup in id_and_parent:
                    url = self.__get_url(tup, index)
                    self.get_info(url, index+1)
    
    
    warnings.filterwarnings('error', category = MySQLdb.Warning)
    del warnings
    
    if __name__ == '__main__':
        url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2013/"
        db_config = GLOBAL_CONFIG['db']
        g = Creeper(**db_config)
        g.root_url = url
        g.init_db()
        lst = g.get_info(url, index=1, limit=5)
        # g.do_get_childs(3)
    
  • 相关阅读:
    02-单臂路由实验
    线程高级篇-读写锁ReentrantReadWriteLock
    线程高级篇-Lock锁和Condition条件
    002 flutter的路由管理--命名路由
    fluuter的路由管理--普通路由
    003 文档的操作
    002 索引的操作
    001 elasticsearch的核心概念
    002 使用nacos完成服务的注册和发现
    001 Nacos的基础内容
  • 原文地址:https://www.cnblogs.com/MrWho/p/python-creeper-to-get-province-city-county-town-info.html
Copyright © 2011-2022 走看看