zoukankan      html  css  js  c++  java
  • 国家统计局区划码爬取

    目标数据

    oracle存储表格

    -- Create table
    create table VILLAGE_CODE
    (
      id                INTEGER,
      area_code         VARCHAR2(500),
      city_village_code VARCHAR2(500),
      area_name         VARCHAR2(500)
    )
    tablespace SYSTEM
      pctfree 10
      pctused 40
      initrans 1
      maxtrans 255
      storage
      (
        initial 64K
        next 1M
        minextents 1
        maxextents unlimited
      );
    -- Add comments to the columns 
    comment on column VILLAGE_CODE.id
      is '自增ID';
    comment on column VILLAGE_CODE.area_code
      is '统计用区划代码';
    comment on column VILLAGE_CODE.city_village_code
      is '城乡分类代码    ';
    comment on column VILLAGE_CODE.area_name
      is '名称';

    爬取代码

    #!/usr/bin/env python
    # encoding: utf-8
    '''
    @author: lurenjia
    @contact: 1499418300@qq.com
    @file: areacode.py
    @time: 2018/9/29 14:40
    @desc:
    '''
    
    import urllib2, re
    from time import sleep
    from random import random
    from config import DBSession
    
    
    headers = {
        "User-Agent": "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
    }
    session = DBSession()
    
    
    def insertVillage(code, name, city_village_code='-1'):
        print code, name, city_village_code
        session.execute("insert into village_code(area_code, area_name, city_village_code) VALUES ('%s','%s','%s')" %(code, name, city_village_code))
        session.commit()
    
    
    def openUrl(url, type):
        try:
            sleep(random()*0.5)
            request = urllib2.Request(url,headers=headers)
            html = urllib2.urlopen(request,timeout=10).read().decode('gbk')
        except:
            html = None
            with open('error.txt', 'a+') as f:
                f.write(url+'                   '+str(type)+'
    ')
        finally:
            return html
        
        
    def parseCode1(baseUrl, lastUrl):
        html = openUrl(baseUrl+lastUrl,1)
        if html:
            for tr in re.findall("<tr class='provincetr'>.+?</tr>", html):
                for td in re.findall("<a href='(.+?html)'>(.+?)<br/>", tr):
                    parseCode2(baseUrl, td[0])
            
    
    def parseCode2(baseUrl, lastUrl):
        html = openUrl(baseUrl + lastUrl,2)
        if html:
            for tr in re.findall("<tr class='citytr'>.+?</tr>", html):
                for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr):
                    insertVillage(td[1], td[2])
                    parseCode3(baseUrl, td[0])
            
    
    def parseCode3(baseUrl, lastUrl):
        baseUrl = baseUrl + lastUrl.split('/')[0] + '/'
        lastUrl = '/'.join(lastUrl.split('/')[1:])
        html = openUrl(baseUrl + lastUrl,3)
        if html:
            for tr in re.findall("<tr class='countytr'>.+?</tr>", html):
                for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr):
                    insertVillage(td[1], td[2])
                    parseCode4(baseUrl, td[0])
            
    
    def parseCode4(baseUrl, lastUrl):
        baseUrl = baseUrl + lastUrl.split('/')[0] + '/'
        lastUrl = '/'.join(lastUrl.split('/')[1:])
        html = openUrl(baseUrl + lastUrl,4)
        if html:
            for tr in re.findall("<tr class='towntr'>.+?</tr>", html):
                for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr):
                    insertVillage(td[1], td[2])
                    parseCode5(baseUrl, td[0])
            
    
    def parseCode5(baseUrl, lastUrl):
        baseUrl = baseUrl + lastUrl.split('/')[0] + '/'
        lastUrl = '/'.join(lastUrl.split('/')[1:])
        html = openUrl(baseUrl + lastUrl,5)
        if html:
            for tr in re.findall("<tr class='villagetr'>.+?</tr>", html):
                for td in re.findall("<td>(.+?)</td><td>(.+?)</td><td>(.+?)</td>", tr):
                    insertVillage(td[0], td[2], td[1])
            
    
    if __name__=="__main__":
        baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
        parseCode1(baseUrl, 'index.html')

    分布式爬取

    纯手写

    #!/usr/bin/env python
    # encoding: utf-8
    '''
    @author: lurenjia
    @contact: 1499418300@qq.com
    @file: areacode.py
    @time: 2018/9/29 14:40
    @desc:
    '''
    
    import urllib2, re, os, redis
    from time import sleep
    from random import random
    from sqlalchemy import *
    from sqlalchemy.orm import sessionmaker
    from multiprocessing import Process
    
    os.environ['NLS_LANG'] = 'AMERICAN_AMERICA.AL32UTF8'
    engine = create_engine('oracle://xxx:xxx@xxx:1521/xe', pool_size=100, encoding='utf8')
    DBSession = sessionmaker(bind=engine)
    session = DBSession()
    pool = redis.ConnectionPool(host='xxx', port=6379)
    MRedis = redis.Redis(connection_pool=pool)
    
    headers = {
        "User-Agent": "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
    }
    
    
    def insertVillage(code, name, city_village_code='-1'):
        print code, name, city_village_code
        session.execute("insert into village_code(area_code, area_name, city_village_code) VALUES ('%s','%s','%s')" %(code, name, city_village_code))
        session.commit()
    
    
    def openUrl(url):
        try:
            sleep(random() * 0.5)
            request = urllib2.Request(url, headers=headers)
            html = urllib2.urlopen(request, timeout=10).read().decode('gbk')
        except:
            html = None
            MRedis.lpush('area_code_error', url)
        finally:
            return html
    
    
    def run():
        while True:
            area_code2 = MRedis.lpop('area_code2')
            while area_code2:
                html = openUrl(area_code2)
                if html:
                    parseCode2(html, area_code2)
                area_code2 = MRedis.lpop('area_code2')
    
            area_code3 = MRedis.lpop('area_code3')
            while area_code3:
                html = openUrl(area_code3)
                if html:
                    parseCode3(html, area_code3)
                area_code3 = MRedis.lpop('area_code3')
    
            area_code4 = MRedis.lpop('area_code4')
            while area_code4:
                html = openUrl(area_code4)
                if html:
                    parseCode4(html, area_code4)
                area_code4 = MRedis.lpop('area_code4')
    
            area_code5 = MRedis.lpop('area_code5')
            while area_code5:
                html = openUrl(area_code5)
                if html:
                    parseCode5(html, area_code5)
                area_code5 = MRedis.lpop('area_code5')
    
    
    def parseCode1(baseUrl, lastUrl):
        html = openUrl(baseUrl+lastUrl)
        if html:
            for tr in re.findall("<tr class='provincetr'>.+?</tr>", html):
                for td in re.findall("<a href='(.+?html)'>(.+?)<br/>", tr):
                    MRedis.lpush('area_code2', baseUrl+td[0])
    
    
    def parseCode2(html, url):
        for tr in re.findall("<tr class='citytr'>.+?</tr>", html):
            for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr):
                insertVillage(td[1], td[2])
                baseUrl = '/'.join(url.split('/')[:-1])
                MRedis.lpush('area_code3', baseUrl +'/'+ td[0])
    
    
    def parseCode3(html, url):
        for tr in re.findall("<tr class='countytr'>.+?</tr>", html):
            for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr):
                insertVillage(td[1], td[2])
                baseUrl = '/'.join(url.split('/')[:-1])
                MRedis.lpush('area_code4', baseUrl + '/' + td[0])
    
    
    def parseCode4(html, url):
        for tr in re.findall("<tr class='towntr'>.+?</tr>", html):
            for td in re.findall("<a href='(.+?html)'>(.+?)</a></td><td><a href='.+?'>(.+?)</a>", tr):
                insertVillage(td[1], td[2])
                baseUrl = '/'.join(url.split('/')[:-1])
                MRedis.lpush('area_code5', baseUrl + '/' + td[0])
    
    
    def parseCode5(html):
        for tr in re.findall("<tr class='villagetr'>.+?</tr>", html):
            for td in re.findall("<td>(.+?)</td><td>(.+?)</td><td>(.+?)</td>", tr):
                insertVillage(td[0], td[2], td[1])
            
    
    if __name__=="__main__":
        baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
        parseCode1(baseUrl, 'index.html')
        # p1 = Process(target=run)
        # p1.start()
        # p2 = Process(target=run)
        # p2.start()
        # p3 = Process(target=run)
        # p3.start()
  • 相关阅读:
    步步为营VS 2008 + .NET 3.5(3) C# 3.0新特性之Automatic Properties(自动属性)、Object Initializers(对象初始化器)、Collection Initializers(集合初始化器)和Extension Methods(扩展方法)
    新瓶旧酒ASP.NET AJAX(10) 客户端脚本编程(Sys.Services命名空间下的类)
    步步为营VS 2008 + .NET 3.5(2) VS 2008新特性之JavaScript Intellisense and Debugging(JavaScript的智能感知和调试)
    新瓶旧酒ASP.NET AJAX(8) 客户端脚本编程(Sys.Net命名空间下的WebRequestManager、WebRequest、WebRequestExecutor和XMLHttpExecutor)
    步步为营VS 2008 + .NET 3.5(11) DLINQ(LINQ to SQL)之大数据量分页、延迟执行和日志记录
    步步为营VS 2008 + .NET 3.5(14) XLINQ(LINQ to XML)之针对XML文件的添加、查询、更新和删除
    [翻译]ASP.NET 2.0中的健康监测系统(Health Monitoring)(3) 触发自定义事件
    稳扎稳打Silverlight(2) 1.0实例之支持录音和回放的钢琴(Silverlight+ASP.NET AJAX+DLINQ)
    步步为营VS 2008 + .NET 3.5(7) LINQ查询操作符之First、FirstOrDefault、Last、LastOrDefault、ElementAt、ElementAtOrDefault、Contains、Any、All、Count、LongCount、Sum、Min、Max、Average、Aggregate、Cast、DefaultIfEmpty、SequenceEqual、OfType、ToArray、ToList、ToDictionary
    步步为营VS 2008 + .NET 3.5(1) VS 2008新特性之Multi Targeting(多定向)、Web Designer and CSS(集成了CSS的web设计器)和Nested Master Page(嵌套母版页)
  • 原文地址:https://www.cnblogs.com/lurenjia1994/p/9724372.html
Copyright © 2011-2022 走看看