zoukankan      html  css  js  c++  java
  • python抓取网页例子

    python抓取网页例子

    最近在学习python,刚刚完成了一个网页抓取的例子,通过python抓取全世界所有的学校以及学院的数据,并存为xml文件。数据源是人人网。

    因为刚学习python,写的代码还不够Pythonic。

    核心代码如下:

    
    
    #!/usr/bin/python
    
    
    import urllib.request
    from html.parser import HTMLParser  
    import json
    import time
    import xml.dom.minidom
    import os
    
    
    class Dept():
        id = 0
        name = ''
        
    class University(Dept):
        depts = []
    
    class City(Dept):
        universities  = []    
        
    class Country(Dept):
        cities = []
    
    class MyHtmlParser(HTMLParser):
        def __init__(self):   
            HTMLParser.__init__(self)   
            self.links = []  
            self.depts = [] 
        def handle_starttag(self, tag, attrs):   
            if tag == 'option':
                for att in attrs:
                    for a in att:
                        if a != 'value' and a != '':
                            self.depts.append(a)
    def readDept(code):
        depts = []
        html = ''
        for word in urllib.request.urlopen('http://www.renren.com/GetDep.do?id=' + str(code)).readlines():
            real = word.strip().decode('gbk') 
            html =  html  + real
        hp = MyHtmlParser()   
        hp.feed(html)  
        for inst in hp.depts:
            dept = Dept()
            dept.name = inst
            depts.append(dept) 
        return depts
    
    def writeXml(city):
        impl = xml.dom.minidom.getDOMImplementation()
        dom = impl.createDocument(None, 'city', None)
        root = dom.documentElement  
        filename = city.name + '.xml'
        if os.path.isfile(filename): 
            os.remove(filename)
        nameE = dom.createElement('name')
        nameT = dom.createTextNode(city.name)
        idE = dom.createElement('id')
        idT = dom.createTextNode(str(city.id))
        nameE.appendChild(nameT)
        idE.appendChild(idT)
        root.appendChild(nameE)
        root.appendChild(idE)
        univs = dom.createElement('universities')
        root.appendChild(univs)
        for uni in city.universities:
    #        print('write xml' + city.name + '	' + uni.name)
            universityE = dom.createElement('university')
            univs.appendChild(universityE)
            uniE = dom.createElement('name')
            uniT = dom.createTextNode(uni.name)
            uidE = dom.createElement('id')
            uidT = dom.createTextNode(str(uni.id))
            uniE.appendChild(uniT)
            uidE.appendChild(uidT)
            universityE.appendChild(uniE)
            universityE.appendChild(uidE)
            deptsE = dom.createElement('departments')
            universityE.appendChild(deptsE)
            for dep in uni.depts:
                deptE = dom.createElement('department')
                deptsE.appendChild(deptE)
                deptNameE = dom.createElement('name')
                deptIdE = dom.createElement('id')
                deptT = dom.createTextNode(dep.name)
                deptIdT = dom.createTextNode(str(dep.id))
                deptNameE.appendChild(deptT)
                deptIdE.appendChild(deptIdT)
                deptE.appendChild(deptNameE)
                
        f= open(filename, 'w', encoding='utf-8')
        dom.writexml(f, addindent='  ', newl='
    ',encoding='utf-8')
        print('write xml :' + city.name + '.xml')
        f.close()  
        
    
    def mkdir(path):
    
        path=path.strip()
        path=path.rstrip("/")
        isExists=os.path.exists(path)
        if not isExists:
            os.makedirs(path)
    
    def readData(content):
        counties = []
        jdata = json.loads(content)
        for i in range(0,100):
            try:
                country = Country()
                country.name = jdata[i]['name'] 
                country.id = jdata[i]['id'] 
                provs = jdata[i]['provs'] 
                for prov in provs:
                    city = City()
                    city.name = prov['name']
                    city.id = prov['id']
                    country.cities.append(city)
                    city.universities = []
                    for dic in prov['univs']:
                        university = University() 
                        university.id = dic['id']
                        university.name = dic['name']
    #                    print('get data: 	' + university.name)
                        university.depts = readDept(university.id)
                        city.universities.append(university)
                        print('city = ' + city.name + '	university = ' + university.name)
                    writeXml(city)
                counties.append(country)
            except IndexError:
                break;
        return counties
    
    
    print('开始时间:' + time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
    f = open('data','r' )
    content = f.read()
    f.close()
    counties = readData(content)
    print('结束时间:' + time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
    
    
    
    

    其中data是从如下网站拿到的

    http://s.xnimg.cn/allunivlist.js
    
  • 相关阅读:
    51单片机 第五节 模块化编程与LCD调试工具
    51单片机 第七节 定时器
    第四届蓝桥杯试题
    洛谷题单 【算法17】搜索
    HttpPostedFile 和 HttpPostedFileBase 你真的了解嘛?
    Juqery让世界更美好超级简单实用的(上、下)自动翻的最佳效果,有图为证!
    图片防盗链实现
    gravity与layout_gravity的区别
    color.xml
    SOAPAction Header!
  • 原文地址:https://www.cnblogs.com/wardensky/p/4780808.html
Copyright © 2011-2022 走看看