zoukankan      html  css  js  c++  java
  • 使用Python抓取网页信息

            之前用C#帮朋友写了一个抓取网页信息的程序,搞得好复杂,今天朋友又要让下网页数据,好多啊,又想偷懒,可是不想用C#了,于是想到了Python,大概花了两个小时,用记事本敲的,然后在IDLE (Python GUI)里面测试。发现Python之类的解释性语言很不错,又不用编译,写个脚本就好了。代码如下:

    # -*- coding:gb2312 -*-
    import sys
    import urllib
    import re
    
    #从html中解析标题
    def ParshTitle(html):
    	startPos = html.find('<title>')
    	endpos = html.find('</title>')
    	
    	strTmp = html[startPos+29:endpos]
    	strTmp = strTmp.replace('</font>', '')
    	return strTmp
    
    #从html中解析CPI数据
    def ParshCPI(html):
    	startPos = html.find('<TBODY>')
    	endpos = html.find('</TBODY>')
    	
    	strTmp = html[startPos:endpos]
    	return "<table>" + strTmp + "</table>"
    
    
    #从html中解析城镇投资数据
    def ParshUrbanInvestment(html):
    	startPos = html.find('<TBODY>')
    	endpos = html.find('</TBODY>')
    	
    	strTmp = html[startPos:endpos]
    	return "<table>" + strTmp + "</table>"
    	
    #提取各地区数据
    def GetHtmlData(url, htmlfile, bisCPI):
    	wp = urllib.urlopen(url)#打开连接
    	content = wp.read()	#获取页面内容
    	content = content.replace('\r\n', '')
    
    	title = ParshTitle(content)
    	
    	if bisCPI:
    		content = ParshCPI(content)
    	else:
    		content = ParshUrbanInvestment(content)
    	
    	fl = title + htmlfile
    
    	#将文件路径转为gbk编码
    	fl = unicode(fl,'gbk')
    	f = open(fl, 'w')
    	f.write(content)
    	f.close()
    
    
    if __name__ =="__main__":
    	#首先提取CPI数据
    	num_list = range(72)        #生成0~71的数字
    	strUrl = "http://www.stats.gov.cn/was40/gjtjj_detail_data.jsp?searchword=%28docTitle%21%3D%B9%FA%C4%DA%C9%FA%B2%FA%D7%DC%D6%B5+and+docTitle%21%3D%B3%C7%D5%F2%B5%A5%CE%BB%B4%D3%D2%B5%C8%CB%D4%B1%C0%CD%B6%AF%B1%A8%B3%EA+and+docTitle%21%3D%C5%A9%C1%D6%C4%C1%D3%E6%D2%B5%D7%DC%B2%FA%D6%B5+and+docTitle%21%3D%B8%F7%B5%D8%C7%F8%C5%A9%C1%D6%C4%C1%D3%E6%D2%B5%D7%DC%B2%FA%D6%B5+and+docTitle%21%3D%D2%DA%D4%AA%D2%D4%C9%CF%C9%CC%C6%B7%BD%BB%D2%D7%CA%D0%B3%A1%D6%F7%D2%AA%D6%B8%B1%EA+and+docTitle%21%3D%B8%F7%B5%D8%C7%F8%D2%DA%D4%AA%D2%D4%C9%CF%C9%CC%C6%B7%BD%BB%D2%D7%CA%D0%B3%A1%BB%F9%B1%BE%C7%E9%BF%F6+and+docTitle%21%3D%B8%F7%B5%D8%C7%F8%C5%A9%B4%E5%BE%D3%C3%F1%BC%D2%CD%A5%C6%BD%BE%F9%C3%BF%C8%CB%CF%D6%BD%F0%CA%D5%C8%EB+and+docTitle%21%3D%B7%D6%B5%D8%C7%F8%B3%C7%D5%F2%B5%A5%CE%BB%BE%CD%D2%B5%C8%CB%D4%B1+and+docTitle%21%3D%B7%D6%B5%D8%C7%F8%B3%C7%D5%F2%B5%A5%CE%BB%BE%CD%D2%B5%C8%CB%D4%B1%C0%CD%B6%AF%B1%A8%B3%EA+and+docTitle%21%3D%C8%AB%B9%FA%D6%F7%D2%AA%C5%A9%B2%FA%C6%B7%C9%FA%B2%FA%BC%DB%B8%F1%D6%B8%CA%FD+and+docTitle%21%3D%B8%F7%B5%D8%C7%F8%C5%A9%B4%E5%BE%D3%C3%F1%BC%D2%CD%A5%C6%BD%BE%F9%C3%BF%C8%CB%CF%D6%BD%F0%D6%A7%B3%F6+and+docTitle%21%3D%C6%F3%D2%B5%BE%B0%C6%F8%D6%B8%CA%FD+and+docTitle%21%3D%C4%BF%C2%BC+and++docTitle%21%3D%B7%D6%B5%D8%C7%F8%B3%C7%D5%F2%B5%A5%CE%BB%BE%CD%D2%B5%C8%CB%D4%B1+and+docTitle%21%3D%B7%D6%B5%D8%C7%F8%B3%C7%D5%F2%B5%A5%CE%BB%BE%CD%D2%B5%C8%CB%D4%B1%C0%CD%B6%AF%B1%A8%B3%EA+and++docTitle%21%3D%C8%AB%B9%FA%D6%F7%D2%AA%C5%A9%B2%FA%C6%B7%C9%FA%B2%FA%BC%DB%B8%F1%D6%B8%CA%FD+and+docTitle%21%3D%B8%F7%B5%D8%C7%F8%C5%A9%B4%E5%BE%D3%C3%F1%BC%D2%CD%A5%C6%BD%BE%F9%C3%BF%C8%CB%CF%D6%BD%F0%D6%A7%B3%F6+and++docTitle%21%3D%B8%F7%B5%D8%C7%F8%B3%C7%D5%F2%BE%D3%C3%F1%BC%D2%CD%A5%CA%D5%D6%A7%BB%F9%B1%BE%C7%E9%BF%F6%29+and+%28docTitle%3D%B8%F7%B5%D8%C7%F8%BE%D3%C3%F1%CF%FB%B7%D1%BC%DB%B8%F1%D6%B8%CA%FD+or+DOCHTMLCONTENT%3D%B8%F7%B5%D8%C7%F8%BE%D3%C3%F1%CF%FB%B7%D1%BC%DB%B8%F1%D6%B8%CA%FD%29&channelid=9951&record="
    	for i in num_list:
    		if i==0:
    			continue
    			
    		strTemp = strUrl + str(i)
    		strTxt = "_CPI.htm"
    		GetHtmlData(strTemp, strTxt, True)
    		print (str(i) + "/72") 
    	
    	#再提取城镇投资数据
    	num_list = range(56)        #生成0~55的数字
    	strUrl = "http://www.stats.gov.cn/was40/gjtjj_detail_data.jsp?searchword=%B8%F7%B5%D8%C7%F8%B3%C7%D5%F2%CD%B6%D7%CA&presearchword=%28docTitle%21%3D%B9%FA%C4%DA%C9%FA%B2%FA%D7%DC%D6%B5+and+docTitle%21%3D%B3%C7%D5%F2%B5%A5%CE%BB%B4%D3%D2%B5%C8%CB%D4%B1%C0%CD%B6%AF%B1%A8%B3%EA+and+docTitle%21%3D%C5%A9%C1%D6%C4%C1%D3%E6%D2%B5%D7%DC%B2%FA%D6%B5+and+docTitle%21%3D%B8%F7%B5%D8%C7%F8%C5%A9%C1%D6%C4%C1%D3%E6%D2%B5%D7%DC%B2%FA%D6%B5+and+docTitle%21%3D%D2%DA%D4%AA%D2%D4%C9%CF%C9%CC%C6%B7%BD%BB%D2%D7%CA%D0%B3%A1%D6%F7%D2%AA%D6%B8%B1%EA+and+docTitle%21%3D%B8%F7%B5%D8%C7%F8%D2%DA%D4%AA%D2%D4%C9%CF%C9%CC%C6%B7%BD%BB%D2%D7%CA%D0%B3%A1%BB%F9%B1%BE%C7%E9%BF%F6+and+docTitle%21%3D%B8%F7%B5%D8%C7%F8%C5%A9%B4%E5%BE%D3%C3%F1%BC%D2%CD%A5%C6%BD%BE%F9%C3%BF%C8%CB%CF%D6%BD%F0%CA%D5%C8%EB+and+docTitle%21%3D%B7%D6%B5%D8%C7%F8%B3%C7%D5%F2%B5%A5%CE%BB%BE%CD%D2%B5%C8%CB%D4%B1+and+docTitle%21%3D%B7%D6%B5%D8%C7%F8%B3%C7%D5%F2%B5%A5%CE%BB%BE%CD%D2%B5%C8%CB%D4%B1%C0%CD%B6%AF%B1%A8%B3%EA+and+docTitle%21%3D%C8%AB%B9%FA%D6%F7%D2%AA%C5%A9%B2%FA%C6%B7%C9%FA%B2%FA%BC%DB%B8%F1%D6%B8%CA%FD+and+docTitle%21%3D%B8%F7%B5%D8%C7%F8%C5%A9%B4%E5%BE%D3%C3%F1%BC%D2%CD%A5%C6%BD%BE%F9%C3%BF%C8%CB%CF%D6%BD%F0%D6%A7%B3%F6+and+docTitle%21%3D%C6%F3%D2%B5%BE%B0%C6%F8%D6%B8%CA%FD+and+docTitle%21%3D%C4%BF%C2%BC+and++docTitle%21%3D%B7%D6%B5%D8%C7%F8%B3%C7%D5%F2%B5%A5%CE%BB%BE%CD%D2%B5%C8%CB%D4%B1+and+docTitle%21%3D%B7%D6%B5%D8%C7%F8%B3%C7%D5%F2%B5%A5%CE%BB%BE%CD%D2%B5%C8%CB%D4%B1%C0%CD%B6%AF%B1%A8%B3%EA+and++docTitle%21%3D%C8%AB%B9%FA%D6%F7%D2%AA%C5%A9%B2%FA%C6%B7%C9%FA%B2%FA%BC%DB%B8%F1%D6%B8%CA%FD+and+docTitle%21%3D%B8%F7%B5%D8%C7%F8%C5%A9%B4%E5%BE%D3%C3%F1%BC%D2%CD%A5%C6%BD%BE%F9%C3%BF%C8%CB%CF%D6%BD%F0%D6%A7%B3%F6+and++docTitle%21%3D%B8%F7%B5%D8%C7%F8%B3%C7%D5%F2%BE%D3%C3%F1%BC%D2%CD%A5%CA%D5%D6%A7%BB%F9%B1%BE%C7%E9%BF%F6%29+and+%28docTitle%3D%B8%F7%B5%D8%C7%F8%BE%D3%C3%F1%CF%FB%B7%D1%BC%DB%B8%F1%D6%B8%CA%FD+or+DOCHTMLCONTENT%3D%B8%F7%B5%D8%C7%F8%BE%D3%C3%F1%CF%FB%B7%D1%BC%DB%B8%F1%D6%B8%CA%FD%29&channelid=9951&record="
    	for i in num_list:
    		if i==0:
    			continue
    
    		strTemp = strUrl + str(i)
    		strTxt = "_UI.htm"
    		GetHtmlData(strTemp, strTxt, False)
    		print (str(i) + "/56") 
    

            对上面的代码稍微做个说明:这个主要是用来抓取国家统计局网站的统计数据,包括各地区居民消费价格指数和各地区城镇投资数据。同学要的数据是2006年1月到现在的,在统计局的网站上查询发现,所有的数据的URL都一样,只有最后的一个id值是从1开始递增,最近的是1,时间之前的慢慢递增。统计局的这个URL好长啊!!

            代码中写了三个函数,分别是获取网页的Tilte,用来作为保存的文件的文件名。

            一个函数是从html中解析各地区居民消费价格指数,另外一个提取各地区城镇投资数据,其实目前的代码这两个函数是一样的,我比较懒,没有提取里面的详细信息,目前只是把网页的那个Table的tbody给拿出来了,呵呵。如果可以的话,可以使用正则表达式,只把里面的数据提取出来。

            上次用C#写了好久才搞定,Python果然不一样,写简单的工具的话,还是比较推荐Python的,第一,简单,第二,不用编译,直接写个txt,放到解释器里面就好了。


  • 相关阅读:
    JavaScript或jQuery模拟点击超链接和按钮
    web开发中目录路径问题的解决
    jQuery操作复选框的简单使用
    php中常用魔术方法的举例
    Code-Validator:验证经度、验证维度
    Code-Validator:验证身份证号
    Code-Validator:验证IPv6地址
    Code-Validator:验证IPv4地址
    Code-Validator:验证网址(可以匹配IPv4地址但没对IPv4地址进行格式验证;IPv6暂时没做匹配)
    Code-Validator:验证电子邮箱
  • 原文地址:https://www.cnblogs.com/xiaowangba/p/6314030.html
Copyright © 2011-2022 走看看