pypthon爬虫相关
- 1、爬取图片至本地,目前仅能实现将百度帖里面的图片下载至本地,百度图片里面的图片还不能实现,等完善
- 2、抓取糗百的段子,查看作者、段子内容,点赞个数
- 3、更加人性化的查看方式,按下回车显示一个段子,如果按[q|Q]就直接退出
代码
- 1、用到的模块,urllib,urllib2,re
- 2、url匹配用到了re模块
- 3、文件下载使用了urllib.urlretrieve()来将分析出来的图片下载至本地
1、百度贴吧图片
#!/usr/bin/env python
#-*- coding:utf8 -*-
import urllib, urllib2
import re
def getHtml(url):
page = urllib2.urlopen(url)
return page.read()
def getImage(html):
'''需要注意这里的.*?表示的是非贪婪匹配,如果遇到第一个>,那么就停止匹配'''
re_img = re.compile(r'<img class="BDE_Image" src="(.*?)".*?>')
img_list = re_img.findall( html )
i = 1
for imgurl in img_list:
print imgurl
'''使用urllib.urlretrieve()来将分析出来的图片下载至本地
'''
urllib.urlretrieve(imgurl, filename='%s.jpg' % i)
i += 1
if __name__ == '__main__':
#url = 'http://tieba.baidu.com/p/3999261766'
#url = 'http://tieba.baidu.com/p/4957363500'
url = 'http://tieba.baidu.com/p/2263349749'
page = getHtml( url )
# print page
getImage( page )
2、一次性获取指定页面的糗百段子
#!/usr/bin/env python
#-*- coding:utf8 -*-
'''
# 爬取糗百段子
1. 攫取段子
2. 过滤带有图片的段子
3. 实现第按一次回车显示一个段子的发布人,段子内容,点赞个数
'''
import urllib, urllib2
import re
page = 2
url = 'http://www.qiushibaike.com/hot/page/' + str(page)
# print( url )
# User-Agent: 封装
headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:45.0) Gecko/20100101 Firefox/45.0"}
try:
req = urllib2.Request(url, headers=headers)
rsp = urllib2.urlopen(req)
html = rsp.read()
except urllib2.URLError, e:
if hasattr(e, 'code'):
print e.code
if hasattr(e, 'reason'):
print e.reason
re_page = re.compile(r'<div class="author.*?>.*?<a.*?<img src=".*?" alt="(.*?)"/>.*?<a.*?<div class="content">.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(d+)</i>', re.S)
items = re_page.findall( html )
# print(items)
for item in items:
for i in item:
print(i)
3、实现第按一次回车显示一个段子的发布人,段子内容,点赞个数
#!/usr/bin/env python
#-*- coding:utf8 -*-
'''
# 爬取糗百段子
1. 攫取段子
2. 实现每按一次回车只显示一个段子
'''
import urllib, urllib2
import re
import sys
def getPage(page_num):
url = 'http://www.qiushibaike.com/hot/page/' + str(page_num)
# print( url )
# User-Agent: 封装
headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:45.0) Gecko/20100101 Firefox/45.0"}
try:
req = urllib2.Request(url, headers=headers)
rsp = urllib2.urlopen(req)
html = rsp.read()
# print html
return html # 返回网页源码
except urllib2.URLError, e:
if hasattr(e, 'code'):
print('连接服务器失败, 错误代码: %s' % e.code)
return None
if hasattr(e, 'reason'):
print('连接服务器失败,错误原因: %s' % e.reason)
return None
def getPageContent(page_num=1):
html = getPage(page_num)
# print(html)
# re_page = re.compile(r'<div class="author.*?>.*?<a.*?<img src=".*?" alt="(.*?)"/>.*?<a.*?<div class="content">.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(d+)</i>', re.S)
re_page = re.compile(r'<div class="author.*?>.*?<a.*?<img src=".*?"/>.*?</a>.*?<a.*?>.*?<h2>(.*?)</h2>.*?</a>.*?<a.*?<div class="content">.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(d+)</i>', re.S)
items = re_page.findall( html )
# print( items[1] )
# print( len(items) )
page_contents = []
for item in items:
# print(' item '.center(50, '='))
# print( item )
page_contents.append((page_num, item[0].strip(), item[1].strip(), item[2].strip()))
# page_contents.append('
')
return page_contents
def getOneStory(page_contents):
for story in page_contents:
input = raw_input()
if input.lower() == 'q':
sys.exit()
# print(story[0], story[1], story[2], story[3])
print('第%s页 发布人:%s 赞: %s
%s' % (story[0], story[1], story[3], story[2]))
if '__main__' == __name__:
print("Loading web content from web site ...
Press [q|Q] to exit, and press 'Enter' see next content:
")
num = 1
while True:
page_contents = getPageContent(num)
getOneStory(page_contents)
num += 1
# page_content = getPageContent()
# print(page_content)
# for item in page_content:
# for i in item:
# print(i)
未完,待续……