import urllib.request
import urllib.parse
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from lxml import etree
def loadPage(url):
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}
request = urllib.request.Request(url, headers = headers)
html = urllib.request.urlopen(request).read()
#解析HTML文档为HTML_DOM模型
content = etree.HTML(html)
#返回所有匹配成功的列表集合
link_list = content.xpath('//li[@class=" j_thread_list clearfix"]//div[@class="threadlist_title pull_left j_th_tit "]/a/@href')
print(link_list,len(link_list))
for link in link_list:
fulllink = "http://tieba.baidu.com" + link #每个帖子的链接
loadImage(fulllink)
#取出每个帖子中每个图片的的链接
def loadImage(link):
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}
request = urllib.request.Request(link, headers=headers)
html = urllib.request.urlopen(request).read()
content = etree.HTML(html)
#返回帖子里所有图片链接的列表集合
link_list = content.xpath('//img[@class="BDE_Image"]/@src')
for link in link_list:
filename = link[-15:]
urllib.request.urlretrieve(link,'./tieba/'+filename)
print("下载成功"+'----'+filename)
def tiebaSpider(url, beginPage, endPage):
for page in range(beginPage, endPage + 1):
pn = (page - 1) * 50
fullurl = url + "&pn=" + str(pn)
loadPage(fullurl)
if __name__ == "__main__":
kw = input("请输入要爬取的贴吧名:")
startPage = int(input("请输入起始页:"))
endPage = int(input("请输入结束页:"))
url = "https://tieba.baidu.com/f?"
# 可以使用urlencode({'kw':kw}) ---> https://tieba.baidu.com/f?kw=美女
key = urllib.parse.urlencode({"kw": kw})
fullurl = url + key
# fullurl = url + 'kw=' +kw
# print(fullurl)
tiebaSpider(fullurl, startPage, endPage)