-
获取页面信息,用XPath 做数据提取
-
获取每个blog里的
用标题
、正文、阅读次数信息 -
保存到 json 文件内
# -*- coding:utf-8 -*- import urllib import urllib2 from lxml import etree def loadPage(url): """ 作用:根据url发送请求,获取服务器响应文件 url: 需要爬取的url地址 """ #print url #headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"} request = urllib2.Request(url) html = urllib2.urlopen(request).read() # 解析HTML文档为HTML DOM模型 content = etree.HTML(html) #print content # 返回所有匹配成功的列表集合 link_list = content.xpath('//div[@class="postTitle"]/a/@href') for link in link_list: #print link loadpage(link) # 取出每个文章的链接 def loadpage(link): headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} request = urllib2.Request(link, headers = headers) html = urllib2.urlopen(request).read() # 解析 content = etree.HTML(html) # 取出每层发送的文章链接集合 link_list = content.xpath('//div[@class="post"]//a')[0].text print link_list # 取出每个标题,正文,阅读的连接 #for link in link_list: # print link # write(link) def blogSpider(url, beginPage, endPage): """ 作用:贴吧爬虫调度器,负责组合处理每个页面的url url : 贴吧url的前部分 beginPage : 起始页 endPage : 结束页 """ for page in range(beginPage, endPage + 1): pn = page fullurl = url + str(pn) #print fullurl loadPage(fullurl) #print html print "谢谢使用" if __name__ == "__main__": beginPage = int(raw_input("请输入起始页:")) endPage = int(raw_input("请输入结束页:")) url = "http://www.cnblogs.com/wanglinjie/default.html?page=" blogSpider(url, beginPage, endPage)
待完。。。
python3中:
1 import requests 2 from lxml import etree 3 4 def loadPage(url): 5 """ 6 作用:根据url发送请求,获取服务器响应文件 7 url: 需要爬取的url地址 8 """ 9 #print url 10 #headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"} 11 12 #request = urllib2.Request(url) 13 #html = urllib2.urlopen(request).read() 14 reseponse = requests.get(url) 15 # 解析HTML文档为HTML DOM模型 16 #content = etree.HTML(html) 17 html = reseponse.content 18 content = etree.HTML(html) 19 #content = reseponse.text 20 #print(content) 21 # 返回所有匹配成功的列表集合 22 link_list = content.xpath('//div[@class="postTitle"]/a/@href') 23 for link in link_list: 24 25 #print link 26 loadpage(link) 27 28 # 取出每个文章的链接 29 def loadpage(link): 30 headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} 31 #request = urllib2.Request(link, headers = headers) 32 #html = urllib2.urlopen(request).read() 33 reseponse = requests.get(link, headers = headers) 34 html = reseponse.content 35 # 解析 36 37 content = etree.HTML(html) 38 # 取出每层发送的文章链接集合 39 link_list = content.xpath('//div[@class="post"]//a')[0].text 40 print(link_list) 41 # 取出每个标题,正文,阅读的连接 42 #for link in link_list: 43 # print link 44 # write(link) 45 46 def blogSpider(url, beginPage, endPage): 47 """ 48 作用:贴吧爬虫调度器,负责组合处理每个页面的url 49 url : 贴吧url的前部分 50 beginPage : 起始页 51 endPage : 结束页 52 """ 53 for page in range(beginPage, endPage + 1): 54 pn = page 55 fullurl = url + str(pn) 56 #print fullurl 57 loadPage(fullurl) 58 #print html 59 60 print("谢谢使用") 61 62 if __name__ == "__main__": 63 beginPage = int(input("请输入起始页:")) 64 endPage = int(input("请输入结束页:")) 65 66 url = "http://www.cnblogs.com/wanglinjie/default.html?page=" 67 blogSpider(url, beginPage, endPage)