zoukankan      html  css  js  c++  java
  • 爬取【王琳杰-博客园】的博文

    1. 获取页面信息,用XPath  做数据提取

    2. 获取每个blog里的用标题、正文、阅读次数信息

    3. 保存到 json 文件内

    # -*- coding:utf-8 -*-
    
    import urllib
    import urllib2
    from lxml import etree
    
    def loadPage(url):
        """
            作用:根据url发送请求,获取服务器响应文件
            url: 需要爬取的url地址
        """
        #print url
        #headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
    
        request = urllib2.Request(url)
        html = urllib2.urlopen(request).read()
        # 解析HTML文档为HTML DOM模型
        content = etree.HTML(html)
        #print content
        # 返回所有匹配成功的列表集合
        link_list = content.xpath('//div[@class="postTitle"]/a/@href')
        for link in link_list:
            
            #print link
            loadpage(link)
    
    # 取出每个文章的链接
    def loadpage(link):
        headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
        request = urllib2.Request(link, headers = headers)
        html = urllib2.urlopen(request).read()
        # 解析
        content = etree.HTML(html)
        # 取出每层发送的文章链接集合
        link_list = content.xpath('//div[@class="post"]//a')[0].text
        print link_list
        # 取出每个标题,正文,阅读的连接
        #for link in link_list:
        #   print link
        #   write(link)
    
    def blogSpider(url, beginPage, endPage):
        """
            作用:贴吧爬虫调度器,负责组合处理每个页面的url
            url : 贴吧url的前部分
            beginPage : 起始页
            endPage : 结束页
        """
        for page in range(beginPage, endPage + 1):
            pn = page
            fullurl = url + str(pn)
            #print fullurl
            loadPage(fullurl)
            #print html
    
            print "谢谢使用"
    
    if __name__ == "__main__":
        beginPage = int(raw_input("请输入起始页:"))
        endPage = int(raw_input("请输入结束页:"))
    
        url = "http://www.cnblogs.com/wanglinjie/default.html?page="
        blogSpider(url, beginPage, endPage)

     待完。。。

    python3中:

     1 import requests
     2 from lxml import etree
     3 
     4 def loadPage(url):
     5     """
     6         作用:根据url发送请求,获取服务器响应文件
     7         url: 需要爬取的url地址
     8     """
     9     #print url
    10     #headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
    11 
    12     #request = urllib2.Request(url)
    13     #html = urllib2.urlopen(request).read()
    14     reseponse = requests.get(url)
    15     # 解析HTML文档为HTML DOM模型
    16     #content = etree.HTML(html)
    17     html = reseponse.content
    18     content = etree.HTML(html)
    19     #content = reseponse.text
    20     #print(content)
    21     # 返回所有匹配成功的列表集合
    22     link_list = content.xpath('//div[@class="postTitle"]/a/@href')
    23     for link in link_list:
    24         
    25         #print link
    26         loadpage(link)
    27 
    28 # 取出每个文章的链接
    29 def loadpage(link):
    30     headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
    31     #request = urllib2.Request(link, headers = headers)
    32     #html = urllib2.urlopen(request).read()
    33     reseponse = requests.get(link, headers = headers)
    34     html = reseponse.content
    35     # 解析
    36 
    37     content = etree.HTML(html)
    38     # 取出每层发送的文章链接集合
    39     link_list = content.xpath('//div[@class="post"]//a')[0].text
    40     print(link_list)
    41     # 取出每个标题,正文,阅读的连接
    42     #for link in link_list:
    43     #   print link
    44     #   write(link)
    45 
    46 def blogSpider(url, beginPage, endPage):
    47     """
    48         作用:贴吧爬虫调度器,负责组合处理每个页面的url
    49         url : 贴吧url的前部分
    50         beginPage : 起始页
    51         endPage : 结束页
    52     """
    53     for page in range(beginPage, endPage + 1):
    54         pn = page
    55         fullurl = url + str(pn)
    56         #print fullurl
    57         loadPage(fullurl)
    58         #print html
    59 
    60         print("谢谢使用")
    61 
    62 if __name__ == "__main__":
    63     beginPage = int(input("请输入起始页:"))
    64     endPage = int(input("请输入结束页:"))
    65 
    66     url = "http://www.cnblogs.com/wanglinjie/default.html?page="
    67     blogSpider(url, beginPage, endPage)
  • 相关阅读:
    线程包含CPU现场
    K8S资源操作
    Kubernetes资源管理
    kubernetes部署安装
    kubernetes简介
    Docker总结
    3.11 虚拟局域网
    3.10 以太网交换机生成树协议STP
    3.9 以太网交换机自学习和转发帧的流程
    3.8 集线器与交换机的区别
  • 原文地址:https://www.cnblogs.com/wanglinjie/p/9194068.html
Copyright © 2011-2022 走看看