zoukankan      html  css  js  c++  java
  • Python 利用Python编写简单网络爬虫实例3

    利用Python编写简单网络爬虫实例3

    by:授客 QQ1033553122

    实验环境

    python版本:3.3.52.7下报错

     

    实验目的

    获取目标网站“http://bbs.51testing.com/forum.php”中特定url,通过分析发现,目标url同其它url的关系如下


     
         

     

    Python <wbr>利用Python编写简单网络爬虫实例3

     

    目标url存在子页面中的文章中,随机分布,我们要把它找出来

     

    python脚本

    #!/usr/bin/env python

     

    # -*- coding:utf-8 -*-

     

    from urllib.request import *

    import gzip, re

    from io import BytesIO

    from html.parser import HTMLParser

     

    # 爬虫类

    class Reptile:

        """to download web pages"""

     

        def __init__(self):

            self.url_set = set()  # 用于存储已下载过的页面url

            self.data = ""

        

        # 下载页面 

        def get_page(self, url, headers):

            request = Request(url, headers=headers)

            request.add_header('Accept-encoding', 'gzip') #下载经过gzip方式压缩后的网页,减少网络流量

     

            try:

                response = urlopen(request) # 发送请求报文

       

                if response.code == 200: # 请求成功

                    page = response.read() # 读取经压缩后的页面

                  

                    if response.info().get("Content-Encoding") ==  "gzip":       

                        page_data = BytesIO(page)

                        gzipper = gzip.GzipFile(fileobj = page_data)

                        self.data = gzipper.read()

                    else:

                        print("gzip unused")

                        self.data = page_data  # 网页未采用gzip方式压缩,使用原页面            

            except Exception:

                pass

           

            self.url_set.add(url)

                     

            return self.data

     

        # 获取论坛目标版块url

        def get_forum_url(self, url_set, home, include):

            forum_url_set = set() # 用于存放版块url

            while len(url_set) > 0:

                url = url_set.pop()

                if re.findall(include, url):

                    # 读取的版块url通常是forum-53-1.html形势的

                    url = home + url

                    forum_url_set.add(url)

            return forum_url_set

                   

        # 获取版块url下的帖子url

        def get_title_url(self, url_set, home, include):

            title_url_set = set() # 用于存放帖子url

            while len(url_set) > 0:

                url = url_set.pop()

                if re.findall(include, url):

                    # 读取的帖子url通常是thread-1044711-1-1.html形式的

                    url = home + url

                    title_url_set.add(url)

            return title_url_set

      

     

    # 解析器类

    class MyHtmlParser(HTMLParser):

        def reset(self):

            HTMLParser.reset(self)  # 注意顺序

            self.url_set = set()

           

        def handle_starttag(self, tag, attrs):

            #self.url = []

            url_list = [value for key, value in attrs if "href" == key]

            if url_list:

                for url in url_list:

                    self.url_set.add(url)

             

       

    ##############测试################

    # 添加头域,伪装浏览器访问网站,防止一些网站拒绝爬虫访问

    headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0"}

     

     

    init_url = "http://bbs.51testing.com/forum.php"

     

    # 构造解析器

    parser = MyHtmlParser(strict = False)

     

    # 下载网页

    page_number = 1

    print("program is downloading the frist url page")

    reptile = Reptile()

    page = reptile.get_page(init_url, headers)

     

    print("processing the %dth url page" % page_number)

    # 解析网页(获取url)

    parser.feed(str(page))

     

    # 获取分类版块url

    home = "http://bbs.51testing.com/"

    # 分成多个变量来写,主要是为了书写方便,排版友好

    pattern1 = "forum-122-[1-9]|forum-243-[1-9]|forum-40-[1-9]|forum-63-[1-9]"

    pattern2 = "|forum-42-[1-9]|forum-53-[1-9]|forum-275-[1-9]|forum-140-[-9]"

    pattern3 = "|forum-138-[1-9]|forum-139-[1-9]|forum-141-[1-9]"

    pattern = pattern1 + pattern2 + pattern3

    include = re.compile(pattern)

     

    forum_url_set = reptile.get_forum_url(parser.url_set, home, include)

     

    # 循环,获取每个分类版块下,1-10子版块的url(10)(翻页页面)

    result_url_set = set()

    forum_index = 1

    for forum_url in forum_url_set:

        page = reptile.get_page(forum_url, headers)

        parser.feed(str(page))

       

        print("getting the board urls in the %dth forum page" % forum_index)

        tmp_url_set = reptile.get_forum_url(parser.url_set, home, include)

        forum_index = forum_index + 1

       

        result_url_set = result_url_set ^ tmp_url_set

     

    title_url_set = set()

    forum_index = 1

    title_index = 1

    for forum_url in result_url_set:

        page = reptile.get_page(forum_url, headers)

        parser.feed(str(page))

       

        # 获取版块下的帖子url

        pattern1 = "thread-[0-9]{7}-[0-9]{1}-[0-9]{1}[.]html|"

        pattern2 = "thread-[0-9]{6}-[0-9]{1}-[0-9]{1}[.]html|"

        pattern3 = "thread-[0-9]{7}-[0-9]{1}-[0-9]{2}[.]html|"

        pattern4 = "thread-[0-9]{6}-[0-9]{1}-[0-9]{2}[.]html"

        pattern = pattern1 + pattern2 + pattern3 + pattern4

        include = re.compile(pattern)

       

        print("getting all title urls in the %dth forum board" % forum_index)

        tmp_url_set = reptile.get_title_url(parser.url_set, home, include)

        forum_index = forum_index + 1

       

        title_url_set = title_url_set ^ tmp_url_set

     

      

    # 获取目标url

    target_index = 1

    title_index = 1

    filepath = "d:/url.txt"

    for title_url in title_url_set:

        print("processing the %dth title url" % title_index)

        page = reptile.get_page(title_url, headers)

        parser.feed(str(page))

       

        # 保存目标url    

        with open(filepath, "a") as f:

            while len(parser.url_set) > 0:

                url = parser.url_set.pop()

                pattern = "http://bbs.51testing.com/treasure/treasure.php[?]trenum=[0-9]{5}"

                include = re.compile(pattern)

                flag = re.findall(include, url)           

                if flag:

                    print("find target! saving the %dth target url in the %dth title page" % (target_index, title_index))

                    f.write("the %dth url: %s" % (target_index, url))

                    target_index = target_index + 1

                    f.write(" ")

        title_index = title_index + 1

     

    print("complete")

     

    结果:

    Python <wbr>利用Python编写简单网络爬虫实例3

    Python <wbr>利用Python编写简单网络爬虫实例3

    声明:仅供学习研究使用,请勿用于其它非法用途

  • 相关阅读:
    [CF-Edu113]D. Inconvenient Pairs
    第3组团队Git现场编程实战
    团队项目-选题报告
    第一次个人编程作业
    第一次软工博客作业
    (二十二)python 3 sort()与sorted()
    (二十一)python 3 内置函数
    (二十)python 3 匿名函数
    (十九)python 3 内嵌函数和闭包
    (十八)python 3 回调函数
  • 原文地址:https://www.cnblogs.com/shouke/p/10157942.html
Copyright © 2011-2022 走看看