Python 利用Python编写简单网络爬虫实例3

zoukankan html css js c++ java

Python 利用Python编写简单网络爬虫实例3

利用Python编写简单网络爬虫实例3

by:授客 QQ：1033553122

实验环境

python版本：3.3.5（2.7下报错

实验目的

获取目标网站“http://bbs.51testing.com/forum.php”中特定url，通过分析发现，目标url同其它url的关系如下

目标url存在子页面中的文章中，随机分布，我们要把它找出来

python脚本

#!/usr/bin/env python

# -*- coding:utf-8 -*-

from urllib.request import *

import gzip, re

from io import BytesIO

from html.parser import HTMLParser

# 爬虫类

class Reptile:

    """to download web pages"""

    def __init__(self):

        self.url_set = set() # 用于存储已下载过的页面url

        self.data = ""



    # 下载页面

    def get_page(self, url, headers):

        request = Request(url, headers=headers)

        request.add_header('Accept-encoding', 'gzip') #下载经过gzip方式压缩后的网页，减少网络流量

        try:

            response = urlopen(request) # 发送请求报文



            if response.code == 200: # 请求成功

                page = response.read() # 读取经压缩后的页面



                if response.info().get("Content-Encoding") == "gzip":

                    page_data = BytesIO(page)

                    gzipper = gzip.GzipFile(fileobj = page_data)

                    self.data = gzipper.read()

                else:

                    print("gzip unused")

                    self.data = page_data # 网页未采用gzip方式压缩，使用原页面

        except Exception:

            pass



        self.url_set.add(url)



        return self.data

    # 获取论坛目标版块url

    def get_forum_url(self, url_set, home, include):

        forum_url_set = set() # 用于存放版块url

        while len(url_set) > 0:

            url = url_set.pop()

            if re.findall(include, url):

                # 读取的版块url通常是forum-53-1.html形势的

                url = home + url

                forum_url_set.add(url)

        return forum_url_set



    # 获取版块url下的帖子url

    def get_title_url(self, url_set, home, include):

        title_url_set = set() # 用于存放帖子url

        while len(url_set) > 0:

            url = url_set.pop()

            if re.findall(include, url):

                # 读取的帖子url通常是thread-1044711-1-1.html形式的

                url = home + url

                title_url_set.add(url)

        return title_url_set



# 解析器类

class MyHtmlParser(HTMLParser):

    def reset(self):

        HTMLParser.reset(self) # 注意顺序

        self.url_set = set()



    def handle_starttag(self, tag, attrs):

        #self.url = []

        url_list = [value for key, value in attrs if "href" == key]

        if url_list:

            for url in url_list:

                self.url_set.add(url)





##############测试################

# 添加头域，伪装浏览器访问网站,防止一些网站拒绝爬虫访问

headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0"}

init_url = "http://bbs.51testing.com/forum.php"

# 构造解析器

parser = MyHtmlParser(strict = False)

# 下载网页

page_number = 1

print("program is downloading the frist url page")

reptile = Reptile()

page = reptile.get_page(init_url, headers)

print("processing the %dth url page" % page_number)

# 解析网页(获取url)

parser.feed(str(page))

# 获取分类版块url

home = "http://bbs.51testing.com/"

# 分成多个变量来写，主要是为了书写方便，排版友好

pattern1 = "forum-122-[1-9]|forum-243-[1-9]|forum-40-[1-9]|forum-63-[1-9]"

pattern2 = "|forum-42-[1-9]|forum-53-[1-9]|forum-275-[1-9]|forum-140-[-9]"

pattern3 = "|forum-138-[1-9]|forum-139-[1-9]|forum-141-[1-9]"

pattern = pattern1 + pattern2 + pattern3

include = re.compile(pattern)

forum_url_set = reptile.get_forum_url(parser.url_set, home, include)

# 循环，获取每个分类版块下，1-10子版块的url(前10页)(翻页页面)

result_url_set = set()

forum_index = 1

for forum_url in forum_url_set:

    page = reptile.get_page(forum_url, headers)

    parser.feed(str(page))



    print("getting the board urls in the %dth forum page" % forum_index)

    tmp_url_set = reptile.get_forum_url(parser.url_set, home, include)

    forum_index = forum_index + 1



    result_url_set = result_url_set ^ tmp_url_set

title_url_set = set()

forum_index = 1

title_index = 1

for forum_url in result_url_set:

    page = reptile.get_page(forum_url, headers)

    parser.feed(str(page))



    # 获取版块下的帖子url

    pattern1 = "thread-[0-9]{7}-[0-9]{1}-[0-9]{1}[.]html|"

    pattern2 = "thread-[0-9]{6}-[0-9]{1}-[0-9]{1}[.]html|"

    pattern3 = "thread-[0-9]{7}-[0-9]{1}-[0-9]{2}[.]html|"

    pattern4 = "thread-[0-9]{6}-[0-9]{1}-[0-9]{2}[.]html"

    pattern = pattern1 + pattern2 + pattern3 + pattern4

    include = re.compile(pattern)



    print("getting all title urls in the %dth forum board" % forum_index)

    tmp_url_set = reptile.get_title_url(parser.url_set, home, include)

    forum_index = forum_index + 1



    title_url_set = title_url_set ^ tmp_url_set



# 获取目标url

target_index = 1

title_index = 1

filepath = "d:/url.txt"

for title_url in title_url_set:

    print("processing the %dth title url" % title_index)

    page = reptile.get_page(title_url, headers)

    parser.feed(str(page))



    # 保存目标url

    with open(filepath, "a") as f:

        while len(parser.url_set) > 0:

            url = parser.url_set.pop()

            pattern = "http://bbs.51testing.com/treasure/treasure.php[?]trenum=[0-9]{5}"

            include = re.compile(pattern)

            flag = re.findall(include, url)

            if flag:

                print("find target! saving the %dth target url in the %dth title page" % (target_index, title_index))

                f.write("the %dth url: %s" % (target_index, url))

                target_index = target_index + 1

                f.write(" ")

    title_index = title_index + 1

print("complete")

结果：

声明：仅供学习研究使用，请勿用于其它非法用途

查看全文

相关阅读:
sqlmap使用和X-Forwarded-For注入漏洞测试
 Spring 配置文件中将common.properties文件外置
 git rebase 和 reset的区别
 fatal: Not a valid object name: 'master'.
Git error on commit after merge
git 常用命令使用
 git常用命令学习
 关于idea 修改jsp文件后不能生效
 mysql 表被锁处理方案
 Redis 基本操作

原文地址：https://www.cnblogs.com/shouke/p/10157942.html