zoukankan      html  css  js  c++  java
  • 爬取王垠老师的博客并生成pdf

    尚未完善,有待改进

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    
    __author__ = 'jiangwenwen'
    import pdfkit
    import time
    import requests
    import random
    from bs4 import BeautifulSoup
    from fake_useragent import UserAgent
    
    # 请求头
    ua = UserAgent()
    
    headers = {
        'cache-control': "no-cache",
        "Host": "www.yinwang.org",
        "User-Agent": ua.random,
        "Referer": "http://www.yinwang.org/",
    }
    
    # IP代理池
    ip_pool = ['123.55.114.217:9999',
               '110.52.235.91:9999',
               '183.163.43.61:9999',
               '119.101.126.52:9999',
               '119.101.124.165:9999',
               '119.101.125.38:9999',
               '119.101.125.84:9999',
               '110.52.235.80:9999',
               '119.101.125.49:9999',
               '110.52.235.162:9999',
               '119.101.124.23:9999'
               ]
    
    
    # 打印成pdf
    def print_pdf(url, file_name):
        start = time.time()
        print("正在打印中...")
        headers["User-Agent"] = ua.random
        print("User-Agent是:{0}".format(headers["User-Agent"]))
        content = requests.get(url, headers=headers, timeout=3, proxies=get_proxy(ip_pool)).text
        pdfkit.from_string(content, file_name)
        end = time.time()
        print("打印成功,本次打印耗时:%0.2f秒" % (end - start))
    
    
    # 获得有效代理
    def get_proxy(ip_pool):
        for ip in ip_pool:
            url = "http://www.yinwang.org/"
            # 用requests来验证ip是否可用
            try:
                requests.get(url, proxies={"http": "http://{}".format(ip), }, timeout=3)
            except:
                continue
            else:
                proxies = {
                    "http": "http://{}".format(ip),
                    "https": "http://{}".format(ip),
                }
                return proxies
    
    
    response = requests.get("http://www.yinwang.org/", headers=headers, proxies=get_proxy(ip_pool))
    soup = BeautifulSoup(response.content, 'html.parser')
    tags = soup.find_all("li", class_="list-group-item title")
    
    for child in tags:
        article_url = "http://www.yinwang.org" + child.a.get('href')
        article_file_name = "桌面\" + child.a.string + ".pdf"
        print_pdf(article_url, article_file_name)
    
    
    
    
  • 相关阅读:
    ShoreWall不错的Linux防火墙 规格严格
    数据挖掘 规格严格
    GLIBC 规格严格
    Solr/Lucene Wiki 规格严格
    Zope??? 规格严格
    用Apache htpasswd管理SVN帐户
    假装
    拼包函数及网络封包的异常处理(含代码)
    云计算
    云计算
  • 原文地址:https://www.cnblogs.com/jiangwenwen1/p/10328339.html
Copyright © 2011-2022 走看看