zoukankan      html  css  js  c++  java
  • Python-爬虫-懒得写的部分

    requests

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    
    import requests
    import re
    
    url = ""
    hd = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0",}
    px = {"http":"http://127.0.0.1:8888"} # 代理
    rst = requests.get(url, headers = hd) 
    data = bytes(rst.text, response.encoding).decode("gbk", "ignore")
    title = re.compile("<title>(.*?)</title>", re.S).findall(data)
    
    

    urllib

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    
    import urllib
    import urllib.request
    import re
    import random
    
    # 浏览器伪装
    opener = urllib.request.build_opener()
    UA = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36")
    opener.addheaders = [UA]
    urllib.request.install_opener(opener)
    
    url = ""
    data = urllib.request.urlopen(url).read().decode('utf-8', 'ignore')
    
    # 构建用户代理池
    uapools=[
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36"
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0"
        ""
    ]
    
    def UA():
        opener = urllib.request.build_opener()
        thisua = random.choice(uapools)
        ua = ("User-Agent", thisua)
        opener.add_handlers = [ua]
        urllib.request.install_opener(opener)
        # print("当前使用UA:" + str(thisua))
        
    for i in range(0, 10):
        UA()
        data = urllib.request.urlopen(url).read().decode('utf-8', 'ignore')
    

    范例

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    
    import urllib.request
    import re
    import random
    import time
    
    uapools = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36"
    ]
    
    def UA():
        opener = urllib.request.build_opener()
        thisua = random.choice(uapools)
        ua = ("User-Agent", thisua)
        opener.addheaders = [ua]
        urllib.request.install_opener(opener)
        print("当前使用UA: " + str(thisua))
    
    for i in range(0, 35): # 总页数
        UA()
        thisurl = "" # 构建 url
        try:
            data = urllib.request.urlopen(thisurl).read().decode('utf-8', 'ignore')
            pat = '' # 构建正则
            rst = re.compile(pat, re.S).findall(data)
            for j in range(0, len(rst)):  # 打印
                print(rst[j])
                print("------")
        except Exception as err:
            pass
    
  • 相关阅读:
    AspNetPager多参数传值
    PHP5.3不支持zend debugger, 安装Xdebug调试工具
    WebClient模拟Post发送接收数据
    Newtonsoft.Json序列化和反序列
    VS.PHP 调试错误:Apache Http server已停止工作
    Ubuntu 10.04 下 xampp 安装教程
    java之递归学习
    产品经理值得交的10个朋友
    不用windows安装盘安装64位win7或windows server 2008的方法(32位winpe下安装64位的办法)
    全生命周期研发流程
  • 原文地址:https://www.cnblogs.com/hare1925/p/13083516.html
Copyright © 2011-2022 走看看