zoukankan      html  css  js  c++  java
  • Python-爬虫-懒得写的部分

    requests

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    
    import requests
    import re
    
    url = ""
    hd = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0",}
    px = {"http":"http://127.0.0.1:8888"} # 代理
    rst = requests.get(url, headers = hd) 
    data = bytes(rst.text, response.encoding).decode("gbk", "ignore")
    title = re.compile("<title>(.*?)</title>", re.S).findall(data)
    
    

    urllib

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    
    import urllib
    import urllib.request
    import re
    import random
    
    # 浏览器伪装
    opener = urllib.request.build_opener()
    UA = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36")
    opener.addheaders = [UA]
    urllib.request.install_opener(opener)
    
    url = ""
    data = urllib.request.urlopen(url).read().decode('utf-8', 'ignore')
    
    # 构建用户代理池
    uapools=[
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36"
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0"
        ""
    ]
    
    def UA():
        opener = urllib.request.build_opener()
        thisua = random.choice(uapools)
        ua = ("User-Agent", thisua)
        opener.add_handlers = [ua]
        urllib.request.install_opener(opener)
        # print("当前使用UA:" + str(thisua))
        
    for i in range(0, 10):
        UA()
        data = urllib.request.urlopen(url).read().decode('utf-8', 'ignore')
    

    范例

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    
    import urllib.request
    import re
    import random
    import time
    
    uapools = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36"
    ]
    
    def UA():
        opener = urllib.request.build_opener()
        thisua = random.choice(uapools)
        ua = ("User-Agent", thisua)
        opener.addheaders = [ua]
        urllib.request.install_opener(opener)
        print("当前使用UA: " + str(thisua))
    
    for i in range(0, 35): # 总页数
        UA()
        thisurl = "" # 构建 url
        try:
            data = urllib.request.urlopen(thisurl).read().decode('utf-8', 'ignore')
            pat = '' # 构建正则
            rst = re.compile(pat, re.S).findall(data)
            for j in range(0, len(rst)):  # 打印
                print(rst[j])
                print("------")
        except Exception as err:
            pass
    
  • 相关阅读:
    关于在调用JAVAFX相关包时遇到Access restriction: The type 'Application' is not API (restriction on required library)的解决方法
    JS 获取随机颜色值
    JS jQuery 点击页面漂浮出文字
    JQ 获取浏览器窗口宽高
    JQ 操作css
    JQ 遍历--(祖先,后代,同胞,过滤)
    JQ DOM元素 创建 添加 删除
    jQuery 效果
    3
    webpack 打包CSS 引入图片
  • 原文地址:https://www.cnblogs.com/hare1925/p/13083516.html
Copyright © 2011-2022 走看看