zoukankan      html  css  js  c++  java
  • day01

    1、爬虫介绍:

     2、第一个爬虫程序:

    from urllib.request import urlopen
    url= "http://www.baidu.com"  # 记得是http,不是https
    
    response = urlopen(url)
    info = response.read()  # 获取内容
    # print(info.decode())    # 设置编码,decode()默认是utf-8
    
    print(response.getcode()) # 获得状态码  200
    print(response.geturl())  # 获得真实url  http://www.baidu.com
    print(response.info())    # 获得响应头

    3、Request请求信息:

    from urllib.request import urlopen,Request
    from random import choice
    url= "http://www.baidu.com"  # 记得是http,不是https
    User_Agent = [
        "User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
        "Opera 11.11 – WindowsUser-Agent:Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
        "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) 
       Chrome/17.0.963.56 Safari/535.11
    " ] headers= { "User-Agent":choice(User_Agent) # 随机获得User-agent } request = Request(url,headers=headers) print(request.get_header("User-agent")) # 获得请求头的User-agent信息 response = urlopen(request) info = response.read() # 获取内容 print(info.decode()) # 设置编码,decode()默认是utf-8

    4、random中choice:

    from random import choice
    from fake_useragent import UserAgent
    User_Agent = [
        "Opera 11.12 – MACUser-Agent:Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
        "Opera 11.11 – WindowsUser-Agent:Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
        "Chrome 17.0 – MACUser-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
    ]
    
    c = choice(User_Agent) #  随机获得一个求头
    print(c)
    
    c= UserAgent()  # 随机获得user-agent
    print(c.chrome)
    print(c.firefox)
    print(c.ie)

     5、get请求之中文处理:

    from urllib.request import urlopen,Request
    from random import choice
    from urllib.parse import quote,urlencode
    
    # url =  "https://www.baidu.com/s?wd=尚学堂"    # 这样会出错
    
    # 处理中文:方式一
    wd = quote("尚学堂")
    # url= "https://www.baidu.com/s?wd={}".format(wd)  
    
    # 处理中文:方式二
    args = {
        "wd":"尚学堂",
        "id":"11"
    }
    url= "https://www.baidu.com/s?wd={}".format(urlencode(args)) 
    # print(url)  # https://www.baidu.com/s?wd=wd=%E5%B0%9A%E5%AD%A6%E5%A0%82&id=11
    
    headers= {
        "User-Agent": "User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
    }
    request = Request(url,headers=headers)
    response = urlopen(request)
    print(response.read().decode())

     6、爬取贴吧:

    from urllib.request import urlopen,Request
    from random import choice
    from urllib.parse import quote,urlencode
    from fake_useragent import UserAgent
    
    def get_html(num,title):
        base_url = "http://tieba.baidu.com/f?{}"
        headers = {
            "User-Agent": UserAgent().random
        }
        for pn in range(int(num)):
            args = {
                "kw": "尚学堂",
                "ie": "utf - 8",
                "pn": pn * 50
            }
            request = Request(url,headers=headers)
            print("正在爬取")
            response = urlopen(request)
            filename = ""+str(pn)+"页.html"
            save(filename,response.read())
    
    def save(filename,content):
        html = content.decode()
        with open(filename,"wb") as f:
            f.write(content)
    
    def main():
        num = input("请输入爬取的页数:")
        title = input("请输入爬取的题目:")
        get_html(num,title)  # 调用函数
    
    if __name__ == "__main__":
        main()

    7、post请求之登录:

    from urllib.request import Request,urlopen
    from urllib.parse import urlencode
    url = "https://www.sxt.cn/index/login/login.html"
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"
    }
    data = {
        "user": "17703181473",
        "password": "12346"
    }
    # post请求,Request中有data,字典先变成字符串(urlencode),再变成字节(encode)
    # 必须加encode,变成字节
    request = Request(url,data=urlencode(data).encode(),headers=headers)
    resposne = urlopen(request)
    print(resposne.read().decode()) 
  • 相关阅读:
    GC
    java基石-JVM
    golang 结构体指针及赋值
    golang 记一次map中struct的管道造成死锁的解决方式
    golang 封装"执行shell管理redis(string,集合等)"成api
    golang panic及处理
    Python简直无所不能!在电脑上如何调用手机摄像头?教你轻松搞定!
    2021最新版Python爬取抖音小姐姐短视频,无水印,超级详细!(附视频/源码)
    自从学会Python爬虫后,爬视频我只爬小姐姐!教你批量下载某短视频网站视频!
    突然不知道听什么歌了,但是排行榜的准没错,于是用Python全部都爬下来了!
  • 原文地址:https://www.cnblogs.com/fangsheng/p/10310329.html
Copyright © 2011-2022 走看看