zoukankan      html  css  js  c++  java
  • 爬虫 http原理,梨视频,github登陆实例,requests请求参数小总结

    回顾:http协议基于请求响应的方式,请求:请求首行 请求头{'keys':vales} 请求体 ;响应:响应首行,响应头{'keys':'vales'},响应体。

    import socket
    
    
    sock=socket.socket()
    sock.bind(("127.0.0.1",8808))
    sock.listen(5)
    
    while 1:
        print("server waiting.....")
        conn,addr=sock.accept()
        data=conn.recv(1024)
        print("data", data)
        
        # 读取html文件
        with open("login.html","rb") as f:
            data=f.read()
    
        conn.send((b"HTTP/1.1 200 OK
    Content-type:text/html
    
    %s"%data))
        conn.close()
    基于socket的浏览器交互
    '''
        GET请求
        # 请求首行
        GET / HTTP/1.1
    
        # get请求后面的参数
        b'GET /?name=wd&age=11 HTTP/1.1
    
        # 请求头
        Host: 127.0.0.1:8008
    
        Connection: keep-alive
    
        Cache-Control: max-age=0
    
        Upgrade-Insecure-Requests: 1
    
        User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64)                 
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181Safari/537.36
    
    Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
    Accept-Encoding: gzip, deflate, br
    
    Accept-Language: zh-CN,zh;q=0.9
     Cookie:csrftoken=7xx6BxQDJ6KB0PM7qS8uTA892ACtooNbnnF4LDwlYk1Y7S7nTS81FBqwruizHsxF
    
    '
        # 请求体(get请求,请求体为空)    
        '''
       b''
        '''
        POST请求
        # 请求首行
        b'POST /?name=wd&age=11 HTTP/1.1
    
        # 请求头
        Host: 127.0.0.1:8008
    
    Connection: keep-alive
    
    Content-Length: 21
    
    Cache-Control: max-age=0
    
    Origin: http://127.0.0.1:8008
    
    Upgrade-Insecure-Requests: 1
    
    Content-Type: application/x-www-form-urlencoded
    
    User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36
    
    Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
    
    Referer: http://127.0.0.1:8008/?name=lqz&age=18
    
    Accept-Encoding: gzip, deflate, br
    
    Accept-Language: zh-CN,zh;q=0.9
    
    Cookie:csrftoken=7xx6BxQDJ6KB0PM7qS8uTA892ACtooNbnnF4LDwlYk1Y7S7nTS81FBqwruizHsxF
    
    '
        # 请求体
        b'name=wd&password=11'
        
        '''
     
    请求
    b"HTTP/1.1 200 OK
    
    Content-type:text/html
    
    
    %s"%data
    响应

    http原理

    点击详情

    梨视频案例

    #返回数据3种格式
    #1.text                    匹配需要的东西 
    #2.content(二进制)    保存成图片,视频等
    #3.json                    反序列化成字典或列表
    
    #下载功能
    def download(videos,title):
        if not os.path.exists('video'):
            os.mkdir('video')
        path=os.path.join('video',title)+'.mp4'
        res=requests.get(videos)
        with open(path,'wb') as f:
            f.write(res.content)
    
    #起线程执行执行
    if __name__ == '__main__':
        from concurrent.futures import ThreadPoolExecutor
        p=ThreadPoolExecutor(10)
        for i in parser_index(get_index()):
            dic=video_info(get_video(i))
            print(dic)
            p.submit(download,dic['video'],dic['title'])
        p.shutdown(wait=True)

    #注意问题:梨视频下滑加载视频(是根据url的参数,例如分类下的视频显示多少)

    github登陆实例

    #get请求登陆页面 获取csrf随机字符串和cookies

    #post请求登陆操作 携带csrf,输入的用户名密码等(请求体数据) 和 cookies,user-agent,referer等(请求头数据) 必须数据

    数据是请求体还是请求头数据? (我的理解是比如ajax里的data,django的返回数据都是请求体的数据. request.set_cookies('islogin':'true') request对象的数据为请求头的)

    """
    1.请求登陆页面 获取token cookie
    2.发生登陆的post请求,将用户名密码 和token 放在请求体中,cookie放在请求头中
    
    """
    import requests
    import re
    login_url = "https://github.com/login"
    #浏览器标识
    headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
    #请求登陆页面
    res1 = requests.get(login_url,headers=headers)
    
    print(res1.status_code)
    # 从响应体中获取token
    token = re.search('name="authenticity_token" value="(.*?)"',res1.text).group(1)
    
    # 保存cookie
    login_cookie = res1.cookies.get_dict()
    print(login_cookie)
    
    # 发送登陆请求
    res2 = requests.post("https://github.com/session",
                  headers={
                      "user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"},
                  cookies = login_cookie,
                  data={
                    "commit": "Sign in",
                    "utf8": "",
                    "authenticity_token": token,
                    "login": "xxxxxxxxxxx",
                    "password": "xxxxxxxxxxx"},
                    # 是否允许自动重定向
                    allow_redirects = False)
    print(res2.status_code)
    
    # 用户登录成功后的cookie
    user_cookie = res2.cookies.get_dict()
    
    # 携带用户cookies访问主页
    res3 = requests.get("https://github.com/settings/profile",cookies = user_cookie,headers = headers)
    print(res3.status_code)
    print(res3.text)
    # "https://github.com/settings/profile"

    requests请求参数小总结

    #get请求参数
    kwd = "吴秀波出轨门"
    url = "https://www.baidu.com/s"
    requests.get(url,headers=headers,params={"wd":kwd})
    
    #post请求参数
    requests.post("https://github.com/session",
                  headers={
                      "user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"},
                  cookies = login_cookie,
                  data={
                    "commit": "Sign in",
                    "utf8": "",
                    "authenticity_token": token,
                    "login": "ssssss",
                    "password": "ssssss"},
                    # 是否允许自动重定向
                    allow_redirects = False)
    #返回值处理
    # response.cookies.get_dict() #获取cookies
    # response.status_code # 状态码
    # response.text # 将结果以文本的形式返回
    # response.content # 将结果以二进制的方式返回
    # response.json() # 将数据直接反序列化得到字典或是列表
    主要代码内容
  • 相关阅读:
    AngularJS7那些不得不说的事故
    Python和C++的混合编程(使用Boost编写Python的扩展包)
    为OPENCV添加freetype支持并显示中文字符(在mac上编译opencv及contrib库)
    OpenProject基础使用介绍
    负载均衡
    如何搭建wordpress ,wecenter
    nginx 模块
    Nginx
    http 协议
    ssh
  • 原文地址:https://www.cnblogs.com/3sss-ss-s/p/10301020.html
Copyright © 2011-2022 走看看