zoukankan      html  css  js  c++  java
  • 爬虫学习(四)——post请求爬取

    百度翻译爬取数据

    import urllib.request
    import urllib.parse
    post_url = "https://fanyi.baidu.com/sug"
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
    word= input( "请输入要翻译的内容:" )
    data = {"kw":word}


    # 对表单数据进行处理时,先转换成为字符串,在转换成为字节
    # 只转化成为字符串形式
    data1 = urllib.parse.urlencode(data)
    print(data1)
    # 输出结果
    # word=%E5%AE%9D%E8%B4%9D 字符串格式


    # 转换成为字节格式
    data = urllib.parse.urlencode(data).encode("utf8")
    print(data)
    # 显示结果:
    # b'word=%E5%AE%9D%E8%B4%9D' 字节格式


    request = urllib.request.Request(post_url,headers=headers)
    response = urllib.request.urlopen(request,data = data)
    print(response.read().decode("utf8"))

    百度翻译爬取数据

    import urllib.request
    import urllib.parse

    apiurl = "https://fanyi.baidu.com/v2transapi"

    在爬取目标网站时,先找准目标网站的接口,和需要传递的数据
    data = {
    'from': 'en',
    'to': 'zh',
    'query': 'baby',
    'transtype': 'realtime',
    'simple_means_flag': '3',
    'sign': '814534.560887',
    'token': '8b44713bb18ae29ba380245d18270565',
    }
    data1= urllib.parse.urlencode( data ).encode( "utf8" )

    #post请求重点是请求头信息要详细,关键的属性不能省

    headers = {

    #'Accept': '*/*',
    #'Accept-Encoding': 'gzip, deflate, br',
    #'Accept-Language': 'zh-CN,zh;q=0.9',
    #'Connection': 'keep-alive',
    #'Content-Length': '121',
    #'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Cookie': 'PSTM=1528269920; BIDUPSID=7EE884F5F31114F0BCDC2588805B747F; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; to_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; from_lang_often=%5B%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID=C15EE352EEB61222BDA4C2F95822E5EF:SL=0:NR=10:FG=1; pgv_pvi=4516305920; delPer=0; H_PS_PSSID=1436_21101_28206_28131_26350_28139; PSINO=2; locale=zh; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1546425466,1546425533,1546425602,1546484054; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1546487219',
    'Host': 'fanyi.baidu.com',
    'Origin': 'https://fanyi.baidu.com',
    'Referer': 'https://fanyi.baidu.com/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',

    }
    request = urllib.request.Request( url=apiurl, headers=headers )
    response = urllib.request.urlopen( request, data=data1 )
    print(response.read().decode( "utf8" ) )

    ajax的post请求
    爬取KFC的餐厅地址
    import urllib.request
    import urllib.parse
    url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword"
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
    # 表单输入地址信息
    keyword =input("请输入需要查询的城市")
    #请求网页需要进行传递的参数
    data = {
    "cname":"",
    "pid": "",
    "keyword": keyword,
    "pageIndex": "1",
    "pageSize": "10",
    }
    # 解析参数,编程字节格式
    data = urllib.parse.urlencode(data).encode("utf8")
    # 构建请求头信息
    request = urllib.request.Request(url,headers = headers)
    # 发送请求,获取相应数据,重点是附带参数的post请求。
    response = urllib.request.urlopen(request,data=data)
    # 读取响应信息
    print(response.read().decode("utf8"))





     

  • 相关阅读:
    POJ 2723 Get Luffy Out(2-SAT)
    ZOJ 3613 Wormhole Transport
    HDU 4085 Peach Blossom Spring
    NBUT 1221 Intermediary
    NBUT 1223 Friends number
    NBUT 1220 SPY
    NBUT 1218 You are my brother
    PAT 1131. Subway Map (30)
    ZSTU OJ 4273 玩具
    ZSTU OJ 4272 最佳淘汰算法
  • 原文地址:https://www.cnblogs.com/kuangkuangduangduang/p/10366222.html
Copyright © 2011-2022 走看看