zoukankan      html  css  js  c++  java
  • requests 模块的基本使用

    写入excl表中!

    import requests
    import xlwt
    import json
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
    }
    
    # url="https://movie.douban.com/j/chart/top_list?type=17&interval_id=100%3A90&action=&start=0&limit=20"
    
    url="https://movie.douban.com/j/chart/top_list?"
    
    
    
    params={
        'type': '17',
        'interval_id': '100:90',
        'action': '',
        'start': '1',
        'limit': '300',
    }
    response=requests.get(url=url,params=params,headers=headers)
    page_text=response.text
    data=json.loads(page_text)
    # print(data)
    
    title=["title","score","types","actors"]
    
    li=[]
    for dic in data:
        li.append([
            dic["title"],dic["score"],dic["types"],dic["actors"]
        ])
    
    wbk=xlwt.Workbook()
    sheet=wbk.add_sheet("movie")
    
    for i in range(len(title)):
        sheet.write(0,i,title[i])
        
    for i in range(1,len(li)):
        for j in range(len(title)):
            sheet.write(i,j,li[i][j])
            
            
            
            
    wbk.save("douban_movie.xls")
    print("ok")

    request

    • 添加一个cell:a b
    • 删除cell:x
    • 双击:进入可编辑模式
    • 切换cell的模式:
      • y:markdown->code
      • m:code->markdown
    • tab:
    • 执行cell:shift+enter
    • 打开帮助文档:
      • shift+tab

    编码流程:

    • 指定url
    • 发起了请求
    • 获取响应数据
    • 持久化存储
    #爬取搜狗首页源码信息
    
    import requests
    #指定url
    url="https://www.sogou.com/"
    # 发起请求
    response=requests.get(url=url)
    # 获取响应数据
    page_text=response.text
    # 持久化存储
    with open("sogou.html","w",encoding="utf-8")as f:
        f.write(page_text)
    # 简易网页采集器
    #https://www.sogou.com/web?query=周杰伦
    import  requests
    url="https://www.sogou.com/web"
    # 设置动态参数
    wd=input("enter a key word:")
    params={
        "query":wd
    }
    
    # UA伪装
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
    }
    
    
    
    # params 是一个字典,用来处理动态请求参数
    response=requests.get(url=url,params=params,headers=headers)
    
    
    # 指定格式
    response.encoding="utf-8"
    
    
    page_text=response.text
    fileName=wd+'.html'
    
    with open(fileName,'w',encoding='utf-8') as f:
        f.write(page_text)
    
    print(fileName,'下载成功')
    # 爬取肯德基餐厅位置信息
    import requests
    
    url="http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword"
    
    city=input("enter a city:")
    # UA伪装
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
    }
    data={
            'cname': '',
            'pid': '',
            'keyword': city,
            'pageIndex': '1',
            'pageSize': '10'
    }
    response=requests.post(url=url,data=data,headers=headers)
    
    json_data=response.json()
    for dic in json_data["Table1"]:
        print(dic['storeName']+':'+dic["addressDetail"]+"
    ")
    # 爬取药品信息
    # url="http://125.35.6.84:81/xk/"
    
    
    import requests
    
    url="http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList"
    
    # UA伪装
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
    }
    page=int(input("共需要爬取页数:"))
    for num in range(1,page):
        data={
            'on': 'true',
            'page':str(num),
            'pageSize': '15',
            'productName': '',
            'conditionType': '1',
            'applyname': '',
            'applysn': '',
        }
    
        response=requests.post(url=url,data=data,headers=headers)
    
        json_data=response.json()
    
        for dic in json_data["list"]:
        #     print(dic["ID"],dic["EPS_NAME"],'第一梯队')
    
            url1="http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById"
            data={
                'id':dic["ID"]
            }
            response2=requests.post(url=url1,data=data,headers=headers)
    
            dic2=response2.json()
            print(dic2["epsName"],dic2["epsProductAddress"])
    # 爬取豆瓣电影
    
    import requests
    
    url="https://movie.douban.com/j/chart/top_list?type=24&interval_id=100%3A90&action=&start=0&limit=20"
    
    # UA伪装
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
    }
    
    
    response=requests.get(url=url,headers=headers)
    json_data=response.json()
    
    # for dic in json_data:
    #     print(dic["title"],dic["rating"][0])
    #爬取图片
    import requests
    
    url="http://img.netbian.com/file/2019/0730/14d3739bf11dd92055abb56e3f792d3f.jpg"
    
    response=requests.get(url=url)
    content=response.content
    with open("./meinv.jpg","wb") as f:
        f.write(content)



    #基于urllib爬取图片
    from urllib import request

    
    

    url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1564654066124&di=47579504bdd90e51c50c387cbff21391&imgtype=0&src=http%3A%2F%2Fb-ssl.duitang.com%2Fuploads%2Fitem%2F201508%2F05%2F20150805214640_tY24E.jpeg'

    
    

    request.urlretrieve(url,filename="./meishaonv.jpg")



    两种爬取图片方式的区别:
      是否可以进行UA伪装




  • 相关阅读:
    KVM镜像管理利器-guestfish使用详解
    两台linux机器时间同步
    git配合tortoiseGit的基础使用
    使用yum来下载RPM包而不进行安装
    Linux解压缩总结
    Linux下使用git命令及github项目
    linux shell常用快捷键
    调用 sphinx-build生成HTML文件
    复制virtualenv环境到其他服务器环境配置的方法
    CentOS 6.5 PYPI本地源制作
  • 原文地址:https://www.cnblogs.com/XLHIT/p/11283607.html
Copyright © 2011-2022 走看看