zoukankan      html  css  js  c++  java
  • Python 简单爬虫案例

    Python 简单爬虫案例

    import requests
    url = "https://www.sogou.com/web"
    # 封装参数
    wd = input('enter a word')
    param = {
        'query':wd
    }
    response = requests.get(url=url,params=param)
    
    page_text = response.content
    fileName = wd+'.html'
    with open(fileName,'wb') as fp:
        fp.write(page_text)
        print('over')
    需求:爬去搜狗指定词条搜索后的页面数据
    import requests
    url = "https://fanyi.baidu.com/sug"
    wd = input('enter aword')
    data = {
        'kw':wd
    }
    response = requests.post(url=url,data=data)
    print(response.json())
    需求:抓取百度翻译
    import requests
    url = "https://movie.douban.com/j/chart/top_list"
    param = {
        "type": "5",
        "interval_id": "100:90",
        "action": "",
        "start": "40",
        "limit": "100",
    }
    
    movie_data = requests.get(url=url,params=param).json()
    print(movie_data)
    需求:抓取豆瓣电影分类https://movie.douban.com/排行榜中的电影详情数据
    import requests
    url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx'
    wd = input('enter aword:')
    data = {
        "cname": '',
        "pid": '',
        "keyword":wd ,
        "pageIndex": "1",
        "pageSize": "10",
    }
    response = requests.post(url=url,data=data).json()
    print(response)
    需求:抓取肯德基餐厅查询http://www.kfc.com.cn/kfccda/index.aspx中指定地点的餐厅数据
    http://125.35.6.84:81/xk/
    import requests
    url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'
    
    headers = {
        'User-Agent':"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
    }
    
    id_list = []
    for page in range(1,11):
        data = {
            "on": "true",
            "page": str(page),
            "pageSize": "15",
            "productName": '',
            "conditionType": "1",
            "applyname": '',
            "applysn": '',
        }
        json_data = requests.post(url=url,data=data,headers=headers).json()
        for dic in json_data["list"]:
            id = dic["ID"]
            id_list.append(id)
    
    detail_url = "http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById"
    for id in id_list:
        detail_data = {
            "id":id
        }
        detail_json = requests.post(url=detail_url,data=detail_data,headers=headers).json()
        print(detail_json)
    需求:爬去国家药品监督管理总局中基于中华人民共和国化妆品生产许个证相关数据 http://125.35.6.84:81/xk/
    import os
    import re
    import urllib
    import requests
    
    
    url = 'https://www.qiushibaike.com/pic/page/%d/?s=5170552'
    # page = 1
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
    }
    if not os.path.exists('./qiutu'):
        os.mkdir('./qiutu')
        
    start_page = int(input('enter a start pageNum:'))
    end_page = int(input('enter a end pageNum:'))
    
    for page in range(start_page,end_page+1):
        new_url = format(url%page)
    #     print(new_url)
        page_text = requests.get(url=new_url,headers=headers).text
        img_url_list = re.findall('<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>',page_text,re.S)
        for img_url in img_url_list:
            img_url = 'https:'+img_url
            imgName = img_url.split('/')[-1]
            imgPath = 'qiutu/'+imgName
            urllib.request.urlretrieve(url=img_url,filename=imgPath)
            print(imgPath,'下载成功!')
            
    print('over!!!')
    需求:爬去糗事百科中所有图片进行保存
    import requests
    url = "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1564643415&di=423648f96f24460811fc7a39e23d29f8&imgtype=jpg&er=1&src=http%3A%2F%2Fimg1.replays.net%2Flol.replays.net%2Fuploads%2Fbody%2F2017%2F06%2F1496734520iBi.jpg"
    headers = {
        'User-Agent':"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
    }
    img_data = requests.get(url=url,headers=headers).content
    with open('./kapai.jpg','wb') as fp:
        fp.write(img_data)
    需求:爬取卡牌大师4k照片
  • 相关阅读:
    EF中的EntityState几个状态的说明
    sql server 更新表,每天的数据分固定批次设置批次号sql
    bootstrap Validators
    马老师 生产环境mysql主从复制、架构优化方案
    PHP在微博优化中的“大显身手”
    将form表单转化为json数据
    免费资源库收集
    奇怪的php问题
    PHP 大数自动转换为科学计数法
    access database in a helper function ?
  • 原文地址:https://www.cnblogs.com/xiangsikai/p/11251658.html
Copyright © 2011-2022 走看看