zoukankan      html  css  js  c++  java
  • 1 爬虫 百度/搜狗/豆瓣/药监局

    爬 爬 爬 -- 

    两个软件 
    Anaconda  内置 Jupyter   编译器
    Fiddler4     一个代理软件
    例1  获取整个页面/搜狗
    import requests
    url = 'https://www.sogou.com/'  # 1指定url
    res = requests.get(url=url)     # 2 请求得到相应对象
    page_text = res.text            # 3 text属性返回的是字符串形式的响应数据
    with open('./sg.html','w',encoding='utf-8') as f:    #4 持久化数据
        f.write(page_text)
    例二   搜狗搜索的结果页面
    #UA检测  解决办法headers请求头里加 User-Agent(浏览器标识)
    import requests
    url = 'https://www.sogou.com/web'
    wd = input('你要搜啥:')
    param = {'query':wd}
    res = requests.get(url=url,params=param)  #携带参数
    # print(res.encoding)  # ISO-8859-1  查看响应的编码格式
    res.encoding = 'utf-8' # 指定编码格式
    page_text = res.text
    name = wd + '.html'
    with open(name,'w',encoding='utf-8') as f:
        f.write(page_text)
        print(name,'爬取结束!')
    案例二  更新 添加 请求头 User-Agent 键值对
    import requests
    url = 'https://www.sogou.com/web'
    wd = input('你要搜啥:')
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'}
    param = {
        'query':wd
    }
    res = requests.get(url=url,params=param,headers=headers)  # 参数,请求头 UA检测  反爬机制
    # print(res.encoding)  # ISO-8859-1  查看响应的编码格式
    res.encoding = 'utf-8' # 编码格式改变
    page_text = res.text
    name = wd + '.html'
    with open(name,'w',encoding='utf-8') as f:
        f.write(page_text)
        print(name,'爬取结束!')
    案例3 
    # 获取 百度翻译的结果数据 
    # 页面中有可能存在动态加载的数据
    import requests
    url = 'https://fanyi.baidu.com/sug'
    wd = input('enter a word: ')
    data = {'kw':wd}
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'}
    res = requests.post(url=url,data=data,headers=headers) # post 请求
    obj_json = res.json()  # json
    for i in obj_json['data']:
        print(i['k'],' ',i['v'])
    案例4
    #
    豆瓣电影详情数据 # 页面中有些情况会包含动态加载的数据 鼠标滚轮下滑 数据持续加载 import requests url = 'https://movie.douban.com/j/chart/top_list' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'} param = { "type": "5", "interval_id": "100:90", "action":"", "start": "0", "limit": "50", } obj_json = requests.get(url=url,params=param,headers=headers).json() # get请求 params参数 # print(obj_json) print(len(obj_json))
    案例五
    #
    药监局 化妆品公司数据 http://125.35.6.84:81/xk/ import requests post_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'} all_data = [] IDs =[] for page in range(1,3): data = { "on": "true", "page": str(page), "pageSize": "15", "productName":"", "conditionType": "1", "applyname": "", "applysn": "", } # 首页ajax 请求返回的响应数据 json_obj = requests.post(url=post_url,data=data,headers=headers).json() for dic in json_obj["list"]: IDs.append(dic['ID']) print(len(IDs)) for id in IDs: detail_post_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById' data = {'id':id} detail_dic = requests.post(url=detail_post_url,data=data,headers=headers).json() all_data.append(detail_dic) print(all_data[0]) print(len(all_data))

     # 下面是 PM2.5 检测网站

    # 需求爬取当前页面全部的城市名称https://www.aqistudy.cn/historydata/
    url = 'https://www.aqistudy.cn/historydata/'
    page_text = requests.get(url=url,headers=headers).text
    
    tree = etree.HTML(page_text)
    #热门城市://div[@class="bottom"]/ul/li/a/text()
    #全部城市://div[@class="bottom"]/ul/div[2]/li/a/text()
    all_city_names = tree.xpath('//div[@class="bottom"]/ul/li/a/text() | //div[@class="bottom"]/ul/div[2]/li/a/text()')
    print(all_city_names,len(all_city_names))
    import requests
    from lxml import etree
    
    url='http://industry.nbd.com.cn'
    headers={
         'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
    }
    
    page_text=requests.get(url=url,headers=headers).text
    tree=etree.HTML(page_text)
    lis_all=tree.xpath('//ul[@class="m-columnnews-list"]/li')
    all_list=[]
    for  i in lis_all:
        title=i.xpath('./a/img/@alt')[0]
        all_list.append(title)
    print(all_list)
    每经网 热门精选话题 xtree xpath
  • 相关阅读:
    Java实现 LeetCode 343 整数拆分(动态规划入门经典)
    Java实现 LeetCode 342 4的幂
    Java实现 LeetCode 342 4的幂
    Java实现 LeetCode 342 4的幂
    Java实现 LeetCode 341 扁平化嵌套列表迭代器
    Java实现 LeetCode 341 扁平化嵌套列表迭代器
    Java实现 LeetCode 341 扁平化嵌套列表迭代器
    Java实现 LeetCode 338 比特位计数
    H264(NAL简介与I帧判断)
    分享一段H264视频和AAC音频的RTP封包代码
  • 原文地址:https://www.cnblogs.com/zhangchen-sx/p/10792461.html
Copyright © 2011-2022 走看看