zoukankan      html  css  js  c++  java
  • 1 爬虫 百度/搜狗/豆瓣/药监局

    爬 爬 爬 -- 

    两个软件 
    Anaconda  内置 Jupyter   编译器
    Fiddler4     一个代理软件
    例1  获取整个页面/搜狗
    import requests
    url = 'https://www.sogou.com/'  # 1指定url
    res = requests.get(url=url)     # 2 请求得到相应对象
    page_text = res.text            # 3 text属性返回的是字符串形式的响应数据
    with open('./sg.html','w',encoding='utf-8') as f:    #4 持久化数据
        f.write(page_text)
    例二   搜狗搜索的结果页面
    #UA检测  解决办法headers请求头里加 User-Agent(浏览器标识)
    import requests
    url = 'https://www.sogou.com/web'
    wd = input('你要搜啥:')
    param = {'query':wd}
    res = requests.get(url=url,params=param)  #携带参数
    # print(res.encoding)  # ISO-8859-1  查看响应的编码格式
    res.encoding = 'utf-8' # 指定编码格式
    page_text = res.text
    name = wd + '.html'
    with open(name,'w',encoding='utf-8') as f:
        f.write(page_text)
        print(name,'爬取结束!')
    案例二  更新 添加 请求头 User-Agent 键值对
    import requests
    url = 'https://www.sogou.com/web'
    wd = input('你要搜啥:')
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'}
    param = {
        'query':wd
    }
    res = requests.get(url=url,params=param,headers=headers)  # 参数,请求头 UA检测  反爬机制
    # print(res.encoding)  # ISO-8859-1  查看响应的编码格式
    res.encoding = 'utf-8' # 编码格式改变
    page_text = res.text
    name = wd + '.html'
    with open(name,'w',encoding='utf-8') as f:
        f.write(page_text)
        print(name,'爬取结束!')
    案例3 
    # 获取 百度翻译的结果数据 
    # 页面中有可能存在动态加载的数据
    import requests
    url = 'https://fanyi.baidu.com/sug'
    wd = input('enter a word: ')
    data = {'kw':wd}
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'}
    res = requests.post(url=url,data=data,headers=headers) # post 请求
    obj_json = res.json()  # json
    for i in obj_json['data']:
        print(i['k'],' ',i['v'])
    案例4
    #
    豆瓣电影详情数据 # 页面中有些情况会包含动态加载的数据 鼠标滚轮下滑 数据持续加载 import requests url = 'https://movie.douban.com/j/chart/top_list' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'} param = { "type": "5", "interval_id": "100:90", "action":"", "start": "0", "limit": "50", } obj_json = requests.get(url=url,params=param,headers=headers).json() # get请求 params参数 # print(obj_json) print(len(obj_json))
    案例五
    #
    药监局 化妆品公司数据 http://125.35.6.84:81/xk/ import requests post_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'} all_data = [] IDs =[] for page in range(1,3): data = { "on": "true", "page": str(page), "pageSize": "15", "productName":"", "conditionType": "1", "applyname": "", "applysn": "", } # 首页ajax 请求返回的响应数据 json_obj = requests.post(url=post_url,data=data,headers=headers).json() for dic in json_obj["list"]: IDs.append(dic['ID']) print(len(IDs)) for id in IDs: detail_post_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById' data = {'id':id} detail_dic = requests.post(url=detail_post_url,data=data,headers=headers).json() all_data.append(detail_dic) print(all_data[0]) print(len(all_data))

     # 下面是 PM2.5 检测网站

    # 需求爬取当前页面全部的城市名称https://www.aqistudy.cn/historydata/
    url = 'https://www.aqistudy.cn/historydata/'
    page_text = requests.get(url=url,headers=headers).text
    
    tree = etree.HTML(page_text)
    #热门城市://div[@class="bottom"]/ul/li/a/text()
    #全部城市://div[@class="bottom"]/ul/div[2]/li/a/text()
    all_city_names = tree.xpath('//div[@class="bottom"]/ul/li/a/text() | //div[@class="bottom"]/ul/div[2]/li/a/text()')
    print(all_city_names,len(all_city_names))
    import requests
    from lxml import etree
    
    url='http://industry.nbd.com.cn'
    headers={
         'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
    }
    
    page_text=requests.get(url=url,headers=headers).text
    tree=etree.HTML(page_text)
    lis_all=tree.xpath('//ul[@class="m-columnnews-list"]/li')
    all_list=[]
    for  i in lis_all:
        title=i.xpath('./a/img/@alt')[0]
        all_list.append(title)
    print(all_list)
    每经网 热门精选话题 xtree xpath
  • 相关阅读:
    Python-pymysql
    MySQL学习(3)
    MySQL学习(1)
    MySQL与PostgreSQL哪个更好?
    svn与git区别
    journalctl常用命令
    Spring Cloud 生产环境性能优化
    springcloud优雅停止上下线与熔断
    istio基础详解
    微服务的全链路监控
  • 原文地址:https://www.cnblogs.com/zhangchen-sx/p/10792461.html
Copyright © 2011-2022 走看看