zoukankan      html  css  js  c++  java
  • 爬虫 request,正则,bs4

    jupyter notebook
    开启服务
    
    • 爬虫: 通过编写程序,模拟浏览器上网,然后让其去互联网上爬取数据的过程.

    • 爬虫的分类:

      • 通用爬虫:
      • 聚焦爬虫:
      • 增量式
    • 反扒机制:

    • 反反爬机制:
      -robots.txt 协议: 遵循或者不遵循.

    • requests 模块代码编写的流程:

      • 指定url
      • 发起请求
      • 获取响应对象中的数据
      • 持久化存储

    practice.py

    import requests
    #1
    url = 'https://www.sogou.com'
    #2.
    response = requests.get(url=url)
    
    #3
    page_text =response.text
    
    #4.
    with open('./sougou.html','w',encoding='utf-8') as fp:
        fp.write(page_text)
        
    
    

    需求: 爬取搜狗指定词条搜索后的页面数据

    import requests
    url = 'https://www.sogou.com/web'
    # 封装函数
    wd = input('enter a word:')
    param = {
        'query': wd
    }
    response = requests.get(url=url, params=param)
    
    page_text = response.content
    fileName = wd + '.html'
    with open(fileName, 'wb') as fp:
        fp.write(page_text)
        print('over')
    

    爬取百度翻译结构

    url = 'https://fanyi.baidu.com/sug'
    wd = input('enter a word:')
    data = {
        'kw': wd
    }
    response = requests.post(url=url,data=data)
    
    print(response.json())
    

    爬取豆瓣电影分类排行榜 https://move.douban.com/中的电影详情数据

    url = 'https://movie.douban.com/j/chart/top_list'
    params = {
        'type': '5',
        'interval_id': "100:90",
        'action': '',
        'start': '60',
        'limit': '100',
    }
    
    movie_data = requests.get(url=url,params=params).json()
    
    print(movie_data)
    
    

    需求: 爬取国家药品监督管理总局基于中华人民共和国化妆品生产学课程相关数据http://125.35.6.84:81/xk/

    import requests
    url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
    }
    id_list = []
    for page in range(1,11):
        data = {
            "on": "true",
            "page": str(page),
            "pageSize": "15",
            "productName": "",
            "conditionType": "1",
            "applyname": "",
            "applysn": "",
        }
        json_data = requests.post(url=url,data=data,headers=headers).json()
        for dic in json_data['list']:
            id = dic['ID']
            id_list.append(id)
            
    detail_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
    for id in id_list:
        detail_data = {
            'id': id
        }
        detail_json = requests.post(url=detail_url,data=detail_data,headers=headers).json()
        print(detail_json)
    
    

    爬取照片

    url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1551785494&di=d9329f74ebdc5bd6158447daf4d5a783&imgtype=jpg&er=1&src=http%3A%2F%2Fimg.biaoche.org%2F%3Fimg%3D03.imgmini.eastday.com%2Fmobile%2F20180616%2F0e1faa7f78e9c172db3c73d0cc1be192_wmk.jpeg'
    img_data =requests.get(url=url,headers=headers).content
    with open('./xiaohua.jpg','wb') as fp:
        fp.write(img_data)
        
    import urllib
    
    url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1551191414490&di=8db3ee6e5b31215f03cf77e7deaa2077&imgtype=0&src=http%3A%2F%2F00imgmini.eastday.com%2Fmobile%2F20180918%2F20180918171154_1d58954e0491887b39e7122bdd1a9506_2.jpeg'
    urllib.request.urlretrieve(url=url,filename='lulaoye.jpg')    
        
      
    

    正则

    import re
    string = '''fall in love with you
    i love you very much
    i love she
    i love her
    '''
    re.findall('^i.*', string,re.M)
    # re.M 匹配之后分多个
    ['i love you very much', 'i love she', 'i love her']
    
    
    ###########################################
    # 匹配全部行
    string1 = """细思极恐
    你的队友在看书
    你的敌人在磨刀
    你的闺蜜在减肥
    隔壁老王在练腰
    
    """
    re.findall('.*',string1,re.S)
    # re.s 匹配成一行
    # ['细思极恐
    你的队友在看书
    你的敌人在磨刀
    你的闺蜜在减肥
    隔壁老王在练腰
    
    ', '']
    

    爬取糗事百科中所有的图片进行保存

    import os
    import re
    import urllib
    import requests
    
    
    url = 'https://www.qiushibaike.com/pic/page/%d/?s=5170552'
    # page = 1
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
    }
    if not os.path.exists('./qiutu'):
        os.mkdir('./qiutu')
        
    start_page = int(input('enter a start pageNum:'))
    end_page = int(input('enter a end pageNum:'))
    
    for page in range(start_page,end_page+1):
        new_url = format(url%page)
    #     print(new_url)
        page_text = requests.get(url=new_url,headers=headers).text
        img_url_list = re.findall('<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>',page_text,re.S)
        for img_url in img_url_list:
            img_url = 'https:'+img_url
            imgName = img_url.split('/')[-1]
            imgPath = 'qiutu/'+imgName
            urllib.request.urlretrieve(url=img_url,filename=imgPath)
            print(imgPath,'下载成功!')
            
    print('over!!!')
    
    

    bs4的使用

    • bs4解析: 1.pip install bs4
      2.pip install lxml
    • 解析原理:
      • 1 将即将要进行解析的源码加载到bs对象
      • 2 调用bs对象中相关的方法或者属性进行源码中的相关标签的定位
      • 3 将定位到的标签纸巾存在的文本或者属性值获取
    mport requests
    from bs4 import BeautifulSoup
    
    url = 'http://www.shicimingju.com/book/sanguoyanyi.html'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
    }
    page_text = requests.get(url=url,headers=headers).text
    
    soup = BeautifulSoup(page_text,'lxml')
    
    a_list =soup.select('.book-mulu > ul > li > a')
    fp = open('sanguo.txt','w',encoding='utf-8')
    for a in a_list:
        title = a.string
        # string 是亲儿子 ,下面的文字
        detail_url = 'http://www.shicimingju.com' + a['href']
        detail_page_text = requests.get(url=detail_url,headers=headers).text
        
        soup = BeautifulSoup(detail_page_text,'lxml')
        content = soup.find('div',class_='chapter_content').text
        
        fp.write(title + '
    ' + content)
        print(title,'下载完成')
    print('over')
    fp.close
    
    
  • 相关阅读:
    作业十一
    作业十
    作业九
    作业八
    作业七
    作业六
    作业五
    作业四
    eclipse+maven+web服务,实现对hdfs的目录浏览展示
    Eclipse+hadoop伪态式分布+API
  • 原文地址:https://www.cnblogs.com/zzy7372/p/10440043.html
Copyright © 2011-2022 走看看