zoukankan      html  css  js  c++  java
  • 爬虫基础案例

    1.数据筛选之BS4模块

    例1:获取红牛分公司信息:http://www.redbull.com.cn/about/branch  [规律比较统一]

    # 获取红牛分公司数据http://www.redbull.com.cn/about/branch
    import requests
    from bs4 import BeautifulSoup
    import re
    import pandas
    import openpyxl
    
    res = requests.get('http://www.redbull.com.cn/about/branch')
    # 先测试是否需要加其余条件
    # print(res.text)
    """
    公司名称
    公司地址
    公司邮箱
    公司电话
    <h2>红牛杭州分公司</h2>
    <p class='mapIco'>杭州市上城区庆春路29号远洋大厦11楼A座</p>
    <p class='mailIco'>310009</p>
    <p class='telIco'>0571-87045279/7792</p>
    """
    # 方式1
    # 正则方式
    # title_list = re.findall('<h2>(.*?)</h2>', res.text)
    # addr_list = re.findall("<p class='mapIco'>(.*?)</p>", res.text)
    # email_list = re.findall("<p class='mailIco'>(.*?)</p>", res.text)
    # phone_list = re.findall("<p class='telIco'>(.*?)</p>", res.text)
    # print(phone_list)
    # 一一对应
    # # 1.构造字典数据
    # data_dict = {
    #     "公司名称": title_list,
    #     "公司地址": addr_list,
    #     "公司邮箱": email_list,
    #     "公司电话": phone_list
    # }
    # df = pandas.DataFrame(data_dict)
    # df.to_excel(r'company.xlsx')
    
    # 方式2 采用bs方式
    soup = BeautifulSoup(res.text, 'lxml')
    # title_list = soup.find_all(name='h2')
    # for title in title_list:
    #     print(title.text)
    # 列表生成式
    title_list = [title.text for title in soup.find_all(name='h2')]
    # print(title_list)
    
    # addr_list = soup.find_all(name='p',class_='mapIco')
    # for addr in addr_list:
    #     print(addr.text)
    addr_list = [addr.text for addr in soup.find_all(name='p', class_='mapIco')]
    email_list = [email.text for email in soup.find_all(name='p', class_='mailIco')]
    phone_list = [phone.text for phone in soup.find_all(name='p', class_='telIco')]
    
    print(len(title_list))
    for i in range(40):
        print("""
            "公司名称": %s,
            "公司地址": %s,
            "公司邮箱": %s,
            "公司电话": %s
        """ %(title_list[i],addr_list[i],email_list[i],phone_list[i])
        )

    例2:爬取链家数据(数据处理)

    import requests
    from bs4 import BeautifulSoup
    
    """
    1.研究url规律
        https://sh.lianjia.com/ershoufang/huangpu/
         https://sh.lianjia.com/ershoufang/pudong/
         https://城市首字母缩写.lianjia.com/房屋类型/区域名称/
    2.上海浦东区二手房
        尝试着发送请求
           第一种:先拿存储房屋数据的li标签
        第二种:直接查找对应的标签数据
    """
    res = requests.get('https://sh.lianjia.com/ershoufang/pudong/')
    # print(res.text)
    
    soup = BeautifulSoup(res.text, 'lxml')
    # 研究url规律,筛选数据
    div_list = soup.find_all(name='div', class_='info')
    
    title_list = [div.find(name='a').text for div in div_list if div.find(name='a')]
    
    link_list = [div.find(name='a').get('href') for div in div_list if div.find(name='a')]
    
    div1_list = soup.find_all(name='div', attrs={"class": 'positionInfo'})
    addr_list = [div1.text for div1 in div1_list]
    # addr_list = [div1.find('a').text for div1 in div1_list]
    # print(addr_list)
    # for address in addr_list:
    #     res = address.split('-')
    #     print(res)
    # addr_list1 = [div1.find_all('a')[1].text for div1 in div1_list]
    # print(addr_list1)
    div2_list = soup.find_all(name='div',attrs={"class":"houseInfo"})
    info_list = [ div2.text for div2 in div2_list ]
    """
    '1室1厅 | 59平米 | 南 | 精装 | 中楼层(共14层) | 2010年建 | 板楼'
    户型
    面积
    朝向
    装修
    楼层
    年代
    楼型
    """
    hx = [ i.split('|')[0].strip() for i in info_list]
    mj = [i.split('|')[1].strip() for i in info_list]
    cx = [i.split('|')[2].strip() for i in info_list]
    zx = [i.split('|')[3].strip() for i in info_list]
    lc = [i.split('|')[4].strip() for i in info_list]
    nd = [i.split('|')[5].strip() for i in info_list]
    lx = [i.split('|')[-1].strip() for i in info_list]
    
    div3_list = soup.find_all(name='div',attrs={"class":"followInfo"})
    gz = [ div3.text for div3 in div3_list ]
    
    div4_list = soup.find_all(name='div',attrs={"class":"totalPrice"})
    total_price = [ div4.text for div4 in div4_list ]
    
    div5_list = soup.find_all(name='div',attrs={"class":"unitPrice"})
    unit = [ div5.text for div5 in div5_list ]
    """效果"""
    import pandas as pd
    data_dict = {
        "名称":title_list,
        "地址": addr_list,
        "户型":hx,
        "面积":mj,
        "朝向":cx,
        "装修":zx,
        "楼层":lc,
        "年代":nd,
        "楼型":lx,
        "总价":total_price,
        "单价":unit
    }
    df = pd.DataFrame(data_dict)
    df.to_excel(r'链家.xlsx')
    
    # 多页规律
        你只需要研究url特点即可(绝对有规律)
          第一页:https://sh.lianjia.com/ershoufang/jingan/
        第二页:https://sh.lianjia.com/ershoufang/jingan/pg2/
        第三页:https://sh.lianjia.com/ershoufang/jingan/pg3/
        ...
        https://sh.lianjia.com/ershoufang/jingan/pgN/
        '''第一页应该可以写成
        https://sh.lianjia.com/ershoufang/jingan/pg1/
        '''
        for i in range(1,100):
        base_url = "https://sh.lianjia.com/ershoufang/jingan/pg%s/"
        print(base_url%i)

    例3:爬取天气数据:网站数据不是一次性获取

    """
    有时候网站的数据不是一次性加载的,内部可能是通过js动态请求
    http://tianqi.2345.com/wea_history/58362.htm
    有些网站内容编码查看需要在线json格式化
    通过network检查找内部api接口
    虹口
    http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=71451&areaInfo%5BareaType%5D=2&date%5Byear%5D=2020&date%5Bmonth%5D=11
    http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=71451&areaInfo%5BareaType%5D=2&date%5Byear%5D=2020&date%5Bmonth%5D=12
    http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=71451&areaInfo%5BareaType%5D=2&date%5Byear%5D=2021&date%5Bmonth%5D=1
    
    http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=区域&areaInfo%5BareaType%5D=2&date%5Byear%5D=年份&date%5Bmonth%5D=月份
    """
    
    import requests
    import pandas as pd
    
    res = requests.get("http://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=71451&areaInfo%5BareaType%5D=2&date%5Byear%5D=2020&date%5Bmonth%5D=12")
    json_dict = res.json()
    data = json_dict.get('data')
    # 直接获取网页table标签内部所有的数据
    res = pd.read_html(data)
    res[0].to_excel(r'weather.xlsx')

     例4:爬取汽车之家新闻:排除干扰

    import requests
    from bs4 import BeautifulSoup
    
    res = requests.get("https://www.autohome.com.cn/news/")
    res.encoding = 'GBK'
    soup = BeautifulSoup(res.text,'lxml')
    
    ul_ele = soup.find(name='ul',class_="article")
    li_list = ul_ele.find_all('li')
    # print(li_list)
    title_list=[]
    link_list=[]
    info_list=[]
    time_list=[]
    num_list=[]
    for li in li_list:
        if li.find('a'):
            # 其中有干扰项:<li id="ad_tw_04" style="display: none;"></li>,所以需要if判断
            link = li.find('a')['href']
            # print('https:'+link)
            link_list.append('https:'+link)
    
        # 新闻标题 h3
        if li.find('h3'):
            title = li.find('h3').text
            title_list.append(title)
    
        if li.find('p'):
            info =li.find('p').text
            info_list.append(info)
    
        # if li.find('span'):
        #     tm = li.find('span').text
        #     time_list.append(tm)
        if li.select('span.fn-left'):
            tm = li.select('span.fn-left')[0].text
            # print(tm)
    
        if li.select('span.fn-right'):
            num = li.select('span.fn-right')[0].find('em').text
            #评论数是通过计算动态变化的,默认为0,通过js文件找
            # comment = li.select('span.fn-right')[0].find_all('em')
            # print(comment)

    例5:基于openpyxl爬取豆瓣数据

    # 爬取豆瓣电影top250数据
    1.先尝试着爬取一页
    2.再去研究多页
        https://movie.douban.com/top250
        https://movie.douban.com/top250?start=25&filter=
        https://movie.douban.com/top250?start=50&filter=
        ...
        # 推导第一页
        https://movie.douban.com/top250?start=0&filter=
      
    import requests
    from openpyxl import Workbook
    from bs4 import BeautifulSoup
    import time
    
    
    wb = Workbook()
    w1 = wb.create_sheet('电影排行榜',index=0)
    # 制作表头字段
    w1['A1'] = '序号'
    w1['B1'] = '名称'
    w1['C1'] = '连接'
    w1['D1'] = '评分'
    w1['E1'] = '人数'
    # 提前定义一个序号字段
    count = 1
    
    for i in range(0,250,25):
        base_url = 'https://movie.douban.com/top250?start=%s&filter='
        url = base_url%i
        res = requests.get(url,
                           # 携带请求头
                           headers={
                               "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36"
                           }
                           )
        soup = BeautifulSoup(res.text,'lxml')
        ol = soup.find(name='ol',class_='grid_view')
        li_list = ol.find_all(name='li')
        for li in li_list:
            count += 1
            title = li.find(name='span').text
            link = li.find(name='a').get('href')
            num = li.select('.rating_num')[0].text
            comment = li.find(name='div',class_='star').find_all('span')[-1].text
            # 写入数据
            w1['A%s'%count] = count - 1
            w1['B%s'%count] = title
            w1['C%s'%count] = link
            w1['D%s'%count] = num
            w1['E%s'%count] = comment
        # 人为的设置间歇 避免IP封禁
        time.sleep(5)
    wb.save(r'movie.xlsx')
    """上述代码还可以封装成函数 和 启动脚本的形式
    def get_data(url):
        ...
    
    if __name__ == '__main__':
        for i in range(0,250,25):
            base_url = 'https://movie.douban.com/top250?start=%s&filter='
            url = base_url%i
            get_data(url)
    """

    总结

    1.先尝试爬取一页数据甚至是几条数据
    2.代码逻辑跑通了之后采取考虑多页的情况
  • 相关阅读:
    收集 关于php的博文
    hdwiki中插件开发指南
    MySQL中“”的坑
    SpringBoot生成验证码
    Spring中的循环依赖问题
    深入理解AQS(抽象队列同步器)
    JUC下线程的三种等待唤醒机制
    关于List集合的去重
    MySQL:去除 字符串
    MySQL:Can't connect to MySQL server on 'localhost'(10061)
  • 原文地址:https://www.cnblogs.com/yangmeichong/p/14270271.html
Copyright © 2011-2022 走看看