• Python爬虫bs4解析实战


    1.常用方法

    from bs4 import BeautifulSoup
    
    html = """
    <table class="tablelist" cellpadding="0" cellspacing="0">
        <tr class="h">
            <td class="l" width="374">职位名称</td>
            <td>职位类别</td>
            <td>人数</td>
            <td>地点</td>
            <td>发布时间</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=45021&keywords=python&tid=0&lid=0">22989-腾讯云计费PHP高级开发工程师</a></td>
            <td>技术类</td>
            <td>2</td>
            <td>深圳</td>
            <td>2018-10-23</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a target="_blank" href="position_detail.php?id=45005&keywords=python&tid=0&lid=0">25663-腾讯云高级后台开发(互联网业务)(北京)</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>北京</td>
            <td>2018-10-23</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=45007&keywords=python&tid=0&lid=0">TEG06-云计算架构师(深圳)</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2018-10-23</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a target="_blank" href="position_detail.php?id=44980&keywords=python&tid=0&lid=0">PCG04-PCG研发部数据科学家(深圳/北京)</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2018-10-23</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=44981&keywords=python&tid=0&lid=0">PCG04-PCG研发部业务运维工程师(深圳)</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2018-10-23</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a target="_blank" href="position_detail.php?id=44971&keywords=python&tid=0&lid=0">23674-腾讯新闻大数据分析工程师(北京)</a></td>
            <td>技术类</td>
            <td>2</td>
            <td>北京</td>
            <td>2018-10-23</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=44964&keywords=python&tid=0&lid=0">TEG05-高级数据挖掘工程师(深圳)</a></td>
            <td>技术类</td>
            <td>2</td>
            <td>深圳</td>
            <td>2018-10-23</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a target="_blank" href="position_detail.php?id=44968&keywords=python&tid=0&lid=0">PCG01-QQ后台推荐算法工程师</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2018-10-23</td>
        </tr>
        <tr class="even">
            <td class="l square"><a target="_blank" href="position_detail.php?id=44969&keywords=python&tid=0&lid=0">PCG01-QQ后台大数据开发工程师</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2018-10-23</td>
        </tr>
        <tr class="odd">
            <td class="l square"><a target="_blank" href="position_detail.php?id=44952&keywords=python&tid=0&lid=0">22989-腾讯云AI产品高级咨询顾问(深圳北京)</a></td>
            <td>技术类</td>
            <td>1</td>
            <td>深圳</td>
            <td>2018-10-23</td>
        </tr>
    </table>    
    """
    
    soup = BeautifulSoup(html, "lxml")
    # 1.找到所有的tr标签
    # trs = soup.find_all("tr")
    # 2.找到第二个tr标签,limit表示找到个数,在列表层面获取具体标签
    # tr = soup.find_all("tr", limit=2)[1]
    # 3.找到所有class等于even的tr标签,class关键字冲突,加下划线
    # trs = soup.find_all("tr", class_="even")
    # 4.attrs属性可添加多个,以key-value形式
    # trs = soup.find_all("tr", attrs={"class": "even"})
    # 5.将所有a标签有target属性的找到,可以添加多个关键字参数
    # aList = soup.find_all("a", target="_blank")
    # 6.获取所有的a标签的href属性
    # aList = soup.find_all("a")
    # for a in aList:
        # 1.通过下标操作的方式
        # href = a["href"]
        # 2.通过attrs属性的方式
        # href = a.attrs["href"]
    # 获取所有的职位信息,过滤掉第一个
    trs = soup.find_all("tr")[1:]
    jobs = []
    for tr in trs:
        job = {}
        # tds = tr.find_all("td")
        # title = tds[0].string
        # category = tds[1].string
        # nums = tds[2].string
        # city = tds[3].string
        # pubtime = tds[4].string
        # job["title"] = title
        # job["category"] = category
        # job["nums"] = nums
        # job["city"] = city
        # job["pubtime"] = pubtime
        # jobs.append(job)
        # 获取所有文本
        infos = list(tr.stripped_strings)
        job["title"] = infos[0]
        job["category"] = infos[1]
        job["nums"] = infos[2]
        job["city"] = infos[3]
        job["pubtime"] = infos[4]
        jobs.append(job)
    print(jobs)
    View Code

    2.css选择器方法

    # 1.获取所有tr标签
    # trs = soup.select("tr")
    # 2.获取第二个tr标签
    # tr = soup.select("tr")[1]
    # 3.获取所有class是even的tr标签
    # trs = soup.select("tr.even")
    # trs = soup.select("tr[class='even']")
    # 4.获取所有a标签的href属性
    # aList = soup.select("a")
    # for a in aList:
    #     print(a["href"])
    # 5.将所有的职位信息提取出来
    # trs = soup.select("tr")
    # for tr in trs:
    #     infos = list(tr.stripped_strings)
    #     print(infos)
    View Code
    from bs4 import BeautifulSoup
    
    html = """
    <div>
    <!--我是div-->
    </div>    
    """
    
    # 本质上是一个tag类型,生成一个tag实例对象,调用tag的方法
    soup = BeautifulSoup(html, "lxml")
    div = soup.find("div")
    print(type(div))    # <class 'bs4.element.Tag'>
    # string打印标签下的直接子元素,隔行显示不能打印
    print(div.string)
    # contents打印标签下的所有元素,返回一个列表
    print(div.contents)
    # children打印标签下的所有元素,返回一个迭代器
    print(div.children)

    3.爬取中国天气网并图文显示

    """中国天气网爬取并视图显示最低气温城市"""
    import requests
    from bs4 import BeautifulSoup
    from pyecharts import Bar
    
    HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36"}
    ALL_DATA = []
    
    
    def detail_urls(url):
        rep = requests.get(url=url, headers=HEADERS)
        text = rep.content.decode(encoding="utf-8")
        # 港澳表格标签残缺需要补缺能力强的html5lib补齐表格标签
        soup = BeautifulSoup(text, "html5lib")
        # 找到第一个属性为conMidtab的div标签
        commidtab = soup.find("div", class_="conMidtab")
        # 找到这个div下的所有table
        tables = commidtab.find_all("table")
        # 循环每一个table
        for table in tables:
            # 排除介绍部分
            trs = table.find_all("tr")[2:]
            # 省份和直辖市两种情况
            for index, tr in enumerate(trs):
                tds = tr.find_all("td")
                city_td = tds[0]
                if index == 0:
                    city_td = tds[1]
                # 获取所有文本并去掉空格
                city = list(city_td.stripped_strings)[0]
                min_temp_td = tds[-2]
                min_temp = list(min_temp_td.stripped_strings)[0]
                max_temp_td = tds[-5]
                max_temp = list(max_temp_td.stripped_strings)[0]
                ALL_DATA.append({"city": city, "min_temp": int(min_temp), "max_temp": int(max_temp)})
    
    
    def spider():
        base_url = "http://www.weather.com.cn/textFC/{}.shtml"
        # 页数较少所以直接拿
        address = ["hb", "db", "hd", "hz", "hn", "xb", "xn", "gat"]
        for i in range(len(address)):
            url = base_url.format(address[i])
            # 将生成的传递给页面解析函数
            get_detail_urls = detail_urls(url)
        ALL_DATA.sort(key=lambda data: data["min_temp"])
        datas = ALL_DATA[0:10]
        cities = list(map(lambda x: x["city"], datas))
        min_temp = list(map(lambda x: x["min_temp"], datas))
        max_temp = list(map(lambda x: x["max_temp"], datas))
        bar = Bar("中国最低气温排行榜")
        bar.add("最低气温", cities, min_temp, mark_line=["average"], mark_point=["max", "min"])
        bar.add("最高气温", cities, max_temp, mark_line=["average"], mark_point=["max", "min"])
        bar.render("temperature.html")
    
    
    if __name__ == '__main__':
        spider()
    View Code

    4.总结

    """由于网络的不确定性,要保持一个程序的正常运行就得在代码中处理好
    各种可能会发生的异常,以确保程序正常运行"""
    from urllib.request import urlopen
    from urllib.error import HTTPError
    from bs4 import BeautifulSoup
    
    
    def getTitle(url):
        try:
            # 请求相关的错误,比如请求不到网页
            html = urlopen(url)
            # 进行捕捉,并返回友好形式
        except HTTPError as e:
            return None
        try:
            """print(html)
            from http.client import HTTPResponse
            调用HTTPResponse的read()方法,返回bytes类型数据
            print(type(html.read()))
            pycharmIDE 命令ctrl+b 进入BeautifulSoup源码,查看所需参数,
            第一个为请求返回结果,第二个为解析返回数据的解析器,可选择lxml,html5lib等解析器"""
            htmlTag = BeautifulSoup(html.read(), "html.parser")
            # 标签选择器,选择h1标签
            title = htmlTag.body.h1
        except AttributeError as e:
            # 页面可能没有这个标签属性,进行捕捉,并返回友好形式
            return None
        # 函数运行成功,返回执行结果
        return title
    
    # 调用执行函数,获得返回结果
    title = getTitle("http://jandan.net/")
    # 判断返回结果的类型,根据结果类型做处理
    if title == None:
        print("Title could not be found")
    else:
        # 打印成功执行结果
        print(title)
    View Code

     map()函数简介: https://www.cnblogs.com/superxuezhazha/p/5714970.html

  • 相关阅读:
    源码分析— java读写锁ReentrantReadWriteLock
    7. SOFAJRaft源码分析—如何实现一个轻量级的对象池?
    深入理解Kafka必知必会(2)
    6. SOFAJRaft源码分析— 透过RheaKV看线性一致性读
    5. SOFAJRaft源码分析— RheaKV中如何存放数据?
    深入理解Kafka必知必会(1)
    4. SOFAJRaft源码分析— RheaKV初始化做了什么?
    3. SOFAJRaft源码分析— 是如何进行选举的?
    2. SOFAJRaft源码分析—JRaft的定时任务调度器是怎么做的?
    pinpoint1.8.5安装及使用指南
  • 原文地址:https://www.cnblogs.com/Guishuzhe/p/9835859.html
走看看 - 开发者的网上家园