zoukankan      html  css  js  c++  java
  • Python爬虫lxml解析实战

    XPath常用规则
    /                            从当前节点选取直接子节点
    //                           从当前节点选取子孙节点
    .                            选取当前节点
    ..                           选取当前节点的父节点
    @                          选取属性
    *                           通配符,选择所有元素节点与元素名
    @*                        选取所有属性
    [@attrib]               选取具有给定属性的所有元素
    [@attrib='value']    选取给定属性具有给定值的所有元素
    [tag]                     选取所有具有指定元素的直接子节点
    [tag='text']            选取所有具有指定元素并且文本内容是text节点   
    """爬取豆瓣网站的信息"""
    import requests
    from lxml import etree
    
    # 请求头设置
    headers = {
        "User-Agentv": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36",
        "Referer": "https://movie.douban.com/",
    }
    
    url = "https://movie.douban.com/cinema/nowplaying/chongqing/"
    # 发起请求
    rep = requests.get(url, headers=headers)
    text = rep.text
    # 转换成html格式
    html = etree.HTML(text)
    # 找到子孙节点ul标签
    ul = html.xpath("//ul[@class='lists']")[0]
    # 当前ul下的所有li标签
    lis = ul.xpath("./li")
    movies = []
    # 循环每个li标签
    for li in lis:
        # 直接@li标签的属性获取值
        title = li.xpath("@data-title")[0]
        score = li.xpath("@data-score")[0]
        region = li.xpath("@data-region")[0]
        actors = li.xpath("@data-actors")[0]
        director = li.xpath("@data-director")[0]
        liimg = li.xpath(".//img/@src")
        movie = {
            "title": title,
            "score": score,
            "region": region,
            "actors": actors,
            "director": director,
            "liimg": liimg,
        }
        movies.append(movie)
    print(movies)
    View Code

    电影天堂

    import requests
    from lxml import etree
    
    BASE_DOMAIN = "http://www.ygdy8.net"
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36",
    }
    
    
    def get_detail_urls(url):
        # 进入首页
        rep = requests.get(url=url, headers=HEADERS)
        # 小坑(编码里面有非法字符,所以加ignore过滤掉)
        text = rep.content.decode("gbk", "ignore")
        html = etree.HTML(text)
        # 通过规律直接找table下的a标签属性
        detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
        # map接受一个函数和list,并通过匿名函数lambda依次作用在list的每个元素上,得到一个新的list并返回
        detail_urls = map(lambda url:BASE_DOMAIN+url, detail_urls)
        # 返回拼接完成的详情url
        return detail_urls
    
    
    def parse_detail_page(url):
        # 爬取详情页面信息
        movie = {}
        res = requests.get(url, headers=HEADERS)
        text = res.content.decode("gbk")
        html = etree.HTML(text)
        title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
        movie["title"] = title
        zoomE = html.xpath("//div[@id='Zoom']")[0]
        # 获取当前标签下的img
        imgs = zoomE.xpath(".//img/@src")
        # 列表切片法,避免取超过范围的数据报错
        cover = imgs[0:1]
        movie["cover"] = cover
        poster = imgs[1:2]
        movie["poster"] = poster
        infos = zoomE.xpath(".//text()")
    
        def parse_info(info, rule):
            # 重复操作,提取出一个函数
            return info.replace(rule, "").strip()
    
        for index, info in enumerate(infos):
            if info.startswith("◎年  代"):
                text = parse_info(info, "◎年  代")
                movie["year"] = text
            elif info.startswith("◎产  地"):
                text = parse_info(info, "◎产  地")
                movie["country"] = text
            elif info.startswith("◎类  别"):
                text = parse_info(info, "◎类  别")
                movie["category"] = text
            elif info.startswith("◎豆瓣评分"):
                text = parse_info(info, "◎豆瓣评分")
                movie["douban_rating"] = text
            elif info.startswith("◎片  长"):
                text = parse_info(info, "◎片  长")
                movie["duration"] = text
            elif info.startswith("◎导  演"):
                text = parse_info(info, "◎导  演")
                movie["director"] = text
            elif info.startswith("◎主  演"):
                text = parse_info(info, "◎主  演")
                actors = [text]
                for x in range(index+1, len(infos)):
                    actor = infos[x].strip()
                    if actor.startswith("◎标"):
                        break
                    actors.append(actor)
                    movie["actors"] = actors
            elif info.startswith("◎简  介"):
                text = parse_info(info, "◎简  介")
                for x in range(index+1, len(infos)):
                    profile = infos[x].strip()
                    if profile.startswith("◎获奖情况"):
                        break
                    movie["profile"] = profile
        download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")
        movie["download_url"] = download_url
        return movie
    
    
    def spider():
        base_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html"
        movies = []
        # 设置爬取页面数量的url
        for i in range(1, 180):
            url = base_url.format(i)
            # 传递到第一个首页爬取详情页面链接
            detail_urls = get_detail_urls(url)
            # 获取待爬取页面详情的url
            for detail_url in detail_urls:
                # 传递到详情页面爬取并获取爬取的详情数据
                movie = parse_detail_page(detail_url)
                movies.append(movie)
        print(movies)
    
    
    if __name__ == '__main__':
        spider()
    View Code

    猫眼电影

    """猫眼电影爬取"""
    import requests
    from lxml import etree
    
    BASE_URL = "http://maoyan.com"
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36"
    }
    
    
    def get_detail_urls(url):
        # 具体获取详情url
        rep = requests.get(url=url, headers=HEADERS)
        html = etree.HTML(rep.text)
        # 找到详情url
        detail_urls = html.xpath("//dl//div[@class='movie-item']/a/@href")
        detail_urls = map(lambda url: BASE_URL+url, detail_urls)
        return detail_urls
    
    
    def parse_detail_page(url):
        # 获取数据
        movie = {}
        res = requests.get(url=url, headers=HEADERS)
        text = res.content.decode("utf-8")
        html = etree.HTML(text)
        name = html.xpath("//div[@class='movie-brief-container']/h3/text()")[0]
        movie["name"] = name
        lis = html.xpath("//div[@class='movie-brief-container']//li")
        for li in range(len(lis)):
            if li == 0:
                movie["plot"] = lis[li].xpath("./text()")[0]
            if li == 1:
                movie["country"] = lis[li].xpath("./text()")[0].split()[0]
                movie["duration"] = lis[li].xpath("./text()")[0].split()[1]
            if li == 2:
                try:
                    movie["release_time"] = lis[li].xpath("./text()")[0]
                except Exception as e:
                    continue
    
        avatar = html.xpath("//div[@class='avatar-shadow']/img/@src")
        movie["avatar"] = avatar
        content = html.xpath("//div[@class='mod-content']/span/text()")[0]
        movie["content"] = content
        container = html.xpath("//div[@class='comment-list-container']/ul")
        for li in container:
            li_name = li.xpath(".//span[@class='name']/text()")
            li_content = li.xpath(".//div[@class='comment-content']/text()")
            livs = zip(li_name, li_content)
            movie["user"] = dict((name, value)for name, value in livs)
        return movie
    
    
    def spider():
        # 获取url自行拼接
        base_url = "http://maoyan.com/films?showType=1&offset={}"
        movies = []
        for i in range(0, 31, 30):
            url = base_url.format(i)
            # 拿到url之后去找到详情页面url
            detail_urls = get_detail_urls(url)
            for detail_url in detail_urls:
                # 去获取详情页面数据
                movie = parse_detail_page(detail_url)
                movies.append(movie)
                print(movie)
        print(movies)
    
    
    if __name__ == '__main__':
        spider()
    View Code

    腾讯招聘网

    """爬取腾讯招聘网找工作"""
    import requests
    from lxml import etree
    
    HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36",
               "Referer": "https://hr.tencent.com/position.php?keywords=python&lid=2218&tid=87&start=0"
               }
    BASE_URL = "https://hr.tencent.com/"
    
    
    def get_detail_urls(url):
        rep = requests.get(url=url, headers=HEADERS)
        html = etree.HTML(rep.text)
        detail_urls = html.xpath("//table//td[@class='l square']/a/@href")
        detail_urls = map(lambda url: BASE_URL+url, detail_urls)
        return detail_urls
    
    
    def get_parse_detail(url):
        job_offers = {}
        res = requests.get(url=url, headers=HEADERS)
        html = etree.HTML(res.text)
        position = html.xpath("//table//td[@class='l2 bold size16']/text()")[0]
        job_offers["position"] = position
        tds = html.xpath("//table//tr[@class='c bottomline']/td/text()")
        for i in range(len(tds)):
            job_offers["location"] = tds[0]
            job_offers["category"] = tds[1]
            job_offers["recruits"] = tds[2]
        duties = html.xpath("//tr[3][contains(@class, 'c')]//li/text()")
        job_offers["duties"] = duties
        claim = html.xpath("//tr[4][contains(@class, 'c')]//li/text()")
        job_offers["claim"] = claim
        return job_offers
    
    
    def spider():
        base_url = "https://hr.tencent.com/position.php?keywords=python&lid=2218&tid=87&start={}#a"
        squres = []
        for i in range(0, 340, 10):
            url = base_url.format(i)
            detail_urls = get_detail_urls(url)
            for detail_url in detail_urls:
                squre = get_parse_detail(detail_url)
                squres.append(squre)
                print(squre)
    
    
    if __name__ == '__main__':
        spider()
    View Code

    可参考博客链接(我就懒得写了):http://www.cnblogs.com/zhangxinqi/p/9210211.html#_label11

  • 相关阅读:
    VMware Workstation Pro下载密钥
    hypervisor
    Xmanager6 下载地址
    linux常用命令
    linux常用
    查看机器端口信息
    windows下快捷键
    SpringMVC学习笔记整理
    2017面试题收集
    oracle 常用知识点整理
  • 原文地址:https://www.cnblogs.com/Guishuzhe/p/9826146.html
Copyright © 2011-2022 走看看