zoukankan      html  css  js  c++  java
  • lxml爬取实验

    1.豆瓣

    爬取单个页面数据

    import requests
    from lxml import etree
    #import os
    
    
    url = "https://movie.douban.com/cinema/nowplaying/yongzhou/"
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    req = requests.get(url=url,headers=headers)
    text = req.text
    dics = []
    #将抓取下来的数据根据一定的规则进行提取
    html = etree.HTML(text)
    ul = html.xpath("//ul[@class='lists']")[0]
    #print(etree.tostring(ul,encoding='utf-8').decode('utf-8'))
    lis = ul.xpath("./li")
    for li in lis:
        title = li.xpath("@data-title")[0]
        score = li.xpath("@data-actors")[0]
        adress = li.xpath("@data-region")[0]
        img_hai = li.xpath(".//img/@src")[0]
        dic = {
            'title':title,
            'score':score,
            'adress':adress,
            'img':img_hai
        }
        dics.append(dic)
    print(dics)

    2.电影天堂

    爬取多个页面数据

    import requests
    import json
    from lxml import etree
    url = "http://www.dytt8.net"
    HEADERS = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        'Referer':'http://www.dytt8.net/html/gndy/dyzz/list_23_1.html'
    }
    
    def get_url(urls):
        response = requests.get(urls,headers=HEADERS)
        text = response.text            #请求页面
        html = etree.HTML(text)         #解析为HTML文档 html为Element对象    (可以执行xpath语法)
        detail_urls = html.xpath("//table[@class='tbspan']//a/@href")       #获取页面下的href
        detail_urls = map(lambda urls:url+urls,detail_urls)         #将detail_urls这个列表中每个url都扔给lambda这个函数合并    再将整个修改后的赋给detail_urls
        return detail_urls
    
    def parse_detail_url(de_ur):
        movie = {}
        response = requests.get(de_ur,headers=HEADERS)
        text = response.content.decode('gbk')
        html = etree.HTML(text)
        title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]       #获取标题
        movie['title'] = title          #放入字典
        zoomE = html.xpath("//div[@id='Zoom']")[0]
        img_hb = zoomE.xpath(".//img/@src")
        cover = img_hb[0]           #海报
        #sst = img_hb[1]      #电影截图
        movie['cover'] = cover
        #movie['sst'] = sst
    
        def parse_info(info,rule):
            return info.replace(rule,"").strip()     #.strip()把前后空格删掉
        infos = zoomE.xpath(".//text()")
        for index,info in enumerate(infos):         #enumerate 索引序列(0 str 1 str 2 str)
            if info.startswith("◎片  名"):       #判断 以。。开始
                info = parse_info(info,"◎片  名")         #调用parse_info将"◎片  名"替换为无(没有)
                movie['pian'] = info
            elif info.startswith("◎年  代"):
                info = parse_info(info, "◎年  代")
                movie['year'] = info
            elif info.startswith("◎产  地"):
                info = parse_info(info, "◎产  地")
                movie['adress'] = info
            elif info.startswith("◎导  演"):
                info = parse_info(info, "◎导  演")
                movie['actor'] = info
            elif info.startswith("◎类  别"):
                info = parse_info(info, "◎类  别")
                movie['lb'] = info
            elif info.startswith("◎豆瓣评分"):
                info = parse_info(info, "◎豆瓣评分")
                movie['db'] = info
            elif info.startswith("◎主  演"):
                info = parse_info(info, "◎主  演")
                actors = []
                for x in range(index+1,len(infos)):
                    actor = infos[x]
                    if actor.startswith(""):               #过滤简介部分
                        break
                    actors.append(actor)
                movie['actors'] = actors
            elif info.startswith("◎简  介"):
                info = parse_info(info,"◎简  介")
                for x in range(index+1,len(infos)):
                    profile = infos[x].strip()
                    if profile.startswith(""):             #过滤下载地址部分
                        break
                    movie['profile'] = profile
        download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]                #下载地址
        movie['download_url'] = download_url
        return movie
    
    def write_to_file(content):
        with open('result.txt','a',encoding='utf-8') as f:
            f.write(json.dumps(content,ensure_ascii=False)+'
    ')        #ensure_ascii=False 输出为中文
            f.close()
    
    def dianying():
         urld = "http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html"               #这里用到了{} .format()的用法
         movies = []                        #定义一个列表
         for x in range(1,8):
             #第一个for循环用来控制7个页面
             print(x)
             urls = urld.format(x)
             if x==5:                   #这里因为第5个页面出现报错信息 可能是编码问题  解决不了  所以我就过滤了第5页
                 continue
             detail_ur = get_url(urls)          #解析每页的详细信息
             write_to_file("第%s页" % x)
             for detail_url in detail_ur:
                 #第二个for循环用来遍历每个页
                 movie = parse_detail_url(detail_url)
                 movies.append(movie)
                 write_to_file(movie)
    
    if __name__ == '__main__':
        dianying()

    3.腾讯招聘

    跟上一个电影天堂的代码差不多

    import requests
    import json
    from lxml import etree
    url = "https://hr.tencent.com/"
    HEADERS = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    def get_url(urld):
        response = requests.get(urld,headers=HEADERS)
        text = response.text
        html = etree.HTML(text)
        detail_url = html.xpath("//tr[@class='even' or @class='odd']//a/@href")
        detail_url = map(lambda x:url+x,detail_url)
    
        return detail_url
    
    def prease_url(detail_url):
        dic = {}
        #print(detail_url)
        response = requests.get(detail_url,headers=HEADERS)
        text =response.text
        html = etree.HTML(text)
        title = html.xpath("//tr[@class='h']//td[@class='l2 bold size16']//text()")[0]
        dic['title'] = title
    
        #方法一    (死板)
        adress = html.xpath("//tr[@class='c bottomline']//td//text()")[1]
        dic['adress'] = adress
        # 方法二    (简洁)
        str = html.xpath("//tr[@class='c bottomline']//td")
        leibie = str[1].xpath(".//text()")[1]
        dic['leibie'] = leibie
        nums = str[2].xpath(".//text()")[1]
        dic['nums'] = nums
        gz = html.xpath("//ul[@class='squareli']")
        gzzz = gz[0].xpath(".//text()")
        gzyq = gz[1].xpath(".//text()")
        dic['工作职责'] = gzzz
        dic['工作要求'] = gzyq
        #print(dic)
        return dic
    
    def write_to_file(content):
        with open('tengxun.txt','a',encoding='utf-8') as f:
            f.write(json.dumps(content,ensure_ascii=False)+'
    ')        #ensure_ascii=False 输出为中文
            f.close()
    def tengxun():
        movies = []
        urls = "https://hr.tencent.com/position.php?keywords=python&lid=0&tid=87&start={}#a"
        for x in range(0,501,10):           #步长为10
            print(x)
            urld = urls.format(x)
            detail_urls = get_url(urld)
            for detail_url in detail_urls:
                movie = prease_url(detail_url)
                movies.append(movie)
                write_to_file(x)
                write_to_file(movies)
    
    if __name__ == '__main__':
        tengxun()
  • 相关阅读:
    整除15问题
    软件工程基础Proposal提议
    对在大学阶段软件工程实践的一些想法
    运行web项目端口占用问题
    Day_1
    error C3615: constexpr 函数 "QAlgorithmsPrivate::qt_builtin_ctz" 不会生成常数表达式
    Qt应用程序的打包
    将html代码部署到阿里云服务器,并进行域名解析,以及在部署过程中遇到的问题和解决方法
    linux部署html代码到linux服务器,并进行域名解析
    运行sudo apt-get install nginx时报错有几个软件包无法下载,要不运行 apt-get update 或者加上 --fix-missing 的选项再试试?解决
  • 原文地址:https://www.cnblogs.com/c-pyday/p/9760862.html
Copyright © 2011-2022 走看看