zoukankan      html  css  js  c++  java
  • 02-正则和xpath

    一、正则基本回顾

    1.常用指令

    import re
    #提取出python
    # key="javapythonc++php"
    # re.findall('python',key)[0]
    
    # #提取出hello world
    # key="<html><h1>hello world<h1></html>"
    # re.findall('<h1>(.*?)<h1>',key)[0]
    
    # #提取170
    # string = '我喜欢身高为170的女孩'
    # re.findall('d+',string)[0]
    
    # #提取出http://和https://
    # key='http://www.baidu.com and https://boob.com'
    # re.findall('https?://',key)
    
    # #提取出hello
    # key='lalala<hTml>hello</HtMl>hahah' #输出<hTml>hello</HtMl>
    
    # #提取出hit. 
    key='bobo@hit.edu.com'#想要匹配到hit.
    re.findall('h.*?.',key)
    
    # #匹配sas和saas
    # key='saas and sas and saaas'
    
    # #匹配出i开头的行
    # string = '''fall in love with you
    # i love you very much
    # i love she
    # i love her'''
    # re.findall('^i.*',string,re.M)
    
    # #匹配全部行
    string1 = """<div>细思极恐
    你的队友在看书
    你的闺蜜在减肥
    你的敌人在磨刀
    隔壁老王在炼药
    </div>"""
    re.findall('.*',string1,re.S)
    

    2.示例

    #解析糗事百科糗图下所有的图片数据
    import re
    import requests
    from urllib import request
    import os
    
    
    
    #1.检查页面数据是否为动态加载出来的
    #2.获取页面源码数据
    if not os.path.exists('qiutu'):
        os.mkdir('qiutu')
        
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
    url = 'https://www.qiushibaike.com/pic/'
    page_text = requests.get(url=url,headers=headers).text
    #3.解析img标签的src属性值
    ex = '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>'
    img_url_list = re.findall(ex,page_text,re.S)
    for img_url in img_url_list:
        img_url = 'https:'+img_url
        imgPath = 'qiutu/'+img_url.split('/')[-1]
        #4.对图片url发请求
        #5.持久化存储
        request.urlretrieve(url=img_url,filename=imgPath)
        print(imgPath+'下载成功!!!')
    

      

    二、xpath

    1.环境安装:

    pip install lxml
    

    2.xpath解析原理:

    • 标签定位
    • xpath表达式进行标签的定位
    • xpath表达式必须作用在xpath函数中
    • xpath函数是被封装在etree对象

    3.示例

    1.简历模板爬取
    import requests
    import os
    from lxml import etree
    import random
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
    url = 'http://sc.chinaz.com/jianli/free.html'
    response = requests.get(url=url,headers=headers)
    response.encoding = 'utf-8'
    page_text = response.text
    
    if not os.path.exists('jianli'):
        os.mkdir('jianli')
    tree = etree.HTML(page_text)
    div_list = tree.xpath('//div[@id="container"]/div')
    for div in div_list:
        detail_url = div.xpath('./a/@href')[0]
        name = div.xpath('./a/img/@alt')[0]
        
        detail_page_text = requests.get(url=detail_url,headers=headers).text
        tree = etree.HTML(detail_page_text)
        download_url_list = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a/@href')
        download_url = random.choice(download_url_list)
        
        jianli_data = requests.get(url=download_url,headers=headers).content
        
        file_path = 'jianli/'+name+'.rar'
        with open(file_path,'wb') as fp:
            fp.write(jianli_data)
        print(file_path+'下载成功')
    
    
    
    
    ######处理多页
    import requests
    import os
    from lxml import etree
    import random
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
        'Connection':'close'
    }
    start_page = 1
    end_page = 5
    
    if not os.path.exists('jianli'):
            os.mkdir('jianli')
            
    url = 'http://sc.chinaz.com/jianli/free_%d.html'
    
    for page in range(start_page,end_page+1):
        if page == 1:
            new_url = 'http://sc.chinaz.com/jianli/free.html'
        else:
            new_url = format(url%page)
    
        response = requests.get(url=new_url,headers=headers)
        response.encoding = 'utf-8'
        page_text = response.text
    
        tree = etree.HTML(page_text)
        div_list = tree.xpath('//div[@id="container"]/div')
        for div in div_list:
            detail_url = div.xpath('./a/@href')[0]
            name = div.xpath('./a/img/@alt')[0]
    
            detail_page_text = requests.get(url=detail_url,headers=headers).text
            tree = etree.HTML(detail_page_text)
            download_url_list = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a/@href')
            download_url = random.choice(download_url_list)
    
            jianli_data = requests.get(url=download_url,headers=headers).content
    
            file_path = 'jianli/'+name+'.rar'
            with open(file_path,'wb') as fp:
                fp.write(jianli_data)
            print(file_path+'下载成功')
    

      

    幻想毫无价值,计划渺如尘埃,目标不可能达到。这一切的一切毫无意义——除非我们付诸行动。
  • 相关阅读:
    makefile文件编写
    soem函数库的编译
    加秘钥的SSH
    ssh传文件
    ssh1
    安装paramiko的方法
    Ftp客户端(上传文件)
    ftp服务端
    vi编辑器没有颜色的解决办法
    socket服务器
  • 原文地址:https://www.cnblogs.com/TodayWind/p/13767810.html
Copyright © 2011-2022 走看看