zoukankan      html  css  js  c++  java
  • Python爬虫学习笔记(五)

    数据分析:

    ·正则表达式:

    Test1(正则表达式 - 字符串拆分):

    代码:

    import re
    
    # 拆分字符串
    one = 'asdfsdfas'
    # 标准是以s为拆分
    pattern = re.compile('s')
    result = pattern.split(one)
    print(result)

    返回:

    ['a', 'df', 'dfa', '']

    Test2(正则表达式 - 匹配中文):

    代码1:

    # 匹配中文
    two = '<h2 tid="tid-YkerKe" id="hid-Htc8Nb">Test4(正则表达式 - 纯数字的正则):</h2>'
    
    # python中匹配中间 [a-z] unicode的范围
    pattern = re.compile('[u4e00-u9fa5]')
    result = pattern.findall(two)
    print(result)
    匹配中文

    返回1:

    ['', '', '', '', '', '', '', '', '', '', '']

     代码2:

    import re
    
    # 匹配中文
    two = '<h2 tid="tid-YkerKe" id="hid-Htc8Nb">Test4(正则表达式 - 纯数字的正则):</h2>'
    
    # python中匹配中间 [a-z] unicode的范围
    pattern = re.compile('[u4e00-u9fa5]+')
    result = pattern.findall(two)
    print(result)

     返回2:

    ['正则表达式', '纯数字的正则']

    Test3(正则表达式 - 网站爬取):

    代码:

    import re
    import requests
    
    url = 'https://news.baidu.com/'
    handers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"
    }
    data = requests.get(url, headers=handers).content.decode()
    with open('02news.html', 'w', encoding='utf-8')as f:
        f.write(data)

    返回:

    Test4:(正则表达式 - 新闻页面简单爬取):

    代码1:

    # coding=gbk
    import re
    import requests
    
    url = 'https://news.baidu.com/'
    handers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"
    }
    data = requests.get(url, headers=handers).content.decode()
    
    # '<a href="http://politics.people.com.cn/n1/2021/0303/c1001-32040808.html" target="_blank" class="a3" mon="ct=1&amp;a=1&amp;c=top&amp;pn=0">人民的信心和支持就是我们国家奋进的力量</a>'
    pattern = re.compile('<a href="(.*?)" target="_blank" mon="(.*?)">(.*?)</a>')
    result = pattern.findall(data)
    print(result)

    返回1:

    代码2:

    # coding=gbk
    import re
    import requests
    
    url = 'https://news.baidu.com/'
    handers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"
    }
    data = requests.get(url, headers=handers).content.decode()
    
    # '<a href="http://politics.people.com.cn/n1/2021/0303/c1001-32040808.html" target="_blank" class="a3" mon="ct=1&amp;a=1&amp;c=top&amp;pn=0">人民的信心和支持就是我们国家奋进的力量</a>'
    # pattern = re.compile('<a(.*?)</a>')
    result = pattern.findall(data)
    print(result)

     返回2:

    ·XPATH:

    Test1(xpath基本用法):

    代码:

    # coding=gbk
    import re
    import requests
    from lxml import etree
    
    url = 'https://news.baidu.com/'
    handers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"
    }
    data = requests.get(url, headers=handers).content.decode()
    # 1.转解析类型
    xpath_data = etree.HTML(data)
    # xpath
    # 2.调用xpath方法
    result1 = xpath_data.xpath('/html/head/title/text()')
    result2 = xpath_data.xpath('//a/text()')
    result3 = xpath_data.xpath('//a[@mon="ct=1&a=1&c=top&pn=0"]/text()')
    result4 = xpath_data.xpath('//a[@mon="ct=1&a=1&c=top&pn=0"]/@href')
    result5 = xpath_data.xpath('//li/a/text()')
    
    print(result1)
    print(result2)
    print(result3)
    print(result4)
    print(result5)

    返回:

    注:

    xpath语法:
    1.节点: /
    2.跨节点://
    3.精确的标签://a[@属性="属性值"]
    4.标签包裹的内容 /text()
    5.属性:@href
    6.xpath返回的数据类型 —— List
    xpath下标是从1开始的;只能取平级关系的标签

    Test2(实战):

    以https://www.cnblogs.com/3cH0-Nu1L/default.html?page=为例77

    代码:

    # coding=gbk
    import requests
    from lxml import etree
    
    
    class BkySpider(object):
        def __init__(self):
            self.base_url = 'https://www.cnblogs.com/3cH0-Nu1L/default.html?page='
            self.handers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"
        }
        # 1.发请求
        def get_response(self, url):
            response = requests.get(url, headers=self.handers)
            data = response.content.decode()
            return data
        # 2.解析数据
        def parse_data(self, data):
            # 使用xpath解析当前页面所有的随笔title
            # 1.转类型
            x_data = etree.HTML(data)
            # 2.根据xpath路径解析
            title_list = x_data.xpath('//a[@class="postTitle2 vertical-middle"]/text()')
            url_list = x_data.xpath('//a[@class="postTitle2 vertical-middle"]/@href')
            print(result)
    
        # 3.保存数据
        def save_data(self, data):
            with open('05bky.html', 'w', encoding='utf-8')as f:
                f.write(data)
    
        # 4.启动
        def run(self):
            # 1.拼接完整URL
            url = self.base_url + '2'
            # 2.发请求
            data = self.get_response(url)
            # 3.做解析
            self.parse_data(data)
            # 4.保存
            #self.save_data(data)
    
    
    BkySpider().run()
  • 相关阅读:
    从服务器上下载下来的代码,部署到本地时,Url自动带www前缀
    个人说明
    名词解释
    Bandizip-解压缩软件
    uTools-工具插件集
    Geek-软件卸载工具
    Microsoft商店软件推荐
    Docker入门第九章
    Docker入门第八章
    IDM-下载工具
  • 原文地址:https://www.cnblogs.com/3cH0-Nu1L/p/14477345.html
Copyright © 2011-2022 走看看