zoukankan      html  css  js  c++  java
  • python爬虫相关安装与应用

    1、mysql数据库用于存储大量数据。

    2、Navicat for MySQL以图形和表格等形式管理数据库工具。

    3、编程语言python3与环境配置

    4、pythcharm集成开发环境(社区版)不需要激活

    5、Python包管理器Anaconda3(爬虫主要用到两个包requests,pymysql)与环境配置(网上可找安装教程).

    链接:https://pan.baidu.com/s/1Zef6oPmtNZ4sWBXyAMBSgA
    提取码:am9q

    应用:

    1、正则表达式提取猫眼top100电影中的电影名称、主演和上映时间

    import pymysql
    import requests
    import re
    
    def get_text(url):
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    
    def parse_html(url, list):
        demo = get_text(url)
        patern = re.compile('class="name".*?title="(.*?)".*?:(.*?)s*?</p>.*?:(d{4}-d{2}-d{2})', re.S)
        results = re.findall(patern, demo)
        for result in results:
            list.append(result)
        return list
    
    list = []
    for i in range(0,10):
        url = 'https://maoyan.com/board/4?offset='+str(10*i)
        list = parse_html(url, list)
    
    count = 0
    for i in list:
        count = count + 1
        print(i)
    print("一共有"+str(count)+"条数据!")

    2、正则表达式提取西北大学讲座信息

    import requests
    import pymysql
    import re
    import os
    
    def get_text(url):
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    
    
    def parse_html(url, list):
        demo = get_text(url)
        patern = re.compile('<li><span class="fr">[(.*?)].*?&nbsp;&nbsp;(.*?)</a>',re.S)
        results = re.findall(patern, demo)
        for result in results:
            list.append(result)
        return list
    
    
    list = []
    url = 'http://computer.swu.edu.cn/s/computer/kxyj2xsky/index.html'
    list = parse_html(url,list)
    for i in range(2, 5):
        url = "http://computer.swu.edu.cn/s/computer/kxyj2xsky/index_"+str(i)+".html"
        list = parse_html(url, list)
    
    count = 0
    for i in list:
        count = count + 1
        print(i)
    print("一共有"+str(count)+"条数据!")

    3、爬取图片

    import requests
    import pymysql
    import re
    import os
    
    def get_text(url):
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    
    
    def parse_html(url, list):
        demo = get_text(url)
        patern = re.compile('<li><span class="fr">[(.*?)].*?&nbsp;&nbsp;(.*?)</a>',re.S)
        results = re.findall(patern, demo)
        for result in results:
            list.append(result)
        return list
    
    
    list = []
    url = 'http://computer.swu.edu.cn/s/computer/kxyj2xsky/index.html'
    list = parse_html(url,list)
    for i in range(2, 5):
        url = "http://computer.swu.edu.cn/s/computer/kxyj2xsky/index_"+str(i)+".html"
        list = parse_html(url, list)
    
    count = 0
    for i in list:
        count = count + 1
        print(i)
    print("一共有"+str(count)+"条数据!")
    import pymysql
    import requests
    from hashlib import md5
    import re
    import os
    
    # db = pymysql.connect('localhost', 'root', '1458555801', 'world')
    # print("数据库连接成功!")
    # print("---------------------------------------------------")
    # r = requests.get("https://python123.io/ws/demo.html")
    # print(r.text)
    
    # r = requests.get("https://python123.io/ws/demo.html")
    # print(r)
    # # 提取网页文本内容
    # print(r.text)
    # # 提取网页编码方式
    # print(r.encoding)
    # print(r.apparent_encoding)
    # r.encoding = r.apparent_encoding
    # # 打印状态码
    # print(r.status_code)
    # # 捕获异常
    # print(r.raise_for_status())
    
    def get_text(url):
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    
    # print(get_text('https://python123.io/ws/demo.html'))
    
    # demo = get_text('https://python123.io/ws/demo.html')
    # result = re.search('Th.*?ge', demo)
    # print(result)
    # print(result.group())
    # result2 = re.search('http.*?001', demo)
    # print(result2.group())
    # result3 = re.findall('<p.*?</p>', demo, re.S)
    # print(result3)
    
    def parse_html(url, list):
        demo = get_text(url)
        # 将正则表达式编译成正则表达式对象,方便复用该正则表达式
        # ".*?" :匹配任意字符串
        # [u4e00-u9fa5] :匹配中文
        # (d{4}-d{2}-d{2}) : 匹配日期
        patern = re.compile('<li><spansclass="fr">[(d{4}-d{2}-d{2})].*?&nbsp;&nbsp;(.*?)</a></li>', re.S)
        results = re.findall(patern, demo)
        for result in results:
            list.append(result)
        return list
    
    list = []
    url = 'http://computer.swu.edu.cn/s/computer/kxyj2xsky/index.html'
    list = parse_html(url, list)
    for i in range(2,5):
        # http://computer.swu.edu.cn/s/computer/kxyj2xsky/index_2.html
        url = 'http://computer.swu.edu.cn/s/computer/kxyj2xsky/index_'+str(i) + '.html'
        list = parse_html(url, list)
    
    count = 0
    for i in list:
        count = count + 1
        print(i)
    print("一共有"+str(count)+"条数据!")
    
    # def download_image(url):
    #     r = requests.get(url)
    #     r.raise_for_status()
    #     save_image(r.content)
    #
    # def save_image(content):
    #     file_path = '{0}/{1}.{2}'.format('C:/Users/Think/Desktop/image', md5(content).hexdigest(), 'jpg')
    #     if not os.path.exists(file_path):
    #         with open(file_path, 'wb') as f:
    #             f.write(content)
    #             f.close()
    
    # for i in list:
    #     download_image(i)
    # print("下载成功")
  • 相关阅读:
    jmeter的插件安装
    linux下性能监控工具nmon的使用
    kafka如何保证不重复消费又不丢失数据_Kafka写入的数据如何保证不丢失?
    Goroutine和Panic
    go 并发有趣现象和要避开的坑
    Go语言宕机恢复(recover)——防止程序崩溃
    invalid character 'è' looking for beginning of value
    golang实现RPC的几种方式
    channl与select
    我要在栈上。不,你应该在堆上
  • 原文地址:https://www.cnblogs.com/nonames/p/11144193.html
Copyright © 2011-2022 走看看