zoukankan      html  css  js  c++  java
  • Python 原生爬虫实例

    1、项目环境

      Python 3.7

      MySQL 5.6

      Pycharm 2020

    2、项目目录

      

       crawl_blogs.py  爬虫主文件

       mysql_db.py  mysql 连接 增删改工具

       result.txt  爬取的内容 存储文件

       uploads  下载的文件目录

    3、数据库连接 mysql_db.py

    import pymysql;
    
    readHost = '127.0.0.1'
    writeHost = '110.1.58.75'
    userName = 'root'
    passWord = 'abc@123456'
    dataBase = 'py_db'
    charset = 'utf8'
    #读写数据库 type 1 读库 2 写库
    def run_sql(sql,dbtype=''):
        try:
            if dbtype == '':
                host = readHost
            else:
                host = writeHost
            db = pymysql.connect(host,userName,passWord,dataBase,3306,None,charset)
        except:
            print('数据库连接失败,请重试')
    
        # 使用 cursor() 方法创建一个游标对象 cursor
        cursor = db.cursor()
        result = ()
        try:
            # 执行sql语句
            cursor.execute(sql)
            if dbtype == 'fetchall':
                # 获取所有记录列表
                result = cursor.fetchall()
            elif dbtype == 'fetchone':
                # 使用 fetchone() 方法获取单条数据.
                result = cursor.fetchone()
            elif dbtype == '':
                # 提交到数据库执行 增,删,改
                db.commit()
        except:
            db.rollback()#发生错误时回滚
            print("Error: unable to fetch data")
        # 关闭数据库连接
        db.close()
        return result

    4、爬取程序主文件 crawl_blogs.py (以爬取博客为例)

    import json
    import requests
    from requests.exceptions import RequestException
    import re
    import time
    import mysql_db
    import urllib
    import os
    import random
    
    save_path = 'result.txt'
    
    #获取某一页的内容
    def get_one_page(url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
    
    #处理文件路径,相对路径转绝对
    def get_file_path(file_url,prefix=''):
        if file_url.startswith('/https://') or file_url.startswith('/http://'):
            return file_url.lstrip('/')
        else:
            return prefix+file_url
    
    #解析当前页html
    def parse_one_page(html):
        pattern = re.compile('<div class="row b-one-article">.*?"col-xs-12 col-md-12 col-lg-12".*?<a class="b-oa-title".*?href="(.*?)".*?>(.*?)</a>.*?<li class="col-xs-7 col-md-3 col-lg-3">.*?<i class="fa fa-calendar"></i>(.*?)</li>.*?<div class="col-xs-12 col-md-12 col-lg-12">.*?<img class="bjy-lazyload".*?data-src="(.*?)".*?>.*?</div>', re.S)
        items = re.findall(pattern, html)
        if items:
            for item in items:
                yield {
                    'href': item[0],
                    'title': item[1],
                    'time': item[2].strip()[2:],#去除换行符
    
                    # 'img_url':  item[3] if item[3].find('://') != -1 else 'https://baijunyao.com/'+item[3]
                    'img_url':  get_file_path(item[3],'https://baijunyao.com/')
                }
    
    #写入数据到文件
    def write_to_file(content):
        with open(save_path, 'a', encoding='utf-8') as f:
            f.write(json.dumps(content, ensure_ascii=False) + '
    ')
    
    #循环爬取
    def main(offset):
        url = 'https://baijunyao.com/?page=' + str(offset)
        html = get_one_page(url)
        for item in parse_one_page(html):
            print(item)
            write_to_file(item)
    
    #读取文件并入库
    def save_to_db(filepath):
        f = open(filepath,'rb')
    
        # lines = f.readlines()#读取全部内容
        # for line in lines:
        date = time.strftime('%Y%m%d')
        save_dir = os.path.abspath('.')+'/uploads/article/'+str(date)+'/'
        #目录不存在 则创建目录
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
    
        begin_time = time.time()
        i = 0
        for line in f:
            i+=1
            item = line.decode()
            item = json.loads(item)
    
            href = item['href']
            title = item['title']
            insert_time = item['time']
            timeArray = time.strptime(insert_time, "%y-%m-%d %H:%M:%S")# 转换成时间数组
            insert_time = int(time.mktime(timeArray))# 转换成时间戳
            img_url = item['img_url']
            #下载图片
            img_suffix = os.path.splitext(img_url)[-1] #获取文件后缀
            file_name = str(int(time.time()))+'_'+str(random.randint(1000,9999))+img_suffix
            urllib.request.urlretrieve(img_url,save_dir+file_name)
    
            sql = "INSERT INTO blogs(title,href,insert_time,update_time,img_url) values('%s','%s',%d,%d,'%s')" %(title,href,insert_time,insert_time,img_url)
            print(sql)
            mysql_db.run_sql(sql)
    
        end_time = time.time()
        use_time = end_time-begin_time
        print('入库完毕,共计'+(str)(i)+'条,耗时:'+str(use_time)+'')
    
    
    if __name__ == '__main__':
        #爬取数据并保存到文件
        begin_time = time.time()
        for i in range(1,28):
            main(i)
            #time.sleep(1)
        #读取文件并入库
        end_time = time.time()
        use_time = end_time-begin_time
        print('爬取完毕,共耗时:'+str(use_time)+'')
    
        save_to_db(save_path)

    5、爬取结果

    6、下载的博客封面图

    7、数据表存储

  • 相关阅读:
    Java设计模式-----装饰模式
    Java并发包中Lock的实现原理
    ThreadLocal,静态变量,实例变量,局部变量的线程安全
    ThreadLocal类详解
    SQL之LEFT JOIN,EIGHT JOIN,INSERT JOIN的区别
    Wireshark 、HTTPWatch、Fiddler的介绍
    TCP/IP、HTTP、Socket的区别
    我希望你并不幸福
    Autoregressive Convolutional Neural Networks for Asynchronous Time Series
    DRL Lecture1:Policy Gradient
  • 原文地址:https://www.cnblogs.com/guliang/p/13426134.html
Copyright © 2011-2022 走看看