zoukankan      html  css  js  c++  java
  • python 爬虫之requests+日志+配置文件读取+mysql入库


    #
    !/usr/bin/env python # -*- coding: utf-8 -*-
    # 日志管理 import logging import sys reload(sys) sys.setdefaultencoding('utf-8') def getlogger(logName, logFile): logger=logging.getLogger(logName) logger.setLevel(logging.DEBUG) screenHandle = logging.StreamHandler() screenHandle.setLevel(logging.DEBUG) fileHandle = logging.FileHandler(logFile,'a') fileHandle.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') screenHandle.setFormatter(formatter) fileHandle.setFormatter(formatter) logger.addHandler(fileHandle) logger.addHandler(screenHandle) return logger

    mysql.conf 

    [mysql]
    user=你的root
    password=你的password
    database=你的database
    host=localhost
    port =3306

    requests_to_mysql.py
    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    import ConfigParser
    import json
    import random
    import sys
    import time
    import pymysql
    import requests
    import log_config
    import datetime
    
    logger = log_config.getlogger('reference_mysql', 'reference_mysql.log')
    conf = ConfigParser.ConfigParser()
    conf.read("mysql.conf")
    user = conf.get("mysql", "user")
    password = conf.get("mysql", "password")
    database = conf.get("mysql", "database")
    host = conf.get("mysql", "host")
    port = conf.get("mysql", "port")
    siteURL = '你要爬取得请求'
    fileurl = '可能爬取路径需要拼接的域名'
    
    headers = {'Host': '爬取网站的域名',
               'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
                             ' Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3103.400 QQBrowser/9.6.11372.400'}
    #你爬取的网站可能有很多层条件去过滤,所以你都需要列举处理,一般也包括时间段
    cate_dict = {'key':'value'}
    
    moudue_dict = {'key': 'value'}
    
    industry_dict = {'key':'value'}
    
    date_list = ['2018-10-10']
    
    date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
    logger.info("start get %s data" % date)
    # 启动参数决定是否爬取今天的还是所有的历史数据sys.argv为list,启动不带参数sys.argv[0]默认为当前文件所在位置
    if len(sys.argv) != 1:
        if sys.argv[1] == 'all':
            date = ''
        else:
            logger.info('input error,please input all')
            exit()
    
    
    # 获取总页数
    def get_page(dates, category, mod, industry):
        data = {'seDate': dates,
                'pageNum': 1,
                'pageSize': 30,
                'category': cate_dict[category],
                'column': 'szse',
                'plate': mod,
                'tabName': 'fulltext',
                'trade': industry}
        req = requests.post(siteURL, headers=headers, data=data)
        content = req.text
        content = json.loads(content)
        # filelist = content['announcements']
        filesum = content['totalAnnouncement']
        # print filesum
        if filesum != 0:
            if filesum % 30 == 0:
                pages = filesum / 30
            else:
                pages = filesum / 30 + 1
            return pages
        else:
            return 0
    
    
    # 获取一页数据
    def get_page_data(dates, category, page, module_type, industry):
        # 当前时间必须通过下面方式获取,否者mysql datetime类型不能接受该参数
        now_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        data = {'seDate': dates,
                'pageNum': page,
                'pageSize': 30,
                'category': cate_dict[category],
                'column': 'szse',
                'plate': module_type,
                'tabName': 'fulltext',
                'trade': industry}
        logger.info("getting page %s" % str(page))
        retries = 0
        content = ""
        while retries < 3:
            try:
                req = requests.post(siteURL, headers=headers, data=data)
                content = req.text
                break
            except Exception as e:
                logger.error("get data failed", e)
                retries += 1
                logger.info('req error retry %s ' % retries)
                # logger.info('req error retry %s '%retries)
                t = random.uniform(1, 2)
                time.sleep(t)
        try:
            content = json.loads(content)
            filelist = content['announcements']
            logger.info("filelist=%s" % len(filelist))
            page_datas = []
            for fileone in filelist:
                # 文件处理状态,mysql中的
                pro_status = 0
                # java中解析url重试次数,这里不用管,默认设为0
                retry_count = 0
                sec_code = fileone['secCode']
                sec_name = fileone['secName']
                announcement_title = fileone['announcementTitle']
                announcement_time = fileone['announcementTime']
                public_time = date_long_to_str(announcement_time)
                adjunct_url = fileurl + fileone['adjunctUrl']
                page_data = [category, cate_dict[category], industry_dict[industry], module_type, public_time, public_time,
                             sec_code, sec_name, announcement_title, adjunct_url, pro_status, retry_count,
                             now_date, now_date]
                page_datas.append(page_data)
            if len(page_datas) > 0:
                set_data_mysql(page_datas)
    
        except Exception as e:
            logger.error(
                'get this page detail error... [cat:' + category + '  industry:' + industry + ''
                '  module_type:' + module_type + '  date:' + dates + ']', e)
    
    
    # 批量插入mysql
    def set_data_mysql(page_datas):
        # 创建连接
        conn = pymysql.connect(host=host, port=int(port), user=user, passwd=password, db=database)
        # 创建游标
        cursor = conn.cursor()
        sql = "INSERT INTO test(这里有14个字段) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        effect_row = cursor.executemany(sql, page_datas)
        # 提交sql,不提交不会进入mysql
        conn.commit()
        logger.info("already into dabatabase %s" % effect_row)
        # # 下面两行是单行插入
        # # listOne = ('年度报告', 'category_ndbg_szsh;', dt)
        # # effect_row = cursor.execute(sql, listOne)
        # conn.commit() #需要提交来进入数据库
        # print effect_row
    
    
    # long转str类型时间1539187200000  1539001526000->2018-10-08 20:25:26
    def date_long_to_str(long_date):
        if long_date == "" or long_date == 0:
            return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        fommat_time = time.localtime(long(long_date)/1000)
        time_str = time.strftime("%Y-%m-%d %H:%M:%S", fommat_time)
        return time_str
    
    
    # 全局循环爬取
    def collect_cate():
        if date == '':
            for seDate in date_list:
                for mod in moudue_dict:
                    for category in cate_dict:
                        for industry in industry_dict:
                            #logger.info("category=%s, mod=%s, industry=%s" % (category, mod, industry))
                            pages = get_page(seDate, category, moudue_dict[mod], industry)
                            #logger.info("pages = %s" % pages)
                            for page in range(1, pages + 1):
                                get_page_data(seDate, category, page, moudue_dict[mod], industry)
        else:
            for mod in moudue_dict:
                for category in cate_dict:
                    for industry in industry_dict:
                        #logger.info("category = %s, mod=%s, industry=%s" % (category, mod, industry))
                        pages = get_page(date, category, moudue_dict[mod], industry)
                        #logger.info("pages = %s" % pages)
                        if 0 != pages:
                            for page in range(1, pages + 1):
                                get_page_data(date, category, page, moudue_dict[mod], industry)
    
    
    if __name__ == "__main__":
        collect_cate()
  • 相关阅读:
    js封装日期格式化函数
    原生js时间戳获取和转换
    自适应好用的一个css
    ES6五种遍历对象属性的方式
    ES6对象属性名简洁表示法和表达式、对象新方法、属性的遍历
    ES6数组扩展运算符(Rest+Spread)、类方法、原型方法
    正则表达式常见匹配
    typescript深copy和浅copy
    判断一个变量类型是对象还是数组
    npm 淘宝镜像的配置
  • 原文地址:https://www.cnblogs.com/keepMoveForevery/p/9777155.html
Copyright © 2011-2022 走看看