zoukankan      html  css  js  c++  java
  • Python3.5爬取cbooo.cn数据并且同步到mysql中

    #!/usr/local/bin/python
    # -*- coding: utf-8 -*-
    # Python:                  3.5
    # Author:                  wucl(),zhenghai.zhang
    # Program:                 爬取CBO网站上所有电影的名称并写入数据库。
    # Version:                 0.1
    # History:                 2017.10.25
    
    import requests,time, pymysql, re, datetime
    from exchangelib import DELEGATE, Account, Credentials, Message, Mailbox, HTMLBody
    
    host = 'xxx'
    user = 'xxx'
    passwd = 'xxx'
    dbme = 'crawl'
    dbtarget = 'back_brace'
    table = 'movie_hotwords'
    tabledelta = 'movie_hotwords_delta'
    tablesync = 'slot_value'
    port = 3306
    tolist = ['xxx@xxx.com']
    
    def get_info():
        try:
            url = 'http://www.cbooo.cn/Mdata/getMdata_movie?area=50&type=0&year=0&initial=%E5%85%A8%E9%83%A8&pIndex=1'
            pData = requests.get(url).json()
            return pData['tPage'], pData['tCount']
        except:
            print("获取总页数和总电影数失败")
    
    
    def get_movies(page):
        try:
            url = 'http://www.cbooo.cn/Mdata/getMdata_movie?area=50&type=0&year=0&initial=%E5%85%A8%E9%83%A8&pIndex=' + str(page)
            pData = requests.get(url).json()
            movies_list = pData['pData']
            return movies_list
        except:
            print('获取第%s页电影列表失败' % page)
    
    
    def Movie_insert(host, user, passwd, dbme, port, table, movies_list):
        conn=pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8")
        cur=conn.cursor()
        new_movies = []
        punc = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.()::。·"
        punctuation = punc
        for movie in movies_list:
            try:
                movie['MovieName'] = re.sub(r"[%s]+" % punctuation, "", movie["MovieName"])
                cmd = 'insert into %s(movie_id, movie_name) values("%s", "%s")' % (table, movie['ID'], movie['MovieName'])
                cur.execute(cmd)
                new_movies.append(movie)
            except pymysql.Error:
                print(" "*20, movie['MovieName'], "already exists, skip……")
        cur.close()
        conn.commit()
        conn.close()
        return new_movies
    
    
    def Movie_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, movies_list, tablesync):
        conn = pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8")
        cur = conn.cursor()
        cur.execute("delete from %s " % dbme+"."+tabledelta)
    
        for movie in movies_list:
            try:
                cmd = 'insert into %s(movie_id, movie_name) values("%s", "%s")' % (tabledelta, movie['ID'], movie['MovieName'])
                cmdsync = 'insert into %s(slot_type_id, slot_value, create_by, modify_by, gmt_create, gmt_modify, out_value) values("%s", "%s", "%s", "%s", "%s", "%s", "%s")' % (dbtarget+"."+tablesync, "xxxxxx", movie['MovieName'], "system", "system", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),"")
                cur.execute(cmd)
                cur.execute(cmdsync)
            except pymysql.Error:
                print(" " * 20, movie['MovieName'], "already exists, skip……")
    
        try:
            cmdbacktoskill = 'insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("BACKBRACE","testpass","SLOT","xxxxxx","init","SLOT_BACKBRACE_TESTPASS" ,"zhenghai.zhang","zhenghai.zhang","%s","%s")' % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
            cmdskilltoskillpro = 'insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("SKILL","deploy","SLOT","xxxxxx","init","SLOT_SKILL_DEPLOY" ,"zhenghai.zhang","zhenghai.zhang","%s","%s")' % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    
            print(cmdbacktoskill)
            cur.execute(cmdbacktoskill)
            print(cmdskilltoskillpro)
            cur.execute(cmdskilltoskillpro)
        except pymysql.Error:
            print("write into back_brace.release_task error!!!")
        cur.close()
        conn.commit()
        conn.close()
    
    
    def Email(to, subject, body):
        creds = Credentials(
            username='xxxxxx',
            password='xxxxxx')
        account = Account(
            primary_smtp_address='xxx@xxx.com',
            credentials=creds,
            autodiscover=True,
            access_type=DELEGATE)
        m = Message(
            account=account,
            subject=subject,
            body=HTMLBody(body),
            to_recipients=[Mailbox(email_address=to)])
        m.send_and_save()
    
    
    if __name__ == '__main__':
        update_movies = []
        pages, counts = get_info()
        pages = 1
        for i in range(1,pages + 1):
            print("*"*30,i,"*"*30)
            movies_list = get_movies(i)
            new_movies = Movie_insert(host, user, passwd, dbme, port, table, movies_list)
            for new_movie in new_movies:
                print(new_movie['MovieName'],"Added")
                onemovie = {}
                onemovie["ID"] = new_movie["ID"]
                onemovie["MovieName"] = new_movie["MovieName"]
                update_movies.append(onemovie)
            time.sleep(1)
        print(update_movies)
    
        try:
            Movie_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, update_movies, tablesync)  # 将增加的电影写入movie_hotwords_delta表中
        except:
            print("Movie update and sync Error!")
    
        subject = '本次新增电影名称'
        body = "本次新增的电影名称为:<hr>"
        for movie in update_movies:
            body += movie["MovieName"] + "<br>"
        for to in tolist:
            Email(to, subject, body)

    欢迎大侠指点

  • 相关阅读:
    11.SpringBoot学习(十一)——JDBC之 Spring Boot JdbcTemplate
    10.SpringBoot学习(十)——JDBC之 Spring Boot Jpa
    9.SpringBoot学习(九)——Spring Boot Admin使用
    8.SpringBoot学习(八)——Spring Boot Actuator监控
    7.SpringBoot学习(七)——Spring Boot Email发送邮件
    5.Gradle组件说明
    3.Gradle构建Java项目
    6.Gradle多模块项目
    flash解析json格式
    java窗体与Flash交互
  • 原文地址:https://www.cnblogs.com/zhzhang/p/7766928.html
Copyright © 2011-2022 走看看