zoukankan      html  css  js  c++  java
  • 千万数据条 用户特征数据 写入mysql

    from mysql_tool import *
    import copy
    
    s = '''
    INSERT INTO `qqzone`.`myu` (`id`, `uid`, `age`, `gender`, `marriageStatus`, `education`, `consumptionAbility`, `LBS`, `interest1`, `interest2`, `interest3`, `interest4`, `interest5`, `kw1`, `kw2`, `kw3`, `kw4`, `kw5`, `topic1`, `topic2`, `topic3`, `topic4`, `topic5`, `ct`, `appIdAction`, `appIdInstall`, `os`, `carrier`, `house`) VALUES ('33', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
    '''.replace('
    ', '')
    
    indb_fields_s = '{}{}'.format(s.split('VALUES')[0], ' VALUES ').replace('`id`,', '')
    fields_l = [i.replace(' ', '').replace('`', '') for i in s.split('(')[1].split(')')[0].split(',')]
    
    val_d = {}
    for i in fields_l:
        val_d[i] = 'NULL'
    del val_d['id']
    
    indb_step, indb_step_s = 2000, ''
    f = 'userFeature.data'
    with open(f, 'r') as fr:
        sql_d_l, indb_step_c = [], 0
        for i in fr:
            break
            sql_d = copy.deepcopy(val_d)
            l = i.replace('
    ', '').split('|')
            for ii in l:
                try:
                    ix_ = ii.index(' ')
                    k, v = ii[0:ix_], ii[ix_ + 1:]
                    # if k == 'appIdInstall':
                    #     continue
                    sql_d[k] = v
                except Exception as e:
                    print(e)
    
            sql_d_l.append(sql_d)
            indb_step_c += 1
            if indb_step_c % indb_step == 0:
                s_l = []
                for d in sql_d_l:
                    s_l.append('("{}")'.format('","'.join([d[k] for k in d])))
                indb_step_s = '{}{};'.format(indb_fields_s, ','.join(s_l))
                try:
                    mysql_write(indb_step_s)
                    indb_step_c = 0
                except Exception as e:
                    logs_l = [e, indb_step_s]
                    logs_s = '||'.join(logs_l)
                    print(logs_s)
                sql_d, sql_d_l, indb_step_c = val_d, [], 0
    
    if indb_step_c % indb_step != 0:
        s_l = []
        for d in sql_d_l:
            s_l.append('("{}")'.format('","'.join([d[k] for k in d])))
        indb_step_s = '{}{};'.format(indb_fields_s, ','.join(s_l))
        try:
            mysql_write(indb_step_s)
            indb_step_c = 0
        except Exception as e:
            logs_l = [e, indb_step_s]
            logs_s = '||'.join(logs_l)
            print(logs_s)
    
    def myindb(f, indb_fields_s, indb_step=2000):
        with open(f, 'r') as fr:
            sql_l, indb_step_c, indb_step_s = [], 0,''
            for i in fr:
                if 'aid' in i:
                    continue
                sql_l.append([ii for ii in i.replace('
    ', '').split(',')])
                indb_step_c += 1
                if indb_step_c % indb_step == 0:
                    s_l = []
                    for l in sql_l:
                        s_l.append('("{}")'.format('","'.join(l)))
                    indb_step_s = '{}{};'.format(indb_fields_s, ','.join(s_l))
                    try:
                        mysql_write(indb_step_s)
                        sql_l, indb_step_c = [], 0
                    except Exception as e:
                        logs_l = [e, indb_step_s]
                        logs_s = '||'.join(logs_l)
                        print(indb_step_s)
                        print(logs_s)
                        print(e)
        if indb_step_c % indb_step != 0:
            s_l = []
            for l in sql_l:
                s_l.append('("{}")'.format('","'.join(l)))
            indb_step_s = '{}{};'.format(indb_fields_s, ','.join(s_l))
            try:
                mysql_write(indb_step_s)
            except Exception as e:
                logs_l = [e, indb_step_s]
                logs_s = '||'.join(logs_l)
                print(logs_s)
                print(e)
    
    
    f = 'train.csv'
    indb_step, indb_step_s = 10000, ''
    indb_fields_s = 'INSERT INTO `qqzone`.`myt` ( `aid`, `uid`, `label`) VALUES '
    #myindb(f, indb_fields_s, indb_step)
    
    #f = 'test1.csv'
    indb_step, indb_step_s = 10000, ''
    indb_fields_s = 'INSERT INTO `qqzone`.`myr` ( `aid`, `uid`) VALUES '
    #myindb(f, indb_fields_s, indb_step)
    
    f = 'adFeature.csv'
    indb_step, indb_step_s = 200, ''
    indb_fields_s = 'INSERT INTO `qqzone`.`myadf` (`aid`, `advertiserId`, `campaignId`, `creativeId`, `creativeSize`,`adCategoryId`, `productId`, `productType`) VALUES '
    #myindb(f, indb_fields_s, indb_step)

    总的字段数、字段名确定,但是每行数据可能

    缺失某些字段

  • 相关阅读:
    【BZOJ3270】【高斯消元】博物馆
    【CODECHEF】【phollard rho + miller_rabin】The First Cube
    【BZOJ3884】【降幂大法】上帝与集合的正确用法
    【CF521C】【排列组合】Pluses everywhere
    mfc的任务栏的隐藏和显示
    Git配置过程
    AOP概念和7个专业术语
    文件操作IO流
    可扩展标记性语言XML
    深入理解多态
  • 原文地址:https://www.cnblogs.com/rsapaper/p/8973750.html
Copyright © 2011-2022 走看看