zoukankan      html  css  js  c++  java
  • 生成大量小文件 异常

    from ProjectUtil.usingModuleTOMODIFY import getNow
    from pymongo import MongoClient
    
    # mongo key
    host, username, password = '10.14.14.12', 'ain', 'ad'
    uri = "mongodb://%s:%s@%s" % (username, password, host,)
    
    q_f_export, q_export = '/data/bigdata/mongoexport/superpub-ask-question.csv', []
    q_f_mysql, q_mysql = '/data/bigdata/mongoexport/question.txt', []
    q_f_distinct, q_distinct = '{}-distinct'.format(q_f_export), []
    
    MYSQL_max_q_id = 3979647
    
    with open(q_f_distinct, 'r', encoding='utf-8') as fr:
        q_distinct = [i.rstrip('
    ') for i in fr]
        fr.close()
    
    start_ = getNow()
    
    mongo_client = MongoClient(uri)
    db = mongo_client.superpub
    mongo_collection = db.ask
    
    
    def get_momgo_res(question):
        global mongo_collection
        cursor = mongo_collection.find({"question": question}, {'answer': 1})
        a = []
        while cursor.alive:
            for doc in cursor:
                this_a = doc['answer']
                if this_a not in a:
                    a.append(this_a)
        return a
    
    
    def w(f, s):
        with open(f, 'w', encoding='utf-8') as fw:
            fw.write(s)
            fw.close()
    
    
    le_ = len(q_distinct)
    c = MYSQL_max_q_id + 1
    dir_ = '/data/bigdata/mongoexport/QA/'
    for q in q_distinct:
        try:
            a = get_momgo_res(q)
            if len(a) == 0:
                continue
            else:
                f = '{}{}q'.format(dir_, c)
                w(f, q)
                c += 1
                index_ = 0
                for i in a:
                    f = '{}{}a{}'.format(dir_, c, index_)
                    w(f, i)
                    index_ += 1
        except Exception as e:
            print(e)
        if c % 10000 == 0:
            print(c - MYSQL_max_q_id, '/', le_, start_, ':',
                  getNow())
    

      

  • 相关阅读:
    systemctl命令
    linux下常用命令查看端口占用
    【PostgreSQL】存取jsonb
    tomcat内存溢出之PermGen space
    Spring事务传播机制
    java框架篇---spring aop两种配置方式
    Hibernate一对多实例
    Github 的系统内部都在用什么开源软件?
    这是一个关于软件开发的博客。
    JavaScript中数组的集合和映射
  • 原文地址:https://www.cnblogs.com/rsapaper/p/10135394.html
Copyright © 2011-2022 走看看