zoukankan      html  css  js  c++  java
  • sphinx测试数据生成

    import json
    from random import sample, randint
    from uuid import uuid4
    
    def gen_random_words():
        with open("D:\exp\test_data\dictionary.txt") as f:
            words = [word.strip() for word in f]
            f.close()
            # print "OK. words length:", len(words)
            return sample(words, 3000)
        return []
    
    
    total_words = 0
    def sample_words(search_words, random_words):
        global total_words
        sample_cnt = 1000
        for word in random_words:
            total_words += 1
            if len(search_words) < sample_cnt:
                search_words.append(word)
            else:
                if randint(1, total_words) <= sample_cnt:
                    kick_off = randint(0, sample_cnt-1)
                    search_words[kick_off] = word
    
    
    def gen_an_event(words, search_words):
        query_words = sample(words, randint(1, 10))
        sample_words(search_words,query_words)
        title = " ".join(query_words)
        query_words = sample(words, randint(1, 100))
        sample_words(search_words,query_words)
        content = " ".join(query_words)
        event_data = {"title": title, "content": content}
        return event_data
    
    
    if __name__ == "__main__":
        search_words = []
        for i in range(1):
            words = gen_random_words()
            lines_cnt = 500000
            es_out_put = [""]*lines_cnt
            for i in range(0, lines_cnt):
                event = gen_an_event(words, search_words)
                es_out_put[i] = "        (%d, 2, 9, NOW(), '%s', '%s'), 
    " % (i+5, event["title"], event["content"])
            # print es_out_put
            # print splunk_out_put
            out_puts = [es_out_put]
            file_name = str(uuid4()) + ".txt"
            for i,dir_name in enumerate(["Sphinx"]):
                outfile = "D:\test_data\%s\%s" % (dir_name, file_name)
                f = open(outfile, "w")
                for j in range(0, lines_cnt):
                    f.write(out_puts[i][j])
                f.close()
                print outfile
        outfile = "D:\test_data\search_words2.txt"
        f = open(outfile, "w")
        f.write(json.dumps(search_words))
        f.close()
    
    
    sql = '''
    DROP TABLE IF EXISTS test.documents;
    CREATE TABLE test.documents
    (
            id                      INTEGER PRIMARY KEY NOT NULL AUTO_INCREMENT,
            group_id        INTEGER NOT NULL,
            group_id2       INTEGER NOT NULL,
            date_added      DATETIME NOT NULL,
            title           VARCHAR(255) NOT NULL,
            content         TEXT NOT NULL
    );
    
    REPLACE INTO test.documents ( id, group_id, group_id2, date_added, title, content ) VALUES
            ( 1, 1, 5, NOW(), 'test one', 'this is my test document number one. also checking search within phrases.' ),
            ( 2, 1, 6, NOW(), 'test two', 'this is my test document number two' ),
            ( 3, 2, 7, NOW(), 'another doc', 'this is another group' ),
            ( 4, 2, 8, NOW(), 'doc number four', 'this is to test groups' );
    
    DROP TABLE IF EXISTS test.tags;
    CREATE TABLE test.tags
    (
            docid INTEGER NOT NULL,
            tagid INTEGER NOT NULL,
            UNIQUE(docid,tagid)
    );
    
    INSERT INTO test.tags VALUES
            (1,1), (1,3), (1,5), (1,7),
            (2,6), (2,4), (2,2),
            (3,15),
            (4,7), (4,40);
    '''
  • 相关阅读:
    初学 Delphi 嵌入汇编[13] 地址参数用 [] 取值
    初学 Delphi 嵌入汇编[17] 逻辑运算
    初学 Delphi 嵌入汇编[11] 用汇编重写一个 Delphi 函数
    初学 Delphi 嵌入汇编[12] 在汇编代码中可以直接使用 Result
    初学 Delphi 嵌入汇编[19] Delphi 的无符号整数类型
    分享:tcpproxy实现
    Socket编程之简单介绍 蓝天下的雨 博客园
    分享:libuv 中文编程指南
    分享:《编程之美》求二叉树中节点的最大距离
    CentOS6.0下编译最新版本boost库
  • 原文地址:https://www.cnblogs.com/bonelee/p/6651403.html
Copyright © 2011-2022 走看看