zoukankan      html  css  js  c++  java
  • bitmap与桶方式对1000万数据进行排序(转+自己实现理解)

    1.  100万数据的产生,随机数方式

    #include <iostream>
    #include
    <time.h>
    #include
    <assert.h>
    #include
    <stdio.h>
    #include
    <stdlib.h>

    using namespace std;
    const int size = 10000000;
    int num[size];
    int main()
    {
    int n;
    FILE
    *fp = fopen("data.txt", "w");
    assert(fp);
    for (n = 1; n <= size; n++)

    //之前此处写成了n=0;n<size。导致下面有一段小程序的测试数据出现了0,特此订正。
    num[n] = n;
    srand((unsigned)time(NULL));
    int i, j;
    for (n = 0; n < size; n++)
    {
    i
    = (rand() * RAND_MAX + rand()) % 10000000;
    j
    = (rand() * RAND_MAX + rand()) % 10000000;
    swap(num[i], num[j]);
    }
    for (n = 0; n < size; n++)
    fprintf(fp,
    "%d ", num[n]);
    fclose(fp);

    return 0;
    }

      使用bit_set 进行排序

    //位图方式解决海量数据排序,数据不能有重复

    //使用 C++ stl的 bitset
    #include <iostream>
    #include
    <bitset>
    #include
    <assert.h>
    #include
    <time.h>
    #include
    <stdio.h>
    #include
    <stdlib.h>
    using namespace std;
    const int max_each_scan = 5000000;
    int main()
    {
    clock_t begin
    = clock();
    bitset
    <max_each_scan> bit_map;
    bit_map.reset();
    // open the file with the unsorted data
    FILE *fp_unsort_file = fopen("data.txt", "r");
    assert(fp_unsort_file);
    int num;

    // the first time scan to sort the data between 0 - 4999999
    while (fscanf(fp_unsort_file, "%d ", &num) != EOF)
    {
    if (num < max_each_scan)
    //有这个数字,将bit_map的对应的位设置为1
    bit_map.set(num, 1);
    }
    FILE
    *fp_sort_file = fopen("sort.txt", "w");
    assert(fp_sort_file);
    int i;
    // write the sorted data into file
    for (i = 0; i < max_each_scan; i++)
    {
    if (bit_map[i] == 1)
    fprintf(fp_sort_file,
    "%d ", i);
    }

    // the second time scan to sort the data between 5000000 - 9999999
    int result = fseek(fp_unsort_file, 0, SEEK_SET);
    if (result)
    cout
    << "fseek failed!" << endl;
    else
    {
    bit_map.reset();
    while (fscanf(fp_unsort_file, "%d ", &num) != EOF)
    {
    if (num >= max_each_scan && num < 10000000)
    {
    num
    -= max_each_scan;
    bit_map.
    set(num, 1);
    }
    }
    for (i = 0; i < max_each_scan; i++)
    {
    if (bit_map[i] == 1)
    fprintf(fp_sort_file,
    "%d ", i + max_each_scan);
    }
    }
    clock_t end
    = clock();

    cout
    <<"用位图的方法,耗时:"<<endl;
    cout
    << (end - begin) / CLK_TCK << "s" << endl;
    fclose(fp_sort_file);
    fclose(fp_unsort_file);
    return 0;
    }

      位图排序的实现示例

    #include <iostream>
    #include
    <memory.h>
    #define BYTESIZE 8
    using namespace std;
    void setBit(char *p,int posi)
    {
    for(int i = 0;i < (posi/BYTESIZE);i ++)
    {
    p
    ++;
    }
    *p = *p|(0x01 << (posi%BYTESIZE));//将该Bit位赋值1
    return;
    }
    int main()
    {
    int num[] = {3,2,5,7,12,24,9,8,6};
    const int BufferLen = 2;
    char *pBuffer = new char[BufferLen];
    memset(pBuffer,
    0,BufferLen);

    for(int i = 0;i < 9;i ++)
    setBit(pBuffer,num[i]);

    //输出排序结果
    for(int i = 0;i < BufferLen;i ++)//每次处理一个字节
    {
    for(int j = 0;j < BYTESIZE;j ++)
    {
    if( (*pBuffer&(0x01<<j)) == (0x01<<j))
    cout
    << i * BYTESIZE + j << " ";
    }
    pBuffer
    ++;
    }
    return 0;
    }

      归并排序方式实现

    //copyright@ 纯净的天空 && yansha
    //5、July,updated,2010.05.28。
    #include <iostream>
    #include
    <ctime>
    #include
    <fstream>
    #include
    <stdio.h>
    #include
    <stdlib.h>
    #include
    <string.h>
    //#include "ExternSort.h"
    using namespace std;
    //使用多路归并进行外排序的类
    //ExternSort.h
    /*
    * 大数据量的排序
    * 多路归并排序
    * 以千万级整数从小到大排序为例
    * 一个比较简单的例子,没有建立内存缓冲区
    */
    #ifndef EXTERN_SORT_H
    #define EXTERN_SORT_H
    #include
    <cassert>
    class ExternSort
    {
    public:void sort()
    {
    time_t start
    = time(NULL);
    //将文件内容分块在内存中排序,并分别写入临时文件
    int file_count = memory_sort();
    //归并临时文件内容到输出文件
    merge_sort(file_count);
    time_t end
    = time(NULL);
    printf(
    "total time:%f/n", (end - start) * 1000.0/ CLOCKS_PER_SEC);
    }

    //input_file:输入文件名
    //out_file:输出文件名
    //count: 每次在内存中排序的整数个数
    ExternSort(const char *input_file, const char * out_file, int count)
    {
    m_count
    = count;
    m_in_file
    = new char[strlen(input_file) + 1];
    strcpy(m_in_file, input_file);
    m_out_file
    = new char[strlen(out_file) + 1];
    strcpy(m_out_file, out_file);
    }
    virtual ~ExternSort()
    {
    delete [] m_in_file;
    delete [] m_out_file;
    }
    private:int m_count;
    //数组长度
    char *m_in_file;
    //输入文件的路径
    char *m_out_file;
    //输出文件的路径
    protected:int read_data(FILE* f, int a[], int n)
    {
    int i = 0;
    while(i < n && (fscanf(f, "%d", &a[i]) != EOF))
    i
    ++;
    printf(
    "read:%d integer/n", i);
    return i;
    }
    void write_data(FILE* f, int a[], int n)
    {
    for(int i = 0; i < n; ++i)
    fprintf(f,
    "%d ", a[i]);
    }
    char* temp_filename(int index)
    {
    char *tempfile = new char[100];
    sprintf(tempfile,
    "temp%d.txt", index);
    return tempfile;
    }
    static int cmp_int(const void *a, const void *b)
    {
    return *(int*)a - *(int*)b;
    }
    int memory_sort()
    {
    FILE
    * fin = fopen(m_in_file, "rt");
    int n = 0, file_count = 0;
    int *array = new int[m_count];
    //每读入m_count个整数就在内存中做一次排序,并写入临时文件
    while(( n = read_data(fin, array, m_count)) > 0)
    {
    qsort(array, n,
    sizeof(int), cmp_int);
    //这里,调用了库函数阿,在第四节的c实现里,不再调用qsort。
    char *fileName = temp_filename(file_count++);
    FILE
    *tempFile = fopen(fileName, "w");
    free(fileName);
    write_data(tempFile, array, n);
    fclose(tempFile);
    }
    delete [] array;
    fclose(fin);
    return file_count;
    }
    void merge_sort(int file_count)
    {
    if(file_count <= 0)
    return;
    //归并临时文件
    FILE *fout = fopen(m_out_file, "wt");
    FILE
    * *farray = new FILE*[file_count];
    int i;
    for(i = 0; i < file_count; ++i)
    {
    char* fileName = temp_filename(i);
    farray[i]
    = fopen(fileName, "rt");
    free(fileName);
    }
    int *data = new int[file_count];
    //存储每个文件当前的一个数字
    bool *hasNext = new bool[file_count];
    //标记文件是否读完
    memset(data, 0, sizeof(int) * file_count);
    memset(hasNext,
    1, sizeof(bool) * file_count);
    for(i = 0; i < file_count; ++i)
    {
    if(fscanf(farray[i], "%d", &data[i]) == EOF)
    //读每个文件的第一个数到data数组
    hasNext[i] = false;
    }
    while(true)
    {
    //求data中可用的最小的数字,并记录对应文件的索引
    int min = data[0];
    int j = 0;
    while (j < file_count && !hasNext[j])
    j
    ++;
    if (j >= file_count)
    //没有可取的数字,终止归并
    break;
    for(i = j + 1; i < file_count; ++i)
    {
    if(hasNext[i] && min > data[i])
    {
    min
    = data[i];j = i;
    }
    }
    if(fscanf(farray[j], "%d", &data[j]) == EOF)
    //读取文件的下一个元素
    hasNext[j] = false;
    fprintf(fout,
    "%d ", min);
    }
    delete [] hasNext;
    delete [] data;
    for(i = 0; i < file_count; ++i)
    {
    fclose(farray[i]);
    }
    delete [] farray;
    fclose(fout);
    }
    };
    #endif
    //测试主函数文件
    /*
    * 大文件排序*
    数据不能一次性全部装入内存*
    排序文件里有多个整数,
    整数之间用空格隔开
    */
    const unsigned int count = 10000000;
    // 文件里数据的行数
    const unsigned int number_to_sort = 1000000;
    //在内存中一次排序的数量
    const char *unsort_file = "data.txt";
    //原始未排序的文件名
    const char *sort_file = "sort_data.txt";
    //已排序的文件名
    void init_data(unsigned int num);
    //随机生成数据文件
    int main(int argc, char* *argv)
    {
    srand(time(NULL));
    init_data(count);
    ExternSort extSort(unsort_file, sort_file, number_to_sort);
    extSort.sort();

    return 0;
    }
    void init_data(unsigned int num)
    {
    FILE
    * f = fopen(unsort_file, "wt");
    for(int i = 0; i < num; ++i)
    fprintf(f,
    "%d ", rand());
    fclose(f);
    }

      还有一种是桶排序方式实现的

    #include <stdio.h>
    #include
    <stdlib.h>
    #include
    <string.h>
    #include
    <iostream>
    #include
    <ctime>
    #define LOW 18 //桶大小
    #define FILE_NUM 39 //桶对应的文件数

    #define MEM_SIZE 256*1024
    using namespace::std;

    int memory[MEM_SIZE]; //1M


    //对ifp中的数据进行排序,结果输出到ofp中 ,i是正在处理的桶的编号
    void sort(FILE*ifp, FILE *ofp, int i)
    {
    memset(memory,
    0,1024*1024);

    int d;
    int high=i<<LOW; //保存数据的高位

    if(fscanf(ifp, "%d", &d)==1)
    {
    ++memory[d&0x3ffff]; //计数,只是用低18位
    high=d&0xfffc0000; //保存高位

    }

    while(fscanf(ifp, "%d", &d)==1)
    {

    ++memory[d&0x3ffff]; //计数,不考虑高五位
    }

    for (int i=0; i<MEM_SIZE; ++i)
    {
    int num=memory[i];
    while(num--)
    {
    fprintf(ofp,
    "%d ",i|high); //输出结果
    }


    }


    }

    int main()
    {
    FILE
    *fp_tmp[FILE_NUM];
    FILE
    *fp_data;

    if(NULL==(fp_data=fopen("data.txt","r"))) //打开测试数据
    exit(0);
    int d;
    int i;

    time_t start
    = time(NULL); //开始计时

    for (i=0; i<FILE_NUM; ++i) //创建桶对应的FILE_NUM个文件
    {
    char buf[64]="tmp_";
    char buf_int[4];
    itoa(i, buf_int,
    10);
    strcat(buf,buf_int);
    strcat(buf,
    ".txt");

    if((fp_tmp[i]=fopen(buf,"w+"))==NULL)
    exit(
    0);

    }

    while(fscanf(fp_data,"%d",&d)==1) //读入数据存放到各个桶中
    {
    int i = d >> LOW; //不管这个数多大,右移18位啊,都变成0了

    fprintf(fp_tmp[d
    >> LOW], "%d ",d&0x3ffff);
    }
    for (i=0; i<FILE_NUM; ++i) //初始化文件指针
    {
    rewind(fp_tmp[i]);

    }

    FILE
    * out_fp;
    if(NULL==(out_fp=fopen("out.txt","w"))) //out.txt用于保存排序后的数据
    exit(0);

    for (i=0; i<FILE_NUM; ++i)
    {

    sort(fp_tmp[i],out_fp,i);
    //分别对每个桶进行排序

    }



    for (i=0; i<FILE_NUM; ++i) //关闭文件
    {
    fclose(fp_tmp[i]);

    }
    time_t end
    = time(NULL); //停止计时

    printf(
    "total time:%f/n", (end - start) * 1000.0/ CLOCKS_PER_SEC);

    return 0;
    }

      

  • 相关阅读:
    java基础(六):RabbitMQ 入门
    Spring Boot 入门(六):集成 treetable 和 zTree 实现树形图
    Geoserver+Openlayers拉框查询
    Spring Boot 入门(五):集成 AOP 进行日志管理
    Spring boot 入门(四):集成 Shiro 实现登陆认证和权限管理
    java基础(五):谈谈java中的多线程
    java基础(四):谈谈java中的IO流
    java基础(三):谈谈java异常的处理
    java基础(二):谈谈Java基本数据结构
    Java编译时常量和运行时常量
  • 原文地址:https://www.cnblogs.com/hitwtx/p/2147447.html
Copyright © 2011-2022 走看看