zoukankan      html  css  js  c++  java
  • UNIX环境高级编程APUE练习4.6-实现类似cp(1)的程序,保留文件中的空洞

    1 题面

    编写类似cp(1)的程序,它复制包含空洞的文件,但是不将字节0写到输出文件中去。

    2 基本思路

    • 首先要搞清楚空洞的性质以判断一个文件是否有空洞,以及空洞的位置
    • 知道了空洞的位置之后,读到源文件中的空洞部分时,在目标文件中lseek相应的长度

    3 创建空洞文件,同时探索空洞性质

    交替lseekwrite,逐渐增大间隔长度。比较文件的大小和实际占用的block数目

    • 测试源码
    #include <stdio.h>
    #include <fcntl.h>
    #include <unistd.h>
    #include <string.h>
    
    int holesize[]={1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32*1024};
    int filesize = 64*1024;
    
    int main()
    {
        int i = 0;
        int count = 0;
        int ret = 0, fd = 0;
        char filename[32]={0};
        unsigned char buf[32*1024]={0};
        memset(buf, 1, 32*1024);
        for (; i< sizeof(holesize)/ sizeof(int); ++i) {
            count = 0;
            memset(filename, 0, 32);
            sprintf(filename, "%s%d", "holesize", holesize[i]);
            fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
            if(fd < 0) {
                printf("open file fail
    ");
                return -1;
            }
            while(count < filesize) {
                ret = lseek(fd, holesize[i], SEEK_CUR);
                if(ret < 0) {
                    printf("lseek fail
    ");
                    return -1;
                }
                int remain = holesize[i];
                while(remain) {
                    ret = write(fd, buf, remain);
                    if(ret < 0 ) {
                        perror("write fail
    ");
                        return -1;
                    }
                    remain -= ret;
                }
                count += holesize[i] * 2;
            }
            close(fd);
        }
        return 0;
    }
    
    • MAC OSX 10.1.4.6测试结果
    ^_^$ ll -s
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize1
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize1024
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize128
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize16
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize16384
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize2
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize2048
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize256
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize32
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize32768
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize4
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize4096
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize512
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize64
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize8
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize8192
    

    Mac OSX上创建不了空洞文件,因为默认的文件系统是HFS +,不支持稀疏文件

    • Ubuntu18 4.15.0-60-generic测试结果
    ^_^$ ll -s
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize1
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize1024
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize128
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize16
    32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize16384
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize2
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize2048
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize256
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize32
    32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize32768
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize4
    32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize4096
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize512
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize64
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize8
    32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize8192
    

    4KB以上才实际创建空洞。
    因为在linux的文件系统中,磁盘分配的最小物理单元为簇。(即使文件大小不足以占用满一簇,该簇空余的磁盘存储仍旧是该文件的)

    所以可以根据这个性质,判断文件是否是空洞文件。有空洞的文件,用文件大小计算的block数至少比实际占用的block数大1个簇的block数

    如何可移植地获取簇的大小

    pagesize = sysconf(_SC_PAGESIZE);
    

    初步实现功能

    • 源码
    #include <stdio.h>
    #include <unistd.h>
    #include <fcntl.h>
    #include <string.h>
    #include <stdlib.h>
    #include <sys/stat.h>
    #include <sys/errno.h>
    
    int my_cp(const char *from, const char *to)
    {
        int fd1 = -1, fd2 = -1;
        int rev = -1;
        unsigned char *buffer = NULL;
        unsigned char *start_pos = NULL;
        long pagesize = 0;
        long long blocks, blksize, size;
        int read_num, write_num, remain_num, current_pos = 0, last_zero = -1, last_nonzero = -1, have_holes = 0;
        struct stat st;
    
        fd1 = open(from, O_RDONLY);
        if(-1 == fd1){
            perror("open file1 faild");
            goto err;
        }
    
        if(fstat(fd1, &st) !=0) {
            perror("fstat: ");
            goto err;
        }
        else{
    #ifdef _SC_PAGESIZE
            pagesize = sysconf(_SC_PAGESIZE);
            if (pagesize < 0) {
                if (errno != 0) {
                    if (errno == EINVAL) {
                        fputs(" (not supported)
    ", stdout);
                        pagesize = st.st_blksize;
                    }
                    else {
                        perror("sysconf error");
                        goto err;
                    }
                } else {
                    fputs(" (no limit)
    ", stdout);
                    pagesize = st.st_blksize;
                }
            }
            printf("pagesize: %ld
    ", pagesize);
    #else
            pagesize = st.st_blksize;
    #endif
            blocks = st.st_blocks;
            blksize = st.st_blksize;
            size = st.st_size;
            printf("st.st_blocks: %lld
    ", blocks);
            printf("st.st_blksize: %lld
    ", blksize);
            printf("st.st_size: %lld
    ", size);
            /*块大小512,在不同平台上可能不兼容*/
            if(S_ISREG(st.st_mode) && (size / pagesize + (size%pagesize?1:0)) * pagesize > 512 * blocks) {
                have_holes = 1;
                printf("%s is a sparse-block file!
    ", from);
            } else{
                have_holes = 0;
                printf("%s is not a sparse-block file!
    ", from);
            }
        }
        fd2 = open(to, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
        if ( -1 == fd2) {
            perror ("open file2 faild");
            goto err;
        }
    
        buffer = malloc(pagesize);
        if(buffer == NULL) {
            perror ("malloc fail");
            goto err;
        }
        memset(buffer, '', pagesize);
        while((read_num = read(fd1, buffer, pagesize)) > 0) {
            /* 源文件有空洞 */
            if(have_holes){
                last_zero = -1;
                last_nonzero = -1;
                for(current_pos = 0; current_pos < read_num; current_pos++){
                    /* 逐字节判断,效率较低*/
                    if(buffer[current_pos] == 0){
                        if(last_nonzero > last_zero){
                            remain_num = last_nonzero - last_zero;
                            start_pos = buffer + last_zero + 1;
                            while(remain_num){
                                write_num = write(fd2, start_pos, remain_num);
                                if ( -1 == write_num){
                                    perror( "write file2 error");
                                    goto err;
                                }
                                remain_num -= write_num;
                                start_pos += write_num;
                            }
                        }
                        last_zero = current_pos;
                    }
                    else{
                        if(last_zero > last_nonzero){
                            remain_num = last_zero - last_nonzero;
                            if(-1 == lseek(fd2, remain_num, SEEK_CUR)){
                                perror("lseek file2 fail");
                                goto err;
                            }
                        }
                        last_nonzero = current_pos;
                    }
                }
                /* 处理最后剩余数据*/
                remain_num = (last_nonzero > last_zero)?(last_nonzero - last_zero):(last_zero - last_nonzero);
                start_pos = buffer + current_pos - remain_num;
                if(last_nonzero > last_zero){
                    while(remain_num){
                        write_num = write(fd2, start_pos, remain_num);
                        if ( -1 == write_num){
                            perror( "write file2 error");
                            goto err;
                        }
                            remain_num -= write_num;
                            start_pos += write_num;
                        }
                    }
                else{
                    if(-1 == lseek(fd2, remain_num, SEEK_CUR)){
                        perror("lseek file2 fail");
                        goto err;
                    }
                }
            }
            /* 源文件无空洞 */
            else {
                remain_num = read_num;
                start_pos = buffer;
                while(remain_num){
                    write_num = write(fd2, start_pos, remain_num);
                    if ( -1 == write_num){
                        perror( "write file2 error");
                        goto err;
                    }
                    remain_num -= write_num;
                    start_pos += write_num;
                }
            }
        }
        if(-1 == read_num) {
            perror("read file1 error");
            goto err;
        }
        rev = 0;
    err:
        if(buffer) free(buffer);
        close(fd1);
        close(fd2);
        return rev;
    }
    
    int main(int argc, char *argv[])
    {
        if(argc < 3) {
            printf("Usage: %s file1 file2
    ", argv[0]);
            return -1;
        }
        my_cp(argv[1], argv[2]);
        return 0;
    }
    
    • 测试结果
    ^_^$ ./my_cp holesize2048 holesize2048.cp
    pagesize: 4096
    st.st_blocks: 128
    st.st_blksize: 4096
    st.st_size: 65536
    holesize2048 is not a sparse-block file!
    chen@ubuntu18:~/study/apue.3e/exercises/4
    ^_^$ ./my_cp holesize4096 holesize4096.cp
    pagesize: 4096
    st.st_blocks: 72
    st.st_blksize: 4096
    st.st_size: 65536
    holesize4096 is a sparse-block file!
    
    ^_^$ ll -s
    total 1708
    64 -rw-r--r-- 1 chen chen  65536 1月   6 17:27 holesize2048
    64 -rw-r--r-- 1 chen chen  65536 1月   6 17:27 holesize2048.cp
    36 -rw-r--r-- 1 chen chen  65536 1月   6 17:27 holesize4096
    32 -rw-r--r-- 1 chen chen  65536 1月   6 17:27 holesize4096.cp
    

    空洞文件可以正常拷贝

    尝试优化程序

    上面的程序仅在判断文件是否含有空洞时利用的空洞的最小限制。而在实际读写时并没有利用该性质。

    这样较短的0字节也会当成是空洞,导致系统调用次数的增加,性能的降低

    要优化性能,必须进一步探究空洞的性质。在什么样的情况下才创建空洞(不实际占用磁盘空间的块)?

    • 测试程序源码

    此程序创建了3个文件:

    - 文件1先`write`了1K的非零数据,然后`lseek` 7K-1字节。循环2次。
    - 文件2先`write`了1K的非零数据,然后`lseek` 7K字节。循环2次
    - 文件3先`write`了1K的非零数据,然后`lseek` 7K+1字节。循环2次
    
    #include <stdio.h>
    #include <fcntl.h>
    #include <unistd.h>
    #include <string.h>
    
    int holesize[]={4096};
    int filesize = 64*1024;
    
    int main()
    {
        int i = 0;
        int count = 0;
        int ret = 0, fd1 = 0, fd2 = 0, fd3 = 0;
        char filename1[32]={0};
        char filename2[32]={0};
        char filename3[32]={0};
        unsigned char buf[32*1024]={0};
        memset(buf, 1, 32*1024);
        for (; i< sizeof(holesize)/ sizeof(int); ++i) {
            count = 0;
            memset(filename1, 0, 32);
            memset(filename2, 0, 32);
            memset(filename3, 0, 32);
            sprintf(filename1, "%s%d-1", "holesize", holesize[i]);
            sprintf(filename2, "%s%d-2", "holesize", holesize[i]);
            sprintf(filename3, "%s%d-3", "holesize", holesize[i]);
            fd1 = open(filename1, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
            fd2 = open(filename2, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
            fd3 = open(filename3, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
            if(fd1 < 0 || fd2 < 0 || fd3 < 0) {
                printf("open file fail
    ");
                return -1;
            }
            count = 0;
            while(count < 2) {
                int remain = holesize[i] * 1 / 4;
                while(remain) {
                    ret = write(fd1, buf, remain);
                    if(ret < 0 ) {
                        perror("write fail
    ");
                        return -1;
                    }
                    remain -= ret;
                }
                ret = lseek(fd1, holesize[i] * 7 / 4 - 1, SEEK_CUR);
                if(ret < 0) {
                    printf("lseek fail
    ");
                    return -1;
                }
                ++count;
            }
            count = 0;
            while(count < 2) {
                int remain = holesize[i] * 1 / 4;
                while(remain) {
                    ret = write(fd2, buf, remain);
                    if(ret < 0 ) {
                        perror("write fail
    ");
                        return -1;
                    }
                    remain -= ret;
                }
                ret = lseek(fd2, holesize[i] * 7 / 4, SEEK_CUR);
                if(ret < 0) {
                    printf("lseek fail
    ");
                    return -1;
                }
                ++count;
            }
            count = 0;
            while(count < 2) {
                int remain = holesize[i] * 1 / 4;
                while(remain) {
                    ret = write(fd3, buf, remain);
                    if(ret < 0 ) {
                        perror("write fail
    ");
                        return -1;
                    }
                    remain -= ret;
                }
                ret = lseek(fd3, holesize[i] * 7 / 4 + 1, SEEK_CUR);
                if(ret < 0) {
                    printf("lseek fail
    ");
                    return -1;
                }
                ++count;
            }
            close(fd1);
            close(fd2);
            close(fd3);
        }
        return 0;
    }
    
    • 测试结果
    ^_^$ ll -s
    12 -rw-r--r-- 1 chen chen  9215 1月   6 15:07 holesize4096-1
     8 -rw-r--r-- 1 chen chen  9216 1月   6 15:07 holesize4096-2
     8 -rw-r--r-- 1 chen chen  9217 1月   6 15:07 holesize4096-3
    

    可见空洞必须从一页的起始位置开始计算,并且等于或超过pagesize,才不占用实际磁盘空间

    优化后程序

    • 源码
    #include <stdio.h>
    #include <unistd.h>
    #include <fcntl.h>
    #include <string.h>
    #include <stdlib.h>
    #include <sys/stat.h>
    #include <sys/errno.h>
    
    ssize_t read_ex(int fd, void *buf, size_t nbyte){
        size_t read_remain = nbyte;
        unsigned char *read_start = (unsigned char*)buf;
        ssize_t read_num = -1;
        ssize_t total_num = 0;
        while(read_remain) {
            read_num = read(fd, read_start, read_remain);
            if(-1 == read_num){
                return -1;
            }
            else if(0 == read_num){
                break;
            }
            else{
                read_remain -= read_num;
                read_start += read_num;
                total_num += read_num;
            }
        }
        return total_num;
    }
    
    ssize_t write_ex(int fd, const void *buf, size_t nbyte){
        size_t write_remain = nbyte;
        unsigned char *write_start = (unsigned char*)buf;
        ssize_t write_num = -1;
        ssize_t total_num = 0;
        while(write_remain) {
            write_num = write(fd, write_start, write_remain);
            if(-1 == write_num){
                return -1;
            }
            else{
                write_remain -= write_num;
                write_start += write_num;
                total_num += write_num;
            }
        }
        return total_num;
    }
    int my_cp(const char *from, const char *to)
    {
        int fd1 = -1, fd2 = -1;
        int rev = -1;
        unsigned char *buffer = NULL, *buffer_zero = NULL;
        long pagesize = 0;
        long long blocks, blksize, size;
        int read_num, write_num, write_remain, have_holes = 0;
        struct stat st;
    
        fd1 = open(from, O_RDONLY);
        if(-1 == fd1){
            perror("open file1 faild");
            goto err;
        }
    
        if(fstat(fd1, &st) !=0) {
            perror("fstat: ");
            goto err;
        }
        else{
    #ifdef _SC_PAGESIZE
            pagesize = sysconf(_SC_PAGESIZE);
            if (pagesize < 0) {
                if (errno != 0) {
                    if (errno == EINVAL) {
                        fputs(" (not supported)
    ", stdout);
                        pagesize = st.st_blksize;
                    }
                    else {
                        perror("sysconf error");
                        goto err;
                    }
                } else {
                    fputs(" (no limit)
    ", stdout);
                    pagesize = st.st_blksize;
                }
            }
            printf("pagesize: %ld
    ", pagesize);
    #else
            pagesize = st.st_blksize;
    #endif
            blocks = st.st_blocks;
            blksize = st.st_blksize;
            size = st.st_size;
            printf("st.st_blocks: %lld
    ", blocks);
            printf("st.st_blksize: %lld
    ", blksize);
            printf("st.st_size: %lld
    ", size);
            /*块大小512,在不同平台上可能不兼容*/
            if(S_ISREG(st.st_mode) && (size / pagesize + (size%pagesize?1:0)) * pagesize > 512 * blocks) {
                have_holes = 1;
                printf("%s is a sparse-block file!
    ", from);
            } else{
                have_holes = 0;
                printf("%s is not a sparse-block file!
    ", from);
            }
        }
        buffer = malloc(pagesize);
        buffer_zero = malloc(pagesize);
        if(buffer == NULL || buffer_zero == NULL) {
            perror ("malloc fail");
            goto err;
        }
        memset(buffer, '', pagesize);
        memset(buffer_zero, '', pagesize);
    
        fd2 = open(to, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
        if (-1 == fd2) {
            perror ("open file2 faild");
            goto err;
        }
    
        while((read_num = read_ex(fd1, buffer, pagesize)) > 0) {
            /* 读取到空洞 */
            if(have_holes && !memcmp(buffer_zero, buffer, read_num)){
                if(-1 == lseek(fd2, read_num, SEEK_CUR)){
                    perror("lseek file2 fail");
                    goto err;
                }
            }
            /* 非空洞 */
            else{
                write_num = write_ex(fd2, buffer, read_num);
                if (-1 == write_num){
                    perror( "write file2 error");
                    goto err;
                }
            }
        }
        if(-1 == read_num){
            perror("read file1 error");
            goto err;
        }
        rev = 0;
    err:
        if(buffer) free(buffer);
        if(buffer_zero) free(buffer_zero);
        close(fd1);
        close(fd2);
        return rev;
    }
    
    int main(int argc, char *argv[])
    {
        if(argc < 3) {
            printf("Usage: %s file1 file2
    ", argv[0]);
            return -1;
        }
        my_cp(argv[1], argv[2]);
        return 0;
    }
    
    • 对比测试

    构造一个文件,除了开头一个空洞,其余数据为0x00,0x01的100000次重复

    用优化前的程序拷贝该文件10000次,大约2000s

    用优化后的程序拷贝该文件10000次,大约30s

  • 相关阅读:
    Windows Server 2003 SP2(32位) 中文版 下载地址 光盘整合方法
    用Recycle()方法对Java对象的重要性
    Lotus中千奇百怪的 $$
    Developing a simple application using steps "User Decision" and "Mail"(1) 沧海
    沟通中的情绪管理(演讲稿) 沧海
    人只有在压力之下,才可能成功,没做一件事,都必须成功,不许言败 沧海
    什么是IDOC,以及IDOC的步骤 沧海
    VS2008 Professional Edition CHS中的deffactory.dat读取错误 沧海
    Including custom text in the step "User Decision" 沧海
    SAP Upgrade Strategy 沧海
  • 原文地址:https://www.cnblogs.com/logchen/p/12157828.html
Copyright © 2011-2022 走看看