zoukankan      html  css  js  c++  java
  • UNIX环境高级编程APUE练习4.6-实现类似cp(1)的程序,保留文件中的空洞

    1 题面

    编写类似cp(1)的程序,它复制包含空洞的文件,但是不将字节0写到输出文件中去。

    2 基本思路

    • 首先要搞清楚空洞的性质以判断一个文件是否有空洞,以及空洞的位置
    • 知道了空洞的位置之后,读到源文件中的空洞部分时,在目标文件中lseek相应的长度

    3 创建空洞文件,同时探索空洞性质

    交替lseekwrite,逐渐增大间隔长度。比较文件的大小和实际占用的block数目

    • 测试源码
    #include <stdio.h>
    #include <fcntl.h>
    #include <unistd.h>
    #include <string.h>
    
    int holesize[]={1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32*1024};
    int filesize = 64*1024;
    
    int main()
    {
        int i = 0;
        int count = 0;
        int ret = 0, fd = 0;
        char filename[32]={0};
        unsigned char buf[32*1024]={0};
        memset(buf, 1, 32*1024);
        for (; i< sizeof(holesize)/ sizeof(int); ++i) {
            count = 0;
            memset(filename, 0, 32);
            sprintf(filename, "%s%d", "holesize", holesize[i]);
            fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
            if(fd < 0) {
                printf("open file fail
    ");
                return -1;
            }
            while(count < filesize) {
                ret = lseek(fd, holesize[i], SEEK_CUR);
                if(ret < 0) {
                    printf("lseek fail
    ");
                    return -1;
                }
                int remain = holesize[i];
                while(remain) {
                    ret = write(fd, buf, remain);
                    if(ret < 0 ) {
                        perror("write fail
    ");
                        return -1;
                    }
                    remain -= ret;
                }
                count += holesize[i] * 2;
            }
            close(fd);
        }
        return 0;
    }
    
    • MAC OSX 10.1.4.6测试结果
    ^_^$ ll -s
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize1
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize1024
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize128
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize16
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize16384
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize2
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize2048
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize256
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize32
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize32768
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize4
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize4096
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize512
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize64
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize8
    128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize8192
    

    Mac OSX上创建不了空洞文件,因为默认的文件系统是HFS +,不支持稀疏文件

    • Ubuntu18 4.15.0-60-generic测试结果
    ^_^$ ll -s
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize1
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize1024
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize128
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize16
    32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize16384
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize2
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize2048
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize256
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize32
    32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize32768
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize4
    32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize4096
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize512
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize64
    64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize8
    32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize8192
    

    4KB以上才实际创建空洞。
    因为在linux的文件系统中,磁盘分配的最小物理单元为簇。(即使文件大小不足以占用满一簇,该簇空余的磁盘存储仍旧是该文件的)

    所以可以根据这个性质,判断文件是否是空洞文件。有空洞的文件,用文件大小计算的block数至少比实际占用的block数大1个簇的block数

    如何可移植地获取簇的大小

    pagesize = sysconf(_SC_PAGESIZE);
    

    初步实现功能

    • 源码
    #include <stdio.h>
    #include <unistd.h>
    #include <fcntl.h>
    #include <string.h>
    #include <stdlib.h>
    #include <sys/stat.h>
    #include <sys/errno.h>
    
    int my_cp(const char *from, const char *to)
    {
        int fd1 = -1, fd2 = -1;
        int rev = -1;
        unsigned char *buffer = NULL;
        unsigned char *start_pos = NULL;
        long pagesize = 0;
        long long blocks, blksize, size;
        int read_num, write_num, remain_num, current_pos = 0, last_zero = -1, last_nonzero = -1, have_holes = 0;
        struct stat st;
    
        fd1 = open(from, O_RDONLY);
        if(-1 == fd1){
            perror("open file1 faild");
            goto err;
        }
    
        if(fstat(fd1, &st) !=0) {
            perror("fstat: ");
            goto err;
        }
        else{
    #ifdef _SC_PAGESIZE
            pagesize = sysconf(_SC_PAGESIZE);
            if (pagesize < 0) {
                if (errno != 0) {
                    if (errno == EINVAL) {
                        fputs(" (not supported)
    ", stdout);
                        pagesize = st.st_blksize;
                    }
                    else {
                        perror("sysconf error");
                        goto err;
                    }
                } else {
                    fputs(" (no limit)
    ", stdout);
                    pagesize = st.st_blksize;
                }
            }
            printf("pagesize: %ld
    ", pagesize);
    #else
            pagesize = st.st_blksize;
    #endif
            blocks = st.st_blocks;
            blksize = st.st_blksize;
            size = st.st_size;
            printf("st.st_blocks: %lld
    ", blocks);
            printf("st.st_blksize: %lld
    ", blksize);
            printf("st.st_size: %lld
    ", size);
            /*块大小512,在不同平台上可能不兼容*/
            if(S_ISREG(st.st_mode) && (size / pagesize + (size%pagesize?1:0)) * pagesize > 512 * blocks) {
                have_holes = 1;
                printf("%s is a sparse-block file!
    ", from);
            } else{
                have_holes = 0;
                printf("%s is not a sparse-block file!
    ", from);
            }
        }
        fd2 = open(to, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
        if ( -1 == fd2) {
            perror ("open file2 faild");
            goto err;
        }
    
        buffer = malloc(pagesize);
        if(buffer == NULL) {
            perror ("malloc fail");
            goto err;
        }
        memset(buffer, '', pagesize);
        while((read_num = read(fd1, buffer, pagesize)) > 0) {
            /* 源文件有空洞 */
            if(have_holes){
                last_zero = -1;
                last_nonzero = -1;
                for(current_pos = 0; current_pos < read_num; current_pos++){
                    /* 逐字节判断,效率较低*/
                    if(buffer[current_pos] == 0){
                        if(last_nonzero > last_zero){
                            remain_num = last_nonzero - last_zero;
                            start_pos = buffer + last_zero + 1;
                            while(remain_num){
                                write_num = write(fd2, start_pos, remain_num);
                                if ( -1 == write_num){
                                    perror( "write file2 error");
                                    goto err;
                                }
                                remain_num -= write_num;
                                start_pos += write_num;
                            }
                        }
                        last_zero = current_pos;
                    }
                    else{
                        if(last_zero > last_nonzero){
                            remain_num = last_zero - last_nonzero;
                            if(-1 == lseek(fd2, remain_num, SEEK_CUR)){
                                perror("lseek file2 fail");
                                goto err;
                            }
                        }
                        last_nonzero = current_pos;
                    }
                }
                /* 处理最后剩余数据*/
                remain_num = (last_nonzero > last_zero)?(last_nonzero - last_zero):(last_zero - last_nonzero);
                start_pos = buffer + current_pos - remain_num;
                if(last_nonzero > last_zero){
                    while(remain_num){
                        write_num = write(fd2, start_pos, remain_num);
                        if ( -1 == write_num){
                            perror( "write file2 error");
                            goto err;
                        }
                            remain_num -= write_num;
                            start_pos += write_num;
                        }
                    }
                else{
                    if(-1 == lseek(fd2, remain_num, SEEK_CUR)){
                        perror("lseek file2 fail");
                        goto err;
                    }
                }
            }
            /* 源文件无空洞 */
            else {
                remain_num = read_num;
                start_pos = buffer;
                while(remain_num){
                    write_num = write(fd2, start_pos, remain_num);
                    if ( -1 == write_num){
                        perror( "write file2 error");
                        goto err;
                    }
                    remain_num -= write_num;
                    start_pos += write_num;
                }
            }
        }
        if(-1 == read_num) {
            perror("read file1 error");
            goto err;
        }
        rev = 0;
    err:
        if(buffer) free(buffer);
        close(fd1);
        close(fd2);
        return rev;
    }
    
    int main(int argc, char *argv[])
    {
        if(argc < 3) {
            printf("Usage: %s file1 file2
    ", argv[0]);
            return -1;
        }
        my_cp(argv[1], argv[2]);
        return 0;
    }
    
    • 测试结果
    ^_^$ ./my_cp holesize2048 holesize2048.cp
    pagesize: 4096
    st.st_blocks: 128
    st.st_blksize: 4096
    st.st_size: 65536
    holesize2048 is not a sparse-block file!
    chen@ubuntu18:~/study/apue.3e/exercises/4
    ^_^$ ./my_cp holesize4096 holesize4096.cp
    pagesize: 4096
    st.st_blocks: 72
    st.st_blksize: 4096
    st.st_size: 65536
    holesize4096 is a sparse-block file!
    
    ^_^$ ll -s
    total 1708
    64 -rw-r--r-- 1 chen chen  65536 1月   6 17:27 holesize2048
    64 -rw-r--r-- 1 chen chen  65536 1月   6 17:27 holesize2048.cp
    36 -rw-r--r-- 1 chen chen  65536 1月   6 17:27 holesize4096
    32 -rw-r--r-- 1 chen chen  65536 1月   6 17:27 holesize4096.cp
    

    空洞文件可以正常拷贝

    尝试优化程序

    上面的程序仅在判断文件是否含有空洞时利用的空洞的最小限制。而在实际读写时并没有利用该性质。

    这样较短的0字节也会当成是空洞,导致系统调用次数的增加,性能的降低

    要优化性能,必须进一步探究空洞的性质。在什么样的情况下才创建空洞(不实际占用磁盘空间的块)?

    • 测试程序源码

    此程序创建了3个文件:

    - 文件1先`write`了1K的非零数据,然后`lseek` 7K-1字节。循环2次。
    - 文件2先`write`了1K的非零数据,然后`lseek` 7K字节。循环2次
    - 文件3先`write`了1K的非零数据,然后`lseek` 7K+1字节。循环2次
    
    #include <stdio.h>
    #include <fcntl.h>
    #include <unistd.h>
    #include <string.h>
    
    int holesize[]={4096};
    int filesize = 64*1024;
    
    int main()
    {
        int i = 0;
        int count = 0;
        int ret = 0, fd1 = 0, fd2 = 0, fd3 = 0;
        char filename1[32]={0};
        char filename2[32]={0};
        char filename3[32]={0};
        unsigned char buf[32*1024]={0};
        memset(buf, 1, 32*1024);
        for (; i< sizeof(holesize)/ sizeof(int); ++i) {
            count = 0;
            memset(filename1, 0, 32);
            memset(filename2, 0, 32);
            memset(filename3, 0, 32);
            sprintf(filename1, "%s%d-1", "holesize", holesize[i]);
            sprintf(filename2, "%s%d-2", "holesize", holesize[i]);
            sprintf(filename3, "%s%d-3", "holesize", holesize[i]);
            fd1 = open(filename1, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
            fd2 = open(filename2, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
            fd3 = open(filename3, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
            if(fd1 < 0 || fd2 < 0 || fd3 < 0) {
                printf("open file fail
    ");
                return -1;
            }
            count = 0;
            while(count < 2) {
                int remain = holesize[i] * 1 / 4;
                while(remain) {
                    ret = write(fd1, buf, remain);
                    if(ret < 0 ) {
                        perror("write fail
    ");
                        return -1;
                    }
                    remain -= ret;
                }
                ret = lseek(fd1, holesize[i] * 7 / 4 - 1, SEEK_CUR);
                if(ret < 0) {
                    printf("lseek fail
    ");
                    return -1;
                }
                ++count;
            }
            count = 0;
            while(count < 2) {
                int remain = holesize[i] * 1 / 4;
                while(remain) {
                    ret = write(fd2, buf, remain);
                    if(ret < 0 ) {
                        perror("write fail
    ");
                        return -1;
                    }
                    remain -= ret;
                }
                ret = lseek(fd2, holesize[i] * 7 / 4, SEEK_CUR);
                if(ret < 0) {
                    printf("lseek fail
    ");
                    return -1;
                }
                ++count;
            }
            count = 0;
            while(count < 2) {
                int remain = holesize[i] * 1 / 4;
                while(remain) {
                    ret = write(fd3, buf, remain);
                    if(ret < 0 ) {
                        perror("write fail
    ");
                        return -1;
                    }
                    remain -= ret;
                }
                ret = lseek(fd3, holesize[i] * 7 / 4 + 1, SEEK_CUR);
                if(ret < 0) {
                    printf("lseek fail
    ");
                    return -1;
                }
                ++count;
            }
            close(fd1);
            close(fd2);
            close(fd3);
        }
        return 0;
    }
    
    • 测试结果
    ^_^$ ll -s
    12 -rw-r--r-- 1 chen chen  9215 1月   6 15:07 holesize4096-1
     8 -rw-r--r-- 1 chen chen  9216 1月   6 15:07 holesize4096-2
     8 -rw-r--r-- 1 chen chen  9217 1月   6 15:07 holesize4096-3
    

    可见空洞必须从一页的起始位置开始计算,并且等于或超过pagesize,才不占用实际磁盘空间

    优化后程序

    • 源码
    #include <stdio.h>
    #include <unistd.h>
    #include <fcntl.h>
    #include <string.h>
    #include <stdlib.h>
    #include <sys/stat.h>
    #include <sys/errno.h>
    
    ssize_t read_ex(int fd, void *buf, size_t nbyte){
        size_t read_remain = nbyte;
        unsigned char *read_start = (unsigned char*)buf;
        ssize_t read_num = -1;
        ssize_t total_num = 0;
        while(read_remain) {
            read_num = read(fd, read_start, read_remain);
            if(-1 == read_num){
                return -1;
            }
            else if(0 == read_num){
                break;
            }
            else{
                read_remain -= read_num;
                read_start += read_num;
                total_num += read_num;
            }
        }
        return total_num;
    }
    
    ssize_t write_ex(int fd, const void *buf, size_t nbyte){
        size_t write_remain = nbyte;
        unsigned char *write_start = (unsigned char*)buf;
        ssize_t write_num = -1;
        ssize_t total_num = 0;
        while(write_remain) {
            write_num = write(fd, write_start, write_remain);
            if(-1 == write_num){
                return -1;
            }
            else{
                write_remain -= write_num;
                write_start += write_num;
                total_num += write_num;
            }
        }
        return total_num;
    }
    int my_cp(const char *from, const char *to)
    {
        int fd1 = -1, fd2 = -1;
        int rev = -1;
        unsigned char *buffer = NULL, *buffer_zero = NULL;
        long pagesize = 0;
        long long blocks, blksize, size;
        int read_num, write_num, write_remain, have_holes = 0;
        struct stat st;
    
        fd1 = open(from, O_RDONLY);
        if(-1 == fd1){
            perror("open file1 faild");
            goto err;
        }
    
        if(fstat(fd1, &st) !=0) {
            perror("fstat: ");
            goto err;
        }
        else{
    #ifdef _SC_PAGESIZE
            pagesize = sysconf(_SC_PAGESIZE);
            if (pagesize < 0) {
                if (errno != 0) {
                    if (errno == EINVAL) {
                        fputs(" (not supported)
    ", stdout);
                        pagesize = st.st_blksize;
                    }
                    else {
                        perror("sysconf error");
                        goto err;
                    }
                } else {
                    fputs(" (no limit)
    ", stdout);
                    pagesize = st.st_blksize;
                }
            }
            printf("pagesize: %ld
    ", pagesize);
    #else
            pagesize = st.st_blksize;
    #endif
            blocks = st.st_blocks;
            blksize = st.st_blksize;
            size = st.st_size;
            printf("st.st_blocks: %lld
    ", blocks);
            printf("st.st_blksize: %lld
    ", blksize);
            printf("st.st_size: %lld
    ", size);
            /*块大小512,在不同平台上可能不兼容*/
            if(S_ISREG(st.st_mode) && (size / pagesize + (size%pagesize?1:0)) * pagesize > 512 * blocks) {
                have_holes = 1;
                printf("%s is a sparse-block file!
    ", from);
            } else{
                have_holes = 0;
                printf("%s is not a sparse-block file!
    ", from);
            }
        }
        buffer = malloc(pagesize);
        buffer_zero = malloc(pagesize);
        if(buffer == NULL || buffer_zero == NULL) {
            perror ("malloc fail");
            goto err;
        }
        memset(buffer, '', pagesize);
        memset(buffer_zero, '', pagesize);
    
        fd2 = open(to, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
        if (-1 == fd2) {
            perror ("open file2 faild");
            goto err;
        }
    
        while((read_num = read_ex(fd1, buffer, pagesize)) > 0) {
            /* 读取到空洞 */
            if(have_holes && !memcmp(buffer_zero, buffer, read_num)){
                if(-1 == lseek(fd2, read_num, SEEK_CUR)){
                    perror("lseek file2 fail");
                    goto err;
                }
            }
            /* 非空洞 */
            else{
                write_num = write_ex(fd2, buffer, read_num);
                if (-1 == write_num){
                    perror( "write file2 error");
                    goto err;
                }
            }
        }
        if(-1 == read_num){
            perror("read file1 error");
            goto err;
        }
        rev = 0;
    err:
        if(buffer) free(buffer);
        if(buffer_zero) free(buffer_zero);
        close(fd1);
        close(fd2);
        return rev;
    }
    
    int main(int argc, char *argv[])
    {
        if(argc < 3) {
            printf("Usage: %s file1 file2
    ", argv[0]);
            return -1;
        }
        my_cp(argv[1], argv[2]);
        return 0;
    }
    
    • 对比测试

    构造一个文件,除了开头一个空洞,其余数据为0x00,0x01的100000次重复

    用优化前的程序拷贝该文件10000次,大约2000s

    用优化后的程序拷贝该文件10000次,大约30s

  • 相关阅读:
    linux下的apue.3e安装[Unix环境高级编程]
    mysql将主键序号置为1
    玛丽全开-许愿池
    redis学习之路

    jvm
    oracle版本号含义
    oracle查看所有的表空间
    oracle存储过程 package
    oracle同义词创建(synonym)
  • 原文地址:https://www.cnblogs.com/logchen/p/12157828.html
Copyright © 2011-2022 走看看