zoukankan      html  css  js  c++  java
  • 用于拆解和组合PDF中各个对象的shell脚本

    拆解脚本

    header_start=0
    header_len=15
     
    xref_start=$(strings -a -t d $1 | grep -e "xref" | awk '{print $1}')
    trailer_start=$(strings -a -t d $1 | grep -e "trailer" | awk '{print $1}')
    #echo $xref_start
    #echo $trailer_start
     
    xref_len=$(echo "$trailer_start - $xref_start" | bc)
    #echo $xref_len
     
    header_dump=$(echo "$1" | sed -re 's/^(.*).pdf/tdis\_1\_header.bin/g')
    dd if=$1 of=$header_dump bs=1 skip=$header_start count=$header_len
     
    xref_dump=$(echo "$1" | sed -re 's/^(.*).pdf/tdis\_1\_xref.bin/g')
    dd if=$1 of=$xref_dump bs=1 skip=$xref_start count=$xref_len
     
    trailer_dump=$(echo "$1" | sed -re 's/^(.*).pdf/tdis\_1\_trailer.bin/g')
    dd if=$1 of=$trailer_dump bs=1 skip=$trailer_start
     
    #cat tdis_"$1"_xref.bin | awk 'NF==3' | awk 'NR!=1{printf("%d 0 obj is at offset: %d
    ", NR-1, $1);}'
    cat $xref_dump | awk 'NF==3' | awk 'NR!=1{printf("%08d %08d
    ", $1, NR-1);}' | sort > tdis_"$xref_dump"
    #echo "$xref_start 0" >> tdis_"$xref_dump"
    printf "%08d %08d
    " $xref_start 0 >> tdis_"$xref_dump"
    
    cat tdis_$xref_dump | awk 'BEGIN{loffset=0;lobjnum=0;}{printf("%3d %3d %3d
    ", loffset, $1-loffset, lobjnum);loffset=$1;lobjnum=$2;}' | awk 'NR!=1' > tdis_metrics_"$xref_dump"
     
    if [ ! -d objects ]
    then
        mkdir objects
    fi
    cat tdis_metrics_"$xref_dump" | while read offset len objn
    do
    #echo $offset, $len, $objn
    obj_name=$(printf "%s_%03d" $1 $objn | sed -re 's/^(.*).pdf/tdis\_1\_obj/g' | awk '{printf("objects/%s.bin", $0);}')
    #echo $obj_name
    dd if=$1 of=$obj_name bs=1 skip=$offset count=$len
    done
    

      

    组合脚本

    target=$1
    dd if=$(ls -1 | grep "header.bin") of=$target bs=1 count=15
    
    obj_offset=15
    obj_nums=0
    for file in $(ls -1 objects)
    do
        #echo $file
        obj_len=$(wc objects/$file | awk '{print $3}')
        dd if=objects/$file of=$target bs=1 count=$obj_len seek=$obj_offset
        printf "%010d %05d n
    " $obj_offset 0 >> "tas_generated_"$1"_xref.bin" 
        obj_offset=$[ $obj_offset + $obj_len ]
        obj_nums=$[ $obj_nums + 1 ]
    done
    echo "xref" >> $target
    printf "0 %d
    " $obj_nums >> $target
    echo "0000000000 65535 f" >> $target
    cat "tas_generated_"$1"_xref.bin" >> $target
    
    awk 'NR<=2' $(ls -1 | grep "trailer.bin") >> $target
    echo "startxref" >> $target
    echo $obj_offset >> $target
    echo "%%EOF" >> $target
    

    这样,我们就可以对解析出来的单个pdf对象进行单独操作了。

    手动找出包含graphic operators stream的对象,使用下面脚本解压stream

    target=$(ls -1 objects | grep "_obj_"$1".bin")
    grep -Ubo --binary-file=text stream objects/$target | sed -e 's/:/ /g' | awk 'NR==1{printf("%d ",$1+7);}NR==2{printf("%d ", $1-10);}' > tdeflate_stream.bin
    read xstart xend < tdeflate_stream.bin
    dd if=objects/$target of=flated.bin bs=1 skip=$xstart count=$[ $xend - $xstart ]
    cat flated.bin | zlib-flate -uncompress > deflated.bin
    

    重新编辑deflated.bin文件,再使用下面脚本压缩

    printf "%d 0 obj
    " $1 > tflate_"$1".bin
    printf "<</Length %d/Filter/FlateDecode>>stream
    " >> tflate_"$1".bin
    cat deflated.bin | zlib-flate -compress >> tflate_"$1".bin
    echo "" >> tflate_"$1".bin
    echo "endstream" >> tflate_"$1".bin
    echo "endobj" >> tflate_"$1".bin
    
    target=$(ls -1 objects | grep "_obj_"$1".bin") 
    rm objects/$target
    mv tflate_"$1".bin objects/$target
    
  • 相关阅读:
    第二章作业题
    数据类型及内置方法
    流程控制
    Python入门,基本数据类型
    练习题
    Java中的时间日期Date和Calendar
    String的static方法
    Java中基本类型的包装类
    Java中的API
    Java里的参数类型/返回值类型
  • 原文地址:https://www.cnblogs.com/long123king/p/3931812.html
Copyright © 2011-2022 走看看