zoukankan      html  css  js  c++  java
  • zlib压缩爬虫采集到的网页源码保存到mongodb减少存储空间

      1 mport zlib
      2 import pymongo
      3 
      4 def compress_html(infile,dst,level=9):
      5     '''[summary]
      6     
      7     [压缩文件]
      8     Arguments:
      9         infile {[string]} -- [输入文件路径]
     10         dst {[string]} -- [输出文件路径]
     11     
     12     Keyword Arguments:
     13         level {number} -- [压缩比例,压缩级别是一个0-9的数字,0压缩速度最快(压缩的过程),9压缩速度最慢,压缩率最大,0不压缩数据] (default: {9})
     14     '''
     15     infile = open(infile,'rb')
     16     dst = open(dst,'wb')
     17     compress = zlib.compressobj(level)
     18     data = infile.read(1024)
     19     while data:
     20         dst.write(compress.compress(data))
     21         data = infile.read(1024)
     22     dst.write(compress.flush())
     23     infile.close()
     24     dst.close()
     25 
     26 def decompress(infile, dst):
     27     '''[summary]
     28     
     29     [解压文件]
     30     
     31     Arguments:
     32         infile {[string]} -- [输入文件路径]
     33         dst {[string]} -- [输出文件路径]
     34     '''
     35     infile = open(infile, 'rb')
     36     dst = open(dst, 'wb')
     37     decompress = zlib.decompressobj()
     38     data = infile.read(1024)
     39     while data:
     40       dst.write(decompress.decompress(data))
     41       data = infile.read(1024)
     42     dst.write(decompress.flush())
     43     infile.close()
     44     dst.close()
     45 
     46 
     47 def compress_str(instr):
     48     '''[summary]
     49     
     50     [压缩字符串]
     51     
     52     Arguments:
     53         instr {[string]} -- [待压缩的字符串]
     54     '''
     55     # MONGODB 主机名
     56     MONGODB_HOST = "192.168.0.67"
     57     # MONGODB 端口号
     58     MONGODB_PORT = 27017
     59     # 数据库名称
     60     MONGODB_DBNAME = "CompressHtml"
     61     # 存放数据的表名称
     62     MONGODB_SHEETNAME = "compress_html"
     63 
     64     # 创建MONGODB数据库链接
     65     client = pymongo.MongoClient(host=MONGODB_HOST, port=MONGODB_PORT)
     66     # 指定数据库
     67     mydb = client[MONGODB_DBNAME]
     68     # 存放数据的数据库表名
     69     postdb = mydb[MONGODB_SHEETNAME]
     70     compress_str = zlib.compress(instr.encode(encoding='utf-8'),level=9)
     71     print(type(compress_str))
     72     print(compress_str)
     73     postdb.insert_one({"cas":"350-03-8","英文名称":"3-Acetylpyridine","英文同义词":"NSC 761;FEMA 3424;Imatinib-int A;3-ActylPyridine;3-ACETOPYRIDINE;3-acetyl-pyridin;3-Acetylpiridine;3-Acetalpyridine;FEMA NUMBER 3424;3-ACETYLPYRIDINE"
     74         ,"中文名称":"3-乙酰基吡啶","html":compress_str})
     75     client.close()
     76 
     77 def decompress_str(bytes_data=None):
     78     '''[summary]
     79     
     80     [将二进制html文件解压成str]
     81     
     82     Arguments:
     83         bytes_data {[bytes]} -- [待解压的html]
     84     '''
     85     # MONGODB 主机名
     86     MONGODB_HOST = "192.168.0.67"
     87     # MONGODB 端口号
     88     MONGODB_PORT = 27017
     89     # 数据库名称
     90     MONGODB_DBNAME = "CompressHtml"
     91     # 存放数据的表名称
     92     MONGODB_SHEETNAME = "compress_html"
     93 
     94     # 创建MONGODB数据库链接
     95     client = pymongo.MongoClient(host=MONGODB_HOST, port=MONGODB_PORT)
     96     # 指定数据库
     97     mydb = client[MONGODB_DBNAME]
     98     # 存放数据的数据库表名
     99     postdb = mydb[MONGODB_SHEETNAME]
    100     data = postdb.find_one()
    101     bytes_html = data.get("html")
    102     print(type(bytes_html))
    103     html_source = zlib.decompress(bytes_html)
    104     print(type(html_source))
    105     html_str = html_source.decode('utf-8','ignore')
    106     print(type(html_str))
    107     client.close()
    108 
    109 
    110 
    111 
    112 if __name__ == '__main__':
    113     # compress_html("3-乙酰基吡啶 _ 350-03-8.html","350-03-8_compress.html")
    114     # compress_html("3-乙酰基吡啶 _ 350-03-8_noheaderfooter.txt","350-03-8_compress.txt")
    115 
    116     # with open("3-乙酰基吡啶 _ 350-03-8.html",'r',encoding='utf-8') as f:
    117     #     data = f.read()
    118     #     # print(data)
    119     #     print(type(data))
    120     #     compress_str(data)
    121     decompress_str(bytes_data=None)

    
    
    
    
    

    压缩效果还是非常不错的,源文件由138kb压缩后为19kb,减小了7.2倍,为大规模存储数据到mongo减少了很多磁盘存储空间

    
    

     

     mongodb中可以存入Binary二进制的数据

  • 相关阅读:
    SAP的PI日志查看工具
    微信小程序调用SAP发布的REST显示数据列表
    SAP发布REST/HTTP接口
    SAP的JSON没有双引号问题
    SAP扩展库位
    函数使用十一:BAPI_BANK_CREATE
    竟然有人在群里谈交钱培训PI。。。。等哥哥有时间,断了你们的财路
    FPM十一:点击POPUP显示明细
    WDA基础十八:Select option配置
    SAP常见查询组合
  • 原文地址:https://www.cnblogs.com/fly-kaka/p/13850649.html
Copyright © 2011-2022 走看看