zoukankan      html  css  js  c++  java
  • python转换html到pdf文件

    1.安装wkhtmltopdf 

    Windows平台直接在 http://wkhtmltopdf.org/downloads.html 下载稳定版的 wkhtmltopdf 进行安装,安装完成之后把该程序的执行路径加入到系统环境 $PATH 变量中,否则 pdfkit 找不到 wkhtmltopdf 就出现错误 “No wkhtmltopdf executable found”

    2.安装pdfkit

    直接pip install pdfkit

    pdfkit 是 wkhtmltopdf 的Python封装包

    1 import pdfkit
    2 
    3 # 有下面3中途径生产pdf
    4 
    5 pdfkit.from_url('http://google.com', 'out.pdf')
    6 
    7 pdfkit.from_file('test.html', 'out.pdf')
    8 
    9 pdfkit.from_string('Hello!', 'out.pdf')

    3.合并pdf,使用PyPDF2

    直接pip install PyPDF2

    1 from PyPDF2 import PdfFileMerger
    2 merger = PdfFileMerger()
    3 input1 = open("1.pdf", "rb")
    4 input2 = open("2.pdf", "rb")
    5 merger.append(input1)
    6 merger.append(input2)
    7 # 写入到输出pdf文档中
    8 output = open("hql_all.pdf", "wb")
    9 merger.write(output)

    4.综合示例:

      1 # coding=utf-8  
      2 import os  
      3 import re  
      4 import time  
      5 import logging  
      6 import pdfkit  
      7 import requests  
      8 from bs4 import BeautifulSoup  
      9 from PyPDF2 import PdfFileMerger  
     10 
     11 html_template = """ 
     12 <!DOCTYPE html> 
     13 <html lang="en"> 
     14 <head> 
     15     <meta charset="UTF-8"> 
     16 </head> 
     17 <body> 
     18 {content} 
     19 </body> 
     20 </html> 
     21 
     22 """  
     23 
     24 
     25 def parse_url_to_html(url, name):  
     26     """ 
     27     解析URL,返回HTML内容 
     28     :param url:解析的url 
     29     :param name: 保存的html文件名 
     30     :return: html 
     31     """  
     32     try:  
     33         response = requests.get(url)  
     34         soup = BeautifulSoup(response.content, 'html.parser')  
     35         # 正文  
     36         body = soup.find_all(class_="x-wiki-content")[0]  
     37         # 标题  
     38         title = soup.find('h4').get_text()  
     39 
     40         # 标题加入到正文的最前面,居中显示  
     41         center_tag = soup.new_tag("center")  
     42         title_tag = soup.new_tag('h1')  
     43         title_tag.string = title  
     44         center_tag.insert(1, title_tag)  
     45         body.insert(1, center_tag)  
     46         html = str(body)  
     47         # body中的img标签的src相对路径的改成绝对路径  
     48         pattern = "(<img .*?src=")(.*?)(")"  
     49 
     50         def func(m):  
     51             if not m.group(3).startswith("http"):  
     52                 rtn = m.group(1) + "http://www.liaoxuefeng.com" + m.group(2) + m.group(3)  
     53                 return rtn  
     54             else:  
     55                 return m.group(1)+m.group(2)+m.group(3)  
     56         html = re.compile(pattern).sub(func, html)  
     57         html = html_template.format(content=html)  
     58         html = html.encode("utf-8")  
     59         with open(name, 'wb') as f:  
     60             f.write(html)  
     61         return name  
     62 
     63     except Exception as e:  
     64 
     65         logging.error("解析错误", exc_info=True)  
     66 
     67 
     68 def get_url_list():  
     69     """ 
     70     获取所有URL目录列表 
     71     :return: 
     72     """  
     73     response = requests.get("http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000")  
     74     soup = BeautifulSoup(response.content, "html.parser")  
     75     menu_tag = soup.find_all(class_="uk-nav uk-nav-side")[1]  
     76     urls = []  
     77     for li in menu_tag.find_all("li"):  
     78         url = "http://www.liaoxuefeng.com" + li.a.get('href')  
     79         urls.append(url)  
     80     return urls  
     81 
     82 
     83 def save_pdf(htmls, file_name):  
     84     """ 
     85     把所有html文件保存到pdf文件 
     86     :param htmls:  html文件列表 
     87     :param file_name: pdf文件名 
     88     :return: 
     89     """  
     90     options = {  
     91         'page-size': 'Letter',  
     92         'margin-top': '0.75in',  
     93         'margin-right': '0.75in',  
     94         'margin-bottom': '0.75in',  
     95         'margin-left': '0.75in',  
     96         'encoding': "UTF-8",  
     97         'custom-header': [  
     98             ('Accept-Encoding', 'gzip')  
     99         ],  
    100         'cookie': [  
    101             ('cookie-name1', 'cookie-value1'),  
    102             ('cookie-name2', 'cookie-value2'),  
    103         ],  
    104         'outline-depth': 10,  
    105     }  
    106     pdfkit.from_file(htmls, file_name, options=options)  
    107 
    108 
    109 def main():  
    110     start = time.time()  
    111     file_name = u"liaoxuefeng_Python3_tutorial"  
    112     urls = get_url_list()  
    113     for index, url in enumerate(urls):  
    114       parse_url_to_html(url, str(index) + ".html")  
    115     htmls =[]  
    116     pdfs =[]  
    117     for i in range(0,124):  
    118         htmls.append(str(i)+'.html')  
    119         pdfs.append(file_name+str(i)+'.pdf')  
    120 
    121         save_pdf(str(i)+'.html', file_name+str(i)+'.pdf')  
    122 
    123         print u"转换完成第"+str(i)+'个html'  
    124 
    125     merger = PdfFileMerger()  
    126     for pdf in pdfs:  
    127        merger.append(open(pdf,'rb'))  
    128        print u"合并完成第"+str(i)+'个pdf'+pdf  
    129 
    130     output = open(u"廖雪峰Python_all.pdf", "wb")  
    131     merger.write(output)  
    132 
    133     print u"输出PDF成功!"  
    134 
    135     for html in htmls:  
    136         os.remove(html)  
    137         print u"删除临时文件"+html  
    138 
    139     for pdf in pdfs:  
    140         os.remove(pdf)  
    141         print u"删除临时文件"+pdf  
    142 
    143     total_time = time.time() - start  
    144     print(u"总共耗时:%f 秒" % total_time)  
    145 
    146 
    147 if __name__ == '__main__':  
    148     main()  
  • 相关阅读:
    嗨!亲爱的朋友们,欢迎您光临我的BLOG
    SQL里的各种语句语法
    2000/XP登陆后自动注销解决办法(WORM_FUNNER.A)
    ASP程序加密解密方法全面解析
    双击硬盘盘符打不开文件的处理方法
    关于rs.Open sql,conn语句
    C# 图片被占用资源无法删除或者修改,转换成数据流解决
    asp.net div 使用
    鼠标移到控件上显示,移出控件消失
    C# 文件关联 (jpg等图片格式为例)
  • 原文地址:https://www.cnblogs.com/hushaojun/p/8286893.html
Copyright © 2011-2022 走看看