zoukankan      html  css  js  c++  java
  • 使用Python3将Markdown(.md)文本转换成 html、pdf

    一、Markdown中不同的文本内容会分成不同的文本块,并通过markdown的语法控制进行文本的拼接,组成新的文件。

    二、利用Python3实现(.md)文件转换成(.html)文件

      在cmd命令行下进入(.py)文件目录下,使用命令进行执行

      >python md2html.py <file.md> <file.html>

    import sys, re
    
    #生成器模块
    def lines(file):
        #在文本最后加一空行
        for line in file: yield line
        yield '
    '
    
    def blocks(file):
        #生成单独的文本块
        block = []
        for line in lines(file):
            if line.strip():
                block.append(line)
            elif block:
                yield ''.join(block).strip()
                block = []
    
    #文本块处理程序
    class Handler:
        """
        处理程序父类
        """
        def callback(self, prefix, name, *args):
            method = getattr(self, prefix + name, None)
            if callable(method): return method(*args)
    
        def start(self, name):
            self.callback('start_', name)
    
        def end(self, name):
            self.callback('end_', name)
    
        def sub(self, name):
            def substitution(match):
                result = self.callback('sub_', name, match)
                if result is None: result = match.group(0)
                return result
            return substitution
    
    class HTMLRenderer(Handler):
        """
        HTML处理程序,给文本块加相应的HTML标记
        """
        def start_document(self):
            print('<html><head><title>Python文本解析</title></head><body>')
    
        def end_document(self):
            print('</body></html>')
    
        def start_paragraph(self):
            print('<p style="color: #444;">')
    
        def end_paragraph(self):
            print('</p>')
    
        def start_heading(self):
            print('<h2 style="color: #68BE5D;">')
    
        def end_heading(self):
            print('</h2>')
    
        def start_list(self):
            print('<ul style="color: #363736;">')
    
        def end_list(self):
            print('</ul>')
    
        def start_listitem(self):
            print('<li>')
    
        def end_listitem(self):
            print('</li>')
    
        def start_title(self):
            print('<h1 style="color: #1ABC9C;">')
    
        def end_title(self):
            print('</h1>')
    
        def sub_emphasis(self, match):
            return('<em>%s</em>' % match.group(1))
    
        def sub_url(self, match):
            return('<a target="_blank" style="text-decoration: none;color: #BC1A4B;" href="%s">%s</a>' % (match.group(1), match.group(1)))
    
        def sub_mail(self, match):
            return('<a style="text-decoration: none;color: #BC1A4B;" href="mailto:%s">%s</a>' % (match.group(1), match.group(1)))
    
        def feed(self, data):
            print(data)
    
    
    #规则,判断每个文本块应该如何处理
    class Rule:
        """
        规则父类
        """
        def action(self, block, handler):
            """
            加标记
            """
            handler.start(self.type)
            handler.feed(block)
            handler.end(self.type)
            return True
    
    class HeadingRule(Rule):
        """
        一号标题规则
        """
        type = 'heading'
        def condition(self, block):
            """
            判断文本块是否符合规则
            """
            return not '
    ' in block and len(block) <= 70 and not block[-1] == ':'
    
    class TitleRule(HeadingRule):
        """
        二号标题规则
        """
        type = 'title'
        first = True
    
        def condition(self, block):
            if not self.first: return False
            self.first = False
            return HeadingRule.condition(self, block)
    
    class ListItemRule(Rule):
        """
        列表项规则
        """
        type = 'listitem'
        def condition(self, block):
            return block[0] == '-'
    
        def action(self, block, handler):
            handler.start(self.type)
            handler.feed(block[1:].strip())
            handler.end(self.type)
            return True
    
    class ListRule(ListItemRule):
        """
        列表规则
        """
        type = 'list'
        inside = False
        def condition(self, block):
            return True
    
        def action(self, block, handler):
            if not self.inside and ListItemRule.condition(self, block):
                handler.start(self.type)
                self.inside = True
            elif self.inside and not ListItemRule.condition(self, block):
                handler.end(self.type)
                self.inside = False
            return False
    
    class ParagraphRule(Rule):
        """
        段落规则
        """
        type = 'paragraph'
    
        def condition(self, block):
            return True
    
    class Code(Rule):
        '''
        代码框规则
        高亮显示规则
        。。。
        '''
        pass
    
    
    # 对整个文本进行解析
    class Parser:
        """
        解析器父类
        """
        def __init__(self, handler):
            self.handler = handler
            self.rules = []
            self.filters = []
    
        def addRule(self, rule):
            """
            添加规则
            """
            self.rules.append(rule)
    
        def addFilter(self, pattern, name):
            """
            添加过滤器
            """
            def filter(block, handler):
                return re.sub(pattern, handler.sub(name), block)
            self.filters.append(filter)
    
        def parse(self, file):
            """
            解析
            """
            self.handler.start('document')
            for block in blocks(file):
                for filter in self.filters:
                    block = filter(block, self.handler)
                for rule in self.rules:
                    if rule.condition(block):
                        last = rule.action(block, self.handler)
                        if last: break
            self.handler.end('document')
    
    class BasicTextParser(Parser):
        """
        纯文本解析器
        """
        def __init__(self, handler):
            Parser.__init__(self, handler)
            self.addRule(ListRule())
            self.addRule(ListItemRule())
            self.addRule(TitleRule())
            self.addRule(HeadingRule())
            self.addRule(ParagraphRule())
    
            self.addFilter(r'*(.+?)*', 'emphasis')
            self.addFilter(r'(http://[.a-zA-Z/]+)', 'url')
            self.addFilter(r'([.a-zA-Z]+@[.a-zA-Z]+[a-zA-Z]+)', 'mail')
    
    
    """
    运行测试程序
    """
    handler = HTMLRenderer()
    parser = BasicTextParser(handler)
    parser.parse(sys.stdin)

    三、利用Python3将文本转化成pdf文件

      命令>python md2pdf.py 源文件 目标文件 [options]

    Options:
        -h --help     show help document.
        -v --version  show version information.
        -o --output   translate sourcefile into html file.
        -p --print    translate sourcefile into pdf file and html file respectively.
        -P --Print    translate sourcefile into pdf file only.
    import os,re
    import sys,getopt
    from enum import Enum
    from subprocess import call
    from functools import reduce
    
    from docopt import docopt
    
    __version__ = '1.0'
    
    # 定义三个枚举类
    # 定义表状态
    class TABLE(Enum):
        Init = 1
        Format = 2
        Table = 3
    
    # 有序序列状态
    class ORDERLIST(Enum):
        Init = 1
        List = 2
    
    # 块状态
    class BLOCK(Enum):
        Init = 1
        Block = 2
        CodeBlock = 3
    
    # 定义全局状态,并初始化状态
    table_state = TABLE.Init
    orderList_state = ORDERLIST.Init
    block_state = BLOCK.Init
    is_code = False
    is_normal = True
    
    temp_table_first_line = []
    temp_table_first_line_str = ""
    
    need_mathjax = False
    
    
    def test_state(input):
        global table_state, orderList_state, block_state, is_code, temp_table_first_line, temp_table_first_line_str
        Code_List = ["python
    ", "c++
    ", "c
    "]
    
        result = input
    
        # 构建正则表达式规则
        # 匹配块标识
        pattern = re.compile(r'```(s)*
    ')
        a = pattern.match(input)
    
        # 普通块
        if  a and block_state == BLOCK.Init:
            result = "<blockquote>"
            block_state = BLOCK.Block
            is_normal = False
        # 特殊代码块
        elif len(input) > 4 and input[0:3] == '```' and (input[3:9] == "python" or input[3:6] == "c++" or input[3:4]== "c") and block_state == BLOCK.Init:
            block_state = BLOCK.Block
            result = "<code></br>"
            is_code = True
            is_normal = False
        # 块结束
        elif block_state == BLOCK.Block and input == '```
    ':
            if is_code:
                result = "</code>"
            else:
                result = "</blockquote>"
            block_state = BLOCK.Init
            is_code = False
            is_normal = False
        elif block_state == BLOCK.Block:
            pattern = re.compile(r'[
    
    vf ]')
            result = pattern.sub("&nbsp", result)
            pattern = re.compile(r'	')
            result = pattern.sub("&nbsp" * 4, result)
            result = "<span>" + result + "</span></br>"
            is_normal = False
    
        # 解析有序序列
        if len(input) > 2 and input[0].isdigit() and input[1] == '.' and orderList_state == ORDERLIST.Init:
            orderList_state = ORDERLIST.List
            result = "<ol><li>" + input[2:] + "</li>"
            is_normal = False
        elif len(input) > 2 and  input[0].isdigit() and input[1] == '.' and orderList_state == ORDERLIST.List:
            result = "<li>" + input[2:] + "</li>"
            is_normal = False
        elif orderList_state == ORDERLIST.List and (len(input) <= 2 or input[0].isdigit() == False or input[1] != '.'):
            result = "</ol>" + input
            orderList_state = ORDERLIST.Init
    
        # 解析表格
        pattern = re.compile(r'^((.+)|)+((.+))$')
        match = pattern.match(input)
        if match:
            l = input.split('|')
            l[-1] = l[-1][:-1]
            # 将空字符弹出列表
            if l[0] == '':
                l.pop(0)
            if l[-1] == '':
                l.pop(-1)
            if table_state == TABLE.Init:
                table_state = TABLE.Format
                temp_table_first_line = l
                temp_table_first_line_str = input
                result = ""
            elif table_state == TABLE.Format:
                # 如果是表头与表格主题的分割线
                if reduce(lambda a, b: a and b, [all_same(i,'-') for i in l], True):
                    table_state = TABLE.Table
                    result = "<table><thread><tr>"
                    is_normal = False
                    
                    # 添加表头
                    for i in temp_table_first_line:
                        result += "<th>" + i + "</th>"
                    result += "</tr>"
                    result += "</thread><tbody>"
                    is_normal = False
                else:
                    result = temp_table_first_line_str + "</br>" + input
                    table_state = TABLE.Init
    
            elif table_state == TABLE.Table:
                result = "<tr>"
                for i in l:
                    result += "<td>" + i + "</td>"
                result += "</tr>"
    
        elif table_state == TABLE.Table:
            table_state = TABLE.Init
            result = "</tbody></table>" + result
        elif table_state == TABLE.Format:
            pass
        
        return result
    
    # 判断 lst 是否全由字符 sym 构成 
    def all_same(lst, sym):
        return not lst or sym * len(lst) == lst
    
    # 处理标题
    def handleTitle(s, n):
        temp = "<h" + repr(n) + ">" + s[n:] + "</h" + repr(n) + ">"
        return temp
    
    # 处理无序列表
    def handleUnorderd(s):
        s = "<ul><li>" + s[1:]
        s += "</li></ul>"
        return s
    
    
    def tokenTemplate(s, match):
        pattern = ""
        if match == '*':
            pattern = "*([^*]*)*"
        if match == '~~':
            pattern = "~~([^~~]*)~~"
        if match == '**':
            pattern = "**([^**]*)**"
        return pattern
    
    # 处理特殊标识,比如 **, *, ~~
    def tokenHandler(s):
        l = ['b', 'i', 'S']
        j = 0
        for i in ['**', '*', '~~']:
            pattern = re.compile(tokenTemplate(s,i))
            match = pattern.finditer(s)
            k = 0
            for a in match:
                if a:
                    content = a.group(1)
                    x,y = a.span()
                    c = 3
                    if i == '*':
                        c = 5
                    s = s[:x+c*k] + "<" + l[j] + ">" + content + "</" + l[j] + ">" + s[y+c*k:]
                    k += 1
            pattern = re.compile(r'$([^$]*)$')
            a = pattern.search(s)
            if a:
                global need_mathjax
                need_mathjax = True
            j += 1
        return s
    
    # 处理链接
    def link_image(s):
        # 超链接
        pattern = re.compile(r'\[(.*)]((.*))')
        match = pattern.finditer(s)
        for a in match:
            if a:
                text, url = a.group(1,2)
                x, y = a.span()
                s = s[:x] + "<a href=" + url + " target="_blank">" + text + "</a>" + s[y:]
    
        # 图像链接
        pattern = re.compile(r'![(.*)]((.*))')
        match = pattern.finditer(s)
        for a in match:
            if a:
                text, url = a.group(1,2)
                x, y = a.span()
                s = s[:x] + "<img src=" + url + " target="_blank">" + "</a>" + s[y:]
    
        # 角标
        pattern = re.compile(r'(.)^[([^]]*)]')
        match = pattern.finditer(s)
        k = 0
        for a in match:
            if a:
                sym,index = a.group(1,2)
                x, y = a.span()
                s = s[:x+8*k] + sym + "<sup>" + index + "</sup>" + s[y+8*k:]
            k += 1
    
        return s
    
    
    def parse(input):
        global block_state, is_normal
        is_normal = True
        result = input
    
        # 检测当前 input 解析状态
        result = test_state(input)
        
        if block_state == BLOCK.Block:
            return result
    
        # 分析标题标记 # 
        title_rank = 0
        for i in range(6, 0, -1):
            if input[:i] == '#'*i:
                title_rank = i
                break
        if title_rank != 0:
            # 处理标题,转化为相应的 HTML 文本
            result = handleTitle(input, title_rank)
            return result
    
        # 分析分割线标记 --
        if len(input) > 2 and all_same(input[:-1], '-') and input[-1] == '
    ':
            result = "<hr>"
            return result
    
        # 解析无序列表
        unorderd = ['+', '-']
        if result != "" and result[0] in unorderd :
            result = handleUnorderd(result)
            is_normal = False
    
        f = input[0]
        count = 0
        sys_q = False
        while f == '>':
            count += 1
            f = input[count]
            sys_q = True
        if sys_q:
            result = "<blockquote style="color:#8fbc8f"> "*count + "<b>" + input[count:] + "</b>" + "</blockquote>"*count
            is_normal = False
    
        # 处理特殊标记,比如 ***, ~~~
        result = tokenHandler(result)
    
        # 解析图像链接
        result = link_image(result)
        pa = re.compile(r'^(s)*$')
        a = pa.match(input)
        if input[-1] == "
    " and is_normal == True and not a :
            result+="</br>"
    
        return result 
    
    
    def run(source_file, dest_file, dest_pdf_file, only_pdf):
        # 获取文件名
        file_name = source_file
        # 转换后的 HTML 文件名
        dest_name = dest_file
        # 转换后的 PDF 文件名
        dest_pdf_name = dest_pdf_file
    
        # 获取文件后缀
        _, suffix = os.path.splitext(file_name)
        if suffix not in [".md",".markdown",".mdown","mkd"]:
            print('Error: the file should be in markdown format')
            sys.exit(1)
    
        if only_pdf:
            dest_name = ".~temp~.html"
    
    
        f = open(file_name, "r")
        f_r = open(dest_name, "w")
    
        # 往文件中填写 HTML 的一些属性
        f_r.write("""<style type="text/css">div {display: block;font-family: "Times New Roman",Georgia,Serif}
                #wrapper {  100%;height:100%; margin: 0; padding: 0;}#left { float:left; 
                 10%;  height: 100%;  }#second {   float:left;    80%;height: 100%;   
                }#right {float:left;   10%;  height: 100%; 
                }</style><div id="wrapper"> <div id="left"></div><div id="second">""")
        f_r.write("""<meta charset="utf-8"/>""")
        
        # 逐行解析 markdwon 文件
        for eachline in f:
            result = parse(eachline)
            if result != "":
                f_r.write(result)
    
        f_r.write("""</br></br></div><div id="right"></div></div>""")
    
        # 公式支持
        global need_mathjax
        if need_mathjax:
            f_r.write("""<script type="text/x-mathjax-config">
            MathJax.Hub.Config({tex2jax: {inlineMath: [['$','$'], ['\(','\)']]}});
            </script><script type="text/javascript" 
            src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>""")
        # 文件操作完成之后记得关闭!!!
        f_r.close()
        f.close()
    
        # 调用扩展 wkhtmltopdf 将 HTML 文件转换成 PDF
        if dest_pdf_name != "" or only_pdf:
            call(["wkhtmltopdf", dest_name, dest_pdf_name])
        # 如果有必要,删除中间过程生成的 HTML 文件
        if only_pdf:
            call(["rm", dest_name])
    
    
    # 主函数
    def main():
        dest_file = "translation_result.html"
        dest_pdf_file = "translation_result.pdf"
    
        only_pdf = False
    
        args = docopt(__doc__, version=__version__)
    
        dest_file = args['<outputfile>'] if args['--output'] else dest_file
    
        dest_pdf_file = args['<outputfile>'] if args['--print'] or args['--Print'] else ""
    
        run(args['<sourcefile>'], dest_file, dest_pdf_file, args['--Print'])
    
    
    if __name__=="__main__":
        main() 
  • 相关阅读:
    render()中添加js函数
    切图相关记录
    Jenkins 配置代理。
    Ant 打包 web 项目 xml 模板
    linux 删除命令
    git 创建 本地 裸 仓库
    swagger spring-boot 配置
    Idea 根据 表 生成 实体类
    java 流 api
    spring eureka 注册显示 ip:端口号
  • 原文地址:https://www.cnblogs.com/null-/p/10053532.html
Copyright © 2011-2022 走看看