zoukankan      html  css  js  c++  java
  • 改写《python基础教程》中的一个例子

    一、前言

    初学python,看《python基础教程》,第20章实现了将文本转化成html的功能。由于本人之前有DIY一个markdown转html的算法,所以对这个例子有兴趣。可仔细一看,发现很难看懂,一个功能分散在几个文件中,各个类的耦合非常紧。虽然自己有几年的c++开发经验,但初看这个python代码也觉得头晕。

    二、原版

    以下是其源码

     1 from __future__ import generators
     2 
     3 
     4 def lines(file):
     5     for line in file:
     6         yield line
     7     yield '
    '
     8 
     9 
    10 def blocks(file):
    11     block = []
    12     for line in lines(file):
    13         if line.strip():
    14             block.append(line)
    15         elif block:
    16             yield ''.join(block).strip()
    17             block = []
    util.py
    # This Python file uses the following encoding: utf-8
    class Rule:
        """
        Base class for all rules.
        """
        def action(self, block, handler):
            handler.start(self.type)
            handler.feed(block)
            handler.end(self.type)
            return True
    
    
    class HeadingRule(Rule):
        """
        A heading is a single line that is at most 70 characters and
        that doesn't end with a colon.
        """
        type = 'heading'
    
        def condition(self, block):
            return '
    ' not in block and len(block) <= 70 and not block[-1] == ':'
    
    
    class TitleRule(HeadingRule):
        """
        The title is the first block in the document, provided that it is
        a heading.
        """
        type = 'title'
        first = True
    
        def condition(self, block):
            if not self.first:
                return False
            self.first = False
            return HeadingRule.condition(self, block)
    
    
    class ListItemRule(Rule):
        """
        A list item is a paragraph that begins with a hyphen. As part of
        the formatting, the hyphen is removed.
        """
        type = 'listitem'
    
        def condition(self, block):
            return block[0] == '-'
    
        def action(self, block, handler):
            handler.start(self.type)
            handler.feed(block[1:].strip())
            handler.end(self.type)
            return 1
    
    
    # start ListRule {
    class ListRule(ListItemRule):
        """
        A list begins between a block that is not a list item and a
        subsequent list item. It ends after the last consecutive list
        item.
        """
        type = 'list'
        inside = False
    
        def condition(self, block):
            # 总返回true,因为对每个block都得进行检查
            return True
    
        def action(self, block, handler):
            if not self.inside and ListItemRule.condition(self, block):
                handler.start(self.type)
                self.inside = True
            elif self.inside and not ListItemRule.condition(self, block):
                handler.end(self.type)
                self.inside = False
    # 总返回false,因为得让规则继续处理
            return False
    # end ListRule }
    
    
    class ParagraphRule(Rule):
        """
        A paragraph is simply a block that isn't covered by any of the
        other rules.
        """
        type = 'paragraph'
    
        def condition(self, block):
            return True
    rules.py
     1 # start Handler {
     2 class Handler:
     3     """
     4     An object that handles method calls from the Parser.
     5 
     6     The Parser will call the start() and end() methods at the
     7     beginning of each block, with the proper block name as
     8     parameter. The sub() method will be used in regular expression
     9     substitution. When called with a name such as 'emphasis', it will
    10     return a proper substitution function.
    11     """
    12     def callback(self, prefix, name, *args):
    13         method = getattr(self, prefix+name, None)
    14         if callable(method):
    15             return method(*args)
    16 
    17     def start(self, name):
    18         self.callback('start_', name)
    19 
    20     def end(self, name):
    21         self.callback('end_', name)
    22 
    23     def sub(self, name):
    24         return lambda match: 
    25                 self.callback('sub_', name, match) or match.group(0)
    26 # end Handler }
    27 
    28 
    29 # start HTMLHandler {
    30 class HTMLHandler(Handler):
    31     """
    32     A specific handler used for rendering HTML.
    33 
    34     The methods in HTMLHandler are accessed from the superclass
    35     Handler's start(), end(), and sub() methods. They implement basic
    36     markup as used in HTML documents.
    37     """
    38     def start_document(self):
    39         print '<html><head><title>...</title></head><body>'
    40 
    41     def end_document(self):
    42         print '</body></html>'
    43 
    44     def start_paragraph(self):
    45         print '<p>'
    46 
    47     def end_paragraph(self):
    48         print '</p>'
    49 
    50     def start_title(self):
    51         print '<h1>'
    52 
    53     def end_title(self):
    54         print '</h1>'
    55 
    56     def start_heading(self):
    57         print '<h2>'
    58 
    59     def end_heading(self):
    60         print '</h2>'
    61 
    62     def start_list(self):
    63         print '<ul>'
    64 
    65     def end_list(self):
    66         print '</ul>'
    67 
    68     def start_listitem(self):
    69         print '<li>'
    70 
    71     def end_listitem(self):
    72         print '</li>'
    73 
    74     def sub_emphasis(self, match):
    75         return '<em>%s</em>' % match.group(1)
    76 
    77     def sub_url(self, match):
    78         return '<a href="%s">%s</a>' % (match.group(1), match.group(1))
    79 
    80     def sub_mail(self, match):
    81         return '<a href="mailto:%s">%s</a>' % (match.group(1), match.group(1))
    82 
    83     def feed(self, data):
    84         print data
    85 
    86 # end HTMLHandler }
    handles.py
     1 import sys
     2 import re
     3 from handlers import *
     4 from util import *
     5 from rules import *
     6 
     7 
     8 # start Parser {
     9 class Parser:
    10     """
    11     A Parser reads a text file, applying rules and controlling a
    12     handler.
    13     """
    14     def __init__(self, handler):
    15         self.handler = handler
    16         self.rules = []
    17         self.filters = []
    18 
    19     def addRule(self, rule):
    20         self.rules.append(rule)
    21 
    22     def addFilter(self, pattern, name):
    23         def filter(block, handler):
    24             return re.sub(pattern, handler.sub(name), block)
    25         self.filters.append(filter)
    26 
    27     def parse(self, file):
    28         self.handler.start('document')
    29 
    30         for block in blocks(file):
    31             for filter in self.filters:
    32                 block = filter(block, self.handler)
    33 
    34             for rule in self.rules:
    35                 if rule.condition(block):
    36                     last = rule.action(block, self.handler)
    37                     if last:
    38                         break
    39         self.handler.end('document')
    40 # end Parser }
    41 
    42 
    43 # start BaseTextParser {
    44 class BasicTextParser(Parser):
    45     """
    46     A specific Parser that adds rules and filters in its
    47     constructor.
    48     """
    49     def __init__(self, handler):
    50         Parser.__init__(self, handler)
    51         self.addRule(ListRule())
    52         self.addRule(ListItemRule())
    53         self.addRule(TitleRule())
    54         self.addRule(HeadingRule())
    55         self.addRule(ParagraphRule())
    56 
    57         self.addFilter(r'*(.+?)*', 'emphasis')
    58         self.addFilter(r'(http://[.a-zA-Z/]+)', 'url')
    59         self.addFilter(r'([.a-zA-Z]+@[.a-zA-Z]+[a-zA-Z]+)', 'mail')
    60 # end BaseTextParser }
    61 
    62 handler = HTMLHandler()
    63 parser = BasicTextParser(handler)
    64 
    65 parser.parse(sys.stdin)
    markup.py

    文本如下

    Welcome to World Wide Spam, Inc.
    
    
    These are the corporate web pages of *World Wide Spam*, Inc. We hope
    you find your stay enjoyable, and that you will sample many of our
    products.
    
    A short history of the company
    
    World Wide Spam was started in the summer of 2000. The business
    concept was to ride the dot-com wave and to make money both through
    bulk email and by selling canned meat online.
    
    After receiving several complaints from customers who weren't
    satisfied by their bulk email, World Wide Spam altered their profile,
    and focused 100% on canned goods. Today, they rank as the world's
    13,892nd online supplier of SPAM.
    
    Destinations
    
    From this page you may visit several of our interesting web pages:
    
      - What is SPAM? (http://wwspam.fu/whatisspam)
    
      - How do they make it? (http://wwspam.fu/howtomakeit)
    
      - Why should I eat it? (http://wwspam.fu/whyeatit)
    
    How to get in touch with us
    
    You can get in touch with us in *many* ways: By phone (555-1234), by
    email (wwspam@wwspam.fu) or by visiting our customer feedback page
    (http://wwspam.fu/feedback).
    test_input.txt

    使用命令行  python markup.py < test_input.txt > out.html  即可将文件转化为有格式的html文件

    上面代码有几点不足之处:

    1. rules.py代码和handles.py代码紧密耦合,rules.py,handles.py一起来实现根据规则来生成转化文本。rules.py中各种rule中定义了'heading', 'listitem'等,而handles.py中有各种start_headning(), end_heading()来响应对应的类型方法。
    2. 对文本中特殊格式的转化Filter功能分布中markup.py和handles.py中。markup.py 57-59行,中定义了匹配模式,而替换的方法又在handles.py 74-81行。
    3. ...

    三、改进

    下面是本人改进后的代码

     1 from __future__ import generators
     2 
     3 
     4 def lines(file):
     5     for line in file:
     6         yield line
     7     yield '
    '
     8 
     9 
    10 def lines2(file):
    11     for line in file:
    12         s = line.strip()
    13         if s:
    14             yield s
    15     yield '
    '
    16 
    17 
    18 def blocks(file):
    19     block = []
    20     for line in lines(file):
    21         if line.strip():
    22             block.append(line)
    23         elif block:
    24             yield ''.join(block).strip()
    25             block = []
    util.py
     1 import re
     2 
     3 
     4 def createFilter(pattern, fun):
     5     def filter(line):
     6         return re.sub(pattern, fun, line)
     7     return filter
     8 
     9 
    10 def filterEm():
    11     def subEm(match):
    12         return '<em>%s</em>' % match.group(1)
    13     return createFilter(r'*(.+?)*', subEm)
    14 
    15 
    16 def filterUrl():
    17     def subUrl(match):
    18         return '<a href="%s">%s</a>' % (match.group(1), match.group(1))
    19     return createFilter(r'(http://[.a-zA-Z/]+)', subUrl)
    20 
    21 
    22 def filterMail():
    23     def subMail(match):
    24         return '<a href="mailto:%s">%s</a>' % (match.group(1), match.group(1))
    25     return createFilter(r'([.a-zA-Z]+@[.a-zA-Z]+[a-zA-Z]+)', subMail)
    26 
    27 
    28 def createFilters():
    29     filters = []
    30     filters.append(filterEm())
    31     filters.append(filterUrl())
    32     filters.append(filterMail())
    33     return filters
    filters.py
      1 # This Python file uses the following encoding: utf-8
      2 class Rule:
      3     def action(self, line):
      4         self.start(line)
      5         self.feed(line)
      6         self.end(line)
      7         return True
      8 
      9     def start(self, line):
     10         pass
     11 
     12     def end(self, line):
     13         pass
     14 
     15     def feed(self, line):
     16         print line
     17 
     18     def endDoc(self):
     19         pass
     20 
     21 
     22 class HeadingRule(Rule):  # {{{
     23     def condition(self, line):
     24         return '
    ' not in line and len(line) <= 30 and not line[-1] == ':'
     25 
     26     def start(self, line):
     27         print '<h2>'
     28 
     29     def end(self, line):
     30         print '</h2>'
     31 
     32 
     33 class TitleRule(HeadingRule):
     34     first = True
     35 
     36     def condition(self, line):
     37         if not self.first:
     38             return False
     39         self.first = False
     40         return HeadingRule.condition(self, line)
     41 
     42     def start(self, line):
     43         print '<h1>'
     44 
     45     def end(self, line):
     46         print '</h1>'  # }}}
     47 
     48 
     49 class ListItemRule(Rule):  # {{{
     50     def condition(self, line):
     51         return line[0] == '-'
     52 
     53     def feed(self, line):
     54         print line[1:].strip()
     55 
     56     def start(self, line):
     57         print '<li>'
     58 
     59     def end(self, line):
     60         print '</li>'
     61 
     62 
     63 class ListRule(ListItemRule):
     64     inside = False
     65     firstIn = False
     66     firstOut = False
     67 
     68     def condition(self, line):
     69         return True
     70 
     71     def action(self, line):
     72         if not self.inside and ListItemRule.condition(self, line):
     73             self.start(line)
     74             self.inside = True
     75         elif self.inside and not ListItemRule.condition(self, line):
     76             self.end(line)
     77             self.inside = False
     78         return False
     79 
     80     def start(self, line):
     81         print '<ul>'
     82 
     83     def end(self, line):
     84         print '</ul>'
     85 
     86     def feed(self, line):
     87         pass  # }}}
     88 
     89 
     90 class ParagraphRule(Rule):
     91 
     92     def condition(self, line):
     93         return True
     94 
     95     def start(self, line):
     96         print '<p>'
     97 
     98     def end(self, line):
     99         print '</p>'
    100 
    101 
    102 class DocumentRule(Rule):
    103     first = True
    104     isStart = False
    105 
    106     def condition(self, line):
    107         if self.first:
    108             self.first = False
    109             self.isStart = True
    110             return True
    111         return False
    112 
    113     def action(self, line):
    114         if self.isStart:
    115             self.start(line)
    116             self.isStart = False
    117         return False
    118 
    119     def start(self, line):
    120         print '<html><head><title>...</title></head><body>'
    121 
    122     def end(self, line):
    123         print '</body></html>'
    124 
    125     def endDoc(self):
    126         self.end('')
    rules.py
     1 # This Python file uses the following encoding: utf-8
     2 from util import *
     3 from rules import *
     4 import re
     5 import sys
     6 
     7 
     8 class MyParser:
     9     def __init__(self):
    10         self.rules = []
    11         self.filters = []
    12 
    13     def addRule(self, rule):
    14         self.rules.append(rule)
    15 
    16     def setFilters(self, filters):
    17         self.filters = filters
    18 
    19     def parse(self, file):
    20         for line in lines2(file):
    21 
    22             for filter in self.filters:
    23                 line = filter(line)
    24 
    25             for rule in self.rules:
    26                 if rule.condition(line):
    27                     last = rule.action(line)
    28                     if last:
    29                         break
    30 
    31         # 文档结束后调用,以处理收尾工作
    32         for rule in self.rules:
    33             rule.endDoc()
    parsers.py
     1 from parsers import *
     2 from util import *
     3 from rules import *
     4 from filters import *
     5 import sys
     6 
     7 
     8 p = MyParser()
     9 p.addRule(DocumentRule())
    10 p.addRule(ListRule())
    11 p.addRule(ListItemRule())
    12 p.addRule(TitleRule())
    13 p.addRule(HeadingRule())
    14 p.addRule(ParagraphRule())
    15 p.setFilters(createFilters())
    16 
    17 p.parse(sys.stdin)
    main.py

    使用命令  python main.py < test_input.txt > out.html  运行

    有如下几点改动:

    1. rules和handles功能合在一起都放在rules.py中实现。
    2. 将Filter都放在filters.py中,并且可以看到匹配模式和替换函数写在一起,文本过滤这个功能容易一眼就看出如何实现。
    3. 添加了一个DocumentRule规则用来处理文档的开始和结束。并且在parsers.py 32行 循环调用每个rule类的endDoc()用以文档结束时的处理。当然现在只有DocumentRule类才会响应这个调用
    4. util.py 中用添加lines2()函数,并且在parsers.py中使用这个函数来读取文本行

    最后,代码应该写得容易让人看得懂  (尤其是在一本初始教程中)。

    ps: 本人接下来将用上面的框架用python写个markdown转html的算法,然后再将代码转化成c++代码。最后完善自己的笔记软件并且用Qt写个跨windows/mac平台的markdown的编辑器。

  • 相关阅读:
    洛谷 P1019单词接龙
    洛谷 P1091合唱队列
    洛谷 P1141 01迷宫
    洛谷 P1101单词方阵
    NOIP要炸?
    洛谷 P1219八皇后
    洛谷 P1181数列分段Section I
    刷普及-刷爆了。。。。。。
    洛谷 P3952时间复杂度 (本地AC测评RE的伪题解)
    动态数码管
  • 原文地址:https://www.cnblogs.com/xiangism/p/5241096.html
Copyright © 2011-2022 走看看