zoukankan      html  css  js  c++  java
  • 动手写的Python的HTML语法分析器(面向对象)

    主要包括4个文件,util.py文件主要负责截取每个块。

    rules.py文件定义两个类,超类Rule和分别对应的子类,子类定义了不同的划分块的要求,子类包换action函数,调用handler处理

    handlers.py定义了处理类,超类定义了方法,子类通过名字调用

    markup.py定义了超类parser,定义了子类basicTextParser,超类主要负责创造过滤器,添加规则,对每个块执行处理。

    #handler.py
    #
    -*- coding: cp936 -*-
    class Handler:
    """
    处理从parser调用的方法对象

    这个解析器会在每个块的开始部分调用start()和end()方法,使用合适的块名作为参数。sub()会用于
    正则表达式替换中,当使用了'emphasis'这样的名字调用时,它会返回核实的替换函数
    """
    def callback(self,prefix,name , *args):
    method = getattr(self , prefix + name , None)
    if callable(method):return method(*args)
    def start(self ,name):
    self.callback('start_',name)
    def end(self,name):
    self.callback('end_' ,name)
    def sub(self,name):
    def substitution(match):
    result = self.callback('sub_', name,match)
    if result is None:
    result = match.group(1)
    return result
    return substitution
    class HTMLRenderer(Handler):
    """
    用于生成HTML的具体处理程序
    类中所有的方法都可以通过超类处理程序的START()、end()、sub()方法来访问,他们实现了HTML的基本标签
    """
    def start_document(self):
    print '<html><head><title>...</title></head><body>'
    def end_document(self):
    print '</body></html>'
    def start_paragraph(self):
    print '<p>'
    def end_paragraph(self):
    print '</p>'
    def start_heading(self):
    print '<h2>'
    def end_heading(self):
    print '<h2>'
    def start_list(self):
    print '<ul>'
    def end_list(self):
    print '</ul>'
    def start_listitem(self):
    print '<li>'
    def end_listitem(self):
    print '</li>'
    def start_title(self):
    print '<h1>'
    def end_title(self):
    print '</h1>'
    def sub_emphasis(self,match):
    return '<em>%s<em>' % match.group(1)
    def sub_url(self ,match):
    return '<a href="%s">%s</a>' % (match.group(1),match.group(1))
    def sub_mail(self,match):
    return '<a href="mailto:%s">%s</a>' % (match.group(1),match.group(1))
    def feed(self,data):
    print data
    # -*- coding: cp936 -*-
    #
    rules.py
    class Rule:
    """所有规则的基类"""
    def action(self,block ,handler):
    handler.start(self.type)
    handler.feed(block)
    handler.end(self.type)
    return True

    class HeadingRule(Rule):
    """
    标题占一行,且标题的数目不大于70个字符,且最后不能以冒号结尾
    """
    type = 'heading'
    def condition(self,block):
    return not '\n' in block and len(block)<=70 and not block[-1]==':'

    class TitleRule(HeadingRule):
    """
    题目是文档的第一个块,前提他是大标题
    """
    type = 'title'
    first= True

    def condition(self,block):
    if not self.first:return False
    self.first = False
    return HeadingRule.condition(self, block)

    class ListItemRule(Rule):
    type ='listitem'

    def condition(self,block):
    return block[0]=='-'
    def action(self,block,handler):
    handler.start(self.type)
    handler.feed(block[1:].strip())
    handler.end(self.type)
    return True
    class ListRule(ListItemRule):
    type = 'list'
    inside = False
    def condition(self,block):
    return True
    def action(self,block,handler):
    if not self.inside and ListItemRule.condition(self, block):
    handler.start(self.type)
    self.inside=True
    elif self.inside and not ListItemRule.condition(self, block):
    handler.end(self.type)
    self.inside=False
    return False

    class ParagraphRule(Rule):

    type ='paragraph'
    def condition(self,block):
    return True


    #util.py
    def lines(file):
    for line in file:yield line
    yield '\n'

    def blocks(file):
    block =[]
    for line in lines(file):
    if line.strip():
    block.append(line)
    elif block:
    yield ''.join(block).strip()
    block=[]

    #markup.py
    import sys , re
    from handlers import *
    from util import *
    from rules import *

    class Parser:
    """
    the processer of this , read data ,then use rule , and control to process data block
    """
    #initial
    def __init__(self,handler):
    self.handler = handler
    self.rules =[]
    self.filters =[]
    #addrule to the Parser
    def addRule(self,rule):
    self.rules.append(rule)
    #add filter to the Parser
    def addFilters(self,patten,name):
    def filter(block , handler):
    return re.sub(patten,handler.sub(name),block)
    self.filters.append(filter)
    def parse(self,file):
    self.handler.start('document')
    for block in blocks(file):
    for filter in self.filters:
    block = filter(block , self.handler)
    for rule in self.rules:
    if rule.condition(block):
    last = rule.action(block,self.handler)
    if last:break
    self.handler.end('document')

    class BasicTextParser(Parser):
    def __init__(self,hanler):
    Parser.__init__(self,handler)
    self.addRule(ListRule())
    self.addRule(ListItemRule())
    self.addRule(TitleRule())
    self.addRule(HeadingRule())
    self.addRule(ParagraphRule())

    self.addFilters(r'\*(.+?)\*', 'emphasis')
    self.addFilters(r'(http://[\.a-zA-z/]+)', 'url')
    self.addFilters(r'([\.a-zA-Z/]+@[\.a-zA-z]+[a-zA-Z]+)', 'mail')


    handler = HTMLRenderer()
    parser = BasicTextParser(handler)
    f= open(r'D://python27/input.txt')
    parser.parse(f)






  • 相关阅读:
    #研发中间件介绍#定时任务调度与管理JobCenter
    分享一个分布式定时任务系统 ( python)
    APScheduler + Gearman 构建分布式定时任务调度-std1984-ITPUB博客
    分布式缓存的一起问题 – 后端技术 by Tim Yang
    新兵训练营系列课程——Feed架构介绍
    Mysql分库分表方案
    可扩展性设计之数据切分
    你的数据库数据量上亿,为了提高效率,要分库还是分表?具体怎么做
    58同城mysql分库分表实践-沈剑
    可动态扩展的分库分表策略浅谈
  • 原文地址:https://www.cnblogs.com/lzhenf/p/2382056.html
Copyright © 2011-2022 走看看