zoukankan      html  css  js  c++  java
  • 表驱动词法分析 (Python)

    • 正则表达式预处理
    • 正则表达式转后缀 RE
    • 后缀 RE 转 NFA
    • NFA 合并
    • NFA 转 DFA
    • DFA 最小化
    • NFA/DFA 可视化
    • 单次匹配
    • 最长前缀连续匹配
    import json
    from os import stat
    import networkx as nx
    import matplotlib.pyplot as plt
    from networkx.algorithms.shortest_paths import weighted
    import copy
    
    
    def isOperator(c):
        if c == '\x03' or c == '\x02' or c == '\x01':
            return True
        return False
    
    
    def getPrio(c):
        if c == '\x04':
            return 1
        if c == '\x02':
            return 2
        if c == '\x01':
            return 3
        if c == '\x03':
            return 4
        return 0
    
    
    def toSuffix(expr):
        op = []
        suffix = ""
        for c in expr:
            if isOperator(c):
                # if c == '*':
                #     suffix += c
                #     continue
                while len(op) > 0:
                    t = op[-1]
                    if getPrio(c) <= getPrio(t):
                        op = op[:-1]
                        suffix += t
                    else:
                        break
                op.append(c)
            else:
                if c == '\x04':
                    op.append(c)
                elif c == '\x05':
                    while op[-1] != '\x04':
                        suffix += op[-1]
                        op = op[:-1]
                    op = op[:-1]
                else:
                    suffix += c
        while len(op) > 0:
            suffix += op[-1]
            op = op[:-1]
        return suffix
    
    
    def merge_nfa(nfas):
        n = len(nfas)
        pin = 1
    
        lens = [0 for i in range(n)]
        for i in range(n):
            lens[i] = len(nfas[i]["e"])
        sumlen = sum(lens)+1
    
        ans = {
            "s": 0,
            "t": [],
            "e": [[] for i in range(sumlen)],
            "tt": [[] for i in range(sumlen)]
        }
    
        for i in range(n):
            leni = lens[i]
            mp_idi_id = [pin+i for i in range(leni)]
            nfa_i = nfas[i]
            ans["e"][0].append(["$", mp_idi_id[nfa_i["s"]]])
            for p in range(leni):
                for [w, q] in nfa_i["e"][p]:
                    ans["e"][mp_idi_id[p]].append([w, mp_idi_id[q]])
            for tar in nfa_i["t"]:
                ans["tt"][tar+pin].append(i)
                if tar+pin not in ans["t"]:
                    ans["t"].append(tar+pin)
            pin += leni
    
        return ans
    
    
    def printf(dic):
        dic2 = copy.deepcopy(dic)
        for i in range(len(dic2["e"])):
            dic2["e"][i] = str(dic2["e"][i])
        js = json.dumps(dic2, sort_keys=True, indent=4, separators=(',', ':'))
        print(js)
    
    
    def nfa_atom(a):
        ans = {"s": 0, "t": 1, "e": [[[a, 1]], []]}
        return ans
    
    
    def nfa_concat(nfa_1, nfa_2):
        len_1 = len(nfa_1["e"])
        len_2 = len(nfa_2["e"])
        mp_id1_id = [i for i in range(len_1)]
        mp_id2_id = [i+len_1 for i in range(len_2)]
        ans = {
            "s": mp_id1_id[nfa_1["s"]],
            "t": mp_id2_id[nfa_2["t"]],
            "e": [[] for i in range(len_1 + len_2)]
        }
        # 附加边
        ans["e"][mp_id1_id[nfa_1["t"]]].append(["$", mp_id2_id[nfa_2["s"]]])
        # 转换 nfa_1 的边
        for p in range(len_1):
            for [w, q] in nfa_1["e"][p]:
                ans["e"][mp_id1_id[p]].append([w, mp_id1_id[q]])
        # 转换 nfa_2 的边
        for p in range(len_2):
            for [w, q] in nfa_2["e"][p]:
                ans["e"][mp_id2_id[p]].append([w, mp_id2_id[q]])
        return ans
    
    
    def nfa_union(nfa_1, nfa_2):
        len_1 = len(nfa_1["e"])
        len_2 = len(nfa_2["e"])
        mp_id1_id = [2+i for i in range(len_1)]
        mp_id2_id = [2+i+len_1 for i in range(len_2)]
        ans = {
            "s": 0,
            "t": 1,
            "e": [[] for i in range(2 + len_1 + len_2)]
        }
        # 附加边
        ans["e"][0].append(["$", mp_id1_id[nfa_1["s"]]])
        ans["e"][0].append(["$", mp_id2_id[nfa_2["s"]]])
        ans["e"][mp_id1_id[nfa_1["t"]]].append(["$", 1])
        ans["e"][mp_id2_id[nfa_2["t"]]].append(["$", 1])
        # 转换 nfa_1 的边
        for p in range(len_1):
            for [w, q] in nfa_1["e"][p]:
                ans["e"][mp_id1_id[p]].append([w, mp_id1_id[q]])
        # 转换 nfa_2 的边
        for p in range(len_2):
            for [w, q] in nfa_2["e"][p]:
                ans["e"][mp_id2_id[p]].append([w, mp_id2_id[q]])
        return ans
    
    
    def nfa_star(nfa_1):
        len_1 = len(nfa_1["e"])
        mp_id1_id = [2+i for i in range(len_1)]
        ans = {
            "s": 0,
            "t": 1,
            "e": [[] for i in range(2 + len_1)]
        }
        # 附加边
        ans["e"][0].append(["$", mp_id1_id[nfa_1["s"]]])
        ans["e"][mp_id1_id[nfa_1["t"]]].append(["$", 1])
        ans["e"][mp_id1_id[nfa_1["t"]]].append(["$", mp_id1_id[nfa_1["s"]]])
        ans["e"][0].append(["$", 1])
        # 转换 nfa_1 的边
        for p in range(len_1):
            for [w, q] in nfa_1["e"][p]:
                ans["e"][mp_id1_id[p]].append([w, mp_id1_id[q]])
        return ans
    
    
    def re2nfa(sre):
        stack = []
    
        for x in sre:
            if x not in "\x01\x02\x03":
                stack.append(nfa_atom(x))
            elif x == "\x01":
                p = stack[-2]
                q = stack[-1]
                stack = stack[:-2]
                stack.append(nfa_concat(p, q))
            elif x == "\x02":
                p = stack[-2]
                q = stack[-1]
                stack = stack[:-2]
                stack.append(nfa_union(p, q))
            elif x == "\x03":
                p = stack[-1]
                stack = stack[:-1]
                stack.append(nfa_star(p))
    
        ans = stack[0]
        ans["t"] = [ans["t"]]
    
        return ans
    
    
    def printf0(dic):
        js = json.dumps(dic, sort_keys=True, indent=4, separators=(',', ':'))
        print(js)
    
    
    epsc_buffer = {}
    
    
    def eps_closure(nfa, states):
        global epsc_buffer
        if str(set(states)) in epsc_buffer:
            return epsc_buffer[str(set(states))]
        s = states.copy()
        s = list(set(s))
        while True:
            s1 = s[:]
            for p in s:
                for [w, q] in nfa["e"][p]:
                    if w == '$' and q not in s:
                        s.append(q)
            s.sort()
            if s == s1:
                break
        epsc_buffer[str(set(states))] = s
        return s
    
    
    def move(nfa, states, a):
        s = []
        for p in states:
            for [w, q] in nfa["e"][p]:
                if w == a and q not in s:
                    s.append(q)
        s = list(set(s))
        return s
    
    
    def nfa2dfa(nfa):
        d = [eps_closure(nfa, [0])]
        e = []
        chars = []
        for i in nfa["e"]:
            for [w, q] in i:
                if w not in chars:
                    chars.append(w)
        chars = list(set(chars))
        # print(chars)
        i = 0
        while i < len(d):
            states = d[i]
            e.append([])
            for a in chars:
                if a == "$":
                    continue
                u = move(nfa, states, a)
                u = eps_closure(nfa, u)
                if len(u) == 0:
                    continue
                if u not in d and len(u) > 0:
                    d.append(u)
                e[i].append([a, d.index(u)])
            i += 1
        # print(d)
        ts = []
        tt = []
        for i in range(len(d)):
            flag = 0
            srcs = []
            for j in nfa["t"]:
                if j in d[i]:
                    flag = 1
                    srcs.append(j)
            pats = []
            for j in srcs:
                pats += nfa["tt"][j]
            pats = list(set(pats))
            if flag:
                ts.append(i)
            pats = min(pats) if len(pats)>0 else -1
            tt.append(pats)
        return {'s': 0, 't': ts, 'e': e, 'tt': tt}
    
    
    def getdiv(dfa):
        n = len(dfa["e"])
        # set_1 = dfa["t"]
        # set_0 = [i for i in range(n) if i not in set_1]
        # P = [set_0, set_1]
    
        P = []
    
        patset = list(set(dfa["tt"]))
        pdic = {}
        for i in patset:
            pdic[i] = []
        for i in range(len(dfa["tt"])):
            pdic[dfa["tt"][i]].append(i)
        for i, j in pdic.items():
            P.append(j)
    
        chars = []
        for i in dfa["e"]:
            for [w, q] in i:
                if w not in chars:
                    chars.append(w)
        chars = list(set(chars))
    
        while True:
            P0 = P[:]
            B = [0 for i in range(n)]
            for i in range(len(P)):
                p = P[i]
                for q in p:
                    B[q] = i
            flag = 0
            for p in P:
                for c in chars:
                    def trans(dfa, p, a):
                        for [w, q] in dfa["e"][p]:
                            if w == a:
                                return q
                        return -1
                    tos = [B[trans(dfa, i, c)] if trans(
                        dfa, i, c) >= 0 else -1 for i in p]
                    stos = list(set(tos))
                    if len(stos) <= 1:
                        continue
                    Pa = []
                    for itos in stos:
                        s = [p[i] for i in range(len(p)) if tos[i] == itos]
                        Pa.append(s)
                    P.remove(p)
                    P += Pa
                    flag = 1
                    break
                if flag == 1:
                    break
            if flag == 0:
                break
        # 为符合习惯,交换,让 0 号状态为开始状态
        id0 = -1
        for i in range(len(P)):
            if 0 in P[i]:
                id0 = i
                break
    
        P[0], P[id0] = P[id0], P[0]
        return P
    
    
    def dfamin(dfa):
        n = len(dfa["e"])
        div = getdiv(dfa)
        for i in range(len(div)):
            div[i] = list(set(div[i]))
        mp = [0 for i in range(n)]  # 属于哪一个集合
        for i in range(len(div)):
            p = div[i]
            for q in div[i]:
                mp[q] = i
        ans = {'s': 0, 't': [], 'e': [[] for i in range(len(div))], 'tt': []}
        # 决定哪些状态是接受状态
        for i in range(len(div)):
            flag = 1
            bel = 99999
            for j in div[i]:
                if j not in dfa["t"]:
                    flag = 0
                else:
                    bel = min(bel, dfa["tt"][j])
            if flag:
                ans["t"].append(i)
                ans["tt"].append(bel)
            else:
                ans["tt"].append(-1)
        # 转换所有边
        e = []
        for i in range(n):
            for [w, q] in dfa["e"][i]:
                pp = mp[i]
                qq = mp[q]
                e.append((pp, w, qq))
        # print(e)
        e = list(set(e))
        for [p, w, q] in e:
            ans['e'][p].append([w, q])
        return ans
    
    
    def draw_fa(nfa):
        G = nx.DiGraph()
        G.clear()
        len_nfa = len(nfa["e"])
        G.add_nodes_from([i for i in range(len_nfa)])  # 添加多个节点
        for i in range(len_nfa):
            if len(nfa["e"][i]) == 0:
                continue
            for [w, q] in nfa["e"][i]:
                G.add_edge(i, q, weight=1 if w == '$' else 2 if w ==
                           'a' else 3 if w == 'b' else 4)  # 添加一条
                if i == q:
                    print("自环", i, w)
    
        # nx.draw(G, node_size=500, with_labels=True, node_color='red')
        elarge1 = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] == 2]
        elarge2 = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] == 3]
        elarge3 = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] == 4]
        esmall = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] == 1]
        n0 = [x for x, y in G.nodes(data=True) if x !=
              nfa["s"] and x not in nfa["t"]]
        n1 = [x for x, y in G.nodes(data=True) if x == nfa["s"]]
        n2 = [x for x, y in G.nodes(data=True) if x in nfa["t"]]
    
        pos = nx.spring_layout(G)  # positions for all nodes
    
        nx.draw_networkx_nodes(G, pos, nodelist=n0,
                               node_size=500, node_color='gray')
        nx.draw_networkx_nodes(G, pos, nodelist=n1,
                               node_size=500, node_color='pink')
        nx.draw_networkx_nodes(G, pos, nodelist=n2,
                               node_size=500, node_color='brown')
    
        nx.draw_networkx_edges(G, pos, edgelist=elarge1, width=3, edge_color='red')
        nx.draw_networkx_edges(G, pos, edgelist=elarge2,
                               width=3, edge_color='green')
        nx.draw_networkx_edges(G, pos, edgelist=elarge3,
                               width=3, edge_color='blue')
        nx.draw_networkx_edges(G, pos, edgelist=esmall, width=2,
                               alpha=1, edge_color='gray', style='dotted')
        nx.draw_networkx_labels(G, pos, font_size=14,
                                font_family='sans-serif', font_color='w')
        plt.axis('off')
        plt.show()
    
    
    def run(dfa, str):
        p = dfa["s"]
        for c in str:
            x = -1
            for [w, q] in dfa["e"][p]:
                if w == c:
                    x = q
            if x == -1:
                return -1
            else:
                p = x
        return dfa["tt"][p]
    
    
    def runx(dfa, src):
        ptr = 0
        results = []
        while ptr < len(src):
            p = dfa["s"]
    
            MAXPREREAD = 256  # 最多预读 MAXPREREAD 个字符
            str = src[ptr:ptr+MAXPREREAD]
            last_match_pos = -1
            last_match_ans = -1
            for step, c in enumerate(str):
                x = -1
                for [w, q] in dfa["e"][p]:
                    if w == c:
                        x = q
                if x == -1:
                    break
                else:
                    p = x
                if dfa["tt"][p] != -1:
                    last_match_pos = step+1
                    last_match_ans = dfa["tt"][p]
            if last_match_pos == -1:
                # 错误,跳过该行
                print("\033[31merror near:", src[ptr:ptr+32], "\033[0m")
                while ptr < len(src) and src[ptr] != '\n':
                    ptr += 1
                ptr += 1
            else:
                # print("\033[32mmatch re#%d" % last_match_ans, str[:last_match_pos],"\033[0m")
                result_re_id = last_match_ans
                result_str = str[:last_match_pos]
                if result_str not in " \n\r\t":
                    results.append((result_re_id, result_str))
                ptr += last_match_pos
        return results
    
    
    def re_preprocess(re):
        re = copy.deepcopy(re)
        re = re.replace("[A~Za~z]", "([A~Z]|[a~z])")
        re = re.replace("[A~Za~z0~9]", "([A~Z]|[a~z]|[0~9])")
        re = re.replace("[0~9]", "(0|1|2|3|4|5|6|7|8|9)")
        re = re.replace("[1~9]", "(1|2|3|4|5|6|7|8|9)")
        re = re.replace(
            "[A~Z]", "(A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z)")
        re = re.replace(
            "[a~z]", "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z)")
    
        re = re.replace("\\n", "\n")
        re = re.replace("\\r", "\r")
        re = re.replace("\\t", "\t")
    
        re = re.replace("\\&", "\x06")
        re = re.replace("&", "\x01")
        re = re.replace("\x06", "&")
    
        re = re.replace("\\|", "\x06")
        re = re.replace("|", "\x02")
        re = re.replace("\x06", "|")
    
        re = re.replace("\\*", "\x06")
        re = re.replace("*", "\x03")
        re = re.replace("\x06", "*")
    
        re = re.replace("\\(", "\x06")
        re = re.replace("(", "\x04")
        re = re.replace("\x06", "(")
    
        re = re.replace("\\)", "\x06")
        re = re.replace(")", "\x05")
        re = re.replace("\x06", ")")
    
        i = 0
        while i+1 < len(re):
            a = 'a' if re[i] not in "\x02\x01\x03\x04\x05" else re[i]
            b = 'a' if re[i+1] not in "\x02\x01\x03\x04\x05" else re[i+1]
            c = [
                'a\x04', 'aa', '\x05a', '\x03a', '\x03\x04', '\x05\x04'
            ]
            if a+b in c:
                re = re[:i+1]+'\x01'+re[i+1:]
            i += 1
        return re
    
    
    re_file = open("re.txt", "r")
    re_list = re_file.readlines()
    re_list = [i[:-1] for i in re_list]
    
    sres = copy.deepcopy(re_list)
    re_list = [re_preprocess(i) for i in re_list]
    nfas = [re2nfa(toSuffix(i)) for i in re_list]
    nfa = merge_nfa(nfas)
    dfa = nfa2dfa(nfa)
    dfa = dfamin(dfa)
    
    # printf(dfa)
    # print(ans)
    
    src_file = open("src.txt", "r")
    src = src_file.read()
    
    results = runx(dfa, src)
    
    # map.txt 对每个 re 给定一个名称,用于语法分析
    map_file = open("map.txt", "r")
    token_map = map_file.readlines()
    token_map = [i[:-1] for i in token_map]
    
    tokens_file = open("tokens.txt", "w")
    for x, y in results:
        print(x,  token_map[x], y, file=tokens_file)
    
    # s = ["a", "b", "ab", "aa", "abc", "1", "0", "123", "1.23", "a11"]
    # ans = [run(dfa, i) for i in s]
    # for i, a in enumerate(ans):
    #     print(i, s[i], a, -1 if a == -1 else sres[a])
    
    # draw_fa(dfa)
    

    RE 输入示例

    #include
    define
    auto
    bool
    break
    case
    catch
    char
    class
    const
    const_cast
    continue
    default
    delete
    do
    double
    dynamic_cast
    else
    enum
    explicit
    extern
    false
    float
    for
    friend
    goto
    if
    inline
    int
    long
    mutable
    namespace
    new
    operator
    private
    protected
    public
    register
    reinterpret_cast
    return
    short
    signed
    sizeof
    static
    static_cast
    struct
    switch
    template
    this
    throw
    true
    try
    typedef
    typeid
    typename
    union
    unsigned
    using
    virtual
    void
    volatile
    while
     
    \n
    \r
    \t
    ,
    ;
    .
    \(
    \)
    [
    ]
    {
    }
    #
    ++
    --
    +
    -
    \*
    /
    %
    <<
    >>
    <=
    >=
    <
    >
    ==
    =
    !=
    !
    \&\&
    \|\|
    (+|-|$)[0~9][0~9]*
    (+|-|$)[0~9][0~9]*((.[0~9]*)|$)(((E|e)(+|-)[0~9][0~9]*)|$)(f|$)
    ([A~Za~z]|_)([A~Za~z0~9]|_)*
    '([A~Za~z0~9]|_|-| |+|\*|/|?|!|@|#|%)'
    "([A~Za~z0~9]|_|-| |+|\*|/|?|!|@|#|%)*"
    //([A~Za~z0~9]|_|-| |+|\*|/|?|!|@|#|%)*\n
    

    Tokens 输出示例

    26 if if
    69 ( (
    95 Const 1
    89 == ==
    95 Const 2
    70 ) )
    26 if if
    69 ( (
    95 Const 3
    89 == ==
    95 Const 4
    70 ) )
    97 Id a
    90 = =
    95 Const 1
    67 ; ;
    17 else else
    97 Id a
    90 = =
    95 Const 2
    67 ; ;
    17 else else
    97 Id a
    90 = =
    95 Const 3
    67 ; ;
    
    
  • 相关阅读:
    c++ Knight Moves 超级升级版
    百度招聘彩蛋
    C++ 八数码问题宽搜
    c++ 动态规划(数塔)
    c++ 深度优先搜索(迷宫)
    c++ 宽搜(倒水)
    c++ matrix逆时针螺旋
    c++ 基数排序
    c++ 二分答案(解方程)
    c++ 二分答案(数组查找)
  • 原文地址:https://www.cnblogs.com/mollnn/p/15785363.html
Copyright © 2011-2022 走看看