正则式到dfa最小化实现

2019-10-30

python实现：

re到NFA
NFA到DFA
DFA最小化

注：1.非完全实现，不支持[a-zA-Z] （当然如果需要的话也可以很轻松地改为完全实现）2.存储变量的数据结构设计很糟糕..

1. re到DFA最小化

_class.py

Regex类：

class Regex():
    def __init__(self, rstr):
        self.rstr = rstr
        self.out = []
        self.ops = []
        self.dic = {'*': 3,  '.': 2, '|': 1}

    def getWeight(self, ch):
        return self.dic[ch]

    def reshape(self):
        i = 0
        while(i < len(self.rstr[:])-1):
            if(
                (re.match("[0-9]", self.rstr[i+1]) and
                 re.match("[0-9\*\)]", self.rstr[i])) or
                re.match("[\(]", self.rstr[i+1]) and
                re.match("[0-9]", self.rstr[i])
            ):
                _ = list(self.rstr)  # string to list
                _.insert(i+1, '.')
                self.rstr = "".join(_)
            i += 1

    def postfix(self):
        for ch in self.rstr:
            if(ch == '('):
                self.ops.append(ch)
            elif(ch == ')'):
                while(self.ops[-1] != '('):
                    self.out.append(self.ops.pop())
                self.ops.pop()
            elif(re.match('[\*\.\|]', ch)):
                while(len(self.ops) != 0 and self.ops[-1] != '(' and self.getWeight(self.ops[-1]) >= self.getWeight(ch)):
                    self.out.append(self.ops.pop())
                self.ops.append(ch)
            else:
                self.out.append(ch)
        while(len(self.ops) != 0):
            self.out.append(self.ops.pop())

reshape：将省略的连接符号*用.代替，注意到对括号的额外处理
postfix：转换为逆波兰式，注意对单目运算符和括号的处理^1
- ops：符号栈
- out：记录最终结果
添加.：先把string转为list，然后调用insert()，再用join()转回来
- string是immutable，所以一般要转为mutable的list

Base基类：

class Base():
    def __init__(self):
        self.set1 = set()
        self.func = defaultdict(set)
        self.cnt = 0
        self.nfq = ()
        self.setS = {'0', '1'}

set1：状态的有限集合
nfq：初态与终态
setS：输入符号有限集合
func：映射关系

NFA类：

class NFA(Regex, Base):
    def __init__(self, rstr):
        Regex.__init__(self, rstr)
        Base.__init__(self)
        self.stack = []

    def nfa(self):
        for o in self.out[:]:
            if(re.match("[0-9]", o)):
                self.calc(self.out.pop(0))
            else:
                if(re.match("\*", o)):
                    op = self.out.pop(0)
                    a = self.stack.pop()
                    self.calc(op, a, a)
                else:
                    op = self.out.pop(0)
                    b = self.stack.pop()
                    a = self.stack.pop()
                    self.calc(op, a, b)
        self.nfq = self.stack.pop()

    def calc(self, op, a=-1, b=-1):
        if(op == '.'):
            self.func[a[1]].add(('e', b[0]))
            self.stack.append((a[0], b[1]))
        else:
            temp = list()
            for i in range(2):
                self.set1.add(str(self.cnt))  # auto remove same element
                temp.append(str(self.cnt))
                self.cnt += 1
            if(op == '*'):
                self.func[temp[0]].add(('e', a[0]))
                self.func[temp[0]].add(('e', temp[1]))
                self.func[a[1]].add(('e', a[0]))
                self.func[a[1]].add(('e', temp[1]))
            elif(op == '|'):
                self.func[temp[0]].add(('e', a[0]))
                self.func[temp[0]].add(('e', b[0]))
                self.func[a[1]].add(('e', temp[1]))
                self.func[b[1]].add(('e', temp[1]))
            else:
                self.func[temp[0]].add((op, temp[1]))
            self.stack.append((temp[0], temp[1]))

nfa.stack：根据逆波兰式计算的临时结果存储^1
Thompson构造法^2

DFA类：

class DFA(NFA):
    def __init__(self, rstr):
        super().__init__(rstr)
        self.cdic = dict()
        self.ddic = defaultdict(set)
        self.edic = defaultdict(set)
        self.set2 = set()
        self.cmax = 0
		self.final=None
        
    def dfa(self):
        tps = set()
        for i in self.set1:
            for j in self.func[i]:
                if (j[0] != 'e'):  
                    self.edic[(j[0], i)].add(j[1])
        self.set2.add(str(self.cmax))
        self.cdic[str(self.cmax)] = self.clousre(self.nfq[0], set(self.nfq[0]))
        dot = 0
        while(dot <= self.cmax):
            for i in self.setS:  
                self.move(dot, i)
            dot += 1

    def clousre(self, i, tset):
        for j in self.func[i]:
            if(j[0] == 'e'):
                tset |= self.clousre(j[1], tset)
                tset.add(j[1])
        return tset

    def move(self, dot, i):
        tp = set()
        tp2 = set()
        for j in self.cdic[str(dot)]:  # move('A',j)
            tp |= self.edic[(i, j)]
        for n in tp:
            tp2 |= self.clousre(n, set())
            tp2 |= tp
        if(tp2):
            if(tp2 not in self.cdic.values()):  # new value
                self.cmax += 1
                self.cdic[str(self.cmax)] = tp2
                if(self.nfq[1] in tp):
                    self.final = str(self.cmax)
                self.set2.add(str(self.cmax))
                self.ddic[str(dot)].add((i, str(self.cmax)))
            else:  # new value
                for k, v in self.cdic.items():  
                    if(v == tp2):
                        self.ddic[str(dot)].add((i, str(k)))

dfa.edic：由于之前没按NFA和DFA的定义设计合适的数据结构，因此要多转换一下
dfa.cdic：记录重命名前后的映射关系
dfa.ddic：映射关系
set2：状态的有限集合

sDFA：

class sDFA(DFA):
    def __init__(self, rstr):
        super().__init__(rstr)
        self.fdic = dict()

    def hopcroft(self):
        self.dfq = ('0', self.final)
        self.set3 = self.set2.copy()
        self.set3.remove(self.dfq[1])


        self.li = list()
        self.li.append(list(self.set3.copy()))
        self.li.append([self.dfq[1]])
        # li = [ [1,2,3], [4] ]
        gdic = defaultdict(int) # set default 0

        for temp, i in enumerate(self.li):
            for j in i:
                gdic[j] = temp 
        self.c = temp

        for i in self.set2:
            for j in self.ddic[i]:
                self.fdic[(j[0], i)] = j[1]

        while True:
            tp = len(self.li)
            flg = -1 # 第一次并记录下标
            flg2 = False # 产生新分组
            for v, i in enumerate(self.li[:]): # v-> 0,1 i-> [0,1,2] [3]
                for n in self.setS:  # n in {'0','1'}
                    for j in i[:]:  # j in [0,1,2]
                        if((n, j) in self.fdic.keys()):
                            if(flg == -1 or flg == gdic[self.fdic[(n, j)]]): 
                                flg = gdic[self.fdic[(n, j)]] 
                            else:
                                self.li[v].remove(j)
                                self.li.append([j])
                                self.c += 1
                                gdic[j] = self.c
                                flg2 = True  
                    flg = -1
                    if(flg2):  
                        flg2 = False
                        break
            if(tp == len(self.li)):
                break
        
        # li分好组，合并同类组
        t = None
        flg = False
        for v,i in enumerate(self.li[:]):
            for j in i[:]:
                if flg == False:  # 有可合并项
                    flg = True
                    t = j
                else:
                    self.conversion(j, t,v)
            flg = False
            t = None
        self.set3 = set(x for y in self.li for x in y)

    def conversion(self, t1, t2,v):
        for i in self.set2:
            tp = self.ddic[i].copy()
            for j in tp:
                if(j[1] == t1):
                    self.ddic[i].add((j[0], t2))
                    self.ddic[i].remove(j)
        self.ddic[t2] |= self.ddic[t1]
        self.li[v].remove(t1)

self.final
- dfa终态的确定：包含nfa终态的cdic代表的结点
- 本算法中，dfa最大序号结点不能代表终态结点，如：
hopcroft
- 子集划分法^3
set3：状态的有限集合
dfq：初态和终态
fdic：同edic
conversion：合并等价状态

一文通：参考链接

2. tkinter包装

_main.py

import tkinter as tk
import graphviz as gv
from PIL import Image, ImageTk
from _class import *


class automata():
    def __init__(self):
        self.rstr = ""
        self.window = None
        self.addr = list()

    def resize(self, w_box, h_box, pil_image):  # 参数是：要适应的窗口宽、高、Image.open后的图片
        w, h = pil_image.size  # 获取图像的原始大小
        f1 = 1.0*w_box/w
        f2 = 1.0*h_box/h
        factor = min([f1, f2])
        width = int(w*factor)
        height = int(h*factor)
        return pil_image.resize((width, height), Image.ANTIALIAS)

    def drawDot(self, g, ch1, ch2, w):
        g.node(name=str(ch1))
        g.node(name=str(ch2))
        g.edge(str(ch1), str(ch2), label=w)
        #g.edge_attr['label']=w

    def drawGraph(self, g, sett, func, filename):
        for i in sett:
            for j in func[i]:
                self.drawDot(g, i, j[1], j[0])
        g.render(filename, format='png')
        self.addr.append(filename+".png")

    def drawWindow(self):
        self.window = tk.Tk()
        self.window.title('DFA CONVERT')
        self.window.geometry('800x600+100+100')
        lb1 = tk.Label(self.window,
                       text='input ur fucking re asshole',    # 标签的文字
                       bg='pink',     # 背景颜色
                       font=('Arial', 12),     # 字体和字体大小
                       height=2)  # 标签长宽
        lb1.pack()
        txt = tk.Entry(self.window)
        txt.insert(40, 'type re expression in 40')
        txt.pack()

        def imgpack(i, sett, dicc, filename):
            g = gv.Digraph()
            self.drawGraph(g, sett, dicc, filename)
            img = Image.open(self.addr[i])
            img = self.resize(600, 400, img)
            img = ImageTk.PhotoImage(img)
            lb2 = tk.Label(self.window, image=img)
            lb2.image = img
            lb2.pack(side=tk.LEFT, padx=25)

        def sGet():
            self.rstr = txt.get()
            N = sDFA(self.rstr)
            N.reshape()
            N.postfix()

            N.nfa()
            imgpack(0, N.set1, N.func, "NFA")

            N.dfa()
            imgpack(1, N.set2, N.ddic, "DFA")

            N.hopcroft()
            imgpack(2, N.set3, N.ddic, "sDFA")
        btn = tk.Button(self.window, text='键入', width=15,
                        height=2, command=sGet)
        btn.pack()
        self.window.mainloop()

    def run(self):
        self.drawWindow()


if __name__ == "__main__":
    t = automata()
    t.run()

wsl-ubuntu不支持绘图真是…

resize：轮子。^4显示本地上的图片并且随着窗口而改变大小的代码（不然容易超出窗口而显示不全）。
不能动态跟新图片，偷懒（不过目前我也不知道怎么实现）

1. re到DFA最小化

2. tkinter包装

3. 结果展示