03_subset_construction.py

  1"""
  203_subset_construction.py - NFA to DFA via Subset Construction
  3
  4Demonstrates the subset construction (powerset construction) algorithm,
  5which converts a Non-deterministic Finite Automaton (NFA) into an
  6equivalent Deterministic Finite Automaton (DFA).
  7
  8Key idea:
  9  Each DFA state corresponds to a *set* of NFA states.
 10  The DFA's transition on symbol 'a' from state {s1,s2,...} is:
 11    epsilon_closure( move({s1,s2,...}, 'a') )
 12
 13Topics covered:
 14  - Subset construction algorithm
 15  - DFA minimization concept (identifying accept states)
 16  - Printing transition tables
 17  - Comparing NFA vs DFA simulation results
 18"""
 19
 20from __future__ import annotations
 21from dataclasses import dataclass, field
 22from typing import Optional
 23
 24# Reuse NFA machinery from 02_thompson_nfa
 25# (copied here to keep this file standalone)
 26
 27EPSILON = ''
 28
 29
 30class State:
 31    _counter = 0
 32
 33    def __init__(self):
 34        State._counter += 1
 35        self.id = State._counter
 36        self.transitions: dict[str, list[State]] = {}
 37        self.is_accept: bool = False
 38
 39    def add_transition(self, symbol: str, target: State) -> None:
 40        self.transitions.setdefault(symbol, []).append(target)
 41
 42    def __repr__(self):
 43        return f"S{self.id}"
 44
 45
 46@dataclass
 47class NFA:
 48    start: State
 49    accept: State
 50
 51
 52# --- Thompson's construction (abbreviated) -----------------------------------
 53
 54def _lit(ch):
 55    s, a = State(), State()
 56    s.add_transition(ch, a)
 57    return NFA(s, a)
 58
 59def _concat(n1, n2):
 60    n1.accept.add_transition(EPSILON, n2.start)
 61    return NFA(n1.start, n2.accept)
 62
 63def _union(n1, n2):
 64    s, a = State(), State()
 65    s.add_transition(EPSILON, n1.start)
 66    s.add_transition(EPSILON, n2.start)
 67    n1.accept.add_transition(EPSILON, a)
 68    n2.accept.add_transition(EPSILON, a)
 69    return NFA(s, a)
 70
 71def _star(n):
 72    s, a = State(), State()
 73    s.add_transition(EPSILON, n.start)
 74    s.add_transition(EPSILON, a)
 75    n.accept.add_transition(EPSILON, n.start)
 76    n.accept.add_transition(EPSILON, a)
 77    return NFA(s, a)
 78
 79PREC = {'|': 1, '.': 2, '*': 3, '+': 3, '?': 3}
 80BINOPS = {'|', '.'}
 81UNOPS  = {'*', '+', '?'}
 82
 83def _add_concat(r):
 84    res = []
 85    for i, c in enumerate(r):
 86        res.append(c)
 87        if i + 1 < len(r):
 88            l, ri = c, r[i+1]
 89            if l not in ('(', '|', '.') and ri not in (')', '|', '.', '*', '+', '?'):
 90                res.append('.')
 91    return ''.join(res)
 92
 93def _postfix(r):
 94    out, stk = [], []
 95    for c in r:
 96        if c == '(':
 97            stk.append(c)
 98        elif c == ')':
 99            while stk and stk[-1] != '(':
100                out.append(stk.pop())
101            if stk: stk.pop()
102        elif c in PREC:
103            while (stk and stk[-1] != '(' and stk[-1] in PREC and
104                   PREC[stk[-1]] >= PREC[c] and c not in UNOPS):
105                out.append(stk.pop())
106            stk.append(c)
107        else:
108            out.append(c)
109    while stk: out.append(stk.pop())
110    return ''.join(out)
111
112def regex_to_nfa(regex: str) -> NFA:
113    State._counter = 0
114    postfix = _postfix(_add_concat(regex))
115    stk: list[NFA] = []
116    for c in postfix:
117        if c == '.':
118            b, a = stk.pop(), stk.pop(); stk.append(_concat(a, b))
119        elif c == '|':
120            b, a = stk.pop(), stk.pop(); stk.append(_union(a, b))
121        elif c == '*':
122            stk.append(_star(stk.pop()))
123        elif c == '+':
124            n = stk.pop()
125            # plus: concat(n, star(n)) -- reuse same states
126            s2, a2 = State(), State()
127            s2.add_transition(EPSILON, n.start)
128            n.accept.add_transition(EPSILON, n.start)
129            n.accept.add_transition(EPSILON, a2)
130            stk.append(NFA(s2, a2))
131        elif c == '?':
132            n = stk.pop()
133            s2, a2 = State(), State()
134            s2.add_transition(EPSILON, n.start)
135            s2.add_transition(EPSILON, a2)
136            n.accept.add_transition(EPSILON, a2)
137            stk.append(NFA(s2, a2))
138        else:
139            stk.append(_lit(c))
140    nfa = stk[0]
141    nfa.accept.is_accept = True
142    return nfa
143
144
145# ---------------------------------------------------------------------------
146# NFA simulation helpers
147# ---------------------------------------------------------------------------
148
149def epsilon_closure(states: frozenset[State]) -> frozenset[State]:
150    closure = set(states)
151    worklist = list(states)
152    while worklist:
153        s = worklist.pop()
154        for t in s.transitions.get(EPSILON, []):
155            if t not in closure:
156                closure.add(t)
157                worklist.append(t)
158    return frozenset(closure)
159
160
161def nfa_move(states: frozenset[State], sym: str) -> frozenset[State]:
162    result: set[State] = set()
163    for s in states:
164        result.update(s.transitions.get(sym, []))
165    return frozenset(result)
166
167
168def alphabet(nfa: NFA) -> set[str]:
169    """Collect all non-epsilon symbols used in the NFA."""
170    syms: set[str] = set()
171    visited: set[int] = set()
172    queue = [nfa.start]
173    while queue:
174        s = queue.pop()
175        if s.id in visited:
176            continue
177        visited.add(s.id)
178        for sym, targets in s.transitions.items():
179            if sym != EPSILON:
180                syms.add(sym)
181            queue.extend(targets)
182    return syms
183
184
185# ---------------------------------------------------------------------------
186# DFA representation
187# ---------------------------------------------------------------------------
188
189@dataclass
190class DFAState:
191    """A DFA state representing a frozenset of NFA states."""
192    id: int
193    nfa_states: frozenset[State]
194    is_accept: bool
195    transitions: dict[str, int] = field(default_factory=dict)  # symbol -> DFA state id
196
197    def name(self) -> str:
198        ids = sorted(s.id for s in self.nfa_states)
199        return '{' + ','.join(f'S{i}' for i in ids) + '}'
200
201
202@dataclass
203class DFA:
204    states: list[DFAState]
205    start_id: int
206    alphabet: set[str]
207
208    def start(self) -> DFAState:
209        return self.states[self.start_id]
210
211    def accepts(self, text: str) -> bool:
212        current = self.start_id
213        for ch in text:
214            state = self.states[current]
215            if ch not in state.transitions:
216                return False
217            current = state.transitions[ch]
218        return self.states[current].is_accept
219
220
221# ---------------------------------------------------------------------------
222# Subset Construction Algorithm
223# ---------------------------------------------------------------------------
224
225def subset_construction(nfa: NFA) -> DFA:
226    """
227    Convert an NFA to a DFA using the subset construction algorithm.
228
229    Steps:
230    1. Start with epsilon_closure({nfa.start}) as the initial DFA state.
231    2. For each unmarked DFA state D and each symbol a in the alphabet:
232       a. Compute T = epsilon_closure(move(D, a))
233       b. If T is not already a DFA state, add it (unmarked)
234       c. Record the transition D --a--> T
235    3. A DFA state is accepting if any of its NFA states is an accept state.
236    """
237    syms = sorted(alphabet(nfa))
238    start_set = epsilon_closure(frozenset({nfa.start}))
239
240    # Map from frozenset[State] -> DFA state id
241    state_map: dict[frozenset[State], int] = {}
242    dfa_states: list[DFAState] = []
243
244    def get_or_create(nfa_set: frozenset[State]) -> int:
245        if nfa_set in state_map:
246            return state_map[nfa_set]
247        sid = len(dfa_states)
248        is_acc = any(s.is_accept for s in nfa_set)
249        ds = DFAState(id=sid, nfa_states=nfa_set, is_accept=is_acc)
250        dfa_states.append(ds)
251        state_map[nfa_set] = sid
252        return sid
253
254    start_id = get_or_create(start_set)
255    worklist = [start_id]
256
257    while worklist:
258        did = worklist.pop()
259        ds = dfa_states[did]
260        for sym in syms:
261            moved = nfa_move(ds.nfa_states, sym)
262            if not moved:
263                continue   # dead state (no transition)
264            closed = epsilon_closure(moved)
265            target_id = get_or_create(closed)
266            ds.transitions[sym] = target_id
267            if target_id == len(dfa_states) - 1:   # newly created
268                worklist.append(target_id)
269
270    return DFA(states=dfa_states, start_id=start_id, alphabet=set(syms))
271
272
273# ---------------------------------------------------------------------------
274# Pretty printing
275# ---------------------------------------------------------------------------
276
277def print_dfa_table(dfa: DFA, regex: str) -> None:
278    """Print the DFA transition table."""
279    syms = sorted(dfa.alphabet)
280    print(f"\nDFA for regex: {regex!r}")
281    print(f"  States: {len(dfa.states)}")
282    print(f"  Alphabet: {{{', '.join(repr(s) for s in syms)}}}")
283    print(f"  Start state: {dfa.states[dfa.start_id].name()}")
284    accept_names = [dfa.states[i].name() for i, s in enumerate(dfa.states) if s.is_accept]
285    print(f"  Accept states: {accept_names}")
286
287    # Header
288    col_w = max(20, max(len(dfa.states[i].name()) for i in range(len(dfa.states))) + 2)
289    sym_w = 12
290    header = f"  {'STATE':<{col_w}}"
291    for sym in syms:
292        header += f"  {repr(sym):^{sym_w}}"
293    header += f"  {'ACCEPT':>6}"
294    print()
295    print(header)
296    print("  " + "-" * (col_w + len(syms) * (sym_w + 2) + 10))
297
298    for ds in dfa.states:
299        marker = " <-- start" if ds.id == dfa.start_id else ""
300        row = f"  {ds.name():<{col_w}}"
301        for sym in syms:
302            if sym in ds.transitions:
303                target_name = dfa.states[ds.transitions[sym]].name()
304                row += f"  {target_name:^{sym_w}}"
305            else:
306                row += f"  {'(dead)':^{sym_w}}"
307        row += f"  {'YES' if ds.is_accept else 'no':>6}"
308        row += marker
309        print(row)
310
311
312# ---------------------------------------------------------------------------
313# Demo
314# ---------------------------------------------------------------------------
315
316def run_demo(regex: str, test_strings: list[str]) -> None:
317    nfa = regex_to_nfa(regex)
318    dfa = subset_construction(nfa)
319    print_dfa_table(dfa, regex)
320
321    print(f"\n  Simulation results:")
322    for text in test_strings:
323        result = dfa.accepts(text)
324        display = repr(text) if text else "''"
325        print(f"    {display:>14}  ->  {'ACCEPT' if result else 'reject'}")
326
327
328def main():
329    print("=" * 60)
330    print("Subset Construction: NFA -> DFA")
331    print("=" * 60)
332
333    demos = [
334        # Simple examples to show the state explosion clearly
335        ("a|b",         ["a", "b", "c", "ab"]),
336        ("a*b",         ["b", "ab", "aab", "aaab", "a", ""]),
337        ("(a|b)*abb",   ["abb", "aabb", "babb", "ab", "abba", ""]),
338        ("ab?c",        ["ac", "abc", "abbc", "bc"]),
339    ]
340
341    for regex, tests in demos:
342        print(f"\n{'='*60}")
343        run_demo(regex, tests)
344
345    # Show NFA state count vs DFA state count for (a|b)*abb
346    print(f"\n{'='*60}")
347    print("NFA vs DFA state count comparison:")
348    print(f"{'REGEX':<20} {'NFA states':>12} {'DFA states':>12}")
349    print("-" * 46)
350    for regex, _ in demos:
351        nfa = regex_to_nfa(regex)
352        # count NFA states
353        visited: set[int] = set()
354        queue = [nfa.start]
355        while queue:
356            s = queue.pop()
357            if s.id in visited: continue
358            visited.add(s.id)
359            for tgts in s.transitions.values():
360                queue.extend(tgts)
361        nfa_count = len(visited)
362        dfa = subset_construction(nfa)
363        print(f"  {regex!r:<18} {nfa_count:>12} {len(dfa.states):>12}")
364
365
366if __name__ == "__main__":
367    main()