1"""
203_subset_construction.py - NFA to DFA via Subset Construction
3
4Demonstrates the subset construction (powerset construction) algorithm,
5which converts a Non-deterministic Finite Automaton (NFA) into an
6equivalent Deterministic Finite Automaton (DFA).
7
8Key idea:
9 Each DFA state corresponds to a *set* of NFA states.
10 The DFA's transition on symbol 'a' from state {s1,s2,...} is:
11 epsilon_closure( move({s1,s2,...}, 'a') )
12
13Topics covered:
14 - Subset construction algorithm
15 - DFA minimization concept (identifying accept states)
16 - Printing transition tables
17 - Comparing NFA vs DFA simulation results
18"""
19
20from __future__ import annotations
21from dataclasses import dataclass, field
22from typing import Optional
23
24# Reuse NFA machinery from 02_thompson_nfa
25# (copied here to keep this file standalone)
26
27EPSILON = ''
28
29
30class State:
31 _counter = 0
32
33 def __init__(self):
34 State._counter += 1
35 self.id = State._counter
36 self.transitions: dict[str, list[State]] = {}
37 self.is_accept: bool = False
38
39 def add_transition(self, symbol: str, target: State) -> None:
40 self.transitions.setdefault(symbol, []).append(target)
41
42 def __repr__(self):
43 return f"S{self.id}"
44
45
46@dataclass
47class NFA:
48 start: State
49 accept: State
50
51
52# --- Thompson's construction (abbreviated) -----------------------------------
53
54def _lit(ch):
55 s, a = State(), State()
56 s.add_transition(ch, a)
57 return NFA(s, a)
58
59def _concat(n1, n2):
60 n1.accept.add_transition(EPSILON, n2.start)
61 return NFA(n1.start, n2.accept)
62
63def _union(n1, n2):
64 s, a = State(), State()
65 s.add_transition(EPSILON, n1.start)
66 s.add_transition(EPSILON, n2.start)
67 n1.accept.add_transition(EPSILON, a)
68 n2.accept.add_transition(EPSILON, a)
69 return NFA(s, a)
70
71def _star(n):
72 s, a = State(), State()
73 s.add_transition(EPSILON, n.start)
74 s.add_transition(EPSILON, a)
75 n.accept.add_transition(EPSILON, n.start)
76 n.accept.add_transition(EPSILON, a)
77 return NFA(s, a)
78
79PREC = {'|': 1, '.': 2, '*': 3, '+': 3, '?': 3}
80BINOPS = {'|', '.'}
81UNOPS = {'*', '+', '?'}
82
83def _add_concat(r):
84 res = []
85 for i, c in enumerate(r):
86 res.append(c)
87 if i + 1 < len(r):
88 l, ri = c, r[i+1]
89 if l not in ('(', '|', '.') and ri not in (')', '|', '.', '*', '+', '?'):
90 res.append('.')
91 return ''.join(res)
92
93def _postfix(r):
94 out, stk = [], []
95 for c in r:
96 if c == '(':
97 stk.append(c)
98 elif c == ')':
99 while stk and stk[-1] != '(':
100 out.append(stk.pop())
101 if stk: stk.pop()
102 elif c in PREC:
103 while (stk and stk[-1] != '(' and stk[-1] in PREC and
104 PREC[stk[-1]] >= PREC[c] and c not in UNOPS):
105 out.append(stk.pop())
106 stk.append(c)
107 else:
108 out.append(c)
109 while stk: out.append(stk.pop())
110 return ''.join(out)
111
112def regex_to_nfa(regex: str) -> NFA:
113 State._counter = 0
114 postfix = _postfix(_add_concat(regex))
115 stk: list[NFA] = []
116 for c in postfix:
117 if c == '.':
118 b, a = stk.pop(), stk.pop(); stk.append(_concat(a, b))
119 elif c == '|':
120 b, a = stk.pop(), stk.pop(); stk.append(_union(a, b))
121 elif c == '*':
122 stk.append(_star(stk.pop()))
123 elif c == '+':
124 n = stk.pop()
125 # plus: concat(n, star(n)) -- reuse same states
126 s2, a2 = State(), State()
127 s2.add_transition(EPSILON, n.start)
128 n.accept.add_transition(EPSILON, n.start)
129 n.accept.add_transition(EPSILON, a2)
130 stk.append(NFA(s2, a2))
131 elif c == '?':
132 n = stk.pop()
133 s2, a2 = State(), State()
134 s2.add_transition(EPSILON, n.start)
135 s2.add_transition(EPSILON, a2)
136 n.accept.add_transition(EPSILON, a2)
137 stk.append(NFA(s2, a2))
138 else:
139 stk.append(_lit(c))
140 nfa = stk[0]
141 nfa.accept.is_accept = True
142 return nfa
143
144
145# ---------------------------------------------------------------------------
146# NFA simulation helpers
147# ---------------------------------------------------------------------------
148
149def epsilon_closure(states: frozenset[State]) -> frozenset[State]:
150 closure = set(states)
151 worklist = list(states)
152 while worklist:
153 s = worklist.pop()
154 for t in s.transitions.get(EPSILON, []):
155 if t not in closure:
156 closure.add(t)
157 worklist.append(t)
158 return frozenset(closure)
159
160
161def nfa_move(states: frozenset[State], sym: str) -> frozenset[State]:
162 result: set[State] = set()
163 for s in states:
164 result.update(s.transitions.get(sym, []))
165 return frozenset(result)
166
167
168def alphabet(nfa: NFA) -> set[str]:
169 """Collect all non-epsilon symbols used in the NFA."""
170 syms: set[str] = set()
171 visited: set[int] = set()
172 queue = [nfa.start]
173 while queue:
174 s = queue.pop()
175 if s.id in visited:
176 continue
177 visited.add(s.id)
178 for sym, targets in s.transitions.items():
179 if sym != EPSILON:
180 syms.add(sym)
181 queue.extend(targets)
182 return syms
183
184
185# ---------------------------------------------------------------------------
186# DFA representation
187# ---------------------------------------------------------------------------
188
189@dataclass
190class DFAState:
191 """A DFA state representing a frozenset of NFA states."""
192 id: int
193 nfa_states: frozenset[State]
194 is_accept: bool
195 transitions: dict[str, int] = field(default_factory=dict) # symbol -> DFA state id
196
197 def name(self) -> str:
198 ids = sorted(s.id for s in self.nfa_states)
199 return '{' + ','.join(f'S{i}' for i in ids) + '}'
200
201
202@dataclass
203class DFA:
204 states: list[DFAState]
205 start_id: int
206 alphabet: set[str]
207
208 def start(self) -> DFAState:
209 return self.states[self.start_id]
210
211 def accepts(self, text: str) -> bool:
212 current = self.start_id
213 for ch in text:
214 state = self.states[current]
215 if ch not in state.transitions:
216 return False
217 current = state.transitions[ch]
218 return self.states[current].is_accept
219
220
221# ---------------------------------------------------------------------------
222# Subset Construction Algorithm
223# ---------------------------------------------------------------------------
224
225def subset_construction(nfa: NFA) -> DFA:
226 """
227 Convert an NFA to a DFA using the subset construction algorithm.
228
229 Steps:
230 1. Start with epsilon_closure({nfa.start}) as the initial DFA state.
231 2. For each unmarked DFA state D and each symbol a in the alphabet:
232 a. Compute T = epsilon_closure(move(D, a))
233 b. If T is not already a DFA state, add it (unmarked)
234 c. Record the transition D --a--> T
235 3. A DFA state is accepting if any of its NFA states is an accept state.
236 """
237 syms = sorted(alphabet(nfa))
238 start_set = epsilon_closure(frozenset({nfa.start}))
239
240 # Map from frozenset[State] -> DFA state id
241 state_map: dict[frozenset[State], int] = {}
242 dfa_states: list[DFAState] = []
243
244 def get_or_create(nfa_set: frozenset[State]) -> int:
245 if nfa_set in state_map:
246 return state_map[nfa_set]
247 sid = len(dfa_states)
248 is_acc = any(s.is_accept for s in nfa_set)
249 ds = DFAState(id=sid, nfa_states=nfa_set, is_accept=is_acc)
250 dfa_states.append(ds)
251 state_map[nfa_set] = sid
252 return sid
253
254 start_id = get_or_create(start_set)
255 worklist = [start_id]
256
257 while worklist:
258 did = worklist.pop()
259 ds = dfa_states[did]
260 for sym in syms:
261 moved = nfa_move(ds.nfa_states, sym)
262 if not moved:
263 continue # dead state (no transition)
264 closed = epsilon_closure(moved)
265 target_id = get_or_create(closed)
266 ds.transitions[sym] = target_id
267 if target_id == len(dfa_states) - 1: # newly created
268 worklist.append(target_id)
269
270 return DFA(states=dfa_states, start_id=start_id, alphabet=set(syms))
271
272
273# ---------------------------------------------------------------------------
274# Pretty printing
275# ---------------------------------------------------------------------------
276
277def print_dfa_table(dfa: DFA, regex: str) -> None:
278 """Print the DFA transition table."""
279 syms = sorted(dfa.alphabet)
280 print(f"\nDFA for regex: {regex!r}")
281 print(f" States: {len(dfa.states)}")
282 print(f" Alphabet: {{{', '.join(repr(s) for s in syms)}}}")
283 print(f" Start state: {dfa.states[dfa.start_id].name()}")
284 accept_names = [dfa.states[i].name() for i, s in enumerate(dfa.states) if s.is_accept]
285 print(f" Accept states: {accept_names}")
286
287 # Header
288 col_w = max(20, max(len(dfa.states[i].name()) for i in range(len(dfa.states))) + 2)
289 sym_w = 12
290 header = f" {'STATE':<{col_w}}"
291 for sym in syms:
292 header += f" {repr(sym):^{sym_w}}"
293 header += f" {'ACCEPT':>6}"
294 print()
295 print(header)
296 print(" " + "-" * (col_w + len(syms) * (sym_w + 2) + 10))
297
298 for ds in dfa.states:
299 marker = " <-- start" if ds.id == dfa.start_id else ""
300 row = f" {ds.name():<{col_w}}"
301 for sym in syms:
302 if sym in ds.transitions:
303 target_name = dfa.states[ds.transitions[sym]].name()
304 row += f" {target_name:^{sym_w}}"
305 else:
306 row += f" {'(dead)':^{sym_w}}"
307 row += f" {'YES' if ds.is_accept else 'no':>6}"
308 row += marker
309 print(row)
310
311
312# ---------------------------------------------------------------------------
313# Demo
314# ---------------------------------------------------------------------------
315
316def run_demo(regex: str, test_strings: list[str]) -> None:
317 nfa = regex_to_nfa(regex)
318 dfa = subset_construction(nfa)
319 print_dfa_table(dfa, regex)
320
321 print(f"\n Simulation results:")
322 for text in test_strings:
323 result = dfa.accepts(text)
324 display = repr(text) if text else "''"
325 print(f" {display:>14} -> {'ACCEPT' if result else 'reject'}")
326
327
328def main():
329 print("=" * 60)
330 print("Subset Construction: NFA -> DFA")
331 print("=" * 60)
332
333 demos = [
334 # Simple examples to show the state explosion clearly
335 ("a|b", ["a", "b", "c", "ab"]),
336 ("a*b", ["b", "ab", "aab", "aaab", "a", ""]),
337 ("(a|b)*abb", ["abb", "aabb", "babb", "ab", "abba", ""]),
338 ("ab?c", ["ac", "abc", "abbc", "bc"]),
339 ]
340
341 for regex, tests in demos:
342 print(f"\n{'='*60}")
343 run_demo(regex, tests)
344
345 # Show NFA state count vs DFA state count for (a|b)*abb
346 print(f"\n{'='*60}")
347 print("NFA vs DFA state count comparison:")
348 print(f"{'REGEX':<20} {'NFA states':>12} {'DFA states':>12}")
349 print("-" * 46)
350 for regex, _ in demos:
351 nfa = regex_to_nfa(regex)
352 # count NFA states
353 visited: set[int] = set()
354 queue = [nfa.start]
355 while queue:
356 s = queue.pop()
357 if s.id in visited: continue
358 visited.add(s.id)
359 for tgts in s.transitions.values():
360 queue.extend(tgts)
361 nfa_count = len(visited)
362 dfa = subset_construction(nfa)
363 print(f" {regex!r:<18} {nfa_count:>12} {len(dfa.states):>12}")
364
365
366if __name__ == "__main__":
367 main()