1"""
202_thompson_nfa.py - Thompson's Construction: Regex to NFA
3
4Demonstrates converting a regular expression to a Non-deterministic
5Finite Automaton (NFA) using Thompson's construction algorithm.
6
7Algorithm overview:
8 1. Parse the regex into postfix notation (handles |, *, +, ?, concatenation)
9 2. Use a stack of NFA fragments
10 3. For each operator, pop fragment(s), apply construction, push result
11 4. Final stack item is the complete NFA
12
13Topics covered:
14 - Infix-to-postfix conversion (Shunting Yard)
15 - NFA state and transition representation
16 - Thompson's construction rules for each operator
17 - Epsilon-closure
18 - NFA simulation (accepts/rejects input strings)
19"""
20
21from __future__ import annotations
22from dataclasses import dataclass, field
23from typing import Optional
24
25EPSILON = '' # Represents an epsilon (empty) transition
26
27
28# ---------------------------------------------------------------------------
29# NFA State and Fragment
30# ---------------------------------------------------------------------------
31
32class State:
33 """A single NFA state. Uses an auto-incrementing ID for display."""
34 _counter = 0
35
36 def __init__(self):
37 State._counter += 1
38 self.id: int = State._counter
39 # transitions: dict mapping symbol -> list[State]
40 self.transitions: dict[str, list[State]] = {}
41 self.is_accept: bool = False
42
43 def add_transition(self, symbol: str, target: State) -> None:
44 self.transitions.setdefault(symbol, []).append(target)
45
46 def __repr__(self) -> str:
47 return f"S{self.id}"
48
49
50@dataclass
51class NFA:
52 """
53 An NFA fragment used during Thompson's construction.
54 Each fragment has exactly one start state and one accept state.
55 After construction, the final NFA's accept state's is_accept is set to True.
56 """
57 start: State
58 accept: State
59
60
61# ---------------------------------------------------------------------------
62# Thompson's Construction
63# ---------------------------------------------------------------------------
64
65def make_literal(ch: str) -> NFA:
66 """NFA that accepts exactly one character 'ch'."""
67 s = State()
68 a = State()
69 s.add_transition(ch, a)
70 return NFA(s, a)
71
72
73def make_epsilon() -> NFA:
74 """NFA that accepts the empty string."""
75 s = State()
76 a = State()
77 s.add_transition(EPSILON, a)
78 return NFA(s, a)
79
80
81def make_concat(nfa1: NFA, nfa2: NFA) -> NFA:
82 """
83 NFA for (nfa1)(nfa2): connect nfa1's accept to nfa2's start via epsilon.
84 """
85 nfa1.accept.add_transition(EPSILON, nfa2.start)
86 return NFA(nfa1.start, nfa2.accept)
87
88
89def make_union(nfa1: NFA, nfa2: NFA) -> NFA:
90 """
91 NFA for (nfa1)|(nfa2):
92 new_start --eps--> nfa1.start
93 new_start --eps--> nfa2.start
94 nfa1.accept --eps--> new_accept
95 nfa2.accept --eps--> new_accept
96 """
97 new_start = State()
98 new_accept = State()
99 new_start.add_transition(EPSILON, nfa1.start)
100 new_start.add_transition(EPSILON, nfa2.start)
101 nfa1.accept.add_transition(EPSILON, new_accept)
102 nfa2.accept.add_transition(EPSILON, new_accept)
103 return NFA(new_start, new_accept)
104
105
106def make_star(nfa: NFA) -> NFA:
107 """
108 NFA for (nfa)*:
109 new_start --eps--> nfa.start
110 new_start --eps--> new_accept (zero repetitions)
111 nfa.accept --eps--> nfa.start (repeat)
112 nfa.accept --eps--> new_accept (exit loop)
113 """
114 new_start = State()
115 new_accept = State()
116 new_start.add_transition(EPSILON, nfa.start)
117 new_start.add_transition(EPSILON, new_accept)
118 nfa.accept.add_transition(EPSILON, nfa.start)
119 nfa.accept.add_transition(EPSILON, new_accept)
120 return NFA(new_start, new_accept)
121
122
123def make_plus(nfa: NFA) -> NFA:
124 """NFA for (nfa)+: one or more. Equivalent to nfa·nfa*."""
125 star = make_star(NFA(nfa.start, nfa.accept))
126 # We need a fresh copy concept; instead use: nfa · (nfa)*
127 # For simplicity in Thompson's, plus(nfa) = concat(nfa, star(nfa_copy))
128 # We reuse existing states: after nfa.accept, loop back
129 # Simpler: make_concat(nfa, make_star(nfa_copy))
130 # Since we can't easily copy states, implement directly:
131 new_start = State()
132 new_accept = State()
133 new_start.add_transition(EPSILON, nfa.start)
134 nfa.accept.add_transition(EPSILON, nfa.start) # repeat
135 nfa.accept.add_transition(EPSILON, new_accept) # exit
136 return NFA(new_start, new_accept)
137
138
139def make_question(nfa: NFA) -> NFA:
140 """NFA for (nfa)?: zero or one. Adds epsilon bypass."""
141 new_start = State()
142 new_accept = State()
143 new_start.add_transition(EPSILON, nfa.start)
144 new_start.add_transition(EPSILON, new_accept) # skip
145 nfa.accept.add_transition(EPSILON, new_accept)
146 return NFA(new_start, new_accept)
147
148
149# ---------------------------------------------------------------------------
150# Regex to NFA: Shunting Yard + Thompson
151# ---------------------------------------------------------------------------
152
153# Operator precedence (higher = tighter binding)
154PRECEDENCE = {'|': 1, '.': 2, '*': 3, '+': 3, '?': 3}
155BINARY_OPS = {'|', '.'}
156UNARY_OPS = {'*', '+', '?'}
157
158
159def add_concat_operator(regex: str) -> str:
160 """
161 Insert explicit concatenation operator '.' between adjacent tokens.
162 E.g., "ab" -> "a.b", "a*b" -> "a*.b", "(a)(b)" -> "(a).(b)"
163 """
164 result = []
165 for i, ch in enumerate(regex):
166 result.append(ch)
167 if i + 1 < len(regex):
168 left = ch
169 right = regex[i + 1]
170 # Insert '.' if left is not an open paren or binary operator,
171 # and right is not a close paren, unary op, or binary op.
172 if (left not in ('(', '|', '.') and
173 right not in (')', '|', '.', '*', '+', '?')):
174 result.append('.')
175 return ''.join(result)
176
177
178def to_postfix(regex: str) -> str:
179 """Convert infix regex (with explicit '.') to postfix using Shunting Yard."""
180 output = []
181 stack = []
182 for ch in regex:
183 if ch == '(':
184 stack.append(ch)
185 elif ch == ')':
186 while stack and stack[-1] != '(':
187 output.append(stack.pop())
188 if stack:
189 stack.pop() # pop '('
190 elif ch in PRECEDENCE:
191 while (stack and stack[-1] != '(' and
192 stack[-1] in PRECEDENCE and
193 PRECEDENCE[stack[-1]] >= PRECEDENCE[ch] and
194 ch not in UNARY_OPS): # right-associative unary
195 output.append(stack.pop())
196 stack.append(ch)
197 else:
198 output.append(ch) # literal character
199 while stack:
200 output.append(stack.pop())
201 return ''.join(output)
202
203
204def regex_to_nfa(regex: str) -> NFA:
205 """Full pipeline: regex string -> NFA."""
206 State._counter = 0 # reset state IDs for clean output
207 explicit = add_concat_operator(regex)
208 postfix = to_postfix(explicit)
209
210 stack: list[NFA] = []
211
212 for ch in postfix:
213 if ch == '.':
214 b = stack.pop()
215 a = stack.pop()
216 stack.append(make_concat(a, b))
217 elif ch == '|':
218 b = stack.pop()
219 a = stack.pop()
220 stack.append(make_union(a, b))
221 elif ch == '*':
222 a = stack.pop()
223 stack.append(make_star(a))
224 elif ch == '+':
225 a = stack.pop()
226 stack.append(make_plus(a))
227 elif ch == '?':
228 a = stack.pop()
229 stack.append(make_question(a))
230 else:
231 stack.append(make_literal(ch))
232
233 nfa = stack[0]
234 nfa.accept.is_accept = True
235 return nfa
236
237
238# ---------------------------------------------------------------------------
239# NFA Simulation
240# ---------------------------------------------------------------------------
241
242def epsilon_closure(states: set[State]) -> set[State]:
243 """Compute the set of states reachable via epsilon transitions."""
244 closure = set(states)
245 worklist = list(states)
246 while worklist:
247 s = worklist.pop()
248 for target in s.transitions.get(EPSILON, []):
249 if target not in closure:
250 closure.add(target)
251 worklist.append(target)
252 return closure
253
254
255def nfa_move(states: set[State], symbol: str) -> set[State]:
256 """Compute states reachable from 'states' via 'symbol' (not epsilon)."""
257 result: set[State] = set()
258 for s in states:
259 result.update(s.transitions.get(symbol, []))
260 return result
261
262
263def nfa_accepts(nfa: NFA, text: str) -> bool:
264 """Simulate the NFA on 'text'. Returns True if any accepting state is reached."""
265 current = epsilon_closure({nfa.start})
266 for ch in text:
267 current = epsilon_closure(nfa_move(current, ch))
268 if not current:
269 return False
270 return any(s.is_accept for s in current)
271
272
273def collect_states(nfa: NFA) -> list[State]:
274 """BFS to collect all reachable states from nfa.start."""
275 visited: set[int] = set()
276 states: list[State] = []
277 queue = [nfa.start]
278 while queue:
279 s = queue.pop(0)
280 if s.id in visited:
281 continue
282 visited.add(s.id)
283 states.append(s)
284 for targets in s.transitions.values():
285 queue.extend(targets)
286 return states
287
288
289def print_nfa(nfa: NFA, regex: str) -> None:
290 """Print a human-readable representation of the NFA."""
291 states = collect_states(nfa)
292 print(f" Regex : {regex}")
293 print(f" Start : S{nfa.start.id}")
294 print(f" Accept : S{nfa.accept.id}")
295 print(f" States : {len(states)}")
296 print(f" Transitions:")
297 for s in sorted(states, key=lambda x: x.id):
298 for sym, targets in sorted(s.transitions.items()):
299 sym_display = 'ε' if sym == EPSILON else repr(sym)
300 for t in targets:
301 marker = ' (accept)' if t.is_accept else ''
302 print(f" S{s.id} --{sym_display}--> S{t.id}{marker}")
303
304
305# ---------------------------------------------------------------------------
306# Demo
307# ---------------------------------------------------------------------------
308
309def main():
310 print("=" * 60)
311 print("Thompson's NFA Construction")
312 print("=" * 60)
313
314 test_cases = [
315 # (regex, strings_to_test)
316 ("a", ["a", "b", ""]),
317 ("ab", ["ab", "a", "abc"]),
318 ("a|b", ["a", "b", "c", ""]),
319 ("a*", ["", "a", "aa", "aaa", "b"]),
320 ("a+", ["", "a", "aa", "b"]),
321 ("a?b", ["b", "ab", "aab"]),
322 ("(a|b)*", ["", "a", "b", "ab", "ba", "aabb", "c"]),
323 ("(a|b)*abb", ["abb", "aabb", "babb", "ab", "abba"]),
324 ("a(b|c)*d", ["ad", "abd", "acd", "abcd", "abbd", "x"]),
325 ]
326
327 for regex, tests in test_cases:
328 print(f"\n{'─'*56}")
329 nfa = regex_to_nfa(regex)
330 print_nfa(nfa, regex)
331 print(f" Simulation:")
332 for text in tests:
333 result = nfa_accepts(nfa, text)
334 symbol = "ACCEPT" if result else "reject"
335 display = repr(text) if text else "''"
336 print(f" {display:>12} -> {symbol}")
337
338
339if __name__ == "__main__":
340 main()