02_thompson_nfa.py

  1"""
  202_thompson_nfa.py - Thompson's Construction: Regex to NFA
  3
  4Demonstrates converting a regular expression to a Non-deterministic
  5Finite Automaton (NFA) using Thompson's construction algorithm.
  6
  7Algorithm overview:
  8  1. Parse the regex into postfix notation (handles |, *, +, ?, concatenation)
  9  2. Use a stack of NFA fragments
 10  3. For each operator, pop fragment(s), apply construction, push result
 11  4. Final stack item is the complete NFA
 12
 13Topics covered:
 14  - Infix-to-postfix conversion (Shunting Yard)
 15  - NFA state and transition representation
 16  - Thompson's construction rules for each operator
 17  - Epsilon-closure
 18  - NFA simulation (accepts/rejects input strings)
 19"""
 20
 21from __future__ import annotations
 22from dataclasses import dataclass, field
 23from typing import Optional
 24
 25EPSILON = ''   # Represents an epsilon (empty) transition
 26
 27
 28# ---------------------------------------------------------------------------
 29# NFA State and Fragment
 30# ---------------------------------------------------------------------------
 31
 32class State:
 33    """A single NFA state. Uses an auto-incrementing ID for display."""
 34    _counter = 0
 35
 36    def __init__(self):
 37        State._counter += 1
 38        self.id: int = State._counter
 39        # transitions: dict mapping symbol -> list[State]
 40        self.transitions: dict[str, list[State]] = {}
 41        self.is_accept: bool = False
 42
 43    def add_transition(self, symbol: str, target: State) -> None:
 44        self.transitions.setdefault(symbol, []).append(target)
 45
 46    def __repr__(self) -> str:
 47        return f"S{self.id}"
 48
 49
 50@dataclass
 51class NFA:
 52    """
 53    An NFA fragment used during Thompson's construction.
 54    Each fragment has exactly one start state and one accept state.
 55    After construction, the final NFA's accept state's is_accept is set to True.
 56    """
 57    start: State
 58    accept: State
 59
 60
 61# ---------------------------------------------------------------------------
 62# Thompson's Construction
 63# ---------------------------------------------------------------------------
 64
 65def make_literal(ch: str) -> NFA:
 66    """NFA that accepts exactly one character 'ch'."""
 67    s = State()
 68    a = State()
 69    s.add_transition(ch, a)
 70    return NFA(s, a)
 71
 72
 73def make_epsilon() -> NFA:
 74    """NFA that accepts the empty string."""
 75    s = State()
 76    a = State()
 77    s.add_transition(EPSILON, a)
 78    return NFA(s, a)
 79
 80
 81def make_concat(nfa1: NFA, nfa2: NFA) -> NFA:
 82    """
 83    NFA for (nfa1)(nfa2): connect nfa1's accept to nfa2's start via epsilon.
 84    """
 85    nfa1.accept.add_transition(EPSILON, nfa2.start)
 86    return NFA(nfa1.start, nfa2.accept)
 87
 88
 89def make_union(nfa1: NFA, nfa2: NFA) -> NFA:
 90    """
 91    NFA for (nfa1)|(nfa2):
 92      new_start --eps--> nfa1.start
 93      new_start --eps--> nfa2.start
 94      nfa1.accept --eps--> new_accept
 95      nfa2.accept --eps--> new_accept
 96    """
 97    new_start = State()
 98    new_accept = State()
 99    new_start.add_transition(EPSILON, nfa1.start)
100    new_start.add_transition(EPSILON, nfa2.start)
101    nfa1.accept.add_transition(EPSILON, new_accept)
102    nfa2.accept.add_transition(EPSILON, new_accept)
103    return NFA(new_start, new_accept)
104
105
106def make_star(nfa: NFA) -> NFA:
107    """
108    NFA for (nfa)*:
109      new_start --eps--> nfa.start
110      new_start --eps--> new_accept  (zero repetitions)
111      nfa.accept --eps--> nfa.start  (repeat)
112      nfa.accept --eps--> new_accept (exit loop)
113    """
114    new_start = State()
115    new_accept = State()
116    new_start.add_transition(EPSILON, nfa.start)
117    new_start.add_transition(EPSILON, new_accept)
118    nfa.accept.add_transition(EPSILON, nfa.start)
119    nfa.accept.add_transition(EPSILON, new_accept)
120    return NFA(new_start, new_accept)
121
122
123def make_plus(nfa: NFA) -> NFA:
124    """NFA for (nfa)+: one or more. Equivalent to nfa·nfa*."""
125    star = make_star(NFA(nfa.start, nfa.accept))
126    # We need a fresh copy concept; instead use: nfa · (nfa)*
127    # For simplicity in Thompson's, plus(nfa) = concat(nfa, star(nfa_copy))
128    # We reuse existing states: after nfa.accept, loop back
129    # Simpler: make_concat(nfa, make_star(nfa_copy))
130    # Since we can't easily copy states, implement directly:
131    new_start = State()
132    new_accept = State()
133    new_start.add_transition(EPSILON, nfa.start)
134    nfa.accept.add_transition(EPSILON, nfa.start)   # repeat
135    nfa.accept.add_transition(EPSILON, new_accept)   # exit
136    return NFA(new_start, new_accept)
137
138
139def make_question(nfa: NFA) -> NFA:
140    """NFA for (nfa)?: zero or one. Adds epsilon bypass."""
141    new_start = State()
142    new_accept = State()
143    new_start.add_transition(EPSILON, nfa.start)
144    new_start.add_transition(EPSILON, new_accept)   # skip
145    nfa.accept.add_transition(EPSILON, new_accept)
146    return NFA(new_start, new_accept)
147
148
149# ---------------------------------------------------------------------------
150# Regex to NFA: Shunting Yard + Thompson
151# ---------------------------------------------------------------------------
152
153# Operator precedence (higher = tighter binding)
154PRECEDENCE = {'|': 1, '.': 2, '*': 3, '+': 3, '?': 3}
155BINARY_OPS = {'|', '.'}
156UNARY_OPS = {'*', '+', '?'}
157
158
159def add_concat_operator(regex: str) -> str:
160    """
161    Insert explicit concatenation operator '.' between adjacent tokens.
162    E.g., "ab" -> "a.b",  "a*b" -> "a*.b",  "(a)(b)" -> "(a).(b)"
163    """
164    result = []
165    for i, ch in enumerate(regex):
166        result.append(ch)
167        if i + 1 < len(regex):
168            left = ch
169            right = regex[i + 1]
170            # Insert '.' if left is not an open paren or binary operator,
171            # and right is not a close paren, unary op, or binary op.
172            if (left not in ('(', '|', '.') and
173                    right not in (')', '|', '.', '*', '+', '?')):
174                result.append('.')
175    return ''.join(result)
176
177
178def to_postfix(regex: str) -> str:
179    """Convert infix regex (with explicit '.') to postfix using Shunting Yard."""
180    output = []
181    stack = []
182    for ch in regex:
183        if ch == '(':
184            stack.append(ch)
185        elif ch == ')':
186            while stack and stack[-1] != '(':
187                output.append(stack.pop())
188            if stack:
189                stack.pop()   # pop '('
190        elif ch in PRECEDENCE:
191            while (stack and stack[-1] != '(' and
192                   stack[-1] in PRECEDENCE and
193                   PRECEDENCE[stack[-1]] >= PRECEDENCE[ch] and
194                   ch not in UNARY_OPS):   # right-associative unary
195                output.append(stack.pop())
196            stack.append(ch)
197        else:
198            output.append(ch)   # literal character
199    while stack:
200        output.append(stack.pop())
201    return ''.join(output)
202
203
204def regex_to_nfa(regex: str) -> NFA:
205    """Full pipeline: regex string -> NFA."""
206    State._counter = 0   # reset state IDs for clean output
207    explicit = add_concat_operator(regex)
208    postfix = to_postfix(explicit)
209
210    stack: list[NFA] = []
211
212    for ch in postfix:
213        if ch == '.':
214            b = stack.pop()
215            a = stack.pop()
216            stack.append(make_concat(a, b))
217        elif ch == '|':
218            b = stack.pop()
219            a = stack.pop()
220            stack.append(make_union(a, b))
221        elif ch == '*':
222            a = stack.pop()
223            stack.append(make_star(a))
224        elif ch == '+':
225            a = stack.pop()
226            stack.append(make_plus(a))
227        elif ch == '?':
228            a = stack.pop()
229            stack.append(make_question(a))
230        else:
231            stack.append(make_literal(ch))
232
233    nfa = stack[0]
234    nfa.accept.is_accept = True
235    return nfa
236
237
238# ---------------------------------------------------------------------------
239# NFA Simulation
240# ---------------------------------------------------------------------------
241
242def epsilon_closure(states: set[State]) -> set[State]:
243    """Compute the set of states reachable via epsilon transitions."""
244    closure = set(states)
245    worklist = list(states)
246    while worklist:
247        s = worklist.pop()
248        for target in s.transitions.get(EPSILON, []):
249            if target not in closure:
250                closure.add(target)
251                worklist.append(target)
252    return closure
253
254
255def nfa_move(states: set[State], symbol: str) -> set[State]:
256    """Compute states reachable from 'states' via 'symbol' (not epsilon)."""
257    result: set[State] = set()
258    for s in states:
259        result.update(s.transitions.get(symbol, []))
260    return result
261
262
263def nfa_accepts(nfa: NFA, text: str) -> bool:
264    """Simulate the NFA on 'text'. Returns True if any accepting state is reached."""
265    current = epsilon_closure({nfa.start})
266    for ch in text:
267        current = epsilon_closure(nfa_move(current, ch))
268        if not current:
269            return False
270    return any(s.is_accept for s in current)
271
272
273def collect_states(nfa: NFA) -> list[State]:
274    """BFS to collect all reachable states from nfa.start."""
275    visited: set[int] = set()
276    states: list[State] = []
277    queue = [nfa.start]
278    while queue:
279        s = queue.pop(0)
280        if s.id in visited:
281            continue
282        visited.add(s.id)
283        states.append(s)
284        for targets in s.transitions.values():
285            queue.extend(targets)
286    return states
287
288
289def print_nfa(nfa: NFA, regex: str) -> None:
290    """Print a human-readable representation of the NFA."""
291    states = collect_states(nfa)
292    print(f"  Regex  : {regex}")
293    print(f"  Start  : S{nfa.start.id}")
294    print(f"  Accept : S{nfa.accept.id}")
295    print(f"  States : {len(states)}")
296    print(f"  Transitions:")
297    for s in sorted(states, key=lambda x: x.id):
298        for sym, targets in sorted(s.transitions.items()):
299            sym_display = 'ε' if sym == EPSILON else repr(sym)
300            for t in targets:
301                marker = ' (accept)' if t.is_accept else ''
302                print(f"    S{s.id} --{sym_display}--> S{t.id}{marker}")
303
304
305# ---------------------------------------------------------------------------
306# Demo
307# ---------------------------------------------------------------------------
308
309def main():
310    print("=" * 60)
311    print("Thompson's NFA Construction")
312    print("=" * 60)
313
314    test_cases = [
315        # (regex, strings_to_test)
316        ("a",           ["a", "b", ""]),
317        ("ab",          ["ab", "a", "abc"]),
318        ("a|b",         ["a", "b", "c", ""]),
319        ("a*",          ["", "a", "aa", "aaa", "b"]),
320        ("a+",          ["", "a", "aa", "b"]),
321        ("a?b",         ["b", "ab", "aab"]),
322        ("(a|b)*",      ["", "a", "b", "ab", "ba", "aabb", "c"]),
323        ("(a|b)*abb",   ["abb", "aabb", "babb", "ab", "abba"]),
324        ("a(b|c)*d",    ["ad", "abd", "acd", "abcd", "abbd", "x"]),
325    ]
326
327    for regex, tests in test_cases:
328        print(f"\n{'─'*56}")
329        nfa = regex_to_nfa(regex)
330        print_nfa(nfa, regex)
331        print(f"  Simulation:")
332        for text in tests:
333            result = nfa_accepts(nfa, text)
334            symbol = "ACCEPT" if result else "reject"
335            display = repr(text) if text else "''"
336            print(f"    {display:>12}  ->  {symbol}")
337
338
339if __name__ == "__main__":
340    main()