1"""
205_ll1_parser.py - LL(1) Table-Driven Parser
3
4Demonstrates the construction and use of an LL(1) parser:
5 1. Represent a context-free grammar
6 2. Compute FIRST sets for all symbols
7 3. Compute FOLLOW sets for all non-terminals
8 4. Build the LL(1) parsing table
9 5. Table-driven parsing using an explicit stack
10
11Grammar (unambiguous arithmetic expressions):
12 E -> T E'
13 E' -> '+' T E' | '-' T E' | ε
14 T -> F T'
15 T' -> '*' F T' | '/' F T' | ε
16 F -> '(' E ')' | num | id
17
18This grammar is already left-recursion-free and suitable for LL(1).
19
20Topics covered:
21 - Grammar representation using Python dicts
22 - FIRST set computation (handles ε productions)
23 - FOLLOW set computation
24 - LL(1) parse table construction
25 - Detecting grammar conflicts (not LL(1) if conflicts exist)
26 - Stack-based parsing with parse trace
27"""
28
29from __future__ import annotations
30from collections import defaultdict
31from typing import Optional
32
33EPSILON = 'ε'
34EOF_SYM = '$'
35
36
37# ---------------------------------------------------------------------------
38# Grammar Representation
39# ---------------------------------------------------------------------------
40
41class Grammar:
42 """
43 A context-free grammar.
44 Productions are stored as:
45 { NonTerminal: [ [sym1, sym2, ...], [sym1, ...], ... ] }
46 Terminals are single-quoted strings like 'id', 'num', '+', etc.
47 Non-terminals are unquoted strings: 'E', 'T', 'F', etc.
48 """
49
50 def __init__(self, start: str, productions: dict[str, list[list[str]]]):
51 self.start = start
52 self.productions = productions
53
54 # Identify all non-terminals and terminals
55 self.non_terminals: set[str] = set(productions.keys())
56 self.terminals: set[str] = set()
57 for rhs_list in productions.values():
58 for rhs in rhs_list:
59 for sym in rhs:
60 if sym not in self.non_terminals and sym != EPSILON:
61 self.terminals.add(sym)
62
63 def is_terminal(self, sym: str) -> bool:
64 return sym in self.terminals or sym == EOF_SYM
65
66 def is_nonterminal(self, sym: str) -> bool:
67 return sym in self.non_terminals
68
69 def __repr__(self) -> str:
70 lines = [f"Grammar (start={self.start!r}):"]
71 for nt, prods in self.productions.items():
72 for rhs in prods:
73 lines.append(f" {nt} -> {' '.join(rhs)}")
74 return '\n'.join(lines)
75
76
77# ---------------------------------------------------------------------------
78# FIRST Set Computation
79# ---------------------------------------------------------------------------
80
81def compute_first(grammar: Grammar) -> dict[str, set[str]]:
82 """
83 Compute FIRST(X) for every symbol X in the grammar.
84
85 FIRST(X) = set of terminals that can appear as the first symbol
86 of any string derived from X. Includes ε if X =>* ε.
87
88 Rules:
89 1. If X is terminal: FIRST(X) = {X}
90 2. If X -> ε: ε ∈ FIRST(X)
91 3. If X -> Y1 Y2 ... Yk:
92 add FIRST(Y1) - {ε} to FIRST(X)
93 if ε ∈ FIRST(Y1), add FIRST(Y2) - {ε}, etc.
94 if ε ∈ FIRST(Yi) for all i, add ε to FIRST(X)
95 """
96 first: dict[str, set[str]] = defaultdict(set)
97
98 # Terminals: FIRST(t) = {t}
99 for t in grammar.terminals:
100 first[t].add(t)
101 first[EOF_SYM].add(EOF_SYM)
102
103 # Iteratively compute FIRST for non-terminals
104 changed = True
105 while changed:
106 changed = False
107 for nt, prods in grammar.productions.items():
108 for rhs in prods:
109 # Handle epsilon production
110 if rhs == [EPSILON]:
111 if EPSILON not in first[nt]:
112 first[nt].add(EPSILON)
113 changed = True
114 continue
115
116 # Compute FIRST of the RHS
117 all_have_epsilon = True
118 for sym in rhs:
119 sym_first = first[sym] - {EPSILON}
120 before = len(first[nt])
121 first[nt].update(sym_first)
122 if len(first[nt]) > before:
123 changed = True
124 if EPSILON not in first[sym]:
125 all_have_epsilon = False
126 break
127 else:
128 pass
129
130 if all_have_epsilon:
131 if EPSILON not in first[nt]:
132 first[nt].add(EPSILON)
133 changed = True
134
135 return dict(first)
136
137
138def first_of_string(symbols: list[str], first: dict[str, set[str]]) -> set[str]:
139 """
140 Compute FIRST of a sequence of symbols (e.g., the RHS of a production).
141 """
142 result: set[str] = set()
143 all_nullable = True
144 for sym in symbols:
145 if sym == EPSILON:
146 continue
147 result.update(first.get(sym, set()) - {EPSILON})
148 if EPSILON not in first.get(sym, set()):
149 all_nullable = False
150 break
151 if all_nullable:
152 result.add(EPSILON)
153 return result
154
155
156# ---------------------------------------------------------------------------
157# FOLLOW Set Computation
158# ---------------------------------------------------------------------------
159
160def compute_follow(grammar: Grammar, first: dict[str, set[str]]) -> dict[str, set[str]]:
161 """
162 Compute FOLLOW(A) for every non-terminal A.
163
164 FOLLOW(A) = set of terminals that can appear immediately to the
165 right of A in some sentential form.
166
167 Rules:
168 1. EOF_SYM ($) ∈ FOLLOW(start)
169 2. If B -> α A β: add FIRST(β) - {ε} to FOLLOW(A)
170 3. If B -> α A or B -> α A β where ε ∈ FIRST(β):
171 add FOLLOW(B) to FOLLOW(A)
172 """
173 follow: dict[str, set[str]] = defaultdict(set)
174 follow[grammar.start].add(EOF_SYM)
175
176 changed = True
177 while changed:
178 changed = False
179 for nt, prods in grammar.productions.items():
180 for rhs in prods:
181 for i, sym in enumerate(rhs):
182 if not grammar.is_nonterminal(sym):
183 continue
184 # Compute FIRST(β) where β = rhs[i+1:]
185 beta = rhs[i+1:]
186 beta_first = first_of_string(beta, first) if beta else {EPSILON}
187
188 # Rule 2: add FIRST(β) - {ε}
189 before = len(follow[sym])
190 follow[sym].update(beta_first - {EPSILON})
191 if len(follow[sym]) > before:
192 changed = True
193
194 # Rule 3: if ε ∈ FIRST(β), add FOLLOW(B)
195 if EPSILON in beta_first:
196 before = len(follow[sym])
197 follow[sym].update(follow[nt])
198 if len(follow[sym]) > before:
199 changed = True
200
201 return dict(follow)
202
203
204# ---------------------------------------------------------------------------
205# LL(1) Parse Table Construction
206# ---------------------------------------------------------------------------
207
208class GrammarConflictError(Exception):
209 pass
210
211
212def build_ll1_table(
213 grammar: Grammar,
214 first: dict[str, set[str]],
215 follow: dict[str, set[str]],
216) -> dict[tuple[str, str], list[str]]:
217 """
218 Build the LL(1) parse table M[A, a].
219
220 For each production A -> α:
221 1. For each terminal a in FIRST(α): set M[A, a] = α
222 2. If ε ∈ FIRST(α):
223 for each terminal b in FOLLOW(A): set M[A, b] = α
224 if $ ∈ FOLLOW(A): set M[A, $] = α
225
226 Raises GrammarConflictError if a cell would be filled twice
227 (indicating the grammar is not LL(1)).
228 """
229 table: dict[tuple[str, str], list[str]] = {}
230 conflicts: list[str] = []
231
232 for nt, prods in grammar.productions.items():
233 for rhs in prods:
234 f = first_of_string(rhs, first)
235
236 for terminal in f - {EPSILON}:
237 key = (nt, terminal)
238 if key in table:
239 conflicts.append(f"Conflict at M[{nt}, {terminal!r}]")
240 table[key] = rhs
241
242 if EPSILON in f:
243 for terminal in follow.get(nt, set()):
244 key = (nt, terminal)
245 if key in table:
246 conflicts.append(f"Conflict at M[{nt}, {terminal!r}]")
247 table[key] = rhs
248
249 if conflicts:
250 raise GrammarConflictError(
251 "Grammar is NOT LL(1):\n" + "\n".join(f" {c}" for c in conflicts)
252 )
253 return table
254
255
256# ---------------------------------------------------------------------------
257# Table-Driven LL(1) Parser
258# ---------------------------------------------------------------------------
259
260class ParseError(Exception):
261 pass
262
263
264def ll1_parse(tokens: list[str], table: dict, grammar: Grammar) -> list[str]:
265 """
266 Parse a token list using the LL(1) table.
267 Returns a list of strings describing each parse step (the parse trace).
268
269 Stack-based algorithm:
270 Initialize: stack = [$, start]
271 Loop:
272 Let X = top of stack, a = current input token
273 if X == a == $: success
274 if X is terminal: if X == a, pop and advance; else error
275 if X is non-terminal: look up M[X, a]; push RHS in reverse
276 """
277 trace: list[str] = []
278 input_tokens = tokens + [EOF_SYM]
279 stack = [EOF_SYM, grammar.start]
280 ip = 0 # input pointer
281
282 while stack:
283 X = stack[-1]
284 a = input_tokens[ip]
285
286 if X == EOF_SYM and a == EOF_SYM:
287 trace.append(f" ACCEPT")
288 break
289
290 if X == a: # terminal match
291 trace.append(f" match {X!r}")
292 stack.pop()
293 ip += 1
294 elif grammar.is_nonterminal(X):
295 key = (X, a)
296 if key not in table:
297 raise ParseError(
298 f"No entry in table for M[{X!r}, {a!r}]. Input: {input_tokens}"
299 )
300 rhs = table[key]
301 stack.pop()
302 display_rhs = ' '.join(rhs) if rhs != [EPSILON] else 'ε'
303 trace.append(f" {X} -> {display_rhs}")
304 if rhs != [EPSILON]:
305 for sym in reversed(rhs):
306 stack.append(sym)
307 else:
308 raise ParseError(f"Unexpected token {a!r}, expected {X!r}")
309
310 return trace
311
312
313# ---------------------------------------------------------------------------
314# Pretty printing helpers
315# ---------------------------------------------------------------------------
316
317def print_sets(label: str, sets: dict[str, set[str]], symbols: list[str]) -> None:
318 print(f"\n{label}:")
319 for sym in symbols:
320 s = sets.get(sym, set())
321 items = ', '.join(sorted(s))
322 print(f" {sym:<8} = {{ {items} }}")
323
324
325def print_parse_table(
326 table: dict[tuple[str, str], list[str]],
327 grammar: Grammar,
328 terminals: list[str],
329) -> None:
330 nts = list(grammar.non_terminals)
331 terminals_all = sorted(terminals) + [EOF_SYM]
332 col_w = 18
333 print("\nLL(1) Parse Table:")
334 header = f" {'':12}" + ''.join(f" {t!r:^{col_w}}" for t in terminals_all)
335 print(header)
336 print(" " + "-" * (12 + len(terminals_all) * (col_w + 2)))
337 for nt in sorted(nts):
338 row = f" {nt:<12}"
339 for t in terminals_all:
340 rhs = table.get((nt, t))
341 if rhs:
342 cell = nt + '->' + ' '.join(rhs)
343 else:
344 cell = ''
345 row += f" {cell:^{col_w}}"
346 print(row)
347
348
349# ---------------------------------------------------------------------------
350# Demo Grammar and Tests
351# ---------------------------------------------------------------------------
352
353# Standard unambiguous expression grammar (left-recursion eliminated)
354EXPR_GRAMMAR = Grammar(
355 start='E',
356 productions={
357 'E': [['T', "E'"]],
358 "E'": [['+', 'T', "E'"], ['-', 'T', "E'"], [EPSILON]],
359 'T': [['F', "T'"]],
360 "T'": [['*', 'F', "T'"], ['/', 'F', "T'"], [EPSILON]],
361 'F': [['(', 'E', ')'], ['num'], ['id']],
362 }
363)
364
365# Test token sequences (already tokenized)
366TEST_INPUTS = [
367 ['num', '+', 'num', '*', 'num'], # num + num * num
368 ['id', '*', '(', 'num', '+', 'id', ')'],# id * (num + id)
369 ['num'], # just a number
370 ['(', 'id', '+', 'num', ')', '*', 'id'],# (id + num) * id
371]
372
373
374def main():
375 print("=" * 60)
376 print("LL(1) Parser Demo")
377 print("=" * 60)
378
379 grammar = EXPR_GRAMMAR
380 print(f"\n{grammar}")
381
382 # Compute FIRST and FOLLOW sets
383 first = compute_first(grammar)
384 follow = compute_follow(grammar, first)
385
386 nt_order = ['E', "E'", 'T', "T'", 'F']
387 print_sets("FIRST sets", first, nt_order)
388 print_sets("FOLLOW sets", follow, nt_order)
389
390 # Build parse table
391 table = build_ll1_table(grammar, first, follow)
392 print_parse_table(table, grammar, sorted(grammar.terminals))
393
394 # Parse test inputs
395 print("\n--- Parse Traces ---")
396 for tokens in TEST_INPUTS:
397 expr_display = ' '.join(tokens)
398 print(f"\nInput: {expr_display}")
399 try:
400 trace = ll1_parse(tokens, table, grammar)
401 for step in trace:
402 print(step)
403 except ParseError as e:
404 print(f" ERROR: {e}")
405
406
407if __name__ == "__main__":
408 main()