05_ll1_parser.py

Download
python 409 lines 12.9 KB
  1"""
  205_ll1_parser.py - LL(1) Table-Driven Parser
  3
  4Demonstrates the construction and use of an LL(1) parser:
  5  1. Represent a context-free grammar
  6  2. Compute FIRST sets for all symbols
  7  3. Compute FOLLOW sets for all non-terminals
  8  4. Build the LL(1) parsing table
  9  5. Table-driven parsing using an explicit stack
 10
 11Grammar (unambiguous arithmetic expressions):
 12  E  -> T E'
 13  E' -> '+' T E' | '-' T E' | ε
 14  T  -> F T'
 15  T' -> '*' F T' | '/' F T' | ε
 16  F  -> '(' E ')' | num | id
 17
 18This grammar is already left-recursion-free and suitable for LL(1).
 19
 20Topics covered:
 21  - Grammar representation using Python dicts
 22  - FIRST set computation (handles ε productions)
 23  - FOLLOW set computation
 24  - LL(1) parse table construction
 25  - Detecting grammar conflicts (not LL(1) if conflicts exist)
 26  - Stack-based parsing with parse trace
 27"""
 28
 29from __future__ import annotations
 30from collections import defaultdict
 31from typing import Optional
 32
 33EPSILON = 'ε'
 34EOF_SYM = '$'
 35
 36
 37# ---------------------------------------------------------------------------
 38# Grammar Representation
 39# ---------------------------------------------------------------------------
 40
 41class Grammar:
 42    """
 43    A context-free grammar.
 44    Productions are stored as:
 45      { NonTerminal: [ [sym1, sym2, ...], [sym1, ...], ... ] }
 46    Terminals are single-quoted strings like 'id', 'num', '+', etc.
 47    Non-terminals are unquoted strings: 'E', 'T', 'F', etc.
 48    """
 49
 50    def __init__(self, start: str, productions: dict[str, list[list[str]]]):
 51        self.start = start
 52        self.productions = productions
 53
 54        # Identify all non-terminals and terminals
 55        self.non_terminals: set[str] = set(productions.keys())
 56        self.terminals: set[str] = set()
 57        for rhs_list in productions.values():
 58            for rhs in rhs_list:
 59                for sym in rhs:
 60                    if sym not in self.non_terminals and sym != EPSILON:
 61                        self.terminals.add(sym)
 62
 63    def is_terminal(self, sym: str) -> bool:
 64        return sym in self.terminals or sym == EOF_SYM
 65
 66    def is_nonterminal(self, sym: str) -> bool:
 67        return sym in self.non_terminals
 68
 69    def __repr__(self) -> str:
 70        lines = [f"Grammar (start={self.start!r}):"]
 71        for nt, prods in self.productions.items():
 72            for rhs in prods:
 73                lines.append(f"  {nt} -> {' '.join(rhs)}")
 74        return '\n'.join(lines)
 75
 76
 77# ---------------------------------------------------------------------------
 78# FIRST Set Computation
 79# ---------------------------------------------------------------------------
 80
 81def compute_first(grammar: Grammar) -> dict[str, set[str]]:
 82    """
 83    Compute FIRST(X) for every symbol X in the grammar.
 84
 85    FIRST(X) = set of terminals that can appear as the first symbol
 86               of any string derived from X. Includes ε if X =>* ε.
 87
 88    Rules:
 89      1. If X is terminal: FIRST(X) = {X}
 90      2. If X -> ε: ε ∈ FIRST(X)
 91      3. If X -> Y1 Y2 ... Yk:
 92           add FIRST(Y1) - {ε} to FIRST(X)
 93           if ε ∈ FIRST(Y1), add FIRST(Y2) - {ε}, etc.
 94           if ε ∈ FIRST(Yi) for all i, add ε to FIRST(X)
 95    """
 96    first: dict[str, set[str]] = defaultdict(set)
 97
 98    # Terminals: FIRST(t) = {t}
 99    for t in grammar.terminals:
100        first[t].add(t)
101    first[EOF_SYM].add(EOF_SYM)
102
103    # Iteratively compute FIRST for non-terminals
104    changed = True
105    while changed:
106        changed = False
107        for nt, prods in grammar.productions.items():
108            for rhs in prods:
109                # Handle epsilon production
110                if rhs == [EPSILON]:
111                    if EPSILON not in first[nt]:
112                        first[nt].add(EPSILON)
113                        changed = True
114                    continue
115
116                # Compute FIRST of the RHS
117                all_have_epsilon = True
118                for sym in rhs:
119                    sym_first = first[sym] - {EPSILON}
120                    before = len(first[nt])
121                    first[nt].update(sym_first)
122                    if len(first[nt]) > before:
123                        changed = True
124                    if EPSILON not in first[sym]:
125                        all_have_epsilon = False
126                        break
127                else:
128                    pass
129
130                if all_have_epsilon:
131                    if EPSILON not in first[nt]:
132                        first[nt].add(EPSILON)
133                        changed = True
134
135    return dict(first)
136
137
138def first_of_string(symbols: list[str], first: dict[str, set[str]]) -> set[str]:
139    """
140    Compute FIRST of a sequence of symbols (e.g., the RHS of a production).
141    """
142    result: set[str] = set()
143    all_nullable = True
144    for sym in symbols:
145        if sym == EPSILON:
146            continue
147        result.update(first.get(sym, set()) - {EPSILON})
148        if EPSILON not in first.get(sym, set()):
149            all_nullable = False
150            break
151    if all_nullable:
152        result.add(EPSILON)
153    return result
154
155
156# ---------------------------------------------------------------------------
157# FOLLOW Set Computation
158# ---------------------------------------------------------------------------
159
160def compute_follow(grammar: Grammar, first: dict[str, set[str]]) -> dict[str, set[str]]:
161    """
162    Compute FOLLOW(A) for every non-terminal A.
163
164    FOLLOW(A) = set of terminals that can appear immediately to the
165                right of A in some sentential form.
166
167    Rules:
168      1. EOF_SYM ($) ∈ FOLLOW(start)
169      2. If B -> α A β: add FIRST(β) - {ε} to FOLLOW(A)
170      3. If B -> α A or B -> α A β where ε ∈ FIRST(β):
171           add FOLLOW(B) to FOLLOW(A)
172    """
173    follow: dict[str, set[str]] = defaultdict(set)
174    follow[grammar.start].add(EOF_SYM)
175
176    changed = True
177    while changed:
178        changed = False
179        for nt, prods in grammar.productions.items():
180            for rhs in prods:
181                for i, sym in enumerate(rhs):
182                    if not grammar.is_nonterminal(sym):
183                        continue
184                    # Compute FIRST(β) where β = rhs[i+1:]
185                    beta = rhs[i+1:]
186                    beta_first = first_of_string(beta, first) if beta else {EPSILON}
187
188                    # Rule 2: add FIRST(β) - {ε}
189                    before = len(follow[sym])
190                    follow[sym].update(beta_first - {EPSILON})
191                    if len(follow[sym]) > before:
192                        changed = True
193
194                    # Rule 3: if ε ∈ FIRST(β), add FOLLOW(B)
195                    if EPSILON in beta_first:
196                        before = len(follow[sym])
197                        follow[sym].update(follow[nt])
198                        if len(follow[sym]) > before:
199                            changed = True
200
201    return dict(follow)
202
203
204# ---------------------------------------------------------------------------
205# LL(1) Parse Table Construction
206# ---------------------------------------------------------------------------
207
208class GrammarConflictError(Exception):
209    pass
210
211
212def build_ll1_table(
213    grammar: Grammar,
214    first: dict[str, set[str]],
215    follow: dict[str, set[str]],
216) -> dict[tuple[str, str], list[str]]:
217    """
218    Build the LL(1) parse table M[A, a].
219
220    For each production A -> α:
221      1. For each terminal a in FIRST(α): set M[A, a] = α
222      2. If ε ∈ FIRST(α):
223           for each terminal b in FOLLOW(A): set M[A, b] = α
224           if $ ∈ FOLLOW(A): set M[A, $] = α
225
226    Raises GrammarConflictError if a cell would be filled twice
227    (indicating the grammar is not LL(1)).
228    """
229    table: dict[tuple[str, str], list[str]] = {}
230    conflicts: list[str] = []
231
232    for nt, prods in grammar.productions.items():
233        for rhs in prods:
234            f = first_of_string(rhs, first)
235
236            for terminal in f - {EPSILON}:
237                key = (nt, terminal)
238                if key in table:
239                    conflicts.append(f"Conflict at M[{nt}, {terminal!r}]")
240                table[key] = rhs
241
242            if EPSILON in f:
243                for terminal in follow.get(nt, set()):
244                    key = (nt, terminal)
245                    if key in table:
246                        conflicts.append(f"Conflict at M[{nt}, {terminal!r}]")
247                    table[key] = rhs
248
249    if conflicts:
250        raise GrammarConflictError(
251            "Grammar is NOT LL(1):\n" + "\n".join(f"  {c}" for c in conflicts)
252        )
253    return table
254
255
256# ---------------------------------------------------------------------------
257# Table-Driven LL(1) Parser
258# ---------------------------------------------------------------------------
259
260class ParseError(Exception):
261    pass
262
263
264def ll1_parse(tokens: list[str], table: dict, grammar: Grammar) -> list[str]:
265    """
266    Parse a token list using the LL(1) table.
267    Returns a list of strings describing each parse step (the parse trace).
268
269    Stack-based algorithm:
270      Initialize: stack = [$, start]
271      Loop:
272        Let X = top of stack, a = current input token
273        if X == a == $: success
274        if X is terminal: if X == a, pop and advance; else error
275        if X is non-terminal: look up M[X, a]; push RHS in reverse
276    """
277    trace: list[str] = []
278    input_tokens = tokens + [EOF_SYM]
279    stack = [EOF_SYM, grammar.start]
280    ip = 0   # input pointer
281
282    while stack:
283        X = stack[-1]
284        a = input_tokens[ip]
285
286        if X == EOF_SYM and a == EOF_SYM:
287            trace.append(f"  ACCEPT")
288            break
289
290        if X == a:   # terminal match
291            trace.append(f"  match {X!r}")
292            stack.pop()
293            ip += 1
294        elif grammar.is_nonterminal(X):
295            key = (X, a)
296            if key not in table:
297                raise ParseError(
298                    f"No entry in table for M[{X!r}, {a!r}]. Input: {input_tokens}"
299                )
300            rhs = table[key]
301            stack.pop()
302            display_rhs = ' '.join(rhs) if rhs != [EPSILON] else 'ε'
303            trace.append(f"  {X} -> {display_rhs}")
304            if rhs != [EPSILON]:
305                for sym in reversed(rhs):
306                    stack.append(sym)
307        else:
308            raise ParseError(f"Unexpected token {a!r}, expected {X!r}")
309
310    return trace
311
312
313# ---------------------------------------------------------------------------
314# Pretty printing helpers
315# ---------------------------------------------------------------------------
316
317def print_sets(label: str, sets: dict[str, set[str]], symbols: list[str]) -> None:
318    print(f"\n{label}:")
319    for sym in symbols:
320        s = sets.get(sym, set())
321        items = ', '.join(sorted(s))
322        print(f"  {sym:<8} = {{ {items} }}")
323
324
325def print_parse_table(
326    table: dict[tuple[str, str], list[str]],
327    grammar: Grammar,
328    terminals: list[str],
329) -> None:
330    nts = list(grammar.non_terminals)
331    terminals_all = sorted(terminals) + [EOF_SYM]
332    col_w = 18
333    print("\nLL(1) Parse Table:")
334    header = f"  {'':12}" + ''.join(f"  {t!r:^{col_w}}" for t in terminals_all)
335    print(header)
336    print("  " + "-" * (12 + len(terminals_all) * (col_w + 2)))
337    for nt in sorted(nts):
338        row = f"  {nt:<12}"
339        for t in terminals_all:
340            rhs = table.get((nt, t))
341            if rhs:
342                cell = nt + '->' + ' '.join(rhs)
343            else:
344                cell = ''
345            row += f"  {cell:^{col_w}}"
346        print(row)
347
348
349# ---------------------------------------------------------------------------
350# Demo Grammar and Tests
351# ---------------------------------------------------------------------------
352
353# Standard unambiguous expression grammar (left-recursion eliminated)
354EXPR_GRAMMAR = Grammar(
355    start='E',
356    productions={
357        'E':  [['T', "E'"]],
358        "E'": [['+', 'T', "E'"], ['-', 'T', "E'"], [EPSILON]],
359        'T':  [['F', "T'"]],
360        "T'": [['*', 'F', "T'"], ['/', 'F', "T'"], [EPSILON]],
361        'F':  [['(', 'E', ')'], ['num'], ['id']],
362    }
363)
364
365# Test token sequences (already tokenized)
366TEST_INPUTS = [
367    ['num', '+', 'num', '*', 'num'],        # num + num * num
368    ['id', '*', '(', 'num', '+', 'id', ')'],# id * (num + id)
369    ['num'],                                 # just a number
370    ['(', 'id', '+', 'num', ')', '*', 'id'],# (id + num) * id
371]
372
373
374def main():
375    print("=" * 60)
376    print("LL(1) Parser Demo")
377    print("=" * 60)
378
379    grammar = EXPR_GRAMMAR
380    print(f"\n{grammar}")
381
382    # Compute FIRST and FOLLOW sets
383    first = compute_first(grammar)
384    follow = compute_follow(grammar, first)
385
386    nt_order = ['E', "E'", 'T', "T'", 'F']
387    print_sets("FIRST sets", first, nt_order)
388    print_sets("FOLLOW sets", follow, nt_order)
389
390    # Build parse table
391    table = build_ll1_table(grammar, first, follow)
392    print_parse_table(table, grammar, sorted(grammar.terminals))
393
394    # Parse test inputs
395    print("\n--- Parse Traces ---")
396    for tokens in TEST_INPUTS:
397        expr_display = ' '.join(tokens)
398        print(f"\nInput: {expr_display}")
399        try:
400            trace = ll1_parse(tokens, table, grammar)
401            for step in trace:
402                print(step)
403        except ParseError as e:
404            print(f"  ERROR: {e}")
405
406
407if __name__ == "__main__":
408    main()