01_lexer.py

Download
python 371 lines 10.3 KB
  1"""
  201_lexer.py - Lexer/Scanner for a simple C-like language
  3
  4Demonstrates lexical analysis: the first phase of compilation.
  5The lexer reads source code (a stream of characters) and produces
  6a stream of tokens, discarding whitespace and comments.
  7
  8Topics covered:
  9  - Regular expressions for token patterns
 10  - Token types and Token dataclass
 11  - Handling keywords vs identifiers
 12  - String literal and character literal scanning
 13  - Line/column tracking for error messages
 14"""
 15
 16import re
 17from dataclasses import dataclass
 18from enum import Enum, auto
 19from typing import Iterator
 20
 21
 22# ---------------------------------------------------------------------------
 23# Token types
 24# ---------------------------------------------------------------------------
 25
 26class TokenType(Enum):
 27    # Literals
 28    INT_LITERAL    = auto()
 29    FLOAT_LITERAL  = auto()
 30    STRING_LITERAL = auto()
 31    CHAR_LITERAL   = auto()
 32    BOOL_LITERAL   = auto()
 33
 34    # Keywords
 35    INT    = auto()
 36    FLOAT  = auto()
 37    BOOL   = auto()
 38    CHAR   = auto()
 39    VOID   = auto()
 40    IF     = auto()
 41    ELSE   = auto()
 42    WHILE  = auto()
 43    FOR    = auto()
 44    RETURN = auto()
 45    TRUE   = auto()
 46    FALSE  = auto()
 47    NULL   = auto()
 48
 49    # Identifiers
 50    IDENTIFIER = auto()
 51
 52    # Arithmetic operators
 53    PLUS    = auto()   # +
 54    MINUS   = auto()   # -
 55    STAR    = auto()   # *
 56    SLASH   = auto()   # /
 57    PERCENT = auto()   # %
 58
 59    # Relational operators
 60    EQ  = auto()   # ==
 61    NEQ = auto()   # !=
 62    LT  = auto()   # <
 63    GT  = auto()   # >
 64    LEQ = auto()   # <=
 65    GEQ = auto()   # >=
 66
 67    # Logical operators
 68    AND = auto()   # &&
 69    OR  = auto()   # ||
 70    NOT = auto()   # !
 71
 72    # Assignment
 73    ASSIGN      = auto()   # =
 74    PLUS_ASSIGN = auto()   # +=
 75    MINUS_ASSIGN= auto()   # -=
 76    STAR_ASSIGN = auto()   # *=
 77    SLASH_ASSIGN= auto()   # /=
 78
 79    # Bitwise
 80    AMP   = auto()   # &
 81    PIPE  = auto()   # |
 82    CARET = auto()   # ^
 83    TILDE = auto()   # ~
 84    LSHIFT= auto()   # <<
 85    RSHIFT= auto()   # >>
 86
 87    # Punctuation
 88    LPAREN    = auto()   # (
 89    RPAREN    = auto()   # )
 90    LBRACE    = auto()   # {
 91    RBRACE    = auto()   # }
 92    LBRACKET  = auto()   # [
 93    RBRACKET  = auto()   # ]
 94    SEMICOLON = auto()   # ;
 95    COLON     = auto()   # :
 96    COMMA     = auto()   # ,
 97    DOT       = auto()   # .
 98    ARROW     = auto()   # ->
 99    ELLIPSIS  = auto()   # ...
100
101    # Special
102    EOF     = auto()
103    INVALID = auto()
104
105
106KEYWORDS = {
107    "int":    TokenType.INT,
108    "float":  TokenType.FLOAT,
109    "bool":   TokenType.BOOL,
110    "char":   TokenType.CHAR,
111    "void":   TokenType.VOID,
112    "if":     TokenType.IF,
113    "else":   TokenType.ELSE,
114    "while":  TokenType.WHILE,
115    "for":    TokenType.FOR,
116    "return": TokenType.RETURN,
117    "true":   TokenType.TRUE,
118    "false":  TokenType.FALSE,
119    "null":   TokenType.NULL,
120}
121
122
123# ---------------------------------------------------------------------------
124# Token dataclass
125# ---------------------------------------------------------------------------
126
127@dataclass
128class Token:
129    type: TokenType
130    value: str
131    line: int
132    column: int
133
134    def __repr__(self) -> str:
135        return f"Token({self.type.name}, {self.value!r}, line={self.line}, col={self.column})"
136
137
138# ---------------------------------------------------------------------------
139# Lexer
140# ---------------------------------------------------------------------------
141
142# Each entry: (TokenType, regex_pattern)
143# Patterns are tried in order; first match wins.
144TOKEN_PATTERNS = [
145    # Multi-character operators (must come before single-char)
146    (TokenType.ELLIPSIS,     r'\.\.\.'),
147    (TokenType.ARROW,        r'->'),
148    (TokenType.EQ,           r'=='),
149    (TokenType.NEQ,          r'!='),
150    (TokenType.LEQ,          r'<='),
151    (TokenType.GEQ,          r'>='),
152    (TokenType.AND,          r'&&'),
153    (TokenType.OR,           r'\|\|'),
154    (TokenType.PLUS_ASSIGN,  r'\+='),
155    (TokenType.MINUS_ASSIGN, r'-='),
156    (TokenType.STAR_ASSIGN,  r'\*='),
157    (TokenType.SLASH_ASSIGN, r'/='),
158    (TokenType.LSHIFT,       r'<<'),
159    (TokenType.RSHIFT,       r'>>'),
160
161    # Float literal (must come before INT_LITERAL)
162    (TokenType.FLOAT_LITERAL, r'\d+\.\d*([eE][+-]?\d+)?|\d+[eE][+-]?\d+'),
163
164    # Int literal (hex, octal, decimal)
165    (TokenType.INT_LITERAL,  r'0[xX][0-9a-fA-F]+|0[0-7]*|[1-9]\d*'),
166
167    # String literal
168    (TokenType.STRING_LITERAL, r'"(?:[^"\\]|\\.)*"'),
169
170    # Char literal
171    (TokenType.CHAR_LITERAL, r"'(?:[^'\\]|\\.)'"),
172
173    # Identifier / keyword
174    (TokenType.IDENTIFIER,   r'[A-Za-z_]\w*'),
175
176    # Single-char operators and punctuation
177    (TokenType.PLUS,      r'\+'),
178    (TokenType.MINUS,     r'-'),
179    (TokenType.STAR,      r'\*'),
180    (TokenType.SLASH,     r'/'),
181    (TokenType.PERCENT,   r'%'),
182    (TokenType.LT,        r'<'),
183    (TokenType.GT,        r'>'),
184    (TokenType.NOT,       r'!'),
185    (TokenType.ASSIGN,    r'='),
186    (TokenType.AMP,       r'&'),
187    (TokenType.PIPE,      r'\|'),
188    (TokenType.CARET,     r'\^'),
189    (TokenType.TILDE,     r'~'),
190    (TokenType.LPAREN,    r'\('),
191    (TokenType.RPAREN,    r'\)'),
192    (TokenType.LBRACE,    r'\{'),
193    (TokenType.RBRACE,    r'\}'),
194    (TokenType.LBRACKET,  r'\['),
195    (TokenType.RBRACKET,  r'\]'),
196    (TokenType.SEMICOLON, r';'),
197    (TokenType.COLON,     r':'),
198    (TokenType.COMMA,     r','),
199    (TokenType.DOT,       r'\.'),
200]
201
202# Compile all patterns into one master regex with named groups
203_MASTER_PATTERN = re.compile(
204    '|'.join(f'(?P<T{i}_{tt.name}>{pat})' for i, (tt, pat) in enumerate(TOKEN_PATTERNS))
205)
206
207
208class LexerError(Exception):
209    def __init__(self, message: str, line: int, column: int):
210        super().__init__(f"Lexer error at line {line}, col {column}: {message}")
211        self.line = line
212        self.column = column
213
214
215class Lexer:
216    """
217    Tokenizes a source string into a list of Tokens.
218    Handles:
219      - Single-line comments  // ...
220      - Block comments        /* ... */
221      - Whitespace (ignored)
222      - All token types defined above
223    """
224
225    def __init__(self, source: str):
226        self.source = source
227        self.pos = 0
228        self.line = 1
229        self.column = 1
230
231    def _skip_whitespace_and_comments(self) -> None:
232        while self.pos < len(self.source):
233            ch = self.source[self.pos]
234            if ch in ' \t\r':
235                self.pos += 1
236                self.column += 1
237            elif ch == '\n':
238                self.pos += 1
239                self.line += 1
240                self.column = 1
241            elif self.source[self.pos:self.pos+2] == '//':
242                # Single-line comment: skip to end of line
243                while self.pos < len(self.source) and self.source[self.pos] != '\n':
244                    self.pos += 1
245            elif self.source[self.pos:self.pos+2] == '/*':
246                # Block comment
247                start_line, start_col = self.line, self.column
248                self.pos += 2
249                self.column += 2
250                while self.pos < len(self.source):
251                    if self.source[self.pos:self.pos+2] == '*/':
252                        self.pos += 2
253                        self.column += 2
254                        break
255                    elif self.source[self.pos] == '\n':
256                        self.pos += 1
257                        self.line += 1
258                        self.column = 1
259                    else:
260                        self.pos += 1
261                        self.column += 1
262                else:
263                    raise LexerError("Unterminated block comment", start_line, start_col)
264            else:
265                break
266
267    def tokenize(self) -> list[Token]:
268        tokens: list[Token] = []
269        while True:
270            self._skip_whitespace_and_comments()
271            if self.pos >= len(self.source):
272                tokens.append(Token(TokenType.EOF, '', self.line, self.column))
273                break
274
275            m = _MASTER_PATTERN.match(self.source, self.pos)
276            if not m:
277                raise LexerError(
278                    f"Unexpected character {self.source[self.pos]!r}",
279                    self.line, self.column
280                )
281
282            value = m.group()
283            tok_line, tok_col = self.line, self.column
284
285            # Determine which pattern matched
286            token_type = TokenType.INVALID
287            for i, (tt, _) in enumerate(TOKEN_PATTERNS):
288                group_name = f'T{i}_{tt.name}'
289                if m.group(group_name) is not None:
290                    token_type = tt
291                    break
292
293            # Reclassify identifiers that are keywords
294            if token_type == TokenType.IDENTIFIER and value in KEYWORDS:
295                token_type = KEYWORDS[value]
296
297            tokens.append(Token(token_type, value, tok_line, tok_col))
298
299            # Advance position and column
300            newlines = value.count('\n')
301            if newlines:
302                self.line += newlines
303                self.column = len(value) - value.rfind('\n')
304            else:
305                self.column += len(value)
306            self.pos = m.end()
307
308        return tokens
309
310
311# ---------------------------------------------------------------------------
312# Demo
313# ---------------------------------------------------------------------------
314
315SAMPLE_PROGRAM = r"""
316// Compute the nth Fibonacci number
317int fibonacci(int n) {
318    if (n <= 1) {
319        return n;
320    }
321    int a = 0;
322    int b = 1;
323    int i = 2;
324    while (i <= n) {
325        int tmp = a + b;
326        a = b;
327        b = tmp;
328        i += 1;
329    }
330    return b;
331}
332
333/* Entry point */
334int main() {
335    float pi = 3.14159;
336    char greeting[] = "Hello, World!";
337    bool flag = true;
338    int result = fibonacci(10);
339    return 0;
340}
341"""
342
343
344def main():
345    print("=" * 60)
346    print("Lexer Demo: Tokenizing a C-like program")
347    print("=" * 60)
348    print("\nSource program:")
349    print(SAMPLE_PROGRAM)
350
351    lexer = Lexer(SAMPLE_PROGRAM)
352    tokens = lexer.tokenize()
353
354    print(f"\nProduced {len(tokens)} tokens:\n")
355    print(f"{'TYPE':<22} {'VALUE':<20} {'LINE':>4} {'COL':>4}")
356    print("-" * 56)
357    for tok in tokens:
358        if tok.type == TokenType.EOF:
359            break
360        print(f"{tok.type.name:<22} {tok.value!r:<20} {tok.line:>4} {tok.column:>4}")
361
362    print("\n--- Token type summary ---")
363    from collections import Counter
364    counts = Counter(tok.type.name for tok in tokens if tok.type != TokenType.EOF)
365    for name, count in sorted(counts.items(), key=lambda x: -x[1]):
366        print(f"  {name:<22}: {count}")
367
368
369if __name__ == "__main__":
370    main()