1"""
201_lexer.py - Lexer/Scanner for a simple C-like language
3
4Demonstrates lexical analysis: the first phase of compilation.
5The lexer reads source code (a stream of characters) and produces
6a stream of tokens, discarding whitespace and comments.
7
8Topics covered:
9 - Regular expressions for token patterns
10 - Token types and Token dataclass
11 - Handling keywords vs identifiers
12 - String literal and character literal scanning
13 - Line/column tracking for error messages
14"""
15
16import re
17from dataclasses import dataclass
18from enum import Enum, auto
19from typing import Iterator
20
21
22# ---------------------------------------------------------------------------
23# Token types
24# ---------------------------------------------------------------------------
25
26class TokenType(Enum):
27 # Literals
28 INT_LITERAL = auto()
29 FLOAT_LITERAL = auto()
30 STRING_LITERAL = auto()
31 CHAR_LITERAL = auto()
32 BOOL_LITERAL = auto()
33
34 # Keywords
35 INT = auto()
36 FLOAT = auto()
37 BOOL = auto()
38 CHAR = auto()
39 VOID = auto()
40 IF = auto()
41 ELSE = auto()
42 WHILE = auto()
43 FOR = auto()
44 RETURN = auto()
45 TRUE = auto()
46 FALSE = auto()
47 NULL = auto()
48
49 # Identifiers
50 IDENTIFIER = auto()
51
52 # Arithmetic operators
53 PLUS = auto() # +
54 MINUS = auto() # -
55 STAR = auto() # *
56 SLASH = auto() # /
57 PERCENT = auto() # %
58
59 # Relational operators
60 EQ = auto() # ==
61 NEQ = auto() # !=
62 LT = auto() # <
63 GT = auto() # >
64 LEQ = auto() # <=
65 GEQ = auto() # >=
66
67 # Logical operators
68 AND = auto() # &&
69 OR = auto() # ||
70 NOT = auto() # !
71
72 # Assignment
73 ASSIGN = auto() # =
74 PLUS_ASSIGN = auto() # +=
75 MINUS_ASSIGN= auto() # -=
76 STAR_ASSIGN = auto() # *=
77 SLASH_ASSIGN= auto() # /=
78
79 # Bitwise
80 AMP = auto() # &
81 PIPE = auto() # |
82 CARET = auto() # ^
83 TILDE = auto() # ~
84 LSHIFT= auto() # <<
85 RSHIFT= auto() # >>
86
87 # Punctuation
88 LPAREN = auto() # (
89 RPAREN = auto() # )
90 LBRACE = auto() # {
91 RBRACE = auto() # }
92 LBRACKET = auto() # [
93 RBRACKET = auto() # ]
94 SEMICOLON = auto() # ;
95 COLON = auto() # :
96 COMMA = auto() # ,
97 DOT = auto() # .
98 ARROW = auto() # ->
99 ELLIPSIS = auto() # ...
100
101 # Special
102 EOF = auto()
103 INVALID = auto()
104
105
106KEYWORDS = {
107 "int": TokenType.INT,
108 "float": TokenType.FLOAT,
109 "bool": TokenType.BOOL,
110 "char": TokenType.CHAR,
111 "void": TokenType.VOID,
112 "if": TokenType.IF,
113 "else": TokenType.ELSE,
114 "while": TokenType.WHILE,
115 "for": TokenType.FOR,
116 "return": TokenType.RETURN,
117 "true": TokenType.TRUE,
118 "false": TokenType.FALSE,
119 "null": TokenType.NULL,
120}
121
122
123# ---------------------------------------------------------------------------
124# Token dataclass
125# ---------------------------------------------------------------------------
126
127@dataclass
128class Token:
129 type: TokenType
130 value: str
131 line: int
132 column: int
133
134 def __repr__(self) -> str:
135 return f"Token({self.type.name}, {self.value!r}, line={self.line}, col={self.column})"
136
137
138# ---------------------------------------------------------------------------
139# Lexer
140# ---------------------------------------------------------------------------
141
142# Each entry: (TokenType, regex_pattern)
143# Patterns are tried in order; first match wins.
144TOKEN_PATTERNS = [
145 # Multi-character operators (must come before single-char)
146 (TokenType.ELLIPSIS, r'\.\.\.'),
147 (TokenType.ARROW, r'->'),
148 (TokenType.EQ, r'=='),
149 (TokenType.NEQ, r'!='),
150 (TokenType.LEQ, r'<='),
151 (TokenType.GEQ, r'>='),
152 (TokenType.AND, r'&&'),
153 (TokenType.OR, r'\|\|'),
154 (TokenType.PLUS_ASSIGN, r'\+='),
155 (TokenType.MINUS_ASSIGN, r'-='),
156 (TokenType.STAR_ASSIGN, r'\*='),
157 (TokenType.SLASH_ASSIGN, r'/='),
158 (TokenType.LSHIFT, r'<<'),
159 (TokenType.RSHIFT, r'>>'),
160
161 # Float literal (must come before INT_LITERAL)
162 (TokenType.FLOAT_LITERAL, r'\d+\.\d*([eE][+-]?\d+)?|\d+[eE][+-]?\d+'),
163
164 # Int literal (hex, octal, decimal)
165 (TokenType.INT_LITERAL, r'0[xX][0-9a-fA-F]+|0[0-7]*|[1-9]\d*'),
166
167 # String literal
168 (TokenType.STRING_LITERAL, r'"(?:[^"\\]|\\.)*"'),
169
170 # Char literal
171 (TokenType.CHAR_LITERAL, r"'(?:[^'\\]|\\.)'"),
172
173 # Identifier / keyword
174 (TokenType.IDENTIFIER, r'[A-Za-z_]\w*'),
175
176 # Single-char operators and punctuation
177 (TokenType.PLUS, r'\+'),
178 (TokenType.MINUS, r'-'),
179 (TokenType.STAR, r'\*'),
180 (TokenType.SLASH, r'/'),
181 (TokenType.PERCENT, r'%'),
182 (TokenType.LT, r'<'),
183 (TokenType.GT, r'>'),
184 (TokenType.NOT, r'!'),
185 (TokenType.ASSIGN, r'='),
186 (TokenType.AMP, r'&'),
187 (TokenType.PIPE, r'\|'),
188 (TokenType.CARET, r'\^'),
189 (TokenType.TILDE, r'~'),
190 (TokenType.LPAREN, r'\('),
191 (TokenType.RPAREN, r'\)'),
192 (TokenType.LBRACE, r'\{'),
193 (TokenType.RBRACE, r'\}'),
194 (TokenType.LBRACKET, r'\['),
195 (TokenType.RBRACKET, r'\]'),
196 (TokenType.SEMICOLON, r';'),
197 (TokenType.COLON, r':'),
198 (TokenType.COMMA, r','),
199 (TokenType.DOT, r'\.'),
200]
201
202# Compile all patterns into one master regex with named groups
203_MASTER_PATTERN = re.compile(
204 '|'.join(f'(?P<T{i}_{tt.name}>{pat})' for i, (tt, pat) in enumerate(TOKEN_PATTERNS))
205)
206
207
208class LexerError(Exception):
209 def __init__(self, message: str, line: int, column: int):
210 super().__init__(f"Lexer error at line {line}, col {column}: {message}")
211 self.line = line
212 self.column = column
213
214
215class Lexer:
216 """
217 Tokenizes a source string into a list of Tokens.
218 Handles:
219 - Single-line comments // ...
220 - Block comments /* ... */
221 - Whitespace (ignored)
222 - All token types defined above
223 """
224
225 def __init__(self, source: str):
226 self.source = source
227 self.pos = 0
228 self.line = 1
229 self.column = 1
230
231 def _skip_whitespace_and_comments(self) -> None:
232 while self.pos < len(self.source):
233 ch = self.source[self.pos]
234 if ch in ' \t\r':
235 self.pos += 1
236 self.column += 1
237 elif ch == '\n':
238 self.pos += 1
239 self.line += 1
240 self.column = 1
241 elif self.source[self.pos:self.pos+2] == '//':
242 # Single-line comment: skip to end of line
243 while self.pos < len(self.source) and self.source[self.pos] != '\n':
244 self.pos += 1
245 elif self.source[self.pos:self.pos+2] == '/*':
246 # Block comment
247 start_line, start_col = self.line, self.column
248 self.pos += 2
249 self.column += 2
250 while self.pos < len(self.source):
251 if self.source[self.pos:self.pos+2] == '*/':
252 self.pos += 2
253 self.column += 2
254 break
255 elif self.source[self.pos] == '\n':
256 self.pos += 1
257 self.line += 1
258 self.column = 1
259 else:
260 self.pos += 1
261 self.column += 1
262 else:
263 raise LexerError("Unterminated block comment", start_line, start_col)
264 else:
265 break
266
267 def tokenize(self) -> list[Token]:
268 tokens: list[Token] = []
269 while True:
270 self._skip_whitespace_and_comments()
271 if self.pos >= len(self.source):
272 tokens.append(Token(TokenType.EOF, '', self.line, self.column))
273 break
274
275 m = _MASTER_PATTERN.match(self.source, self.pos)
276 if not m:
277 raise LexerError(
278 f"Unexpected character {self.source[self.pos]!r}",
279 self.line, self.column
280 )
281
282 value = m.group()
283 tok_line, tok_col = self.line, self.column
284
285 # Determine which pattern matched
286 token_type = TokenType.INVALID
287 for i, (tt, _) in enumerate(TOKEN_PATTERNS):
288 group_name = f'T{i}_{tt.name}'
289 if m.group(group_name) is not None:
290 token_type = tt
291 break
292
293 # Reclassify identifiers that are keywords
294 if token_type == TokenType.IDENTIFIER and value in KEYWORDS:
295 token_type = KEYWORDS[value]
296
297 tokens.append(Token(token_type, value, tok_line, tok_col))
298
299 # Advance position and column
300 newlines = value.count('\n')
301 if newlines:
302 self.line += newlines
303 self.column = len(value) - value.rfind('\n')
304 else:
305 self.column += len(value)
306 self.pos = m.end()
307
308 return tokens
309
310
311# ---------------------------------------------------------------------------
312# Demo
313# ---------------------------------------------------------------------------
314
315SAMPLE_PROGRAM = r"""
316// Compute the nth Fibonacci number
317int fibonacci(int n) {
318 if (n <= 1) {
319 return n;
320 }
321 int a = 0;
322 int b = 1;
323 int i = 2;
324 while (i <= n) {
325 int tmp = a + b;
326 a = b;
327 b = tmp;
328 i += 1;
329 }
330 return b;
331}
332
333/* Entry point */
334int main() {
335 float pi = 3.14159;
336 char greeting[] = "Hello, World!";
337 bool flag = true;
338 int result = fibonacci(10);
339 return 0;
340}
341"""
342
343
344def main():
345 print("=" * 60)
346 print("Lexer Demo: Tokenizing a C-like program")
347 print("=" * 60)
348 print("\nSource program:")
349 print(SAMPLE_PROGRAM)
350
351 lexer = Lexer(SAMPLE_PROGRAM)
352 tokens = lexer.tokenize()
353
354 print(f"\nProduced {len(tokens)} tokens:\n")
355 print(f"{'TYPE':<22} {'VALUE':<20} {'LINE':>4} {'COL':>4}")
356 print("-" * 56)
357 for tok in tokens:
358 if tok.type == TokenType.EOF:
359 break
360 print(f"{tok.type.name:<22} {tok.value!r:<20} {tok.line:>4} {tok.column:>4}")
361
362 print("\n--- Token type summary ---")
363 from collections import Counter
364 counts = Counter(tok.type.name for tok in tokens if tok.type != TokenType.EOF)
365 for name, count in sorted(counts.items(), key=lambda x: -x[1]):
366 print(f" {name:<22}: {count}")
367
368
369if __name__ == "__main__":
370 main()