-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexer.py
48 lines (38 loc) · 1.62 KB
/
lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import re
class Token:
def __init__(self, type, value):
self.type = type
self.value = value
def __repr__(self):
return f"Token({self.type}, {self.value})"
# Define tokens and patterns
TOKENS = {
'KEYWORDS': ["תניא", "בעא_מיניה", "אמר_ליה", "הדרן_עלך", "אי", "אי_נמי", "אידך", "אין", "לא"],
'OPERATORS': ["﬩=", "+=", "-=", "*=", "/=", "%=", "﬩﬩", "++", "--", "==", "!=", ">=", "<=", ">", "<", "&&", "||", "﬩", "+", "-", "*", "/", "%", "="],
'DELIMITERS': ["{", "}", "(", ")", "[", "]", ",", ";"]
}
TOKEN_PATTERNS = [
('KEYWORD', r'\b(?:' + '|'.join(TOKENS['KEYWORDS']) + r')\b'),
('NUMBER', r'\b\d+\b'),
('STRING', r'"[^"]*"'),
('OPERATOR', r'|'.join(re.escape(op) for op in TOKENS['OPERATORS'])),
('DELIMITER', r'|'.join(re.escape(delim) for delim in TOKENS['DELIMITERS'])),
('IDENTIFIER', r'\b\w+\b'),
('WHITESPACE', r'\s+'),
('COMMENT_SINGLE', r'\\.*')
]
# Compile patterns
combined_pattern = '|'.join(f'(?P<{name}>{pattern})' for name, pattern in TOKEN_PATTERNS)
compiled_pattern = re.compile(combined_pattern)
# Lexer function
def lexer(code):
tokens = []
for match in compiled_pattern.finditer(code):
token_type = match.lastgroup
value = match.group(token_type)
if token_type in ('WHITESPACE', 'COMMENT_SINGLE'):
continue # Skip whitespace tokens
if token_type == 'STRING':
value = value[1:-1] # Remove surrounding quotes
tokens.append(Token(token_type, value))
return tokens