From 61c4df1e9f984e176637c251edb6c05fe122b1ae Mon Sep 17 00:00:00 2001 From: lindsay stevens Date: Sat, 30 Nov 2024 20:01:38 +1100 Subject: [PATCH] chg: expression parsing optimisation - usage of lru_cache on `parse_expression` helps performance but it seems to be a diminishing return for cache sizes > 128, and the memory used by the cached strings and ExpressionLexerTokens can become significant if there are lots of long strings being parsed - added option to get the parsed token type only, since calls through `is_single_token_expression` only care about the token type - for token type checks, ignore empty strings or strings that would be to short to be that token type --- pyxform/parsing/expression.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/pyxform/parsing/expression.py b/pyxform/parsing/expression.py index 32c6509b..335864e6 100644 --- a/pyxform/parsing/expression.py +++ b/pyxform/parsing/expression.py @@ -3,7 +3,7 @@ from functools import lru_cache -def get_expression_lexer() -> re.Scanner: +def get_expression_lexer(name_only: bool = False) -> re.Scanner: """ Get a expression lexer (scanner) for parsing. """ @@ -61,7 +61,9 @@ def get_expression_lexer() -> re.Scanner: } def get_tokenizer(name): - def tokenizer(scan, value): + def tokenizer(scan, value) -> ExpLexerToken | str: + if name_only: + return name return ExpLexerToken(name, value, scan.match.start(), scan.match.end()) return tokenizer @@ -84,9 +86,10 @@ def __init__(self, name: str, value: str, start: int, end: int) -> None: _EXPRESSION_LEXER = get_expression_lexer() +_TOKEN_NAME_LEXER = get_expression_lexer(name_only=True) -@lru_cache(maxsize=1024) +@lru_cache(maxsize=128) def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]: """ Parse an expression. @@ -104,8 +107,10 @@ def is_single_token_expression(expression: str, token_types: Iterable[str]) -> b """ Does the expression contain single token of one of the provided token types? """ - tokens, _ = parse_expression(expression.strip()) - if 1 == len(tokens) and tokens[0].name in token_types: + if not expression: + return False + tokens, _ = _TOKEN_NAME_LEXER.scan(expression.strip()) + if 1 == len(tokens) and tokens[0] in token_types: return True else: return False @@ -115,6 +120,8 @@ def is_pyxform_reference(value: str) -> bool: """ Does the input string contain only a valid Pyxform reference? e.g. ${my_question} """ + if not value or len(value) <= 3: # Needs 3 characters for "${}", plus a name inside. + return False return is_single_token_expression(expression=value, token_types=("PYXFORM_REF",)) @@ -122,4 +129,6 @@ def is_xml_tag(value: str) -> bool: """ Does the input string contain only a valid XML tag / element name? """ + if not value: + return False return is_single_token_expression(expression=value, token_types=("NAME",))