chg: expression parsing optimisation

- usage of lru_cache on `parse_expression` helps performance but it seems to be a diminishing return for cache sizes > 128, and the memory used by the cached strings and ExpressionLexerTokens can become significant if there are lots of long strings being parsed - added option to get the parsed token type only, since calls through `is_single_token_expression` only care about the token type - for token type checks, ignore empty strings or strings that would be to short to be that token type
lindsay-stevens · Nov 30, 2024 · 61c4df1 · 61c4df1
1 parent ff99279
commit 61c4df1
Showing 1 changed file with 14 additions and 5 deletions.
diff --git a/pyxform/parsing/expression.py b/pyxform/parsing/expression.py
@@ -3,7 +3,7 @@
 from functools import lru_cache
 
 
-def get_expression_lexer() -> re.Scanner:
+def get_expression_lexer(name_only: bool = False) -> re.Scanner:
     """
     Get a expression lexer (scanner) for parsing.
     """
@@ -61,7 +61,9 @@ def get_expression_lexer() -> re.Scanner:
     }
 
     def get_tokenizer(name):
-        def tokenizer(scan, value):
+        def tokenizer(scan, value) -> ExpLexerToken | str:
+            if name_only:
+                return name
             return ExpLexerToken(name, value, scan.match.start(), scan.match.end())
 
         return tokenizer
@@ -84,9 +86,10 @@ def __init__(self, name: str, value: str, start: int, end: int) -> None:
 
 
 _EXPRESSION_LEXER = get_expression_lexer()
+_TOKEN_NAME_LEXER = get_expression_lexer(name_only=True)
 
 
-@lru_cache(maxsize=1024)
+@lru_cache(maxsize=128)
 def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
     """
     Parse an expression.
@@ -104,8 +107,10 @@ def is_single_token_expression(expression: str, token_types: Iterable[str]) -> b
     """
     Does the expression contain single token of one of the provided token types?
     """
-    tokens, _ = parse_expression(expression.strip())
-    if 1 == len(tokens) and tokens[0].name in token_types:
+    if not expression:
+        return False
+    tokens, _ = _TOKEN_NAME_LEXER.scan(expression.strip())
+    if 1 == len(tokens) and tokens[0] in token_types:
         return True
     else:
         return False
@@ -115,11 +120,15 @@ def is_pyxform_reference(value: str) -> bool:
     """
     Does the input string contain only a valid Pyxform reference? e.g. ${my_question}
     """
+    if not value or len(value) <= 3:  # Needs 3 characters for "${}", plus a name inside.
+        return False
     return is_single_token_expression(expression=value, token_types=("PYXFORM_REF",))
 
 
 def is_xml_tag(value: str) -> bool:
     """
     Does the input string contain only a valid XML tag / element name?
     """
+    if not value:
+        return False
     return is_single_token_expression(expression=value, token_types=("NAME",))