From 61c4df1e9f984e176637c251edb6c05fe122b1ae Mon Sep 17 00:00:00 2001
From: lindsay stevens <lstevens@getodk.org>
Date: Sat, 30 Nov 2024 20:01:38 +1100
Subject: [PATCH] chg: expression parsing optimisation

- usage of lru_cache on `parse_expression` helps performance but it
  seems to be a diminishing return for cache sizes > 128, and the
  memory used by the cached strings and ExpressionLexerTokens can become
  significant if there are lots of long strings being parsed
- added option to get the parsed token type only, since calls through
  `is_single_token_expression` only care about the token type
- for token type checks, ignore empty strings or strings that would be
  to short to be that token type
---
 pyxform/parsing/expression.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/pyxform/parsing/expression.py b/pyxform/parsing/expression.py
index 32c6509b..335864e6 100644
--- a/pyxform/parsing/expression.py
+++ b/pyxform/parsing/expression.py
@@ -3,7 +3,7 @@
 from functools import lru_cache
 
 
-def get_expression_lexer() -> re.Scanner:
+def get_expression_lexer(name_only: bool = False) -> re.Scanner:
     """
     Get a expression lexer (scanner) for parsing.
     """
@@ -61,7 +61,9 @@ def get_expression_lexer() -> re.Scanner:
     }
 
     def get_tokenizer(name):
-        def tokenizer(scan, value):
+        def tokenizer(scan, value) -> ExpLexerToken | str:
+            if name_only:
+                return name
             return ExpLexerToken(name, value, scan.match.start(), scan.match.end())
 
         return tokenizer
@@ -84,9 +86,10 @@ def __init__(self, name: str, value: str, start: int, end: int) -> None:
 
 
 _EXPRESSION_LEXER = get_expression_lexer()
+_TOKEN_NAME_LEXER = get_expression_lexer(name_only=True)
 
 
-@lru_cache(maxsize=1024)
+@lru_cache(maxsize=128)
 def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
     """
     Parse an expression.
@@ -104,8 +107,10 @@ def is_single_token_expression(expression: str, token_types: Iterable[str]) -> b
     """
     Does the expression contain single token of one of the provided token types?
     """
-    tokens, _ = parse_expression(expression.strip())
-    if 1 == len(tokens) and tokens[0].name in token_types:
+    if not expression:
+        return False
+    tokens, _ = _TOKEN_NAME_LEXER.scan(expression.strip())
+    if 1 == len(tokens) and tokens[0] in token_types:
         return True
     else:
         return False
@@ -115,6 +120,8 @@ def is_pyxform_reference(value: str) -> bool:
     """
     Does the input string contain only a valid Pyxform reference? e.g. ${my_question}
     """
+    if not value or len(value) <= 3:  # Needs 3 characters for "${}", plus a name inside.
+        return False
     return is_single_token_expression(expression=value, token_types=("PYXFORM_REF",))
 
 
@@ -122,4 +129,6 @@ def is_xml_tag(value: str) -> bool:
     """
     Does the input string contain only a valid XML tag / element name?
     """
+    if not value:
+        return False
     return is_single_token_expression(expression=value, token_types=("NAME",))