diff --git a/utils/extract_urls.py b/utils/extract_urls.py index 8584e05f..9cc010be 100644 --- a/utils/extract_urls.py +++ b/utils/extract_urls.py @@ -4,13 +4,17 @@ @handle_exceptions(default_return_value=[], raise_on_error=False) def extract_image_urls(text: str) -> list[dict[str, str]]: - """Extract alt text and URLs from img tags in the given text. + """Extract alt text and URLs from img tags in the given text. Excludes SVG images. Example: Screenshot 2024-12-12 at 6 25 41 PM """ pattern = r']*alt="([^"]*)"[^>]*src="([^"]*)"[^>]*>' matches = findall(pattern, text) - return [{"alt": alt, "url": url} for alt, url in matches] + return [ + {"alt": alt, "url": url} + for alt, url in matches + if not url.lower().endswith(".svg") + ] def extract_urls(text: str) -> tuple[list[str], list[str]]: