From 7a6a151d7571ef215480a3a72bc5cf371afec278 Mon Sep 17 00:00:00 2001 From: Robin Date: Wed, 5 Jun 2024 16:19:22 -0400 Subject: [PATCH] Don't consider textual characters to be emoji MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We were using emojibase-regex to match emoji within messages. However, the docs (https://emojibase.dev/docs/regex/) state that this regex matches both emoji and text presentation characters. This is not what we want, and will result in false positives for characters like 'โ†”' that could turn into an emoji if paired with a variation selector. Unfortunately, none of the other regexes provided by Emojibase do what we want either (https://github.com/milesj/emojibase/issues/174). In the meantime, browser support for the RGI_Emoji character sequence class has made it feasible to write an emoji regex by hand, so that's what I've done. --- .eslintrc.js | 10 +++++ src/HtmlUtils.tsx | 17 +++++--- .../views/rooms/SendMessageComposer.tsx | 2 +- src/editor/parts.ts | 11 +++-- test/HtmlUtils-test.tsx | 12 ++++++ test/__snapshots__/HtmlUtils-test.tsx.snap | 41 +++++++++++++++++++ 6 files changed, 81 insertions(+), 12 deletions(-) diff --git a/.eslintrc.js b/.eslintrc.js index 4bec4e832038..ddcd1ac23989 100644 --- a/.eslintrc.js +++ b/.eslintrc.js @@ -70,6 +70,11 @@ module.exports = { name: "matrix-react-sdk/", message: "Please use matrix-react-sdk/src/index instead", }, + { + name: "emojibase-regex", + message: + "This regex doesn't actually test for emoji. See the docs at https://emojibase.dev/docs/regex/ and prefer our own EMOJI_REGEX from HtmlUtils.", + }, ], patterns: [ { @@ -136,6 +141,11 @@ module.exports = { ], message: "Please use matrix-js-sdk/src/matrix instead", }, + { + group: ["emojibase-regex/emoji*"], + message: + "This regex doesn't actually test for emoji. See the docs at https://emojibase.dev/docs/regex/ and prefer our own EMOJI_REGEX from HtmlUtils.", + }, ], }, ], diff --git a/src/HtmlUtils.tsx b/src/HtmlUtils.tsx index b63ed1dcf0ce..eba97cd6aeb6 100644 --- a/src/HtmlUtils.tsx +++ b/src/HtmlUtils.tsx @@ -20,7 +20,6 @@ limitations under the License. import React, { LegacyRef, ReactNode } from "react"; import sanitizeHtml from "sanitize-html"; import classNames from "classnames"; -import EMOJIBASE_REGEX from "emojibase-regex"; import katex from "katex"; import { decode } from "html-entities"; import { IContent } from "matrix-js-sdk/src/matrix"; @@ -46,10 +45,18 @@ const SURROGATE_PAIR_PATTERN = /([\ud800-\udbff])([\udc00-\udfff])/; const SYMBOL_PATTERN = /([\u2100-\u2bff])/; // Regex pattern for non-emoji characters that can appear in an "all-emoji" message -// (Zero-Width Joiner, Zero-Width Space, Emoji presentation character, other whitespace) -const EMOJI_SEPARATOR_REGEX = /[\u200D\u200B\s]|\uFE0F/g; +// (Zero-Width Space, other whitespace) +const EMOJI_SEPARATOR_REGEX = /[\u200B\s]/g; -const BIGEMOJI_REGEX = new RegExp(`^(${EMOJIBASE_REGEX.source})+$`, "i"); +// Regex for emoji. This includes any RGI_Emoji sequence followed by an optional +// emoji presentation VS (U+FE0F), but not those sequences that are followed by +// a text presentation VS (U+FE0E). We also count lone regional indicators +// (U+1F1E6-U+1F1FF). Technically this regex produces false negatives for emoji +// followed by U+FE0E when the emoji doesn't have a text variant, but in +// practice this doesn't matter. +export const EMOJI_REGEX = /\p{RGI_Emoji}(?!\uFE0E)(?:(? { expect(html).toMatchInlineSnapshot(`"test foo <b>bar"`); }); + it("generates big emoji for emoji made of multiple characters", () => { + const { asFragment } = render(bodyToHtml({ body: "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ โ†”๏ธ ๐Ÿ‡ฎ๐Ÿ‡ธ", msgtype: "m.text" }, [], {}) as ReactElement); + + expect(asFragment()).toMatchSnapshot(); + }); + it("should generate big emoji for an emoji-only reply to a message", () => { const { asFragment } = render( bodyToHtml( @@ -132,6 +138,12 @@ describe("bodyToHtml", () => { expect(asFragment()).toMatchSnapshot(); }); + it("does not mistake characters in text presentation mode for emoji", () => { + const { asFragment } = render(bodyToHtml({ body: "โ†” โ—๏ธŽ", msgtype: "m.text" }, [], {}) as ReactElement); + + expect(asFragment()).toMatchSnapshot(); + }); + describe("feature_latex_maths", () => { beforeEach(() => { jest.spyOn(SettingsStore, "getValue").mockImplementation((feature) => feature === "feature_latex_maths"); diff --git a/test/__snapshots__/HtmlUtils-test.tsx.snap b/test/__snapshots__/HtmlUtils-test.tsx.snap index c33cc46433d3..c69eaa7d952a 100644 --- a/test/__snapshots__/HtmlUtils-test.tsx.snap +++ b/test/__snapshots__/HtmlUtils-test.tsx.snap @@ -1,5 +1,16 @@ // Jest Snapshot v1, https://goo.gl/fbAQLP +exports[`bodyToHtml does not mistake characters in text presentation mode for emoji 1`] = ` + + + โ†” โ—๏ธŽ + + +`; + exports[`bodyToHtml feature_latex_maths should not mangle code blocks 1`] = `"

hello

$\\xi$

world

"`; exports[`bodyToHtml feature_latex_maths should not mangle divs 1`] = `"

hello

world
"`; @@ -8,6 +19,36 @@ exports[`bodyToHtml feature_latex_maths should render block katex 1`] = `"

hel exports[`bodyToHtml feature_latex_maths should render inline katex 1`] = `"hello ฮพ\\xi world"`; +exports[`bodyToHtml generates big emoji for emoji made of multiple characters 1`] = ` + + + + ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ + + + + โ†”๏ธ + + + + ๐Ÿ‡ฎ๐Ÿ‡ธ + + + +`; + exports[`bodyToHtml should generate big emoji for an emoji-only reply to a message 1`] = `