improve detection of emojis when deleting

This commit is contained in:
Helium314 2025-02-03 00:11:34 +01:00
parent ad3086183d
commit 73988394a9
2 changed files with 26 additions and 4 deletions

View file

@ -62,21 +62,32 @@ fun getFullEmojiAtEnd(s: CharSequence): String {
while (offset > 0) { while (offset > 0) {
val codepoint = text.codePointBefore(offset) val codepoint = text.codePointBefore(offset)
// stop if codepoint can't be emoji // stop if codepoint can't be emoji
if (!mightBeEmoji(codepoint)) return "" if (!mightBeEmoji(codepoint))
return text.substring(offset)
offset -= Character.charCount(codepoint) offset -= Character.charCount(codepoint)
// todo: if codepoint in 0x1F3FB..0x1F3FF -> combine with other emojis in front, but only if they actually combine
// why isn't this done with zwj like everything else? skin tones can be emojis by themselves...
if (offset > 0 && text[offset - 1].code == KeyCode.ZWJ) { if (offset > 0 && text[offset - 1].code == KeyCode.ZWJ) {
// todo: this appends ZWJ in weird cases like text, ZWJ, emoji
// and detects single ZWJ as emoji (at least irrelevant for current use of getFullEmojiAtEnd)
offset -= 1 offset -= 1
continue continue
} }
if (codepoint in 0x1F3FB..0x1F3FF) {
// Skin tones are not added with ZWJ, but just appended. This is not nice as they can be emojis on their own,
// but that's how it is done. Assume that an emoji before the skin tone will get merged (usually correct in practice)
val codepointBefore = text.codePointBefore(offset)
if (isEmoji(codepointBefore)) {
offset -= Character.charCount(codepointBefore)
continue
}
}
// check the whole text after offset // check the whole text after offset
val textToCheck = text.substring(offset) val textToCheck = text.substring(offset)
if (isEmoji(textToCheck)) { if (isEmoji(textToCheck)) {
return textToCheck return textToCheck
} }
} }
return "" return text.substring(offset)
} }
/** split the string on the first of consecutive space only, further consecutive spaces are added to the next split */ /** split the string on the first of consecutive space only, further consecutive spaces are added to the next split */

View file

@ -42,6 +42,8 @@ class StringUtilsTest {
} }
@Test fun detectEmojisAtEnd() { @Test fun detectEmojisAtEnd() {
assertEquals("", getFullEmojiAtEnd("\uD83C\uDF83 "))
assertEquals("", getFullEmojiAtEnd("a"))
assertEquals("\uD83C\uDF83", getFullEmojiAtEnd("\uD83C\uDF83")) assertEquals("\uD83C\uDF83", getFullEmojiAtEnd("\uD83C\uDF83"))
assertEquals("", getFullEmojiAtEnd("")) assertEquals("", getFullEmojiAtEnd(""))
assertEquals("", getFullEmojiAtEnd("")) assertEquals("", getFullEmojiAtEnd(""))
@ -51,6 +53,15 @@ class StringUtilsTest {
assertEquals("\uD83C\uDFF3\u200D\uD83C\uDF08", getFullEmojiAtEnd("\uD83C\uDFF3\u200D\uD83C\uDF08")) assertEquals("\uD83C\uDFF3\u200D\uD83C\uDF08", getFullEmojiAtEnd("\uD83C\uDFF3\u200D\uD83C\uDF08"))
assertEquals("\uD83C\uDFF3\u200D\uD83C\uDF08", getFullEmojiAtEnd("\uD83C\uDFF4\u200D☠️\uD83C\uDFF3\u200D\uD83C\uDF08")) assertEquals("\uD83C\uDFF3\u200D\uD83C\uDF08", getFullEmojiAtEnd("\uD83C\uDFF4\u200D☠️\uD83C\uDFF3\u200D\uD83C\uDF08"))
assertEquals("\uD83C\uDFF3\u200D⚧️", getFullEmojiAtEnd("hello there🏳")) assertEquals("\uD83C\uDFF3\u200D⚧️", getFullEmojiAtEnd("hello there🏳"))
assertEquals("\uD83D\uDD75\uD83C\uDFFC", getFullEmojiAtEnd(" 🕵🏼"))
assertEquals("\uD83D\uDD75\uD83C\uDFFC", getFullEmojiAtEnd("🕵🏼"))
assertEquals("\uD83C\uDFFC", getFullEmojiAtEnd(" \uD83C\uDFFC"))
// fails, but unlikely enough that we leave it unfixed
//assertEquals("\uD83C\uDFFC", getFullEmojiAtEnd("\uD83C\uDF84\uD83C\uDFFC"))
// below also fail, because ZWJ handling is not suitable for some unusual cases
//assertEquals("", getFullEmojiAtEnd("\u200D"))
//assertEquals("", getFullEmojiAtEnd("a\u200D"))
//assertEquals("\uD83D\uDE22", getFullEmojiAtEnd(" \u200D\uD83D\uDE22"))
} }
// todo: add tests for emoji detection? // todo: add tests for emoji detection?