move some code from RichInputConnection to StringUtils

so we can easily add unit tests and maybe improve the awkward behavior
2025-06-24 18:10:54 +00:00 · 2025-06-09 20:20:27 +02:00 · 2025-06-09 20:20:27 +02:00 · 80ba394b95
commit 80ba394b95
parent 52744b7427
4 changed files with 198 additions and 109 deletions
--- a/app/src/main/java/helium314/keyboard/latin/RichInputConnection.java
+++ b/app/src/main/java/helium314/keyboard/latin/RichInputConnection.java
@ -40,8 +40,6 @@ import helium314.keyboard.latin.settings.SpacingAndPunctuations;
 import helium314.keyboard.latin.utils.CapsModeUtils;
 import helium314.keyboard.latin.utils.DebugLogUtils;
 import helium314.keyboard.latin.utils.NgramContextUtils;
-import helium314.keyboard.latin.utils.ScriptUtils;
-import helium314.keyboard.latin.utils.SpannableStringUtils;
 import helium314.keyboard.latin.utils.StatsUtils;
 import helium314.keyboard.latin.utils.TextRange;

@ -825,15 +823,6 @@ public final class RichInputConnection implements PrivateCommandPerformer {
        return NgramContextUtils.getNgramContextFromNthPreviousWord(prev, spacingAndPunctuations, n);
    }

-    private static boolean isPartOfCompositionForScript(final int codePoint,
-            final SpacingAndPunctuations spacingAndPunctuations, final String script) {
-        // We always consider word connectors part of compositions.
-        return spacingAndPunctuations.isWordConnector(codePoint)
-                // Otherwise, it's part of composition if it's part of script and not a separator.
-                || (!spacingAndPunctuations.isWordSeparator(codePoint)
-                        && ScriptUtils.isLetterPartOfScript(codePoint, script));
-    }
-
    /**
     * Returns the text surrounding the cursor.
     *
@ -860,90 +849,7 @@ public final class RichInputConnection implements PrivateCommandPerformer {
        if (before == null || after == null) {
            return null;
        }
-
-        // Going backward, find the first breaking point (separator)
-        int startIndexInBefore = before.length();
-        int endIndexInAfter = -1;
-        while (startIndexInBefore > 0) {
-            final int codePoint = Character.codePointBefore(before, startIndexInBefore);
-            if (!isPartOfCompositionForScript(codePoint, spacingAndPunctuations, script)) {
-                if (Character.isWhitespace(codePoint) || !spacingAndPunctuations.mCurrentLanguageHasSpaces)
-                    break;
-                // continue to the next whitespace and see whether this contains a sometimesWordConnector
-                for (int i = startIndexInBefore - 1; i >= 0; i--) {
-                    final char c = before.charAt(i);
-                    if (spacingAndPunctuations.isSometimesWordConnector(c)) {
-                        // if yes -> whitespace is the index
-                        startIndexInBefore = Math.max(StringUtils.charIndexOfLastWhitespace(before), 0);
-                        final int firstSpaceAfter = StringUtils.charIndexOfFirstWhitespace(after);
-                        endIndexInAfter = firstSpaceAfter == -1 ? after.length() : firstSpaceAfter -1;
-                        break;
-                    } else if (Character.isWhitespace(c)) {
-                        // if no, just break normally
-                        break;
-                    }
-                }
-                break;
-            }
-            --startIndexInBefore;
-            if (Character.isSupplementaryCodePoint(codePoint)) {
-                --startIndexInBefore;
-            }
-        }
-
-        // Find last word separator after the cursor
-        if (endIndexInAfter == -1) {
-            while (++endIndexInAfter < after.length()) {
-                final int codePoint = Character.codePointAt(after, endIndexInAfter);
-                if (!isPartOfCompositionForScript(codePoint, spacingAndPunctuations, script)) {
-                    if (Character.isWhitespace(codePoint) || !spacingAndPunctuations.mCurrentLanguageHasSpaces)
-                        break;
-                    // continue to the next whitespace and see whether this contains a sometimesWordConnector
-                    for (int i = endIndexInAfter; i < after.length(); i++) {
-                        final char c = after.charAt(i);
-                        if (spacingAndPunctuations.isSometimesWordConnector(c)) {
-                            // if yes -> whitespace is next to the index
-                            startIndexInBefore = Math.max(StringUtils.charIndexOfLastWhitespace(before), 0);
-                            final int firstSpaceAfter = StringUtils.charIndexOfFirstWhitespace(after);
-                            endIndexInAfter = firstSpaceAfter == -1 ? after.length() : firstSpaceAfter - 1;
-                            break;
-                        } else if (Character.isWhitespace(c)) {
-                            // if no, just break normally
-                            break;
-                        }
-                    }
-                    break;
-                }
-                if (Character.isSupplementaryCodePoint(codePoint)) {
-                    ++endIndexInAfter;
-                }
-            }
-        }
-
-        // strip stuff before "//" (i.e. ignore http and other protocols)
-        final String beforeConsideringStart = before.subSequence(startIndexInBefore, before.length()).toString();
-        final int protocolEnd = beforeConsideringStart.lastIndexOf("//");
-        if (protocolEnd != -1)
-            startIndexInBefore += protocolEnd + 1;
-
-        // we don't want the end characters to be word separators
-        while (endIndexInAfter > 0 && spacingAndPunctuations.isWordSeparator(after.charAt(endIndexInAfter - 1))) {
-            --endIndexInAfter;
-        }
-        while (startIndexInBefore < before.length() && spacingAndPunctuations.isWordSeparator(before.charAt(startIndexInBefore))) {
-            ++startIndexInBefore;
-        }
-
-        final boolean hasUrlSpans =
-                SpannableStringUtils.hasUrlSpans(before, startIndexInBefore, before.length())
-                || SpannableStringUtils.hasUrlSpans(after, 0, endIndexInAfter);
-        // We don't use TextUtils#concat because it copies all spans without respect to their
-        // nature. If the text includes a PARAGRAPH span and it has been split, then
-        // TextUtils#concat will crash when it tries to concat both sides of it.
-        return new TextRange(
-                SpannableStringUtils.concatWithNonParagraphSuggestionSpansOnly(before, after),
-                        startIndexInBefore, before.length() + endIndexInAfter, before.length(),
-                        hasUrlSpans);
+        return StringUtilsKt.getTouchedWordRange(before, after, script, spacingAndPunctuations);
    }

    public boolean isCursorTouchingWord(final SpacingAndPunctuations spacingAndPunctuations,
@ -956,19 +862,7 @@ public final class RichInputConnection implements PrivateCommandPerformer {
            // a composing region should always count as a word
            return true;
        }
-        final String textBeforeCursor = mCommittedTextBeforeComposingText.toString();
-        int indexOfCodePointInJavaChars = textBeforeCursor.length();
-        int consideredCodePoint = 0 == indexOfCodePointInJavaChars ? Constants.NOT_A_CODE
-                : textBeforeCursor.codePointBefore(indexOfCodePointInJavaChars);
-        // Search for the first non word-connector char
-        if (spacingAndPunctuations.isWordConnector(consideredCodePoint)) {
-            indexOfCodePointInJavaChars -= Character.charCount(consideredCodePoint);
-            consideredCodePoint = 0 == indexOfCodePointInJavaChars ? Constants.NOT_A_CODE
-                    : textBeforeCursor.codePointBefore(indexOfCodePointInJavaChars);
-        }
-        return !(Constants.NOT_A_CODE == consideredCodePoint
-                || spacingAndPunctuations.isWordSeparator(consideredCodePoint)
-                || spacingAndPunctuations.isWordConnector(consideredCodePoint));
+        return StringUtilsKt.endsWithWordCodepoint(mCommittedTextBeforeComposingText.toString(), spacingAndPunctuations);
    }

    public boolean isCursorFollowedByWordCharacter(
--- a/app/src/main/java/helium314/keyboard/latin/common/StringUtils.kt
+++ b/app/src/main/java/helium314/keyboard/latin/common/StringUtils.kt
@ -6,13 +6,18 @@ import helium314.keyboard.keyboard.internal.keyboard_parser.floris.KeyCode
 import helium314.keyboard.latin.common.StringUtils.mightBeEmoji
 import helium314.keyboard.latin.common.StringUtils.newSingleCodePointString
 import helium314.keyboard.latin.settings.SpacingAndPunctuations
+import helium314.keyboard.latin.utils.ScriptUtils
 import helium314.keyboard.latin.utils.SpacedTokens
+import helium314.keyboard.latin.utils.SpannableStringUtils
+import helium314.keyboard.latin.utils.TextRange
 import java.math.BigInteger
 import java.util.Locale
+import kotlin.math.max

 fun CharSequence.codePointAt(offset: Int) = Character.codePointAt(this, offset)
 fun CharSequence.codePointBefore(offset: Int) = Character.codePointBefore(this, offset)

+/** Loops over the codepoints in [text]. Exits when [loop] returns true */
 inline fun loopOverCodePoints(text: CharSequence, loop: (cp: Int, charCount: Int) -> Boolean) {
    var offset = 0
    while (offset < text.length) {
@ -23,6 +28,7 @@ inline fun loopOverCodePoints(text: CharSequence, loop: (cp: Int, charCount: Int
    }
 }

+/** Loops backwards over the codepoints in [text]. Exits when [loop] returns true */
 inline fun loopOverCodePointsBackwards(text: CharSequence, loop: (cp: Int, charCount: Int) -> Boolean) {
    var offset = text.length
    while (offset > 0) {
@ -88,6 +94,111 @@ fun getFullEmojiAtEnd(text: CharSequence): String {
    return s.substring(offset)
 }

+/**
+ *  Returns whether the [text] does not end with word separator, ignoring all word connectors.
+ *  If the [text] is empty (after ignoring word connectors), the method returns false.
+ */
+// todo: this returns true on numbers, why isn't Character.isLetter(code) used?
+fun endsWithWordCodepoint(text: String, spacingAndPunctuations: SpacingAndPunctuations): Boolean {
+    if (text.isEmpty()) return false
+    var codePoint = 0 // initial value irrelevant since length is always > 0
+    loopOverCodePointsBackwards(text) { cp, _ ->
+        codePoint = cp
+        !spacingAndPunctuations.isWordConnector(cp)
+    }
+    // codePoint might still be a wordConnector (if text consists of wordConnectors)
+    return !spacingAndPunctuations.isWordConnector(codePoint) && !spacingAndPunctuations.isWordSeparator(codePoint)
+}
+
+// todo: simplify... maybe compare with original code?
+fun getTouchedWordRange(before: CharSequence, after: CharSequence, script: String, spacingAndPunctuations: SpacingAndPunctuations): TextRange {
+    // Going backward, find the first breaking point (separator)
+    var startIndexInBefore = before.length
+    var endIndexInAfter = -1 // todo: clarify why might we want to set it when checking before
+    loopOverCodePointsBackwards(before) { codePoint, cpLength ->
+        if (!isPartOfCompositionForScript(codePoint, spacingAndPunctuations, script)) {
+            if (Character.isWhitespace(codePoint) || !spacingAndPunctuations.mCurrentLanguageHasSpaces)
+                return@loopOverCodePointsBackwards true
+            // continue to the next whitespace and see whether this contains a sometimesWordConnector
+            for (i in startIndexInBefore - 1 downTo 0) {
+                val c = before[i]
+                if (spacingAndPunctuations.isSometimesWordConnector(c.code)) {
+                    // if yes -> whitespace is the index
+                    startIndexInBefore = max(StringUtils.charIndexOfLastWhitespace(before).toDouble(), 0.0).toInt()
+                    val firstSpaceAfter = StringUtils.charIndexOfFirstWhitespace(after)
+                    endIndexInAfter = if (firstSpaceAfter == -1) after.length else firstSpaceAfter - 1
+                    return@loopOverCodePointsBackwards true
+                } else if (Character.isWhitespace(c)) {
+                    // if no, just break normally
+                    return@loopOverCodePointsBackwards true
+                }
+            }
+            return@loopOverCodePointsBackwards true
+        }
+        startIndexInBefore -= cpLength
+        false
+    }
+
+    // Find last word separator after the cursor
+    if (endIndexInAfter == -1) {
+        endIndexInAfter = 0
+        loopOverCodePoints(after) { codePoint, cpLength ->
+            if (!isPartOfCompositionForScript(codePoint, spacingAndPunctuations, script)) {
+                if (Character.isWhitespace(codePoint) || !spacingAndPunctuations.mCurrentLanguageHasSpaces)
+                    return@loopOverCodePoints true
+                // continue to the next whitespace and see whether this contains a sometimesWordConnector
+                for (i in endIndexInAfter..<after.length) {
+                    val c = after[i]
+                    if (spacingAndPunctuations.isSometimesWordConnector(c.code)) {
+                        // if yes -> whitespace is next to the index
+                        startIndexInBefore = max(StringUtils.charIndexOfLastWhitespace(before), 0)
+                        val firstSpaceAfter = StringUtils.charIndexOfFirstWhitespace(after)
+                        endIndexInAfter = if (firstSpaceAfter == -1) after.length else firstSpaceAfter - 1
+                        return@loopOverCodePoints true
+                    } else if (Character.isWhitespace(c)) {
+                        // if no, just break normally
+                        return@loopOverCodePoints true
+                    }
+                }
+                return@loopOverCodePoints true
+            }
+            endIndexInAfter += cpLength
+            false
+        }
+    }
+
+    // strip text before "//" (i.e. ignore http and other protocols)
+    val beforeConsideringStart = before.substring(startIndexInBefore, before.length)
+    val protocolEnd = beforeConsideringStart.lastIndexOf("//")
+    if (protocolEnd != -1) startIndexInBefore += protocolEnd + 1
+
+    // we don't want the end characters to be word separators
+    while (endIndexInAfter > 0 && spacingAndPunctuations.isWordSeparator(after[endIndexInAfter - 1].code)) {
+        --endIndexInAfter
+    }
+    while (startIndexInBefore < before.length && spacingAndPunctuations.isWordSeparator(before[startIndexInBefore].code)) {
+        ++startIndexInBefore
+    }
+
+    val hasUrlSpans = SpannableStringUtils.hasUrlSpans(before, startIndexInBefore, before.length)
+        || SpannableStringUtils.hasUrlSpans(after, 0, endIndexInAfter)
+
+    // We don't use TextUtils#concat because it copies all spans without respect to their
+    // nature. If the text includes a PARAGRAPH span and it has been split, then
+    // TextUtils#concat will crash when it tries to concat both sides of it.
+    return TextRange(
+        SpannableStringUtils.concatWithNonParagraphSuggestionSpansOnly(before, after),
+        startIndexInBefore, before.length + endIndexInAfter, before.length,
+        hasUrlSpans
+    )
+}
+
+// actually this should not be in STRING Utils, but only used for getTouchedWordRange
+private fun isPartOfCompositionForScript(codePoint: Int, spacingAndPunctuations: SpacingAndPunctuations, script: String) =
+    spacingAndPunctuations.isWordConnector(codePoint) // We always consider word connectors part of compositions.
+        // Otherwise, it's part of composition if it's part of script and not a separator.
+        || (!spacingAndPunctuations.isWordSeparator(codePoint) && ScriptUtils.isLetterPartOfScript(codePoint, script))
+
 /** split the string on the first of consecutive space only, further consecutive spaces are added to the next split */
 fun String.splitOnFirstSpacesOnly(): List<String> {
    val out = mutableListOf<String>()
--- a/app/src/main/java/helium314/keyboard/latin/utils/TextRange.java
+++ b/app/src/main/java/helium314/keyboard/latin/utils/TextRange.java
@ -7,9 +7,13 @@
 package helium314.keyboard.latin.utils;

 import android.text.Spanned;
+import android.text.TextUtils;
 import android.text.style.SuggestionSpan;

+import androidx.annotation.NonNull;
+
 import java.util.Arrays;
+import java.util.Objects;

 /**
 * Represents a range of text, relative to the current cursor position.
@ -95,6 +99,28 @@ public final class TextRange {
        return writeIndex == readIndex ? spans : Arrays.copyOfRange(spans, 0, writeIndex);
    }

+    @Override
+    public boolean equals(Object other) {
+        if (!(other instanceof TextRange textRange)) return false;
+        return mWordAtCursorStartIndex == textRange.mWordAtCursorStartIndex
+            && mWordAtCursorEndIndex == textRange.mWordAtCursorEndIndex
+            && mCursorIndex == textRange.mCursorIndex
+            && mHasUrlSpans == textRange.mHasUrlSpans
+            && TextUtils.equals(mTextAtCursor, textRange.mTextAtCursor)
+            && TextUtils.equals(mWord, textRange.mWord);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(mTextAtCursor, mWordAtCursorStartIndex, mWordAtCursorEndIndex, mCursorIndex, mWord, mHasUrlSpans);
+    }
+
+    @NonNull
+    @Override
+    public String toString() {
+        return mTextAtCursor + ", " + mWord + ", " + mCursorIndex;
+    }
+
    public TextRange(final CharSequence textAtCursor, final int wordAtCursorStartIndex,
            final int wordAtCursorEndIndex, final int cursorIndex, final boolean hasUrlSpans) {
        if (wordAtCursorStartIndex < 0 || cursorIndex < wordAtCursorStartIndex
@ -109,4 +135,4 @@ public final class TextRange {
        mHasUrlSpans = hasUrlSpans;
        mWord = mTextAtCursor.subSequence(mWordAtCursorStartIndex, mWordAtCursorEndIndex);
    }
-}
+}
--- a/app/src/test/java/helium314/keyboard/latin/StringUtilsTest.kt
+++ b/app/src/test/java/helium314/keyboard/latin/StringUtilsTest.kt
@ -4,9 +4,13 @@ package helium314.keyboard.latin
 import androidx.test.core.app.ApplicationProvider
 import helium314.keyboard.ShadowInputMethodManager2
 import helium314.keyboard.latin.common.StringUtils
+import helium314.keyboard.latin.common.endsWithWordCodepoint
 import helium314.keyboard.latin.common.getFullEmojiAtEnd
+import helium314.keyboard.latin.common.getTouchedWordRange
 import helium314.keyboard.latin.common.nonWordCodePointAndNoSpaceBeforeCursor
 import helium314.keyboard.latin.settings.SpacingAndPunctuations
+import helium314.keyboard.latin.utils.ScriptUtils
+import helium314.keyboard.latin.utils.TextRange
 import org.junit.runner.RunWith
 import org.robolectric.RobolectricTestRunner
 import org.robolectric.annotation.Config
@ -60,6 +64,54 @@ class StringUtilsTest {
        assert(nonWordCodePointAndNoSpaceBeforeCursor("th.is", sp))
    }

+    @Test fun `is word-like at end`() {
+        val sp = SpacingAndPunctuations(ApplicationProvider.getApplicationContext<App>().resources, false)
+        assert(!endsWithWordCodepoint("", sp))
+        assert(endsWithWordCodepoint("don'", sp))
+        assert(!endsWithWordCodepoint("hello!", sp))
+        assert(!endsWithWordCodepoint("when ", sp))
+        assert(endsWithWordCodepoint("3-", sp)) // todo: this seems wrong
+        assert(endsWithWordCodepoint("5'", sp)) // todo: this seems wrong
+        assert(endsWithWordCodepoint("1", sp)) // todo: this seems wrong
+        assert(endsWithWordCodepoint("a-", sp))
+        assert(!endsWithWordCodepoint("--", sp))
+    }
+
+    @Test fun `get touched text range`() {
+        val sp = SpacingAndPunctuations(ApplicationProvider.getApplicationContext<App>().resources, false)
+        val spUrl = SpacingAndPunctuations(ApplicationProvider.getApplicationContext<App>().resources, true)
+        val script = ScriptUtils.SCRIPT_LATIN
+        checkTextRange("blabla this is v", "ery good", sp, script, 15, 19)
+        checkTextRange(".hel", "lo...", sp, script, 1, 6)
+        checkTextRange("(hi", ")", sp, script, 1, 3)
+        checkTextRange("", "word", sp, script, 0, 4)
+
+        checkTextRange("mail: blorb@", "florb.com or", sp, script, 12, 17)
+        checkTextRange("mail: blorb@", "florb.com or", spUrl, script, 6, 21)
+        checkTextRange("mail: blor", "b@florb.com or", sp, script, 6, 11)
+        checkTextRange("mail: blor", "b@florb.com or", spUrl, script, 6, 21)
+        checkTextRange("mail: blorb@f", "lorb.com or", sp, script, 12, 17)
+        checkTextRange("mail: blorb@f", "lorb.com or", spUrl, script, 6, 21)
+
+        checkTextRange("http://exam", "ple.com", sp, script, 7, 14)
+        checkTextRange("http://exam", "ple.com", spUrl, script, 7, 18)
+        checkTextRange("http://example.", "com", sp, script, 15, 18)
+        checkTextRange("http://example.", "com", spUrl, script, 7, 18)
+        checkTextRange("htt", "p://example.com", sp, script, 0, 4)
+        checkTextRange("htt", "p://example.com", spUrl, script, 0, 18)
+        checkTextRange("http:/", "/example.com", sp, script, 6, 6)
+        checkTextRange("http:/", "/example.com", spUrl, script, 0, 18)
+
+        checkTextRange("..", ".", spUrl, script, 2, 2)
+        checkTextRange("...", "", spUrl, script, 3, 3)
+
+        // todo: these are bad cases of url detection
+        //  also: sometimesWordConnectors are for URL and should be named accordingly
+        checkTextRange("@@@", "@@@", spUrl, script, 0, 6)
+        checkTextRange("a...", "", spUrl, script, 0, 4)
+        checkTextRange("@@@", "", spUrl, script, 0, 3)
+    }
+
    @Test fun detectEmojisAtEnd() {
        assertEquals("", getFullEmojiAtEnd("\uD83C\uDF83 "))
        assertEquals("", getFullEmojiAtEnd("a"))
@ -87,4 +139,10 @@ class StringUtilsTest {
    //  could help towards fully fixing https://github.com/Helium314/HeliBoard/issues/22
    //  though this might be tricky, as some emojis will show as one on new Android versions, and
    //  as two on older versions
+
+    private fun checkTextRange(before: String, after: String, sp: SpacingAndPunctuations, script: String, wordStart: Int, WordEnd: Int) {
+        val got = getTouchedWordRange(before, after, script, sp)
+        val wanted = TextRange(before + after, wordStart, WordEnd, before.length, false)
+        assertEquals(wanted, got)
+    }
 }