move some code from RichInputConnection to StringUtils

so we can easily add unit tests
and maybe improve the awkward behavior
This commit is contained in:
Helium314 2025-06-09 20:20:27 +02:00
parent 52744b7427
commit 80ba394b95
4 changed files with 198 additions and 109 deletions

View file

@ -40,8 +40,6 @@ import helium314.keyboard.latin.settings.SpacingAndPunctuations;
import helium314.keyboard.latin.utils.CapsModeUtils;
import helium314.keyboard.latin.utils.DebugLogUtils;
import helium314.keyboard.latin.utils.NgramContextUtils;
import helium314.keyboard.latin.utils.ScriptUtils;
import helium314.keyboard.latin.utils.SpannableStringUtils;
import helium314.keyboard.latin.utils.StatsUtils;
import helium314.keyboard.latin.utils.TextRange;
@ -825,15 +823,6 @@ public final class RichInputConnection implements PrivateCommandPerformer {
return NgramContextUtils.getNgramContextFromNthPreviousWord(prev, spacingAndPunctuations, n);
}
private static boolean isPartOfCompositionForScript(final int codePoint,
final SpacingAndPunctuations spacingAndPunctuations, final String script) {
// We always consider word connectors part of compositions.
return spacingAndPunctuations.isWordConnector(codePoint)
// Otherwise, it's part of composition if it's part of script and not a separator.
|| (!spacingAndPunctuations.isWordSeparator(codePoint)
&& ScriptUtils.isLetterPartOfScript(codePoint, script));
}
/**
* Returns the text surrounding the cursor.
*
@ -860,90 +849,7 @@ public final class RichInputConnection implements PrivateCommandPerformer {
if (before == null || after == null) {
return null;
}
// Going backward, find the first breaking point (separator)
int startIndexInBefore = before.length();
int endIndexInAfter = -1;
while (startIndexInBefore > 0) {
final int codePoint = Character.codePointBefore(before, startIndexInBefore);
if (!isPartOfCompositionForScript(codePoint, spacingAndPunctuations, script)) {
if (Character.isWhitespace(codePoint) || !spacingAndPunctuations.mCurrentLanguageHasSpaces)
break;
// continue to the next whitespace and see whether this contains a sometimesWordConnector
for (int i = startIndexInBefore - 1; i >= 0; i--) {
final char c = before.charAt(i);
if (spacingAndPunctuations.isSometimesWordConnector(c)) {
// if yes -> whitespace is the index
startIndexInBefore = Math.max(StringUtils.charIndexOfLastWhitespace(before), 0);
final int firstSpaceAfter = StringUtils.charIndexOfFirstWhitespace(after);
endIndexInAfter = firstSpaceAfter == -1 ? after.length() : firstSpaceAfter -1;
break;
} else if (Character.isWhitespace(c)) {
// if no, just break normally
break;
}
}
break;
}
--startIndexInBefore;
if (Character.isSupplementaryCodePoint(codePoint)) {
--startIndexInBefore;
}
}
// Find last word separator after the cursor
if (endIndexInAfter == -1) {
while (++endIndexInAfter < after.length()) {
final int codePoint = Character.codePointAt(after, endIndexInAfter);
if (!isPartOfCompositionForScript(codePoint, spacingAndPunctuations, script)) {
if (Character.isWhitespace(codePoint) || !spacingAndPunctuations.mCurrentLanguageHasSpaces)
break;
// continue to the next whitespace and see whether this contains a sometimesWordConnector
for (int i = endIndexInAfter; i < after.length(); i++) {
final char c = after.charAt(i);
if (spacingAndPunctuations.isSometimesWordConnector(c)) {
// if yes -> whitespace is next to the index
startIndexInBefore = Math.max(StringUtils.charIndexOfLastWhitespace(before), 0);
final int firstSpaceAfter = StringUtils.charIndexOfFirstWhitespace(after);
endIndexInAfter = firstSpaceAfter == -1 ? after.length() : firstSpaceAfter - 1;
break;
} else if (Character.isWhitespace(c)) {
// if no, just break normally
break;
}
}
break;
}
if (Character.isSupplementaryCodePoint(codePoint)) {
++endIndexInAfter;
}
}
}
// strip stuff before "//" (i.e. ignore http and other protocols)
final String beforeConsideringStart = before.subSequence(startIndexInBefore, before.length()).toString();
final int protocolEnd = beforeConsideringStart.lastIndexOf("//");
if (protocolEnd != -1)
startIndexInBefore += protocolEnd + 1;
// we don't want the end characters to be word separators
while (endIndexInAfter > 0 && spacingAndPunctuations.isWordSeparator(after.charAt(endIndexInAfter - 1))) {
--endIndexInAfter;
}
while (startIndexInBefore < before.length() && spacingAndPunctuations.isWordSeparator(before.charAt(startIndexInBefore))) {
++startIndexInBefore;
}
final boolean hasUrlSpans =
SpannableStringUtils.hasUrlSpans(before, startIndexInBefore, before.length())
|| SpannableStringUtils.hasUrlSpans(after, 0, endIndexInAfter);
// We don't use TextUtils#concat because it copies all spans without respect to their
// nature. If the text includes a PARAGRAPH span and it has been split, then
// TextUtils#concat will crash when it tries to concat both sides of it.
return new TextRange(
SpannableStringUtils.concatWithNonParagraphSuggestionSpansOnly(before, after),
startIndexInBefore, before.length() + endIndexInAfter, before.length(),
hasUrlSpans);
return StringUtilsKt.getTouchedWordRange(before, after, script, spacingAndPunctuations);
}
public boolean isCursorTouchingWord(final SpacingAndPunctuations spacingAndPunctuations,
@ -956,19 +862,7 @@ public final class RichInputConnection implements PrivateCommandPerformer {
// a composing region should always count as a word
return true;
}
final String textBeforeCursor = mCommittedTextBeforeComposingText.toString();
int indexOfCodePointInJavaChars = textBeforeCursor.length();
int consideredCodePoint = 0 == indexOfCodePointInJavaChars ? Constants.NOT_A_CODE
: textBeforeCursor.codePointBefore(indexOfCodePointInJavaChars);
// Search for the first non word-connector char
if (spacingAndPunctuations.isWordConnector(consideredCodePoint)) {
indexOfCodePointInJavaChars -= Character.charCount(consideredCodePoint);
consideredCodePoint = 0 == indexOfCodePointInJavaChars ? Constants.NOT_A_CODE
: textBeforeCursor.codePointBefore(indexOfCodePointInJavaChars);
}
return !(Constants.NOT_A_CODE == consideredCodePoint
|| spacingAndPunctuations.isWordSeparator(consideredCodePoint)
|| spacingAndPunctuations.isWordConnector(consideredCodePoint));
return StringUtilsKt.endsWithWordCodepoint(mCommittedTextBeforeComposingText.toString(), spacingAndPunctuations);
}
public boolean isCursorFollowedByWordCharacter(

View file

@ -6,13 +6,18 @@ import helium314.keyboard.keyboard.internal.keyboard_parser.floris.KeyCode
import helium314.keyboard.latin.common.StringUtils.mightBeEmoji
import helium314.keyboard.latin.common.StringUtils.newSingleCodePointString
import helium314.keyboard.latin.settings.SpacingAndPunctuations
import helium314.keyboard.latin.utils.ScriptUtils
import helium314.keyboard.latin.utils.SpacedTokens
import helium314.keyboard.latin.utils.SpannableStringUtils
import helium314.keyboard.latin.utils.TextRange
import java.math.BigInteger
import java.util.Locale
import kotlin.math.max
fun CharSequence.codePointAt(offset: Int) = Character.codePointAt(this, offset)
fun CharSequence.codePointBefore(offset: Int) = Character.codePointBefore(this, offset)
/** Loops over the codepoints in [text]. Exits when [loop] returns true */
inline fun loopOverCodePoints(text: CharSequence, loop: (cp: Int, charCount: Int) -> Boolean) {
var offset = 0
while (offset < text.length) {
@ -23,6 +28,7 @@ inline fun loopOverCodePoints(text: CharSequence, loop: (cp: Int, charCount: Int
}
}
/** Loops backwards over the codepoints in [text]. Exits when [loop] returns true */
inline fun loopOverCodePointsBackwards(text: CharSequence, loop: (cp: Int, charCount: Int) -> Boolean) {
var offset = text.length
while (offset > 0) {
@ -88,6 +94,111 @@ fun getFullEmojiAtEnd(text: CharSequence): String {
return s.substring(offset)
}
/**
* Returns whether the [text] does not end with word separator, ignoring all word connectors.
* If the [text] is empty (after ignoring word connectors), the method returns false.
*/
// todo: this returns true on numbers, why isn't Character.isLetter(code) used?
fun endsWithWordCodepoint(text: String, spacingAndPunctuations: SpacingAndPunctuations): Boolean {
if (text.isEmpty()) return false
var codePoint = 0 // initial value irrelevant since length is always > 0
loopOverCodePointsBackwards(text) { cp, _ ->
codePoint = cp
!spacingAndPunctuations.isWordConnector(cp)
}
// codePoint might still be a wordConnector (if text consists of wordConnectors)
return !spacingAndPunctuations.isWordConnector(codePoint) && !spacingAndPunctuations.isWordSeparator(codePoint)
}
// todo: simplify... maybe compare with original code?
fun getTouchedWordRange(before: CharSequence, after: CharSequence, script: String, spacingAndPunctuations: SpacingAndPunctuations): TextRange {
// Going backward, find the first breaking point (separator)
var startIndexInBefore = before.length
var endIndexInAfter = -1 // todo: clarify why might we want to set it when checking before
loopOverCodePointsBackwards(before) { codePoint, cpLength ->
if (!isPartOfCompositionForScript(codePoint, spacingAndPunctuations, script)) {
if (Character.isWhitespace(codePoint) || !spacingAndPunctuations.mCurrentLanguageHasSpaces)
return@loopOverCodePointsBackwards true
// continue to the next whitespace and see whether this contains a sometimesWordConnector
for (i in startIndexInBefore - 1 downTo 0) {
val c = before[i]
if (spacingAndPunctuations.isSometimesWordConnector(c.code)) {
// if yes -> whitespace is the index
startIndexInBefore = max(StringUtils.charIndexOfLastWhitespace(before).toDouble(), 0.0).toInt()
val firstSpaceAfter = StringUtils.charIndexOfFirstWhitespace(after)
endIndexInAfter = if (firstSpaceAfter == -1) after.length else firstSpaceAfter - 1
return@loopOverCodePointsBackwards true
} else if (Character.isWhitespace(c)) {
// if no, just break normally
return@loopOverCodePointsBackwards true
}
}
return@loopOverCodePointsBackwards true
}
startIndexInBefore -= cpLength
false
}
// Find last word separator after the cursor
if (endIndexInAfter == -1) {
endIndexInAfter = 0
loopOverCodePoints(after) { codePoint, cpLength ->
if (!isPartOfCompositionForScript(codePoint, spacingAndPunctuations, script)) {
if (Character.isWhitespace(codePoint) || !spacingAndPunctuations.mCurrentLanguageHasSpaces)
return@loopOverCodePoints true
// continue to the next whitespace and see whether this contains a sometimesWordConnector
for (i in endIndexInAfter..<after.length) {
val c = after[i]
if (spacingAndPunctuations.isSometimesWordConnector(c.code)) {
// if yes -> whitespace is next to the index
startIndexInBefore = max(StringUtils.charIndexOfLastWhitespace(before), 0)
val firstSpaceAfter = StringUtils.charIndexOfFirstWhitespace(after)
endIndexInAfter = if (firstSpaceAfter == -1) after.length else firstSpaceAfter - 1
return@loopOverCodePoints true
} else if (Character.isWhitespace(c)) {
// if no, just break normally
return@loopOverCodePoints true
}
}
return@loopOverCodePoints true
}
endIndexInAfter += cpLength
false
}
}
// strip text before "//" (i.e. ignore http and other protocols)
val beforeConsideringStart = before.substring(startIndexInBefore, before.length)
val protocolEnd = beforeConsideringStart.lastIndexOf("//")
if (protocolEnd != -1) startIndexInBefore += protocolEnd + 1
// we don't want the end characters to be word separators
while (endIndexInAfter > 0 && spacingAndPunctuations.isWordSeparator(after[endIndexInAfter - 1].code)) {
--endIndexInAfter
}
while (startIndexInBefore < before.length && spacingAndPunctuations.isWordSeparator(before[startIndexInBefore].code)) {
++startIndexInBefore
}
val hasUrlSpans = SpannableStringUtils.hasUrlSpans(before, startIndexInBefore, before.length)
|| SpannableStringUtils.hasUrlSpans(after, 0, endIndexInAfter)
// We don't use TextUtils#concat because it copies all spans without respect to their
// nature. If the text includes a PARAGRAPH span and it has been split, then
// TextUtils#concat will crash when it tries to concat both sides of it.
return TextRange(
SpannableStringUtils.concatWithNonParagraphSuggestionSpansOnly(before, after),
startIndexInBefore, before.length + endIndexInAfter, before.length,
hasUrlSpans
)
}
// actually this should not be in STRING Utils, but only used for getTouchedWordRange
private fun isPartOfCompositionForScript(codePoint: Int, spacingAndPunctuations: SpacingAndPunctuations, script: String) =
spacingAndPunctuations.isWordConnector(codePoint) // We always consider word connectors part of compositions.
// Otherwise, it's part of composition if it's part of script and not a separator.
|| (!spacingAndPunctuations.isWordSeparator(codePoint) && ScriptUtils.isLetterPartOfScript(codePoint, script))
/** split the string on the first of consecutive space only, further consecutive spaces are added to the next split */
fun String.splitOnFirstSpacesOnly(): List<String> {
val out = mutableListOf<String>()

View file

@ -7,9 +7,13 @@
package helium314.keyboard.latin.utils;
import android.text.Spanned;
import android.text.TextUtils;
import android.text.style.SuggestionSpan;
import androidx.annotation.NonNull;
import java.util.Arrays;
import java.util.Objects;
/**
* Represents a range of text, relative to the current cursor position.
@ -95,6 +99,28 @@ public final class TextRange {
return writeIndex == readIndex ? spans : Arrays.copyOfRange(spans, 0, writeIndex);
}
@Override
public boolean equals(Object other) {
if (!(other instanceof TextRange textRange)) return false;
return mWordAtCursorStartIndex == textRange.mWordAtCursorStartIndex
&& mWordAtCursorEndIndex == textRange.mWordAtCursorEndIndex
&& mCursorIndex == textRange.mCursorIndex
&& mHasUrlSpans == textRange.mHasUrlSpans
&& TextUtils.equals(mTextAtCursor, textRange.mTextAtCursor)
&& TextUtils.equals(mWord, textRange.mWord);
}
@Override
public int hashCode() {
return Objects.hash(mTextAtCursor, mWordAtCursorStartIndex, mWordAtCursorEndIndex, mCursorIndex, mWord, mHasUrlSpans);
}
@NonNull
@Override
public String toString() {
return mTextAtCursor + ", " + mWord + ", " + mCursorIndex;
}
public TextRange(final CharSequence textAtCursor, final int wordAtCursorStartIndex,
final int wordAtCursorEndIndex, final int cursorIndex, final boolean hasUrlSpans) {
if (wordAtCursorStartIndex < 0 || cursorIndex < wordAtCursorStartIndex
@ -109,4 +135,4 @@ public final class TextRange {
mHasUrlSpans = hasUrlSpans;
mWord = mTextAtCursor.subSequence(mWordAtCursorStartIndex, mWordAtCursorEndIndex);
}
}
}

View file

@ -4,9 +4,13 @@ package helium314.keyboard.latin
import androidx.test.core.app.ApplicationProvider
import helium314.keyboard.ShadowInputMethodManager2
import helium314.keyboard.latin.common.StringUtils
import helium314.keyboard.latin.common.endsWithWordCodepoint
import helium314.keyboard.latin.common.getFullEmojiAtEnd
import helium314.keyboard.latin.common.getTouchedWordRange
import helium314.keyboard.latin.common.nonWordCodePointAndNoSpaceBeforeCursor
import helium314.keyboard.latin.settings.SpacingAndPunctuations
import helium314.keyboard.latin.utils.ScriptUtils
import helium314.keyboard.latin.utils.TextRange
import org.junit.runner.RunWith
import org.robolectric.RobolectricTestRunner
import org.robolectric.annotation.Config
@ -60,6 +64,54 @@ class StringUtilsTest {
assert(nonWordCodePointAndNoSpaceBeforeCursor("th.is", sp))
}
@Test fun `is word-like at end`() {
val sp = SpacingAndPunctuations(ApplicationProvider.getApplicationContext<App>().resources, false)
assert(!endsWithWordCodepoint("", sp))
assert(endsWithWordCodepoint("don'", sp))
assert(!endsWithWordCodepoint("hello!", sp))
assert(!endsWithWordCodepoint("when ", sp))
assert(endsWithWordCodepoint("3-", sp)) // todo: this seems wrong
assert(endsWithWordCodepoint("5'", sp)) // todo: this seems wrong
assert(endsWithWordCodepoint("1", sp)) // todo: this seems wrong
assert(endsWithWordCodepoint("a-", sp))
assert(!endsWithWordCodepoint("--", sp))
}
@Test fun `get touched text range`() {
val sp = SpacingAndPunctuations(ApplicationProvider.getApplicationContext<App>().resources, false)
val spUrl = SpacingAndPunctuations(ApplicationProvider.getApplicationContext<App>().resources, true)
val script = ScriptUtils.SCRIPT_LATIN
checkTextRange("blabla this is v", "ery good", sp, script, 15, 19)
checkTextRange(".hel", "lo...", sp, script, 1, 6)
checkTextRange("(hi", ")", sp, script, 1, 3)
checkTextRange("", "word", sp, script, 0, 4)
checkTextRange("mail: blorb@", "florb.com or", sp, script, 12, 17)
checkTextRange("mail: blorb@", "florb.com or", spUrl, script, 6, 21)
checkTextRange("mail: blor", "b@florb.com or", sp, script, 6, 11)
checkTextRange("mail: blor", "b@florb.com or", spUrl, script, 6, 21)
checkTextRange("mail: blorb@f", "lorb.com or", sp, script, 12, 17)
checkTextRange("mail: blorb@f", "lorb.com or", spUrl, script, 6, 21)
checkTextRange("http://exam", "ple.com", sp, script, 7, 14)
checkTextRange("http://exam", "ple.com", spUrl, script, 7, 18)
checkTextRange("http://example.", "com", sp, script, 15, 18)
checkTextRange("http://example.", "com", spUrl, script, 7, 18)
checkTextRange("htt", "p://example.com", sp, script, 0, 4)
checkTextRange("htt", "p://example.com", spUrl, script, 0, 18)
checkTextRange("http:/", "/example.com", sp, script, 6, 6)
checkTextRange("http:/", "/example.com", spUrl, script, 0, 18)
checkTextRange("..", ".", spUrl, script, 2, 2)
checkTextRange("...", "", spUrl, script, 3, 3)
// todo: these are bad cases of url detection
// also: sometimesWordConnectors are for URL and should be named accordingly
checkTextRange("@@@", "@@@", spUrl, script, 0, 6)
checkTextRange("a...", "", spUrl, script, 0, 4)
checkTextRange("@@@", "", spUrl, script, 0, 3)
}
@Test fun detectEmojisAtEnd() {
assertEquals("", getFullEmojiAtEnd("\uD83C\uDF83 "))
assertEquals("", getFullEmojiAtEnd("a"))
@ -87,4 +139,10 @@ class StringUtilsTest {
// could help towards fully fixing https://github.com/Helium314/HeliBoard/issues/22
// though this might be tricky, as some emojis will show as one on new Android versions, and
// as two on older versions
private fun checkTextRange(before: String, after: String, sp: SpacingAndPunctuations, script: String, wordStart: Int, WordEnd: Int) {
val got = getTouchedWordRange(before, after, script, sp)
val wanted = TextRange(before + after, wordStart, WordEnd, before.length, false)
assertEquals(wanted, got)
}
}