tune autocorrections once again

2025-05-19 00:10:20 +00:00 · 2023-08-30 09:05:29 +02:00 · 2023-08-30 09:05:29 +02:00 · b3764239b8
commit b3764239b8
parent 963ceacec9
3 changed files with 92 additions and 21 deletions
--- a/app/src/main/java/org/dslul/openboard/inputmethod/latin/Suggest.java
+++ b/app/src/main/java/org/dslul/openboard/inputmethod/latin/Suggest.java
@ -336,7 +336,7 @@ public final class Suggest {
            } else if (first == null) {
                allowsToBeAutoCorrected = false; // no autocorrect if first suggestion unknown in this context
            } else if (typed == null) {
-                allowsToBeAutoCorrected = true; // autocorrect if typed word not known in this context, todo: this may be too aggressive
+                allowsToBeAutoCorrected = true; // allow autocorrect if typed word not known in this context, todo: this may be too aggressive
            } else {
                // autocorrect if suggested word has clearly higher score for empty word suggestions
                allowsToBeAutoCorrected = (first.mScore - typed.mScore) > 20;
@ -416,20 +416,17 @@ public final class Suggest {
                    // dict locale different -> return the better match
                    return new boolean[]{ true, dictLocale == first.mSourceDict.mLocale };
                }
-                if (first.mScore < typedWordFirstOccurrenceWordInfo.mScore - 100000) {
-                    // don't autocorrect if typed word is clearly the better suggestion
-                    // todo: maybe this should be reduced more, to 50k or even 0
-                    return new boolean[]{ true, false };
-                }
+                // todo: this may need tuning, especially the score difference thing
+                final int firstWordBonusScore = (first.isKindOf(SuggestedWordInfo.KIND_WHITELIST) ? 20 : 0) // large bonus because it's wanted by dictionary
+                        + (StringUtils.isLowerCaseAscii(typedWordString) ? 5 : 0) // small bonus because typically only ascii is typed (applies to latin keyboards only)
+                        + (first.mScore > typedWordFirstOccurrenceWordInfo.mScore ? 5 : 0); // small bonus if score is higher
                putEmptyWordSuggestions.run();
                int firstScoreForEmpty = firstAndTypedWordEmptyInfos.get(0) != null ? firstAndTypedWordEmptyInfos.get(0).mScore : 0;
                int typedScoreForEmpty = firstAndTypedWordEmptyInfos.get(1) != null ? firstAndTypedWordEmptyInfos.get(1).mScore : 0;
-                if (firstScoreForEmpty == 0 && typedScoreForEmpty == 0) {
-                    // both words unknown in this ngram context -> return the correction
-                    return new boolean[]{ true, true };
-                }
-                if (firstScoreForEmpty > typedScoreForEmpty + 20) {
-                    // return the better match for ngram context, biased towards typed word
+                if (firstScoreForEmpty + firstWordBonusScore >= typedScoreForEmpty + 20) {
+                    // return the better match for ngram context
+                    //  biased towards typed word
+                    //  but with bonus depending on 
                    return new boolean[]{ true, true };
                }
                hasAutoCorrection = false;
--- a/app/src/main/java/org/dslul/openboard/inputmethod/latin/common/StringUtils.java
+++ b/app/src/main/java/org/dslul/openboard/inputmethod/latin/common/StringUtils.java
@ -712,9 +712,9 @@ public final class StringUtils {
        return false;
    }

-    public static boolean probablyContainsEmoji(String s) {
+    public static boolean probablyContainsEmoji(final String s) {
        int offset = 0;
-        int length = s.length();
+        final int length = s.length();
        while (offset < length) {
            int c = Character.codePointAt(s, offset);
            if (probablyIsEmojiCodePoint(c))
@ -725,10 +725,19 @@ public final class StringUtils {
    }

    // seemingly arbitrary ranges taken from "somewhere on the internet"
-    public static boolean probablyIsEmojiCodePoint(int c) {
+    public static boolean probablyIsEmojiCodePoint(final int c) {
        return (0x200D <= c && c <= 0x3299) // ??
                || (0x1F004 <= c && c <= 0x1F251) // ??
                || (0x1F300 <= c && c <= 0x1FFFF) // ??
                || c == 0xFE0F; // variation selector emoji with color
    }
+
+    public static boolean isLowerCaseAscii(final String s) {
+        final int length = s.length();
+        for (int i = 0; i < length; i++) {
+            final int c = s.charAt(i);
+            if (c < 97 || c > 122) return false;
+        }
+        return true;
+    }
 }
--- a/app/src/test/java/org/dslul/openboard/inputmethod/latin/SuggestTest.kt
+++ b/app/src/test/java/org/dslul/openboard/inputmethod/latin/SuggestTest.kt
@ -62,7 +62,7 @@ class SuggestTest {
        // not corrected because first empty score not high enough
    }

-    @Test fun `not "ill" to "I'll" if both were used before in this context`() {
+    @Test fun `"ill" to "I'll" if both have same ngram score`() {
        val locale = Locale.ENGLISH
        val result = shouldBeAutoCorrected(
            "ill",
@ -72,8 +72,20 @@ class SuggestTest {
            locale,
            thresholdModest
        )
+        assert(result.last()) // should be corrected
+    }
+
+    @Test fun `no "ill" to "I'll" if "ill" has somewhat better ngram score`() {
+        val locale = Locale.ENGLISH
+        val result = shouldBeAutoCorrected(
+            "ill",
+            listOf(suggestion("I'll", Int.MAX_VALUE, locale), suggestion("ill", 1500000, locale)),
+            suggestion("I'll", 200, locale),
+            suggestion("ill", 211, locale),
+            locale,
+            thresholdModest
+        )
        assert(!result.last()) // should not be corrected
-        // essentially same as `not "ill" to "I'll" if only "ill" was used before in this context`
    }

    @Test fun `no English "I" for Polish "i" when typing in Polish`() {
@ -119,17 +131,70 @@ class SuggestTest {
        // not corrected because of locale matching
    }

-    @Test fun `no "lé" instead of "le"`() {
+    @Test fun `no "né" instead of "ne"`() {
        val result = shouldBeAutoCorrected(
-            "le",
-            listOf(suggestion("le", 1900000, Locale.FRENCH), suggestion("lé", 1500000, Locale.FRENCH)),
+            "ne",
+            listOf(suggestion("ne", 1900000, Locale.FRENCH), suggestion("né", 1900000-1, Locale.FRENCH)),
            null,
            null,
            Locale.FRENCH,
            thresholdModest
        )
        assert(!result.last()) // should not be corrected
-        // not corrected because of score difference
+        // not corrected because score is lower
+    }
+
+    @Test fun `"né" instead of "ne" if "né" in ngram context`() {
+        val locale = Locale.FRENCH
+        val result = shouldBeAutoCorrected(
+            "ne",
+            listOf(suggestion("ne", 1900000, locale), suggestion("né", 1900000-1, locale)),
+            suggestion("né", 200, locale),
+            null,
+            locale,
+            thresholdModest
+        )
+        assert(result.last()) // should be corrected
+    }
+
+    @Test fun `"né" instead of "ne" if "né" has clearly better score in ngram context`() {
+        val locale = Locale.FRENCH
+        val result = shouldBeAutoCorrected(
+            "ne",
+            listOf(suggestion("ne", 1900000, locale), suggestion("né", 1900000-1, locale)),
+            suggestion("né", 215, locale),
+            suggestion("ne", 200, locale),
+            locale,
+            thresholdModest
+        )
+        assert(result.last()) // should be corrected
+    }
+
+    @Test fun `no "né" instead of "ne" if both with same score in ngram context`() {
+        val locale = Locale.FRENCH
+        val result = shouldBeAutoCorrected(
+            "ne",
+            listOf(suggestion("ne", 1900000, locale), suggestion("né", 1900000-1, locale)),
+            suggestion("né", 200, locale),
+            suggestion("ne", 200, locale),
+            locale,
+            thresholdModest
+        )
+        assert(!result.last()) // should not be corrected
+    }
+
+    @Test fun `no "ne" instead of "né"`() {
+        val locale = Locale.FRENCH
+        val result = shouldBeAutoCorrected(
+            "né",
+            listOf(suggestion("ne", 600000, locale), suggestion("né", 1600000, locale)),
+            suggestion("né", 200, locale),
+            suggestion("ne", 200, locale),
+            locale,
+            thresholdModest
+        )
+        assert(!result.last()) // should not be corrected
+        // not even allowed to check because of low score for ne
    }

 }