diff --git a/app/src/main/java/org/dslul/openboard/inputmethod/latin/Suggest.java b/app/src/main/java/org/dslul/openboard/inputmethod/latin/Suggest.java index 7d91c2559..b9e3e8f7a 100644 --- a/app/src/main/java/org/dslul/openboard/inputmethod/latin/Suggest.java +++ b/app/src/main/java/org/dslul/openboard/inputmethod/latin/Suggest.java @@ -336,7 +336,7 @@ public final class Suggest { } else if (first == null) { allowsToBeAutoCorrected = false; // no autocorrect if first suggestion unknown in this context } else if (typed == null) { - allowsToBeAutoCorrected = true; // autocorrect if typed word not known in this context, todo: this may be too aggressive + allowsToBeAutoCorrected = true; // allow autocorrect if typed word not known in this context, todo: this may be too aggressive } else { // autocorrect if suggested word has clearly higher score for empty word suggestions allowsToBeAutoCorrected = (first.mScore - typed.mScore) > 20; @@ -416,20 +416,17 @@ public final class Suggest { // dict locale different -> return the better match return new boolean[]{ true, dictLocale == first.mSourceDict.mLocale }; } - if (first.mScore < typedWordFirstOccurrenceWordInfo.mScore - 100000) { - // don't autocorrect if typed word is clearly the better suggestion - // todo: maybe this should be reduced more, to 50k or even 0 - return new boolean[]{ true, false }; - } + // todo: this may need tuning, especially the score difference thing + final int firstWordBonusScore = (first.isKindOf(SuggestedWordInfo.KIND_WHITELIST) ? 20 : 0) // large bonus because it's wanted by dictionary + + (StringUtils.isLowerCaseAscii(typedWordString) ? 5 : 0) // small bonus because typically only ascii is typed (applies to latin keyboards only) + + (first.mScore > typedWordFirstOccurrenceWordInfo.mScore ? 5 : 0); // small bonus if score is higher putEmptyWordSuggestions.run(); int firstScoreForEmpty = firstAndTypedWordEmptyInfos.get(0) != null ? firstAndTypedWordEmptyInfos.get(0).mScore : 0; int typedScoreForEmpty = firstAndTypedWordEmptyInfos.get(1) != null ? firstAndTypedWordEmptyInfos.get(1).mScore : 0; - if (firstScoreForEmpty == 0 && typedScoreForEmpty == 0) { - // both words unknown in this ngram context -> return the correction - return new boolean[]{ true, true }; - } - if (firstScoreForEmpty > typedScoreForEmpty + 20) { - // return the better match for ngram context, biased towards typed word + if (firstScoreForEmpty + firstWordBonusScore >= typedScoreForEmpty + 20) { + // return the better match for ngram context + // biased towards typed word + // but with bonus depending on return new boolean[]{ true, true }; } hasAutoCorrection = false; diff --git a/app/src/main/java/org/dslul/openboard/inputmethod/latin/common/StringUtils.java b/app/src/main/java/org/dslul/openboard/inputmethod/latin/common/StringUtils.java index 142038428..9b18be472 100644 --- a/app/src/main/java/org/dslul/openboard/inputmethod/latin/common/StringUtils.java +++ b/app/src/main/java/org/dslul/openboard/inputmethod/latin/common/StringUtils.java @@ -712,9 +712,9 @@ public final class StringUtils { return false; } - public static boolean probablyContainsEmoji(String s) { + public static boolean probablyContainsEmoji(final String s) { int offset = 0; - int length = s.length(); + final int length = s.length(); while (offset < length) { int c = Character.codePointAt(s, offset); if (probablyIsEmojiCodePoint(c)) @@ -725,10 +725,19 @@ public final class StringUtils { } // seemingly arbitrary ranges taken from "somewhere on the internet" - public static boolean probablyIsEmojiCodePoint(int c) { + public static boolean probablyIsEmojiCodePoint(final int c) { return (0x200D <= c && c <= 0x3299) // ?? || (0x1F004 <= c && c <= 0x1F251) // ?? || (0x1F300 <= c && c <= 0x1FFFF) // ?? || c == 0xFE0F; // variation selector emoji with color } + + public static boolean isLowerCaseAscii(final String s) { + final int length = s.length(); + for (int i = 0; i < length; i++) { + final int c = s.charAt(i); + if (c < 97 || c > 122) return false; + } + return true; + } } diff --git a/app/src/test/java/org/dslul/openboard/inputmethod/latin/SuggestTest.kt b/app/src/test/java/org/dslul/openboard/inputmethod/latin/SuggestTest.kt index 845686a89..ef690cfda 100644 --- a/app/src/test/java/org/dslul/openboard/inputmethod/latin/SuggestTest.kt +++ b/app/src/test/java/org/dslul/openboard/inputmethod/latin/SuggestTest.kt @@ -62,7 +62,7 @@ class SuggestTest { // not corrected because first empty score not high enough } - @Test fun `not "ill" to "I'll" if both were used before in this context`() { + @Test fun `"ill" to "I'll" if both have same ngram score`() { val locale = Locale.ENGLISH val result = shouldBeAutoCorrected( "ill", @@ -72,8 +72,20 @@ class SuggestTest { locale, thresholdModest ) + assert(result.last()) // should be corrected + } + + @Test fun `no "ill" to "I'll" if "ill" has somewhat better ngram score`() { + val locale = Locale.ENGLISH + val result = shouldBeAutoCorrected( + "ill", + listOf(suggestion("I'll", Int.MAX_VALUE, locale), suggestion("ill", 1500000, locale)), + suggestion("I'll", 200, locale), + suggestion("ill", 211, locale), + locale, + thresholdModest + ) assert(!result.last()) // should not be corrected - // essentially same as `not "ill" to "I'll" if only "ill" was used before in this context` } @Test fun `no English "I" for Polish "i" when typing in Polish`() { @@ -119,17 +131,70 @@ class SuggestTest { // not corrected because of locale matching } - @Test fun `no "lé" instead of "le"`() { + @Test fun `no "né" instead of "ne"`() { val result = shouldBeAutoCorrected( - "le", - listOf(suggestion("le", 1900000, Locale.FRENCH), suggestion("lé", 1500000, Locale.FRENCH)), + "ne", + listOf(suggestion("ne", 1900000, Locale.FRENCH), suggestion("né", 1900000-1, Locale.FRENCH)), null, null, Locale.FRENCH, thresholdModest ) assert(!result.last()) // should not be corrected - // not corrected because of score difference + // not corrected because score is lower + } + + @Test fun `"né" instead of "ne" if "né" in ngram context`() { + val locale = Locale.FRENCH + val result = shouldBeAutoCorrected( + "ne", + listOf(suggestion("ne", 1900000, locale), suggestion("né", 1900000-1, locale)), + suggestion("né", 200, locale), + null, + locale, + thresholdModest + ) + assert(result.last()) // should be corrected + } + + @Test fun `"né" instead of "ne" if "né" has clearly better score in ngram context`() { + val locale = Locale.FRENCH + val result = shouldBeAutoCorrected( + "ne", + listOf(suggestion("ne", 1900000, locale), suggestion("né", 1900000-1, locale)), + suggestion("né", 215, locale), + suggestion("ne", 200, locale), + locale, + thresholdModest + ) + assert(result.last()) // should be corrected + } + + @Test fun `no "né" instead of "ne" if both with same score in ngram context`() { + val locale = Locale.FRENCH + val result = shouldBeAutoCorrected( + "ne", + listOf(suggestion("ne", 1900000, locale), suggestion("né", 1900000-1, locale)), + suggestion("né", 200, locale), + suggestion("ne", 200, locale), + locale, + thresholdModest + ) + assert(!result.last()) // should not be corrected + } + + @Test fun `no "ne" instead of "né"`() { + val locale = Locale.FRENCH + val result = shouldBeAutoCorrected( + "né", + listOf(suggestion("ne", 600000, locale), suggestion("né", 1600000, locale)), + suggestion("né", 200, locale), + suggestion("ne", 200, locale), + locale, + thresholdModest + ) + assert(!result.last()) // should not be corrected + // not even allowed to check because of low score for ne } }