HeliBoard/app/src/main/java/helium314/keyboard/latin/utils/ScriptUtils.kt

/*
 * Copyright (C) 2012 The Android Open Source Project
 * modified
 * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only
 */
package helium314.keyboard.latin.utils

import java.util.Locale

/**
 * A class to help with handling different writing scripts.
 */
object ScriptUtils {
    // Unicode scripts (ISO 15924), incomplete
    const val SCRIPT_UNKNOWN = "" // Used for hardware keyboards
    const val SCRIPT_ARABIC = "Arab"
    const val SCRIPT_ARMENIAN = "Armn"
    const val SCRIPT_BENGALI = "Beng"
    const val SCRIPT_CYRILLIC = "Cyrl"
    const val SCRIPT_DEVANAGARI = "Deva"
    const val SCRIPT_GEORGIAN = "Geor"
    const val SCRIPT_GREEK = "Grek"
    const val SCRIPT_HEBREW = "Hebr"
    const val SCRIPT_KANNADA = "Knda"
    const val SCRIPT_KHMER = "Khmr"
    const val SCRIPT_LAO = "Laoo"
    const val SCRIPT_LATIN = "Latn"
    const val SCRIPT_MALAYALAM = "Mlym"
    const val SCRIPT_MYANMAR = "Mymr"
    const val SCRIPT_SINHALA = "Sinh"
    const val SCRIPT_TAMIL = "Taml"
    const val SCRIPT_TELUGU = "Telu"
    const val SCRIPT_THAI = "Thai"
    const val SCRIPT_HANGUL = "Hang"
    const val SCRIPT_GUJARATI = "Gujr"

    @JvmStatic
    fun scriptSupportsUppercase(locale: Locale): Boolean {
        // only Latin, Cyrillic, Greek and Armenian have upper/lower case
        // https://unicode.org/faq/casemap_charprop.html#3
        return when (locale.script()) {
            SCRIPT_LATIN, SCRIPT_CYRILLIC, SCRIPT_GREEK, SCRIPT_ARMENIAN -> true
            else -> false
        }
    }

    /*
     * Returns whether the code point is a letter that makes sense for the specified
     * locale for this spell checker.
     * The dictionaries supported by Latin IME are described in res/xml/spellchecker.xml
     * and is limited to EFIGS languages and Russian.
     * Hence at the moment this explicitly tests for Cyrillic characters or Latin characters
     * as appropriate, and explicitly excludes CJK, Arabic and Hebrew characters.
     */
    @JvmStatic
    fun isLetterPartOfScript(codePoint: Int, script: String): Boolean {
        return when (script) {
            SCRIPT_ARABIC ->
                // Arabic letters can be in any of the following blocks:
                // Arabic U+0600..U+06FF
                // Arabic Supplement, Thaana U+0750..U+077F, U+0780..U+07BF
                // Arabic Extended-A U+08A0..U+08FF
                // Arabic Presentation Forms-A U+FB50..U+FDFF
                // Arabic Presentation Forms-B U+FE70..U+FEFF
                codePoint in 0x600..0x6FF
                        || codePoint in 0x750..0x7BF
                        || codePoint in 0x8A0..0x8FF
                        || codePoint in 0xFB50..0xFDFF
                        || codePoint in 0xFE70..0xFEFF
            SCRIPT_ARMENIAN ->
                // Armenian letters are in the Armenian unicode block, U+0530..U+058F and
                // Alphabetic Presentation Forms block, U+FB00..U+FB4F, but only in the Armenian part
                // of that block, which is U+FB13..U+FB17.
                codePoint in 0x530..0x58F || codePoint in 0xFB13..0xFB17
            SCRIPT_BENGALI ->
                // Bengali unicode block is U+0980..U+09FF
                codePoint in 0x980..0x9FF
            SCRIPT_CYRILLIC ->
                // All Cyrillic characters are in the 400~52F block. There are some in the upper
                // Unicode range, but they are archaic characters that are not used in modern
                // Russian and are not used by our dictionary.
                codePoint in 0x400..0x52F && Character.isLetter(codePoint)
            SCRIPT_DEVANAGARI ->
                // Devanagari unicode block is +0900..U+097F
                codePoint in 0x900..0x97F
            SCRIPT_GEORGIAN ->
                // Georgian letters are in the Georgian unicode block, U+10A0..U+10FF,
                // or Georgian supplement block, U+2D00..U+2D2F
                codePoint in 0x10A0..0x10FF || codePoint in 0x2D00..0x2D2F
            SCRIPT_GREEK ->
                // Greek letters are either in the 370~3FF range (Greek & Coptic), or in the
                // 1F00~1FFF range (Greek extended). Our dictionary contains both sort of characters.
                // Our dictionary also contains a few words with 0xF2; it would be best to check
                // if that's correct, but a web search does return results for these words so
                // they are probably okay.
                codePoint in 0x370..0x3FF || codePoint in 0x1F00..0x1FFF || codePoint == 0xF2
            SCRIPT_HEBREW ->
                // Hebrew letters are in the Hebrew unicode block, which spans from U+0590 to U+05FF,
                // or in the Alphabetic Presentation Forms block, U+FB00..U+FB4F, but only in the
                // Hebrew part of that block, which is U+FB1D..U+FB4F.
                codePoint in 0x590..0x5FF || codePoint in 0xFB1D..0xFB4F
            SCRIPT_KANNADA ->
                // Kannada unicode block is U+0C80..U+0CFF
                codePoint in 0xC80..0xCFF
            SCRIPT_KHMER ->
                // Khmer letters are in unicode block U+1780..U+17FF, and the Khmer symbols block
                // is U+19E0..U+19FF
                codePoint in 0x1780..0x17FF || codePoint in 0x19E0..0x19FF
            SCRIPT_LAO ->
                // The Lao block is U+0E80..U+0EFF
                codePoint in 0xE80..0xEFF
            SCRIPT_LATIN ->
                // Our supported latin script dictionaries (EFIGS) at the moment only include
                // characters in the C0, C1, Latin Extended A and B, IPA extensions unicode
                // blocks. As it happens, those are back-to-back in the code range 0x40 to 0x2AF,
                // so the below is a very efficient way to test for it. As for the 0-0x3F, it's
                // excluded from isLetter anyway.
                codePoint <= 0x2AF && Character.isLetter(codePoint)
            SCRIPT_MALAYALAM ->
                // Malayalam unicode block is U+0D00..U+0D7F
                codePoint in 0xD00..0xD7F
            SCRIPT_MYANMAR ->
                // Myanmar has three unicode blocks :
                // Myanmar U+1000..U+109F
                // Myanmar extended-A U+AA60..U+AA7F
                // Myanmar extended-B U+A9E0..U+A9FF
                codePoint in 0x1000..0x109F || codePoint in 0xAA60..0xAA7F || codePoint in 0xA9E0..0xA9FF
            SCRIPT_SINHALA ->
                // Sinhala unicode block is U+0D80..U+0DFF
                codePoint in 0xD80..0xDFF
            SCRIPT_TAMIL ->
                // Tamil unicode block is U+0B80..U+0BFF
                codePoint in 0xB80..0xBFF
            SCRIPT_TELUGU ->
                // Telugu unicode block is U+0C00..U+0C7F
                codePoint in 0xC00..0xC7F
            SCRIPT_THAI ->
                // Thai unicode block is U+0E00..U+0E7F
                codePoint in 0xE00..0xE7F
            SCRIPT_HANGUL -> codePoint in 0xAC00..0xD7A3
                    || codePoint in 0x3131..0x318E
                    || codePoint in 0x1100..0x11FF
                    || codePoint in 0xA960..0xA97C
                    || codePoint in 0xD7B0..0xD7C6
                    || codePoint in 0xD7CB..0xD7FB
            SCRIPT_GUJARATI ->
                // Gujarati unicode block is U+0A80..U+0AFF
                codePoint in 0xA80..0xAFF
            SCRIPT_UNKNOWN -> true
            else -> throw RuntimeException("Unknown value of script: $script")
        }
    }

    /**
     * returns the locale script with fallback to default scripts
     */
    @JvmStatic
    fun Locale.script(): String {
        if (script.isNotEmpty()) return script
        if (country.equals("ZZ", true)) {
            Log.w("ScriptUtils", "old _ZZ locale found: $this")
            return SCRIPT_LATIN
        }
        return when (language) {
            "ar", "ur", "fa" -> SCRIPT_ARABIC
            "hy" -> SCRIPT_ARMENIAN
            "bn" -> SCRIPT_BENGALI
            "sr", "mk", "ru", "uk", "mn", "be", "kk", "ky", "bg", "xdq", "cv", "mhr", "mns" -> SCRIPT_CYRILLIC
            "ka" -> SCRIPT_GEORGIAN
            "el" -> SCRIPT_GREEK
            "iw" -> SCRIPT_HEBREW
            "km" -> SCRIPT_KHMER
            "lo" -> SCRIPT_LAO
            "ml" -> SCRIPT_MALAYALAM
            "my" -> SCRIPT_MYANMAR
            "si" -> SCRIPT_SINHALA
            "ta" -> SCRIPT_TAMIL
            "te" -> SCRIPT_TELUGU
            "th" -> SCRIPT_THAI
            "ko" -> SCRIPT_HANGUL
            "hi", "mr", "ne" -> SCRIPT_DEVANAGARI
            "kn" -> SCRIPT_KANNADA
            "gu" -> SCRIPT_GUJARATI
            else -> SCRIPT_LATIN // use as fallback
        }
    }
}
Use language tags (#445) WARNING: due to renames, your existing user history and blacklist files might not be used after this commit. If you build the app with this commit, backup and restore settings ot fix it. Use language tags for identifying a string locale, not Locale.toString. This allows to avoid issues with non-default scripts, e.g. we can now use `sr-Latn` instead of the `sr_ZZ` workaround. Existing files are not renamed, but rename will happen when restoring backups. Most of the occurrences of a locale string have been replaced with Locale where possible. One notable exception is in user dictionary settings, where the locale string must be used to retrieve contents from system personal dictionary. Internal script IDs are switched to string as used in language tags, e.g. Latn for latin. This allows for correct interpretation of a Locale with explicitly specified script. 2024-01-28 10:42:42 +01:00			`/*`
			`* Copyright (C) 2012 The Android Open Source Project`
			`* modified`
			`* SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only`
			`*/`
rename package introduces weird bugs for some reason 2024-01-31 18:32:43 +01:00			`package helium314.keyboard.latin.utils`
Use language tags (#445) WARNING: due to renames, your existing user history and blacklist files might not be used after this commit. If you build the app with this commit, backup and restore settings ot fix it. Use language tags for identifying a string locale, not Locale.toString. This allows to avoid issues with non-default scripts, e.g. we can now use `sr-Latn` instead of the `sr_ZZ` workaround. Existing files are not renamed, but rename will happen when restoring backups. Most of the occurrences of a locale string have been replaced with Locale where possible. One notable exception is in user dictionary settings, where the locale string must be used to retrieve contents from system personal dictionary. Internal script IDs are switched to string as used in language tags, e.g. Latn for latin. This allows for correct interpretation of a Locale with explicitly specified script. 2024-01-28 10:42:42 +01:00
			`import java.util.Locale`

			`/**`
			`* A class to help with handling different writing scripts.`
			`*/`
			`object ScriptUtils {`
			`// Unicode scripts (ISO 15924), incomplete`
			`const val SCRIPT_UNKNOWN = "" // Used for hardware keyboards`
			`const val SCRIPT_ARABIC = "Arab"`
			`const val SCRIPT_ARMENIAN = "Armn"`
			`const val SCRIPT_BENGALI = "Beng"`
			`const val SCRIPT_CYRILLIC = "Cyrl"`
			`const val SCRIPT_DEVANAGARI = "Deva"`
			`const val SCRIPT_GEORGIAN = "Geor"`
			`const val SCRIPT_GREEK = "Grek"`
			`const val SCRIPT_HEBREW = "Hebr"`
			`const val SCRIPT_KANNADA = "Knda"`
			`const val SCRIPT_KHMER = "Khmr"`
			`const val SCRIPT_LAO = "Laoo"`
			`const val SCRIPT_LATIN = "Latn"`
			`const val SCRIPT_MALAYALAM = "Mlym"`
			`const val SCRIPT_MYANMAR = "Mymr"`
			`const val SCRIPT_SINHALA = "Sinh"`
			`const val SCRIPT_TAMIL = "Taml"`
			`const val SCRIPT_TELUGU = "Telu"`
			`const val SCRIPT_THAI = "Thai"`
			`const val SCRIPT_HANGUL = "Hang"`
			`const val SCRIPT_GUJARATI = "Gujr"`

			`@JvmStatic`
			`fun scriptSupportsUppercase(locale: Locale): Boolean {`
			`// only Latin, Cyrillic, Greek and Armenian have upper/lower case`
			`// https://unicode.org/faq/casemap_charprop.html#3`
			`return when (locale.script()) {`
			`SCRIPT_LATIN, SCRIPT_CYRILLIC, SCRIPT_GREEK, SCRIPT_ARMENIAN -> true`
			`else -> false`
			`}`
			`}`

			`/*`
			`* Returns whether the code point is a letter that makes sense for the specified`
			`* locale for this spell checker.`
			`* The dictionaries supported by Latin IME are described in res/xml/spellchecker.xml`
			`* and is limited to EFIGS languages and Russian.`
			`* Hence at the moment this explicitly tests for Cyrillic characters or Latin characters`
			`* as appropriate, and explicitly excludes CJK, Arabic and Hebrew characters.`
			`*/`
			`@JvmStatic`
			`fun isLetterPartOfScript(codePoint: Int, script: String): Boolean {`
			`return when (script) {`
			`SCRIPT_ARABIC ->`
			`// Arabic letters can be in any of the following blocks:`
			`// Arabic U+0600..U+06FF`
			`// Arabic Supplement, Thaana U+0750..U+077F, U+0780..U+07BF`
			`// Arabic Extended-A U+08A0..U+08FF`
			`// Arabic Presentation Forms-A U+FB50..U+FDFF`
			`// Arabic Presentation Forms-B U+FE70..U+FEFF`
			`codePoint in 0x600..0x6FF`
			`\|\| codePoint in 0x750..0x7BF`
			`\|\| codePoint in 0x8A0..0x8FF`
			`\|\| codePoint in 0xFB50..0xFDFF`
			`\|\| codePoint in 0xFE70..0xFEFF`
			`SCRIPT_ARMENIAN ->`
			`// Armenian letters are in the Armenian unicode block, U+0530..U+058F and`
			`// Alphabetic Presentation Forms block, U+FB00..U+FB4F, but only in the Armenian part`
			`// of that block, which is U+FB13..U+FB17.`
			`codePoint in 0x530..0x58F \|\| codePoint in 0xFB13..0xFB17`
			`SCRIPT_BENGALI ->`
			`// Bengali unicode block is U+0980..U+09FF`
			`codePoint in 0x980..0x9FF`
			`SCRIPT_CYRILLIC ->`
			`// All Cyrillic characters are in the 400~52F block. There are some in the upper`
			`// Unicode range, but they are archaic characters that are not used in modern`
			`// Russian and are not used by our dictionary.`
			`codePoint in 0x400..0x52F && Character.isLetter(codePoint)`
			`SCRIPT_DEVANAGARI ->`
			`// Devanagari unicode block is +0900..U+097F`
			`codePoint in 0x900..0x97F`
			`SCRIPT_GEORGIAN ->`
			`// Georgian letters are in the Georgian unicode block, U+10A0..U+10FF,`
			`// or Georgian supplement block, U+2D00..U+2D2F`
			`codePoint in 0x10A0..0x10FF \|\| codePoint in 0x2D00..0x2D2F`
			`SCRIPT_GREEK ->`
			`// Greek letters are either in the 370~3FF range (Greek & Coptic), or in the`
			`// 1F00~1FFF range (Greek extended). Our dictionary contains both sort of characters.`
			`// Our dictionary also contains a few words with 0xF2; it would be best to check`
			`// if that's correct, but a web search does return results for these words so`
			`// they are probably okay.`
			`codePoint in 0x370..0x3FF \|\| codePoint in 0x1F00..0x1FFF \|\| codePoint == 0xF2`
			`SCRIPT_HEBREW ->`
			`// Hebrew letters are in the Hebrew unicode block, which spans from U+0590 to U+05FF,`
			`// or in the Alphabetic Presentation Forms block, U+FB00..U+FB4F, but only in the`
			`// Hebrew part of that block, which is U+FB1D..U+FB4F.`
			`codePoint in 0x590..0x5FF \|\| codePoint in 0xFB1D..0xFB4F`
			`SCRIPT_KANNADA ->`
			`// Kannada unicode block is U+0C80..U+0CFF`
			`codePoint in 0xC80..0xCFF`
			`SCRIPT_KHMER ->`
			`// Khmer letters are in unicode block U+1780..U+17FF, and the Khmer symbols block`
			`// is U+19E0..U+19FF`
			`codePoint in 0x1780..0x17FF \|\| codePoint in 0x19E0..0x19FF`
			`SCRIPT_LAO ->`
			`// The Lao block is U+0E80..U+0EFF`
			`codePoint in 0xE80..0xEFF`
			`SCRIPT_LATIN ->`
			`// Our supported latin script dictionaries (EFIGS) at the moment only include`
			`// characters in the C0, C1, Latin Extended A and B, IPA extensions unicode`
			`// blocks. As it happens, those are back-to-back in the code range 0x40 to 0x2AF,`
			`// so the below is a very efficient way to test for it. As for the 0-0x3F, it's`
			`// excluded from isLetter anyway.`
			`codePoint <= 0x2AF && Character.isLetter(codePoint)`
			`SCRIPT_MALAYALAM ->`
			`// Malayalam unicode block is U+0D00..U+0D7F`
			`codePoint in 0xD00..0xD7F`
			`SCRIPT_MYANMAR ->`
			`// Myanmar has three unicode blocks :`
			`// Myanmar U+1000..U+109F`
			`// Myanmar extended-A U+AA60..U+AA7F`
			`// Myanmar extended-B U+A9E0..U+A9FF`
			`codePoint in 0x1000..0x109F \|\| codePoint in 0xAA60..0xAA7F \|\| codePoint in 0xA9E0..0xA9FF`
			`SCRIPT_SINHALA ->`
			`// Sinhala unicode block is U+0D80..U+0DFF`
			`codePoint in 0xD80..0xDFF`
			`SCRIPT_TAMIL ->`
			`// Tamil unicode block is U+0B80..U+0BFF`
			`codePoint in 0xB80..0xBFF`
			`SCRIPT_TELUGU ->`
			`// Telugu unicode block is U+0C00..U+0C7F`
			`codePoint in 0xC00..0xC7F`
			`SCRIPT_THAI ->`
			`// Thai unicode block is U+0E00..U+0E7F`
			`codePoint in 0xE00..0xE7F`
			`SCRIPT_HANGUL -> codePoint in 0xAC00..0xD7A3`
			`\|\| codePoint in 0x3131..0x318E`
			`\|\| codePoint in 0x1100..0x11FF`
			`\|\| codePoint in 0xA960..0xA97C`
			`\|\| codePoint in 0xD7B0..0xD7C6`
			`\|\| codePoint in 0xD7CB..0xD7FB`
			`SCRIPT_GUJARATI ->`
			`// Gujarati unicode block is U+0A80..U+0AFF`
			`codePoint in 0xA80..0xAFF`
			`SCRIPT_UNKNOWN -> true`
			`else -> throw RuntimeException("Unknown value of script: $script")`
			`}`
			`}`

			`/**`
			`* returns the locale script with fallback to default scripts`
			`*/`
			`@JvmStatic`
			`fun Locale.script(): String {`
			`if (script.isNotEmpty()) return script`
			`if (country.equals("ZZ", true)) {`
			`Log.w("ScriptUtils", "old _ZZ locale found: $this")`
			`return SCRIPT_LATIN`
			`}`
			`return when (language) {`
			`"ar", "ur", "fa" -> SCRIPT_ARABIC`
			`"hy" -> SCRIPT_ARMENIAN`
			`"bn" -> SCRIPT_BENGALI`
Add Mansi layout (#791) 2024-05-17 23:27:23 +05:30			`"sr", "mk", "ru", "uk", "mn", "be", "kk", "ky", "bg", "xdq", "cv", "mhr", "mns" -> SCRIPT_CYRILLIC`
Use language tags (#445) WARNING: due to renames, your existing user history and blacklist files might not be used after this commit. If you build the app with this commit, backup and restore settings ot fix it. Use language tags for identifying a string locale, not Locale.toString. This allows to avoid issues with non-default scripts, e.g. we can now use `sr-Latn` instead of the `sr_ZZ` workaround. Existing files are not renamed, but rename will happen when restoring backups. Most of the occurrences of a locale string have been replaced with Locale where possible. One notable exception is in user dictionary settings, where the locale string must be used to retrieve contents from system personal dictionary. Internal script IDs are switched to string as used in language tags, e.g. Latn for latin. This allows for correct interpretation of a Locale with explicitly specified script. 2024-01-28 10:42:42 +01:00			`"ka" -> SCRIPT_GEORGIAN`
			`"el" -> SCRIPT_GREEK`
			`"iw" -> SCRIPT_HEBREW`
			`"km" -> SCRIPT_KHMER`
			`"lo" -> SCRIPT_LAO`
			`"ml" -> SCRIPT_MALAYALAM`
			`"my" -> SCRIPT_MYANMAR`
			`"si" -> SCRIPT_SINHALA`
			`"ta" -> SCRIPT_TAMIL`
			`"te" -> SCRIPT_TELUGU`
			`"th" -> SCRIPT_THAI`
			`"ko" -> SCRIPT_HANGUL`
			`"hi", "mr", "ne" -> SCRIPT_DEVANAGARI`
			`"kn" -> SCRIPT_KANNADA`
			`"gu" -> SCRIPT_GUJARATI`
			`else -> SCRIPT_LATIN // use as fallback`
			`}`
			`}`
			`}`