/* * Copyright (C) 2012 The Android Open Source Project * modified * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only */ package helium314.keyboard.latin.utils import java.util.Locale /** * A class to help with handling different writing scripts. */ object ScriptUtils { // Unicode scripts (ISO 15924), incomplete const val SCRIPT_UNKNOWN = "" // Used for hardware keyboards const val SCRIPT_ARABIC = "Arab" const val SCRIPT_ARMENIAN = "Armn" const val SCRIPT_BENGALI = "Beng" const val SCRIPT_CYRILLIC = "Cyrl" const val SCRIPT_DEVANAGARI = "Deva" const val SCRIPT_GEORGIAN = "Geor" const val SCRIPT_GREEK = "Grek" const val SCRIPT_HEBREW = "Hebr" const val SCRIPT_KANNADA = "Knda" const val SCRIPT_KHMER = "Khmr" const val SCRIPT_LAO = "Laoo" const val SCRIPT_LATIN = "Latn" const val SCRIPT_MALAYALAM = "Mlym" const val SCRIPT_MYANMAR = "Mymr" const val SCRIPT_SINHALA = "Sinh" const val SCRIPT_TAMIL = "Taml" const val SCRIPT_TELUGU = "Telu" const val SCRIPT_THAI = "Thai" const val SCRIPT_HANGUL = "Hang" const val SCRIPT_GUJARATI = "Gujr" @JvmStatic fun scriptSupportsUppercase(locale: Locale): Boolean { // only Latin, Cyrillic, Greek and Armenian have upper/lower case // https://unicode.org/faq/casemap_charprop.html#3 return when (locale.script()) { SCRIPT_LATIN, SCRIPT_CYRILLIC, SCRIPT_GREEK, SCRIPT_ARMENIAN -> true else -> false } } /* * Returns whether the code point is a letter that makes sense for the specified * locale for this spell checker. * The dictionaries supported by Latin IME are described in res/xml/spellchecker.xml * and is limited to EFIGS languages and Russian. * Hence at the moment this explicitly tests for Cyrillic characters or Latin characters * as appropriate, and explicitly excludes CJK, Arabic and Hebrew characters. */ @JvmStatic fun isLetterPartOfScript(codePoint: Int, script: String): Boolean { return when (script) { SCRIPT_ARABIC -> // Arabic letters can be in any of the following blocks: // Arabic U+0600..U+06FF // Arabic Supplement, Thaana U+0750..U+077F, U+0780..U+07BF // Arabic Extended-A U+08A0..U+08FF // Arabic Presentation Forms-A U+FB50..U+FDFF // Arabic Presentation Forms-B U+FE70..U+FEFF codePoint in 0x600..0x6FF || codePoint in 0x750..0x7BF || codePoint in 0x8A0..0x8FF || codePoint in 0xFB50..0xFDFF || codePoint in 0xFE70..0xFEFF SCRIPT_ARMENIAN -> // Armenian letters are in the Armenian unicode block, U+0530..U+058F and // Alphabetic Presentation Forms block, U+FB00..U+FB4F, but only in the Armenian part // of that block, which is U+FB13..U+FB17. codePoint in 0x530..0x58F || codePoint in 0xFB13..0xFB17 SCRIPT_BENGALI -> // Bengali unicode block is U+0980..U+09FF codePoint in 0x980..0x9FF SCRIPT_CYRILLIC -> // All Cyrillic characters are in the 400~52F block. There are some in the upper // Unicode range, but they are archaic characters that are not used in modern // Russian and are not used by our dictionary. codePoint in 0x400..0x52F && Character.isLetter(codePoint) SCRIPT_DEVANAGARI -> // Devanagari unicode block is +0900..U+097F codePoint in 0x900..0x97F SCRIPT_GEORGIAN -> // Georgian letters are in the Georgian unicode block, U+10A0..U+10FF, // or Georgian supplement block, U+2D00..U+2D2F codePoint in 0x10A0..0x10FF || codePoint in 0x2D00..0x2D2F SCRIPT_GREEK -> // Greek letters are either in the 370~3FF range (Greek & Coptic), or in the // 1F00~1FFF range (Greek extended). Our dictionary contains both sort of characters. // Our dictionary also contains a few words with 0xF2; it would be best to check // if that's correct, but a web search does return results for these words so // they are probably okay. codePoint in 0x370..0x3FF || codePoint in 0x1F00..0x1FFF || codePoint == 0xF2 SCRIPT_HEBREW -> // Hebrew letters are in the Hebrew unicode block, which spans from U+0590 to U+05FF, // or in the Alphabetic Presentation Forms block, U+FB00..U+FB4F, but only in the // Hebrew part of that block, which is U+FB1D..U+FB4F. codePoint in 0x590..0x5FF || codePoint in 0xFB1D..0xFB4F SCRIPT_KANNADA -> // Kannada unicode block is U+0C80..U+0CFF codePoint in 0xC80..0xCFF SCRIPT_KHMER -> // Khmer letters are in unicode block U+1780..U+17FF, and the Khmer symbols block // is U+19E0..U+19FF codePoint in 0x1780..0x17FF || codePoint in 0x19E0..0x19FF SCRIPT_LAO -> // The Lao block is U+0E80..U+0EFF codePoint in 0xE80..0xEFF SCRIPT_LATIN -> // Our supported latin script dictionaries (EFIGS) at the moment only include // characters in the C0, C1, Latin Extended A and B, IPA extensions unicode // blocks. As it happens, those are back-to-back in the code range 0x40 to 0x2AF, // so the below is a very efficient way to test for it. As for the 0-0x3F, it's // excluded from isLetter anyway. codePoint <= 0x2AF && Character.isLetter(codePoint) SCRIPT_MALAYALAM -> // Malayalam unicode block is U+0D00..U+0D7F codePoint in 0xD00..0xD7F SCRIPT_MYANMAR -> // Myanmar has three unicode blocks : // Myanmar U+1000..U+109F // Myanmar extended-A U+AA60..U+AA7F // Myanmar extended-B U+A9E0..U+A9FF codePoint in 0x1000..0x109F || codePoint in 0xAA60..0xAA7F || codePoint in 0xA9E0..0xA9FF SCRIPT_SINHALA -> // Sinhala unicode block is U+0D80..U+0DFF codePoint in 0xD80..0xDFF SCRIPT_TAMIL -> // Tamil unicode block is U+0B80..U+0BFF codePoint in 0xB80..0xBFF SCRIPT_TELUGU -> // Telugu unicode block is U+0C00..U+0C7F codePoint in 0xC00..0xC7F SCRIPT_THAI -> // Thai unicode block is U+0E00..U+0E7F codePoint in 0xE00..0xE7F SCRIPT_HANGUL -> codePoint in 0xAC00..0xD7A3 || codePoint in 0x3131..0x318E || codePoint in 0x1100..0x11FF || codePoint in 0xA960..0xA97C || codePoint in 0xD7B0..0xD7C6 || codePoint in 0xD7CB..0xD7FB SCRIPT_GUJARATI -> // Gujarati unicode block is U+0A80..U+0AFF codePoint in 0xA80..0xAFF SCRIPT_UNKNOWN -> true else -> throw RuntimeException("Unknown value of script: $script") } } /** * returns the locale script with fallback to default scripts */ @JvmStatic fun Locale.script(): String { if (script.isNotEmpty()) return script if (country.equals("ZZ", true)) { Log.w("ScriptUtils", "old _ZZ locale found: $this") return SCRIPT_LATIN } return when (language) { "ar", "ur", "fa" -> SCRIPT_ARABIC "hy" -> SCRIPT_ARMENIAN "bn" -> SCRIPT_BENGALI "sr", "mk", "ru", "uk", "mn", "be", "kk", "ky", "bg", "xdq", "cv", "mhr", "mns" -> SCRIPT_CYRILLIC "ka" -> SCRIPT_GEORGIAN "el" -> SCRIPT_GREEK "iw" -> SCRIPT_HEBREW "km" -> SCRIPT_KHMER "lo" -> SCRIPT_LAO "ml" -> SCRIPT_MALAYALAM "my" -> SCRIPT_MYANMAR "si" -> SCRIPT_SINHALA "ta" -> SCRIPT_TAMIL "te" -> SCRIPT_TELUGU "th" -> SCRIPT_THAI "ko" -> SCRIPT_HANGUL "hi", "mr", "ne" -> SCRIPT_DEVANAGARI "kn" -> SCRIPT_KANNADA "gu" -> SCRIPT_GUJARATI else -> SCRIPT_LATIN // use as fallback } } }