HeliBoard/app/src/main/java/helium314/keyboard/latin/utils/ScriptUtils.kt
2024-05-17 19:57:23 +02:00

187 lines
8.5 KiB
Kotlin

/*
* Copyright (C) 2012 The Android Open Source Project
* modified
* SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only
*/
package helium314.keyboard.latin.utils
import java.util.Locale
/**
* A class to help with handling different writing scripts.
*/
object ScriptUtils {
// Unicode scripts (ISO 15924), incomplete
const val SCRIPT_UNKNOWN = "" // Used for hardware keyboards
const val SCRIPT_ARABIC = "Arab"
const val SCRIPT_ARMENIAN = "Armn"
const val SCRIPT_BENGALI = "Beng"
const val SCRIPT_CYRILLIC = "Cyrl"
const val SCRIPT_DEVANAGARI = "Deva"
const val SCRIPT_GEORGIAN = "Geor"
const val SCRIPT_GREEK = "Grek"
const val SCRIPT_HEBREW = "Hebr"
const val SCRIPT_KANNADA = "Knda"
const val SCRIPT_KHMER = "Khmr"
const val SCRIPT_LAO = "Laoo"
const val SCRIPT_LATIN = "Latn"
const val SCRIPT_MALAYALAM = "Mlym"
const val SCRIPT_MYANMAR = "Mymr"
const val SCRIPT_SINHALA = "Sinh"
const val SCRIPT_TAMIL = "Taml"
const val SCRIPT_TELUGU = "Telu"
const val SCRIPT_THAI = "Thai"
const val SCRIPT_HANGUL = "Hang"
const val SCRIPT_GUJARATI = "Gujr"
@JvmStatic
fun scriptSupportsUppercase(locale: Locale): Boolean {
// only Latin, Cyrillic, Greek and Armenian have upper/lower case
// https://unicode.org/faq/casemap_charprop.html#3
return when (locale.script()) {
SCRIPT_LATIN, SCRIPT_CYRILLIC, SCRIPT_GREEK, SCRIPT_ARMENIAN -> true
else -> false
}
}
/*
* Returns whether the code point is a letter that makes sense for the specified
* locale for this spell checker.
* The dictionaries supported by Latin IME are described in res/xml/spellchecker.xml
* and is limited to EFIGS languages and Russian.
* Hence at the moment this explicitly tests for Cyrillic characters or Latin characters
* as appropriate, and explicitly excludes CJK, Arabic and Hebrew characters.
*/
@JvmStatic
fun isLetterPartOfScript(codePoint: Int, script: String): Boolean {
return when (script) {
SCRIPT_ARABIC ->
// Arabic letters can be in any of the following blocks:
// Arabic U+0600..U+06FF
// Arabic Supplement, Thaana U+0750..U+077F, U+0780..U+07BF
// Arabic Extended-A U+08A0..U+08FF
// Arabic Presentation Forms-A U+FB50..U+FDFF
// Arabic Presentation Forms-B U+FE70..U+FEFF
codePoint in 0x600..0x6FF
|| codePoint in 0x750..0x7BF
|| codePoint in 0x8A0..0x8FF
|| codePoint in 0xFB50..0xFDFF
|| codePoint in 0xFE70..0xFEFF
SCRIPT_ARMENIAN ->
// Armenian letters are in the Armenian unicode block, U+0530..U+058F and
// Alphabetic Presentation Forms block, U+FB00..U+FB4F, but only in the Armenian part
// of that block, which is U+FB13..U+FB17.
codePoint in 0x530..0x58F || codePoint in 0xFB13..0xFB17
SCRIPT_BENGALI ->
// Bengali unicode block is U+0980..U+09FF
codePoint in 0x980..0x9FF
SCRIPT_CYRILLIC ->
// All Cyrillic characters are in the 400~52F block. There are some in the upper
// Unicode range, but they are archaic characters that are not used in modern
// Russian and are not used by our dictionary.
codePoint in 0x400..0x52F && Character.isLetter(codePoint)
SCRIPT_DEVANAGARI ->
// Devanagari unicode block is +0900..U+097F
codePoint in 0x900..0x97F
SCRIPT_GEORGIAN ->
// Georgian letters are in the Georgian unicode block, U+10A0..U+10FF,
// or Georgian supplement block, U+2D00..U+2D2F
codePoint in 0x10A0..0x10FF || codePoint in 0x2D00..0x2D2F
SCRIPT_GREEK ->
// Greek letters are either in the 370~3FF range (Greek & Coptic), or in the
// 1F00~1FFF range (Greek extended). Our dictionary contains both sort of characters.
// Our dictionary also contains a few words with 0xF2; it would be best to check
// if that's correct, but a web search does return results for these words so
// they are probably okay.
codePoint in 0x370..0x3FF || codePoint in 0x1F00..0x1FFF || codePoint == 0xF2
SCRIPT_HEBREW ->
// Hebrew letters are in the Hebrew unicode block, which spans from U+0590 to U+05FF,
// or in the Alphabetic Presentation Forms block, U+FB00..U+FB4F, but only in the
// Hebrew part of that block, which is U+FB1D..U+FB4F.
codePoint in 0x590..0x5FF || codePoint in 0xFB1D..0xFB4F
SCRIPT_KANNADA ->
// Kannada unicode block is U+0C80..U+0CFF
codePoint in 0xC80..0xCFF
SCRIPT_KHMER ->
// Khmer letters are in unicode block U+1780..U+17FF, and the Khmer symbols block
// is U+19E0..U+19FF
codePoint in 0x1780..0x17FF || codePoint in 0x19E0..0x19FF
SCRIPT_LAO ->
// The Lao block is U+0E80..U+0EFF
codePoint in 0xE80..0xEFF
SCRIPT_LATIN ->
// Our supported latin script dictionaries (EFIGS) at the moment only include
// characters in the C0, C1, Latin Extended A and B, IPA extensions unicode
// blocks. As it happens, those are back-to-back in the code range 0x40 to 0x2AF,
// so the below is a very efficient way to test for it. As for the 0-0x3F, it's
// excluded from isLetter anyway.
codePoint <= 0x2AF && Character.isLetter(codePoint)
SCRIPT_MALAYALAM ->
// Malayalam unicode block is U+0D00..U+0D7F
codePoint in 0xD00..0xD7F
SCRIPT_MYANMAR ->
// Myanmar has three unicode blocks :
// Myanmar U+1000..U+109F
// Myanmar extended-A U+AA60..U+AA7F
// Myanmar extended-B U+A9E0..U+A9FF
codePoint in 0x1000..0x109F || codePoint in 0xAA60..0xAA7F || codePoint in 0xA9E0..0xA9FF
SCRIPT_SINHALA ->
// Sinhala unicode block is U+0D80..U+0DFF
codePoint in 0xD80..0xDFF
SCRIPT_TAMIL ->
// Tamil unicode block is U+0B80..U+0BFF
codePoint in 0xB80..0xBFF
SCRIPT_TELUGU ->
// Telugu unicode block is U+0C00..U+0C7F
codePoint in 0xC00..0xC7F
SCRIPT_THAI ->
// Thai unicode block is U+0E00..U+0E7F
codePoint in 0xE00..0xE7F
SCRIPT_HANGUL -> codePoint in 0xAC00..0xD7A3
|| codePoint in 0x3131..0x318E
|| codePoint in 0x1100..0x11FF
|| codePoint in 0xA960..0xA97C
|| codePoint in 0xD7B0..0xD7C6
|| codePoint in 0xD7CB..0xD7FB
SCRIPT_GUJARATI ->
// Gujarati unicode block is U+0A80..U+0AFF
codePoint in 0xA80..0xAFF
SCRIPT_UNKNOWN -> true
else -> throw RuntimeException("Unknown value of script: $script")
}
}
/**
* returns the locale script with fallback to default scripts
*/
@JvmStatic
fun Locale.script(): String {
if (script.isNotEmpty()) return script
if (country.equals("ZZ", true)) {
Log.w("ScriptUtils", "old _ZZ locale found: $this")
return SCRIPT_LATIN
}
return when (language) {
"ar", "ur", "fa" -> SCRIPT_ARABIC
"hy" -> SCRIPT_ARMENIAN
"bn" -> SCRIPT_BENGALI
"sr", "mk", "ru", "uk", "mn", "be", "kk", "ky", "bg", "xdq", "cv", "mhr", "mns" -> SCRIPT_CYRILLIC
"ka" -> SCRIPT_GEORGIAN
"el" -> SCRIPT_GREEK
"iw" -> SCRIPT_HEBREW
"km" -> SCRIPT_KHMER
"lo" -> SCRIPT_LAO
"ml" -> SCRIPT_MALAYALAM
"my" -> SCRIPT_MYANMAR
"si" -> SCRIPT_SINHALA
"ta" -> SCRIPT_TAMIL
"te" -> SCRIPT_TELUGU
"th" -> SCRIPT_THAI
"ko" -> SCRIPT_HANGUL
"hi", "mr", "ne" -> SCRIPT_DEVANAGARI
"kn" -> SCRIPT_KANNADA
"gu" -> SCRIPT_GUJARATI
else -> SCRIPT_LATIN // use as fallback
}
}
}