diff --git a/README.md b/README.md index d0835c762..7c81aabb7 100644 --- a/README.md +++ b/README.md @@ -10,12 +10,12 @@ Changes: * Debug version can be installed along OpenBoard * Allow users to add and replace built-in dictionaries * modified / improved from https://github.com/openboard-team/openboard/pull/569 and https://github.com/openboard-team/openboard/pull/578 - * some AOSP dictionaries are available [here](dictionaries/dict) + * some AOSP dictionaries are available [here](https://codeberg.org/Helium314/aosp-dictionaries) * experimental dictionaries with next-word suggestions created from sentence lists [are also available](dictionaries/experimental), but they may contain unwanted words, and may be missing other features * dictionary files starting with "main_" replace the built-in dictionary for the language, all other names work as add-on dictionaries * add [Arabic dictionary](https://github.com/remi0s/aosp-dictionary-tools/blob/master/dictsCreated/WikiAndOpenSubtitles/ar_wordlist.combined) for download, from https://github.com/openboard-team/openboard/pull/450 * add [Hebrew dictionary](https://github.com/Hananel-Hazan/aosp-dictionary-tools/blob/master/hebrew-hspell.txt.combined.new) for download, from https://github.com/openboard-team/openboard/pull/300 - * add [Galician dictionary](https://github.com/chavaone/openboard/blob/master/dictionaries/es_GL_wordlist.combined.g) for download, from https://github.com/openboard-team/openboard/pull/291 + * add [Galician dictionary](https://github.com/chavaone/openboard/blob/master/dictionaries/es_GL_wordlist.combined.gz) for download, from https://github.com/openboard-team/openboard/pull/291 * Fix suggestions after some characters, https://github.com/openboard-team/openboard/pull/694, https://github.com/openboard-team/openboard/issues/795 * Fix suggestions sometimes not being shown, https://github.com/openboard-team/openboard/pull/709 * Reduce amount of unwanted automatic space insertions, https://github.com/openboard-team/openboard/pull/576 @@ -41,6 +41,7 @@ Changes: * Fix number row not split in split keyboard view, https://github.com/Helium314/openboard/pull/27 * Fix white background of emoji tab selector on AMOLED theme for some Android versions, https://github.com/Helium314/openboard/pull/26 * Fix issue with spell checker incorrectly flagging words before a period as wrong on newer Android versions, https://github.com/openboard-team/openboard/pull/679 +* Plan / to do: * ~upgrade dependencies~ diff --git a/app/src/main/res/raw/main_hy.dict b/app/src/main/assets/dicts/main_hy.dict similarity index 100% rename from app/src/main/res/raw/main_hy.dict rename to app/src/main/assets/dicts/main_hy.dict diff --git a/app/src/main/java/org/dslul/openboard/inputmethod/latin/settings/DictionarySettingsFragment.kt b/app/src/main/java/org/dslul/openboard/inputmethod/latin/settings/DictionarySettingsFragment.kt index d19b87fa7..276f04d7b 100644 --- a/app/src/main/java/org/dslul/openboard/inputmethod/latin/settings/DictionarySettingsFragment.kt +++ b/app/src/main/java/org/dslul/openboard/inputmethod/latin/settings/DictionarySettingsFragment.kt @@ -347,7 +347,7 @@ class DictionarySettingsFragment : SubScreenFragment() { companion object { private const val DICTIONARY_REQUEST_CODE = 96834 private const val DICTIONARY_URL = - "https://github.com/Helium314/openboard/tree/new/dictionaries/dict" + "https://codeberg.org/Helium314/aosp-dictionaries" private const val USER_DICTIONARY_SUFFIX = "user.dict" private const val DICT_INTERNAL_AND_USER = 2 diff --git a/app/src/main/res/values/strings.xml b/app/src/main/res/values/strings.xml index 838fb0c4c..e024bda1c 100644 --- a/app/src/main/res/values/strings.xml +++ b/app/src/main/res/values/strings.xml @@ -513,11 +513,11 @@ disposition rather than other common dispositions for Latin languages. [CHAR LIM "Reset to default" - "Select a dictionary to add. Dictionaries can be downloaded at %s." + "Select a dictionary to add. Dictionaries in .dict format can be downloaded %s." + + "here" "User-added dictionaries, click to remove:" - - "project repository" "Load dictionary" diff --git a/dictionaries/ar_wordlist.combined.gz b/dictionaries/ar_wordlist.combined.gz deleted file mode 100644 index bb59c456d..000000000 Binary files a/dictionaries/ar_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/bg_wordlist.combined.gz b/dictionaries/bg_wordlist.combined.gz deleted file mode 100644 index 3c4284063..000000000 Binary files a/dictionaries/bg_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/cs_wordlist.combined.gz b/dictionaries/cs_wordlist.combined.gz deleted file mode 100755 index 94ba863a2..000000000 Binary files a/dictionaries/cs_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/da_wordlist.combined.gz b/dictionaries/da_wordlist.combined.gz deleted file mode 100755 index b4baf627f..000000000 Binary files a/dictionaries/da_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/de_wordlist.combined.gz b/dictionaries/de_wordlist.combined.gz deleted file mode 100755 index d51633058..000000000 Binary files a/dictionaries/de_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/dict/emoji_en.dict b/dictionaries/dict/emoji_en.dict deleted file mode 100644 index 3c6a1a4d2..000000000 Binary files a/dictionaries/dict/emoji_en.dict and /dev/null differ diff --git a/dictionaries/dict/emoji_fr.dict b/dictionaries/dict/emoji_fr.dict deleted file mode 100644 index c8dd40763..000000000 Binary files a/dictionaries/dict/emoji_fr.dict and /dev/null differ diff --git a/dictionaries/dict/main_ar.dict b/dictionaries/dict/main_ar.dict deleted file mode 100644 index 33c4e82ee..000000000 Binary files a/dictionaries/dict/main_ar.dict and /dev/null differ diff --git a/dictionaries/dict/main_bg.dict b/dictionaries/dict/main_bg.dict deleted file mode 100644 index b39d7e3f8..000000000 Binary files a/dictionaries/dict/main_bg.dict and /dev/null differ diff --git a/dictionaries/dict/main_cs.dict b/dictionaries/dict/main_cs.dict deleted file mode 100644 index 889ee5e89..000000000 Binary files a/dictionaries/dict/main_cs.dict and /dev/null differ diff --git a/dictionaries/dict/main_da.dict b/dictionaries/dict/main_da.dict deleted file mode 100644 index 60a48640a..000000000 Binary files a/dictionaries/dict/main_da.dict and /dev/null differ diff --git a/dictionaries/dict/main_de.dict b/dictionaries/dict/main_de.dict deleted file mode 100644 index 58aecf9ed..000000000 Binary files a/dictionaries/dict/main_de.dict and /dev/null differ diff --git a/dictionaries/dict/main_el.dict b/dictionaries/dict/main_el.dict deleted file mode 100644 index fb8bbceec..000000000 Binary files a/dictionaries/dict/main_el.dict and /dev/null differ diff --git a/dictionaries/dict/main_en_au.dict b/dictionaries/dict/main_en_au.dict deleted file mode 100644 index 5f61b0319..000000000 Binary files a/dictionaries/dict/main_en_au.dict and /dev/null differ diff --git a/dictionaries/dict/main_en_gb.dict b/dictionaries/dict/main_en_gb.dict deleted file mode 100644 index 77145c7d4..000000000 Binary files a/dictionaries/dict/main_en_gb.dict and /dev/null differ diff --git a/dictionaries/dict/main_en_us.dict b/dictionaries/dict/main_en_us.dict deleted file mode 100644 index 081a8c8c6..000000000 Binary files a/dictionaries/dict/main_en_us.dict and /dev/null differ diff --git a/dictionaries/dict/main_eo.dict b/dictionaries/dict/main_eo.dict deleted file mode 100644 index 1518153f7..000000000 Binary files a/dictionaries/dict/main_eo.dict and /dev/null differ diff --git a/dictionaries/dict/main_es.dict b/dictionaries/dict/main_es.dict deleted file mode 100644 index e2e6925bc..000000000 Binary files a/dictionaries/dict/main_es.dict and /dev/null differ diff --git a/dictionaries/dict/main_fi.dict b/dictionaries/dict/main_fi.dict deleted file mode 100644 index a1cdac3b6..000000000 Binary files a/dictionaries/dict/main_fi.dict and /dev/null differ diff --git a/dictionaries/dict/main_fr.dict b/dictionaries/dict/main_fr.dict deleted file mode 100644 index d1e0df196..000000000 Binary files a/dictionaries/dict/main_fr.dict and /dev/null differ diff --git a/dictionaries/dict/main_gl.dict b/dictionaries/dict/main_gl.dict deleted file mode 100644 index ea32746f8..000000000 Binary files a/dictionaries/dict/main_gl.dict and /dev/null differ diff --git a/dictionaries/dict/main_he.dict b/dictionaries/dict/main_he.dict deleted file mode 100644 index 0d9b81bdc..000000000 Binary files a/dictionaries/dict/main_he.dict and /dev/null differ diff --git a/dictionaries/dict/main_hr.dict b/dictionaries/dict/main_hr.dict deleted file mode 100644 index 57251c10e..000000000 Binary files a/dictionaries/dict/main_hr.dict and /dev/null differ diff --git a/dictionaries/dict/main_hu.dict b/dictionaries/dict/main_hu.dict deleted file mode 100644 index 0b05b265e..000000000 Binary files a/dictionaries/dict/main_hu.dict and /dev/null differ diff --git a/dictionaries/dict/main_hy.dict b/dictionaries/dict/main_hy.dict deleted file mode 100644 index 420e2df6d..000000000 Binary files a/dictionaries/dict/main_hy.dict and /dev/null differ diff --git a/dictionaries/dict/main_it.dict b/dictionaries/dict/main_it.dict deleted file mode 100644 index c516e12e0..000000000 Binary files a/dictionaries/dict/main_it.dict and /dev/null differ diff --git a/dictionaries/dict/main_iw.dict b/dictionaries/dict/main_iw.dict deleted file mode 100644 index d9cb74c9b..000000000 Binary files a/dictionaries/dict/main_iw.dict and /dev/null differ diff --git a/dictionaries/dict/main_ka.dict b/dictionaries/dict/main_ka.dict deleted file mode 100644 index 8e55ebda6..000000000 Binary files a/dictionaries/dict/main_ka.dict and /dev/null differ diff --git a/dictionaries/dict/main_lb.dict b/dictionaries/dict/main_lb.dict deleted file mode 100644 index aaf6e84bb..000000000 Binary files a/dictionaries/dict/main_lb.dict and /dev/null differ diff --git a/dictionaries/dict/main_lt.dict b/dictionaries/dict/main_lt.dict deleted file mode 100644 index 728499315..000000000 Binary files a/dictionaries/dict/main_lt.dict and /dev/null differ diff --git a/dictionaries/dict/main_lv.dict b/dictionaries/dict/main_lv.dict deleted file mode 100644 index 73bb20dab..000000000 Binary files a/dictionaries/dict/main_lv.dict and /dev/null differ diff --git a/dictionaries/dict/main_nb.dict b/dictionaries/dict/main_nb.dict deleted file mode 100644 index bf58e763e..000000000 Binary files a/dictionaries/dict/main_nb.dict and /dev/null differ diff --git a/dictionaries/dict/main_nl.dict b/dictionaries/dict/main_nl.dict deleted file mode 100644 index 4d031d0c2..000000000 Binary files a/dictionaries/dict/main_nl.dict and /dev/null differ diff --git a/dictionaries/dict/main_pl.dict b/dictionaries/dict/main_pl.dict deleted file mode 100644 index f55af662c..000000000 Binary files a/dictionaries/dict/main_pl.dict and /dev/null differ diff --git a/dictionaries/dict/main_pt_br.dict b/dictionaries/dict/main_pt_br.dict deleted file mode 100644 index 6122a8b01..000000000 Binary files a/dictionaries/dict/main_pt_br.dict and /dev/null differ diff --git a/dictionaries/dict/main_pt_pt.dict b/dictionaries/dict/main_pt_pt.dict deleted file mode 100644 index a685e35dc..000000000 Binary files a/dictionaries/dict/main_pt_pt.dict and /dev/null differ diff --git a/dictionaries/dict/main_ro.dict b/dictionaries/dict/main_ro.dict deleted file mode 100644 index 1f69a653b..000000000 Binary files a/dictionaries/dict/main_ro.dict and /dev/null differ diff --git a/dictionaries/dict/main_ru.dict b/dictionaries/dict/main_ru.dict deleted file mode 100644 index f24552dd9..000000000 Binary files a/dictionaries/dict/main_ru.dict and /dev/null differ diff --git a/dictionaries/dict/main_sl.dict b/dictionaries/dict/main_sl.dict deleted file mode 100644 index 573231e27..000000000 Binary files a/dictionaries/dict/main_sl.dict and /dev/null differ diff --git a/dictionaries/dict/main_sr.dict b/dictionaries/dict/main_sr.dict deleted file mode 100644 index 0accc33b6..000000000 Binary files a/dictionaries/dict/main_sr.dict and /dev/null differ diff --git a/dictionaries/dict/main_sv.dict b/dictionaries/dict/main_sv.dict deleted file mode 100644 index 0e7fdda62..000000000 Binary files a/dictionaries/dict/main_sv.dict and /dev/null differ diff --git a/dictionaries/dict/main_tr.dict b/dictionaries/dict/main_tr.dict deleted file mode 100644 index 3951fa237..000000000 Binary files a/dictionaries/dict/main_tr.dict and /dev/null differ diff --git a/dictionaries/dict/main_uk.dict b/dictionaries/dict/main_uk.dict deleted file mode 100644 index b6a20264a..000000000 Binary files a/dictionaries/dict/main_uk.dict and /dev/null differ diff --git a/dictionaries/el_wordlist.combined.gz b/dictionaries/el_wordlist.combined.gz deleted file mode 100755 index 599734cf8..000000000 Binary files a/dictionaries/el_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/en_AU_wordlist.combined.gz b/dictionaries/en_AU_wordlist.combined.gz deleted file mode 100755 index e08ff37b7..000000000 Binary files a/dictionaries/en_AU_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/en_GB_wordlist.combined.gz b/dictionaries/en_GB_wordlist.combined.gz deleted file mode 100755 index 217660fc4..000000000 Binary files a/dictionaries/en_GB_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/en_US_wordlist.combined.gz b/dictionaries/en_US_wordlist.combined.gz deleted file mode 100644 index 1ccedfb9f..000000000 Binary files a/dictionaries/en_US_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/en_emoji.combined.gz b/dictionaries/en_emoji.combined.gz deleted file mode 100755 index 4d9cf1b59..000000000 Binary files a/dictionaries/en_emoji.combined.gz and /dev/null differ diff --git a/dictionaries/eo_wordlist.combined.gz b/dictionaries/eo_wordlist.combined.gz deleted file mode 100644 index f5c58a57a..000000000 Binary files a/dictionaries/eo_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/es_GL_wordlist.combined.gz b/dictionaries/es_GL_wordlist.combined.gz deleted file mode 100644 index 5c2e28699..000000000 Binary files a/dictionaries/es_GL_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/es_wordlist.combined.gz b/dictionaries/es_wordlist.combined.gz deleted file mode 100755 index 71e7309fc..000000000 Binary files a/dictionaries/es_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/experimental/README.md b/dictionaries/experimental/README.md deleted file mode 100644 index 967f11d9c..000000000 --- a/dictionaries/experimental/README.md +++ /dev/null @@ -1,35 +0,0 @@ -This directory contains dictionaries compiled from sentence lists to make use of next-word predictions. -Currently all word dictionaries are based on word lists available at https://wortschatz.uni-leipzig.de/en/download/ under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) license. - -The emoji dictionary is adapted from [gemoji](https://github.com/github/gemoji/blob/master/db/emoji.json) without further processing. - -Dictionaries are created using [`create_wordlist_from_sentences.py`](create_wordlist_from_sentences.py) for `_wordlist.combined` and [`dicttool_aosp.jar`](https://github.com/remi0s/aosp-dictionary-tools) for creating `.dict` files. See the `example_()` functions in the python script for how to use it. You can simply adjust paths and add your sentence (or word) lists. -The script is still experimental, rather slow and may produce bad dictionaries in some languages. Some words seem to be wrongly added (e.g. "i" for English), and names are typically missing, though this depends on how exactly you create the Android dictionaries. - -A "potentially_offensive" attribute is added for some words, which sometimes seems unnecessary. Currently this is coming from the "nosuggest" attribute of the used _hunspell_ dictionaries, which occurs for offensive words as well as for weird / rare word forms. - -Other flags are currently missing, same for shortcuts (e.g. ill -> I'll or écoeuré -> écœuré, as found in AOSP dictionaries). - ------ - -`wordlist.combined` file infos (mostly guessed, didn't find documentation): -* header is necessary - * format like `dictionary=main:en_us,locale=en_US,description=English (US),date=1414726260,version=54` - * all of these fields are necessary, though `description` is not used - * German dictionaries also have `REQUIRES_GERMAN_UMLAUT_PROCESSING=1` -* each word is in a line like ` word=re,f=0,flags=abbreviation,originalFreq=99,possibly_offensive=true` - * `word` is the word (necessary) - * `f` is frequency, from 0 to 255(?) (necessary) - * higher value is more likely to get suggested / corrected - * special value `whitelist`, possibly equal to 15 - * `f=0` will not be suggested if bad words are blocked, and will never be added to user history - * possible bug: words with `possibly_offensive=true` and `f=0` will be suggested when not blocking offensive words, but other words with `f=0` are still not suggested - * `originalFreq`: unclear, is this used? - * `flags`: `medical`, `technical`, `hand-added`, `babytalk`, `abbreviation`, `offensive`, `technical`, `nonword`, and probably more: are they used for anything? - * `possibly_offensive=true` stops the word from being suggested when blocking offensive words - * `not_a_word=true` will not be suggested, use together with `shortcut` - * `shortcut=` (below a ``) will suggest `` when the `` is typed - * which `f` to use? maybe only 0-14 and `whitelist` allowed - * what does `f` do here? - * `bigram=` (below a ``) will suggest `` as next word before typing any letters - * what does `f` do here? Looks like 1, 2, and 3 are used for the usual 3 bigram entries diff --git a/dictionaries/experimental/create_wordlist_from_sentences.py b/dictionaries/experimental/create_wordlist_from_sentences.py deleted file mode 100755 index d3ce84f22..000000000 --- a/dictionaries/experimental/create_wordlist_from_sentences.py +++ /dev/null @@ -1,466 +0,0 @@ -#!/bin/python -import math -import os -import time -import regex -import copy -from spylls.hunspell import Dictionary - -# issues: -# for english got 'i' as word (shouldn't it be 'I' only? or is 'i' the imaginary number?) -# potentially_offensive is not set, where to get info? parse android dicts? -# maybe ignore compound words like 'long-term'? will android actually suggest them? - -# maybe useful -# https://wortschatz.uni-leipzig.de/en/download/ -# really useful source of sentences / fragments, but should be checked against dictionaries as they are taken from news / web -# https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists -# word frequency lists linked, in some cases there are also sentence lists -# https://github.com/wooorm/dictionaries -# hunspell dicts, are they the same as the one included in phunspell? - -# memory usage depends on word lists and language, expect 0.5 - 2 GB -# for some reason, Italian requires 4 GB for unmunch - - -# from https://github.com/zverok/spylls/blob/master/examples/unmunch.py -def unmunch_word(word, aff): - result = set() - - if aff.FORBIDDENWORD and aff.FORBIDDENWORD in word.flags: - return result - - if not (aff.NEEDAFFIX and aff.NEEDAFFIX in word.flags): - result.add(word.stem) - - suffixes = [ - suffix - for flag in word.flags - for suffix in aff.SFX.get(flag, []) - if suffix.cond_regexp.search(word.stem) - ] - prefixes = [ - prefix - for flag in word.flags - for prefix in aff.PFX.get(flag, []) - if prefix.cond_regexp.search(word.stem) - ] - - for suffix in suffixes: - root = word.stem[0:-len(suffix.strip)] if suffix.strip else word.stem - suffixed = root + suffix.add - if not (aff.NEEDAFFIX and aff.NEEDAFFIX in suffix.flags): - result.add(suffixed) - - secondary_suffixes = [ - suffix2 - for flag in suffix.flags - for suffix2 in aff.SFX.get(flag, []) - if suffix2.cond_regexp.search(suffixed) - ] - for suffix2 in secondary_suffixes: - root = suffixed[0:-len(suffix2.strip)] if suffix2.strip else suffixed - result.add(root + suffix2.add) - - for prefix in prefixes: - root = word.stem[len(prefix.strip):] - prefixed = prefix.add + root - if not (aff.NEEDAFFIX and aff.NEEDAFFIX in prefix.flags): - result.add(prefixed) - - if prefix.crossproduct: - additional_suffixes = [ - suffix - for flag in prefix.flags - for suffix in aff.SFX.get(flag, []) - if suffix.crossproduct and not suffix in suffixes and suffix.cond_regexp.search(prefixed) - ] - for suffix in suffixes + additional_suffixes: - root = prefixed[0:-len(suffix.strip)] if suffix.strip else prefixed - suffixed = root + suffix.add - result.add(suffixed) - - secondary_suffixes = [ - suffix2 - for flag in suffix.flags - for suffix2 in aff.SFX.get(flag, []) - if suffix2.crossproduct and suffix2.cond_regexp.search(suffixed) - ] - for suffix2 in secondary_suffixes: - root = suffixed[0:-len(suffix2.strip)] if suffix2.strip else suffixed - result.add(root + suffix2.add) - - return result - - -def unmunch_dictionary(dictionary: Dictionary) -> set[str]: - result = set() - for word in dictionary.dic.words: - result.update(unmunch_word(word, dictionary.aff)) - return result - - -class wordlist: - def __init__(self, - # spylls dictionary - dictionary: Dictionary | None = None, - # words that should be ignored, typically (international) names we don't want in a language word list - neutral_words: set[str] | None = None): - self.dictionary = dictionary - self.dict_words: set[str] = set() - if neutral_words is None: - self.neutral_words = set() - else: - self.neutral_words = neutral_words - # number of identified words - count = 0 - - # number of words used for frequency - count_valid = 0 - - # words to ignore, as they should be in some additional dictionary (mostly names) - # these are not counted as valid or invalid, and not used for next-word data - neutral_word_count = 0 - - # words detected as invalid, these are mostly names and capitalized words (possibly also part of names) - invalid_words: set[str] = set() - not_words: set[str] = set() - - # unclear words with more than one match group in above regex - # check and decide in the end what to do - weird_things: set[str] = set() - - # for each word, contains a dict with: - # count: int (always) - # next: dict[str, int] (not always) - # how often the word is followed by some others (next_word, count) - # nosuggest: bool (usually only if True, as determined by hunspell dict) - word_infos: dict = {} - - # regex for that kicks out things that are definitely not words - # next word will be treated as ngram start - # allow latin letters, and ' and - (but not at start/end) - possible_word_regex = r"(?!['-])([\p{L}\d'-]+)(? None: - previous_word: str | None = None - for word in line.split(): - if word in self.word_infos: - # shortcut: we already know the word, avoid doing the regex check and dict lookup if possible - # only increase count and add next word info - self.add_word(word) - if previous_word is not None: - previous_info = self.word_infos[previous_word] - previous_next = previous_info.get("next", {}) - previous_next[word] = previous_next.get(word, 0) + 1 - previous_info["next"] = previous_next - previous_word = word - continue - if len(word) >= 48: - # android dicttool ignores those, so let's skip them already here - previous_word = None - continue - if word.isspace(): - # don't treat spaces as countable word (assuming a line does not contain a line break) - continue - if word.isnumeric(): - # don't put numbers info not_words - previous_word = None - continue - if not regex.search(r"\p{L}", word): - # no letters, no word (but ngram ends here) - self.not_words.add(word) - previous_word = None - continue - # hunspell dict has ', not ’, but we want to understand both - word = word.replace('’', '\'') - # must find something, because r"\p{L}" non-matches are already removed - re_find = regex.findall(self.possible_word_regex, word) - self.count += 1 - if len(re_find) > 1: - # just write down and end sentence for now - self.weird_things.add(word) - previous_word = None - # treat re_find[0] as the actual word, but need to investigate weird_things to maybe improve possible_word_regex - full_word = word - word = re_find[0] - # if the match is not at the start, treat as ngram start - if not full_word.startswith(word): - previous_word = None - - if word in self.neutral_words: - self.neutral_word_count += 1 - previous_word = None - continue - - if word in self.invalid_words: - previous_word = None - continue - - if word not in self.word_infos: - if add_unknown_words: - if previous_word is None and word[0].isupper(): - continue - else: - try: - valid, word = self.dict_check(word, previous_word is None) - except IndexError: - # happens for "İsmail" when using German dictionary, also for other words starting with "İ" - previous_word = None - continue - if not valid: - if previous_word is not None: - self.invalid_words.add(word) - previous_word = None - continue - - self.count_valid += 1 - self.add_word(word, add_to_count=False) - - if previous_word is not None: - previous_info = self.word_infos[previous_word] - previous_next = previous_info.get("next", {}) - previous_next[word] = previous_next.get(word, 0) + 1 - previous_info["next"] = previous_next - # set new previous word, or None if ngram end is suspected (this could be optimized, but no priority) - if full_word.endswith(word): - previous_word = word - else: - previous_word = None - - # returns whether word is valid according to the dictionary, and, for the case it was capitalized, the valid form - def dict_check(self, word: str, try_decapitalize: bool) -> tuple[bool, str]: - if try_decapitalize and word[0].isupper(): - decapitalized = word[0].lower() + word[1:] - if decapitalized in self.word_infos: - return True, decapitalized - # todo: lookup can be slow, optimize order with capitalization and nosuggest - if not self.dictionary.lookuper(word, capitalization=True, allow_nosuggest=True): - return False, word - # word may be valid, check capitalization and nosuggest - if self.dictionary.lookuper(word, capitalization=False, allow_nosuggest=False): - return True, word - if self.dictionary.lookuper(decapitalized, capitalization=False, allow_nosuggest=False): - return True, decapitalized - if self.dictionary.lookuper(word, capitalization=False, allow_nosuggest=True): - self.word_infos[word] = {"nosuggest": True} - return True, word - if self.dictionary.lookuper(decapitalized, capitalization=False, allow_nosuggest=True): - self.word_infos[decapitalized] = {"nosuggest": True} - return True, decapitalized - return False, word - # we always want correct capitalization - # maybe invert order for better performance, similar to above - if not self.dictionary.lookuper(word, capitalization=False, allow_nosuggest=True): - return False, word - if self.dictionary.lookuper(word, capitalization=False, allow_nosuggest=False): - return True, word - self.word_infos[word] = {"nosuggest": True} - return True, word - - def add_word(self, word: str, nosuggest: bool = False, add_to_count: bool = True): - word_info = self.word_infos.get(word, {}) - word_info["count"] = word_info.get("count", 0) + 1 - if nosuggest: - word_info["nosuggest"] = True - self.word_infos[word] = word_info - if add_to_count: - self.count += 1 - self.count_valid += 1 - - def add_sentence_file(self, filename: str, add_unknown_words: bool = False): - with open(filename) as f: - for line in f: - self.add_line(line, add_unknown_words) - - def add_word_file(self, filename: str): - with open(filename) as f: - for line in f: - for word in line.split(): - self.add_word(word) - - # dicts need all the input, but only type and locale are relevant - # type can be any ASCII string, but typically main is used - # note that only one dict can be loaded for each type - # using main also overrides any built-in dictionary in my version of OpenBoard - # locale should be in compatible format (e.g. en, en_US, fr, fr_CA,...) - def create_android_word_list(self, file_path, dict_type: str, locale: str, description: str, version: int): - with open(file_path, 'w') as f: - t = int(time.time()) - # e.g. dictionary=main:en_us,locale=en_US,description=English (US),date=1414726260,version=54 - header = f"dictionary={dict_type}:{locale.lower()},locale={locale},description={description},date={t},version={version}" - if locale.startswith("de"): - header += ",REQUIRES_GERMAN_UMLAUT_PROCESSING=1" - # any special things for other languages? - # russian dict has MULTIPLE_WORDS_DEMOTION_RATE=50 -> what's this? - f.write(header + "\n") - # deep copy to avoid modifying self.word_infos - word_infos = copy.deepcopy(self.word_infos) - # todo: check android dicts and maybe some documentation about frequencies - add_frequencies(word_infos, 1, 250) - filter_bigrams(word_infos, 3, 2) - - for word, infos in sorted(word_infos.items(), key=lambda item: -item[1]["count"]): - frequency = infos["frequency"] - if infos.get("nosuggest", False): - # todo: frequency of nosuggest words? - # in AOSP dicts there are possibly_offensive words with freq > 0, but profanity has frequency 0 - # dictionaryFacilitator will add freq == 0 to history only as invalid words - # -> what happens here? try and compare "hardcore" (f=112) and "Cid" (f=0) - # hunspell nosuggest english is insults/slurs, which are f=0 in AOSP dictionaries - # other possibly_offensive words found in AOSP dictionaries are not flagged at all - # -> maybe find a way to extract this information from existing dictonaries? - # but hunspell nosuggest german is also weird/rare word forms - f.write(f" word={word},f={frequency},possibly_offensive=true\n") - else: - f.write(f" word={word},f={frequency}\n") - if "next" in infos: - for next_word, freq in infos["next"].items(): - f.write(f" bigram={next_word},f={freq}\n") - - -# adds a "frequency" entry to each entry of word_infos -# frequency is the log of input frequencies, and scaled between min_frequency and max_frequency -def add_frequencies(word_infos: dict[str, int], min_frequency: int, max_frequency: int): - assert max_frequency > min_frequency - max_count = 0 - min_count = 2147483647 # simply start with a very large number (int32 max) - # first get max and min count - for _, infos in word_infos.items(): - count = infos["count"] - if count < min_count: - min_count = count - if count > max_count: - max_count = count - min_f = math.log(min_count) - fdiff = max(math.log(max_count) - min_f, 1) - for word, infos in word_infos.items(): - f = math.log(infos["count"]) - infos["frequency"] = int((f - min_f) * (max_frequency - min_frequency) / fdiff + min_frequency) - - -# modifies word_infos: -# fewer entries per word (limiting to max_bigrams and requiring min_count occurences) -# frequency replaced by order, starting at 1 for the most used, like it seems to be in the AOSP en(_US) dictionary -def filter_bigrams(word_infos: dict, max_bigrams, min_count): - for word, infos in word_infos.items(): - if "next" not in infos: - continue - bigram = infos["next"] - new_bigram = dict() - bigram_count = 1 - for next_word, next_count in sorted(bigram.items(), key=lambda item: -item[1]): - if bigram_count > max_bigrams or next_count < min_count: - break - new_bigram[next_word] = bigram_count - bigram_count += 1 - infos["next"] = new_bigram - - -# highest frequency first -def sort_dict_by_count(d: dict[str, int]): - return sorted(d.items(), key=lambda item: -item[1]) - - -# use existing dictionary for spell check -# use sentence list to build word list -def example_1(): - d = Dictionary.from_files("/home/user/.local/lib/python3.10/site-packages/phunspell/data/dictionary/en/en_US") - w = wordlist(dictionary=d) - w.add_sentence_file("/home/user/eng_news_2020_100K-sentences.txt", - add_unknown_words=False) # will only add words that pass the spell check - w.create_android_word_list("/home/user/en_US_wordlist.compiled", "main", "en_US", "english", 1) - - -# use existing dictionary for spell check -# use words from unmunched (affix-expanded) dictionary -# creates a much larger wordlist in some languages -# use sentence list to build word list -# this is mostly for frequencies and next words, but may also add new words in some languages, e.g. German compund words -def example_2(): - d = Dictionary.from_files("/home/user/.local/lib/python3.10/site-packages/phunspell/data/dictionary/en/en_US") - w = wordlist(dictionary=d) - # unmunched cache not necessary for english, but helps for e.g. german or czech - w.add_unmunched_dictionary(unmunched_cache="/home/user/en_unmunched.txt") # adds all words with frequency 1 - w.add_sentence_file("/home/user/eng_news_2020_100K-sentences.txt", add_unknown_words=False) - w.create_android_word_list("/home/user/en_US_wordlist.compiled", "main", "en_US", "english", 1) - - -# don't use a dictionary, only a word list -# this will produce low-quality suggestions, as word count is the same for all words -# but if the word list contains duplicates, it will affect word count -def example_3(): - w = wordlist() - w.add_word_file("/home/user/some_word_list.txt") - w.create_android_word_list("/home/user/en_US_wordlist.compiled", "main", "en_US", "english", 1) - - -# don't use a dictionary, but provide a word list -# use a sentence file for word count and next word suggestions -def example_4(): - w = wordlist() - w.add_word_file("/home/user/some_word_list.txt") - w.add_sentence_file("/home/user/eng_news_2020_100K-sentences.txt", add_unknown_words=False) - w.create_android_word_list("/home/user/en_US_wordlist.compiled", "main", "en_US", "english", 1) - - -# don't use a dictionary, but a list of sentences -# android word list may contain spelling errors depending on source of the sentences -def example_5(): - w = wordlist() - w.add_sentence_file("/home/user/eng_news_2020_100K/eng_news_2020_100K-sentences.txt", - add_unknown_words=True) # add all words to the word list, except some obvious non-words - w.create_android_word_list("/home/user/en_US_wordlist.compiled", "main", "en_US", "english", 1) diff --git a/dictionaries/experimental/dict/emoji_en.dict b/dictionaries/experimental/dict/emoji_en.dict deleted file mode 100644 index 94c076afb..000000000 Binary files a/dictionaries/experimental/dict/emoji_en.dict and /dev/null differ diff --git a/dictionaries/experimental/dict/main_ar.dict b/dictionaries/experimental/dict/main_ar.dict deleted file mode 100644 index 19b494dbc..000000000 Binary files a/dictionaries/experimental/dict/main_ar.dict and /dev/null differ diff --git a/dictionaries/experimental/dict/main_bn.dict b/dictionaries/experimental/dict/main_bn.dict deleted file mode 100644 index 10e444504..000000000 Binary files a/dictionaries/experimental/dict/main_bn.dict and /dev/null differ diff --git a/dictionaries/experimental/dict/main_cs.dict b/dictionaries/experimental/dict/main_cs.dict deleted file mode 100644 index 1a3eebbba..000000000 Binary files a/dictionaries/experimental/dict/main_cs.dict and /dev/null differ diff --git a/dictionaries/experimental/dict/main_de.dict b/dictionaries/experimental/dict/main_de.dict deleted file mode 100644 index bb23822bf..000000000 Binary files a/dictionaries/experimental/dict/main_de.dict and /dev/null differ diff --git a/dictionaries/experimental/dict/main_en_gb.dict b/dictionaries/experimental/dict/main_en_gb.dict deleted file mode 100644 index c95145e36..000000000 Binary files a/dictionaries/experimental/dict/main_en_gb.dict and /dev/null differ diff --git a/dictionaries/experimental/dict/main_en_us.dict b/dictionaries/experimental/dict/main_en_us.dict deleted file mode 100644 index ecaad6bd4..000000000 Binary files a/dictionaries/experimental/dict/main_en_us.dict and /dev/null differ diff --git a/dictionaries/experimental/dict/main_es.dict b/dictionaries/experimental/dict/main_es.dict deleted file mode 100644 index 388a0a651..000000000 Binary files a/dictionaries/experimental/dict/main_es.dict and /dev/null differ diff --git a/dictionaries/experimental/dict/main_fr.dict b/dictionaries/experimental/dict/main_fr.dict deleted file mode 100644 index 80b54256a..000000000 Binary files a/dictionaries/experimental/dict/main_fr.dict and /dev/null differ diff --git a/dictionaries/experimental/dict/main_it.dict b/dictionaries/experimental/dict/main_it.dict deleted file mode 100644 index 669f04d1a..000000000 Binary files a/dictionaries/experimental/dict/main_it.dict and /dev/null differ diff --git a/dictionaries/experimental/dict/main_ru.dict b/dictionaries/experimental/dict/main_ru.dict deleted file mode 100644 index 145d3b353..000000000 Binary files a/dictionaries/experimental/dict/main_ru.dict and /dev/null differ diff --git a/dictionaries/experimental/dict/main_uk.dict b/dictionaries/experimental/dict/main_uk.dict deleted file mode 100644 index 48b2ceed8..000000000 Binary files a/dictionaries/experimental/dict/main_uk.dict and /dev/null differ diff --git a/dictionaries/experimental/wordlists/ar_wordlist.combined.gz b/dictionaries/experimental/wordlists/ar_wordlist.combined.gz deleted file mode 100644 index ddfe3d694..000000000 Binary files a/dictionaries/experimental/wordlists/ar_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/experimental/wordlists/bn_wordlist.combined.gz b/dictionaries/experimental/wordlists/bn_wordlist.combined.gz deleted file mode 100644 index 829f68d8c..000000000 Binary files a/dictionaries/experimental/wordlists/bn_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/experimental/wordlists/cs_wordlist.combined.gz b/dictionaries/experimental/wordlists/cs_wordlist.combined.gz deleted file mode 100644 index 01f57c549..000000000 Binary files a/dictionaries/experimental/wordlists/cs_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/experimental/wordlists/de_wordlist.combined.gz b/dictionaries/experimental/wordlists/de_wordlist.combined.gz deleted file mode 100644 index a990dfb76..000000000 Binary files a/dictionaries/experimental/wordlists/de_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/experimental/wordlists/en_GB_wordlist.combined.gz b/dictionaries/experimental/wordlists/en_GB_wordlist.combined.gz deleted file mode 100644 index 933fe4a13..000000000 Binary files a/dictionaries/experimental/wordlists/en_GB_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/experimental/wordlists/en_US_wordlist.combined.gz b/dictionaries/experimental/wordlists/en_US_wordlist.combined.gz deleted file mode 100644 index f7a19f6b3..000000000 Binary files a/dictionaries/experimental/wordlists/en_US_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/experimental/wordlists/en_emoji.combined.gz b/dictionaries/experimental/wordlists/en_emoji.combined.gz deleted file mode 100644 index 3eb989068..000000000 Binary files a/dictionaries/experimental/wordlists/en_emoji.combined.gz and /dev/null differ diff --git a/dictionaries/experimental/wordlists/es_wordlist.combined.gz b/dictionaries/experimental/wordlists/es_wordlist.combined.gz deleted file mode 100644 index e385447d2..000000000 Binary files a/dictionaries/experimental/wordlists/es_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/experimental/wordlists/fr_wordlist.combined.gz b/dictionaries/experimental/wordlists/fr_wordlist.combined.gz deleted file mode 100644 index bba081e16..000000000 Binary files a/dictionaries/experimental/wordlists/fr_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/experimental/wordlists/it_wordlist.combined.gz b/dictionaries/experimental/wordlists/it_wordlist.combined.gz deleted file mode 100644 index bac2e0bee..000000000 Binary files a/dictionaries/experimental/wordlists/it_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/experimental/wordlists/ru_wordlist.combined.gz b/dictionaries/experimental/wordlists/ru_wordlist.combined.gz deleted file mode 100644 index b7a87d83d..000000000 Binary files a/dictionaries/experimental/wordlists/ru_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/experimental/wordlists/uk_wordlist.combined.gz b/dictionaries/experimental/wordlists/uk_wordlist.combined.gz deleted file mode 100644 index b49405ad4..000000000 Binary files a/dictionaries/experimental/wordlists/uk_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/fi_wordlist.combined.gz b/dictionaries/fi_wordlist.combined.gz deleted file mode 100755 index b7332ad3e..000000000 Binary files a/dictionaries/fi_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/fr_emoji.combined.gz b/dictionaries/fr_emoji.combined.gz deleted file mode 100755 index 5c9c7a096..000000000 Binary files a/dictionaries/fr_emoji.combined.gz and /dev/null differ diff --git a/dictionaries/fr_wordlist.combined.gz b/dictionaries/fr_wordlist.combined.gz deleted file mode 100755 index afe44a6d9..000000000 Binary files a/dictionaries/fr_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/he_wordlist.combined.gz b/dictionaries/he_wordlist.combined.gz deleted file mode 100644 index c1b380f40..000000000 Binary files a/dictionaries/he_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/hr_wordlist.combined.gz b/dictionaries/hr_wordlist.combined.gz deleted file mode 100755 index 9a2086f1f..000000000 Binary files a/dictionaries/hr_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/hu_wordlist.combined.gz b/dictionaries/hu_wordlist.combined.gz deleted file mode 100644 index 0e1bd8a7c..000000000 Binary files a/dictionaries/hu_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/hy_wordlist.combined.gz b/dictionaries/hy_wordlist.combined.gz deleted file mode 100644 index bfc3b4763..000000000 Binary files a/dictionaries/hy_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/it_wordlist.combined.gz b/dictionaries/it_wordlist.combined.gz deleted file mode 100755 index ed58a12c5..000000000 Binary files a/dictionaries/it_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/iw_wordlist.combined.gz b/dictionaries/iw_wordlist.combined.gz deleted file mode 100755 index 13eab9f17..000000000 Binary files a/dictionaries/iw_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/ka_wordlist.combined.gz b/dictionaries/ka_wordlist.combined.gz deleted file mode 100755 index 02bacba36..000000000 Binary files a/dictionaries/ka_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/lb_wordlist.combined.gz b/dictionaries/lb_wordlist.combined.gz deleted file mode 100755 index 9e0579b38..000000000 Binary files a/dictionaries/lb_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/lt_wordlist.combined.gz b/dictionaries/lt_wordlist.combined.gz deleted file mode 100755 index 961266bb8..000000000 Binary files a/dictionaries/lt_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/lv_wordlist.combined.gz b/dictionaries/lv_wordlist.combined.gz deleted file mode 100755 index ae906a9db..000000000 Binary files a/dictionaries/lv_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/nb_wordlist.combined.gz b/dictionaries/nb_wordlist.combined.gz deleted file mode 100755 index 1c0f2cfb9..000000000 Binary files a/dictionaries/nb_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/nl_wordlist.combined.gz b/dictionaries/nl_wordlist.combined.gz deleted file mode 100755 index 19c3a7ea8..000000000 Binary files a/dictionaries/nl_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/pl_wordlist.combined.gz b/dictionaries/pl_wordlist.combined.gz deleted file mode 100755 index 2b84eecfd..000000000 Binary files a/dictionaries/pl_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/pt_BR_wordlist.combined.gz b/dictionaries/pt_BR_wordlist.combined.gz deleted file mode 100755 index 7aac61e50..000000000 Binary files a/dictionaries/pt_BR_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/pt_PT_wordlist.combined.gz b/dictionaries/pt_PT_wordlist.combined.gz deleted file mode 100755 index 5bf9a60e8..000000000 Binary files a/dictionaries/pt_PT_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/ro_wordlist.combined.gz b/dictionaries/ro_wordlist.combined.gz deleted file mode 100755 index 92cb73ed9..000000000 Binary files a/dictionaries/ro_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/ru_wordlist.combined.gz b/dictionaries/ru_wordlist.combined.gz deleted file mode 100755 index 5e9266221..000000000 Binary files a/dictionaries/ru_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/sample.combined b/dictionaries/sample.combined deleted file mode 100755 index 4fa595e1e..000000000 --- a/dictionaries/sample.combined +++ /dev/null @@ -1,38 +0,0 @@ -# This is a sample wordlist that can be converted to a binary dictionary -# for use by the Latin IME. -# The file is essentially a CSV file, with indent level denoting nesting. -# -# The file starts with a single CSV line with the header attributes. Whatever -# the content, these are included as is in the binary file. The first attribute -# of the file should be `dictionary'. Usual fields are `locale', `description', -# `date', `version', `options'. -# -# Each word has a `word' entry and at least a `f' argument denoting its -# probability, as an integer between 0 and 255 on a logarithmic scale, with -# 255 meaning 1 and each decrement in 1 dividing probability by 1.15. -# As a special case, a weight of 0 is taken to mean profanity - words that -# should not be considered a typo, but that should never be suggested -# explicitly. An entry may be made not a word by adding a `not_a_word' -# field with a value of `true'. The main reason for putting such entries -# into the dictionary is to add shortcut targets and maybe a whitelist -# replacement. -# -# Each word may or may not have any number of shortcut target lines -# starting with a `shortcut' entry and having at least a `f' frequency -# value between 0 and 14, or the special value `whitelist' which becomes -# 15, which is then taken to be the whitelist target of this word. -# -# Each word may also have any number of bigram lines starting with a -# `bigram' entry containing the following word whose frequency should -# override the unigram frequency when following the word this bigram is -# for. -# -dictionary=main:en,locale=en,description=Sample wordlist,date=1351495318,version=1 - word=sample,f=200 - bigram=wordlist,f=243 - word=wordlist,f=180 - word=shortcut,f=176 - shortcut=target,f=10 - word=witelisted,f=10,not_a_word=true - shortcut=whitelisted,f=whitelist - word=profanity,f=0 diff --git a/dictionaries/sl_wordlist.combined.gz b/dictionaries/sl_wordlist.combined.gz deleted file mode 100755 index a7240fe5b..000000000 Binary files a/dictionaries/sl_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/sr_wordlist.combined.gz b/dictionaries/sr_wordlist.combined.gz deleted file mode 100755 index 30ce99670..000000000 Binary files a/dictionaries/sr_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/sv_wordlist.combined.gz b/dictionaries/sv_wordlist.combined.gz deleted file mode 100755 index db44ae4c4..000000000 Binary files a/dictionaries/sv_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/tr_wordlist.combined.gz b/dictionaries/tr_wordlist.combined.gz deleted file mode 100755 index d3c8825b9..000000000 Binary files a/dictionaries/tr_wordlist.combined.gz and /dev/null differ diff --git a/dictionaries/uk_wordlist.combined.gz b/dictionaries/uk_wordlist.combined.gz deleted file mode 100755 index ed8420d7d..000000000 Binary files a/dictionaries/uk_wordlist.combined.gz and /dev/null differ