move dictionaries and related script to a separate repository

https://codeberg.org/Helium314/aosp-dictionaries
2025-05-21 17:24:14 +00:00 · 2023-07-28 19:19:57 +02:00 · 2023-07-28 19:19:57 +02:00 · 33b64ac34d
commit 33b64ac34d
parent 1956ca43e4
107 changed files with 7 additions and 545 deletions
--- a/README.md
+++ b/README.md
@ -10,12 +10,12 @@ Changes:
 * Debug version can be installed along OpenBoard
 * Allow users to add and replace built-in dictionaries
  * modified / improved from https://github.com/openboard-team/openboard/pull/569 and https://github.com/openboard-team/openboard/pull/578
-  * some AOSP dictionaries are available [here](dictionaries/dict)
+  * some AOSP dictionaries are available [here](https://codeberg.org/Helium314/aosp-dictionaries)
    * experimental dictionaries with next-word suggestions created from sentence lists [are also available](dictionaries/experimental), but they may contain unwanted words, and may be missing other features
  * dictionary files starting with "main_" replace the built-in dictionary for the language, all other names work as add-on dictionaries
  * add [Arabic dictionary](https://github.com/remi0s/aosp-dictionary-tools/blob/master/dictsCreated/WikiAndOpenSubtitles/ar_wordlist.combined) for download, from https://github.com/openboard-team/openboard/pull/450
  * add [Hebrew dictionary](https://github.com/Hananel-Hazan/aosp-dictionary-tools/blob/master/hebrew-hspell.txt.combined.new) for download, from https://github.com/openboard-team/openboard/pull/300
-  * add [Galician dictionary](https://github.com/chavaone/openboard/blob/master/dictionaries/es_GL_wordlist.combined.g) for download, from https://github.com/openboard-team/openboard/pull/291
+  * add [Galician dictionary](https://github.com/chavaone/openboard/blob/master/dictionaries/es_GL_wordlist.combined.gz) for download, from https://github.com/openboard-team/openboard/pull/291
 * Fix suggestions after some characters, https://github.com/openboard-team/openboard/pull/694, https://github.com/openboard-team/openboard/issues/795
 * Fix suggestions sometimes not being shown, https://github.com/openboard-team/openboard/pull/709
 * Reduce amount of unwanted automatic space insertions, https://github.com/openboard-team/openboard/pull/576
@ -41,6 +41,7 @@ Changes:
 * Fix number row not split in split keyboard view, https://github.com/Helium314/openboard/pull/27
 * Fix white background of emoji tab selector on AMOLED theme for some Android versions, https://github.com/Helium314/openboard/pull/26
 * Fix issue with spell checker incorrectly flagging words before a period as wrong on newer Android versions, https://github.com/openboard-team/openboard/pull/679
 *
 Plan / to do:
 * ~upgrade dependencies~
--- a/app/src/main/assets/dicts/main_hy.dict
+++ b/app/src/main/assets/dicts/main_hy.dict
--- a/app/src/main/java/org/dslul/openboard/inputmethod/latin/settings/DictionarySettingsFragment.kt
+++ b/app/src/main/java/org/dslul/openboard/inputmethod/latin/settings/DictionarySettingsFragment.kt
@ -347,7 +347,7 @@ class DictionarySettingsFragment : SubScreenFragment() {
    companion object {
        private const val DICTIONARY_REQUEST_CODE = 96834
        private const val DICTIONARY_URL =
-            "https://github.com/Helium314/openboard/tree/new/dictionaries/dict"
+            "https://codeberg.org/Helium314/aosp-dictionaries"
        private const val USER_DICTIONARY_SUFFIX = "user.dict"
        private const val DICT_INTERNAL_AND_USER = 2
--- a/app/src/main/res/values/strings.xml
+++ b/app/src/main/res/values/strings.xml
@ -513,11 +513,11 @@ disposition rather than other common dispositions for Latin languages. [CHAR LIM
    <!-- Button text in user dictionary remove dialog for resetting to default -->
    <string name="reset_dictionary">"Reset to default"</string>
    <!-- Message for the user dictionary selection dialog. This string will be interpreted as HTML -->
-    <string name="add_dictionary">"Select a dictionary to add. Dictionaries can be downloaded at %s."</string>
+    <string name="add_dictionary">"Select a dictionary to add. Dictionaries in .dict format can be downloaded %s."</string>
    <!-- Title of the link to the download page inserted into selection message (above) -->
    <string name="dictionary_link_text">"here"</string>
    <!-- Message above list of user-added dictionaries -->
    <string name="existing_dictionaries">"User-added dictionaries, click to remove:"</string>
    <!-- Title of the link to the download page inserted into selection message (above) -->
    <string name="dictionary_link_text">"project repository"</string>
    <!-- Button text for dictionary file selection -->
    <string name="load_dictionary_file">"Load dictionary"</string>
    <!-- Toast text shown when dictionary file for a locale was added successfully -->
--- a/dictionaries/ar_wordlist.combined.gz
+++ b/dictionaries/ar_wordlist.combined.gz
--- a/dictionaries/bg_wordlist.combined.gz
+++ b/dictionaries/bg_wordlist.combined.gz
--- a/dictionaries/cs_wordlist.combined.gz
+++ b/dictionaries/cs_wordlist.combined.gz
--- a/dictionaries/da_wordlist.combined.gz
+++ b/dictionaries/da_wordlist.combined.gz
--- a/dictionaries/de_wordlist.combined.gz
+++ b/dictionaries/de_wordlist.combined.gz
--- a/dictionaries/dict/emoji_en.dict
+++ b/dictionaries/dict/emoji_en.dict
--- a/dictionaries/dict/emoji_fr.dict
+++ b/dictionaries/dict/emoji_fr.dict
--- a/dictionaries/dict/main_ar.dict
+++ b/dictionaries/dict/main_ar.dict
--- a/dictionaries/dict/main_bg.dict
+++ b/dictionaries/dict/main_bg.dict
--- a/dictionaries/dict/main_cs.dict
+++ b/dictionaries/dict/main_cs.dict
--- a/dictionaries/dict/main_da.dict
+++ b/dictionaries/dict/main_da.dict
--- a/dictionaries/dict/main_de.dict
+++ b/dictionaries/dict/main_de.dict
--- a/dictionaries/dict/main_el.dict
+++ b/dictionaries/dict/main_el.dict
--- a/dictionaries/dict/main_en_au.dict
+++ b/dictionaries/dict/main_en_au.dict
--- a/dictionaries/dict/main_en_gb.dict
+++ b/dictionaries/dict/main_en_gb.dict
--- a/dictionaries/dict/main_en_us.dict
+++ b/dictionaries/dict/main_en_us.dict
--- a/dictionaries/dict/main_eo.dict
+++ b/dictionaries/dict/main_eo.dict
--- a/dictionaries/dict/main_es.dict
+++ b/dictionaries/dict/main_es.dict
--- a/dictionaries/dict/main_fi.dict
+++ b/dictionaries/dict/main_fi.dict
--- a/dictionaries/dict/main_fr.dict
+++ b/dictionaries/dict/main_fr.dict
--- a/dictionaries/dict/main_gl.dict
+++ b/dictionaries/dict/main_gl.dict
--- a/dictionaries/dict/main_he.dict
+++ b/dictionaries/dict/main_he.dict
--- a/dictionaries/dict/main_hr.dict
+++ b/dictionaries/dict/main_hr.dict
--- a/dictionaries/dict/main_hu.dict
+++ b/dictionaries/dict/main_hu.dict
--- a/dictionaries/dict/main_hy.dict
+++ b/dictionaries/dict/main_hy.dict
--- a/dictionaries/dict/main_it.dict
+++ b/dictionaries/dict/main_it.dict
--- a/dictionaries/dict/main_iw.dict
+++ b/dictionaries/dict/main_iw.dict
--- a/dictionaries/dict/main_ka.dict
+++ b/dictionaries/dict/main_ka.dict
--- a/dictionaries/dict/main_lb.dict
+++ b/dictionaries/dict/main_lb.dict
--- a/dictionaries/dict/main_lt.dict
+++ b/dictionaries/dict/main_lt.dict
--- a/dictionaries/dict/main_lv.dict
+++ b/dictionaries/dict/main_lv.dict
--- a/dictionaries/dict/main_nb.dict
+++ b/dictionaries/dict/main_nb.dict
--- a/dictionaries/dict/main_nl.dict
+++ b/dictionaries/dict/main_nl.dict
--- a/dictionaries/dict/main_pl.dict
+++ b/dictionaries/dict/main_pl.dict
--- a/dictionaries/dict/main_pt_br.dict
+++ b/dictionaries/dict/main_pt_br.dict
--- a/dictionaries/dict/main_pt_pt.dict
+++ b/dictionaries/dict/main_pt_pt.dict
--- a/dictionaries/dict/main_ro.dict
+++ b/dictionaries/dict/main_ro.dict
--- a/dictionaries/dict/main_ru.dict
+++ b/dictionaries/dict/main_ru.dict
--- a/dictionaries/dict/main_sl.dict
+++ b/dictionaries/dict/main_sl.dict
--- a/dictionaries/dict/main_sr.dict
+++ b/dictionaries/dict/main_sr.dict
--- a/dictionaries/dict/main_sv.dict
+++ b/dictionaries/dict/main_sv.dict
--- a/dictionaries/dict/main_tr.dict
+++ b/dictionaries/dict/main_tr.dict
--- a/dictionaries/dict/main_uk.dict
+++ b/dictionaries/dict/main_uk.dict
--- a/dictionaries/el_wordlist.combined.gz
+++ b/dictionaries/el_wordlist.combined.gz
--- a/dictionaries/en_AU_wordlist.combined.gz
+++ b/dictionaries/en_AU_wordlist.combined.gz
--- a/dictionaries/en_GB_wordlist.combined.gz
+++ b/dictionaries/en_GB_wordlist.combined.gz
--- a/dictionaries/en_US_wordlist.combined.gz
+++ b/dictionaries/en_US_wordlist.combined.gz
--- a/dictionaries/en_emoji.combined.gz
+++ b/dictionaries/en_emoji.combined.gz
--- a/dictionaries/eo_wordlist.combined.gz
+++ b/dictionaries/eo_wordlist.combined.gz
--- a/dictionaries/es_GL_wordlist.combined.gz
+++ b/dictionaries/es_GL_wordlist.combined.gz
--- a/dictionaries/es_wordlist.combined.gz
+++ b/dictionaries/es_wordlist.combined.gz
--- a/dictionaries/experimental/README.md
+++ b/dictionaries/experimental/README.md
@ -1,35 +0,0 @@
 This directory contains dictionaries compiled from sentence lists to make use of next-word predictions.
 Currently all word dictionaries are based on word lists available at https://wortschatz.uni-leipzig.de/en/download/ under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) license.
 The emoji dictionary is adapted from [gemoji](https://github.com/github/gemoji/blob/master/db/emoji.json) without further processing.
 Dictionaries are created using [`create_wordlist_from_sentences.py`](create_wordlist_from_sentences.py) for `<locale>_wordlist.combined` and [`dicttool_aosp.jar`](https://github.com/remi0s/aosp-dictionary-tools) for creating `.dict` files. See the `example_()` functions in the python script for how to use it. You can simply adjust paths and add your sentence (or word) lists.
 The script is still experimental, rather slow and may produce bad dictionaries in some languages. Some words seem to be wrongly added (e.g. "i" for English), and names are typically missing, though this depends on how exactly you create the Android dictionaries.
 A "potentially_offensive" attribute is added for some words, which sometimes seems unnecessary. Currently this is coming from the "nosuggest" attribute of the used _hunspell_ dictionaries, which occurs for offensive words as well as for weird / rare word forms.
 Other flags are currently missing, same for shortcuts (e.g. ill -> I'll or écoeuré -> écœuré, as found in AOSP dictionaries).
 -----
 `wordlist.combined` file infos (mostly guessed, didn't find documentation):
 * header is necessary
  * format like `dictionary=main:en_us,locale=en_US,description=English (US),date=1414726260,version=54`
  * all of these fields are necessary, though `description` is not used
  * German dictionaries also have `REQUIRES_GERMAN_UMLAUT_PROCESSING=1`
 * each word is in a line like ` word=re,f=0,flags=abbreviation,originalFreq=99,possibly_offensive=true`
  * `word` is the word (necessary)
  * `f` is frequency, from 0 to 255(?) (necessary)
    * higher value is more likely to get suggested / corrected
    * special value `whitelist`, possibly equal to 15
    * `f=0` will not be suggested if bad words are blocked, and will never be added to user history
      * possible bug: words with `possibly_offensive=true` and `f=0` will be suggested when not blocking offensive words, but other words with `f=0` are still not suggested
  * `originalFreq`: unclear, is this used?
  * `flags`: `medical`, `technical`, `hand-added`, `babytalk`, `abbreviation`, `offensive`, `technical`, `nonword`, and probably more: are they used for anything?
  * `possibly_offensive=true` stops the word from being suggested when blocking offensive words
  * `not_a_word=true` will not be suggested, use together with `shortcut`
  * `shortcut=<s>` (below a `<word>`) will suggest `<s>` when the `<word>` is typed
    * which `f` to use? maybe only 0-14 and `whitelist` allowed
    * what does `f` do here?
  * `bigram=<b>` (below a `<word>`) will suggest `<b>` as next word before typing any letters
    * what does `f` do here? Looks like 1, 2, and 3 are used for the usual 3 bigram entries
--- a/dictionaries/experimental/create_wordlist_from_sentences.py
+++ b/dictionaries/experimental/create_wordlist_from_sentences.py
@ -1,466 +0,0 @@
 #!/bin/python
 import math
 import os
 import time
 import regex
 import copy
 from spylls.hunspell import Dictionary
 # issues:
 # for english got 'i' as word (shouldn't it be 'I' only? or is 'i' the imaginary number?)
 # potentially_offensive is not set, where to get info? parse android dicts?
 # maybe ignore compound words like 'long-term'? will android actually suggest them?
 # maybe useful
 # https://wortschatz.uni-leipzig.de/en/download/
 #  really useful source of sentences / fragments, but should be checked against dictionaries as they are taken from news / web
 # https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists
 #  word frequency lists linked, in some cases there are also sentence lists
 # https://github.com/wooorm/dictionaries
 #  hunspell dicts, are they the same as the one included in phunspell?
 # memory usage depends on word lists and language, expect 0.5 - 2 GB
 # for some reason, Italian requires 4 GB for unmunch
 # from https://github.com/zverok/spylls/blob/master/examples/unmunch.py
 def unmunch_word(word, aff):
    result = set()
    if aff.FORBIDDENWORD and aff.FORBIDDENWORD in word.flags:
        return result
    if not (aff.NEEDAFFIX and aff.NEEDAFFIX in word.flags):
        result.add(word.stem)
    suffixes = [
        suffix
        for flag in word.flags
        for suffix in aff.SFX.get(flag, [])
        if suffix.cond_regexp.search(word.stem)
    ]
    prefixes = [
        prefix
        for flag in word.flags
        for prefix in aff.PFX.get(flag, [])
        if prefix.cond_regexp.search(word.stem)
    ]
    for suffix in suffixes:
        root = word.stem[0:-len(suffix.strip)] if suffix.strip else word.stem
        suffixed = root + suffix.add
        if not (aff.NEEDAFFIX and aff.NEEDAFFIX in suffix.flags):
            result.add(suffixed)
        secondary_suffixes = [
            suffix2
            for flag in suffix.flags
            for suffix2 in aff.SFX.get(flag, [])
            if suffix2.cond_regexp.search(suffixed)
        ]
        for suffix2 in secondary_suffixes:
            root = suffixed[0:-len(suffix2.strip)] if suffix2.strip else suffixed
            result.add(root + suffix2.add)
    for prefix in prefixes:
        root = word.stem[len(prefix.strip):]
        prefixed = prefix.add + root
        if not (aff.NEEDAFFIX and aff.NEEDAFFIX in prefix.flags):
            result.add(prefixed)
        if prefix.crossproduct:
            additional_suffixes = [
                suffix
                for flag in prefix.flags
                for suffix in aff.SFX.get(flag, [])
                if suffix.crossproduct and not suffix in suffixes and suffix.cond_regexp.search(prefixed)
            ]
            for suffix in suffixes + additional_suffixes:
                root = prefixed[0:-len(suffix.strip)] if suffix.strip else prefixed
                suffixed = root + suffix.add
                result.add(suffixed)
                secondary_suffixes = [
                    suffix2
                    for flag in suffix.flags
                    for suffix2 in aff.SFX.get(flag, [])
                    if suffix2.crossproduct and suffix2.cond_regexp.search(suffixed)
                ]
                for suffix2 in secondary_suffixes:
                    root = suffixed[0:-len(suffix2.strip)] if suffix2.strip else suffixed
                    result.add(root + suffix2.add)
    return result
 def unmunch_dictionary(dictionary: Dictionary) -> set[str]:
    result = set()
    for word in dictionary.dic.words:
        result.update(unmunch_word(word, dictionary.aff))
    return result
 class wordlist:
    def __init__(self,
                 # spylls dictionary
                 dictionary: Dictionary | None = None,
                 # words that should be ignored, typically (international) names we don't want in a language word list
                 neutral_words: set[str] | None = None):
        self.dictionary = dictionary
        self.dict_words: set[str] = set()
        if neutral_words is None:
            self.neutral_words = set()
        else:
            self.neutral_words = neutral_words
    # number of identified words
    count = 0
    # number of words used for frequency
    count_valid = 0
    # words to ignore, as they should be in some additional dictionary (mostly names)
    # these are not counted as valid or invalid, and not used for next-word data
    neutral_word_count = 0
    # words detected as invalid, these are mostly names and capitalized words (possibly also part of names)
    invalid_words: set[str] = set()
    not_words: set[str] = set()
    # unclear words with more than one match group in above regex
    # check and decide in the end what to do
    weird_things: set[str] = set()
    # for each word, contains a dict with:
    #  count: int (always)
    #  next: dict[str, int] (not always)
    #   how often the word is followed by some others (next_word, count)
    #  nosuggest: bool (usually only if True, as determined by hunspell dict)
    word_infos: dict = {}
    # regex for that kicks out things that are definitely not words
    # next word will be treated as ngram start
    # allow latin letters, and ' and - (but not at start/end)
    possible_word_regex = r"(?!['-])([\p{L}\d'-]+)(?<!['-])"  # \p{L} requires regex, not re
    # adds words that are valid according to dictionary
    # this is useful for adding many word form that are valid but not used frequently
    def add_unmunched_dictionary(self, unmunched_cache: str | None = None):
        unmunched: set[str] = set()
        if unmunched_cache is not None and os.path.isfile(unmunched_cache):
            try:
                with open(unmunched_cache) as f:
                    for w in f:
                        unmunched.add(w.strip())
            except:
                print(f"error reading {unmunched_cache}")
        if len(unmunched) == 0:
            s = unmunch_dictionary(self.dictionary)
            # unmunch may create word fragments
            #  remove words that are not valid according to dictionary
            #  or that start or end with -
            # unfortunately this can be really slow depending on language, seen from a few seconds up to hours (cs)
            #  but with the cache it's ok
            for word in s:
                if not word.startswith("-") and not word.endswith("-") and not word.isdigit():
                    # don't care about whether word is already in word_infos, we only want hunspell words
                    if self.dictionary.lookuper(word, capitalization=False, allow_nosuggest=False):
                        unmunched.add(word)
                    elif self.dictionary.lookuper(word, capitalization=False, allow_nosuggest=True):
                        unmunched.add(f"nosuggest:{word}")
            if unmunched_cache is not None:
                try:
                    with open(unmunched_cache, 'w') as f:
                        f.writelines([str(i) + '\n' for i in unmunched])
                except:
                    print(f"could not write to {unmunched_cache}")
        count = 0
        for word in unmunched:
            if word not in self.word_infos:
                if word.startswith("nosuggest:"):
                    word = word[10:]
                    self.add_word(word, True)
                else:
                    self.add_word(word)
                count += 1
        print(count, "words added using add_unmunched_dictionary")
    # tries adding a line, which is a sentence or sentence fragment
    # if next-word information is not wanted, use add_word instead
    # currently ngram ends after every non-word character
    #  like 2, ", ,, ., -
    #  any cases where this shouldn't happen?
    def add_line(self, line: str,
                 # True: all words will be added, except if they start with an uppercase letter and
                 #  previous_word is None (careful, this can easily add spelling mistakes)
                 # False: if no dictionary, no words will be added
                 #  if dictionary, words will still be added if they are found in a case-sensitive lookup
                 add_unknown_words: bool = False) -> None:
        previous_word: str | None = None
        for word in line.split():
            if word in self.word_infos:
                # shortcut: we already know the word, avoid doing the regex check and dict lookup if possible
                # only increase count and add next word info
                self.add_word(word)
                if previous_word is not None:
                    previous_info = self.word_infos[previous_word]
                    previous_next = previous_info.get("next", {})
                    previous_next[word] = previous_next.get(word, 0) + 1
                    previous_info["next"] = previous_next
                previous_word = word
                continue
            if len(word) >= 48:
                # android dicttool ignores those, so let's skip them already here
                previous_word = None
                continue
            if word.isspace():
                # don't treat spaces as countable word (assuming a line does not contain a line break)
                continue
            if word.isnumeric():
                # don't put numbers info not_words
                previous_word = None
                continue
            if not regex.search(r"\p{L}", word):
                # no letters, no word (but ngram ends here)
                self.not_words.add(word)
                previous_word = None
                continue
            # hunspell dict has ', not ’, but we want to understand both
            word = word.replace('’', '\'')
            # must find something, because r"\p{L}" non-matches are already removed
            re_find = regex.findall(self.possible_word_regex, word)
            self.count += 1
            if len(re_find) > 1:
                # just write down and end sentence for now
                self.weird_things.add(word)
                previous_word = None
            # treat re_find[0] as the actual word, but need to investigate weird_things to maybe improve possible_word_regex
            full_word = word
            word = re_find[0]
            # if the match is not at the start, treat as ngram start
            if not full_word.startswith(word):
                previous_word = None
            if word in self.neutral_words:
                self.neutral_word_count += 1
                previous_word = None
                continue
            if word in self.invalid_words:
                previous_word = None
                continue
            if word not in self.word_infos:
                if add_unknown_words:
                    if previous_word is None and word[0].isupper():
                        continue
                else:
                    try:
                        valid, word = self.dict_check(word, previous_word is None)
                    except IndexError:
                        # happens for "İsmail" when using German dictionary, also for other words starting with "İ"
                        previous_word = None
                        continue
                    if not valid:
                        if previous_word is not None:
                            self.invalid_words.add(word)
                        previous_word = None
                        continue
            self.count_valid += 1
            self.add_word(word, add_to_count=False)
            if previous_word is not None:
                previous_info = self.word_infos[previous_word]
                previous_next = previous_info.get("next", {})
                previous_next[word] = previous_next.get(word, 0) + 1
                previous_info["next"] = previous_next
            # set new previous word, or None if ngram end is suspected (this could be optimized, but no priority)
            if full_word.endswith(word):
                previous_word = word
            else:
                previous_word = None
    # returns whether word is valid according to the dictionary, and, for the case it was capitalized, the valid form
    def dict_check(self, word: str, try_decapitalize: bool) -> tuple[bool, str]:
        if try_decapitalize and word[0].isupper():
            decapitalized = word[0].lower() + word[1:]
            if decapitalized in self.word_infos:
                return True, decapitalized
            # todo: lookup can be slow, optimize order with capitalization and nosuggest
            if not self.dictionary.lookuper(word, capitalization=True, allow_nosuggest=True):
                return False, word
            # word may be valid, check capitalization and nosuggest
            if self.dictionary.lookuper(word, capitalization=False, allow_nosuggest=False):
                return True, word
            if self.dictionary.lookuper(decapitalized, capitalization=False, allow_nosuggest=False):
                return True, decapitalized
            if self.dictionary.lookuper(word, capitalization=False, allow_nosuggest=True):
                self.word_infos[word] = {"nosuggest": True}
                return True, word
            if self.dictionary.lookuper(decapitalized, capitalization=False, allow_nosuggest=True):
                self.word_infos[decapitalized] = {"nosuggest": True}
                return True, decapitalized
            return False, word
        # we always want correct capitalization
        # maybe invert order for better performance, similar to above
        if not self.dictionary.lookuper(word, capitalization=False, allow_nosuggest=True):
            return False, word
        if self.dictionary.lookuper(word, capitalization=False, allow_nosuggest=False):
            return True, word
        self.word_infos[word] = {"nosuggest": True}
        return True, word
    def add_word(self, word: str, nosuggest: bool = False, add_to_count: bool = True):
        word_info = self.word_infos.get(word, {})
        word_info["count"] = word_info.get("count", 0) + 1
        if nosuggest:
            word_info["nosuggest"] = True
        self.word_infos[word] = word_info
        if add_to_count:
            self.count += 1
            self.count_valid += 1
    def add_sentence_file(self, filename: str, add_unknown_words: bool = False):
        with open(filename) as f:
            for line in f:
                self.add_line(line, add_unknown_words)
    def add_word_file(self, filename: str):
        with open(filename) as f:
            for line in f:
                for word in line.split():
                    self.add_word(word)
    # dicts need all the input, but only type and locale are relevant
    # type can be any ASCII string, but typically main is used
    #  note that only one dict can be loaded for each type
    #  using main also overrides any built-in dictionary in my version of OpenBoard
    # locale should be in compatible format (e.g. en, en_US, fr, fr_CA,...)
    def create_android_word_list(self, file_path, dict_type: str, locale: str, description: str, version: int):
        with open(file_path, 'w') as f:
            t = int(time.time())
            # e.g. dictionary=main:en_us,locale=en_US,description=English (US),date=1414726260,version=54
            header = f"dictionary={dict_type}:{locale.lower()},locale={locale},description={description},date={t},version={version}"
            if locale.startswith("de"):
                header += ",REQUIRES_GERMAN_UMLAUT_PROCESSING=1"
                # any special things for other languages?
                # russian dict has MULTIPLE_WORDS_DEMOTION_RATE=50 -> what's this?
            f.write(header + "\n")
            # deep copy to avoid modifying self.word_infos
            word_infos = copy.deepcopy(self.word_infos)
            # todo: check android dicts and maybe some documentation about frequencies
            add_frequencies(word_infos, 1, 250)
            filter_bigrams(word_infos, 3, 2)
            for word, infos in sorted(word_infos.items(), key=lambda item: -item[1]["count"]):
                frequency = infos["frequency"]
                if infos.get("nosuggest", False):
                    # todo: frequency of nosuggest words?
                    #  in AOSP dicts there are possibly_offensive words with freq > 0, but profanity has frequency 0
                    #  dictionaryFacilitator will add freq == 0 to history only as invalid words
                    #   -> what happens here? try and compare "hardcore" (f=112) and "Cid" (f=0)
                    #  hunspell nosuggest english is insults/slurs, which are f=0 in AOSP dictionaries
                    #   other possibly_offensive words found in AOSP dictionaries are not flagged at all
                    #   -> maybe find a way to extract this information from existing dictonaries?
                    #   but hunspell nosuggest german is also weird/rare word forms
                    f.write(f" word={word},f={frequency},possibly_offensive=true\n")
                else:
                    f.write(f" word={word},f={frequency}\n")
                if "next" in infos:
                    for next_word, freq in infos["next"].items():
                        f.write(f"  bigram={next_word},f={freq}\n")
 # adds a "frequency" entry to each entry of word_infos
 #  frequency is the log of input frequencies, and scaled between min_frequency and max_frequency
 def add_frequencies(word_infos: dict[str, int], min_frequency: int, max_frequency: int):
    assert max_frequency > min_frequency
    max_count = 0
    min_count = 2147483647  # simply start with a very large number (int32 max)
    # first get max and min count
    for _, infos in word_infos.items():
        count = infos["count"]
        if count < min_count:
            min_count = count
        if count > max_count:
            max_count = count
    min_f = math.log(min_count)
    fdiff = max(math.log(max_count) - min_f, 1)
    for word, infos in word_infos.items():
        f = math.log(infos["count"])
        infos["frequency"] = int((f - min_f) * (max_frequency - min_frequency) / fdiff + min_frequency)
 # modifies word_infos:
 #  fewer entries per word (limiting to max_bigrams and requiring min_count occurences)
 #  frequency replaced by order, starting at 1 for the most used, like it seems to be in the AOSP en(_US) dictionary
 def filter_bigrams(word_infos: dict, max_bigrams, min_count):
    for word, infos in word_infos.items():
        if "next" not in infos:
            continue
        bigram = infos["next"]
        new_bigram = dict()
        bigram_count = 1
        for next_word, next_count in sorted(bigram.items(), key=lambda item: -item[1]):
            if bigram_count > max_bigrams or next_count < min_count:
                break
            new_bigram[next_word] = bigram_count
            bigram_count += 1
        infos["next"] = new_bigram
 # highest frequency first
 def sort_dict_by_count(d: dict[str, int]):
    return sorted(d.items(), key=lambda item: -item[1])
 # use existing dictionary for spell check
 # use sentence list to build word list
 def example_1():
    d = Dictionary.from_files("/home/user/.local/lib/python3.10/site-packages/phunspell/data/dictionary/en/en_US")
    w = wordlist(dictionary=d)
    w.add_sentence_file("/home/user/eng_news_2020_100K-sentences.txt",
                        add_unknown_words=False)  # will only add words that pass the spell check
    w.create_android_word_list("/home/user/en_US_wordlist.compiled", "main", "en_US", "english", 1)
 # use existing dictionary for spell check
 # use words from unmunched (affix-expanded) dictionary
 #  creates a much larger wordlist in some languages
 # use sentence list to build word list
 #  this is mostly for frequencies and next words, but may also add new words in some languages, e.g. German compund words
 def example_2():
    d = Dictionary.from_files("/home/user/.local/lib/python3.10/site-packages/phunspell/data/dictionary/en/en_US")
    w = wordlist(dictionary=d)
    # unmunched cache not necessary for english, but helps for e.g. german or czech
    w.add_unmunched_dictionary(unmunched_cache="/home/user/en_unmunched.txt")  # adds all words with frequency 1
    w.add_sentence_file("/home/user/eng_news_2020_100K-sentences.txt", add_unknown_words=False)
    w.create_android_word_list("/home/user/en_US_wordlist.compiled", "main", "en_US", "english", 1)
 # don't use a dictionary, only a word list
 # this will produce low-quality suggestions, as word count is the same for all words
 #  but if the word list contains duplicates, it will affect word count
 def example_3():
    w = wordlist()
    w.add_word_file("/home/user/some_word_list.txt")
    w.create_android_word_list("/home/user/en_US_wordlist.compiled", "main", "en_US", "english", 1)
 # don't use a dictionary, but provide a word list
 # use a sentence file for word count and next word suggestions
 def example_4():
    w = wordlist()
    w.add_word_file("/home/user/some_word_list.txt")
    w.add_sentence_file("/home/user/eng_news_2020_100K-sentences.txt", add_unknown_words=False)
    w.create_android_word_list("/home/user/en_US_wordlist.compiled", "main", "en_US", "english", 1)
 # don't use a dictionary, but a list of sentences
 # android word list may contain spelling errors depending on source of the sentences
 def example_5():
    w = wordlist()
    w.add_sentence_file("/home/user/eng_news_2020_100K/eng_news_2020_100K-sentences.txt",
                        add_unknown_words=True)  # add all words to the word list, except some obvious non-words
    w.create_android_word_list("/home/user/en_US_wordlist.compiled", "main", "en_US", "english", 1)
--- a/dictionaries/experimental/dict/emoji_en.dict
+++ b/dictionaries/experimental/dict/emoji_en.dict
--- a/dictionaries/experimental/dict/main_ar.dict
+++ b/dictionaries/experimental/dict/main_ar.dict
--- a/dictionaries/experimental/dict/main_bn.dict
+++ b/dictionaries/experimental/dict/main_bn.dict
--- a/dictionaries/experimental/dict/main_cs.dict
+++ b/dictionaries/experimental/dict/main_cs.dict
--- a/dictionaries/experimental/dict/main_de.dict
+++ b/dictionaries/experimental/dict/main_de.dict
--- a/dictionaries/experimental/dict/main_en_gb.dict
+++ b/dictionaries/experimental/dict/main_en_gb.dict
--- a/dictionaries/experimental/dict/main_en_us.dict
+++ b/dictionaries/experimental/dict/main_en_us.dict
--- a/dictionaries/experimental/dict/main_es.dict
+++ b/dictionaries/experimental/dict/main_es.dict
--- a/dictionaries/experimental/dict/main_fr.dict
+++ b/dictionaries/experimental/dict/main_fr.dict
--- a/dictionaries/experimental/dict/main_it.dict
+++ b/dictionaries/experimental/dict/main_it.dict
--- a/dictionaries/experimental/dict/main_ru.dict
+++ b/dictionaries/experimental/dict/main_ru.dict
--- a/dictionaries/experimental/dict/main_uk.dict
+++ b/dictionaries/experimental/dict/main_uk.dict
--- a/dictionaries/experimental/wordlists/ar_wordlist.combined.gz
+++ b/dictionaries/experimental/wordlists/ar_wordlist.combined.gz
--- a/dictionaries/experimental/wordlists/bn_wordlist.combined.gz
+++ b/dictionaries/experimental/wordlists/bn_wordlist.combined.gz
--- a/dictionaries/experimental/wordlists/cs_wordlist.combined.gz
+++ b/dictionaries/experimental/wordlists/cs_wordlist.combined.gz
--- a/dictionaries/experimental/wordlists/de_wordlist.combined.gz
+++ b/dictionaries/experimental/wordlists/de_wordlist.combined.gz
--- a/dictionaries/experimental/wordlists/en_GB_wordlist.combined.gz
+++ b/dictionaries/experimental/wordlists/en_GB_wordlist.combined.gz
--- a/dictionaries/experimental/wordlists/en_US_wordlist.combined.gz
+++ b/dictionaries/experimental/wordlists/en_US_wordlist.combined.gz
--- a/dictionaries/experimental/wordlists/en_emoji.combined.gz
+++ b/dictionaries/experimental/wordlists/en_emoji.combined.gz
--- a/dictionaries/experimental/wordlists/es_wordlist.combined.gz
+++ b/dictionaries/experimental/wordlists/es_wordlist.combined.gz
--- a/dictionaries/experimental/wordlists/fr_wordlist.combined.gz
+++ b/dictionaries/experimental/wordlists/fr_wordlist.combined.gz
--- a/dictionaries/experimental/wordlists/it_wordlist.combined.gz
+++ b/dictionaries/experimental/wordlists/it_wordlist.combined.gz
--- a/dictionaries/experimental/wordlists/ru_wordlist.combined.gz
+++ b/dictionaries/experimental/wordlists/ru_wordlist.combined.gz
--- a/dictionaries/experimental/wordlists/uk_wordlist.combined.gz
+++ b/dictionaries/experimental/wordlists/uk_wordlist.combined.gz
--- a/dictionaries/fi_wordlist.combined.gz
+++ b/dictionaries/fi_wordlist.combined.gz
--- a/dictionaries/fr_emoji.combined.gz
+++ b/dictionaries/fr_emoji.combined.gz
--- a/dictionaries/fr_wordlist.combined.gz
+++ b/dictionaries/fr_wordlist.combined.gz
--- a/dictionaries/he_wordlist.combined.gz
+++ b/dictionaries/he_wordlist.combined.gz
--- a/dictionaries/hr_wordlist.combined.gz
+++ b/dictionaries/hr_wordlist.combined.gz
--- a/dictionaries/hu_wordlist.combined.gz
+++ b/dictionaries/hu_wordlist.combined.gz
--- a/dictionaries/hy_wordlist.combined.gz
+++ b/dictionaries/hy_wordlist.combined.gz
--- a/dictionaries/it_wordlist.combined.gz
+++ b/dictionaries/it_wordlist.combined.gz
--- a/dictionaries/iw_wordlist.combined.gz
+++ b/dictionaries/iw_wordlist.combined.gz
--- a/dictionaries/ka_wordlist.combined.gz
+++ b/dictionaries/ka_wordlist.combined.gz
--- a/dictionaries/lb_wordlist.combined.gz
+++ b/dictionaries/lb_wordlist.combined.gz
--- a/dictionaries/lt_wordlist.combined.gz
+++ b/dictionaries/lt_wordlist.combined.gz
--- a/dictionaries/lv_wordlist.combined.gz
+++ b/dictionaries/lv_wordlist.combined.gz
--- a/dictionaries/nb_wordlist.combined.gz
+++ b/dictionaries/nb_wordlist.combined.gz
--- a/dictionaries/nl_wordlist.combined.gz
+++ b/dictionaries/nl_wordlist.combined.gz
--- a/dictionaries/pl_wordlist.combined.gz
+++ b/dictionaries/pl_wordlist.combined.gz
--- a/dictionaries/pt_BR_wordlist.combined.gz
+++ b/dictionaries/pt_BR_wordlist.combined.gz
--- a/dictionaries/pt_PT_wordlist.combined.gz
+++ b/dictionaries/pt_PT_wordlist.combined.gz
--- a/dictionaries/ro_wordlist.combined.gz
+++ b/dictionaries/ro_wordlist.combined.gz
--- a/Show more
+++ b/Show more