add experimental dictionaries, and script to create them from sentence lists (wip)

2025-06-04 05:40:17 +00:00 · 2023-07-05 18:04:07 +02:00 · 2023-07-05 18:04:07 +02:00 · d9358f64de
commit d9358f64de
parent 5b3ddaaadd
15 changed files with 394 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -11,6 +11,7 @@ Changes:
 * Allow users to add and replace built-in dictionaries
  * modified / improved from https://github.com/openboard-team/openboard/pull/569 and https://github.com/openboard-team/openboard/pull/578
  * dictionaries are available at https://github.com/Helium314/openboard/tree/new/dictionaries/dict
    * experimental dictionaries with next-word suggestions created from sentence lists in https://github.com/Helium314/openboard/tree/new/dictionaries/experimental, but they may contain unwanted words, and will suggest offensive words even with the option turned off
  * dictionary files starting with "main_" replace the built-in dictionary for the language, all other names work as add-on dictionaries
  * add Arabic dictionary for download, from https://github.com/remi0s/aosp-dictionary-tools/blob/master/dictsCreated/WikiAndOpenSubtitles/ar_wordlist.combined / https://github.com/openboard-team/openboard/pull/450
  * add Hebrew dictionary for download, from https://github.com/Hananel-Hazan/aosp-dictionary-tools/blob/master/hebrew-hspell.txt.combined.new / https://github.com/openboard-team/openboard/pull/300
--- a/dictionaries/experimental/README.md
+++ b/dictionaries/experimental/README.md
@ -0,0 +1,7 @@
 This directory contains dictionaries compiled from sentence lists to make use of next-word predictions.
 Currently all dictionaries are based on word lists available at https://wortschatz.uni-leipzig.de/en/download/ under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) license.
 Dictionaries are created using create_wordlist_from_sentences.py (in this directory) to create `<locale>_wordlist.combined` and [`dicttool_aosp.jar`](https://github.com/remi0s/aosp-dictionary-tools) for creating `.dict` files. See the `example_()` functions in the python script for how to use it. You can simply adjust paths and add your sentence lists (or word lists with one word per line).
 The script is still experimental, rather slow and may produce bad dictionaries in some languages. Some words seem to be wrongly added (e.g. "i" for English, or "Der" for German), and names are typically missing, though this depends on how exactly you create the Android dictionaries.
 Flags (like "offensive") are currently missing, same for shortcuts (e.g. ill -> I'll or écoeuré -> écœuré, as found in AOSP dictionaries).
--- a/dictionaries/experimental/create_wordlist_from_sentences.py
+++ b/dictionaries/experimental/create_wordlist_from_sentences.py
@ -0,0 +1,386 @@
 #!/bin/python
 import math
 import os
 import time
 import regex
 from spylls.hunspell import Dictionary
 # issues:
 # for english got 'i' as word (shouldn't it be 'I' only? or is 'i' the imaginary number?)
 # potentially_offensive is not set, where to get info? parse android dicts?
 # maybe ignore compound words like 'long-term'? will android actually suggest them?
 # maybe useful
 # https://wortschatz.uni-leipzig.de/en/download/
 #  really useful source of sentences / fragments, but should be checked against dictionaries as they are taken from news / web
 # https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists
 #  word frequency lists linked, in some cases there are also sentence lists
 # https://github.com/wooorm/dictionaries
 #  hunspell dicts, are they the same as the one included in phunspell?
 # memory usage depends on word lists and language, expect 0.5 - 2 GB
 # from https://github.com/zverok/spylls/blob/master/examples/unmunch.py
 def unmunch_word(word, aff):
    result = set()
    if aff.FORBIDDENWORD and aff.FORBIDDENWORD in word.flags:
        return result
    if not (aff.NEEDAFFIX and aff.NEEDAFFIX in word.flags):
        result.add(word.stem)
    suffixes = [
        suffix
        for flag in word.flags
        for suffix in aff.SFX.get(flag, [])
        if suffix.cond_regexp.search(word.stem)
    ]
    prefixes = [
        prefix
        for flag in word.flags
        for prefix in aff.PFX.get(flag, [])
        if prefix.cond_regexp.search(word.stem)
    ]
    for suffix in suffixes:
        root = word.stem[0:-len(suffix.strip)] if suffix.strip else word.stem
        suffixed = root + suffix.add
        if not (aff.NEEDAFFIX and aff.NEEDAFFIX in suffix.flags):
            result.add(suffixed)
        secondary_suffixes = [
            suffix2
            for flag in suffix.flags
            for suffix2 in aff.SFX.get(flag, [])
            if suffix2.cond_regexp.search(suffixed)
        ]
        for suffix2 in secondary_suffixes:
            root = suffixed[0:-len(suffix2.strip)] if suffix2.strip else suffixed
            result.add(root + suffix2.add)
    for prefix in prefixes:
        root = word.stem[len(prefix.strip):]
        prefixed = prefix.add + root
        if not (aff.NEEDAFFIX and aff.NEEDAFFIX in prefix.flags):
            result.add(prefixed)
        if prefix.crossproduct:
            additional_suffixes = [
                suffix
                for flag in prefix.flags
                for suffix in aff.SFX.get(flag, [])
                if suffix.crossproduct and not suffix in suffixes and suffix.cond_regexp.search(prefixed)
            ]
            for suffix in suffixes + additional_suffixes:
                root = prefixed[0:-len(suffix.strip)] if suffix.strip else prefixed
                suffixed = root + suffix.add
                result.add(suffixed)
                secondary_suffixes = [
                    suffix2
                    for flag in suffix.flags
                    for suffix2 in aff.SFX.get(flag, [])
                    if suffix2.crossproduct and suffix2.cond_regexp.search(suffixed)
                ]
                for suffix2 in secondary_suffixes:
                    root = suffixed[0:-len(suffix2.strip)] if suffix2.strip else suffixed
                    result.add(root + suffix2.add)
    return result
 def unmunch_dictionary(dictionary: Dictionary) -> set[str]:
    result = set()
    for word in dictionary.dic.words:
        result.update(unmunch_word(word, dictionary.aff))
    return result
 class wordlist:
    def __init__(self,
                 # spylls dictionary
                 dictionary: Dictionary | None = None,
                 # words that should be ignored, typically (international) names we don't want in a language word list
                 neutral_words: set[str] | None = None,
                 # path where unmunched dictionary is cached (because creation is really slow)
                 # file will be written if not read
                 unmunched_cache: str | None = None):
        self.dictionary = dictionary
        self.dict_words: set[str] = set()
        if dictionary is not None:
            # this and the whole unmunch possibly would not be necessary if hunspell had implemented
            # this 15+ year old feature request: https://github.com/hunspell/hunspell/issues/258
            # use unmunched_cache spell-checked list if available
            if unmunched_cache is not None and os.path.isfile(unmunched_cache):
                try:
                    with open(unmunched_cache) as f:
                        for w in f:
                            self.dict_words.add(w.strip())
                except:
                    print(f"error reading {unmunched_cache}")
                    self.dict_words = {}
            if len(self.dict_words) == 0:
                s = unmunch_dictionary(dictionary)
                # unmunch may create word fragments
                #  remove words that are not valid according to dictionary
                #  or that start or end with -
                # unfortunately this can be really slow depending on language, seen from a few seconds up to hours (cs)
                #  but with the cache it's ok
                for word in s:
                    if not word.startswith("-") and not word.endswith("-") and not word.isdigit() and dictionary.lookup(word):
                        self.dict_words.add(word)
                if unmunched_cache is not None:
                    try:
                        with open(unmunched_cache, 'w') as f:
                            f.writelines([str(i)+'\n' for i in self.dict_words])
                    except:
                        print(f"could not write to {unmunched_cache}")
        if neutral_words is None:
            self.neutral_words = set()
        else:
            self.neutral_words = neutral_words
    # number of identified words
    count = 0
    # number of words used for frequency
    count_valid = 0
    # word and number of occurrences
    frequencies: dict[str, int] = {}
    # contains dict for each word, with next word and number of occurrences
    bigrams: dict[str, dict[str, int]] = {}
    # words to ignore, as they should be in some additional dictionary (mostly names)
    # these are not counted as valid
    neutral_word_count = 0
    # words detected as invalid
    invalid_words: set[str] = set()
    not_words: set[str] = set()
    # unclear words with more than one match group in above regex
    # check and decide in the end what to do
    weird_things: set[str] = set()
    # regex for that kicks out things that are definitely not words
    # next word will be treated as ngram start
    # allow latin letters, and ' and - (but not at start/end)
    possible_word_regex = r"(?!['-])([\p{L}\d'-]+)(?<!['-])"  # \p{L} requires regex, not re
    # possible_word_regex = r"([\w\d]+)"  # this works with re, but problematic for most languages
    def add_unmunched_dictionary(self):
        count = 0
        for word in self.dict_words:
            if word not in self.frequencies:
                self.frequencies[word] = 1
                count += 1
        print(count, "words added using add_unmunched_dictionary")
    # currently ngram ends after every non-word character
    # like 2, ", ,, ., -
    # any cases where this shouldn't happen?
    def add_line(self, line: str,
                 # true for proper sentences / sentence fragments, false for word lists
                 add_to_bigrams: bool = True,
                 # set to false if line may contain spelling mistake or words that should not be learned otherwise
                 add_new_words: bool = True) -> None:
        previous_word: str | None = None
        for word in line.split():
            if len(word) >= 48:
                # android dicttool ignores those, so let's skip them already here
                previous_word = None
                continue
            if word.isspace():
                # don't treat spaces as countable word (assuming a line does not contain a line break)
                continue
            if word.isnumeric():
                # don't put numbers info not_words
                previous_word = None
                continue
            if not regex.search(r"\p{L}", word):
                # no letters, no word (but ngram ends here)
                self.not_words.add(word)
                previous_word = None
                continue
            # hunspell dict has ', not ’, but we want to understand both
            word = word.replace('’', '\'')
            # must find something, because r"\p{L}" non-matches are already removed
            re_find = regex.findall(self.possible_word_regex, word)
            self.count += 1
            if len(re_find) > 1:
                self.weird_things.add(word)
            # treat re_find[0] as the actual word, but need to investigate weird_things to maybe improve possible_word_regex
            full_word = word
            word = re_find[0]
            # if the match is not at the start, treat as ngram start
            if not full_word.startswith(word):
                previous_word = None
            if word in self.neutral_words:
                self.neutral_word_count += 1
                continue
            if len(self.dict_words) == 0:  # no (valid) dictionary
                if previous_word is None and word[0].isupper() and word not in self.frequencies:
                    # uppercase might come from sentence start, so better ignore this word
                    continue
            elif word not in self.dict_words:
                # if ngram start, look up the version with first letter lowercase
                # this is not always sentence start, but should be quite often
                if previous_word is None and word[0].isupper():
                    decapitalized = word[0].lower() + word[1:]
                    if decapitalized in self.dict_words:
                        word = decapitalized
                    else:
                        # don't add to invalid words, we're not sure and just ignore it
                        continue
                else:
                    # the word was not capitalized, so checking with hunspell should be fine.
                    # this finds some words that are not in unmunched dict_words, especially compound words
                    #  but also words we don't want, where capitalization makes the word invalid... but hunspell can't deal with this
                    #  only way so far: find words manually and add to self.neutral_words
                    #  TODO: possible approach for allowing adding compound words, but not prohibiting capitalized versions of common words:
                    #   treat short words (e.g. less than 8 or 10 letters) always as invalid here
                    #   but this likely has unwanted side effects... just try and check the invalid word list (have a separate one for initial testing)
                    # we also want to exclude full uppercase words here (these are not in dict_words, but allowed by hunspell)
                    try:
                        if (self.dictionary is not None and not self.dictionary.lookup(word)) or word.isupper():
                            self.invalid_words.add(word)
                            previous_word = None
                            continue
                    except IndexError:
                        # happens for İsmail and other words starting with 'İ', possibly spylls issue
                        print(f"dictionary issue when looking up {word}")
                        self.invalid_words.add(word)
                        previous_word = None
                        continue
            if add_new_words or word in self.frequencies:
                self.frequencies[word] = self.frequencies.get(word, 0) + 1
            self.count_valid += 1
            if previous_word is not None and (add_new_words or word in self.frequencies):
                bigram = self.bigrams.get(previous_word, {})
                bigram[word] = bigram.get(word, 0) + 1
                self.bigrams[previous_word] = bigram
            # set new previous word, or None if ngram end is suspected (this could be optimized, but no priority)
            if add_to_bigrams and full_word.endswith(word):
                previous_word = word
            else:
                previous_word = None
    def add_file(self, filename: str, add_to_bigrams: bool = True, add_new_words: bool = True):
        with open(filename) as f:
            for line in f:
                self.add_line(line, add_to_bigrams, add_new_words)
    # dicts need all the input, but only type and locale are relevant
    # type can be any ASCII string, but typically main is used
    #  note that only one dict can be loaded for each type
    #  using main also overrides any built-in dictionary in my version of OpenBoard
    # locale should be in compatible format (e.g. en, en_US, fr, fr_CA,...)
    def create_android_word_list(self, file_path, dict_type: str, locale: str, description: str, version: int):
        with open(file_path, 'w') as f:
            t = int(time.time())
            # e.g. dictionary=main:en_us,locale=en_US,description=English (US),date=1414726260,version=54
            header = f"dictionary={dict_type}:{locale.lower()},locale={locale},description={description},date={t},version={version}"
            if locale.startswith("de"):
                header += ",REQUIRES_GERMAN_UMLAUT_PROCESSING=1"
                # any special things for other languages?
                # russian dict has MULTIPLE_WORDS_DEMOTION_RATE=50 -> what's this?
            f.write(header + "\n")
            bigrams = filter_bigrams(self.bigrams, 3, 2)  # todo: maybe set min to 3 to avoid huge dict for rare words
            # todo: check android dicts and maybe some documentation about frequencies
            for word, frequency in sort_dict_by_frequency(normalize_frequencies(self.frequencies, 230, 1)):
                # todo: check other dicts whether flags or originalFreq is necessary
                # and check what they actually do
                f.write(f" word={word},f={frequency}\n")
                if word not in bigrams:
                    continue
                for next_word, freq in bigrams[word].items():
                    f.write(f"  bigram={next_word},f={freq}\n")
 # returns a dictionary with the frequencies being the log of input frequencies, and scaled between min and max
 def normalize_frequencies(frequencies: dict[str, int], max_frequency: int, min_frequency: int) -> dict[str, int]:
    assert max_frequency > min_frequency
    max_f = 0
    min_f = 2147483647  # simply start with a very large number (int32 max)
    d = {}
    for word, f in frequencies.items():
        f = math.log(f)
        if f < min_f:
            min_f = f
        if f > max_f:
            max_f = f
    fdiff = max(max_f - min_f, 1)
    for word, f in frequencies.items():
        f = math.log(f)
        d[word] = int((f - min_f) * (max_frequency - min_frequency) / fdiff + min_frequency)
    return d
 # returns a dict like bigrams, but
 #  fewer entries (limiting to max_bigrams and requiring min frequency)
 #  frequency replaced by order, starting at 1 for the most used, like it seems to be in the AOSP en(_US) dictionary
 def filter_bigrams(bigrams: dict[str, dict[str, int]], max_bigrams, min_bigram_frequency) -> dict[str, dict[str, int]]:
    new_bigrams: dict[str, dict[str, int]] = {}
    for word, bigram in bigrams.items():
        new_bigram = dict()
        count = 1
        for next_word, frequency in sort_dict_by_frequency(bigram):
            if count > max_bigrams or frequency < min_bigram_frequency:
                break
            new_bigram[next_word] = count
            count += 1
        new_bigrams[word] = new_bigram
    return new_bigrams
 # highest frequency first
 def sort_dict_by_frequency(d: dict[str, int]):
    return sorted(d.items(), key=lambda item: -item[1])
 # use existing dictionary for spell check (not perfect with capitalization)
 # use sentence list to build frequencies and bigrams
 def example_1():
    d = Dictionary.from_files("/home/user/.local/lib/python3.10/site-packages/phunspell/data/dictionary/en/en_US")
    # unmunched cache not necessary for english, but helps for e.g. German or Czech
    w = wordlist(dictionary=d, unmunched_cache="/home/user/unmunched_en_US.txt")
    w.add_file("/home/user/eng_news_2020_100K/eng_news_2020_100K-sentences.txt",
               add_to_bigrams=True,  # these are sentences, so use them to build bigrams
               add_new_words=True)  # we don't have any words in our frequency list yet, so False results in nothing happening
    w.create_android_word_list("/home/user/en_US_wordlist.compiled", "main", "en_US", "english", 1)
    print("frequencies", w.frequencies)
    print("bigrams", w.bigrams)
 # use existing dictionary for spell check (not perfect with capitalization)
 # use words from unmunched (affix-expanded) dictionary as the only valid words
 #  this is bad for languages that allow compound words, such as German
 # use sentence list to build frequencies and bigrams of words in the list only
 def example_2():
    d = Dictionary.from_files("/home/user/.local/lib/python3.10/site-packages/phunspell/data/dictionary/en/en_US")
    # unmunched cache not necessary for english, but helps for e.g. german or czech
    w = wordlist(dictionary=d)
    w.add_unmunched_dictionary()  # adds all words with frequency 1
    w.add_file("/home/user/eng_news_2020_100K/eng_news_2020_100K-sentences.txt",
               add_to_bigrams=True,  # these are sentences, so use them to build bigrams
               add_new_words=False)  # we have a list of valid words and don't want to add more
    w.create_android_word_list("/home/user/en_US_wordlist.compiled", "main", "en_US", "english", 1)
 # don't use a dictionary, only a word list
 # this will produce low-quality suggestions, as frequency is the same for all words
 def example_3():
    w = wordlist()
    w.add_file("/home/user/some_word_list.txt",
               add_to_bigrams=False)  # only words without connection
    w.create_android_word_list("/home/user/en_US_wordlist.compiled", "main", "en_US", "english", 1)
 # don't use a dictionary, but a list of sentences
 # android word list may contain spelling errors depending on source of the sentences
 def example_4():
    w = wordlist()
    w.add_file("/home/user/eng_news_2020_100K/eng_news_2020_100K-sentences.txt")
    w.create_android_word_list("/home/user/en_US_wordlist.compiled", "main", "en_US", "english", 1)
--- a/dictionaries/experimental/dict/main_cs.dict
+++ b/dictionaries/experimental/dict/main_cs.dict
--- a/dictionaries/experimental/dict/main_de.dict
+++ b/dictionaries/experimental/dict/main_de.dict
--- a/dictionaries/experimental/dict/main_en_gb.dict
+++ b/dictionaries/experimental/dict/main_en_gb.dict
--- a/dictionaries/experimental/dict/main_en_us.dict
+++ b/dictionaries/experimental/dict/main_en_us.dict
--- a/dictionaries/experimental/dict/main_es.dict
+++ b/dictionaries/experimental/dict/main_es.dict
--- a/dictionaries/experimental/dict/main_fr.dict
+++ b/dictionaries/experimental/dict/main_fr.dict
--- a/dictionaries/experimental/wordlists/cs_wordlist.combined.gz
+++ b/dictionaries/experimental/wordlists/cs_wordlist.combined.gz
--- a/dictionaries/experimental/wordlists/de_wordlist.combined.gz
+++ b/dictionaries/experimental/wordlists/de_wordlist.combined.gz
--- a/dictionaries/experimental/wordlists/en_GB_wordlist.combined.gz
+++ b/dictionaries/experimental/wordlists/en_GB_wordlist.combined.gz
--- a/dictionaries/experimental/wordlists/en_US_wordlist.combined.gz
+++ b/dictionaries/experimental/wordlists/en_US_wordlist.combined.gz
--- a/dictionaries/experimental/wordlists/es_wordlist.combined.gz
+++ b/dictionaries/experimental/wordlists/es_wordlist.combined.gz
--- a/dictionaries/experimental/wordlists/fr_wordlist.combined.gz
+++ b/dictionaries/experimental/wordlists/fr_wordlist.combined.gz