2024-06-01 13:50:06 +02:00
|
|
|
#!/bin/python
|
|
|
|
|
|
|
|
import sys
|
|
|
|
import os
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
2024-06-01 22:18:19 +02:00
|
|
|
# script for generating information about actual diacritics use from language data
|
|
|
|
# input could be word lists (one word per line), or *-words.txt files from https://www.wortschatz.uni-leipzig.de/en/download
|
|
|
|
# diacritics.txt contains language and in next line diacritics for that language, but should ideally
|
|
|
|
# contain many languages for better results regarding foreign diacritics
|
|
|
|
# resulting data is usage count for language diacritics, and list of words containing non-language diacritics
|
|
|
|
|
|
|
|
|
2024-06-01 13:50:06 +02:00
|
|
|
file_ending_filter = "-words.txt"
|
|
|
|
word_lists_dir = "../../wordlists/"
|
2024-06-01 22:18:19 +02:00
|
|
|
diacritics_file = "../../diacritics.txt"
|
2024-06-01 13:50:06 +02:00
|
|
|
|
|
|
|
|
|
|
|
def find_word_lists(language: str) -> list[str]:
|
|
|
|
# return a list of files
|
|
|
|
files = list()
|
|
|
|
if not os.path.isdir(word_lists_dir + language):
|
|
|
|
return files
|
|
|
|
for (dirpath, dirnames, filenames) in os.walk(word_lists_dir + language):
|
|
|
|
for n in filenames:
|
|
|
|
if n.endswith(file_ending_filter):
|
|
|
|
files.append(dirpath + "/" + n)
|
|
|
|
return files
|
|
|
|
|
|
|
|
|
|
|
|
def check_diacritics(language: str, diacritics: list[str], all_diacritics: set[str]):
|
|
|
|
word_lists = find_word_lists(language)
|
|
|
|
if len(word_lists) == 0:
|
|
|
|
return
|
2024-06-01 22:18:19 +02:00
|
|
|
report_file = f"diacritics_report_{language}.txt"
|
|
|
|
if os.path.isfile(report_file):
|
|
|
|
return
|
2024-06-01 13:50:06 +02:00
|
|
|
for dia in diacritics:
|
|
|
|
all_diacritics.remove(dia)
|
|
|
|
foreign_dia = "".join(all_diacritics)
|
|
|
|
dia_regex = fr"[{foreign_dia}]"
|
|
|
|
print("checking", language, "with", diacritics)
|
|
|
|
foreigns = list()
|
|
|
|
dia_count = dict()
|
|
|
|
for dia in diacritics:
|
|
|
|
dia_count[dia] = 0
|
|
|
|
for word_list in word_lists:
|
|
|
|
with open(word_list) as f:
|
|
|
|
# check whether file contains any diacritics that are not in the list
|
|
|
|
for line in f:
|
|
|
|
if re.search(dia_regex, line):
|
|
|
|
foreigns.append(line.rstrip())
|
|
|
|
else:
|
|
|
|
# search for language diacritics and add a count
|
|
|
|
for dia in diacritics:
|
|
|
|
if dia in line:
|
|
|
|
try:
|
|
|
|
# assuming the format from https://www.wortschatz.uni-leipzig.de/en/download
|
|
|
|
count = int(line.split("\t")[2])
|
|
|
|
except:
|
|
|
|
count = 1
|
|
|
|
dia_count[dia] = dia_count[dia] + count
|
|
|
|
dia_results = f"language: {language}\n"
|
|
|
|
dia_results = dia_results + f"diacritics: {diacritics}\n"
|
|
|
|
dia_results = dia_results + f"language diacritics counts: {dia_count}\n"
|
|
|
|
dia_results = dia_results + "foreign diacritics:\n"
|
|
|
|
dia_results = dia_results + "\n".join(foreigns)
|
2024-06-01 22:18:19 +02:00
|
|
|
with open(report_file, 'w') as f:
|
2024-06-01 13:50:06 +02:00
|
|
|
f.write(dia_results)
|
|
|
|
|
|
|
|
|
|
|
|
def make_all_diacritics(dia_lists: list[list[str]]) -> set[str]:
|
|
|
|
all_dia = set()
|
|
|
|
for dia_list in dia_lists:
|
|
|
|
for dia in dia_list:
|
|
|
|
all_dia.add(dia)
|
|
|
|
return all_dia
|
|
|
|
|
|
|
|
|
|
|
|
def read_diacritics() -> dict[str, list[str]]:
|
|
|
|
d = dict()
|
|
|
|
language = ""
|
2024-06-01 22:18:19 +02:00
|
|
|
with open(diacritics_file) as f:
|
2024-06-01 13:50:06 +02:00
|
|
|
for line in f:
|
|
|
|
if language == "":
|
|
|
|
language = line.strip()
|
|
|
|
else:
|
|
|
|
d[language] = list(map(str.strip, line.split(",")))
|
|
|
|
language = ""
|
|
|
|
return d
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
diacritics = read_diacritics()
|
|
|
|
all_diacritics = make_all_diacritics(list(diacritics.values()))
|
|
|
|
for key in diacritics:
|
|
|
|
check_diacritics(key, diacritics[key], all_diacritics.copy())
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|