mirror of
https://github.com/LibreTranslate/LibreTranslate.git
synced 2025-06-18 23:21:00 +00:00
Use lingua for language detection
This commit is contained in:
parent
7be612419b
commit
6c5fa2a4ee
3 changed files with 40 additions and 88 deletions
|
@ -1,83 +1,36 @@
|
|||
# Originally adapted from https://github.com/aboSamoor/polyglot/blob/master/polyglot/base.py
|
||||
|
||||
import unicodedata
|
||||
|
||||
import pycld2 as cld2
|
||||
|
||||
|
||||
class UnknownLanguageError(Exception):
|
||||
pass
|
||||
import linguars
|
||||
from functools import lru_cache
|
||||
|
||||
class Language:
|
||||
def __init__(self, choice):
|
||||
name, code, confidence, bytesize = choice
|
||||
def __init__(self, code, confidence):
|
||||
self.code = code
|
||||
self.name = name
|
||||
self.confidence = float(confidence)
|
||||
self.read_bytes = int(bytesize)
|
||||
|
||||
def __str__(self):
|
||||
return ("name: {:<12}code: {:<9}confidence: {:>5.1f} "
|
||||
"read bytes:{:>6}".format(self.name, self.code,
|
||||
self.confidence, self.read_bytes))
|
||||
return ("code: {:<9} confidence: {:>5.1f} ".format(
|
||||
self.code,
|
||||
self.confidence))
|
||||
|
||||
@staticmethod
|
||||
def from_code(code):
|
||||
return Language(("", code, 100, 0))
|
||||
@lru_cache(maxsize=None)
|
||||
def load_detector(langcodes = ()):
|
||||
languages = []
|
||||
for lc in langcodes:
|
||||
try:
|
||||
languages.append(linguars.Language.from_iso_code_639_1(lc))
|
||||
except:
|
||||
pass # Not supported
|
||||
|
||||
return linguars.LanguageDetector(languages=languages)
|
||||
|
||||
|
||||
class Detector:
|
||||
""" Detect the language used in a snippet of text."""
|
||||
|
||||
def __init__(self, text, quiet=False):
|
||||
""" Detector of the language used in `text`.
|
||||
Args:
|
||||
text (string): unicode string.
|
||||
"""
|
||||
self.__text = text
|
||||
self.reliable = True
|
||||
"""False if the detector used Best Effort strategy in detection."""
|
||||
self.quiet = quiet
|
||||
"""If true, exceptions will be silenced."""
|
||||
self.detect(text)
|
||||
|
||||
@staticmethod
|
||||
def supported_languages():
|
||||
"""Returns a list of the languages that can be detected by pycld2."""
|
||||
return [name.capitalize() for name,code in cld2.LANGUAGES if not name.startswith("X_")]
|
||||
def __init__(self, langcodes = ()):
|
||||
self.detector = load_detector(langcodes)
|
||||
|
||||
def detect(self, text):
|
||||
"""Decide which language is used to write the text.
|
||||
The method tries first to detect the language with high reliability. If
|
||||
that is not possible, the method switches to best effort strategy.
|
||||
Args:
|
||||
text (string): A snippet of text, the longer it is the more reliable we
|
||||
can detect the language used to write the text.
|
||||
"""
|
||||
try:
|
||||
reliable, index, top_3_choices = cld2.detect(text, bestEffort=False)
|
||||
except cld2.error as e:
|
||||
if "input contains invalid UTF-8" in str(e):
|
||||
# Fix for https://github.com/LibreTranslate/LibreTranslate/issues/514
|
||||
# related to https://github.com/aboSamoor/polyglot/issues/71#issuecomment-707997790
|
||||
text = ''.join([l for l in text if unicodedata.category(str(l))[0] not in ('S', 'M', 'C')])
|
||||
reliable, index, top_3_choices = cld2.detect(text, bestEffort=False)
|
||||
else:
|
||||
raise e
|
||||
top_3_choices = self.detector.confidence(text)[:3]
|
||||
print(top_3_choices)
|
||||
if top_3_choices[0][1] == 0:
|
||||
return [Language("en", 0)]
|
||||
return [Language(lang.iso_code_639_1, round(conf * 100)) for lang, conf in top_3_choices]
|
||||
|
||||
if not reliable:
|
||||
self.reliable = False
|
||||
reliable, index, top_3_choices = cld2.detect(text, bestEffort=True)
|
||||
|
||||
if not self.quiet and not reliable:
|
||||
raise UnknownLanguageError("Try passing a longer snippet of text")
|
||||
|
||||
self.languages = [Language(x) for x in top_3_choices]
|
||||
self.language = self.languages[0]
|
||||
return self.language
|
||||
|
||||
def __str__(self):
|
||||
text = f"Prediction is reliable: {self.reliable}\n"
|
||||
text += "\n".join([f"Language {i+1}: {str(l)}"
|
||||
for i,l in enumerate(self.languages)])
|
||||
return text
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue