LibreTranslate/libretranslate/detect.py
2025-03-20 11:39:26 -04:00

52 lines
1.4 KiB
Python

from langdetect import DetectorFactory
DetectorFactory.seed = 0
from langdetect import detect_langs, LangDetectException
from langdetect.lang_detect_exception import ErrorCode
from lexilang.detector import detect as lldetect
class Language:
def __init__(self, code, confidence):
self.code = code
self.confidence = float(confidence)
def __str__(self):
return (f"code: {self.code:<9} confidence: {self.confidence:>5.1f} ")
def check_lang(langcodes, lang):
return normalized_lang_code(lang) in langcodes
def normalized_lang_code(lang):
code = lang.lang
# Handle zh-cn
if code.startswith("zh"):
code = "zh"
return code
class Detector:
def __init__(self, langcodes = ()):
self.langcodes = langcodes
def detect(self, text):
if len(text) < 20:
code, conf = lldetect(text, self.langcodes)
if conf > 0:
return [Language(code, round(conf * 100))]
try:
top_3_choices = [lang for lang in detect_langs(text) if check_lang(self.langcodes, lang)][:3]
if not len(top_3_choices):
return [Language("en", 0)]
if top_3_choices[0].prob == 0:
return [Language("en", 0)]
except LangDetectException as e:
if e.code == ErrorCode.CantDetectError:
return [Language("en", 0)]
else:
raise e
return [Language(normalized_lang_code(lang), round(lang.prob * 100)) for lang in top_3_choices]