From 2cd8bf2c6c0e975a71f9c09b6d03383c3acbe5c9 Mon Sep 17 00:00:00 2001 From: physton Date: Thu, 28 Mar 2024 23:38:57 +0800 Subject: [PATCH] Update Translators => 5.9.0 --- scripts/physton_prompt/translators/server.py | 576 +++++++++++++++---- translate_apis.json | 2 +- 2 files changed, 463 insertions(+), 115 deletions(-) diff --git a/scripts/physton_prompt/translators/server.py b/scripts/physton_prompt/translators/server.py index e3a4f7b..0505a29 100644 --- a/scripts/physton_prompt/translators/server.py +++ b/scripts/physton_prompt/translators/server.py @@ -2,7 +2,7 @@ # author=UlionTse """ -Copyright (C) 2017-2023 UlionTse +Copyright (C) 2017 UlionTse This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ Email: uliontse@outlook.com -translators Copyright (C) 2017-2023 UlionTse +translators Copyright (C) 2017 UlionTse This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. @@ -47,10 +47,13 @@ import requests import lxml.etree import pathos.multiprocessing -# import cryptography.hazmat.primitives.asymmetric.rsa as cry_rsa +import cryptography.hazmat.primitives.ciphers as cry_ciphers +import cryptography.hazmat.primitives.padding as cry_padding import cryptography.hazmat.primitives.hashes as cry_hashes -import cryptography.hazmat.primitives.asymmetric.padding as cry_padding import cryptography.hazmat.primitives.serialization as cry_serialization +import cryptography.hazmat.primitives.asymmetric.rsa as cry_asym_rsa +import cryptography.hazmat.primitives.asymmetric.padding as cry_asym_padding + SessionType = requests.sessions.Session @@ -61,15 +64,25 @@ __all__ = [ 'translate_text', 'translate_html', 'translators_pool', - 'alibaba', 'apertium', 'argos', 'baidu', 'bing', 'caiyun', 'cloudTranslation', 'deepl', 'elia', 'google', - 'iciba', 'iflytek', 'iflyrec', 'itranslate', 'judic', 'languageWire', 'lingvanex', 'mglip', 'mirai', 'modernMt', - 'myMemory', 'niutrans', 'papago', 'qqFanyi', 'qqTranSmart', 'reverso', 'sogou', 'sysTran', 'tilde', 'translateCom', - 'translateMe', 'utibet', 'volcEngine', 'yandex', 'yeekit', 'youdao', - '_alibaba', '_apertium', '_argos', '_baidu', '_bing', '_caiyun', '_cloudTranslation', '_deepl', '_elia', '_google', - '_iciba', '_iflytek', '_iflyrec', '_itranslate', '_judic', '_languageWire', '_lingvanex', '_mglip', '_mirai', '_modernMt', - '_myMemory', '_niutrans', '_papago', '_qqFanyi', '_qqTranSmart', '_reverso', '_sogou', '_sysTran', '_tilde', '_translateCom', - '_translateMe', '_utibet', '_volcEngine', '_yandex', '_yeekit', '_youdao', -] # 36 + + 'alibaba', 'apertium', 'argos', 'baidu', 'bing', + 'caiyun', 'cloudTranslation', 'deepl', 'elia', 'google', + 'hujiang', 'iciba', 'iflytek', 'iflyrec', 'itranslate', + 'judic', 'languageWire', 'lingvanex', 'mglip', 'mirai', + 'modernMt', 'myMemory', 'niutrans', 'papago', 'qqFanyi', + 'qqTranSmart', 'reverso', 'sogou', 'sysTran', 'tilde', + 'translateCom', 'translateMe', 'utibet', 'volcEngine', 'yandex', + 'yeekit', 'youdao', + + '_alibaba', '_apertium', '_argos', '_baidu', '_bing', + '_caiyun', '_cloudTranslation', '_deepl', '_elia', '_google', + '_hujiang', '_iciba', '_iflytek', '_iflyrec', '_itranslate', + '_judic', '_languageWire', '_lingvanex', '_mglip', '_mirai', + '_modernMt', '_myMemory', '_niutrans', '_papago', '_qqFanyi', + '_qqTranSmart', '_reverso', '_sogou', '_sysTran', '_tilde', + '_translateCom', '_translateMe', '_utibet', '_volcEngine', '_yandex', + '_yeekit', '_youdao', +] # 37 class TranslatorError(Exception): @@ -108,6 +121,15 @@ def _wrapper(*args, **kwargs): def get_timestamp() -> int: return int(time.time() * 1e3) + @staticmethod + def get_uuid() -> str: + _uuid = '' + for i in range(8): + _uuid += hex(int(65536 * (1 + random.random())))[2:][1:] + if 1 <= i <= 4: + _uuid += '-' + return _uuid + @staticmethod def get_headers(host_url: str, if_api: bool = False, @@ -118,7 +140,7 @@ def get_headers(host_url: str, if_http_override_for_api: bool = False ) -> dict: - user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36" + user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' url_path = urllib.parse.urlparse(host_url.strip('/')).path host_headers = { 'Referer' if if_referer_for_host else 'Host': host_url, @@ -212,8 +234,11 @@ def make_temp_language_map(from_language: str, to_language: str, default_from_la @functools.wraps(func) def _wrapper(*args, **kwargs): try: - return func(*args, **kwargs) - except TranslatorError as e: + language_map = func(*args, **kwargs) + if not language_map: + raise TranslatorError + return language_map + except Exception as e: if kwargs.get('if_print_warning', True): warnings.warn(f'GetLanguageMapError: {str(e)}.\nThe function make_temp_language_map() works.') return make_temp_language_map(kwargs.get('from_language'), kwargs.get('to_language'), kwargs.get('default_from_language')) @@ -226,12 +251,14 @@ def check_input_limit(query_text: str, input_limit: int) -> None: @staticmethod def check_query(func): - def check_query_text(query_text: str, if_ignore_empty_query: bool, if_ignore_limit_of_length: bool, limit_of_length: int) -> str: + def check_query_text(query_text: str, if_ignore_empty_query: bool, if_ignore_limit_of_length: bool, limit_of_length: int, bias_of_length: int = 10) -> str: if not isinstance(query_text, str): raise TranslatorError query_text = query_text.strip() qt_length = len(query_text) + limit_of_length -= bias_of_length # #154 + if qt_length == 0 and not if_ignore_empty_query: raise TranslatorError("The `query_text` can't be empty!") if qt_length >= limit_of_length and not if_ignore_limit_of_length: @@ -239,12 +266,12 @@ def check_query_text(query_text: str, if_ignore_empty_query: bool, if_ignore_lim else: if qt_length >= limit_of_length: warnings.warn(f'The length of `query_text` is {qt_length}, above {limit_of_length}.') - return query_text[:limit_of_length - 1] + return query_text[:limit_of_length] return query_text @functools.wraps(func) def _wrapper(*args, **kwargs): - if_ignore_empty_query = kwargs.get('if_ignore_empty_query', False) + if_ignore_empty_query = kwargs.get('if_ignore_empty_query', True) if_ignore_limit_of_length = kwargs.get('if_ignore_limit_of_length', False) limit_of_length = kwargs.get('limit_of_length', 20000) is_detail_result = kwargs.get('is_detail_result', False) @@ -285,7 +312,7 @@ def _wrapper(*args, **kwargs): # return _wrapper -class GuestSeverRegion(Tse): +class Region(Tse): def __init__(self): super().__init__() self.get_addr_url = 'https://geolocation.onetrust.com/cookieconsentpub/v1/geo/location' @@ -294,19 +321,20 @@ def __init__(self): self.ip_tb_add_url = 'https://ip.taobao.com/outGetIpInfo' self.default_region = os.environ.get('translators_default_region', None) - @property - def get_server_region(self, if_judge_cn: bool = True) -> str: + def get_region_of_server(self, if_judge_cn: bool = True, if_print_region: bool = True) -> str: if self.default_region: - sys.stderr.write(f'Using customized region {self.default_region} server backend.\n\n') + if if_print_region: + sys.stderr.write(f'Using customized region {self.default_region} server backend.\n\n') return ('CN' if self.default_region == 'China' else 'EN') if if_judge_cn else self.default_region _headers_fn = lambda url: self.get_headers(url, if_api=False, if_referer_for_host=True) try: try: data = json.loads(requests.get(self.get_addr_url, headers=_headers_fn(self.get_addr_url)).text[9:-2]) - sys.stderr.write(f'Using region {data.get("stateName")} server backend.\n\n') + if if_print_region: + sys.stderr.write(f'Using region {data.get("stateName")} server backend.\n\n') return data.get('country') if if_judge_cn else data.get("stateName") - except requests.exceptions.Timeout: + except: ip_address = requests.get(self.get_ip_url, headers=_headers_fn(self.get_ip_url)).json()['origin'] payload = {'ip': ip_address, 'accessKey': 'alibaba-inc'} data = requests.post(url=self.ip_tb_add_url, data=payload, headers=_headers_fn(self.ip_tb_add_url)).json().get('data') @@ -498,6 +526,7 @@ def __init__(self, server_region='EN'): self.en_host_url = 'https://translate.google.com' self.api_url = None self.api_url_path = '/_/TranslateWebserverUi/data/batchexecute' + self.consent_url = 'https://consent.google.com/save' self.server_region = server_region self.host_headers = None self.api_headers = None @@ -525,11 +554,14 @@ def get_info(self, host_html: str) -> dict: data = execjs.eval(data_str) return {'bl': data['cfb2h'], 'f.sid': data['FdrFJe']} - def get_consent_cookie(self, consent_html: str) -> str: # by mercuree. merged but not verify. + def get_consent_data(self, consent_html: str) -> dict: #142 merged but not verify. et = lxml.etree.HTML(consent_html) - input_element = et.xpath('.//input[@type="hidden"][@name="v"]') - cookie_value = input_element[0].attrib.get('value') if input_element else 'cb' - return f'CONSENT=YES+{cookie_value}' # cookie CONSENT=YES+cb works for now + form_element = et.xpath('.//form[1]') + self.consent_url = form_element[0].attrib.get('action') if form_element else self.consent_url + + input_elements = form_element[0].xpath('.//input[@type="hidden"]') + data = {e.attrib.get('name'): e.attrib.get('value') for e in input_elements} + return data @Tse.time_stat @Tse.check_query @@ -588,9 +620,9 @@ def google_api(self, query_text: str, from_language: str = 'auto', to_language: self.begin_time = time.time() self.session = requests.Session() r = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) - if 'consent.google.com' == urllib.parse.urlparse(r.url).hostname: - self.host_headers.update({'cookie': self.get_consent_cookie(r.text)}) - host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies).text + if urllib.parse.urlparse(self.consent_url).hostname == urllib.parse.urlparse(r.url).hostname: + form_data = self.get_consent_data(r.text) + host_html = self.session.post(self.consent_url, data=form_data, headers=self.host_headers, timeout=timeout, proxies=proxies).text else: host_html = r.text debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, if_print_warning) @@ -639,6 +671,7 @@ def get_language_map(self, lang_url: str, ss: SessionType, headers: dict, timeou lang_list = sorted(list(set(lang_list))) return {}.fromkeys(lang_list, lang_list) + @Tse.uncertified @Tse.time_stat @Tse.check_query def baidu_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', **kwargs: ApiKwargsType) -> Union[str, dict]: @@ -686,6 +719,9 @@ def baidu_api(self, query_text: str, from_language: str = 'auto', to_language: s debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, if_print_warning) self.language_map = self.get_language_map(self.get_lang_url, self.session, self.host_headers, timeout, proxies, **debug_lang_kwargs) + # self.session.cookies.update({'ab_sr': f'1.0.1_{self.absr_v}=='}) + # self.session.cookies.update({k: '1' for k in ['REALTIME_TRANS_SWITCH', 'FANYI_WORD_SWITCH', 'HISTORY_SWITCH', 'SOUND_SPD_SWITCH', 'SOUND_PREFER_SWITCH']}) + from_language, to_language = self.check_language(from_language, to_language, self.language_map, output_zh=self.output_zh) payload = { @@ -761,9 +797,16 @@ def get_tk(self, host_html: str) -> str: tk_list = re.compile("""token: '(.*?)',|token: "(.*?)",""").findall(host_html)[0] return tk_list[0] or tk_list[1] + # def get_new_absr(self, absr): + # absr = base64.b64decode(absr+'==').decode() + # absr = absr[:-32] + hashlib.md5(str(int(time.time())).encode()).hexdigest() + # absr = base64.b64encode(absr.encode()).decode() + # return absr + # def get_acs_token(self): # pass + @Tse.uncertified @Tse.time_stat @Tse.check_query def baidu_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', **kwargs: ApiKwargsType) -> Union[str, dict]: @@ -807,7 +850,7 @@ def baidu_api(self, query_text: str, from_language: str = 'auto', to_language: s if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time and self.token and self.sign): self.begin_time = time.time() self.session = requests.Session() - _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) # must twice, send cookies. + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) # must twice, reload token. host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies).text self.token = self.get_tk(host_html) self.sign = self.get_sign(query_text, host_html, self.session, self.host_headers, timeout, proxies) @@ -818,9 +861,14 @@ def baidu_api(self, query_text: str, from_language: str = 'auto', to_language: s debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, if_print_warning) self.language_map = self.get_language_map(self.get_lang_url, self.session, self.host_headers, timeout, proxies, **debug_lang_kwargs) + # self.session.cookies.update({'ab_sr': f'1.0.1_{self.absr_v}=='}) + # self.session.cookies.update({k: '1' for k in ['REALTIME_TRANS_SWITCH', 'FANYI_WORD_SWITCH', 'HISTORY_SWITCH', 'SOUND_SPD_SWITCH', 'SOUND_PREFER_SWITCH']}) + from_language, to_language = self.check_language(from_language, to_language, self.language_map, output_zh=self.output_zh) + + payload = urllib.parse.urlencode({"query": query_text}) + res = self.session.post(self.langdetect_url, headers=self.api_headers, data=payload, timeout=timeout, proxies=proxies) if from_language == 'auto': - res = self.session.post(self.langdetect_url, headers=self.api_headers, data={"query": query_text}, timeout=timeout, proxies=proxies) from_language = res.json()['lan'] params = {"from": from_language, "to": to_language} @@ -833,8 +881,9 @@ def baidu_api(self, query_text: str, from_language: str = 'auto', to_language: s "sign": self.sign, "token": self.token, "domain": use_domain, + "ts": self.get_timestamp(), } - payload = urllib.parse.urlencode(payload).encode('utf-8') + payload = urllib.parse.urlencode(payload) # self.api_headers.update({'Acs-Token': self.acs_token}) r = self.session.post(self.api_url, params=params, data=payload, headers=self.api_headers, timeout=timeout, proxies=proxies) r.raise_for_status() @@ -1658,8 +1707,7 @@ def get_language_map(self, host_html: str, **kwargs: LangMapKwargsType) -> dict: def get_ig_iid(self, host_html: str) -> dict: et = lxml.etree.HTML(host_html) - # iid = et.xpath('//*[@id="tta_outGDCont"]/@data-iid')[0] # browser page is different between request page. - iid = 'translator.5028' + iid = et.xpath('//*[@id="tta_outGDCont"]/@data-iid')[0] # 'translator.5028' ig = re.compile('IG:"(.*?)"').findall(host_html)[0] return {'iid': iid, 'ig': ig} @@ -1733,11 +1781,17 @@ def bing_api(self, query_text: str, from_language: str = 'auto', to_language: st api_url = ''.join([self.api_url, api_url_param]) r = self.session.post(api_url, headers=self.host_headers, data=payload, timeout=timeout, proxies=proxies) r.raise_for_status() - data = r.json() time.sleep(sleep_seconds) self.query_count += 1 - return data[0] if is_detail_result else data[0]['translations'][0]['text'] + try: + data = r.json() + return data[0] if is_detail_result else data[0]['translations'][0]['text'] + except requests.exceptions.JSONDecodeError: #122 + data_html = r.text + et = lxml.etree.HTML(data_html) + ss = et.xpath('//*/textarea/text()') + return {'data': ss} if is_detail_result else ss[-1] class Sogou(Tse): def __init__(self): @@ -1773,14 +1827,6 @@ def get_language_map(self, host_html: str, lang_old_url: str, ss: SessionType, t lang_list = [item['lang'] for item in lang_item_list if item['play'] == 1] return {}.fromkeys(lang_list, lang_list) - # def get_uuid(self) -> str: - # _uuid = '' - # for i in range(8): - # _uuid += hex(int(65536 * (1 + 0)))[2:][1:] - # if i in range(1, 5): - # _uuid += '-' - # return _uuid - def get_form(self, query_text: str, from_language: str, to_language: str, uid: str) -> dict: sign_text = "" + from_language + to_language + query_text + '109984457' # window.__INITIAL_STATE__.common.CONFIG.secretCode sign = hashlib.md5(sign_text.encode()).hexdigest() @@ -1877,15 +1923,14 @@ def __init__(self): @Tse.debug_language_map def get_language_map(self, js_html: str, **kwargs: LangMapKwargsType) -> dict: - return execjs.eval(re.compile('={auto:\\[(.*?)}').search(js_html).group()[1:]) + lang_text = re.compile('lang:{(.*?)},').search(js_html).group()[5:-1] + lang_pair_list = re.compile('(\\w+):(.*?),').findall(lang_text) + lang_list = sorted([lang for lang, _ in lang_pair_list]) + return {}.fromkeys(lang_list, lang_list) def get_tk(self, js_html: str) -> str: return re.compile('headers\\["X-Authorization"]="(.*?)",').findall(js_html)[0] - # def get_jwt(self, browser_id: str, api_headers: dict, ss: SessionType, timeout: float, proxies: dict) -> str: - # data = {"browser_id": browser_id} - # return ss.post(self.get_jwt_url, json=data, headers=api_headers, timeout=timeout, proxies=proxies).json()['jwt'] - def crypt(self, if_de: bool = True) -> dict: if if_de: return {k: v for k, v in zip(self.cipher_key, self.normal_key)} @@ -1999,6 +2044,7 @@ def __init__(self): self.begin_time = time.time() self.host_url = 'https://www.deepl.com/translator' self.api_url = 'https://www2.deepl.com/jsonrpc' + self.login_url = 'https://login-wall.deepl.com' self.host_headers = self.get_headers(self.host_url, if_api=False) self.api_headers = self.get_headers(self.host_url, if_api=True, if_ajax_for_api=False, if_json_for_api=True) self.params = {'split': {'method': 'LMT_split_text'}, 'handle': {'method': 'LMT_handle_jobs'}} @@ -2012,7 +2058,7 @@ def __init__(self): @Tse.debug_language_map def get_language_map(self, host_html: str, **kwargs: LangMapKwargsType) -> dict: - lang_list = list(set(re.compile('translateIntoLang\\.(\\w+)":').findall(host_html))) + lang_list = sorted(list(set(re.compile("\\['selectLang_source_(\\w+)']").findall(host_html)))) return {}.fromkeys(lang_list, lang_list) def split_sentences_param(self, query_text: str, from_language: str) -> dict: @@ -2031,6 +2077,8 @@ def split_sentences_param(self, query_text: str, from_language: str) -> dict: }, }, } + if from_language != 'auto': + data['params']['lang'].update({'lang_computed': from_language}) return {**self.params['split'], **data} def context_sentences_param(self, sentences: List[str], from_language: str, to_language: str) -> dict: @@ -2045,6 +2093,7 @@ def context_sentences_param(self, sentences: List[str], from_language: str, to_l # 'regionalVariant': 'en-US', 'browserType': 1, 'mode': 'translate', + 'textType': 'plaintext', }, 'jobs': [ { @@ -2054,14 +2103,15 @@ def context_sentences_param(self, sentences: List[str], from_language: str, to_l 'raw_en_context_before': sentences[1:i] if sentences[i-1] else [], 'raw_en_context_after': [sentences[i+1]] if sentences[i+1] else [], 'preferred_num_beams': 1 if len(sentences) >= 4 else 4, # 1 if two sentences else 4, len>=2+2 - } for i in range(1, len(sentences) - 1) + } + for i in range(1, len(sentences) - 1) ], 'lang': { 'preference': { 'weight': {}, 'default': 'default', }, - 'source_lang_user_selected': from_language, # "source_lang_computed" + 'source_lang_computed': from_language, # 'source_lang_user_selected' 'target_lang': to_language, }, }, @@ -2109,6 +2159,7 @@ def deepl_api(self, query_text: str, from_language: str = 'auto', to_language: s host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies).text debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, if_print_warning) self.language_map = self.get_language_map(host_html, **debug_lang_kwargs) + _ = self.session.get(self.login_url, headers=self.host_headers, timeout=timeout, proxies=proxies) from_language, to_language = self.check_language(from_language, to_language, language_map=self.language_map, output_zh=self.output_zh, output_auto='auto') from_language = from_language.upper() if from_language != 'auto' else from_language @@ -2118,17 +2169,17 @@ def deepl_api(self, query_text: str, from_language: str = 'auto', to_language: s r_s = self.session.post(self.api_url, params=self.params['split'], json=ssp_data, headers=self.api_headers, timeout=timeout, proxies=proxies) r_s.raise_for_status() s_data = r_s.json() - + from_language = s_data['result']['lang']['detected'] s_sentences = [it['sentences'][0]['text'] for item in s_data['result']['texts'] for it in item['chunks']] - h_data = self.context_sentences_param(s_sentences, from_language, to_language) + h_data = self.context_sentences_param(s_sentences, from_language, to_language) r_cs = self.session.post(self.api_url, params=self.params['handle'], json=h_data, headers=self.api_headers, timeout=timeout, proxies=proxies) r_cs.raise_for_status() data = r_cs.json() time.sleep(sleep_seconds) self.request_id += 3 self.query_count += 1 - return data if is_detail_result else '\n'.join(item['beams'][0]['sentences'][0]["text"] for item in data['result']['translations']) + return data if is_detail_result else ' '.join(item['beams'][0]['sentences'][0]["text"] for item in data['result']['translations']) # either ' ' or '\n'. class Yandex(Tse): @@ -2282,20 +2333,19 @@ class Argos(Tse): def __init__(self): super().__init__() self.begin_time = time.time() - self.host_url = 'https://translate.argosopentech.com' + self.host_url = 'https://libretranslate.com' self.api_url = f'{self.host_url}/translate' self.language_url = f'{self.host_url}/languages' + self.secret_url = f'{self.host_url}/js/app.js?v=1.5.0' self.host_headers = self.get_headers(self.host_url, if_api=False, if_ajax_for_api=False) self.api_headers = self.get_headers(self.host_url, if_api=True, if_ajax_for_api=False, if_json_for_api=True) self.language_headers = self.get_headers(self.host_url, if_api=False, if_json_for_api=True) - self.host_pool = ['https://translate.argosopentech.com', 'https://libretranslate.de', - 'https://translate.astian.org', 'https://translate.mentality.rip', - 'https://translate.api.skitzen.com', 'https://trans.zillyhuhn.com'] self.language_map = None + self.api_secret = None self.session = None self.query_count = 0 self.output_zh = 'zh' - self.input_limit = int(5e3) # unknown + self.input_limit = int(2e3) self.default_from_language = self.output_zh @Tse.debug_language_map @@ -2304,11 +2354,16 @@ def get_language_map(self, lang_url: str, ss: SessionType, headers: dict, timeou lang_list = sorted([lang['code'] for lang in lang_list]) return {}.fromkeys(lang_list, lang_list) + def get_secret(self, secret_url: str, ss: SessionType, headers: dict, timeout: Optional[float], proxies: Optional[dict]) -> str: + js_html = ss.get(secret_url, headers=headers, timeout=timeout, proxies=proxies).text + secret = re.compile('apiSecret: "(.*?)"').findall(js_html)[0] + return secret + @Tse.time_stat @Tse.check_query def argos_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', **kwargs: ApiKwargsType) -> Union[str, dict]: """ - https://translate.argosopentech.com + https://libretranslate.com :param query_text: str, must. :param from_language: str, default 'auto'. :param to_language: str, default 'en'. @@ -2325,18 +2380,9 @@ def argos_api(self, query_text: str, from_language: str = 'auto', to_language: s :param if_show_time_stat: bool, default False. :param show_time_stat_precision: int, default 2. :param if_print_warning: bool, default True. - :param reset_host_url: str, default None. :return: str or dict """ - reset_host_url = kwargs.get('reset_host_url', None) - if reset_host_url and reset_host_url != self.host_url: - if reset_host_url not in self.host_pool: - raise TranslatorError - self.host_url = reset_host_url - self.api_url = f'{self.host_url}/translate' - self.language_url = f'{self.host_url}/languages' - timeout = kwargs.get('timeout', None) proxies = kwargs.get('proxies', None) sleep_seconds = kwargs.get('sleep_seconds', 0) @@ -2348,15 +2394,23 @@ def argos_api(self, query_text: str, from_language: str = 'auto', to_language: s not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 - if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time and self.api_secret): self.begin_time = time.time() self.session = requests.Session() - _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies).text + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + self.api_secret = self.get_secret(self.secret_url, self.session, self.host_headers, timeout, proxies) debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, if_print_warning) self.language_map = self.get_language_map(self.language_url, self.session, self.language_headers, timeout, proxies, **debug_lang_kwargs) from_language, to_language = self.check_language(from_language, to_language, self.language_map, output_zh=self.output_zh) - payload = {'q': query_text, 'source': from_language, 'target': to_language, 'format': 'text'} + payload = { + 'q': query_text, + 'source': from_language, + 'target': to_language, + 'format': 'text', + 'api_key': '', + 'secret': self.api_secret, + } r = self.session.post(self.api_url, headers=self.api_headers, json=payload, timeout=timeout, proxies=proxies) r.raise_for_status() data = r.json() @@ -2376,7 +2430,9 @@ def __init__(self): self.language_headers = self.get_headers(self.host_url, if_api=False, if_json_for_api=True) self.language_map = None self.session = None - self.s_y2 = 'ifanyiweb8hc9s98e' + self.sign_key = '6dVjYLFyzfkFkk' # 'ifanyiweb8hc9s98e' + self.encrypt_key = 'L4fBtD5fLC9FQw22' + self.decrypt_key = 'aahc3TfyfCEmER33' self.query_count = 0 self.output_zh = 'zh' self.input_limit = int(3e3) @@ -2389,6 +2445,49 @@ def get_language_map(self, api_url: str, ss: SessionType, headers: dict, timeout lang_list = sorted(list(set([lang for d in dd for lang in dd[d]]))) return {}.fromkeys(lang_list, lang_list) + def encrypt_by_aes_ecb_pkcs7(self, data: str, key: str, if_padding: bool = True) -> bytes: + algorithm = cry_ciphers.base.modes.algorithms.AES(key=key.encode()) + mode = cry_ciphers.base.modes.ECB() + block_size = cry_ciphers.base.modes.algorithms.AES.block_size + + cipher = cry_ciphers.Cipher(algorithm=algorithm, mode=mode) + encryptor = cipher.encryptor() + + if if_padding: + padder = cry_padding.PKCS7(block_size=block_size).padder() + data = padder.update(data=data.encode()) + padder.finalize() # + + data = data if if_padding else data.encode() + encrypted_data = encryptor.update(data=data) + return encrypted_data + + def decrypt_by_aes_ecb_pkcs7(self, data: bytes, key: str, if_padding: bool = True) -> str: + algorithm = cry_ciphers.base.modes.algorithms.AES(key=key.encode()) + mode = cry_ciphers.base.modes.ECB() + block_size = cry_ciphers.base.modes.algorithms.AES.block_size + + cipher = cry_ciphers.Cipher(algorithm=algorithm, mode=mode) + decryptor = cipher.decryptor() + decrypted_data = decryptor.update(data=data) + + if if_padding: + un_padder = cry_padding.PKCS7(block_size=block_size).unpadder() + decrypted_data = un_padder.update(data=decrypted_data) + un_padder.finalize() # + return decrypted_data.decode() + + def get_sign(self, query_text: str) -> str: + cry_text = f"6key_web_new_fanyi{self.sign_key}{query_text}" + sign = hashlib.md5(cry_text.encode()).hexdigest()[:16] + sign = self.encrypt_by_aes_ecb_pkcs7(data=sign, key=self.encrypt_key, if_padding=True) + sign = base64.b64encode(sign).decode() + return sign + + def get_result(self, data: dict) -> dict: + data = base64.b64decode(data['content']) + data_str = self.decrypt_by_aes_ecb_pkcs7(data=data, key=self.decrypt_key, if_padding=True) + data = json.loads(data_str) + return data + @Tse.time_stat @Tse.check_query def iciba_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', **kwargs: ApiKwargsType) -> Union[str, dict]: @@ -2433,15 +2532,25 @@ def iciba_api(self, query_text: str, from_language: str = 'auto', to_language: s from_language, to_language = self.check_language(from_language, to_language, self.language_map, output_zh=self.output_zh) - sign = hashlib.md5(f"6key_web_fanyi{self.s_y2}{query_text}".encode()).hexdigest()[:16] # strip() - params = {'c': 'trans', 'm': 'fy', 'client': 6, 'auth_user': 'key_web_fanyi', 'sign': sign} - payload = {'from': from_language, 'to': to_language, 'q': query_text} + params = { + 'c': 'trans', + 'm': 'fy', + 'client': 6, + 'auth_user': 'key_web_new_fanyi', + 'sign': self.get_sign(query_text), + } + payload = { + 'from': from_language, + 'to': 'auto' if from_language == 'auto' else to_language, + 'q': query_text, + } r = self.session.post(self.api_url, headers=self.api_headers, params=params, data=payload, timeout=timeout, proxies=proxies) r.raise_for_status() data = r.json() + data = self.get_result(data) time.sleep(sleep_seconds) self.query_count += 1 - return data if is_detail_result else data['content'] if data.get('isSensitive') == 1 else data['content']['out'] + return data if is_detail_result else data['out'] class IflytekV1(Tse): @@ -3196,10 +3305,10 @@ class Lingvanex(Tse): def __init__(self): super().__init__() self.begin_time = time.time() - self.host_url = 'https://lingvanex.com/demo/' + self.host_url = 'https://lingvanex.com/translate/' self.api_url = None self.language_url = None - self.auth_url = 'https://lingvanex.com/lingvanex_demo_page/js/api-base.js' + self.auth_url = 'https://lingvanex.com/translate/js/api-base.js' self.host_headers = self.get_headers(self.host_url, if_api=False) self.api_headers = self.get_headers(self.host_url, if_api=True, if_json_for_api=False) self.session = None @@ -3234,7 +3343,7 @@ def get_auth(self, auth_url: str, ss: SessionType, headers: dict, timeout: float @Tse.check_query def lingvanex_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', **kwargs: ApiKwargsType) -> Union[str, dict]: """ - https://lingvanex.com/demo/ + https://lingvanex.com/translate/ :param query_text: str, must. :param from_language: str, default 'auto'. :param to_language: str, default 'en'. @@ -3298,7 +3407,7 @@ def lingvanex_api(self, query_text: str, from_language: str = 'auto', to_languag 'to': to_language, 'text': query_text, 'platform': 'dp', - 'is_return_text_split_ranges': 'true' + # 'is_return_text_split_ranges': 'true' } payload = urllib.parse.urlencode(payload) r = self.session.post(self.api_url, data=payload, headers=self.api_headers, timeout=timeout, proxies=proxies) @@ -3306,10 +3415,10 @@ def lingvanex_api(self, query_text: str, from_language: str = 'auto', to_languag data = r.json() time.sleep(sleep_seconds) self.query_count += 1 - return data if is_detail_result else data['result']['text'] + return data if is_detail_result else data['result'] -class Niutrans(Tse): +class NiutransV1(Tse): def __init__(self): super().__init__() self.begin_time = time.time() @@ -3344,9 +3453,9 @@ def encrypt_rsa(self, message_text: str, public_key_text: str) -> str: public_key_object = cry_serialization.load_pem_public_key(public_key_pem.encode()) cipher_text = base64.b64encode(public_key_object.encrypt( plaintext=message_text.encode(), - # padding=cry_padding.PKCS1v15() - padding=cry_padding.OAEP( - mgf=cry_padding.MGF1(algorithm=cry_hashes.SHA256()), + # padding=cry_asym_padding.PKCS1v15() + padding=cry_asym_padding.OAEP( + mgf=cry_asym_padding.MGF1(algorithm=cry_hashes.SHA256()), algorithm=cry_hashes.SHA256(), label=None ) @@ -3431,6 +3540,158 @@ def niutrans_api(self, query_text: str, from_language: str = 'auto', to_language return data if is_detail_result else '\n'.join([' '.join([it['data'] for it in item['sentences']]) for item in data['data']]) +class NiutransV2(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.home_url = 'https://niutrans.com' + self.host_url = 'https://niutrans.com/trans?type=text' + self.api_url = 'https://test.niutrans.com/NiuTransServer/testaligntrans' + self.get_language_url = 'https://niutrans.com/NiuTransFrontPage/language/getAllLanguage' + self.detect_language_url = 'https://test.niutrans.com/NiuTransServer/language' + self.login_url = 'https://niutrans.com/NiuTransConsole/user/isLogin' + self.geetest_host_url = 'https://www.geetest.com' + self.geetest_captcaha_url = 'https://www.geetest.com/adaptive-captcha-demo' + self.geetest_load_url = 'https://gcaptcha4.geetest.com/load' + self.geetest_verify_url = 'https://gcaptcha4.geetest.com/verify' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True, if_json_for_api=True) + self.session = None + self.language_map = None + self.captcha_id = None # '24f56dc13c40dc4a02fd0318567caef5' + self.geetest_load_data = None + self.geetest_verify_data = None + self.query_count = 0 + self.output_zh = 'zh' + self.input_limit = int(5e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, lang_url: str, ss: SessionType, headers: dict, timeout: Optional[float], proxies: Optional[dict], **kwargs: LangMapKwargsType) -> dict: + d_lang_map = ss.get(lang_url, headers=headers, timeout=timeout, proxies=proxies).json() + lang_list = sorted(set([it['code'] for item in d_lang_map['languageList'] for it in item['result']])) + return {}.fromkeys(lang_list, lang_list) + + def get_captcha_id(self, captcha_url: str, ss: SessionType, headers: dict, timeout: Optional[float], proxies: Optional[dict]): + captcha_host_html = ss.get(captcha_url, headers=headers, timeout=timeout, proxies=proxies).text + captcha_js_url_path = re.compile('/_next/static/(.*?)/pages/adaptive-captcha-demo.js').search(captcha_host_html).group(0) + captcha_js_url = f'{self.geetest_host_url}{captcha_js_url_path}' + captcha_js_html = ss.get(captcha_js_url, headers=headers, timeout=timeout, proxies=proxies).text + captcha_id = re.compile('captchaId:"(.*?)",').search(captcha_js_html).group(1) + return captcha_id + + def get_geetest_callback(self): + return f'geetest_{int(self.get_timestamp() + int(random.random() * 1e4))}' + + def get_geetest_w(self, k=1088): + pool = list('abcdef' + '0123456789') + return ''.join(random.choices(pool, k=k)) # TODO + + def get_geetest_data(self, timeout, proxies): + gl_params = { + 'callback': self.get_geetest_callback(), + 'captcha_id': self.captcha_id, + 'challenge': str(uuid.uuid4()), + 'client_type': 'web', # 'h5' + 'lang': 'zh-cn', + } + r_gl = self.session.get(self.geetest_load_url, params=gl_params, headers=self.host_headers, timeout=timeout, proxies=proxies) + self.geetest_load_data = json.loads(r_gl.text[22:-1])['data'] + + gv_params = { + 'callback': self.get_geetest_callback(), + 'captcha_id': self.captcha_id, + 'client_type': 'web', # 'h5' + 'lot_number': self.geetest_load_data['lot_number'], + 'payload': self.geetest_load_data['payload'], + 'process_token': self.geetest_load_data['process_token'], + 'payload_protocol': self.geetest_load_data['payload_protocol'], + 'pt': self.geetest_load_data['pt'], + 'w': self.get_geetest_w(), # TODO + } + r_gv = self.session.get(self.geetest_verify_url, params=gv_params, headers=self.host_headers, timeout=timeout, proxies=proxies) + self.geetest_verify_data = json.loads(r_gv.text[22:-1])['data']['seccode'] + return + + @Tse.uncertified + @Tse.time_stat + @Tse.check_query + def niutrans_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://niutrans.com/trans?type=text + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time and self.captcha_id): + self.begin_time = time.time() + self.session = requests.Session() + _ = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + _ = self.session.get(self.login_url, headers=self.host_headers, timeout=timeout, proxies=proxies) + self.captcha_id = self.get_captcha_id(self.geetest_captcaha_url, self.session, self.host_headers, timeout, proxies) + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, if_print_warning) + self.language_map = self.get_language_map(self.get_language_url, self.session, self.api_headers, timeout, proxies, **debug_lang_kwargs) + + from_language, to_language = self.check_language(from_language, to_language, self.language_map, output_zh=self.output_zh) + if from_language == 'auto': + params = { + 'src_text': query_text, + 'time': self.get_timestamp(), + 'source': 'text', + } + res = self.session.get(self.detect_language_url, params=params, headers=self.host_headers, timeout=timeout, proxies=proxies) + from_language = res.json()['language'] + + self.get_geetest_data(timeout, proxies) + trans_params = { + 'src_text': query_text, + 'from': from_language, + 'to': to_language, + 'source': 'text', + 'dictNo': '', + 'memoryNo': '', + 'lot_number': self.geetest_verify_data['lot_number'], + 'captcha_output': self.geetest_verify_data['captcha_output'], + 'pass_token': self.geetest_verify_data['pass_token'], + 'gen_time': self.geetest_verify_data['gen_time'], + 'time': self.get_timestamp(), + 'isUseDict': 0, + 'isUseMemory': 0, + } + r = self.session.get(self.api_url, params=trans_params, headers=self.api_headers, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['tgt_text'] + + class Mglip(Tse): def __init__(self): super().__init__() @@ -4328,7 +4589,7 @@ def __init__(self): self.api_url = 'https://api-translate.systran.net/translation/text/translate' self.get_lang_url = 'https://api-translate.systran.net/translation/supportedLanguages' self.get_token_url = 'https://translate.systran.net/oidc/token' - self.get_client_url = 'https://www.systran.net/wp-content/themes/systran/translator/js/translateBox.bundle.js' + self.get_client_url = 'https://www.systransoft.com/wp-content/themes/systran/dist/translatebox/translateBox.bundle.js' self.host_headers = self.get_headers(self.home_url, if_api=False, if_referer_for_host=True) self.api_ajax_headers = self.get_headers(self.home_url, if_api=True, if_ajax_for_api=True) self.api_json_headers = self.get_headers(self.home_url, if_api=True, if_json_for_api=True) @@ -4374,7 +4635,7 @@ def get_client_data(self, client_url: str, ss: SessionType, headers: dict, timeo @Tse.check_query def sysTran_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', **kwargs: ApiKwargsType) -> Union[str, dict]: """ - https://www.systran.net/translate/ + https://www.systran.net/translate/, https://www.systransoft.com/translate/ :param query_text: str, must. :param from_language: str, default 'auto'. :param to_language: str, default 'en'. @@ -4853,6 +5114,7 @@ def __init__(self): def get_language_map(self, lang_list: List[str], **kwargs: LangMapKwargsType) -> dict: return {}.fromkeys(lang_list, lang_list) + @Tse.uncertified @Tse.time_stat @Tse.check_query def judic_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', **kwargs: ApiKwargsType) -> Union[str, dict]: @@ -4995,10 +5257,91 @@ def yeekit_api(self, query_text: str, from_language: str = 'auto', to_language: return data if is_detail_result else '\n'.join(' '.join(p) for p in json.loads(data[0])['translation'][0]['translated'][0]['translation list']) +class Hujiang(Tse): + def __init__(self): + super().__init__() + self.begin_time = time.time() + self.host_url = 'https://dict.hjenglish.com/app/trans' + self.api_url = 'https://dict.hjenglish.com/v10/dict/translation' + self.host_headers = self.get_headers(self.host_url, if_api=False) + self.api_headers = self.get_headers(self.host_url, if_api=True) + self.language_map = None + self.session = None + self.hj_uid = str(uuid.uuid4()) + self.query_count = 0 + self.output_zh = 'cn' + self.input_limit = int(5e3) + self.default_from_language = self.output_zh + + @Tse.debug_language_map + def get_language_map(self, host_html: str, **kwargs: LangMapKwargsType) -> dict: + et = lxml.etree.HTML(host_html) + lang_list = sorted(list(set(et.xpath('//*/select[@class="translate-fromLang"]/option/@value')))) + return {}.fromkeys(lang_list, lang_list) + + @Tse.time_stat + @Tse.check_query + def hujiang_api(self, query_text: str, from_language: str = 'auto', to_language: str = 'en', **kwargs: ApiKwargsType) -> Union[str, dict]: + """ + https://dict.hjenglish.com/app/trans + :param query_text: str, must. + :param from_language: str, default 'auto'. + :param to_language: str, default 'en'. + :param **kwargs: + :param timeout: float, default None. + :param proxies: dict, default None. + :param sleep_seconds: float, default 0. + :param is_detail_result: bool, default False. + :param if_ignore_limit_of_length: bool, default False. + :param limit_of_length: int, default 20000. + :param if_ignore_empty_query: bool, default False. + :param update_session_after_freq: int, default 1000. + :param update_session_after_seconds: float, default 1500. + :param if_show_time_stat: bool, default False. + :param show_time_stat_precision: int, default 2. + :param if_print_warning: bool, default True. + :return: str or dict + """ + + timeout = kwargs.get('timeout', None) + proxies = kwargs.get('proxies', None) + sleep_seconds = kwargs.get('sleep_seconds', 0) + if_print_warning = kwargs.get('if_print_warning', True) + is_detail_result = kwargs.get('is_detail_result', False) + update_session_after_freq = kwargs.get('update_session_after_freq', self.default_session_freq) + update_session_after_seconds = kwargs.get('update_session_after_seconds', self.default_session_seconds) + self.check_input_limit(query_text, self.input_limit) + + not_update_cond_freq = 1 if self.query_count % update_session_after_freq != 0 else 0 + not_update_cond_time = 1 if time.time() - self.begin_time < update_session_after_seconds else 0 + if not (self.session and self.language_map and not_update_cond_freq and not_update_cond_time): + self.begin_time = time.time() + self.session = requests.Session() + self.session.cookies.update({'HJ_UID': self.hj_uid, 'HJC_USRC': 'uzhi', 'HJC_NUID': '1'}) + host_html = self.session.get(self.host_url, headers=self.host_headers, timeout=timeout, proxies=proxies).text + debug_lang_kwargs = self.debug_lang_kwargs(from_language, to_language, self.default_from_language, if_print_warning) + self.language_map = self.get_language_map(host_html, **debug_lang_kwargs) + + if from_language == 'auto': + from_language = self.warning_auto_lang('hujiang', self.default_from_language, if_print_warning) + from_language, to_language = self.check_language(from_language, to_language, self.language_map, output_zh=self.output_zh) + + payload = urllib.parse.urlencode({'content': query_text}) + api_url = f'{self.api_url}/{from_language}/{to_language}' + r = self.session.post(api_url, headers=self.api_headers, data=payload, timeout=timeout, proxies=proxies) + r.raise_for_status() + data = r.json() + time.sleep(sleep_seconds) + self.query_count += 1 + return data if is_detail_result else data['data']['content'] # supported by baidu. + + class TranslatorsServer: def __init__(self): self.cpu_cnt = os.cpu_count() - self.server_region = GuestSeverRegion().get_server_region + self._region = Region() + self.get_region_of_server = self._region.get_region_of_server + self.server_region = self.get_region_of_server(if_print_region=False) self._alibaba = AlibabaV2() self.alibaba = self._alibaba.alibaba_api self._apertium = Apertium() @@ -5019,6 +5362,8 @@ def __init__(self): self.elia = self._elia.elia_api self._google = GoogleV2(server_region=self.server_region) self.google = self._google.google_api + self._hujiang = Hujiang() + self.hujiang = self._hujiang.hujiang_api self._iciba = Iciba() self.iciba = self._iciba.iciba_api self._iflytek = IflytekV2() @@ -5033,7 +5378,7 @@ def __init__(self): self.languageWire = self._languageWire.languageWire_api self._lingvanex = Lingvanex() self.lingvanex = self._lingvanex.lingvanex_api - self._niutrans = Niutrans() + self._niutrans = NiutransV2() self.niutrans = self._niutrans.niutrans_api self._mglip = Mglip() self.mglip = self._mglip.mglip_api @@ -5074,26 +5419,26 @@ def __init__(self): self._translators_dict = { 'alibaba': self._alibaba, 'apertium': self._apertium, 'argos': self._argos, 'baidu': self._baidu, 'bing': self._bing, 'caiyun': self._caiyun, 'cloudTranslation': self._cloudTranslation, 'deepl': self._deepl, 'elia': self._elia, 'google': self._google, - 'iciba': self._iciba, 'iflytek': self._iflytek, 'iflyrec': self._iflyrec, 'itranslate': self._itranslate, 'judic': self._judic, - 'languageWire': self._languageWire, 'lingvanex': self._lingvanex, 'niutrans': self._niutrans, 'mglip': self._mglip, 'mirai': self._mirai, - 'modernMt': self._modernMt, 'myMemory': self._myMemory, 'papago': self._papago, 'qqFanyi': self._qqFanyi, 'qqTranSmart': self._qqTranSmart, - 'reverso': self._reverso, 'sogou': self._sogou, 'sysTran': self._sysTran, 'tilde': self._tilde, 'translateCom': self._translateCom, - 'translateMe': self._translateMe, 'utibet': self._utibet, 'volcEngine': self._volcEngine, 'yandex': self._yandex, 'yeekit': self._yeekit, - 'youdao': self._youdao, + 'hujiang': self._hujiang, 'iciba': self._iciba, 'iflytek': self._iflytek, 'iflyrec': self._iflyrec, 'itranslate': self._itranslate, + 'judic': self._judic, 'languageWire': self._languageWire, 'lingvanex': self._lingvanex, 'niutrans': self._niutrans, 'mglip': self._mglip, + 'mirai': self._mirai, 'modernMt': self._modernMt, 'myMemory': self._myMemory, 'papago': self._papago, 'qqFanyi': self._qqFanyi, + 'qqTranSmart': self._qqTranSmart, 'reverso': self._reverso, 'sogou': self._sogou, 'sysTran': self._sysTran, 'tilde': self._tilde, + 'translateCom': self._translateCom, 'translateMe': self._translateMe, 'utibet': self._utibet, 'volcEngine': self._volcEngine, 'yandex': self._yandex, + 'yeekit': self._yeekit, 'youdao': self._youdao, } self.translators_dict = { 'alibaba': self.alibaba, 'apertium': self.apertium, 'argos': self.argos, 'baidu': self.baidu, 'bing': self.bing, 'caiyun': self.caiyun, 'cloudTranslation': self.cloudTranslation, 'deepl': self.deepl, 'elia': self.elia, 'google': self.google, - 'iciba': self.iciba, 'iflytek': self.iflytek, 'iflyrec': self.iflyrec, 'itranslate': self.itranslate, 'judic': self.judic, - 'languageWire': self.languageWire, 'lingvanex': self.lingvanex, 'niutrans': self.niutrans, 'mglip': self.mglip, 'mirai': self.mirai, - 'modernMt': self.modernMt, 'myMemory': self.myMemory, 'papago': self.papago, 'qqFanyi': self.qqFanyi, 'qqTranSmart': self.qqTranSmart, - 'reverso': self.reverso, 'sogou': self.sogou, 'sysTran': self.sysTran, 'tilde': self.tilde, 'translateCom': self.translateCom, - 'translateMe': self.translateMe, 'utibet': self.utibet, 'volcEngine': self.volcEngine, 'yandex': self.yandex, 'yeekit': self.yeekit, - 'youdao': self.youdao, + 'hujiang': self.hujiang, 'iciba': self.iciba, 'iflytek': self.iflytek, 'iflyrec': self.iflyrec, 'itranslate': self.itranslate, + 'judic': self.judic, 'languageWire': self.languageWire, 'lingvanex': self.lingvanex, 'niutrans': self.niutrans, 'mglip': self.mglip, + 'mirai': self.mirai, 'modernMt': self.modernMt, 'myMemory': self.myMemory, 'papago': self.papago, 'qqFanyi': self.qqFanyi, + 'qqTranSmart': self.qqTranSmart, 'reverso': self.reverso, 'sogou': self.sogou, 'sysTran': self.sysTran, 'tilde': self.tilde, + 'translateCom': self.translateCom, 'translateMe': self.translateMe, 'utibet': self.utibet, 'volcEngine': self.volcEngine, 'yandex': self.yandex, + 'yeekit': self.yeekit, 'youdao': self.youdao, } self.translators_pool = list(self.translators_dict.keys()) self.not_en_langs = {'utibet': 'ti', 'mglip': 'mon'} - self.not_zh_langs = {'languageWire': 'fr', 'tilde': 'fr', 'elia': 'fr', 'apertium': 'spa'} + self.not_zh_langs = {'languageWire': 'fr', 'tilde': 'fr', 'elia': 'fr', 'apertium': 'spa', 'judic': 'de'} self.pre_acceleration_label = 0 self.example_query_text = '你好。\n欢迎你!' self.success_translators_pool = [] @@ -5122,9 +5467,9 @@ def translate_text(self, :param update_session_after_freq: int, default 1000. :param update_session_after_seconds: float, default 1500. :param if_use_cn_host: bool, default False. Support google(), bing() only. - :param reset_host_url: str, default None. Support google(), argos(), yandex() only. + :param reset_host_url: str, default None. Support google(), yandex() only. :param if_check_reset_host_url: bool, default True. Support google(), yandex() only. - :param if_ignore_empty_query: bool, default False. + :param if_ignore_empty_query: bool, default True. :param if_ignore_limit_of_length: bool, default False. :param limit_of_length: int, default 20000. :param if_show_time_stat: bool, default False. @@ -5148,7 +5493,7 @@ def translate_html(self, translator: str = 'bing', from_language: str = 'auto', to_language: str = 'en', - n_jobs: int = -1, + n_jobs: int = 1, if_use_preacceleration: bool = False, **kwargs: ApiKwargsType, ) -> str: @@ -5158,7 +5503,7 @@ def translate_html(self, :param translator: str, default 'bing'. :param from_language: str, default 'auto'. :param to_language: str, default 'en'. - :param n_jobs: int, default -1, means os.cpu_cnt(). + :param n_jobs: int, default 1. -1 means os.cpu_cnt(). :param if_use_preacceleration: bool, default False. :param **kwargs: :param is_detail_result: bool, default False. @@ -5171,7 +5516,7 @@ def translate_html(self, :param if_use_cn_host: bool, default False. Support google(), bing() only. :param reset_host_url: str, default None. Support google(), argos(), yandex() only. :param if_check_reset_host_url: bool, default True. Support google(), yandex() only. - :param if_ignore_empty_query: bool, default False. + :param if_ignore_empty_query: bool, default True. :param if_ignore_limit_of_length: bool, default False. :param limit_of_length: int, default 20000. :param if_show_time_stat: bool, default False. @@ -5191,14 +5536,14 @@ def translate_html(self, def _translate_text(sentence: str) -> Tuple[str, str]: return sentence, self.translators_dict[translator](query_text=sentence, from_language=from_language, to_language=to_language, **kwargs) - pattern = re.compile("(?:^|(?<=>))([\\s\\S]*?)(?:(?=<)|$)") # TODO:
+ pattern = re.compile('>([\\s\\S]*?)<') # not perfect sentence_list = list(set(pattern.findall(html_text))) n_jobs = self.cpu_cnt if n_jobs <= 0 else n_jobs with pathos.multiprocessing.ProcessPool(n_jobs) as pool: result_list = pool.map(_translate_text, sentence_list) - result_dict = {text: ts_text for text, ts_text in result_list} + result_dict = {text: f'>{ts_text}<' for text, ts_text in result_list} _get_result_func = lambda k: result_dict.get(k.group(1), '') return pattern.sub(repl=_get_result_func, string=html_text) @@ -5290,6 +5635,8 @@ def preaccelerate_and_speedtest(self, timeout: Optional[float] = None, **kwargs: elia = tss.elia _google = tss._google google = tss.google +_hujiang = tss._hujiang +hujiang = tss.hujiang _iciba = tss._iciba iciba = tss.iciba _iflytek = tss._iflytek @@ -5347,6 +5694,7 @@ def preaccelerate_and_speedtest(self, timeout: Optional[float] = None, **kwargs: translate_html = tss.translate_html translators_pool = tss.translators_pool get_languages = tss.get_languages +get_region_of_server = tss.get_region_of_server preaccelerate = tss.preaccelerate speedtest = tss.speedtest diff --git a/translate_apis.json b/translate_apis.json index 80af613..ead7199 100644 --- a/translate_apis.json +++ b/translate_apis.json @@ -1,5 +1,5 @@ { - "default": "baidu_free", + "default": "alibaba_free", "apis": [ { "type": "dont_need_api_key",