Skip to content

Commit

Permalink
introduce doc2x
Browse files Browse the repository at this point in the history
  • Loading branch information
binary-husky committed Apr 14, 2024
1 parent c131ec0 commit 160552c
Show file tree
Hide file tree
Showing 7 changed files with 208 additions and 10 deletions.
4 changes: 4 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,10 @@
MATHPIX_APPKEY = ""


# Mathpix 拥有执行PDF的OCR功能,但是需要注册账号
DOC2X_API_KEY = ""


# 自定义API KEY格式
CUSTOM_API_KEY_PATTERN = ""

Expand Down
112 changes: 111 additions & 1 deletion crazy_functions/PDF批量翻译.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from toolbox import CatchException, report_exception, get_log_folder, gen_time_str, check_packages
from toolbox import update_ui, promote_file_to_downloadzone, update_ui_lastest_msg, disable_auto_promotion
from toolbox import write_history_to_file, promote_file_to_downloadzone
from toolbox import write_history_to_file, promote_file_to_downloadzone, get_conf, extract_archive
from toolbox import get_upload_folder, zip_folder
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from .crazy_utils import read_and_clean_pdf_text
Expand Down Expand Up @@ -46,13 +47,122 @@ def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
return

# 开始正式执行任务
DOC2X_API_KEY = get_conf("DOC2X_API_KEY")
if len(DOC2X_API_KEY) != 0:
yield from 解析PDF_DOC2X(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request)
return
grobid_url = get_avail_grobid_url()
if grobid_url is not None:
yield from 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url)
return
else:
yield from update_ui_lastest_msg("GROBID服务不可用,请检查config中的GROBID_URL。作为替代,现在将执行效果稍差的旧版代码。", chatbot, history, delay=3)
yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
return



def 解析PDF_DOC2X_单文件(fp, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, DOC2X_API_KEY, user_request):

def pdf2markdown(filepath):
import requests, json, os
markdown_dir = get_log_folder(plugin_name="pdf_ocr")
doc2x_api_key = DOC2X_API_KEY
url = "https://api.doc2x.noedgeai.com/api/v1/pdf"

chatbot.append((None, "加载PDF文件,发送至DOC2X解析..."))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

res = requests.post(
url,
files={"file": open(filepath, "rb")},
data={"ocr": "1"},
headers={"Authorization": "Bearer " + doc2x_api_key}
)
res_json = []
if res.status_code == 200:
decoded = res.content.decode("utf-8")
for z_decoded in decoded.split('\n'):
if len(z_decoded) == 0: continue
assert z_decoded.startswith("data: ")
z_decoded = z_decoded[len("data: "):]
decoded_json = json.loads(z_decoded)
res_json.append(decoded_json)
else:
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
uuid = res_json[0]['uuid']
to = "md" # latex, md, docx
url = "https://api.doc2x.noedgeai.com/api/export"+"?request_id="+uuid+"&to="+to

chatbot.append((None, f"读取解析: {url} ..."))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

res = requests.get(url, headers={"Authorization": "Bearer " + doc2x_api_key})
md_zip_path = os.path.join(markdown_dir, gen_time_str() + '.zip')
if res.status_code == 200:
with open(md_zip_path, "wb") as f: f.write(res.content)
else:
raise RuntimeError(format("[ERROR] status code: %d, body: %s" % (res.status_code, res.text)))
promote_file_to_downloadzone(md_zip_path, chatbot=chatbot)
chatbot.append((None, f"完成解析 {md_zip_path} ..."))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return md_zip_path

def deliver_to_markdown_plugin(md_zip_path, user_request):
from crazy_functions.批量Markdown翻译 import Markdown英译中
import shutil

time_tag = gen_time_str()
target_path_base = get_log_folder(chatbot.get_user())
file_origin_name = os.path.basename(md_zip_path)
this_file_path = os.path.join(target_path_base, file_origin_name)
os.makedirs(target_path_base, exist_ok=True)
shutil.copyfile(md_zip_path, this_file_path)
ex_folder = this_file_path + ".extract"
extract_archive(
file_path=this_file_path, dest_dir=ex_folder
)
chatbot.append((None, f"调用Markdown插件 {ex_folder} ..."))
plugin_kwargs['markdown_expected_output_dir'] = ex_folder

translated_f_name = 'translated_markdown.md'
generated_fp = plugin_kwargs['markdown_expected_output_path'] = os.path.join(ex_folder, translated_f_name)
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
yield from Markdown英译中(ex_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request)
if os.path.exists(generated_fp):
# 修正一些公式问题
with open(generated_fp, 'r', encoding='utf8') as f:
content = f.read()
# 将公式中的\[ \]替换成$$
content = content.replace(r'\[', r'$$').replace(r'\]', r'$$')
# 将公式中的\( \)替换成$
content = content.replace(r'\(', r'$').replace(r'\)', r'$')
content = content.replace('```', '\n').replace('```markdown', '\n')
with open(generated_fp, 'w', encoding='utf8') as f:
f.write(content)
# 生成包含图片的压缩包
dest_folder = get_log_folder(chatbot.get_user())
zip_name = '翻译后的带图文档.zip'
zip_folder(source_folder=ex_folder, dest_folder=dest_folder, zip_name=zip_name)
zip_fp = os.path.join(dest_folder, zip_name)
# 生成在线预览html
file_name = '在线预览翻译' + gen_time_str() + '.html'
with open('crazy_functions/pdf_fns/report_template_v2.html', 'r', encoding='utf8') as f:
html_template = f.read()
html_template = html_template.replace("{MARKDOWN_FILE_PATH}", translated_f_name)
preview_fp = os.path.join(ex_folder, file_name)
with open(preview_fp, 'w', encoding='utf8') as f:
f.write(html_template)
promote_file_to_downloadzone(preview_fp, chatbot=chatbot)
promote_file_to_downloadzone(zip_fp, chatbot=chatbot)
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
md_zip_path = yield from pdf2markdown(fp)
yield from deliver_to_markdown_plugin(md_zip_path, user_request)

def 解析PDF_DOC2X(file_manifest, *args):
for index, fp in enumerate(file_manifest):
yield from 解析PDF_DOC2X_单文件(fp, *args)
return

def 解析PDF_基于GROBID(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, grobid_url):
import copy, json
Expand Down
73 changes: 73 additions & 0 deletions crazy_functions/pdf_fns/report_template_v2.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">

<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>GPT-Academic 翻译报告书</title>
<style>
.centered-a {
color: red;
text-align: center;
margin-bottom: 2%;
font-size: 1.5em;
}
.centered-b {
color: red;
text-align: center;
margin-top: 10%;
margin-bottom: 20%;
font-size: 1.5em;
}
.centered-c {
color: rgba(255, 0, 0, 0);
text-align: center;
margin-top: 2%;
margin-bottom: 20%;
font-size: 7em;
}
</style>
<script>
// Configure MathJax settings
MathJax = {
tex: {
inlineMath: [
['$', '$'],
['\(', '\)']
]
}
}
addEventListener('zero-md-rendered', () => {MathJax.typeset(); console.log('MathJax typeset!');})
</script>
<!-- Load MathJax library -->
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml.js"></script>
<script
type="module"
src="https://cdn.jsdelivr.net/gh/zerodevx/zero-md@2/dist/zero-md.min.js"
></script>

</head>

<body>
<div class="test_temp1" style="width:10%; height: 500px; float:left;">

</div>
<div class="test_temp2" style="width:80%; height: 500px; float:left;">
<!-- Simply set the `src` attribute to your MD file and win -->
<div class="centered-a">
请按Ctrl+S保存此页面,否则该页面可能在几分钟后失效。
</div>
<zero-md src="translated_markdown.md" no-shadow>
</zero-md>
<div class="centered-b">
本报告由GPT-Academic开源项目生成,地址:https://github.com/binary-husky/gpt_academic。
</div>
<div class="centered-c">
本报告由GPT-Academic开源项目生成,地址:https://github.com/binary-husky/gpt_academic。
</div>
</div>
<div class="test_temp3" style="width:10%; height: 500px; float:left;">
</div>

</body>

</html>
15 changes: 10 additions & 5 deletions crazy_functions/批量Markdown翻译.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import glob, time, os, re, logging
import glob, shutil, os, re, logging
from toolbox import update_ui, trimmed_format_exc, gen_time_str, disable_auto_promotion
from toolbox import CatchException, report_exception, get_log_folder
from toolbox import write_history_to_file, promote_file_to_downloadzone
Expand Down Expand Up @@ -69,17 +69,17 @@ def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, ch

# <-------- 多线程翻译开始 ---------->
if language == 'en->zh':
inputs_array = ["This is a Markdown file, translate it into Chinese, do not modify any existing Markdown commands:" +
inputs_array = ["This is a Markdown file, translate it into Chinese, do NOT modify any existing Markdown commands:" +
f"\n\n{frag}" for frag in pfg.sp_file_contents]
inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
elif language == 'zh->en':
inputs_array = [f"This is a Markdown file, translate it into English, do not modify any existing Markdown commands:" +
inputs_array = [f"This is a Markdown file, translate it into English, do NOT modify any existing Markdown commands:" +
f"\n\n{frag}" for frag in pfg.sp_file_contents]
inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
else:
inputs_array = [f"This is a Markdown file, translate it into {language}, do not modify any existing Markdown commands, only answer me with translated results:" +
inputs_array = [f"This is a Markdown file, translate it into {language}, do NOT modify any existing Markdown commands, do NOT use code wrapper (```), ONLY answer me with translated results:" +
f"\n\n{frag}" for frag in pfg.sp_file_contents]
inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
Expand All @@ -99,7 +99,12 @@ def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, ch
for i_say, gpt_say in zip(gpt_response_collection[0::2], gpt_response_collection[1::2]):
pfg.sp_file_result.append(gpt_say)
pfg.merge_result()
pfg.write_result(language)
output_file_arr = pfg.write_result(language)
for output_file in output_file_arr:
promote_file_to_downloadzone(output_file, chatbot=chatbot)
if 'markdown_expected_output_path' in plugin_kwargs:
expected_f_name = plugin_kwargs['markdown_expected_output_path']
shutil.copyfile(output_file, expected_f_name)
except:
logging.error(trimmed_format_exc())

Expand Down
2 changes: 2 additions & 0 deletions shared_utils/text_mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ def apply_gpt_academic_string_mask(string, mode="show_all"):
当字符串中有掩码tag时(<gpt_academic_string_mask><show_...>),根据字符串要给谁看(大模型,还是web渲染),对字符串进行处理,返回处理后的字符串
示意图:https://mermaid.live/edit#pako:eNqlkUtLw0AUhf9KuOta0iaTplkIPlpduFJwoZEwJGNbzItpita2O6tF8QGKogXFtwu7cSHiq3-mk_oznFR8IYLgrGbuOd9hDrcCpmcR0GDW9ubNPKaBMDauuwI_A9M6YN-3y0bODwxsYos4BdMoBrTg5gwHF-d0mBH6-vqFQe58ed5m9XPW2uteX3Tubrj0ljLYcwxxR3h1zB43WeMs3G19yEM9uapDMe_NG9i2dagKw1Fee4c1D9nGEbtc-5n6HbNtJ8IyHOs8tbs7V2HrlDX2w2Y7XD_5haHEtQiNsOwfMVa_7TzsvrWIuJGo02qTrdwLk9gukQylHv3Afv1ML270s-HZUndrmW1tdA-WfvbM_jMFYuAQ6uCCxVdciTJ1CPLEITpo_GphypeouzXuw6XAmyi7JmgBLZEYlHwLB2S4gHMUO-9DH7tTnvf1CVoFFkBLSOk4QmlRTqpIlaWUHINyNFXjaQWpCYRURUKiWovBYo8X4ymEJFlECQUpqaQkJmuvWygPpg
"""
if not string:
return string
if "<gpt_academic_string_mask>" not in string: # No need to process
return string

Expand Down
10 changes: 6 additions & 4 deletions tests/test_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,12 @@ def validate_path():

# plugin_test(plugin='crazy_functions.Latex输出PDF->Latex翻译中文并重新编译PDF', main_input="2307.07522")

plugin_test(
plugin="crazy_functions.Latex输出PDF->Latex翻译中文并重新编译PDF",
main_input="G:/SEAFILE_LOCAL/50503047/我的资料库/学位/paperlatex/aaai/Fu_8368_with_appendix",
)
plugin_test(plugin='crazy_functions.PDF批量翻译->批量翻译PDF文档', main_input='build/pdf/t1.pdf')

# plugin_test(
# plugin="crazy_functions.Latex输出PDF->Latex翻译中文并重新编译PDF",
# main_input="G:/SEAFILE_LOCAL/50503047/我的资料库/学位/paperlatex/aaai/Fu_8368_with_appendix",
# )

# plugin_test(plugin='crazy_functions.虚空终端->虚空终端', main_input='修改api-key为sk-jhoejriotherjep')

Expand Down
2 changes: 2 additions & 0 deletions toolbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ def get_list(self):
def get_cookies(self):
return self._cookies

def get_user(self):
return self._cookies.get("user_name", default_user_name)

def ArgsGeneralWrapper(f):
"""
Expand Down

0 comments on commit 160552c

Please sign in to comment.