Skip to content

Commit

Permalink
MacOS with two execute stage version
Browse files Browse the repository at this point in the history
  • Loading branch information
Naibo_Mac_M2 committed Dec 23, 2023
1 parent e79eecc commit 476cec0
Show file tree
Hide file tree
Showing 9 changed files with 89 additions and 75 deletions.
1 change: 1 addition & 0 deletions .temp_to_pub/.gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
EasySpider_MacOS/easyspider_executestage
EasySpider_MacOS/easyspider_executestage_full
EasySpider_Linux64_x64/user_data
EasySpider_windows_x32/user_data
EasySpider
Expand Down
2 changes: 1 addition & 1 deletion ElectronJS/config.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"copyright":1,"sys_version":"x64","mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"D:\\Documents\\Projects\\EasySpider\\ElectronJS\\user_data"}
{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"copyright":1,"sys_version":"x64","mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"/Users/naibo/Documents/EasySpider/ElectronJS/user_data"}
40 changes: 20 additions & 20 deletions ElectronJS/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,27 +59,27 @@ let chromeBinaryPath = "";
let execute_path = "";
console.log(process.arch);

exec(`wmic os get Caption`, function (error, stdout, stderr) {
if (error) {
console.error(`执行的错误: ${error}`);
return;
}
// exec(`wmic os get Caption`, function (error, stdout, stderr) {
// if (error) {
// console.error(`执行的错误: ${error}`);
// return;
// }

if (stdout.includes("Windows 7")) {
console.log("Windows 7");
let sys_arch = config.sys_arch;
if (sys_arch === "x64") {
dialog.showMessageBoxSync({
type: "error",
title: "Error",
message:
"Windows 7系统请下载使用x32版本的软件,不论Win 7系统为x64还是x32版本。\nFor Windows 7, please download and use the x32 version of the software, regardless of whether the Win 7 system is x64 or x32 version.",
});
}
} else {
console.log("Not Windows 7");
}
});
// if (stdout.includes("Windows 7")) {
// console.log("Windows 7");
// let sys_arch = config.sys_arch;
// if (sys_arch === "x64") {
// dialog.showMessageBoxSync({
// type: "error",
// title: "Error",
// message:
// "Windows 7系统请下载使用x32版本的软件,不论Win 7系统为x64还是x32版本。\nFor Windows 7, please download and use the x32 version of the software, regardless of whether the Win 7 system is x64 or x32 version.",
// });
// }
// } else {
// console.log("Not Windows 7");
// }
// });

if (process.platform === "win32" && process.arch === "ia32") {
driverPath = path.join(__dirname, "chrome_win32/chromedriver_win32.exe");
Expand Down
5 changes: 4 additions & 1 deletion ElectronJS/src/taskGrid/executeTask.html
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,9 @@ <h4 class="modal-title" id="myModalLabel">{{"Task Execution Instruction~执行
<div class="modal-body">
<input onkeydown="inputDelete(event)" id="serviceId" type="hidden" name="serviceId" value="-1"></input>
<input onkeydown="inputDelete(event)" id="url" type="hidden" name="url" value="about:blank"></input>
<label>{{ `Please open a terminal (For Windows, please use PowerShell instead of CMD), go to EasySpider's folder, and then copy (Command/Ctrl + c) the following command to run the task (EasySpider cannot quit when executing command, unless --read_type is set to "local"):~请在EasySpider目录下打开命令行工具Terminal (Windows请使用PowerShell而不是CMD),然后复制(Command/Ctrl + c)和运行以下命令以执行任务(执行命令时不能退出EasySpider,除非将--read_type设置为local):` | lang }}</label>
<label><a href="https://github.com/NaiboWang/EasySpider/wiki/Argument-Instruction" target="_blank">{{`Click Here~点击这里` | lang}}</a> {{`Here to see argument instruction.~这里查看参数配置说明。` | lang}}</label>
<label v-if="OS=='darwin'">{{`对于MacOS系统,EasySpider提供了两个不同的执行程序,分别为easyspider_executestage和easyspider_executestage_full,前者执行时加载速度较快,并提供了除OCR识别和数据去重以外的全部功能;后者则提供了包括OCR识别和数据去重在内的全部功能,但运行时加载速度较慢,需要等待2-10分钟才能执行程序,请根据自己的需求选择执行哪个程序。~For MacOS system, EasySpider provides two different execution programs, 'easyspider_executestage' and 'easyspider_executestage_full', the former loads faster when executing, and provides all functions except OCR recognition and data deduplication; the latter provides all functions including OCR recognition and data deduplication, but the loading speed is slower when running, and it takes 2-10 minutes to wait for the program to execute, please choose which program to execute according to your needs.` | lang}}</label>
<label>{{ `Please open a terminal (For Windows, please use PowerShell instead of CMD), go to EasySpider's folder, and then copy (Command/Ctrl + c) the following command to run the task (EasySpider cannot quit when executing command, unless --read_type is set to "local"):~请在EasySpider目录下打开命令行工具Terminal (Windows请使用PowerShell而不是CMD),然后复制(Command/Ctrl + c)和运行以下命令以执行任务(执行命令时不能退出EasySpider,除非将--read_type设置为local):` | lang }}</label>
<textarea class="form-control" style="height:150px">cd {{easyspider_location}}
{{command}} --config_folder "{{config_folder}}" --headless 0 --read_type remote --config_file_name config.json --saved_file_name </textarea>
</div>
Expand Down Expand Up @@ -314,6 +315,7 @@ <h4 style="text-align: center;">{{"Task Execution~任务执行" | lang}}</h4>
config_folder: "",
easyspider_location: "",
mysql_config_path: "",
OS: "win32",
}, mounted() {
$.get(this.backEndAddressServiceWrapper + "/getConfig", function (result) {
app.$data.user_data_folder = result.user_data_folder;
Expand Down Expand Up @@ -540,6 +542,7 @@ <h4 style="text-align: center;">{{"Task Execution~任务执行" | lang}}</h4>

function changeCommand() {
$.get(app.$data.backEndAddressServiceWrapper + "/queryOSVersion", function (OSInfo) {
app.$data.OS = OSInfo.version;
if(OSInfo.version == 'win32' && OSInfo.bit == 'x64'){
app.$data.command = "./EasySpider/resources/app/chrome_win64/easyspider_executestage.exe --ids [" + app.$data.ID.toString() + "] --user_data " + (app.$data.with_user_data ? "1" : "0") + " --server_address " + app.$data.backEndAddressServiceWrapper;
} else if(OSInfo.version == 'win32' && OSInfo.bit == 'ia32'){
Expand Down
1 change: 1 addition & 0 deletions ElectronJS/tasks/308.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":308,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2023-12-23 14:21:24","update_time":"2023-12-23 14:23:36","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":1,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"手机"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://shouji.jd.com/"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/a[1]","//a[contains(., '手机')]","/html/body/div[last()-6]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]/a[last()-1]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":1,"contentType":8,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"手机"}],"unique_index":"m5moh4pro4rlqhoa60d","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://shouji.jd.com/"}],"unique_index":"m5moh4pro4rlqhoa60d","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0}]}}]}
23 changes: 16 additions & 7 deletions ExecuteStage/easyspider_executestage.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,25 @@
# import hashlib
import time
import requests
from ddddocr import DdddOcr
from multiprocessing import freeze_support
freeze_support() # 防止无限死循环多开
try:
from ddddocr import DdddOcr
import onnxruntime
onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
except:
print("OCR识别无法在当前环境下使用(ddddocr库缺失),请使用完整版执行器easyspider_executestage_full来运行需要OCR识别的任务。")
print("OCR recognition cannot be used in the current environment (ddddocr library is missing), please use the executor with ddddocr 'easyspider_executestage_full' to run the task which requires OCR recognition.")
time.sleep(2)
from urllib.parse import urljoin
from lxml import etree, html
try:
import pandas as pd
except:
print("数据去重无法在当前环境下使用(pandas库缺失),请使用完整版执行器easyspider_executestage_full来运行需要去重的任务。")
print("Data deduplication cannot be used in the current environment (pandas library is missing), please use the executor with pandas 'easyspider_executestage_full' to run the task which requires data deduplication.")
time.sleep(2)

import onnxruntime

onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
import pandas as pd
# import numpy
# import pytesseract
# import uuid
Expand Down Expand Up @@ -2185,8 +2196,6 @@ def getData(self, param, loopElement, isInLoop=True, parentPath="", index=0):
self.OUTPUT.append(line)

if __name__ == '__main__':
from multiprocessing import freeze_support
freeze_support() # 防止无限死循环多开
# 如果需要调试程序,请在命令行参数中加入--keyboard 0 来禁用键盘监听以提升调试速度
# If you need to debug the program, please add --keyboard 0 in the command line parameters to disable keyboard listening to improve debugging speed
config = {
Expand Down
14 changes: 12 additions & 2 deletions ExecuteStage/generateExecutable_Macos.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,17 @@
# 先打包一个不带ddddocr和pandas的版本,然后再打包一个带的版本,不带ddddocr和pandas的版本运行速度会快很多
rm -r build
rm -r dist
pyinstaller -F --icon=favicon.ico --add-data "/Users/naibo/anaconda3/lib/python3.11/site-packages/onnxruntime/capi/onnxruntime_pybind11_state.so:onnxruntime/capi" --add-data "/Users/naibo/anaconda3/lib/python3.11/site-packages/ddddocr/common_old.onnx:ddddocr" easyspider_executestage.py
pyinstaller -F --icon=favicon.ico easyspider_executestage.py --exclude-module ddddocr --exclude-module onnxruntime --exclude-module onnx --exclude-module onnxruntime_pybind11_state.so --exclude-module pillow --exclude-module pandas --exclude-module numpy --exclude-module scipy --exclude-module sklearn

rm ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage
rm ../ElectronJS/easyspider_executestage
cp dist/easyspider_executestage ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage
# mv dist/easyspider_executestage ../ElectronJS/easyspider_executestage

echo "With ddddocr and pandas"

# # 打包带ddddocr和pandas的版本
rm -r build
rm -r dist
pyinstaller -F --icon=favicon.ico --add-data "/Users/naibo/anaconda3/lib/python3.11/site-packages/onnxruntime/capi/onnxruntime_pybind11_state.so:onnxruntime/capi" --add-data "/Users/naibo/anaconda3/lib/python3.11/site-packages/ddddocr/common_old.onnx:ddddocr" easyspider_executestage.py
rm ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage_full
cp dist/easyspider_executestage ../.temp_to_pub/EasySpider_MacOS/easyspider_executestage_full
34 changes: 34 additions & 0 deletions ExecuteStage/package_size.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os
import subprocess
import sys
from pathlib import Path

# 获取当前Python环境的lib路径
lib_path = Path(sys.prefix) / "lib"

# 使用pip列出所有已安装的包及其版本
installed_packages = subprocess.check_output([sys.executable, '-m', 'pip', 'list']).decode().strip().split('\n')[2:]

# 初始化一个字典来保存数据
package_sizes = {}

# 对于每个已安装的包,找到对应的路径并计算大小
for package in installed_packages:
name, version = package.split()[:2]
package_size = 0

# 寻找与包名相关的顶层目录
# 注意:这里简单地把包名直接转换为目录名,这在某些情况下可能不适用。
# 例如,Google 的 protobuf 包在文件系统中称为 'google' 和 'protobuf'
# 这需要特别处理或者使用包的元数据来找到正确的顶层目录。
package_dir = lib_path / "python{0}.{1}".format(*sys.version_info) / "site-packages" / name

# 计算文件夹大小
if package_dir.exists():
package_size = sum(f.stat().st_size for f in package_dir.glob('**/*') if f.is_file())

package_sizes[name] = package_size

# 将包按大小排序并输出
for name, size in sorted(package_sizes.items(), key=lambda item: item[1], reverse=True):
print(f"{name}: {size/1024/1024:.2f} MB")
44 changes: 0 additions & 44 deletions ExecuteStage/test.py

This file was deleted.

0 comments on commit 476cec0

Please sign in to comment.