From 0a5d159b8a4a0a6988db30cf953bb94d316a5614 Mon Sep 17 00:00:00 2001 From: naibo Date: Fri, 2 Jun 2023 01:36:06 +0800 Subject: [PATCH] V0.3.2 --- ElectronJS/main.js | 1 + ElectronJS/tasks/96.json | 1 + ElectronJS/tasks/97.json | 1 + ExecuteStage/.vscode/launch.json | 2 +- ExecuteStage/easyspider_executestage.py | 28 +++++++++++++++++++++++-- 5 files changed, 30 insertions(+), 3 deletions(-) create mode 100644 ElectronJS/tasks/96.json create mode 100644 ElectronJS/tasks/97.json diff --git a/ElectronJS/main.js b/ElectronJS/main.js index ac85e987..65582011 100644 --- a/ElectronJS/main.js +++ b/ElectronJS/main.js @@ -296,6 +296,7 @@ async function runBrowser(lang = "en", user_data_folder = '') { } options.addExtensions(path.join(__dirname, "XPathHelper.crx")); options.setChromeBinaryPath(chromeBinaryPath); + options.add if (user_data_folder != "") { let dir = path.join(task_server.getDir(), user_data_folder); console.log(dir); diff --git a/ElectronJS/tasks/96.json b/ElectronJS/tasks/96.json new file mode 100644 index 00000000..fb5c31b5 --- /dev/null +++ b/ElectronJS/tasks/96.json @@ -0,0 +1 @@ +{"id":96,"name":"[2305.07067] SigRec: Automatic Recovery of Function Signatures in Smart Contracts","url":"https://arxiv.org/abs/2305.07067","links":"https://arxiv.org/pdf/2008.03554.pdf","create_time":"6/2/2023, 1:00:27 AM","version":"0.3.2","containJudge":false,"desc":"https://arxiv.org/abs/2305.07067","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://arxiv.org/pdf/2008.03554.pdf","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://arxiv.org/pdf/2008.03554.pdf"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://arxiv.org/abs/2305.07067","links":"https://arxiv.org/pdf/2008.03554.pdf","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","wait":2,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"maxWaitTime":10,"paras":[],"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'PDF')]","//A[@class='abs-button download-pdf']"]}}]} \ No newline at end of file diff --git a/ElectronJS/tasks/97.json b/ElectronJS/tasks/97.json new file mode 100644 index 00000000..cbfda6c9 --- /dev/null +++ b/ElectronJS/tasks/97.json @@ -0,0 +1 @@ +{"id":97,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"6/2/2023, 1:34:59 AM","version":"0.3.2","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"},{"id":1,"name":"inputText_1","nodeName":"输入文字","nodeId":3,"desc":"要输入的文本,如京东搜索框输入:电脑","type":"string","exampleValue":"Field[\"参数1_链接文本\"]123Field[\"参数2_链接地Field[\"参数1_链接文本\"]","value":"Field[\"参数1_链接文本\"]123Field[\"参数2_链接地Field[\"参数1_链接文本\"]"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"string","exampleValue":"手机"},{"id":1,"name":"参数2_链接地址","desc":"","type":"string","exampleValue":"https://shouji.jd.com/"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2,4,3],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":6,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":1,"contentType":0,"relative":false,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/a[1]","allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/a[1]","//a[contains(., '手机')]"],"exampleValues":[{"num":0,"value":"手机"}],"unique_index":"yaeqxdsd9dlidernop","default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}},{"id":4,"index":3,"parentId":0,"type":0,"option":4,"title":"输入文字","sequence":[],"isInLoop":false,"position":3,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[@id=\"key\"]","wait":10,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"value":"Field[\"参数1_链接文本\"]123Field[\"参数2_链接地Field[\"参数1_链接文本\"]","allXPaths":["/html/body/div[4]/div[1]/div[2]/div[1]/input[1]","//input[contains(., '')]","id(\"key\")","//INPUT[@class='text defcolor']"]}},{"id":3,"index":4,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[5],"isInLoop":false,"position":2,"parameters":{"history":3,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/a[1]","//a[contains(., '手机')]"]}},{"id":5,"index":5,"parentId":3,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":3,"tabIndex":-1,"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":1,"contentType":0,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"手机"},{"num":1,"value":"数码"},{"num":2,"value":"家用电器"},{"num":3,"value":"电脑"},{"num":4,"value":"办公"},{"num":5,"value":"家纺"},{"num":6,"value":"家居"},{"num":7,"value":"厨具"},{"num":8,"value":"家具"},{"num":9,"value":"家装"},{"num":10,"value":"灯具"},{"num":11,"value":"工业品"},{"num":12,"value":"内衣"},{"num":13,"value":"男装"},{"num":14,"value":"女装"},{"num":15,"value":"童装"},{"num":16,"value":"箱包"},{"num":17,"value":"钟表"},{"num":18,"value":"珠宝"},{"num":19,"value":"女鞋"},{"num":20,"value":"运动"},{"num":21,"value":"户外"},{"num":22,"value":"男鞋"},{"num":23,"value":"汽车用品"},{"num":24,"value":"车载电器"},{"num":25,"value":"母婴"},{"num":26,"value":"洗护喂养"},{"num":27,"value":"玩具乐器"},{"num":28,"value":"宠物生活"},{"num":29,"value":"家庭清洁"},{"num":30,"value":"个人护理"},{"num":31,"value":"计生情趣"},{"num":32,"value":"图书"},{"num":33,"value":"童书"},{"num":34,"value":"文学"}],"unique_index":"jsqj8nvtjfplidevrwo","default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://shouji.jd.com/"},{"num":1,"value":"https://shuma.jd.com/"},{"num":2,"value":"https://search.jd.com/Search?keyword=%E5%B0%8F%E5%AE%B6%E7%94%B5&enc=utf-8&wq=%E5%B0%8F%E5%AE%B6%E7%94%B5&pvid=261a350161304c979fa0e7ce95c05671"},{"num":3,"value":"https://diannao.jd.com/"},{"num":4,"value":"https://bg.jd.com/"},{"num":5,"value":"https://channel.jd.com/jf.html"},{"num":6,"value":"https://channel.jd.com/home.html"},{"num":7,"value":"https://channel.jd.com/kitchenware.html"},{"num":8,"value":"https://channel.jd.com/furniture.html"},{"num":9,"value":"https://jzjc.jd.com/"},{"num":10,"value":"https://channel.jd.com/9855-9856.html"},{"num":11,"value":"https://pro.jd.com/mall/active/2u2DR1dUiK34csAE3DqmcG8aXvUK/index.html"},{"num":12,"value":"https://channel.jd.com/underwear.html"},{"num":13,"value":"https://channel.jd.com/1315-1342.html"},{"num":14,"value":"https://channel.jd.com/women.html"},{"num":15,"value":"https://list.jd.com/list.html?cat=1319,11842"},{"num":16,"value":"https://channel.jd.com/bag.html"},{"num":17,"value":"https://channel.jd.com/watch.html"},{"num":18,"value":"https://channel.jd.com/jewellery.html"},{"num":19,"value":"https://channel.jd.com/womensshoes.html"},{"num":20,"value":"https://phat.jd.com/10-109.html"},{"num":21,"value":"https://channel.jd.com/outdoor.html"},{"num":22,"value":"https://channel.jd.com/mensshoes.html"},{"num":23,"value":"https://che.jd.com/"},{"num":24,"value":"https://list.jd.com/list.html?cat=6728,6740&page=1&delivery_glb=1&stock=1&sort=sort_totalsales15_desc&trans=1&JL=4_7_0#J_main"},{"num":25,"value":"https://search.jd.com/Search?keyword=%E6%AF%8D%E5%A9%B4&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%AF%8D%E5%A9%B4&stock=1&gp=2&click=1"},{"num":26,"value":"https://channel.jd.com/feed.html"},{"num":27,"value":"https://toy.jd.com/"},{"num":28,"value":"https://channel.jd.com/pet.html"},{"num":29,"value":"https://channel.jd.com/beauty.html"},{"num":30,"value":"https://lady.jd.com/"},{"num":31,"value":"https://channel.jd.com/9192-9196.html"},{"num":32,"value":"https://book.jd.com/"},{"num":33,"value":"https://book.jd.com/children.html"},{"num":34,"value":"https://channel.jd.com/p_wenxuezongheguan.html"}],"unique_index":"jsqj8nvtjfplidevrwo","default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]} \ No newline at end of file diff --git a/ExecuteStage/.vscode/launch.json b/ExecuteStage/.vscode/launch.json index 50a2ed43..c665b42f 100644 --- a/ExecuteStage/.vscode/launch.json +++ b/ExecuteStage/.vscode/launch.json @@ -12,7 +12,7 @@ "console": "integratedTerminal", "justMyCode": true, // "args": ["--id", "38", "--read_type", "local", "--headless", "1"] - "args": ["--id", "[5]", "--headless", "0", "--user_data", "0"] + "args": ["--id", "[80]", "--headless", "0", "--user_data", "0"] } ] } \ No newline at end of file diff --git a/ExecuteStage/easyspider_executestage.py b/ExecuteStage/easyspider_executestage.py index 54898b18..09198416 100644 --- a/ExecuteStage/easyspider_executestage.py +++ b/ExecuteStage/easyspider_executestage.py @@ -702,10 +702,18 @@ def inputInfo(self, para, loopValue): textbox.send_keys(Keys.SHIFT, Keys.END) # Send the DELETE key textbox.send_keys(Keys.DELETE) + value = "" if para["useLoop"]: - textbox.send_keys(loopValue) + value = loopValue else: - textbox.send_keys(para["value"]) + value = para["value"] + pattern = r'Field\["([^"]+)"\]' # 将value中的Field[""]替换为outputParameters中的键值 + try: + replaced_text = re.sub(pattern, lambda match: self.outputParameters.get(match.group(1), ''), value) + except: + replaced_text = value + value = replaced_text + textbox.send_keys(value) self.execute_code(2, para["afterJS"], para["afterJSWaitTime"], textbox) # 执行后置js # global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText self.bodyText = self.browser.find_element(By.CSS_SELECTOR, "body").text @@ -1089,6 +1097,22 @@ def getData(self, para, loopElement, isInLoop=True, parentPath="", index=0): option.add_argument( "--disable-blink-features=AutomationControlled") # TMALL 反扒 options.add_argument("--disable-blink-features=AutomationControlled") # TMALL 反扒 + options.add_experimental_option("prefs", { + "download.default_directory": "Data/", # 设置文件下载路径 + "download.prompt_for_download": False, # 禁止下载提示框 + "plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}], + "download.directory_upgrade": True, + "download.extensions_to_open": "applications/pdf", + "plugins.always_open_pdf_externally": True # 总是在外部程序中打开PDF + }) + option.add_experimental_option("prefs", { + "download.default_directory": "Data/", # 设置文件下载路径 + "download.prompt_for_download": False, # 禁止下载提示框 + "plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}], + "download.directory_upgrade": True, + "download.extensions_to_open": "applications/pdf", + "plugins.always_open_pdf_externally": True # 总是在外部程序中打开PDF + }) print(options) threads = [] for i in c.id: