Skip to content

Commit 79b02a1

Browse files
committed
Local version
1 parent f125db1 commit 79b02a1

File tree

5 files changed

+94
-22
lines changed

5 files changed

+94
-22
lines changed

C#/.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
obj/
2+
.vs/

ExecuteStage/.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,5 @@ __pycache__/
99
*.spec
1010
Chrome/
1111
Data/
12+
tasks/
13+
Application/

ExecuteStage/ServiceWrapper_ExecuteStage.py

+88-21
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
desired_capabilities["pageLoadStrategy"] = "none"
3434
outputParameters = {}
3535

36+
3637
class Time:
3738
def __init__(self, type1=""):
3839
self.t = int(round(time.time() * 1000))
@@ -65,7 +66,7 @@ def scrollDown(para, rt=""):
6566
for i in range(para["scrollCount"]):
6667
time.sleep(1) # 下拉完等1秒
6768
Log("下拉完等待1秒")
68-
body = browser.find_element(By.CSS_SELECTOR,"body")
69+
body = browser.find_element(By.CSS_SELECTOR, "body")
6970
if para["scrollType"] == 1:
7071
body.send_keys(Keys.PGDN)
7172
else:
@@ -78,7 +79,7 @@ def scrollDown(para, rt=""):
7879
for i in range(para["scrollCount"]):
7980
time.sleep(1) # 下拉完等1秒
8081
Log("下拉完等待1秒")
81-
body = browser.find_element(By.CSS_SELECTOR,"body")
82+
body = browser.find_element(By.CSS_SELECTOR, "body")
8283
if para["scrollType"] == 1:
8384
body.send_keys(Keys.PGDN)
8485
else:
@@ -106,7 +107,8 @@ def excuteNode(nodeId, loopValue="", clickPath="", index=0):
106107
clickElement(node["parameters"], loopValue, clickPath, index)
107108
elif node["option"] == 3: # 提取数据
108109
recordLog("getData")
109-
getData(node["parameters"], loopValue, node["isInLoop"], parentPath = clickPath, index = index)
110+
getData(node["parameters"], loopValue, node["isInLoop"],
111+
parentPath=clickPath, index=index)
110112
elif node["option"] == 4: # 输入文字
111113
inputInfo(node["parameters"], loopValue)
112114
elif node["option"] == 8: # 循环
@@ -184,20 +186,35 @@ def loopExcute(node, loopValue, clickPath="", index=0):
184186
count = 0 # 执行次数
185187
while True: # do while循环
186188
try:
187-
element = browser.find_element(By.XPATH,
188-
node["parameters"]["xpath"])
189+
finished = False
190+
element = browser.find_element(
191+
By.XPATH, node["parameters"]["xpath"])
189192
for i in node["sequence"]: # 挨个执行操作
190193
excuteNode(i, element, node["parameters"]["xpath"], 0)
194+
finished = True
191195
Log("click: ", node["parameters"]["xpath"])
192196
recordLog("click:" + node["parameters"]["xpath"])
193-
# except NoSuchElementException:
194-
except:
197+
except NoSuchElementException:
198+
# except:
199+
print("\n\n-------Get Element Error-------\n\n")
195200
Log("clickNotFound: ", node["parameters"]["xpath"])
196201
recordLog("clickNotFound:" + node["parameters"]["xpath"])
197202
for i in node["sequence"]: # 不带点击元素的把剩余的如提取数据的操作执行一遍
198203
if node["option"] != 2:
199204
excuteNode(i, None, node["parameters"]["xpath"], 0)
205+
finished = True
200206
break # 如果找不到元素,退出循环
207+
finally:
208+
if not finished:
209+
print("\n\n-------Retrying-------\n\n")
210+
Log("-------Retrying-------: ",
211+
node["parameters"]["xpath"])
212+
recordLog("clickNotFound:" + node["parameters"]["xpath"])
213+
for i in node["sequence"]: # 不带点击元素的把剩余的如提取数据的操作执行一遍
214+
if node["option"] != 2:
215+
excuteNode(i, None, node["parameters"]["xpath"], 0)
216+
break # 如果找不到元素,退出循环
217+
201218
count = count + 1
202219
Log("页数:", count)
203220
recordLog("页数:" + str(count))
@@ -274,7 +291,8 @@ def loopExcute(node, loopValue, clickPath="", index=0):
274291
excuteNode(i, text, "", 0)
275292
elif int(node["parameters"]["loopType"]) == 4: # 固定网址列表
276293
# tempList = node["parameters"]["textList"].split("\r\n")
277-
urlList = list(filter(isnull, node["parameters"]["textList"].split("\n"))) # 去空行
294+
urlList = list(
295+
filter(isnull, node["parameters"]["textList"].split("\n"))) # 去空行
278296
# urlList = []
279297
# for url in tempList:
280298
# if url != "":
@@ -292,6 +310,7 @@ def loopExcute(node, loopValue, clickPath="", index=0):
292310
def openPage(para, loopValue):
293311
rt = Time("打开网页")
294312
time.sleep(2) # 打开网页后强行等待至少2秒
313+
time.sleep(random.uniform(1, 10)) # 生成一个a到b的小数等待时间
295314
global links
296315
global urlId
297316
global history
@@ -333,7 +352,7 @@ def openPage(para, loopValue):
333352
if containJudge:
334353
global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText
335354
try:
336-
bodyText = browser.find_element(By.CSS_SELECTOR,"body").text
355+
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
337356
Log('URL Page: ' + url)
338357
recordLog('URL Page: ' + url)
339358
except TimeoutException:
@@ -343,7 +362,7 @@ def openPage(para, loopValue):
343362
time.sleep(1)
344363
Log("获得bodytext等待1秒")
345364
# 再执行一遍
346-
bodyText = browser.find_element(By.CSS_SELECTOR,"body").text
365+
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
347366
rt.end()
348367
except Exception as e:
349368
Log(e)
@@ -374,7 +393,7 @@ def inputInfo(para, loopValue):
374393
else:
375394
textbox.send_keys(para["value"])
376395
global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText
377-
bodyText = browser.find_element(By.CSS_SELECTOR,"body").text
396+
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
378397
rt.end()
379398

380399

@@ -404,6 +423,7 @@ def clickElement(para, loopElement=None, clickPath="", index=0):
404423
recordLog(str(e))
405424
time.sleep(0.5) # 点击之后等半秒
406425
Log("点击之后等待0.5秒")
426+
time.sleep(random.uniform(1, 10)) # 生成一个a到b的小数等待时间
407427
if tempHandleNum != len(browser.window_handles): # 如果有新标签页的行为发生
408428
browser.switch_to.window(browser.window_handles[-1]) # 跳转到新的标签页
409429
history["handle"] = browser.current_window_handle
@@ -425,15 +445,15 @@ def clickElement(para, loopElement=None, clickPath="", index=0):
425445
if containJudge: # 有判断语句才执行以下操作
426446
global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText
427447
try:
428-
bodyText = browser.find_element(By.CSS_SELECTOR,"body").text
448+
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
429449
except TimeoutException:
430450
Log('time out after 10 seconds when getting body text')
431451
recordLog('time out after 10 seconds when getting body text')
432452
browser.execute_script('window.stop()')
433453
time.sleep(1)
434454
Log("bodytext等待1秒")
435455
# 再执行一遍
436-
bodyText = browser.find_element(By.CSS_SELECTOR,"body").text
456+
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
437457
rt.end()
438458
except Exception as e:
439459
Log(e)
@@ -442,7 +462,7 @@ def clickElement(para, loopElement=None, clickPath="", index=0):
442462

443463

444464
# 提取数据事件
445-
def getData(para, loopElement, isInLoop=True, parentPath="", index = 0):
465+
def getData(para, loopElement, isInLoop=True, parentPath="", index=0):
446466
if not isInLoop and para["wait"] == 0:
447467
time.sleep(1) # 如果提取数据字段不在循环内而且设置的等待时间为0,默认等待1秒
448468
Log("提取数据等待1秒")
@@ -454,12 +474,14 @@ def getData(para, loopElement, isInLoop=True, parentPath="", index = 0):
454474
if p["relativeXpath"] == "": # 相对xpath有时候就是元素本身,不需要二次查找
455475
element = loopElement
456476
else:
457-
if p["relativeXpath"].find("//")>=0: # 如果字串里有//即子孙查找,则不动语句
458-
full_path = "(" + parentPath + p["relativeXpath"] + ")" + "[" + str(index + 1) + "]"
477+
if p["relativeXpath"].find("//") >= 0: # 如果字串里有//即子孙查找,则不动语句
478+
full_path = "(" + parentPath + \
479+
p["relativeXpath"] + ")" + \
480+
"[" + str(index + 1) + "]"
459481
element = browser.find_element(By.XPATH, full_path)
460482
else:
461483
element = loopElement.find_element(By.XPATH,
462-
p["relativeXpath"][1:])
484+
p["relativeXpath"][1:])
463485
else:
464486
element = browser.find_element(By.XPATH, p["relativeXpath"])
465487
except NoSuchElementException: # 找不到元素的时候,使用默认值
@@ -638,6 +660,7 @@ def clean():
638660
if __name__ == '__main__':
639661
options = Options()
640662
exe_path = "chromedriver.exe"
663+
option = webdriver.ChromeOptions()
641664
if os.path.exists(os.getcwd()+"/ServiceWrapper"):
642665
print("Finding chromedriver in ServiceWrapper",
643666
os.getcwd()+"/ServiceWrapper")
@@ -651,11 +674,37 @@ def clean():
651674
elif os.getcwd().find("ExecuteStage") >= 0: # 如果直接执行
652675
print("Finding chromedriver in ServiceWrapper",
653676
os.getcwd()+"/Debug")
654-
options.binary_location = "./Chrome/chrome.exe" # 指定chrome位置
655-
exe_path = "./Chrome/chromedriver.exe"
677+
option.binary_location = "./Application/chrome.exe" # 指定chrome位置
678+
# option.binary_location = "C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe"
679+
exe_path = "./Application/chromedriver.exe"
656680
else:
657681
options.binary_location = "chrome.exe" # 指定chrome位置
658-
browser = webdriver.Chrome(options=options, executable_path=exe_path)
682+
683+
option.add_experimental_option(
684+
'excludeSwitches', ['enable-automation']) # 以开发者模式
685+
686+
# user_data_dir = r'' # 注意没有Default!
687+
688+
# options.add_argument('--user-data-dir='+p)
689+
690+
# 总结:
691+
# 0. 带Cookie需要用userdatadir
692+
# 1. chrome_options才是配置用户文件和chrome文件地址的正确选项
693+
# 2. User Profile文件夹的路径是:C:\Users\用户名\AppData\Local\Google\Chrome\User Data不要加Default
694+
# 3. 就算User Profile相同,chrome版本不同所存储的cookie信息也不同,也不能爬
695+
# 4. TMALL如果一直弹出验证码,而且无法通过验证,那么需要在其他浏览器上用
696+
697+
option.add_argument(
698+
'--user-data-dir=C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\User Data') # TMALL 反扒
699+
option.add_argument("--profile-directory=Default")
700+
# options.add_argument(
701+
# '--user-data-dir=C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\User Data') # TMALL 反扒
702+
option.add_argument(
703+
"--disable-blink-features=AutomationControlled") # TMALL 反扒
704+
print(options)
705+
browser = webdriver.Chrome(
706+
options=options, chrome_options=option, executable_path=exe_path)
707+
wait = WebDriverWait(browser, 10)
659708
browser.get('about:blank')
660709
browser.set_page_load_timeout(10) # 加载页面最大超时时间
661710
browser.set_script_timeout(10)
@@ -675,7 +724,25 @@ def clean():
675724
else:
676725
backEndAddress = "http://servicewrapper.naibo.wang"
677726

678-
content = requests.get(backEndAddress + "/backEnd/queryTask?id=" + str(id))
727+
# TODO when transfer to electron, use commandline-config
728+
config = {
729+
"type": "remote",
730+
}
731+
from commandline_config import Config
732+
c = Config(config)
733+
co = c
734+
co = {"type": "remote"}
735+
if len(sys.argv) > 4:
736+
co = sys.argv[4]
737+
if co["type"] == "remote":
738+
print("remote")
739+
content = requests.get(
740+
backEndAddress + "/backEnd/queryTask?id=" + str(id))
741+
service = json.loads(content.text)
742+
else:
743+
print("local")
744+
with open("tasks/" + str(id) + ".json", 'r', encoding='utf-8') as f:
745+
content = f.read()
679746
service = json.loads(content.text) # 加载服务信息
680747
print("name:", service["name"])
681748
procedure = service["graph"] # 程序执行流程

ExecuteStage/packageCommand.cmd

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pyinstaller -F --icon=favicon.ico .\ServiceWrapper_ExecuteStage.py

ExecuteStage/service_invoke.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def invokeService(id, data):
3636
count = len(os.listdir("tasks")) + 1
3737
service["id"] = count # 修改id
3838
print(count)
39-
with open("tasks/%d.json" % count, "w", ) as f:
39+
with open("tasks/%d.json" % count, "w", encoding='utf-8') as f:
4040
s = json.dumps(service, ensure_ascii=False)
4141
f.write(s)
4242
return count

0 commit comments

Comments
 (0)