33
33
desired_capabilities ["pageLoadStrategy" ] = "none"
34
34
outputParameters = {}
35
35
36
+
36
37
class Time :
37
38
def __init__ (self , type1 = "" ):
38
39
self .t = int (round (time .time () * 1000 ))
@@ -65,7 +66,7 @@ def scrollDown(para, rt=""):
65
66
for i in range (para ["scrollCount" ]):
66
67
time .sleep (1 ) # 下拉完等1秒
67
68
Log ("下拉完等待1秒" )
68
- body = browser .find_element (By .CSS_SELECTOR ,"body" )
69
+ body = browser .find_element (By .CSS_SELECTOR , "body" )
69
70
if para ["scrollType" ] == 1 :
70
71
body .send_keys (Keys .PGDN )
71
72
else :
@@ -78,7 +79,7 @@ def scrollDown(para, rt=""):
78
79
for i in range (para ["scrollCount" ]):
79
80
time .sleep (1 ) # 下拉完等1秒
80
81
Log ("下拉完等待1秒" )
81
- body = browser .find_element (By .CSS_SELECTOR ,"body" )
82
+ body = browser .find_element (By .CSS_SELECTOR , "body" )
82
83
if para ["scrollType" ] == 1 :
83
84
body .send_keys (Keys .PGDN )
84
85
else :
@@ -106,7 +107,8 @@ def excuteNode(nodeId, loopValue="", clickPath="", index=0):
106
107
clickElement (node ["parameters" ], loopValue , clickPath , index )
107
108
elif node ["option" ] == 3 : # 提取数据
108
109
recordLog ("getData" )
109
- getData (node ["parameters" ], loopValue , node ["isInLoop" ], parentPath = clickPath , index = index )
110
+ getData (node ["parameters" ], loopValue , node ["isInLoop" ],
111
+ parentPath = clickPath , index = index )
110
112
elif node ["option" ] == 4 : # 输入文字
111
113
inputInfo (node ["parameters" ], loopValue )
112
114
elif node ["option" ] == 8 : # 循环
@@ -184,20 +186,35 @@ def loopExcute(node, loopValue, clickPath="", index=0):
184
186
count = 0 # 执行次数
185
187
while True : # do while循环
186
188
try :
187
- element = browser .find_element (By .XPATH ,
188
- node ["parameters" ]["xpath" ])
189
+ finished = False
190
+ element = browser .find_element (
191
+ By .XPATH , node ["parameters" ]["xpath" ])
189
192
for i in node ["sequence" ]: # 挨个执行操作
190
193
excuteNode (i , element , node ["parameters" ]["xpath" ], 0 )
194
+ finished = True
191
195
Log ("click: " , node ["parameters" ]["xpath" ])
192
196
recordLog ("click:" + node ["parameters" ]["xpath" ])
193
- # except NoSuchElementException:
194
- except :
197
+ except NoSuchElementException :
198
+ # except:
199
+ print ("\n \n -------Get Element Error-------\n \n " )
195
200
Log ("clickNotFound: " , node ["parameters" ]["xpath" ])
196
201
recordLog ("clickNotFound:" + node ["parameters" ]["xpath" ])
197
202
for i in node ["sequence" ]: # 不带点击元素的把剩余的如提取数据的操作执行一遍
198
203
if node ["option" ] != 2 :
199
204
excuteNode (i , None , node ["parameters" ]["xpath" ], 0 )
205
+ finished = True
200
206
break # 如果找不到元素,退出循环
207
+ finally :
208
+ if not finished :
209
+ print ("\n \n -------Retrying-------\n \n " )
210
+ Log ("-------Retrying-------: " ,
211
+ node ["parameters" ]["xpath" ])
212
+ recordLog ("clickNotFound:" + node ["parameters" ]["xpath" ])
213
+ for i in node ["sequence" ]: # 不带点击元素的把剩余的如提取数据的操作执行一遍
214
+ if node ["option" ] != 2 :
215
+ excuteNode (i , None , node ["parameters" ]["xpath" ], 0 )
216
+ break # 如果找不到元素,退出循环
217
+
201
218
count = count + 1
202
219
Log ("页数:" , count )
203
220
recordLog ("页数:" + str (count ))
@@ -274,7 +291,8 @@ def loopExcute(node, loopValue, clickPath="", index=0):
274
291
excuteNode (i , text , "" , 0 )
275
292
elif int (node ["parameters" ]["loopType" ]) == 4 : # 固定网址列表
276
293
# tempList = node["parameters"]["textList"].split("\r\n")
277
- urlList = list (filter (isnull , node ["parameters" ]["textList" ].split ("\n " ))) # 去空行
294
+ urlList = list (
295
+ filter (isnull , node ["parameters" ]["textList" ].split ("\n " ))) # 去空行
278
296
# urlList = []
279
297
# for url in tempList:
280
298
# if url != "":
@@ -292,6 +310,7 @@ def loopExcute(node, loopValue, clickPath="", index=0):
292
310
def openPage (para , loopValue ):
293
311
rt = Time ("打开网页" )
294
312
time .sleep (2 ) # 打开网页后强行等待至少2秒
313
+ time .sleep (random .uniform (1 , 10 )) # 生成一个a到b的小数等待时间
295
314
global links
296
315
global urlId
297
316
global history
@@ -333,7 +352,7 @@ def openPage(para, loopValue):
333
352
if containJudge :
334
353
global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText
335
354
try :
336
- bodyText = browser .find_element (By .CSS_SELECTOR ,"body" ).text
355
+ bodyText = browser .find_element (By .CSS_SELECTOR , "body" ).text
337
356
Log ('URL Page: ' + url )
338
357
recordLog ('URL Page: ' + url )
339
358
except TimeoutException :
@@ -343,7 +362,7 @@ def openPage(para, loopValue):
343
362
time .sleep (1 )
344
363
Log ("获得bodytext等待1秒" )
345
364
# 再执行一遍
346
- bodyText = browser .find_element (By .CSS_SELECTOR ,"body" ).text
365
+ bodyText = browser .find_element (By .CSS_SELECTOR , "body" ).text
347
366
rt .end ()
348
367
except Exception as e :
349
368
Log (e )
@@ -374,7 +393,7 @@ def inputInfo(para, loopValue):
374
393
else :
375
394
textbox .send_keys (para ["value" ])
376
395
global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText
377
- bodyText = browser .find_element (By .CSS_SELECTOR ,"body" ).text
396
+ bodyText = browser .find_element (By .CSS_SELECTOR , "body" ).text
378
397
rt .end ()
379
398
380
399
@@ -404,6 +423,7 @@ def clickElement(para, loopElement=None, clickPath="", index=0):
404
423
recordLog (str (e ))
405
424
time .sleep (0.5 ) # 点击之后等半秒
406
425
Log ("点击之后等待0.5秒" )
426
+ time .sleep (random .uniform (1 , 10 )) # 生成一个a到b的小数等待时间
407
427
if tempHandleNum != len (browser .window_handles ): # 如果有新标签页的行为发生
408
428
browser .switch_to .window (browser .window_handles [- 1 ]) # 跳转到新的标签页
409
429
history ["handle" ] = browser .current_window_handle
@@ -425,15 +445,15 @@ def clickElement(para, loopElement=None, clickPath="", index=0):
425
445
if containJudge : # 有判断语句才执行以下操作
426
446
global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText
427
447
try :
428
- bodyText = browser .find_element (By .CSS_SELECTOR ,"body" ).text
448
+ bodyText = browser .find_element (By .CSS_SELECTOR , "body" ).text
429
449
except TimeoutException :
430
450
Log ('time out after 10 seconds when getting body text' )
431
451
recordLog ('time out after 10 seconds when getting body text' )
432
452
browser .execute_script ('window.stop()' )
433
453
time .sleep (1 )
434
454
Log ("bodytext等待1秒" )
435
455
# 再执行一遍
436
- bodyText = browser .find_element (By .CSS_SELECTOR ,"body" ).text
456
+ bodyText = browser .find_element (By .CSS_SELECTOR , "body" ).text
437
457
rt .end ()
438
458
except Exception as e :
439
459
Log (e )
@@ -442,7 +462,7 @@ def clickElement(para, loopElement=None, clickPath="", index=0):
442
462
443
463
444
464
# 提取数据事件
445
- def getData (para , loopElement , isInLoop = True , parentPath = "" , index = 0 ):
465
+ def getData (para , loopElement , isInLoop = True , parentPath = "" , index = 0 ):
446
466
if not isInLoop and para ["wait" ] == 0 :
447
467
time .sleep (1 ) # 如果提取数据字段不在循环内而且设置的等待时间为0,默认等待1秒
448
468
Log ("提取数据等待1秒" )
@@ -454,12 +474,14 @@ def getData(para, loopElement, isInLoop=True, parentPath="", index = 0):
454
474
if p ["relativeXpath" ] == "" : # 相对xpath有时候就是元素本身,不需要二次查找
455
475
element = loopElement
456
476
else :
457
- if p ["relativeXpath" ].find ("//" )>= 0 : # 如果字串里有//即子孙查找,则不动语句
458
- full_path = "(" + parentPath + p ["relativeXpath" ] + ")" + "[" + str (index + 1 ) + "]"
477
+ if p ["relativeXpath" ].find ("//" ) >= 0 : # 如果字串里有//即子孙查找,则不动语句
478
+ full_path = "(" + parentPath + \
479
+ p ["relativeXpath" ] + ")" + \
480
+ "[" + str (index + 1 ) + "]"
459
481
element = browser .find_element (By .XPATH , full_path )
460
482
else :
461
483
element = loopElement .find_element (By .XPATH ,
462
- p ["relativeXpath" ][1 :])
484
+ p ["relativeXpath" ][1 :])
463
485
else :
464
486
element = browser .find_element (By .XPATH , p ["relativeXpath" ])
465
487
except NoSuchElementException : # 找不到元素的时候,使用默认值
@@ -638,6 +660,7 @@ def clean():
638
660
if __name__ == '__main__' :
639
661
options = Options ()
640
662
exe_path = "chromedriver.exe"
663
+ option = webdriver .ChromeOptions ()
641
664
if os .path .exists (os .getcwd ()+ "/ServiceWrapper" ):
642
665
print ("Finding chromedriver in ServiceWrapper" ,
643
666
os .getcwd ()+ "/ServiceWrapper" )
@@ -651,11 +674,37 @@ def clean():
651
674
elif os .getcwd ().find ("ExecuteStage" ) >= 0 : # 如果直接执行
652
675
print ("Finding chromedriver in ServiceWrapper" ,
653
676
os .getcwd ()+ "/Debug" )
654
- options .binary_location = "./Chrome/chrome.exe" # 指定chrome位置
655
- exe_path = "./Chrome/chromedriver.exe"
677
+ option .binary_location = "./Application/chrome.exe" # 指定chrome位置
678
+ # option.binary_location = "C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe"
679
+ exe_path = "./Application/chromedriver.exe"
656
680
else :
657
681
options .binary_location = "chrome.exe" # 指定chrome位置
658
- browser = webdriver .Chrome (options = options , executable_path = exe_path )
682
+
683
+ option .add_experimental_option (
684
+ 'excludeSwitches' , ['enable-automation' ]) # 以开发者模式
685
+
686
+ # user_data_dir = r'' # 注意没有Default!
687
+
688
+ # options.add_argument('--user-data-dir='+p)
689
+
690
+ # 总结:
691
+ # 0. 带Cookie需要用userdatadir
692
+ # 1. chrome_options才是配置用户文件和chrome文件地址的正确选项
693
+ # 2. User Profile文件夹的路径是:C:\Users\用户名\AppData\Local\Google\Chrome\User Data不要加Default
694
+ # 3. 就算User Profile相同,chrome版本不同所存储的cookie信息也不同,也不能爬
695
+ # 4. TMALL如果一直弹出验证码,而且无法通过验证,那么需要在其他浏览器上用
696
+
697
+ option .add_argument (
698
+ '--user-data-dir=C:\\ Users\\ q9823\\ AppData\\ Local\\ Google\\ Chrome\\ User Data' ) # TMALL 反扒
699
+ option .add_argument ("--profile-directory=Default" )
700
+ # options.add_argument(
701
+ # '--user-data-dir=C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\User Data') # TMALL 反扒
702
+ option .add_argument (
703
+ "--disable-blink-features=AutomationControlled" ) # TMALL 反扒
704
+ print (options )
705
+ browser = webdriver .Chrome (
706
+ options = options , chrome_options = option , executable_path = exe_path )
707
+ wait = WebDriverWait (browser , 10 )
659
708
browser .get ('about:blank' )
660
709
browser .set_page_load_timeout (10 ) # 加载页面最大超时时间
661
710
browser .set_script_timeout (10 )
@@ -675,7 +724,25 @@ def clean():
675
724
else :
676
725
backEndAddress = "http://servicewrapper.naibo.wang"
677
726
678
- content = requests .get (backEndAddress + "/backEnd/queryTask?id=" + str (id ))
727
+ # TODO when transfer to electron, use commandline-config
728
+ config = {
729
+ "type" : "remote" ,
730
+ }
731
+ from commandline_config import Config
732
+ c = Config (config )
733
+ co = c
734
+ co = {"type" : "remote" }
735
+ if len (sys .argv ) > 4 :
736
+ co = sys .argv [4 ]
737
+ if co ["type" ] == "remote" :
738
+ print ("remote" )
739
+ content = requests .get (
740
+ backEndAddress + "/backEnd/queryTask?id=" + str (id ))
741
+ service = json .loads (content .text )
742
+ else :
743
+ print ("local" )
744
+ with open ("tasks/" + str (id ) + ".json" , 'r' , encoding = 'utf-8' ) as f :
745
+ content = f .read ()
679
746
service = json .loads (content .text ) # 加载服务信息
680
747
print ("name:" , service ["name" ])
681
748
procedure = service ["graph" ] # 程序执行流程
0 commit comments