-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrequest_module.py
72 lines (66 loc) · 2.02 KB
/
request_module.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from logging import exception
import threading
import requests
import re
from variables import POWERS, REGEX_PATTERN
import time
from concurrent.futures import ThreadPoolExecutor
import os
from queue import Queue, Empty
import sys
HASH_CODES = set()
MODULO = 2147483647
LINKS_LIST = Queue()
outFILE = open('links.txt', 'w')
EXCEPTIONS = {'js', 'png', 'jpg', 'css', 'jpeg', 'webp', 'xml'}
def CreateHash(link: str) -> int:
hashCode = 0
i = 0
for chr in link:
if chr != '/':
hashCode += (ord(chr) * POWERS[i]) % MODULO
hashCode = hashCode % MODULO
i += 1
return hashCode
def GetSufix(string: str) -> str:
aux = string[-5:]
if aux[0] == '.':
return aux[1:]
if aux[1] == '.':
return aux[2:]
if aux[2] == '.':
return aux[3:]
return None
def GetSessionLinks(session, base_url: str):
page = session.get(base_url).text
for link in re.finditer(REGEX_PATTERN, page):
url = link.group()
hash_code = CreateHash(url)
if hash_code not in HASH_CODES:
sufix = GetSufix(url)
if sufix not in EXCEPTIONS:
LINKS_LIST.put(url)
HASH_CODES.add(hash_code)
outFILE.write(url)
# print(url)
outFILE.write('\n')
def StartCrawler(base_link):
Session = requests.Session()
global LINKS_LIST
LINKS_LIST.put(base_link)
HASH_CODES.add(CreateHash(base_link))
with ThreadPoolExecutor(max_workers=35) as executor:
while True:
try:
target_url = LINKS_LIST.get()
executor.submit(GetSessionLinks, Session, target_url)
except:
continue
if __name__ =='__main__':
inFILE = open('data.txt', 'r')
base_link = inFILE.readline().rstrip()
timeout = int(inFILE.readline())
new_thread = threading.Thread(target=StartCrawler, args=[base_link])
new_thread.start()
time.sleep(timeout)
os._exit(0)