From 88c85d27e8f81dcdc6399b25df117cdca69165cb Mon Sep 17 00:00:00 2001 From: thewhiteh4t Date: Sun, 27 Oct 2024 02:41:35 +0530 Subject: [PATCH] refactoring --- modules/crawler.py | 79 ++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 45 deletions(-) diff --git a/modules/crawler.py b/modules/crawler.py index cb0d87b..aca6ecb 100644 --- a/modules/crawler.py +++ b/modules/crawler.py @@ -19,20 +19,19 @@ user_agent = {'User-Agent': 'FinalRecon'} -total = [] -r_total = [] -sm_total = [] -js_total = [] -css_total = [] -int_total = [] -ext_total = [] -img_total = [] -js_crawl_total = [] -sm_crawl_total = [] - def crawler(target, protocol, netloc, output, data): - global r_url, sm_url + r_total = [] + sm_total = [] + css_total = [] + js_total = [] + int_total = [] + ext_total = [] + img_total = [] + sm_crawl_total = [] + js_crawl_total = [] + total = [] + print(f'\n{Y}[!] Starting Crawler...{W}\n') try: @@ -46,26 +45,27 @@ def crawler(target, protocol, netloc, output, data): if status == 200: page = rqst.content soup = bs4.BeautifulSoup(page, 'lxml') - r_url = f'{protocol}://{netloc}/robots.txt' sm_url = f'{protocol}://{netloc}/sitemap.xml' base_url = f'{protocol}://{netloc}' - loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) tasks = asyncio.gather( - robots(r_url, base_url, data, output), - sitemap(sm_url, data, output), - css(target, data, soup, output), - js_scan(target, data, soup, output), - internal_links(target, data, soup, output), - external_links(target, data, soup, output), - images(target, data, soup, output), - sm_crawl(data, output), - js_crawl(data, output)) + robots(r_url, r_total, sm_total, base_url, data, output), + sitemap(sm_url, sm_total, data, output), + css(target, css_total, data, soup, output), + js_scan(target, js_total, data, soup, output), + internal_links(target, int_total, data, soup, output), + external_links(target, ext_total, data, soup, output), + images(target, img_total, data, soup, output), + sm_crawl(data, sm_crawl_total, sm_total, sm_url, output), + js_crawl(data, js_crawl_total, js_total, output)) loop.run_until_complete(tasks) loop.close() - stats(output, data, soup) + stats(output, r_total, sm_total, css_total, js_total, + int_total, ext_total, img_total, sm_crawl_total, + js_crawl_total, total, data, soup + ) log_writer('[crawler] Completed') else: print(f'{R}[-] {C}Status : {W}{status}') @@ -102,8 +102,7 @@ def url_filter(target, link): return link -async def robots(robo_url, base_url, data, output): - global r_total +async def robots(robo_url, r_total, sm_total, base_url, data, output): print(f'{G}[+] {C}Looking for robots.txt{W}', end='', flush=True) try: @@ -144,8 +143,7 @@ async def robots(robo_url, base_url, data, output): log_writer(f'[crawler.robots] Exception = {exc}') -async def sitemap(target_url, data, output): - global sm_total +async def sitemap(target_url, sm_total, data, output): print(f'{G}[+] {C}Looking for sitemap.xml{W}', end='', flush=True) try: sm_rqst = requests.get(target_url, headers=user_agent, verify=False, timeout=10) @@ -173,8 +171,7 @@ async def sitemap(target_url, data, output): log_writer(f'[crawler.sitemap] Exception = {exc}') -async def css(target, data, soup, output): - global css_total +async def css(target, css_total, data, soup, output): print(f'{G}[+] {C}Extracting CSS Links{W}', end='', flush=True) css_links = soup.find_all('link', href=True) @@ -188,8 +185,7 @@ async def css(target, data, soup, output): exporter(data, output, css_total, 'css') -async def js_scan(target, data, soup, output): - global js_total +async def js_scan(target, js_total, data, soup, output): print(f'{G}[+] {C}Extracting Javascript Links{W}', end='', flush=True) scr_tags = soup.find_all('script', src=True) @@ -205,8 +201,7 @@ async def js_scan(target, data, soup, output): exporter(data, output, js_total, 'javascripts') -async def internal_links(target, data, soup, output): - global int_total +async def internal_links(target, int_total, data, soup, output): print(f'{G}[+] {C}Extracting Internal Links{W}', end='', flush=True) ext = tldextract.extract(target) @@ -224,8 +219,7 @@ async def internal_links(target, data, soup, output): exporter(data, output, int_total, 'internal_urls') -async def external_links(target, data, soup, output): - global ext_total +async def external_links(target, ext_total, data, soup, output): print(f'{G}[+] {C}Extracting External Links{W}', end='', flush=True) ext = tldextract.extract(target) @@ -243,8 +237,7 @@ async def external_links(target, data, soup, output): exporter(data, output, ext_total, 'external_urls') -async def images(target, data, soup, output): - global img_total +async def images(target, img_total, data, soup, output): print(f'{G}[+] {C}Extracting Images{W}', end='', flush=True) image_tags = soup.find_all('img') @@ -258,8 +251,7 @@ async def images(target, data, soup, output): exporter(data, output, img_total, 'images') -async def sm_crawl(data, output): - global sm_crawl_total +async def sm_crawl(data, sm_crawl_total, sm_total, sm_url, output): print(f'{G}[+] {C}Crawling Sitemaps{W}', end='', flush=True) threads = [] @@ -302,8 +294,7 @@ def fetch(site_url): exporter(data, output, sm_crawl_total, 'urls_inside_sitemap') -async def js_crawl(data, output): - global js_crawl_total +async def js_crawl(data, js_crawl_total, js_total, output): print(f'{G}[+] {C}Crawling Javascripts{W}', end='', flush=True) threads = [] @@ -347,9 +338,7 @@ def exporter(data, output, list_name, file_name): export(output, data) -def stats(output, data, soup): - global total - +def stats(output, r_total, sm_total, css_total, js_total, int_total, ext_total, img_total, sm_crawl_total, js_crawl_total, total, data, soup): total.extend(r_total) total.extend(sm_total) total.extend(css_total)