diff --git a/utils/crawl-locust.py b/utils/crawl-locust.py new file mode 100644 index 00000000..20378cfa --- /dev/null +++ b/utils/crawl-locust.py @@ -0,0 +1,75 @@ +import random +from locust import HttpUser, between, task +from bs4 import BeautifulSoup + +# To run: python3 -m locust -f utils/crawl-locust.py +# Dynamically querying projects and versions is not implemented, remember to replace +# data in `projects` key with a subset of data from the tested instance. + +class ElixirUser(HttpUser): + wait_time = between(1, 10) + projects = ( + ("linux", ( + "v6.9.4", + "v6.8", + "v6.2", + "v5.14.15", + "v5.9", + "v5.4", + "v4.17", + "v4.10.11", + "v4.6", + "v3.15", + "v3.5.6", + "v3.1" + )), + ("musl",( + "v1.2.5" , + )), + ("zephyr",( + "v3.7.0", + "v3.4.0", + "v3.0.0", + "v2.7.0", + "v2.5.0", + "v2.3.0", + "v1.12.0", + "v1.5.0", + )), + ) + + def on_start(self): + self.wait() + self.index_page() + + def parse_tree(self, phtml): + links = phtml.find_all('a', class_='tree-icon') + + for link in links: + link_url = f"{self.host}{link['href']}" + self.urls.append(link_url) + + def parse_html(self, r): + phtml = BeautifulSoup(r.content, 'html.parser') + tree = phtml.find(class_='lxrtree') + if tree is not None: + self.parse_tree(phtml) + else: + idents = phtml.find_all(class_='ident') + for i in idents: + self.urls.append(f"{self.host}{i['href']}") + + @task(1) + def index_page(self): + project, versions = random.choice(self.projects) + version = random.choice(versions) + r = self.client.get(f"{project}/{version}/source") + self.urls = [] + self.parse_html(r) + + @task(100) + def load_random_source_page(self): + url = random.choice(self.urls) + r = self.client.get(url) + self.parse_html(r) + diff --git a/utils/crawl.py b/utils/crawl.py new file mode 100644 index 00000000..a93f3511 --- /dev/null +++ b/utils/crawl.py @@ -0,0 +1,52 @@ +import time +import logging +import sys +import random +from bs4 import BeautifulSoup +import requests + +if __name__ == "__main__": + if len(sys.argv) != 4: + print(f"usage: {sys.argv[0]} url project version") + exit(1) + + base_url = sys.argv[1] + project = sys.argv[2] + version = sys.argv[3] + + first_url = f"{base_url}/{project}/{version}/source" + urls_set = set([first_url]) + urls_list = [first_url] + + while True: + url_index = random.randint(0, len(urls_list)-1) + url = urls_list.pop(url_index) + + try: + req = requests.get(url, timeout=30) + except Exception as e: + logging.exception("request failed!") + time.sleep(1) + continue + + if req.status_code != 200: + print("===== ERROR", url, req.status_code) + + phtml = BeautifulSoup(req.text, 'html.parser') + + tree = phtml.find(class_='lxrtree') + if tree is not None: + links = phtml.find_all('a', class_='tree-icon') + + for link in links: + link_url = f"{base_url}{link['href']}" + if link_url not in urls_set: + urls_set.add(link_url) + urls_list.append(link_url) + + urls_set.remove(url) + + duration = req.elapsed.total_seconds() + print(url, req.status_code, duration, '' if duration < 1 else 'LONG') + +