Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

utils: Add crawlers for testing for crashes/performance #373

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions utils/crawl-locust.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import random
from locust import HttpUser, between, task
from bs4 import BeautifulSoup

# To run: python3 -m locust -f utils/crawl-locust.py
# Dynamically querying projects and versions is not implemented, remember to replace
# data in `projects` key with a subset of data from the tested instance.

class ElixirUser(HttpUser):
wait_time = between(1, 10)
projects = (
("linux", (
"v6.9.4",
"v6.8",
"v6.2",
"v5.14.15",
"v5.9",
"v5.4",
"v4.17",
"v4.10.11",
"v4.6",
"v3.15",
"v3.5.6",
"v3.1"
)),
("musl",(
"v1.2.5" ,
)),
("zephyr",(
"v3.7.0",
"v3.4.0",
"v3.0.0",
"v2.7.0",
"v2.5.0",
"v2.3.0",
"v1.12.0",
"v1.5.0",
)),
)

def on_start(self):
self.wait()
self.index_page()

def parse_tree(self, phtml):
links = phtml.find_all('a', class_='tree-icon')

for link in links:
link_url = f"{self.host}{link['href']}"
self.urls.append(link_url)

def parse_html(self, r):
phtml = BeautifulSoup(r.content, 'html.parser')
tree = phtml.find(class_='lxrtree')
if tree is not None:
self.parse_tree(phtml)
else:
idents = phtml.find_all(class_='ident')
for i in idents:
self.urls.append(f"{self.host}{i['href']}")

@task(1)
def index_page(self):
project, versions = random.choice(self.projects)
version = random.choice(versions)
r = self.client.get(f"{project}/{version}/source")
self.urls = []
self.parse_html(r)

@task(100)
def load_random_source_page(self):
url = random.choice(self.urls)
r = self.client.get(url)
self.parse_html(r)

52 changes: 52 additions & 0 deletions utils/crawl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import time
import logging
import sys
import random
from bs4 import BeautifulSoup
import requests

if __name__ == "__main__":
if len(sys.argv) != 4:
print(f"usage: {sys.argv[0]} url project version")
exit(1)

base_url = sys.argv[1]
project = sys.argv[2]
version = sys.argv[3]

first_url = f"{base_url}/{project}/{version}/source"
urls_set = set([first_url])
urls_list = [first_url]

while True:
url_index = random.randint(0, len(urls_list)-1)
url = urls_list.pop(url_index)

try:
req = requests.get(url, timeout=30)
except Exception as e:
logging.exception("request failed!")
time.sleep(1)
continue

if req.status_code != 200:
print("===== ERROR", url, req.status_code)

phtml = BeautifulSoup(req.text, 'html.parser')

tree = phtml.find(class_='lxrtree')
if tree is not None:
links = phtml.find_all('a', class_='tree-icon')

for link in links:
link_url = f"{base_url}{link['href']}"
if link_url not in urls_set:
urls_set.add(link_url)
urls_list.append(link_url)

urls_set.remove(url)

duration = req.elapsed.total_seconds()
print(url, req.status_code, duration, '' if duration < 1 else 'LONG')