Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add common perftest classes for data generation + use in confluence to demonstrate #1590

Merged
2 changes: 1 addition & 1 deletion Dockerfile.ftest
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ FROM python:3.10
COPY . /connectors
WORKDIR /connectors
RUN make clean install
RUN pip install -r requirements/ftest.txt
RUN bin/pip install -r requirements/ftest.txt
103 changes: 103 additions & 0 deletions tests/commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@
# or more contributor license agreements. Licensed under the Elastic License 2.0;
# you may not use this file except in compliance with the Elastic License 2.0.
#
import math
from functools import cached_property
from random import choices

from faker import Faker


class AsyncIterator:
"""
Async documents generator fake class, which records the args and kwargs it was called with.
Expand Down Expand Up @@ -61,3 +68,99 @@ def assert_called_once_with(self, *args, **kwargs):
raise AssertionError(
f"Expected kwargs: {kwargs}. Actual kwargs: {self.call_kwargs[0]}."
)


class WeightedFakeProvider:
def __init__(self, seed=None, weights=None):
self.seed = seed
if weights and len(weights) != 4:
raise Exception(
f"Exactly 4 weights should be provided. Got {len(weights)}: {weights}"
)
self.weights = weights or [0.58, 0.3, 0.1, 0.02]

if math.fsum(self.weights) != 1:
raise Exception(
f"Sum of weights should be equal to 1. Sum of provided weights {self.weights} is {math.fsum(self.weights)}"
)
self.fake_provider = FakeProvider(seed=seed)

@cached_property
def _texts(self):
return [
self.fake_provider.small_text(),
self.fake_provider.medium_text(),
self.fake_provider.large_text(),
self.fake_provider.extra_large_text(),
]

@cached_property
def _htmls(self):
return [
self.fake_provider.small_html(),
self.fake_provider.medium_html(),
self.fake_provider.large_html(),
self.fake_provider.extra_large_html(),
]

def get_text(self):
return choices(self._texts, self.weights)[0]

def get_html(self):
return choices(self._htmls, self.weights)[0]


class FakeProvider:
def __init__(self, seed=None):
self.seed = seed
self.fake = Faker()
if seed:
self.fake.seed_instance(seed)

@cached_property
def _cached_random_str(self):
return self.fake.pystr(min_chars=100 * 1024, max_chars=100 * 1024 + 1)

def small_text(self):
# Up to 1KB of text
return self.generate_text(1 * 1024)

def medium_text(self):
# Up to 256KB of text
return self.generate_text(256 * 1024)

def large_text(self):
# Up to 1MB of text
return self.generate_text(1024 * 1024)

def extra_large_text(self):
return self.generate_text(20 * 1024 * 1024)

def small_html(self):
# Around 100KB
return self.generate_html(1)

def medium_html(self):
# Around 1MB
return self.generate_html(1 * 10)

def large_html(self):
# Around 8MB
return self.generate_html(8 * 10)

def extra_large_html(self):
# Around 25MB
return self.generate_html(25 * 10)

def generate_text(self, max_size):
return self.fake.text(max_nb_chars=max_size)

def generate_html(self, images_of_100kb):
img = self._cached_random_str # 100kb
text = self.small_text()

images = []
for _ in range(images_of_100kb):
images.append(f"<img src='{img}'/>")

return f"<html><head></head><body><div>{text}</div><div>{'<br/>'.join(images)}</div></body></html>"
2 changes: 1 addition & 1 deletion tests/sources/fixtures/confluence/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ services:
build:
context: ../../../../
dockerfile: Dockerfile.ftest
command: python tests/sources/fixtures/confluence/fixture.py
command: bin/python tests/sources/fixtures/confluence/fixture.py
ports:
- "9696:9696"
volumes:
Expand Down
65 changes: 49 additions & 16 deletions tests/sources/fixtures/confluence/fixture.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,26 +7,50 @@
"""
import io
import os
import random
import string
import time

from flask import Flask, request

DATA_SIZE = os.environ.get("DATA_SIZE", "small").lower()
_SIZES = {"small": 1000000, "medium": 2000000, "large": 6000000}
FILE_SIZE = _SIZES[DATA_SIZE]
LARGE_DATA = "".join([random.choice(string.ascii_letters) for _ in range(FILE_SIZE)])
from tests.commons import WeightedFakeProvider

fake_provider = WeightedFakeProvider()

DATA_SIZE = os.environ.get("DATA_SIZE", "medium")
artem-shelkovnikov marked this conversation as resolved.
Show resolved Hide resolved

match DATA_SIZE:
case "small":
SPACE_COUNT = 100
SPACE_OBJECT_COUNT = 100
ATTACHMENT_COUNT = 3
case "medium":
SPACE_COUNT = 100
SPACE_OBJECT_COUNT = 200
ATTACHMENT_COUNT = 5
seanstory marked this conversation as resolved.
Show resolved Hide resolved
case "large":
SPACE_COUNT = 100
SPACE_OBJECT_COUNT = 250
ATTACHMENT_COUNT = 7


def get_num_docs():
# 2 is multiplier cause SPACE_OBJECTs will be delivered twice:
# Test returns SPACE_OBJECT_COUNT objects for each type of content
# There are 2 types of content:
# - blogpost
# - page
print(SPACE_COUNT + SPACE_OBJECT_COUNT * ATTACHMENT_COUNT * 2)


class ConfluenceAPI:
def __init__(self):
self.app = Flask(__name__)
self.space_start_at = 0
self.space_page_limit = 100
self.total_spaces = 4000
self.total_content = 50
self.total_spaces = SPACE_COUNT
self.total_content = SPACE_OBJECT_COUNT
self.attachment_start_at = 1
self.attachment_end_at = 6
self.attachment_end_at = self.attachment_start_at + ATTACHMENT_COUNT - 1
self.attachments = {}

self.app.route("/rest/api/space", methods=["GET"])(self.get_spaces)
self.app.route("/rest/api/content/search", methods=["GET"])(self.get_content)
Expand All @@ -38,6 +62,10 @@ def __init__(self):
methods=["GET"],
)(self.download)

@self.app.before_request
def before_request():
time.sleep(0.05)

def get_spaces(self):
"""Function to handle get spaces calls with pagination

Expand Down Expand Up @@ -111,7 +139,7 @@ def get_content(self):
"title": f"ES-scrum_{content_count}",
"type": document_type,
"history": {"lastUpdated": {"when": "2023-01-24T04:07:19.672Z"}},
"children": {"attachment": {"size": 5}},
"children": {"attachment": {"size": ATTACHMENT_COUNT}},
"body": {"storage": {"value": f"This is a test {document_type}"}},
"space": {"name": "Demo Space 0"},
"_links": {
Expand All @@ -134,20 +162,23 @@ def get_attachments(self, content_id):
"results": [],
"start": 0,
"limit": 100,
"size": 5,
"size": ATTACHMENT_COUNT,
"_links": {"next": None},
}

for attachment_count in range(self.attachment_start_at, self.attachment_end_at):
attachment_name = f"attachment_{content_id}_{attachment_count}.html"
attachment_file = fake_provider.get_html()
self.attachments[attachment_name] = attachment_file
attachment = {
"id": f"attachment_{content_id}_{attachment_count}",
"title": f"attachment_{content_id}_{attachment_count}.py",
"title": attachment_name,
"type": "attachment",
"version": {"when": "2023-01-03T09:24:50.633Z"},
"extensions": {"fileSize": FILE_SIZE},
"extensions": {"fileSize": len(attachment_file.encode("utf-8"))},
"_links": {
"download": f"/download/attachments/{content_id}/attachment_{content_id}_{attachment_count}.py",
"webui": f"/pages/viewpageattachments.action?pageId={content_id}&preview=attachment_{content_id}_{attachment_count}.py",
"download": f"/download/attachments/{content_id}/attachment_{content_id}_{attachment_count}.html",
"webui": f"/pages/viewpageattachments.action?pageId={content_id}&preview=attachment_{content_id}_{attachment_count}.html",
},
}
attachments["results"].append(attachment)
Expand All @@ -163,7 +194,9 @@ def download(self, content_id, attachment_id):
Returns:
data_reader (io.BytesIO): object of io.BytesIO.
"""
data_reader = io.BytesIO(bytes(LARGE_DATA, encoding="utf-8"))
data_reader = io.BytesIO(
bytes(self.attachments[attachment_id], encoding="utf-8")
)
return data_reader


Expand Down