Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion backend/btrixcloud/basecrawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,9 +464,11 @@ async def _resolve_crawl_refs(
raise HTTPException(status_code=400, detail="missing_org")

if hasattr(crawl, "profileid") and crawl.profileid:
crawl.profileName = await self.crawl_configs.profiles.get_profile_name(
profile = await self.crawl_configs.profiles.get_profile(
crawl.profileid, org
)
if profile:
crawl.profileName = profile.name

if (
files
Expand Down
53 changes: 31 additions & 22 deletions backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,23 +265,18 @@ async def add_crawl_config(
proxy_id = config_in.proxyId

profileid = None
# ensure profile is valid, get proxy_id from profile
if isinstance(config_in.profileid, UUID):
profileid = config_in.profileid

# ensure profile is valid, get proxy_id from profile
if profileid:
profile = await self.profiles.get_profile(profileid, org)
proxy_id = profile.proxyId
proxy_id = None
else:
if config_in.config and config_in.config.failOnContentCheck:
raise HTTPException(
status_code=400, detail="fail_on_content_check_requires_profile"
)

# ensure proxy_id is valid and available for org
if proxy_id:
if not self.can_org_use_proxy(org, proxy_id):
raise HTTPException(status_code=404, detail="proxy_not_found")
self.assert_can_org_use_proxy(org, proxy_id)

if config_in.config.exclude:
exclude = config_in.config.exclude
Expand Down Expand Up @@ -602,7 +597,17 @@ async def update_crawl_config(
and ((not update.profileid) != (not orig_crawl_config.profileid))
)

changed = changed or (orig_crawl_config.proxyId != update.proxyId)
# either unsetting profile or no profile set on current config
no_profile = update.profileid == "" or (
update.profileid is None and not orig_crawl_config.profileid
)

changed = changed or (
no_profile
and update.proxyId is not None
and orig_crawl_config.proxyId != update.proxyId
and ((not update.proxyId) != (not orig_crawl_config.proxyId))
)

metadata_changed = self.check_attr_changed(orig_crawl_config, update, "name")
metadata_changed = metadata_changed or self.check_attr_changed(
Expand Down Expand Up @@ -633,8 +638,6 @@ async def update_crawl_config(
last_rev = ConfigRevision(**orig_dict)
last_rev = await self.config_revs.insert_one(last_rev.to_dict())

proxy_id = update.proxyId

# set update query
query = update.dict(exclude_unset=True)
query["modifiedBy"] = user.id
Expand All @@ -647,14 +650,15 @@ async def update_crawl_config(
# else, ensure its a valid profile
elif update.profileid:
profile = await self.profiles.get_profile(cast(UUID, update.profileid), org)
self.assert_can_org_use_proxy(org, profile.proxyId)
query["profileid"] = update.profileid
proxy_id = profile.proxyId
# don't change the proxy if profile is set, as it should match the profile proxy
elif orig_crawl_config.profileid:
proxy_id = None

if proxy_id is not None:
query["proxyId"] = proxy_id
if no_profile:
if update.proxyId == "":
query["proxyId"] = None
elif update.proxyId:
self.assert_can_org_use_proxy(org, update.proxyId)
query["proxyId"] = update.proxyId

if update.config is not None:
query["config"] = update.config.dict()
Expand Down Expand Up @@ -1025,9 +1029,10 @@ async def get_crawl_config_out(self, cid: UUID, org: Organization):
await self._add_running_curr_crawl_stats(crawlconfig)

if crawlconfig.profileid:
crawlconfig.profileName = await self.profiles.get_profile_name(
crawlconfig.profileid, org
)
profile = await self.profiles.get_profile(crawlconfig.profileid, org)
if profile:
crawlconfig.profileName = profile.name
crawlconfig.proxyId = profile.proxyId

crawlconfig.config.seeds = None

Expand Down Expand Up @@ -1241,8 +1246,7 @@ async def run_now_internal(
else:
profile_filename = ""

if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId):
raise HTTPException(status_code=404, detail="proxy_not_found")
self.assert_can_org_use_proxy(org, crawlconfig.proxyId)

storage_filename = (
crawlconfig.crawlFilenameTemplate or self.default_filename_template
Expand Down Expand Up @@ -1418,6 +1422,11 @@ def can_org_use_proxy(self, org: Organization, proxy: CrawlerProxy | str) -> boo
_proxy.shared and org.allowSharedProxies
) or _proxy.id in org.allowedProxies

def assert_can_org_use_proxy(self, org: Organization, proxy: Optional[str]):
"""assert that proxy can be used or throw error"""
if proxy and not self.can_org_use_proxy(org, proxy):
raise HTTPException(status_code=400, detail="proxy_not_found")

def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str:
"""Generate WARC prefix slug from org slug, name or url
if no name is provided, hostname is used from url, otherwise
Expand Down
2 changes: 1 addition & 1 deletion backend/btrixcloud/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
) = PageOps = BackgroundJobOps = FileUploadOps = CrawlLogOps = CrawlManager = object


CURR_DB_VERSION = "0052"
CURR_DB_VERSION = "0054"


# ============================================================================
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""
Migration 0054 -- clear proxyId on workflows that have profile set
using proxyId from profile always
"""

from btrixcloud.migrations import BaseMigration


MIGRATION_VERSION = "0054"


class Migration(BaseMigration):
"""Migration class."""

# pylint: disable=unused-argument
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)

async def migrate_up(self):
"""Perform migration up.
Unset proxyId on workflows that have a profileid set
"""
crawl_configs = self.mdb["crawl_configs"]

# Set non-public collections to private
try:
await crawl_configs.update_many(
{"profileid": {"$ne": None}, "proxyId": {"$ne": None}},
{"$set": {"proxyId": None}},
)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Error update crawl_configs: {err}",
flush=True,
)
2 changes: 2 additions & 0 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2428,6 +2428,8 @@ class ProfileFile(BaseFile):
class Profile(BaseMongoModel):
"""Browser profile"""

id: UUID

name: str
description: Optional[str] = ""

Expand Down
4 changes: 4 additions & 0 deletions backend/btrixcloud/orgs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1300,6 +1300,10 @@ async def import_org(
if not workflow.get("crawlerChannel"):
workflow["crawlerChannel"] = "default"

# Ensure proxyId is unset if profile is set
if workflow.get("profileid"):
workflow["proxyId"] = None

crawl_config = CrawlConfig.from_dict(workflow)
await self.crawl_configs_db.insert_one(crawl_config.to_dict())

Expand Down
6 changes: 6 additions & 0 deletions chart/test/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ operator_resync_seconds: 3

qa_scale: 2

# lower storage sizes
redis_storage: "100Mi"
profile_browser_workdir_size: "100Mi"
crawler_storage: "1Gi"


# for testing only
crawler_extra_cpu_per_browser: 300m

Expand Down
Loading