From eb12da38cdb703531fab9e7e938cfee113c15ac5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= Date: Fri, 16 Aug 2024 17:36:02 +0200 Subject: [PATCH 1/2] doc: Specify how to use env vars and whitelist configuration --- README.md | 10 +++++++--- app/app.py | 5 +++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 70b4fbb..b22a140 100644 --- a/README.md +++ b/README.md @@ -112,10 +112,14 @@ scrape-it-now index run --help ### Source environment variables -To configure easily the CLI, source environment variables from a `.env` file. - -For example: +To configure easily the CLI, source environment variables from a `.env` file. For example, for the `--azure-storage-connection-string` option: ```bash AZURE_STORAGE_CONNECTION_STRING=xxx ``` + +For arguments that accept multiple values, use a space-separated list. For example, for the `--whitelist` option: + +```bash +WHITELIST=learn\.microsoft\.com,^/(?!en-us).*,^/[^/]+/answers/,^/[^/]+/previous-versions/ go\.microsoft\.com,.* +``` diff --git a/app/app.py b/app/app.py index 1cb262a..2971127 100644 --- a/app/app.py +++ b/app/app.py @@ -64,6 +64,7 @@ def scrape() -> None: "--whitelist", "-w", envvar="WHITELIST", + help="Comma separated list of domains and paths to whitelist. Example, to filter nytimes.com and only the pages from 2024 pages, use 'nytimes.com,^/2024/'.", multiple=True, type=str, ) @@ -177,6 +178,8 @@ async def scrape_run( for v in viewport: width, height = v.split("x") viewports_parsed.append((int(width), int(height))) + if viewports_parsed: + logger.info("Viewports: %s", viewports_parsed) # Parse whitelist whitelist_parsed: dict[re.Pattern, list[re.Pattern]] = {} @@ -190,6 +193,8 @@ async def scrape_run( for path in w.split(",")[1:]: path = re.compile(path.strip()) whitelist_parsed[domain].append(path) + if whitelist_parsed: + logger.info("Whitelist: %s", whitelist_parsed) await scrape_backend_run( cache_refresh=cache_refresh, From 1471163523aab20ce489f448c8d4c121e117fd5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= Date: Fri, 16 Aug 2024 17:36:09 +0200 Subject: [PATCH 2/2] quality: Code lint --- app/helpers/persistence.py | 16 +++++++--------- app/index.py | 17 ++++++----------- app/models/state.py | 3 ++- app/scrape.py | 20 +++++++++++++++----- 4 files changed, 30 insertions(+), 26 deletions(-) diff --git a/app/helpers/persistence.py b/app/helpers/persistence.py index bff8595..1249f95 100644 --- a/app/helpers/persistence.py +++ b/app/helpers/persistence.py @@ -1,12 +1,14 @@ -from app.helpers.logging import logger +from contextlib import asynccontextmanager +from typing import AsyncGenerator + from azure.core.credentials import AzureKeyCredential from azure.core.exceptions import ResourceExistsError from azure.search.documents.aio import SearchClient from azure.storage.blob.aio import BlobServiceClient, ContainerClient from azure.storage.queue.aio import QueueClient, QueueServiceClient -from contextlib import asynccontextmanager from openai import AsyncAzureOpenAI -from typing import AsyncGenerator + +from app.helpers.logging import logger @asynccontextmanager @@ -55,9 +57,7 @@ async def blob_client( """ Get the Azure Blob Storage client. """ - async with BlobServiceClient.from_connection_string( - connection_string - ) as x: + async with BlobServiceClient.from_connection_string(connection_string) as x: client = x.get_container_client(container) # Create if it does not exist @@ -79,9 +79,7 @@ async def queue_client( """ Get the Azure Queue Storage client. """ - async with QueueServiceClient.from_connection_string( - connection_string - ) as x: + async with QueueServiceClient.from_connection_string(connection_string) as x: client = x.get_queue_client(queue) # Create if it does not exist diff --git a/app/index.py b/app/index.py index d470fe3..e15bd03 100644 --- a/app/index.py +++ b/app/index.py @@ -186,10 +186,7 @@ def _markdown_chunck( The text is split by Markdown headings, and each chunk is as big as possible without exceeding the max_tokens limit. """ contents = [] - if ( - _count_tokens(text) - < max_tokens - ): # If the text is small enough + if _count_tokens(text) < max_tokens: # If the text is small enough contents.append(text) return contents @@ -266,10 +263,7 @@ def _rebuild_headings() -> str: ).strip() # Chunck if is still too big - current_cleaned_count = math.ceil( - _count_tokens(current_cleaned) - / max_tokens - ) + current_cleaned_count = math.ceil(_count_tokens(current_cleaned) / max_tokens) current_cleaned_chunck_size = math.ceil( len(current_cleaned) / current_cleaned_count ) @@ -300,8 +294,7 @@ def _rebuild_headings() -> str: current_chunk += f"### {last_h3_head}\n" for h4_head, h4_content in h3_next.items(): if ( - _count_tokens(current_chunk) - >= max_tokens + _count_tokens(current_chunk) >= max_tokens ): # If the chunk is too big # Re-apply the last heading to the next chunk current_chunk = _split_paragraph( @@ -424,7 +417,9 @@ async def _worker( # TODO: Add a dead-letter queue # TODO: Add a retry mechanism # TODO: Narrow the exception type - logger.error("Error processing %s", blob_name, exc_info=True) + logger.error( + "Error processing %s", blob_name, exc_info=True + ) # Wait 3 sec to avoid spamming the queue if it is empty await asyncio.sleep(3) diff --git a/app/models/state.py b/app/models/state.py index eaf2c12..be21916 100644 --- a/app/models/state.py +++ b/app/models/state.py @@ -1,4 +1,5 @@ -from datetime import datetime, UTC +from datetime import UTC, datetime + from pydantic import BaseModel, Field diff --git a/app/scrape.py b/app/scrape.py index 02bbbc6..553ee14 100644 --- a/app/scrape.py +++ b/app/scrape.py @@ -3,7 +3,7 @@ from urllib.parse import urlparse from uuid import uuid4 -from azure.core.exceptions import ResourceNotFoundError, ResourceExistsError +from azure.core.exceptions import ResourceExistsError, ResourceNotFoundError from azure.storage.blob.aio import ContainerClient from azure.storage.queue.aio import QueueClient from html2text import HTML2Text @@ -121,7 +121,9 @@ async def _add( # Skip if the previous attempt is too recent # Date is now and not the one from the model, on purposes. Otherwise, if its a cached model, the date would match the frefresher date every time. if previous.created_at >= datetime.now(UTC) - cache_refresh: - logger.debug("Skipping %s due to recent attempt at %s", url, previous.created_at) + logger.debug( + "Skipping %s due to recent attempt at %s", url, previous.created_at + ) return False except (ResourceNotFoundError, ValidationError): @@ -395,8 +397,12 @@ async def _worker( except ResourceExistsError: # Wait for the lease to expire logger.debug("Lease already exists, waiting") await asyncio.sleep(1) - except ResourceNotFoundError: # Create the blob if it does not exist - logger.debug("State blob does not exist, creating an empty one") + except ( + ResourceNotFoundError + ): # Create the blob if it does not exist + logger.debug( + "State blob does not exist, creating an empty one" + ) await state_blob.upload_blob( data=b"", length=0, @@ -421,7 +427,11 @@ async def _worker( ) # Release the lease await state_lease.release() - logger.info("Updated job state to %i processed and %i queued", state.processed, state.queued) + logger.info( + "Updated job state to %i processed and %i queued", + state.processed, + state.queued, + ) # Wait 3 sec to avoid spamming the queue if it is empty await asyncio.sleep(3)