From eb12da38cdb703531fab9e7e938cfee113c15ac5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= <clemence@lesne.pro>
Date: Fri, 16 Aug 2024 17:36:02 +0200
Subject: [PATCH 1/2] doc: Specify how to use env vars and whitelist
 configuration

---
 README.md  | 10 +++++++---
 app/app.py |  5 +++++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 70b4fbb..b22a140 100644
--- a/README.md
+++ b/README.md
@@ -112,10 +112,14 @@ scrape-it-now index run --help
 
 ### Source environment variables
 
-To configure easily the CLI, source environment variables from a `.env` file.
-
-For example:
+To configure easily the CLI, source environment variables from a `.env` file. For example, for the `--azure-storage-connection-string` option:
 
 ```bash
 AZURE_STORAGE_CONNECTION_STRING=xxx
 ```
+
+For arguments that accept multiple values, use a space-separated list. For example, for the `--whitelist` option:
+
+```bash
+WHITELIST=learn\.microsoft\.com,^/(?!en-us).*,^/[^/]+/answers/,^/[^/]+/previous-versions/ go\.microsoft\.com,.*
+```
diff --git a/app/app.py b/app/app.py
index 1cb262a..2971127 100644
--- a/app/app.py
+++ b/app/app.py
@@ -64,6 +64,7 @@ def scrape() -> None:
     "--whitelist",
     "-w",
     envvar="WHITELIST",
+    help="Comma separated list of domains and paths to whitelist. Example, to filter nytimes.com and only the pages from 2024 pages, use 'nytimes.com,^/2024/'.",
     multiple=True,
     type=str,
 )
@@ -177,6 +178,8 @@ async def scrape_run(
     for v in viewport:
         width, height = v.split("x")
         viewports_parsed.append((int(width), int(height)))
+    if viewports_parsed:
+        logger.info("Viewports: %s", viewports_parsed)
 
     # Parse whitelist
     whitelist_parsed: dict[re.Pattern, list[re.Pattern]] = {}
@@ -190,6 +193,8 @@ async def scrape_run(
         for path in w.split(",")[1:]:
             path = re.compile(path.strip())
             whitelist_parsed[domain].append(path)
+    if whitelist_parsed:
+        logger.info("Whitelist: %s", whitelist_parsed)
 
     await scrape_backend_run(
         cache_refresh=cache_refresh,

From 1471163523aab20ce489f448c8d4c121e117fd5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= <clemence@lesne.pro>
Date: Fri, 16 Aug 2024 17:36:09 +0200
Subject: [PATCH 2/2] quality: Code lint

---
 app/helpers/persistence.py | 16 +++++++---------
 app/index.py               | 17 ++++++-----------
 app/models/state.py        |  3 ++-
 app/scrape.py              | 20 +++++++++++++++-----
 4 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/app/helpers/persistence.py b/app/helpers/persistence.py
index bff8595..1249f95 100644
--- a/app/helpers/persistence.py
+++ b/app/helpers/persistence.py
@@ -1,12 +1,14 @@
-from app.helpers.logging import logger
+from contextlib import asynccontextmanager
+from typing import AsyncGenerator
+
 from azure.core.credentials import AzureKeyCredential
 from azure.core.exceptions import ResourceExistsError
 from azure.search.documents.aio import SearchClient
 from azure.storage.blob.aio import BlobServiceClient, ContainerClient
 from azure.storage.queue.aio import QueueClient, QueueServiceClient
-from contextlib import asynccontextmanager
 from openai import AsyncAzureOpenAI
-from typing import AsyncGenerator
+
+from app.helpers.logging import logger
 
 
 @asynccontextmanager
@@ -55,9 +57,7 @@ async def blob_client(
     """
     Get the Azure Blob Storage client.
     """
-    async with BlobServiceClient.from_connection_string(
-        connection_string
-    ) as x:
+    async with BlobServiceClient.from_connection_string(connection_string) as x:
         client = x.get_container_client(container)
 
         # Create if it does not exist
@@ -79,9 +79,7 @@ async def queue_client(
     """
     Get the Azure Queue Storage client.
     """
-    async with QueueServiceClient.from_connection_string(
-        connection_string
-    ) as x:
+    async with QueueServiceClient.from_connection_string(connection_string) as x:
         client = x.get_queue_client(queue)
 
         # Create if it does not exist
diff --git a/app/index.py b/app/index.py
index d470fe3..e15bd03 100644
--- a/app/index.py
+++ b/app/index.py
@@ -186,10 +186,7 @@ def _markdown_chunck(
     The text is split by Markdown headings, and each chunk is as big as possible without exceeding the max_tokens limit.
     """
     contents = []
-    if (
-        _count_tokens(text)
-        < max_tokens
-    ):  # If the text is small enough
+    if _count_tokens(text) < max_tokens:  # If the text is small enough
         contents.append(text)
         return contents
 
@@ -266,10 +263,7 @@ def _rebuild_headings() -> str:
         ).strip()
 
         # Chunck if is still too big
-        current_cleaned_count = math.ceil(
-            _count_tokens(current_cleaned)
-            / max_tokens
-        )
+        current_cleaned_count = math.ceil(_count_tokens(current_cleaned) / max_tokens)
         current_cleaned_chunck_size = math.ceil(
             len(current_cleaned) / current_cleaned_count
         )
@@ -300,8 +294,7 @@ def _rebuild_headings() -> str:
                     current_chunk += f"### {last_h3_head}\n"
                 for h4_head, h4_content in h3_next.items():
                     if (
-                        _count_tokens(current_chunk)
-                        >= max_tokens
+                        _count_tokens(current_chunk) >= max_tokens
                     ):  # If the chunk is too big
                         # Re-apply the last heading to the next chunk
                         current_chunk = _split_paragraph(
@@ -424,7 +417,9 @@ async def _worker(
                                 # TODO: Add a dead-letter queue
                                 # TODO: Add a retry mechanism
                                 # TODO: Narrow the exception type
-                                logger.error("Error processing %s", blob_name, exc_info=True)
+                                logger.error(
+                                    "Error processing %s", blob_name, exc_info=True
+                                )
 
                         # Wait 3 sec to avoid spamming the queue if it is empty
                         await asyncio.sleep(3)
diff --git a/app/models/state.py b/app/models/state.py
index eaf2c12..be21916 100644
--- a/app/models/state.py
+++ b/app/models/state.py
@@ -1,4 +1,5 @@
-from datetime import datetime, UTC
+from datetime import UTC, datetime
+
 from pydantic import BaseModel, Field
 
 
diff --git a/app/scrape.py b/app/scrape.py
index 02bbbc6..553ee14 100644
--- a/app/scrape.py
+++ b/app/scrape.py
@@ -3,7 +3,7 @@
 from urllib.parse import urlparse
 from uuid import uuid4
 
-from azure.core.exceptions import ResourceNotFoundError, ResourceExistsError
+from azure.core.exceptions import ResourceExistsError, ResourceNotFoundError
 from azure.storage.blob.aio import ContainerClient
 from azure.storage.queue.aio import QueueClient
 from html2text import HTML2Text
@@ -121,7 +121,9 @@ async def _add(
             # Skip if the previous attempt is too recent
             # Date is now and not the one from the model, on purposes. Otherwise, if its a cached model, the date would match the frefresher date every time.
             if previous.created_at >= datetime.now(UTC) - cache_refresh:
-                logger.debug("Skipping %s due to recent attempt at %s", url, previous.created_at)
+                logger.debug(
+                    "Skipping %s due to recent attempt at %s", url, previous.created_at
+                )
                 return False
 
         except (ResourceNotFoundError, ValidationError):
@@ -395,8 +397,12 @@ async def _worker(
                             except ResourceExistsError:  # Wait for the lease to expire
                                 logger.debug("Lease already exists, waiting")
                                 await asyncio.sleep(1)
-                            except ResourceNotFoundError:  # Create the blob if it does not exist
-                                logger.debug("State blob does not exist, creating an empty one")
+                            except (
+                                ResourceNotFoundError
+                            ):  # Create the blob if it does not exist
+                                logger.debug(
+                                    "State blob does not exist, creating an empty one"
+                                )
                                 await state_blob.upload_blob(
                                     data=b"",
                                     length=0,
@@ -421,7 +427,11 @@ async def _worker(
                         )
                         # Release the lease
                         await state_lease.release()
-                        logger.info("Updated job state to %i processed and %i queued", state.processed, state.queued)
+                        logger.info(
+                            "Updated job state to %i processed and %i queued",
+                            state.processed,
+                            state.queued,
+                        )
 
                         # Wait 3 sec to avoid spamming the queue if it is empty
                         await asyncio.sleep(3)