Merge branch 'develop'

clemlesne · Aug 16, 2024 · 555ba0f · 555ba0f
2 parents 1359faa + 1471163
commit 555ba0f
Show file tree

Hide file tree

Showing 6 changed files with 42 additions and 29 deletions.
diff --git a/README.md b/README.md
@@ -112,10 +112,14 @@ scrape-it-now index run --help
 
 ### Source environment variables
 
-To configure easily the CLI, source environment variables from a `.env` file.
-
-For example:
+To configure easily the CLI, source environment variables from a `.env` file. For example, for the `--azure-storage-connection-string` option:
 
 ```bash
 AZURE_STORAGE_CONNECTION_STRING=xxx
 ```
+
+For arguments that accept multiple values, use a space-separated list. For example, for the `--whitelist` option:
+
+```bash
+WHITELIST=learn\.microsoft\.com,^/(?!en-us).*,^/[^/]+/answers/,^/[^/]+/previous-versions/ go\.microsoft\.com,.*
+```
diff --git a/app/app.py b/app/app.py
@@ -64,6 +64,7 @@ def scrape() -> None:
     "--whitelist",
     "-w",
     envvar="WHITELIST",
+    help="Comma separated list of domains and paths to whitelist. Example, to filter nytimes.com and only the pages from 2024 pages, use 'nytimes.com,^/2024/'.",
     multiple=True,
     type=str,
 )
@@ -177,6 +178,8 @@ async def scrape_run(
     for v in viewport:
         width, height = v.split("x")
         viewports_parsed.append((int(width), int(height)))
+    if viewports_parsed:
+        logger.info("Viewports: %s", viewports_parsed)
 
     # Parse whitelist
     whitelist_parsed: dict[re.Pattern, list[re.Pattern]] = {}
@@ -190,6 +193,8 @@ async def scrape_run(
         for path in w.split(",")[1:]:
             path = re.compile(path.strip())
             whitelist_parsed[domain].append(path)
+    if whitelist_parsed:
+        logger.info("Whitelist: %s", whitelist_parsed)
 
     await scrape_backend_run(
         cache_refresh=cache_refresh,

diff --git a/app/helpers/persistence.py b/app/helpers/persistence.py
@@ -1,12 +1,14 @@
-from app.helpers.logging import logger
+from contextlib import asynccontextmanager
+from typing import AsyncGenerator
+
 from azure.core.credentials import AzureKeyCredential
 from azure.core.exceptions import ResourceExistsError
 from azure.search.documents.aio import SearchClient
 from azure.storage.blob.aio import BlobServiceClient, ContainerClient
 from azure.storage.queue.aio import QueueClient, QueueServiceClient
-from contextlib import asynccontextmanager
 from openai import AsyncAzureOpenAI
-from typing import AsyncGenerator
+
+from app.helpers.logging import logger
 
 
 @asynccontextmanager
@@ -55,9 +57,7 @@ async def blob_client(
     """
     Get the Azure Blob Storage client.
     """
-    async with BlobServiceClient.from_connection_string(
-        connection_string
-    ) as x:
+    async with BlobServiceClient.from_connection_string(connection_string) as x:
         client = x.get_container_client(container)
 
         # Create if it does not exist
@@ -79,9 +79,7 @@ async def queue_client(
     """
     Get the Azure Queue Storage client.
     """
-    async with QueueServiceClient.from_connection_string(
-        connection_string
-    ) as x:
+    async with QueueServiceClient.from_connection_string(connection_string) as x:
         client = x.get_queue_client(queue)
 
         # Create if it does not exist

diff --git a/app/index.py b/app/index.py
@@ -186,10 +186,7 @@ def _markdown_chunck(
     The text is split by Markdown headings, and each chunk is as big as possible without exceeding the max_tokens limit.
     """
     contents = []
-    if (
-        _count_tokens(text)
-        < max_tokens
-    ):  # If the text is small enough
+    if _count_tokens(text) < max_tokens:  # If the text is small enough
         contents.append(text)
         return contents
 
@@ -266,10 +263,7 @@ def _rebuild_headings() -> str:
         ).strip()
 
         # Chunck if is still too big
-        current_cleaned_count = math.ceil(
-            _count_tokens(current_cleaned)
-            / max_tokens
-        )
+        current_cleaned_count = math.ceil(_count_tokens(current_cleaned) / max_tokens)
         current_cleaned_chunck_size = math.ceil(
             len(current_cleaned) / current_cleaned_count
         )
@@ -300,8 +294,7 @@ def _rebuild_headings() -> str:
                     current_chunk += f"### {last_h3_head}\n"
                 for h4_head, h4_content in h3_next.items():
                     if (
-                        _count_tokens(current_chunk)
-                        >= max_tokens
+                        _count_tokens(current_chunk) >= max_tokens
                     ):  # If the chunk is too big
                         # Re-apply the last heading to the next chunk
                         current_chunk = _split_paragraph(
@@ -424,7 +417,9 @@ async def _worker(
                                 # TODO: Add a dead-letter queue
                                 # TODO: Add a retry mechanism
                                 # TODO: Narrow the exception type
-                                logger.error("Error processing %s", blob_name, exc_info=True)
+                                logger.error(
+                                    "Error processing %s", blob_name, exc_info=True
+                                )
 
                         # Wait 3 sec to avoid spamming the queue if it is empty
                         await asyncio.sleep(3)

diff --git a/app/models/state.py b/app/models/state.py
@@ -1,4 +1,5 @@
-from datetime import datetime, UTC
+from datetime import UTC, datetime
+
 from pydantic import BaseModel, Field
 
 

diff --git a/app/scrape.py b/app/scrape.py
@@ -3,7 +3,7 @@
 from urllib.parse import urlparse
 from uuid import uuid4
 
-from azure.core.exceptions import ResourceNotFoundError, ResourceExistsError
+from azure.core.exceptions import ResourceExistsError, ResourceNotFoundError
 from azure.storage.blob.aio import ContainerClient
 from azure.storage.queue.aio import QueueClient
 from html2text import HTML2Text
@@ -121,7 +121,9 @@ async def _add(
             # Skip if the previous attempt is too recent
             # Date is now and not the one from the model, on purposes. Otherwise, if its a cached model, the date would match the frefresher date every time.
             if previous.created_at >= datetime.now(UTC) - cache_refresh:
-                logger.debug("Skipping %s due to recent attempt at %s", url, previous.created_at)
+                logger.debug(
+                    "Skipping %s due to recent attempt at %s", url, previous.created_at
+                )
                 return False
 
         except (ResourceNotFoundError, ValidationError):
@@ -395,8 +397,12 @@ async def _worker(
                             except ResourceExistsError:  # Wait for the lease to expire
                                 logger.debug("Lease already exists, waiting")
                                 await asyncio.sleep(1)
-                            except ResourceNotFoundError:  # Create the blob if it does not exist
-                                logger.debug("State blob does not exist, creating an empty one")
+                            except (
+                                ResourceNotFoundError
+                            ):  # Create the blob if it does not exist
+                                logger.debug(
+                                    "State blob does not exist, creating an empty one"
+                                )
                                 await state_blob.upload_blob(
                                     data=b"",
                                     length=0,
@@ -421,7 +427,11 @@ async def _worker(
                         )
                         # Release the lease
                         await state_lease.release()
-                        logger.info("Updated job state to %i processed and %i queued", state.processed, state.queued)
+                        logger.info(
+                            "Updated job state to %i processed and %i queued",
+                            state.processed,
+                            state.queued,
+                        )
 
                         # Wait 3 sec to avoid spamming the queue if it is empty
                         await asyncio.sleep(3)