Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
clemlesne committed Aug 16, 2024
2 parents 1359faa + 1471163 commit 555ba0f
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 29 deletions.
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,14 @@ scrape-it-now index run --help

### Source environment variables

To configure easily the CLI, source environment variables from a `.env` file.

For example:
To configure easily the CLI, source environment variables from a `.env` file. For example, for the `--azure-storage-connection-string` option:

```bash
AZURE_STORAGE_CONNECTION_STRING=xxx
```

For arguments that accept multiple values, use a space-separated list. For example, for the `--whitelist` option:

```bash
WHITELIST=learn\.microsoft\.com,^/(?!en-us).*,^/[^/]+/answers/,^/[^/]+/previous-versions/ go\.microsoft\.com,.*
```
5 changes: 5 additions & 0 deletions app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def scrape() -> None:
"--whitelist",
"-w",
envvar="WHITELIST",
help="Comma separated list of domains and paths to whitelist. Example, to filter nytimes.com and only the pages from 2024 pages, use 'nytimes.com,^/2024/'.",
multiple=True,
type=str,
)
Expand Down Expand Up @@ -177,6 +178,8 @@ async def scrape_run(
for v in viewport:
width, height = v.split("x")
viewports_parsed.append((int(width), int(height)))
if viewports_parsed:
logger.info("Viewports: %s", viewports_parsed)

# Parse whitelist
whitelist_parsed: dict[re.Pattern, list[re.Pattern]] = {}
Expand All @@ -190,6 +193,8 @@ async def scrape_run(
for path in w.split(",")[1:]:
path = re.compile(path.strip())
whitelist_parsed[domain].append(path)
if whitelist_parsed:
logger.info("Whitelist: %s", whitelist_parsed)

await scrape_backend_run(
cache_refresh=cache_refresh,
Expand Down
16 changes: 7 additions & 9 deletions app/helpers/persistence.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from app.helpers.logging import logger
from contextlib import asynccontextmanager
from typing import AsyncGenerator

from azure.core.credentials import AzureKeyCredential
from azure.core.exceptions import ResourceExistsError
from azure.search.documents.aio import SearchClient
from azure.storage.blob.aio import BlobServiceClient, ContainerClient
from azure.storage.queue.aio import QueueClient, QueueServiceClient
from contextlib import asynccontextmanager
from openai import AsyncAzureOpenAI
from typing import AsyncGenerator

from app.helpers.logging import logger


@asynccontextmanager
Expand Down Expand Up @@ -55,9 +57,7 @@ async def blob_client(
"""
Get the Azure Blob Storage client.
"""
async with BlobServiceClient.from_connection_string(
connection_string
) as x:
async with BlobServiceClient.from_connection_string(connection_string) as x:
client = x.get_container_client(container)

# Create if it does not exist
Expand All @@ -79,9 +79,7 @@ async def queue_client(
"""
Get the Azure Queue Storage client.
"""
async with QueueServiceClient.from_connection_string(
connection_string
) as x:
async with QueueServiceClient.from_connection_string(connection_string) as x:
client = x.get_queue_client(queue)

# Create if it does not exist
Expand Down
17 changes: 6 additions & 11 deletions app/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,10 +186,7 @@ def _markdown_chunck(
The text is split by Markdown headings, and each chunk is as big as possible without exceeding the max_tokens limit.
"""
contents = []
if (
_count_tokens(text)
< max_tokens
): # If the text is small enough
if _count_tokens(text) < max_tokens: # If the text is small enough
contents.append(text)
return contents

Expand Down Expand Up @@ -266,10 +263,7 @@ def _rebuild_headings() -> str:
).strip()

# Chunck if is still too big
current_cleaned_count = math.ceil(
_count_tokens(current_cleaned)
/ max_tokens
)
current_cleaned_count = math.ceil(_count_tokens(current_cleaned) / max_tokens)
current_cleaned_chunck_size = math.ceil(
len(current_cleaned) / current_cleaned_count
)
Expand Down Expand Up @@ -300,8 +294,7 @@ def _rebuild_headings() -> str:
current_chunk += f"### {last_h3_head}\n"
for h4_head, h4_content in h3_next.items():
if (
_count_tokens(current_chunk)
>= max_tokens
_count_tokens(current_chunk) >= max_tokens
): # If the chunk is too big
# Re-apply the last heading to the next chunk
current_chunk = _split_paragraph(
Expand Down Expand Up @@ -424,7 +417,9 @@ async def _worker(
# TODO: Add a dead-letter queue
# TODO: Add a retry mechanism
# TODO: Narrow the exception type
logger.error("Error processing %s", blob_name, exc_info=True)
logger.error(
"Error processing %s", blob_name, exc_info=True
)

# Wait 3 sec to avoid spamming the queue if it is empty
await asyncio.sleep(3)
Expand Down
3 changes: 2 additions & 1 deletion app/models/state.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from datetime import datetime, UTC
from datetime import UTC, datetime

from pydantic import BaseModel, Field


Expand Down
20 changes: 15 additions & 5 deletions app/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from urllib.parse import urlparse
from uuid import uuid4

from azure.core.exceptions import ResourceNotFoundError, ResourceExistsError
from azure.core.exceptions import ResourceExistsError, ResourceNotFoundError
from azure.storage.blob.aio import ContainerClient
from azure.storage.queue.aio import QueueClient
from html2text import HTML2Text
Expand Down Expand Up @@ -121,7 +121,9 @@ async def _add(
# Skip if the previous attempt is too recent
# Date is now and not the one from the model, on purposes. Otherwise, if its a cached model, the date would match the frefresher date every time.
if previous.created_at >= datetime.now(UTC) - cache_refresh:
logger.debug("Skipping %s due to recent attempt at %s", url, previous.created_at)
logger.debug(
"Skipping %s due to recent attempt at %s", url, previous.created_at
)
return False

except (ResourceNotFoundError, ValidationError):
Expand Down Expand Up @@ -395,8 +397,12 @@ async def _worker(
except ResourceExistsError: # Wait for the lease to expire
logger.debug("Lease already exists, waiting")
await asyncio.sleep(1)
except ResourceNotFoundError: # Create the blob if it does not exist
logger.debug("State blob does not exist, creating an empty one")
except (
ResourceNotFoundError
): # Create the blob if it does not exist
logger.debug(
"State blob does not exist, creating an empty one"
)
await state_blob.upload_blob(
data=b"",
length=0,
Expand All @@ -421,7 +427,11 @@ async def _worker(
)
# Release the lease
await state_lease.release()
logger.info("Updated job state to %i processed and %i queued", state.processed, state.queued)
logger.info(
"Updated job state to %i processed and %i queued",
state.processed,
state.queued,
)

# Wait 3 sec to avoid spamming the queue if it is empty
await asyncio.sleep(3)
Expand Down

0 comments on commit 555ba0f

Please sign in to comment.