diff --git a/README.md b/README.md index 53c37a4..5211dbd 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,8 @@ Scraper: Indexer: - [x] AI Search index is created automatically -- [x] Chunck markdown while keeping the content coherent -- [x] Embed chuncks with OpenAI embeddings +- [x] Chunk markdown while keeping the content coherent +- [x] Embed chunks with OpenAI embeddings - [x] Indexed content is semantically searchable with [Azure AI Search](https://learn.microsoft.com/en-us/azure/search) ## How to use @@ -156,7 +156,7 @@ graph LR web["Website"] subgraph "Azure Queue Storage" - to_chunck["To chunck"] + to_chunk["To chunk"] to_scrape["To scrape"] end @@ -174,7 +174,7 @@ graph LR cli -- 4. Update cache --> scraped cli -- 5. Push state --> state cli -- 6. Add message --> to_scrape - cli -- 7. Add message --> to_chunck + cli -- 7. Add message --> to_chunk cli -- 8. Update state --> job ``` @@ -187,7 +187,7 @@ graph LR embeddings["Azure OpenAI Embeddings"] subgraph "Azure Queue Storage" - to_chunck["To chunck"] + to_chunk["To chunk"] end subgraph "Azure Blob Storage" @@ -196,7 +196,7 @@ graph LR end end - cli -- 1. Pull message --> to_chunck + cli -- 1. Pull message --> to_chunk cli -- 2. Get cache --> scraped cli -- 3. Chunk --> cli cli -- 4. Embed --> embeddings diff --git a/app/index.py b/app/index.py index 2378e44..8bd4120 100644 --- a/app/index.py +++ b/app/index.py @@ -75,13 +75,13 @@ async def _process_one( logger.info("%s data is invalid (code %i)", short_name, result.status) return - # Chunck to small markdown files - chuncks = _markdown_chunck( + # Chunk to small markdown files + chunks = _markdown_chunk( max_tokens=800, text=result.content, ) - doc_ids = [f"{hash_url(result.url)}-{i}" for i in range(len(chuncks))] - logger.info("%s chunked into %i parts", short_name, len(chuncks)) + doc_ids = [f"{hash_url(result.url)}-{i}" for i in range(len(chunks))] + logger.info("%s chunked into %i parts", short_name, len(chunks)) # Check if the document is already indexed try: @@ -95,14 +95,14 @@ async def _process_one( return except ( ResourceNotFoundError - ): # If a chunck is not found, it is not indexed, thus we can re-process the document + ): # If a chunk is not found, it is not indexed, thus we can re-process the document pass # Generate the embeddings by block (mitigate API throughput limits) embeddings = [] chunks_size = 10 - for i in range(0, len(chuncks), chunks_size): - chunk_input = chuncks[i : i + chunks_size] + for i in range(0, len(chunks), chunks_size): + chunk_input = chunks[i : i + chunks_size] res = await _embeddings( embedding_deployment=embedding_deployment, embedding_dimensions=embedding_dimensions, @@ -119,7 +119,7 @@ async def _process_one( url=result.url, vectors=embedding.embedding, ) - for doc_id, content, embedding in zip(doc_ids, chuncks, embeddings) + for doc_id, content, embedding in zip(doc_ids, chunks, embeddings) ] # Index the documents @@ -181,7 +181,7 @@ async def _embeddings( ) -def _markdown_chunck( +def _markdown_chunk( max_tokens: int, text: str, ) -> list[str]: @@ -267,19 +267,19 @@ def _rebuild_headings() -> str: current_chunk.splitlines()[: -(to_remove + 1)] ).strip() - # Chunck if is still too big + # Chunk if is still too big current_cleaned_count = math.ceil(_count_tokens(current_cleaned) / max_tokens) - current_cleaned_chunck_size = math.ceil( + current_cleaned_chunk_size = math.ceil( len(current_cleaned) / current_cleaned_count ) for i in range(current_cleaned_count): # Iterate over the chunks - chunck_content = current_cleaned[ - i * current_cleaned_chunck_size : (i + 1) * current_cleaned_chunck_size + chunk_content = current_cleaned[ + i * current_cleaned_chunk_size : (i + 1) * current_cleaned_chunk_size ] if i == 0: # Headings only on the first chunk - contents.append(chunck_content) + contents.append(chunk_content) else: # Re-apply the last heading to the next chunk - contents.append(_rebuild_headings() + chunck_content) + contents.append(_rebuild_headings() + chunk_content) return _rebuild_headings()