chore: Fix typo in "chunk" work

clemlesne · Aug 17, 2024 · df9c269 · df9c269
1 parent 5927f3a
commit df9c269
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -26,8 +26,8 @@ Scraper:
 Indexer:
 
 - [x] AI Search index is created automatically
-- [x] Chunck markdown while keeping the content coherent
-- [x] Embed chuncks with OpenAI embeddings
+- [x] Chunk markdown while keeping the content coherent
+- [x] Embed chunks with OpenAI embeddings
 - [x] Indexed content is semantically searchable with [Azure AI Search](https://learn.microsoft.com/en-us/azure/search)
 
 ## How to use
@@ -156,7 +156,7 @@ graph LR
   web["Website"]
 
   subgraph "Azure Queue Storage"
-    to_chunck["To chunck"]
+    to_chunk["To chunk"]
     to_scrape["To scrape"]
   end
 
@@ -174,7 +174,7 @@ graph LR
   cli -- 4. Update cache --> scraped
   cli -- 5. Push state --> state
   cli -- 6. Add message --> to_scrape
-  cli -- 7. Add message --> to_chunck
+  cli -- 7. Add message --> to_chunk
   cli -- 8. Update state --> job
 ```
 
@@ -187,7 +187,7 @@ graph LR
   embeddings["Azure OpenAI Embeddings"]
 
   subgraph "Azure Queue Storage"
-    to_chunck["To chunck"]
+    to_chunk["To chunk"]
   end
 
   subgraph "Azure Blob Storage"
@@ -196,7 +196,7 @@ graph LR
     end
   end
 
-  cli -- 1. Pull message --> to_chunck
+  cli -- 1. Pull message --> to_chunk
   cli -- 2. Get cache --> scraped
   cli -- 3. Chunk --> cli
   cli -- 4. Embed --> embeddings

diff --git a/app/index.py b/app/index.py
@@ -75,13 +75,13 @@ async def _process_one(
         logger.info("%s data is invalid (code %i)", short_name, result.status)
         return
 
-    # Chunck to small markdown files
-    chuncks = _markdown_chunck(
+    # Chunk to small markdown files
+    chunks = _markdown_chunk(
         max_tokens=800,
         text=result.content,
     )
-    doc_ids = [f"{hash_url(result.url)}-{i}" for i in range(len(chuncks))]
-    logger.info("%s chunked into %i parts", short_name, len(chuncks))
+    doc_ids = [f"{hash_url(result.url)}-{i}" for i in range(len(chunks))]
+    logger.info("%s chunked into %i parts", short_name, len(chunks))
 
     # Check if the document is already indexed
     try:
@@ -95,14 +95,14 @@ async def _process_one(
         return
     except (
         ResourceNotFoundError
-    ):  # If a chunck is not found, it is not indexed, thus we can re-process the document
+    ):  # If a chunk is not found, it is not indexed, thus we can re-process the document
         pass
 
     # Generate the embeddings by block (mitigate API throughput limits)
     embeddings = []
     chunks_size = 10
-    for i in range(0, len(chuncks), chunks_size):
-        chunk_input = chuncks[i : i + chunks_size]
+    for i in range(0, len(chunks), chunks_size):
+        chunk_input = chunks[i : i + chunks_size]
         res = await _embeddings(
             embedding_deployment=embedding_deployment,
             embedding_dimensions=embedding_dimensions,
@@ -119,7 +119,7 @@ async def _process_one(
             url=result.url,
             vectors=embedding.embedding,
         )
-        for doc_id, content, embedding in zip(doc_ids, chuncks, embeddings)
+        for doc_id, content, embedding in zip(doc_ids, chunks, embeddings)
     ]
 
     # Index the documents
@@ -181,7 +181,7 @@ async def _embeddings(
     )
 
 
-def _markdown_chunck(
+def _markdown_chunk(
     max_tokens: int,
     text: str,
 ) -> list[str]:
@@ -267,19 +267,19 @@ def _rebuild_headings() -> str:
             current_chunk.splitlines()[: -(to_remove + 1)]
         ).strip()
 
-        # Chunck if is still too big
+        # Chunk if is still too big
         current_cleaned_count = math.ceil(_count_tokens(current_cleaned) / max_tokens)
-        current_cleaned_chunck_size = math.ceil(
+        current_cleaned_chunk_size = math.ceil(
             len(current_cleaned) / current_cleaned_count
         )
         for i in range(current_cleaned_count):  # Iterate over the chunks
-            chunck_content = current_cleaned[
-                i * current_cleaned_chunck_size : (i + 1) * current_cleaned_chunck_size
+            chunk_content = current_cleaned[
+                i * current_cleaned_chunk_size : (i + 1) * current_cleaned_chunk_size
             ]
             if i == 0:  # Headings only on the first chunk
-                contents.append(chunck_content)
+                contents.append(chunk_content)
             else:  # Re-apply the last heading to the next chunk
-                contents.append(_rebuild_headings() + chunck_content)
+                contents.append(_rebuild_headings() + chunk_content)
 
         return _rebuild_headings()