microsoft · AlonsoGuevara · Oct 30, 2024 · Aug 30, 2024 · Sep 3, 2024 · Sep 4, 2024
@@ -0,0 +1,4 @@
+{
+  "type": "minor",
+  "description": "Add Incremental Indexing"
+}
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Add relationship merge"
+}
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Add text units update"
+}
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Add naive community merge using time period"
+}
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Add config for incremental updates"
+}
@@ -24,7 +24,6 @@ async def build_index(
     config: GraphRagConfig,
     run_id: str = "",
     is_resume_run: bool = False,
-    is_update_run: bool = False,
     memory_profile: bool = False,
     progress_reporter: ProgressReporter | None = None,
     emit: list[TableEmitterType] = [TableEmitterType.Parquet],  # noqa: B006
@@ -54,6 +53,8 @@ async def build_index(
     list[PipelineRunResult]
         The list of pipeline run results
     """
+    is_update_run = bool(config.update_index_storage)
+
     if is_resume_run and is_update_run:
         msg = "Cannot resume and update a run at the same time."
         raise ValueError(msg)

@@ -69,7 +69,6 @@ def index_cli(
     root_dir: Path,
     verbose: bool,
     resume: str | None,
-    update_index_id: str | None,
     memprofile: bool,
     cache: bool,
     reporter: ReporterType,
@@ -82,7 +81,7 @@ def index_cli(
     """Run the pipeline with the given config."""
     progress_reporter = create_progress_reporter(reporter)
     info, error, success = _logger(progress_reporter)
-    run_id = resume or update_index_id or time.strftime("%Y%m%d-%H%M%S")
+    run_id = resume or time.strftime("%Y%m%d-%H%M%S")
 
     config = load_config(root_dir, config_filepath)
     config.storage.base_dir = str(output_dir) if output_dir else config.storage.base_dir
@@ -123,7 +122,6 @@ def index_cli(
             config=config,
             run_id=run_id,
             is_resume_run=bool(resume),
-            is_update_run=bool(update_index_id),
             memory_profile=memprofile,
             progress_reporter=progress_reporter,
             emit=emit,

@@ -102,12 +102,6 @@ def _index_cli(
             help="Skip any preflight validation. Useful when running no LLM steps."
         ),
     ] = False,
-    update_index: Annotated[
-        str | None,
-        typer.Option(
-            help="Update an index run id, leveraging previous outputs and applying new indexes."
-        ),
-    ] = None,
     output: Annotated[
         Path | None,
         typer.Option(
@@ -119,15 +113,10 @@ def _index_cli(
     ] = None,
 ):
     """Build a knowledge graph index."""
-    if resume and update_index:
-        msg = "Cannot resume and update a run at the same time"
-        raise ValueError(msg)
-
     index_cli(
         root_dir=root,
         verbose=verbose,
         resume=resume,
-        update_index_id=update_index,
         memprofile=memprofile,
         cache=cache,
         reporter=ReporterType(reporter),

@@ -375,6 +375,25 @@ def hydrate_parallelization_params(
                 container_name=reader.str(Fragment.container_name),
                 base_dir=reader.str(Fragment.base_dir) or defs.STORAGE_BASE_DIR,
             )
+
+        with (
+            reader.envvar_prefix(Section.update_index_storage),
+            reader.use(values.get("update_index_storage")),
+        ):
+            s_type = reader.str(Fragment.type)
+            if s_type:
+                update_index_storage_model = StorageConfig(
+                    type=StorageType(s_type) if s_type else defs.STORAGE_TYPE,
+                    connection_string=reader.str(Fragment.conn_string),
+                    storage_account_blob_url=reader.str(
+                        Fragment.storage_account_blob_url
+                    ),
+                    container_name=reader.str(Fragment.container_name),
+                    base_dir=reader.str(Fragment.base_dir)
+                    or defs.UPDATE_STORAGE_BASE_DIR,
+                )
+            else:
+                update_index_storage_model = None
         with reader.envvar_prefix(Section.chunk), reader.use(values.get("chunks")):
             group_by_columns = reader.list("group_by_columns", "BY_COLUMNS")
             if group_by_columns is None:
@@ -547,6 +566,7 @@ def hydrate_parallelization_params(
         embed_graph=embed_graph_model,
         reporting=reporting_model,
         storage=storage_model,
+        update_index_storage=update_index_storage_model,
         cache=cache_model,
         input=input_model,
         chunks=chunks_model,
@@ -624,6 +644,7 @@ class Section(str, Enum):
     storage = "STORAGE"
     summarize_descriptions = "SUMMARIZE_DESCRIPTIONS"
     umap = "UMAP"
+    update_index_storage = "UPDATE_INDEX_STORAGE"
     local_search = "LOCAL_SEARCH"
     global_search = "GLOBAL_SEARCH"
 

@@ -86,6 +86,7 @@
 STORAGE_TYPE = StorageType.file
 SUMMARIZE_DESCRIPTIONS_MAX_LENGTH = 500
 UMAP_ENABLED = False
+UPDATE_STORAGE_BASE_DIR = "update_output"
 
 VECTOR_STORE = f"""
     type: {VectorStoreType.LanceDB.value}

@@ -54,6 +54,12 @@ def __str__(self):
     )
     """The storage configuration."""
 
+    update_index_storage: StorageConfig | None = Field(
+        description="The storage configuration for the updated index.",
+        default=None,
+    )
+    """The storage configuration for the updated index."""
+
     cache: CacheConfig = Field(
         description="The cache configuration.", default=CacheConfig()
     )

@@ -183,6 +183,18 @@ def resolve_paths(
             )
         )
 
+    if (
+        config.update_index_storage
+        and config.update_index_storage.type == StorageType.file
+    ):
+        config.update_index_storage.base_dir = str(
+            resolve_path(
+                config.update_index_storage.base_dir,
+                config.root_dir,
+                pattern_or_timestamp_value,
+            )
+        )
+
     if config.reporting.type == ReportingType.file:
         config.reporting.base_dir = str(
             resolve_path(

@@ -47,6 +47,11 @@ def __str__(self):
     )
     """The storage configuration for the pipeline."""
 
+    update_index_storage: PipelineStorageConfigTypes | None = pydantic_Field(
+        default=None, discriminator="type"
+    )
+    """The storage configuration for the updated index."""
+
     cache: PipelineCacheConfigTypes | None = pydantic_Field(
         default=None, discriminator="type"
     )

@@ -14,10 +14,7 @@
     StorageType,
     TextEmbeddingTarget,
 )
-from graphrag.config.models import (
-    GraphRagConfig,
-    TextEmbeddingConfig,
-)
+from graphrag.config.models import GraphRagConfig, StorageConfig, TextEmbeddingConfig
 from graphrag.index.config.cache import (
     PipelineBlobCacheConfig,
     PipelineCacheConfigTypes,
@@ -118,7 +115,10 @@ def create_pipeline_config(settings: GraphRagConfig, verbose=False) -> PipelineC
         root_dir=settings.root_dir,
         input=_get_pipeline_input_config(settings),
         reporting=_get_reporting_config(settings),
-        storage=_get_storage_config(settings),
+        storage=_get_storage_config(settings, settings.storage),
+        update_index_storage=_get_storage_config(
+            settings, settings.update_index_storage
+        ),
         cache=_get_cache_config(settings),
         workflows=[
             *_document_workflows(settings, embedded_fields),
@@ -469,23 +469,26 @@ def _get_reporting_config(
 
 def _get_storage_config(
     settings: GraphRagConfig,
-) -> PipelineStorageConfigTypes:
+    storage_settings: StorageConfig | None,
+) -> PipelineStorageConfigTypes | None:
     """Get the storage type from the settings."""
+    if not storage_settings:
+        return None
     root_dir = settings.root_dir
-    match settings.storage.type:
+    match storage_settings.type:
         case StorageType.memory:
             return PipelineMemoryStorageConfig()
         case StorageType.file:
             # relative to the root_dir
-            base_dir = settings.storage.base_dir
+            base_dir = storage_settings.base_dir
             if base_dir is None:
                 msg = "Base directory must be provided for file storage."
                 raise ValueError(msg)
             return PipelineFileStorageConfig(base_dir=str(Path(root_dir) / base_dir))
         case StorageType.blob:
-            connection_string = settings.storage.connection_string
-            storage_account_blob_url = settings.storage.storage_account_blob_url
-            container_name = settings.storage.container_name
+            connection_string = storage_settings.connection_string
+            storage_account_blob_url = storage_settings.storage_account_blob_url
+            container_name = storage_settings.container_name
             if container_name is None:
                 msg = "Container name must be provided for blob storage."
                 raise ValueError(msg)
@@ -495,12 +498,12 @@ def _get_storage_config(
             return PipelineBlobStorageConfig(
                 connection_string=connection_string,
                 container_name=container_name,
-                base_dir=settings.storage.base_dir,
+                base_dir=storage_settings.base_dir,
                 storage_account_blob_url=storage_account_blob_url,
             )
         case _:
             # relative to the root_dir
-            base_dir = settings.storage.base_dir
+            base_dir = storage_settings.base_dir
             if base_dir is None:
                 msg = "Base directory must be provided for file storage."
                 raise ValueError(msg)

@@ -3,6 +3,8 @@
 
 """All the steps to transform final communities."""
 
+from datetime import datetime, timezone
+
 import pandas as pd
 from datashaper import (
     VerbCallbacks,
@@ -61,6 +63,12 @@ def create_final_communities(
 
     filtered["title"] = "Community " + filtered["id"].astype(str)
 
+    # Add period timestamp to the community reports
+    filtered["period"] = datetime.now(timezone.utc).date().isoformat()
+
+    # Add size of the community
+    filtered["size"] = filtered.loc[:, "text_unit_ids"].apply(lambda x: len(x))
+
     return filtered.loc[
         :,
         [
@@ -69,5 +77,7 @@ def create_final_communities(
             "level",
             "relationship_ids",
             "text_unit_ids",
+            "period",
+            "size",
         ],
     ]
@@ -42,6 +42,7 @@
 async def create_final_community_reports(
     nodes_input: pd.DataFrame,
     edges_input: pd.DataFrame,
+    communities_input: pd.DataFrame,
     claims_input: pd.DataFrame | None,
     callbacks: VerbCallbacks,
     cache: PipelineCache,
@@ -118,7 +119,15 @@ async def create_final_community_reports(
             embedding_name="community_report_title",
         )
 
-    return community_reports
+    # Merge by community and it with communities to add size and period
+    return community_reports.merge(
+        communities_input.loc[:, ["id", "size", "period"]],
+        left_on="community",
+        right_on="id",
+        how="left",
+        copy=False,
+        suffixes=("", "_y"),
+    ).drop(columns=["id_y"])
 
 
 def _prep_nodes(input: pd.DataFrame) -> pd.DataFrame:

@@ -58,7 +58,7 @@ def prep_community_report_context(
             invalid_context_df, max_tokens
         )
         set_context_size(invalid_context_df)
-        invalid_context_df[schemas.CONTEXT_EXCEED_FLAG] = 0
+        invalid_context_df.loc[:, schemas.CONTEXT_EXCEED_FLAG] = 0
         return union(valid_context_df, invalid_context_df)
 
     level_context_df = _antijoin_reports(level_context_df, report_df)

@@ -13,12 +13,14 @@
 
 def set_context_size(df: pd.DataFrame) -> None:
     """Measure the number of tokens in the context."""
-    df[schemas.CONTEXT_SIZE] = df[schemas.CONTEXT_STRING].apply(lambda x: num_tokens(x))
+    df.loc[:, schemas.CONTEXT_SIZE] = df.loc[:, schemas.CONTEXT_STRING].apply(
+        lambda x: num_tokens(x)
+    )
 
 
 def set_context_exceeds_flag(df: pd.DataFrame, max_tokens: int) -> None:
     """Set a flag to indicate if the context exceeds the limit."""
-    df[schemas.CONTEXT_EXCEED_FLAG] = df[schemas.CONTEXT_SIZE].apply(
+    df.loc[:, schemas.CONTEXT_EXCEED_FLAG] = df.loc[:, schemas.CONTEXT_SIZE].apply(
         lambda x: x > max_tokens
     )
 

@@ -90,6 +90,12 @@
   # connection_string: <azure_blob_storage_connection_string>
   # container_name: <azure_blob_storage_container_name>
 
+update_index_storage: # Storage to save an updated index (for incremental indexing). Enabling this performs an incremental index run
+  # type: {defs.STORAGE_TYPE.value} # or blob
+  # base_dir: "{defs.UPDATE_STORAGE_BASE_DIR}"
+  # connection_string: <azure_blob_storage_connection_string>
+  # container_name: <azure_blob_storage_container_name>
+
 reporting:
   type: {defs.REPORTING_TYPE.value} # or console, blob
   base_dir: "{defs.REPORTING_BASE_DIR}"