Improve CLI speed with lazy imports (#1319)

microsoft · Nov 16, 2024 · 22a57d1 · 22a57d1
1 parent 9b4f24e
commit 22a57d1
Show file tree

Hide file tree

Showing 237 changed files with 936 additions and 1,383 deletions.
diff --git a/.semversioner/next-release/patch-20241025031711368197.json b/.semversioner/next-release/patch-20241025031711368197.json
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "move import statements out of init files"
+}
diff --git a/.semversioner/next-release/patch-20241031180003172666.json b/.semversioner/next-release/patch-20241031180003172666.json
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "fix autocompletion of existing files/directory paths."
+}
diff --git a/docs/prompt_tuning/auto_prompt_tuning.md b/docs/prompt_tuning/auto_prompt_tuning.md
@@ -20,9 +20,9 @@ Before running auto tuning, ensure you have already initialized your workspace w
 You can run the main script from the command line with various options:
 
 ```bash
-graphrag prompt-tune [--root ROOT] [--domain DOMAIN]  [--method METHOD] [--limit LIMIT] [--language LANGUAGE] \
+graphrag prompt-tune [--root ROOT] [--config CONFIG] [--domain DOMAIN]  [--selection-method METHOD] [--limit LIMIT] [--language LANGUAGE] \
 [--max-tokens MAX_TOKENS] [--chunk-size CHUNK_SIZE] [--n-subset-max N_SUBSET_MAX] [--k K] \
-[--min-examples-required MIN_EXAMPLES_REQUIRED] [--no-entity-types] [--output OUTPUT]
+[--min-examples-required MIN_EXAMPLES_REQUIRED] [--discover-entity-types] [--output OUTPUT]
 ```
 
 ## Command-Line Options
@@ -49,7 +49,7 @@ graphrag prompt-tune [--root ROOT] [--domain DOMAIN]  [--method METHOD] [--limit
 
 - `--min-examples-required` (optional): The minimum number of examples required for entity extraction prompts. Default is 2.
 
-- `--no-entity-types` (optional): Use untyped entity extraction generation. We recommend using this when your data covers a lot of topics or it is highly randomized.
+- `--discover-entity-types` (optional): Allow the LLM to discover and extract entities automatically. We recommend using this when your data covers a lot of topics or it is highly randomized.
 
 - `--output` (optional): The folder to save the generated prompts. Default is "prompts".
 

diff --git a/examples/custom_input/run.py b/examples/custom_input/run.py
@@ -5,7 +5,7 @@
 
 import pandas as pd
 
-from graphrag.index import run_pipeline_with_config
+from graphrag.index.run import run_pipeline_with_config
 
 pipeline_file = os.path.join(
     os.path.dirname(os.path.abspath(__file__)), "./pipeline.yml"

diff --git a/examples/single_verb/run.py b/examples/single_verb/run.py
@@ -5,8 +5,8 @@
 
 import pandas as pd
 
-from graphrag.index import run_pipeline, run_pipeline_with_config
-from graphrag.index.config import PipelineWorkflowReference
+from graphrag.index.config.workflow import PipelineWorkflowReference
+from graphrag.index.run import run_pipeline, run_pipeline_with_config
 
 # our fake dataset
 dataset = pd.DataFrame([{"col1": 2, "col2": 4}, {"col1": 5, "col2": 10}])

diff --git a/examples/use_built_in_workflows/run.py b/examples/use_built_in_workflows/run.py
@@ -3,9 +3,10 @@
 import asyncio
 import os
 
-from graphrag.index import run_pipeline, run_pipeline_with_config
-from graphrag.index.config import PipelineCSVInputConfig, PipelineWorkflowReference
-from graphrag.index.input import load_input
+from graphrag.index.config.input import PipelineCSVInputConfig
+from graphrag.index.config.workflow import PipelineWorkflowReference
+from graphrag.index.input.load_input import load_input
+from graphrag.index.run import run_pipeline, run_pipeline_with_config
 
 sample_data_dir = os.path.join(
     os.path.dirname(os.path.abspath(__file__)), "../_sample_data/"

diff --git a/graphrag/__main__.py b/graphrag/__main__.py
@@ -3,6 +3,6 @@
 
 """The GraphRAG package."""
 
-from .cli.main import app
+from graphrag.cli.main import app
 
 app(prog_name="graphrag")
diff --git a/graphrag/api/__init__.py b/graphrag/api/__init__.py
@@ -8,14 +8,15 @@
 """
 
 from graphrag.api.index import build_index
-from graphrag.api.prompt_tune import DocSelectionType, generate_indexing_prompts
+from graphrag.api.prompt_tune import generate_indexing_prompts
 from graphrag.api.query import (
     drift_search,
     global_search,
     global_search_streaming,
     local_search,
     local_search_streaming,
 )
+from graphrag.prompt_tune.types import DocSelectionType
 
 __all__ = [  # noqa: RUF022
     # index API

diff --git a/graphrag/api/index.py b/graphrag/api/index.py
@@ -10,13 +10,14 @@
 
 from pathlib import Path
 
-from graphrag.config import CacheType, GraphRagConfig
+from graphrag.config.enums import CacheType
+from graphrag.config.models.graph_rag_config import GraphRagConfig
 from graphrag.index.cache.noop_pipeline_cache import NoopPipelineCache
 from graphrag.index.create_pipeline_config import create_pipeline_config
 from graphrag.index.emit.types import TableEmitterType
 from graphrag.index.run import run_pipeline_with_config
 from graphrag.index.typing import PipelineRunResult
-from graphrag.logging import ProgressReporter
+from graphrag.logging.base import ProgressReporter
 from graphrag.vector_stores.factory import VectorStoreType
 
 

diff --git a/graphrag/api/prompt_tune.py b/graphrag/api/prompt_tune.py
@@ -15,25 +15,32 @@
 from pydantic import PositiveInt, validate_call
 
 from graphrag.config.models.graph_rag_config import GraphRagConfig
-from graphrag.index.llm import load_llm
-from graphrag.logging import PrintProgressReporter
-from graphrag.prompt_tune.generator import (
-    MAX_TOKEN_COUNT,
-    create_community_summarization_prompt,
-    create_entity_extraction_prompt,
-    create_entity_summarization_prompt,
-    detect_language,
+from graphrag.index.llm.load_llm import load_llm
+from graphrag.logging.print_progress import PrintProgressReporter
+from graphrag.prompt_tune.defaults import MAX_TOKEN_COUNT
+from graphrag.prompt_tune.generator.community_report_rating import (
     generate_community_report_rating,
+)
+from graphrag.prompt_tune.generator.community_report_summarization import (
+    create_community_summarization_prompt,
+)
+from graphrag.prompt_tune.generator.community_reporter_role import (
     generate_community_reporter_role,
-    generate_domain,
+)
+from graphrag.prompt_tune.generator.domain import generate_domain
+from graphrag.prompt_tune.generator.entity_extraction_prompt import (
+    create_entity_extraction_prompt,
+)
+from graphrag.prompt_tune.generator.entity_relationship import (
     generate_entity_relationship_examples,
-    generate_entity_types,
-    generate_persona,
 )
-from graphrag.prompt_tune.loader import (
-    MIN_CHUNK_SIZE,
-    load_docs_in_chunks,
+from graphrag.prompt_tune.generator.entity_summarization_prompt import (
+    create_entity_summarization_prompt,
 )
+from graphrag.prompt_tune.generator.entity_types import generate_entity_types
+from graphrag.prompt_tune.generator.language import detect_language
+from graphrag.prompt_tune.generator.persona import generate_persona
+from graphrag.prompt_tune.loader.input import MIN_CHUNK_SIZE, load_docs_in_chunks
 from graphrag.prompt_tune.types import DocSelectionType
 
 

diff --git a/graphrag/api/query.py b/graphrag/api/query.py
@@ -24,12 +24,12 @@
 import pandas as pd
 from pydantic import validate_call
 
-from graphrag.config import GraphRagConfig
+from graphrag.config.models.graph_rag_config import GraphRagConfig
 from graphrag.index.config.embeddings import (
     community_full_content_embedding,
     entity_description_embedding,
 )
-from graphrag.logging import PrintProgressReporter
+from graphrag.logging.print_progress import PrintProgressReporter
 from graphrag.query.factories import (
     get_drift_search_engine,
     get_global_search_engine,
@@ -47,8 +47,8 @@
 from graphrag.query.structured_search.base import SearchResult  # noqa: TCH001
 from graphrag.utils.cli import redact
 from graphrag.utils.embeddings import create_collection_name
-from graphrag.vector_stores import VectorStoreFactory, VectorStoreType
 from graphrag.vector_stores.base import BaseVectorStore
+from graphrag.vector_stores.factory import VectorStoreFactory, VectorStoreType
 
 reporter = PrintProgressReporter("")
 

diff --git a/graphrag/callbacks/factories.py b/graphrag/callbacks/factories.py
@@ -8,17 +8,16 @@
 
 from datashaper import WorkflowCallbacks
 
-from graphrag.config import ReportingType
-from graphrag.index.config import (
+from graphrag.callbacks.blob_workflow_callbacks import BlobWorkflowCallbacks
+from graphrag.callbacks.console_workflow_callbacks import ConsoleWorkflowCallbacks
+from graphrag.callbacks.file_workflow_callbacks import FileWorkflowCallbacks
+from graphrag.config.enums import ReportingType
+from graphrag.index.config.reporting import (
     PipelineBlobReportingConfig,
     PipelineFileReportingConfig,
     PipelineReportingConfig,
 )
 
-from .blob_workflow_callbacks import BlobWorkflowCallbacks
-from .console_workflow_callbacks import ConsoleWorkflowCallbacks
-from .file_workflow_callbacks import FileWorkflowCallbacks
-
 
 def create_pipeline_reporter(
     config: PipelineReportingConfig | None, root_dir: str | None

diff --git a/graphrag/callbacks/global_search_callbacks.py b/graphrag/callbacks/global_search_callbacks.py
@@ -3,10 +3,9 @@
 
 """GlobalSearch LLM Callbacks."""
 
+from graphrag.callbacks.llm_callbacks import BaseLLMCallback
 from graphrag.query.structured_search.base import SearchResult
 
-from .llm_callbacks import BaseLLMCallback
-
 
 class GlobalSearchLLMCallback(BaseLLMCallback):
     """GlobalSearch LLM Callbacks."""

diff --git a/graphrag/callbacks/progress_workflow_callbacks.py b/graphrag/callbacks/progress_workflow_callbacks.py
@@ -7,7 +7,7 @@
 
 from datashaper import ExecutionNode, NoopWorkflowCallbacks, Progress, TableContainer
 
-from graphrag.logging import ProgressReporter
+from graphrag.logging.base import ProgressReporter
 
 
 class ProgressWorkflowCallbacks(NoopWorkflowCallbacks):

diff --git a/graphrag/cli/index.py b/graphrag/cli/index.py
@@ -11,15 +11,15 @@
 from pathlib import Path
 
 import graphrag.api as api
-from graphrag.config import (
-    CacheType,
-    enable_logging_with_config,
-    load_config,
-    resolve_paths,
-)
+from graphrag.config.enums import CacheType
+from graphrag.config.load_config import load_config
+from graphrag.config.logging import enable_logging_with_config
+from graphrag.config.resolve_path import resolve_paths
 from graphrag.index.emit.types import TableEmitterType
 from graphrag.index.validate_config import validate_config_names
-from graphrag.logging import ProgressReporter, ReporterType, create_progress_reporter
+from graphrag.logging.base import ProgressReporter
+from graphrag.logging.factories import create_progress_reporter
+from graphrag.logging.types import ReporterType
 from graphrag.utils.cli import redact
 
 # Ignore warnings from numba

diff --git a/graphrag/cli/initialize.py b/graphrag/cli/initialize.py
@@ -6,7 +6,8 @@
 from pathlib import Path
 
 from graphrag.config.init_content import INIT_DOTENV, INIT_YAML
-from graphrag.logging import ReporterType, create_progress_reporter
+from graphrag.logging.factories import create_progress_reporter
+from graphrag.logging.types import ReporterType
 from graphrag.prompts.index.claim_extraction import CLAIM_EXTRACTION_PROMPT
 from graphrag.prompts.index.community_report import (
     COMMUNITY_REPORT_PROMPT,