microsoft · natoverse · Jan 28, 2025 · Jan 8, 2025 · Jan 10, 2025 · Jan 10, 2025
@@ -0,0 +1,4 @@
+{
+  "type": "minor",
+  "description": "Add NLP graph extraction."
+}
@@ -33,11 +33,17 @@ cosmosdb
 Hnsw
 odata
 
-# NLTK Terms
+# NLP Terms
 chunker
 wordnet
 maxent
 punkt
+punct
+lemmatizer
+PROPN
+Syntatic
+ents
+INTJ
 
 # Libraries
 Langchain

@@ -13,18 +13,19 @@
 from graphrag.cache.noop_pipeline_cache import NoopPipelineCache
 from graphrag.callbacks.reporting import create_pipeline_reporter
 from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
-from graphrag.config.enums import CacheType
+from graphrag.config.enums import CacheType, IndexingMethod
 from graphrag.config.models.graph_rag_config import GraphRagConfig
-from graphrag.index.run.run_workflows import run_workflows
+from graphrag.index.run.run_pipeline import run_pipeline
 from graphrag.index.typing import PipelineRunResult
+from graphrag.index.workflows.factory import create_pipeline
 from graphrag.logger.base import ProgressLogger
-from graphrag.utils.api import get_workflows_list
 
 log = logging.getLogger(__name__)
 
 
 async def build_index(
     config: GraphRagConfig,
+    method: IndexingMethod = IndexingMethod.Standard,
     memory_profile: bool = False,
     callbacks: list[WorkflowCallbacks] | None = None,
     progress_logger: ProgressLogger | None = None,
@@ -35,6 +36,8 @@ async def build_index(
     ----------
     config : GraphRagConfig
         The configuration.
+    method : IndexingMethod default=IndexingMethod.Standard
+        Styling of indexing to perform (full LLM, NLP + LLM, etc.).
     memory_profile : bool
         Whether to enable memory profiling.
     callbacks : list[WorkflowCallbacks] | None default=None
@@ -61,10 +64,10 @@ async def build_index(
     if memory_profile:
         log.warning("New pipeline does not yet support memory profiling.")
 
-    workflows = get_workflows_list(config)
+    pipeline = create_pipeline(config, method)
 
-    async for output in run_workflows(
-        workflows,
+    async for output in run_pipeline(
+        pipeline,
         config,
         cache=pipeline_cache,
         callbacks=callbacks,

@@ -10,7 +10,7 @@
 from pathlib import Path
 
 import graphrag.api as api
-from graphrag.config.enums import CacheType
+from graphrag.config.enums import CacheType, IndexingMethod
 from graphrag.config.load_config import load_config
 from graphrag.config.logging import enable_logging_with_config
 from graphrag.index.validate_config import validate_config_names
@@ -63,6 +63,7 @@ def handle_signal(signum, _):
 
 def index_cli(
     root_dir: Path,
+    method: IndexingMethod,
     verbose: bool,
     memprofile: bool,
     cache: bool,
@@ -81,6 +82,7 @@ def index_cli(
 
     _run_index(
         config=config,
+        method=method,
         verbose=verbose,
         memprofile=memprofile,
         cache=cache,
@@ -92,6 +94,7 @@ def index_cli(
 
 def update_cli(
     root_dir: Path,
+    method: IndexingMethod,
     verbose: bool,
     memprofile: bool,
     cache: bool,
@@ -119,6 +122,7 @@ def update_cli(
 
     _run_index(
         config=config,
+        method=method,
         verbose=verbose,
         memprofile=memprofile,
         cache=cache,
@@ -130,6 +134,7 @@ def update_cli(
 
 def _run_index(
     config,
+    method,
     verbose,
     memprofile,
     cache,
@@ -170,6 +175,7 @@ def _run_index(
     outputs = asyncio.run(
         api.build_index(
             config=config,
+            method=method,
             memory_profile=memprofile,
             progress_logger=progress_logger,
         )

@@ -6,12 +6,12 @@
 import os
 import re
 from collections.abc import Callable
-from enum import Enum
 from pathlib import Path
 from typing import Annotated
 
 import typer
 
+from graphrag.config.enums import IndexingMethod, SearchMethod
 from graphrag.logger.types import LoggerType
 from graphrag.prompt_tune.defaults import (
     MAX_TOKEN_COUNT,
@@ -82,19 +82,6 @@ def completer(incomplete: str) -> list[str]:
     return completer
 
 
-class SearchType(Enum):
-    """The type of search to run."""
-
-    LOCAL = "local"
-    GLOBAL = "global"
-    DRIFT = "drift"
-    BASIC = "basic"
-
-    def __str__(self):
-        """Return the string representation of the enum value."""
-        return self.value
-
-
 @app.command("init")
 def _initialize_cli(
     root: Annotated[
@@ -141,6 +128,9 @@ def _index_cli(
             ),
         ),
     ] = Path(),  # set default to current directory
+    method: Annotated[
+        IndexingMethod, typer.Option(help="The indexing method to use.")
+    ] = IndexingMethod.Standard,
     verbose: Annotated[
         bool, typer.Option(help="Run the indexing pipeline with verbose logging")
     ] = False,
@@ -186,6 +176,7 @@ def _index_cli(
         dry_run=dry_run,
         skip_validation=skip_validation,
         output_dir=output,
+        method=method,
     )
 
 
@@ -207,6 +198,9 @@ def _update_cli(
             resolve_path=True,
         ),
     ] = Path(),  # set default to current directory
+    method: Annotated[
+        IndexingMethod, typer.Option(help="The indexing method to use.")
+    ] = IndexingMethod.Standard,
     verbose: Annotated[
         bool, typer.Option(help="Run the indexing pipeline with verbose logging")
     ] = False,
@@ -249,6 +243,7 @@ def _update_cli(
         config_filepath=config,
         skip_validation=skip_validation,
         output_dir=output,
+        method=method,
     )
 
 
@@ -364,7 +359,7 @@ def _prompt_tune_cli(
 
 @app.command("query")
 def _query_cli(
-    method: Annotated[SearchType, typer.Option(help="The query algorithm to use.")],
+    method: Annotated[SearchMethod, typer.Option(help="The query algorithm to use.")],
     query: Annotated[str, typer.Option(help="The query to execute.")],
     config: Annotated[
         Path | None,
@@ -433,7 +428,7 @@ def _query_cli(
     )
 
     match method:
-        case SearchType.LOCAL:
+        case SearchMethod.LOCAL:
             run_local_search(
                 config_filepath=config,
                 data_dir=data,
@@ -443,7 +438,7 @@ def _query_cli(
                 streaming=streaming,
                 query=query,
             )
-        case SearchType.GLOBAL:
+        case SearchMethod.GLOBAL:
             run_global_search(
                 config_filepath=config,
                 data_dir=data,
@@ -454,7 +449,7 @@ def _query_cli(
                 streaming=streaming,
                 query=query,
             )
-        case SearchType.DRIFT:
+        case SearchMethod.DRIFT:
             run_drift_search(
                 config_filepath=config,
                 data_dir=data,
@@ -464,7 +459,7 @@ def _query_cli(
                 response_type=response_type,
                 query=query,
             )
-        case SearchType.BASIC:
+        case SearchMethod.BASIC:
             run_basic_search(
                 config_filepath=config,
                 data_dir=data,

@@ -12,6 +12,7 @@
     InputFileType,
     InputType,
     LLMType,
+    NounPhraseExtractorType,
     OutputType,
     ReportingType,
     TextEmbeddingTarget,
@@ -42,8 +43,11 @@
 LLM_SLEEP_ON_RATE_LIMIT_RECOMMENDATION = True
 LLM_CONCURRENT_REQUESTS = 25
 
+PARALLELIZATION_STAGGER = 0.3
+PARALLELIZATION_NUM_THREADS = 50
+
 #
-# Text Embedding Parameters
+# Text embedding
 #
 EMBEDDING_TYPE = LLMType.OpenAIEmbedding
 EMBEDDING_MODEL = "text-embedding-3-small"
@@ -52,36 +56,67 @@
 EMBEDDING_TARGET = TextEmbeddingTarget.required
 EMBEDDING_MODEL_ID = DEFAULT_EMBEDDING_MODEL_ID
 
+# LLM response caching
 CACHE_TYPE = CacheType.file
 CACHE_BASE_DIR = "cache"
+
+# Text chunking
 CHUNK_SIZE = 1200
 CHUNK_OVERLAP = 100
 CHUNK_GROUP_BY_COLUMNS = ["id"]
 CHUNK_STRATEGY = ChunkStrategyType.tokens
+
+# Claim extraction
 CLAIM_DESCRIPTION = (
     "Any claims or facts that could be relevant to information discovery."
 )
 CLAIM_MAX_GLEANINGS = 1
 CLAIM_EXTRACTION_ENABLED = False
 CLAIM_EXTRACTION_MODEL_ID = DEFAULT_CHAT_MODEL_ID
+
+# Graph clustering
 MAX_CLUSTER_SIZE = 10
 USE_LCC = True
 CLUSTER_GRAPH_SEED = 0xDEADBEEF
+
+# Community report summarization
 COMMUNITY_REPORT_MAX_LENGTH = 2000
 COMMUNITY_REPORT_MAX_INPUT_LENGTH = 8000
 COMMUNITY_REPORT_MODEL_ID = DEFAULT_CHAT_MODEL_ID
+
+# Graph extraction via LLM
 ENTITY_EXTRACTION_ENTITY_TYPES = ["organization", "person", "geo", "event"]
 ENTITY_EXTRACTION_MAX_GLEANINGS = 1
 ENTITY_EXTRACTION_MODEL_ID = DEFAULT_CHAT_MODEL_ID
+
+# Graph extraction via NLP
+NLP_NORMALIZE_EDGE_WEIGHTS = True
+NLP_EXTRACTOR_TYPE = NounPhraseExtractorType.RegexEnglish
+NLP_MAX_WORD_LENGTH = 15
+NLP_MODEL_NAME = "en_core_web_md"
+NLP_EXCLUDE_NOUNS = None
+NLP_WORD_DELIMITER = " "
+NLP_INCLUDE_NAMED_ENTITIES = True
+NLP_EXCLUDE_ENTITY_TAGS = ["DATE"]
+NLP_EXCLUDE_POS_TAGS = ["DET", "PRON", "INTJ", "X"]
+NLP_NOUN_PHRASE_TAGS = ["PROPN", "NOUNS"]
+NLP_NOUN_PHRASE_CFG = {
+    "PROPN,PROPN": "PROPN",
+    "NOUN,NOUN": "NOUNS",
+    "NOUNS,NOUN": "NOUNS",
+    "ADJ,ADJ": "ADJ",
+    "ADJ,NOUN": "NOUNS",
+}
+
+# Input file params
 INPUT_FILE_TYPE = InputFileType.text
 INPUT_TYPE = InputType.file
 INPUT_BASE_DIR = "input"
 INPUT_FILE_ENCODING = "utf-8"
 INPUT_TEXT_COLUMN = "text"
 INPUT_CSV_PATTERN = ".*\\.csv$"
 INPUT_TEXT_PATTERN = ".*\\.txt$"
-PARALLELIZATION_STAGGER = 0.3
-PARALLELIZATION_NUM_THREADS = 50
+
 NODE2VEC_ENABLED = False
 NODE2VEC_DIMENSIONS = 1536
 NODE2VEC_NUM_WALKS = 10
@@ -101,6 +136,14 @@
 UMAP_ENABLED = False
 UPDATE_OUTPUT_BASE_DIR = "update_output"
 
+# Graph Pruning
+PRUNE_MIN_NODE_FREQ = 2
+PRUNE_MAX_NODE_FREQ_STD = None
+PRUNE_MIN_NODE_DEGREE = 1
+PRUNE_MAX_NODE_DEGREE_STD = None
+PRUNE_MIN_EDGE_WEIGHT_PCT = 40
+PRUNE_REMOVE_EGO_NODES = False
+PRUNE_LCC_ONLY = False
 
 VECTOR_STORE_TYPE = VectorStoreType.LanceDB.value
 VECTOR_STORE_DB_URI = str(Path(OUTPUT_BASE_DIR) / "lancedb")

@@ -140,3 +140,36 @@ class ChunkStrategyType(str, Enum):
     def __repr__(self):
         """Get a string representation."""
         return f'"{self.value}"'
+
+
+class SearchMethod(Enum):
+    """The type of search to run."""
+
+    LOCAL = "local"
+    GLOBAL = "global"
+    DRIFT = "drift"
+    BASIC = "basic"
+
+    def __str__(self):
+        """Return the string representation of the enum value."""
+        return self.value
+
+
+class IndexingMethod(str, Enum):
+    """Enum for the type of indexing to perform."""
+
+    Standard = "standard"
+    """Traditional GraphRAG indexing, with all graph construction and summarization performed by a language model."""
+    Fast = "fast"
+    """Fast indexing, using NLP for graph construction and language model for summarization."""
+
+
+class NounPhraseExtractorType(str, Enum):
+    """Enum for the noun phrase extractor options."""
+
+    RegexEnglish = "regex_english"
+    """Standard extractor using regex. Fastest, but limited to English."""
+    Syntactic = "syntactic_parser"
+    """Noun phrase extractor based on dependency parsing and NER using SpaCy."""
+    CFG = "cfg"
+    """Noun phrase extractor combining CFG-based noun-chunk extraction and NER."""