Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .semversioner/next-release/minor-20250122234420705255.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "minor",
"description": "Add NLP graph extraction."
}
8 changes: 7 additions & 1 deletion dictionary.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,17 @@ cosmosdb
Hnsw
odata

# NLTK Terms
# NLP Terms
chunker
wordnet
maxent
punkt
punct
lemmatizer
PROPN
Syntatic
ents
INTJ

# Libraries
Langchain
Expand Down
15 changes: 9 additions & 6 deletions graphrag/api/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,19 @@
from graphrag.cache.noop_pipeline_cache import NoopPipelineCache
from graphrag.callbacks.reporting import create_pipeline_reporter
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
from graphrag.config.enums import CacheType
from graphrag.config.enums import CacheType, IndexingMethod
from graphrag.config.models.graph_rag_config import GraphRagConfig
from graphrag.index.run.run_workflows import run_workflows
from graphrag.index.run.run_pipeline import run_pipeline
from graphrag.index.typing import PipelineRunResult
from graphrag.index.workflows.factory import create_pipeline
from graphrag.logger.base import ProgressLogger
from graphrag.utils.api import get_workflows_list

log = logging.getLogger(__name__)


async def build_index(
config: GraphRagConfig,
method: IndexingMethod = IndexingMethod.Standard,
memory_profile: bool = False,
callbacks: list[WorkflowCallbacks] | None = None,
progress_logger: ProgressLogger | None = None,
Expand All @@ -35,6 +36,8 @@ async def build_index(
----------
config : GraphRagConfig
The configuration.
method : IndexingMethod default=IndexingMethod.Standard
Styling of indexing to perform (full LLM, NLP + LLM, etc.).
memory_profile : bool
Whether to enable memory profiling.
callbacks : list[WorkflowCallbacks] | None default=None
Expand All @@ -61,10 +64,10 @@ async def build_index(
if memory_profile:
log.warning("New pipeline does not yet support memory profiling.")

workflows = get_workflows_list(config)
pipeline = create_pipeline(config, method)

async for output in run_workflows(
workflows,
async for output in run_pipeline(
pipeline,
config,
cache=pipeline_cache,
callbacks=callbacks,
Expand Down
8 changes: 7 additions & 1 deletion graphrag/cli/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pathlib import Path

import graphrag.api as api
from graphrag.config.enums import CacheType
from graphrag.config.enums import CacheType, IndexingMethod
from graphrag.config.load_config import load_config
from graphrag.config.logging import enable_logging_with_config
from graphrag.index.validate_config import validate_config_names
Expand Down Expand Up @@ -63,6 +63,7 @@ def handle_signal(signum, _):

def index_cli(
root_dir: Path,
method: IndexingMethod,
verbose: bool,
memprofile: bool,
cache: bool,
Expand All @@ -81,6 +82,7 @@ def index_cli(

_run_index(
config=config,
method=method,
verbose=verbose,
memprofile=memprofile,
cache=cache,
Expand All @@ -92,6 +94,7 @@ def index_cli(

def update_cli(
root_dir: Path,
method: IndexingMethod,
verbose: bool,
memprofile: bool,
cache: bool,
Expand Down Expand Up @@ -119,6 +122,7 @@ def update_cli(

_run_index(
config=config,
method=method,
verbose=verbose,
memprofile=memprofile,
cache=cache,
Expand All @@ -130,6 +134,7 @@ def update_cli(

def _run_index(
config,
method,
verbose,
memprofile,
cache,
Expand Down Expand Up @@ -170,6 +175,7 @@ def _run_index(
outputs = asyncio.run(
api.build_index(
config=config,
method=method,
memory_profile=memprofile,
progress_logger=progress_logger,
)
Expand Down
33 changes: 14 additions & 19 deletions graphrag/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
import os
import re
from collections.abc import Callable
from enum import Enum
from pathlib import Path
from typing import Annotated

import typer

from graphrag.config.enums import IndexingMethod, SearchMethod
from graphrag.logger.types import LoggerType
from graphrag.prompt_tune.defaults import (
MAX_TOKEN_COUNT,
Expand Down Expand Up @@ -82,19 +82,6 @@ def completer(incomplete: str) -> list[str]:
return completer


class SearchType(Enum):
"""The type of search to run."""

LOCAL = "local"
GLOBAL = "global"
DRIFT = "drift"
BASIC = "basic"

def __str__(self):
"""Return the string representation of the enum value."""
return self.value


@app.command("init")
def _initialize_cli(
root: Annotated[
Expand Down Expand Up @@ -141,6 +128,9 @@ def _index_cli(
),
),
] = Path(), # set default to current directory
method: Annotated[
IndexingMethod, typer.Option(help="The indexing method to use.")
] = IndexingMethod.Standard,
verbose: Annotated[
bool, typer.Option(help="Run the indexing pipeline with verbose logging")
] = False,
Expand Down Expand Up @@ -186,6 +176,7 @@ def _index_cli(
dry_run=dry_run,
skip_validation=skip_validation,
output_dir=output,
method=method,
)


Expand All @@ -207,6 +198,9 @@ def _update_cli(
resolve_path=True,
),
] = Path(), # set default to current directory
method: Annotated[
IndexingMethod, typer.Option(help="The indexing method to use.")
] = IndexingMethod.Standard,
verbose: Annotated[
bool, typer.Option(help="Run the indexing pipeline with verbose logging")
] = False,
Expand Down Expand Up @@ -249,6 +243,7 @@ def _update_cli(
config_filepath=config,
skip_validation=skip_validation,
output_dir=output,
method=method,
)


Expand Down Expand Up @@ -364,7 +359,7 @@ def _prompt_tune_cli(

@app.command("query")
def _query_cli(
method: Annotated[SearchType, typer.Option(help="The query algorithm to use.")],
method: Annotated[SearchMethod, typer.Option(help="The query algorithm to use.")],
query: Annotated[str, typer.Option(help="The query to execute.")],
config: Annotated[
Path | None,
Expand Down Expand Up @@ -433,7 +428,7 @@ def _query_cli(
)

match method:
case SearchType.LOCAL:
case SearchMethod.LOCAL:
run_local_search(
config_filepath=config,
data_dir=data,
Expand All @@ -443,7 +438,7 @@ def _query_cli(
streaming=streaming,
query=query,
)
case SearchType.GLOBAL:
case SearchMethod.GLOBAL:
run_global_search(
config_filepath=config,
data_dir=data,
Expand All @@ -454,7 +449,7 @@ def _query_cli(
streaming=streaming,
query=query,
)
case SearchType.DRIFT:
case SearchMethod.DRIFT:
run_drift_search(
config_filepath=config,
data_dir=data,
Expand All @@ -464,7 +459,7 @@ def _query_cli(
response_type=response_type,
query=query,
)
case SearchType.BASIC:
case SearchMethod.BASIC:
run_basic_search(
config_filepath=config,
data_dir=data,
Expand Down
49 changes: 46 additions & 3 deletions graphrag/config/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
InputFileType,
InputType,
LLMType,
NounPhraseExtractorType,
OutputType,
ReportingType,
TextEmbeddingTarget,
Expand Down Expand Up @@ -42,8 +43,11 @@
LLM_SLEEP_ON_RATE_LIMIT_RECOMMENDATION = True
LLM_CONCURRENT_REQUESTS = 25

PARALLELIZATION_STAGGER = 0.3
PARALLELIZATION_NUM_THREADS = 50

#
# Text Embedding Parameters
# Text embedding
#
EMBEDDING_TYPE = LLMType.OpenAIEmbedding
EMBEDDING_MODEL = "text-embedding-3-small"
Expand All @@ -52,36 +56,67 @@
EMBEDDING_TARGET = TextEmbeddingTarget.required
EMBEDDING_MODEL_ID = DEFAULT_EMBEDDING_MODEL_ID

# LLM response caching
CACHE_TYPE = CacheType.file
CACHE_BASE_DIR = "cache"

# Text chunking
CHUNK_SIZE = 1200
CHUNK_OVERLAP = 100
CHUNK_GROUP_BY_COLUMNS = ["id"]
CHUNK_STRATEGY = ChunkStrategyType.tokens

# Claim extraction
CLAIM_DESCRIPTION = (
"Any claims or facts that could be relevant to information discovery."
)
CLAIM_MAX_GLEANINGS = 1
CLAIM_EXTRACTION_ENABLED = False
CLAIM_EXTRACTION_MODEL_ID = DEFAULT_CHAT_MODEL_ID

# Graph clustering
MAX_CLUSTER_SIZE = 10
USE_LCC = True
CLUSTER_GRAPH_SEED = 0xDEADBEEF

# Community report summarization
COMMUNITY_REPORT_MAX_LENGTH = 2000
COMMUNITY_REPORT_MAX_INPUT_LENGTH = 8000
COMMUNITY_REPORT_MODEL_ID = DEFAULT_CHAT_MODEL_ID

# Graph extraction via LLM
ENTITY_EXTRACTION_ENTITY_TYPES = ["organization", "person", "geo", "event"]
ENTITY_EXTRACTION_MAX_GLEANINGS = 1
ENTITY_EXTRACTION_MODEL_ID = DEFAULT_CHAT_MODEL_ID

# Graph extraction via NLP
NLP_NORMALIZE_EDGE_WEIGHTS = True
NLP_EXTRACTOR_TYPE = NounPhraseExtractorType.RegexEnglish
NLP_MAX_WORD_LENGTH = 15
NLP_MODEL_NAME = "en_core_web_md"
NLP_EXCLUDE_NOUNS = None
NLP_WORD_DELIMITER = " "
NLP_INCLUDE_NAMED_ENTITIES = True
NLP_EXCLUDE_ENTITY_TAGS = ["DATE"]
NLP_EXCLUDE_POS_TAGS = ["DET", "PRON", "INTJ", "X"]
NLP_NOUN_PHRASE_TAGS = ["PROPN", "NOUNS"]
NLP_NOUN_PHRASE_CFG = {
"PROPN,PROPN": "PROPN",
"NOUN,NOUN": "NOUNS",
"NOUNS,NOUN": "NOUNS",
"ADJ,ADJ": "ADJ",
"ADJ,NOUN": "NOUNS",
}

# Input file params
INPUT_FILE_TYPE = InputFileType.text
INPUT_TYPE = InputType.file
INPUT_BASE_DIR = "input"
INPUT_FILE_ENCODING = "utf-8"
INPUT_TEXT_COLUMN = "text"
INPUT_CSV_PATTERN = ".*\\.csv$"
INPUT_TEXT_PATTERN = ".*\\.txt$"
PARALLELIZATION_STAGGER = 0.3
PARALLELIZATION_NUM_THREADS = 50

NODE2VEC_ENABLED = False
NODE2VEC_DIMENSIONS = 1536
NODE2VEC_NUM_WALKS = 10
Expand All @@ -101,6 +136,14 @@
UMAP_ENABLED = False
UPDATE_OUTPUT_BASE_DIR = "update_output"

# Graph Pruning
PRUNE_MIN_NODE_FREQ = 2
PRUNE_MAX_NODE_FREQ_STD = None
PRUNE_MIN_NODE_DEGREE = 1
PRUNE_MAX_NODE_DEGREE_STD = None
PRUNE_MIN_EDGE_WEIGHT_PCT = 40
PRUNE_REMOVE_EGO_NODES = False
PRUNE_LCC_ONLY = False

VECTOR_STORE_TYPE = VectorStoreType.LanceDB.value
VECTOR_STORE_DB_URI = str(Path(OUTPUT_BASE_DIR) / "lancedb")
Expand Down
33 changes: 33 additions & 0 deletions graphrag/config/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,36 @@ class ChunkStrategyType(str, Enum):
def __repr__(self):
"""Get a string representation."""
return f'"{self.value}"'


class SearchMethod(Enum):
"""The type of search to run."""

LOCAL = "local"
GLOBAL = "global"
DRIFT = "drift"
BASIC = "basic"

def __str__(self):
"""Return the string representation of the enum value."""
return self.value


class IndexingMethod(str, Enum):
"""Enum for the type of indexing to perform."""

Standard = "standard"
"""Traditional GraphRAG indexing, with all graph construction and summarization performed by a language model."""
Fast = "fast"
"""Fast indexing, using NLP for graph construction and language model for summarization."""


class NounPhraseExtractorType(str, Enum):
"""Enum for the noun phrase extractor options."""

RegexEnglish = "regex_english"
"""Standard extractor using regex. Fastest, but limited to English."""
Syntactic = "syntactic_parser"
"""Noun phrase extractor based on dependency parsing and NER using SpaCy."""
CFG = "cfg"
"""Noun phrase extractor combining CFG-based noun-chunk extraction and NER."""
Loading
Loading