fix logic

dayesouza · dayesouza · commit b18ddbcb6131 · 2026-02-19T14:52:39.000-03:00
diff --git a/packages/graphrag/graphrag/index/workflows/create_communities.py b/packages/graphrag/graphrag/index/workflows/create_communities.py
@@ -31,19 +31,17 @@ async def run_workflow(
     reader = DataReader(context.output_table_provider)
     relationships = await reader.relationships()
 
-    title_to_entity_id: dict[str, str] = {}
-    async with context.output_table_provider.open("entities") as entities_table:
-        async for row in entities_table:
-            title_to_entity_id[row["title"]] = row["id"]
-
     max_cluster_size = config.cluster_graph.max_cluster_size
     use_lcc = config.cluster_graph.use_lcc
     seed = config.cluster_graph.seed
 
-    async with context.output_table_provider.open("communities") as communities_table:
+    async with (
+        context.output_table_provider.open("entities") as entities_table,
+        context.output_table_provider.open("communities") as communities_table,
+    ):
         sample_rows = await create_communities(
             communities_table,
-            title_to_entity_id,
+            entities_table,
             relationships,
             max_cluster_size=max_cluster_size,
             use_lcc=use_lcc,
@@ -56,7 +54,7 @@ async def run_workflow(
 
 async def create_communities(
     communities_table: Table,
-    title_to_entity_id: dict[str, str],
+    entities_table: Table,
     relationships: pd.DataFrame,
     max_cluster_size: int,
     use_lcc: bool,
@@ -68,8 +66,8 @@ async def create_communities(
     ----
         communities_table: Table
             Output table to write community rows to.
-        title_to_entity_id: dict[str, str]
-            Mapping of entity title to entity id.
+        entities_table: Table
+            Table containing entity rows.
         relationships: pd.DataFrame
             Relationships DataFrame with source, target, weight,
             text_unit_ids columns.
@@ -92,6 +90,10 @@ async def create_communities(
         seed=seed,
     )
 
+    title_to_entity_id: dict[str, str] = {}
+    async for row in entities_table:
+        title_to_entity_id[row["title"]] = row["id"]
+
     communities = pd.DataFrame(
         clusters, columns=pd.Index(["level", "community", "parent", "title"])
     ).explode("title")
diff --git a/tests/unit/indexing/test_create_communities.py b/tests/unit/indexing/test_create_communities.py
@@ -20,6 +20,7 @@
     create_communities,
 )
 from graphrag_storage.tables.csv_table import CSVTable
+from graphrag_storage.tables.table import Table
 
 
 class FakeTable(CSVTable):
@@ -33,15 +34,55 @@ async def write(self, row: dict[str, Any]) -> None:
         self.rows.append(row)
 
 
+class FakeEntitiesTable(Table):
+    """In-memory read-only table that supports async iteration."""
+
+    def __init__(self, rows: list[dict[str, Any]]) -> None:
+        self._rows = rows
+        self._index = 0
+
+    def __aiter__(self):
+        """Return an async iterator over the rows."""
+        self._index = 0
+        return self
+
+    async def __anext__(self) -> dict[str, Any]:
+        """Yield the next row or stop."""
+        if self._index >= len(self._rows):
+            raise StopAsyncIteration
+        row = self._rows[self._index]
+        self._index += 1
+        return row
+
+    async def length(self) -> int:
+        """Return number of rows."""
+        return len(self._rows)
+
+    async def has(self, row_id: str) -> bool:
+        """Check if a row with the given ID exists."""
+        return any(r.get("id") == row_id for r in self._rows)
+
+    async def write(self, row: dict[str, Any]) -> None:
+        """Not supported for read-only table."""
+        raise NotImplementedError
+
+    async def close(self) -> None:
+        """No-op."""
+
+
 async def _run_create_communities(
     title_to_entity_id: dict[str, str],
     relationships: pd.DataFrame,
     **kwargs: Any,
 ) -> pd.DataFrame:
-    """Helper that runs create_communities with a FakeTable and returns all rows as a DataFrame."""
-    table = FakeTable()
-    await create_communities(table, title_to_entity_id, relationships, **kwargs)
-    return pd.DataFrame(table.rows)
+    """Helper that runs create_communities with fake tables and returns all rows as a DataFrame."""
+    communities_table = FakeTable()
+    entity_rows = [
+        {"id": eid, "title": title} for title, eid in title_to_entity_id.items()
+    ]
+    entities_table = FakeEntitiesTable(entity_rows)
+    await create_communities(communities_table, entities_table, relationships, **kwargs)
+    return pd.DataFrame(communities_table.rows)
 
 
 def _make_title_to_entity_id(