Skip to content

Commit 60036ff

Browse files
authored
Merge pull request #66 from malingatembo/feat/add-ignore-list-param
Add ignore_list parameter to skip URL checks for specific docs
2 parents 3bc992a + 66a6cc6 commit 60036ff

File tree

1 file changed

+26
-4
lines changed

1 file changed

+26
-4
lines changed

src/lightspeed_rag_content/document_processor.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -663,13 +663,19 @@ def process(
663663
required_exts: Optional[list[str]] = None,
664664
file_extractor: Optional[dict[str, BaseReader]] = None,
665665
unreachable_action: Optional[str] = "warn",
666+
ignore_list: Optional[list[str]] = None,
666667
) -> None:
667668
"""Read documents from a path and split them into nodes for a vector database.
668669
669670
unreachable_action:
670671
"warn": Just log a warning message for links that are unreacheable
671672
"fail": Fail in case of an unreachable link. Raises RuntimeError
672673
"drop": Drop the document, do not include it into the vector database
674+
675+
ignore_list:
676+
List of document titles to exclude from unreachable URL validation.
677+
Documents with titles in this list will be included in the vector database
678+
regardless of their url_reachable status.
673679
"""
674680
reader = SimpleDirectoryReader(
675681
str(docs_dir),
@@ -684,16 +690,32 @@ def process(
684690

685691
# Check for unreachable URLs if we are not ignoring them
686692
if unreachable_action != "warn":
693+
# Separate docs into those we should check and those in ignore_list
694+
if ignore_list:
695+
docs_to_check = []
696+
ignored_docs = []
697+
for doc in docs:
698+
if doc.metadata.get("title") in ignore_list:
699+
ignored_docs.append(doc)
700+
else:
701+
docs_to_check.append(doc)
702+
else:
703+
docs_to_check = docs
704+
ignored_docs = []
705+
706+
# Find reachable docs among those we're checking
687707
reachable_docs = [
688-
doc for doc in docs if doc.metadata["url_reachable"] is True
708+
doc for doc in docs_to_check if doc.metadata["url_reachable"] is True
689709
]
690-
if len(docs) != len(reachable_docs):
710+
711+
if len(docs_to_check) != len(reachable_docs):
691712
# Optionally fail on unreachable URLs
692713
if unreachable_action == "fail":
693714
raise RuntimeError("Some documents have unreachable URLs. ")
694-
# Optionally drop unreachable URLs
715+
# Optionally drop unreachable URLs (but keep ignored docs)
695716
if unreachable_action == "drop":
696-
docs = reachable_docs
717+
docs = reachable_docs + ignored_docs
718+
697719

698720
self.db.add_docs(docs)
699721

0 commit comments

Comments
 (0)