@@ -663,13 +663,19 @@ def process(
663663 required_exts : Optional [list [str ]] = None ,
664664 file_extractor : Optional [dict [str , BaseReader ]] = None ,
665665 unreachable_action : Optional [str ] = "warn" ,
666+ ignore_list : Optional [list [str ]] = None ,
666667 ) -> None :
667668 """Read documents from a path and split them into nodes for a vector database.
668669
669670 unreachable_action:
670671 "warn": Just log a warning message for links that are unreacheable
671672 "fail": Fail in case of an unreachable link. Raises RuntimeError
672673 "drop": Drop the document, do not include it into the vector database
674+
675+ ignore_list:
676+ List of document titles to exclude from unreachable URL validation.
677+ Documents with titles in this list will be included in the vector database
678+ regardless of their url_reachable status.
673679 """
674680 reader = SimpleDirectoryReader (
675681 str (docs_dir ),
@@ -684,16 +690,32 @@ def process(
684690
685691 # Check for unreachable URLs if we are not ignoring them
686692 if unreachable_action != "warn" :
693+ # Separate docs into those we should check and those in ignore_list
694+ if ignore_list :
695+ docs_to_check = []
696+ ignored_docs = []
697+ for doc in docs :
698+ if doc .metadata .get ("title" ) in ignore_list :
699+ ignored_docs .append (doc )
700+ else :
701+ docs_to_check .append (doc )
702+ else :
703+ docs_to_check = docs
704+ ignored_docs = []
705+
706+ # Find reachable docs among those we're checking
687707 reachable_docs = [
688- doc for doc in docs if doc .metadata ["url_reachable" ] is True
708+ doc for doc in docs_to_check if doc .metadata ["url_reachable" ] is True
689709 ]
690- if len (docs ) != len (reachable_docs ):
710+
711+ if len (docs_to_check ) != len (reachable_docs ):
691712 # Optionally fail on unreachable URLs
692713 if unreachable_action == "fail" :
693714 raise RuntimeError ("Some documents have unreachable URLs. " )
694- # Optionally drop unreachable URLs
715+ # Optionally drop unreachable URLs (but keep ignored docs)
695716 if unreachable_action == "drop" :
696- docs = reachable_docs
717+ docs = reachable_docs + ignored_docs
718+
697719
698720 self .db .add_docs (docs )
699721
0 commit comments