11import os
22import fnmatch
33import subprocess
4- from typing import Any , Dict , List , Optional
4+ import json
5+ import asyncio
6+ from typing import Any , Dict , List , Optional , TYPE_CHECKING
7+
58from tree_sitter_language_pack import get_parser
69from tree_sitter import Parser
710
8- MAX_ITEMS_OUT = 15
11+ if TYPE_CHECKING :
12+ from AgentCrew .modules .llm .base import BaseLLMService
13+
14+ MAX_ITEMS_OUT = 20
15+ MAX_FILES_TO_ANALYZE = 400
916
1017
1118class CodeAnalysisService :
@@ -43,8 +50,27 @@ class CodeAnalysisService:
4350 # Add more languages as needed
4451 }
4552
46- def __init__ (self ):
47- """Initialize the code analysis service with tree-sitter parsers."""
53+ def __init__ (self , llm_service : Optional ["BaseLLMService" ] = None ):
54+ """Initialize the code analysis service with tree-sitter parsers.
55+
56+ Args:
57+ llm_service: Optional LLM service for intelligent file selection when
58+ analyzing large repositories (>500 files).
59+ """
60+ self .llm_service = llm_service
61+ if self .llm_service :
62+ if self .llm_service .provider_name == "google" :
63+ self .llm_service .model = "gemini-2.5-flash-lite"
64+ elif self .llm_service .provider_name == "claude" :
65+ self .llm_service .model = "claude-3-5-haiku-latest"
66+ elif self .llm_service .provider_name == "openai" :
67+ self .llm_service .model = "gpt-4.1-nano"
68+ elif self .llm_service .provider_name == "groq" :
69+ self .llm_service .model = "llama-3.3-70b-versatile"
70+ elif self .llm_service .provider_name == "deepinfra" :
71+ self .llm_service .model = "google/gemma-3-27b-it"
72+ elif self .llm_service .provider_name == "github_copilot" :
73+ self .llm_service .model = "gpt-5-mini"
4874 try :
4975 self ._parser_cache = {
5076 "python" : get_parser ("python" ),
@@ -898,6 +924,73 @@ def _count_nodes(self, structure: Dict[str, Any], node_types: set[str]) -> int:
898924
899925 return count
900926
927+ def _select_files_with_llm (
928+ self , files : List [str ], max_files : int = MAX_FILES_TO_ANALYZE
929+ ) -> List [str ]:
930+ """Use LLM to intelligently select which files to analyze from a large repository.
931+
932+ Args:
933+ files: List of relative file paths to select from
934+ max_files: Maximum number of files to select
935+
936+ Returns:
937+ List of selected file paths that should be analyzed
938+ """
939+ if not self .llm_service :
940+ return files [:max_files ]
941+
942+ prompt = f"""You are analyzing a code repository with { len (files )} files.
943+ The analysis system can only process { max_files } files at a time.
944+
945+ Please select the { max_files } most important files to analyze based on these criteria:
946+ 1. Core application logic files (main entry points, core modules)
947+ 2. Business logic and domain models
948+ 3. API endpoints and controllers
949+ 4. Service/utility classes
950+ 5. Configuration files that define app structure
951+ 6. Test files are lower priority unless they reveal architecture
952+ 7. Generated files, lock files, and vendor files should be excluded
953+
954+ Here is the complete list of files in the repository:
955+ { chr (10 ).join (files )}
956+
957+ Return your selection as a JSON array of file paths. Only return the JSON array, nothing else.
958+ Select exactly { max_files } files from the list above.
959+
960+ Example response format:
961+ ["src/main.py", "src/app.py", "src/models/user.py"]"""
962+
963+ try :
964+ loop = asyncio .get_event_loop ()
965+ except RuntimeError :
966+ loop = asyncio .new_event_loop ()
967+ asyncio .set_event_loop (loop )
968+
969+ try :
970+ response = loop .run_until_complete (
971+ self .llm_service .process_message (prompt , temperature = 0 )
972+ )
973+
974+ response = response .strip ()
975+ if response .startswith ("```json" ):
976+ response = response [7 :]
977+ if response .startswith ("```" ):
978+ response = response [3 :]
979+ if response .endswith ("```" ):
980+ response = response [:- 3 ]
981+ response = response .strip ()
982+
983+ selected_files = json .loads (response )
984+
985+ if isinstance (selected_files , list ):
986+ valid_files = [f for f in selected_files if f in files ]
987+ if len (valid_files ) >= max_files * 0.5 :
988+ return valid_files [:max_files ]
989+ except Exception :
990+ pass
991+
992+ return files [:max_files ]
993+
901994 def analyze_code_structure (
902995 self , path : str , exclude_patterns : List [str ] = []
903996 ) -> Dict [str , Any ] | str :
@@ -911,11 +1004,9 @@ def analyze_code_structure(
9111004 Dictionary containing analysis results for each file or formatted string
9121005 """
9131006 try :
914- # Verify the path exists
9151007 if not os .path .exists (path ):
9161008 return {"error" : f"Path does not exist: { path } " }
9171009
918- # Run git ls-files to get all tracked files
9191010 try :
9201011 result = subprocess .run (
9211012 ["git" , "ls-files" ],
@@ -930,21 +1021,32 @@ def analyze_code_structure(
9301021 "error" : f"Failed to run git ls-files on { path } . Make sure it's a git repository."
9311022 }
9321023
933- # Filter for supported file types
934- supported_files = []
1024+ supported_files_rel = []
9351025 for file_path in files :
9361026 excluded = False
937- if file_path .strip (): # Skip empty lines
938- # Check against glob exclude patterns
1027+ if file_path .strip ():
9391028 for pattern in exclude_patterns :
9401029 if fnmatch .fnmatch (file_path , pattern ):
9411030 excluded = True
9421031 break
9431032 ext = os .path .splitext (file_path )[1 ].lower ()
9441033 if ext in self .LANGUAGE_MAP and not excluded :
945- supported_files .append (os .path .join (path , file_path ))
1034+ supported_files_rel .append (file_path )
1035+
1036+ non_analyzed_files = []
1037+ files_to_analyze = supported_files_rel
1038+
1039+ if len (supported_files_rel ) > MAX_FILES_TO_ANALYZE :
1040+ selected_files = self ._select_files_with_llm (
1041+ supported_files_rel , MAX_FILES_TO_ANALYZE
1042+ )
1043+ non_analyzed_files = [
1044+ f for f in supported_files_rel if f not in selected_files
1045+ ]
1046+ files_to_analyze = selected_files
1047+
1048+ supported_files = [os .path .join (path , f ) for f in files_to_analyze ]
9461049
947- # Analyze each file
9481050 analysis_results = []
9491051 errors = []
9501052 for file_path in supported_files :
@@ -953,15 +1055,13 @@ def analyze_code_structure(
9531055 language = self ._detect_language (file_path )
9541056
9551057 if language == "config" :
956- # Skip problematic file
9571058 if os .path .basename (file_path ) == "package-lock.json" :
9581059 continue
9591060 result = {"type" : "config" , "name" : os .path .basename (file_path )}
9601061 else :
9611062 result = self ._analyze_file (file_path )
9621063
9631064 if result and isinstance (result , dict ) and "error" not in result :
964- # Successfully analyzed file
9651065 analysis_results .append (
9661066 {
9671067 "path" : rel_path ,
@@ -977,7 +1077,11 @@ def analyze_code_structure(
9771077 if not analysis_results :
9781078 return "Analysis completed but no valid results. This may due to excluded patterns is not correct"
9791079 return self ._format_analysis_results (
980- analysis_results , supported_files , errors
1080+ analysis_results ,
1081+ supported_files ,
1082+ errors ,
1083+ non_analyzed_files ,
1084+ len (supported_files_rel ),
9811085 )
9821086
9831087 except Exception as e :
@@ -1318,10 +1422,19 @@ def _format_analysis_results(
13181422 analysis_results : List [Dict [str , Any ]],
13191423 analyzed_files : List [str ],
13201424 errors : List [Dict [str , str ]],
1425+ non_analyzed_files : List [str ] = [],
1426+ total_supported_files : int = 0 ,
13211427 ) -> str :
1322- """Format the analysis results into a clear text format."""
1428+ """Format the analysis results into a clear text format.
1429+
1430+ Args:
1431+ analysis_results: List of analysis results for each file
1432+ analyzed_files: List of files that were analyzed
1433+ errors: List of errors encountered during analysis
1434+ non_analyzed_files: List of files that were skipped due to file limit
1435+ total_supported_files: Total number of supported files in the repository
1436+ """
13231437
1324- # Count statistics
13251438 total_files = len (analyzed_files )
13261439 classes = sum (
13271440 self ._count_nodes (f ["structure" ], self .class_types )
@@ -1336,28 +1449,44 @@ def _format_analysis_results(
13361449 for f in analysis_results
13371450 )
13381451 error_count = len (errors )
1452+ non_analyzed_count = len (non_analyzed_files )
13391453
1340- # Build output sections
13411454 sections = []
13421455
1343- # Add statistics section
13441456 sections .append ("\n ===ANALYSIS STATISTICS===\n " )
13451457 sections .append (f"Total files analyzed: { total_files } " )
1458+ if non_analyzed_count > 0 :
1459+ sections .append (
1460+ f"Total files skipped (repository too large): { non_analyzed_count } "
1461+ )
1462+ sections .append (
1463+ f"Total supported files in repository: { total_supported_files } "
1464+ )
13461465 sections .append (f"Total errors: { error_count } " )
13471466 sections .append (f"Total classes found: { classes } " )
13481467 sections .append (f"Total functions found: { functions } " )
13491468 sections .append (f"Total decorated functions: { decorated_functions } " )
13501469
1351- # Add errors section if any
13521470 if errors :
13531471 sections .append ("\n ===ERRORS===" )
13541472 for error in errors :
13551473 error_first_line = error ["error" ].split ("\n " )[0 ]
13561474 sections .append (f"{ error ['path' ]} : { error_first_line } " )
13571475
1358- # Add repository map
13591476 sections .append ("\n ===REPOSITORY STRUCTURE===" )
13601477 sections .append (self ._generate_text_map (analysis_results ))
13611478
1362- # Join all sections with newlines
1479+ if non_analyzed_files :
1480+ sections .append ("\n ===NON-ANALYZED FILES (repository too large)===" )
1481+ sections .append (
1482+ f"The following { non_analyzed_count } files were not analyzed due to the { MAX_FILES_TO_ANALYZE } file limit:"
1483+ )
1484+ max_non_analyzed_to_show = int (MAX_FILES_TO_ANALYZE / 2 )
1485+ for file_path in sorted (non_analyzed_files [:max_non_analyzed_to_show ]):
1486+ sections .append (f" { file_path } " )
1487+ if len (non_analyzed_files ) > max_non_analyzed_to_show :
1488+ sections .append (
1489+ f" ...and { len (non_analyzed_files ) - max_non_analyzed_to_show } more files."
1490+ )
1491+
13631492 return "\n " .join (sections )
0 commit comments