Skip to content

Commit 4ab5a1a

Browse files
committed
feat: limited the file to analyzed to 400 files to prevent context overflow
1 parent 983d9b3 commit 4ab5a1a

File tree

7 files changed

+160
-27
lines changed

7 files changed

+160
-27
lines changed

AgentCrew/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.8.11"
1+
__version__ = "0.8.12"

AgentCrew/app.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,8 @@ def setup_services(
265265
search_service = None
266266

267267
try:
268-
code_analysis_service = CodeAnalysisService()
268+
code_analysis_llm = llm_manager.initialize_standalone_service(provider)
269+
code_analysis_service = CodeAnalysisService(llm_service=code_analysis_llm)
269270
except Exception as e:
270271
click.echo(f"⚠️ Code analysis tool not available: {str(e)}")
271272
code_analysis_service = None

AgentCrew/modules/code_analysis/service.py

Lines changed: 151 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,18 @@
11
import os
22
import fnmatch
33
import subprocess
4-
from typing import Any, Dict, List, Optional
4+
import json
5+
import asyncio
6+
from typing import Any, Dict, List, Optional, TYPE_CHECKING
7+
58
from tree_sitter_language_pack import get_parser
69
from tree_sitter import Parser
710

8-
MAX_ITEMS_OUT = 15
11+
if TYPE_CHECKING:
12+
from AgentCrew.modules.llm.base import BaseLLMService
13+
14+
MAX_ITEMS_OUT = 20
15+
MAX_FILES_TO_ANALYZE = 400
916

1017

1118
class CodeAnalysisService:
@@ -43,8 +50,27 @@ class CodeAnalysisService:
4350
# Add more languages as needed
4451
}
4552

46-
def __init__(self):
47-
"""Initialize the code analysis service with tree-sitter parsers."""
53+
def __init__(self, llm_service: Optional["BaseLLMService"] = None):
54+
"""Initialize the code analysis service with tree-sitter parsers.
55+
56+
Args:
57+
llm_service: Optional LLM service for intelligent file selection when
58+
analyzing large repositories (>500 files).
59+
"""
60+
self.llm_service = llm_service
61+
if self.llm_service:
62+
if self.llm_service.provider_name == "google":
63+
self.llm_service.model = "gemini-2.5-flash-lite"
64+
elif self.llm_service.provider_name == "claude":
65+
self.llm_service.model = "claude-3-5-haiku-latest"
66+
elif self.llm_service.provider_name == "openai":
67+
self.llm_service.model = "gpt-4.1-nano"
68+
elif self.llm_service.provider_name == "groq":
69+
self.llm_service.model = "llama-3.3-70b-versatile"
70+
elif self.llm_service.provider_name == "deepinfra":
71+
self.llm_service.model = "google/gemma-3-27b-it"
72+
elif self.llm_service.provider_name == "github_copilot":
73+
self.llm_service.model = "gpt-5-mini"
4874
try:
4975
self._parser_cache = {
5076
"python": get_parser("python"),
@@ -898,6 +924,73 @@ def _count_nodes(self, structure: Dict[str, Any], node_types: set[str]) -> int:
898924

899925
return count
900926

927+
def _select_files_with_llm(
928+
self, files: List[str], max_files: int = MAX_FILES_TO_ANALYZE
929+
) -> List[str]:
930+
"""Use LLM to intelligently select which files to analyze from a large repository.
931+
932+
Args:
933+
files: List of relative file paths to select from
934+
max_files: Maximum number of files to select
935+
936+
Returns:
937+
List of selected file paths that should be analyzed
938+
"""
939+
if not self.llm_service:
940+
return files[:max_files]
941+
942+
prompt = f"""You are analyzing a code repository with {len(files)} files.
943+
The analysis system can only process {max_files} files at a time.
944+
945+
Please select the {max_files} most important files to analyze based on these criteria:
946+
1. Core application logic files (main entry points, core modules)
947+
2. Business logic and domain models
948+
3. API endpoints and controllers
949+
4. Service/utility classes
950+
5. Configuration files that define app structure
951+
6. Test files are lower priority unless they reveal architecture
952+
7. Generated files, lock files, and vendor files should be excluded
953+
954+
Here is the complete list of files in the repository:
955+
{chr(10).join(files)}
956+
957+
Return your selection as a JSON array of file paths. Only return the JSON array, nothing else.
958+
Select exactly {max_files} files from the list above.
959+
960+
Example response format:
961+
["src/main.py", "src/app.py", "src/models/user.py"]"""
962+
963+
try:
964+
loop = asyncio.get_event_loop()
965+
except RuntimeError:
966+
loop = asyncio.new_event_loop()
967+
asyncio.set_event_loop(loop)
968+
969+
try:
970+
response = loop.run_until_complete(
971+
self.llm_service.process_message(prompt, temperature=0)
972+
)
973+
974+
response = response.strip()
975+
if response.startswith("```json"):
976+
response = response[7:]
977+
if response.startswith("```"):
978+
response = response[3:]
979+
if response.endswith("```"):
980+
response = response[:-3]
981+
response = response.strip()
982+
983+
selected_files = json.loads(response)
984+
985+
if isinstance(selected_files, list):
986+
valid_files = [f for f in selected_files if f in files]
987+
if len(valid_files) >= max_files * 0.5:
988+
return valid_files[:max_files]
989+
except Exception:
990+
pass
991+
992+
return files[:max_files]
993+
901994
def analyze_code_structure(
902995
self, path: str, exclude_patterns: List[str] = []
903996
) -> Dict[str, Any] | str:
@@ -911,11 +1004,9 @@ def analyze_code_structure(
9111004
Dictionary containing analysis results for each file or formatted string
9121005
"""
9131006
try:
914-
# Verify the path exists
9151007
if not os.path.exists(path):
9161008
return {"error": f"Path does not exist: {path}"}
9171009

918-
# Run git ls-files to get all tracked files
9191010
try:
9201011
result = subprocess.run(
9211012
["git", "ls-files"],
@@ -930,21 +1021,32 @@ def analyze_code_structure(
9301021
"error": f"Failed to run git ls-files on {path}. Make sure it's a git repository."
9311022
}
9321023

933-
# Filter for supported file types
934-
supported_files = []
1024+
supported_files_rel = []
9351025
for file_path in files:
9361026
excluded = False
937-
if file_path.strip(): # Skip empty lines
938-
# Check against glob exclude patterns
1027+
if file_path.strip():
9391028
for pattern in exclude_patterns:
9401029
if fnmatch.fnmatch(file_path, pattern):
9411030
excluded = True
9421031
break
9431032
ext = os.path.splitext(file_path)[1].lower()
9441033
if ext in self.LANGUAGE_MAP and not excluded:
945-
supported_files.append(os.path.join(path, file_path))
1034+
supported_files_rel.append(file_path)
1035+
1036+
non_analyzed_files = []
1037+
files_to_analyze = supported_files_rel
1038+
1039+
if len(supported_files_rel) > MAX_FILES_TO_ANALYZE:
1040+
selected_files = self._select_files_with_llm(
1041+
supported_files_rel, MAX_FILES_TO_ANALYZE
1042+
)
1043+
non_analyzed_files = [
1044+
f for f in supported_files_rel if f not in selected_files
1045+
]
1046+
files_to_analyze = selected_files
1047+
1048+
supported_files = [os.path.join(path, f) for f in files_to_analyze]
9461049

947-
# Analyze each file
9481050
analysis_results = []
9491051
errors = []
9501052
for file_path in supported_files:
@@ -953,15 +1055,13 @@ def analyze_code_structure(
9531055
language = self._detect_language(file_path)
9541056

9551057
if language == "config":
956-
# Skip problematic file
9571058
if os.path.basename(file_path) == "package-lock.json":
9581059
continue
9591060
result = {"type": "config", "name": os.path.basename(file_path)}
9601061
else:
9611062
result = self._analyze_file(file_path)
9621063

9631064
if result and isinstance(result, dict) and "error" not in result:
964-
# Successfully analyzed file
9651065
analysis_results.append(
9661066
{
9671067
"path": rel_path,
@@ -977,7 +1077,11 @@ def analyze_code_structure(
9771077
if not analysis_results:
9781078
return "Analysis completed but no valid results. This may due to excluded patterns is not correct"
9791079
return self._format_analysis_results(
980-
analysis_results, supported_files, errors
1080+
analysis_results,
1081+
supported_files,
1082+
errors,
1083+
non_analyzed_files,
1084+
len(supported_files_rel),
9811085
)
9821086

9831087
except Exception as e:
@@ -1318,10 +1422,19 @@ def _format_analysis_results(
13181422
analysis_results: List[Dict[str, Any]],
13191423
analyzed_files: List[str],
13201424
errors: List[Dict[str, str]],
1425+
non_analyzed_files: List[str] = [],
1426+
total_supported_files: int = 0,
13211427
) -> str:
1322-
"""Format the analysis results into a clear text format."""
1428+
"""Format the analysis results into a clear text format.
1429+
1430+
Args:
1431+
analysis_results: List of analysis results for each file
1432+
analyzed_files: List of files that were analyzed
1433+
errors: List of errors encountered during analysis
1434+
non_analyzed_files: List of files that were skipped due to file limit
1435+
total_supported_files: Total number of supported files in the repository
1436+
"""
13231437

1324-
# Count statistics
13251438
total_files = len(analyzed_files)
13261439
classes = sum(
13271440
self._count_nodes(f["structure"], self.class_types)
@@ -1336,28 +1449,44 @@ def _format_analysis_results(
13361449
for f in analysis_results
13371450
)
13381451
error_count = len(errors)
1452+
non_analyzed_count = len(non_analyzed_files)
13391453

1340-
# Build output sections
13411454
sections = []
13421455

1343-
# Add statistics section
13441456
sections.append("\n===ANALYSIS STATISTICS===\n")
13451457
sections.append(f"Total files analyzed: {total_files}")
1458+
if non_analyzed_count > 0:
1459+
sections.append(
1460+
f"Total files skipped (repository too large): {non_analyzed_count}"
1461+
)
1462+
sections.append(
1463+
f"Total supported files in repository: {total_supported_files}"
1464+
)
13461465
sections.append(f"Total errors: {error_count}")
13471466
sections.append(f"Total classes found: {classes}")
13481467
sections.append(f"Total functions found: {functions}")
13491468
sections.append(f"Total decorated functions: {decorated_functions}")
13501469

1351-
# Add errors section if any
13521470
if errors:
13531471
sections.append("\n===ERRORS===")
13541472
for error in errors:
13551473
error_first_line = error["error"].split("\n")[0]
13561474
sections.append(f"{error['path']}: {error_first_line}")
13571475

1358-
# Add repository map
13591476
sections.append("\n===REPOSITORY STRUCTURE===")
13601477
sections.append(self._generate_text_map(analysis_results))
13611478

1362-
# Join all sections with newlines
1479+
if non_analyzed_files:
1480+
sections.append("\n===NON-ANALYZED FILES (repository too large)===")
1481+
sections.append(
1482+
f"The following {non_analyzed_count} files were not analyzed due to the {MAX_FILES_TO_ANALYZE} file limit:"
1483+
)
1484+
max_non_analyzed_to_show = int(MAX_FILES_TO_ANALYZE / 2)
1485+
for file_path in sorted(non_analyzed_files[:max_non_analyzed_to_show]):
1486+
sections.append(f" {file_path}")
1487+
if len(non_analyzed_files) > max_non_analyzed_to_show:
1488+
sections.append(
1489+
f" ...and {len(non_analyzed_files) - max_non_analyzed_to_show} more files."
1490+
)
1491+
13631492
return "\n".join(sections)

docker/pyproject.docker.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "agentcrew-ai"
3-
version = "0.8.11"
3+
version = "0.8.12"
44
requires-python = ">=3.12"
55
classifiers = [
66
"Programming Language :: Python :: 3",

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "agentcrew-ai"
3-
version = "0.8.11"
3+
version = "0.8.12"
44
requires-python = ">=3.12"
55
classifiers = [
66
"Programming Language :: Python :: 3",

tests/code_analysis_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import requests
33

44
from AgentCrew.modules.code_analysis import CodeAnalysisService
5+
from AgentCrew.modules.llm.service_manager import ServiceManager
56

67

78
def count_tokens(content: str, model: str = "claude-opus-4-5-20251101") -> dict:
@@ -21,6 +22,8 @@ def count_tokens(content: str, model: str = "claude-opus-4-5-20251101") -> dict:
2122

2223

2324
if __name__ == "__main__":
25+
llm_manager = ServiceManager.get_instance()
26+
code_analysis_llm = llm_manager.initialize_standalone_service("github_copilot")
2427
analyze = CodeAnalysisService()
2528
result = analyze.analyze_code_structure(
2629
"./",

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)