InternVL3 is a vision-language model built with Native Multimodal Pre-Training - it learned vision and language together from scratch rather than bolting vision onto an existing LLM. Released April 2025.
| Benchmark | InternVL3-78B | GPT-4o | Improvement |
|---|---|---|---|
| MMMU | 72.2 | 69.1 | +4.5% |
| VSI-Bench | 48.9 | ~45 | +8.7% |
| WebArena | 11.7 | 1.9 | +6x |
| RealWorldQA | 72.3 | 70.8 | +2.1% |
Key advantages:
- Superior temporal reasoning across video frames
- Better 3D spatial understanding
- Excellent at GUI automation tasks
- Strong scientific diagram analysis
| Variant | Params | Min GPU | Throughput | Best For |
|---|---|---|---|---|
| 1B | 1B | 8GB | ~40 img/s | Edge/mobile |
| 2B | 2B | 12GB | ~30 img/s | Lightweight |
| 8B | 8B | 24GB | ~20 img/s | Best value |
| 38B | 38B | 2x 80GB | ~8 img/s | High accuracy |
| 78B | 78B | 4x 80GB | ~5 img/s | Maximum |
Recommendation: The 8B model offers the best price/performance ratio and fits on consumer GPUs.
pip install torch transformers accelerate pillow
pip install lmdeploy>=0.6.0 # For production
pip install decord opencv-python # For video processingfrom src.internvl3 import InternVL3
# Load model
model = InternVL3(variant="8B").load()
# Analyze an image
response = model.chat(
"Describe what's happening in this scene",
"street.jpg"
)
print(response)from src.internvl3 import VideoAnalyzer
analyzer = VideoAnalyzer(variant="8B")
# Comprehensive video analysis
result = analyzer.analyze_video(
"factory.mp4",
analysis_type="industrial",
sample_interval=2.0, # Analyze every 2 seconds
max_frames=30,
)
print(f"Duration: {result.duration_seconds}s")
print(f"Frames analyzed: {result.total_frames_analyzed}")
print(f"Summary: {result.summary}")from src.internvl3 import VideoAnalyzer
with VideoAnalyzer() as analyzer:
# Detect anomalies with context
anomalies = analyzer.detect_anomalies(
"conveyor.mp4",
context="Normal: cardboard boxes moving left to right on belt, evenly spaced",
check_interval=1.0,
anomaly_type="industrial",
)
for anomaly in anomalies:
print(f"[{anomaly['timestamp']:.1f}s] {anomaly['severity']}: {anomaly['description']}")InternVL3's strength is reasoning across multiple images:
from src.internvl3 import VideoAnalyzer
analyzer = VideoAnalyzer()
# Analyze sequence of frames
frames = ["frame_001.jpg", "frame_002.jpg", "frame_003.jpg", "frame_004.jpg"]
response = analyzer.analyze_frames_batch(
frames,
"What object moved between these frames and where did it go?"
)
print(response)# Track a specific object through video
tracking = analyzer.track_object(
"warehouse.mp4",
object_description="red forklift",
sample_interval=0.5,
max_frames=60,
)
for result in tracking:
if result["found"]:
print(f"[{result['timestamp']:.1f}s] {result['location']} - {result['state']}")For high-throughput production:
from src.internvl3 import InternVL3
# LMDeploy optimizes KV cache and batching
model = InternVL3(
variant="8B",
backend="lmdeploy",
).load()# 38B requires 2x A100
model = InternVL3(
variant="38B",
backend="lmdeploy", # Handles tensor parallelism
).load()
# 78B requires 4x A100
model = InternVL3(variant="78B", backend="lmdeploy").load()analyzer = VideoAnalyzer()
def quality_check_callback(timestamp, frame, anomalies):
if anomalies:
# Trigger alert
for a in anomalies:
if a.get("severity") == "high":
send_alert(f"Critical defect at {timestamp}s: {a['description']}")
save_frame(frame, f"defect_{timestamp}.jpg")
anomalies = analyzer.detect_anomalies(
"production_line.mp4",
context="""Normal conditions:
- Products move left to right on conveyor
- Each product is rectangular, undamaged
- Spacing between products is 30-50cm
- Belt speed is constant""",
check_interval=0.5, # Fast sampling for production
anomaly_type="industrial",
callback=quality_check_callback,
)result = analyzer.analyze_video(
"parking_lot.mp4",
analysis_type="surveillance",
sample_interval=5.0, # Every 5 seconds for long videos
)
# Find specific events
for frame_analysis in result.timeline:
if "person" in frame_analysis.description.lower():
print(f"[{frame_analysis.timestamp}s] Activity detected")# Compare two frames for changes
changes = analyzer.compare_frames(
"before.jpg",
"after.jpg",
context="Warehouse inventory check",
)
print(f"Items moved: {changes['moved']}")
print(f"New items: {changes['appeared']}")
print(f"Missing items: {changes['disappeared']}")InternVL3 scores 11.7 on WebArena vs GPT-4o's 1.9:
model = InternVL3(variant="8B").load()
# Analyze UI screenshot
response = model.chat(
"""Analyze this UI screenshot:
1. What application is this?
2. What is the current state?
3. What actions are available?
4. If I want to submit a form, what should I click?""",
"app_screenshot.png"
)Best for continuous monitoring:
# Every 2 seconds
frames = extract_video_frames(video, interval_seconds=2.0)For videos with distinct scenes, pre-filter frames:
import cv2
import numpy as np
def detect_scene_changes(video_path, threshold=30):
"""Extract frames where significant changes occur."""
cap = cv2.VideoCapture(video_path)
prev_frame = None
scene_frames = []
while True:
ret, frame = cap.read()
if not ret:
break
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
if prev_frame is not None:
diff = cv2.absdiff(prev_frame, gray)
if np.mean(diff) > threshold:
scene_frames.append(frame)
prev_frame = gray
return scene_framesFor edited videos, extract I-frames:
from decord import VideoReader
vr = VideoReader(video_path)
keyframes = vr.get_key_indices()
frames = [vr[i].asnumpy() for i in keyframes]Always provide context for anomaly detection:
# Good: Specific context
context = """Manufacturing line producing smartphone cases.
Normal: Cases are white, rectangular, no cracks.
Belt moves at 0.5m/s. One case every 2 seconds."""
# Less good: Vague context
context = "Factory video"InternVL3 handles 8+ frames well, but quality may degrade:
# Optimal: 4-8 frames for temporal reasoning
frames = video_frames[:8]
# For longer sequences, summarize in chunks
chunk_summaries = []
for i in range(0, len(frames), 8):
chunk = frames[i:i+8]
summary = analyzer.analyze_frames_batch(chunk, "Summarize activity")
chunk_summaries.append(summary)from src.utils import resize_for_model
# Higher resolution = more detail but slower
# Lower resolution = faster but may miss small details
# For surveillance (need to see faces/plates)
frame = resize_for_model(frame, model="internvl", max_tokens=4096)
# For activity detection (don't need fine detail)
frame = resize_for_model(frame, model="internvl", max_tokens=1024)# Process frames one at a time instead of loading all
from src.utils import extract_video_frames
# Generator - doesn't load all frames at once
for timestamp, frame in extract_video_frames(video, interval=1.0):
result = model.chat(prompt, frame)
# Process and discard frame# 1. Use smaller model
analyzer = VideoAnalyzer(variant="2B") # Faster, still good
# 2. Increase sampling interval
anomalies = analyzer.detect_anomalies(video, check_interval=5.0)
# 3. Use LMDeploy backend
analyzer = VideoAnalyzer(variant="8B", backend="lmdeploy")For better temporal consistency:
# Include temporal context in prompt
prompt = """These 4 frames are from a video, 1 second apart.
Frame 1 (0s), Frame 2 (1s), Frame 3 (2s), Frame 4 (3s).
Track the movement of people between frames.
Describe what each person does from start to end."""See source code documentation:
src/internvl3/loader.py- Model loadingsrc/internvl3/video_analysis.py- Video analysis pipeline