-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathserve.py
More file actions
132 lines (104 loc) · 3.77 KB
/
serve.py
File metadata and controls
132 lines (104 loc) · 3.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""
Serve CEFR model as a FastAPI endpoint on Modal.
Usage:
uv run modal deploy serve.py
# Then: curl https://your-app.modal.run/score -X POST -H "Content-Type: application/json" -d '{"text": "..."}'
"""
import modal
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
app = modal.App("cefr-api")
image = (
modal.Image.debian_slim(python_version="3.12")
.pip_install(
"torch>=2.1.0",
"transformers>=4.40.0",
"fastapi>=0.104.0",
"sentencepiece>=0.1.99",
)
.add_local_file("model.py", "/app/model.py")
)
volume = modal.Volume.from_name("cefr-models")
# FastAPI app
web_app = FastAPI(title="CEFR Scoring API")
class ScoreRequest(BaseModel):
text: str
class ScoreResponse(BaseModel):
score: float
cefr_level: str
confidence: str
@app.cls(
image=image,
gpu="T4",
volumes={"/vol": volume},
scaledown_window=60, # Keep warm for 1 minute
)
class CEFRService:
"""CEFR scoring service with model lifecycle management."""
def __init__(self):
self.model = None
self.tokenizer = None
self.device = None
@modal.enter()
def startup(self):
"""Load model on container startup."""
import torch
from transformers import AutoTokenizer
import sys
sys.path.insert(0, "/app")
from model import CEFRModel
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Loading model on {self.device}...")
self.model = CEFRModel().float()
self.model.load_state_dict(
torch.load("/vol/best_model.pt", map_location=self.device, weights_only=True)
)
self.model = self.model.to(self.device)
self.model.eval()
self.tokenizer = AutoTokenizer.from_pretrained("/vol/tokenizer")
print("Model loaded!")
@modal.asgi_app()
def serve(self):
"""Return the FastAPI app."""
@web_app.get("/health")
def health():
return {"status": "healthy", "model_loaded": self.model is not None}
@web_app.post("/score", response_model=ScoreResponse)
def score_essay(request: ScoreRequest):
"""Score an essay and return CEFR level."""
import torch
import sys
sys.path.insert(0, "/app")
from model import score_to_cefr
if self.model is None:
raise HTTPException(500, "Model not loaded")
if len(request.text.strip()) < 10:
raise HTTPException(400, "Text too short (min 10 characters)")
# Tokenize
encoding = self.tokenizer(
request.text,
truncation=True,
padding="max_length",
max_length=512,
return_tensors="pt",
)
# Move inputs to same device as model
input_ids = encoding["input_ids"].to(self.device)
attention_mask = encoding["attention_mask"].to(self.device)
# Predict
with torch.no_grad():
score = self.model(input_ids, attention_mask).item()
# Clamp to valid range
score = max(1.0, min(6.0, score))
cefr = score_to_cefr(score)
# Confidence heuristic based on distance to nearest CEFR boundary.
# Scores near boundaries (e.g., 2.48 between A2/B1) are less certain.
boundaries = {1.5, 2.5, 3.5, 4.5, 5.5}
min_dist = min(abs(score - b) for b in boundaries)
confidence = "high" if min_dist > 0.3 else "medium" if min_dist > 0.15 else "low"
return ScoreResponse(
score=round(score, 2),
cefr_level=cefr,
confidence=confidence,
)
return web_app