Skip to content

Commit 84b29c9

Browse files
committed
amd update
1 parent da8b599 commit 84b29c9

File tree

5 files changed

+66
-167
lines changed

5 files changed

+66
-167
lines changed

.github/workflows/amd_workflow.yml

Lines changed: 20 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,102 +1,60 @@
11
name: AMD PyTorch Job
2-
32
on:
43
workflow_dispatch:
54
inputs:
6-
script_content:
7-
description: 'Content of Python script'
8-
required: true
9-
type: string
10-
filename:
11-
description: 'Name of Python script'
5+
payload:
6+
description: 'Content of the user submission, as json string'
127
required: true
138
type: string
14-
reference_content:
15-
description: 'Content of the reference code script (optional)'
16-
required: false
17-
type: string
18-
reference_filename:
19-
description: 'Name of reference script (supports .py or .cu)'
20-
required: false
21-
type: string
22-
eval_content:
23-
description: 'Content of the outer eval code script (optional)'
24-
required: false
25-
type: string
26-
eval_filename:
27-
description: 'Name of outer eval script (supports .py or .cu)'
9+
requirements:
10+
description: 'Contents for a requirements.txt file'
2811
required: false
2912
type: string
3013

3114
jobs:
32-
train:
15+
run:
3316
runs-on: [amdgpu-mi250-x86-64]
3417
timeout-minutes: 10
3518
env:
3619
VENV_DIR: /groups/aig_sharks/pytorch_venv
3720
steps:
21+
- uses: actions/checkout@v3
3822
- name: Setup Python
3923
uses: actions/setup-python@v5
4024
with:
4125
python-version: '3.10'
4226

43-
- name: Create script
44-
shell: python
45-
run: |
46-
with open('${{ github.event.inputs.filename }}', 'w') as f:
47-
f.write('''${{ github.event.inputs.script_content }}''')
48-
49-
- name: Create reference scripts if provided
50-
shell: bash
51-
run: |
52-
if [[ -n "${{ github.event.inputs.reference_filename }}" ]]; then
53-
echo "Creating reference script..."
54-
cat > "${{ github.event.inputs.reference_filename }}" <<EOL
55-
${{ github.event.inputs.reference_content }}
56-
EOL
57-
cat "${{ github.event.inputs.reference_filename }}" # Debug: Show file contents
58-
else
59-
echo "No reference content provided."
60-
fi
6127

62-
- name: Create eval scripts if provided
28+
- name: Create input files
6329
shell: bash
6430
run: |
65-
if [[ -n "${{ github.event.inputs.eval_filename }}" ]]; then
66-
echo "Creating reference script..."
67-
cat > "${{ github.event.inputs.eval_filename }}" <<EOL
68-
${{ github.event.inputs.eval_content }}
31+
cat > "payload.json" <<'EOL'
32+
${{ github.event.inputs.payload }}
6933
EOL
70-
cat "${{ github.event.inputs.eval_filename }}" # Debug: Show file contents
71-
else
72-
echo "No eval content provided."
73-
fi
7434
7535
- name: Setup Virtual Environment and Install Dependencies
36+
shell: bash
7637
run: |
7738
python -m venv ${VENV_DIR}
7839
source ${VENV_DIR}/bin/activate
7940
pip install --upgrade pip
80-
pip install --pre pytorch-triton-rocm==3.1.0+cf34004b8a torch==2.6.0.dev20241023+rocm6.2 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
8141
42+
if [[ -n "${{ github.event.inputs.requirements }}" ]]; then
43+
cat > "requirements.txt" <<'EOL'
44+
${{ github.event.inputs.requirements }}
45+
EOL
46+
pip install -r "requirements.txt"
47+
fi
8248
- name: Run script
8349
shell: bash
8450
run: |
85-
if [[ -n "${{ github.event.inputs.eval_content }}" ]]; then
86-
echo "Running Python file..."
87-
python3 "${{ github.event.inputs.eval_filename }}" > training.log 2>&1
88-
cat training.log # Debug: show output
89-
else
90-
echo "Running Python file..."
91-
python3 "${{ github.event.inputs.filename }}" > training.log 2>&1
92-
cat training.log # Debug: show output
93-
fi
51+
python3 .github/workflows/runner.py
52+
cat result.json # Debug: show output
9453
9554
- name: Upload training artifacts
9655
uses: actions/upload-artifact@v4
9756
if: always()
9857
with:
99-
name: training-artifacts
58+
name: run-result
10059
path: |
101-
training.log
102-
${{ github.event.inputs.filename }}
60+
result.json

.github/workflows/runner.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@
2121
result = {"compile": asdict(comp), "run": asdict(run)}
2222
else:
2323
run = run_pytorch_script(
24-
config.get("eval.py", py_eval), config.get("reference.py", None), config.get("submission.py", None), arch=None
24+
config.get("eval.py", py_eval),
25+
config.get("reference.py", None),
26+
config.get("submission.py", None),
27+
arch=None,
2528
)
2629
result = {"run": asdict(run)}
2730

src/discord-cluster-manager/cogs/github_cog.py

Lines changed: 26 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from discord.ext import commands
1313
from env import GITHUB_REPO, GITHUB_TOKEN
1414
from github import Github
15-
from leaderboard_eval import cu_eval, py_eval
15+
from leaderboard_eval import amd_requirements, nvidia_requirements
1616
from report import generate_report
1717
from run_eval import CompileResult, FullResult, RunResult
1818
from utils import get_github_branch_name, send_discord_message, setup_logging
@@ -68,34 +68,12 @@ async def run_github(
6868
else:
6969
reference_content = None
7070

71-
if gpu_type.value == "nvidia":
72-
run_id = await self.trigger_github_nvidia(
73-
lang=lang,
74-
script_content=script_content,
75-
reference_content=reference_content,
76-
)
77-
else:
78-
##########
79-
# OLD CODE
80-
filename = "train.py" if script.filename.endswith(".py") else "train.cu"
81-
if reference_script is not None or reference_code is not None:
82-
reference_content = (
83-
reference_code
84-
if reference_code is not None
85-
else (await reference_script.read()).decode("utf-8")
86-
)
87-
eval_code = py_eval if script.filename.endswith(".py") else cu_eval
88-
89-
run_id = await self.trigger_github_amd(
90-
script_content,
91-
filename,
92-
selected_gpu,
93-
reference_content,
94-
eval_code,
95-
)
96-
else:
97-
run_id = await self.trigger_github_amd(script_content, filename, selected_gpu)
98-
##########
71+
run_id = await self.trigger_github_run(
72+
lang=lang,
73+
gpu_type=selected_gpu,
74+
script_content=script_content,
75+
reference_content=reference_content,
76+
)
9977

10078
if run_id:
10179
await thread.send(
@@ -128,88 +106,42 @@ async def run_github(
128106
await thread.send(f"Error processing request: {str(e)}")
129107
raise
130108

131-
async def trigger_github_nvidia(
132-
self, lang: str, script_content: str, reference_content: Optional[str]
109+
async def trigger_github_run(
110+
self, lang: str, gpu_type: GPUType, script_content: str, reference_content: Optional[str]
133111
):
112+
if lang == "cu" and gpu_type == GPUType.AMD:
113+
# TODO implement HIP
114+
raise ValueError("Cannot use CUDA runs with AMD GPUs")
115+
134116
eval_name = {"py": "eval.py", "cu": "eval.cu"}[lang]
135117
ref_name = {"py": "reference.py", "cu": "reference.cuh"}[lang]
136118
sub_name = {"py": "submission.py", "cu": "submission.cuh"}[lang]
119+
lang_name = {"py": "Python", "cu": "CUDA"}[lang]
137120

138121
if reference_content is None:
139122
config = {eval_name: script_content, "lang": lang}
140123
else:
141124
config = {ref_name: reference_content, sub_name: script_content, "lang": lang}
142125

143-
logger.info("Attempting to trigger GitHub action for NVIDIA")
126+
logger.info(f"Attempting to trigger GitHub action for {lang_name} on {gpu_type.name}")
144127
gh = Github(GITHUB_TOKEN)
145128
repo = gh.get_repo(GITHUB_REPO)
146129

147130
try:
148131
trigger_time = datetime.now(timezone.utc)
149-
workflow_file = "nvidia_workflow.yml"
132+
workflow_file = gpu_type.value
150133
workflow = repo.get_workflow(workflow_file)
151134

152135
payload = json.dumps(config)
153136

154137
inputs = {"payload": payload}
155138
if lang == "py":
156-
inputs["requirements"] = "numpy\ntorch\nsetuptools\nninja\ntriton"
157-
158-
success = workflow.create_dispatch(
159-
get_github_branch_name(),
160-
inputs=inputs
161-
)
162-
if success:
163-
await asyncio.sleep(2)
164-
runs = list(workflow.get_runs())
165-
166-
for run in runs:
167-
if run.created_at.replace(tzinfo=timezone.utc) > trigger_time:
168-
return run.id
169-
return None
170-
171-
except Exception as e:
172-
logger.error(f"Error in trigger_github_action: {str(e)}", exc_info=True)
173-
return None
174-
175-
async def trigger_github_amd(
176-
self,
177-
script_content,
178-
filename,
179-
gpu_type,
180-
reference_content=None,
181-
eval_content=None,
182-
):
183-
logger.info(f"Attempting to trigger GitHub action for {gpu_type.name} GPU")
184-
gh = Github(GITHUB_TOKEN)
185-
repo = gh.get_repo(GITHUB_REPO)
186-
187-
try:
188-
trigger_time = datetime.now(timezone.utc)
189-
workflow_file = gpu_type.value
190-
workflow = repo.get_workflow(workflow_file)
191-
192-
if reference_content is not None:
193-
eval_filename = "eval.py" if filename.endswith(".py") else "eval.cu"
194-
reference_filename = "reference.py" if filename.endswith(".py") else "reference.cuh"
195-
filename = "train.py" if filename.endswith(".py") else "train.cuh"
196-
success = workflow.create_dispatch(
197-
get_github_branch_name(),
198-
{
199-
"script_content": script_content,
200-
"filename": filename,
201-
"reference_content": reference_content,
202-
"reference_filename": reference_filename,
203-
"eval_content": eval_content,
204-
"eval_filename": eval_filename,
205-
},
206-
)
207-
else:
208-
success = workflow.create_dispatch(
209-
get_github_branch_name(),
210-
{"script_content": script_content, "filename": filename},
211-
)
139+
if gpu_type == GPUType.NVIDIA:
140+
inputs["requirements"] = nvidia_requirements
141+
else:
142+
inputs["requirements"] = amd_requirements
212143

144+
success = workflow.create_dispatch(get_github_branch_name(), inputs=inputs)
213145
if success:
214146
await asyncio.sleep(2)
215147
runs = list(workflow.get_runs())
@@ -258,10 +190,7 @@ async def check_workflow_status(self, run_id, thread, gpu_type):
258190
)
259191

260192
if run.status == "completed":
261-
if gpu_type.value == "nvidia":
262-
result = await self.download_results(run_id)
263-
else:
264-
result = await self.handle_training_log(run_id)
193+
result = await self.download_results(run_id)
265194
return run.conclusion, result, run.html_url
266195

267196
await thread.send(
@@ -271,6 +200,7 @@ async def check_workflow_status(self, run_id, thread, gpu_type):
271200
)
272201
await asyncio.sleep(20)
273202
except Exception as e:
203+
logger.error("Error", exc_info=e)
274204
return "error", str(e), None
275205

276206
async def download_results(self, run_id) -> FullResult:
@@ -285,21 +215,14 @@ async def download_results(self, run_id) -> FullResult:
285215
run = RunResult(**data["run"])
286216
return FullResult(success=True, error="", compile=comp, run=run)
287217
except Exception as e:
218+
logger.error("Error downloading artifacts", exc_info=e)
288219
return FullResult(
289220
success=False,
290-
error=f"Error downloading artifacts: {str(e)}",
221+
error=f"Error downloading artifacts: {repr(e)}",
291222
compile=None,
292223
run=None,
293224
)
294225

295-
async def handle_training_log(self, run_id):
296-
try:
297-
data = await self.download_artifact(run_id, name="training-artifacts")
298-
logs = data["training.log"].decode("utf-8")
299-
return logs
300-
except Exception as e:
301-
return f"Error downloading artifacts: {str(e)}"
302-
303226
async def download_artifact(self, run_id, name: str):
304227
logger.info(f"Attempting to download artifact {name} for run {run_id}")
305228
gh = Github(GITHUB_TOKEN)
@@ -330,4 +253,4 @@ async def download_artifact(self, run_id, name: str):
330253
raise RuntimeError(
331254
f"Failed to download artifact. Status code: {response.status_code}"
332255
)
333-
return RuntimeError(f"Could not find artifact {name}")
256+
raise RuntimeError(f"Could not find artifact {name}")

src/discord-cluster-manager/leaderboard_eval.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,17 @@
66

77
py_eval = Path.read_text(Path(__file__).parent / "eval.py")
88
cu_eval = Path.read_text(Path(__file__).parent / "eval.cu")
9+
10+
nvidia_requirements = """
11+
numpy
12+
torch
13+
setuptools
14+
ninja
15+
triton
16+
"""
17+
18+
amd_requirements = """
19+
--index-url https://download.pytorch.org/whl/nightly/rocm6.2
20+
pytorch-triton-rocm==3.1.0+cf34004b8a
21+
torch==2.6.0.dev20241023+rocm6.2
22+
"""

src/discord-cluster-manager/run_eval.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,8 @@ def run_program(args: list[str]) -> RunResult:
150150
result_dict = {}
151151
for line in result.splitlines():
152152
key, _, value = line.partition(":")
153-
result_dict[key.strip()] = value.strip()
153+
if key != "" or value != "":
154+
result_dict[key.strip()] = value.strip()
154155

155156
return RunResult(
156157
success=(

0 commit comments

Comments
 (0)