amd update

ngc92 · ngc92 · commit 84b29c960f66 · 2025-01-15T12:27:33.000+02:00
diff --git a/.github/workflows/amd_workflow.yml b/.github/workflows/amd_workflow.yml
@@ -1,102 +1,60 @@
 name: AMD PyTorch Job
-
 on:
   workflow_dispatch:
     inputs:
-      script_content:
-        description: 'Content of Python script'
-        required: true
-        type: string
-      filename:
-        description: 'Name of Python script'
+      payload:
+        description: 'Content of the user submission, as json string'
         required: true
         type: string
-      reference_content:
-        description: 'Content of the reference code script (optional)'
-        required: false
-        type: string
-      reference_filename:
-        description: 'Name of reference script (supports .py or .cu)'
-        required: false
-        type: string
-      eval_content:
-        description: 'Content of the outer eval code script (optional)'
-        required: false
-        type: string
-      eval_filename:
-        description: 'Name of outer eval script (supports .py or .cu)'
+      requirements:
+        description: 'Contents for a requirements.txt file'
         required: false
         type: string
 
 jobs:
-  train:
+  run:
     runs-on: [amdgpu-mi250-x86-64]
     timeout-minutes: 10
     env:
       VENV_DIR: /groups/aig_sharks/pytorch_venv
     steps:
+    - uses: actions/checkout@v3
     - name: Setup Python
       uses: actions/setup-python@v5
       with:
         python-version: '3.10'
 
-    - name: Create script
-      shell: python
-      run: |
-        with open('${{ github.event.inputs.filename }}', 'w') as f:
-            f.write('''${{ github.event.inputs.script_content }}''')
-
-    - name: Create reference scripts if provided
-      shell: bash
-      run: |
-        if [[ -n "${{ github.event.inputs.reference_filename }}" ]]; then
-          echo "Creating reference script..."
-          cat > "${{ github.event.inputs.reference_filename }}" <<EOL
-        ${{ github.event.inputs.reference_content }}
-        EOL
-            cat "${{ github.event.inputs.reference_filename }}"  # Debug: Show file contents
-          else
-              echo "No reference content provided."
-          fi
 
-    - name: Create eval scripts if provided
+    - name: Create input files
       shell: bash
       run: |
-        if [[ -n "${{ github.event.inputs.eval_filename }}" ]]; then
-          echo "Creating reference script..."
-          cat > "${{ github.event.inputs.eval_filename }}" <<EOL
-        ${{ github.event.inputs.eval_content }}
+        cat > "payload.json" <<'EOL'
+        ${{ github.event.inputs.payload }}
         EOL
-            cat "${{ github.event.inputs.eval_filename }}"  # Debug: Show file contents
-          else
-              echo "No eval content provided."
-          fi
 
     - name: Setup Virtual Environment and Install Dependencies
+      shell: bash
       run: |
         python -m venv ${VENV_DIR}
         source ${VENV_DIR}/bin/activate
         pip install --upgrade pip
-        pip install --pre pytorch-triton-rocm==3.1.0+cf34004b8a torch==2.6.0.dev20241023+rocm6.2 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
 
+        if [[ -n "${{ github.event.inputs.requirements }}" ]]; then
+          cat > "requirements.txt" <<'EOL'
+          ${{ github.event.inputs.requirements }}
+        EOL
+        pip install -r "requirements.txt"
+        fi
     - name: Run script
       shell: bash
       run: |
-        if [[ -n "${{ github.event.inputs.eval_content }}" ]]; then
-          echo "Running Python file..."
-          python3 "${{ github.event.inputs.eval_filename }}" > training.log 2>&1
-          cat training.log  # Debug: show output
-        else
-          echo "Running Python file..."
-          python3 "${{ github.event.inputs.filename }}" > training.log 2>&1
-          cat training.log  # Debug: show output
-        fi
+        python3 .github/workflows/runner.py
+        cat result.json  # Debug: show output
 
     - name: Upload training artifacts
       uses: actions/upload-artifact@v4
       if: always()
       with:
-        name: training-artifacts
+        name: run-result
         path: |
-          training.log
-          ${{ github.event.inputs.filename }}
+          result.json
diff --git a/.github/workflows/runner.py b/.github/workflows/runner.py
@@ -21,7 +21,10 @@
     result = {"compile": asdict(comp), "run": asdict(run)}
 else:
     run = run_pytorch_script(
-        config.get("eval.py", py_eval), config.get("reference.py", None), config.get("submission.py", None), arch=None
+        config.get("eval.py", py_eval),
+        config.get("reference.py", None),
+        config.get("submission.py", None),
+        arch=None,
     )
     result = {"run": asdict(run)}
 
diff --git a/src/discord-cluster-manager/cogs/github_cog.py b/src/discord-cluster-manager/cogs/github_cog.py
@@ -12,7 +12,7 @@
 from discord.ext import commands
 from env import GITHUB_REPO, GITHUB_TOKEN
 from github import Github
-from leaderboard_eval import cu_eval, py_eval
+from leaderboard_eval import amd_requirements, nvidia_requirements
 from report import generate_report
 from run_eval import CompileResult, FullResult, RunResult
 from utils import get_github_branch_name, send_discord_message, setup_logging
@@ -68,34 +68,12 @@ async def run_github(
             else:
                 reference_content = None
 
-            if gpu_type.value == "nvidia":
-                run_id = await self.trigger_github_nvidia(
-                    lang=lang,
-                    script_content=script_content,
-                    reference_content=reference_content,
-                )
-            else:
-                ##########
-                # OLD CODE
-                filename = "train.py" if script.filename.endswith(".py") else "train.cu"
-                if reference_script is not None or reference_code is not None:
-                    reference_content = (
-                        reference_code
-                        if reference_code is not None
-                        else (await reference_script.read()).decode("utf-8")
-                    )
-                    eval_code = py_eval if script.filename.endswith(".py") else cu_eval
-
-                    run_id = await self.trigger_github_amd(
-                        script_content,
-                        filename,
-                        selected_gpu,
-                        reference_content,
-                        eval_code,
-                    )
-                else:
-                    run_id = await self.trigger_github_amd(script_content, filename, selected_gpu)
-                ##########
+            run_id = await self.trigger_github_run(
+                lang=lang,
+                gpu_type=selected_gpu,
+                script_content=script_content,
+                reference_content=reference_content,
+            )
 
             if run_id:
                 await thread.send(
@@ -128,88 +106,42 @@ async def run_github(
                 await thread.send(f"Error processing request: {str(e)}")
             raise
 
-    async def trigger_github_nvidia(
-        self, lang: str, script_content: str, reference_content: Optional[str]
+    async def trigger_github_run(
+        self, lang: str, gpu_type: GPUType, script_content: str, reference_content: Optional[str]
     ):
+        if lang == "cu" and gpu_type == GPUType.AMD:
+            # TODO implement HIP
+            raise ValueError("Cannot use CUDA runs with AMD GPUs")
+
         eval_name = {"py": "eval.py", "cu": "eval.cu"}[lang]
         ref_name = {"py": "reference.py", "cu": "reference.cuh"}[lang]
         sub_name = {"py": "submission.py", "cu": "submission.cuh"}[lang]
+        lang_name = {"py": "Python", "cu": "CUDA"}[lang]
 
         if reference_content is None:
             config = {eval_name: script_content, "lang": lang}
         else:
             config = {ref_name: reference_content, sub_name: script_content, "lang": lang}
 
-        logger.info("Attempting to trigger GitHub action for NVIDIA")
+        logger.info(f"Attempting to trigger GitHub action for {lang_name} on {gpu_type.name}")
         gh = Github(GITHUB_TOKEN)
         repo = gh.get_repo(GITHUB_REPO)
 
         try:
             trigger_time = datetime.now(timezone.utc)
-            workflow_file = "nvidia_workflow.yml"
+            workflow_file = gpu_type.value
             workflow = repo.get_workflow(workflow_file)
 
             payload = json.dumps(config)
 
             inputs = {"payload": payload}
             if lang == "py":
-                inputs["requirements"] = "numpy\ntorch\nsetuptools\nninja\ntriton"
-
-            success = workflow.create_dispatch(
-                get_github_branch_name(),
-                inputs=inputs
-            )
-            if success:
-                await asyncio.sleep(2)
-                runs = list(workflow.get_runs())
-
-                for run in runs:
-                    if run.created_at.replace(tzinfo=timezone.utc) > trigger_time:
-                        return run.id
-            return None
-
-        except Exception as e:
-            logger.error(f"Error in trigger_github_action: {str(e)}", exc_info=True)
-            return None
-
-    async def trigger_github_amd(
-        self,
-        script_content,
-        filename,
-        gpu_type,
-        reference_content=None,
-        eval_content=None,
-    ):
-        logger.info(f"Attempting to trigger GitHub action for {gpu_type.name} GPU")
-        gh = Github(GITHUB_TOKEN)
-        repo = gh.get_repo(GITHUB_REPO)
-
-        try:
-            trigger_time = datetime.now(timezone.utc)
-            workflow_file = gpu_type.value
-            workflow = repo.get_workflow(workflow_file)
-
-            if reference_content is not None:
-                eval_filename = "eval.py" if filename.endswith(".py") else "eval.cu"
-                reference_filename = "reference.py" if filename.endswith(".py") else "reference.cuh"
-                filename = "train.py" if filename.endswith(".py") else "train.cuh"
-                success = workflow.create_dispatch(
-                    get_github_branch_name(),
-                    {
-                        "script_content": script_content,
-                        "filename": filename,
-                        "reference_content": reference_content,
-                        "reference_filename": reference_filename,
-                        "eval_content": eval_content,
-                        "eval_filename": eval_filename,
-                    },
-                )
-            else:
-                success = workflow.create_dispatch(
-                    get_github_branch_name(),
-                    {"script_content": script_content, "filename": filename},
-                )
+                if gpu_type == GPUType.NVIDIA:
+                    inputs["requirements"] = nvidia_requirements
+                else:
+                    inputs["requirements"] = amd_requirements
 
+            success = workflow.create_dispatch(get_github_branch_name(), inputs=inputs)
             if success:
                 await asyncio.sleep(2)
                 runs = list(workflow.get_runs())
@@ -258,10 +190,7 @@ async def check_workflow_status(self, run_id, thread, gpu_type):
                     )
 
                 if run.status == "completed":
-                    if gpu_type.value == "nvidia":
-                        result = await self.download_results(run_id)
-                    else:
-                        result = await self.handle_training_log(run_id)
+                    result = await self.download_results(run_id)
                     return run.conclusion, result, run.html_url
 
                 await thread.send(
@@ -271,6 +200,7 @@ async def check_workflow_status(self, run_id, thread, gpu_type):
                 )
                 await asyncio.sleep(20)
             except Exception as e:
+                logger.error("Error", exc_info=e)
                 return "error", str(e), None
 
     async def download_results(self, run_id) -> FullResult:
@@ -285,21 +215,14 @@ async def download_results(self, run_id) -> FullResult:
             run = RunResult(**data["run"])
             return FullResult(success=True, error="", compile=comp, run=run)
         except Exception as e:
+            logger.error("Error downloading artifacts", exc_info=e)
             return FullResult(
                 success=False,
-                error=f"Error downloading artifacts: {str(e)}",
+                error=f"Error downloading artifacts: {repr(e)}",
                 compile=None,
                 run=None,
             )
 
-    async def handle_training_log(self, run_id):
-        try:
-            data = await self.download_artifact(run_id, name="training-artifacts")
-            logs = data["training.log"].decode("utf-8")
-            return logs
-        except Exception as e:
-            return f"Error downloading artifacts: {str(e)}"
-
     async def download_artifact(self, run_id, name: str):
         logger.info(f"Attempting to download artifact {name} for run {run_id}")
         gh = Github(GITHUB_TOKEN)
@@ -330,4 +253,4 @@ async def download_artifact(self, run_id, name: str):
                     raise RuntimeError(
                         f"Failed to download artifact. Status code: {response.status_code}"
                     )
-        return RuntimeError(f"Could not find artifact {name}")
+        raise RuntimeError(f"Could not find artifact {name}")
diff --git a/src/discord-cluster-manager/leaderboard_eval.py b/src/discord-cluster-manager/leaderboard_eval.py
@@ -6,3 +6,17 @@
 
 py_eval = Path.read_text(Path(__file__).parent / "eval.py")
 cu_eval = Path.read_text(Path(__file__).parent / "eval.cu")
+
+nvidia_requirements = """
+numpy
+torch
+setuptools
+ninja
+triton
+"""
+
+amd_requirements = """
+--index-url https://download.pytorch.org/whl/nightly/rocm6.2
+pytorch-triton-rocm==3.1.0+cf34004b8a
+torch==2.6.0.dev20241023+rocm6.2
+"""
diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
@@ -150,7 +150,8 @@ def run_program(args: list[str]) -> RunResult:
     result_dict = {}
     for line in result.splitlines():
         key, _, value = line.partition(":")
-        result_dict[key.strip()] = value.strip()
+        if key != "" or value != "":
+            result_dict[key.strip()] = value.strip()
 
     return RunResult(
         success=(