Skip to content

pod running error #146

@bilbilmyc

Description

@bilbilmyc

Describe the bug

I used helm to install nvidia_gpu_exporter. I only changed values.yml. pod is running normally, but pod is reporting errors. error="command failed. stderr: err: exit status 12"

To Reproduce
Steps to reproduce the behavior:
https://artifacthub.io/packages/helm/utkuozdemir/nvidia-gpu-exporter

  1. This is my values.yml
image:
  repository: docker.io/utkuozdemir/nvidia_gpu_exporter
  pullPolicy: IfNotPresent
  tag: ""

imagePullSecrets: []
nameOverride: ""
fullnameOverride: ""

serviceAccount:
  create: true
  annotations: {}
  name: ""

podAnnotations: {}

podSecurityContext: {}

securityContext:
  privileged: true

service:
  type: NodePort
  port: 9835
  nodePort: 30235

ingress:
  enabled: false
  className: ""
  annotations: {}
  hosts:
    - host: chart-example.local
      paths:
        - path: /
          pathType: ImplementationSpecific
  tls: []

resources: {}

nodeSelector: {}

tolerations: {}

affinity:
  nodeAffinity:
    requiredDuringSchedulingIgnoredDuringExecution:
      nodeSelectorTerms:
      - matchExpressions:
        - key: kubernetes.io/hostname
          operator: NotIn
          values:
          - pt01
          - pt02
          - pt03

port: 9835

hostPort:
  enabled: false
  port: 9835

log:
  level: info
  format: logfmt

queryFieldNames:
  - AUTO

nvidiaSmiCommand: nvidia-smi

telemetryPath: /metrics

volumes:
  - name: nvidiactl
    hostPath:
      path: /dev/nvidiactl
  - name: nvidia0
    hostPath:
      path: /dev/nvidia0
  - name: nvidia-smi
    hostPath:
      path: /usr/bin/nvidia-smi
  - name: libnvidia-ml-so
    hostPath:
      path: /usr/lib/libnvidia-ml.so
  - name: libnvidia-ml-so-1
    hostPath:
      path: /usr/lib/libnvidia-ml.so.1

volumeMounts:
  - name: nvidiactl
    mountPath: /dev/nvidiactl
  - name: nvidia0
    mountPath: /dev/nvidia0
  - name: nvidia-smi
    mountPath: /usr/bin/nvidia-smi
  - name: libnvidia-ml-so
    mountPath: /usr/lib/x86_64-linux-gnu/libnvidia-ml.so
  - name: libnvidia-ml-so-1
    mountPath: /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1

serviceMonitor:
  enabled: false
  additionalLabels: {}
  scheme: http
  bearerTokenFile:
  interval:
  tlsConfig: {}
  proxyUrl: ""
  relabelings: []
  metricRelabelings: []
  scrapeTimeout: 10s
  1. this is my driver
[root@g105 ~]# ll /usr/lib/libnvidia-ml.so.1 /usr/lib/libnvidia-ml.so /dev/nvidiactl /dev/nvidia0 /usr/bin/nvidia-smi
crw-rw-rw- 1 root root 195,   0 Jan 18 17:34 /dev/nvidia0
crw-rw-rw- 1 root root 195, 255 Jan 18 17:34 /dev/nvidiactl
-rwxr-xr-x 1 root root   634504 Jan 17 13:25 /usr/bin/nvidia-smi
lrwxrwxrwx 1 root root       17 Jan 17 13:25 /usr/lib/libnvidia-ml.so -> libnvidia-ml.so.1
lrwxrwxrwx 1 root root       25 Jan 17 13:25 /usr/lib/libnvidia-ml.so.1 -> libnvidia-ml.so.530.30.02
  1. this is pod status
$ kubectl get pod -n mayunchao
NAME                                     READY   STATUS    RESTARTS   AGE
gpu-exporter-nvidia-gpu-exporter-2lh74   1/1     Running   0          8m7s
gpu-exporter-nvidia-gpu-exporter-474rj   1/1     Running   0          8m44s
gpu-exporter-nvidia-gpu-exporter-6sdxd   1/1     Running   0          8m39s
gpu-exporter-nvidia-gpu-exporter-9xssr   1/1     Running   0          7m40s
gpu-exporter-nvidia-gpu-exporter-b5cpq   1/1     Running   0          6m56s
gpu-exporter-nvidia-gpu-exporter-brrlx   1/1     Running   0          7m30s
gpu-exporter-nvidia-gpu-exporter-dv4z7   1/1     Running   0          7m15s
gpu-exporter-nvidia-gpu-exporter-fcbbn   1/1     Running   0          6m39s
gpu-exporter-nvidia-gpu-exporter-g8gwq   1/1     Running   0          8m27s
gpu-exporter-nvidia-gpu-exporter-grbrt   1/1     Running   0          7m1s
gpu-exporter-nvidia-gpu-exporter-ms5dn   1/1     Running   0          6m49s
gpu-exporter-nvidia-gpu-exporter-pjfpj   1/1     Running   0          8m20s
gpu-exporter-nvidia-gpu-exporter-qzqg6   1/1     Running   0          7m52s
gpu-exporter-nvidia-gpu-exporter-z6sxz   1/1     Running   0          9m7s
gpu-exporter-nvidia-gpu-exporter-zt82b   1/1     Running   0          8m58s

Expected behavior

I expect the POD to run properly and collect data

Console output

$ kubectl logs -n mayunchao gpu-exporter-nvidia-gpu-exporter-6sdxd

level=warn ts=2024-03-07T08:49:46.506Z caller=exporter.go:101 msg="Failed to auto-determine query field names, falling back to the built-in list"
level=info ts=2024-03-07T08:49:46.509Z caller=main.go:65 msg="Listening on address" address=:9835
level=info ts=2024-03-07T08:49:46.510Z caller=tls_config.go:191 msg="TLS is disabled." http2=false
level=error ts=2024-03-07T08:49:50.685Z caller=exporter.go:148 error="command failed. stderr:  err: exit status 12"
level=error ts=2024-03-07T08:50:04.185Z caller=exporter.go:148 error="command failed. stderr:  err: exit status 12"
level=error ts=2024-03-07T08:50:05.663Z caller=exporter.go:148 error="command failed. stderr:  err: exit status 12"
level=error ts=2024-03-07T08:50:19.164Z caller=exporter.go:148 error="command failed. stderr:  err: exit status 12"
level=error ts=2024-03-07T08:50:20.663Z caller=exporter.go:148 error="command failed. stderr:  err: exit status 12"
level=error ts=2024-03-07T08:50:34.163Z caller=exporter.go:148 error="command failed. stderr:  err: exit status 12"
level=error ts=2024-03-07T08:50:35.663Z caller=exporter.go:148 error="command failed. stderr:  err: exit status 12"
level=error ts=2024-03-07T08:50:49.163Z caller=exporter.go:148 error="command failed. stderr:  err: exit status 12"
level=error ts=2024-03-07T08:50:50.663Z caller=exporter.go:148 error="command failed. stderr:  err: exit status 12"
level=error ts=2024-03-07T08:51:04.163Z caller=exporter.go:148 error="command failed. stderr:  err: exit status 12"
level=error ts=2024-03-07T08:51:05.662Z caller=exporter.go:148 error="command failed. stderr:  err: exit status 12"
level=error ts=2024-03-07T08:51:19.164Z caller=exporter.go:148 error="command failed. stderr:  err: exit status 12"
level=error ts=2024-03-07T08:51:20.668Z caller=exporter.go:148 error="command failed. stderr:  err: exit status 12"
level=error ts=2024-03-07T08:51:34.164Z caller=exporter.go:148 error="command failed. stderr:  err: exit status 12"
level=error ts=2024-03-07T08:51:35.662Z caller=exporter.go:148 error="command failed. stderr:  err: exit status 12"
level=error ts=2024-03-07T08:51:49.164Z caller=exporter.go:148 error="command failed. stderr:  err: exit status 12"
level=error ts=2024-03-07T08:51:50.663Z caller=exporter.go:148 error="command failed. stderr:  err: exit status 12"
level=error ts=2024-03-07T08:52:04.164Z caller=exporter.go:148 error="command failed. stderr:  err: exit status 12"
level=error ts=2024-03-07T08:52:05.663Z caller=exporter.go:148 error="command failed. stderr:  err: exit status 12"

Model and Version

  • GPU Model [e.g. NVIDIA GeForce RTX 4090]
  • App version and architecture [e.g. appVersion: 0.3.0, helm chart]
  • Installation method [e.g. helm]
  • Operating System [e.g. CentOS Linux release 7.9.2009 (Core), ]
  • Nvidia GPU driver version [e.g. NVIDIA-SMI 530.30.02 Driver Version: 530.30.02 CUDA Version: 12.1]

Additional context
Add any other context about the problem here.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions