prometheus有两个指标可以告警gpu卡的错误
# HELP dcgm_ecc_dbe_volatile_total Total number of double-bit volatile ECC errors. 双位易失性ECC错误的总数
# TYPE dcgm_ecc_dbe_volatile_total counter
dcgm_ecc_dbe_volatile_total{gpu="0",uuid="GPU-4d52e430-b8c7-a0b9-7fda-4aa825af5c97"} 0
# HELP dcgm_ecc_sbe_volatile_total Total number of single-bit volatile ECC errors. 单位易失性ECC错误的总数
# TYPE dcgm_ecc_sbe_volatile_total counter
dcgm_ecc_sbe_volatile_total{gpu="0",uuid="GPU-4d52e430-b8c7-a0b9-7fda-4aa825af5c97"} 0