diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..bc6a225 --- /dev/null +++ b/go.mod @@ -0,0 +1,21 @@ +module github.com/ROCm/gpu-agent + +go 1.23.2 + +require ( + github.com/gogo/protobuf v1.3.2 + github.com/satori/go.uuid v1.2.0 + github.com/spf13/cobra v1.9.1 + google.golang.org/grpc v1.72.1 + google.golang.org/protobuf v1.36.6 + gopkg.in/yaml.v2 v2.4.0 +) + +require ( + github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/spf13/pflag v1.0.6 // indirect + golang.org/x/net v0.35.0 // indirect + golang.org/x/sys v0.30.0 // indirect + golang.org/x/text v0.22.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..0658c76 --- /dev/null +++ b/go.sum @@ -0,0 +1,80 @@ +github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= +github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww= +github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= +github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= +github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= +github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= +github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= +go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= +go.opentelemetry.io/otel v1.34.0 h1:zRLXxLCgL1WyKsPVrgbSdMN4c0FMkDAskSTQP+0hdUY= +go.opentelemetry.io/otel v1.34.0/go.mod h1:OWFPOQ+h4G8xpyjgqo4SxJYdDQ/qmRH+wivy7zzx9oI= +go.opentelemetry.io/otel/metric v1.34.0 h1:+eTR3U0MyfWjRDhmFMxe2SsW64QrZ84AOhvqS7Y+PoQ= +go.opentelemetry.io/otel/metric v1.34.0/go.mod h1:CEDrp0fy2D0MvkXE+dPV7cMi8tWZwX3dmaIhwPOaqHE= +go.opentelemetry.io/otel/sdk v1.34.0 h1:95zS4k/2GOy069d321O8jWgYsW3MzVV+KuSPKp7Wr1A= +go.opentelemetry.io/otel/sdk v1.34.0/go.mod h1:0e/pNiaMAqaykJGKbi+tSjWfNNHMTxoC9qANsCzbyxU= +go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce1EK0Gyvahk= +go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= +go.opentelemetry.io/otel/trace v1.34.0 h1:+ouXS2V8Rd4hp4580a8q23bg0azF2nI8cqLYnC8mh/k= +go.opentelemetry.io/otel/trace v1.34.0/go.mod h1:Svm7lSjQD7kG7KJ/MUHPVXSDGz2OX4h0M2jHBhmSfRE= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8= +golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= +golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM= +golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a h1:51aaUVRocpvUOSQKM6Q7VuoaktNIaMCLuhZB6DKksq4= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a/go.mod h1:uRxBH1mhmO8PGhU89cMcHaXKZqO+OfakD8QQO0oYwlQ= +google.golang.org/grpc v1.72.1 h1:HR03wO6eyZ7lknl75XlxABNVLLFc2PAb6mHlYh756mA= +google.golang.org/grpc v1.72.1/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM= +google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= +google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/sw/nic/gpuagent/cli/cmd/events.go b/sw/nic/gpuagent/cli/cmd/events.go index 01e98aa..ae89ee4 100644 --- a/sw/nic/gpuagent/cli/cmd/events.go +++ b/sw/nic/gpuagent/cli/cmd/events.go @@ -32,8 +32,8 @@ import ( uuid "github.com/satori/go.uuid" "github.com/spf13/cobra" - "github.com/pensando/sw/nic/gpuagent/cli/utils" - aga "github.com/pensando/sw/nic/gpuagent/gen/go" + "github.com/ROCm/gpu-agent/sw/nic/gpuagent/cli/utils" + aga "github.com/ROCm/gpu-agent/sw/nic/gpuagent/gen/go" ) var ( diff --git a/sw/nic/gpuagent/cli/cmd/gpu.go b/sw/nic/gpuagent/cli/cmd/gpu.go index da1b3d1..d906c5d 100644 --- a/sw/nic/gpuagent/cli/cmd/gpu.go +++ b/sw/nic/gpuagent/cli/cmd/gpu.go @@ -35,8 +35,8 @@ import ( "github.com/spf13/cobra" yaml "gopkg.in/yaml.v2" - "github.com/pensando/sw/nic/gpuagent/cli/utils" - aga "github.com/pensando/sw/nic/gpuagent/gen/go" + "github.com/ROCm/gpu-agent/sw/nic/gpuagent/cli/utils" + aga "github.com/ROCm/gpu-agent/sw/nic/gpuagent/gen/go" ) var ( diff --git a/sw/nic/gpuagent/cli/cmd/gpu_watch.go b/sw/nic/gpuagent/cli/cmd/gpu_watch.go index 4e662a8..af6bb94 100644 --- a/sw/nic/gpuagent/cli/cmd/gpu_watch.go +++ b/sw/nic/gpuagent/cli/cmd/gpu_watch.go @@ -34,8 +34,8 @@ import ( "github.com/spf13/cobra" yaml "gopkg.in/yaml.v2" - "github.com/pensando/sw/nic/gpuagent/cli/utils" - aga "github.com/pensando/sw/nic/gpuagent/gen/go" + "github.com/ROCm/gpu-agent/sw/nic/gpuagent/cli/utils" + aga "github.com/ROCm/gpu-agent/sw/nic/gpuagent/gen/go" ) var ( diff --git a/sw/nic/gpuagent/cli/cmd/root.go b/sw/nic/gpuagent/cli/cmd/root.go index 5487ef7..f889d7f 100644 --- a/sw/nic/gpuagent/cli/cmd/root.go +++ b/sw/nic/gpuagent/cli/cmd/root.go @@ -27,7 +27,7 @@ package cmd import ( "os" - "github.com/pensando/sw/nic/gpuagent/cli/utils" + "github.com/ROCm/gpu-agent/sw/nic/gpuagent/cli/utils" "github.com/spf13/cobra" ) diff --git a/sw/nic/gpuagent/cli/cmd/system.go b/sw/nic/gpuagent/cli/cmd/system.go index d4c549c..7dde0dd 100644 --- a/sw/nic/gpuagent/cli/cmd/system.go +++ b/sw/nic/gpuagent/cli/cmd/system.go @@ -30,8 +30,8 @@ import ( "github.com/spf13/cobra" - "github.com/pensando/sw/nic/gpuagent/cli/utils" - aga "github.com/pensando/sw/nic/gpuagent/gen/go" + "github.com/ROCm/gpu-agent/sw/nic/gpuagent/cli/utils" + aga "github.com/ROCm/gpu-agent/sw/nic/gpuagent/gen/go" ) var ( diff --git a/sw/nic/gpuagent/cli/cmd/topo.go b/sw/nic/gpuagent/cli/cmd/topo.go index 48bc044..1e2666f 100644 --- a/sw/nic/gpuagent/cli/cmd/topo.go +++ b/sw/nic/gpuagent/cli/cmd/topo.go @@ -31,8 +31,8 @@ import ( "github.com/spf13/cobra" - "github.com/pensando/sw/nic/gpuagent/cli/utils" - aga "github.com/pensando/sw/nic/gpuagent/gen/go" + "github.com/ROCm/gpu-agent/sw/nic/gpuagent/cli/utils" + aga "github.com/ROCm/gpu-agent/sw/nic/gpuagent/gen/go" ) var deviceShowCmd = &cobra.Command{ diff --git a/sw/nic/gpuagent/cli/main.go b/sw/nic/gpuagent/cli/main.go index 3d744f2..4129dd5 100644 --- a/sw/nic/gpuagent/cli/main.go +++ b/sw/nic/gpuagent/cli/main.go @@ -17,7 +17,7 @@ package main -import "github.com/pensando/sw/nic/gpuagent/cli/cmd" +import "github.com/ROCm/gpu-agent/sw/nic/gpuagent/cli/cmd" func main() { cmd.Execute() diff --git a/sw/nic/gpuagent/cli/utils/client.go b/sw/nic/gpuagent/cli/utils/client.go index 66ed732..351f5df 100644 --- a/sw/nic/gpuagent/cli/utils/client.go +++ b/sw/nic/gpuagent/cli/utils/client.go @@ -69,13 +69,19 @@ func getClientReqTimeout() (uint, error) { // createNewGRPCClient creates a grpc connection to HAL // we first check if secure grpc exists and if not fallback // to regular grpc -func createNewGRPCClient() (*grpc.ClientConn, error) { +func createNewGRPCClient(baseURL string) (*grpc.ClientConn, error) { // unsecure grpc agaPort := os.Getenv("AGA_GRPC_PORT") if agaPort == "" { agaPort = GRPCDefaultPort } - srvURL := GRPCDefaultBaseURL + ":" + agaPort + var srvURL string + + if baseURL == "" { + srvURL = GRPCDefaultBaseURL + ":" + agaPort + } else { + srvURL = baseURL + ":" + agaPort + } timeout, err := getClientPortConnTimeout() if err != nil { return nil, err @@ -98,12 +104,17 @@ func createNewGRPCClient() (*grpc.ClientConn, error) { return rpcClient, err } -func CreateNewAGAGRPClient() (*grpc.ClientConn, context.Context, +func CreateNewAGAGRPClient(baseURL ...string) (*grpc.ClientConn, context.Context, context.CancelFunc, error) { var ctxt context.Context var cancel context.CancelFunc + var url string + + if len(baseURL) != 0 { + url = baseURL[0] + } - client, err := createNewGRPCClient() + client, err := createNewGRPCClient(url) if err != nil { return nil, nil, nil, err } diff --git a/sw/nic/gpuagent/protos/Makefile b/sw/nic/gpuagent/protos/Makefile new file mode 100644 index 0000000..1bbc808 --- /dev/null +++ b/sw/nic/gpuagent/protos/Makefile @@ -0,0 +1,27 @@ +export PROTOC ?= protoc +TOP_DIR = ../../.. +GEN_DIR = ${TOP_DIR}/nic/gpuagent/gen/go +GPU_PROTO_FILES = $(shell ls ${CURDIR} | grep .proto | grep -v types.proto | grep -v module | tr " " "\n" | LC_ALL=C sort | tr "\n" " ") +$(info ${GPU_PROTO_FILES}) + +default: create-gen-dir gpu-proto + +gpu-proto: gen-clean + LC_ALL=C ${PROTOC} --proto_path=${CURDIR} -I${CURDIR} \ + -I${TOP_DIR} -I$(TOP_DIR)/vendor/github.com/gogo/protobuf/gogoproto \ + --gogofast_out=Mgoogle/protobuf/any.proto=github.com/gogo/protobuf/types,Mgogo.proto=github.com/gogo/protobuf/gogoproto,plugins=grpc:${GEN_DIR} types.proto ${GPU_PROTO_FILES} + +create-gen-dir: + mkdir -p ${GEN_DIR} + +clean-proto: + rm -f ${GEN_DIR}/*.go + +gen-clean: + rm -f ${GEN_DIR}/*.go + +proto-sync: default + @$(eval DIFFS=`git ls-files --exclude-standard --modified --directory ${CURDIR}`) + @echo "Found the following uncommitted pb.go files, if any" + @echo $(DIFFS) + @test -z "$(DIFFS)" diff --git a/sw/nic/gpuagent/tests/Dockerfile b/sw/nic/gpuagent/tests/Dockerfile new file mode 100644 index 0000000..357bcee --- /dev/null +++ b/sw/nic/gpuagent/tests/Dockerfile @@ -0,0 +1,38 @@ +FROM ubuntu:22.04 + +ENV PROTOBUF_VERSION=23.4 +ENV GOPATH=/go +ENV GOROOT=/usr/local/go +ENV PATH=$PATH:$GOROOT/bin:$GOPATH/bin + +RUN DEBIAN_FRONTEND=noninteractive \ + apt-get update && \ + apt-get install -y wget git make unzip build-essential && \ + echo 'Install golang' && \ + wget https://dl.google.com/go/go1.23.2.linux-amd64.tar.gz && \ + tar -C /usr/local -xzf go1.23.2.linux-amd64.tar.gz && \ + export PATH=/usr/local/go/bin:$PATH && \ + export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH && \ + echo 'export PATH=/usr/local/go/bin:$PATH' >> ~/.bashrc && \ + echo 'export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH' >> ~/.bashrc && \ + echo 'Install go protoc' && \ + export GOPATH=/usr/local && \ + export PATH=$PATH:/usr/local/go/bin:/root/go/bin && \ + wget "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOBUF_VERSION}/protoc-${PROTOBUF_VERSION}-linux-x86_64.zip" \ + && unzip "protoc-${PROTOBUF_VERSION}-linux-x86_64.zip" -d /usr/local \ + && rm "protoc-${PROTOBUF_VERSION}-linux-x86_64.zip" \ + rm -rf /var/lib/apt/lists/* && \ + rm -rf /usr/lib/locale && \ + rm -rf /usr/share/locale && \ + rm -rf /usr/local/src && \ + rm -rf /root/.cache && \ + apt-get clean && \ + apt-get autoclean + +WORKDIR /gpu-agent +RUN go mod init temp + +RUN go get github.com/gogo/protobuf/protoc-gen-gogofast +RUN go install github.com/gogo/protobuf/protoc-gen-gogofast + +CMD ["/usr/bin/bash"] \ No newline at end of file diff --git a/sw/nic/gpuagent/tests/Makefile b/sw/nic/gpuagent/tests/Makefile new file mode 100644 index 0000000..3fa08d4 --- /dev/null +++ b/sw/nic/gpuagent/tests/Makefile @@ -0,0 +1,47 @@ +# +# Copyright(C) Advanced Micro Devices, Inc. All rights reserved. +# +# You may not use this software and documentation (if any) (collectively, +# the "Materials") except in compliance with the terms and conditions of +# the Software License Agreement included with the Materials or otherwise as +# set forth in writing and signed by you and an authorized signatory of AMD. +# If you do not have a copy of the Software License Agreement, contact your +# AMD representative for a copy. +# +# You agree that you will not reverse engineer or decompile the Materials, +# in whole or in part, except as allowed by applicable law. +# +# THE MATERIALS ARE DISTRIBUTED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR +# REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED. + +# Image name for the Docker container +IMAGE_NAME := gpuagent-go-test-env +GRPC_BASE_URL ?= "127.0.0.1" + +HOST_MOUNT_DIR := $(PWD)/../../../.. +PROTOS_DIR :=./sw/nic/gpuagent/protos + +# Container mount point +CONTAINER_MOUNT_POINT := /gpu-agent + +DOCKERFILE := Dockerfile + +# Default target +all: test clean-image + +# Build the Docker image +build-image: + docker build -t $(IMAGE_NAME) . + +# Run the tests in a container +test: build-image + docker run --rm \ + -e GRPC_BASE_URL=$(GRPC_BASE_URL) \ + -v $(HOST_MOUNT_DIR):$(CONTAINER_MOUNT_POINT) $(IMAGE_NAME) \ + sh -c "make -C $(CONTAINER_MOUNT_POINT)/sw/nic/gpuagent/protos && go test -v $(CONTAINER_MOUNT_POINT)/sw/nic/gpuagent/tests/ && rm -rf $(CONTAINER_MOUNT_POINT)/sw/nic/gpuagent/gen" + +# Clean up the Docker image +clean-image: + docker rmi -f $(IMAGE_NAME) + +.PHONY: all build-image test clean-image diff --git a/sw/nic/gpuagent/tests/gpuagent_test.go b/sw/nic/gpuagent/tests/gpuagent_test.go new file mode 100644 index 0000000..1f262fe --- /dev/null +++ b/sw/nic/gpuagent/tests/gpuagent_test.go @@ -0,0 +1,242 @@ +// +// Copyright(C) Advanced Micro Devices, Inc. All rights reserved. +// +// You may not use this software and documentation (if any) (collectively, +// the "Materials") except in compliance with the terms and conditions of +// the Software License Agreement included with the Materials or otherwise as +// set forth in writing and signed by you and an authorized signatory of AMD. +// If you do not have a copy of the Software License Agreement, contact your +// AMD representative for a copy. +// +// You agree that you will not reverse engineer or decompile the Materials, +// in whole or in part, except as allowed by applicable law. +// +// THE MATERIALS ARE DISTRIBUTED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR +// REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED. +// + +package main + +import ( + "context" + "fmt" + "os" + "strings" + "testing" + + "github.com/ROCm/gpu-agent/sw/nic/gpuagent/cli/utils" + amdgpu "github.com/ROCm/gpu-agent/sw/nic/gpuagent/gen/go" + "google.golang.org/grpc" +) + +var ( + grpcBaseURL = os.Getenv("GRPC_BASE_URL") + gpuSvcClient amdgpu.GPUSvcClient + topoSvcClient amdgpu.TopoSvcClient + ctxt context.Context + cancel context.CancelFunc + conn *grpc.ClientConn +) + +func Assert(t *testing.T, b bool, errString string) { + if !b { + t.Fatalf(errString) + } +} + +func getGpu(t *testing.T, gpuID []byte) []*amdgpu.GPU { + respMsg := &amdgpu.GPUGetResponse{} + req := &amdgpu.GPUGetRequest{ + Id: [][]byte{}, + } + + if gpuID != nil { + req.Id = append(req.Id, gpuID) + } + + respMsg, err := gpuSvcClient.GPUGet(ctxt, req) + Assert(t, err == nil, fmt.Sprintf("Failed to get GPUs err: %v", err)) + Assert(t, respMsg.ApiStatus == amdgpu.ApiStatus_API_STATUS_OK, fmt.Sprintf("Operation failed with %v error", respMsg.ApiStatus)) + + var response []*amdgpu.GPU + for _, resp := range respMsg.Response { + response = append(response, resp) + } + + return response +} + +func TestMain(m *testing.M) { + fmt.Printf("grpcbaseurl: %v\n", grpcBaseURL) + fmt.Println("Running Unit test cases for gpuagent") + var err error + conn, ctxt, cancel, err = utils.CreateNewAGAGRPClient(grpcBaseURL) + + if err != nil { + fmt.Println("Could not connect to the GPU agent, is agent running?") + os.Exit(1) + } + defer conn.Close() + defer cancel() + + gpuSvcClient = amdgpu.NewGPUSvcClient(conn) + topoSvcClient = amdgpu.NewTopoSvcClient(conn) + + exitCode := m.Run() + os.Exit(exitCode) +} + +// Tests Get ALL GPUs and get GPU by ID +func TestGetGpu(t *testing.T) { + gpus := getGpu(t, nil) + + // at least on GPU has to be returned + Assert(t, len(gpus) != 0, fmt.Sprintf("No GPUs returned")) + + fmt.Printf("TestGetGpusAll: No of GPUs returned: %d\n", len(gpus)) + + //verify vendor is AMD + for _, gpu := range gpus { + Assert(t, gpu.Status.CardVendor == "AMD", fmt.Sprintf("Expected card vendor to be AMD, got: %v", gpu.Status.CardVendor)) + } + //fmt.Println("PASS: Test Get All GPUs") + + //fmt.Println("Running: Test Get GPU by ID") + + gpuID := gpus[0].Spec.Id + gpus = getGpu(t, gpuID) + + // at least on GPU has to be returned + Assert(t, len(gpus) == 1, fmt.Sprintf("No GPUs returned for ID: %v", utils.IdToStr(gpuID))) + + //verify vendor is AMD + Assert(t, gpus[0].Status.CardVendor == "AMD", fmt.Sprintf("Expected card vendor to be AMD, got: %v", gpus[0].Status.CardVendor)) + + // verify the ID + Assert(t, strings.Compare(utils.IdToStr(gpuID), utils.IdToStr(gpus[0].Spec.Id)) == 0, fmt.Sprintf("Expected gpu ID: %v, got: %v", utils.IdToStr(gpuID), utils.IdToStr(gpus[0].Spec.Id))) +} + +func TestGpuAdminStUpdate(t *testing.T) { + gpus := getGpu(t, nil) + Assert(t, len(gpus) != 0, fmt.Sprintf("No GPUs returned")) + + gpuSpec := gpus[0].GetSpec() + + updateSpec := *gpuSpec + + updateSpec.AdminState = amdgpu.GPUAdminState_GPU_ADMIN_STATE_UP + if gpuSpec.AdminState == amdgpu.GPUAdminState_GPU_ADMIN_STATE_UP { + updateSpec.AdminState = amdgpu.GPUAdminState_GPU_ADMIN_STATE_DOWN + } + + reqMsg := &amdgpu.GPUUpdateRequest{ + Spec: []*amdgpu.GPUSpec{ + &updateSpec, + }, + } + // GPU agent call + updateRespMsg, err := gpuSvcClient.GPUUpdate(ctxt, reqMsg) + + Assert(t, err == nil, fmt.Sprintf("Updating GPU failed, err %v", err)) + Assert(t, updateRespMsg.ApiStatus == amdgpu.ApiStatus_API_STATUS_OPERATION_NOT_SUPPORTED, fmt.Sprintf("Operation failed with error %v, error code %v", + updateRespMsg.ApiStatus, updateRespMsg.ErrorCode)) + + // GET to verify the admin-state + retGpu := getGpu(t, gpus[0].Spec.Id) + + Assert(t, retGpu[0].Spec.AdminState != updateSpec.AdminState, fmt.Sprintf("admin-state got updated")) +} + +func TestGpuPerfLevelUpdate(t *testing.T) { + gpus := getGpu(t, nil) + Assert(t, len(gpus) != 0, fmt.Sprintf("No GPUs returned")) + + getNewPerfLevel := func(perf amdgpu.GPUPerformanceLevel) amdgpu.GPUPerformanceLevel { + for l := amdgpu.GPUPerformanceLevel_GPU_PERF_LEVEL_NONE; l <= amdgpu.GPUPerformanceLevel_GPU_PERF_LEVEL_MANUAL; l++ { + if perf != l { + return l + } + } + return amdgpu.GPUPerformanceLevel_GPU_PERF_LEVEL_NONE + } + gpuSpec := gpus[0].GetSpec() + + updateSpec := *gpuSpec + + origPerfLevel := gpuSpec.PerformanceLevel + updateSpec.PerformanceLevel = getNewPerfLevel(origPerfLevel) + + reqMsg := &amdgpu.GPUUpdateRequest{ + Spec: []*amdgpu.GPUSpec{ + &updateSpec, + }, + } + // GPU agent call + updateRespMsg, err := gpuSvcClient.GPUUpdate(ctxt, reqMsg) + Assert(t, err == nil, fmt.Sprintf("Updating GPU failed, err %v", err)) + Assert(t, updateRespMsg.ApiStatus == amdgpu.ApiStatus_API_STATUS_OPERATION_NOT_SUPPORTED, fmt.Sprintf("Operation failed with error %v, error code %v", + updateRespMsg.ApiStatus, updateRespMsg.ErrorCode)) + + // GET to verify the perf-level + retGpu := getGpu(t, gpus[0].Spec.Id) + + Assert(t, retGpu[0].Spec.PerformanceLevel != updateSpec.PerformanceLevel, fmt.Sprintf("perf-level got updated")) +} + +func TestGpuOverdriveLevelUpdate(t *testing.T) { + gpus := getGpu(t, nil) + Assert(t, len(gpus) != 0, fmt.Sprintf("No GPUs returned")) + + gpuSpec := gpus[0].GetSpec() + + updateSpec := *gpuSpec + + origOverdriveLevel := gpuSpec.OverDriveLevel + updateSpec.OverDriveLevel = 13 + + reqMsg := &amdgpu.GPUUpdateRequest{ + Spec: []*amdgpu.GPUSpec{ + &updateSpec, + }, + } + // GPU agent call + updateRespMsg, err := gpuSvcClient.GPUUpdate(ctxt, reqMsg) + Assert(t, err == nil, fmt.Sprintf("Updating GPU failed, err %v", err)) + Assert(t, updateRespMsg.ApiStatus == amdgpu.ApiStatus_API_STATUS_OPERATION_NOT_SUPPORTED, fmt.Sprintf("Operation failed with error %v, error code %v", + updateRespMsg.ApiStatus, updateRespMsg.ErrorCode)) + + // GET to verify the overdrive-level + retGpu := getGpu(t, gpus[0].Spec.Id) + + Assert(t, retGpu[0].Spec.OverDriveLevel == origOverdriveLevel, fmt.Sprintf("overdrive-level got updated")) +} + +func TestGpuDeviceTopology(t *testing.T) { + req := &amdgpu.DeviceTopologyGetRequest{} + // GPU agent call + resp, err := topoSvcClient.DeviceTopologyGet(ctxt, req) + Assert(t, err == nil, fmt.Sprintf("Device topology get failed, err %v", err)) + + Assert(t, resp.ApiStatus == amdgpu.ApiStatus_API_STATUS_OK, fmt.Sprintf("Operation failed with error %v, error code %v", + resp.ApiStatus, resp.ErrorCode)) + + Assert(t, len(resp.GetDeviceTopology()) != 0, fmt.Sprintf("Get device topology response is empty")) +} + +func TestGpuReset(t *testing.T) { + gpus := getGpu(t, nil) + Assert(t, len(gpus) != 0, fmt.Sprintf("No GPUs returned")) + + reqMsg := &amdgpu.GPUResetRequest{} + reqMsg.Id = append(reqMsg.Id, gpus[0].Spec.Id) + reqMsg.Reset_ = &amdgpu.GPUResetRequest_ResetFans{ + ResetFans: true, + } + + // GPU agent call + updateRespMsg, err := gpuSvcClient.GPUReset(ctxt, reqMsg) + Assert(t, err == nil, fmt.Sprintf("GPU Reset failed, err %v", err)) + + Assert(t, updateRespMsg.ApiStatus == amdgpu.ApiStatus_API_STATUS_OPERATION_NOT_SUPPORTED, fmt.Sprintf("Operation failed with error %v, error code %v", + updateRespMsg.ApiStatus, updateRespMsg.ErrorCode)) +}