From 2dbe5b5685e7526d5c740002316921d316e89578 Mon Sep 17 00:00:00 2001 From: Ingo Oppermann Date: Thu, 24 Oct 2024 15:08:26 +0200 Subject: [PATCH] Add GPU support --- app/api/api.go | 8 +- cluster/about.go | 42 +- cluster/api.go | 13 + cluster/client/client.go | 31 +- cluster/leader_rebalance.go | 8 +- cluster/leader_relocate.go | 10 +- cluster/leader_synchronize.go | 14 +- cluster/leader_test.go | 634 +++++++----- cluster/node/core.go | 80 +- cluster/node/node.go | 45 +- cluster/resources.go | 159 ++- cluster/resources_test.go | 603 +++++++++++ config/config.go | 17 +- config/data.go | 6 +- config/value/primitives.go | 54 + config/value/primitives_test.go | 26 + ffmpeg/ffmpeg.go | 74 +- ffmpeg/parse/parser.go | 2 +- ffmpeg/parse/types.go | 21 + http/api/process.go | 136 ++- http/api/process_test.go | 35 +- internal/.gitignore | 3 +- internal/testhelper/nvidia-smi/nvidia-smi.go | 973 ++++++++++++++++++ monitor/cpu.go | 6 +- monitor/disk.go | 2 +- monitor/mem.go | 6 +- monitor/net.go | 2 +- process/limiter.go | 503 +++++---- process/limiter_test.go | 154 ++- process/process.go | 205 ++-- process/process_test.go | 18 +- psutil/gpu/gpu.go | 28 +- psutil/gpu/nvidia/fixtures/process.txt | 54 + .../nvidia/fixtures/{data1.xml => query1.xml} | 0 .../nvidia/fixtures/{data2.xml => query2.xml} | 18 + .../nvidia/fixtures/{data3.xml => query3.xml} | 0 psutil/gpu/nvidia/nvidia.go | 294 ++++-- psutil/gpu/nvidia/nvidia_test.go | 406 +++++++- psutil/process.go | 41 +- psutil/psutil.go | 174 ++-- resources/resources.go | 359 +++++-- resources/resources_test.go | 867 +++++++++++++++- restream/app/process.go | 114 +- restream/app/process_test.go | 10 +- restream/core.go | 194 ++-- restream/core_test.go | 4 +- restream/task.go | 59 +- session/registry_test.go | 1 + 48 files changed, 5375 insertions(+), 1138 deletions(-) create mode 100644 cluster/resources_test.go create mode 100644 internal/testhelper/nvidia-smi/nvidia-smi.go create mode 100644 psutil/gpu/nvidia/fixtures/process.txt rename psutil/gpu/nvidia/fixtures/{data1.xml => query1.xml} (100%) rename psutil/gpu/nvidia/fixtures/{data2.xml => query2.xml} (98%) rename psutil/gpu/nvidia/fixtures/{data3.xml => query3.xml} (100%) diff --git a/app/api/api.go b/app/api/api.go index fa017a88..042354e4 100644 --- a/app/api/api.go +++ b/app/api/api.go @@ -371,9 +371,11 @@ func (a *api) start(ctx context.Context) error { } resources, err := resources.New(resources.Config{ - MaxCPU: cfg.Resources.MaxCPUUsage, - MaxMemory: cfg.Resources.MaxMemoryUsage, - Logger: a.log.logger.core.WithComponent("Resources"), + MaxCPU: cfg.Resources.MaxCPUUsage, + MaxMemory: cfg.Resources.MaxMemoryUsage, + MaxGPU: cfg.Resources.MaxGPUUsage, + MaxGPUMemory: cfg.Resources.MaxGPUMemoryUsage, + Logger: a.log.logger.core.WithComponent("Resources"), }) if err != nil { return fmt.Errorf("failed to initialize resource manager: %w", err) diff --git a/cluster/about.go b/cluster/about.go index 3356faec..60585a38 100644 --- a/cluster/about.go +++ b/cluster/about.go @@ -18,18 +18,29 @@ type ClusterRaft struct { } type ClusterNodeResources struct { - IsThrottling bool // Whether this core is currently throttling - NCPU float64 // Number of CPU on this node - CPU float64 // Current CPU load, 0-100*ncpu - CPULimit float64 // Defined CPU load limit, 0-100*ncpu - CPUCore float64 // Current CPU load of the core itself, 0-100*ncpu - Mem uint64 // Currently used memory in bytes - MemLimit uint64 // Defined memory limit in bytes - MemTotal uint64 // Total available memory in bytes - MemCore uint64 // Current used memory of the core itself in bytes + IsThrottling bool // Whether this core is currently throttling + NCPU float64 // Number of CPU on this node + CPU float64 // Current CPU load, 0-100*ncpu + CPULimit float64 // Defined CPU load limit, 0-100*ncpu + CPUCore float64 // Current CPU load of the core itself, 0-100*ncpu + Mem uint64 // Currently used memory in bytes + MemLimit uint64 // Defined memory limit in bytes + MemTotal uint64 // Total available memory in bytes + MemCore uint64 // Current used memory of the core itself in bytes + GPU []ClusterNodeGPUResources // GPU resources Error error } +type ClusterNodeGPUResources struct { + Mem uint64 // Currently used memory in bytes + MemLimit uint64 // Defined memory limit in bytes + MemTotal uint64 // Total available memory in bytes + Usage float64 // Current general usage, 0-100 + UsageLimit float64 // Defined general usage limit, 0-100 + Encoder float64 // Current encoder usage, 0-100 + Decoder float64 // Current decoder usage, 0-100 +} + type ClusterNode struct { ID string Name string @@ -157,6 +168,19 @@ func (c *cluster) About() (ClusterAbout, error) { }, } + if len(nodeAbout.Resources.GPU) != 0 { + node.Resources.GPU = make([]ClusterNodeGPUResources, len(nodeAbout.Resources.GPU)) + for i, gpu := range nodeAbout.Resources.GPU { + node.Resources.GPU[i].Mem = gpu.Mem + node.Resources.GPU[i].MemLimit = gpu.MemLimit + node.Resources.GPU[i].MemTotal = gpu.MemTotal + node.Resources.GPU[i].Usage = gpu.Usage + node.Resources.GPU[i].UsageLimit = gpu.UsageLimit + node.Resources.GPU[i].Encoder = gpu.Encoder + node.Resources.GPU[i].Decoder = gpu.Decoder + } + } + if s, ok := serversMap[nodeAbout.ID]; ok { node.Voter = s.Voter node.Leader = s.Leader diff --git a/cluster/api.go b/cluster/api.go index de2f865b..38b21695 100644 --- a/cluster/api.go +++ b/cluster/api.go @@ -195,6 +195,19 @@ func (a *api) About(c echo.Context) error { }, } + if len(resources.GPU.GPU) != 0 { + about.Resources.GPU = make([]client.AboutResponseGPUResources, len(resources.GPU.GPU)) + for i, gpu := range resources.GPU.GPU { + about.Resources.GPU[i].Mem = gpu.MemoryUsed + about.Resources.GPU[i].MemLimit = gpu.MemoryLimit + about.Resources.GPU[i].MemTotal = gpu.MemoryTotal + about.Resources.GPU[i].Usage = gpu.Usage + about.Resources.GPU[i].UsageLimit = gpu.UsageLimit + about.Resources.GPU[i].Encoder = gpu.Encoder + about.Resources.GPU[i].Decoder = gpu.Decoder + } + } + if err != nil { about.Resources.Error = err.Error() } diff --git a/cluster/client/client.go b/cluster/client/client.go index 84ab0230..214bf34d 100644 --- a/cluster/client/client.go +++ b/cluster/client/client.go @@ -83,17 +83,28 @@ type AboutResponse struct { Resources AboutResponseResources `json:"resources"` } +type AboutResponseGPUResources struct { + Mem uint64 `json:"memory_bytes"` // Currently used memory in bytes + MemLimit uint64 `json:"memory_limit_bytes"` // Defined memory limit in bytes + MemTotal uint64 `json:"memory_total_bytes"` // Total available memory in bytes + Usage float64 `json:"usage"` // Current general usage, 0-100 + Encoder float64 `json:"encoder"` // Current encoder usage, 0-100 + Decoder float64 `json:"decoder"` // Current decoder usage, 0-100 + UsageLimit float64 `json:"usage_limit"` // Defined general usage limit, 0-100 +} + type AboutResponseResources struct { - IsThrottling bool `json:"is_throttling"` // Whether this core is currently throttling - NCPU float64 `json:"ncpu"` // Number of CPU on this node - CPU float64 `json:"cpu"` // Current CPU load, 0-100*ncpu - CPULimit float64 `json:"cpu_limit"` // Defined CPU load limit, 0-100*ncpu - CPUCore float64 `json:"cpu_core"` // Current CPU load of the core itself, 0-100*ncpu - Mem uint64 `json:"memory_bytes"` // Currently used memory in bytes - MemLimit uint64 `json:"memory_limit_bytes"` // Defined memory limit in bytes - MemTotal uint64 `json:"memory_total_bytes"` // Total available memory in bytes - MemCore uint64 `json:"memory_core_bytes"` // Current used memory of the core itself in bytes - Error string `json:"error"` // Last error + IsThrottling bool `json:"is_throttling"` // Whether this core is currently throttling + NCPU float64 `json:"ncpu"` // Number of CPU on this node + CPU float64 `json:"cpu"` // Current CPU load, 0-100*ncpu + CPULimit float64 `json:"cpu_limit"` // Defined CPU load limit, 0-100*ncpu + CPUCore float64 `json:"cpu_core"` // Current CPU load of the core itself, 0-100*ncpu + Mem uint64 `json:"memory_bytes"` // Currently used memory in bytes + MemLimit uint64 `json:"memory_limit_bytes"` // Defined memory limit in bytes + MemTotal uint64 `json:"memory_total_bytes"` // Total available memory in bytes + MemCore uint64 `json:"memory_core_bytes"` // Current used memory of the core itself in bytes + GPU []AboutResponseGPUResources `json:"gpu"` // Currently used GPU resources + Error string `json:"error"` // Last error } type SetNodeStateRequest struct { diff --git a/cluster/leader_rebalance.go b/cluster/leader_rebalance.go index c583f1ac..3ef2b8f7 100644 --- a/cluster/leader_rebalance.go +++ b/cluster/leader_rebalance.go @@ -78,7 +78,7 @@ func rebalance(have []node.Process, nodes map[string]node.About) ([]interface{}, // Mark nodes as throttling where at least one process is still throttling for _, haveP := range have { - if haveP.Throttling { + if haveP.Resources.Throttling { resources.Throttling(haveP.NodeID, true) } } @@ -126,7 +126,7 @@ func rebalance(have []node.Process, nodes map[string]node.About) ([]interface{}, continue } - if resources.HasNodeEnough(raNodeid, p.Config.LimitCPU, p.Config.LimitMemory) { + if resources.HasNodeEnough(raNodeid, ResourcesFromConfig(p.Config)) { availableNodeid = raNodeid break } @@ -135,7 +135,7 @@ func rebalance(have []node.Process, nodes map[string]node.About) ([]interface{}, // Find the best node with enough resources available. if len(availableNodeid) == 0 { - nodes := resources.FindBestNodes(p.Config.LimitCPU, p.Config.LimitMemory) + nodes := resources.FindBestNodes(ResourcesFromConfig(p.Config)) for _, nodeid := range nodes { if nodeid == overloadedNodeid { continue @@ -169,7 +169,7 @@ func rebalance(have []node.Process, nodes map[string]node.About) ([]interface{}, processes[i] = p // Adjust the resources. - resources.Move(availableNodeid, overloadedNodeid, p.CPU, p.Mem) + resources.Move(availableNodeid, overloadedNodeid, ResourcesFromProcess(p.Resources)) // Adjust the reference affinity. haveReferenceAffinity.Move(p.Config.Reference, p.Config.Domain, overloadedNodeid, availableNodeid) diff --git a/cluster/leader_relocate.go b/cluster/leader_relocate.go index dc5a057a..27ab847b 100644 --- a/cluster/leader_relocate.go +++ b/cluster/leader_relocate.go @@ -95,7 +95,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[ // Mark nodes as throttling where at least one process is still throttling for _, haveP := range have { - if haveP.Throttling { + if haveP.Resources.Throttling { resources.Throttling(haveP.NodeID, true) } } @@ -136,7 +136,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[ if len(targetNodeid) != 0 { _, hasNode := nodes[targetNodeid] - if !hasNode || !resources.HasNodeEnough(targetNodeid, process.Config.LimitCPU, process.Config.LimitMemory) { + if !hasNode || !resources.HasNodeEnough(targetNodeid, ResourcesFromConfig(process.Config)) { targetNodeid = "" } } @@ -152,7 +152,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[ continue } - if resources.HasNodeEnough(raNodeid, process.Config.LimitCPU, process.Config.LimitMemory) { + if resources.HasNodeEnough(raNodeid, ResourcesFromConfig(process.Config)) { targetNodeid = raNodeid break } @@ -161,7 +161,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[ // Find the best node with enough resources available. if len(targetNodeid) == 0 { - nodes := resources.FindBestNodes(process.Config.LimitCPU, process.Config.LimitMemory) + nodes := resources.FindBestNodes(ResourcesFromConfig(process.Config)) for _, nodeid := range nodes { if nodeid == sourceNodeid { continue @@ -194,7 +194,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[ opBudget -= 5 // Adjust the resources. - resources.Move(targetNodeid, sourceNodeid, process.CPU, process.Mem) + resources.Move(targetNodeid, sourceNodeid, ResourcesFromProcess(process.Resources)) // Adjust the reference affinity. haveReferenceAffinity.Move(process.Config.Reference, process.Config.Domain, sourceNodeid, targetNodeid) diff --git a/cluster/leader_synchronize.go b/cluster/leader_synchronize.go index b597d78e..c56e4ad8 100644 --- a/cluster/leader_synchronize.go +++ b/cluster/leader_synchronize.go @@ -143,7 +143,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce // Mark nodes as throttling where at least one process is still throttling for _, haveP := range have { - if haveP.Throttling { + if haveP.Resources.Throttling { resources.Throttling(haveP.NodeID, true) } } @@ -182,7 +182,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce processid: haveP.Config.ProcessID(), }) - resources.Remove(haveP.NodeID, haveP.CPU, haveP.Mem) + resources.Remove(haveP.NodeID, ResourcesFromProcess(haveP.Resources)) continue } @@ -219,7 +219,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce }) // Release the resources. - resources.Remove(haveP.NodeID, haveP.CPU, haveP.Mem) + resources.Remove(haveP.NodeID, ResourcesFromProcess(haveP.Resources)) } } @@ -229,7 +229,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce for _, haveP := range wantOrderStart { nodeid := haveP.NodeID - resources.Add(nodeid, haveP.Config.LimitCPU, haveP.Config.LimitMemory) + resources.Add(nodeid, ResourcesFromConfig(haveP.Config)) // TODO: check if the current node has actually enough resources available, // otherwise it needs to be moved somewhere else. If the node doesn't @@ -347,7 +347,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce // Try to add the process to a node where other processes with the same reference currently reside. raNodes := haveReferenceAffinity.Nodes(wantP.Config.Reference, wantP.Config.Domain) for _, raNodeid := range raNodes { - if resources.HasNodeEnough(raNodeid, wantP.Config.LimitCPU, wantP.Config.LimitMemory) { + if resources.HasNodeEnough(raNodeid, ResourcesFromConfig(wantP.Config)) { nodeid = raNodeid break } @@ -355,7 +355,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce // Find the node with the most resources available. if len(nodeid) == 0 { - nodes := resources.FindBestNodes(wantP.Config.LimitCPU, wantP.Config.LimitMemory) + nodes := resources.FindBestNodes(ResourcesFromConfig(wantP.Config)) if len(nodes) > 0 { nodeid = nodes[0] } @@ -372,7 +372,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce opBudget -= 3 // Consume the resources - resources.Add(nodeid, wantP.Config.LimitCPU, wantP.Config.LimitMemory) + resources.Add(nodeid, ResourcesFromConfig(wantP.Config)) reality[pid] = nodeid diff --git a/cluster/leader_test.go b/cluster/leader_test.go index 4f1d6bba..af17d9a6 100644 --- a/cluster/leader_test.go +++ b/cluster/leader_test.go @@ -193,11 +193,13 @@ func TestSynchronizeOrderStop(t *testing.T) { have := []node.Process{ { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 12, - Mem: 5, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 12, + Mem: 5, + }, Runtime: 42, UpdatedAt: now, Config: &app.Config{ @@ -285,11 +287,13 @@ func TestSynchronizeOrderStart(t *testing.T) { have := []node.Process{ { - NodeID: "node1", - Order: "stop", - State: "finished", - CPU: 0, - Mem: 0, + NodeID: "node1", + Order: "stop", + State: "finished", + Resources: node.ProcessResources{ + CPU: 0, + Mem: 0, + }, Runtime: 42, UpdatedAt: now, Config: &app.Config{ @@ -388,11 +392,13 @@ func TestSynchronizeAddReferenceAffinity(t *testing.T) { have := []node.Process{ { - NodeID: "node2", - Order: "start", - State: "running", - CPU: 12, - Mem: 5, + NodeID: "node2", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 12, + Mem: 5, + }, Runtime: 42, UpdatedAt: now, Config: &app.Config{ @@ -490,11 +496,13 @@ func TestSynchronizeAddReferenceAffinityMultiple(t *testing.T) { have := []node.Process{ { - NodeID: "node2", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node2", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 2, + }, Runtime: 42, UpdatedAt: now, Config: &app.Config{ @@ -882,11 +890,13 @@ func TestSynchronizeRemove(t *testing.T) { have := []node.Process{ { - NodeID: "node2", - Order: "start", - State: "running", - CPU: 12, - Mem: 5, + NodeID: "node2", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 12, + Mem: 5, + }, Runtime: 42, Config: &app.Config{ ID: "foobar", @@ -967,11 +977,13 @@ func TestSynchronizeAddRemove(t *testing.T) { have := []node.Process{ { - NodeID: "node2", - Order: "start", - State: "running", - CPU: 12, - Mem: 5, + NodeID: "node2", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 12, + Mem: 5, + }, Runtime: 42, Config: &app.Config{ ID: "foobar2", @@ -1064,11 +1076,13 @@ func TestSynchronizeNoUpdate(t *testing.T) { have := []node.Process{ { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 12, - Mem: 5, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 12, + Mem: 5, + }, Runtime: 42, Config: &app.Config{ ID: "foobar", @@ -1133,11 +1147,13 @@ func TestSynchronizeUpdate(t *testing.T) { have := []node.Process{ { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 12, - Mem: 5, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 12, + Mem: 5, + }, Runtime: 42, Config: &app.Config{ ID: "foobar", @@ -1217,11 +1233,13 @@ func TestSynchronizeUpdateMetadata(t *testing.T) { have := []node.Process{ { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 12, - Mem: 5, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 12, + Mem: 5, + }, Runtime: 42, Config: &app.Config{ ID: "foobar", @@ -1313,11 +1331,13 @@ func TestSynchronizeWaitDisconnectedNode(t *testing.T) { have := []node.Process{ { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 12, - Mem: 5, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 12, + Mem: 5, + }, Runtime: 42, UpdatedAt: now, Config: &app.Config{ @@ -1397,11 +1417,13 @@ func TestSynchronizeWaitDisconnectedNodeNoWish(t *testing.T) { have := []node.Process{ { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 12, - Mem: 5, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 12, + Mem: 5, + }, Runtime: 42, UpdatedAt: now, Config: &app.Config{ @@ -1493,11 +1515,13 @@ func TestSynchronizeWaitDisconnectedNodeUnrealisticWish(t *testing.T) { have := []node.Process{ { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 12, - Mem: 5, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 12, + Mem: 5, + }, Runtime: 42, UpdatedAt: now, Config: &app.Config{ @@ -1589,11 +1613,13 @@ func TestSynchronizeTimeoutDisconnectedNode(t *testing.T) { have := []node.Process{ { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 12, - Mem: 5, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 12, + Mem: 5, + }, Runtime: 42, UpdatedAt: now, Config: &app.Config{ @@ -1655,22 +1681,26 @@ func TestSynchronizeTimeoutDisconnectedNode(t *testing.T) { func TestRebalanceNothingToDo(t *testing.T) { processes := []node.Process{ { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 35, - Mem: 20, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 35, + Mem: 20, + }, Runtime: 42, Config: &app.Config{ ID: "foobar1", }, }, { - NodeID: "node2", - Order: "start", - State: "running", - CPU: 12, - Mem: 5, + NodeID: "node2", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 12, + Mem: 5, + }, Runtime: 42, Config: &app.Config{ ID: "foobar2", @@ -1711,33 +1741,39 @@ func TestRebalanceNothingToDo(t *testing.T) { func TestRebalanceOverload(t *testing.T) { processes := []node.Process{ { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 35, - Mem: 20, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 35, + Mem: 20, + }, Runtime: 42, Config: &app.Config{ ID: "foobar1", }, }, { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 17, - Mem: 31, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 17, + Mem: 31, + }, Runtime: 27, Config: &app.Config{ ID: "foobar3", }, }, { - NodeID: "node2", - Order: "start", - State: "running", - CPU: 12, - Mem: 5, + NodeID: "node2", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 12, + Mem: 5, + }, Runtime: 42, Config: &app.Config{ ID: "foobar2", @@ -1806,33 +1842,39 @@ func TestRebalanceOverload(t *testing.T) { func TestRebalanceSkip(t *testing.T) { processes := []node.Process{ { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 35, - Mem: 20, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 35, + Mem: 20, + }, Runtime: 42, Config: &app.Config{ ID: "foobar1", }, }, { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 17, - Mem: 31, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 17, + Mem: 31, + }, Runtime: 27, Config: &app.Config{ ID: "foobar3", }, }, { - NodeID: "node2", - Order: "start", - State: "running", - CPU: 12, - Mem: 5, + NodeID: "node2", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 12, + Mem: 5, + }, Runtime: 42, Config: &app.Config{ ID: "foobar2", @@ -1908,22 +1950,26 @@ func TestRebalanceSkip(t *testing.T) { func TestRebalanceReferenceAffinity(t *testing.T) { processes := []node.Process{ { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 42, Config: &app.Config{ ID: "foobar1", }, }, { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 1, Config: &app.Config{ ID: "foobar2", @@ -1931,11 +1977,13 @@ func TestRebalanceReferenceAffinity(t *testing.T) { }, }, { - NodeID: "node2", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node2", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 42, Config: &app.Config{ ID: "foobar3", @@ -1943,11 +1991,13 @@ func TestRebalanceReferenceAffinity(t *testing.T) { }, }, { - NodeID: "node3", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node3", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 42, Config: &app.Config{ ID: "foobar4", @@ -1955,11 +2005,13 @@ func TestRebalanceReferenceAffinity(t *testing.T) { }, }, { - NodeID: "node3", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node3", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 42, Config: &app.Config{ ID: "foobar5", @@ -2048,33 +2100,39 @@ func TestRebalanceReferenceAffinity(t *testing.T) { func TestRebalanceRelocateTarget(t *testing.T) { processes := []node.Process{ { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 35, - Mem: 20, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 35, + Mem: 20, + }, Runtime: 42, Config: &app.Config{ ID: "foobar1", }, }, { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 17, - Mem: 31, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 17, + Mem: 31, + }, Runtime: 27, Config: &app.Config{ ID: "foobar3", }, }, { - NodeID: "node2", - Order: "start", - State: "running", - CPU: 12, - Mem: 5, + NodeID: "node2", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 12, + Mem: 5, + }, Runtime: 42, Config: &app.Config{ ID: "foobar2", @@ -2165,33 +2223,39 @@ func TestRebalanceRelocateTarget(t *testing.T) { func TestRebalanceRelocateAny(t *testing.T) { processes := []node.Process{ { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 35, - Mem: 20, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 35, + Mem: 20, + }, Runtime: 42, Config: &app.Config{ ID: "foobar1", }, }, { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 17, - Mem: 31, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 17, + Mem: 31, + }, Runtime: 27, Config: &app.Config{ ID: "foobar3", }, }, { - NodeID: "node2", - Order: "start", - State: "running", - CPU: 12, - Mem: 5, + NodeID: "node2", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 12, + Mem: 5, + }, Runtime: 42, Config: &app.Config{ ID: "foobar2", @@ -2319,7 +2383,10 @@ func TestFindBestNodesForProcess(t *testing.T) { resources := NewResourcePlanner(nodes) - list := resources.FindBestNodes(35, 20) + list := resources.FindBestNodes(Resources{ + CPU: 35, + Mem: 20, + }) require.Equal(t, []string{"node3", "node2", "node1"}, list) } @@ -2433,7 +2500,10 @@ func TestFindBestNodesForProcess2(t *testing.T) { }, } - list := resources.FindBestNodes(4.0, 45*1024*1024) + list := resources.FindBestNodes(Resources{ + CPU: 4.0, + Mem: 45 * 1024 * 1024, + }) require.Equal(t, []string{"node10", "node8", "node7", "node1", "node5", "node12", "node4", "node3", "node13", "node6", "node11", "node2"}, list) } @@ -2441,11 +2511,13 @@ func TestFindBestNodesForProcess2(t *testing.T) { func TestCreateNodeProcessMap(t *testing.T) { processes := []node.Process{ { - NodeID: "node1", - Order: "start", - State: "finished", - CPU: 1, - Mem: 1, + NodeID: "node1", + Order: "start", + State: "finished", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 1, Config: &app.Config{ ID: "foobar7", @@ -2453,11 +2525,13 @@ func TestCreateNodeProcessMap(t *testing.T) { }, }, { - NodeID: "node1", - Order: "start", - State: "failed", - CPU: 1, - Mem: 1, + NodeID: "node1", + Order: "start", + State: "failed", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 1, Config: &app.Config{ ID: "foobar8", @@ -2465,22 +2539,26 @@ func TestCreateNodeProcessMap(t *testing.T) { }, }, { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 42, Config: &app.Config{ ID: "foobar1", }, }, { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 1, Config: &app.Config{ ID: "foobar2", @@ -2488,11 +2566,13 @@ func TestCreateNodeProcessMap(t *testing.T) { }, }, { - NodeID: "node2", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node2", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 67, Config: &app.Config{ ID: "foobar3", @@ -2500,11 +2580,13 @@ func TestCreateNodeProcessMap(t *testing.T) { }, }, { - NodeID: "node2", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node2", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 42, Config: &app.Config{ ID: "foobar6", @@ -2512,11 +2594,13 @@ func TestCreateNodeProcessMap(t *testing.T) { }, }, { - NodeID: "node3", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node3", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 41, Config: &app.Config{ ID: "foobar4", @@ -2524,11 +2608,13 @@ func TestCreateNodeProcessMap(t *testing.T) { }, }, { - NodeID: "node3", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node3", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 42, Config: &app.Config{ ID: "foobar5", @@ -2542,11 +2628,13 @@ func TestCreateNodeProcessMap(t *testing.T) { require.Equal(t, map[string][]node.Process{ "node1": { { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 1, Config: &app.Config{ ID: "foobar2", @@ -2554,11 +2642,13 @@ func TestCreateNodeProcessMap(t *testing.T) { }, }, { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 42, Config: &app.Config{ ID: "foobar1", @@ -2567,11 +2657,13 @@ func TestCreateNodeProcessMap(t *testing.T) { }, "node2": { { - NodeID: "node2", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node2", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 42, Config: &app.Config{ ID: "foobar6", @@ -2579,11 +2671,13 @@ func TestCreateNodeProcessMap(t *testing.T) { }, }, { - NodeID: "node2", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node2", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 67, Config: &app.Config{ ID: "foobar3", @@ -2593,11 +2687,13 @@ func TestCreateNodeProcessMap(t *testing.T) { }, "node3": { { - NodeID: "node3", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node3", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 41, Config: &app.Config{ ID: "foobar4", @@ -2605,11 +2701,13 @@ func TestCreateNodeProcessMap(t *testing.T) { }, }, { - NodeID: "node3", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node3", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 42, Config: &app.Config{ ID: "foobar5", @@ -2623,22 +2721,26 @@ func TestCreateNodeProcessMap(t *testing.T) { func TestCreateReferenceAffinityNodeMap(t *testing.T) { processes := []node.Process{ { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 42, Config: &app.Config{ ID: "foobar1", }, }, { - NodeID: "node1", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node1", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 1, Config: &app.Config{ ID: "foobar2", @@ -2646,11 +2748,13 @@ func TestCreateReferenceAffinityNodeMap(t *testing.T) { }, }, { - NodeID: "node2", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node2", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 42, Config: &app.Config{ ID: "foobar3", @@ -2658,11 +2762,13 @@ func TestCreateReferenceAffinityNodeMap(t *testing.T) { }, }, { - NodeID: "node2", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node2", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 42, Config: &app.Config{ ID: "foobar3", @@ -2670,11 +2776,13 @@ func TestCreateReferenceAffinityNodeMap(t *testing.T) { }, }, { - NodeID: "node3", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node3", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 42, Config: &app.Config{ ID: "foobar4", @@ -2682,11 +2790,13 @@ func TestCreateReferenceAffinityNodeMap(t *testing.T) { }, }, { - NodeID: "node3", - Order: "start", - State: "running", - CPU: 1, - Mem: 1, + NodeID: "node3", + Order: "start", + State: "running", + Resources: node.ProcessResources{ + CPU: 1, + Mem: 1, + }, Runtime: 42, Config: &app.Config{ ID: "foobar5", diff --git a/cluster/node/core.go b/cluster/node/core.go index 5341db06..9dc87e87 100644 --- a/cluster/node/core.go +++ b/cluster/node/core.go @@ -747,16 +747,62 @@ func (n *Core) MediaGetInfo(prefix, path string) (int64, time.Time, error) { } type Process struct { - NodeID string - Order string - State string + NodeID string + Order string + State string + Resources ProcessResources + Runtime time.Duration + UpdatedAt time.Time + Config *app.Config + Metadata map[string]interface{} +} + +type ProcessResources struct { CPU float64 // Current CPU load of this process, 0-100*ncpu Mem uint64 // Currently consumed memory of this process in bytes + GPU ProcessGPUResources Throttling bool - Runtime time.Duration - UpdatedAt time.Time - Config *app.Config - Metadata map[string]interface{} +} + +type ProcessGPUResources struct { + Index int // GPU number + Usage float64 // Current GPU load, 0-100 + Encoder float64 // Current GPU encoder load, 0-100 + Decoder float64 // Current GPU decoder load, 0-100 + Mem uint64 // Currently consumed GPU memory of this process in bytes +} + +func (p *ProcessResources) Marshal(a *api.ProcessUsage) { + p.Throttling = a.CPU.IsThrottling + + if x, err := a.CPU.Current.Float64(); err == nil { + p.CPU = x + } else { + p.CPU = 0 + } + + p.Mem = a.Memory.Current + + if x, err := a.GPU.Usage.Current.Float64(); err == nil { + p.GPU.Usage = x + } else { + p.GPU.Usage = 0 + } + + if x, err := a.GPU.Encoder.Current.Float64(); err == nil { + p.GPU.Encoder = x + } else { + p.GPU.Encoder = 0 + } + + if x, err := a.GPU.Decoder.Current.Float64(); err == nil { + p.GPU.Decoder = x + } else { + p.GPU.Decoder = 0 + } + + p.GPU.Mem = a.GPU.Memory.Current + p.GPU.Index = a.GPU.Index } func (n *Core) ClusterProcessList() ([]Process, error) { @@ -780,21 +826,15 @@ func (n *Core) ClusterProcessList() ([]Process, error) { p.Config = &api.ProcessConfig{} } - cpu, err := p.State.Resources.CPU.Current.Float64() - if err != nil { - cpu = 0 + process := Process{ + NodeID: nodeid, + Order: p.State.Order, + State: p.State.State, + Runtime: time.Duration(p.State.Runtime) * time.Second, + UpdatedAt: time.Unix(p.UpdatedAt, 0), } - process := Process{ - NodeID: nodeid, - Order: p.State.Order, - State: p.State.State, - Mem: p.State.Resources.Memory.Current, - CPU: cpu, - Throttling: p.State.Resources.CPU.IsThrottling, - Runtime: time.Duration(p.State.Runtime) * time.Second, - UpdatedAt: time.Unix(p.UpdatedAt, 0), - } + process.Resources.Marshal(&p.State.Resources) config, _ := p.Config.Marshal() diff --git a/cluster/node/node.go b/cluster/node/node.go index 078da13a..c1daf191 100644 --- a/cluster/node/node.go +++ b/cluster/node/node.go @@ -138,17 +138,28 @@ type About struct { Resources Resources } +type ResourcesGPU struct { + Mem uint64 // Currently used memory in bytes + MemLimit uint64 // Defined memory limit in bytes + MemTotal uint64 // Total available memory in bytes + Usage float64 // Current general usage, 0-100 + UsageLimit float64 // Defined general usage limit, 0-100 + Encoder float64 // Current encoder usage, 0-100 + Decoder float64 // Current decoder usage, 0-100 +} + type Resources struct { - IsThrottling bool // Whether this core is currently throttling - NCPU float64 // Number of CPU on this node - CPU float64 // Current CPU load, 0-100*ncpu - CPULimit float64 // Defined CPU load limit, 0-100*ncpu - CPUCore float64 // Current CPU load of the core itself, 0-100*ncpu - Mem uint64 // Currently used memory in bytes - MemLimit uint64 // Defined memory limit in bytes - MemTotal uint64 // Total available memory in bytes - MemCore uint64 // Current used memory of the core itself in bytes - Error error // Last error + IsThrottling bool // Whether this core is currently throttling + NCPU float64 // Number of CPU on this node + CPU float64 // Current CPU load, 0-100*ncpu + CPULimit float64 // Defined CPU load limit, 0-100*ncpu + CPUCore float64 // Current CPU load of the core itself, 0-100*ncpu + Mem uint64 // Currently used memory in bytes + MemLimit uint64 // Defined memory limit in bytes + MemTotal uint64 // Total available memory in bytes + MemCore uint64 // Current used memory of the core itself in bytes + GPU []ResourcesGPU // Currently used GPU resources + Error error // Last error } func (n *Node) About() About { @@ -514,6 +525,20 @@ func (n *Node) ping(ctx context.Context, interval time.Duration) { Error: nil, }, } + + if len(about.Resources.GPU) != 0 { + n.nodeAbout.Resources.GPU = make([]ResourcesGPU, len(about.Resources.GPU)) + for i, gpu := range about.Resources.GPU { + n.nodeAbout.Resources.GPU[i].Mem = gpu.Mem + n.nodeAbout.Resources.GPU[i].MemLimit = gpu.MemLimit + n.nodeAbout.Resources.GPU[i].MemTotal = gpu.MemTotal + n.nodeAbout.Resources.GPU[i].Usage = gpu.Usage + n.nodeAbout.Resources.GPU[i].UsageLimit = gpu.UsageLimit + n.nodeAbout.Resources.GPU[i].Encoder = gpu.Encoder + n.nodeAbout.Resources.GPU[i].Decoder = gpu.Decoder + } + } + if len(about.Resources.Error) != 0 { n.nodeAbout.Resources.Error = errors.New(about.Resources.Error) } diff --git a/cluster/resources.go b/cluster/resources.go index 2b5bb2c9..cc81b828 100644 --- a/cluster/resources.go +++ b/cluster/resources.go @@ -4,8 +4,69 @@ import ( "sort" "github.com/datarhei/core/v16/cluster/node" + "github.com/datarhei/core/v16/restream/app" ) +type Resources struct { + CPU float64 // CPU 0-100*ncpu + Mem uint64 // Memoryin bytes + GPU ResourcesGPU // GPU resources +} + +type ResourcesGPU struct { + Index int // GPU number + Usage float64 // GPU general, 0-100 + Encoder float64 // GPU encoder, 0-100 + Decoder float64 // GPU decoder, 0-100 + Mem uint64 // GPU memory in bytes +} + +func ResourcesFromConfig(c *app.Config) Resources { + r := Resources{} + r.MarshalConfig(c) + return r +} + +func ResourcesFromProcess(c node.ProcessResources) Resources { + r := Resources{} + r.MarshalProcess(c) + return r +} + +func (r *Resources) MarshalConfig(c *app.Config) { + r.CPU = c.LimitCPU + r.Mem = c.LimitMemory + r.GPU.Usage = c.LimitGPU.Usage + r.GPU.Encoder = c.LimitGPU.Encoder + r.GPU.Decoder = c.LimitGPU.Decoder + r.GPU.Index = -1 +} + +func (r *Resources) MarshalProcess(c node.ProcessResources) { + r.CPU = c.CPU + r.Mem = c.Mem + r.GPU.Usage = c.GPU.Usage + r.GPU.Encoder = c.GPU.Encoder + r.GPU.Decoder = c.GPU.Decoder + r.GPU.Index = c.GPU.Index +} + +func (r *Resources) HasGPU() bool { + if r.GPU.Usage > 0 || r.GPU.Encoder > 0 || r.GPU.Decoder > 0 || r.GPU.Mem > 0 { + return true + } + + return false +} + +func (r *Resources) DoesFitGPU(g node.ResourcesGPU) bool { + if g.Usage+r.GPU.Usage < g.UsageLimit && g.Encoder+r.GPU.Encoder < g.UsageLimit && g.Decoder+r.GPU.Decoder < g.UsageLimit && g.Mem+r.GPU.Mem < g.MemLimit { + return true + } + + return false +} + type resourcePlanner struct { nodes map[string]node.Resources blocked map[string]struct{} @@ -39,8 +100,8 @@ func (r *resourcePlanner) Throttling(nodeid string, throttling bool) { } // HasNodeEnough returns whether a node has enough resources available for the -// requested cpu and memory consumption. -func (r *resourcePlanner) HasNodeEnough(nodeid string, cpu float64, mem uint64) bool { +// requested cpu, memory, anf gpu consumption. +func (r *resourcePlanner) HasNodeEnough(nodeid string, req Resources) bool { res, hasNode := r.nodes[nodeid] if !hasNode { return false @@ -50,20 +111,39 @@ func (r *resourcePlanner) HasNodeEnough(nodeid string, cpu float64, mem uint64) return false } - if res.Error == nil && res.CPU+cpu < res.CPULimit && res.Mem+mem < res.MemLimit && !res.IsThrottling { - return true + if res.Error != nil || res.IsThrottling { + return false } - return false + if res.CPU+req.CPU >= res.CPULimit || res.Mem+req.Mem >= res.MemLimit { + return false + } + + if req.HasGPU() { + found := false + + for _, g := range res.GPU { + if req.DoesFitGPU(g) { + found = true + break + } + } + + if !found { + return false + } + } + + return true } -// FindBestNodes returns an array of nodeids that can fit the requested cpu and memory requirements. If no +// FindBestNodes returns an array of nodeids that can fit the requested cpu, memory, and gpu requirements. If no // such node is available, an empty array is returned. The array is sorted by the most suitable node first. -func (r *resourcePlanner) FindBestNodes(cpu float64, mem uint64) []string { +func (r *resourcePlanner) FindBestNodes(req Resources) []string { nodes := []string{} for id := range r.nodes { - if r.HasNodeEnough(id, cpu, mem) { + if r.HasNodeEnough(id, req) { nodes = append(nodes, id) } } @@ -81,43 +161,72 @@ func (r *resourcePlanner) FindBestNodes(cpu float64, mem uint64) []string { return nodes } -// Add adds the resources of the node according to the cpu and memory utilization. -func (r *resourcePlanner) Add(nodeid string, cpu float64, mem uint64) { +// Add adds the resources of the node according to the cpu, memory, and gpu utilization. +func (r *resourcePlanner) Add(nodeid string, req Resources) { res, hasRes := r.nodes[nodeid] if !hasRes { return } - res.CPU += cpu - res.Mem += mem + res.CPU += req.CPU + res.Mem += req.Mem + + if req.HasGPU() { + for i, g := range res.GPU { + if req.DoesFitGPU(g) { + g.Usage += req.GPU.Usage + g.Encoder += req.GPU.Encoder + g.Decoder += req.GPU.Decoder + g.Mem += req.GPU.Mem + res.GPU[i] = g + break + } + } + } + r.nodes[nodeid] = res } -// Remove subtracts the resources from the node according to the cpu and memory utilization. -func (r *resourcePlanner) Remove(nodeid string, cpu float64, mem uint64) { +// Remove subtracts the resources from the node according to the cpu, memory, and gpu utilization. +func (r *resourcePlanner) Remove(nodeid string, req Resources) { res, hasRes := r.nodes[nodeid] if !hasRes { return } - res.CPU -= cpu - if res.CPU < 0 { - res.CPU = 0 - } - if mem >= res.Mem { - res.Mem = 0 - } else { - res.Mem -= mem + res.CPU -= min(res.CPU, req.CPU) + res.Mem -= min(res.Mem, req.Mem) + + if req.HasGPU() { + if req.GPU.Index > 0 && req.GPU.Index < len(res.GPU) { + gpu := res.GPU[req.GPU.Index] + gpu.Usage -= min(gpu.Usage, req.GPU.Usage) + gpu.Encoder -= min(gpu.Encoder, req.GPU.Encoder) + gpu.Decoder -= min(gpu.Decoder, req.GPU.Decoder) + gpu.Mem -= min(gpu.Mem, req.GPU.Mem) + res.GPU[req.GPU.Index] = gpu + } } + r.nodes[nodeid] = res } // Move adjusts the resources from the target and source node according to the cpu and memory utilization. -func (r *resourcePlanner) Move(target, source string, cpu float64, mem uint64) { - r.Add(target, cpu, mem) - r.Remove(source, cpu, mem) +func (r *resourcePlanner) Move(target, source string, req Resources) { + r.Add(target, req) + r.Remove(source, req) } func (r *resourcePlanner) Map() map[string]node.Resources { return r.nodes } + +func (r *resourcePlanner) Blocked() []string { + nodes := []string{} + + for nodeid := range r.blocked { + nodes = append(nodes, nodeid) + } + + return nodes +} diff --git a/cluster/resources_test.go b/cluster/resources_test.go new file mode 100644 index 00000000..2f938a31 --- /dev/null +++ b/cluster/resources_test.go @@ -0,0 +1,603 @@ +package cluster + +import ( + "testing" + + "github.com/datarhei/core/v16/cluster/node" + "github.com/stretchr/testify/require" +) + +func TestResources(t *testing.T) { + r := Resources{ + CPU: 1, + Mem: 1, + } + + require.False(t, r.HasGPU()) + + r.GPU = ResourcesGPU{ + Index: 0, + Usage: 1, + Encoder: 0, + Decoder: 0, + Mem: 1, + } + + require.True(t, r.HasGPU()) +} + +func TestResourcePlanner(t *testing.T) { + nodes := map[string]node.About{ + "node1": { + State: "online", + Resources: node.Resources{ + NCPU: 1, + CPU: 7, + Mem: 35, + CPULimit: 90, + MemLimit: 90, + }, + }, + "node2": { + State: "online", + Resources: node.Resources{ + NCPU: 1, + CPU: 85, + Mem: 11, + CPULimit: 90, + MemLimit: 90, + }, + }, + } + + planner := NewResourcePlanner(nodes) + + require.Equal(t, map[string]node.Resources{ + "node1": { + NCPU: 1, + CPU: 7, + Mem: 35, + CPULimit: 90, + MemLimit: 90, + }, + "node2": { + NCPU: 1, + CPU: 85, + Mem: 11, + CPULimit: 90, + MemLimit: 90, + }, + }, planner.Map()) +} + +func TestResourcePlannerBlocked(t *testing.T) { + nodes := map[string]node.About{ + "node1": { + State: "degraded", + Resources: node.Resources{ + NCPU: 1, + CPU: 7, + Mem: 35, + CPULimit: 90, + MemLimit: 90, + }, + }, + "node2": { + State: "online", + Resources: node.Resources{ + NCPU: 1, + CPU: 85, + Mem: 11, + CPULimit: 90, + MemLimit: 90, + }, + }, + } + + planner := NewResourcePlanner(nodes) + + require.Equal(t, []string{"node1"}, planner.Blocked()) +} + +func TestResourcePlannerThrottling(t *testing.T) { + nodes := map[string]node.About{ + "node1": { + State: "online", + Resources: node.Resources{ + NCPU: 1, + CPU: 7, + Mem: 35, + CPULimit: 90, + MemLimit: 90, + }, + }, + "node2": { + State: "online", + Resources: node.Resources{ + NCPU: 1, + CPU: 85, + Mem: 11, + CPULimit: 90, + MemLimit: 90, + }, + }, + } + + planner := NewResourcePlanner(nodes) + + require.True(t, planner.HasNodeEnough("node1", Resources{ + CPU: 30, + Mem: 5, + })) + + planner.Throttling("node1", true) + + require.False(t, planner.HasNodeEnough("node1", Resources{ + CPU: 30, + Mem: 5, + })) + + planner.Throttling("node1", false) + + require.True(t, planner.HasNodeEnough("node1", Resources{ + CPU: 30, + Mem: 5, + })) +} + +func TestRecourcePlannerHasNodeEnough(t *testing.T) { + nodes := map[string]node.About{ + "node1": { + State: "online", + Resources: node.Resources{ + NCPU: 1, + CPU: 7, + Mem: 35, + CPULimit: 90, + MemLimit: 90, + GPU: []node.ResourcesGPU{ + { + Mem: 5, + MemLimit: 90, + Usage: 53, + UsageLimit: 90, + Encoder: 32, + Decoder: 26, + }, + { + Mem: 85, + MemLimit: 90, + Usage: 64, + UsageLimit: 90, + Encoder: 43, + Decoder: 12, + }, + }, + }, + }, + "node2": { + State: "online", + Resources: node.Resources{ + NCPU: 1, + CPU: 85, + Mem: 11, + CPULimit: 90, + MemLimit: 90, + GPU: []node.ResourcesGPU{ + { + Mem: 5, + MemLimit: 90, + Usage: 53, + UsageLimit: 90, + Encoder: 32, + Decoder: 26, + }, + }, + }, + }, + } + + planner := NewResourcePlanner(nodes) + + require.True(t, planner.HasNodeEnough("node1", Resources{ + CPU: 30, + Mem: 5, + })) + + require.False(t, planner.HasNodeEnough("node2", Resources{ + CPU: 30, + Mem: 5, + })) + + require.True(t, planner.HasNodeEnough("node1", Resources{ + CPU: 30, + Mem: 5, + GPU: ResourcesGPU{ + Usage: 0, + Encoder: 0, + Decoder: 0, + Mem: 50, + }, + })) + + require.False(t, planner.HasNodeEnough("node1", Resources{ + CPU: 30, + Mem: 5, + GPU: ResourcesGPU{ + Usage: 0, + Encoder: 0, + Decoder: 0, + Mem: 86, + }, + })) + + require.True(t, planner.HasNodeEnough("node1", Resources{ + CPU: 30, + Mem: 5, + GPU: ResourcesGPU{ + Usage: 0, + Encoder: 50, + Decoder: 0, + Mem: 50, + }, + })) +} + +func TestResourcePlannerAdd(t *testing.T) { + nodes := map[string]node.About{ + "node1": { + State: "online", + Resources: node.Resources{ + NCPU: 1, + CPU: 7, + Mem: 35, + CPULimit: 90, + MemLimit: 90, + }, + }, + } + + planner := NewResourcePlanner(nodes) + + planner.Add("node1", Resources{ + CPU: 42, + Mem: 33, + }) + + require.Equal(t, map[string]node.Resources{ + "node1": { + NCPU: 1, + CPU: 49, + Mem: 68, + CPULimit: 90, + MemLimit: 90, + }, + }, planner.Map()) +} + +func TestResourcePlannerNoGPUAddGPU(t *testing.T) { + nodes := map[string]node.About{ + "node1": { + State: "online", + Resources: node.Resources{ + NCPU: 1, + CPU: 7, + Mem: 35, + CPULimit: 90, + MemLimit: 90, + }, + }, + } + + planner := NewResourcePlanner(nodes) + + planner.Add("node1", Resources{ + CPU: 42, + Mem: 33, + GPU: ResourcesGPU{ + Index: 0, + Usage: 1, + Encoder: 2, + Decoder: 3, + Mem: 4, + }, + }) + + require.Equal(t, map[string]node.Resources{ + "node1": { + NCPU: 1, + CPU: 49, + Mem: 68, + CPULimit: 90, + MemLimit: 90, + }, + }, planner.Map()) +} + +func TestResourcePlannerAddGPU(t *testing.T) { + nodes := map[string]node.About{ + "node1": { + State: "online", + Resources: node.Resources{ + NCPU: 1, + CPU: 7, + Mem: 35, + CPULimit: 90, + MemLimit: 90, + GPU: []node.ResourcesGPU{ + { + Mem: 0, + MemLimit: 0, + Usage: 0, + UsageLimit: 0, + Encoder: 0, + Decoder: 0, + }, + { + Mem: 0, + MemLimit: 100, + Usage: 0, + UsageLimit: 100, + Encoder: 0, + Decoder: 0, + }, + }, + }, + }, + } + + planner := NewResourcePlanner(nodes) + + planner.Add("node1", Resources{ + CPU: 42, + Mem: 33, + GPU: ResourcesGPU{ + Usage: 1, + Encoder: 2, + Decoder: 3, + Mem: 4, + }, + }) + + require.Equal(t, map[string]node.Resources{ + "node1": { + NCPU: 1, + CPU: 49, + Mem: 68, + CPULimit: 90, + MemLimit: 90, + GPU: []node.ResourcesGPU{ + { + Mem: 0, + MemLimit: 0, + Usage: 0, + UsageLimit: 0, + Encoder: 0, + Decoder: 0, + }, + { + Mem: 4, + MemLimit: 100, + Usage: 1, + UsageLimit: 100, + Encoder: 2, + Decoder: 3, + }, + }, + }, + }, planner.Map()) +} + +func TestResourcePlannerRemove(t *testing.T) { + nodes := map[string]node.About{ + "node1": { + State: "online", + Resources: node.Resources{ + NCPU: 1, + CPU: 53, + Mem: 35, + CPULimit: 90, + MemLimit: 90, + }, + }, + } + + planner := NewResourcePlanner(nodes) + + planner.Remove("node1", Resources{ + CPU: 13, + Mem: 20, + }) + + require.Equal(t, map[string]node.Resources{ + "node1": { + NCPU: 1, + CPU: 40, + Mem: 15, + CPULimit: 90, + MemLimit: 90, + }, + }, planner.Map()) +} + +func TestResourcePlannerRemoveTooMuch(t *testing.T) { + nodes := map[string]node.About{ + "node1": { + State: "online", + Resources: node.Resources{ + NCPU: 1, + CPU: 53, + Mem: 35, + CPULimit: 90, + MemLimit: 90, + }, + }, + } + + planner := NewResourcePlanner(nodes) + + planner.Remove("node1", Resources{ + CPU: 100, + Mem: 100, + }) + + require.Equal(t, map[string]node.Resources{ + "node1": { + NCPU: 1, + CPU: 0, + Mem: 0, + CPULimit: 90, + MemLimit: 90, + }, + }, planner.Map()) +} + +func TestResourcePlannerRemoveGPU(t *testing.T) { + nodes := map[string]node.About{ + "node1": { + State: "online", + Resources: node.Resources{ + NCPU: 1, + CPU: 53, + Mem: 35, + CPULimit: 90, + MemLimit: 90, + GPU: []node.ResourcesGPU{ + { + Mem: 4, + MemLimit: 100, + Usage: 1, + UsageLimit: 100, + Encoder: 2, + Decoder: 3, + }, + { + Mem: 23, + MemLimit: 100, + Usage: 43, + UsageLimit: 100, + Encoder: 95, + Decoder: 12, + }, + }, + }, + }, + } + + planner := NewResourcePlanner(nodes) + + planner.Remove("node1", Resources{ + CPU: 13, + Mem: 20, + GPU: ResourcesGPU{ + Index: 1, + Usage: 3, + Encoder: 40, + Decoder: 0, + Mem: 5, + }, + }) + + require.Equal(t, map[string]node.Resources{ + "node1": { + NCPU: 1, + CPU: 40, + Mem: 15, + CPULimit: 90, + MemLimit: 90, + GPU: []node.ResourcesGPU{ + { + Mem: 4, + MemLimit: 100, + Usage: 1, + UsageLimit: 100, + Encoder: 2, + Decoder: 3, + }, + { + Mem: 18, + MemLimit: 100, + Usage: 40, + UsageLimit: 100, + Encoder: 55, + Decoder: 12, + }, + }, + }, + }, planner.Map()) +} + +func TestResourcePlannerRemoveGPUTooMuch(t *testing.T) { + nodes := map[string]node.About{ + "node1": { + State: "online", + Resources: node.Resources{ + NCPU: 1, + CPU: 53, + Mem: 35, + CPULimit: 90, + MemLimit: 90, + GPU: []node.ResourcesGPU{ + { + Mem: 4, + MemLimit: 100, + Usage: 1, + UsageLimit: 100, + Encoder: 2, + Decoder: 3, + }, + { + Mem: 23, + MemLimit: 100, + Usage: 43, + UsageLimit: 100, + Encoder: 95, + Decoder: 12, + }, + }, + }, + }, + } + + planner := NewResourcePlanner(nodes) + + planner.Remove("node1", Resources{ + CPU: 13, + Mem: 20, + GPU: ResourcesGPU{ + Index: 1, + Usage: 100, + Encoder: 100, + Decoder: 100, + Mem: 100, + }, + }) + + require.Equal(t, map[string]node.Resources{ + "node1": { + NCPU: 1, + CPU: 40, + Mem: 15, + CPULimit: 90, + MemLimit: 90, + GPU: []node.ResourcesGPU{ + { + Mem: 4, + MemLimit: 100, + Usage: 1, + UsageLimit: 100, + Encoder: 2, + Decoder: 3, + }, + { + Mem: 0, + MemLimit: 100, + Usage: 0, + UsageLimit: 100, + Encoder: 0, + Decoder: 0, + }, + }, + }, + }, planner.Map()) +} diff --git a/config/config.go b/config/config.go index e0364c03..a878065b 100644 --- a/config/config.go +++ b/config/config.go @@ -306,8 +306,10 @@ func (d *Config) init() { d.vars.Register(value.NewDir(&d.Router.UIPath, "", d.fs), "router.ui_path", "CORE_ROUTER_UI_PATH", nil, "Path to a directory holding UI files mounted as /ui", false, false) // Resources - d.vars.Register(value.NewFloat(&d.Resources.MaxCPUUsage, 0), "resources.max_cpu_usage", "CORE_RESOURCES_MAX_CPU_USAGE", nil, "Maximum system CPU usage in percent, from 0 (no limit) to 100", false, false) - d.vars.Register(value.NewFloat(&d.Resources.MaxMemoryUsage, 0), "resources.max_memory_usage", "CORE_RESOURCES_MAX_MEMORY_USAGE", nil, "Maximum system usage in percent, from 0 (no limit) to 100", false, false) + d.vars.Register(value.NewFloatRange(&d.Resources.MaxCPUUsage, 0, 0, 100), "resources.max_cpu_usage", "CORE_RESOURCES_MAX_CPU_USAGE", nil, "Maximum system CPU usage in percent, from 0 (no limit) to 100", false, false) + d.vars.Register(value.NewFloatRange(&d.Resources.MaxMemoryUsage, 0, 0, 100), "resources.max_memory_usage", "CORE_RESOURCES_MAX_MEMORY_USAGE", nil, "Maximum system usage in percent, from 0 (no limit) to 100", false, false) + d.vars.Register(value.NewFloatRange(&d.Resources.MaxGPUUsage, 0, 0, 100), "resources.max_gpu_usage", "CORE_RESOURCES_MAX_GPU_USAGE", nil, "Maximum general, encoder, and decoder GPU usage in percent per GPU, from 0 (no limit) to 100", false, false) + d.vars.Register(value.NewFloatRange(&d.Resources.MaxGPUMemoryUsage, 0, 0, 100), "resources.max_gpu_memory_usage", "CORE_RESOURCES_MAX_GPU_MEMORY_USAGE", nil, "Maximum GPU memory usage in percent per GPU, from 0 (no limit) to 100", false, false) // Cluster d.vars.Register(value.NewBool(&d.Cluster.Enable, false), "cluster.enable", "CORE_CLUSTER_ENABLE", nil, "Enable cluster mode", false, false) @@ -494,17 +496,6 @@ func (d *Config) Validate(resetLogs bool) { } } - // If resource limits are given, all values must be set - if d.Resources.MaxCPUUsage > 0 || d.Resources.MaxMemoryUsage > 0 { - if d.Resources.MaxCPUUsage <= 0 || d.Resources.MaxCPUUsage > 100 { - d.vars.Log("error", "resources.max_cpu_usage", "must be greater than 0 and smaller or equal to 100") - } - - if d.Resources.MaxMemoryUsage <= 0 { - d.vars.Log("error", "resources.max_memory_usage", "must be greater than 0 and smaller or equal to 100") - } - } - // If cluster mode is enabled, a proper address must be provided if d.Cluster.Enable { if len(d.Cluster.Address) == 0 { diff --git a/config/data.go b/config/data.go index 26c77054..f7f057cf 100644 --- a/config/data.go +++ b/config/data.go @@ -184,8 +184,10 @@ type Data struct { UIPath string `json:"ui_path"` } `json:"router"` Resources struct { - MaxCPUUsage float64 `json:"max_cpu_usage"` // percent 0-100 - MaxMemoryUsage float64 `json:"max_memory_usage"` // percent 0-100 + MaxCPUUsage float64 `json:"max_cpu_usage"` // percent 0-100 + MaxMemoryUsage float64 `json:"max_memory_usage"` // percent 0-100 + MaxGPUUsage float64 `json:"max_gpu_usage"` // percent 0-100 + MaxGPUMemoryUsage float64 `json:"max_gpu_memory_usage"` // percent 0-100 } `json:"resources"` Cluster struct { Enable bool `json:"enable"` diff --git a/config/value/primitives.go b/config/value/primitives.go index 4d1258fd..4c1ae54a 100644 --- a/config/value/primitives.go +++ b/config/value/primitives.go @@ -1,6 +1,7 @@ package value import ( + "fmt" "sort" "strconv" "strings" @@ -310,3 +311,56 @@ func (u *Float64) Validate() error { func (u *Float64) IsEmpty() bool { return float64(*u) == 0 } + +// float64 range + +type Float64Range struct { + p *float64 + from float64 + to float64 +} + +func NewFloatRange(p *float64, val, from, to float64) *Float64Range { + v := &Float64Range{ + p: p, + from: from, + to: to, + } + + *p = val + + return v +} + +func (s *Float64Range) Set(val string) error { + v, err := strconv.ParseFloat(val, 64) + if err != nil { + return err + } + + *s.p = v + + return nil +} + +func (s *Float64Range) String() string { + if s.IsEmpty() { + return "(empty)" + } + + return fmt.Sprintf("%.3f", *s.p) +} + +func (s *Float64Range) Validate() error { + val := *s.p + + if val < s.from || val > s.to { + return fmt.Errorf("value %f is not in range [%f, %f]", val, s.from, s.to) + } + + return nil +} + +func (s *Float64Range) IsEmpty() bool { + return *s.p == 0 +} diff --git a/config/value/primitives_test.go b/config/value/primitives_test.go index 4406d8b0..2ee865ff 100644 --- a/config/value/primitives_test.go +++ b/config/value/primitives_test.go @@ -165,3 +165,29 @@ func TestFloat64Value(t *testing.T) { require.Equal(t, float64(77.7), x) } + +func TestFloat64RangeValue(t *testing.T) { + var x float64 + + val := NewFloatRange(&x, 11.1, 0, 100) + + require.Equal(t, "11.100", val.String()) + require.NoError(t, val.Validate()) + require.Equal(t, false, val.IsEmpty()) + + x = 42.5 + + require.Equal(t, "42.500", val.String()) + require.NoError(t, val.Validate()) + require.Equal(t, false, val.IsEmpty()) + + val.Set("77.7") + + require.Equal(t, float64(77.7), x) + + val.Set("101.9") + + require.Equal(t, "101.900", val.String()) + require.Error(t, val.Validate()) + require.Equal(t, false, val.IsEmpty()) +} diff --git a/ffmpeg/ffmpeg.go b/ffmpeg/ffmpeg.go index 3b1e9710..1b3c96af 100644 --- a/ffmpeg/ffmpeg.go +++ b/ffmpeg/ffmpeg.go @@ -29,23 +29,26 @@ type FFmpeg interface { } type ProcessConfig struct { - Reconnect bool // Whether to reconnect - ReconnectDelay time.Duration // Duration until next reconnect - StaleTimeout time.Duration // Duration to wait until killing the process if there is no progress in the process - Timeout time.Duration // Duration to wait until killing the process - LimitCPU float64 // Kill the process if the CPU usage in percent is above this value. - LimitMemory uint64 // Kill the process if the memory consumption in bytes is above this value. - LimitDuration time.Duration // Kill the process if the limits are exceeded for this duration. - LimitMode string // How to limit the process, "hard" or "soft" - Scheduler string // A scheduler for starting the process, either a concrete date (RFC3339) or in crontab syntax - Args []string // Arguments for the process - Parser process.Parser // Parser for the process output - Logger log.Logger // Logger - OnArgs func([]string) []string // Callback before starting the process to retrieve new arguments - OnBeforeStart func() error // Callback which is called before the process will be started. If error is non-nil, the start will be refused. - OnStart func() // Callback called after process has been started - OnExit func(state string) // Callback called after the process stopped with exit state as argument - OnStateChange func(from, to string) // Callback called on state change + Reconnect bool // Whether to reconnect + ReconnectDelay time.Duration // Duration until next reconnect + StaleTimeout time.Duration // Duration to wait until killing the process if there is no progress in the process + Timeout time.Duration // Duration to wait until killing the process + LimitCPU float64 // Kill the process if the CPU usage in percent is above this value. + LimitMemory uint64 // Kill the process if the memory consumption in bytes is above this value. + LimitGPUUsage float64 // Kill the process id the GPU usage (general) in percent is above this value. + LimitGPUEncoder float64 // Kill the process id the GPU usage (encoder) in percent is above this value. + LimitGPUDecoder float64 // Kill the process id the GPU usage (decoder) in percent is above this value. + LimitGPUMemory uint64 // Kill the process if the GPU memory consumption in bytes is above this value. + LimitDuration time.Duration // Kill the process if the limits are exceeded for this duration. + LimitMode string // How to limit the process, "hard" or "soft" + Scheduler string // A scheduler for starting the process, either a concrete date (RFC3339) or in crontab syntax + Args []string // Arguments for the process + Parser process.Parser // Parser for the process output + Logger log.Logger // Logger + OnBeforeStart func([]string) ([]string, error) // Callback which is called before the process will be started. The string slice is the list of arguments which can be modified. If error is non-nil, the start will be refused. + OnStart func() // Callback called after process has been started + OnExit func(state string) // Callback called after the process stopped with exit state as argument + OnStateChange func(from, to string) // Callback called on state change } // Config is the configuration for ffmpeg that is part of the configuration @@ -138,23 +141,26 @@ func (f *ffmpeg) New(config ProcessConfig) (process.Process, error) { } ffmpeg, err := process.New(process.Config{ - Binary: f.binary, - Args: config.Args, - Reconnect: config.Reconnect, - ReconnectDelay: config.ReconnectDelay, - StaleTimeout: config.StaleTimeout, - Timeout: config.Timeout, - LimitCPU: config.LimitCPU, - LimitMemory: config.LimitMemory, - LimitDuration: config.LimitDuration, - LimitMode: limitMode, - Scheduler: scheduler, - Parser: config.Parser, - Logger: config.Logger, - OnArgs: config.OnArgs, - OnBeforeStart: config.OnBeforeStart, - OnStart: config.OnStart, - OnExit: config.OnExit, + Binary: f.binary, + Args: config.Args, + Reconnect: config.Reconnect, + ReconnectDelay: config.ReconnectDelay, + StaleTimeout: config.StaleTimeout, + Timeout: config.Timeout, + LimitCPU: config.LimitCPU, + LimitMemory: config.LimitMemory, + LimitGPUUsage: config.LimitGPUUsage, + LimitGPUEncoder: config.LimitGPUEncoder, + LimitGPUDecoder: config.LimitGPUDecoder, + LimitGPUMemory: config.LimitGPUMemory, + LimitDuration: config.LimitDuration, + LimitMode: limitMode, + Scheduler: scheduler, + Parser: config.Parser, + Logger: config.Logger, + OnBeforeStart: config.OnBeforeStart, + OnStart: config.OnStart, + OnExit: config.OnExit, OnStateChange: func(from, to string) { f.statesLock.Lock() switch to { diff --git a/ffmpeg/parse/parser.go b/ffmpeg/parse/parser.go index b4912af1..2259159c 100644 --- a/ffmpeg/parse/parser.go +++ b/ffmpeg/parse/parser.go @@ -619,7 +619,7 @@ func (p *parser) Stop(state string, pusage process.Usage) { usage.CPU.Max = pusage.CPU.Max usage.CPU.Limit = pusage.CPU.Limit - usage.Memory.Average = pusage.Memory.Average + usage.Memory.Average = uint64(pusage.Memory.Average) usage.Memory.Max = pusage.Memory.Max usage.Memory.Limit = pusage.Memory.Limit diff --git a/ffmpeg/parse/types.go b/ffmpeg/parse/types.go index a3eb31fc..1c98f6e8 100644 --- a/ffmpeg/parse/types.go +++ b/ffmpeg/parse/types.go @@ -576,6 +576,7 @@ type AVstream struct { type Usage struct { CPU UsageCPU Memory UsageMemory + GPU UsageGPU } type UsageCPU struct { @@ -586,7 +587,27 @@ type UsageCPU struct { } type UsageMemory struct { + Average uint64 + Max uint64 + Limit uint64 +} + +type UsageGPU struct { + Index int + Usage UsageGPUUsage + Encoder UsageGPUUsage + Decoder UsageGPUUsage + Memory UsageGPUMemory +} + +type UsageGPUUsage struct { Average float64 + Max float64 + Limit float64 +} + +type UsageGPUMemory struct { + Average uint64 Max uint64 Limit uint64 } diff --git a/http/api/process.go b/http/api/process.go index baf87707..43a9fce9 100644 --- a/http/api/process.go +++ b/http/api/process.go @@ -155,9 +155,13 @@ type ProcessConfigIOCleanup struct { } type ProcessConfigLimits struct { - CPU float64 `json:"cpu_usage" jsonschema:"minimum=0"` - Memory uint64 `json:"memory_mbytes" jsonschema:"minimum=0" format:"uint64"` - WaitFor uint64 `json:"waitfor_seconds" jsonschema:"minimum=0" format:"uint64"` + CPU float64 `json:"cpu_usage" jsonschema:"minimum=0"` + Memory uint64 `json:"memory_mbytes" jsonschema:"minimum=0" format:"uint64"` + GPUUsage float64 `json:"gpu_usage" jsonschema:"minimum=0"` + GPUEncoder float64 `json:"gpu_encoder" jsonschema:"minimum=0"` + GPUDecoder float64 `json:"gpu_decoder" jsonschema:"minimum=0"` + GPUMemory uint64 `json:"gpu_memory_mbytes" jsonschema:"minimum=0" format:"uint64"` + WaitFor uint64 `json:"waitfor_seconds" jsonschema:"minimum=0" format:"uint64"` } // ProcessConfig represents the configuration of an ffmpeg process @@ -197,7 +201,13 @@ func (cfg *ProcessConfig) Marshal() (*app.Config, map[string]interface{}) { Scheduler: cfg.Scheduler, LimitCPU: cfg.Limits.CPU, LimitMemory: cfg.Limits.Memory * 1024 * 1024, - LimitWaitFor: cfg.Limits.WaitFor, + LimitGPU: app.ConfigLimitGPU{ + Usage: cfg.Limits.GPUUsage, + Encoder: cfg.Limits.GPUEncoder, + Decoder: cfg.Limits.GPUDecoder, + Memory: cfg.Limits.GPUMemory * 1024 * 1024, + }, + LimitWaitFor: cfg.Limits.WaitFor, } cfg.generateInputOutputIDs(cfg.Input) @@ -283,6 +293,10 @@ func (cfg *ProcessConfig) Unmarshal(c *app.Config, metadata map[string]interface cfg.Scheduler = c.Scheduler cfg.Limits.CPU = c.LimitCPU cfg.Limits.Memory = c.LimitMemory / 1024 / 1024 + cfg.Limits.GPUUsage = c.LimitGPU.Usage + cfg.Limits.GPUEncoder = c.LimitGPU.Encoder + cfg.Limits.GPUDecoder = c.LimitGPU.Decoder + cfg.Limits.GPUMemory = c.LimitGPU.Memory / 1024 / 1024 cfg.Limits.WaitFor = c.LimitWaitFor cfg.Options = make([]string, len(c.Options)) @@ -364,20 +378,7 @@ func (s *ProcessState) Unmarshal(state *app.State) { s.Memory = state.Memory s.CPU = json.ToNumber(state.CPU) s.LimitMode = state.LimitMode - s.Resources.CPU = ProcessUsageCPU{ - NCPU: json.ToNumber(state.Resources.CPU.NCPU), - Current: json.ToNumber(state.Resources.CPU.Current), - Average: json.ToNumber(state.Resources.CPU.Average), - Max: json.ToNumber(state.Resources.CPU.Max), - Limit: json.ToNumber(state.Resources.CPU.Limit), - IsThrottling: state.Resources.CPU.IsThrottling, - } - s.Resources.Memory = ProcessUsageMemory{ - Current: state.Resources.Memory.Current, - Average: json.ToNumber(state.Resources.Memory.Average), - Max: state.Resources.Memory.Max, - Limit: state.Resources.Memory.Limit, - } + s.Resources.Unmarshal(&state.Resources) s.Command = state.Command s.Progress.Unmarshal(&state.Progress) @@ -430,15 +431,15 @@ func (p *ProcessUsageCPU) Marshal() app.ProcessUsageCPU { } type ProcessUsageMemory struct { - Current uint64 `json:"cur" format:"uint64"` - Average json.Number `json:"avg" swaggertype:"number" jsonschema:"type=number"` - Max uint64 `json:"max" format:"uint64"` - Limit uint64 `json:"limit" format:"uint64"` + Current uint64 `json:"cur" format:"uint64"` + Average uint64 `json:"avg" format:"uint64"` + Max uint64 `json:"max" format:"uint64"` + Limit uint64 `json:"limit" format:"uint64"` } func (p *ProcessUsageMemory) Unmarshal(pp *app.ProcessUsageMemory) { p.Current = pp.Current - p.Average = json.ToNumber(pp.Average) + p.Average = pp.Average p.Max = pp.Max p.Limit = pp.Limit } @@ -446,31 +447,120 @@ func (p *ProcessUsageMemory) Unmarshal(pp *app.ProcessUsageMemory) { func (p *ProcessUsageMemory) Marshal() app.ProcessUsageMemory { pp := app.ProcessUsageMemory{ Current: p.Current, + Average: p.Average, Max: p.Max, Limit: p.Limit, } + return pp +} + +type ProcessUsageGPUMemory struct { + Current uint64 `json:"cur" format:"uint64"` + Average uint64 `json:"avg" format:"uint64"` + Max uint64 `json:"max" format:"uint64"` + Limit uint64 `json:"limit" format:"uint64"` +} + +func (p *ProcessUsageGPUMemory) Unmarshal(pp *app.ProcessUsageGPUMemory) { + p.Current = pp.Current + p.Average = pp.Average + p.Max = pp.Max + p.Limit = pp.Limit +} + +func (p *ProcessUsageGPUMemory) Marshal() app.ProcessUsageGPUMemory { + pp := app.ProcessUsageGPUMemory{ + Current: p.Current, + Average: p.Average, + Max: p.Max, + Limit: p.Limit, + } + + return pp +} + +type ProcessUsageGPUUsage struct { + Current json.Number `json:"cur" swaggertype:"number" jsonschema:"type=number"` + Average json.Number `json:"avg" swaggertype:"number" jsonschema:"type=number"` + Max json.Number `json:"max" swaggertype:"number" jsonschema:"type=number"` + Limit json.Number `json:"limit" swaggertype:"number" jsonschema:"type=number"` +} + +func (p *ProcessUsageGPUUsage) Unmarshal(pp *app.ProcessUsageGPUUsage) { + p.Current = json.ToNumber(pp.Current) + p.Average = json.ToNumber(pp.Average) + p.Max = json.ToNumber(pp.Max) + p.Limit = json.ToNumber(pp.Limit) +} + +func (p *ProcessUsageGPUUsage) Marshal() app.ProcessUsageGPUUsage { + pp := app.ProcessUsageGPUUsage{} + + if x, err := p.Current.Float64(); err == nil { + pp.Current = x + } + if x, err := p.Average.Float64(); err == nil { pp.Average = x } + if x, err := p.Max.Float64(); err == nil { + pp.Max = x + } + + if x, err := p.Limit.Float64(); err == nil { + pp.Limit = x + } + + return pp +} + +type ProcessUsageGPU struct { + Index int `json:"index"` + Memory ProcessUsageGPUMemory `json:"memory_bytes"` + Usage ProcessUsageGPUUsage `json:"usage"` + Encoder ProcessUsageGPUUsage `json:"encoder"` + Decoder ProcessUsageGPUUsage `json:"decoder"` +} + +func (p *ProcessUsageGPU) Unmarshal(pp *app.ProcessUsageGPU) { + p.Index = pp.Index + p.Memory.Unmarshal(&pp.Memory) + p.Usage.Unmarshal(&pp.Usage) + p.Encoder.Unmarshal(&pp.Encoder) + p.Decoder.Unmarshal(&pp.Decoder) +} + +func (p *ProcessUsageGPU) Marshal() app.ProcessUsageGPU { + pp := app.ProcessUsageGPU{ + Index: p.Index, + Memory: p.Memory.Marshal(), + Usage: p.Usage.Marshal(), + Encoder: p.Encoder.Marshal(), + Decoder: p.Decoder.Marshal(), + } + return pp } type ProcessUsage struct { CPU ProcessUsageCPU `json:"cpu_usage"` Memory ProcessUsageMemory `json:"memory_bytes"` + GPU ProcessUsageGPU `json:"gpu"` } func (p *ProcessUsage) Unmarshal(pp *app.ProcessUsage) { p.CPU.Unmarshal(&pp.CPU) p.Memory.Unmarshal(&pp.Memory) + p.GPU.Unmarshal(&pp.GPU) } func (p *ProcessUsage) Marshal() app.ProcessUsage { pp := app.ProcessUsage{ CPU: p.CPU.Marshal(), Memory: p.Memory.Marshal(), + GPU: p.GPU.Marshal(), } return pp diff --git a/http/api/process_test.go b/http/api/process_test.go index 6dddce39..ddbdfbf8 100644 --- a/http/api/process_test.go +++ b/http/api/process_test.go @@ -56,6 +56,33 @@ func TestProcessUsage(t *testing.T) { Max: 150, Limit: 200, }, + GPU: app.ProcessUsageGPU{ + Index: 3, + Memory: app.ProcessUsageGPUMemory{ + Current: 48, + Average: 43, + Max: 88, + Limit: 34, + }, + Usage: app.ProcessUsageGPUUsage{ + Current: 47, + Average: 22, + Max: 90, + Limit: 80, + }, + Encoder: app.ProcessUsageGPUUsage{ + Current: 48, + Average: 46, + Max: 74, + Limit: 46, + }, + Decoder: app.ProcessUsageGPUUsage{ + Current: 21, + Average: 42, + Max: 30, + Limit: 99, + }, + }, } p := ProcessUsage{} @@ -103,7 +130,13 @@ func TestProcessConfig(t *testing.T) { LogPatterns: []string{"bla", "blubb"}, LimitCPU: 10, LimitMemory: 100 * 1024 * 1024, - LimitWaitFor: 20, + LimitGPU: app.ConfigLimitGPU{ + Usage: 50, + Encoder: 90, + Decoder: 80, + Memory: 24 * 1024 * 1024 * 1024, + }, + LimitWaitFor: 20, } p := ProcessConfig{} diff --git a/internal/.gitignore b/internal/.gitignore index 9872bd8c..ad8efa9c 100644 --- a/internal/.gitignore +++ b/internal/.gitignore @@ -2,4 +2,5 @@ testhelper/ignoresigint/ignoresigint testhelper/sigint/sigint testhelper/sigintwait/sigintwait testhelper/sigpropagate/sigpropagate -testhelper/ffmpeg/ffmpeg \ No newline at end of file +testhelper/ffmpeg/ffmpeg +testhelper/nvidia-smi/nvidia-smi \ No newline at end of file diff --git a/internal/testhelper/nvidia-smi/nvidia-smi.go b/internal/testhelper/nvidia-smi/nvidia-smi.go new file mode 100644 index 00000000..36f6a78c --- /dev/null +++ b/internal/testhelper/nvidia-smi/nvidia-smi.go @@ -0,0 +1,973 @@ +package main + +import ( + "context" + "fmt" + "os" + "os/signal" + "time" +) + +var pmondata = `# gpu pid type sm mem enc dec fb command +# Idx # C/G % % % % MB name + 0 7372 C 2 0 2 - 136 ffmpeg + 0 12176 C 5 2 3 7 782 ffmpeg + 1 20035 C 8 2 4 1 1145 ffmpeg + 1 20141 C 2 1 1 3 429 ffmpeg + 0 29591 C 2 1 - 2 435 ffmpeg ` + +var querydata = ` + + + Mon Jul 15 13:41:56 2024 + 555.42.06 + 12.5 + 2 + + NVIDIA L4 + NVIDIA + Ada Lovelace + Enabled + Disabled + Disabled + None + + N/A + N/A + + + None + + Disabled + 4000 + + N/A + N/A + + 1654523003308 + GPU-c5533cd4-5a60-059e-348d-b6d7466932e4 + 1 + 95.04.29.00.06 + No + 0x100 + 900-2G193-0000-001 + 27B8-895-A1 + N/A + 1 + + G193.0200.00.01 + 2.1 + 6.16 + N/A + + + N/A + N/A + + + N/A + N/A + + N/A + + None + N/A + N/A + + + No + N/A + + 555.42.06 + + N/A + + + 01 + 00 + 0000 + 3 + 2 + 27B810DE + 00000000:01:00.0 + 16CA10DE + + + 4 + 4 + 4 + 4 + 5 + + + 16x + 16x + + + + N/A + N/A + + 0 + 0 + 0 KB/s + 0 KB/s + N/A + N/A + + N/A + P0 + + Active + Not Active + Not Active + Not Active + Not Active + Not Active + Not Active + Not Active + Not Active + + N/A + + 23034 MiB + 434 MiB + 1 MiB + 22601 MiB + + + 32768 MiB + 1 MiB + 32767 MiB + + + 0 MiB + 0 MiB + 0 MiB + + Default + + 2 % + 0 % + 0 % + 0 % + 0 % + 0 % + + + 0 + 0 + 0 + + + 0 + 0 + 0 + + + Enabled + Enabled + + + + 0 + 0 + 0 + 0 + 0 + + + 0 + 0 + 0 + 0 + 0 + No + + + 0 + 0 + 0 + 0 + 0 + + + + + N/A + N/A + + + N/A + N/A + + N/A + N/A + + + 0 + 0 + No + No + + 96 bank(s) + 0 bank(s) + 0 bank(s) + 0 bank(s) + 0 bank(s) + + + + 45 C + 39 C + -5 C + -2 C + 0 C + N/A + N/A + N/A + + + N/A + N/A + + + P0 + 27.22 W + 72.00 W + 72.00 W + 72.00 W + 40.00 W + 72.00 W + + + N/A + + + P0 + N/A + N/A + N/A + N/A + N/A + N/A + + + 2040 MHz + 2040 MHz + 6250 MHz + 1770 MHz + + + 2040 MHz + 6251 MHz + + + 2040 MHz + 6251 MHz + + + N/A + + + 2040 MHz + 2040 MHz + 6251 MHz + 1770 MHz + + + 2040 MHz + + + N/A + N/A + + + 885.000 mV + + + N/A + N/A + N/A + N/A + + N/A + + + + + 6251 MHz + 2040 MHz + 2025 MHz + 2010 MHz + 1995 MHz + 1980 MHz + 1965 MHz + 1950 MHz + 1935 MHz + 1920 MHz + 1905 MHz + 1890 MHz + 1875 MHz + 1860 MHz + 1845 MHz + 1830 MHz + 1815 MHz + 1800 MHz + 1785 MHz + 1770 MHz + 1755 MHz + 1740 MHz + 1725 MHz + 1710 MHz + 1695 MHz + 1680 MHz + 1665 MHz + 1650 MHz + 1635 MHz + 1620 MHz + 1605 MHz + 1590 MHz + 1575 MHz + 1560 MHz + 1545 MHz + 1530 MHz + 1515 MHz + 1500 MHz + 1485 MHz + 1470 MHz + 1455 MHz + 1440 MHz + 1425 MHz + 1410 MHz + 1395 MHz + 1380 MHz + 1365 MHz + 1350 MHz + 1335 MHz + 1320 MHz + 1305 MHz + 1290 MHz + 1275 MHz + 1260 MHz + 1245 MHz + 1230 MHz + 1215 MHz + 1200 MHz + 1185 MHz + 1170 MHz + 1155 MHz + 1140 MHz + 1125 MHz + 1110 MHz + 1095 MHz + 1080 MHz + 1065 MHz + 1050 MHz + 1035 MHz + 1020 MHz + 1005 MHz + 990 MHz + 975 MHz + 960 MHz + 945 MHz + 930 MHz + 915 MHz + 900 MHz + 885 MHz + 870 MHz + 855 MHz + 840 MHz + 825 MHz + 810 MHz + 795 MHz + 780 MHz + 765 MHz + 750 MHz + 735 MHz + 720 MHz + 705 MHz + 690 MHz + 675 MHz + 660 MHz + 645 MHz + 630 MHz + 615 MHz + 600 MHz + 585 MHz + 570 MHz + 555 MHz + 540 MHz + 525 MHz + 510 MHz + 495 MHz + 480 MHz + 465 MHz + 450 MHz + 435 MHz + 420 MHz + 405 MHz + 390 MHz + 375 MHz + 360 MHz + 345 MHz + 330 MHz + 315 MHz + 300 MHz + 285 MHz + 270 MHz + 255 MHz + 240 MHz + 225 MHz + 210 MHz + + + 405 MHz + 645 MHz + 630 MHz + 615 MHz + 600 MHz + 585 MHz + 570 MHz + 555 MHz + 540 MHz + 525 MHz + 510 MHz + 495 MHz + 480 MHz + 465 MHz + 450 MHz + 435 MHz + 420 MHz + 405 MHz + 390 MHz + 375 MHz + 360 MHz + 345 MHz + 330 MHz + 315 MHz + 300 MHz + 285 MHz + 270 MHz + 255 MHz + 240 MHz + 225 MHz + 210 MHz + + + + + 10131 + C + ffmpeg + 389 MiB + + + 13597 + C + ffmpeg + 1054 MiB + + + + + + disabled + + + + + NVIDIA L4 + NVIDIA + Ada Lovelace + Enabled + Disabled + Disabled + None + + N/A + N/A + + + None + + Disabled + 4000 + + N/A + N/A + + 1654523001128 + GPU-128ab6fb-6ec9-fd74-b479-4a5fd14f55bd + 0 + 95.04.29.00.06 + No + 0xc100 + 900-2G193-0000-001 + 27B8-895-A1 + N/A + 1 + + G193.0200.00.01 + 2.1 + 6.16 + N/A + + + N/A + N/A + + + N/A + N/A + + N/A + + None + N/A + N/A + + + No + N/A + + 555.42.06 + + N/A + + + C1 + 00 + 0000 + 3 + 2 + 27B810DE + 00000000:C1:00.0 + 16CA10DE + + + 4 + 4 + 4 + 4 + 5 + + + 16x + 1x + + + + N/A + N/A + + 0 + 0 + 0 KB/s + 0 KB/s + N/A + N/A + + N/A + P0 + + Active + Not Active + Not Active + Not Active + Not Active + Not Active + Not Active + Not Active + Not Active + + N/A + + 23034 MiB + 434 MiB + 1 MiB + 22601 MiB + + + 32768 MiB + 1 MiB + 32767 MiB + + + 0 MiB + 0 MiB + 0 MiB + + Default + + 3 % + 0 % + 0 % + 0 % + 0 % + 0 % + + + 0 + 0 + 0 + + + 0 + 0 + 0 + + + Enabled + Enabled + + + + 0 + 0 + 0 + 0 + 0 + + + 0 + 0 + 0 + 0 + 0 + No + + + 0 + 0 + 0 + 0 + 0 + + + + + N/A + N/A + + + N/A + N/A + + N/A + N/A + + + 0 + 0 + No + No + + 96 bank(s) + 0 bank(s) + 0 bank(s) + 0 bank(s) + 0 bank(s) + + + + 40 C + 43 C + -5 C + -2 C + 0 C + N/A + N/A + N/A + + + N/A + N/A + + + P0 + 29.54 W + 72.00 W + 72.00 W + 72.00 W + 40.00 W + 72.00 W + + + N/A + + + P0 + N/A + N/A + N/A + N/A + N/A + N/A + + + 2040 MHz + 2040 MHz + 6250 MHz + 1770 MHz + + + 2040 MHz + 6251 MHz + + + 2040 MHz + 6251 MHz + + + N/A + + + 2040 MHz + 2040 MHz + 6251 MHz + 1770 MHz + + + 2040 MHz + + + N/A + N/A + + + 910.000 mV + + + N/A + N/A + N/A + N/A + + N/A + + + + + 6251 MHz + 2040 MHz + 2025 MHz + 2010 MHz + 1995 MHz + 1980 MHz + 1965 MHz + 1950 MHz + 1935 MHz + 1920 MHz + 1905 MHz + 1890 MHz + 1875 MHz + 1860 MHz + 1845 MHz + 1830 MHz + 1815 MHz + 1800 MHz + 1785 MHz + 1770 MHz + 1755 MHz + 1740 MHz + 1725 MHz + 1710 MHz + 1695 MHz + 1680 MHz + 1665 MHz + 1650 MHz + 1635 MHz + 1620 MHz + 1605 MHz + 1590 MHz + 1575 MHz + 1560 MHz + 1545 MHz + 1530 MHz + 1515 MHz + 1500 MHz + 1485 MHz + 1470 MHz + 1455 MHz + 1440 MHz + 1425 MHz + 1410 MHz + 1395 MHz + 1380 MHz + 1365 MHz + 1350 MHz + 1335 MHz + 1320 MHz + 1305 MHz + 1290 MHz + 1275 MHz + 1260 MHz + 1245 MHz + 1230 MHz + 1215 MHz + 1200 MHz + 1185 MHz + 1170 MHz + 1155 MHz + 1140 MHz + 1125 MHz + 1110 MHz + 1095 MHz + 1080 MHz + 1065 MHz + 1050 MHz + 1035 MHz + 1020 MHz + 1005 MHz + 990 MHz + 975 MHz + 960 MHz + 945 MHz + 930 MHz + 915 MHz + 900 MHz + 885 MHz + 870 MHz + 855 MHz + 840 MHz + 825 MHz + 810 MHz + 795 MHz + 780 MHz + 765 MHz + 750 MHz + 735 MHz + 720 MHz + 705 MHz + 690 MHz + 675 MHz + 660 MHz + 645 MHz + 630 MHz + 615 MHz + 600 MHz + 585 MHz + 570 MHz + 555 MHz + 540 MHz + 525 MHz + 510 MHz + 495 MHz + 480 MHz + 465 MHz + 450 MHz + 435 MHz + 420 MHz + 405 MHz + 390 MHz + 375 MHz + 360 MHz + 345 MHz + 330 MHz + 315 MHz + 300 MHz + 285 MHz + 270 MHz + 255 MHz + 240 MHz + 225 MHz + 210 MHz + + + 405 MHz + 645 MHz + 630 MHz + 615 MHz + 600 MHz + 585 MHz + 570 MHz + 555 MHz + 540 MHz + 525 MHz + 510 MHz + 495 MHz + 480 MHz + 465 MHz + 450 MHz + 435 MHz + 420 MHz + 405 MHz + 390 MHz + 375 MHz + 360 MHz + 345 MHz + 330 MHz + 315 MHz + 300 MHz + 285 MHz + 270 MHz + 255 MHz + 240 MHz + 225 MHz + 210 MHz + + + + + 16870 + C + ffmpeg + 549 MiB + + + + + + disabled + + + +` + +func main() { + if len(os.Args) == 1 { + os.Exit(1) + } + + ctx, cancel := context.WithCancel(context.Background()) + + if os.Args[1] == "pmon" { + go func(ctx context.Context) { + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + fmt.Fprintf(os.Stdout, "%s\n", pmondata) + } + } + }(ctx) + } else { + go func(ctx context.Context) { + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + fmt.Fprintf(os.Stdout, "%s\n", querydata) + } + } + }(ctx) + } + + // Wait for interrupt signal to gracefully shutdown the app + quit := make(chan os.Signal, 1) + signal.Notify(quit, os.Interrupt) + <-quit + + cancel() + + os.Exit(0) +} diff --git a/monitor/cpu.go b/monitor/cpu.go index 83869653..8a10850a 100644 --- a/monitor/cpu.go +++ b/monitor/cpu.go @@ -33,7 +33,7 @@ func NewCPUCollector(rsc resources.Resources) metric.Collector { c.limitDescr = metric.NewDesc("cpu_limit", "Percentage of CPU to be consumed", nil) c.throttleDescr = metric.NewDesc("cpu_throttling", "Whether the CPU is currently throttled", nil) - if ncpu, err := psutil.CPUCounts(true); err == nil { + if ncpu, err := psutil.CPUCounts(); err == nil { c.ncpu = ncpu } @@ -63,11 +63,11 @@ func (c *cpuCollector) Collect() metric.Metrics { metrics.Add(metric.NewValue(c.ncpuDescr, c.ncpu)) - limit, _ := c.resources.Limits() + limit, _, _, _ := c.resources.Limits() metrics.Add(metric.NewValue(c.limitDescr, limit)) - cpu, _ := c.resources.ShouldLimit() + cpu, _, _ := c.resources.ShouldLimit() throttling := .0 if cpu { throttling = 1 diff --git a/monitor/disk.go b/monitor/disk.go index 7e1ba86d..fda2f24d 100644 --- a/monitor/disk.go +++ b/monitor/disk.go @@ -37,7 +37,7 @@ func (c *diskCollector) Describe() []*metric.Description { func (c *diskCollector) Collect() metric.Metrics { metrics := metric.NewMetrics() - stat, err := psutil.DiskUsage(c.path) + stat, err := psutil.Disk(c.path) if err != nil { return metrics } diff --git a/monitor/mem.go b/monitor/mem.go index 10a66f7f..986b2be5 100644 --- a/monitor/mem.go +++ b/monitor/mem.go @@ -44,11 +44,11 @@ func (c *memCollector) Describe() []*metric.Description { func (c *memCollector) Collect() metric.Metrics { metrics := metric.NewMetrics() - _, limit := c.resources.Limits() + _, limit, _, _ := c.resources.Limits() metrics.Add(metric.NewValue(c.limitDescr, float64(limit))) - _, memory := c.resources.ShouldLimit() + _, memory, _ := c.resources.ShouldLimit() throttling := .0 if memory { throttling = 1 @@ -56,7 +56,7 @@ func (c *memCollector) Collect() metric.Metrics { metrics.Add(metric.NewValue(c.throttleDescr, throttling)) - stat, err := psutil.VirtualMemory() + stat, err := psutil.Memory() if err != nil { return metrics } diff --git a/monitor/net.go b/monitor/net.go index 87b2b8a3..270e0948 100644 --- a/monitor/net.go +++ b/monitor/net.go @@ -33,7 +33,7 @@ func (c *netCollector) Describe() []*metric.Description { func (c *netCollector) Collect() metric.Metrics { metrics := metric.NewMetrics() - devs, err := psutil.NetIOCounters(true) + devs, err := psutil.Network() if err != nil { return metrics } diff --git a/process/limiter.go b/process/limiter.go index ea5df9c2..699294dc 100644 --- a/process/limiter.go +++ b/process/limiter.go @@ -25,9 +25,36 @@ type Usage struct { Max uint64 // bytes Limit uint64 // bytes } + GPU struct { + Index int // number of the GPU + Memory struct { + Current uint64 // bytes + Average float64 // bytes + Max uint64 // bytes + Limit uint64 // bytes + } + Usage struct { + Current float64 // percent 0-100 + Average float64 // percent 0-100 + Max float64 // percent 0-100 + Limit float64 // percent 0-100 + } + Encoder struct { + Current float64 // percent 0-100 + Average float64 // percent 0-100 + Max float64 // percent 0-100 + Limit float64 // percent 0-100 + } + Decoder struct { + Current float64 // percent 0-100 + Average float64 // percent 0-100 + Max float64 // percent 0-100 + Limit float64 // percent 0-100 + } + } } -type LimitFunc func(cpu float64, memory uint64) +type LimitFunc func(cpu float64, memory uint64, gpuusage, gpuencoder, gpudecoder float64, gpumemory uint64) type LimitMode int @@ -44,18 +71,22 @@ func (m LimitMode) String() string { } const ( - LimitModeHard LimitMode = 0 // Killing the process if either CPU or memory is above the limit for a certain time - LimitModeSoft LimitMode = 1 // Throttling the CPU if activated, killing the process if memory is above the limit for a certain time + LimitModeHard LimitMode = 0 // Killing the process if either resource is above the limit for a certain time. + LimitModeSoft LimitMode = 1 // If activated, will throttle the CPU, otherwise killing the process if resources are above the limit. ) type LimiterConfig struct { - CPU float64 // Max. CPU usage in percent 0-100 in hard mode, 0-100*ncpu in softmode - Memory uint64 // Max. memory usage in bytes - WaitFor time.Duration // Duration for one of the limits has to be above the limit until OnLimit gets triggered - OnLimit LimitFunc // Function to be triggered if limits are exceeded - Mode LimitMode // How to limit CPU usage - PSUtil psutil.Util - Logger log.Logger + CPU float64 // Max. CPU usage in percent 0-100 in hard mode, 0-100*ncpu in soft mode. + Memory uint64 // Max. memory usage in bytes. + GPUUsage float64 // Max. GPU general usage in percent 0-100. + GPUEncoder float64 // Max. GPU encoder usage in percent 0-100. + GPUDecoder float64 // Max. GPU decoder usage in percent 0-100. + GPUMemory uint64 // Max. GPU memory usage in bytes. + WaitFor time.Duration // Duration for one of the limits has to be above the limit until OnLimit gets triggered. + OnLimit LimitFunc // Function to be triggered if limits are exceeded. + Mode LimitMode // How to limit CPU usage. + PSUtil psutil.Util + Logger log.Logger } type Limiter interface { @@ -65,26 +96,135 @@ type Limiter interface { // Stop stops the limiter. The limiter can be reused by calling Start() again Stop() - // Current returns the current CPU and memory values - // Deprecated: use Usage() - Current() (cpu float64, memory uint64) - - // Limits returns the defined CPU and memory limits. Values <= 0 means no limit - // Deprecated: use Usage() - Limits() (cpu float64, memory uint64) - // Usage returns the current state of the limiter, such as current, average, max, and // limit values for CPU and memory. Usage() Usage // Limit enables or disables the throttling of the CPU or killing because of to much - // memory consumption. - Limit(cpu, memory bool) error + // memory or GPU consumption. + Limit(cpu, memory, gpu bool) error // Mode returns in which mode the limiter is running in. Mode() LimitMode } +type numbers interface { + ~uint64 | ~float64 +} + +type metric[T numbers] struct { + limit T // Limit + current T // Current load value + last T // Last load value + max T // Max. load value + top T // Decaying max. load value + avg float64 // Average load value + avgCounter uint64 // Counter for average calculation + limitSince time.Time // Time when the limit has been reached (hard limiter mode) + limitEnable bool +} + +func (x *metric[T]) Reset() { + var zero T + + x.current = zero + x.last = zero + x.max = zero + x.top = zero + x.avg = 0 + x.avgCounter = 0 + x.limitEnable = false +} + +func (x *metric[T]) Current() T { + return x.current +} + +func (x *metric[T]) Top() T { + return x.top +} + +func (x *metric[T]) Max() T { + return x.max +} + +func (x *metric[T]) Avg() float64 { + return x.avg +} + +func (x *metric[T]) SetLimit(limit T) { + x.limit = limit +} + +func (x *metric[T]) Limit() T { + return x.limit +} + +func (x *metric[T]) DoLimit(limit bool) (enabled, changed bool) { + if x.limitEnable != limit { + x.limitEnable = limit + changed = true + } + + enabled = x.limitEnable + + return +} + +func (x *metric[T]) IsLimitEnabled() bool { + return x.limitEnable +} + +func (x *metric[T]) Update(value T) { + x.last, x.current = x.current, value + + if x.current > x.max { + x.max = x.current + } + + if x.current > x.top { + x.top = x.current + } else { + x.top = T(float64(x.top) * 0.95) + } + + x.avgCounter++ + + x.avg = ((x.avg * float64(x.avgCounter-1)) + float64(x.current)) / float64(x.avgCounter) +} + +func (x *metric[T]) IsExceeded(waitFor time.Duration, mode LimitMode) bool { + if x.limit <= 0 { + return false + } + + if mode == LimitModeSoft { + // Check if we actually should limit. + if !x.limitEnable { + return false + } + + // If we are currently above the limit, the limit is exceeded. + if x.current > x.limit { + return true + } + } else { + if x.current > x.limit { + // Current value is higher than the limit. + if x.last <= x.limit { + // If the previous value is below the limit, then we reached the limit as of now. + x.limitSince = time.Now() + } + + if time.Since(x.limitSince) >= waitFor { + return true + } + } + } + + return false +} + type limiter struct { psutil psutil.Util @@ -98,40 +238,27 @@ type limiter struct { lastUsage Usage lastUsageLock sync.RWMutex - cpu float64 // CPU limit - cpuCurrent float64 // Current CPU load of this process - cpuLast float64 // Last CPU load of this process - cpuMax float64 // Max. CPU load of this process - cpuTop float64 // Decaying max. CPU load of this process - cpuAvg float64 // Average CPU load of this process - cpuAvgCounter uint64 // Counter for average calculation - cpuLimitSince time.Time // Time when the CPU limit has been reached (hard limiter mode) - cpuLimitEnable bool // Whether CPU throttling is enabled (soft limiter mode) - cpuThrottling bool // Whether CPU throttling is currently active (soft limiter mode) + cpu metric[float64] // CPU limit + cpuThrottling bool // Whether CPU throttling is currently active (soft limiter mode) - memory uint64 // Memory limit (bytes) - memoryCurrent uint64 // Current memory usage - memoryLast uint64 // Last memory usage - memoryMax uint64 // Max. memory usage - memoryTop uint64 // Decaying max. memory usage - memoryAvg float64 // Average memory usage - memoryAvgCounter uint64 // Counter for average memory calculation - memoryLimitSince time.Time // Time when the memory limit has been reached (hard limiter mode) - memoryLimitEnable bool // Whether memory limiting is enabled (soft limiter mode) + memory metric[uint64] // Memory limit (bytes) + + gpu struct { + memory metric[uint64] // GPU memory limit (0-100 percent) + usage metric[float64] // GPU load limit (0-100 percent) + encoder metric[float64] // GPU encoder limit (0-100 percent) + decoder metric[float64] // GPU decoder limit (0-100 percent) + } waitFor time.Duration mode LimitMode - cancelLimit context.CancelFunc - logger log.Logger } // NewLimiter returns a new Limiter func NewLimiter(config LimiterConfig) Limiter { l := &limiter{ - cpu: config.CPU, - memory: config.Memory, waitFor: config.WaitFor, onLimit: config.OnLimit, mode: config.Mode, @@ -139,6 +266,13 @@ func NewLimiter(config LimiterConfig) Limiter { logger: config.Logger, } + l.cpu.SetLimit(config.CPU / 100) + l.memory.SetLimit(config.Memory) + l.gpu.memory.SetLimit(config.GPUMemory) + l.gpu.usage.SetLimit(config.GPUUsage / 100) + l.gpu.encoder.SetLimit(config.GPUEncoder / 100) + l.gpu.decoder.SetLimit(config.GPUDecoder / 100) + if l.logger == nil { l.logger = log.New("") } @@ -147,57 +281,56 @@ func NewLimiter(config LimiterConfig) Limiter { l.psutil = psutil.DefaultUtil } - if ncpu, err := l.psutil.CPUCounts(true); err != nil { + if ncpu, err := l.psutil.CPUCounts(); err != nil { l.ncpu = 1 } else { l.ncpu = ncpu } l.lastUsage.CPU.NCPU = l.ncpu - l.lastUsage.CPU.Limit = l.cpu * l.ncpu - l.lastUsage.Memory.Limit = l.memory + l.lastUsage.CPU.Limit = l.cpu.Limit() * 100 * l.ncpu + l.lastUsage.Memory.Limit = l.memory.Limit() + l.lastUsage.GPU.Memory.Limit = l.gpu.memory.Limit() + l.lastUsage.GPU.Usage.Limit = l.gpu.usage.Limit() * 100 + l.lastUsage.GPU.Encoder.Limit = l.gpu.encoder.Limit() * 100 + l.lastUsage.GPU.Decoder.Limit = l.gpu.decoder.Limit() * 100 l.ncpuFactor = 1 mode := "hard" if l.mode == LimitModeSoft { mode = "soft" - l.cpu /= l.ncpu + l.cpu.SetLimit(l.cpu.Limit() / l.ncpu) l.ncpuFactor = l.ncpu } - l.cpu /= 100 - if l.onLimit == nil { - l.onLimit = func(float64, uint64) {} + l.onLimit = func(float64, uint64, float64, float64, float64, uint64) {} } l.logger = l.logger.WithFields(log.Fields{ - "cpu": l.cpu * l.ncpuFactor, - "memory": l.memory, - "mode": mode, + "cpu": l.cpu.Limit() * l.ncpuFactor, + "memory": l.memory.Limit(), + "gpumemory": l.gpu.memory.Limit(), + "gpuusage": l.gpu.usage.Limit(), + "gpuencoder": l.gpu.encoder.Limit(), + "gpudecoder": l.gpu.decoder.Limit(), + "mode": mode, }) return l } func (l *limiter) reset() { - l.cpuCurrent = 0 - l.cpuLast = 0 - l.cpuAvg = 0 - l.cpuAvgCounter = 0 - l.cpuMax = 0 - l.cpuTop = 0 - l.cpuLimitEnable = false + l.cpu.Reset() l.cpuThrottling = false - l.memoryCurrent = 0 - l.memoryLast = 0 - l.memoryAvg = 0 - l.memoryAvgCounter = 0 - l.memoryMax = 0 - l.memoryTop = 0 - l.memoryLimitEnable = false + l.memory.Reset() + + l.gpu.memory.Reset() + l.gpu.usage.Reset() + l.gpu.encoder.Reset() + l.gpu.decoder.Reset() } func (l *limiter) Start(process psutil.Process) error { @@ -218,10 +351,7 @@ func (l *limiter) Start(process psutil.Process) error { go l.ticker(ctx, time.Second) if l.mode == LimitModeSoft { - ctx, cancel = context.WithCancel(context.Background()) - l.cancelLimit = cancel - - go l.limitCPU(ctx, l.cpu, time.Second) + go l.limitCPU(ctx, l.cpu.Limit(), time.Second) } return nil @@ -237,11 +367,6 @@ func (l *limiter) Stop() { l.cancel() - if l.cancelLimit != nil { - l.cancelLimit() - l.cancelLimit = nil - } - l.proc.Stop() l.proc = nil @@ -256,13 +381,13 @@ func (l *limiter) ticker(ctx context.Context, interval time.Duration) { select { case <-ctx.Done(): return - case t := <-ticker.C: - l.collect(t) + case <-ticker.C: + l.collect() } } } -func (l *limiter) collect(_ time.Time) { +func (l *limiter) collect() { l.lock.Lock() proc := l.proc l.lock.Unlock() @@ -271,118 +396,108 @@ func (l *limiter) collect(_ time.Time) { return } - mstat, merr := proc.VirtualMemory() - cpustat, cerr := proc.CPUPercent() + mstat, merr := proc.Memory() + cpustat, cerr := proc.CPU() + gstat, gerr := proc.GPU() + gindex := -1 l.lock.Lock() + defer l.lock.Unlock() if merr == nil { - l.memoryLast, l.memoryCurrent = l.memoryCurrent, mstat - - if l.memoryCurrent > l.memoryMax { - l.memoryMax = l.memoryCurrent - } - - if l.memoryCurrent > l.memoryTop { - l.memoryTop = l.memoryCurrent - } else { - l.memoryTop = uint64(float64(l.memoryTop) * 0.95) - } - - l.memoryAvgCounter++ - - l.memoryAvg = ((l.memoryAvg * float64(l.memoryAvgCounter-1)) + float64(l.memoryCurrent)) / float64(l.memoryAvgCounter) + l.memory.Update(mstat) } if cerr == nil { - l.cpuLast, l.cpuCurrent = l.cpuCurrent, (cpustat.System+cpustat.User+cpustat.Other)/100 + l.cpu.Update((cpustat.System + cpustat.User + cpustat.Other) / 100) + } - if l.cpuCurrent > l.cpuMax { - l.cpuMax = l.cpuCurrent - } - - if l.cpuCurrent > l.cpuTop { - l.cpuTop = l.cpuCurrent - } else { - l.cpuTop = l.cpuTop * 0.95 - } - - l.cpuAvgCounter++ - - l.cpuAvg = ((l.cpuAvg * float64(l.cpuAvgCounter-1)) + l.cpuCurrent) / float64(l.cpuAvgCounter) + if gerr == nil { + l.gpu.memory.Update(gstat.MemoryUsed) + l.gpu.usage.Update(gstat.Usage / 100) + l.gpu.encoder.Update(gstat.Encoder / 100) + l.gpu.decoder.Update(gstat.Decoder / 100) + gindex = gstat.Index } isLimitExceeded := false if l.mode == LimitModeHard { - if l.cpu > 0 { - if l.cpuCurrent > l.cpu { - // Current value is higher than the limit - if l.cpuLast <= l.cpu { - // If the previous value is below the limit, then we reached the - // limit as of now - l.cpuLimitSince = time.Now() - } - - if time.Since(l.cpuLimitSince) >= l.waitFor { - l.logger.Warn().Log("CPU limit exceeded") - isLimitExceeded = true - } - } + if l.cpu.IsExceeded(l.waitFor, l.mode) { + l.logger.Warn().Log("CPU limit exceeded") + isLimitExceeded = true } + } - if l.memory > 0 { - if l.memoryCurrent > l.memory { - // Current value is higher than the limit - if l.memoryLast <= l.memory { - // If the previous value is below the limit, then we reached the - // limit as of now - l.memoryLimitSince = time.Now() - } + if l.memory.IsExceeded(l.waitFor, l.mode) { + l.logger.Warn().Log("Memory limit exceeded") + isLimitExceeded = true + } - if time.Since(l.memoryLimitSince) >= l.waitFor { - l.logger.Warn().Log("Memory limit exceeded") - isLimitExceeded = true - } - } - } - } else { - if l.memory > 0 && l.memoryLimitEnable { - if l.memoryCurrent > l.memory { - // Current value is higher than the limit - l.logger.Warn().Log("Memory limit exceeded") - isLimitExceeded = true - } - } + if l.gpu.memory.IsExceeded(l.waitFor, l.mode) { + l.logger.Warn().Log("GPU memory limit exceeded") + isLimitExceeded = true + } + + if l.gpu.usage.IsExceeded(l.waitFor, l.mode) { + l.logger.Warn().Log("GPU usage limit exceeded") + isLimitExceeded = true + } + + if l.gpu.encoder.IsExceeded(l.waitFor, l.mode) { + l.logger.Warn().Log("GPU encoder limit exceeded") + isLimitExceeded = true + } + + if l.gpu.decoder.IsExceeded(l.waitFor, l.mode) { + l.logger.Warn().Log("GPU decoder limit exceeded") + isLimitExceeded = true } l.logger.Debug().WithFields(log.Fields{ - "cur_cpu": l.cpuCurrent * l.ncpuFactor, - "top_cpu": l.cpuTop * l.ncpuFactor, - "cur_mem": l.memoryCurrent, - "top_mem": l.memoryTop, - "exceeded": isLimitExceeded, + "cur_cpu": l.cpu.Current() * l.ncpuFactor, + "top_cpu": l.cpu.Top() * l.ncpuFactor, + "cur_mem": l.memory.Current(), + "top_mem": l.memory.Top(), + "cur_gpu_mem": l.gpu.memory.Current(), + "top_gpu_mem": l.gpu.memory.Top(), + "exceeded": isLimitExceeded, }).Log("Observation") if isLimitExceeded { - go l.onLimit(l.cpuCurrent*l.ncpuFactor*100, l.memoryCurrent) + go l.onLimit(l.cpu.Current()*l.ncpuFactor*100, l.memory.Current(), l.gpu.usage.Current(), l.gpu.encoder.Current(), l.gpu.decoder.Current(), l.gpu.memory.Current()) } l.lastUsageLock.Lock() - l.lastUsage.CPU.Current = l.cpuCurrent * l.ncpu * 100 - l.lastUsage.CPU.Average = l.cpuAvg * l.ncpu * 100 - l.lastUsage.CPU.Max = l.cpuMax * l.ncpu * 100 + l.lastUsage.CPU.Current = l.cpu.Current() * l.ncpu * 100 + l.lastUsage.CPU.Average = l.cpu.Avg() * l.ncpu * 100 + l.lastUsage.CPU.Max = l.cpu.Max() * l.ncpu * 100 l.lastUsage.CPU.IsThrottling = l.cpuThrottling - l.lastUsage.Memory.Current = l.memoryCurrent - l.lastUsage.Memory.Average = l.memoryAvg - l.lastUsage.Memory.Max = l.memoryMax - l.lastUsageLock.Unlock() + l.lastUsage.Memory.Current = l.memory.Current() + l.lastUsage.Memory.Average = l.memory.Avg() + l.lastUsage.Memory.Max = l.memory.Max() - l.lock.Unlock() + l.lastUsage.GPU.Index = gindex + l.lastUsage.GPU.Memory.Current = l.gpu.memory.Current() * 100 + l.lastUsage.GPU.Memory.Average = l.gpu.memory.Avg() * 100 + l.lastUsage.GPU.Memory.Max = l.gpu.memory.Max() * 100 + + l.lastUsage.GPU.Usage.Current = l.gpu.usage.Current() * 100 + l.lastUsage.GPU.Usage.Average = l.gpu.usage.Avg() * 100 + l.lastUsage.GPU.Usage.Max = l.gpu.usage.Max() * 100 + + l.lastUsage.GPU.Encoder.Current = l.gpu.encoder.Current() * 100 + l.lastUsage.GPU.Encoder.Average = l.gpu.encoder.Avg() * 100 + l.lastUsage.GPU.Encoder.Max = l.gpu.encoder.Max() * 100 + + l.lastUsage.GPU.Decoder.Current = l.gpu.decoder.Current() * 100 + l.lastUsage.GPU.Decoder.Average = l.gpu.decoder.Avg() * 100 + l.lastUsage.GPU.Decoder.Max = l.gpu.decoder.Max() * 100 + l.lastUsageLock.Unlock() } -func (l *limiter) Limit(cpu, memory bool) error { +func (l *limiter) Limit(cpu, memory, gpu bool) error { l.lock.Lock() defer l.lock.Unlock() @@ -390,35 +505,31 @@ func (l *limiter) Limit(cpu, memory bool) error { return nil } - if memory { - if !l.memoryLimitEnable { - l.memoryLimitEnable = true - - l.logger.Debug().Log("Memory limiter enabled") - } - } else { - if l.memoryLimitEnable { - l.memoryLimitEnable = false - - l.logger.Debug().Log("Memory limiter disabled") - } + enabled, changed := l.cpu.DoLimit(cpu) + if enabled && changed { + l.logger.Debug().Log("CPU limiter enabled") + } else if !enabled && changed { + l.logger.Debug().Log("CPU limiter disabled") } - if cpu { - if !l.cpuLimitEnable { - l.cpuLimitEnable = true - - l.logger.Debug().Log("CPU limiter enabled") - } - } else { - if l.cpuLimitEnable { - l.cpuLimitEnable = false - - l.logger.Debug().Log("CPU limiter disabled") - } - + enabled, changed = l.memory.DoLimit(memory) + if enabled && changed { + l.logger.Debug().Log("Memory limiter enabled") + } else if !enabled && changed { + l.logger.Debug().Log("Memory limiter disabled") } + enabled, changed = l.gpu.memory.DoLimit(gpu) + if enabled && changed { + l.logger.Debug().Log("GPU limiter enabled") + } else if !enabled && changed { + l.logger.Debug().Log("GPU limiter disabled") + } + + l.gpu.usage.DoLimit(gpu) + l.gpu.encoder.DoLimit(gpu) + l.gpu.decoder.DoLimit(gpu) + return nil } @@ -453,7 +564,7 @@ func (l *limiter) limitCPU(ctx context.Context, limit float64, interval time.Dur l.lock.Lock() - if !l.cpuLimitEnable { + if !l.cpu.IsLimitEnabled() { if factorTopLimit > 0 { factorTopLimit -= 10 } else { @@ -469,7 +580,7 @@ func (l *limiter) limitCPU(ctx context.Context, limit float64, interval time.Dur } } else { factorTopLimit = 100 - topLimit = l.cpuTop - limit + topLimit = l.cpu.Top() - limit l.cpuThrottling = true } @@ -482,7 +593,7 @@ func (l *limiter) limitCPU(ctx context.Context, limit float64, interval time.Dur lim += (100 - factorTopLimit) / 100 * topLimit } - pcpu := l.cpuCurrent + pcpu := l.cpu.Current() l.lock.Unlock() @@ -526,16 +637,6 @@ func (l *limiter) limitCPU(ctx context.Context, limit float64, interval time.Dur } } -func (l *limiter) Current() (cpu float64, memory uint64) { - l.lastUsageLock.RLock() - defer l.lastUsageLock.RUnlock() - - cpu = l.lastUsage.CPU.Current / l.ncpu - memory = l.lastUsage.Memory.Current - - return -} - func (l *limiter) Usage() Usage { l.lastUsageLock.RLock() defer l.lastUsageLock.RUnlock() @@ -543,10 +644,6 @@ func (l *limiter) Usage() Usage { return l.lastUsage } -func (l *limiter) Limits() (cpu float64, memory uint64) { - return l.cpu * 100, l.memory -} - func (l *limiter) Mode() LimitMode { return l.mode } diff --git a/process/limiter_test.go b/process/limiter_test.go index c9e31127..0ec98333 100644 --- a/process/limiter_test.go +++ b/process/limiter_test.go @@ -7,13 +7,13 @@ import ( "github.com/datarhei/core/v16/psutil" - "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) type psproc struct{} -func (p *psproc) CPUPercent() (*psutil.CPUInfoStat, error) { - return &psutil.CPUInfoStat{ +func (p *psproc) CPU() (*psutil.CPUInfo, error) { + return &psutil.CPUInfo{ System: 50, User: 0, Idle: 0, @@ -21,10 +21,22 @@ func (p *psproc) CPUPercent() (*psutil.CPUInfoStat, error) { }, nil } -func (p *psproc) VirtualMemory() (uint64, error) { +func (p *psproc) Memory() (uint64, error) { return 197, nil } +func (p *psproc) GPU() (*psutil.GPUInfo, error) { + return &psutil.GPUInfo{ + Index: 0, + Name: "L4", + MemoryTotal: 128, + MemoryUsed: 91, + Usage: 3, + Encoder: 9, + Decoder: 5, + }, nil +} + func (p *psproc) Stop() {} func (p *psproc) Suspend() error { return nil } func (p *psproc) Resume() error { return nil } @@ -42,7 +54,7 @@ func TestCPULimit(t *testing.T) { l := NewLimiter(LimiterConfig{ CPU: 42, - OnLimit: func(float64, uint64) { + OnLimit: func(float64, uint64, float64, float64, float64, uint64) { wg.Done() }, }) @@ -57,7 +69,7 @@ func TestCPULimit(t *testing.T) { lock.Unlock() }() - assert.Eventually(t, func() bool { + require.Eventually(t, func() bool { lock.Lock() defer lock.Unlock() @@ -79,7 +91,7 @@ func TestCPULimitWaitFor(t *testing.T) { l := NewLimiter(LimiterConfig{ CPU: 42, WaitFor: 3 * time.Second, - OnLimit: func(float64, uint64) { + OnLimit: func(float64, uint64, float64, float64, float64, uint64) { wg.Done() }, }) @@ -94,7 +106,7 @@ func TestCPULimitWaitFor(t *testing.T) { lock.Unlock() }() - assert.Eventually(t, func() bool { + require.Eventually(t, func() bool { lock.Lock() defer lock.Unlock() @@ -115,7 +127,7 @@ func TestMemoryLimit(t *testing.T) { l := NewLimiter(LimiterConfig{ Memory: 42, - OnLimit: func(float64, uint64) { + OnLimit: func(float64, uint64, float64, float64, float64, uint64) { wg.Done() }, }) @@ -130,7 +142,7 @@ func TestMemoryLimit(t *testing.T) { lock.Unlock() }() - assert.Eventually(t, func() bool { + require.Eventually(t, func() bool { lock.Lock() defer lock.Unlock() @@ -152,7 +164,7 @@ func TestMemoryLimitWaitFor(t *testing.T) { l := NewLimiter(LimiterConfig{ Memory: 42, WaitFor: 3 * time.Second, - OnLimit: func(float64, uint64) { + OnLimit: func(float64, uint64, float64, float64, float64, uint64) { wg.Done() }, }) @@ -167,7 +179,80 @@ func TestMemoryLimitWaitFor(t *testing.T) { lock.Unlock() }() - assert.Eventually(t, func() bool { + require.Eventually(t, func() bool { + lock.Lock() + defer lock.Unlock() + + return done + }, 10*time.Second, 1*time.Second) +} + +func TestGPUMemoryLimit(t *testing.T) { + lock := sync.Mutex{} + + lock.Lock() + done := false + lock.Unlock() + + go func() { + wg := sync.WaitGroup{} + wg.Add(1) + + l := NewLimiter(LimiterConfig{ + GPUMemory: 42, + OnLimit: func(float64, uint64, float64, float64, float64, uint64) { + wg.Done() + }, + }) + + l.Start(&psproc{}) + defer l.Stop() + + wg.Wait() + + lock.Lock() + done = true + lock.Unlock() + }() + + require.Eventually(t, func() bool { + lock.Lock() + defer lock.Unlock() + + return done + }, 2*time.Second, 100*time.Millisecond) +} + +func TestGPUMemoryLimitWaitFor(t *testing.T) { + lock := sync.Mutex{} + + lock.Lock() + done := false + lock.Unlock() + + go func() { + wg := sync.WaitGroup{} + wg.Add(1) + + l := NewLimiter(LimiterConfig{ + GPUMemory: 42, + WaitFor: 3 * time.Second, + OnLimit: func(float64, uint64, float64, float64, float64, uint64) { + wg.Done() + }, + }) + + l.Start(&psproc{}) + defer l.Stop() + + wg.Wait() + + lock.Lock() + done = true + lock.Unlock() + }() + + require.Eventually(t, func() bool { lock.Lock() defer lock.Unlock() @@ -189,7 +274,7 @@ func TestMemoryLimitSoftMode(t *testing.T) { l := NewLimiter(LimiterConfig{ Memory: 42, Mode: LimitModeSoft, - OnLimit: func(float64, uint64) { + OnLimit: func(float64, uint64, float64, float64, float64, uint64) { wg.Done() }, }) @@ -197,7 +282,7 @@ func TestMemoryLimitSoftMode(t *testing.T) { l.Start(&psproc{}) defer l.Stop() - l.Limit(false, true) + l.Limit(false, true, false) wg.Wait() @@ -206,7 +291,46 @@ func TestMemoryLimitSoftMode(t *testing.T) { lock.Unlock() }() - assert.Eventually(t, func() bool { + require.Eventually(t, func() bool { + lock.Lock() + defer lock.Unlock() + + return done + }, 2*time.Second, 100*time.Millisecond) +} + +func TestGPUMemoryLimitSoftMode(t *testing.T) { + lock := sync.Mutex{} + + lock.Lock() + done := false + lock.Unlock() + + go func() { + wg := sync.WaitGroup{} + wg.Add(1) + + l := NewLimiter(LimiterConfig{ + GPUMemory: 42, + Mode: LimitModeSoft, + OnLimit: func(float64, uint64, float64, float64, float64, uint64) { + wg.Done() + }, + }) + + l.Start(&psproc{}) + defer l.Stop() + + l.Limit(false, false, true) + + wg.Wait() + + lock.Lock() + done = true + lock.Unlock() + }() + + require.Eventually(t, func() bool { lock.Lock() defer lock.Unlock() diff --git a/process/process.go b/process/process.go index 0fe0d45f..430cb7e2 100644 --- a/process/process.go +++ b/process/process.go @@ -46,29 +46,32 @@ type Process interface { // Limit enables or disables CPU and memory limiting. CPU will be throttled // into the configured limit. If memory consumption is above the configured // limit, the process will be killed. - Limit(cpu, memory bool) error + Limit(cpu, memory, gpu bool) error } // Config is the configuration of a process type Config struct { - Binary string // Path to the ffmpeg binary. - Args []string // List of arguments for the binary. - Reconnect bool // Whether to restart the process if it exited. - ReconnectDelay time.Duration // Duration to wait before restarting the process. - StaleTimeout time.Duration // Kill the process after this duration if it doesn't produce any output. - Timeout time.Duration // Kill the process after this duration. - LimitCPU float64 // Kill the process if the CPU usage in percent is above this value. - LimitMemory uint64 // Kill the process if the memory consumption in bytes is above this value. - LimitDuration time.Duration // Kill the process if the limits are exceeded for this duration. - LimitMode LimitMode // Select limiting mode - Scheduler Scheduler // A scheduler. - Parser Parser // A parser for the output of the process. - OnArgs func(args []string) []string // A callback which is called right before the process will start with the command args. - OnBeforeStart func() error // A callback which is called before the process will be started. If error is non-nil, the start will be refused. - OnStart func() // A callback which is called after the process started. - OnExit func(state string) // A callback which is called after the process exited with the exit state. - OnStateChange func(from, to string) // A callback which is called after a state changed. - Logger log.Logger + Binary string // Path to the ffmpeg binary. + Args []string // List of arguments for the binary. + Reconnect bool // Whether to restart the process if it exited. + ReconnectDelay time.Duration // Duration to wait before restarting the process. + StaleTimeout time.Duration // Kill the process after this duration if it doesn't produce any output. + Timeout time.Duration // Kill the process after this duration. + LimitCPU float64 // Kill the process if the CPU usage in percent is above this value, in percent 0-100 in hard mode, 0-100*ncpu in soft mode. + LimitMemory uint64 // Kill the process if the memory consumption in bytes is above this value. + LimitGPUUsage float64 // Kill the process if the GPU usage in percent is above this value, in percent 0-100. + LimitGPUEncoder float64 // Kill the process if the GPU encoder usage in percent is above this value, in percent 0-100. + LimitGPUDecoder float64 // Kill the process if the GPU decoder usage in percent is above this value, in percent 0-100. + LimitGPUMemory uint64 // Kill the process if the GPU memory consumption in bytes is above this value. + LimitDuration time.Duration // Kill the process if the limits are exceeded for this duration. + LimitMode LimitMode // Select limiting mode + Scheduler Scheduler // A scheduler. + Parser Parser // A parser for the output of the process. + OnBeforeStart func(args []string) ([]string, error) // A callback which is called before the process will be started. The string slice is the arguments of the command line. If error is non-nil, the start will be refused. + OnStart func() // A callback which is called after the process started. + OnExit func(state string) // A callback which is called after the process exited with the exit state. + OnStateChange func(from, to string) // A callback which is called after a state changed. + Logger log.Logger } // Status represents the current status of a process @@ -81,20 +84,47 @@ type Status struct { Time time.Time // Time is the time of the last change of the state CommandArgs []string // Currently running command arguments LimitMode string // The limiting mode - CPU struct { - NCPU float64 // Number of logical CPUs - Current float64 // Currently consumed CPU in percent - Average float64 // Average consumed CPU in percent - Max float64 // Max. consumed CPU in percent - Limit float64 // Usage limit in percent - IsThrottling bool // Whether the CPU is currently limited - } // Used CPU in percent - Memory struct { - Current uint64 // Currently consumed memory in bytes - Average float64 // Average consumed memory in bytes - Max uint64 // Max. consumed memory in bytes - Limit uint64 // Usage limit in bytes - } // Used memory in bytes + CPU StatusCPU // CPU consumption in percent + Memory StatusMemory // Memory consumption in bytes + GPU StatusGPU // GPU consumption +} + +type StatusCPU struct { + NCPU float64 // Number of logical CPUs + Current float64 // Currently consumed CPU in percent + Average float64 // Average consumed CPU in percent + Max float64 // Max. consumed CPU in percent + Limit float64 // Usage limit in percent + IsThrottling bool // Whether the CPU is currently limited +} + +type StatusMemory struct { + Current uint64 // Currently consumed memory in bytes + Average uint64 // Average consumed memory in bytes + Max uint64 // Max. consumed memory in bytes + Limit uint64 // Usage limit in bytes +} + +type StatusGPUMemory struct { + Current uint64 // Currently consumed memory in bytes + Average uint64 // Average consumed memory in bytes + Max uint64 // Max. consumed memory in bytes + Limit uint64 // Usage limit in bytes +} + +type StatusGPUUsage struct { + Current float64 // Currently consumed GPU usage in percent + Average float64 // Average consumed GPU usage in percent + Max float64 // Max. consumed GPU usage in percent + Limit float64 // Usage limit in percent +} + +type StatusGPU struct { + Index int + Memory StatusGPUMemory // GPU memory consumption + Usage StatusGPUUsage // GPU usage in percent + Encoder StatusGPUUsage // GPU encoder usage in percent + Decoder StatusGPUUsage // GPU decoder usage in percent } // States @@ -206,8 +236,7 @@ type process struct { logger log.Logger debuglogger log.Logger callbacks struct { - onArgs func(args []string) []string - onBeforeStart func() error + onBeforeStart func(args []string) ([]string, error) onStart func() onExit func(state string) onStateChange func(from, to string) @@ -263,28 +292,35 @@ func New(config Config) (Process, error) { p.stale.last = time.Now() p.stale.timeout = config.StaleTimeout - p.callbacks.onArgs = config.OnArgs p.callbacks.onBeforeStart = config.OnBeforeStart p.callbacks.onStart = config.OnStart p.callbacks.onExit = config.OnExit p.callbacks.onStateChange = config.OnStateChange p.limits = NewLimiter(LimiterConfig{ - CPU: config.LimitCPU, - Memory: config.LimitMemory, - WaitFor: config.LimitDuration, - Mode: config.LimitMode, - Logger: p.logger.WithComponent("ProcessLimiter"), - OnLimit: func(cpu float64, memory uint64) { + CPU: config.LimitCPU, + Memory: config.LimitMemory, + GPUUsage: config.LimitGPUUsage, + GPUEncoder: config.LimitGPUEncoder, + GPUDecoder: config.LimitGPUDecoder, + GPUMemory: config.LimitGPUMemory, + WaitFor: config.LimitDuration, + Mode: config.LimitMode, + Logger: p.logger.WithComponent("ProcessLimiter"), + OnLimit: func(cpu float64, memory uint64, gpuusage, gpuencoder, gpudecoder float64, gpumemory uint64) { if !p.isRunning() { return } p.logger.WithFields(log.Fields{ - "cpu": cpu, - "memory": memory, + "cpu": cpu, + "memory": memory, + "gpuusage": gpuusage, + "gpuencoder": gpuencoder, + "gpudecoder": gpudecoder, + "gpumemmory": gpumemory, }).Warn().Log("Killed because limits are exceeded") - p.Kill(false, fmt.Sprintf("Killed because limits are exceeded (mode: %s, tolerance: %s): %.2f (%.2f) CPU, %d (%d) bytes memory", config.LimitMode.String(), config.LimitDuration.String(), cpu, config.LimitCPU, memory, config.LimitMemory)) + p.Kill(false, fmt.Sprintf("Killed because limits are exceeded (mode: %s, tolerance: %s): %.2f (%.2f) CPU, %d (%d) bytes memory, %.2f/%.2f/%.2f (%.2f) GPU usage, %d (%d) bytes GPU memory", config.LimitMode.String(), config.LimitDuration.String(), cpu, config.LimitCPU, memory, config.LimitMemory, gpuusage, gpuencoder, gpudecoder, config.LimitGPUUsage, gpumemory, config.LimitGPUMemory)) }, }) @@ -467,8 +503,47 @@ func (p *process) Status() Status { Duration: time.Since(stateTime), Time: stateTime, LimitMode: p.limits.Mode().String(), - CPU: usage.CPU, - Memory: usage.Memory, + CPU: StatusCPU{ + NCPU: usage.CPU.NCPU, + Current: usage.CPU.Current, + Average: usage.CPU.Average, + Max: usage.CPU.Max, + Limit: usage.CPU.Limit, + IsThrottling: usage.CPU.IsThrottling, + }, + Memory: StatusMemory{ + Current: usage.Memory.Current, + Average: uint64(usage.Memory.Average), + Max: usage.Memory.Max, + Limit: usage.Memory.Limit, + }, + GPU: StatusGPU{ + Index: usage.GPU.Index, + Memory: StatusGPUMemory{ + Current: usage.GPU.Memory.Current, + Average: uint64(usage.GPU.Memory.Average), + Max: usage.GPU.Memory.Max, + Limit: usage.GPU.Memory.Limit, + }, + Usage: StatusGPUUsage{ + Current: usage.GPU.Usage.Current, + Average: usage.GPU.Usage.Average, + Max: usage.GPU.Usage.Max, + Limit: usage.GPU.Usage.Limit, + }, + Encoder: StatusGPUUsage{ + Current: usage.GPU.Encoder.Current, + Average: usage.GPU.Encoder.Average, + Max: usage.GPU.Encoder.Max, + Limit: usage.GPU.Encoder.Limit, + }, + Decoder: StatusGPUUsage{ + Current: usage.GPU.Decoder.Current, + Average: usage.GPU.Decoder.Average, + Max: usage.GPU.Decoder.Max, + Limit: usage.GPU.Decoder.Limit, + }, + }, } s.CommandArgs = make([]string, len(p.args)) @@ -488,7 +563,7 @@ func (p *process) IsRunning() bool { return p.isRunning() } -func (p *process) Limit(cpu, memory bool) error { +func (p *process) Limit(cpu, memory, gpu bool) error { if !p.isRunning() { return nil } @@ -498,11 +573,12 @@ func (p *process) Limit(cpu, memory bool) error { } p.logger.Warn().WithFields(log.Fields{ - "limit_cpu": cpu, - "limit_memory": memory, + "limit_cpu": cpu, + "limit_memory": memory, + "limit_gpumemory": gpu, }).Log("Limiter triggered") - return p.limits.Limit(cpu, memory) + return p.limits.Limit(cpu, memory, gpu) } // Start will start the process and sets the order to "start". If the @@ -559,11 +635,21 @@ func (p *process) start() error { args := p.args - if p.callbacks.onArgs != nil { + if p.callbacks.onBeforeStart != nil { args = make([]string, len(p.args)) copy(args, p.args) - args = p.callbacks.onArgs(args) + args, err = p.callbacks.onBeforeStart(args) + if err != nil { + p.setState(stateFailed) + + p.parser.Parse([]byte(err.Error())) + p.logger.WithError(err).Error().Log("Starting failed") + + p.reconnect(p.delay(stateFailed)) + + return err + } } p.cmd = exec.Command(p.binary, args...) @@ -582,19 +668,6 @@ func (p *process) start() error { return err } - if p.callbacks.onBeforeStart != nil { - if err := p.callbacks.onBeforeStart(); err != nil { - p.setState(stateFailed) - - p.parser.Parse([]byte(err.Error())) - p.logger.WithError(err).Error().Log("Starting failed") - - p.reconnect(p.delay(stateFailed)) - - return err - } - } - if err := p.cmd.Start(); err != nil { p.setState(stateFailed) diff --git a/process/process_test.go b/process/process_test.go index 11c669b9..6ddba58a 100644 --- a/process/process_test.go +++ b/process/process_test.go @@ -606,21 +606,15 @@ func TestProcessCallbacks(t *testing.T) { "2", }, Reconnect: false, - OnArgs: func(a []string) []string { - lock.Lock() - defer lock.Unlock() - - args = make([]string, len(a)) - copy(args, a) - return a - }, - OnBeforeStart: func() error { + OnBeforeStart: func(a []string) ([]string, error) { lock.Lock() defer lock.Unlock() onBeforeStart = true - return nil + args = make([]string, len(a)) + copy(args, a) + return a, nil }, OnStart: func() { lock.Lock() @@ -681,8 +675,8 @@ func TestProcessCallbacksOnBeforeStart(t *testing.T) { Parser: parser, Reconnect: true, ReconnectDelay: 10 * time.Second, - OnBeforeStart: func() error { - return fmt.Errorf("no, not now") + OnBeforeStart: func(a []string) ([]string, error) { + return a, fmt.Errorf("no, not now") }, }) require.NoError(t, err) diff --git a/psutil/gpu/gpu.go b/psutil/gpu/gpu.go index 7feb19bd..cb8dcf00 100644 --- a/psutil/gpu/gpu.go +++ b/psutil/gpu/gpu.go @@ -3,21 +3,25 @@ package gpu import "errors" type Process struct { - PID int32 - Memory uint64 + PID int32 + Index int + Memory uint64 // bytes + Usage float64 // percent 0-100 + Encoder float64 // percent 0-100 + Decoder float64 // percent 0-100 } type Stats struct { + ID string Name string Architecture string - MemoryTotal uint64 - MemoryUsed uint64 + MemoryTotal uint64 // bytes + MemoryUsed uint64 // bytes - Usage float64 - MemoryUsage float64 - EncoderUsage float64 - DecoderUsage float64 + Usage float64 // percent 0-100 + Encoder float64 // percent 0-100 + Decoder float64 // percent 0-100 Process []Process @@ -25,9 +29,17 @@ type Stats struct { } type GPU interface { + // Count returns the number of GPU in the system. Count() (int, error) + + // Stats returns current GPU stats. Stats() ([]Stats, error) + + // Process returns a Process. Process(pid int32) (Process, error) + + // Close stops all GPU collection processes + Close() } var ErrProcessNotFound = errors.New("process not found") diff --git a/psutil/gpu/nvidia/fixtures/process.txt b/psutil/gpu/nvidia/fixtures/process.txt new file mode 100644 index 00000000..55d7bcf4 --- /dev/null +++ b/psutil/gpu/nvidia/fixtures/process.txt @@ -0,0 +1,54 @@ +# gpu pid type sm mem enc dec fb command +# Idx # C/G % % % % MB name + 0 7372 C 2 0 2 - 136 ffmpeg + 0 12176 C 5 2 3 7 782 ffmpeg + 0 20035 C 8 2 4 1 1145 ffmpeg + 0 20141 C 2 1 1 3 429 ffmpeg + 0 29591 C 2 1 - 2 435 ffmpeg + 0 7372 C 2 0 - - 136 ffmpeg + 0 12176 C 8 3 7 9 782 ffmpeg + 0 20035 C 8 2 3 1 1145 ffmpeg + 0 20141 C - - 1 1 429 ffmpeg + 0 29591 C 3 1 - 2 435 ffmpeg + 0 7372 C 2 1 1 - 136 ffmpeg + 0 12176 C 5 1 5 7 782 ffmpeg + 0 20035 C 8 3 1 4 1145 ffmpeg + 0 20141 C 2 0 1 - 429 ffmpeg + 0 29591 C 2 0 1 3 435 ffmpeg + 0 7372 C 2 0 - - 136 ffmpeg + 0 12176 C 5 1 5 3 782 ffmpeg + 0 20035 C 8 2 5 4 1145 ffmpeg + 0 20141 C 3 1 - 5 429 ffmpeg + 0 29591 C 2 0 - 1 435 ffmpeg + 0 7372 C 2 1 - - 136 ffmpeg + 0 12176 C 10 3 6 8 782 ffmpeg + 0 20035 C 3 1 1 1 1145 ffmpeg + 0 20141 C - - 4 1 429 ffmpeg + 0 29591 C 5 2 - 2 435 ffmpeg + 0 7372 C 5 1 2 - 136 ffmpeg + 0 12176 C 6 2 4 7 782 ffmpeg + 0 20035 C - - - - 1145 ffmpeg + 0 20141 C 5 1 1 3 429 ffmpeg + 0 29591 C 5 2 2 4 435 ffmpeg + 0 7372 C - - 1 - 136 ffmpeg + 0 12176 C 7 2 3 4 782 ffmpeg + 0 20035 C 2 0 - 1 1145 ffmpeg + 0 20141 C 7 2 4 4 429 ffmpeg + 0 29591 C 5 1 2 3 435 ffmpeg + 0 7372 C 2 0 1 - 136 ffmpeg + 0 12176 C 9 3 3 6 782 ffmpeg + 0 20035 C 2 1 - 1 1145 ffmpeg + 0 20141 C 4 1 4 5 429 ffmpeg + 0 29591 C 2 0 2 1 435 ffmpeg + 0 7372 C - - - - 136 ffmpeg + 0 12176 C 10 3 4 8 782 ffmpeg + 0 20035 C 4 1 2 1 1145 ffmpeg + 0 20141 C 7 2 3 3 429 ffmpeg +# gpu pid type sm mem enc dec fb command +# Idx # C/G % % % % MB name + 0 29591 C - - 1 1 435 ffmpeg + 0 7372 C 2 0 2 - 136 ffmpeg + 0 12176 C 7 2 2 6 782 ffmpeg + 0 20035 C 7 2 4 3 1145 ffmpeg + 0 20141 C 5 1 1 3 429 ffmpeg + 0 29591 C - - 1 1 435 ffmpeg \ No newline at end of file diff --git a/psutil/gpu/nvidia/fixtures/data1.xml b/psutil/gpu/nvidia/fixtures/query1.xml similarity index 100% rename from psutil/gpu/nvidia/fixtures/data1.xml rename to psutil/gpu/nvidia/fixtures/query1.xml diff --git a/psutil/gpu/nvidia/fixtures/data2.xml b/psutil/gpu/nvidia/fixtures/query2.xml similarity index 98% rename from psutil/gpu/nvidia/fixtures/data2.xml rename to psutil/gpu/nvidia/fixtures/query2.xml index cd45d707..4d93cac0 100644 --- a/psutil/gpu/nvidia/fixtures/data2.xml +++ b/psutil/gpu/nvidia/fixtures/query2.xml @@ -438,6 +438,18 @@ + + 10131 + C + ffmpeg + 389 MiB + + + 13597 + C + ffmpeg + 1054 MiB + @@ -879,6 +891,12 @@ + + 16870 + C + ffmpeg + 549 MiB + diff --git a/psutil/gpu/nvidia/fixtures/data3.xml b/psutil/gpu/nvidia/fixtures/query3.xml similarity index 100% rename from psutil/gpu/nvidia/fixtures/data3.xml rename to psutil/gpu/nvidia/fixtures/query3.xml diff --git a/psutil/gpu/nvidia/nvidia.go b/psutil/gpu/nvidia/nvidia.go index ba45e2fa..98ad1520 100644 --- a/psutil/gpu/nvidia/nvidia.go +++ b/psutil/gpu/nvidia/nvidia.go @@ -6,6 +6,9 @@ import ( "encoding/xml" "fmt" "os/exec" + "regexp" + "slices" + "strconv" "sync" "time" @@ -47,11 +50,19 @@ func (u *Utilization) UnmarshalText(text []byte) error { } type Process struct { - PID int32 `xml:"pid"` - Memory Megabytes `xml:"used_memory"` + Index int + PID int32 + Memory uint64 // bytes + + Usage float64 // percent 0-100 + Encoder float64 // percent 0-100 + Decoder float64 // percent 0-100 + + lastSeen time.Time } type GPUStats struct { + ID string `xml:"id,attr"` Name string `xml:"product_name"` Architecture string `xml:"product_architecture"` @@ -59,31 +70,17 @@ type GPUStats struct { MemoryUsed Megabytes `xml:"fb_memory_usage>used"` Usage Utilization `xml:"utilization>gpu_util"` - MemoryUsage Utilization `xml:"utilization>memory_util"` - EncoderUsage Utilization `xml:"utilization>encoder_util"` - DecoderUsage Utilization `xml:"utilization>decoder_util"` - - Process []Process `xml:"processes>process_info"` + UsageEncoder Utilization `xml:"utilization>encoder_util"` + UsageDecoder Utilization `xml:"utilization>decoder_util"` } type Stats struct { GPU []GPUStats `xml:"gpu"` } -func parse(data []byte) (Stats, error) { - nv := Stats{} - - err := xml.Unmarshal(data, &nv) - if err != nil { - return nv, fmt.Errorf("parsing report: %w", err) - } - - return nv, nil -} - type nvidia struct { - cmd *exec.Cmd - wr *writer + wrQuery *writerQuery + wrProcess *writerProcess lock sync.RWMutex cancel context.CancelFunc @@ -97,33 +94,33 @@ type dummy struct{} func (d *dummy) Count() (int, error) { return 0, nil } func (d *dummy) Stats() ([]gpu.Stats, error) { return nil, nil } func (d *dummy) Process(pid int32) (gpu.Process, error) { return gpu.Process{}, gpu.ErrProcessNotFound } +func (d *dummy) Close() {} -type writer struct { - buf bytes.Buffer - ch chan Stats +type writerQuery struct { + buf bytes.Buffer + ch chan Stats + terminator []byte } -var terminator = []byte("\n") - -func (w *writer) Write(data []byte) (int, error) { +func (w *writerQuery) Write(data []byte) (int, error) { n, err := w.buf.Write(data) if err != nil { return n, err } for { - idx := bytes.Index(w.buf.Bytes(), terminator) + idx := bytes.Index(w.buf.Bytes(), w.terminator) if idx == -1 { break } - content := make([]byte, idx+len(terminator)) + content := make([]byte, idx+len(w.terminator)) n, err := w.buf.Read(content) if err != nil || n != len(content) { break } - s, err := parse(content) + s, err := w.parse(content) if err != nil { continue } @@ -134,19 +131,132 @@ func (w *writer) Write(data []byte) (int, error) { return n, nil } +func (w *writerQuery) parse(data []byte) (Stats, error) { + nv := Stats{} + + err := xml.Unmarshal(data, &nv) + if err != nil { + return nv, fmt.Errorf("parsing report: %w", err) + } + + return nv, nil +} + +type writerProcess struct { + buf bytes.Buffer + ch chan Process + re *regexp.Regexp + terminator []byte +} + +func (w *writerProcess) Write(data []byte) (int, error) { + n, err := w.buf.Write(data) + if err != nil { + return n, err + } + + for { + idx := bytes.Index(w.buf.Bytes(), w.terminator) + if idx == -1 { + break + } + + content := make([]byte, idx+len(w.terminator)) + n, err := w.buf.Read(content) + if err != nil || n != len(content) { + break + } + + s, err := w.parse(content) + if err != nil { + continue + } + + w.ch <- s + } + + return n, nil +} + +func (w *writerProcess) parse(data []byte) (Process, error) { + p := Process{} + + if len(data) == 0 { + return p, fmt.Errorf("empty line") + } + + if data[0] == '#' { + return p, fmt.Errorf("comment") + } + + matches := w.re.FindStringSubmatch(string(data)) + if matches == nil { + return p, fmt.Errorf("no matches found") + } + + if len(matches) != 7 { + return p, fmt.Errorf("not the expected number of matches found") + } + + if d, err := strconv.ParseInt(matches[1], 10, 0); err == nil { + p.Index = int(d) + } + + if d, err := strconv.ParseInt(matches[2], 10, 32); err == nil { + p.PID = int32(d) + } + + if matches[3][0] != '-' { + if d, err := strconv.ParseFloat(matches[3], 64); err == nil { + p.Usage = d + } + } + + if matches[4][0] != '-' { + if d, err := strconv.ParseFloat(matches[4], 64); err == nil { + p.Encoder = d + } + } + + if matches[5][0] != '-' { + if d, err := strconv.ParseFloat(matches[5], 64); err == nil { + p.Decoder = d + } + } + + if d, err := strconv.ParseUint(matches[6], 10, 64); err == nil { + p.Memory = d * 1024 * 1024 + } + + return p, nil +} + func New(path string) gpu.GPU { if len(path) == 0 { path = "nvidia-smi" } - _, err := exec.LookPath(path) + path, err := exec.LookPath(path) if err != nil { return &dummy{} } n := &nvidia{ - wr: &writer{ - ch: make(chan Stats, 1), + wrQuery: &writerQuery{ + ch: make(chan Stats, 1), + terminator: []byte("\n"), + }, + wrProcess: &writerProcess{ + ch: make(chan Process, 32), + // # gpu pid type sm mem enc dec fb command + // # Idx # C/G % % % % MB name + // 0 7372 C 2 0 2 - 136 ffmpeg + // 0 12176 C 5 2 3 7 782 ffmpeg + // 0 20035 C 8 2 4 1 1145 ffmpeg + // 0 20141 C 2 1 1 3 429 ffmpeg + // 0 29591 C 2 1 - 2 435 ffmpeg + re: regexp.MustCompile(`^\s*([0-9]+)\s+([0-9]+)\s+[A-Z]\s+([0-9-]+)\s+[0-9-]+\s+([0-9-]+)\s+([0-9-]+)\s+([0-9]+).*`), + terminator: []byte("\n"), }, process: map[int32]Process{}, } @@ -154,7 +264,8 @@ func New(path string) gpu.GPU { ctx, cancel := context.WithCancel(context.Background()) n.cancel = cancel - go n.runner(ctx, path) + go n.runnerQuery(ctx, path) + go n.runnerProcess(ctx, path) go n.reader(ctx) return n @@ -165,13 +276,18 @@ func (n *nvidia) reader(ctx context.Context) { select { case <-ctx.Done(): return - case stats := <-n.wr.ch: + case stats := <-n.wrQuery.ch: n.lock.Lock() n.stats = stats - n.process = map[int32]Process{} - for _, g := range n.stats.GPU { - for _, p := range g.Process { - n.process[p.PID] = p + n.lock.Unlock() + case process := <-n.wrProcess.ch: + process.lastSeen = time.Now() + n.lock.Lock() + n.process[process.PID] = process + + for pid, p := range n.process { + if time.Since(p.lastSeen) > 11*time.Second { + delete(n.process, pid) } } n.lock.Unlock() @@ -179,11 +295,11 @@ func (n *nvidia) reader(ctx context.Context) { } } -func (n *nvidia) runner(ctx context.Context, path string) { +func (n *nvidia) runnerQuery(ctx context.Context, path string) { for { - n.cmd = exec.Command(path, "-q", "-x", "-l", "1") - n.cmd.Stdout = n.wr - err := n.cmd.Start() + cmd := exec.CommandContext(ctx, path, "-q", "-x", "-l", "1") + cmd.Stdout = n.wrQuery + err := cmd.Start() if err != nil { n.lock.Lock() n.err = err @@ -193,7 +309,35 @@ func (n *nvidia) runner(ctx context.Context, path string) { continue } - err = n.cmd.Wait() + err = cmd.Wait() + + n.lock.Lock() + n.err = err + n.lock.Unlock() + + select { + case <-ctx.Done(): + return + default: + } + } +} + +func (n *nvidia) runnerProcess(ctx context.Context, path string) { + for { + cmd := exec.CommandContext(ctx, path, "pmon", "-s", "um", "-d", "5") + cmd.Stdout = n.wrProcess + err := cmd.Start() + if err != nil { + n.lock.Lock() + n.err = err + n.lock.Unlock() + + time.Sleep(3 * time.Second) + continue + } + + err = cmd.Wait() n.lock.Lock() n.err = err @@ -219,39 +363,55 @@ func (n *nvidia) Count() (int, error) { } func (n *nvidia) Stats() ([]gpu.Stats, error) { - s := []gpu.Stats{} + stats := []gpu.Stats{} n.lock.RLock() defer n.lock.RUnlock() if n.err != nil { - return s, n.err + return stats, n.err } for _, nv := range n.stats.GPU { - stats := gpu.Stats{ + s := gpu.Stats{ + ID: nv.ID, Name: nv.Name, Architecture: nv.Architecture, MemoryTotal: uint64(nv.MemoryTotal), MemoryUsed: uint64(nv.MemoryUsed), Usage: float64(nv.Usage), - MemoryUsage: float64(nv.MemoryUsage), - EncoderUsage: float64(nv.EncoderUsage), - DecoderUsage: float64(nv.DecoderUsage), + Encoder: float64(nv.UsageEncoder), + Decoder: float64(nv.UsageDecoder), Process: []gpu.Process{}, } - for _, p := range nv.Process { - stats.Process = append(stats.Process, gpu.Process{ - PID: p.PID, - Memory: uint64(p.Memory), - }) - } - - s = append(s, stats) + stats = append(stats, s) } - return s, nil + for _, p := range n.process { + if p.Index >= len(stats) { + continue + } + + stats[p.Index].Process = append(stats[p.Index].Process, gpu.Process{ + PID: p.PID, + Index: p.Index, + Memory: p.Memory, + Usage: p.Usage, + Encoder: p.Encoder, + Decoder: p.Decoder, + }) + } + + for i := range stats { + p := stats[i].Process + slices.SortFunc(p, func(a, b gpu.Process) int { + return int(a.PID - b.PID) + }) + stats[i].Process = p + } + + return stats, nil } func (n *nvidia) Process(pid int32) (gpu.Process, error) { @@ -259,14 +419,18 @@ func (n *nvidia) Process(pid int32) (gpu.Process, error) { defer n.lock.RUnlock() p, hasProcess := n.process[pid] - if !hasProcess { - return gpu.Process{}, gpu.ErrProcessNotFound + if hasProcess { + return gpu.Process{ + PID: p.PID, + Index: p.Index, + Memory: p.Memory, + Usage: p.Usage, + Encoder: p.Encoder, + Decoder: p.Decoder, + }, nil } - return gpu.Process{ - PID: p.PID, - Memory: uint64(p.Memory), - }, nil + return gpu.Process{Index: -1}, gpu.ErrProcessNotFound } func (n *nvidia) Close() { @@ -279,6 +443,4 @@ func (n *nvidia) Close() { n.cancel() n.cancel = nil - - n.cmd.Process.Kill() } diff --git a/psutil/gpu/nvidia/nvidia_test.go b/psutil/gpu/nvidia/nvidia_test.go index f18310b2..51954eb8 100644 --- a/psutil/gpu/nvidia/nvidia_test.go +++ b/psutil/gpu/nvidia/nvidia_test.go @@ -1,102 +1,430 @@ package nvidia import ( + "bytes" "os" + "regexp" + "sync" "testing" + "time" + "github.com/datarhei/core/v16/internal/testhelper" + "github.com/datarhei/core/v16/psutil/gpu" "github.com/stretchr/testify/require" ) -func TestParseNV(t *testing.T) { - data, err := os.ReadFile("./fixtures/data1.xml") +func TestParseQuery(t *testing.T) { + data, err := os.ReadFile("./fixtures/query1.xml") require.NoError(t, err) - nv, err := parse(data) + wr := &writerQuery{} + + nv, err := wr.parse(data) require.NoError(t, err) require.Equal(t, Stats{ GPU: []GPUStats{ { + ID: "00000000:01:00.0", Name: "NVIDIA GeForce GTX 1080", Architecture: "Pascal", MemoryTotal: 8119 * 1024 * 1024, MemoryUsed: 918 * 1024 * 1024, Usage: 15, - MemoryUsage: 7, - EncoderUsage: 3, - DecoderUsage: 0, - Process: []Process{ - { - PID: 18179, - Memory: 916 * 1024 * 1024, - }, - }, + UsageEncoder: 3, + UsageDecoder: 0, }, }, }, nv) - data, err = os.ReadFile("./fixtures/data2.xml") + data, err = os.ReadFile("./fixtures/query2.xml") require.NoError(t, err) - nv, err = parse(data) + nv, err = wr.parse(data) require.NoError(t, err) require.Equal(t, Stats{ GPU: []GPUStats{ { + ID: "00000000:01:00.0", Name: "NVIDIA L4", Architecture: "Ada Lovelace", MemoryTotal: 23034 * 1024 * 1024, MemoryUsed: 1 * 1024 * 1024, Usage: 2, - MemoryUsage: 0, - EncoderUsage: 0, - DecoderUsage: 0, + UsageEncoder: 0, + UsageDecoder: 0, }, { + ID: "00000000:C1:00.0", Name: "NVIDIA L4", Architecture: "Ada Lovelace", MemoryTotal: 23034 * 1024 * 1024, MemoryUsed: 1 * 1024 * 1024, Usage: 3, - MemoryUsage: 0, - EncoderUsage: 0, - DecoderUsage: 0, + UsageEncoder: 0, + UsageDecoder: 0, }, }, }, nv) - data, err = os.ReadFile("./fixtures/data3.xml") + data, err = os.ReadFile("./fixtures/query3.xml") require.NoError(t, err) - nv, err = parse(data) + nv, err = wr.parse(data) require.NoError(t, err) require.Equal(t, Stats{ GPU: []GPUStats{ { + ID: "00000000:01:00.0", Name: "GeForce GTX 1080", MemoryTotal: 8119 * 1024 * 1024, MemoryUsed: 2006 * 1024 * 1024, Usage: 32, - MemoryUsage: 11, - EncoderUsage: 17, - DecoderUsage: 25, - Process: []Process{ - { - PID: 10131, - Memory: 389 * 1024 * 1024, - }, - { - PID: 13597, - Memory: 1054 * 1024 * 1024, - }, - { - PID: 16870, - Memory: 549 * 1024 * 1024, - }, - }, + UsageEncoder: 17, + UsageDecoder: 25, }, }, }, nv) } + +func TestParseProcess(t *testing.T) { + data, err := os.ReadFile("./fixtures/process.txt") + require.NoError(t, err) + + wr := &writerProcess{ + re: regexp.MustCompile(`^\s*([0-9]+)\s+([0-9]+)\s+[A-Z]\s+([0-9-]+)\s+[0-9-]+\s+([0-9-]+)\s+([0-9-]+)\s+([0-9]+).*`), + } + + lines := bytes.Split(data, []byte("\n")) + process := map[int32]Process{} + + for _, line := range lines { + p, err := wr.parse(line) + if err != nil { + continue + } + + process[p.PID] = p + } + + require.Equal(t, map[int32]Process{ + 7372: { + Index: 0, + PID: 7372, + Memory: 136 * 1024 * 1024, + Usage: 2, + Encoder: 2, + Decoder: 0, + }, + 12176: { + Index: 0, + PID: 12176, + Memory: 782 * 1024 * 1024, + Usage: 7, + Encoder: 2, + Decoder: 6, + }, + 20035: { + Index: 0, + PID: 20035, + Memory: 1145 * 1024 * 1024, + Usage: 7, + Encoder: 4, + Decoder: 3, + }, + 20141: { + Index: 0, + PID: 20141, + Memory: 429 * 1024 * 1024, + Usage: 5, + Encoder: 1, + Decoder: 3, + }, + 29591: { + Index: 0, + PID: 29591, + Memory: 435 * 1024 * 1024, + Usage: 0, + Encoder: 1, + Decoder: 1, + }, + }, process) +} + +func TestWriterQuery(t *testing.T) { + data, err := os.ReadFile("./fixtures/query2.xml") + require.NoError(t, err) + + wr := &writerQuery{ + ch: make(chan Stats, 1), + terminator: []byte(""), + } + + stats := Stats{} + wg := sync.WaitGroup{} + wg.Add(1) + + go func() { + defer wg.Done() + + for s := range wr.ch { + stats = s + } + }() + + _, err = wr.Write(data) + require.NoError(t, err) + + close(wr.ch) + + wg.Wait() + + require.Equal(t, Stats{ + GPU: []GPUStats{ + { + ID: "00000000:01:00.0", + Name: "NVIDIA L4", + Architecture: "Ada Lovelace", + MemoryTotal: 23034 * 1024 * 1024, + MemoryUsed: 1 * 1024 * 1024, + Usage: 2, + UsageEncoder: 0, + UsageDecoder: 0, + }, + { + ID: "00000000:C1:00.0", + Name: "NVIDIA L4", + Architecture: "Ada Lovelace", + MemoryTotal: 23034 * 1024 * 1024, + MemoryUsed: 1 * 1024 * 1024, + Usage: 3, + UsageEncoder: 0, + UsageDecoder: 0, + }, + }, + }, stats) +} + +func TestWriterProcess(t *testing.T) { + data, err := os.ReadFile("./fixtures/process.txt") + require.NoError(t, err) + + wr := &writerProcess{ + ch: make(chan Process, 32), + re: regexp.MustCompile(`^\s*([0-9]+)\s+([0-9]+)\s+[A-Z]\s+([0-9-]+)\s+[0-9-]+\s+([0-9-]+)\s+([0-9-]+)\s+([0-9]+).*`), + terminator: []byte("\n"), + } + + process := map[int32]Process{} + wg := sync.WaitGroup{} + wg.Add(1) + + go func() { + defer wg.Done() + for p := range wr.ch { + process[p.PID] = p + } + }() + + _, err = wr.Write(data) + require.NoError(t, err) + + close(wr.ch) + + wg.Wait() + + require.Equal(t, map[int32]Process{ + 7372: { + Index: 0, + PID: 7372, + Memory: 136 * 1024 * 1024, + Usage: 2, + Encoder: 2, + Decoder: 0, + }, + 12176: { + Index: 0, + PID: 12176, + Memory: 782 * 1024 * 1024, + Usage: 7, + Encoder: 2, + Decoder: 6, + }, + 20035: { + Index: 0, + PID: 20035, + Memory: 1145 * 1024 * 1024, + Usage: 7, + Encoder: 4, + Decoder: 3, + }, + 20141: { + Index: 0, + PID: 20141, + Memory: 429 * 1024 * 1024, + Usage: 5, + Encoder: 1, + Decoder: 3, + }, + 29591: { + Index: 0, + PID: 29591, + Memory: 435 * 1024 * 1024, + Usage: 0, + Encoder: 1, + Decoder: 1, + }, + }, process) +} + +func TestNvidiaGPUCount(t *testing.T) { + binary, err := testhelper.BuildBinary("nvidia-smi", "../../../internal/testhelper") + require.NoError(t, err, "Failed to build helper program") + + nv := New(binary) + + t.Cleanup(func() { + nv.Close() + }) + + _, ok := nv.(*dummy) + require.False(t, ok) + + require.Eventually(t, func() bool { + count, _ := nv.Count() + return count != 0 + }, 5*time.Second, time.Second) +} + +func TestNvidiaGPUStats(t *testing.T) { + binary, err := testhelper.BuildBinary("nvidia-smi", "../../../internal/testhelper") + require.NoError(t, err, "Failed to build helper program") + + nv := New(binary) + + t.Cleanup(func() { + nv.Close() + }) + + _, ok := nv.(*dummy) + require.False(t, ok) + + require.Eventually(t, func() bool { + stats, _ := nv.Stats() + + if len(stats) != 2 { + return false + } + + if len(stats[0].Process) != 3 { + return false + } + + if len(stats[1].Process) != 2 { + return false + } + + return true + }, 5*time.Second, time.Second) + + stats, err := nv.Stats() + require.NoError(t, err) + require.Equal(t, []gpu.Stats{ + { + ID: "00000000:01:00.0", + Name: "NVIDIA L4", + Architecture: "Ada Lovelace", + MemoryTotal: 23034 * 1024 * 1024, + MemoryUsed: 1 * 1024 * 1024, + Usage: 2, + Encoder: 0, + Decoder: 0, + Process: []gpu.Process{ + { + Index: 0, + PID: 7372, + Memory: 136 * 1024 * 1024, + Usage: 2, + Encoder: 2, + Decoder: 0, + }, + { + Index: 0, + PID: 12176, + Memory: 782 * 1024 * 1024, + Usage: 5, + Encoder: 3, + Decoder: 7, + }, + { + Index: 0, + PID: 29591, + Memory: 435 * 1024 * 1024, + Usage: 2, + Encoder: 0, + Decoder: 2, + }, + }, + }, + { + ID: "00000000:C1:00.0", + Name: "NVIDIA L4", + Architecture: "Ada Lovelace", + MemoryTotal: 23034 * 1024 * 1024, + MemoryUsed: 1 * 1024 * 1024, + Usage: 3, + Encoder: 0, + Decoder: 0, + Process: []gpu.Process{ + { + Index: 1, + PID: 20035, + Memory: 1145 * 1024 * 1024, + Usage: 8, + Encoder: 4, + Decoder: 1, + }, + { + Index: 1, + PID: 20141, + Memory: 429 * 1024 * 1024, + Usage: 2, + Encoder: 1, + Decoder: 3, + }, + }, + }, + }, stats) +} + +func TestNvidiaGPUProcess(t *testing.T) { + binary, err := testhelper.BuildBinary("nvidia-smi", "../../../internal/testhelper") + require.NoError(t, err, "Failed to build helper program") + + nv := New(binary) + + t.Cleanup(func() { + nv.Close() + }) + + _, ok := nv.(*dummy) + require.False(t, ok) + + require.Eventually(t, func() bool { + _, err := nv.Process(12176) + return err == nil + }, 5*time.Second, time.Second) + + proc, err := nv.Process(12176) + require.NoError(t, err) + require.Equal(t, gpu.Process{ + Index: 0, + PID: 12176, + Memory: 782 * 1024 * 1024, + Usage: 5, + Encoder: 3, + Decoder: 7, + }, proc) +} diff --git a/psutil/process.go b/psutil/process.go index 0789f553..bec312ca 100644 --- a/psutil/process.go +++ b/psutil/process.go @@ -5,24 +5,28 @@ import ( "sync" "time" + "github.com/datarhei/core/v16/psutil/gpu/nvidia" psprocess "github.com/shirou/gopsutil/v3/process" ) type Process interface { - // CPUPercent returns the current CPU load for this process only. The values + // CPU returns the current CPU load for this process only. The values // are normed to the range of 0 to 100. - CPUPercent() (*CPUInfoStat, error) + CPU() (*CPUInfo, error) - // VirtualMemory returns the current memory usage in bytes of this process only. - VirtualMemory() (uint64, error) + // Memory returns the current memory usage in bytes of this process only. + Memory() (uint64, error) + + // GPU returns the current GPU memory in bytes and usage in percent (0-100) of this process only. + GPU() (*GPUInfo, error) // Stop will stop collecting CPU and memory data for this process. Stop() - // Suspend will send SIGSTOP to the process + // Suspend will send SIGSTOP to the process. Suspend() error - // Resume will send SIGCONT to the process + // Resume will send SIGCONT to the process. Resume() error } @@ -142,7 +146,7 @@ func (p *process) Resume() error { return p.proc.Resume() } -func (p *process) CPUPercent() (*CPUInfoStat, error) { +func (p *process) CPU() (*CPUInfo, error) { var diff float64 for { @@ -167,7 +171,7 @@ func (p *process) CPUPercent() (*CPUInfoStat, error) { diff = p.statCurrentTime.Sub(p.statPreviousTime).Seconds() * p.ncpu } - s := &CPUInfoStat{ + s := &CPUInfo{ System: 0, User: 0, Idle: 0, @@ -186,9 +190,28 @@ func (p *process) CPUPercent() (*CPUInfoStat, error) { return s, nil } -func (p *process) VirtualMemory() (uint64, error) { +func (p *process) Memory() (uint64, error) { p.lock.RLock() defer p.lock.RUnlock() return p.memRSS, nil } + +func (p *process) GPU() (*GPUInfo, error) { + info := &GPUInfo{ + Index: -1, + } + + proc, err := nvidia.Default.Process(p.pid) + if err != nil { + return info, nil + } + + info.Index = proc.Index + info.MemoryUsed = proc.Memory + info.Usage = proc.Usage + info.Encoder = proc.Encoder + info.Decoder = proc.Decoder + + return info, nil +} diff --git a/psutil/psutil.go b/psutil/psutil.go index be5e1844..079e933d 100644 --- a/psutil/psutil.go +++ b/psutil/psutil.go @@ -47,29 +47,44 @@ func init() { DefaultUtil, _ = New("/sys/fs/cgroup") } -type MemoryInfoStat struct { +type DiskInfo struct { + Path string + Fstype string + Total uint64 + Used uint64 + InodesTotal uint64 + InodesUsed uint64 +} + +type MemoryInfo struct { Total uint64 // bytes Available uint64 // bytes Used uint64 // bytes } -type CPUInfoStat struct { +type NetworkInfo struct { + Name string // interface name + BytesSent uint64 // number of bytes sent + BytesRecv uint64 // number of bytes received +} + +type CPUInfo struct { System float64 // percent 0-100 User float64 // percent 0-100 Idle float64 // percent 0-100 Other float64 // percent 0-100 } -type GPUInfoStat struct { - Name string +type GPUInfo struct { + Index int // Index of the GPU + Name string // Name of the GPU (not populated for a specific process) - MemoryTotal uint64 // bytes + MemoryTotal uint64 // bytes (not populated for a specific process) MemoryUsed uint64 // bytes - Usage float64 // percent 0-100 - MemoryUsage float64 // percent 0-100 - EncoderUsage float64 // percent 0-100 - DecoderUsage float64 // percent 0-100 + Usage float64 // percent 0-100 + Encoder float64 // percent 0-100 + Decoder float64 // percent 0-100 } type cpuTimesStat struct { @@ -85,18 +100,23 @@ type Util interface { Stop() // CPUCounts returns the number of cores, either logical or physical. - CPUCounts(logical bool) (float64, error) + CPUCounts() (float64, error) - // GPUCounts returns the number of GPU cores. - GPUCounts() (float64, error) - - // CPUPercent returns the current CPU load in percent. The values range + // CPU returns the current CPU load in percent. The values range // from 0 to 100, independently of the number of logical cores. - CPUPercent() (*CPUInfoStat, error) - DiskUsage(path string) (*disk.UsageStat, error) - VirtualMemory() (*MemoryInfoStat, error) - NetIOCounters(pernic bool) ([]net.IOCountersStat, error) - GPUStats() ([]GPUInfoStat, error) + CPU() (*CPUInfo, error) + + // Disk returns the current usage of the partition specified by the path. + Disk(path string) (*DiskInfo, error) + + // Memory return the current memory usage. + Memory() (*MemoryInfo, error) + + // Network returns the current network interface statistics per network adapter. + Network() ([]NetworkInfo, error) + + // GPU return the current usage for each CPU. + GPU() ([]GPUInfo, error) // Process returns a process observer for a process with the given pid. Process(pid int32) (Process, error) @@ -120,7 +140,7 @@ type util struct { statPrevious cpuTimesStat statPreviousTime time.Time nTicks uint64 - mem MemoryInfoStat + mem MemoryInfo } // New returns a new util, it will be started automatically @@ -140,7 +160,7 @@ func New(root string) (Util, error) { if u.ncpu == 0 { var err error - u.ncpu, err = u.CPUCounts(true) + u.ncpu, err = u.CPUCounts() if err != nil { return nil, err } @@ -311,7 +331,7 @@ func (u *util) tickMemory(ctx context.Context, interval time.Duration) { } } -func (u *util) collectMemory() *MemoryInfoStat { +func (u *util) collectMemory() *MemoryInfo { stat, err := u.virtualMemory() if err != nil { return nil @@ -320,12 +340,12 @@ func (u *util) collectMemory() *MemoryInfoStat { return stat } -func (u *util) CPUCounts(logical bool) (float64, error) { +func (u *util) CPUCounts() (float64, error) { if u.hasCgroup && u.ncpu > 0 { return u.ncpu, nil } - ncpu, err := cpu.Counts(logical) + ncpu, err := cpu.Counts(true) if err != nil { return 0, err } @@ -333,18 +353,8 @@ func (u *util) CPUCounts(logical bool) (float64, error) { return float64(ncpu), nil } -func CPUCounts(logical bool) (float64, error) { - return DefaultUtil.CPUCounts(logical) -} - -func (u *util) GPUCounts() (float64, error) { - count, err := nvidia.Default.Count() - - return float64(count), err -} - -func GPUCounts() (float64, error) { - return DefaultUtil.GPUCounts() +func CPUCounts() (float64, error) { + return DefaultUtil.CPUCounts() } // cpuTimes returns the current cpu usage times in seconds. @@ -381,7 +391,7 @@ func (u *util) cpuTimes() (*cpuTimesStat, error) { return s, nil } -func (u *util) CPUPercent() (*CPUInfoStat, error) { +func (u *util) CPU() (*CPUInfo, error) { var total float64 for { @@ -406,7 +416,7 @@ func (u *util) CPUPercent() (*CPUInfoStat, error) { total = (u.statCurrent.total - u.statPrevious.total) } - s := &CPUInfoStat{ + s := &CPUInfo{ System: 0, User: 0, Idle: 100, @@ -429,8 +439,8 @@ func (u *util) CPUPercent() (*CPUInfoStat, error) { return s, nil } -func CPUPercent() (*CPUInfoStat, error) { - return DefaultUtil.CPUPercent() +func CPUPercent() (*CPUInfo, error) { + return DefaultUtil.CPU() } func (u *util) cgroupCPUTimes(version int) (*cpuTimesStat, error) { @@ -466,15 +476,29 @@ func (u *util) cgroupCPUTimes(version int) (*cpuTimesStat, error) { return info, nil } -func (u *util) DiskUsage(path string) (*disk.UsageStat, error) { - return disk.Usage(path) +func (u *util) Disk(path string) (*DiskInfo, error) { + usage, err := disk.Usage(path) + if err != nil { + return nil, err + } + + info := &DiskInfo{ + Path: usage.Path, + Fstype: usage.Fstype, + Total: usage.Total, + Used: usage.Used, + InodesTotal: usage.InodesTotal, + InodesUsed: usage.InodesUsed, + } + + return info, nil } -func DiskUsage(path string) (*disk.UsageStat, error) { - return DefaultUtil.DiskUsage(path) +func Disk(path string) (*DiskInfo, error) { + return DefaultUtil.Disk(path) } -func (u *util) virtualMemory() (*MemoryInfoStat, error) { +func (u *util) virtualMemory() (*MemoryInfo, error) { info, err := mem.VirtualMemory() if err != nil { return nil, err @@ -489,18 +513,18 @@ func (u *util) virtualMemory() (*MemoryInfoStat, error) { } } - return &MemoryInfoStat{ + return &MemoryInfo{ Total: info.Total, Available: info.Available, Used: info.Used, }, nil } -func (u *util) VirtualMemory() (*MemoryInfoStat, error) { +func (u *util) Memory() (*MemoryInfo, error) { u.lock.RLock() defer u.lock.RUnlock() - stat := &MemoryInfoStat{ + stat := &MemoryInfo{ Total: u.mem.Total, Available: u.mem.Available, Used: u.mem.Used, @@ -509,12 +533,12 @@ func (u *util) VirtualMemory() (*MemoryInfoStat, error) { return stat, nil } -func VirtualMemory() (*MemoryInfoStat, error) { - return DefaultUtil.VirtualMemory() +func Memory() (*MemoryInfo, error) { + return DefaultUtil.Memory() } -func (u *util) cgroupVirtualMemory(version int) (*MemoryInfoStat, error) { - info := &MemoryInfoStat{} +func (u *util) cgroupVirtualMemory(version int) (*MemoryInfo, error) { + info := &MemoryInfo{} if version == 1 { lines, err := u.readFile("memory/memory.limit_in_bytes") @@ -569,12 +593,27 @@ func (u *util) cgroupVirtualMemory(version int) (*MemoryInfoStat, error) { return info, nil } -func (u *util) NetIOCounters(pernic bool) ([]net.IOCountersStat, error) { - return net.IOCounters(pernic) +func (u *util) Network() ([]NetworkInfo, error) { + netio, err := net.IOCounters(true) + if err != nil { + return nil, err + } + + info := []NetworkInfo{} + + for _, io := range netio { + info = append(info, NetworkInfo{ + Name: io.Name, + BytesSent: io.BytesSent, + BytesRecv: io.BytesRecv, + }) + } + + return info, nil } -func NetIOCounters(pernic bool) ([]net.IOCountersStat, error) { - return DefaultUtil.NetIOCounters(pernic) +func Network() ([]NetworkInfo, error) { + return DefaultUtil.Network() } func (u *util) readFile(path string) ([]string, error) { @@ -613,29 +652,28 @@ func cpuTotal(c *cpu.TimesStat) float64 { c.Softirq + c.Steal + c.Guest + c.GuestNice } -func (u *util) GPUStats() ([]GPUInfoStat, error) { +func (u *util) GPU() ([]GPUInfo, error) { nvstats, err := nvidia.Default.Stats() if err != nil { return nil, err } - stats := []GPUInfoStat{} + stats := []GPUInfo{} for _, nv := range nvstats { - stats = append(stats, GPUInfoStat{ - Name: nv.Name, - MemoryTotal: nv.MemoryTotal, - MemoryUsed: nv.MemoryUsed, - Usage: nv.Usage, - MemoryUsage: nv.MemoryUsage, - EncoderUsage: nv.EncoderUsage, - DecoderUsage: nv.DecoderUsage, + stats = append(stats, GPUInfo{ + Name: nv.Name, + MemoryTotal: nv.MemoryTotal, + MemoryUsed: nv.MemoryUsed, + Usage: nv.Usage, + Encoder: nv.Encoder, + Decoder: nv.Decoder, }) } return stats, nil } -func GPUStats() ([]GPUInfoStat, error) { - return DefaultUtil.GPUStats() +func GPU() ([]GPUInfo, error) { + return DefaultUtil.GPU() } diff --git a/resources/resources.go b/resources/resources.go index d7255f05..5a4043d5 100644 --- a/resources/resources.go +++ b/resources/resources.go @@ -9,11 +9,13 @@ import ( "github.com/datarhei/core/v16/log" "github.com/datarhei/core/v16/psutil" + "github.com/datarhei/core/v16/slices" ) type Info struct { Mem MemoryInfo CPU CPUInfo + GPU GPUInfo } type MemoryInfo struct { @@ -38,6 +40,44 @@ type CPUInfo struct { Error error } +type GPUInfo struct { + NGPU float64 // number of gpus + GPU []GPUInfoStat + Error error +} + +type GPUInfoStat struct { + Index int + Name string + + // Memory + MemoryTotal uint64 // bytes + MemoryUsed uint64 // bytes + MemoryAvailable uint64 // bytes + MemoryLimit uint64 // bytes + + // GPU + Usage float64 // percent 0-100 + Encoder float64 // percent 0-100 + Decoder float64 // percent 0-100 + UsageLimit float64 // percent 0-100 + + Throttling bool +} + +type Request struct { + CPU float64 // percent 0-100*ncpu + Memory uint64 // bytes + GPUUsage float64 // percent 0-100 + GPUEncoder float64 // percent 0-100 + GPUDecoder float64 // percent 0-100 + GPUMemory uint64 // bytes +} + +type Response struct { + GPU int // GPU number, hwdevice +} + type resources struct { psutil psutil.Util @@ -45,9 +85,14 @@ type resources struct { maxCPU float64 // percent 0-100*ncpu maxMemory uint64 // bytes + ngpu int + maxGPU float64 // general usage, percent 0-100 + maxGPUMemory float64 // memory usage, percent 0-100 + isUnlimited bool isCPULimiting bool isMemoryLimiting bool + isGPULimiting []bool self psutil.Process @@ -67,30 +112,46 @@ type Resources interface { // HasLimits returns whether any limits have been set. HasLimits() bool - // Limits returns the CPU (percent 0-100) and memory (bytes) limits. - Limits() (float64, uint64) + // Limits returns the CPU (percent 0-100), memory (bytes) limits, and GPU limits (usage and memory each in percent 0-100). + Limits() (float64, uint64, float64, float64) - // ShouldLimit returns whether cpu and/or memory is currently limited. - ShouldLimit() (bool, bool) + // ShouldLimit returns whether cpu, memory, and/or GPU is currently limited. + ShouldLimit() (bool, bool, []bool) // Request checks whether the requested resources are available. - Request(cpu float64, memory uint64) error + Request(req Request) (Response, error) - // Info returns the current resource usage + // Info returns the current resource usage. Info() Info } type Config struct { - MaxCPU float64 // percent 0-100 - MaxMemory float64 // percent 0-100 - PSUtil psutil.Util - Logger log.Logger + MaxCPU float64 // percent 0-100 + MaxMemory float64 // percent 0-100 + MaxGPU float64 // general,encoder,decoder usage, percent 0-100 + MaxGPUMemory float64 // memory usage, percent 0-100 + PSUtil psutil.Util + Logger log.Logger } func New(config Config) (Resources, error) { + if config.PSUtil == nil { + config.PSUtil = psutil.DefaultUtil + } + + gpu, err := config.PSUtil.GPU() + if err != nil { + return nil, fmt.Errorf("unable to determine number of GPUs: %w", err) + } + + if len(gpu) == 0 { + config.MaxGPU = 0 + config.MaxGPUMemory = 0 + } + isUnlimited := false - if config.MaxCPU <= 0 && config.MaxMemory <= 0 { + if config.MaxCPU <= 0 && config.MaxMemory <= 0 && config.MaxGPU <= 0 && config.MaxGPUMemory <= 0 { isUnlimited = true } @@ -102,31 +163,39 @@ func New(config Config) (Resources, error) { config.MaxMemory = 100 } - if config.MaxCPU > 100 || config.MaxMemory > 100 { - return nil, fmt.Errorf("both MaxCPU and MaxMemory must have a range of 0-100") + if config.MaxGPU <= 0 { + config.MaxGPU = 100 + } + + if config.MaxGPUMemory <= 0 { + config.MaxGPUMemory = 100 + } + + if config.MaxCPU > 100 || config.MaxMemory > 100 || config.MaxGPU > 100 || config.MaxGPUMemory > 100 { + return nil, fmt.Errorf("all Max... values must have a range of 0-100") } r := &resources{ - maxCPU: config.MaxCPU, - psutil: config.PSUtil, - isUnlimited: isUnlimited, - logger: config.Logger, + maxCPU: config.MaxCPU, + maxGPU: config.MaxGPU, + maxGPUMemory: config.MaxGPUMemory, + psutil: config.PSUtil, + isUnlimited: isUnlimited, + ngpu: len(gpu), + isGPULimiting: make([]bool, len(gpu)), + logger: config.Logger, } if r.logger == nil { r.logger = log.New("") } - if r.psutil == nil { - r.psutil = psutil.DefaultUtil - } - - vmstat, err := r.psutil.VirtualMemory() + vmstat, err := r.psutil.Memory() if err != nil { return nil, fmt.Errorf("unable to determine available memory: %w", err) } - ncpu, err := r.psutil.CPUCounts(true) + ncpu, err := r.psutil.CPUCounts() if err != nil { return nil, fmt.Errorf("unable to determine number of logical CPUs: %w", err) } @@ -137,12 +206,15 @@ func New(config Config) (Resources, error) { r.maxMemory = uint64(float64(vmstat.Total) * config.MaxMemory / 100) r.logger = r.logger.WithFields(log.Fields{ - "ncpu": r.ncpu, - "max_cpu": r.maxCPU, - "max_memory": r.maxMemory, + "ncpu": r.ncpu, + "max_cpu": r.maxCPU, + "max_memory": r.maxMemory, + "ngpu": len(gpu), + "max_gpu": r.maxGPU, + "max_gpu_memory": r.maxGPUMemory, }) - r.self, err = psutil.NewProcess(int32(os.Getpid()), false) + r.self, err = r.psutil.Process(int32(os.Getpid())) if err != nil { return nil, fmt.Errorf("unable to create process observer for self: %w", err) } @@ -189,7 +261,12 @@ func (r *resources) observe(ctx context.Context, interval time.Duration) { case <-ctx.Done(): return case <-ticker.C: - cpustat, err := r.psutil.CPUPercent() + if r.isUnlimited { + // If there aren't any limits imposed, don't do anything + continue + } + + cpustat, err := r.psutil.CPU() if err != nil { r.logger.Warn().WithError(err).Log("Failed to determine system CPU usage") continue @@ -197,12 +274,18 @@ func (r *resources) observe(ctx context.Context, interval time.Duration) { cpuload := (cpustat.User + cpustat.System + cpustat.Other) * r.ncpu - vmstat, err := r.psutil.VirtualMemory() + vmstat, err := r.psutil.Memory() if err != nil { r.logger.Warn().WithError(err).Log("Failed to determine system memory usage") continue } + gpustat, err := r.psutil.GPU() + if err != nil { + r.logger.Warn().WithError(err).Log("Failed to determine GPU usage") + continue + } + r.logger.Debug().WithFields(log.Fields{ "cur_cpu": cpuload, "cur_memory": vmstat.Used, @@ -210,34 +293,46 @@ func (r *resources) observe(ctx context.Context, interval time.Duration) { doCPULimit := false - if !r.isUnlimited { - if !r.isCPULimiting { - if cpuload >= r.maxCPU { - r.logger.Debug().WithField("cpu", cpuload).Log("CPU limit reached") - doCPULimit = true - } - } else { + if !r.isCPULimiting { + if cpuload >= r.maxCPU { + r.logger.Debug().WithField("cpu", cpuload).Log("CPU limit reached") doCPULimit = true - if cpuload < r.maxCPU { - r.logger.Debug().WithField("cpu", cpuload).Log("CPU limit released") - doCPULimit = false - } + } + } else { + doCPULimit = true + if cpuload < r.maxCPU { + r.logger.Debug().WithField("cpu", cpuload).Log("CPU limit released") + doCPULimit = false } } doMemoryLimit := false - if !r.isUnlimited { - if !r.isMemoryLimiting { - if vmstat.Used >= r.maxMemory { - r.logger.Debug().WithField("memory", vmstat.Used).Log("Memory limit reached") - doMemoryLimit = true + if !r.isMemoryLimiting { + if vmstat.Used >= r.maxMemory { + r.logger.Debug().WithField("memory", vmstat.Used).Log("Memory limit reached") + doMemoryLimit = true + } + } else { + doMemoryLimit = true + if vmstat.Used < r.maxMemory { + r.logger.Debug().WithField("memory", vmstat.Used).Log("Memory limit released") + doMemoryLimit = false + } + } + + doGPULimit := make([]bool, r.ngpu) + + for i, limiting := range r.isGPULimiting { + maxMemory := uint64(r.maxGPUMemory * float64(gpustat[i].MemoryTotal) / 100) + if !limiting { + if gpustat[i].MemoryUsed >= maxMemory || (gpustat[i].Usage >= r.maxGPU && gpustat[i].Encoder >= r.maxGPU && gpustat[i].Decoder >= r.maxGPU) { + doGPULimit[i] = true } } else { - doMemoryLimit = true - if vmstat.Used < r.maxMemory { - r.logger.Debug().WithField("memory", vmstat.Used).Log("Memory limit released") - doMemoryLimit = false + doGPULimit[i] = true + if gpustat[i].MemoryUsed < maxMemory && (gpustat[i].Usage < r.maxGPU || gpustat[i].Encoder < r.maxGPU || gpustat[i].Decoder < r.maxGPU) { + doGPULimit[i] = false } } } @@ -247,17 +342,26 @@ func (r *resources) observe(ctx context.Context, interval time.Duration) { r.logger.Warn().WithFields(log.Fields{ "enabled": doCPULimit, }).Log("Limiting CPU") - - r.isCPULimiting = doCPULimit } + r.isCPULimiting = doCPULimit if r.isMemoryLimiting != doMemoryLimit { r.logger.Warn().WithFields(log.Fields{ "enabled": doMemoryLimit, }).Log("Limiting memory") - - r.isMemoryLimiting = doMemoryLimit } + r.isMemoryLimiting = doMemoryLimit + + for i, limiting := range r.isGPULimiting { + if limiting != doGPULimit[i] { + r.logger.Warn().WithFields(log.Fields{ + "enabled": doGPULimit, + "index": i, + }).Log("Limiting GPU") + } + } + r.isGPULimiting = doGPULimit + r.lock.Unlock() } } @@ -267,60 +371,136 @@ func (r *resources) HasLimits() bool { return !r.isUnlimited } -func (r *resources) Limits() (float64, uint64) { - return r.maxCPU / r.ncpu, r.maxMemory +func (r *resources) Limits() (float64, uint64, float64, float64) { + return r.maxCPU / r.ncpu, r.maxMemory, r.maxGPU, r.maxGPUMemory } -func (r *resources) ShouldLimit() (bool, bool) { +func (r *resources) ShouldLimit() (bool, bool, []bool) { r.lock.RLock() defer r.lock.RUnlock() - return r.isCPULimiting, r.isMemoryLimiting + return r.isCPULimiting, r.isMemoryLimiting, slices.Copy(r.isGPULimiting) } -func (r *resources) Request(cpu float64, memory uint64) error { +func (r *resources) Request(req Request) (Response, error) { + res := Response{ + GPU: -1, + } + r.lock.RLock() defer r.lock.RUnlock() logger := r.logger.WithFields(log.Fields{ - "req_cpu": cpu, - "req_memory": memory, + "req_cpu": req.CPU, + "req_memory": req.Memory, + "req_gpu": req.GPUUsage, + "req_gpu_encoder": req.GPUEncoder, + "req_gpu_decoder": req.GPUDecoder, + "req_gpu_memory": req.GPUMemory, }) logger.Debug().Log("Request for acquiring resources") + // Check if anything is currently limiting. if r.isCPULimiting || r.isMemoryLimiting { logger.Debug().Log("Rejected, currently limiting") - return fmt.Errorf("resources are currenlty actively limited") + return res, fmt.Errorf("resources are currenlty actively limited") } - if cpu <= 0 || memory == 0 { + // Check if the requested resources are valid. + if req.CPU <= 0 || req.Memory == 0 { logger.Debug().Log("Rejected, invalid values") - return fmt.Errorf("the cpu and/or memory values are invalid: cpu=%f, memory=%d", cpu, memory) + return res, fmt.Errorf("the cpu and/or memory values are invalid: cpu=%f, memory=%d", req.CPU, req.Memory) } - cpustat, err := r.psutil.CPUPercent() + // Get current CPU and memory values. + cpustat, err := r.psutil.CPU() if err != nil { r.logger.Warn().WithError(err).Log("Failed to determine system CPU usage") - return fmt.Errorf("the system CPU usage couldn't be determined") + return res, fmt.Errorf("the system CPU usage couldn't be determined") } cpuload := (cpustat.User + cpustat.System + cpustat.Other) * r.ncpu - vmstat, err := r.psutil.VirtualMemory() + vmstat, err := r.psutil.Memory() if err != nil { r.logger.Warn().WithError(err).Log("Failed to determine system memory usage") - return fmt.Errorf("the system memory usage couldn't be determined") + return res, fmt.Errorf("the system memory usage couldn't be determined") } - if cpuload+cpu > r.maxCPU { + // Check if enough resources are available + if cpuload+req.CPU > r.maxCPU { logger.Debug().WithField("cur_cpu", cpuload).Log("Rejected, CPU limit exceeded") - return fmt.Errorf("the CPU limit would be exceeded: %f + %f > %f", cpuload, cpu, r.maxCPU) + return res, fmt.Errorf("the CPU limit would be exceeded: %f + %f > %f", cpuload, req.CPU, r.maxCPU) } - if vmstat.Used+memory > r.maxMemory { + if vmstat.Used+req.Memory > r.maxMemory { logger.Debug().WithField("cur_memory", vmstat.Used).Log("Rejected, memory limit exceeded") - return fmt.Errorf("the memory limit would be exceeded: %d + %d > %d", vmstat.Used, memory, r.maxMemory) + return res, fmt.Errorf("the memory limit would be exceeded: %d + %d > %d", vmstat.Used, req.Memory, r.maxMemory) + } + + // Check if any GPU resources are requested + if req.GPUUsage > 0 || req.GPUEncoder > 0 || req.GPUDecoder > 0 || req.GPUMemory > 0 { + if req.GPUUsage < 0 || req.GPUEncoder < 0 || req.GPUDecoder < 0 || req.GPUMemory == 0 { + logger.Debug().Log("Rejected, invalid values") + return res, fmt.Errorf("the gpu usage and memory values are invalid: usage=%f, encoder=%f, decoder=%f, memory=%d", req.GPUUsage, req.GPUEncoder, req.GPUDecoder, req.GPUMemory) + } + + // Get current GPU values + gpustat, err := r.psutil.GPU() + if err != nil { + r.logger.Warn().WithError(err).Log("Failed to determine GPU usage") + return res, fmt.Errorf("the GPU usage couldn't be determined") + } + + if len(gpustat) == 0 { + r.logger.Debug().WithError(err).Log("GPU resources requested but no GPU available") + return res, fmt.Errorf("some GPU resources requested but no GPU available") + } + + foundGPU := -1 + for _, g := range gpustat { + if req.GPUUsage > 0 && g.Usage+req.GPUUsage > r.maxGPU { + logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu": g.Usage}).Log("Rejected, GPU usage limit exceeded") + continue + } + + if req.GPUEncoder > 0 && g.Encoder+req.GPUEncoder > r.maxGPU { + logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu_encoder": g.Usage}).Log("Rejected, GPU encoder usage limit exceeded") + continue + } + + if req.GPUDecoder > 0 && g.Decoder+req.GPUDecoder > r.maxGPU { + logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu_decoder": g.Usage}).Log("Rejected, GPU decoder usage limit exceeded") + continue + } + + gpuMemoryUsage := float64(g.MemoryUsed) / float64(g.MemoryTotal) * 100 + requestedGPUMemoryUsage := float64(req.GPUMemory) / float64(g.MemoryTotal) * 100 + + if gpuMemoryUsage+requestedGPUMemoryUsage > r.maxGPUMemory { + logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu_memory": gpuMemoryUsage}).Log("Rejected, GPU memory usage limit exceeded") + continue + } + + foundGPU = g.Index + + logger = logger.Debug().WithFields(log.Fields{ + "cur_gpu": foundGPU, + "cur_gpu_general": g.Usage, + "cur_gpu_encoder": g.Encoder, + "cur_gpu_decoder": g.Decoder, + "cur_gpu_memory": gpuMemoryUsage, + }) + + break + } + + if foundGPU < 0 { + return res, fmt.Errorf("all GPU usage limits are exceeded") + } + + res.GPU = foundGPU } logger.Debug().WithFields(log.Fields{ @@ -328,17 +508,18 @@ func (r *resources) Request(cpu float64, memory uint64) error { "cur_memory": vmstat.Used, }).Log("Acquiring approved") - return nil + return res, nil } func (r *resources) Info() Info { - cpulimit, memlimit := r.Limits() - cputhrottling, memthrottling := r.ShouldLimit() + cpulimit, memlimit, gpulimit, gpumemlimit := r.Limits() + cputhrottling, memthrottling, gputhrottling := r.ShouldLimit() - cpustat, cpuerr := r.psutil.CPUPercent() - memstat, memerr := r.psutil.VirtualMemory() - selfcpu, _ := r.self.CPUPercent() - selfmem, _ := r.self.VirtualMemory() + cpustat, cpuerr := r.psutil.CPU() + memstat, memerr := r.psutil.Memory() + gpustat, gpuerr := r.psutil.GPU() + selfcpu, _ := r.self.CPU() + selfmem, _ := r.self.Memory() cpuinfo := CPUInfo{ NCPU: r.ncpu, @@ -362,9 +543,31 @@ func (r *resources) Info() Info { Error: memerr, } + gpuinfo := GPUInfo{ + NGPU: float64(len(gpustat)), + Error: gpuerr, + } + + for i, g := range gpustat { + gpuinfo.GPU = append(gpuinfo.GPU, GPUInfoStat{ + Index: g.Index, + Name: g.Name, + MemoryTotal: g.MemoryTotal, + MemoryUsed: g.MemoryUsed, + MemoryAvailable: g.MemoryTotal - g.MemoryUsed, + MemoryLimit: uint64(float64(g.MemoryTotal) * gpumemlimit / 100), + Usage: g.Usage, + Encoder: g.Encoder, + Decoder: g.Decoder, + UsageLimit: gpulimit, + Throttling: gputhrottling[i], + }) + } + i := Info{ CPU: cpuinfo, Mem: meminfo, + GPU: gpuinfo, } return i diff --git a/resources/resources_test.go b/resources/resources_test.go index 3d26c40c..a1ee4244 100644 --- a/resources/resources_test.go +++ b/resources/resources_test.go @@ -1,68 +1,170 @@ package resources import ( + "slices" "sync" "testing" "time" "github.com/datarhei/core/v16/psutil" - "github.com/shirou/gopsutil/v3/disk" - "github.com/shirou/gopsutil/v3/net" "github.com/stretchr/testify/require" ) -type util struct{} +type util struct { + lock sync.Mutex + + cpu psutil.CPUInfo + mem psutil.MemoryInfo + gpu []psutil.GPUInfo +} + +func newUtil(ngpu int) *util { + u := &util{ + cpu: psutil.CPUInfo{ + System: 10, + User: 50, + Idle: 35, + Other: 5, + }, + mem: psutil.MemoryInfo{ + Total: 200, + Available: 40, + Used: 160, + }, + } + + for i := 0; i < ngpu; i++ { + u.gpu = append(u.gpu, psutil.GPUInfo{ + Index: i, + Name: "L4", + MemoryTotal: 24 * 1024 * 1024 * 1024, + MemoryUsed: uint64(12+i) * 1024 * 1024 * 1024, + Usage: 50 - float64((i+1)*5), + Encoder: 50 - float64((i+1)*10), + Decoder: 50 - float64((i+1)*3), + }) + } + + return u +} func (u *util) Start() {} func (u *util) Stop() {} -func (u *util) CPUCounts(logical bool) (float64, error) { +func (u *util) CPUCounts() (float64, error) { return 2, nil } -func (u *util) GPUCounts() (float64, error) { - return 0, nil +func (u *util) CPU() (*psutil.CPUInfo, error) { + u.lock.Lock() + defer u.lock.Unlock() + + cpu := u.cpu + + return &cpu, nil } -func (u *util) CPUPercent() (*psutil.CPUInfoStat, error) { - return &psutil.CPUInfoStat{ - System: 10, - User: 50, - Idle: 35, - Other: 5, - }, nil +func (u *util) Disk(path string) (*psutil.DiskInfo, error) { + return &psutil.DiskInfo{}, nil } -func (u *util) DiskUsage(path string) (*disk.UsageStat, error) { - return &disk.UsageStat{}, nil +func (u *util) Memory() (*psutil.MemoryInfo, error) { + u.lock.Lock() + defer u.lock.Unlock() + + mem := u.mem + + return &mem, nil } -func (u *util) VirtualMemory() (*psutil.MemoryInfoStat, error) { - return &psutil.MemoryInfoStat{ - Total: 200, - Available: 40, - Used: 160, - }, nil -} - -func (u *util) NetIOCounters(pernic bool) ([]net.IOCountersStat, error) { +func (u *util) Network() ([]psutil.NetworkInfo, error) { return nil, nil } -func (u *util) GPUStats() ([]psutil.GPUInfoStat, error) { - return nil, nil +func (u *util) GPU() ([]psutil.GPUInfo, error) { + u.lock.Lock() + defer u.lock.Unlock() + + gpu := []psutil.GPUInfo{} + + gpu = append(gpu, u.gpu...) + + return gpu, nil } func (u *util) Process(pid int32) (psutil.Process, error) { - return nil, nil + return &process{}, nil +} + +type process struct{} + +func (p *process) CPU() (*psutil.CPUInfo, error) { + s := &psutil.CPUInfo{ + System: 1, + User: 2, + Idle: 0, + Other: 3, + } + + return s, nil +} + +func (p *process) Memory() (uint64, error) { return 42, nil } +func (p *process) GPU() (*psutil.GPUInfo, error) { + return &psutil.GPUInfo{ + Index: 0, + Name: "L4", + MemoryTotal: 128, + MemoryUsed: 42, + Usage: 5, + Encoder: 9, + Decoder: 7, + }, nil +} +func (p *process) Stop() {} +func (p *process) Suspend() error { return nil } +func (p *process) Resume() error { return nil } + +func TestConfigNoLimits(t *testing.T) { + _, err := New(Config{ + PSUtil: newUtil(0), + }) + require.NoError(t, err) +} + +func TestConfigWrongLimits(t *testing.T) { + _, err := New(Config{ + MaxCPU: 102, + MaxMemory: 573, + PSUtil: newUtil(0), + }) + require.Error(t, err) + + _, err = New(Config{ + MaxCPU: 0, + MaxMemory: 0, + MaxGPU: 101, + MaxGPUMemory: 103, + PSUtil: newUtil(0), + }) + require.NoError(t, err) + + _, err = New(Config{ + MaxCPU: 0, + MaxMemory: 0, + MaxGPU: 101, + MaxGPUMemory: 103, + PSUtil: newUtil(1), + }) + require.Error(t, err) } func TestMemoryLimit(t *testing.T) { r, err := New(Config{ MaxCPU: 100, MaxMemory: 150. / 200. * 100, - PSUtil: &util{}, + PSUtil: newUtil(0), Logger: nil, }) require.NoError(t, err) @@ -86,7 +188,7 @@ func TestMemoryLimit(t *testing.T) { for { select { case <-ticker.C: - _, limit = r.ShouldLimit() + _, limit, _ = r.ShouldLimit() if limit { return } @@ -102,6 +204,95 @@ func TestMemoryLimit(t *testing.T) { require.True(t, limit) + _, err = r.Request(Request{CPU: 5, Memory: 10}) + require.Error(t, err) + + r.Stop() +} + +func TestMemoryUnlimit(t *testing.T) { + util := newUtil(0) + + r, err := New(Config{ + MaxCPU: 100, + MaxMemory: 150. / 200. * 100, + PSUtil: util, + Logger: nil, + }) + require.NoError(t, err) + + wg := sync.WaitGroup{} + wg.Add(1) + + limit := false + + go func() { + defer func() { + wg.Done() + }() + + timer := time.NewTimer(10 * time.Second) + defer timer.Stop() + + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + _, limit, _ = r.ShouldLimit() + if limit { + return + } + case <-timer.C: + return + } + } + }() + + r.Start() + + wg.Wait() + + require.True(t, limit) + + _, limit, _ = r.ShouldLimit() + require.True(t, limit) + + util.lock.Lock() + util.mem.Used = 140 + util.lock.Unlock() + + wg.Add(1) + + go func() { + defer func() { + wg.Done() + }() + + timer := time.NewTimer(10 * time.Second) + defer timer.Stop() + + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + _, limit, _ = r.ShouldLimit() + if !limit { + return + } + case <-timer.C: + return + } + } + }() + + wg.Wait() + + require.False(t, limit) + r.Stop() } @@ -109,7 +300,7 @@ func TestCPULimit(t *testing.T) { r, err := New(Config{ MaxCPU: 50., MaxMemory: 100, - PSUtil: &util{}, + PSUtil: newUtil(0), Logger: nil, }) require.NoError(t, err) @@ -133,7 +324,7 @@ func TestCPULimit(t *testing.T) { for { select { case <-ticker.C: - limit, _ = r.ShouldLimit() + limit, _, _ = r.ShouldLimit() if limit { return } @@ -149,36 +340,541 @@ func TestCPULimit(t *testing.T) { require.True(t, limit) + _, err = r.Request(Request{CPU: 5, Memory: 10}) + require.Error(t, err) + r.Stop() } -func TestRequest(t *testing.T) { +func TestCPUUnlimit(t *testing.T) { + util := newUtil(0) + r, err := New(Config{ - MaxCPU: 70., - MaxMemory: 170. / 200. * 100, - PSUtil: &util{}, + MaxCPU: 50., + MaxMemory: 100, + PSUtil: util, Logger: nil, }) require.NoError(t, err) - err = r.Request(-1, 0) - require.Error(t, err) + wg := sync.WaitGroup{} + wg.Add(1) - err = r.Request(5, 10) + limit := false + + go func() { + defer func() { + wg.Done() + }() + + timer := time.NewTimer(10 * time.Second) + defer timer.Stop() + + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + limit, _, _ = r.ShouldLimit() + if limit { + return + } + case <-timer.C: + return + } + } + }() + + r.Start() + + wg.Wait() + + require.True(t, limit) + + limit, _, _ = r.ShouldLimit() + require.True(t, limit) + + util.lock.Lock() + util.cpu.User = 20 + util.lock.Unlock() + + wg.Add(1) + + go func() { + defer func() { + wg.Done() + }() + + timer := time.NewTimer(10 * time.Second) + defer timer.Stop() + + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + limit, _, _ = r.ShouldLimit() + if !limit { + return + } + case <-timer.C: + return + } + } + }() + + wg.Wait() + + require.False(t, limit) + + r.Stop() +} + +func TestGPULimitMemory(t *testing.T) { + r, err := New(Config{ + MaxCPU: 100, + MaxMemory: 100, + MaxGPU: 100, + MaxGPUMemory: 20, + PSUtil: newUtil(2), + Logger: nil, + }) require.NoError(t, err) - err = r.Request(5, 20) + wg := sync.WaitGroup{} + wg.Add(1) + + limit := []bool{} + + go func() { + defer func() { + wg.Done() + }() + + timer := time.NewTimer(10 * time.Second) + defer timer.Stop() + + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + _, _, limit = r.ShouldLimit() + if slices.Contains(limit, true) { + return + } + case <-timer.C: + return + } + } + }() + + r.Start() + + wg.Wait() + + require.Contains(t, limit, true) + + _, err = r.Request(Request{CPU: 5, Memory: 10, GPUUsage: 10, GPUMemory: 10}) require.Error(t, err) - err = r.Request(10, 10) + r.Stop() +} + +func TestGPUUnlimitMemory(t *testing.T) { + util := newUtil(2) + + r, err := New(Config{ + MaxCPU: 100, + MaxMemory: 100, + MaxGPU: 100, + MaxGPUMemory: 20, + PSUtil: util, + Logger: nil, + }) require.NoError(t, err) + + wg := sync.WaitGroup{} + wg.Add(1) + + limit := []bool{} + + go func() { + defer func() { + wg.Done() + }() + + timer := time.NewTimer(10 * time.Second) + defer timer.Stop() + + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + _, _, limit = r.ShouldLimit() + if slices.Contains(limit, true) { + return + } + case <-timer.C: + return + } + } + }() + + r.Start() + + wg.Wait() + + require.Contains(t, limit, true) + + util.lock.Lock() + util.gpu[0].MemoryUsed = 10 + util.gpu[1].MemoryUsed = 10 + util.lock.Unlock() + + wg.Add(1) + + go func() { + defer func() { + wg.Done() + }() + + timer := time.NewTimer(10 * time.Second) + defer timer.Stop() + + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + _, _, limit = r.ShouldLimit() + if !slices.Contains(limit, true) { + return + } + case <-timer.C: + return + } + } + }() + + wg.Wait() + + require.NotContains(t, limit, true) + + r.Stop() +} + +func TestGPULimitMemorySome(t *testing.T) { + r, err := New(Config{ + MaxCPU: 100, + MaxMemory: 100, + MaxGPU: 100, + MaxGPUMemory: 14. / 24. * 100., + PSUtil: newUtil(4), + Logger: nil, + }) + require.NoError(t, err) + + wg := sync.WaitGroup{} + wg.Add(1) + + limit := []bool{} + + go func() { + defer func() { + wg.Done() + }() + + timer := time.NewTimer(10 * time.Second) + defer timer.Stop() + + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + _, _, limit = r.ShouldLimit() + if slices.Contains(limit, true) { + return + } + case <-timer.C: + return + } + } + }() + + r.Start() + + wg.Wait() + + require.Equal(t, []bool{false, false, true, true}, limit) + + _, err = r.Request(Request{CPU: 5, Memory: 10, GPUUsage: 10, GPUMemory: 10}) + require.NoError(t, err) + + r.Stop() +} + +func TestGPULimitUsage(t *testing.T) { + r, err := New(Config{ + MaxCPU: 100, + MaxMemory: 100, + MaxGPU: 40, + MaxGPUMemory: 100, + PSUtil: newUtil(3), + Logger: nil, + }) + require.NoError(t, err) + + wg := sync.WaitGroup{} + wg.Add(1) + + limit := []bool{} + + go func() { + defer func() { + wg.Done() + }() + + timer := time.NewTimer(10 * time.Second) + defer timer.Stop() + + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + _, _, limit = r.ShouldLimit() + if slices.Contains(limit, true) { + return + } + case <-timer.C: + return + } + } + }() + + r.Start() + + wg.Wait() + + require.Equal(t, []bool{true, false, false}, limit) + + _, err = r.Request(Request{CPU: 5, Memory: 10, GPUUsage: 10, GPUMemory: 10}) + require.Error(t, err) + + _, err = r.Request(Request{CPU: 5, Memory: 10, GPUEncoder: 10, GPUMemory: 10}) + require.NoError(t, err) + + r.Stop() +} + +func TestGPUUnlimitUsage(t *testing.T) { + util := newUtil(3) + + r, err := New(Config{ + MaxCPU: 100, + MaxMemory: 100, + MaxGPU: 40, + MaxGPUMemory: 100, + PSUtil: util, + Logger: nil, + }) + require.NoError(t, err) + + wg := sync.WaitGroup{} + wg.Add(1) + + limit := []bool{} + + go func() { + defer func() { + wg.Done() + }() + + timer := time.NewTimer(10 * time.Second) + defer timer.Stop() + + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + _, _, limit = r.ShouldLimit() + if slices.Contains(limit, true) { + return + } + case <-timer.C: + return + } + } + }() + + r.Start() + + wg.Wait() + + require.Equal(t, []bool{true, false, false}, limit) + + util.lock.Lock() + util.gpu[0].Usage = 30 + util.gpu[0].Encoder = 30 + util.gpu[0].Decoder = 30 + util.lock.Unlock() + + wg.Add(1) + + go func() { + defer func() { + wg.Done() + }() + + timer := time.NewTimer(10 * time.Second) + defer timer.Stop() + + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + _, _, limit = r.ShouldLimit() + if !slices.Contains(limit, true) { + return + } + case <-timer.C: + return + } + } + }() + + wg.Wait() + + require.Equal(t, []bool{false, false, false}, limit) + + r.Stop() +} + +func TestRequestCPU(t *testing.T) { + r, err := New(Config{ + MaxCPU: 70., + PSUtil: newUtil(0), + }) + require.NoError(t, err) + + _, err = r.Request(Request{CPU: 0, Memory: 0}) + require.Error(t, err) + + _, err = r.Request(Request{CPU: 5, Memory: 10}) + require.NoError(t, err) + + _, err = r.Request(Request{CPU: 30, Memory: 10}) + require.Error(t, err) +} + +func TestRequestMemory(t *testing.T) { + r, err := New(Config{ + MaxMemory: 170. / 200. * 100, + PSUtil: newUtil(0), + }) + require.NoError(t, err) + + _, err = r.Request(Request{CPU: 5, Memory: 0}) + require.Error(t, err) + + _, err = r.Request(Request{CPU: 5, Memory: 10}) + require.NoError(t, err) + + _, err = r.Request(Request{CPU: 50, Memory: 20}) + require.Error(t, err) +} + +func TestRequestNoGPU(t *testing.T) { + r, err := New(Config{ + MaxCPU: 100, + MaxMemory: 100, + PSUtil: newUtil(0), + }) + require.NoError(t, err) + + _, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 30, GPUMemory: 10}) + require.Error(t, err) +} + +func TestRequestInvalidGPURequest(t *testing.T) { + r, err := New(Config{ + MaxCPU: 100, + MaxMemory: 100, + PSUtil: newUtil(1), + }) + require.NoError(t, err) + + _, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 30, GPUMemory: 0}) + require.Error(t, err) + + _, err = r.Request(Request{CPU: 10, Memory: 10, GPUUsage: -1, GPUEncoder: 30, GPUMemory: 0}) + require.Error(t, err) +} + +func TestRequestGPULimitsOneGPU(t *testing.T) { + r, err := New(Config{ + MaxCPU: 100, + MaxMemory: 100, + MaxGPU: 50, + MaxGPUMemory: 60, + PSUtil: newUtil(1), + }) + require.NoError(t, err) + + _, err = r.Request(Request{CPU: 10, Memory: 10, GPUUsage: 50, GPUMemory: 10}) + require.Error(t, err) + + _, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 50, GPUMemory: 10}) + require.Error(t, err) + + _, err = r.Request(Request{CPU: 10, Memory: 10, GPUDecoder: 50, GPUMemory: 10}) + require.Error(t, err) + + _, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 10, GPUMemory: 5 * 1024 * 1024 * 1024}) + require.Error(t, err) + + res, err := r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 10, GPUMemory: 10}) + require.NoError(t, err) + require.Equal(t, 0, res.GPU) +} + +func TestRequestGPULimitsMoreGPU(t *testing.T) { + r, err := New(Config{ + MaxCPU: 100, + MaxMemory: 100, + MaxGPU: 60, + MaxGPUMemory: 60, + PSUtil: newUtil(2), + }) + require.NoError(t, err) + + _, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 50, GPUMemory: 10}) + require.Error(t, err) + + res, err := r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 30, GPUMemory: 10}) + require.NoError(t, err) + require.Equal(t, 1, res.GPU) } func TestHasLimits(t *testing.T) { r, err := New(Config{ MaxCPU: 70., MaxMemory: 170. / 200. * 100, - PSUtil: &util{}, + PSUtil: newUtil(0), Logger: nil, }) require.NoError(t, err) @@ -188,7 +884,7 @@ func TestHasLimits(t *testing.T) { r, err = New(Config{ MaxCPU: 100, MaxMemory: 100, - PSUtil: &util{}, + PSUtil: newUtil(0), Logger: nil, }) require.NoError(t, err) @@ -198,10 +894,95 @@ func TestHasLimits(t *testing.T) { r, err = New(Config{ MaxCPU: 0, MaxMemory: 0, - PSUtil: &util{}, + PSUtil: newUtil(0), + Logger: nil, + }) + require.NoError(t, err) + + require.False(t, r.HasLimits()) + + r, err = New(Config{ + MaxCPU: 0, + MaxMemory: 0, + MaxGPU: 10, + PSUtil: newUtil(1), + Logger: nil, + }) + require.NoError(t, err) + + require.True(t, r.HasLimits()) + + r, err = New(Config{ + MaxCPU: 0, + MaxMemory: 0, + MaxGPU: 10, + PSUtil: newUtil(0), Logger: nil, }) require.NoError(t, err) require.False(t, r.HasLimits()) } + +func TestInfo(t *testing.T) { + r, err := New(Config{ + MaxCPU: 90, + MaxMemory: 90, + MaxGPU: 11, + MaxGPUMemory: 50, + PSUtil: newUtil(2), + }) + require.NoError(t, err) + + info := r.Info() + + require.Equal(t, Info{ + Mem: MemoryInfo{ + Total: 200, + Available: 40, + Used: 160, + Limit: 180, + Core: 42, + Throttling: false, + Error: nil, + }, + CPU: CPUInfo{ + NCPU: 2, + System: 10, + User: 50, + Idle: 35, + Other: 5, + Limit: 90, + Core: 6, + Throttling: false, + Error: nil, + }, + GPU: GPUInfo{ + NGPU: 2, + GPU: []GPUInfoStat{{ + Index: 0, + Name: "L4", + MemoryTotal: 24 * 1024 * 1024 * 1024, + MemoryUsed: 12 * 1024 * 1024 * 1024, + MemoryAvailable: 12 * 1024 * 1024 * 1024, + MemoryLimit: 12 * 1024 * 1024 * 1024, + Usage: 45, + Encoder: 40, + Decoder: 47, + UsageLimit: 11, + }, { + Index: 1, + Name: "L4", + MemoryTotal: 24 * 1024 * 1024 * 1024, + MemoryUsed: 13 * 1024 * 1024 * 1024, + MemoryAvailable: 11 * 1024 * 1024 * 1024, + MemoryLimit: 12 * 1024 * 1024 * 1024, + Usage: 40, + Encoder: 30, + Decoder: 44, + UsageLimit: 11, + }}, + Error: nil, + }, + }, info) +} diff --git a/restream/app/process.go b/restream/app/process.go index 974309cb..e02cd69a 100644 --- a/restream/app/process.go +++ b/restream/app/process.go @@ -79,13 +79,21 @@ type Config struct { Reconnect bool ReconnectDelay uint64 // seconds Autostart bool - StaleTimeout uint64 // seconds - Timeout uint64 // seconds - Scheduler string // crontab pattern or RFC3339 timestamp - LogPatterns []string // will be interpreted as regular expressions - LimitCPU float64 // percent - LimitMemory uint64 // bytes - LimitWaitFor uint64 // seconds + StaleTimeout uint64 // seconds + Timeout uint64 // seconds + Scheduler string // crontab pattern or RFC3339 timestamp + LogPatterns []string // will be interpreted as regular expressions + LimitCPU float64 // percent + LimitMemory uint64 // bytes + LimitGPU ConfigLimitGPU // GPU limits + LimitWaitFor uint64 // seconds +} + +type ConfigLimitGPU struct { + Usage float64 // percent 0-100 + Encoder float64 // percent 0-100 + Decoder float64 // percent 0-100 + Memory uint64 // bytes } func (config *Config) Clone() *Config { @@ -103,6 +111,7 @@ func (config *Config) Clone() *Config { Scheduler: config.Scheduler, LimitCPU: config.LimitCPU, LimitMemory: config.LimitMemory, + LimitGPU: config.LimitGPU, LimitWaitFor: config.LimitWaitFor, } @@ -175,6 +184,10 @@ func (config *Config) Hash() []byte { b.WriteString(strconv.FormatUint(config.LimitMemory, 10)) b.WriteString(strconv.FormatUint(config.LimitWaitFor, 10)) b.WriteString(strconv.FormatFloat(config.LimitCPU, 'f', -1, 64)) + b.WriteString(strconv.FormatFloat(config.LimitGPU.Usage, 'f', -1, 64)) + b.WriteString(strconv.FormatFloat(config.LimitGPU.Encoder, 'f', -1, 64)) + b.WriteString(strconv.FormatFloat(config.LimitGPU.Decoder, 'f', -1, 64)) + b.WriteString(strconv.FormatUint(config.LimitGPU.Memory, 10)) for _, x := range config.Input { b.WriteString(x.HashString()) @@ -294,7 +307,7 @@ type State struct { Memory uint64 // Current memory consumption in bytes CPU float64 // Current CPU consumption in percent LimitMode string // How the process is limited (hard or soft) - Resources ProcessUsage // Current resource usage, include CPU and memory consumption + Resources ProcessUsage // Current resource usage, include CPU, memory and GPU consumption Command []string // ffmpeg command line parameters } @@ -326,10 +339,10 @@ func (p *ProcessUsageCPU) MarshalParser() parse.UsageCPU { } type ProcessUsageMemory struct { - Current uint64 // bytes - Average float64 // bytes - Max uint64 // bytes - Limit uint64 // bytes + Current uint64 // bytes + Average uint64 // bytes + Max uint64 // bytes + Limit uint64 // bytes } func (p *ProcessUsageMemory) UnmarshalParser(pp *parse.UsageMemory) { @@ -348,20 +361,97 @@ func (p *ProcessUsageMemory) MarshalParser() parse.UsageMemory { return pp } +type ProcessUsageGPU struct { + Index int + Usage ProcessUsageGPUUsage + Encoder ProcessUsageGPUUsage + Decoder ProcessUsageGPUUsage + Memory ProcessUsageGPUMemory +} + +func (p *ProcessUsageGPU) UnmarshalParser(pp *parse.UsageGPU) { + p.Index = pp.Index + p.Usage.UnmarshalParser(&pp.Usage) + p.Encoder.UnmarshalParser(&pp.Encoder) + p.Decoder.UnmarshalParser(&pp.Decoder) + p.Memory.UnmarshalParser(&pp.Memory) +} + +func (p *ProcessUsageGPU) MarshalParser() parse.UsageGPU { + pp := parse.UsageGPU{ + Index: p.Index, + Usage: p.Usage.MarshalParser(), + Encoder: p.Encoder.MarshalParser(), + Decoder: p.Decoder.MarshalParser(), + Memory: p.Memory.MarshalParser(), + } + + return pp +} + +type ProcessUsageGPUUsage struct { + Current float64 // percent 0-100 + Average float64 // percent 0-100 + Max float64 // percent 0-100 + Limit float64 // percent 0-100 +} + +func (p *ProcessUsageGPUUsage) UnmarshalParser(pp *parse.UsageGPUUsage) { + p.Average = pp.Average + p.Max = pp.Max + p.Limit = pp.Limit +} + +func (p *ProcessUsageGPUUsage) MarshalParser() parse.UsageGPUUsage { + pp := parse.UsageGPUUsage{ + Average: p.Average, + Max: p.Max, + Limit: p.Limit, + } + + return pp +} + +type ProcessUsageGPUMemory struct { + Current uint64 // bytes + Average uint64 // bytes + Max uint64 // bytes + Limit uint64 // bytes +} + +func (p *ProcessUsageGPUMemory) UnmarshalParser(pp *parse.UsageGPUMemory) { + p.Average = pp.Average + p.Max = pp.Max + p.Limit = pp.Limit +} + +func (p *ProcessUsageGPUMemory) MarshalParser() parse.UsageGPUMemory { + pp := parse.UsageGPUMemory{ + Average: p.Average, + Max: p.Max, + Limit: p.Limit, + } + + return pp +} + type ProcessUsage struct { CPU ProcessUsageCPU Memory ProcessUsageMemory + GPU ProcessUsageGPU } func (p *ProcessUsage) UnmarshalParser(pp *parse.Usage) { p.CPU.UnmarshalParser(&pp.CPU) p.Memory.UnmarshalParser(&pp.Memory) + p.GPU.UnmarshalParser(&pp.GPU) } func (p *ProcessUsage) MarshalParser() parse.Usage { pp := parse.Usage{ CPU: p.CPU.MarshalParser(), Memory: p.Memory.MarshalParser(), + GPU: p.GPU.MarshalParser(), } return pp diff --git a/restream/app/process_test.go b/restream/app/process_test.go index 96889697..2aa6168b 100644 --- a/restream/app/process_test.go +++ b/restream/app/process_test.go @@ -46,12 +46,18 @@ func TestConfigHash(t *testing.T) { LogPatterns: []string{"^libx264"}, LimitCPU: 50, LimitMemory: 3 * 1024 * 1024, - LimitWaitFor: 20, + LimitGPU: ConfigLimitGPU{ + Usage: 10, + Encoder: 42, + Decoder: 14, + Memory: 500 * 1024 * 1024, + }, + LimitWaitFor: 20, } hash1 := config.Hash() - require.Equal(t, []byte{0x7e, 0xae, 0x5b, 0xc3, 0xad, 0xe3, 0x9a, 0xfc, 0xd3, 0x49, 0x15, 0x28, 0x93, 0x17, 0xc5, 0xbf}, hash1) + require.Equal(t, []byte{0x5e, 0x85, 0xc3, 0xc5, 0x44, 0xfd, 0x3e, 0x10, 0x13, 0x76, 0x36, 0x8b, 0xbe, 0x7e, 0xa6, 0xbb}, hash1) config.Reconnect = false diff --git a/restream/core.go b/restream/core.go index e3f64f9d..bbe1a72c 100644 --- a/restream/core.go +++ b/restream/core.go @@ -279,13 +279,14 @@ func (r *restream) resourceObserver(ctx context.Context, rsc resources.Resources defer ticker.Stop() limitCPU, limitMemory := false, false + var limitGPUs []bool = nil for { select { case <-ctx.Done(): return case <-ticker.C: - cpu, memory := rsc.ShouldLimit() + cpu, memory, gpu := rsc.ShouldLimit() hasChanges := false @@ -299,17 +300,34 @@ func (r *restream) resourceObserver(ctx context.Context, rsc resources.Resources hasChanges = true } + if limitGPUs == nil { + limitGPUs = make([]bool, len(gpu)) + } + + for i, g := range gpu { + if g != limitGPUs[i] { + limitGPUs[i] = g + hasChanges = true + } + } + if !hasChanges { break } r.tasks.Range(func(id app.ProcessID, t *task) bool { - if t.Limit(limitCPU, limitMemory) { + limitGPU := false + gpuindex := t.GetHWDevice() + if gpuindex >= 0 { + limitGPU = limitGPUs[gpuindex] + } + if t.Limit(limitCPU, limitMemory, limitGPU) { r.logger.Debug().WithFields(log.Fields{ "limit_cpu": limitCPU, "limit_memory": limitMemory, + "limit_gpu": limitGPU, "id": id, - }).Log("Limiting process CPU and memory consumption") + }).Log("Limiting process CPU, memory, and GPU consumption") } return true @@ -391,7 +409,11 @@ func (r *restream) load() error { // Validate config with all placeholders replaced. However, we need to take care // that the config with the task keeps its dynamic placeholders for process starts. config := t.config.Clone() - resolveDynamicPlaceholder(config, r.replace) + resolveDynamicPlaceholder(config, r.replace, map[string]string{ + "hwdevice": "0", + }, map[string]string{ + "timestamp": time.Now().UTC().Format(time.RFC3339), + }) t.usesDisk, err = validateConfig(config, r.fs.list, r.ffmpeg) if err != nil { @@ -414,30 +436,23 @@ func (r *restream) load() error { } ffmpeg, err := r.ffmpeg.New(ffmpeg.ProcessConfig{ - Reconnect: t.config.Reconnect, - ReconnectDelay: time.Duration(t.config.ReconnectDelay) * time.Second, - StaleTimeout: time.Duration(t.config.StaleTimeout) * time.Second, - Timeout: time.Duration(t.config.Timeout) * time.Second, - LimitCPU: t.config.LimitCPU, - LimitMemory: t.config.LimitMemory, - LimitDuration: time.Duration(t.config.LimitWaitFor) * time.Second, - LimitMode: limitMode, - Scheduler: t.config.Scheduler, - Args: t.command, - Parser: t.parser, - Logger: t.logger, - OnArgs: r.onArgs(t.config.Clone()), - OnBeforeStart: func() error { - if !r.enableSoftLimit { - return nil - } - - if err := r.resources.Request(t.config.LimitCPU, t.config.LimitMemory); err != nil { - return err - } - - return nil - }, + Reconnect: t.config.Reconnect, + ReconnectDelay: time.Duration(t.config.ReconnectDelay) * time.Second, + StaleTimeout: time.Duration(t.config.StaleTimeout) * time.Second, + Timeout: time.Duration(t.config.Timeout) * time.Second, + LimitCPU: t.config.LimitCPU, + LimitMemory: t.config.LimitMemory, + LimitGPUUsage: t.config.LimitGPU.Usage, + LimitGPUEncoder: t.config.LimitGPU.Encoder, + LimitGPUDecoder: t.config.LimitGPU.Decoder, + LimitGPUMemory: t.config.LimitGPU.Memory, + LimitDuration: time.Duration(t.config.LimitWaitFor) * time.Second, + LimitMode: limitMode, + Scheduler: t.config.Scheduler, + Args: t.command, + Parser: t.parser, + Logger: t.logger, + OnBeforeStart: r.onBeforeStart(t.config.Clone()), }) if err != nil { return true @@ -578,7 +593,11 @@ func (r *restream) createTask(config *app.Config) (*task, error) { // Validate config with all placeholders replaced. However, we need to take care // that the config with the task keeps its dynamic placeholders for process starts. config := t.config.Clone() - resolveDynamicPlaceholder(config, r.replace) + resolveDynamicPlaceholder(config, r.replace, map[string]string{ + "hwdevice": "0", + }, map[string]string{ + "timestamp": time.Now().UTC().Format(time.RFC3339), + }) t.usesDisk, err = validateConfig(config, r.fs.list, r.ffmpeg) if err != nil { @@ -600,30 +619,23 @@ func (r *restream) createTask(config *app.Config) (*task, error) { } ffmpeg, err := r.ffmpeg.New(ffmpeg.ProcessConfig{ - Reconnect: t.config.Reconnect, - ReconnectDelay: time.Duration(t.config.ReconnectDelay) * time.Second, - StaleTimeout: time.Duration(t.config.StaleTimeout) * time.Second, - Timeout: time.Duration(t.config.Timeout) * time.Second, - LimitCPU: t.config.LimitCPU, - LimitMemory: t.config.LimitMemory, - LimitDuration: time.Duration(t.config.LimitWaitFor) * time.Second, - LimitMode: limitMode, - Scheduler: t.config.Scheduler, - Args: t.command, - Parser: t.parser, - Logger: t.logger, - OnArgs: r.onArgs(t.config.Clone()), - OnBeforeStart: func() error { - if !r.enableSoftLimit { - return nil - } - - if err := r.resources.Request(t.config.LimitCPU, t.config.LimitMemory); err != nil { - return err - } - - return nil - }, + Reconnect: t.config.Reconnect, + ReconnectDelay: time.Duration(t.config.ReconnectDelay) * time.Second, + StaleTimeout: time.Duration(t.config.StaleTimeout) * time.Second, + Timeout: time.Duration(t.config.Timeout) * time.Second, + LimitCPU: t.config.LimitCPU, + LimitMemory: t.config.LimitMemory, + LimitGPUUsage: t.config.LimitGPU.Usage, + LimitGPUEncoder: t.config.LimitGPU.Encoder, + LimitGPUDecoder: t.config.LimitGPU.Decoder, + LimitGPUMemory: t.config.LimitGPU.Memory, + LimitDuration: time.Duration(t.config.LimitWaitFor) * time.Second, + LimitMode: limitMode, + Scheduler: t.config.Scheduler, + Args: t.command, + Parser: t.parser, + Logger: t.logger, + OnBeforeStart: r.onBeforeStart(t.config.Clone()), }) if err != nil { return nil, err @@ -636,21 +648,45 @@ func (r *restream) createTask(config *app.Config) (*task, error) { return t, nil } -// onArgs is a callback that gets called by a process before it will be started. -// It evalutes the dynamic placeholders in a process config and returns the -// resulting command line to the process. -func (r *restream) onArgs(cfg *app.Config) func([]string) []string { - return func(args []string) []string { +// onBeforeStart is a callback that gets called by a process before it will be started. +// It evalutes the dynamic placeholders in a process config and returns the resulting command line to the process. +func (r *restream) onBeforeStart(cfg *app.Config) func([]string) ([]string, error) { + return func(args []string) ([]string, error) { + selectedGPU := -1 + if r.enableSoftLimit { + res, err := r.resources.Request(resources.Request{ + CPU: cfg.LimitCPU, + Memory: cfg.LimitMemory, + GPUUsage: cfg.LimitGPU.Usage, + GPUEncoder: cfg.LimitGPU.Encoder, + GPUDecoder: cfg.LimitGPU.Decoder, + GPUMemory: cfg.LimitGPU.Memory, + }) + if err != nil { + return []string{}, err + } + + selectedGPU = res.GPU + } + + if t, hasTask := r.tasks.Load(cfg.ProcessID()); hasTask { + t.SetHWDevice(selectedGPU) + } + config := cfg.Clone() - resolveDynamicPlaceholder(config, r.replace) + resolveDynamicPlaceholder(config, r.replace, map[string]string{ + "hwdevice": fmt.Sprintf("%d", selectedGPU), + }, map[string]string{ + "timestamp": time.Now().UTC().Format(time.RFC3339), + }) _, err := validateConfig(config, r.fs.list, r.ffmpeg) if err != nil { - return []string{} + return []string{}, err } - return config.CreateCommand() + return config.CreateCommand(), nil } } @@ -1448,7 +1484,11 @@ func (r *restream) Probe(config *app.Config, timeout time.Duration) app.Probe { return probe } - resolveDynamicPlaceholder(config, r.replace) + resolveDynamicPlaceholder(config, r.replace, map[string]string{ + "hwdevice": "0", + }, map[string]string{ + "timestamp": time.Now().UTC().Format(time.RFC3339), + }) _, err = validateConfig(config, r.fs.list, r.ffmpeg) if err != nil { @@ -1712,22 +1752,26 @@ func resolveStaticPlaceholders(config *app.Config, r replace.Replacer) { // resolveDynamicPlaceholder replaces placeholders in the config that should be replaced at process start. // The config will be modified in place. -func resolveDynamicPlaceholder(config *app.Config, r replace.Replacer) { - vars := map[string]string{ - "timestamp": time.Now().UTC().Format(time.RFC3339), - } +func resolveDynamicPlaceholder(config *app.Config, r replace.Replacer, values map[string]string, vars map[string]string) { + placeholders := []string{"date", "hwdevice"} for i, option := range config.Options { - option = r.Replace(option, "date", "", vars, config, "global") + for _, placeholder := range placeholders { + option = r.Replace(option, placeholder, values[placeholder], vars, config, "global") + } config.Options[i] = option } for i, input := range config.Input { - input.Address = r.Replace(input.Address, "date", "", vars, config, "input") + for _, placeholder := range placeholders { + input.Address = r.Replace(input.Address, placeholder, values[placeholder], vars, config, "input") + } for j, option := range input.Options { - option = r.Replace(option, "date", "", vars, config, "input") + for _, placeholder := range placeholders { + option = r.Replace(option, placeholder, values[placeholder], vars, config, "input") + } input.Options[j] = option } @@ -1736,16 +1780,22 @@ func resolveDynamicPlaceholder(config *app.Config, r replace.Replacer) { } for i, output := range config.Output { - output.Address = r.Replace(output.Address, "date", "", vars, config, "output") + for _, placeholder := range placeholders { + output.Address = r.Replace(output.Address, placeholder, values[placeholder], vars, config, "output") + } for j, option := range output.Options { - option = r.Replace(option, "date", "", vars, config, "output") + for _, placeholder := range placeholders { + option = r.Replace(option, placeholder, values[placeholder], vars, config, "output") + } output.Options[j] = option } for j, cleanup := range output.Cleanup { - cleanup.Pattern = r.Replace(cleanup.Pattern, "date", "", vars, config, "output") + for _, placeholder := range placeholders { + cleanup.Pattern = r.Replace(cleanup.Pattern, placeholder, values[placeholder], vars, config, "output") + } output.Cleanup[j] = cleanup } diff --git a/restream/core_test.go b/restream/core_test.go index 3d9e1a68..48d79d89 100644 --- a/restream/core_test.go +++ b/restream/core_test.go @@ -1261,7 +1261,7 @@ func TestReplacer(t *testing.T) { require.Equal(t, wantprocess, process) - resolveDynamicPlaceholder(process, replacer) + resolveDynamicPlaceholder(process, replacer, nil, nil) wantprocess.Input = []app.ConfigIO{ { @@ -1531,7 +1531,7 @@ func TestProcessLimit(t *testing.T) { status := task.ffmpeg.Status() - ncpu, err := psutil.CPUCounts(true) + ncpu, err := psutil.CPUCounts() require.NoError(t, err) require.Equal(t, ncpu*process.LimitCPU, status.CPU.Limit) diff --git a/restream/task.go b/restream/task.go index 3073b506..40cb74c4 100644 --- a/restream/task.go +++ b/restream/task.go @@ -3,6 +3,7 @@ package restream import ( "errors" "maps" + "sync/atomic" "time" "github.com/datarhei/core/v16/ffmpeg/parse" @@ -31,7 +32,8 @@ type task struct { parser parse.Parser playout map[string]int logger log.Logger - usesDisk bool // Whether this task uses the disk + usesDisk bool // Whether this task uses the disk + hwdevice atomic.Int32 // Index of the GPU this task uses metadata map[string]interface{} lock *xsync.RBMutex @@ -234,8 +236,47 @@ func (t *task) State() (*app.State, error) { state.Memory = status.Memory.Current state.CPU = status.CPU.Current / status.CPU.NCPU state.LimitMode = status.LimitMode - state.Resources.CPU = status.CPU - state.Resources.Memory = status.Memory + state.Resources.CPU = app.ProcessUsageCPU{ + NCPU: status.CPU.NCPU, + Current: status.CPU.Current, + Average: status.CPU.Average, + Max: status.CPU.Max, + Limit: status.CPU.Limit, + IsThrottling: status.CPU.IsThrottling, + } + state.Resources.Memory = app.ProcessUsageMemory{ + Current: status.Memory.Current, + Average: status.Memory.Average, + Max: status.Memory.Max, + Limit: status.Memory.Limit, + } + state.Resources.GPU = app.ProcessUsageGPU{ + Index: status.GPU.Index, + Usage: app.ProcessUsageGPUUsage{ + Current: status.GPU.Usage.Current, + Average: status.GPU.Usage.Average, + Max: status.GPU.Usage.Max, + Limit: status.GPU.Usage.Limit, + }, + Encoder: app.ProcessUsageGPUUsage{ + Current: status.GPU.Encoder.Current, + Average: status.GPU.Encoder.Average, + Max: status.GPU.Encoder.Max, + Limit: status.GPU.Encoder.Limit, + }, + Decoder: app.ProcessUsageGPUUsage{ + Current: status.GPU.Decoder.Current, + Average: status.GPU.Decoder.Average, + Max: status.GPU.Decoder.Max, + Limit: status.GPU.Decoder.Limit, + }, + Memory: app.ProcessUsageGPUMemory{ + Current: status.GPU.Memory.Current, + Average: status.GPU.Memory.Average, + Max: status.GPU.Memory.Max, + Limit: status.GPU.Memory.Limit, + }, + } state.Duration = status.Duration.Round(10 * time.Millisecond).Seconds() state.Reconnect = -1 state.Command = status.CommandArgs @@ -420,7 +461,7 @@ func (t *task) ExportMetadata() map[string]interface{} { return t.metadata } -func (t *task) Limit(cpu, memory bool) bool { +func (t *task) Limit(cpu, memory, gpu bool) bool { token := t.lock.RLock() defer t.lock.RUnlock(token) @@ -428,11 +469,19 @@ func (t *task) Limit(cpu, memory bool) bool { return false } - t.ffmpeg.Limit(cpu, memory) + t.ffmpeg.Limit(cpu, memory, gpu) return true } +func (t *task) SetHWDevice(index int) { + t.hwdevice.Store(int32(index)) +} + +func (t *task) GetHWDevice() int { + return int(t.hwdevice.Load()) +} + func (t *task) Equal(config *app.Config) bool { token := t.lock.RLock() defer t.lock.RUnlock(token) diff --git a/session/registry_test.go b/session/registry_test.go index 7b1d987d..5cba9ec1 100644 --- a/session/registry_test.go +++ b/session/registry_test.go @@ -8,6 +8,7 @@ import ( "time" "github.com/datarhei/core/v16/io/fs" + "github.com/lestrrat-go/strftime" "github.com/stretchr/testify/require" )