diff --git a/app/api/api.go b/app/api/api.go
index fa017a88..042354e4 100644
--- a/app/api/api.go
+++ b/app/api/api.go
@@ -371,9 +371,11 @@ func (a *api) start(ctx context.Context) error {
}
resources, err := resources.New(resources.Config{
- MaxCPU: cfg.Resources.MaxCPUUsage,
- MaxMemory: cfg.Resources.MaxMemoryUsage,
- Logger: a.log.logger.core.WithComponent("Resources"),
+ MaxCPU: cfg.Resources.MaxCPUUsage,
+ MaxMemory: cfg.Resources.MaxMemoryUsage,
+ MaxGPU: cfg.Resources.MaxGPUUsage,
+ MaxGPUMemory: cfg.Resources.MaxGPUMemoryUsage,
+ Logger: a.log.logger.core.WithComponent("Resources"),
})
if err != nil {
return fmt.Errorf("failed to initialize resource manager: %w", err)
diff --git a/cluster/about.go b/cluster/about.go
index 3356faec..60585a38 100644
--- a/cluster/about.go
+++ b/cluster/about.go
@@ -18,18 +18,29 @@ type ClusterRaft struct {
}
type ClusterNodeResources struct {
- IsThrottling bool // Whether this core is currently throttling
- NCPU float64 // Number of CPU on this node
- CPU float64 // Current CPU load, 0-100*ncpu
- CPULimit float64 // Defined CPU load limit, 0-100*ncpu
- CPUCore float64 // Current CPU load of the core itself, 0-100*ncpu
- Mem uint64 // Currently used memory in bytes
- MemLimit uint64 // Defined memory limit in bytes
- MemTotal uint64 // Total available memory in bytes
- MemCore uint64 // Current used memory of the core itself in bytes
+ IsThrottling bool // Whether this core is currently throttling
+ NCPU float64 // Number of CPU on this node
+ CPU float64 // Current CPU load, 0-100*ncpu
+ CPULimit float64 // Defined CPU load limit, 0-100*ncpu
+ CPUCore float64 // Current CPU load of the core itself, 0-100*ncpu
+ Mem uint64 // Currently used memory in bytes
+ MemLimit uint64 // Defined memory limit in bytes
+ MemTotal uint64 // Total available memory in bytes
+ MemCore uint64 // Current used memory of the core itself in bytes
+ GPU []ClusterNodeGPUResources // GPU resources
Error error
}
+type ClusterNodeGPUResources struct {
+ Mem uint64 // Currently used memory in bytes
+ MemLimit uint64 // Defined memory limit in bytes
+ MemTotal uint64 // Total available memory in bytes
+ Usage float64 // Current general usage, 0-100
+ UsageLimit float64 // Defined general usage limit, 0-100
+ Encoder float64 // Current encoder usage, 0-100
+ Decoder float64 // Current decoder usage, 0-100
+}
+
type ClusterNode struct {
ID string
Name string
@@ -157,6 +168,19 @@ func (c *cluster) About() (ClusterAbout, error) {
},
}
+ if len(nodeAbout.Resources.GPU) != 0 {
+ node.Resources.GPU = make([]ClusterNodeGPUResources, len(nodeAbout.Resources.GPU))
+ for i, gpu := range nodeAbout.Resources.GPU {
+ node.Resources.GPU[i].Mem = gpu.Mem
+ node.Resources.GPU[i].MemLimit = gpu.MemLimit
+ node.Resources.GPU[i].MemTotal = gpu.MemTotal
+ node.Resources.GPU[i].Usage = gpu.Usage
+ node.Resources.GPU[i].UsageLimit = gpu.UsageLimit
+ node.Resources.GPU[i].Encoder = gpu.Encoder
+ node.Resources.GPU[i].Decoder = gpu.Decoder
+ }
+ }
+
if s, ok := serversMap[nodeAbout.ID]; ok {
node.Voter = s.Voter
node.Leader = s.Leader
diff --git a/cluster/api.go b/cluster/api.go
index de2f865b..38b21695 100644
--- a/cluster/api.go
+++ b/cluster/api.go
@@ -195,6 +195,19 @@ func (a *api) About(c echo.Context) error {
},
}
+ if len(resources.GPU.GPU) != 0 {
+ about.Resources.GPU = make([]client.AboutResponseGPUResources, len(resources.GPU.GPU))
+ for i, gpu := range resources.GPU.GPU {
+ about.Resources.GPU[i].Mem = gpu.MemoryUsed
+ about.Resources.GPU[i].MemLimit = gpu.MemoryLimit
+ about.Resources.GPU[i].MemTotal = gpu.MemoryTotal
+ about.Resources.GPU[i].Usage = gpu.Usage
+ about.Resources.GPU[i].UsageLimit = gpu.UsageLimit
+ about.Resources.GPU[i].Encoder = gpu.Encoder
+ about.Resources.GPU[i].Decoder = gpu.Decoder
+ }
+ }
+
if err != nil {
about.Resources.Error = err.Error()
}
diff --git a/cluster/client/client.go b/cluster/client/client.go
index 84ab0230..214bf34d 100644
--- a/cluster/client/client.go
+++ b/cluster/client/client.go
@@ -83,17 +83,28 @@ type AboutResponse struct {
Resources AboutResponseResources `json:"resources"`
}
+type AboutResponseGPUResources struct {
+ Mem uint64 `json:"memory_bytes"` // Currently used memory in bytes
+ MemLimit uint64 `json:"memory_limit_bytes"` // Defined memory limit in bytes
+ MemTotal uint64 `json:"memory_total_bytes"` // Total available memory in bytes
+ Usage float64 `json:"usage"` // Current general usage, 0-100
+ Encoder float64 `json:"encoder"` // Current encoder usage, 0-100
+ Decoder float64 `json:"decoder"` // Current decoder usage, 0-100
+ UsageLimit float64 `json:"usage_limit"` // Defined general usage limit, 0-100
+}
+
type AboutResponseResources struct {
- IsThrottling bool `json:"is_throttling"` // Whether this core is currently throttling
- NCPU float64 `json:"ncpu"` // Number of CPU on this node
- CPU float64 `json:"cpu"` // Current CPU load, 0-100*ncpu
- CPULimit float64 `json:"cpu_limit"` // Defined CPU load limit, 0-100*ncpu
- CPUCore float64 `json:"cpu_core"` // Current CPU load of the core itself, 0-100*ncpu
- Mem uint64 `json:"memory_bytes"` // Currently used memory in bytes
- MemLimit uint64 `json:"memory_limit_bytes"` // Defined memory limit in bytes
- MemTotal uint64 `json:"memory_total_bytes"` // Total available memory in bytes
- MemCore uint64 `json:"memory_core_bytes"` // Current used memory of the core itself in bytes
- Error string `json:"error"` // Last error
+ IsThrottling bool `json:"is_throttling"` // Whether this core is currently throttling
+ NCPU float64 `json:"ncpu"` // Number of CPU on this node
+ CPU float64 `json:"cpu"` // Current CPU load, 0-100*ncpu
+ CPULimit float64 `json:"cpu_limit"` // Defined CPU load limit, 0-100*ncpu
+ CPUCore float64 `json:"cpu_core"` // Current CPU load of the core itself, 0-100*ncpu
+ Mem uint64 `json:"memory_bytes"` // Currently used memory in bytes
+ MemLimit uint64 `json:"memory_limit_bytes"` // Defined memory limit in bytes
+ MemTotal uint64 `json:"memory_total_bytes"` // Total available memory in bytes
+ MemCore uint64 `json:"memory_core_bytes"` // Current used memory of the core itself in bytes
+ GPU []AboutResponseGPUResources `json:"gpu"` // Currently used GPU resources
+ Error string `json:"error"` // Last error
}
type SetNodeStateRequest struct {
diff --git a/cluster/leader_rebalance.go b/cluster/leader_rebalance.go
index c583f1ac..3ef2b8f7 100644
--- a/cluster/leader_rebalance.go
+++ b/cluster/leader_rebalance.go
@@ -78,7 +78,7 @@ func rebalance(have []node.Process, nodes map[string]node.About) ([]interface{},
// Mark nodes as throttling where at least one process is still throttling
for _, haveP := range have {
- if haveP.Throttling {
+ if haveP.Resources.Throttling {
resources.Throttling(haveP.NodeID, true)
}
}
@@ -126,7 +126,7 @@ func rebalance(have []node.Process, nodes map[string]node.About) ([]interface{},
continue
}
- if resources.HasNodeEnough(raNodeid, p.Config.LimitCPU, p.Config.LimitMemory) {
+ if resources.HasNodeEnough(raNodeid, ResourcesFromConfig(p.Config)) {
availableNodeid = raNodeid
break
}
@@ -135,7 +135,7 @@ func rebalance(have []node.Process, nodes map[string]node.About) ([]interface{},
// Find the best node with enough resources available.
if len(availableNodeid) == 0 {
- nodes := resources.FindBestNodes(p.Config.LimitCPU, p.Config.LimitMemory)
+ nodes := resources.FindBestNodes(ResourcesFromConfig(p.Config))
for _, nodeid := range nodes {
if nodeid == overloadedNodeid {
continue
@@ -169,7 +169,7 @@ func rebalance(have []node.Process, nodes map[string]node.About) ([]interface{},
processes[i] = p
// Adjust the resources.
- resources.Move(availableNodeid, overloadedNodeid, p.CPU, p.Mem)
+ resources.Move(availableNodeid, overloadedNodeid, ResourcesFromProcess(p.Resources))
// Adjust the reference affinity.
haveReferenceAffinity.Move(p.Config.Reference, p.Config.Domain, overloadedNodeid, availableNodeid)
diff --git a/cluster/leader_relocate.go b/cluster/leader_relocate.go
index dc5a057a..27ab847b 100644
--- a/cluster/leader_relocate.go
+++ b/cluster/leader_relocate.go
@@ -95,7 +95,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[
// Mark nodes as throttling where at least one process is still throttling
for _, haveP := range have {
- if haveP.Throttling {
+ if haveP.Resources.Throttling {
resources.Throttling(haveP.NodeID, true)
}
}
@@ -136,7 +136,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[
if len(targetNodeid) != 0 {
_, hasNode := nodes[targetNodeid]
- if !hasNode || !resources.HasNodeEnough(targetNodeid, process.Config.LimitCPU, process.Config.LimitMemory) {
+ if !hasNode || !resources.HasNodeEnough(targetNodeid, ResourcesFromConfig(process.Config)) {
targetNodeid = ""
}
}
@@ -152,7 +152,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[
continue
}
- if resources.HasNodeEnough(raNodeid, process.Config.LimitCPU, process.Config.LimitMemory) {
+ if resources.HasNodeEnough(raNodeid, ResourcesFromConfig(process.Config)) {
targetNodeid = raNodeid
break
}
@@ -161,7 +161,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[
// Find the best node with enough resources available.
if len(targetNodeid) == 0 {
- nodes := resources.FindBestNodes(process.Config.LimitCPU, process.Config.LimitMemory)
+ nodes := resources.FindBestNodes(ResourcesFromConfig(process.Config))
for _, nodeid := range nodes {
if nodeid == sourceNodeid {
continue
@@ -194,7 +194,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[
opBudget -= 5
// Adjust the resources.
- resources.Move(targetNodeid, sourceNodeid, process.CPU, process.Mem)
+ resources.Move(targetNodeid, sourceNodeid, ResourcesFromProcess(process.Resources))
// Adjust the reference affinity.
haveReferenceAffinity.Move(process.Config.Reference, process.Config.Domain, sourceNodeid, targetNodeid)
diff --git a/cluster/leader_synchronize.go b/cluster/leader_synchronize.go
index b597d78e..c56e4ad8 100644
--- a/cluster/leader_synchronize.go
+++ b/cluster/leader_synchronize.go
@@ -143,7 +143,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
// Mark nodes as throttling where at least one process is still throttling
for _, haveP := range have {
- if haveP.Throttling {
+ if haveP.Resources.Throttling {
resources.Throttling(haveP.NodeID, true)
}
}
@@ -182,7 +182,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
processid: haveP.Config.ProcessID(),
})
- resources.Remove(haveP.NodeID, haveP.CPU, haveP.Mem)
+ resources.Remove(haveP.NodeID, ResourcesFromProcess(haveP.Resources))
continue
}
@@ -219,7 +219,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
})
// Release the resources.
- resources.Remove(haveP.NodeID, haveP.CPU, haveP.Mem)
+ resources.Remove(haveP.NodeID, ResourcesFromProcess(haveP.Resources))
}
}
@@ -229,7 +229,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
for _, haveP := range wantOrderStart {
nodeid := haveP.NodeID
- resources.Add(nodeid, haveP.Config.LimitCPU, haveP.Config.LimitMemory)
+ resources.Add(nodeid, ResourcesFromConfig(haveP.Config))
// TODO: check if the current node has actually enough resources available,
// otherwise it needs to be moved somewhere else. If the node doesn't
@@ -347,7 +347,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
// Try to add the process to a node where other processes with the same reference currently reside.
raNodes := haveReferenceAffinity.Nodes(wantP.Config.Reference, wantP.Config.Domain)
for _, raNodeid := range raNodes {
- if resources.HasNodeEnough(raNodeid, wantP.Config.LimitCPU, wantP.Config.LimitMemory) {
+ if resources.HasNodeEnough(raNodeid, ResourcesFromConfig(wantP.Config)) {
nodeid = raNodeid
break
}
@@ -355,7 +355,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
// Find the node with the most resources available.
if len(nodeid) == 0 {
- nodes := resources.FindBestNodes(wantP.Config.LimitCPU, wantP.Config.LimitMemory)
+ nodes := resources.FindBestNodes(ResourcesFromConfig(wantP.Config))
if len(nodes) > 0 {
nodeid = nodes[0]
}
@@ -372,7 +372,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
opBudget -= 3
// Consume the resources
- resources.Add(nodeid, wantP.Config.LimitCPU, wantP.Config.LimitMemory)
+ resources.Add(nodeid, ResourcesFromConfig(wantP.Config))
reality[pid] = nodeid
diff --git a/cluster/leader_test.go b/cluster/leader_test.go
index 4f1d6bba..af17d9a6 100644
--- a/cluster/leader_test.go
+++ b/cluster/leader_test.go
@@ -193,11 +193,13 @@ func TestSynchronizeOrderStop(t *testing.T) {
have := []node.Process{
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 12,
- Mem: 5,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 12,
+ Mem: 5,
+ },
Runtime: 42,
UpdatedAt: now,
Config: &app.Config{
@@ -285,11 +287,13 @@ func TestSynchronizeOrderStart(t *testing.T) {
have := []node.Process{
{
- NodeID: "node1",
- Order: "stop",
- State: "finished",
- CPU: 0,
- Mem: 0,
+ NodeID: "node1",
+ Order: "stop",
+ State: "finished",
+ Resources: node.ProcessResources{
+ CPU: 0,
+ Mem: 0,
+ },
Runtime: 42,
UpdatedAt: now,
Config: &app.Config{
@@ -388,11 +392,13 @@ func TestSynchronizeAddReferenceAffinity(t *testing.T) {
have := []node.Process{
{
- NodeID: "node2",
- Order: "start",
- State: "running",
- CPU: 12,
- Mem: 5,
+ NodeID: "node2",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 12,
+ Mem: 5,
+ },
Runtime: 42,
UpdatedAt: now,
Config: &app.Config{
@@ -490,11 +496,13 @@ func TestSynchronizeAddReferenceAffinityMultiple(t *testing.T) {
have := []node.Process{
{
- NodeID: "node2",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node2",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 2,
+ },
Runtime: 42,
UpdatedAt: now,
Config: &app.Config{
@@ -882,11 +890,13 @@ func TestSynchronizeRemove(t *testing.T) {
have := []node.Process{
{
- NodeID: "node2",
- Order: "start",
- State: "running",
- CPU: 12,
- Mem: 5,
+ NodeID: "node2",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 12,
+ Mem: 5,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar",
@@ -967,11 +977,13 @@ func TestSynchronizeAddRemove(t *testing.T) {
have := []node.Process{
{
- NodeID: "node2",
- Order: "start",
- State: "running",
- CPU: 12,
- Mem: 5,
+ NodeID: "node2",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 12,
+ Mem: 5,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar2",
@@ -1064,11 +1076,13 @@ func TestSynchronizeNoUpdate(t *testing.T) {
have := []node.Process{
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 12,
- Mem: 5,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 12,
+ Mem: 5,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar",
@@ -1133,11 +1147,13 @@ func TestSynchronizeUpdate(t *testing.T) {
have := []node.Process{
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 12,
- Mem: 5,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 12,
+ Mem: 5,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar",
@@ -1217,11 +1233,13 @@ func TestSynchronizeUpdateMetadata(t *testing.T) {
have := []node.Process{
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 12,
- Mem: 5,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 12,
+ Mem: 5,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar",
@@ -1313,11 +1331,13 @@ func TestSynchronizeWaitDisconnectedNode(t *testing.T) {
have := []node.Process{
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 12,
- Mem: 5,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 12,
+ Mem: 5,
+ },
Runtime: 42,
UpdatedAt: now,
Config: &app.Config{
@@ -1397,11 +1417,13 @@ func TestSynchronizeWaitDisconnectedNodeNoWish(t *testing.T) {
have := []node.Process{
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 12,
- Mem: 5,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 12,
+ Mem: 5,
+ },
Runtime: 42,
UpdatedAt: now,
Config: &app.Config{
@@ -1493,11 +1515,13 @@ func TestSynchronizeWaitDisconnectedNodeUnrealisticWish(t *testing.T) {
have := []node.Process{
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 12,
- Mem: 5,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 12,
+ Mem: 5,
+ },
Runtime: 42,
UpdatedAt: now,
Config: &app.Config{
@@ -1589,11 +1613,13 @@ func TestSynchronizeTimeoutDisconnectedNode(t *testing.T) {
have := []node.Process{
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 12,
- Mem: 5,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 12,
+ Mem: 5,
+ },
Runtime: 42,
UpdatedAt: now,
Config: &app.Config{
@@ -1655,22 +1681,26 @@ func TestSynchronizeTimeoutDisconnectedNode(t *testing.T) {
func TestRebalanceNothingToDo(t *testing.T) {
processes := []node.Process{
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 35,
- Mem: 20,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 35,
+ Mem: 20,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar1",
},
},
{
- NodeID: "node2",
- Order: "start",
- State: "running",
- CPU: 12,
- Mem: 5,
+ NodeID: "node2",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 12,
+ Mem: 5,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar2",
@@ -1711,33 +1741,39 @@ func TestRebalanceNothingToDo(t *testing.T) {
func TestRebalanceOverload(t *testing.T) {
processes := []node.Process{
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 35,
- Mem: 20,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 35,
+ Mem: 20,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar1",
},
},
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 17,
- Mem: 31,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 17,
+ Mem: 31,
+ },
Runtime: 27,
Config: &app.Config{
ID: "foobar3",
},
},
{
- NodeID: "node2",
- Order: "start",
- State: "running",
- CPU: 12,
- Mem: 5,
+ NodeID: "node2",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 12,
+ Mem: 5,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar2",
@@ -1806,33 +1842,39 @@ func TestRebalanceOverload(t *testing.T) {
func TestRebalanceSkip(t *testing.T) {
processes := []node.Process{
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 35,
- Mem: 20,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 35,
+ Mem: 20,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar1",
},
},
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 17,
- Mem: 31,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 17,
+ Mem: 31,
+ },
Runtime: 27,
Config: &app.Config{
ID: "foobar3",
},
},
{
- NodeID: "node2",
- Order: "start",
- State: "running",
- CPU: 12,
- Mem: 5,
+ NodeID: "node2",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 12,
+ Mem: 5,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar2",
@@ -1908,22 +1950,26 @@ func TestRebalanceSkip(t *testing.T) {
func TestRebalanceReferenceAffinity(t *testing.T) {
processes := []node.Process{
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar1",
},
},
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 1,
Config: &app.Config{
ID: "foobar2",
@@ -1931,11 +1977,13 @@ func TestRebalanceReferenceAffinity(t *testing.T) {
},
},
{
- NodeID: "node2",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node2",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar3",
@@ -1943,11 +1991,13 @@ func TestRebalanceReferenceAffinity(t *testing.T) {
},
},
{
- NodeID: "node3",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node3",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar4",
@@ -1955,11 +2005,13 @@ func TestRebalanceReferenceAffinity(t *testing.T) {
},
},
{
- NodeID: "node3",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node3",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar5",
@@ -2048,33 +2100,39 @@ func TestRebalanceReferenceAffinity(t *testing.T) {
func TestRebalanceRelocateTarget(t *testing.T) {
processes := []node.Process{
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 35,
- Mem: 20,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 35,
+ Mem: 20,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar1",
},
},
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 17,
- Mem: 31,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 17,
+ Mem: 31,
+ },
Runtime: 27,
Config: &app.Config{
ID: "foobar3",
},
},
{
- NodeID: "node2",
- Order: "start",
- State: "running",
- CPU: 12,
- Mem: 5,
+ NodeID: "node2",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 12,
+ Mem: 5,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar2",
@@ -2165,33 +2223,39 @@ func TestRebalanceRelocateTarget(t *testing.T) {
func TestRebalanceRelocateAny(t *testing.T) {
processes := []node.Process{
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 35,
- Mem: 20,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 35,
+ Mem: 20,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar1",
},
},
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 17,
- Mem: 31,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 17,
+ Mem: 31,
+ },
Runtime: 27,
Config: &app.Config{
ID: "foobar3",
},
},
{
- NodeID: "node2",
- Order: "start",
- State: "running",
- CPU: 12,
- Mem: 5,
+ NodeID: "node2",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 12,
+ Mem: 5,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar2",
@@ -2319,7 +2383,10 @@ func TestFindBestNodesForProcess(t *testing.T) {
resources := NewResourcePlanner(nodes)
- list := resources.FindBestNodes(35, 20)
+ list := resources.FindBestNodes(Resources{
+ CPU: 35,
+ Mem: 20,
+ })
require.Equal(t, []string{"node3", "node2", "node1"}, list)
}
@@ -2433,7 +2500,10 @@ func TestFindBestNodesForProcess2(t *testing.T) {
},
}
- list := resources.FindBestNodes(4.0, 45*1024*1024)
+ list := resources.FindBestNodes(Resources{
+ CPU: 4.0,
+ Mem: 45 * 1024 * 1024,
+ })
require.Equal(t, []string{"node10", "node8", "node7", "node1", "node5", "node12", "node4", "node3", "node13", "node6", "node11", "node2"}, list)
}
@@ -2441,11 +2511,13 @@ func TestFindBestNodesForProcess2(t *testing.T) {
func TestCreateNodeProcessMap(t *testing.T) {
processes := []node.Process{
{
- NodeID: "node1",
- Order: "start",
- State: "finished",
- CPU: 1,
- Mem: 1,
+ NodeID: "node1",
+ Order: "start",
+ State: "finished",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 1,
Config: &app.Config{
ID: "foobar7",
@@ -2453,11 +2525,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
},
{
- NodeID: "node1",
- Order: "start",
- State: "failed",
- CPU: 1,
- Mem: 1,
+ NodeID: "node1",
+ Order: "start",
+ State: "failed",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 1,
Config: &app.Config{
ID: "foobar8",
@@ -2465,22 +2539,26 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
},
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar1",
},
},
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 1,
Config: &app.Config{
ID: "foobar2",
@@ -2488,11 +2566,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
},
{
- NodeID: "node2",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node2",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 67,
Config: &app.Config{
ID: "foobar3",
@@ -2500,11 +2580,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
},
{
- NodeID: "node2",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node2",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar6",
@@ -2512,11 +2594,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
},
{
- NodeID: "node3",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node3",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 41,
Config: &app.Config{
ID: "foobar4",
@@ -2524,11 +2608,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
},
{
- NodeID: "node3",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node3",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar5",
@@ -2542,11 +2628,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
require.Equal(t, map[string][]node.Process{
"node1": {
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 1,
Config: &app.Config{
ID: "foobar2",
@@ -2554,11 +2642,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
},
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar1",
@@ -2567,11 +2657,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
"node2": {
{
- NodeID: "node2",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node2",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar6",
@@ -2579,11 +2671,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
},
{
- NodeID: "node2",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node2",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 67,
Config: &app.Config{
ID: "foobar3",
@@ -2593,11 +2687,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
"node3": {
{
- NodeID: "node3",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node3",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 41,
Config: &app.Config{
ID: "foobar4",
@@ -2605,11 +2701,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
},
{
- NodeID: "node3",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node3",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar5",
@@ -2623,22 +2721,26 @@ func TestCreateNodeProcessMap(t *testing.T) {
func TestCreateReferenceAffinityNodeMap(t *testing.T) {
processes := []node.Process{
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar1",
},
},
{
- NodeID: "node1",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node1",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 1,
Config: &app.Config{
ID: "foobar2",
@@ -2646,11 +2748,13 @@ func TestCreateReferenceAffinityNodeMap(t *testing.T) {
},
},
{
- NodeID: "node2",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node2",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar3",
@@ -2658,11 +2762,13 @@ func TestCreateReferenceAffinityNodeMap(t *testing.T) {
},
},
{
- NodeID: "node2",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node2",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar3",
@@ -2670,11 +2776,13 @@ func TestCreateReferenceAffinityNodeMap(t *testing.T) {
},
},
{
- NodeID: "node3",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node3",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar4",
@@ -2682,11 +2790,13 @@ func TestCreateReferenceAffinityNodeMap(t *testing.T) {
},
},
{
- NodeID: "node3",
- Order: "start",
- State: "running",
- CPU: 1,
- Mem: 1,
+ NodeID: "node3",
+ Order: "start",
+ State: "running",
+ Resources: node.ProcessResources{
+ CPU: 1,
+ Mem: 1,
+ },
Runtime: 42,
Config: &app.Config{
ID: "foobar5",
diff --git a/cluster/node/core.go b/cluster/node/core.go
index 5341db06..9dc87e87 100644
--- a/cluster/node/core.go
+++ b/cluster/node/core.go
@@ -747,16 +747,62 @@ func (n *Core) MediaGetInfo(prefix, path string) (int64, time.Time, error) {
}
type Process struct {
- NodeID string
- Order string
- State string
+ NodeID string
+ Order string
+ State string
+ Resources ProcessResources
+ Runtime time.Duration
+ UpdatedAt time.Time
+ Config *app.Config
+ Metadata map[string]interface{}
+}
+
+type ProcessResources struct {
CPU float64 // Current CPU load of this process, 0-100*ncpu
Mem uint64 // Currently consumed memory of this process in bytes
+ GPU ProcessGPUResources
Throttling bool
- Runtime time.Duration
- UpdatedAt time.Time
- Config *app.Config
- Metadata map[string]interface{}
+}
+
+type ProcessGPUResources struct {
+ Index int // GPU number
+ Usage float64 // Current GPU load, 0-100
+ Encoder float64 // Current GPU encoder load, 0-100
+ Decoder float64 // Current GPU decoder load, 0-100
+ Mem uint64 // Currently consumed GPU memory of this process in bytes
+}
+
+func (p *ProcessResources) Marshal(a *api.ProcessUsage) {
+ p.Throttling = a.CPU.IsThrottling
+
+ if x, err := a.CPU.Current.Float64(); err == nil {
+ p.CPU = x
+ } else {
+ p.CPU = 0
+ }
+
+ p.Mem = a.Memory.Current
+
+ if x, err := a.GPU.Usage.Current.Float64(); err == nil {
+ p.GPU.Usage = x
+ } else {
+ p.GPU.Usage = 0
+ }
+
+ if x, err := a.GPU.Encoder.Current.Float64(); err == nil {
+ p.GPU.Encoder = x
+ } else {
+ p.GPU.Encoder = 0
+ }
+
+ if x, err := a.GPU.Decoder.Current.Float64(); err == nil {
+ p.GPU.Decoder = x
+ } else {
+ p.GPU.Decoder = 0
+ }
+
+ p.GPU.Mem = a.GPU.Memory.Current
+ p.GPU.Index = a.GPU.Index
}
func (n *Core) ClusterProcessList() ([]Process, error) {
@@ -780,21 +826,15 @@ func (n *Core) ClusterProcessList() ([]Process, error) {
p.Config = &api.ProcessConfig{}
}
- cpu, err := p.State.Resources.CPU.Current.Float64()
- if err != nil {
- cpu = 0
+ process := Process{
+ NodeID: nodeid,
+ Order: p.State.Order,
+ State: p.State.State,
+ Runtime: time.Duration(p.State.Runtime) * time.Second,
+ UpdatedAt: time.Unix(p.UpdatedAt, 0),
}
- process := Process{
- NodeID: nodeid,
- Order: p.State.Order,
- State: p.State.State,
- Mem: p.State.Resources.Memory.Current,
- CPU: cpu,
- Throttling: p.State.Resources.CPU.IsThrottling,
- Runtime: time.Duration(p.State.Runtime) * time.Second,
- UpdatedAt: time.Unix(p.UpdatedAt, 0),
- }
+ process.Resources.Marshal(&p.State.Resources)
config, _ := p.Config.Marshal()
diff --git a/cluster/node/node.go b/cluster/node/node.go
index 078da13a..c1daf191 100644
--- a/cluster/node/node.go
+++ b/cluster/node/node.go
@@ -138,17 +138,28 @@ type About struct {
Resources Resources
}
+type ResourcesGPU struct {
+ Mem uint64 // Currently used memory in bytes
+ MemLimit uint64 // Defined memory limit in bytes
+ MemTotal uint64 // Total available memory in bytes
+ Usage float64 // Current general usage, 0-100
+ UsageLimit float64 // Defined general usage limit, 0-100
+ Encoder float64 // Current encoder usage, 0-100
+ Decoder float64 // Current decoder usage, 0-100
+}
+
type Resources struct {
- IsThrottling bool // Whether this core is currently throttling
- NCPU float64 // Number of CPU on this node
- CPU float64 // Current CPU load, 0-100*ncpu
- CPULimit float64 // Defined CPU load limit, 0-100*ncpu
- CPUCore float64 // Current CPU load of the core itself, 0-100*ncpu
- Mem uint64 // Currently used memory in bytes
- MemLimit uint64 // Defined memory limit in bytes
- MemTotal uint64 // Total available memory in bytes
- MemCore uint64 // Current used memory of the core itself in bytes
- Error error // Last error
+ IsThrottling bool // Whether this core is currently throttling
+ NCPU float64 // Number of CPU on this node
+ CPU float64 // Current CPU load, 0-100*ncpu
+ CPULimit float64 // Defined CPU load limit, 0-100*ncpu
+ CPUCore float64 // Current CPU load of the core itself, 0-100*ncpu
+ Mem uint64 // Currently used memory in bytes
+ MemLimit uint64 // Defined memory limit in bytes
+ MemTotal uint64 // Total available memory in bytes
+ MemCore uint64 // Current used memory of the core itself in bytes
+ GPU []ResourcesGPU // Currently used GPU resources
+ Error error // Last error
}
func (n *Node) About() About {
@@ -514,6 +525,20 @@ func (n *Node) ping(ctx context.Context, interval time.Duration) {
Error: nil,
},
}
+
+ if len(about.Resources.GPU) != 0 {
+ n.nodeAbout.Resources.GPU = make([]ResourcesGPU, len(about.Resources.GPU))
+ for i, gpu := range about.Resources.GPU {
+ n.nodeAbout.Resources.GPU[i].Mem = gpu.Mem
+ n.nodeAbout.Resources.GPU[i].MemLimit = gpu.MemLimit
+ n.nodeAbout.Resources.GPU[i].MemTotal = gpu.MemTotal
+ n.nodeAbout.Resources.GPU[i].Usage = gpu.Usage
+ n.nodeAbout.Resources.GPU[i].UsageLimit = gpu.UsageLimit
+ n.nodeAbout.Resources.GPU[i].Encoder = gpu.Encoder
+ n.nodeAbout.Resources.GPU[i].Decoder = gpu.Decoder
+ }
+ }
+
if len(about.Resources.Error) != 0 {
n.nodeAbout.Resources.Error = errors.New(about.Resources.Error)
}
diff --git a/cluster/resources.go b/cluster/resources.go
index 2b5bb2c9..cc81b828 100644
--- a/cluster/resources.go
+++ b/cluster/resources.go
@@ -4,8 +4,69 @@ import (
"sort"
"github.com/datarhei/core/v16/cluster/node"
+ "github.com/datarhei/core/v16/restream/app"
)
+type Resources struct {
+ CPU float64 // CPU 0-100*ncpu
+ Mem uint64 // Memoryin bytes
+ GPU ResourcesGPU // GPU resources
+}
+
+type ResourcesGPU struct {
+ Index int // GPU number
+ Usage float64 // GPU general, 0-100
+ Encoder float64 // GPU encoder, 0-100
+ Decoder float64 // GPU decoder, 0-100
+ Mem uint64 // GPU memory in bytes
+}
+
+func ResourcesFromConfig(c *app.Config) Resources {
+ r := Resources{}
+ r.MarshalConfig(c)
+ return r
+}
+
+func ResourcesFromProcess(c node.ProcessResources) Resources {
+ r := Resources{}
+ r.MarshalProcess(c)
+ return r
+}
+
+func (r *Resources) MarshalConfig(c *app.Config) {
+ r.CPU = c.LimitCPU
+ r.Mem = c.LimitMemory
+ r.GPU.Usage = c.LimitGPU.Usage
+ r.GPU.Encoder = c.LimitGPU.Encoder
+ r.GPU.Decoder = c.LimitGPU.Decoder
+ r.GPU.Index = -1
+}
+
+func (r *Resources) MarshalProcess(c node.ProcessResources) {
+ r.CPU = c.CPU
+ r.Mem = c.Mem
+ r.GPU.Usage = c.GPU.Usage
+ r.GPU.Encoder = c.GPU.Encoder
+ r.GPU.Decoder = c.GPU.Decoder
+ r.GPU.Index = c.GPU.Index
+}
+
+func (r *Resources) HasGPU() bool {
+ if r.GPU.Usage > 0 || r.GPU.Encoder > 0 || r.GPU.Decoder > 0 || r.GPU.Mem > 0 {
+ return true
+ }
+
+ return false
+}
+
+func (r *Resources) DoesFitGPU(g node.ResourcesGPU) bool {
+ if g.Usage+r.GPU.Usage < g.UsageLimit && g.Encoder+r.GPU.Encoder < g.UsageLimit && g.Decoder+r.GPU.Decoder < g.UsageLimit && g.Mem+r.GPU.Mem < g.MemLimit {
+ return true
+ }
+
+ return false
+}
+
type resourcePlanner struct {
nodes map[string]node.Resources
blocked map[string]struct{}
@@ -39,8 +100,8 @@ func (r *resourcePlanner) Throttling(nodeid string, throttling bool) {
}
// HasNodeEnough returns whether a node has enough resources available for the
-// requested cpu and memory consumption.
-func (r *resourcePlanner) HasNodeEnough(nodeid string, cpu float64, mem uint64) bool {
+// requested cpu, memory, anf gpu consumption.
+func (r *resourcePlanner) HasNodeEnough(nodeid string, req Resources) bool {
res, hasNode := r.nodes[nodeid]
if !hasNode {
return false
@@ -50,20 +111,39 @@ func (r *resourcePlanner) HasNodeEnough(nodeid string, cpu float64, mem uint64)
return false
}
- if res.Error == nil && res.CPU+cpu < res.CPULimit && res.Mem+mem < res.MemLimit && !res.IsThrottling {
- return true
+ if res.Error != nil || res.IsThrottling {
+ return false
}
- return false
+ if res.CPU+req.CPU >= res.CPULimit || res.Mem+req.Mem >= res.MemLimit {
+ return false
+ }
+
+ if req.HasGPU() {
+ found := false
+
+ for _, g := range res.GPU {
+ if req.DoesFitGPU(g) {
+ found = true
+ break
+ }
+ }
+
+ if !found {
+ return false
+ }
+ }
+
+ return true
}
-// FindBestNodes returns an array of nodeids that can fit the requested cpu and memory requirements. If no
+// FindBestNodes returns an array of nodeids that can fit the requested cpu, memory, and gpu requirements. If no
// such node is available, an empty array is returned. The array is sorted by the most suitable node first.
-func (r *resourcePlanner) FindBestNodes(cpu float64, mem uint64) []string {
+func (r *resourcePlanner) FindBestNodes(req Resources) []string {
nodes := []string{}
for id := range r.nodes {
- if r.HasNodeEnough(id, cpu, mem) {
+ if r.HasNodeEnough(id, req) {
nodes = append(nodes, id)
}
}
@@ -81,43 +161,72 @@ func (r *resourcePlanner) FindBestNodes(cpu float64, mem uint64) []string {
return nodes
}
-// Add adds the resources of the node according to the cpu and memory utilization.
-func (r *resourcePlanner) Add(nodeid string, cpu float64, mem uint64) {
+// Add adds the resources of the node according to the cpu, memory, and gpu utilization.
+func (r *resourcePlanner) Add(nodeid string, req Resources) {
res, hasRes := r.nodes[nodeid]
if !hasRes {
return
}
- res.CPU += cpu
- res.Mem += mem
+ res.CPU += req.CPU
+ res.Mem += req.Mem
+
+ if req.HasGPU() {
+ for i, g := range res.GPU {
+ if req.DoesFitGPU(g) {
+ g.Usage += req.GPU.Usage
+ g.Encoder += req.GPU.Encoder
+ g.Decoder += req.GPU.Decoder
+ g.Mem += req.GPU.Mem
+ res.GPU[i] = g
+ break
+ }
+ }
+ }
+
r.nodes[nodeid] = res
}
-// Remove subtracts the resources from the node according to the cpu and memory utilization.
-func (r *resourcePlanner) Remove(nodeid string, cpu float64, mem uint64) {
+// Remove subtracts the resources from the node according to the cpu, memory, and gpu utilization.
+func (r *resourcePlanner) Remove(nodeid string, req Resources) {
res, hasRes := r.nodes[nodeid]
if !hasRes {
return
}
- res.CPU -= cpu
- if res.CPU < 0 {
- res.CPU = 0
- }
- if mem >= res.Mem {
- res.Mem = 0
- } else {
- res.Mem -= mem
+ res.CPU -= min(res.CPU, req.CPU)
+ res.Mem -= min(res.Mem, req.Mem)
+
+ if req.HasGPU() {
+ if req.GPU.Index > 0 && req.GPU.Index < len(res.GPU) {
+ gpu := res.GPU[req.GPU.Index]
+ gpu.Usage -= min(gpu.Usage, req.GPU.Usage)
+ gpu.Encoder -= min(gpu.Encoder, req.GPU.Encoder)
+ gpu.Decoder -= min(gpu.Decoder, req.GPU.Decoder)
+ gpu.Mem -= min(gpu.Mem, req.GPU.Mem)
+ res.GPU[req.GPU.Index] = gpu
+ }
}
+
r.nodes[nodeid] = res
}
// Move adjusts the resources from the target and source node according to the cpu and memory utilization.
-func (r *resourcePlanner) Move(target, source string, cpu float64, mem uint64) {
- r.Add(target, cpu, mem)
- r.Remove(source, cpu, mem)
+func (r *resourcePlanner) Move(target, source string, req Resources) {
+ r.Add(target, req)
+ r.Remove(source, req)
}
func (r *resourcePlanner) Map() map[string]node.Resources {
return r.nodes
}
+
+func (r *resourcePlanner) Blocked() []string {
+ nodes := []string{}
+
+ for nodeid := range r.blocked {
+ nodes = append(nodes, nodeid)
+ }
+
+ return nodes
+}
diff --git a/cluster/resources_test.go b/cluster/resources_test.go
new file mode 100644
index 00000000..2f938a31
--- /dev/null
+++ b/cluster/resources_test.go
@@ -0,0 +1,603 @@
+package cluster
+
+import (
+ "testing"
+
+ "github.com/datarhei/core/v16/cluster/node"
+ "github.com/stretchr/testify/require"
+)
+
+func TestResources(t *testing.T) {
+ r := Resources{
+ CPU: 1,
+ Mem: 1,
+ }
+
+ require.False(t, r.HasGPU())
+
+ r.GPU = ResourcesGPU{
+ Index: 0,
+ Usage: 1,
+ Encoder: 0,
+ Decoder: 0,
+ Mem: 1,
+ }
+
+ require.True(t, r.HasGPU())
+}
+
+func TestResourcePlanner(t *testing.T) {
+ nodes := map[string]node.About{
+ "node1": {
+ State: "online",
+ Resources: node.Resources{
+ NCPU: 1,
+ CPU: 7,
+ Mem: 35,
+ CPULimit: 90,
+ MemLimit: 90,
+ },
+ },
+ "node2": {
+ State: "online",
+ Resources: node.Resources{
+ NCPU: 1,
+ CPU: 85,
+ Mem: 11,
+ CPULimit: 90,
+ MemLimit: 90,
+ },
+ },
+ }
+
+ planner := NewResourcePlanner(nodes)
+
+ require.Equal(t, map[string]node.Resources{
+ "node1": {
+ NCPU: 1,
+ CPU: 7,
+ Mem: 35,
+ CPULimit: 90,
+ MemLimit: 90,
+ },
+ "node2": {
+ NCPU: 1,
+ CPU: 85,
+ Mem: 11,
+ CPULimit: 90,
+ MemLimit: 90,
+ },
+ }, planner.Map())
+}
+
+func TestResourcePlannerBlocked(t *testing.T) {
+ nodes := map[string]node.About{
+ "node1": {
+ State: "degraded",
+ Resources: node.Resources{
+ NCPU: 1,
+ CPU: 7,
+ Mem: 35,
+ CPULimit: 90,
+ MemLimit: 90,
+ },
+ },
+ "node2": {
+ State: "online",
+ Resources: node.Resources{
+ NCPU: 1,
+ CPU: 85,
+ Mem: 11,
+ CPULimit: 90,
+ MemLimit: 90,
+ },
+ },
+ }
+
+ planner := NewResourcePlanner(nodes)
+
+ require.Equal(t, []string{"node1"}, planner.Blocked())
+}
+
+func TestResourcePlannerThrottling(t *testing.T) {
+ nodes := map[string]node.About{
+ "node1": {
+ State: "online",
+ Resources: node.Resources{
+ NCPU: 1,
+ CPU: 7,
+ Mem: 35,
+ CPULimit: 90,
+ MemLimit: 90,
+ },
+ },
+ "node2": {
+ State: "online",
+ Resources: node.Resources{
+ NCPU: 1,
+ CPU: 85,
+ Mem: 11,
+ CPULimit: 90,
+ MemLimit: 90,
+ },
+ },
+ }
+
+ planner := NewResourcePlanner(nodes)
+
+ require.True(t, planner.HasNodeEnough("node1", Resources{
+ CPU: 30,
+ Mem: 5,
+ }))
+
+ planner.Throttling("node1", true)
+
+ require.False(t, planner.HasNodeEnough("node1", Resources{
+ CPU: 30,
+ Mem: 5,
+ }))
+
+ planner.Throttling("node1", false)
+
+ require.True(t, planner.HasNodeEnough("node1", Resources{
+ CPU: 30,
+ Mem: 5,
+ }))
+}
+
+func TestRecourcePlannerHasNodeEnough(t *testing.T) {
+ nodes := map[string]node.About{
+ "node1": {
+ State: "online",
+ Resources: node.Resources{
+ NCPU: 1,
+ CPU: 7,
+ Mem: 35,
+ CPULimit: 90,
+ MemLimit: 90,
+ GPU: []node.ResourcesGPU{
+ {
+ Mem: 5,
+ MemLimit: 90,
+ Usage: 53,
+ UsageLimit: 90,
+ Encoder: 32,
+ Decoder: 26,
+ },
+ {
+ Mem: 85,
+ MemLimit: 90,
+ Usage: 64,
+ UsageLimit: 90,
+ Encoder: 43,
+ Decoder: 12,
+ },
+ },
+ },
+ },
+ "node2": {
+ State: "online",
+ Resources: node.Resources{
+ NCPU: 1,
+ CPU: 85,
+ Mem: 11,
+ CPULimit: 90,
+ MemLimit: 90,
+ GPU: []node.ResourcesGPU{
+ {
+ Mem: 5,
+ MemLimit: 90,
+ Usage: 53,
+ UsageLimit: 90,
+ Encoder: 32,
+ Decoder: 26,
+ },
+ },
+ },
+ },
+ }
+
+ planner := NewResourcePlanner(nodes)
+
+ require.True(t, planner.HasNodeEnough("node1", Resources{
+ CPU: 30,
+ Mem: 5,
+ }))
+
+ require.False(t, planner.HasNodeEnough("node2", Resources{
+ CPU: 30,
+ Mem: 5,
+ }))
+
+ require.True(t, planner.HasNodeEnough("node1", Resources{
+ CPU: 30,
+ Mem: 5,
+ GPU: ResourcesGPU{
+ Usage: 0,
+ Encoder: 0,
+ Decoder: 0,
+ Mem: 50,
+ },
+ }))
+
+ require.False(t, planner.HasNodeEnough("node1", Resources{
+ CPU: 30,
+ Mem: 5,
+ GPU: ResourcesGPU{
+ Usage: 0,
+ Encoder: 0,
+ Decoder: 0,
+ Mem: 86,
+ },
+ }))
+
+ require.True(t, planner.HasNodeEnough("node1", Resources{
+ CPU: 30,
+ Mem: 5,
+ GPU: ResourcesGPU{
+ Usage: 0,
+ Encoder: 50,
+ Decoder: 0,
+ Mem: 50,
+ },
+ }))
+}
+
+func TestResourcePlannerAdd(t *testing.T) {
+ nodes := map[string]node.About{
+ "node1": {
+ State: "online",
+ Resources: node.Resources{
+ NCPU: 1,
+ CPU: 7,
+ Mem: 35,
+ CPULimit: 90,
+ MemLimit: 90,
+ },
+ },
+ }
+
+ planner := NewResourcePlanner(nodes)
+
+ planner.Add("node1", Resources{
+ CPU: 42,
+ Mem: 33,
+ })
+
+ require.Equal(t, map[string]node.Resources{
+ "node1": {
+ NCPU: 1,
+ CPU: 49,
+ Mem: 68,
+ CPULimit: 90,
+ MemLimit: 90,
+ },
+ }, planner.Map())
+}
+
+func TestResourcePlannerNoGPUAddGPU(t *testing.T) {
+ nodes := map[string]node.About{
+ "node1": {
+ State: "online",
+ Resources: node.Resources{
+ NCPU: 1,
+ CPU: 7,
+ Mem: 35,
+ CPULimit: 90,
+ MemLimit: 90,
+ },
+ },
+ }
+
+ planner := NewResourcePlanner(nodes)
+
+ planner.Add("node1", Resources{
+ CPU: 42,
+ Mem: 33,
+ GPU: ResourcesGPU{
+ Index: 0,
+ Usage: 1,
+ Encoder: 2,
+ Decoder: 3,
+ Mem: 4,
+ },
+ })
+
+ require.Equal(t, map[string]node.Resources{
+ "node1": {
+ NCPU: 1,
+ CPU: 49,
+ Mem: 68,
+ CPULimit: 90,
+ MemLimit: 90,
+ },
+ }, planner.Map())
+}
+
+func TestResourcePlannerAddGPU(t *testing.T) {
+ nodes := map[string]node.About{
+ "node1": {
+ State: "online",
+ Resources: node.Resources{
+ NCPU: 1,
+ CPU: 7,
+ Mem: 35,
+ CPULimit: 90,
+ MemLimit: 90,
+ GPU: []node.ResourcesGPU{
+ {
+ Mem: 0,
+ MemLimit: 0,
+ Usage: 0,
+ UsageLimit: 0,
+ Encoder: 0,
+ Decoder: 0,
+ },
+ {
+ Mem: 0,
+ MemLimit: 100,
+ Usage: 0,
+ UsageLimit: 100,
+ Encoder: 0,
+ Decoder: 0,
+ },
+ },
+ },
+ },
+ }
+
+ planner := NewResourcePlanner(nodes)
+
+ planner.Add("node1", Resources{
+ CPU: 42,
+ Mem: 33,
+ GPU: ResourcesGPU{
+ Usage: 1,
+ Encoder: 2,
+ Decoder: 3,
+ Mem: 4,
+ },
+ })
+
+ require.Equal(t, map[string]node.Resources{
+ "node1": {
+ NCPU: 1,
+ CPU: 49,
+ Mem: 68,
+ CPULimit: 90,
+ MemLimit: 90,
+ GPU: []node.ResourcesGPU{
+ {
+ Mem: 0,
+ MemLimit: 0,
+ Usage: 0,
+ UsageLimit: 0,
+ Encoder: 0,
+ Decoder: 0,
+ },
+ {
+ Mem: 4,
+ MemLimit: 100,
+ Usage: 1,
+ UsageLimit: 100,
+ Encoder: 2,
+ Decoder: 3,
+ },
+ },
+ },
+ }, planner.Map())
+}
+
+func TestResourcePlannerRemove(t *testing.T) {
+ nodes := map[string]node.About{
+ "node1": {
+ State: "online",
+ Resources: node.Resources{
+ NCPU: 1,
+ CPU: 53,
+ Mem: 35,
+ CPULimit: 90,
+ MemLimit: 90,
+ },
+ },
+ }
+
+ planner := NewResourcePlanner(nodes)
+
+ planner.Remove("node1", Resources{
+ CPU: 13,
+ Mem: 20,
+ })
+
+ require.Equal(t, map[string]node.Resources{
+ "node1": {
+ NCPU: 1,
+ CPU: 40,
+ Mem: 15,
+ CPULimit: 90,
+ MemLimit: 90,
+ },
+ }, planner.Map())
+}
+
+func TestResourcePlannerRemoveTooMuch(t *testing.T) {
+ nodes := map[string]node.About{
+ "node1": {
+ State: "online",
+ Resources: node.Resources{
+ NCPU: 1,
+ CPU: 53,
+ Mem: 35,
+ CPULimit: 90,
+ MemLimit: 90,
+ },
+ },
+ }
+
+ planner := NewResourcePlanner(nodes)
+
+ planner.Remove("node1", Resources{
+ CPU: 100,
+ Mem: 100,
+ })
+
+ require.Equal(t, map[string]node.Resources{
+ "node1": {
+ NCPU: 1,
+ CPU: 0,
+ Mem: 0,
+ CPULimit: 90,
+ MemLimit: 90,
+ },
+ }, planner.Map())
+}
+
+func TestResourcePlannerRemoveGPU(t *testing.T) {
+ nodes := map[string]node.About{
+ "node1": {
+ State: "online",
+ Resources: node.Resources{
+ NCPU: 1,
+ CPU: 53,
+ Mem: 35,
+ CPULimit: 90,
+ MemLimit: 90,
+ GPU: []node.ResourcesGPU{
+ {
+ Mem: 4,
+ MemLimit: 100,
+ Usage: 1,
+ UsageLimit: 100,
+ Encoder: 2,
+ Decoder: 3,
+ },
+ {
+ Mem: 23,
+ MemLimit: 100,
+ Usage: 43,
+ UsageLimit: 100,
+ Encoder: 95,
+ Decoder: 12,
+ },
+ },
+ },
+ },
+ }
+
+ planner := NewResourcePlanner(nodes)
+
+ planner.Remove("node1", Resources{
+ CPU: 13,
+ Mem: 20,
+ GPU: ResourcesGPU{
+ Index: 1,
+ Usage: 3,
+ Encoder: 40,
+ Decoder: 0,
+ Mem: 5,
+ },
+ })
+
+ require.Equal(t, map[string]node.Resources{
+ "node1": {
+ NCPU: 1,
+ CPU: 40,
+ Mem: 15,
+ CPULimit: 90,
+ MemLimit: 90,
+ GPU: []node.ResourcesGPU{
+ {
+ Mem: 4,
+ MemLimit: 100,
+ Usage: 1,
+ UsageLimit: 100,
+ Encoder: 2,
+ Decoder: 3,
+ },
+ {
+ Mem: 18,
+ MemLimit: 100,
+ Usage: 40,
+ UsageLimit: 100,
+ Encoder: 55,
+ Decoder: 12,
+ },
+ },
+ },
+ }, planner.Map())
+}
+
+func TestResourcePlannerRemoveGPUTooMuch(t *testing.T) {
+ nodes := map[string]node.About{
+ "node1": {
+ State: "online",
+ Resources: node.Resources{
+ NCPU: 1,
+ CPU: 53,
+ Mem: 35,
+ CPULimit: 90,
+ MemLimit: 90,
+ GPU: []node.ResourcesGPU{
+ {
+ Mem: 4,
+ MemLimit: 100,
+ Usage: 1,
+ UsageLimit: 100,
+ Encoder: 2,
+ Decoder: 3,
+ },
+ {
+ Mem: 23,
+ MemLimit: 100,
+ Usage: 43,
+ UsageLimit: 100,
+ Encoder: 95,
+ Decoder: 12,
+ },
+ },
+ },
+ },
+ }
+
+ planner := NewResourcePlanner(nodes)
+
+ planner.Remove("node1", Resources{
+ CPU: 13,
+ Mem: 20,
+ GPU: ResourcesGPU{
+ Index: 1,
+ Usage: 100,
+ Encoder: 100,
+ Decoder: 100,
+ Mem: 100,
+ },
+ })
+
+ require.Equal(t, map[string]node.Resources{
+ "node1": {
+ NCPU: 1,
+ CPU: 40,
+ Mem: 15,
+ CPULimit: 90,
+ MemLimit: 90,
+ GPU: []node.ResourcesGPU{
+ {
+ Mem: 4,
+ MemLimit: 100,
+ Usage: 1,
+ UsageLimit: 100,
+ Encoder: 2,
+ Decoder: 3,
+ },
+ {
+ Mem: 0,
+ MemLimit: 100,
+ Usage: 0,
+ UsageLimit: 100,
+ Encoder: 0,
+ Decoder: 0,
+ },
+ },
+ },
+ }, planner.Map())
+}
diff --git a/config/config.go b/config/config.go
index e0364c03..a878065b 100644
--- a/config/config.go
+++ b/config/config.go
@@ -306,8 +306,10 @@ func (d *Config) init() {
d.vars.Register(value.NewDir(&d.Router.UIPath, "", d.fs), "router.ui_path", "CORE_ROUTER_UI_PATH", nil, "Path to a directory holding UI files mounted as /ui", false, false)
// Resources
- d.vars.Register(value.NewFloat(&d.Resources.MaxCPUUsage, 0), "resources.max_cpu_usage", "CORE_RESOURCES_MAX_CPU_USAGE", nil, "Maximum system CPU usage in percent, from 0 (no limit) to 100", false, false)
- d.vars.Register(value.NewFloat(&d.Resources.MaxMemoryUsage, 0), "resources.max_memory_usage", "CORE_RESOURCES_MAX_MEMORY_USAGE", nil, "Maximum system usage in percent, from 0 (no limit) to 100", false, false)
+ d.vars.Register(value.NewFloatRange(&d.Resources.MaxCPUUsage, 0, 0, 100), "resources.max_cpu_usage", "CORE_RESOURCES_MAX_CPU_USAGE", nil, "Maximum system CPU usage in percent, from 0 (no limit) to 100", false, false)
+ d.vars.Register(value.NewFloatRange(&d.Resources.MaxMemoryUsage, 0, 0, 100), "resources.max_memory_usage", "CORE_RESOURCES_MAX_MEMORY_USAGE", nil, "Maximum system usage in percent, from 0 (no limit) to 100", false, false)
+ d.vars.Register(value.NewFloatRange(&d.Resources.MaxGPUUsage, 0, 0, 100), "resources.max_gpu_usage", "CORE_RESOURCES_MAX_GPU_USAGE", nil, "Maximum general, encoder, and decoder GPU usage in percent per GPU, from 0 (no limit) to 100", false, false)
+ d.vars.Register(value.NewFloatRange(&d.Resources.MaxGPUMemoryUsage, 0, 0, 100), "resources.max_gpu_memory_usage", "CORE_RESOURCES_MAX_GPU_MEMORY_USAGE", nil, "Maximum GPU memory usage in percent per GPU, from 0 (no limit) to 100", false, false)
// Cluster
d.vars.Register(value.NewBool(&d.Cluster.Enable, false), "cluster.enable", "CORE_CLUSTER_ENABLE", nil, "Enable cluster mode", false, false)
@@ -494,17 +496,6 @@ func (d *Config) Validate(resetLogs bool) {
}
}
- // If resource limits are given, all values must be set
- if d.Resources.MaxCPUUsage > 0 || d.Resources.MaxMemoryUsage > 0 {
- if d.Resources.MaxCPUUsage <= 0 || d.Resources.MaxCPUUsage > 100 {
- d.vars.Log("error", "resources.max_cpu_usage", "must be greater than 0 and smaller or equal to 100")
- }
-
- if d.Resources.MaxMemoryUsage <= 0 {
- d.vars.Log("error", "resources.max_memory_usage", "must be greater than 0 and smaller or equal to 100")
- }
- }
-
// If cluster mode is enabled, a proper address must be provided
if d.Cluster.Enable {
if len(d.Cluster.Address) == 0 {
diff --git a/config/data.go b/config/data.go
index 26c77054..f7f057cf 100644
--- a/config/data.go
+++ b/config/data.go
@@ -184,8 +184,10 @@ type Data struct {
UIPath string `json:"ui_path"`
} `json:"router"`
Resources struct {
- MaxCPUUsage float64 `json:"max_cpu_usage"` // percent 0-100
- MaxMemoryUsage float64 `json:"max_memory_usage"` // percent 0-100
+ MaxCPUUsage float64 `json:"max_cpu_usage"` // percent 0-100
+ MaxMemoryUsage float64 `json:"max_memory_usage"` // percent 0-100
+ MaxGPUUsage float64 `json:"max_gpu_usage"` // percent 0-100
+ MaxGPUMemoryUsage float64 `json:"max_gpu_memory_usage"` // percent 0-100
} `json:"resources"`
Cluster struct {
Enable bool `json:"enable"`
diff --git a/config/value/primitives.go b/config/value/primitives.go
index 4d1258fd..4c1ae54a 100644
--- a/config/value/primitives.go
+++ b/config/value/primitives.go
@@ -1,6 +1,7 @@
package value
import (
+ "fmt"
"sort"
"strconv"
"strings"
@@ -310,3 +311,56 @@ func (u *Float64) Validate() error {
func (u *Float64) IsEmpty() bool {
return float64(*u) == 0
}
+
+// float64 range
+
+type Float64Range struct {
+ p *float64
+ from float64
+ to float64
+}
+
+func NewFloatRange(p *float64, val, from, to float64) *Float64Range {
+ v := &Float64Range{
+ p: p,
+ from: from,
+ to: to,
+ }
+
+ *p = val
+
+ return v
+}
+
+func (s *Float64Range) Set(val string) error {
+ v, err := strconv.ParseFloat(val, 64)
+ if err != nil {
+ return err
+ }
+
+ *s.p = v
+
+ return nil
+}
+
+func (s *Float64Range) String() string {
+ if s.IsEmpty() {
+ return "(empty)"
+ }
+
+ return fmt.Sprintf("%.3f", *s.p)
+}
+
+func (s *Float64Range) Validate() error {
+ val := *s.p
+
+ if val < s.from || val > s.to {
+ return fmt.Errorf("value %f is not in range [%f, %f]", val, s.from, s.to)
+ }
+
+ return nil
+}
+
+func (s *Float64Range) IsEmpty() bool {
+ return *s.p == 0
+}
diff --git a/config/value/primitives_test.go b/config/value/primitives_test.go
index 4406d8b0..2ee865ff 100644
--- a/config/value/primitives_test.go
+++ b/config/value/primitives_test.go
@@ -165,3 +165,29 @@ func TestFloat64Value(t *testing.T) {
require.Equal(t, float64(77.7), x)
}
+
+func TestFloat64RangeValue(t *testing.T) {
+ var x float64
+
+ val := NewFloatRange(&x, 11.1, 0, 100)
+
+ require.Equal(t, "11.100", val.String())
+ require.NoError(t, val.Validate())
+ require.Equal(t, false, val.IsEmpty())
+
+ x = 42.5
+
+ require.Equal(t, "42.500", val.String())
+ require.NoError(t, val.Validate())
+ require.Equal(t, false, val.IsEmpty())
+
+ val.Set("77.7")
+
+ require.Equal(t, float64(77.7), x)
+
+ val.Set("101.9")
+
+ require.Equal(t, "101.900", val.String())
+ require.Error(t, val.Validate())
+ require.Equal(t, false, val.IsEmpty())
+}
diff --git a/ffmpeg/ffmpeg.go b/ffmpeg/ffmpeg.go
index 3b1e9710..1b3c96af 100644
--- a/ffmpeg/ffmpeg.go
+++ b/ffmpeg/ffmpeg.go
@@ -29,23 +29,26 @@ type FFmpeg interface {
}
type ProcessConfig struct {
- Reconnect bool // Whether to reconnect
- ReconnectDelay time.Duration // Duration until next reconnect
- StaleTimeout time.Duration // Duration to wait until killing the process if there is no progress in the process
- Timeout time.Duration // Duration to wait until killing the process
- LimitCPU float64 // Kill the process if the CPU usage in percent is above this value.
- LimitMemory uint64 // Kill the process if the memory consumption in bytes is above this value.
- LimitDuration time.Duration // Kill the process if the limits are exceeded for this duration.
- LimitMode string // How to limit the process, "hard" or "soft"
- Scheduler string // A scheduler for starting the process, either a concrete date (RFC3339) or in crontab syntax
- Args []string // Arguments for the process
- Parser process.Parser // Parser for the process output
- Logger log.Logger // Logger
- OnArgs func([]string) []string // Callback before starting the process to retrieve new arguments
- OnBeforeStart func() error // Callback which is called before the process will be started. If error is non-nil, the start will be refused.
- OnStart func() // Callback called after process has been started
- OnExit func(state string) // Callback called after the process stopped with exit state as argument
- OnStateChange func(from, to string) // Callback called on state change
+ Reconnect bool // Whether to reconnect
+ ReconnectDelay time.Duration // Duration until next reconnect
+ StaleTimeout time.Duration // Duration to wait until killing the process if there is no progress in the process
+ Timeout time.Duration // Duration to wait until killing the process
+ LimitCPU float64 // Kill the process if the CPU usage in percent is above this value.
+ LimitMemory uint64 // Kill the process if the memory consumption in bytes is above this value.
+ LimitGPUUsage float64 // Kill the process id the GPU usage (general) in percent is above this value.
+ LimitGPUEncoder float64 // Kill the process id the GPU usage (encoder) in percent is above this value.
+ LimitGPUDecoder float64 // Kill the process id the GPU usage (decoder) in percent is above this value.
+ LimitGPUMemory uint64 // Kill the process if the GPU memory consumption in bytes is above this value.
+ LimitDuration time.Duration // Kill the process if the limits are exceeded for this duration.
+ LimitMode string // How to limit the process, "hard" or "soft"
+ Scheduler string // A scheduler for starting the process, either a concrete date (RFC3339) or in crontab syntax
+ Args []string // Arguments for the process
+ Parser process.Parser // Parser for the process output
+ Logger log.Logger // Logger
+ OnBeforeStart func([]string) ([]string, error) // Callback which is called before the process will be started. The string slice is the list of arguments which can be modified. If error is non-nil, the start will be refused.
+ OnStart func() // Callback called after process has been started
+ OnExit func(state string) // Callback called after the process stopped with exit state as argument
+ OnStateChange func(from, to string) // Callback called on state change
}
// Config is the configuration for ffmpeg that is part of the configuration
@@ -138,23 +141,26 @@ func (f *ffmpeg) New(config ProcessConfig) (process.Process, error) {
}
ffmpeg, err := process.New(process.Config{
- Binary: f.binary,
- Args: config.Args,
- Reconnect: config.Reconnect,
- ReconnectDelay: config.ReconnectDelay,
- StaleTimeout: config.StaleTimeout,
- Timeout: config.Timeout,
- LimitCPU: config.LimitCPU,
- LimitMemory: config.LimitMemory,
- LimitDuration: config.LimitDuration,
- LimitMode: limitMode,
- Scheduler: scheduler,
- Parser: config.Parser,
- Logger: config.Logger,
- OnArgs: config.OnArgs,
- OnBeforeStart: config.OnBeforeStart,
- OnStart: config.OnStart,
- OnExit: config.OnExit,
+ Binary: f.binary,
+ Args: config.Args,
+ Reconnect: config.Reconnect,
+ ReconnectDelay: config.ReconnectDelay,
+ StaleTimeout: config.StaleTimeout,
+ Timeout: config.Timeout,
+ LimitCPU: config.LimitCPU,
+ LimitMemory: config.LimitMemory,
+ LimitGPUUsage: config.LimitGPUUsage,
+ LimitGPUEncoder: config.LimitGPUEncoder,
+ LimitGPUDecoder: config.LimitGPUDecoder,
+ LimitGPUMemory: config.LimitGPUMemory,
+ LimitDuration: config.LimitDuration,
+ LimitMode: limitMode,
+ Scheduler: scheduler,
+ Parser: config.Parser,
+ Logger: config.Logger,
+ OnBeforeStart: config.OnBeforeStart,
+ OnStart: config.OnStart,
+ OnExit: config.OnExit,
OnStateChange: func(from, to string) {
f.statesLock.Lock()
switch to {
diff --git a/ffmpeg/parse/parser.go b/ffmpeg/parse/parser.go
index b4912af1..2259159c 100644
--- a/ffmpeg/parse/parser.go
+++ b/ffmpeg/parse/parser.go
@@ -619,7 +619,7 @@ func (p *parser) Stop(state string, pusage process.Usage) {
usage.CPU.Max = pusage.CPU.Max
usage.CPU.Limit = pusage.CPU.Limit
- usage.Memory.Average = pusage.Memory.Average
+ usage.Memory.Average = uint64(pusage.Memory.Average)
usage.Memory.Max = pusage.Memory.Max
usage.Memory.Limit = pusage.Memory.Limit
diff --git a/ffmpeg/parse/types.go b/ffmpeg/parse/types.go
index a3eb31fc..1c98f6e8 100644
--- a/ffmpeg/parse/types.go
+++ b/ffmpeg/parse/types.go
@@ -576,6 +576,7 @@ type AVstream struct {
type Usage struct {
CPU UsageCPU
Memory UsageMemory
+ GPU UsageGPU
}
type UsageCPU struct {
@@ -586,7 +587,27 @@ type UsageCPU struct {
}
type UsageMemory struct {
+ Average uint64
+ Max uint64
+ Limit uint64
+}
+
+type UsageGPU struct {
+ Index int
+ Usage UsageGPUUsage
+ Encoder UsageGPUUsage
+ Decoder UsageGPUUsage
+ Memory UsageGPUMemory
+}
+
+type UsageGPUUsage struct {
Average float64
+ Max float64
+ Limit float64
+}
+
+type UsageGPUMemory struct {
+ Average uint64
Max uint64
Limit uint64
}
diff --git a/http/api/process.go b/http/api/process.go
index baf87707..43a9fce9 100644
--- a/http/api/process.go
+++ b/http/api/process.go
@@ -155,9 +155,13 @@ type ProcessConfigIOCleanup struct {
}
type ProcessConfigLimits struct {
- CPU float64 `json:"cpu_usage" jsonschema:"minimum=0"`
- Memory uint64 `json:"memory_mbytes" jsonschema:"minimum=0" format:"uint64"`
- WaitFor uint64 `json:"waitfor_seconds" jsonschema:"minimum=0" format:"uint64"`
+ CPU float64 `json:"cpu_usage" jsonschema:"minimum=0"`
+ Memory uint64 `json:"memory_mbytes" jsonschema:"minimum=0" format:"uint64"`
+ GPUUsage float64 `json:"gpu_usage" jsonschema:"minimum=0"`
+ GPUEncoder float64 `json:"gpu_encoder" jsonschema:"minimum=0"`
+ GPUDecoder float64 `json:"gpu_decoder" jsonschema:"minimum=0"`
+ GPUMemory uint64 `json:"gpu_memory_mbytes" jsonschema:"minimum=0" format:"uint64"`
+ WaitFor uint64 `json:"waitfor_seconds" jsonschema:"minimum=0" format:"uint64"`
}
// ProcessConfig represents the configuration of an ffmpeg process
@@ -197,7 +201,13 @@ func (cfg *ProcessConfig) Marshal() (*app.Config, map[string]interface{}) {
Scheduler: cfg.Scheduler,
LimitCPU: cfg.Limits.CPU,
LimitMemory: cfg.Limits.Memory * 1024 * 1024,
- LimitWaitFor: cfg.Limits.WaitFor,
+ LimitGPU: app.ConfigLimitGPU{
+ Usage: cfg.Limits.GPUUsage,
+ Encoder: cfg.Limits.GPUEncoder,
+ Decoder: cfg.Limits.GPUDecoder,
+ Memory: cfg.Limits.GPUMemory * 1024 * 1024,
+ },
+ LimitWaitFor: cfg.Limits.WaitFor,
}
cfg.generateInputOutputIDs(cfg.Input)
@@ -283,6 +293,10 @@ func (cfg *ProcessConfig) Unmarshal(c *app.Config, metadata map[string]interface
cfg.Scheduler = c.Scheduler
cfg.Limits.CPU = c.LimitCPU
cfg.Limits.Memory = c.LimitMemory / 1024 / 1024
+ cfg.Limits.GPUUsage = c.LimitGPU.Usage
+ cfg.Limits.GPUEncoder = c.LimitGPU.Encoder
+ cfg.Limits.GPUDecoder = c.LimitGPU.Decoder
+ cfg.Limits.GPUMemory = c.LimitGPU.Memory / 1024 / 1024
cfg.Limits.WaitFor = c.LimitWaitFor
cfg.Options = make([]string, len(c.Options))
@@ -364,20 +378,7 @@ func (s *ProcessState) Unmarshal(state *app.State) {
s.Memory = state.Memory
s.CPU = json.ToNumber(state.CPU)
s.LimitMode = state.LimitMode
- s.Resources.CPU = ProcessUsageCPU{
- NCPU: json.ToNumber(state.Resources.CPU.NCPU),
- Current: json.ToNumber(state.Resources.CPU.Current),
- Average: json.ToNumber(state.Resources.CPU.Average),
- Max: json.ToNumber(state.Resources.CPU.Max),
- Limit: json.ToNumber(state.Resources.CPU.Limit),
- IsThrottling: state.Resources.CPU.IsThrottling,
- }
- s.Resources.Memory = ProcessUsageMemory{
- Current: state.Resources.Memory.Current,
- Average: json.ToNumber(state.Resources.Memory.Average),
- Max: state.Resources.Memory.Max,
- Limit: state.Resources.Memory.Limit,
- }
+ s.Resources.Unmarshal(&state.Resources)
s.Command = state.Command
s.Progress.Unmarshal(&state.Progress)
@@ -430,15 +431,15 @@ func (p *ProcessUsageCPU) Marshal() app.ProcessUsageCPU {
}
type ProcessUsageMemory struct {
- Current uint64 `json:"cur" format:"uint64"`
- Average json.Number `json:"avg" swaggertype:"number" jsonschema:"type=number"`
- Max uint64 `json:"max" format:"uint64"`
- Limit uint64 `json:"limit" format:"uint64"`
+ Current uint64 `json:"cur" format:"uint64"`
+ Average uint64 `json:"avg" format:"uint64"`
+ Max uint64 `json:"max" format:"uint64"`
+ Limit uint64 `json:"limit" format:"uint64"`
}
func (p *ProcessUsageMemory) Unmarshal(pp *app.ProcessUsageMemory) {
p.Current = pp.Current
- p.Average = json.ToNumber(pp.Average)
+ p.Average = pp.Average
p.Max = pp.Max
p.Limit = pp.Limit
}
@@ -446,31 +447,120 @@ func (p *ProcessUsageMemory) Unmarshal(pp *app.ProcessUsageMemory) {
func (p *ProcessUsageMemory) Marshal() app.ProcessUsageMemory {
pp := app.ProcessUsageMemory{
Current: p.Current,
+ Average: p.Average,
Max: p.Max,
Limit: p.Limit,
}
+ return pp
+}
+
+type ProcessUsageGPUMemory struct {
+ Current uint64 `json:"cur" format:"uint64"`
+ Average uint64 `json:"avg" format:"uint64"`
+ Max uint64 `json:"max" format:"uint64"`
+ Limit uint64 `json:"limit" format:"uint64"`
+}
+
+func (p *ProcessUsageGPUMemory) Unmarshal(pp *app.ProcessUsageGPUMemory) {
+ p.Current = pp.Current
+ p.Average = pp.Average
+ p.Max = pp.Max
+ p.Limit = pp.Limit
+}
+
+func (p *ProcessUsageGPUMemory) Marshal() app.ProcessUsageGPUMemory {
+ pp := app.ProcessUsageGPUMemory{
+ Current: p.Current,
+ Average: p.Average,
+ Max: p.Max,
+ Limit: p.Limit,
+ }
+
+ return pp
+}
+
+type ProcessUsageGPUUsage struct {
+ Current json.Number `json:"cur" swaggertype:"number" jsonschema:"type=number"`
+ Average json.Number `json:"avg" swaggertype:"number" jsonschema:"type=number"`
+ Max json.Number `json:"max" swaggertype:"number" jsonschema:"type=number"`
+ Limit json.Number `json:"limit" swaggertype:"number" jsonschema:"type=number"`
+}
+
+func (p *ProcessUsageGPUUsage) Unmarshal(pp *app.ProcessUsageGPUUsage) {
+ p.Current = json.ToNumber(pp.Current)
+ p.Average = json.ToNumber(pp.Average)
+ p.Max = json.ToNumber(pp.Max)
+ p.Limit = json.ToNumber(pp.Limit)
+}
+
+func (p *ProcessUsageGPUUsage) Marshal() app.ProcessUsageGPUUsage {
+ pp := app.ProcessUsageGPUUsage{}
+
+ if x, err := p.Current.Float64(); err == nil {
+ pp.Current = x
+ }
+
if x, err := p.Average.Float64(); err == nil {
pp.Average = x
}
+ if x, err := p.Max.Float64(); err == nil {
+ pp.Max = x
+ }
+
+ if x, err := p.Limit.Float64(); err == nil {
+ pp.Limit = x
+ }
+
+ return pp
+}
+
+type ProcessUsageGPU struct {
+ Index int `json:"index"`
+ Memory ProcessUsageGPUMemory `json:"memory_bytes"`
+ Usage ProcessUsageGPUUsage `json:"usage"`
+ Encoder ProcessUsageGPUUsage `json:"encoder"`
+ Decoder ProcessUsageGPUUsage `json:"decoder"`
+}
+
+func (p *ProcessUsageGPU) Unmarshal(pp *app.ProcessUsageGPU) {
+ p.Index = pp.Index
+ p.Memory.Unmarshal(&pp.Memory)
+ p.Usage.Unmarshal(&pp.Usage)
+ p.Encoder.Unmarshal(&pp.Encoder)
+ p.Decoder.Unmarshal(&pp.Decoder)
+}
+
+func (p *ProcessUsageGPU) Marshal() app.ProcessUsageGPU {
+ pp := app.ProcessUsageGPU{
+ Index: p.Index,
+ Memory: p.Memory.Marshal(),
+ Usage: p.Usage.Marshal(),
+ Encoder: p.Encoder.Marshal(),
+ Decoder: p.Decoder.Marshal(),
+ }
+
return pp
}
type ProcessUsage struct {
CPU ProcessUsageCPU `json:"cpu_usage"`
Memory ProcessUsageMemory `json:"memory_bytes"`
+ GPU ProcessUsageGPU `json:"gpu"`
}
func (p *ProcessUsage) Unmarshal(pp *app.ProcessUsage) {
p.CPU.Unmarshal(&pp.CPU)
p.Memory.Unmarshal(&pp.Memory)
+ p.GPU.Unmarshal(&pp.GPU)
}
func (p *ProcessUsage) Marshal() app.ProcessUsage {
pp := app.ProcessUsage{
CPU: p.CPU.Marshal(),
Memory: p.Memory.Marshal(),
+ GPU: p.GPU.Marshal(),
}
return pp
diff --git a/http/api/process_test.go b/http/api/process_test.go
index 6dddce39..ddbdfbf8 100644
--- a/http/api/process_test.go
+++ b/http/api/process_test.go
@@ -56,6 +56,33 @@ func TestProcessUsage(t *testing.T) {
Max: 150,
Limit: 200,
},
+ GPU: app.ProcessUsageGPU{
+ Index: 3,
+ Memory: app.ProcessUsageGPUMemory{
+ Current: 48,
+ Average: 43,
+ Max: 88,
+ Limit: 34,
+ },
+ Usage: app.ProcessUsageGPUUsage{
+ Current: 47,
+ Average: 22,
+ Max: 90,
+ Limit: 80,
+ },
+ Encoder: app.ProcessUsageGPUUsage{
+ Current: 48,
+ Average: 46,
+ Max: 74,
+ Limit: 46,
+ },
+ Decoder: app.ProcessUsageGPUUsage{
+ Current: 21,
+ Average: 42,
+ Max: 30,
+ Limit: 99,
+ },
+ },
}
p := ProcessUsage{}
@@ -103,7 +130,13 @@ func TestProcessConfig(t *testing.T) {
LogPatterns: []string{"bla", "blubb"},
LimitCPU: 10,
LimitMemory: 100 * 1024 * 1024,
- LimitWaitFor: 20,
+ LimitGPU: app.ConfigLimitGPU{
+ Usage: 50,
+ Encoder: 90,
+ Decoder: 80,
+ Memory: 24 * 1024 * 1024 * 1024,
+ },
+ LimitWaitFor: 20,
}
p := ProcessConfig{}
diff --git a/internal/.gitignore b/internal/.gitignore
index 9872bd8c..ad8efa9c 100644
--- a/internal/.gitignore
+++ b/internal/.gitignore
@@ -2,4 +2,5 @@ testhelper/ignoresigint/ignoresigint
testhelper/sigint/sigint
testhelper/sigintwait/sigintwait
testhelper/sigpropagate/sigpropagate
-testhelper/ffmpeg/ffmpeg
\ No newline at end of file
+testhelper/ffmpeg/ffmpeg
+testhelper/nvidia-smi/nvidia-smi
\ No newline at end of file
diff --git a/internal/testhelper/nvidia-smi/nvidia-smi.go b/internal/testhelper/nvidia-smi/nvidia-smi.go
new file mode 100644
index 00000000..36f6a78c
--- /dev/null
+++ b/internal/testhelper/nvidia-smi/nvidia-smi.go
@@ -0,0 +1,973 @@
+package main
+
+import (
+ "context"
+ "fmt"
+ "os"
+ "os/signal"
+ "time"
+)
+
+var pmondata = `# gpu pid type sm mem enc dec fb command
+# Idx # C/G % % % % MB name
+ 0 7372 C 2 0 2 - 136 ffmpeg
+ 0 12176 C 5 2 3 7 782 ffmpeg
+ 1 20035 C 8 2 4 1 1145 ffmpeg
+ 1 20141 C 2 1 1 3 429 ffmpeg
+ 0 29591 C 2 1 - 2 435 ffmpeg `
+
+var querydata = `
+
+
+ Mon Jul 15 13:41:56 2024
+ 555.42.06
+ 12.5
+ 2
+
+ NVIDIA L4
+ NVIDIA
+ Ada Lovelace
+ Enabled
+ Disabled
+ Disabled
+ None
+
+ N/A
+ N/A
+
+
+ None
+
+ Disabled
+ 4000
+
+ N/A
+ N/A
+
+ 1654523003308
+ GPU-c5533cd4-5a60-059e-348d-b6d7466932e4
+ 1
+ 95.04.29.00.06
+ No
+ 0x100
+ 900-2G193-0000-001
+ 27B8-895-A1
+ N/A
+ 1
+
+ G193.0200.00.01
+ 2.1
+ 6.16
+ N/A
+
+
+ N/A
+ N/A
+
+
+ N/A
+ N/A
+
+ N/A
+
+ None
+ N/A
+ N/A
+
+
+ No
+ N/A
+
+ 555.42.06
+
+ N/A
+
+
+ 01
+ 00
+ 0000
+ 3
+ 2
+ 27B810DE
+ 00000000:01:00.0
+ 16CA10DE
+
+
+ 4
+ 4
+ 4
+ 4
+ 5
+
+
+ 16x
+ 16x
+
+
+
+ N/A
+ N/A
+
+ 0
+ 0
+ 0 KB/s
+ 0 KB/s
+ N/A
+ N/A
+
+ N/A
+ P0
+
+ Active
+ Not Active
+ Not Active
+ Not Active
+ Not Active
+ Not Active
+ Not Active
+ Not Active
+ Not Active
+
+ N/A
+
+ 23034 MiB
+ 434 MiB
+ 1 MiB
+ 22601 MiB
+
+
+ 32768 MiB
+ 1 MiB
+ 32767 MiB
+
+
+ 0 MiB
+ 0 MiB
+ 0 MiB
+
+ Default
+
+ 2 %
+ 0 %
+ 0 %
+ 0 %
+ 0 %
+ 0 %
+
+
+ 0
+ 0
+ 0
+
+
+ 0
+ 0
+ 0
+
+
+ Enabled
+ Enabled
+
+
+
+ 0
+ 0
+ 0
+ 0
+ 0
+
+
+ 0
+ 0
+ 0
+ 0
+ 0
+ No
+
+
+ 0
+ 0
+ 0
+ 0
+ 0
+
+
+
+
+ N/A
+ N/A
+
+
+ N/A
+ N/A
+
+ N/A
+ N/A
+
+
+ 0
+ 0
+ No
+ No
+
+ 96 bank(s)
+ 0 bank(s)
+ 0 bank(s)
+ 0 bank(s)
+ 0 bank(s)
+
+
+
+ 45 C
+ 39 C
+ -5 C
+ -2 C
+ 0 C
+ N/A
+ N/A
+ N/A
+
+
+ N/A
+ N/A
+
+
+ P0
+ 27.22 W
+ 72.00 W
+ 72.00 W
+ 72.00 W
+ 40.00 W
+ 72.00 W
+
+
+ N/A
+
+
+ P0
+ N/A
+ N/A
+ N/A
+ N/A
+ N/A
+ N/A
+
+
+ 2040 MHz
+ 2040 MHz
+ 6250 MHz
+ 1770 MHz
+
+
+ 2040 MHz
+ 6251 MHz
+
+
+ 2040 MHz
+ 6251 MHz
+
+
+ N/A
+
+
+ 2040 MHz
+ 2040 MHz
+ 6251 MHz
+ 1770 MHz
+
+
+ 2040 MHz
+
+
+ N/A
+ N/A
+
+
+ 885.000 mV
+
+
+ N/A
+ N/A
+ N/A
+ N/A
+
+ N/A
+
+
+
+
+ 6251 MHz
+ 2040 MHz
+ 2025 MHz
+ 2010 MHz
+ 1995 MHz
+ 1980 MHz
+ 1965 MHz
+ 1950 MHz
+ 1935 MHz
+ 1920 MHz
+ 1905 MHz
+ 1890 MHz
+ 1875 MHz
+ 1860 MHz
+ 1845 MHz
+ 1830 MHz
+ 1815 MHz
+ 1800 MHz
+ 1785 MHz
+ 1770 MHz
+ 1755 MHz
+ 1740 MHz
+ 1725 MHz
+ 1710 MHz
+ 1695 MHz
+ 1680 MHz
+ 1665 MHz
+ 1650 MHz
+ 1635 MHz
+ 1620 MHz
+ 1605 MHz
+ 1590 MHz
+ 1575 MHz
+ 1560 MHz
+ 1545 MHz
+ 1530 MHz
+ 1515 MHz
+ 1500 MHz
+ 1485 MHz
+ 1470 MHz
+ 1455 MHz
+ 1440 MHz
+ 1425 MHz
+ 1410 MHz
+ 1395 MHz
+ 1380 MHz
+ 1365 MHz
+ 1350 MHz
+ 1335 MHz
+ 1320 MHz
+ 1305 MHz
+ 1290 MHz
+ 1275 MHz
+ 1260 MHz
+ 1245 MHz
+ 1230 MHz
+ 1215 MHz
+ 1200 MHz
+ 1185 MHz
+ 1170 MHz
+ 1155 MHz
+ 1140 MHz
+ 1125 MHz
+ 1110 MHz
+ 1095 MHz
+ 1080 MHz
+ 1065 MHz
+ 1050 MHz
+ 1035 MHz
+ 1020 MHz
+ 1005 MHz
+ 990 MHz
+ 975 MHz
+ 960 MHz
+ 945 MHz
+ 930 MHz
+ 915 MHz
+ 900 MHz
+ 885 MHz
+ 870 MHz
+ 855 MHz
+ 840 MHz
+ 825 MHz
+ 810 MHz
+ 795 MHz
+ 780 MHz
+ 765 MHz
+ 750 MHz
+ 735 MHz
+ 720 MHz
+ 705 MHz
+ 690 MHz
+ 675 MHz
+ 660 MHz
+ 645 MHz
+ 630 MHz
+ 615 MHz
+ 600 MHz
+ 585 MHz
+ 570 MHz
+ 555 MHz
+ 540 MHz
+ 525 MHz
+ 510 MHz
+ 495 MHz
+ 480 MHz
+ 465 MHz
+ 450 MHz
+ 435 MHz
+ 420 MHz
+ 405 MHz
+ 390 MHz
+ 375 MHz
+ 360 MHz
+ 345 MHz
+ 330 MHz
+ 315 MHz
+ 300 MHz
+ 285 MHz
+ 270 MHz
+ 255 MHz
+ 240 MHz
+ 225 MHz
+ 210 MHz
+
+
+ 405 MHz
+ 645 MHz
+ 630 MHz
+ 615 MHz
+ 600 MHz
+ 585 MHz
+ 570 MHz
+ 555 MHz
+ 540 MHz
+ 525 MHz
+ 510 MHz
+ 495 MHz
+ 480 MHz
+ 465 MHz
+ 450 MHz
+ 435 MHz
+ 420 MHz
+ 405 MHz
+ 390 MHz
+ 375 MHz
+ 360 MHz
+ 345 MHz
+ 330 MHz
+ 315 MHz
+ 300 MHz
+ 285 MHz
+ 270 MHz
+ 255 MHz
+ 240 MHz
+ 225 MHz
+ 210 MHz
+
+
+
+
+ 10131
+ C
+ ffmpeg
+ 389 MiB
+
+
+ 13597
+ C
+ ffmpeg
+ 1054 MiB
+
+
+
+
+
+ disabled
+
+
+
+
+ NVIDIA L4
+ NVIDIA
+ Ada Lovelace
+ Enabled
+ Disabled
+ Disabled
+ None
+
+ N/A
+ N/A
+
+
+ None
+
+ Disabled
+ 4000
+
+ N/A
+ N/A
+
+ 1654523001128
+ GPU-128ab6fb-6ec9-fd74-b479-4a5fd14f55bd
+ 0
+ 95.04.29.00.06
+ No
+ 0xc100
+ 900-2G193-0000-001
+ 27B8-895-A1
+ N/A
+ 1
+
+ G193.0200.00.01
+ 2.1
+ 6.16
+ N/A
+
+
+ N/A
+ N/A
+
+
+ N/A
+ N/A
+
+ N/A
+
+ None
+ N/A
+ N/A
+
+
+ No
+ N/A
+
+ 555.42.06
+
+ N/A
+
+
+ C1
+ 00
+ 0000
+ 3
+ 2
+ 27B810DE
+ 00000000:C1:00.0
+ 16CA10DE
+
+
+ 4
+ 4
+ 4
+ 4
+ 5
+
+
+ 16x
+ 1x
+
+
+
+ N/A
+ N/A
+
+ 0
+ 0
+ 0 KB/s
+ 0 KB/s
+ N/A
+ N/A
+
+ N/A
+ P0
+
+ Active
+ Not Active
+ Not Active
+ Not Active
+ Not Active
+ Not Active
+ Not Active
+ Not Active
+ Not Active
+
+ N/A
+
+ 23034 MiB
+ 434 MiB
+ 1 MiB
+ 22601 MiB
+
+
+ 32768 MiB
+ 1 MiB
+ 32767 MiB
+
+
+ 0 MiB
+ 0 MiB
+ 0 MiB
+
+ Default
+
+ 3 %
+ 0 %
+ 0 %
+ 0 %
+ 0 %
+ 0 %
+
+
+ 0
+ 0
+ 0
+
+
+ 0
+ 0
+ 0
+
+
+ Enabled
+ Enabled
+
+
+
+ 0
+ 0
+ 0
+ 0
+ 0
+
+
+ 0
+ 0
+ 0
+ 0
+ 0
+ No
+
+
+ 0
+ 0
+ 0
+ 0
+ 0
+
+
+
+
+ N/A
+ N/A
+
+
+ N/A
+ N/A
+
+ N/A
+ N/A
+
+
+ 0
+ 0
+ No
+ No
+
+ 96 bank(s)
+ 0 bank(s)
+ 0 bank(s)
+ 0 bank(s)
+ 0 bank(s)
+
+
+
+ 40 C
+ 43 C
+ -5 C
+ -2 C
+ 0 C
+ N/A
+ N/A
+ N/A
+
+
+ N/A
+ N/A
+
+
+ P0
+ 29.54 W
+ 72.00 W
+ 72.00 W
+ 72.00 W
+ 40.00 W
+ 72.00 W
+
+
+ N/A
+
+
+ P0
+ N/A
+ N/A
+ N/A
+ N/A
+ N/A
+ N/A
+
+
+ 2040 MHz
+ 2040 MHz
+ 6250 MHz
+ 1770 MHz
+
+
+ 2040 MHz
+ 6251 MHz
+
+
+ 2040 MHz
+ 6251 MHz
+
+
+ N/A
+
+
+ 2040 MHz
+ 2040 MHz
+ 6251 MHz
+ 1770 MHz
+
+
+ 2040 MHz
+
+
+ N/A
+ N/A
+
+
+ 910.000 mV
+
+
+ N/A
+ N/A
+ N/A
+ N/A
+
+ N/A
+
+
+
+
+ 6251 MHz
+ 2040 MHz
+ 2025 MHz
+ 2010 MHz
+ 1995 MHz
+ 1980 MHz
+ 1965 MHz
+ 1950 MHz
+ 1935 MHz
+ 1920 MHz
+ 1905 MHz
+ 1890 MHz
+ 1875 MHz
+ 1860 MHz
+ 1845 MHz
+ 1830 MHz
+ 1815 MHz
+ 1800 MHz
+ 1785 MHz
+ 1770 MHz
+ 1755 MHz
+ 1740 MHz
+ 1725 MHz
+ 1710 MHz
+ 1695 MHz
+ 1680 MHz
+ 1665 MHz
+ 1650 MHz
+ 1635 MHz
+ 1620 MHz
+ 1605 MHz
+ 1590 MHz
+ 1575 MHz
+ 1560 MHz
+ 1545 MHz
+ 1530 MHz
+ 1515 MHz
+ 1500 MHz
+ 1485 MHz
+ 1470 MHz
+ 1455 MHz
+ 1440 MHz
+ 1425 MHz
+ 1410 MHz
+ 1395 MHz
+ 1380 MHz
+ 1365 MHz
+ 1350 MHz
+ 1335 MHz
+ 1320 MHz
+ 1305 MHz
+ 1290 MHz
+ 1275 MHz
+ 1260 MHz
+ 1245 MHz
+ 1230 MHz
+ 1215 MHz
+ 1200 MHz
+ 1185 MHz
+ 1170 MHz
+ 1155 MHz
+ 1140 MHz
+ 1125 MHz
+ 1110 MHz
+ 1095 MHz
+ 1080 MHz
+ 1065 MHz
+ 1050 MHz
+ 1035 MHz
+ 1020 MHz
+ 1005 MHz
+ 990 MHz
+ 975 MHz
+ 960 MHz
+ 945 MHz
+ 930 MHz
+ 915 MHz
+ 900 MHz
+ 885 MHz
+ 870 MHz
+ 855 MHz
+ 840 MHz
+ 825 MHz
+ 810 MHz
+ 795 MHz
+ 780 MHz
+ 765 MHz
+ 750 MHz
+ 735 MHz
+ 720 MHz
+ 705 MHz
+ 690 MHz
+ 675 MHz
+ 660 MHz
+ 645 MHz
+ 630 MHz
+ 615 MHz
+ 600 MHz
+ 585 MHz
+ 570 MHz
+ 555 MHz
+ 540 MHz
+ 525 MHz
+ 510 MHz
+ 495 MHz
+ 480 MHz
+ 465 MHz
+ 450 MHz
+ 435 MHz
+ 420 MHz
+ 405 MHz
+ 390 MHz
+ 375 MHz
+ 360 MHz
+ 345 MHz
+ 330 MHz
+ 315 MHz
+ 300 MHz
+ 285 MHz
+ 270 MHz
+ 255 MHz
+ 240 MHz
+ 225 MHz
+ 210 MHz
+
+
+ 405 MHz
+ 645 MHz
+ 630 MHz
+ 615 MHz
+ 600 MHz
+ 585 MHz
+ 570 MHz
+ 555 MHz
+ 540 MHz
+ 525 MHz
+ 510 MHz
+ 495 MHz
+ 480 MHz
+ 465 MHz
+ 450 MHz
+ 435 MHz
+ 420 MHz
+ 405 MHz
+ 390 MHz
+ 375 MHz
+ 360 MHz
+ 345 MHz
+ 330 MHz
+ 315 MHz
+ 300 MHz
+ 285 MHz
+ 270 MHz
+ 255 MHz
+ 240 MHz
+ 225 MHz
+ 210 MHz
+
+
+
+
+ 16870
+ C
+ ffmpeg
+ 549 MiB
+
+
+
+
+
+ disabled
+
+
+
+`
+
+func main() {
+ if len(os.Args) == 1 {
+ os.Exit(1)
+ }
+
+ ctx, cancel := context.WithCancel(context.Background())
+
+ if os.Args[1] == "pmon" {
+ go func(ctx context.Context) {
+ ticker := time.NewTicker(time.Second)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-ctx.Done():
+ return
+ case <-ticker.C:
+ fmt.Fprintf(os.Stdout, "%s\n", pmondata)
+ }
+ }
+ }(ctx)
+ } else {
+ go func(ctx context.Context) {
+ ticker := time.NewTicker(time.Second)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-ctx.Done():
+ return
+ case <-ticker.C:
+ fmt.Fprintf(os.Stdout, "%s\n", querydata)
+ }
+ }
+ }(ctx)
+ }
+
+ // Wait for interrupt signal to gracefully shutdown the app
+ quit := make(chan os.Signal, 1)
+ signal.Notify(quit, os.Interrupt)
+ <-quit
+
+ cancel()
+
+ os.Exit(0)
+}
diff --git a/monitor/cpu.go b/monitor/cpu.go
index 83869653..8a10850a 100644
--- a/monitor/cpu.go
+++ b/monitor/cpu.go
@@ -33,7 +33,7 @@ func NewCPUCollector(rsc resources.Resources) metric.Collector {
c.limitDescr = metric.NewDesc("cpu_limit", "Percentage of CPU to be consumed", nil)
c.throttleDescr = metric.NewDesc("cpu_throttling", "Whether the CPU is currently throttled", nil)
- if ncpu, err := psutil.CPUCounts(true); err == nil {
+ if ncpu, err := psutil.CPUCounts(); err == nil {
c.ncpu = ncpu
}
@@ -63,11 +63,11 @@ func (c *cpuCollector) Collect() metric.Metrics {
metrics.Add(metric.NewValue(c.ncpuDescr, c.ncpu))
- limit, _ := c.resources.Limits()
+ limit, _, _, _ := c.resources.Limits()
metrics.Add(metric.NewValue(c.limitDescr, limit))
- cpu, _ := c.resources.ShouldLimit()
+ cpu, _, _ := c.resources.ShouldLimit()
throttling := .0
if cpu {
throttling = 1
diff --git a/monitor/disk.go b/monitor/disk.go
index 7e1ba86d..fda2f24d 100644
--- a/monitor/disk.go
+++ b/monitor/disk.go
@@ -37,7 +37,7 @@ func (c *diskCollector) Describe() []*metric.Description {
func (c *diskCollector) Collect() metric.Metrics {
metrics := metric.NewMetrics()
- stat, err := psutil.DiskUsage(c.path)
+ stat, err := psutil.Disk(c.path)
if err != nil {
return metrics
}
diff --git a/monitor/mem.go b/monitor/mem.go
index 10a66f7f..986b2be5 100644
--- a/monitor/mem.go
+++ b/monitor/mem.go
@@ -44,11 +44,11 @@ func (c *memCollector) Describe() []*metric.Description {
func (c *memCollector) Collect() metric.Metrics {
metrics := metric.NewMetrics()
- _, limit := c.resources.Limits()
+ _, limit, _, _ := c.resources.Limits()
metrics.Add(metric.NewValue(c.limitDescr, float64(limit)))
- _, memory := c.resources.ShouldLimit()
+ _, memory, _ := c.resources.ShouldLimit()
throttling := .0
if memory {
throttling = 1
@@ -56,7 +56,7 @@ func (c *memCollector) Collect() metric.Metrics {
metrics.Add(metric.NewValue(c.throttleDescr, throttling))
- stat, err := psutil.VirtualMemory()
+ stat, err := psutil.Memory()
if err != nil {
return metrics
}
diff --git a/monitor/net.go b/monitor/net.go
index 87b2b8a3..270e0948 100644
--- a/monitor/net.go
+++ b/monitor/net.go
@@ -33,7 +33,7 @@ func (c *netCollector) Describe() []*metric.Description {
func (c *netCollector) Collect() metric.Metrics {
metrics := metric.NewMetrics()
- devs, err := psutil.NetIOCounters(true)
+ devs, err := psutil.Network()
if err != nil {
return metrics
}
diff --git a/process/limiter.go b/process/limiter.go
index ea5df9c2..699294dc 100644
--- a/process/limiter.go
+++ b/process/limiter.go
@@ -25,9 +25,36 @@ type Usage struct {
Max uint64 // bytes
Limit uint64 // bytes
}
+ GPU struct {
+ Index int // number of the GPU
+ Memory struct {
+ Current uint64 // bytes
+ Average float64 // bytes
+ Max uint64 // bytes
+ Limit uint64 // bytes
+ }
+ Usage struct {
+ Current float64 // percent 0-100
+ Average float64 // percent 0-100
+ Max float64 // percent 0-100
+ Limit float64 // percent 0-100
+ }
+ Encoder struct {
+ Current float64 // percent 0-100
+ Average float64 // percent 0-100
+ Max float64 // percent 0-100
+ Limit float64 // percent 0-100
+ }
+ Decoder struct {
+ Current float64 // percent 0-100
+ Average float64 // percent 0-100
+ Max float64 // percent 0-100
+ Limit float64 // percent 0-100
+ }
+ }
}
-type LimitFunc func(cpu float64, memory uint64)
+type LimitFunc func(cpu float64, memory uint64, gpuusage, gpuencoder, gpudecoder float64, gpumemory uint64)
type LimitMode int
@@ -44,18 +71,22 @@ func (m LimitMode) String() string {
}
const (
- LimitModeHard LimitMode = 0 // Killing the process if either CPU or memory is above the limit for a certain time
- LimitModeSoft LimitMode = 1 // Throttling the CPU if activated, killing the process if memory is above the limit for a certain time
+ LimitModeHard LimitMode = 0 // Killing the process if either resource is above the limit for a certain time.
+ LimitModeSoft LimitMode = 1 // If activated, will throttle the CPU, otherwise killing the process if resources are above the limit.
)
type LimiterConfig struct {
- CPU float64 // Max. CPU usage in percent 0-100 in hard mode, 0-100*ncpu in softmode
- Memory uint64 // Max. memory usage in bytes
- WaitFor time.Duration // Duration for one of the limits has to be above the limit until OnLimit gets triggered
- OnLimit LimitFunc // Function to be triggered if limits are exceeded
- Mode LimitMode // How to limit CPU usage
- PSUtil psutil.Util
- Logger log.Logger
+ CPU float64 // Max. CPU usage in percent 0-100 in hard mode, 0-100*ncpu in soft mode.
+ Memory uint64 // Max. memory usage in bytes.
+ GPUUsage float64 // Max. GPU general usage in percent 0-100.
+ GPUEncoder float64 // Max. GPU encoder usage in percent 0-100.
+ GPUDecoder float64 // Max. GPU decoder usage in percent 0-100.
+ GPUMemory uint64 // Max. GPU memory usage in bytes.
+ WaitFor time.Duration // Duration for one of the limits has to be above the limit until OnLimit gets triggered.
+ OnLimit LimitFunc // Function to be triggered if limits are exceeded.
+ Mode LimitMode // How to limit CPU usage.
+ PSUtil psutil.Util
+ Logger log.Logger
}
type Limiter interface {
@@ -65,26 +96,135 @@ type Limiter interface {
// Stop stops the limiter. The limiter can be reused by calling Start() again
Stop()
- // Current returns the current CPU and memory values
- // Deprecated: use Usage()
- Current() (cpu float64, memory uint64)
-
- // Limits returns the defined CPU and memory limits. Values <= 0 means no limit
- // Deprecated: use Usage()
- Limits() (cpu float64, memory uint64)
-
// Usage returns the current state of the limiter, such as current, average, max, and
// limit values for CPU and memory.
Usage() Usage
// Limit enables or disables the throttling of the CPU or killing because of to much
- // memory consumption.
- Limit(cpu, memory bool) error
+ // memory or GPU consumption.
+ Limit(cpu, memory, gpu bool) error
// Mode returns in which mode the limiter is running in.
Mode() LimitMode
}
+type numbers interface {
+ ~uint64 | ~float64
+}
+
+type metric[T numbers] struct {
+ limit T // Limit
+ current T // Current load value
+ last T // Last load value
+ max T // Max. load value
+ top T // Decaying max. load value
+ avg float64 // Average load value
+ avgCounter uint64 // Counter for average calculation
+ limitSince time.Time // Time when the limit has been reached (hard limiter mode)
+ limitEnable bool
+}
+
+func (x *metric[T]) Reset() {
+ var zero T
+
+ x.current = zero
+ x.last = zero
+ x.max = zero
+ x.top = zero
+ x.avg = 0
+ x.avgCounter = 0
+ x.limitEnable = false
+}
+
+func (x *metric[T]) Current() T {
+ return x.current
+}
+
+func (x *metric[T]) Top() T {
+ return x.top
+}
+
+func (x *metric[T]) Max() T {
+ return x.max
+}
+
+func (x *metric[T]) Avg() float64 {
+ return x.avg
+}
+
+func (x *metric[T]) SetLimit(limit T) {
+ x.limit = limit
+}
+
+func (x *metric[T]) Limit() T {
+ return x.limit
+}
+
+func (x *metric[T]) DoLimit(limit bool) (enabled, changed bool) {
+ if x.limitEnable != limit {
+ x.limitEnable = limit
+ changed = true
+ }
+
+ enabled = x.limitEnable
+
+ return
+}
+
+func (x *metric[T]) IsLimitEnabled() bool {
+ return x.limitEnable
+}
+
+func (x *metric[T]) Update(value T) {
+ x.last, x.current = x.current, value
+
+ if x.current > x.max {
+ x.max = x.current
+ }
+
+ if x.current > x.top {
+ x.top = x.current
+ } else {
+ x.top = T(float64(x.top) * 0.95)
+ }
+
+ x.avgCounter++
+
+ x.avg = ((x.avg * float64(x.avgCounter-1)) + float64(x.current)) / float64(x.avgCounter)
+}
+
+func (x *metric[T]) IsExceeded(waitFor time.Duration, mode LimitMode) bool {
+ if x.limit <= 0 {
+ return false
+ }
+
+ if mode == LimitModeSoft {
+ // Check if we actually should limit.
+ if !x.limitEnable {
+ return false
+ }
+
+ // If we are currently above the limit, the limit is exceeded.
+ if x.current > x.limit {
+ return true
+ }
+ } else {
+ if x.current > x.limit {
+ // Current value is higher than the limit.
+ if x.last <= x.limit {
+ // If the previous value is below the limit, then we reached the limit as of now.
+ x.limitSince = time.Now()
+ }
+
+ if time.Since(x.limitSince) >= waitFor {
+ return true
+ }
+ }
+ }
+
+ return false
+}
+
type limiter struct {
psutil psutil.Util
@@ -98,40 +238,27 @@ type limiter struct {
lastUsage Usage
lastUsageLock sync.RWMutex
- cpu float64 // CPU limit
- cpuCurrent float64 // Current CPU load of this process
- cpuLast float64 // Last CPU load of this process
- cpuMax float64 // Max. CPU load of this process
- cpuTop float64 // Decaying max. CPU load of this process
- cpuAvg float64 // Average CPU load of this process
- cpuAvgCounter uint64 // Counter for average calculation
- cpuLimitSince time.Time // Time when the CPU limit has been reached (hard limiter mode)
- cpuLimitEnable bool // Whether CPU throttling is enabled (soft limiter mode)
- cpuThrottling bool // Whether CPU throttling is currently active (soft limiter mode)
+ cpu metric[float64] // CPU limit
+ cpuThrottling bool // Whether CPU throttling is currently active (soft limiter mode)
- memory uint64 // Memory limit (bytes)
- memoryCurrent uint64 // Current memory usage
- memoryLast uint64 // Last memory usage
- memoryMax uint64 // Max. memory usage
- memoryTop uint64 // Decaying max. memory usage
- memoryAvg float64 // Average memory usage
- memoryAvgCounter uint64 // Counter for average memory calculation
- memoryLimitSince time.Time // Time when the memory limit has been reached (hard limiter mode)
- memoryLimitEnable bool // Whether memory limiting is enabled (soft limiter mode)
+ memory metric[uint64] // Memory limit (bytes)
+
+ gpu struct {
+ memory metric[uint64] // GPU memory limit (0-100 percent)
+ usage metric[float64] // GPU load limit (0-100 percent)
+ encoder metric[float64] // GPU encoder limit (0-100 percent)
+ decoder metric[float64] // GPU decoder limit (0-100 percent)
+ }
waitFor time.Duration
mode LimitMode
- cancelLimit context.CancelFunc
-
logger log.Logger
}
// NewLimiter returns a new Limiter
func NewLimiter(config LimiterConfig) Limiter {
l := &limiter{
- cpu: config.CPU,
- memory: config.Memory,
waitFor: config.WaitFor,
onLimit: config.OnLimit,
mode: config.Mode,
@@ -139,6 +266,13 @@ func NewLimiter(config LimiterConfig) Limiter {
logger: config.Logger,
}
+ l.cpu.SetLimit(config.CPU / 100)
+ l.memory.SetLimit(config.Memory)
+ l.gpu.memory.SetLimit(config.GPUMemory)
+ l.gpu.usage.SetLimit(config.GPUUsage / 100)
+ l.gpu.encoder.SetLimit(config.GPUEncoder / 100)
+ l.gpu.decoder.SetLimit(config.GPUDecoder / 100)
+
if l.logger == nil {
l.logger = log.New("")
}
@@ -147,57 +281,56 @@ func NewLimiter(config LimiterConfig) Limiter {
l.psutil = psutil.DefaultUtil
}
- if ncpu, err := l.psutil.CPUCounts(true); err != nil {
+ if ncpu, err := l.psutil.CPUCounts(); err != nil {
l.ncpu = 1
} else {
l.ncpu = ncpu
}
l.lastUsage.CPU.NCPU = l.ncpu
- l.lastUsage.CPU.Limit = l.cpu * l.ncpu
- l.lastUsage.Memory.Limit = l.memory
+ l.lastUsage.CPU.Limit = l.cpu.Limit() * 100 * l.ncpu
+ l.lastUsage.Memory.Limit = l.memory.Limit()
+ l.lastUsage.GPU.Memory.Limit = l.gpu.memory.Limit()
+ l.lastUsage.GPU.Usage.Limit = l.gpu.usage.Limit() * 100
+ l.lastUsage.GPU.Encoder.Limit = l.gpu.encoder.Limit() * 100
+ l.lastUsage.GPU.Decoder.Limit = l.gpu.decoder.Limit() * 100
l.ncpuFactor = 1
mode := "hard"
if l.mode == LimitModeSoft {
mode = "soft"
- l.cpu /= l.ncpu
+ l.cpu.SetLimit(l.cpu.Limit() / l.ncpu)
l.ncpuFactor = l.ncpu
}
- l.cpu /= 100
-
if l.onLimit == nil {
- l.onLimit = func(float64, uint64) {}
+ l.onLimit = func(float64, uint64, float64, float64, float64, uint64) {}
}
l.logger = l.logger.WithFields(log.Fields{
- "cpu": l.cpu * l.ncpuFactor,
- "memory": l.memory,
- "mode": mode,
+ "cpu": l.cpu.Limit() * l.ncpuFactor,
+ "memory": l.memory.Limit(),
+ "gpumemory": l.gpu.memory.Limit(),
+ "gpuusage": l.gpu.usage.Limit(),
+ "gpuencoder": l.gpu.encoder.Limit(),
+ "gpudecoder": l.gpu.decoder.Limit(),
+ "mode": mode,
})
return l
}
func (l *limiter) reset() {
- l.cpuCurrent = 0
- l.cpuLast = 0
- l.cpuAvg = 0
- l.cpuAvgCounter = 0
- l.cpuMax = 0
- l.cpuTop = 0
- l.cpuLimitEnable = false
+ l.cpu.Reset()
l.cpuThrottling = false
- l.memoryCurrent = 0
- l.memoryLast = 0
- l.memoryAvg = 0
- l.memoryAvgCounter = 0
- l.memoryMax = 0
- l.memoryTop = 0
- l.memoryLimitEnable = false
+ l.memory.Reset()
+
+ l.gpu.memory.Reset()
+ l.gpu.usage.Reset()
+ l.gpu.encoder.Reset()
+ l.gpu.decoder.Reset()
}
func (l *limiter) Start(process psutil.Process) error {
@@ -218,10 +351,7 @@ func (l *limiter) Start(process psutil.Process) error {
go l.ticker(ctx, time.Second)
if l.mode == LimitModeSoft {
- ctx, cancel = context.WithCancel(context.Background())
- l.cancelLimit = cancel
-
- go l.limitCPU(ctx, l.cpu, time.Second)
+ go l.limitCPU(ctx, l.cpu.Limit(), time.Second)
}
return nil
@@ -237,11 +367,6 @@ func (l *limiter) Stop() {
l.cancel()
- if l.cancelLimit != nil {
- l.cancelLimit()
- l.cancelLimit = nil
- }
-
l.proc.Stop()
l.proc = nil
@@ -256,13 +381,13 @@ func (l *limiter) ticker(ctx context.Context, interval time.Duration) {
select {
case <-ctx.Done():
return
- case t := <-ticker.C:
- l.collect(t)
+ case <-ticker.C:
+ l.collect()
}
}
}
-func (l *limiter) collect(_ time.Time) {
+func (l *limiter) collect() {
l.lock.Lock()
proc := l.proc
l.lock.Unlock()
@@ -271,118 +396,108 @@ func (l *limiter) collect(_ time.Time) {
return
}
- mstat, merr := proc.VirtualMemory()
- cpustat, cerr := proc.CPUPercent()
+ mstat, merr := proc.Memory()
+ cpustat, cerr := proc.CPU()
+ gstat, gerr := proc.GPU()
+ gindex := -1
l.lock.Lock()
+ defer l.lock.Unlock()
if merr == nil {
- l.memoryLast, l.memoryCurrent = l.memoryCurrent, mstat
-
- if l.memoryCurrent > l.memoryMax {
- l.memoryMax = l.memoryCurrent
- }
-
- if l.memoryCurrent > l.memoryTop {
- l.memoryTop = l.memoryCurrent
- } else {
- l.memoryTop = uint64(float64(l.memoryTop) * 0.95)
- }
-
- l.memoryAvgCounter++
-
- l.memoryAvg = ((l.memoryAvg * float64(l.memoryAvgCounter-1)) + float64(l.memoryCurrent)) / float64(l.memoryAvgCounter)
+ l.memory.Update(mstat)
}
if cerr == nil {
- l.cpuLast, l.cpuCurrent = l.cpuCurrent, (cpustat.System+cpustat.User+cpustat.Other)/100
+ l.cpu.Update((cpustat.System + cpustat.User + cpustat.Other) / 100)
+ }
- if l.cpuCurrent > l.cpuMax {
- l.cpuMax = l.cpuCurrent
- }
-
- if l.cpuCurrent > l.cpuTop {
- l.cpuTop = l.cpuCurrent
- } else {
- l.cpuTop = l.cpuTop * 0.95
- }
-
- l.cpuAvgCounter++
-
- l.cpuAvg = ((l.cpuAvg * float64(l.cpuAvgCounter-1)) + l.cpuCurrent) / float64(l.cpuAvgCounter)
+ if gerr == nil {
+ l.gpu.memory.Update(gstat.MemoryUsed)
+ l.gpu.usage.Update(gstat.Usage / 100)
+ l.gpu.encoder.Update(gstat.Encoder / 100)
+ l.gpu.decoder.Update(gstat.Decoder / 100)
+ gindex = gstat.Index
}
isLimitExceeded := false
if l.mode == LimitModeHard {
- if l.cpu > 0 {
- if l.cpuCurrent > l.cpu {
- // Current value is higher than the limit
- if l.cpuLast <= l.cpu {
- // If the previous value is below the limit, then we reached the
- // limit as of now
- l.cpuLimitSince = time.Now()
- }
-
- if time.Since(l.cpuLimitSince) >= l.waitFor {
- l.logger.Warn().Log("CPU limit exceeded")
- isLimitExceeded = true
- }
- }
+ if l.cpu.IsExceeded(l.waitFor, l.mode) {
+ l.logger.Warn().Log("CPU limit exceeded")
+ isLimitExceeded = true
}
+ }
- if l.memory > 0 {
- if l.memoryCurrent > l.memory {
- // Current value is higher than the limit
- if l.memoryLast <= l.memory {
- // If the previous value is below the limit, then we reached the
- // limit as of now
- l.memoryLimitSince = time.Now()
- }
+ if l.memory.IsExceeded(l.waitFor, l.mode) {
+ l.logger.Warn().Log("Memory limit exceeded")
+ isLimitExceeded = true
+ }
- if time.Since(l.memoryLimitSince) >= l.waitFor {
- l.logger.Warn().Log("Memory limit exceeded")
- isLimitExceeded = true
- }
- }
- }
- } else {
- if l.memory > 0 && l.memoryLimitEnable {
- if l.memoryCurrent > l.memory {
- // Current value is higher than the limit
- l.logger.Warn().Log("Memory limit exceeded")
- isLimitExceeded = true
- }
- }
+ if l.gpu.memory.IsExceeded(l.waitFor, l.mode) {
+ l.logger.Warn().Log("GPU memory limit exceeded")
+ isLimitExceeded = true
+ }
+
+ if l.gpu.usage.IsExceeded(l.waitFor, l.mode) {
+ l.logger.Warn().Log("GPU usage limit exceeded")
+ isLimitExceeded = true
+ }
+
+ if l.gpu.encoder.IsExceeded(l.waitFor, l.mode) {
+ l.logger.Warn().Log("GPU encoder limit exceeded")
+ isLimitExceeded = true
+ }
+
+ if l.gpu.decoder.IsExceeded(l.waitFor, l.mode) {
+ l.logger.Warn().Log("GPU decoder limit exceeded")
+ isLimitExceeded = true
}
l.logger.Debug().WithFields(log.Fields{
- "cur_cpu": l.cpuCurrent * l.ncpuFactor,
- "top_cpu": l.cpuTop * l.ncpuFactor,
- "cur_mem": l.memoryCurrent,
- "top_mem": l.memoryTop,
- "exceeded": isLimitExceeded,
+ "cur_cpu": l.cpu.Current() * l.ncpuFactor,
+ "top_cpu": l.cpu.Top() * l.ncpuFactor,
+ "cur_mem": l.memory.Current(),
+ "top_mem": l.memory.Top(),
+ "cur_gpu_mem": l.gpu.memory.Current(),
+ "top_gpu_mem": l.gpu.memory.Top(),
+ "exceeded": isLimitExceeded,
}).Log("Observation")
if isLimitExceeded {
- go l.onLimit(l.cpuCurrent*l.ncpuFactor*100, l.memoryCurrent)
+ go l.onLimit(l.cpu.Current()*l.ncpuFactor*100, l.memory.Current(), l.gpu.usage.Current(), l.gpu.encoder.Current(), l.gpu.decoder.Current(), l.gpu.memory.Current())
}
l.lastUsageLock.Lock()
- l.lastUsage.CPU.Current = l.cpuCurrent * l.ncpu * 100
- l.lastUsage.CPU.Average = l.cpuAvg * l.ncpu * 100
- l.lastUsage.CPU.Max = l.cpuMax * l.ncpu * 100
+ l.lastUsage.CPU.Current = l.cpu.Current() * l.ncpu * 100
+ l.lastUsage.CPU.Average = l.cpu.Avg() * l.ncpu * 100
+ l.lastUsage.CPU.Max = l.cpu.Max() * l.ncpu * 100
l.lastUsage.CPU.IsThrottling = l.cpuThrottling
- l.lastUsage.Memory.Current = l.memoryCurrent
- l.lastUsage.Memory.Average = l.memoryAvg
- l.lastUsage.Memory.Max = l.memoryMax
- l.lastUsageLock.Unlock()
+ l.lastUsage.Memory.Current = l.memory.Current()
+ l.lastUsage.Memory.Average = l.memory.Avg()
+ l.lastUsage.Memory.Max = l.memory.Max()
- l.lock.Unlock()
+ l.lastUsage.GPU.Index = gindex
+ l.lastUsage.GPU.Memory.Current = l.gpu.memory.Current() * 100
+ l.lastUsage.GPU.Memory.Average = l.gpu.memory.Avg() * 100
+ l.lastUsage.GPU.Memory.Max = l.gpu.memory.Max() * 100
+
+ l.lastUsage.GPU.Usage.Current = l.gpu.usage.Current() * 100
+ l.lastUsage.GPU.Usage.Average = l.gpu.usage.Avg() * 100
+ l.lastUsage.GPU.Usage.Max = l.gpu.usage.Max() * 100
+
+ l.lastUsage.GPU.Encoder.Current = l.gpu.encoder.Current() * 100
+ l.lastUsage.GPU.Encoder.Average = l.gpu.encoder.Avg() * 100
+ l.lastUsage.GPU.Encoder.Max = l.gpu.encoder.Max() * 100
+
+ l.lastUsage.GPU.Decoder.Current = l.gpu.decoder.Current() * 100
+ l.lastUsage.GPU.Decoder.Average = l.gpu.decoder.Avg() * 100
+ l.lastUsage.GPU.Decoder.Max = l.gpu.decoder.Max() * 100
+ l.lastUsageLock.Unlock()
}
-func (l *limiter) Limit(cpu, memory bool) error {
+func (l *limiter) Limit(cpu, memory, gpu bool) error {
l.lock.Lock()
defer l.lock.Unlock()
@@ -390,35 +505,31 @@ func (l *limiter) Limit(cpu, memory bool) error {
return nil
}
- if memory {
- if !l.memoryLimitEnable {
- l.memoryLimitEnable = true
-
- l.logger.Debug().Log("Memory limiter enabled")
- }
- } else {
- if l.memoryLimitEnable {
- l.memoryLimitEnable = false
-
- l.logger.Debug().Log("Memory limiter disabled")
- }
+ enabled, changed := l.cpu.DoLimit(cpu)
+ if enabled && changed {
+ l.logger.Debug().Log("CPU limiter enabled")
+ } else if !enabled && changed {
+ l.logger.Debug().Log("CPU limiter disabled")
}
- if cpu {
- if !l.cpuLimitEnable {
- l.cpuLimitEnable = true
-
- l.logger.Debug().Log("CPU limiter enabled")
- }
- } else {
- if l.cpuLimitEnable {
- l.cpuLimitEnable = false
-
- l.logger.Debug().Log("CPU limiter disabled")
- }
-
+ enabled, changed = l.memory.DoLimit(memory)
+ if enabled && changed {
+ l.logger.Debug().Log("Memory limiter enabled")
+ } else if !enabled && changed {
+ l.logger.Debug().Log("Memory limiter disabled")
}
+ enabled, changed = l.gpu.memory.DoLimit(gpu)
+ if enabled && changed {
+ l.logger.Debug().Log("GPU limiter enabled")
+ } else if !enabled && changed {
+ l.logger.Debug().Log("GPU limiter disabled")
+ }
+
+ l.gpu.usage.DoLimit(gpu)
+ l.gpu.encoder.DoLimit(gpu)
+ l.gpu.decoder.DoLimit(gpu)
+
return nil
}
@@ -453,7 +564,7 @@ func (l *limiter) limitCPU(ctx context.Context, limit float64, interval time.Dur
l.lock.Lock()
- if !l.cpuLimitEnable {
+ if !l.cpu.IsLimitEnabled() {
if factorTopLimit > 0 {
factorTopLimit -= 10
} else {
@@ -469,7 +580,7 @@ func (l *limiter) limitCPU(ctx context.Context, limit float64, interval time.Dur
}
} else {
factorTopLimit = 100
- topLimit = l.cpuTop - limit
+ topLimit = l.cpu.Top() - limit
l.cpuThrottling = true
}
@@ -482,7 +593,7 @@ func (l *limiter) limitCPU(ctx context.Context, limit float64, interval time.Dur
lim += (100 - factorTopLimit) / 100 * topLimit
}
- pcpu := l.cpuCurrent
+ pcpu := l.cpu.Current()
l.lock.Unlock()
@@ -526,16 +637,6 @@ func (l *limiter) limitCPU(ctx context.Context, limit float64, interval time.Dur
}
}
-func (l *limiter) Current() (cpu float64, memory uint64) {
- l.lastUsageLock.RLock()
- defer l.lastUsageLock.RUnlock()
-
- cpu = l.lastUsage.CPU.Current / l.ncpu
- memory = l.lastUsage.Memory.Current
-
- return
-}
-
func (l *limiter) Usage() Usage {
l.lastUsageLock.RLock()
defer l.lastUsageLock.RUnlock()
@@ -543,10 +644,6 @@ func (l *limiter) Usage() Usage {
return l.lastUsage
}
-func (l *limiter) Limits() (cpu float64, memory uint64) {
- return l.cpu * 100, l.memory
-}
-
func (l *limiter) Mode() LimitMode {
return l.mode
}
diff --git a/process/limiter_test.go b/process/limiter_test.go
index c9e31127..0ec98333 100644
--- a/process/limiter_test.go
+++ b/process/limiter_test.go
@@ -7,13 +7,13 @@ import (
"github.com/datarhei/core/v16/psutil"
- "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
)
type psproc struct{}
-func (p *psproc) CPUPercent() (*psutil.CPUInfoStat, error) {
- return &psutil.CPUInfoStat{
+func (p *psproc) CPU() (*psutil.CPUInfo, error) {
+ return &psutil.CPUInfo{
System: 50,
User: 0,
Idle: 0,
@@ -21,10 +21,22 @@ func (p *psproc) CPUPercent() (*psutil.CPUInfoStat, error) {
}, nil
}
-func (p *psproc) VirtualMemory() (uint64, error) {
+func (p *psproc) Memory() (uint64, error) {
return 197, nil
}
+func (p *psproc) GPU() (*psutil.GPUInfo, error) {
+ return &psutil.GPUInfo{
+ Index: 0,
+ Name: "L4",
+ MemoryTotal: 128,
+ MemoryUsed: 91,
+ Usage: 3,
+ Encoder: 9,
+ Decoder: 5,
+ }, nil
+}
+
func (p *psproc) Stop() {}
func (p *psproc) Suspend() error { return nil }
func (p *psproc) Resume() error { return nil }
@@ -42,7 +54,7 @@ func TestCPULimit(t *testing.T) {
l := NewLimiter(LimiterConfig{
CPU: 42,
- OnLimit: func(float64, uint64) {
+ OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
wg.Done()
},
})
@@ -57,7 +69,7 @@ func TestCPULimit(t *testing.T) {
lock.Unlock()
}()
- assert.Eventually(t, func() bool {
+ require.Eventually(t, func() bool {
lock.Lock()
defer lock.Unlock()
@@ -79,7 +91,7 @@ func TestCPULimitWaitFor(t *testing.T) {
l := NewLimiter(LimiterConfig{
CPU: 42,
WaitFor: 3 * time.Second,
- OnLimit: func(float64, uint64) {
+ OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
wg.Done()
},
})
@@ -94,7 +106,7 @@ func TestCPULimitWaitFor(t *testing.T) {
lock.Unlock()
}()
- assert.Eventually(t, func() bool {
+ require.Eventually(t, func() bool {
lock.Lock()
defer lock.Unlock()
@@ -115,7 +127,7 @@ func TestMemoryLimit(t *testing.T) {
l := NewLimiter(LimiterConfig{
Memory: 42,
- OnLimit: func(float64, uint64) {
+ OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
wg.Done()
},
})
@@ -130,7 +142,7 @@ func TestMemoryLimit(t *testing.T) {
lock.Unlock()
}()
- assert.Eventually(t, func() bool {
+ require.Eventually(t, func() bool {
lock.Lock()
defer lock.Unlock()
@@ -152,7 +164,7 @@ func TestMemoryLimitWaitFor(t *testing.T) {
l := NewLimiter(LimiterConfig{
Memory: 42,
WaitFor: 3 * time.Second,
- OnLimit: func(float64, uint64) {
+ OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
wg.Done()
},
})
@@ -167,7 +179,80 @@ func TestMemoryLimitWaitFor(t *testing.T) {
lock.Unlock()
}()
- assert.Eventually(t, func() bool {
+ require.Eventually(t, func() bool {
+ lock.Lock()
+ defer lock.Unlock()
+
+ return done
+ }, 10*time.Second, 1*time.Second)
+}
+
+func TestGPUMemoryLimit(t *testing.T) {
+ lock := sync.Mutex{}
+
+ lock.Lock()
+ done := false
+ lock.Unlock()
+
+ go func() {
+ wg := sync.WaitGroup{}
+ wg.Add(1)
+
+ l := NewLimiter(LimiterConfig{
+ GPUMemory: 42,
+ OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
+ wg.Done()
+ },
+ })
+
+ l.Start(&psproc{})
+ defer l.Stop()
+
+ wg.Wait()
+
+ lock.Lock()
+ done = true
+ lock.Unlock()
+ }()
+
+ require.Eventually(t, func() bool {
+ lock.Lock()
+ defer lock.Unlock()
+
+ return done
+ }, 2*time.Second, 100*time.Millisecond)
+}
+
+func TestGPUMemoryLimitWaitFor(t *testing.T) {
+ lock := sync.Mutex{}
+
+ lock.Lock()
+ done := false
+ lock.Unlock()
+
+ go func() {
+ wg := sync.WaitGroup{}
+ wg.Add(1)
+
+ l := NewLimiter(LimiterConfig{
+ GPUMemory: 42,
+ WaitFor: 3 * time.Second,
+ OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
+ wg.Done()
+ },
+ })
+
+ l.Start(&psproc{})
+ defer l.Stop()
+
+ wg.Wait()
+
+ lock.Lock()
+ done = true
+ lock.Unlock()
+ }()
+
+ require.Eventually(t, func() bool {
lock.Lock()
defer lock.Unlock()
@@ -189,7 +274,7 @@ func TestMemoryLimitSoftMode(t *testing.T) {
l := NewLimiter(LimiterConfig{
Memory: 42,
Mode: LimitModeSoft,
- OnLimit: func(float64, uint64) {
+ OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
wg.Done()
},
})
@@ -197,7 +282,7 @@ func TestMemoryLimitSoftMode(t *testing.T) {
l.Start(&psproc{})
defer l.Stop()
- l.Limit(false, true)
+ l.Limit(false, true, false)
wg.Wait()
@@ -206,7 +291,46 @@ func TestMemoryLimitSoftMode(t *testing.T) {
lock.Unlock()
}()
- assert.Eventually(t, func() bool {
+ require.Eventually(t, func() bool {
+ lock.Lock()
+ defer lock.Unlock()
+
+ return done
+ }, 2*time.Second, 100*time.Millisecond)
+}
+
+func TestGPUMemoryLimitSoftMode(t *testing.T) {
+ lock := sync.Mutex{}
+
+ lock.Lock()
+ done := false
+ lock.Unlock()
+
+ go func() {
+ wg := sync.WaitGroup{}
+ wg.Add(1)
+
+ l := NewLimiter(LimiterConfig{
+ GPUMemory: 42,
+ Mode: LimitModeSoft,
+ OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
+ wg.Done()
+ },
+ })
+
+ l.Start(&psproc{})
+ defer l.Stop()
+
+ l.Limit(false, false, true)
+
+ wg.Wait()
+
+ lock.Lock()
+ done = true
+ lock.Unlock()
+ }()
+
+ require.Eventually(t, func() bool {
lock.Lock()
defer lock.Unlock()
diff --git a/process/process.go b/process/process.go
index 0fe0d45f..430cb7e2 100644
--- a/process/process.go
+++ b/process/process.go
@@ -46,29 +46,32 @@ type Process interface {
// Limit enables or disables CPU and memory limiting. CPU will be throttled
// into the configured limit. If memory consumption is above the configured
// limit, the process will be killed.
- Limit(cpu, memory bool) error
+ Limit(cpu, memory, gpu bool) error
}
// Config is the configuration of a process
type Config struct {
- Binary string // Path to the ffmpeg binary.
- Args []string // List of arguments for the binary.
- Reconnect bool // Whether to restart the process if it exited.
- ReconnectDelay time.Duration // Duration to wait before restarting the process.
- StaleTimeout time.Duration // Kill the process after this duration if it doesn't produce any output.
- Timeout time.Duration // Kill the process after this duration.
- LimitCPU float64 // Kill the process if the CPU usage in percent is above this value.
- LimitMemory uint64 // Kill the process if the memory consumption in bytes is above this value.
- LimitDuration time.Duration // Kill the process if the limits are exceeded for this duration.
- LimitMode LimitMode // Select limiting mode
- Scheduler Scheduler // A scheduler.
- Parser Parser // A parser for the output of the process.
- OnArgs func(args []string) []string // A callback which is called right before the process will start with the command args.
- OnBeforeStart func() error // A callback which is called before the process will be started. If error is non-nil, the start will be refused.
- OnStart func() // A callback which is called after the process started.
- OnExit func(state string) // A callback which is called after the process exited with the exit state.
- OnStateChange func(from, to string) // A callback which is called after a state changed.
- Logger log.Logger
+ Binary string // Path to the ffmpeg binary.
+ Args []string // List of arguments for the binary.
+ Reconnect bool // Whether to restart the process if it exited.
+ ReconnectDelay time.Duration // Duration to wait before restarting the process.
+ StaleTimeout time.Duration // Kill the process after this duration if it doesn't produce any output.
+ Timeout time.Duration // Kill the process after this duration.
+ LimitCPU float64 // Kill the process if the CPU usage in percent is above this value, in percent 0-100 in hard mode, 0-100*ncpu in soft mode.
+ LimitMemory uint64 // Kill the process if the memory consumption in bytes is above this value.
+ LimitGPUUsage float64 // Kill the process if the GPU usage in percent is above this value, in percent 0-100.
+ LimitGPUEncoder float64 // Kill the process if the GPU encoder usage in percent is above this value, in percent 0-100.
+ LimitGPUDecoder float64 // Kill the process if the GPU decoder usage in percent is above this value, in percent 0-100.
+ LimitGPUMemory uint64 // Kill the process if the GPU memory consumption in bytes is above this value.
+ LimitDuration time.Duration // Kill the process if the limits are exceeded for this duration.
+ LimitMode LimitMode // Select limiting mode
+ Scheduler Scheduler // A scheduler.
+ Parser Parser // A parser for the output of the process.
+ OnBeforeStart func(args []string) ([]string, error) // A callback which is called before the process will be started. The string slice is the arguments of the command line. If error is non-nil, the start will be refused.
+ OnStart func() // A callback which is called after the process started.
+ OnExit func(state string) // A callback which is called after the process exited with the exit state.
+ OnStateChange func(from, to string) // A callback which is called after a state changed.
+ Logger log.Logger
}
// Status represents the current status of a process
@@ -81,20 +84,47 @@ type Status struct {
Time time.Time // Time is the time of the last change of the state
CommandArgs []string // Currently running command arguments
LimitMode string // The limiting mode
- CPU struct {
- NCPU float64 // Number of logical CPUs
- Current float64 // Currently consumed CPU in percent
- Average float64 // Average consumed CPU in percent
- Max float64 // Max. consumed CPU in percent
- Limit float64 // Usage limit in percent
- IsThrottling bool // Whether the CPU is currently limited
- } // Used CPU in percent
- Memory struct {
- Current uint64 // Currently consumed memory in bytes
- Average float64 // Average consumed memory in bytes
- Max uint64 // Max. consumed memory in bytes
- Limit uint64 // Usage limit in bytes
- } // Used memory in bytes
+ CPU StatusCPU // CPU consumption in percent
+ Memory StatusMemory // Memory consumption in bytes
+ GPU StatusGPU // GPU consumption
+}
+
+type StatusCPU struct {
+ NCPU float64 // Number of logical CPUs
+ Current float64 // Currently consumed CPU in percent
+ Average float64 // Average consumed CPU in percent
+ Max float64 // Max. consumed CPU in percent
+ Limit float64 // Usage limit in percent
+ IsThrottling bool // Whether the CPU is currently limited
+}
+
+type StatusMemory struct {
+ Current uint64 // Currently consumed memory in bytes
+ Average uint64 // Average consumed memory in bytes
+ Max uint64 // Max. consumed memory in bytes
+ Limit uint64 // Usage limit in bytes
+}
+
+type StatusGPUMemory struct {
+ Current uint64 // Currently consumed memory in bytes
+ Average uint64 // Average consumed memory in bytes
+ Max uint64 // Max. consumed memory in bytes
+ Limit uint64 // Usage limit in bytes
+}
+
+type StatusGPUUsage struct {
+ Current float64 // Currently consumed GPU usage in percent
+ Average float64 // Average consumed GPU usage in percent
+ Max float64 // Max. consumed GPU usage in percent
+ Limit float64 // Usage limit in percent
+}
+
+type StatusGPU struct {
+ Index int
+ Memory StatusGPUMemory // GPU memory consumption
+ Usage StatusGPUUsage // GPU usage in percent
+ Encoder StatusGPUUsage // GPU encoder usage in percent
+ Decoder StatusGPUUsage // GPU decoder usage in percent
}
// States
@@ -206,8 +236,7 @@ type process struct {
logger log.Logger
debuglogger log.Logger
callbacks struct {
- onArgs func(args []string) []string
- onBeforeStart func() error
+ onBeforeStart func(args []string) ([]string, error)
onStart func()
onExit func(state string)
onStateChange func(from, to string)
@@ -263,28 +292,35 @@ func New(config Config) (Process, error) {
p.stale.last = time.Now()
p.stale.timeout = config.StaleTimeout
- p.callbacks.onArgs = config.OnArgs
p.callbacks.onBeforeStart = config.OnBeforeStart
p.callbacks.onStart = config.OnStart
p.callbacks.onExit = config.OnExit
p.callbacks.onStateChange = config.OnStateChange
p.limits = NewLimiter(LimiterConfig{
- CPU: config.LimitCPU,
- Memory: config.LimitMemory,
- WaitFor: config.LimitDuration,
- Mode: config.LimitMode,
- Logger: p.logger.WithComponent("ProcessLimiter"),
- OnLimit: func(cpu float64, memory uint64) {
+ CPU: config.LimitCPU,
+ Memory: config.LimitMemory,
+ GPUUsage: config.LimitGPUUsage,
+ GPUEncoder: config.LimitGPUEncoder,
+ GPUDecoder: config.LimitGPUDecoder,
+ GPUMemory: config.LimitGPUMemory,
+ WaitFor: config.LimitDuration,
+ Mode: config.LimitMode,
+ Logger: p.logger.WithComponent("ProcessLimiter"),
+ OnLimit: func(cpu float64, memory uint64, gpuusage, gpuencoder, gpudecoder float64, gpumemory uint64) {
if !p.isRunning() {
return
}
p.logger.WithFields(log.Fields{
- "cpu": cpu,
- "memory": memory,
+ "cpu": cpu,
+ "memory": memory,
+ "gpuusage": gpuusage,
+ "gpuencoder": gpuencoder,
+ "gpudecoder": gpudecoder,
+ "gpumemmory": gpumemory,
}).Warn().Log("Killed because limits are exceeded")
- p.Kill(false, fmt.Sprintf("Killed because limits are exceeded (mode: %s, tolerance: %s): %.2f (%.2f) CPU, %d (%d) bytes memory", config.LimitMode.String(), config.LimitDuration.String(), cpu, config.LimitCPU, memory, config.LimitMemory))
+ p.Kill(false, fmt.Sprintf("Killed because limits are exceeded (mode: %s, tolerance: %s): %.2f (%.2f) CPU, %d (%d) bytes memory, %.2f/%.2f/%.2f (%.2f) GPU usage, %d (%d) bytes GPU memory", config.LimitMode.String(), config.LimitDuration.String(), cpu, config.LimitCPU, memory, config.LimitMemory, gpuusage, gpuencoder, gpudecoder, config.LimitGPUUsage, gpumemory, config.LimitGPUMemory))
},
})
@@ -467,8 +503,47 @@ func (p *process) Status() Status {
Duration: time.Since(stateTime),
Time: stateTime,
LimitMode: p.limits.Mode().String(),
- CPU: usage.CPU,
- Memory: usage.Memory,
+ CPU: StatusCPU{
+ NCPU: usage.CPU.NCPU,
+ Current: usage.CPU.Current,
+ Average: usage.CPU.Average,
+ Max: usage.CPU.Max,
+ Limit: usage.CPU.Limit,
+ IsThrottling: usage.CPU.IsThrottling,
+ },
+ Memory: StatusMemory{
+ Current: usage.Memory.Current,
+ Average: uint64(usage.Memory.Average),
+ Max: usage.Memory.Max,
+ Limit: usage.Memory.Limit,
+ },
+ GPU: StatusGPU{
+ Index: usage.GPU.Index,
+ Memory: StatusGPUMemory{
+ Current: usage.GPU.Memory.Current,
+ Average: uint64(usage.GPU.Memory.Average),
+ Max: usage.GPU.Memory.Max,
+ Limit: usage.GPU.Memory.Limit,
+ },
+ Usage: StatusGPUUsage{
+ Current: usage.GPU.Usage.Current,
+ Average: usage.GPU.Usage.Average,
+ Max: usage.GPU.Usage.Max,
+ Limit: usage.GPU.Usage.Limit,
+ },
+ Encoder: StatusGPUUsage{
+ Current: usage.GPU.Encoder.Current,
+ Average: usage.GPU.Encoder.Average,
+ Max: usage.GPU.Encoder.Max,
+ Limit: usage.GPU.Encoder.Limit,
+ },
+ Decoder: StatusGPUUsage{
+ Current: usage.GPU.Decoder.Current,
+ Average: usage.GPU.Decoder.Average,
+ Max: usage.GPU.Decoder.Max,
+ Limit: usage.GPU.Decoder.Limit,
+ },
+ },
}
s.CommandArgs = make([]string, len(p.args))
@@ -488,7 +563,7 @@ func (p *process) IsRunning() bool {
return p.isRunning()
}
-func (p *process) Limit(cpu, memory bool) error {
+func (p *process) Limit(cpu, memory, gpu bool) error {
if !p.isRunning() {
return nil
}
@@ -498,11 +573,12 @@ func (p *process) Limit(cpu, memory bool) error {
}
p.logger.Warn().WithFields(log.Fields{
- "limit_cpu": cpu,
- "limit_memory": memory,
+ "limit_cpu": cpu,
+ "limit_memory": memory,
+ "limit_gpumemory": gpu,
}).Log("Limiter triggered")
- return p.limits.Limit(cpu, memory)
+ return p.limits.Limit(cpu, memory, gpu)
}
// Start will start the process and sets the order to "start". If the
@@ -559,11 +635,21 @@ func (p *process) start() error {
args := p.args
- if p.callbacks.onArgs != nil {
+ if p.callbacks.onBeforeStart != nil {
args = make([]string, len(p.args))
copy(args, p.args)
- args = p.callbacks.onArgs(args)
+ args, err = p.callbacks.onBeforeStart(args)
+ if err != nil {
+ p.setState(stateFailed)
+
+ p.parser.Parse([]byte(err.Error()))
+ p.logger.WithError(err).Error().Log("Starting failed")
+
+ p.reconnect(p.delay(stateFailed))
+
+ return err
+ }
}
p.cmd = exec.Command(p.binary, args...)
@@ -582,19 +668,6 @@ func (p *process) start() error {
return err
}
- if p.callbacks.onBeforeStart != nil {
- if err := p.callbacks.onBeforeStart(); err != nil {
- p.setState(stateFailed)
-
- p.parser.Parse([]byte(err.Error()))
- p.logger.WithError(err).Error().Log("Starting failed")
-
- p.reconnect(p.delay(stateFailed))
-
- return err
- }
- }
-
if err := p.cmd.Start(); err != nil {
p.setState(stateFailed)
diff --git a/process/process_test.go b/process/process_test.go
index 11c669b9..6ddba58a 100644
--- a/process/process_test.go
+++ b/process/process_test.go
@@ -606,21 +606,15 @@ func TestProcessCallbacks(t *testing.T) {
"2",
},
Reconnect: false,
- OnArgs: func(a []string) []string {
- lock.Lock()
- defer lock.Unlock()
-
- args = make([]string, len(a))
- copy(args, a)
- return a
- },
- OnBeforeStart: func() error {
+ OnBeforeStart: func(a []string) ([]string, error) {
lock.Lock()
defer lock.Unlock()
onBeforeStart = true
- return nil
+ args = make([]string, len(a))
+ copy(args, a)
+ return a, nil
},
OnStart: func() {
lock.Lock()
@@ -681,8 +675,8 @@ func TestProcessCallbacksOnBeforeStart(t *testing.T) {
Parser: parser,
Reconnect: true,
ReconnectDelay: 10 * time.Second,
- OnBeforeStart: func() error {
- return fmt.Errorf("no, not now")
+ OnBeforeStart: func(a []string) ([]string, error) {
+ return a, fmt.Errorf("no, not now")
},
})
require.NoError(t, err)
diff --git a/psutil/gpu/gpu.go b/psutil/gpu/gpu.go
index 7feb19bd..cb8dcf00 100644
--- a/psutil/gpu/gpu.go
+++ b/psutil/gpu/gpu.go
@@ -3,21 +3,25 @@ package gpu
import "errors"
type Process struct {
- PID int32
- Memory uint64
+ PID int32
+ Index int
+ Memory uint64 // bytes
+ Usage float64 // percent 0-100
+ Encoder float64 // percent 0-100
+ Decoder float64 // percent 0-100
}
type Stats struct {
+ ID string
Name string
Architecture string
- MemoryTotal uint64
- MemoryUsed uint64
+ MemoryTotal uint64 // bytes
+ MemoryUsed uint64 // bytes
- Usage float64
- MemoryUsage float64
- EncoderUsage float64
- DecoderUsage float64
+ Usage float64 // percent 0-100
+ Encoder float64 // percent 0-100
+ Decoder float64 // percent 0-100
Process []Process
@@ -25,9 +29,17 @@ type Stats struct {
}
type GPU interface {
+ // Count returns the number of GPU in the system.
Count() (int, error)
+
+ // Stats returns current GPU stats.
Stats() ([]Stats, error)
+
+ // Process returns a Process.
Process(pid int32) (Process, error)
+
+ // Close stops all GPU collection processes
+ Close()
}
var ErrProcessNotFound = errors.New("process not found")
diff --git a/psutil/gpu/nvidia/fixtures/process.txt b/psutil/gpu/nvidia/fixtures/process.txt
new file mode 100644
index 00000000..55d7bcf4
--- /dev/null
+++ b/psutil/gpu/nvidia/fixtures/process.txt
@@ -0,0 +1,54 @@
+# gpu pid type sm mem enc dec fb command
+# Idx # C/G % % % % MB name
+ 0 7372 C 2 0 2 - 136 ffmpeg
+ 0 12176 C 5 2 3 7 782 ffmpeg
+ 0 20035 C 8 2 4 1 1145 ffmpeg
+ 0 20141 C 2 1 1 3 429 ffmpeg
+ 0 29591 C 2 1 - 2 435 ffmpeg
+ 0 7372 C 2 0 - - 136 ffmpeg
+ 0 12176 C 8 3 7 9 782 ffmpeg
+ 0 20035 C 8 2 3 1 1145 ffmpeg
+ 0 20141 C - - 1 1 429 ffmpeg
+ 0 29591 C 3 1 - 2 435 ffmpeg
+ 0 7372 C 2 1 1 - 136 ffmpeg
+ 0 12176 C 5 1 5 7 782 ffmpeg
+ 0 20035 C 8 3 1 4 1145 ffmpeg
+ 0 20141 C 2 0 1 - 429 ffmpeg
+ 0 29591 C 2 0 1 3 435 ffmpeg
+ 0 7372 C 2 0 - - 136 ffmpeg
+ 0 12176 C 5 1 5 3 782 ffmpeg
+ 0 20035 C 8 2 5 4 1145 ffmpeg
+ 0 20141 C 3 1 - 5 429 ffmpeg
+ 0 29591 C 2 0 - 1 435 ffmpeg
+ 0 7372 C 2 1 - - 136 ffmpeg
+ 0 12176 C 10 3 6 8 782 ffmpeg
+ 0 20035 C 3 1 1 1 1145 ffmpeg
+ 0 20141 C - - 4 1 429 ffmpeg
+ 0 29591 C 5 2 - 2 435 ffmpeg
+ 0 7372 C 5 1 2 - 136 ffmpeg
+ 0 12176 C 6 2 4 7 782 ffmpeg
+ 0 20035 C - - - - 1145 ffmpeg
+ 0 20141 C 5 1 1 3 429 ffmpeg
+ 0 29591 C 5 2 2 4 435 ffmpeg
+ 0 7372 C - - 1 - 136 ffmpeg
+ 0 12176 C 7 2 3 4 782 ffmpeg
+ 0 20035 C 2 0 - 1 1145 ffmpeg
+ 0 20141 C 7 2 4 4 429 ffmpeg
+ 0 29591 C 5 1 2 3 435 ffmpeg
+ 0 7372 C 2 0 1 - 136 ffmpeg
+ 0 12176 C 9 3 3 6 782 ffmpeg
+ 0 20035 C 2 1 - 1 1145 ffmpeg
+ 0 20141 C 4 1 4 5 429 ffmpeg
+ 0 29591 C 2 0 2 1 435 ffmpeg
+ 0 7372 C - - - - 136 ffmpeg
+ 0 12176 C 10 3 4 8 782 ffmpeg
+ 0 20035 C 4 1 2 1 1145 ffmpeg
+ 0 20141 C 7 2 3 3 429 ffmpeg
+# gpu pid type sm mem enc dec fb command
+# Idx # C/G % % % % MB name
+ 0 29591 C - - 1 1 435 ffmpeg
+ 0 7372 C 2 0 2 - 136 ffmpeg
+ 0 12176 C 7 2 2 6 782 ffmpeg
+ 0 20035 C 7 2 4 3 1145 ffmpeg
+ 0 20141 C 5 1 1 3 429 ffmpeg
+ 0 29591 C - - 1 1 435 ffmpeg
\ No newline at end of file
diff --git a/psutil/gpu/nvidia/fixtures/data1.xml b/psutil/gpu/nvidia/fixtures/query1.xml
similarity index 100%
rename from psutil/gpu/nvidia/fixtures/data1.xml
rename to psutil/gpu/nvidia/fixtures/query1.xml
diff --git a/psutil/gpu/nvidia/fixtures/data2.xml b/psutil/gpu/nvidia/fixtures/query2.xml
similarity index 98%
rename from psutil/gpu/nvidia/fixtures/data2.xml
rename to psutil/gpu/nvidia/fixtures/query2.xml
index cd45d707..4d93cac0 100644
--- a/psutil/gpu/nvidia/fixtures/data2.xml
+++ b/psutil/gpu/nvidia/fixtures/query2.xml
@@ -438,6 +438,18 @@
+
+ 10131
+ C
+ ffmpeg
+ 389 MiB
+
+
+ 13597
+ C
+ ffmpeg
+ 1054 MiB
+
@@ -879,6 +891,12 @@
+
+ 16870
+ C
+ ffmpeg
+ 549 MiB
+
diff --git a/psutil/gpu/nvidia/fixtures/data3.xml b/psutil/gpu/nvidia/fixtures/query3.xml
similarity index 100%
rename from psutil/gpu/nvidia/fixtures/data3.xml
rename to psutil/gpu/nvidia/fixtures/query3.xml
diff --git a/psutil/gpu/nvidia/nvidia.go b/psutil/gpu/nvidia/nvidia.go
index ba45e2fa..98ad1520 100644
--- a/psutil/gpu/nvidia/nvidia.go
+++ b/psutil/gpu/nvidia/nvidia.go
@@ -6,6 +6,9 @@ import (
"encoding/xml"
"fmt"
"os/exec"
+ "regexp"
+ "slices"
+ "strconv"
"sync"
"time"
@@ -47,11 +50,19 @@ func (u *Utilization) UnmarshalText(text []byte) error {
}
type Process struct {
- PID int32 `xml:"pid"`
- Memory Megabytes `xml:"used_memory"`
+ Index int
+ PID int32
+ Memory uint64 // bytes
+
+ Usage float64 // percent 0-100
+ Encoder float64 // percent 0-100
+ Decoder float64 // percent 0-100
+
+ lastSeen time.Time
}
type GPUStats struct {
+ ID string `xml:"id,attr"`
Name string `xml:"product_name"`
Architecture string `xml:"product_architecture"`
@@ -59,31 +70,17 @@ type GPUStats struct {
MemoryUsed Megabytes `xml:"fb_memory_usage>used"`
Usage Utilization `xml:"utilization>gpu_util"`
- MemoryUsage Utilization `xml:"utilization>memory_util"`
- EncoderUsage Utilization `xml:"utilization>encoder_util"`
- DecoderUsage Utilization `xml:"utilization>decoder_util"`
-
- Process []Process `xml:"processes>process_info"`
+ UsageEncoder Utilization `xml:"utilization>encoder_util"`
+ UsageDecoder Utilization `xml:"utilization>decoder_util"`
}
type Stats struct {
GPU []GPUStats `xml:"gpu"`
}
-func parse(data []byte) (Stats, error) {
- nv := Stats{}
-
- err := xml.Unmarshal(data, &nv)
- if err != nil {
- return nv, fmt.Errorf("parsing report: %w", err)
- }
-
- return nv, nil
-}
-
type nvidia struct {
- cmd *exec.Cmd
- wr *writer
+ wrQuery *writerQuery
+ wrProcess *writerProcess
lock sync.RWMutex
cancel context.CancelFunc
@@ -97,33 +94,33 @@ type dummy struct{}
func (d *dummy) Count() (int, error) { return 0, nil }
func (d *dummy) Stats() ([]gpu.Stats, error) { return nil, nil }
func (d *dummy) Process(pid int32) (gpu.Process, error) { return gpu.Process{}, gpu.ErrProcessNotFound }
+func (d *dummy) Close() {}
-type writer struct {
- buf bytes.Buffer
- ch chan Stats
+type writerQuery struct {
+ buf bytes.Buffer
+ ch chan Stats
+ terminator []byte
}
-var terminator = []byte("\n")
-
-func (w *writer) Write(data []byte) (int, error) {
+func (w *writerQuery) Write(data []byte) (int, error) {
n, err := w.buf.Write(data)
if err != nil {
return n, err
}
for {
- idx := bytes.Index(w.buf.Bytes(), terminator)
+ idx := bytes.Index(w.buf.Bytes(), w.terminator)
if idx == -1 {
break
}
- content := make([]byte, idx+len(terminator))
+ content := make([]byte, idx+len(w.terminator))
n, err := w.buf.Read(content)
if err != nil || n != len(content) {
break
}
- s, err := parse(content)
+ s, err := w.parse(content)
if err != nil {
continue
}
@@ -134,19 +131,132 @@ func (w *writer) Write(data []byte) (int, error) {
return n, nil
}
+func (w *writerQuery) parse(data []byte) (Stats, error) {
+ nv := Stats{}
+
+ err := xml.Unmarshal(data, &nv)
+ if err != nil {
+ return nv, fmt.Errorf("parsing report: %w", err)
+ }
+
+ return nv, nil
+}
+
+type writerProcess struct {
+ buf bytes.Buffer
+ ch chan Process
+ re *regexp.Regexp
+ terminator []byte
+}
+
+func (w *writerProcess) Write(data []byte) (int, error) {
+ n, err := w.buf.Write(data)
+ if err != nil {
+ return n, err
+ }
+
+ for {
+ idx := bytes.Index(w.buf.Bytes(), w.terminator)
+ if idx == -1 {
+ break
+ }
+
+ content := make([]byte, idx+len(w.terminator))
+ n, err := w.buf.Read(content)
+ if err != nil || n != len(content) {
+ break
+ }
+
+ s, err := w.parse(content)
+ if err != nil {
+ continue
+ }
+
+ w.ch <- s
+ }
+
+ return n, nil
+}
+
+func (w *writerProcess) parse(data []byte) (Process, error) {
+ p := Process{}
+
+ if len(data) == 0 {
+ return p, fmt.Errorf("empty line")
+ }
+
+ if data[0] == '#' {
+ return p, fmt.Errorf("comment")
+ }
+
+ matches := w.re.FindStringSubmatch(string(data))
+ if matches == nil {
+ return p, fmt.Errorf("no matches found")
+ }
+
+ if len(matches) != 7 {
+ return p, fmt.Errorf("not the expected number of matches found")
+ }
+
+ if d, err := strconv.ParseInt(matches[1], 10, 0); err == nil {
+ p.Index = int(d)
+ }
+
+ if d, err := strconv.ParseInt(matches[2], 10, 32); err == nil {
+ p.PID = int32(d)
+ }
+
+ if matches[3][0] != '-' {
+ if d, err := strconv.ParseFloat(matches[3], 64); err == nil {
+ p.Usage = d
+ }
+ }
+
+ if matches[4][0] != '-' {
+ if d, err := strconv.ParseFloat(matches[4], 64); err == nil {
+ p.Encoder = d
+ }
+ }
+
+ if matches[5][0] != '-' {
+ if d, err := strconv.ParseFloat(matches[5], 64); err == nil {
+ p.Decoder = d
+ }
+ }
+
+ if d, err := strconv.ParseUint(matches[6], 10, 64); err == nil {
+ p.Memory = d * 1024 * 1024
+ }
+
+ return p, nil
+}
+
func New(path string) gpu.GPU {
if len(path) == 0 {
path = "nvidia-smi"
}
- _, err := exec.LookPath(path)
+ path, err := exec.LookPath(path)
if err != nil {
return &dummy{}
}
n := &nvidia{
- wr: &writer{
- ch: make(chan Stats, 1),
+ wrQuery: &writerQuery{
+ ch: make(chan Stats, 1),
+ terminator: []byte("\n"),
+ },
+ wrProcess: &writerProcess{
+ ch: make(chan Process, 32),
+ // # gpu pid type sm mem enc dec fb command
+ // # Idx # C/G % % % % MB name
+ // 0 7372 C 2 0 2 - 136 ffmpeg
+ // 0 12176 C 5 2 3 7 782 ffmpeg
+ // 0 20035 C 8 2 4 1 1145 ffmpeg
+ // 0 20141 C 2 1 1 3 429 ffmpeg
+ // 0 29591 C 2 1 - 2 435 ffmpeg
+ re: regexp.MustCompile(`^\s*([0-9]+)\s+([0-9]+)\s+[A-Z]\s+([0-9-]+)\s+[0-9-]+\s+([0-9-]+)\s+([0-9-]+)\s+([0-9]+).*`),
+ terminator: []byte("\n"),
},
process: map[int32]Process{},
}
@@ -154,7 +264,8 @@ func New(path string) gpu.GPU {
ctx, cancel := context.WithCancel(context.Background())
n.cancel = cancel
- go n.runner(ctx, path)
+ go n.runnerQuery(ctx, path)
+ go n.runnerProcess(ctx, path)
go n.reader(ctx)
return n
@@ -165,13 +276,18 @@ func (n *nvidia) reader(ctx context.Context) {
select {
case <-ctx.Done():
return
- case stats := <-n.wr.ch:
+ case stats := <-n.wrQuery.ch:
n.lock.Lock()
n.stats = stats
- n.process = map[int32]Process{}
- for _, g := range n.stats.GPU {
- for _, p := range g.Process {
- n.process[p.PID] = p
+ n.lock.Unlock()
+ case process := <-n.wrProcess.ch:
+ process.lastSeen = time.Now()
+ n.lock.Lock()
+ n.process[process.PID] = process
+
+ for pid, p := range n.process {
+ if time.Since(p.lastSeen) > 11*time.Second {
+ delete(n.process, pid)
}
}
n.lock.Unlock()
@@ -179,11 +295,11 @@ func (n *nvidia) reader(ctx context.Context) {
}
}
-func (n *nvidia) runner(ctx context.Context, path string) {
+func (n *nvidia) runnerQuery(ctx context.Context, path string) {
for {
- n.cmd = exec.Command(path, "-q", "-x", "-l", "1")
- n.cmd.Stdout = n.wr
- err := n.cmd.Start()
+ cmd := exec.CommandContext(ctx, path, "-q", "-x", "-l", "1")
+ cmd.Stdout = n.wrQuery
+ err := cmd.Start()
if err != nil {
n.lock.Lock()
n.err = err
@@ -193,7 +309,35 @@ func (n *nvidia) runner(ctx context.Context, path string) {
continue
}
- err = n.cmd.Wait()
+ err = cmd.Wait()
+
+ n.lock.Lock()
+ n.err = err
+ n.lock.Unlock()
+
+ select {
+ case <-ctx.Done():
+ return
+ default:
+ }
+ }
+}
+
+func (n *nvidia) runnerProcess(ctx context.Context, path string) {
+ for {
+ cmd := exec.CommandContext(ctx, path, "pmon", "-s", "um", "-d", "5")
+ cmd.Stdout = n.wrProcess
+ err := cmd.Start()
+ if err != nil {
+ n.lock.Lock()
+ n.err = err
+ n.lock.Unlock()
+
+ time.Sleep(3 * time.Second)
+ continue
+ }
+
+ err = cmd.Wait()
n.lock.Lock()
n.err = err
@@ -219,39 +363,55 @@ func (n *nvidia) Count() (int, error) {
}
func (n *nvidia) Stats() ([]gpu.Stats, error) {
- s := []gpu.Stats{}
+ stats := []gpu.Stats{}
n.lock.RLock()
defer n.lock.RUnlock()
if n.err != nil {
- return s, n.err
+ return stats, n.err
}
for _, nv := range n.stats.GPU {
- stats := gpu.Stats{
+ s := gpu.Stats{
+ ID: nv.ID,
Name: nv.Name,
Architecture: nv.Architecture,
MemoryTotal: uint64(nv.MemoryTotal),
MemoryUsed: uint64(nv.MemoryUsed),
Usage: float64(nv.Usage),
- MemoryUsage: float64(nv.MemoryUsage),
- EncoderUsage: float64(nv.EncoderUsage),
- DecoderUsage: float64(nv.DecoderUsage),
+ Encoder: float64(nv.UsageEncoder),
+ Decoder: float64(nv.UsageDecoder),
Process: []gpu.Process{},
}
- for _, p := range nv.Process {
- stats.Process = append(stats.Process, gpu.Process{
- PID: p.PID,
- Memory: uint64(p.Memory),
- })
- }
-
- s = append(s, stats)
+ stats = append(stats, s)
}
- return s, nil
+ for _, p := range n.process {
+ if p.Index >= len(stats) {
+ continue
+ }
+
+ stats[p.Index].Process = append(stats[p.Index].Process, gpu.Process{
+ PID: p.PID,
+ Index: p.Index,
+ Memory: p.Memory,
+ Usage: p.Usage,
+ Encoder: p.Encoder,
+ Decoder: p.Decoder,
+ })
+ }
+
+ for i := range stats {
+ p := stats[i].Process
+ slices.SortFunc(p, func(a, b gpu.Process) int {
+ return int(a.PID - b.PID)
+ })
+ stats[i].Process = p
+ }
+
+ return stats, nil
}
func (n *nvidia) Process(pid int32) (gpu.Process, error) {
@@ -259,14 +419,18 @@ func (n *nvidia) Process(pid int32) (gpu.Process, error) {
defer n.lock.RUnlock()
p, hasProcess := n.process[pid]
- if !hasProcess {
- return gpu.Process{}, gpu.ErrProcessNotFound
+ if hasProcess {
+ return gpu.Process{
+ PID: p.PID,
+ Index: p.Index,
+ Memory: p.Memory,
+ Usage: p.Usage,
+ Encoder: p.Encoder,
+ Decoder: p.Decoder,
+ }, nil
}
- return gpu.Process{
- PID: p.PID,
- Memory: uint64(p.Memory),
- }, nil
+ return gpu.Process{Index: -1}, gpu.ErrProcessNotFound
}
func (n *nvidia) Close() {
@@ -279,6 +443,4 @@ func (n *nvidia) Close() {
n.cancel()
n.cancel = nil
-
- n.cmd.Process.Kill()
}
diff --git a/psutil/gpu/nvidia/nvidia_test.go b/psutil/gpu/nvidia/nvidia_test.go
index f18310b2..51954eb8 100644
--- a/psutil/gpu/nvidia/nvidia_test.go
+++ b/psutil/gpu/nvidia/nvidia_test.go
@@ -1,102 +1,430 @@
package nvidia
import (
+ "bytes"
"os"
+ "regexp"
+ "sync"
"testing"
+ "time"
+ "github.com/datarhei/core/v16/internal/testhelper"
+ "github.com/datarhei/core/v16/psutil/gpu"
"github.com/stretchr/testify/require"
)
-func TestParseNV(t *testing.T) {
- data, err := os.ReadFile("./fixtures/data1.xml")
+func TestParseQuery(t *testing.T) {
+ data, err := os.ReadFile("./fixtures/query1.xml")
require.NoError(t, err)
- nv, err := parse(data)
+ wr := &writerQuery{}
+
+ nv, err := wr.parse(data)
require.NoError(t, err)
require.Equal(t, Stats{
GPU: []GPUStats{
{
+ ID: "00000000:01:00.0",
Name: "NVIDIA GeForce GTX 1080",
Architecture: "Pascal",
MemoryTotal: 8119 * 1024 * 1024,
MemoryUsed: 918 * 1024 * 1024,
Usage: 15,
- MemoryUsage: 7,
- EncoderUsage: 3,
- DecoderUsage: 0,
- Process: []Process{
- {
- PID: 18179,
- Memory: 916 * 1024 * 1024,
- },
- },
+ UsageEncoder: 3,
+ UsageDecoder: 0,
},
},
}, nv)
- data, err = os.ReadFile("./fixtures/data2.xml")
+ data, err = os.ReadFile("./fixtures/query2.xml")
require.NoError(t, err)
- nv, err = parse(data)
+ nv, err = wr.parse(data)
require.NoError(t, err)
require.Equal(t, Stats{
GPU: []GPUStats{
{
+ ID: "00000000:01:00.0",
Name: "NVIDIA L4",
Architecture: "Ada Lovelace",
MemoryTotal: 23034 * 1024 * 1024,
MemoryUsed: 1 * 1024 * 1024,
Usage: 2,
- MemoryUsage: 0,
- EncoderUsage: 0,
- DecoderUsage: 0,
+ UsageEncoder: 0,
+ UsageDecoder: 0,
},
{
+ ID: "00000000:C1:00.0",
Name: "NVIDIA L4",
Architecture: "Ada Lovelace",
MemoryTotal: 23034 * 1024 * 1024,
MemoryUsed: 1 * 1024 * 1024,
Usage: 3,
- MemoryUsage: 0,
- EncoderUsage: 0,
- DecoderUsage: 0,
+ UsageEncoder: 0,
+ UsageDecoder: 0,
},
},
}, nv)
- data, err = os.ReadFile("./fixtures/data3.xml")
+ data, err = os.ReadFile("./fixtures/query3.xml")
require.NoError(t, err)
- nv, err = parse(data)
+ nv, err = wr.parse(data)
require.NoError(t, err)
require.Equal(t, Stats{
GPU: []GPUStats{
{
+ ID: "00000000:01:00.0",
Name: "GeForce GTX 1080",
MemoryTotal: 8119 * 1024 * 1024,
MemoryUsed: 2006 * 1024 * 1024,
Usage: 32,
- MemoryUsage: 11,
- EncoderUsage: 17,
- DecoderUsage: 25,
- Process: []Process{
- {
- PID: 10131,
- Memory: 389 * 1024 * 1024,
- },
- {
- PID: 13597,
- Memory: 1054 * 1024 * 1024,
- },
- {
- PID: 16870,
- Memory: 549 * 1024 * 1024,
- },
- },
+ UsageEncoder: 17,
+ UsageDecoder: 25,
},
},
}, nv)
}
+
+func TestParseProcess(t *testing.T) {
+ data, err := os.ReadFile("./fixtures/process.txt")
+ require.NoError(t, err)
+
+ wr := &writerProcess{
+ re: regexp.MustCompile(`^\s*([0-9]+)\s+([0-9]+)\s+[A-Z]\s+([0-9-]+)\s+[0-9-]+\s+([0-9-]+)\s+([0-9-]+)\s+([0-9]+).*`),
+ }
+
+ lines := bytes.Split(data, []byte("\n"))
+ process := map[int32]Process{}
+
+ for _, line := range lines {
+ p, err := wr.parse(line)
+ if err != nil {
+ continue
+ }
+
+ process[p.PID] = p
+ }
+
+ require.Equal(t, map[int32]Process{
+ 7372: {
+ Index: 0,
+ PID: 7372,
+ Memory: 136 * 1024 * 1024,
+ Usage: 2,
+ Encoder: 2,
+ Decoder: 0,
+ },
+ 12176: {
+ Index: 0,
+ PID: 12176,
+ Memory: 782 * 1024 * 1024,
+ Usage: 7,
+ Encoder: 2,
+ Decoder: 6,
+ },
+ 20035: {
+ Index: 0,
+ PID: 20035,
+ Memory: 1145 * 1024 * 1024,
+ Usage: 7,
+ Encoder: 4,
+ Decoder: 3,
+ },
+ 20141: {
+ Index: 0,
+ PID: 20141,
+ Memory: 429 * 1024 * 1024,
+ Usage: 5,
+ Encoder: 1,
+ Decoder: 3,
+ },
+ 29591: {
+ Index: 0,
+ PID: 29591,
+ Memory: 435 * 1024 * 1024,
+ Usage: 0,
+ Encoder: 1,
+ Decoder: 1,
+ },
+ }, process)
+}
+
+func TestWriterQuery(t *testing.T) {
+ data, err := os.ReadFile("./fixtures/query2.xml")
+ require.NoError(t, err)
+
+ wr := &writerQuery{
+ ch: make(chan Stats, 1),
+ terminator: []byte(""),
+ }
+
+ stats := Stats{}
+ wg := sync.WaitGroup{}
+ wg.Add(1)
+
+ go func() {
+ defer wg.Done()
+
+ for s := range wr.ch {
+ stats = s
+ }
+ }()
+
+ _, err = wr.Write(data)
+ require.NoError(t, err)
+
+ close(wr.ch)
+
+ wg.Wait()
+
+ require.Equal(t, Stats{
+ GPU: []GPUStats{
+ {
+ ID: "00000000:01:00.0",
+ Name: "NVIDIA L4",
+ Architecture: "Ada Lovelace",
+ MemoryTotal: 23034 * 1024 * 1024,
+ MemoryUsed: 1 * 1024 * 1024,
+ Usage: 2,
+ UsageEncoder: 0,
+ UsageDecoder: 0,
+ },
+ {
+ ID: "00000000:C1:00.0",
+ Name: "NVIDIA L4",
+ Architecture: "Ada Lovelace",
+ MemoryTotal: 23034 * 1024 * 1024,
+ MemoryUsed: 1 * 1024 * 1024,
+ Usage: 3,
+ UsageEncoder: 0,
+ UsageDecoder: 0,
+ },
+ },
+ }, stats)
+}
+
+func TestWriterProcess(t *testing.T) {
+ data, err := os.ReadFile("./fixtures/process.txt")
+ require.NoError(t, err)
+
+ wr := &writerProcess{
+ ch: make(chan Process, 32),
+ re: regexp.MustCompile(`^\s*([0-9]+)\s+([0-9]+)\s+[A-Z]\s+([0-9-]+)\s+[0-9-]+\s+([0-9-]+)\s+([0-9-]+)\s+([0-9]+).*`),
+ terminator: []byte("\n"),
+ }
+
+ process := map[int32]Process{}
+ wg := sync.WaitGroup{}
+ wg.Add(1)
+
+ go func() {
+ defer wg.Done()
+ for p := range wr.ch {
+ process[p.PID] = p
+ }
+ }()
+
+ _, err = wr.Write(data)
+ require.NoError(t, err)
+
+ close(wr.ch)
+
+ wg.Wait()
+
+ require.Equal(t, map[int32]Process{
+ 7372: {
+ Index: 0,
+ PID: 7372,
+ Memory: 136 * 1024 * 1024,
+ Usage: 2,
+ Encoder: 2,
+ Decoder: 0,
+ },
+ 12176: {
+ Index: 0,
+ PID: 12176,
+ Memory: 782 * 1024 * 1024,
+ Usage: 7,
+ Encoder: 2,
+ Decoder: 6,
+ },
+ 20035: {
+ Index: 0,
+ PID: 20035,
+ Memory: 1145 * 1024 * 1024,
+ Usage: 7,
+ Encoder: 4,
+ Decoder: 3,
+ },
+ 20141: {
+ Index: 0,
+ PID: 20141,
+ Memory: 429 * 1024 * 1024,
+ Usage: 5,
+ Encoder: 1,
+ Decoder: 3,
+ },
+ 29591: {
+ Index: 0,
+ PID: 29591,
+ Memory: 435 * 1024 * 1024,
+ Usage: 0,
+ Encoder: 1,
+ Decoder: 1,
+ },
+ }, process)
+}
+
+func TestNvidiaGPUCount(t *testing.T) {
+ binary, err := testhelper.BuildBinary("nvidia-smi", "../../../internal/testhelper")
+ require.NoError(t, err, "Failed to build helper program")
+
+ nv := New(binary)
+
+ t.Cleanup(func() {
+ nv.Close()
+ })
+
+ _, ok := nv.(*dummy)
+ require.False(t, ok)
+
+ require.Eventually(t, func() bool {
+ count, _ := nv.Count()
+ return count != 0
+ }, 5*time.Second, time.Second)
+}
+
+func TestNvidiaGPUStats(t *testing.T) {
+ binary, err := testhelper.BuildBinary("nvidia-smi", "../../../internal/testhelper")
+ require.NoError(t, err, "Failed to build helper program")
+
+ nv := New(binary)
+
+ t.Cleanup(func() {
+ nv.Close()
+ })
+
+ _, ok := nv.(*dummy)
+ require.False(t, ok)
+
+ require.Eventually(t, func() bool {
+ stats, _ := nv.Stats()
+
+ if len(stats) != 2 {
+ return false
+ }
+
+ if len(stats[0].Process) != 3 {
+ return false
+ }
+
+ if len(stats[1].Process) != 2 {
+ return false
+ }
+
+ return true
+ }, 5*time.Second, time.Second)
+
+ stats, err := nv.Stats()
+ require.NoError(t, err)
+ require.Equal(t, []gpu.Stats{
+ {
+ ID: "00000000:01:00.0",
+ Name: "NVIDIA L4",
+ Architecture: "Ada Lovelace",
+ MemoryTotal: 23034 * 1024 * 1024,
+ MemoryUsed: 1 * 1024 * 1024,
+ Usage: 2,
+ Encoder: 0,
+ Decoder: 0,
+ Process: []gpu.Process{
+ {
+ Index: 0,
+ PID: 7372,
+ Memory: 136 * 1024 * 1024,
+ Usage: 2,
+ Encoder: 2,
+ Decoder: 0,
+ },
+ {
+ Index: 0,
+ PID: 12176,
+ Memory: 782 * 1024 * 1024,
+ Usage: 5,
+ Encoder: 3,
+ Decoder: 7,
+ },
+ {
+ Index: 0,
+ PID: 29591,
+ Memory: 435 * 1024 * 1024,
+ Usage: 2,
+ Encoder: 0,
+ Decoder: 2,
+ },
+ },
+ },
+ {
+ ID: "00000000:C1:00.0",
+ Name: "NVIDIA L4",
+ Architecture: "Ada Lovelace",
+ MemoryTotal: 23034 * 1024 * 1024,
+ MemoryUsed: 1 * 1024 * 1024,
+ Usage: 3,
+ Encoder: 0,
+ Decoder: 0,
+ Process: []gpu.Process{
+ {
+ Index: 1,
+ PID: 20035,
+ Memory: 1145 * 1024 * 1024,
+ Usage: 8,
+ Encoder: 4,
+ Decoder: 1,
+ },
+ {
+ Index: 1,
+ PID: 20141,
+ Memory: 429 * 1024 * 1024,
+ Usage: 2,
+ Encoder: 1,
+ Decoder: 3,
+ },
+ },
+ },
+ }, stats)
+}
+
+func TestNvidiaGPUProcess(t *testing.T) {
+ binary, err := testhelper.BuildBinary("nvidia-smi", "../../../internal/testhelper")
+ require.NoError(t, err, "Failed to build helper program")
+
+ nv := New(binary)
+
+ t.Cleanup(func() {
+ nv.Close()
+ })
+
+ _, ok := nv.(*dummy)
+ require.False(t, ok)
+
+ require.Eventually(t, func() bool {
+ _, err := nv.Process(12176)
+ return err == nil
+ }, 5*time.Second, time.Second)
+
+ proc, err := nv.Process(12176)
+ require.NoError(t, err)
+ require.Equal(t, gpu.Process{
+ Index: 0,
+ PID: 12176,
+ Memory: 782 * 1024 * 1024,
+ Usage: 5,
+ Encoder: 3,
+ Decoder: 7,
+ }, proc)
+}
diff --git a/psutil/process.go b/psutil/process.go
index 0789f553..bec312ca 100644
--- a/psutil/process.go
+++ b/psutil/process.go
@@ -5,24 +5,28 @@ import (
"sync"
"time"
+ "github.com/datarhei/core/v16/psutil/gpu/nvidia"
psprocess "github.com/shirou/gopsutil/v3/process"
)
type Process interface {
- // CPUPercent returns the current CPU load for this process only. The values
+ // CPU returns the current CPU load for this process only. The values
// are normed to the range of 0 to 100.
- CPUPercent() (*CPUInfoStat, error)
+ CPU() (*CPUInfo, error)
- // VirtualMemory returns the current memory usage in bytes of this process only.
- VirtualMemory() (uint64, error)
+ // Memory returns the current memory usage in bytes of this process only.
+ Memory() (uint64, error)
+
+ // GPU returns the current GPU memory in bytes and usage in percent (0-100) of this process only.
+ GPU() (*GPUInfo, error)
// Stop will stop collecting CPU and memory data for this process.
Stop()
- // Suspend will send SIGSTOP to the process
+ // Suspend will send SIGSTOP to the process.
Suspend() error
- // Resume will send SIGCONT to the process
+ // Resume will send SIGCONT to the process.
Resume() error
}
@@ -142,7 +146,7 @@ func (p *process) Resume() error {
return p.proc.Resume()
}
-func (p *process) CPUPercent() (*CPUInfoStat, error) {
+func (p *process) CPU() (*CPUInfo, error) {
var diff float64
for {
@@ -167,7 +171,7 @@ func (p *process) CPUPercent() (*CPUInfoStat, error) {
diff = p.statCurrentTime.Sub(p.statPreviousTime).Seconds() * p.ncpu
}
- s := &CPUInfoStat{
+ s := &CPUInfo{
System: 0,
User: 0,
Idle: 0,
@@ -186,9 +190,28 @@ func (p *process) CPUPercent() (*CPUInfoStat, error) {
return s, nil
}
-func (p *process) VirtualMemory() (uint64, error) {
+func (p *process) Memory() (uint64, error) {
p.lock.RLock()
defer p.lock.RUnlock()
return p.memRSS, nil
}
+
+func (p *process) GPU() (*GPUInfo, error) {
+ info := &GPUInfo{
+ Index: -1,
+ }
+
+ proc, err := nvidia.Default.Process(p.pid)
+ if err != nil {
+ return info, nil
+ }
+
+ info.Index = proc.Index
+ info.MemoryUsed = proc.Memory
+ info.Usage = proc.Usage
+ info.Encoder = proc.Encoder
+ info.Decoder = proc.Decoder
+
+ return info, nil
+}
diff --git a/psutil/psutil.go b/psutil/psutil.go
index be5e1844..079e933d 100644
--- a/psutil/psutil.go
+++ b/psutil/psutil.go
@@ -47,29 +47,44 @@ func init() {
DefaultUtil, _ = New("/sys/fs/cgroup")
}
-type MemoryInfoStat struct {
+type DiskInfo struct {
+ Path string
+ Fstype string
+ Total uint64
+ Used uint64
+ InodesTotal uint64
+ InodesUsed uint64
+}
+
+type MemoryInfo struct {
Total uint64 // bytes
Available uint64 // bytes
Used uint64 // bytes
}
-type CPUInfoStat struct {
+type NetworkInfo struct {
+ Name string // interface name
+ BytesSent uint64 // number of bytes sent
+ BytesRecv uint64 // number of bytes received
+}
+
+type CPUInfo struct {
System float64 // percent 0-100
User float64 // percent 0-100
Idle float64 // percent 0-100
Other float64 // percent 0-100
}
-type GPUInfoStat struct {
- Name string
+type GPUInfo struct {
+ Index int // Index of the GPU
+ Name string // Name of the GPU (not populated for a specific process)
- MemoryTotal uint64 // bytes
+ MemoryTotal uint64 // bytes (not populated for a specific process)
MemoryUsed uint64 // bytes
- Usage float64 // percent 0-100
- MemoryUsage float64 // percent 0-100
- EncoderUsage float64 // percent 0-100
- DecoderUsage float64 // percent 0-100
+ Usage float64 // percent 0-100
+ Encoder float64 // percent 0-100
+ Decoder float64 // percent 0-100
}
type cpuTimesStat struct {
@@ -85,18 +100,23 @@ type Util interface {
Stop()
// CPUCounts returns the number of cores, either logical or physical.
- CPUCounts(logical bool) (float64, error)
+ CPUCounts() (float64, error)
- // GPUCounts returns the number of GPU cores.
- GPUCounts() (float64, error)
-
- // CPUPercent returns the current CPU load in percent. The values range
+ // CPU returns the current CPU load in percent. The values range
// from 0 to 100, independently of the number of logical cores.
- CPUPercent() (*CPUInfoStat, error)
- DiskUsage(path string) (*disk.UsageStat, error)
- VirtualMemory() (*MemoryInfoStat, error)
- NetIOCounters(pernic bool) ([]net.IOCountersStat, error)
- GPUStats() ([]GPUInfoStat, error)
+ CPU() (*CPUInfo, error)
+
+ // Disk returns the current usage of the partition specified by the path.
+ Disk(path string) (*DiskInfo, error)
+
+ // Memory return the current memory usage.
+ Memory() (*MemoryInfo, error)
+
+ // Network returns the current network interface statistics per network adapter.
+ Network() ([]NetworkInfo, error)
+
+ // GPU return the current usage for each CPU.
+ GPU() ([]GPUInfo, error)
// Process returns a process observer for a process with the given pid.
Process(pid int32) (Process, error)
@@ -120,7 +140,7 @@ type util struct {
statPrevious cpuTimesStat
statPreviousTime time.Time
nTicks uint64
- mem MemoryInfoStat
+ mem MemoryInfo
}
// New returns a new util, it will be started automatically
@@ -140,7 +160,7 @@ func New(root string) (Util, error) {
if u.ncpu == 0 {
var err error
- u.ncpu, err = u.CPUCounts(true)
+ u.ncpu, err = u.CPUCounts()
if err != nil {
return nil, err
}
@@ -311,7 +331,7 @@ func (u *util) tickMemory(ctx context.Context, interval time.Duration) {
}
}
-func (u *util) collectMemory() *MemoryInfoStat {
+func (u *util) collectMemory() *MemoryInfo {
stat, err := u.virtualMemory()
if err != nil {
return nil
@@ -320,12 +340,12 @@ func (u *util) collectMemory() *MemoryInfoStat {
return stat
}
-func (u *util) CPUCounts(logical bool) (float64, error) {
+func (u *util) CPUCounts() (float64, error) {
if u.hasCgroup && u.ncpu > 0 {
return u.ncpu, nil
}
- ncpu, err := cpu.Counts(logical)
+ ncpu, err := cpu.Counts(true)
if err != nil {
return 0, err
}
@@ -333,18 +353,8 @@ func (u *util) CPUCounts(logical bool) (float64, error) {
return float64(ncpu), nil
}
-func CPUCounts(logical bool) (float64, error) {
- return DefaultUtil.CPUCounts(logical)
-}
-
-func (u *util) GPUCounts() (float64, error) {
- count, err := nvidia.Default.Count()
-
- return float64(count), err
-}
-
-func GPUCounts() (float64, error) {
- return DefaultUtil.GPUCounts()
+func CPUCounts() (float64, error) {
+ return DefaultUtil.CPUCounts()
}
// cpuTimes returns the current cpu usage times in seconds.
@@ -381,7 +391,7 @@ func (u *util) cpuTimes() (*cpuTimesStat, error) {
return s, nil
}
-func (u *util) CPUPercent() (*CPUInfoStat, error) {
+func (u *util) CPU() (*CPUInfo, error) {
var total float64
for {
@@ -406,7 +416,7 @@ func (u *util) CPUPercent() (*CPUInfoStat, error) {
total = (u.statCurrent.total - u.statPrevious.total)
}
- s := &CPUInfoStat{
+ s := &CPUInfo{
System: 0,
User: 0,
Idle: 100,
@@ -429,8 +439,8 @@ func (u *util) CPUPercent() (*CPUInfoStat, error) {
return s, nil
}
-func CPUPercent() (*CPUInfoStat, error) {
- return DefaultUtil.CPUPercent()
+func CPUPercent() (*CPUInfo, error) {
+ return DefaultUtil.CPU()
}
func (u *util) cgroupCPUTimes(version int) (*cpuTimesStat, error) {
@@ -466,15 +476,29 @@ func (u *util) cgroupCPUTimes(version int) (*cpuTimesStat, error) {
return info, nil
}
-func (u *util) DiskUsage(path string) (*disk.UsageStat, error) {
- return disk.Usage(path)
+func (u *util) Disk(path string) (*DiskInfo, error) {
+ usage, err := disk.Usage(path)
+ if err != nil {
+ return nil, err
+ }
+
+ info := &DiskInfo{
+ Path: usage.Path,
+ Fstype: usage.Fstype,
+ Total: usage.Total,
+ Used: usage.Used,
+ InodesTotal: usage.InodesTotal,
+ InodesUsed: usage.InodesUsed,
+ }
+
+ return info, nil
}
-func DiskUsage(path string) (*disk.UsageStat, error) {
- return DefaultUtil.DiskUsage(path)
+func Disk(path string) (*DiskInfo, error) {
+ return DefaultUtil.Disk(path)
}
-func (u *util) virtualMemory() (*MemoryInfoStat, error) {
+func (u *util) virtualMemory() (*MemoryInfo, error) {
info, err := mem.VirtualMemory()
if err != nil {
return nil, err
@@ -489,18 +513,18 @@ func (u *util) virtualMemory() (*MemoryInfoStat, error) {
}
}
- return &MemoryInfoStat{
+ return &MemoryInfo{
Total: info.Total,
Available: info.Available,
Used: info.Used,
}, nil
}
-func (u *util) VirtualMemory() (*MemoryInfoStat, error) {
+func (u *util) Memory() (*MemoryInfo, error) {
u.lock.RLock()
defer u.lock.RUnlock()
- stat := &MemoryInfoStat{
+ stat := &MemoryInfo{
Total: u.mem.Total,
Available: u.mem.Available,
Used: u.mem.Used,
@@ -509,12 +533,12 @@ func (u *util) VirtualMemory() (*MemoryInfoStat, error) {
return stat, nil
}
-func VirtualMemory() (*MemoryInfoStat, error) {
- return DefaultUtil.VirtualMemory()
+func Memory() (*MemoryInfo, error) {
+ return DefaultUtil.Memory()
}
-func (u *util) cgroupVirtualMemory(version int) (*MemoryInfoStat, error) {
- info := &MemoryInfoStat{}
+func (u *util) cgroupVirtualMemory(version int) (*MemoryInfo, error) {
+ info := &MemoryInfo{}
if version == 1 {
lines, err := u.readFile("memory/memory.limit_in_bytes")
@@ -569,12 +593,27 @@ func (u *util) cgroupVirtualMemory(version int) (*MemoryInfoStat, error) {
return info, nil
}
-func (u *util) NetIOCounters(pernic bool) ([]net.IOCountersStat, error) {
- return net.IOCounters(pernic)
+func (u *util) Network() ([]NetworkInfo, error) {
+ netio, err := net.IOCounters(true)
+ if err != nil {
+ return nil, err
+ }
+
+ info := []NetworkInfo{}
+
+ for _, io := range netio {
+ info = append(info, NetworkInfo{
+ Name: io.Name,
+ BytesSent: io.BytesSent,
+ BytesRecv: io.BytesRecv,
+ })
+ }
+
+ return info, nil
}
-func NetIOCounters(pernic bool) ([]net.IOCountersStat, error) {
- return DefaultUtil.NetIOCounters(pernic)
+func Network() ([]NetworkInfo, error) {
+ return DefaultUtil.Network()
}
func (u *util) readFile(path string) ([]string, error) {
@@ -613,29 +652,28 @@ func cpuTotal(c *cpu.TimesStat) float64 {
c.Softirq + c.Steal + c.Guest + c.GuestNice
}
-func (u *util) GPUStats() ([]GPUInfoStat, error) {
+func (u *util) GPU() ([]GPUInfo, error) {
nvstats, err := nvidia.Default.Stats()
if err != nil {
return nil, err
}
- stats := []GPUInfoStat{}
+ stats := []GPUInfo{}
for _, nv := range nvstats {
- stats = append(stats, GPUInfoStat{
- Name: nv.Name,
- MemoryTotal: nv.MemoryTotal,
- MemoryUsed: nv.MemoryUsed,
- Usage: nv.Usage,
- MemoryUsage: nv.MemoryUsage,
- EncoderUsage: nv.EncoderUsage,
- DecoderUsage: nv.DecoderUsage,
+ stats = append(stats, GPUInfo{
+ Name: nv.Name,
+ MemoryTotal: nv.MemoryTotal,
+ MemoryUsed: nv.MemoryUsed,
+ Usage: nv.Usage,
+ Encoder: nv.Encoder,
+ Decoder: nv.Decoder,
})
}
return stats, nil
}
-func GPUStats() ([]GPUInfoStat, error) {
- return DefaultUtil.GPUStats()
+func GPU() ([]GPUInfo, error) {
+ return DefaultUtil.GPU()
}
diff --git a/resources/resources.go b/resources/resources.go
index d7255f05..5a4043d5 100644
--- a/resources/resources.go
+++ b/resources/resources.go
@@ -9,11 +9,13 @@ import (
"github.com/datarhei/core/v16/log"
"github.com/datarhei/core/v16/psutil"
+ "github.com/datarhei/core/v16/slices"
)
type Info struct {
Mem MemoryInfo
CPU CPUInfo
+ GPU GPUInfo
}
type MemoryInfo struct {
@@ -38,6 +40,44 @@ type CPUInfo struct {
Error error
}
+type GPUInfo struct {
+ NGPU float64 // number of gpus
+ GPU []GPUInfoStat
+ Error error
+}
+
+type GPUInfoStat struct {
+ Index int
+ Name string
+
+ // Memory
+ MemoryTotal uint64 // bytes
+ MemoryUsed uint64 // bytes
+ MemoryAvailable uint64 // bytes
+ MemoryLimit uint64 // bytes
+
+ // GPU
+ Usage float64 // percent 0-100
+ Encoder float64 // percent 0-100
+ Decoder float64 // percent 0-100
+ UsageLimit float64 // percent 0-100
+
+ Throttling bool
+}
+
+type Request struct {
+ CPU float64 // percent 0-100*ncpu
+ Memory uint64 // bytes
+ GPUUsage float64 // percent 0-100
+ GPUEncoder float64 // percent 0-100
+ GPUDecoder float64 // percent 0-100
+ GPUMemory uint64 // bytes
+}
+
+type Response struct {
+ GPU int // GPU number, hwdevice
+}
+
type resources struct {
psutil psutil.Util
@@ -45,9 +85,14 @@ type resources struct {
maxCPU float64 // percent 0-100*ncpu
maxMemory uint64 // bytes
+ ngpu int
+ maxGPU float64 // general usage, percent 0-100
+ maxGPUMemory float64 // memory usage, percent 0-100
+
isUnlimited bool
isCPULimiting bool
isMemoryLimiting bool
+ isGPULimiting []bool
self psutil.Process
@@ -67,30 +112,46 @@ type Resources interface {
// HasLimits returns whether any limits have been set.
HasLimits() bool
- // Limits returns the CPU (percent 0-100) and memory (bytes) limits.
- Limits() (float64, uint64)
+ // Limits returns the CPU (percent 0-100), memory (bytes) limits, and GPU limits (usage and memory each in percent 0-100).
+ Limits() (float64, uint64, float64, float64)
- // ShouldLimit returns whether cpu and/or memory is currently limited.
- ShouldLimit() (bool, bool)
+ // ShouldLimit returns whether cpu, memory, and/or GPU is currently limited.
+ ShouldLimit() (bool, bool, []bool)
// Request checks whether the requested resources are available.
- Request(cpu float64, memory uint64) error
+ Request(req Request) (Response, error)
- // Info returns the current resource usage
+ // Info returns the current resource usage.
Info() Info
}
type Config struct {
- MaxCPU float64 // percent 0-100
- MaxMemory float64 // percent 0-100
- PSUtil psutil.Util
- Logger log.Logger
+ MaxCPU float64 // percent 0-100
+ MaxMemory float64 // percent 0-100
+ MaxGPU float64 // general,encoder,decoder usage, percent 0-100
+ MaxGPUMemory float64 // memory usage, percent 0-100
+ PSUtil psutil.Util
+ Logger log.Logger
}
func New(config Config) (Resources, error) {
+ if config.PSUtil == nil {
+ config.PSUtil = psutil.DefaultUtil
+ }
+
+ gpu, err := config.PSUtil.GPU()
+ if err != nil {
+ return nil, fmt.Errorf("unable to determine number of GPUs: %w", err)
+ }
+
+ if len(gpu) == 0 {
+ config.MaxGPU = 0
+ config.MaxGPUMemory = 0
+ }
+
isUnlimited := false
- if config.MaxCPU <= 0 && config.MaxMemory <= 0 {
+ if config.MaxCPU <= 0 && config.MaxMemory <= 0 && config.MaxGPU <= 0 && config.MaxGPUMemory <= 0 {
isUnlimited = true
}
@@ -102,31 +163,39 @@ func New(config Config) (Resources, error) {
config.MaxMemory = 100
}
- if config.MaxCPU > 100 || config.MaxMemory > 100 {
- return nil, fmt.Errorf("both MaxCPU and MaxMemory must have a range of 0-100")
+ if config.MaxGPU <= 0 {
+ config.MaxGPU = 100
+ }
+
+ if config.MaxGPUMemory <= 0 {
+ config.MaxGPUMemory = 100
+ }
+
+ if config.MaxCPU > 100 || config.MaxMemory > 100 || config.MaxGPU > 100 || config.MaxGPUMemory > 100 {
+ return nil, fmt.Errorf("all Max... values must have a range of 0-100")
}
r := &resources{
- maxCPU: config.MaxCPU,
- psutil: config.PSUtil,
- isUnlimited: isUnlimited,
- logger: config.Logger,
+ maxCPU: config.MaxCPU,
+ maxGPU: config.MaxGPU,
+ maxGPUMemory: config.MaxGPUMemory,
+ psutil: config.PSUtil,
+ isUnlimited: isUnlimited,
+ ngpu: len(gpu),
+ isGPULimiting: make([]bool, len(gpu)),
+ logger: config.Logger,
}
if r.logger == nil {
r.logger = log.New("")
}
- if r.psutil == nil {
- r.psutil = psutil.DefaultUtil
- }
-
- vmstat, err := r.psutil.VirtualMemory()
+ vmstat, err := r.psutil.Memory()
if err != nil {
return nil, fmt.Errorf("unable to determine available memory: %w", err)
}
- ncpu, err := r.psutil.CPUCounts(true)
+ ncpu, err := r.psutil.CPUCounts()
if err != nil {
return nil, fmt.Errorf("unable to determine number of logical CPUs: %w", err)
}
@@ -137,12 +206,15 @@ func New(config Config) (Resources, error) {
r.maxMemory = uint64(float64(vmstat.Total) * config.MaxMemory / 100)
r.logger = r.logger.WithFields(log.Fields{
- "ncpu": r.ncpu,
- "max_cpu": r.maxCPU,
- "max_memory": r.maxMemory,
+ "ncpu": r.ncpu,
+ "max_cpu": r.maxCPU,
+ "max_memory": r.maxMemory,
+ "ngpu": len(gpu),
+ "max_gpu": r.maxGPU,
+ "max_gpu_memory": r.maxGPUMemory,
})
- r.self, err = psutil.NewProcess(int32(os.Getpid()), false)
+ r.self, err = r.psutil.Process(int32(os.Getpid()))
if err != nil {
return nil, fmt.Errorf("unable to create process observer for self: %w", err)
}
@@ -189,7 +261,12 @@ func (r *resources) observe(ctx context.Context, interval time.Duration) {
case <-ctx.Done():
return
case <-ticker.C:
- cpustat, err := r.psutil.CPUPercent()
+ if r.isUnlimited {
+ // If there aren't any limits imposed, don't do anything
+ continue
+ }
+
+ cpustat, err := r.psutil.CPU()
if err != nil {
r.logger.Warn().WithError(err).Log("Failed to determine system CPU usage")
continue
@@ -197,12 +274,18 @@ func (r *resources) observe(ctx context.Context, interval time.Duration) {
cpuload := (cpustat.User + cpustat.System + cpustat.Other) * r.ncpu
- vmstat, err := r.psutil.VirtualMemory()
+ vmstat, err := r.psutil.Memory()
if err != nil {
r.logger.Warn().WithError(err).Log("Failed to determine system memory usage")
continue
}
+ gpustat, err := r.psutil.GPU()
+ if err != nil {
+ r.logger.Warn().WithError(err).Log("Failed to determine GPU usage")
+ continue
+ }
+
r.logger.Debug().WithFields(log.Fields{
"cur_cpu": cpuload,
"cur_memory": vmstat.Used,
@@ -210,34 +293,46 @@ func (r *resources) observe(ctx context.Context, interval time.Duration) {
doCPULimit := false
- if !r.isUnlimited {
- if !r.isCPULimiting {
- if cpuload >= r.maxCPU {
- r.logger.Debug().WithField("cpu", cpuload).Log("CPU limit reached")
- doCPULimit = true
- }
- } else {
+ if !r.isCPULimiting {
+ if cpuload >= r.maxCPU {
+ r.logger.Debug().WithField("cpu", cpuload).Log("CPU limit reached")
doCPULimit = true
- if cpuload < r.maxCPU {
- r.logger.Debug().WithField("cpu", cpuload).Log("CPU limit released")
- doCPULimit = false
- }
+ }
+ } else {
+ doCPULimit = true
+ if cpuload < r.maxCPU {
+ r.logger.Debug().WithField("cpu", cpuload).Log("CPU limit released")
+ doCPULimit = false
}
}
doMemoryLimit := false
- if !r.isUnlimited {
- if !r.isMemoryLimiting {
- if vmstat.Used >= r.maxMemory {
- r.logger.Debug().WithField("memory", vmstat.Used).Log("Memory limit reached")
- doMemoryLimit = true
+ if !r.isMemoryLimiting {
+ if vmstat.Used >= r.maxMemory {
+ r.logger.Debug().WithField("memory", vmstat.Used).Log("Memory limit reached")
+ doMemoryLimit = true
+ }
+ } else {
+ doMemoryLimit = true
+ if vmstat.Used < r.maxMemory {
+ r.logger.Debug().WithField("memory", vmstat.Used).Log("Memory limit released")
+ doMemoryLimit = false
+ }
+ }
+
+ doGPULimit := make([]bool, r.ngpu)
+
+ for i, limiting := range r.isGPULimiting {
+ maxMemory := uint64(r.maxGPUMemory * float64(gpustat[i].MemoryTotal) / 100)
+ if !limiting {
+ if gpustat[i].MemoryUsed >= maxMemory || (gpustat[i].Usage >= r.maxGPU && gpustat[i].Encoder >= r.maxGPU && gpustat[i].Decoder >= r.maxGPU) {
+ doGPULimit[i] = true
}
} else {
- doMemoryLimit = true
- if vmstat.Used < r.maxMemory {
- r.logger.Debug().WithField("memory", vmstat.Used).Log("Memory limit released")
- doMemoryLimit = false
+ doGPULimit[i] = true
+ if gpustat[i].MemoryUsed < maxMemory && (gpustat[i].Usage < r.maxGPU || gpustat[i].Encoder < r.maxGPU || gpustat[i].Decoder < r.maxGPU) {
+ doGPULimit[i] = false
}
}
}
@@ -247,17 +342,26 @@ func (r *resources) observe(ctx context.Context, interval time.Duration) {
r.logger.Warn().WithFields(log.Fields{
"enabled": doCPULimit,
}).Log("Limiting CPU")
-
- r.isCPULimiting = doCPULimit
}
+ r.isCPULimiting = doCPULimit
if r.isMemoryLimiting != doMemoryLimit {
r.logger.Warn().WithFields(log.Fields{
"enabled": doMemoryLimit,
}).Log("Limiting memory")
-
- r.isMemoryLimiting = doMemoryLimit
}
+ r.isMemoryLimiting = doMemoryLimit
+
+ for i, limiting := range r.isGPULimiting {
+ if limiting != doGPULimit[i] {
+ r.logger.Warn().WithFields(log.Fields{
+ "enabled": doGPULimit,
+ "index": i,
+ }).Log("Limiting GPU")
+ }
+ }
+ r.isGPULimiting = doGPULimit
+
r.lock.Unlock()
}
}
@@ -267,60 +371,136 @@ func (r *resources) HasLimits() bool {
return !r.isUnlimited
}
-func (r *resources) Limits() (float64, uint64) {
- return r.maxCPU / r.ncpu, r.maxMemory
+func (r *resources) Limits() (float64, uint64, float64, float64) {
+ return r.maxCPU / r.ncpu, r.maxMemory, r.maxGPU, r.maxGPUMemory
}
-func (r *resources) ShouldLimit() (bool, bool) {
+func (r *resources) ShouldLimit() (bool, bool, []bool) {
r.lock.RLock()
defer r.lock.RUnlock()
- return r.isCPULimiting, r.isMemoryLimiting
+ return r.isCPULimiting, r.isMemoryLimiting, slices.Copy(r.isGPULimiting)
}
-func (r *resources) Request(cpu float64, memory uint64) error {
+func (r *resources) Request(req Request) (Response, error) {
+ res := Response{
+ GPU: -1,
+ }
+
r.lock.RLock()
defer r.lock.RUnlock()
logger := r.logger.WithFields(log.Fields{
- "req_cpu": cpu,
- "req_memory": memory,
+ "req_cpu": req.CPU,
+ "req_memory": req.Memory,
+ "req_gpu": req.GPUUsage,
+ "req_gpu_encoder": req.GPUEncoder,
+ "req_gpu_decoder": req.GPUDecoder,
+ "req_gpu_memory": req.GPUMemory,
})
logger.Debug().Log("Request for acquiring resources")
+ // Check if anything is currently limiting.
if r.isCPULimiting || r.isMemoryLimiting {
logger.Debug().Log("Rejected, currently limiting")
- return fmt.Errorf("resources are currenlty actively limited")
+ return res, fmt.Errorf("resources are currenlty actively limited")
}
- if cpu <= 0 || memory == 0 {
+ // Check if the requested resources are valid.
+ if req.CPU <= 0 || req.Memory == 0 {
logger.Debug().Log("Rejected, invalid values")
- return fmt.Errorf("the cpu and/or memory values are invalid: cpu=%f, memory=%d", cpu, memory)
+ return res, fmt.Errorf("the cpu and/or memory values are invalid: cpu=%f, memory=%d", req.CPU, req.Memory)
}
- cpustat, err := r.psutil.CPUPercent()
+ // Get current CPU and memory values.
+ cpustat, err := r.psutil.CPU()
if err != nil {
r.logger.Warn().WithError(err).Log("Failed to determine system CPU usage")
- return fmt.Errorf("the system CPU usage couldn't be determined")
+ return res, fmt.Errorf("the system CPU usage couldn't be determined")
}
cpuload := (cpustat.User + cpustat.System + cpustat.Other) * r.ncpu
- vmstat, err := r.psutil.VirtualMemory()
+ vmstat, err := r.psutil.Memory()
if err != nil {
r.logger.Warn().WithError(err).Log("Failed to determine system memory usage")
- return fmt.Errorf("the system memory usage couldn't be determined")
+ return res, fmt.Errorf("the system memory usage couldn't be determined")
}
- if cpuload+cpu > r.maxCPU {
+ // Check if enough resources are available
+ if cpuload+req.CPU > r.maxCPU {
logger.Debug().WithField("cur_cpu", cpuload).Log("Rejected, CPU limit exceeded")
- return fmt.Errorf("the CPU limit would be exceeded: %f + %f > %f", cpuload, cpu, r.maxCPU)
+ return res, fmt.Errorf("the CPU limit would be exceeded: %f + %f > %f", cpuload, req.CPU, r.maxCPU)
}
- if vmstat.Used+memory > r.maxMemory {
+ if vmstat.Used+req.Memory > r.maxMemory {
logger.Debug().WithField("cur_memory", vmstat.Used).Log("Rejected, memory limit exceeded")
- return fmt.Errorf("the memory limit would be exceeded: %d + %d > %d", vmstat.Used, memory, r.maxMemory)
+ return res, fmt.Errorf("the memory limit would be exceeded: %d + %d > %d", vmstat.Used, req.Memory, r.maxMemory)
+ }
+
+ // Check if any GPU resources are requested
+ if req.GPUUsage > 0 || req.GPUEncoder > 0 || req.GPUDecoder > 0 || req.GPUMemory > 0 {
+ if req.GPUUsage < 0 || req.GPUEncoder < 0 || req.GPUDecoder < 0 || req.GPUMemory == 0 {
+ logger.Debug().Log("Rejected, invalid values")
+ return res, fmt.Errorf("the gpu usage and memory values are invalid: usage=%f, encoder=%f, decoder=%f, memory=%d", req.GPUUsage, req.GPUEncoder, req.GPUDecoder, req.GPUMemory)
+ }
+
+ // Get current GPU values
+ gpustat, err := r.psutil.GPU()
+ if err != nil {
+ r.logger.Warn().WithError(err).Log("Failed to determine GPU usage")
+ return res, fmt.Errorf("the GPU usage couldn't be determined")
+ }
+
+ if len(gpustat) == 0 {
+ r.logger.Debug().WithError(err).Log("GPU resources requested but no GPU available")
+ return res, fmt.Errorf("some GPU resources requested but no GPU available")
+ }
+
+ foundGPU := -1
+ for _, g := range gpustat {
+ if req.GPUUsage > 0 && g.Usage+req.GPUUsage > r.maxGPU {
+ logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu": g.Usage}).Log("Rejected, GPU usage limit exceeded")
+ continue
+ }
+
+ if req.GPUEncoder > 0 && g.Encoder+req.GPUEncoder > r.maxGPU {
+ logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu_encoder": g.Usage}).Log("Rejected, GPU encoder usage limit exceeded")
+ continue
+ }
+
+ if req.GPUDecoder > 0 && g.Decoder+req.GPUDecoder > r.maxGPU {
+ logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu_decoder": g.Usage}).Log("Rejected, GPU decoder usage limit exceeded")
+ continue
+ }
+
+ gpuMemoryUsage := float64(g.MemoryUsed) / float64(g.MemoryTotal) * 100
+ requestedGPUMemoryUsage := float64(req.GPUMemory) / float64(g.MemoryTotal) * 100
+
+ if gpuMemoryUsage+requestedGPUMemoryUsage > r.maxGPUMemory {
+ logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu_memory": gpuMemoryUsage}).Log("Rejected, GPU memory usage limit exceeded")
+ continue
+ }
+
+ foundGPU = g.Index
+
+ logger = logger.Debug().WithFields(log.Fields{
+ "cur_gpu": foundGPU,
+ "cur_gpu_general": g.Usage,
+ "cur_gpu_encoder": g.Encoder,
+ "cur_gpu_decoder": g.Decoder,
+ "cur_gpu_memory": gpuMemoryUsage,
+ })
+
+ break
+ }
+
+ if foundGPU < 0 {
+ return res, fmt.Errorf("all GPU usage limits are exceeded")
+ }
+
+ res.GPU = foundGPU
}
logger.Debug().WithFields(log.Fields{
@@ -328,17 +508,18 @@ func (r *resources) Request(cpu float64, memory uint64) error {
"cur_memory": vmstat.Used,
}).Log("Acquiring approved")
- return nil
+ return res, nil
}
func (r *resources) Info() Info {
- cpulimit, memlimit := r.Limits()
- cputhrottling, memthrottling := r.ShouldLimit()
+ cpulimit, memlimit, gpulimit, gpumemlimit := r.Limits()
+ cputhrottling, memthrottling, gputhrottling := r.ShouldLimit()
- cpustat, cpuerr := r.psutil.CPUPercent()
- memstat, memerr := r.psutil.VirtualMemory()
- selfcpu, _ := r.self.CPUPercent()
- selfmem, _ := r.self.VirtualMemory()
+ cpustat, cpuerr := r.psutil.CPU()
+ memstat, memerr := r.psutil.Memory()
+ gpustat, gpuerr := r.psutil.GPU()
+ selfcpu, _ := r.self.CPU()
+ selfmem, _ := r.self.Memory()
cpuinfo := CPUInfo{
NCPU: r.ncpu,
@@ -362,9 +543,31 @@ func (r *resources) Info() Info {
Error: memerr,
}
+ gpuinfo := GPUInfo{
+ NGPU: float64(len(gpustat)),
+ Error: gpuerr,
+ }
+
+ for i, g := range gpustat {
+ gpuinfo.GPU = append(gpuinfo.GPU, GPUInfoStat{
+ Index: g.Index,
+ Name: g.Name,
+ MemoryTotal: g.MemoryTotal,
+ MemoryUsed: g.MemoryUsed,
+ MemoryAvailable: g.MemoryTotal - g.MemoryUsed,
+ MemoryLimit: uint64(float64(g.MemoryTotal) * gpumemlimit / 100),
+ Usage: g.Usage,
+ Encoder: g.Encoder,
+ Decoder: g.Decoder,
+ UsageLimit: gpulimit,
+ Throttling: gputhrottling[i],
+ })
+ }
+
i := Info{
CPU: cpuinfo,
Mem: meminfo,
+ GPU: gpuinfo,
}
return i
diff --git a/resources/resources_test.go b/resources/resources_test.go
index 3d26c40c..a1ee4244 100644
--- a/resources/resources_test.go
+++ b/resources/resources_test.go
@@ -1,68 +1,170 @@
package resources
import (
+ "slices"
"sync"
"testing"
"time"
"github.com/datarhei/core/v16/psutil"
- "github.com/shirou/gopsutil/v3/disk"
- "github.com/shirou/gopsutil/v3/net"
"github.com/stretchr/testify/require"
)
-type util struct{}
+type util struct {
+ lock sync.Mutex
+
+ cpu psutil.CPUInfo
+ mem psutil.MemoryInfo
+ gpu []psutil.GPUInfo
+}
+
+func newUtil(ngpu int) *util {
+ u := &util{
+ cpu: psutil.CPUInfo{
+ System: 10,
+ User: 50,
+ Idle: 35,
+ Other: 5,
+ },
+ mem: psutil.MemoryInfo{
+ Total: 200,
+ Available: 40,
+ Used: 160,
+ },
+ }
+
+ for i := 0; i < ngpu; i++ {
+ u.gpu = append(u.gpu, psutil.GPUInfo{
+ Index: i,
+ Name: "L4",
+ MemoryTotal: 24 * 1024 * 1024 * 1024,
+ MemoryUsed: uint64(12+i) * 1024 * 1024 * 1024,
+ Usage: 50 - float64((i+1)*5),
+ Encoder: 50 - float64((i+1)*10),
+ Decoder: 50 - float64((i+1)*3),
+ })
+ }
+
+ return u
+}
func (u *util) Start() {}
func (u *util) Stop() {}
-func (u *util) CPUCounts(logical bool) (float64, error) {
+func (u *util) CPUCounts() (float64, error) {
return 2, nil
}
-func (u *util) GPUCounts() (float64, error) {
- return 0, nil
+func (u *util) CPU() (*psutil.CPUInfo, error) {
+ u.lock.Lock()
+ defer u.lock.Unlock()
+
+ cpu := u.cpu
+
+ return &cpu, nil
}
-func (u *util) CPUPercent() (*psutil.CPUInfoStat, error) {
- return &psutil.CPUInfoStat{
- System: 10,
- User: 50,
- Idle: 35,
- Other: 5,
- }, nil
+func (u *util) Disk(path string) (*psutil.DiskInfo, error) {
+ return &psutil.DiskInfo{}, nil
}
-func (u *util) DiskUsage(path string) (*disk.UsageStat, error) {
- return &disk.UsageStat{}, nil
+func (u *util) Memory() (*psutil.MemoryInfo, error) {
+ u.lock.Lock()
+ defer u.lock.Unlock()
+
+ mem := u.mem
+
+ return &mem, nil
}
-func (u *util) VirtualMemory() (*psutil.MemoryInfoStat, error) {
- return &psutil.MemoryInfoStat{
- Total: 200,
- Available: 40,
- Used: 160,
- }, nil
-}
-
-func (u *util) NetIOCounters(pernic bool) ([]net.IOCountersStat, error) {
+func (u *util) Network() ([]psutil.NetworkInfo, error) {
return nil, nil
}
-func (u *util) GPUStats() ([]psutil.GPUInfoStat, error) {
- return nil, nil
+func (u *util) GPU() ([]psutil.GPUInfo, error) {
+ u.lock.Lock()
+ defer u.lock.Unlock()
+
+ gpu := []psutil.GPUInfo{}
+
+ gpu = append(gpu, u.gpu...)
+
+ return gpu, nil
}
func (u *util) Process(pid int32) (psutil.Process, error) {
- return nil, nil
+ return &process{}, nil
+}
+
+type process struct{}
+
+func (p *process) CPU() (*psutil.CPUInfo, error) {
+ s := &psutil.CPUInfo{
+ System: 1,
+ User: 2,
+ Idle: 0,
+ Other: 3,
+ }
+
+ return s, nil
+}
+
+func (p *process) Memory() (uint64, error) { return 42, nil }
+func (p *process) GPU() (*psutil.GPUInfo, error) {
+ return &psutil.GPUInfo{
+ Index: 0,
+ Name: "L4",
+ MemoryTotal: 128,
+ MemoryUsed: 42,
+ Usage: 5,
+ Encoder: 9,
+ Decoder: 7,
+ }, nil
+}
+func (p *process) Stop() {}
+func (p *process) Suspend() error { return nil }
+func (p *process) Resume() error { return nil }
+
+func TestConfigNoLimits(t *testing.T) {
+ _, err := New(Config{
+ PSUtil: newUtil(0),
+ })
+ require.NoError(t, err)
+}
+
+func TestConfigWrongLimits(t *testing.T) {
+ _, err := New(Config{
+ MaxCPU: 102,
+ MaxMemory: 573,
+ PSUtil: newUtil(0),
+ })
+ require.Error(t, err)
+
+ _, err = New(Config{
+ MaxCPU: 0,
+ MaxMemory: 0,
+ MaxGPU: 101,
+ MaxGPUMemory: 103,
+ PSUtil: newUtil(0),
+ })
+ require.NoError(t, err)
+
+ _, err = New(Config{
+ MaxCPU: 0,
+ MaxMemory: 0,
+ MaxGPU: 101,
+ MaxGPUMemory: 103,
+ PSUtil: newUtil(1),
+ })
+ require.Error(t, err)
}
func TestMemoryLimit(t *testing.T) {
r, err := New(Config{
MaxCPU: 100,
MaxMemory: 150. / 200. * 100,
- PSUtil: &util{},
+ PSUtil: newUtil(0),
Logger: nil,
})
require.NoError(t, err)
@@ -86,7 +188,7 @@ func TestMemoryLimit(t *testing.T) {
for {
select {
case <-ticker.C:
- _, limit = r.ShouldLimit()
+ _, limit, _ = r.ShouldLimit()
if limit {
return
}
@@ -102,6 +204,95 @@ func TestMemoryLimit(t *testing.T) {
require.True(t, limit)
+ _, err = r.Request(Request{CPU: 5, Memory: 10})
+ require.Error(t, err)
+
+ r.Stop()
+}
+
+func TestMemoryUnlimit(t *testing.T) {
+ util := newUtil(0)
+
+ r, err := New(Config{
+ MaxCPU: 100,
+ MaxMemory: 150. / 200. * 100,
+ PSUtil: util,
+ Logger: nil,
+ })
+ require.NoError(t, err)
+
+ wg := sync.WaitGroup{}
+ wg.Add(1)
+
+ limit := false
+
+ go func() {
+ defer func() {
+ wg.Done()
+ }()
+
+ timer := time.NewTimer(10 * time.Second)
+ defer timer.Stop()
+
+ ticker := time.NewTicker(time.Second)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-ticker.C:
+ _, limit, _ = r.ShouldLimit()
+ if limit {
+ return
+ }
+ case <-timer.C:
+ return
+ }
+ }
+ }()
+
+ r.Start()
+
+ wg.Wait()
+
+ require.True(t, limit)
+
+ _, limit, _ = r.ShouldLimit()
+ require.True(t, limit)
+
+ util.lock.Lock()
+ util.mem.Used = 140
+ util.lock.Unlock()
+
+ wg.Add(1)
+
+ go func() {
+ defer func() {
+ wg.Done()
+ }()
+
+ timer := time.NewTimer(10 * time.Second)
+ defer timer.Stop()
+
+ ticker := time.NewTicker(time.Second)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-ticker.C:
+ _, limit, _ = r.ShouldLimit()
+ if !limit {
+ return
+ }
+ case <-timer.C:
+ return
+ }
+ }
+ }()
+
+ wg.Wait()
+
+ require.False(t, limit)
+
r.Stop()
}
@@ -109,7 +300,7 @@ func TestCPULimit(t *testing.T) {
r, err := New(Config{
MaxCPU: 50.,
MaxMemory: 100,
- PSUtil: &util{},
+ PSUtil: newUtil(0),
Logger: nil,
})
require.NoError(t, err)
@@ -133,7 +324,7 @@ func TestCPULimit(t *testing.T) {
for {
select {
case <-ticker.C:
- limit, _ = r.ShouldLimit()
+ limit, _, _ = r.ShouldLimit()
if limit {
return
}
@@ -149,36 +340,541 @@ func TestCPULimit(t *testing.T) {
require.True(t, limit)
+ _, err = r.Request(Request{CPU: 5, Memory: 10})
+ require.Error(t, err)
+
r.Stop()
}
-func TestRequest(t *testing.T) {
+func TestCPUUnlimit(t *testing.T) {
+ util := newUtil(0)
+
r, err := New(Config{
- MaxCPU: 70.,
- MaxMemory: 170. / 200. * 100,
- PSUtil: &util{},
+ MaxCPU: 50.,
+ MaxMemory: 100,
+ PSUtil: util,
Logger: nil,
})
require.NoError(t, err)
- err = r.Request(-1, 0)
- require.Error(t, err)
+ wg := sync.WaitGroup{}
+ wg.Add(1)
- err = r.Request(5, 10)
+ limit := false
+
+ go func() {
+ defer func() {
+ wg.Done()
+ }()
+
+ timer := time.NewTimer(10 * time.Second)
+ defer timer.Stop()
+
+ ticker := time.NewTicker(time.Second)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-ticker.C:
+ limit, _, _ = r.ShouldLimit()
+ if limit {
+ return
+ }
+ case <-timer.C:
+ return
+ }
+ }
+ }()
+
+ r.Start()
+
+ wg.Wait()
+
+ require.True(t, limit)
+
+ limit, _, _ = r.ShouldLimit()
+ require.True(t, limit)
+
+ util.lock.Lock()
+ util.cpu.User = 20
+ util.lock.Unlock()
+
+ wg.Add(1)
+
+ go func() {
+ defer func() {
+ wg.Done()
+ }()
+
+ timer := time.NewTimer(10 * time.Second)
+ defer timer.Stop()
+
+ ticker := time.NewTicker(time.Second)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-ticker.C:
+ limit, _, _ = r.ShouldLimit()
+ if !limit {
+ return
+ }
+ case <-timer.C:
+ return
+ }
+ }
+ }()
+
+ wg.Wait()
+
+ require.False(t, limit)
+
+ r.Stop()
+}
+
+func TestGPULimitMemory(t *testing.T) {
+ r, err := New(Config{
+ MaxCPU: 100,
+ MaxMemory: 100,
+ MaxGPU: 100,
+ MaxGPUMemory: 20,
+ PSUtil: newUtil(2),
+ Logger: nil,
+ })
require.NoError(t, err)
- err = r.Request(5, 20)
+ wg := sync.WaitGroup{}
+ wg.Add(1)
+
+ limit := []bool{}
+
+ go func() {
+ defer func() {
+ wg.Done()
+ }()
+
+ timer := time.NewTimer(10 * time.Second)
+ defer timer.Stop()
+
+ ticker := time.NewTicker(time.Second)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-ticker.C:
+ _, _, limit = r.ShouldLimit()
+ if slices.Contains(limit, true) {
+ return
+ }
+ case <-timer.C:
+ return
+ }
+ }
+ }()
+
+ r.Start()
+
+ wg.Wait()
+
+ require.Contains(t, limit, true)
+
+ _, err = r.Request(Request{CPU: 5, Memory: 10, GPUUsage: 10, GPUMemory: 10})
require.Error(t, err)
- err = r.Request(10, 10)
+ r.Stop()
+}
+
+func TestGPUUnlimitMemory(t *testing.T) {
+ util := newUtil(2)
+
+ r, err := New(Config{
+ MaxCPU: 100,
+ MaxMemory: 100,
+ MaxGPU: 100,
+ MaxGPUMemory: 20,
+ PSUtil: util,
+ Logger: nil,
+ })
require.NoError(t, err)
+
+ wg := sync.WaitGroup{}
+ wg.Add(1)
+
+ limit := []bool{}
+
+ go func() {
+ defer func() {
+ wg.Done()
+ }()
+
+ timer := time.NewTimer(10 * time.Second)
+ defer timer.Stop()
+
+ ticker := time.NewTicker(time.Second)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-ticker.C:
+ _, _, limit = r.ShouldLimit()
+ if slices.Contains(limit, true) {
+ return
+ }
+ case <-timer.C:
+ return
+ }
+ }
+ }()
+
+ r.Start()
+
+ wg.Wait()
+
+ require.Contains(t, limit, true)
+
+ util.lock.Lock()
+ util.gpu[0].MemoryUsed = 10
+ util.gpu[1].MemoryUsed = 10
+ util.lock.Unlock()
+
+ wg.Add(1)
+
+ go func() {
+ defer func() {
+ wg.Done()
+ }()
+
+ timer := time.NewTimer(10 * time.Second)
+ defer timer.Stop()
+
+ ticker := time.NewTicker(time.Second)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-ticker.C:
+ _, _, limit = r.ShouldLimit()
+ if !slices.Contains(limit, true) {
+ return
+ }
+ case <-timer.C:
+ return
+ }
+ }
+ }()
+
+ wg.Wait()
+
+ require.NotContains(t, limit, true)
+
+ r.Stop()
+}
+
+func TestGPULimitMemorySome(t *testing.T) {
+ r, err := New(Config{
+ MaxCPU: 100,
+ MaxMemory: 100,
+ MaxGPU: 100,
+ MaxGPUMemory: 14. / 24. * 100.,
+ PSUtil: newUtil(4),
+ Logger: nil,
+ })
+ require.NoError(t, err)
+
+ wg := sync.WaitGroup{}
+ wg.Add(1)
+
+ limit := []bool{}
+
+ go func() {
+ defer func() {
+ wg.Done()
+ }()
+
+ timer := time.NewTimer(10 * time.Second)
+ defer timer.Stop()
+
+ ticker := time.NewTicker(time.Second)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-ticker.C:
+ _, _, limit = r.ShouldLimit()
+ if slices.Contains(limit, true) {
+ return
+ }
+ case <-timer.C:
+ return
+ }
+ }
+ }()
+
+ r.Start()
+
+ wg.Wait()
+
+ require.Equal(t, []bool{false, false, true, true}, limit)
+
+ _, err = r.Request(Request{CPU: 5, Memory: 10, GPUUsage: 10, GPUMemory: 10})
+ require.NoError(t, err)
+
+ r.Stop()
+}
+
+func TestGPULimitUsage(t *testing.T) {
+ r, err := New(Config{
+ MaxCPU: 100,
+ MaxMemory: 100,
+ MaxGPU: 40,
+ MaxGPUMemory: 100,
+ PSUtil: newUtil(3),
+ Logger: nil,
+ })
+ require.NoError(t, err)
+
+ wg := sync.WaitGroup{}
+ wg.Add(1)
+
+ limit := []bool{}
+
+ go func() {
+ defer func() {
+ wg.Done()
+ }()
+
+ timer := time.NewTimer(10 * time.Second)
+ defer timer.Stop()
+
+ ticker := time.NewTicker(time.Second)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-ticker.C:
+ _, _, limit = r.ShouldLimit()
+ if slices.Contains(limit, true) {
+ return
+ }
+ case <-timer.C:
+ return
+ }
+ }
+ }()
+
+ r.Start()
+
+ wg.Wait()
+
+ require.Equal(t, []bool{true, false, false}, limit)
+
+ _, err = r.Request(Request{CPU: 5, Memory: 10, GPUUsage: 10, GPUMemory: 10})
+ require.Error(t, err)
+
+ _, err = r.Request(Request{CPU: 5, Memory: 10, GPUEncoder: 10, GPUMemory: 10})
+ require.NoError(t, err)
+
+ r.Stop()
+}
+
+func TestGPUUnlimitUsage(t *testing.T) {
+ util := newUtil(3)
+
+ r, err := New(Config{
+ MaxCPU: 100,
+ MaxMemory: 100,
+ MaxGPU: 40,
+ MaxGPUMemory: 100,
+ PSUtil: util,
+ Logger: nil,
+ })
+ require.NoError(t, err)
+
+ wg := sync.WaitGroup{}
+ wg.Add(1)
+
+ limit := []bool{}
+
+ go func() {
+ defer func() {
+ wg.Done()
+ }()
+
+ timer := time.NewTimer(10 * time.Second)
+ defer timer.Stop()
+
+ ticker := time.NewTicker(time.Second)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-ticker.C:
+ _, _, limit = r.ShouldLimit()
+ if slices.Contains(limit, true) {
+ return
+ }
+ case <-timer.C:
+ return
+ }
+ }
+ }()
+
+ r.Start()
+
+ wg.Wait()
+
+ require.Equal(t, []bool{true, false, false}, limit)
+
+ util.lock.Lock()
+ util.gpu[0].Usage = 30
+ util.gpu[0].Encoder = 30
+ util.gpu[0].Decoder = 30
+ util.lock.Unlock()
+
+ wg.Add(1)
+
+ go func() {
+ defer func() {
+ wg.Done()
+ }()
+
+ timer := time.NewTimer(10 * time.Second)
+ defer timer.Stop()
+
+ ticker := time.NewTicker(time.Second)
+ defer ticker.Stop()
+
+ for {
+ select {
+ case <-ticker.C:
+ _, _, limit = r.ShouldLimit()
+ if !slices.Contains(limit, true) {
+ return
+ }
+ case <-timer.C:
+ return
+ }
+ }
+ }()
+
+ wg.Wait()
+
+ require.Equal(t, []bool{false, false, false}, limit)
+
+ r.Stop()
+}
+
+func TestRequestCPU(t *testing.T) {
+ r, err := New(Config{
+ MaxCPU: 70.,
+ PSUtil: newUtil(0),
+ })
+ require.NoError(t, err)
+
+ _, err = r.Request(Request{CPU: 0, Memory: 0})
+ require.Error(t, err)
+
+ _, err = r.Request(Request{CPU: 5, Memory: 10})
+ require.NoError(t, err)
+
+ _, err = r.Request(Request{CPU: 30, Memory: 10})
+ require.Error(t, err)
+}
+
+func TestRequestMemory(t *testing.T) {
+ r, err := New(Config{
+ MaxMemory: 170. / 200. * 100,
+ PSUtil: newUtil(0),
+ })
+ require.NoError(t, err)
+
+ _, err = r.Request(Request{CPU: 5, Memory: 0})
+ require.Error(t, err)
+
+ _, err = r.Request(Request{CPU: 5, Memory: 10})
+ require.NoError(t, err)
+
+ _, err = r.Request(Request{CPU: 50, Memory: 20})
+ require.Error(t, err)
+}
+
+func TestRequestNoGPU(t *testing.T) {
+ r, err := New(Config{
+ MaxCPU: 100,
+ MaxMemory: 100,
+ PSUtil: newUtil(0),
+ })
+ require.NoError(t, err)
+
+ _, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 30, GPUMemory: 10})
+ require.Error(t, err)
+}
+
+func TestRequestInvalidGPURequest(t *testing.T) {
+ r, err := New(Config{
+ MaxCPU: 100,
+ MaxMemory: 100,
+ PSUtil: newUtil(1),
+ })
+ require.NoError(t, err)
+
+ _, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 30, GPUMemory: 0})
+ require.Error(t, err)
+
+ _, err = r.Request(Request{CPU: 10, Memory: 10, GPUUsage: -1, GPUEncoder: 30, GPUMemory: 0})
+ require.Error(t, err)
+}
+
+func TestRequestGPULimitsOneGPU(t *testing.T) {
+ r, err := New(Config{
+ MaxCPU: 100,
+ MaxMemory: 100,
+ MaxGPU: 50,
+ MaxGPUMemory: 60,
+ PSUtil: newUtil(1),
+ })
+ require.NoError(t, err)
+
+ _, err = r.Request(Request{CPU: 10, Memory: 10, GPUUsage: 50, GPUMemory: 10})
+ require.Error(t, err)
+
+ _, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 50, GPUMemory: 10})
+ require.Error(t, err)
+
+ _, err = r.Request(Request{CPU: 10, Memory: 10, GPUDecoder: 50, GPUMemory: 10})
+ require.Error(t, err)
+
+ _, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 10, GPUMemory: 5 * 1024 * 1024 * 1024})
+ require.Error(t, err)
+
+ res, err := r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 10, GPUMemory: 10})
+ require.NoError(t, err)
+ require.Equal(t, 0, res.GPU)
+}
+
+func TestRequestGPULimitsMoreGPU(t *testing.T) {
+ r, err := New(Config{
+ MaxCPU: 100,
+ MaxMemory: 100,
+ MaxGPU: 60,
+ MaxGPUMemory: 60,
+ PSUtil: newUtil(2),
+ })
+ require.NoError(t, err)
+
+ _, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 50, GPUMemory: 10})
+ require.Error(t, err)
+
+ res, err := r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 30, GPUMemory: 10})
+ require.NoError(t, err)
+ require.Equal(t, 1, res.GPU)
}
func TestHasLimits(t *testing.T) {
r, err := New(Config{
MaxCPU: 70.,
MaxMemory: 170. / 200. * 100,
- PSUtil: &util{},
+ PSUtil: newUtil(0),
Logger: nil,
})
require.NoError(t, err)
@@ -188,7 +884,7 @@ func TestHasLimits(t *testing.T) {
r, err = New(Config{
MaxCPU: 100,
MaxMemory: 100,
- PSUtil: &util{},
+ PSUtil: newUtil(0),
Logger: nil,
})
require.NoError(t, err)
@@ -198,10 +894,95 @@ func TestHasLimits(t *testing.T) {
r, err = New(Config{
MaxCPU: 0,
MaxMemory: 0,
- PSUtil: &util{},
+ PSUtil: newUtil(0),
+ Logger: nil,
+ })
+ require.NoError(t, err)
+
+ require.False(t, r.HasLimits())
+
+ r, err = New(Config{
+ MaxCPU: 0,
+ MaxMemory: 0,
+ MaxGPU: 10,
+ PSUtil: newUtil(1),
+ Logger: nil,
+ })
+ require.NoError(t, err)
+
+ require.True(t, r.HasLimits())
+
+ r, err = New(Config{
+ MaxCPU: 0,
+ MaxMemory: 0,
+ MaxGPU: 10,
+ PSUtil: newUtil(0),
Logger: nil,
})
require.NoError(t, err)
require.False(t, r.HasLimits())
}
+
+func TestInfo(t *testing.T) {
+ r, err := New(Config{
+ MaxCPU: 90,
+ MaxMemory: 90,
+ MaxGPU: 11,
+ MaxGPUMemory: 50,
+ PSUtil: newUtil(2),
+ })
+ require.NoError(t, err)
+
+ info := r.Info()
+
+ require.Equal(t, Info{
+ Mem: MemoryInfo{
+ Total: 200,
+ Available: 40,
+ Used: 160,
+ Limit: 180,
+ Core: 42,
+ Throttling: false,
+ Error: nil,
+ },
+ CPU: CPUInfo{
+ NCPU: 2,
+ System: 10,
+ User: 50,
+ Idle: 35,
+ Other: 5,
+ Limit: 90,
+ Core: 6,
+ Throttling: false,
+ Error: nil,
+ },
+ GPU: GPUInfo{
+ NGPU: 2,
+ GPU: []GPUInfoStat{{
+ Index: 0,
+ Name: "L4",
+ MemoryTotal: 24 * 1024 * 1024 * 1024,
+ MemoryUsed: 12 * 1024 * 1024 * 1024,
+ MemoryAvailable: 12 * 1024 * 1024 * 1024,
+ MemoryLimit: 12 * 1024 * 1024 * 1024,
+ Usage: 45,
+ Encoder: 40,
+ Decoder: 47,
+ UsageLimit: 11,
+ }, {
+ Index: 1,
+ Name: "L4",
+ MemoryTotal: 24 * 1024 * 1024 * 1024,
+ MemoryUsed: 13 * 1024 * 1024 * 1024,
+ MemoryAvailable: 11 * 1024 * 1024 * 1024,
+ MemoryLimit: 12 * 1024 * 1024 * 1024,
+ Usage: 40,
+ Encoder: 30,
+ Decoder: 44,
+ UsageLimit: 11,
+ }},
+ Error: nil,
+ },
+ }, info)
+}
diff --git a/restream/app/process.go b/restream/app/process.go
index 974309cb..e02cd69a 100644
--- a/restream/app/process.go
+++ b/restream/app/process.go
@@ -79,13 +79,21 @@ type Config struct {
Reconnect bool
ReconnectDelay uint64 // seconds
Autostart bool
- StaleTimeout uint64 // seconds
- Timeout uint64 // seconds
- Scheduler string // crontab pattern or RFC3339 timestamp
- LogPatterns []string // will be interpreted as regular expressions
- LimitCPU float64 // percent
- LimitMemory uint64 // bytes
- LimitWaitFor uint64 // seconds
+ StaleTimeout uint64 // seconds
+ Timeout uint64 // seconds
+ Scheduler string // crontab pattern or RFC3339 timestamp
+ LogPatterns []string // will be interpreted as regular expressions
+ LimitCPU float64 // percent
+ LimitMemory uint64 // bytes
+ LimitGPU ConfigLimitGPU // GPU limits
+ LimitWaitFor uint64 // seconds
+}
+
+type ConfigLimitGPU struct {
+ Usage float64 // percent 0-100
+ Encoder float64 // percent 0-100
+ Decoder float64 // percent 0-100
+ Memory uint64 // bytes
}
func (config *Config) Clone() *Config {
@@ -103,6 +111,7 @@ func (config *Config) Clone() *Config {
Scheduler: config.Scheduler,
LimitCPU: config.LimitCPU,
LimitMemory: config.LimitMemory,
+ LimitGPU: config.LimitGPU,
LimitWaitFor: config.LimitWaitFor,
}
@@ -175,6 +184,10 @@ func (config *Config) Hash() []byte {
b.WriteString(strconv.FormatUint(config.LimitMemory, 10))
b.WriteString(strconv.FormatUint(config.LimitWaitFor, 10))
b.WriteString(strconv.FormatFloat(config.LimitCPU, 'f', -1, 64))
+ b.WriteString(strconv.FormatFloat(config.LimitGPU.Usage, 'f', -1, 64))
+ b.WriteString(strconv.FormatFloat(config.LimitGPU.Encoder, 'f', -1, 64))
+ b.WriteString(strconv.FormatFloat(config.LimitGPU.Decoder, 'f', -1, 64))
+ b.WriteString(strconv.FormatUint(config.LimitGPU.Memory, 10))
for _, x := range config.Input {
b.WriteString(x.HashString())
@@ -294,7 +307,7 @@ type State struct {
Memory uint64 // Current memory consumption in bytes
CPU float64 // Current CPU consumption in percent
LimitMode string // How the process is limited (hard or soft)
- Resources ProcessUsage // Current resource usage, include CPU and memory consumption
+ Resources ProcessUsage // Current resource usage, include CPU, memory and GPU consumption
Command []string // ffmpeg command line parameters
}
@@ -326,10 +339,10 @@ func (p *ProcessUsageCPU) MarshalParser() parse.UsageCPU {
}
type ProcessUsageMemory struct {
- Current uint64 // bytes
- Average float64 // bytes
- Max uint64 // bytes
- Limit uint64 // bytes
+ Current uint64 // bytes
+ Average uint64 // bytes
+ Max uint64 // bytes
+ Limit uint64 // bytes
}
func (p *ProcessUsageMemory) UnmarshalParser(pp *parse.UsageMemory) {
@@ -348,20 +361,97 @@ func (p *ProcessUsageMemory) MarshalParser() parse.UsageMemory {
return pp
}
+type ProcessUsageGPU struct {
+ Index int
+ Usage ProcessUsageGPUUsage
+ Encoder ProcessUsageGPUUsage
+ Decoder ProcessUsageGPUUsage
+ Memory ProcessUsageGPUMemory
+}
+
+func (p *ProcessUsageGPU) UnmarshalParser(pp *parse.UsageGPU) {
+ p.Index = pp.Index
+ p.Usage.UnmarshalParser(&pp.Usage)
+ p.Encoder.UnmarshalParser(&pp.Encoder)
+ p.Decoder.UnmarshalParser(&pp.Decoder)
+ p.Memory.UnmarshalParser(&pp.Memory)
+}
+
+func (p *ProcessUsageGPU) MarshalParser() parse.UsageGPU {
+ pp := parse.UsageGPU{
+ Index: p.Index,
+ Usage: p.Usage.MarshalParser(),
+ Encoder: p.Encoder.MarshalParser(),
+ Decoder: p.Decoder.MarshalParser(),
+ Memory: p.Memory.MarshalParser(),
+ }
+
+ return pp
+}
+
+type ProcessUsageGPUUsage struct {
+ Current float64 // percent 0-100
+ Average float64 // percent 0-100
+ Max float64 // percent 0-100
+ Limit float64 // percent 0-100
+}
+
+func (p *ProcessUsageGPUUsage) UnmarshalParser(pp *parse.UsageGPUUsage) {
+ p.Average = pp.Average
+ p.Max = pp.Max
+ p.Limit = pp.Limit
+}
+
+func (p *ProcessUsageGPUUsage) MarshalParser() parse.UsageGPUUsage {
+ pp := parse.UsageGPUUsage{
+ Average: p.Average,
+ Max: p.Max,
+ Limit: p.Limit,
+ }
+
+ return pp
+}
+
+type ProcessUsageGPUMemory struct {
+ Current uint64 // bytes
+ Average uint64 // bytes
+ Max uint64 // bytes
+ Limit uint64 // bytes
+}
+
+func (p *ProcessUsageGPUMemory) UnmarshalParser(pp *parse.UsageGPUMemory) {
+ p.Average = pp.Average
+ p.Max = pp.Max
+ p.Limit = pp.Limit
+}
+
+func (p *ProcessUsageGPUMemory) MarshalParser() parse.UsageGPUMemory {
+ pp := parse.UsageGPUMemory{
+ Average: p.Average,
+ Max: p.Max,
+ Limit: p.Limit,
+ }
+
+ return pp
+}
+
type ProcessUsage struct {
CPU ProcessUsageCPU
Memory ProcessUsageMemory
+ GPU ProcessUsageGPU
}
func (p *ProcessUsage) UnmarshalParser(pp *parse.Usage) {
p.CPU.UnmarshalParser(&pp.CPU)
p.Memory.UnmarshalParser(&pp.Memory)
+ p.GPU.UnmarshalParser(&pp.GPU)
}
func (p *ProcessUsage) MarshalParser() parse.Usage {
pp := parse.Usage{
CPU: p.CPU.MarshalParser(),
Memory: p.Memory.MarshalParser(),
+ GPU: p.GPU.MarshalParser(),
}
return pp
diff --git a/restream/app/process_test.go b/restream/app/process_test.go
index 96889697..2aa6168b 100644
--- a/restream/app/process_test.go
+++ b/restream/app/process_test.go
@@ -46,12 +46,18 @@ func TestConfigHash(t *testing.T) {
LogPatterns: []string{"^libx264"},
LimitCPU: 50,
LimitMemory: 3 * 1024 * 1024,
- LimitWaitFor: 20,
+ LimitGPU: ConfigLimitGPU{
+ Usage: 10,
+ Encoder: 42,
+ Decoder: 14,
+ Memory: 500 * 1024 * 1024,
+ },
+ LimitWaitFor: 20,
}
hash1 := config.Hash()
- require.Equal(t, []byte{0x7e, 0xae, 0x5b, 0xc3, 0xad, 0xe3, 0x9a, 0xfc, 0xd3, 0x49, 0x15, 0x28, 0x93, 0x17, 0xc5, 0xbf}, hash1)
+ require.Equal(t, []byte{0x5e, 0x85, 0xc3, 0xc5, 0x44, 0xfd, 0x3e, 0x10, 0x13, 0x76, 0x36, 0x8b, 0xbe, 0x7e, 0xa6, 0xbb}, hash1)
config.Reconnect = false
diff --git a/restream/core.go b/restream/core.go
index e3f64f9d..bbe1a72c 100644
--- a/restream/core.go
+++ b/restream/core.go
@@ -279,13 +279,14 @@ func (r *restream) resourceObserver(ctx context.Context, rsc resources.Resources
defer ticker.Stop()
limitCPU, limitMemory := false, false
+ var limitGPUs []bool = nil
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
- cpu, memory := rsc.ShouldLimit()
+ cpu, memory, gpu := rsc.ShouldLimit()
hasChanges := false
@@ -299,17 +300,34 @@ func (r *restream) resourceObserver(ctx context.Context, rsc resources.Resources
hasChanges = true
}
+ if limitGPUs == nil {
+ limitGPUs = make([]bool, len(gpu))
+ }
+
+ for i, g := range gpu {
+ if g != limitGPUs[i] {
+ limitGPUs[i] = g
+ hasChanges = true
+ }
+ }
+
if !hasChanges {
break
}
r.tasks.Range(func(id app.ProcessID, t *task) bool {
- if t.Limit(limitCPU, limitMemory) {
+ limitGPU := false
+ gpuindex := t.GetHWDevice()
+ if gpuindex >= 0 {
+ limitGPU = limitGPUs[gpuindex]
+ }
+ if t.Limit(limitCPU, limitMemory, limitGPU) {
r.logger.Debug().WithFields(log.Fields{
"limit_cpu": limitCPU,
"limit_memory": limitMemory,
+ "limit_gpu": limitGPU,
"id": id,
- }).Log("Limiting process CPU and memory consumption")
+ }).Log("Limiting process CPU, memory, and GPU consumption")
}
return true
@@ -391,7 +409,11 @@ func (r *restream) load() error {
// Validate config with all placeholders replaced. However, we need to take care
// that the config with the task keeps its dynamic placeholders for process starts.
config := t.config.Clone()
- resolveDynamicPlaceholder(config, r.replace)
+ resolveDynamicPlaceholder(config, r.replace, map[string]string{
+ "hwdevice": "0",
+ }, map[string]string{
+ "timestamp": time.Now().UTC().Format(time.RFC3339),
+ })
t.usesDisk, err = validateConfig(config, r.fs.list, r.ffmpeg)
if err != nil {
@@ -414,30 +436,23 @@ func (r *restream) load() error {
}
ffmpeg, err := r.ffmpeg.New(ffmpeg.ProcessConfig{
- Reconnect: t.config.Reconnect,
- ReconnectDelay: time.Duration(t.config.ReconnectDelay) * time.Second,
- StaleTimeout: time.Duration(t.config.StaleTimeout) * time.Second,
- Timeout: time.Duration(t.config.Timeout) * time.Second,
- LimitCPU: t.config.LimitCPU,
- LimitMemory: t.config.LimitMemory,
- LimitDuration: time.Duration(t.config.LimitWaitFor) * time.Second,
- LimitMode: limitMode,
- Scheduler: t.config.Scheduler,
- Args: t.command,
- Parser: t.parser,
- Logger: t.logger,
- OnArgs: r.onArgs(t.config.Clone()),
- OnBeforeStart: func() error {
- if !r.enableSoftLimit {
- return nil
- }
-
- if err := r.resources.Request(t.config.LimitCPU, t.config.LimitMemory); err != nil {
- return err
- }
-
- return nil
- },
+ Reconnect: t.config.Reconnect,
+ ReconnectDelay: time.Duration(t.config.ReconnectDelay) * time.Second,
+ StaleTimeout: time.Duration(t.config.StaleTimeout) * time.Second,
+ Timeout: time.Duration(t.config.Timeout) * time.Second,
+ LimitCPU: t.config.LimitCPU,
+ LimitMemory: t.config.LimitMemory,
+ LimitGPUUsage: t.config.LimitGPU.Usage,
+ LimitGPUEncoder: t.config.LimitGPU.Encoder,
+ LimitGPUDecoder: t.config.LimitGPU.Decoder,
+ LimitGPUMemory: t.config.LimitGPU.Memory,
+ LimitDuration: time.Duration(t.config.LimitWaitFor) * time.Second,
+ LimitMode: limitMode,
+ Scheduler: t.config.Scheduler,
+ Args: t.command,
+ Parser: t.parser,
+ Logger: t.logger,
+ OnBeforeStart: r.onBeforeStart(t.config.Clone()),
})
if err != nil {
return true
@@ -578,7 +593,11 @@ func (r *restream) createTask(config *app.Config) (*task, error) {
// Validate config with all placeholders replaced. However, we need to take care
// that the config with the task keeps its dynamic placeholders for process starts.
config := t.config.Clone()
- resolveDynamicPlaceholder(config, r.replace)
+ resolveDynamicPlaceholder(config, r.replace, map[string]string{
+ "hwdevice": "0",
+ }, map[string]string{
+ "timestamp": time.Now().UTC().Format(time.RFC3339),
+ })
t.usesDisk, err = validateConfig(config, r.fs.list, r.ffmpeg)
if err != nil {
@@ -600,30 +619,23 @@ func (r *restream) createTask(config *app.Config) (*task, error) {
}
ffmpeg, err := r.ffmpeg.New(ffmpeg.ProcessConfig{
- Reconnect: t.config.Reconnect,
- ReconnectDelay: time.Duration(t.config.ReconnectDelay) * time.Second,
- StaleTimeout: time.Duration(t.config.StaleTimeout) * time.Second,
- Timeout: time.Duration(t.config.Timeout) * time.Second,
- LimitCPU: t.config.LimitCPU,
- LimitMemory: t.config.LimitMemory,
- LimitDuration: time.Duration(t.config.LimitWaitFor) * time.Second,
- LimitMode: limitMode,
- Scheduler: t.config.Scheduler,
- Args: t.command,
- Parser: t.parser,
- Logger: t.logger,
- OnArgs: r.onArgs(t.config.Clone()),
- OnBeforeStart: func() error {
- if !r.enableSoftLimit {
- return nil
- }
-
- if err := r.resources.Request(t.config.LimitCPU, t.config.LimitMemory); err != nil {
- return err
- }
-
- return nil
- },
+ Reconnect: t.config.Reconnect,
+ ReconnectDelay: time.Duration(t.config.ReconnectDelay) * time.Second,
+ StaleTimeout: time.Duration(t.config.StaleTimeout) * time.Second,
+ Timeout: time.Duration(t.config.Timeout) * time.Second,
+ LimitCPU: t.config.LimitCPU,
+ LimitMemory: t.config.LimitMemory,
+ LimitGPUUsage: t.config.LimitGPU.Usage,
+ LimitGPUEncoder: t.config.LimitGPU.Encoder,
+ LimitGPUDecoder: t.config.LimitGPU.Decoder,
+ LimitGPUMemory: t.config.LimitGPU.Memory,
+ LimitDuration: time.Duration(t.config.LimitWaitFor) * time.Second,
+ LimitMode: limitMode,
+ Scheduler: t.config.Scheduler,
+ Args: t.command,
+ Parser: t.parser,
+ Logger: t.logger,
+ OnBeforeStart: r.onBeforeStart(t.config.Clone()),
})
if err != nil {
return nil, err
@@ -636,21 +648,45 @@ func (r *restream) createTask(config *app.Config) (*task, error) {
return t, nil
}
-// onArgs is a callback that gets called by a process before it will be started.
-// It evalutes the dynamic placeholders in a process config and returns the
-// resulting command line to the process.
-func (r *restream) onArgs(cfg *app.Config) func([]string) []string {
- return func(args []string) []string {
+// onBeforeStart is a callback that gets called by a process before it will be started.
+// It evalutes the dynamic placeholders in a process config and returns the resulting command line to the process.
+func (r *restream) onBeforeStart(cfg *app.Config) func([]string) ([]string, error) {
+ return func(args []string) ([]string, error) {
+ selectedGPU := -1
+ if r.enableSoftLimit {
+ res, err := r.resources.Request(resources.Request{
+ CPU: cfg.LimitCPU,
+ Memory: cfg.LimitMemory,
+ GPUUsage: cfg.LimitGPU.Usage,
+ GPUEncoder: cfg.LimitGPU.Encoder,
+ GPUDecoder: cfg.LimitGPU.Decoder,
+ GPUMemory: cfg.LimitGPU.Memory,
+ })
+ if err != nil {
+ return []string{}, err
+ }
+
+ selectedGPU = res.GPU
+ }
+
+ if t, hasTask := r.tasks.Load(cfg.ProcessID()); hasTask {
+ t.SetHWDevice(selectedGPU)
+ }
+
config := cfg.Clone()
- resolveDynamicPlaceholder(config, r.replace)
+ resolveDynamicPlaceholder(config, r.replace, map[string]string{
+ "hwdevice": fmt.Sprintf("%d", selectedGPU),
+ }, map[string]string{
+ "timestamp": time.Now().UTC().Format(time.RFC3339),
+ })
_, err := validateConfig(config, r.fs.list, r.ffmpeg)
if err != nil {
- return []string{}
+ return []string{}, err
}
- return config.CreateCommand()
+ return config.CreateCommand(), nil
}
}
@@ -1448,7 +1484,11 @@ func (r *restream) Probe(config *app.Config, timeout time.Duration) app.Probe {
return probe
}
- resolveDynamicPlaceholder(config, r.replace)
+ resolveDynamicPlaceholder(config, r.replace, map[string]string{
+ "hwdevice": "0",
+ }, map[string]string{
+ "timestamp": time.Now().UTC().Format(time.RFC3339),
+ })
_, err = validateConfig(config, r.fs.list, r.ffmpeg)
if err != nil {
@@ -1712,22 +1752,26 @@ func resolveStaticPlaceholders(config *app.Config, r replace.Replacer) {
// resolveDynamicPlaceholder replaces placeholders in the config that should be replaced at process start.
// The config will be modified in place.
-func resolveDynamicPlaceholder(config *app.Config, r replace.Replacer) {
- vars := map[string]string{
- "timestamp": time.Now().UTC().Format(time.RFC3339),
- }
+func resolveDynamicPlaceholder(config *app.Config, r replace.Replacer, values map[string]string, vars map[string]string) {
+ placeholders := []string{"date", "hwdevice"}
for i, option := range config.Options {
- option = r.Replace(option, "date", "", vars, config, "global")
+ for _, placeholder := range placeholders {
+ option = r.Replace(option, placeholder, values[placeholder], vars, config, "global")
+ }
config.Options[i] = option
}
for i, input := range config.Input {
- input.Address = r.Replace(input.Address, "date", "", vars, config, "input")
+ for _, placeholder := range placeholders {
+ input.Address = r.Replace(input.Address, placeholder, values[placeholder], vars, config, "input")
+ }
for j, option := range input.Options {
- option = r.Replace(option, "date", "", vars, config, "input")
+ for _, placeholder := range placeholders {
+ option = r.Replace(option, placeholder, values[placeholder], vars, config, "input")
+ }
input.Options[j] = option
}
@@ -1736,16 +1780,22 @@ func resolveDynamicPlaceholder(config *app.Config, r replace.Replacer) {
}
for i, output := range config.Output {
- output.Address = r.Replace(output.Address, "date", "", vars, config, "output")
+ for _, placeholder := range placeholders {
+ output.Address = r.Replace(output.Address, placeholder, values[placeholder], vars, config, "output")
+ }
for j, option := range output.Options {
- option = r.Replace(option, "date", "", vars, config, "output")
+ for _, placeholder := range placeholders {
+ option = r.Replace(option, placeholder, values[placeholder], vars, config, "output")
+ }
output.Options[j] = option
}
for j, cleanup := range output.Cleanup {
- cleanup.Pattern = r.Replace(cleanup.Pattern, "date", "", vars, config, "output")
+ for _, placeholder := range placeholders {
+ cleanup.Pattern = r.Replace(cleanup.Pattern, placeholder, values[placeholder], vars, config, "output")
+ }
output.Cleanup[j] = cleanup
}
diff --git a/restream/core_test.go b/restream/core_test.go
index 3d9e1a68..48d79d89 100644
--- a/restream/core_test.go
+++ b/restream/core_test.go
@@ -1261,7 +1261,7 @@ func TestReplacer(t *testing.T) {
require.Equal(t, wantprocess, process)
- resolveDynamicPlaceholder(process, replacer)
+ resolveDynamicPlaceholder(process, replacer, nil, nil)
wantprocess.Input = []app.ConfigIO{
{
@@ -1531,7 +1531,7 @@ func TestProcessLimit(t *testing.T) {
status := task.ffmpeg.Status()
- ncpu, err := psutil.CPUCounts(true)
+ ncpu, err := psutil.CPUCounts()
require.NoError(t, err)
require.Equal(t, ncpu*process.LimitCPU, status.CPU.Limit)
diff --git a/restream/task.go b/restream/task.go
index 3073b506..40cb74c4 100644
--- a/restream/task.go
+++ b/restream/task.go
@@ -3,6 +3,7 @@ package restream
import (
"errors"
"maps"
+ "sync/atomic"
"time"
"github.com/datarhei/core/v16/ffmpeg/parse"
@@ -31,7 +32,8 @@ type task struct {
parser parse.Parser
playout map[string]int
logger log.Logger
- usesDisk bool // Whether this task uses the disk
+ usesDisk bool // Whether this task uses the disk
+ hwdevice atomic.Int32 // Index of the GPU this task uses
metadata map[string]interface{}
lock *xsync.RBMutex
@@ -234,8 +236,47 @@ func (t *task) State() (*app.State, error) {
state.Memory = status.Memory.Current
state.CPU = status.CPU.Current / status.CPU.NCPU
state.LimitMode = status.LimitMode
- state.Resources.CPU = status.CPU
- state.Resources.Memory = status.Memory
+ state.Resources.CPU = app.ProcessUsageCPU{
+ NCPU: status.CPU.NCPU,
+ Current: status.CPU.Current,
+ Average: status.CPU.Average,
+ Max: status.CPU.Max,
+ Limit: status.CPU.Limit,
+ IsThrottling: status.CPU.IsThrottling,
+ }
+ state.Resources.Memory = app.ProcessUsageMemory{
+ Current: status.Memory.Current,
+ Average: status.Memory.Average,
+ Max: status.Memory.Max,
+ Limit: status.Memory.Limit,
+ }
+ state.Resources.GPU = app.ProcessUsageGPU{
+ Index: status.GPU.Index,
+ Usage: app.ProcessUsageGPUUsage{
+ Current: status.GPU.Usage.Current,
+ Average: status.GPU.Usage.Average,
+ Max: status.GPU.Usage.Max,
+ Limit: status.GPU.Usage.Limit,
+ },
+ Encoder: app.ProcessUsageGPUUsage{
+ Current: status.GPU.Encoder.Current,
+ Average: status.GPU.Encoder.Average,
+ Max: status.GPU.Encoder.Max,
+ Limit: status.GPU.Encoder.Limit,
+ },
+ Decoder: app.ProcessUsageGPUUsage{
+ Current: status.GPU.Decoder.Current,
+ Average: status.GPU.Decoder.Average,
+ Max: status.GPU.Decoder.Max,
+ Limit: status.GPU.Decoder.Limit,
+ },
+ Memory: app.ProcessUsageGPUMemory{
+ Current: status.GPU.Memory.Current,
+ Average: status.GPU.Memory.Average,
+ Max: status.GPU.Memory.Max,
+ Limit: status.GPU.Memory.Limit,
+ },
+ }
state.Duration = status.Duration.Round(10 * time.Millisecond).Seconds()
state.Reconnect = -1
state.Command = status.CommandArgs
@@ -420,7 +461,7 @@ func (t *task) ExportMetadata() map[string]interface{} {
return t.metadata
}
-func (t *task) Limit(cpu, memory bool) bool {
+func (t *task) Limit(cpu, memory, gpu bool) bool {
token := t.lock.RLock()
defer t.lock.RUnlock(token)
@@ -428,11 +469,19 @@ func (t *task) Limit(cpu, memory bool) bool {
return false
}
- t.ffmpeg.Limit(cpu, memory)
+ t.ffmpeg.Limit(cpu, memory, gpu)
return true
}
+func (t *task) SetHWDevice(index int) {
+ t.hwdevice.Store(int32(index))
+}
+
+func (t *task) GetHWDevice() int {
+ return int(t.hwdevice.Load())
+}
+
func (t *task) Equal(config *app.Config) bool {
token := t.lock.RLock()
defer t.lock.RUnlock(token)
diff --git a/session/registry_test.go b/session/registry_test.go
index 7b1d987d..5cba9ec1 100644
--- a/session/registry_test.go
+++ b/session/registry_test.go
@@ -8,6 +8,7 @@ import (
"time"
"github.com/datarhei/core/v16/io/fs"
+
"github.com/lestrrat-go/strftime"
"github.com/stretchr/testify/require"
)