Add GPU support

This commit is contained in:
Ingo Oppermann 2024-10-24 15:08:26 +02:00
parent df30a6b8e3
commit 2dbe5b5685
No known key found for this signature in database
GPG Key ID: 2AB32426E9DD229E
48 changed files with 5375 additions and 1138 deletions

View File

@ -371,9 +371,11 @@ func (a *api) start(ctx context.Context) error {
}
resources, err := resources.New(resources.Config{
MaxCPU: cfg.Resources.MaxCPUUsage,
MaxMemory: cfg.Resources.MaxMemoryUsage,
Logger: a.log.logger.core.WithComponent("Resources"),
MaxCPU: cfg.Resources.MaxCPUUsage,
MaxMemory: cfg.Resources.MaxMemoryUsage,
MaxGPU: cfg.Resources.MaxGPUUsage,
MaxGPUMemory: cfg.Resources.MaxGPUMemoryUsage,
Logger: a.log.logger.core.WithComponent("Resources"),
})
if err != nil {
return fmt.Errorf("failed to initialize resource manager: %w", err)

View File

@ -18,18 +18,29 @@ type ClusterRaft struct {
}
type ClusterNodeResources struct {
IsThrottling bool // Whether this core is currently throttling
NCPU float64 // Number of CPU on this node
CPU float64 // Current CPU load, 0-100*ncpu
CPULimit float64 // Defined CPU load limit, 0-100*ncpu
CPUCore float64 // Current CPU load of the core itself, 0-100*ncpu
Mem uint64 // Currently used memory in bytes
MemLimit uint64 // Defined memory limit in bytes
MemTotal uint64 // Total available memory in bytes
MemCore uint64 // Current used memory of the core itself in bytes
IsThrottling bool // Whether this core is currently throttling
NCPU float64 // Number of CPU on this node
CPU float64 // Current CPU load, 0-100*ncpu
CPULimit float64 // Defined CPU load limit, 0-100*ncpu
CPUCore float64 // Current CPU load of the core itself, 0-100*ncpu
Mem uint64 // Currently used memory in bytes
MemLimit uint64 // Defined memory limit in bytes
MemTotal uint64 // Total available memory in bytes
MemCore uint64 // Current used memory of the core itself in bytes
GPU []ClusterNodeGPUResources // GPU resources
Error error
}
type ClusterNodeGPUResources struct {
Mem uint64 // Currently used memory in bytes
MemLimit uint64 // Defined memory limit in bytes
MemTotal uint64 // Total available memory in bytes
Usage float64 // Current general usage, 0-100
UsageLimit float64 // Defined general usage limit, 0-100
Encoder float64 // Current encoder usage, 0-100
Decoder float64 // Current decoder usage, 0-100
}
type ClusterNode struct {
ID string
Name string
@ -157,6 +168,19 @@ func (c *cluster) About() (ClusterAbout, error) {
},
}
if len(nodeAbout.Resources.GPU) != 0 {
node.Resources.GPU = make([]ClusterNodeGPUResources, len(nodeAbout.Resources.GPU))
for i, gpu := range nodeAbout.Resources.GPU {
node.Resources.GPU[i].Mem = gpu.Mem
node.Resources.GPU[i].MemLimit = gpu.MemLimit
node.Resources.GPU[i].MemTotal = gpu.MemTotal
node.Resources.GPU[i].Usage = gpu.Usage
node.Resources.GPU[i].UsageLimit = gpu.UsageLimit
node.Resources.GPU[i].Encoder = gpu.Encoder
node.Resources.GPU[i].Decoder = gpu.Decoder
}
}
if s, ok := serversMap[nodeAbout.ID]; ok {
node.Voter = s.Voter
node.Leader = s.Leader

View File

@ -195,6 +195,19 @@ func (a *api) About(c echo.Context) error {
},
}
if len(resources.GPU.GPU) != 0 {
about.Resources.GPU = make([]client.AboutResponseGPUResources, len(resources.GPU.GPU))
for i, gpu := range resources.GPU.GPU {
about.Resources.GPU[i].Mem = gpu.MemoryUsed
about.Resources.GPU[i].MemLimit = gpu.MemoryLimit
about.Resources.GPU[i].MemTotal = gpu.MemoryTotal
about.Resources.GPU[i].Usage = gpu.Usage
about.Resources.GPU[i].UsageLimit = gpu.UsageLimit
about.Resources.GPU[i].Encoder = gpu.Encoder
about.Resources.GPU[i].Decoder = gpu.Decoder
}
}
if err != nil {
about.Resources.Error = err.Error()
}

View File

@ -83,17 +83,28 @@ type AboutResponse struct {
Resources AboutResponseResources `json:"resources"`
}
type AboutResponseGPUResources struct {
Mem uint64 `json:"memory_bytes"` // Currently used memory in bytes
MemLimit uint64 `json:"memory_limit_bytes"` // Defined memory limit in bytes
MemTotal uint64 `json:"memory_total_bytes"` // Total available memory in bytes
Usage float64 `json:"usage"` // Current general usage, 0-100
Encoder float64 `json:"encoder"` // Current encoder usage, 0-100
Decoder float64 `json:"decoder"` // Current decoder usage, 0-100
UsageLimit float64 `json:"usage_limit"` // Defined general usage limit, 0-100
}
type AboutResponseResources struct {
IsThrottling bool `json:"is_throttling"` // Whether this core is currently throttling
NCPU float64 `json:"ncpu"` // Number of CPU on this node
CPU float64 `json:"cpu"` // Current CPU load, 0-100*ncpu
CPULimit float64 `json:"cpu_limit"` // Defined CPU load limit, 0-100*ncpu
CPUCore float64 `json:"cpu_core"` // Current CPU load of the core itself, 0-100*ncpu
Mem uint64 `json:"memory_bytes"` // Currently used memory in bytes
MemLimit uint64 `json:"memory_limit_bytes"` // Defined memory limit in bytes
MemTotal uint64 `json:"memory_total_bytes"` // Total available memory in bytes
MemCore uint64 `json:"memory_core_bytes"` // Current used memory of the core itself in bytes
Error string `json:"error"` // Last error
IsThrottling bool `json:"is_throttling"` // Whether this core is currently throttling
NCPU float64 `json:"ncpu"` // Number of CPU on this node
CPU float64 `json:"cpu"` // Current CPU load, 0-100*ncpu
CPULimit float64 `json:"cpu_limit"` // Defined CPU load limit, 0-100*ncpu
CPUCore float64 `json:"cpu_core"` // Current CPU load of the core itself, 0-100*ncpu
Mem uint64 `json:"memory_bytes"` // Currently used memory in bytes
MemLimit uint64 `json:"memory_limit_bytes"` // Defined memory limit in bytes
MemTotal uint64 `json:"memory_total_bytes"` // Total available memory in bytes
MemCore uint64 `json:"memory_core_bytes"` // Current used memory of the core itself in bytes
GPU []AboutResponseGPUResources `json:"gpu"` // Currently used GPU resources
Error string `json:"error"` // Last error
}
type SetNodeStateRequest struct {

View File

@ -78,7 +78,7 @@ func rebalance(have []node.Process, nodes map[string]node.About) ([]interface{},
// Mark nodes as throttling where at least one process is still throttling
for _, haveP := range have {
if haveP.Throttling {
if haveP.Resources.Throttling {
resources.Throttling(haveP.NodeID, true)
}
}
@ -126,7 +126,7 @@ func rebalance(have []node.Process, nodes map[string]node.About) ([]interface{},
continue
}
if resources.HasNodeEnough(raNodeid, p.Config.LimitCPU, p.Config.LimitMemory) {
if resources.HasNodeEnough(raNodeid, ResourcesFromConfig(p.Config)) {
availableNodeid = raNodeid
break
}
@ -135,7 +135,7 @@ func rebalance(have []node.Process, nodes map[string]node.About) ([]interface{},
// Find the best node with enough resources available.
if len(availableNodeid) == 0 {
nodes := resources.FindBestNodes(p.Config.LimitCPU, p.Config.LimitMemory)
nodes := resources.FindBestNodes(ResourcesFromConfig(p.Config))
for _, nodeid := range nodes {
if nodeid == overloadedNodeid {
continue
@ -169,7 +169,7 @@ func rebalance(have []node.Process, nodes map[string]node.About) ([]interface{},
processes[i] = p
// Adjust the resources.
resources.Move(availableNodeid, overloadedNodeid, p.CPU, p.Mem)
resources.Move(availableNodeid, overloadedNodeid, ResourcesFromProcess(p.Resources))
// Adjust the reference affinity.
haveReferenceAffinity.Move(p.Config.Reference, p.Config.Domain, overloadedNodeid, availableNodeid)

View File

@ -95,7 +95,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[
// Mark nodes as throttling where at least one process is still throttling
for _, haveP := range have {
if haveP.Throttling {
if haveP.Resources.Throttling {
resources.Throttling(haveP.NodeID, true)
}
}
@ -136,7 +136,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[
if len(targetNodeid) != 0 {
_, hasNode := nodes[targetNodeid]
if !hasNode || !resources.HasNodeEnough(targetNodeid, process.Config.LimitCPU, process.Config.LimitMemory) {
if !hasNode || !resources.HasNodeEnough(targetNodeid, ResourcesFromConfig(process.Config)) {
targetNodeid = ""
}
}
@ -152,7 +152,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[
continue
}
if resources.HasNodeEnough(raNodeid, process.Config.LimitCPU, process.Config.LimitMemory) {
if resources.HasNodeEnough(raNodeid, ResourcesFromConfig(process.Config)) {
targetNodeid = raNodeid
break
}
@ -161,7 +161,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[
// Find the best node with enough resources available.
if len(targetNodeid) == 0 {
nodes := resources.FindBestNodes(process.Config.LimitCPU, process.Config.LimitMemory)
nodes := resources.FindBestNodes(ResourcesFromConfig(process.Config))
for _, nodeid := range nodes {
if nodeid == sourceNodeid {
continue
@ -194,7 +194,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[
opBudget -= 5
// Adjust the resources.
resources.Move(targetNodeid, sourceNodeid, process.CPU, process.Mem)
resources.Move(targetNodeid, sourceNodeid, ResourcesFromProcess(process.Resources))
// Adjust the reference affinity.
haveReferenceAffinity.Move(process.Config.Reference, process.Config.Domain, sourceNodeid, targetNodeid)

View File

@ -143,7 +143,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
// Mark nodes as throttling where at least one process is still throttling
for _, haveP := range have {
if haveP.Throttling {
if haveP.Resources.Throttling {
resources.Throttling(haveP.NodeID, true)
}
}
@ -182,7 +182,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
processid: haveP.Config.ProcessID(),
})
resources.Remove(haveP.NodeID, haveP.CPU, haveP.Mem)
resources.Remove(haveP.NodeID, ResourcesFromProcess(haveP.Resources))
continue
}
@ -219,7 +219,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
})
// Release the resources.
resources.Remove(haveP.NodeID, haveP.CPU, haveP.Mem)
resources.Remove(haveP.NodeID, ResourcesFromProcess(haveP.Resources))
}
}
@ -229,7 +229,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
for _, haveP := range wantOrderStart {
nodeid := haveP.NodeID
resources.Add(nodeid, haveP.Config.LimitCPU, haveP.Config.LimitMemory)
resources.Add(nodeid, ResourcesFromConfig(haveP.Config))
// TODO: check if the current node has actually enough resources available,
// otherwise it needs to be moved somewhere else. If the node doesn't
@ -347,7 +347,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
// Try to add the process to a node where other processes with the same reference currently reside.
raNodes := haveReferenceAffinity.Nodes(wantP.Config.Reference, wantP.Config.Domain)
for _, raNodeid := range raNodes {
if resources.HasNodeEnough(raNodeid, wantP.Config.LimitCPU, wantP.Config.LimitMemory) {
if resources.HasNodeEnough(raNodeid, ResourcesFromConfig(wantP.Config)) {
nodeid = raNodeid
break
}
@ -355,7 +355,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
// Find the node with the most resources available.
if len(nodeid) == 0 {
nodes := resources.FindBestNodes(wantP.Config.LimitCPU, wantP.Config.LimitMemory)
nodes := resources.FindBestNodes(ResourcesFromConfig(wantP.Config))
if len(nodes) > 0 {
nodeid = nodes[0]
}
@ -372,7 +372,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
opBudget -= 3
// Consume the resources
resources.Add(nodeid, wantP.Config.LimitCPU, wantP.Config.LimitMemory)
resources.Add(nodeid, ResourcesFromConfig(wantP.Config))
reality[pid] = nodeid

View File

@ -193,11 +193,13 @@ func TestSynchronizeOrderStop(t *testing.T) {
have := []node.Process{
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 12,
Mem: 5,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 12,
Mem: 5,
},
Runtime: 42,
UpdatedAt: now,
Config: &app.Config{
@ -285,11 +287,13 @@ func TestSynchronizeOrderStart(t *testing.T) {
have := []node.Process{
{
NodeID: "node1",
Order: "stop",
State: "finished",
CPU: 0,
Mem: 0,
NodeID: "node1",
Order: "stop",
State: "finished",
Resources: node.ProcessResources{
CPU: 0,
Mem: 0,
},
Runtime: 42,
UpdatedAt: now,
Config: &app.Config{
@ -388,11 +392,13 @@ func TestSynchronizeAddReferenceAffinity(t *testing.T) {
have := []node.Process{
{
NodeID: "node2",
Order: "start",
State: "running",
CPU: 12,
Mem: 5,
NodeID: "node2",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 12,
Mem: 5,
},
Runtime: 42,
UpdatedAt: now,
Config: &app.Config{
@ -490,11 +496,13 @@ func TestSynchronizeAddReferenceAffinityMultiple(t *testing.T) {
have := []node.Process{
{
NodeID: "node2",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node2",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 2,
},
Runtime: 42,
UpdatedAt: now,
Config: &app.Config{
@ -882,11 +890,13 @@ func TestSynchronizeRemove(t *testing.T) {
have := []node.Process{
{
NodeID: "node2",
Order: "start",
State: "running",
CPU: 12,
Mem: 5,
NodeID: "node2",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 12,
Mem: 5,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar",
@ -967,11 +977,13 @@ func TestSynchronizeAddRemove(t *testing.T) {
have := []node.Process{
{
NodeID: "node2",
Order: "start",
State: "running",
CPU: 12,
Mem: 5,
NodeID: "node2",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 12,
Mem: 5,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar2",
@ -1064,11 +1076,13 @@ func TestSynchronizeNoUpdate(t *testing.T) {
have := []node.Process{
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 12,
Mem: 5,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 12,
Mem: 5,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar",
@ -1133,11 +1147,13 @@ func TestSynchronizeUpdate(t *testing.T) {
have := []node.Process{
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 12,
Mem: 5,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 12,
Mem: 5,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar",
@ -1217,11 +1233,13 @@ func TestSynchronizeUpdateMetadata(t *testing.T) {
have := []node.Process{
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 12,
Mem: 5,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 12,
Mem: 5,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar",
@ -1313,11 +1331,13 @@ func TestSynchronizeWaitDisconnectedNode(t *testing.T) {
have := []node.Process{
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 12,
Mem: 5,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 12,
Mem: 5,
},
Runtime: 42,
UpdatedAt: now,
Config: &app.Config{
@ -1397,11 +1417,13 @@ func TestSynchronizeWaitDisconnectedNodeNoWish(t *testing.T) {
have := []node.Process{
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 12,
Mem: 5,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 12,
Mem: 5,
},
Runtime: 42,
UpdatedAt: now,
Config: &app.Config{
@ -1493,11 +1515,13 @@ func TestSynchronizeWaitDisconnectedNodeUnrealisticWish(t *testing.T) {
have := []node.Process{
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 12,
Mem: 5,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 12,
Mem: 5,
},
Runtime: 42,
UpdatedAt: now,
Config: &app.Config{
@ -1589,11 +1613,13 @@ func TestSynchronizeTimeoutDisconnectedNode(t *testing.T) {
have := []node.Process{
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 12,
Mem: 5,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 12,
Mem: 5,
},
Runtime: 42,
UpdatedAt: now,
Config: &app.Config{
@ -1655,22 +1681,26 @@ func TestSynchronizeTimeoutDisconnectedNode(t *testing.T) {
func TestRebalanceNothingToDo(t *testing.T) {
processes := []node.Process{
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 35,
Mem: 20,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 35,
Mem: 20,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar1",
},
},
{
NodeID: "node2",
Order: "start",
State: "running",
CPU: 12,
Mem: 5,
NodeID: "node2",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 12,
Mem: 5,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar2",
@ -1711,33 +1741,39 @@ func TestRebalanceNothingToDo(t *testing.T) {
func TestRebalanceOverload(t *testing.T) {
processes := []node.Process{
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 35,
Mem: 20,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 35,
Mem: 20,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar1",
},
},
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 17,
Mem: 31,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 17,
Mem: 31,
},
Runtime: 27,
Config: &app.Config{
ID: "foobar3",
},
},
{
NodeID: "node2",
Order: "start",
State: "running",
CPU: 12,
Mem: 5,
NodeID: "node2",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 12,
Mem: 5,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar2",
@ -1806,33 +1842,39 @@ func TestRebalanceOverload(t *testing.T) {
func TestRebalanceSkip(t *testing.T) {
processes := []node.Process{
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 35,
Mem: 20,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 35,
Mem: 20,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar1",
},
},
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 17,
Mem: 31,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 17,
Mem: 31,
},
Runtime: 27,
Config: &app.Config{
ID: "foobar3",
},
},
{
NodeID: "node2",
Order: "start",
State: "running",
CPU: 12,
Mem: 5,
NodeID: "node2",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 12,
Mem: 5,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar2",
@ -1908,22 +1950,26 @@ func TestRebalanceSkip(t *testing.T) {
func TestRebalanceReferenceAffinity(t *testing.T) {
processes := []node.Process{
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar1",
},
},
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 1,
Config: &app.Config{
ID: "foobar2",
@ -1931,11 +1977,13 @@ func TestRebalanceReferenceAffinity(t *testing.T) {
},
},
{
NodeID: "node2",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node2",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar3",
@ -1943,11 +1991,13 @@ func TestRebalanceReferenceAffinity(t *testing.T) {
},
},
{
NodeID: "node3",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node3",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar4",
@ -1955,11 +2005,13 @@ func TestRebalanceReferenceAffinity(t *testing.T) {
},
},
{
NodeID: "node3",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node3",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar5",
@ -2048,33 +2100,39 @@ func TestRebalanceReferenceAffinity(t *testing.T) {
func TestRebalanceRelocateTarget(t *testing.T) {
processes := []node.Process{
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 35,
Mem: 20,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 35,
Mem: 20,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar1",
},
},
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 17,
Mem: 31,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 17,
Mem: 31,
},
Runtime: 27,
Config: &app.Config{
ID: "foobar3",
},
},
{
NodeID: "node2",
Order: "start",
State: "running",
CPU: 12,
Mem: 5,
NodeID: "node2",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 12,
Mem: 5,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar2",
@ -2165,33 +2223,39 @@ func TestRebalanceRelocateTarget(t *testing.T) {
func TestRebalanceRelocateAny(t *testing.T) {
processes := []node.Process{
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 35,
Mem: 20,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 35,
Mem: 20,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar1",
},
},
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 17,
Mem: 31,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 17,
Mem: 31,
},
Runtime: 27,
Config: &app.Config{
ID: "foobar3",
},
},
{
NodeID: "node2",
Order: "start",
State: "running",
CPU: 12,
Mem: 5,
NodeID: "node2",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 12,
Mem: 5,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar2",
@ -2319,7 +2383,10 @@ func TestFindBestNodesForProcess(t *testing.T) {
resources := NewResourcePlanner(nodes)
list := resources.FindBestNodes(35, 20)
list := resources.FindBestNodes(Resources{
CPU: 35,
Mem: 20,
})
require.Equal(t, []string{"node3", "node2", "node1"}, list)
}
@ -2433,7 +2500,10 @@ func TestFindBestNodesForProcess2(t *testing.T) {
},
}
list := resources.FindBestNodes(4.0, 45*1024*1024)
list := resources.FindBestNodes(Resources{
CPU: 4.0,
Mem: 45 * 1024 * 1024,
})
require.Equal(t, []string{"node10", "node8", "node7", "node1", "node5", "node12", "node4", "node3", "node13", "node6", "node11", "node2"}, list)
}
@ -2441,11 +2511,13 @@ func TestFindBestNodesForProcess2(t *testing.T) {
func TestCreateNodeProcessMap(t *testing.T) {
processes := []node.Process{
{
NodeID: "node1",
Order: "start",
State: "finished",
CPU: 1,
Mem: 1,
NodeID: "node1",
Order: "start",
State: "finished",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 1,
Config: &app.Config{
ID: "foobar7",
@ -2453,11 +2525,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
},
{
NodeID: "node1",
Order: "start",
State: "failed",
CPU: 1,
Mem: 1,
NodeID: "node1",
Order: "start",
State: "failed",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 1,
Config: &app.Config{
ID: "foobar8",
@ -2465,22 +2539,26 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
},
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar1",
},
},
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 1,
Config: &app.Config{
ID: "foobar2",
@ -2488,11 +2566,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
},
{
NodeID: "node2",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node2",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 67,
Config: &app.Config{
ID: "foobar3",
@ -2500,11 +2580,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
},
{
NodeID: "node2",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node2",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar6",
@ -2512,11 +2594,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
},
{
NodeID: "node3",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node3",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 41,
Config: &app.Config{
ID: "foobar4",
@ -2524,11 +2608,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
},
{
NodeID: "node3",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node3",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar5",
@ -2542,11 +2628,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
require.Equal(t, map[string][]node.Process{
"node1": {
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 1,
Config: &app.Config{
ID: "foobar2",
@ -2554,11 +2642,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
},
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar1",
@ -2567,11 +2657,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
"node2": {
{
NodeID: "node2",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node2",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar6",
@ -2579,11 +2671,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
},
{
NodeID: "node2",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node2",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 67,
Config: &app.Config{
ID: "foobar3",
@ -2593,11 +2687,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
"node3": {
{
NodeID: "node3",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node3",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 41,
Config: &app.Config{
ID: "foobar4",
@ -2605,11 +2701,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
},
},
{
NodeID: "node3",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node3",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar5",
@ -2623,22 +2721,26 @@ func TestCreateNodeProcessMap(t *testing.T) {
func TestCreateReferenceAffinityNodeMap(t *testing.T) {
processes := []node.Process{
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar1",
},
},
{
NodeID: "node1",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node1",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 1,
Config: &app.Config{
ID: "foobar2",
@ -2646,11 +2748,13 @@ func TestCreateReferenceAffinityNodeMap(t *testing.T) {
},
},
{
NodeID: "node2",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node2",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar3",
@ -2658,11 +2762,13 @@ func TestCreateReferenceAffinityNodeMap(t *testing.T) {
},
},
{
NodeID: "node2",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node2",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar3",
@ -2670,11 +2776,13 @@ func TestCreateReferenceAffinityNodeMap(t *testing.T) {
},
},
{
NodeID: "node3",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node3",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar4",
@ -2682,11 +2790,13 @@ func TestCreateReferenceAffinityNodeMap(t *testing.T) {
},
},
{
NodeID: "node3",
Order: "start",
State: "running",
CPU: 1,
Mem: 1,
NodeID: "node3",
Order: "start",
State: "running",
Resources: node.ProcessResources{
CPU: 1,
Mem: 1,
},
Runtime: 42,
Config: &app.Config{
ID: "foobar5",

View File

@ -747,16 +747,62 @@ func (n *Core) MediaGetInfo(prefix, path string) (int64, time.Time, error) {
}
type Process struct {
NodeID string
Order string
State string
NodeID string
Order string
State string
Resources ProcessResources
Runtime time.Duration
UpdatedAt time.Time
Config *app.Config
Metadata map[string]interface{}
}
type ProcessResources struct {
CPU float64 // Current CPU load of this process, 0-100*ncpu
Mem uint64 // Currently consumed memory of this process in bytes
GPU ProcessGPUResources
Throttling bool
Runtime time.Duration
UpdatedAt time.Time
Config *app.Config
Metadata map[string]interface{}
}
type ProcessGPUResources struct {
Index int // GPU number
Usage float64 // Current GPU load, 0-100
Encoder float64 // Current GPU encoder load, 0-100
Decoder float64 // Current GPU decoder load, 0-100
Mem uint64 // Currently consumed GPU memory of this process in bytes
}
func (p *ProcessResources) Marshal(a *api.ProcessUsage) {
p.Throttling = a.CPU.IsThrottling
if x, err := a.CPU.Current.Float64(); err == nil {
p.CPU = x
} else {
p.CPU = 0
}
p.Mem = a.Memory.Current
if x, err := a.GPU.Usage.Current.Float64(); err == nil {
p.GPU.Usage = x
} else {
p.GPU.Usage = 0
}
if x, err := a.GPU.Encoder.Current.Float64(); err == nil {
p.GPU.Encoder = x
} else {
p.GPU.Encoder = 0
}
if x, err := a.GPU.Decoder.Current.Float64(); err == nil {
p.GPU.Decoder = x
} else {
p.GPU.Decoder = 0
}
p.GPU.Mem = a.GPU.Memory.Current
p.GPU.Index = a.GPU.Index
}
func (n *Core) ClusterProcessList() ([]Process, error) {
@ -780,21 +826,15 @@ func (n *Core) ClusterProcessList() ([]Process, error) {
p.Config = &api.ProcessConfig{}
}
cpu, err := p.State.Resources.CPU.Current.Float64()
if err != nil {
cpu = 0
process := Process{
NodeID: nodeid,
Order: p.State.Order,
State: p.State.State,
Runtime: time.Duration(p.State.Runtime) * time.Second,
UpdatedAt: time.Unix(p.UpdatedAt, 0),
}
process := Process{
NodeID: nodeid,
Order: p.State.Order,
State: p.State.State,
Mem: p.State.Resources.Memory.Current,
CPU: cpu,
Throttling: p.State.Resources.CPU.IsThrottling,
Runtime: time.Duration(p.State.Runtime) * time.Second,
UpdatedAt: time.Unix(p.UpdatedAt, 0),
}
process.Resources.Marshal(&p.State.Resources)
config, _ := p.Config.Marshal()

View File

@ -138,17 +138,28 @@ type About struct {
Resources Resources
}
type ResourcesGPU struct {
Mem uint64 // Currently used memory in bytes
MemLimit uint64 // Defined memory limit in bytes
MemTotal uint64 // Total available memory in bytes
Usage float64 // Current general usage, 0-100
UsageLimit float64 // Defined general usage limit, 0-100
Encoder float64 // Current encoder usage, 0-100
Decoder float64 // Current decoder usage, 0-100
}
type Resources struct {
IsThrottling bool // Whether this core is currently throttling
NCPU float64 // Number of CPU on this node
CPU float64 // Current CPU load, 0-100*ncpu
CPULimit float64 // Defined CPU load limit, 0-100*ncpu
CPUCore float64 // Current CPU load of the core itself, 0-100*ncpu
Mem uint64 // Currently used memory in bytes
MemLimit uint64 // Defined memory limit in bytes
MemTotal uint64 // Total available memory in bytes
MemCore uint64 // Current used memory of the core itself in bytes
Error error // Last error
IsThrottling bool // Whether this core is currently throttling
NCPU float64 // Number of CPU on this node
CPU float64 // Current CPU load, 0-100*ncpu
CPULimit float64 // Defined CPU load limit, 0-100*ncpu
CPUCore float64 // Current CPU load of the core itself, 0-100*ncpu
Mem uint64 // Currently used memory in bytes
MemLimit uint64 // Defined memory limit in bytes
MemTotal uint64 // Total available memory in bytes
MemCore uint64 // Current used memory of the core itself in bytes
GPU []ResourcesGPU // Currently used GPU resources
Error error // Last error
}
func (n *Node) About() About {
@ -514,6 +525,20 @@ func (n *Node) ping(ctx context.Context, interval time.Duration) {
Error: nil,
},
}
if len(about.Resources.GPU) != 0 {
n.nodeAbout.Resources.GPU = make([]ResourcesGPU, len(about.Resources.GPU))
for i, gpu := range about.Resources.GPU {
n.nodeAbout.Resources.GPU[i].Mem = gpu.Mem
n.nodeAbout.Resources.GPU[i].MemLimit = gpu.MemLimit
n.nodeAbout.Resources.GPU[i].MemTotal = gpu.MemTotal
n.nodeAbout.Resources.GPU[i].Usage = gpu.Usage
n.nodeAbout.Resources.GPU[i].UsageLimit = gpu.UsageLimit
n.nodeAbout.Resources.GPU[i].Encoder = gpu.Encoder
n.nodeAbout.Resources.GPU[i].Decoder = gpu.Decoder
}
}
if len(about.Resources.Error) != 0 {
n.nodeAbout.Resources.Error = errors.New(about.Resources.Error)
}

View File

@ -4,8 +4,69 @@ import (
"sort"
"github.com/datarhei/core/v16/cluster/node"
"github.com/datarhei/core/v16/restream/app"
)
type Resources struct {
CPU float64 // CPU 0-100*ncpu
Mem uint64 // Memoryin bytes
GPU ResourcesGPU // GPU resources
}
type ResourcesGPU struct {
Index int // GPU number
Usage float64 // GPU general, 0-100
Encoder float64 // GPU encoder, 0-100
Decoder float64 // GPU decoder, 0-100
Mem uint64 // GPU memory in bytes
}
func ResourcesFromConfig(c *app.Config) Resources {
r := Resources{}
r.MarshalConfig(c)
return r
}
func ResourcesFromProcess(c node.ProcessResources) Resources {
r := Resources{}
r.MarshalProcess(c)
return r
}
func (r *Resources) MarshalConfig(c *app.Config) {
r.CPU = c.LimitCPU
r.Mem = c.LimitMemory
r.GPU.Usage = c.LimitGPU.Usage
r.GPU.Encoder = c.LimitGPU.Encoder
r.GPU.Decoder = c.LimitGPU.Decoder
r.GPU.Index = -1
}
func (r *Resources) MarshalProcess(c node.ProcessResources) {
r.CPU = c.CPU
r.Mem = c.Mem
r.GPU.Usage = c.GPU.Usage
r.GPU.Encoder = c.GPU.Encoder
r.GPU.Decoder = c.GPU.Decoder
r.GPU.Index = c.GPU.Index
}
func (r *Resources) HasGPU() bool {
if r.GPU.Usage > 0 || r.GPU.Encoder > 0 || r.GPU.Decoder > 0 || r.GPU.Mem > 0 {
return true
}
return false
}
func (r *Resources) DoesFitGPU(g node.ResourcesGPU) bool {
if g.Usage+r.GPU.Usage < g.UsageLimit && g.Encoder+r.GPU.Encoder < g.UsageLimit && g.Decoder+r.GPU.Decoder < g.UsageLimit && g.Mem+r.GPU.Mem < g.MemLimit {
return true
}
return false
}
type resourcePlanner struct {
nodes map[string]node.Resources
blocked map[string]struct{}
@ -39,8 +100,8 @@ func (r *resourcePlanner) Throttling(nodeid string, throttling bool) {
}
// HasNodeEnough returns whether a node has enough resources available for the
// requested cpu and memory consumption.
func (r *resourcePlanner) HasNodeEnough(nodeid string, cpu float64, mem uint64) bool {
// requested cpu, memory, anf gpu consumption.
func (r *resourcePlanner) HasNodeEnough(nodeid string, req Resources) bool {
res, hasNode := r.nodes[nodeid]
if !hasNode {
return false
@ -50,20 +111,39 @@ func (r *resourcePlanner) HasNodeEnough(nodeid string, cpu float64, mem uint64)
return false
}
if res.Error == nil && res.CPU+cpu < res.CPULimit && res.Mem+mem < res.MemLimit && !res.IsThrottling {
return true
if res.Error != nil || res.IsThrottling {
return false
}
return false
if res.CPU+req.CPU >= res.CPULimit || res.Mem+req.Mem >= res.MemLimit {
return false
}
if req.HasGPU() {
found := false
for _, g := range res.GPU {
if req.DoesFitGPU(g) {
found = true
break
}
}
if !found {
return false
}
}
return true
}
// FindBestNodes returns an array of nodeids that can fit the requested cpu and memory requirements. If no
// FindBestNodes returns an array of nodeids that can fit the requested cpu, memory, and gpu requirements. If no
// such node is available, an empty array is returned. The array is sorted by the most suitable node first.
func (r *resourcePlanner) FindBestNodes(cpu float64, mem uint64) []string {
func (r *resourcePlanner) FindBestNodes(req Resources) []string {
nodes := []string{}
for id := range r.nodes {
if r.HasNodeEnough(id, cpu, mem) {
if r.HasNodeEnough(id, req) {
nodes = append(nodes, id)
}
}
@ -81,43 +161,72 @@ func (r *resourcePlanner) FindBestNodes(cpu float64, mem uint64) []string {
return nodes
}
// Add adds the resources of the node according to the cpu and memory utilization.
func (r *resourcePlanner) Add(nodeid string, cpu float64, mem uint64) {
// Add adds the resources of the node according to the cpu, memory, and gpu utilization.
func (r *resourcePlanner) Add(nodeid string, req Resources) {
res, hasRes := r.nodes[nodeid]
if !hasRes {
return
}
res.CPU += cpu
res.Mem += mem
res.CPU += req.CPU
res.Mem += req.Mem
if req.HasGPU() {
for i, g := range res.GPU {
if req.DoesFitGPU(g) {
g.Usage += req.GPU.Usage
g.Encoder += req.GPU.Encoder
g.Decoder += req.GPU.Decoder
g.Mem += req.GPU.Mem
res.GPU[i] = g
break
}
}
}
r.nodes[nodeid] = res
}
// Remove subtracts the resources from the node according to the cpu and memory utilization.
func (r *resourcePlanner) Remove(nodeid string, cpu float64, mem uint64) {
// Remove subtracts the resources from the node according to the cpu, memory, and gpu utilization.
func (r *resourcePlanner) Remove(nodeid string, req Resources) {
res, hasRes := r.nodes[nodeid]
if !hasRes {
return
}
res.CPU -= cpu
if res.CPU < 0 {
res.CPU = 0
}
if mem >= res.Mem {
res.Mem = 0
} else {
res.Mem -= mem
res.CPU -= min(res.CPU, req.CPU)
res.Mem -= min(res.Mem, req.Mem)
if req.HasGPU() {
if req.GPU.Index > 0 && req.GPU.Index < len(res.GPU) {
gpu := res.GPU[req.GPU.Index]
gpu.Usage -= min(gpu.Usage, req.GPU.Usage)
gpu.Encoder -= min(gpu.Encoder, req.GPU.Encoder)
gpu.Decoder -= min(gpu.Decoder, req.GPU.Decoder)
gpu.Mem -= min(gpu.Mem, req.GPU.Mem)
res.GPU[req.GPU.Index] = gpu
}
}
r.nodes[nodeid] = res
}
// Move adjusts the resources from the target and source node according to the cpu and memory utilization.
func (r *resourcePlanner) Move(target, source string, cpu float64, mem uint64) {
r.Add(target, cpu, mem)
r.Remove(source, cpu, mem)
func (r *resourcePlanner) Move(target, source string, req Resources) {
r.Add(target, req)
r.Remove(source, req)
}
func (r *resourcePlanner) Map() map[string]node.Resources {
return r.nodes
}
func (r *resourcePlanner) Blocked() []string {
nodes := []string{}
for nodeid := range r.blocked {
nodes = append(nodes, nodeid)
}
return nodes
}

603
cluster/resources_test.go Normal file
View File

@ -0,0 +1,603 @@
package cluster
import (
"testing"
"github.com/datarhei/core/v16/cluster/node"
"github.com/stretchr/testify/require"
)
func TestResources(t *testing.T) {
r := Resources{
CPU: 1,
Mem: 1,
}
require.False(t, r.HasGPU())
r.GPU = ResourcesGPU{
Index: 0,
Usage: 1,
Encoder: 0,
Decoder: 0,
Mem: 1,
}
require.True(t, r.HasGPU())
}
func TestResourcePlanner(t *testing.T) {
nodes := map[string]node.About{
"node1": {
State: "online",
Resources: node.Resources{
NCPU: 1,
CPU: 7,
Mem: 35,
CPULimit: 90,
MemLimit: 90,
},
},
"node2": {
State: "online",
Resources: node.Resources{
NCPU: 1,
CPU: 85,
Mem: 11,
CPULimit: 90,
MemLimit: 90,
},
},
}
planner := NewResourcePlanner(nodes)
require.Equal(t, map[string]node.Resources{
"node1": {
NCPU: 1,
CPU: 7,
Mem: 35,
CPULimit: 90,
MemLimit: 90,
},
"node2": {
NCPU: 1,
CPU: 85,
Mem: 11,
CPULimit: 90,
MemLimit: 90,
},
}, planner.Map())
}
func TestResourcePlannerBlocked(t *testing.T) {
nodes := map[string]node.About{
"node1": {
State: "degraded",
Resources: node.Resources{
NCPU: 1,
CPU: 7,
Mem: 35,
CPULimit: 90,
MemLimit: 90,
},
},
"node2": {
State: "online",
Resources: node.Resources{
NCPU: 1,
CPU: 85,
Mem: 11,
CPULimit: 90,
MemLimit: 90,
},
},
}
planner := NewResourcePlanner(nodes)
require.Equal(t, []string{"node1"}, planner.Blocked())
}
func TestResourcePlannerThrottling(t *testing.T) {
nodes := map[string]node.About{
"node1": {
State: "online",
Resources: node.Resources{
NCPU: 1,
CPU: 7,
Mem: 35,
CPULimit: 90,
MemLimit: 90,
},
},
"node2": {
State: "online",
Resources: node.Resources{
NCPU: 1,
CPU: 85,
Mem: 11,
CPULimit: 90,
MemLimit: 90,
},
},
}
planner := NewResourcePlanner(nodes)
require.True(t, planner.HasNodeEnough("node1", Resources{
CPU: 30,
Mem: 5,
}))
planner.Throttling("node1", true)
require.False(t, planner.HasNodeEnough("node1", Resources{
CPU: 30,
Mem: 5,
}))
planner.Throttling("node1", false)
require.True(t, planner.HasNodeEnough("node1", Resources{
CPU: 30,
Mem: 5,
}))
}
func TestRecourcePlannerHasNodeEnough(t *testing.T) {
nodes := map[string]node.About{
"node1": {
State: "online",
Resources: node.Resources{
NCPU: 1,
CPU: 7,
Mem: 35,
CPULimit: 90,
MemLimit: 90,
GPU: []node.ResourcesGPU{
{
Mem: 5,
MemLimit: 90,
Usage: 53,
UsageLimit: 90,
Encoder: 32,
Decoder: 26,
},
{
Mem: 85,
MemLimit: 90,
Usage: 64,
UsageLimit: 90,
Encoder: 43,
Decoder: 12,
},
},
},
},
"node2": {
State: "online",
Resources: node.Resources{
NCPU: 1,
CPU: 85,
Mem: 11,
CPULimit: 90,
MemLimit: 90,
GPU: []node.ResourcesGPU{
{
Mem: 5,
MemLimit: 90,
Usage: 53,
UsageLimit: 90,
Encoder: 32,
Decoder: 26,
},
},
},
},
}
planner := NewResourcePlanner(nodes)
require.True(t, planner.HasNodeEnough("node1", Resources{
CPU: 30,
Mem: 5,
}))
require.False(t, planner.HasNodeEnough("node2", Resources{
CPU: 30,
Mem: 5,
}))
require.True(t, planner.HasNodeEnough("node1", Resources{
CPU: 30,
Mem: 5,
GPU: ResourcesGPU{
Usage: 0,
Encoder: 0,
Decoder: 0,
Mem: 50,
},
}))
require.False(t, planner.HasNodeEnough("node1", Resources{
CPU: 30,
Mem: 5,
GPU: ResourcesGPU{
Usage: 0,
Encoder: 0,
Decoder: 0,
Mem: 86,
},
}))
require.True(t, planner.HasNodeEnough("node1", Resources{
CPU: 30,
Mem: 5,
GPU: ResourcesGPU{
Usage: 0,
Encoder: 50,
Decoder: 0,
Mem: 50,
},
}))
}
func TestResourcePlannerAdd(t *testing.T) {
nodes := map[string]node.About{
"node1": {
State: "online",
Resources: node.Resources{
NCPU: 1,
CPU: 7,
Mem: 35,
CPULimit: 90,
MemLimit: 90,
},
},
}
planner := NewResourcePlanner(nodes)
planner.Add("node1", Resources{
CPU: 42,
Mem: 33,
})
require.Equal(t, map[string]node.Resources{
"node1": {
NCPU: 1,
CPU: 49,
Mem: 68,
CPULimit: 90,
MemLimit: 90,
},
}, planner.Map())
}
func TestResourcePlannerNoGPUAddGPU(t *testing.T) {
nodes := map[string]node.About{
"node1": {
State: "online",
Resources: node.Resources{
NCPU: 1,
CPU: 7,
Mem: 35,
CPULimit: 90,
MemLimit: 90,
},
},
}
planner := NewResourcePlanner(nodes)
planner.Add("node1", Resources{
CPU: 42,
Mem: 33,
GPU: ResourcesGPU{
Index: 0,
Usage: 1,
Encoder: 2,
Decoder: 3,
Mem: 4,
},
})
require.Equal(t, map[string]node.Resources{
"node1": {
NCPU: 1,
CPU: 49,
Mem: 68,
CPULimit: 90,
MemLimit: 90,
},
}, planner.Map())
}
func TestResourcePlannerAddGPU(t *testing.T) {
nodes := map[string]node.About{
"node1": {
State: "online",
Resources: node.Resources{
NCPU: 1,
CPU: 7,
Mem: 35,
CPULimit: 90,
MemLimit: 90,
GPU: []node.ResourcesGPU{
{
Mem: 0,
MemLimit: 0,
Usage: 0,
UsageLimit: 0,
Encoder: 0,
Decoder: 0,
},
{
Mem: 0,
MemLimit: 100,
Usage: 0,
UsageLimit: 100,
Encoder: 0,
Decoder: 0,
},
},
},
},
}
planner := NewResourcePlanner(nodes)
planner.Add("node1", Resources{
CPU: 42,
Mem: 33,
GPU: ResourcesGPU{
Usage: 1,
Encoder: 2,
Decoder: 3,
Mem: 4,
},
})
require.Equal(t, map[string]node.Resources{
"node1": {
NCPU: 1,
CPU: 49,
Mem: 68,
CPULimit: 90,
MemLimit: 90,
GPU: []node.ResourcesGPU{
{
Mem: 0,
MemLimit: 0,
Usage: 0,
UsageLimit: 0,
Encoder: 0,
Decoder: 0,
},
{
Mem: 4,
MemLimit: 100,
Usage: 1,
UsageLimit: 100,
Encoder: 2,
Decoder: 3,
},
},
},
}, planner.Map())
}
func TestResourcePlannerRemove(t *testing.T) {
nodes := map[string]node.About{
"node1": {
State: "online",
Resources: node.Resources{
NCPU: 1,
CPU: 53,
Mem: 35,
CPULimit: 90,
MemLimit: 90,
},
},
}
planner := NewResourcePlanner(nodes)
planner.Remove("node1", Resources{
CPU: 13,
Mem: 20,
})
require.Equal(t, map[string]node.Resources{
"node1": {
NCPU: 1,
CPU: 40,
Mem: 15,
CPULimit: 90,
MemLimit: 90,
},
}, planner.Map())
}
func TestResourcePlannerRemoveTooMuch(t *testing.T) {
nodes := map[string]node.About{
"node1": {
State: "online",
Resources: node.Resources{
NCPU: 1,
CPU: 53,
Mem: 35,
CPULimit: 90,
MemLimit: 90,
},
},
}
planner := NewResourcePlanner(nodes)
planner.Remove("node1", Resources{
CPU: 100,
Mem: 100,
})
require.Equal(t, map[string]node.Resources{
"node1": {
NCPU: 1,
CPU: 0,
Mem: 0,
CPULimit: 90,
MemLimit: 90,
},
}, planner.Map())
}
func TestResourcePlannerRemoveGPU(t *testing.T) {
nodes := map[string]node.About{
"node1": {
State: "online",
Resources: node.Resources{
NCPU: 1,
CPU: 53,
Mem: 35,
CPULimit: 90,
MemLimit: 90,
GPU: []node.ResourcesGPU{
{
Mem: 4,
MemLimit: 100,
Usage: 1,
UsageLimit: 100,
Encoder: 2,
Decoder: 3,
},
{
Mem: 23,
MemLimit: 100,
Usage: 43,
UsageLimit: 100,
Encoder: 95,
Decoder: 12,
},
},
},
},
}
planner := NewResourcePlanner(nodes)
planner.Remove("node1", Resources{
CPU: 13,
Mem: 20,
GPU: ResourcesGPU{
Index: 1,
Usage: 3,
Encoder: 40,
Decoder: 0,
Mem: 5,
},
})
require.Equal(t, map[string]node.Resources{
"node1": {
NCPU: 1,
CPU: 40,
Mem: 15,
CPULimit: 90,
MemLimit: 90,
GPU: []node.ResourcesGPU{
{
Mem: 4,
MemLimit: 100,
Usage: 1,
UsageLimit: 100,
Encoder: 2,
Decoder: 3,
},
{
Mem: 18,
MemLimit: 100,
Usage: 40,
UsageLimit: 100,
Encoder: 55,
Decoder: 12,
},
},
},
}, planner.Map())
}
func TestResourcePlannerRemoveGPUTooMuch(t *testing.T) {
nodes := map[string]node.About{
"node1": {
State: "online",
Resources: node.Resources{
NCPU: 1,
CPU: 53,
Mem: 35,
CPULimit: 90,
MemLimit: 90,
GPU: []node.ResourcesGPU{
{
Mem: 4,
MemLimit: 100,
Usage: 1,
UsageLimit: 100,
Encoder: 2,
Decoder: 3,
},
{
Mem: 23,
MemLimit: 100,
Usage: 43,
UsageLimit: 100,
Encoder: 95,
Decoder: 12,
},
},
},
},
}
planner := NewResourcePlanner(nodes)
planner.Remove("node1", Resources{
CPU: 13,
Mem: 20,
GPU: ResourcesGPU{
Index: 1,
Usage: 100,
Encoder: 100,
Decoder: 100,
Mem: 100,
},
})
require.Equal(t, map[string]node.Resources{
"node1": {
NCPU: 1,
CPU: 40,
Mem: 15,
CPULimit: 90,
MemLimit: 90,
GPU: []node.ResourcesGPU{
{
Mem: 4,
MemLimit: 100,
Usage: 1,
UsageLimit: 100,
Encoder: 2,
Decoder: 3,
},
{
Mem: 0,
MemLimit: 100,
Usage: 0,
UsageLimit: 100,
Encoder: 0,
Decoder: 0,
},
},
},
}, planner.Map())
}

View File

@ -306,8 +306,10 @@ func (d *Config) init() {
d.vars.Register(value.NewDir(&d.Router.UIPath, "", d.fs), "router.ui_path", "CORE_ROUTER_UI_PATH", nil, "Path to a directory holding UI files mounted as /ui", false, false)
// Resources
d.vars.Register(value.NewFloat(&d.Resources.MaxCPUUsage, 0), "resources.max_cpu_usage", "CORE_RESOURCES_MAX_CPU_USAGE", nil, "Maximum system CPU usage in percent, from 0 (no limit) to 100", false, false)
d.vars.Register(value.NewFloat(&d.Resources.MaxMemoryUsage, 0), "resources.max_memory_usage", "CORE_RESOURCES_MAX_MEMORY_USAGE", nil, "Maximum system usage in percent, from 0 (no limit) to 100", false, false)
d.vars.Register(value.NewFloatRange(&d.Resources.MaxCPUUsage, 0, 0, 100), "resources.max_cpu_usage", "CORE_RESOURCES_MAX_CPU_USAGE", nil, "Maximum system CPU usage in percent, from 0 (no limit) to 100", false, false)
d.vars.Register(value.NewFloatRange(&d.Resources.MaxMemoryUsage, 0, 0, 100), "resources.max_memory_usage", "CORE_RESOURCES_MAX_MEMORY_USAGE", nil, "Maximum system usage in percent, from 0 (no limit) to 100", false, false)
d.vars.Register(value.NewFloatRange(&d.Resources.MaxGPUUsage, 0, 0, 100), "resources.max_gpu_usage", "CORE_RESOURCES_MAX_GPU_USAGE", nil, "Maximum general, encoder, and decoder GPU usage in percent per GPU, from 0 (no limit) to 100", false, false)
d.vars.Register(value.NewFloatRange(&d.Resources.MaxGPUMemoryUsage, 0, 0, 100), "resources.max_gpu_memory_usage", "CORE_RESOURCES_MAX_GPU_MEMORY_USAGE", nil, "Maximum GPU memory usage in percent per GPU, from 0 (no limit) to 100", false, false)
// Cluster
d.vars.Register(value.NewBool(&d.Cluster.Enable, false), "cluster.enable", "CORE_CLUSTER_ENABLE", nil, "Enable cluster mode", false, false)
@ -494,17 +496,6 @@ func (d *Config) Validate(resetLogs bool) {
}
}
// If resource limits are given, all values must be set
if d.Resources.MaxCPUUsage > 0 || d.Resources.MaxMemoryUsage > 0 {
if d.Resources.MaxCPUUsage <= 0 || d.Resources.MaxCPUUsage > 100 {
d.vars.Log("error", "resources.max_cpu_usage", "must be greater than 0 and smaller or equal to 100")
}
if d.Resources.MaxMemoryUsage <= 0 {
d.vars.Log("error", "resources.max_memory_usage", "must be greater than 0 and smaller or equal to 100")
}
}
// If cluster mode is enabled, a proper address must be provided
if d.Cluster.Enable {
if len(d.Cluster.Address) == 0 {

View File

@ -184,8 +184,10 @@ type Data struct {
UIPath string `json:"ui_path"`
} `json:"router"`
Resources struct {
MaxCPUUsage float64 `json:"max_cpu_usage"` // percent 0-100
MaxMemoryUsage float64 `json:"max_memory_usage"` // percent 0-100
MaxCPUUsage float64 `json:"max_cpu_usage"` // percent 0-100
MaxMemoryUsage float64 `json:"max_memory_usage"` // percent 0-100
MaxGPUUsage float64 `json:"max_gpu_usage"` // percent 0-100
MaxGPUMemoryUsage float64 `json:"max_gpu_memory_usage"` // percent 0-100
} `json:"resources"`
Cluster struct {
Enable bool `json:"enable"`

View File

@ -1,6 +1,7 @@
package value
import (
"fmt"
"sort"
"strconv"
"strings"
@ -310,3 +311,56 @@ func (u *Float64) Validate() error {
func (u *Float64) IsEmpty() bool {
return float64(*u) == 0
}
// float64 range
type Float64Range struct {
p *float64
from float64
to float64
}
func NewFloatRange(p *float64, val, from, to float64) *Float64Range {
v := &Float64Range{
p: p,
from: from,
to: to,
}
*p = val
return v
}
func (s *Float64Range) Set(val string) error {
v, err := strconv.ParseFloat(val, 64)
if err != nil {
return err
}
*s.p = v
return nil
}
func (s *Float64Range) String() string {
if s.IsEmpty() {
return "(empty)"
}
return fmt.Sprintf("%.3f", *s.p)
}
func (s *Float64Range) Validate() error {
val := *s.p
if val < s.from || val > s.to {
return fmt.Errorf("value %f is not in range [%f, %f]", val, s.from, s.to)
}
return nil
}
func (s *Float64Range) IsEmpty() bool {
return *s.p == 0
}

View File

@ -165,3 +165,29 @@ func TestFloat64Value(t *testing.T) {
require.Equal(t, float64(77.7), x)
}
func TestFloat64RangeValue(t *testing.T) {
var x float64
val := NewFloatRange(&x, 11.1, 0, 100)
require.Equal(t, "11.100", val.String())
require.NoError(t, val.Validate())
require.Equal(t, false, val.IsEmpty())
x = 42.5
require.Equal(t, "42.500", val.String())
require.NoError(t, val.Validate())
require.Equal(t, false, val.IsEmpty())
val.Set("77.7")
require.Equal(t, float64(77.7), x)
val.Set("101.9")
require.Equal(t, "101.900", val.String())
require.Error(t, val.Validate())
require.Equal(t, false, val.IsEmpty())
}

View File

@ -29,23 +29,26 @@ type FFmpeg interface {
}
type ProcessConfig struct {
Reconnect bool // Whether to reconnect
ReconnectDelay time.Duration // Duration until next reconnect
StaleTimeout time.Duration // Duration to wait until killing the process if there is no progress in the process
Timeout time.Duration // Duration to wait until killing the process
LimitCPU float64 // Kill the process if the CPU usage in percent is above this value.
LimitMemory uint64 // Kill the process if the memory consumption in bytes is above this value.
LimitDuration time.Duration // Kill the process if the limits are exceeded for this duration.
LimitMode string // How to limit the process, "hard" or "soft"
Scheduler string // A scheduler for starting the process, either a concrete date (RFC3339) or in crontab syntax
Args []string // Arguments for the process
Parser process.Parser // Parser for the process output
Logger log.Logger // Logger
OnArgs func([]string) []string // Callback before starting the process to retrieve new arguments
OnBeforeStart func() error // Callback which is called before the process will be started. If error is non-nil, the start will be refused.
OnStart func() // Callback called after process has been started
OnExit func(state string) // Callback called after the process stopped with exit state as argument
OnStateChange func(from, to string) // Callback called on state change
Reconnect bool // Whether to reconnect
ReconnectDelay time.Duration // Duration until next reconnect
StaleTimeout time.Duration // Duration to wait until killing the process if there is no progress in the process
Timeout time.Duration // Duration to wait until killing the process
LimitCPU float64 // Kill the process if the CPU usage in percent is above this value.
LimitMemory uint64 // Kill the process if the memory consumption in bytes is above this value.
LimitGPUUsage float64 // Kill the process id the GPU usage (general) in percent is above this value.
LimitGPUEncoder float64 // Kill the process id the GPU usage (encoder) in percent is above this value.
LimitGPUDecoder float64 // Kill the process id the GPU usage (decoder) in percent is above this value.
LimitGPUMemory uint64 // Kill the process if the GPU memory consumption in bytes is above this value.
LimitDuration time.Duration // Kill the process if the limits are exceeded for this duration.
LimitMode string // How to limit the process, "hard" or "soft"
Scheduler string // A scheduler for starting the process, either a concrete date (RFC3339) or in crontab syntax
Args []string // Arguments for the process
Parser process.Parser // Parser for the process output
Logger log.Logger // Logger
OnBeforeStart func([]string) ([]string, error) // Callback which is called before the process will be started. The string slice is the list of arguments which can be modified. If error is non-nil, the start will be refused.
OnStart func() // Callback called after process has been started
OnExit func(state string) // Callback called after the process stopped with exit state as argument
OnStateChange func(from, to string) // Callback called on state change
}
// Config is the configuration for ffmpeg that is part of the configuration
@ -138,23 +141,26 @@ func (f *ffmpeg) New(config ProcessConfig) (process.Process, error) {
}
ffmpeg, err := process.New(process.Config{
Binary: f.binary,
Args: config.Args,
Reconnect: config.Reconnect,
ReconnectDelay: config.ReconnectDelay,
StaleTimeout: config.StaleTimeout,
Timeout: config.Timeout,
LimitCPU: config.LimitCPU,
LimitMemory: config.LimitMemory,
LimitDuration: config.LimitDuration,
LimitMode: limitMode,
Scheduler: scheduler,
Parser: config.Parser,
Logger: config.Logger,
OnArgs: config.OnArgs,
OnBeforeStart: config.OnBeforeStart,
OnStart: config.OnStart,
OnExit: config.OnExit,
Binary: f.binary,
Args: config.Args,
Reconnect: config.Reconnect,
ReconnectDelay: config.ReconnectDelay,
StaleTimeout: config.StaleTimeout,
Timeout: config.Timeout,
LimitCPU: config.LimitCPU,
LimitMemory: config.LimitMemory,
LimitGPUUsage: config.LimitGPUUsage,
LimitGPUEncoder: config.LimitGPUEncoder,
LimitGPUDecoder: config.LimitGPUDecoder,
LimitGPUMemory: config.LimitGPUMemory,
LimitDuration: config.LimitDuration,
LimitMode: limitMode,
Scheduler: scheduler,
Parser: config.Parser,
Logger: config.Logger,
OnBeforeStart: config.OnBeforeStart,
OnStart: config.OnStart,
OnExit: config.OnExit,
OnStateChange: func(from, to string) {
f.statesLock.Lock()
switch to {

View File

@ -619,7 +619,7 @@ func (p *parser) Stop(state string, pusage process.Usage) {
usage.CPU.Max = pusage.CPU.Max
usage.CPU.Limit = pusage.CPU.Limit
usage.Memory.Average = pusage.Memory.Average
usage.Memory.Average = uint64(pusage.Memory.Average)
usage.Memory.Max = pusage.Memory.Max
usage.Memory.Limit = pusage.Memory.Limit

View File

@ -576,6 +576,7 @@ type AVstream struct {
type Usage struct {
CPU UsageCPU
Memory UsageMemory
GPU UsageGPU
}
type UsageCPU struct {
@ -586,7 +587,27 @@ type UsageCPU struct {
}
type UsageMemory struct {
Average uint64
Max uint64
Limit uint64
}
type UsageGPU struct {
Index int
Usage UsageGPUUsage
Encoder UsageGPUUsage
Decoder UsageGPUUsage
Memory UsageGPUMemory
}
type UsageGPUUsage struct {
Average float64
Max float64
Limit float64
}
type UsageGPUMemory struct {
Average uint64
Max uint64
Limit uint64
}

View File

@ -155,9 +155,13 @@ type ProcessConfigIOCleanup struct {
}
type ProcessConfigLimits struct {
CPU float64 `json:"cpu_usage" jsonschema:"minimum=0"`
Memory uint64 `json:"memory_mbytes" jsonschema:"minimum=0" format:"uint64"`
WaitFor uint64 `json:"waitfor_seconds" jsonschema:"minimum=0" format:"uint64"`
CPU float64 `json:"cpu_usage" jsonschema:"minimum=0"`
Memory uint64 `json:"memory_mbytes" jsonschema:"minimum=0" format:"uint64"`
GPUUsage float64 `json:"gpu_usage" jsonschema:"minimum=0"`
GPUEncoder float64 `json:"gpu_encoder" jsonschema:"minimum=0"`
GPUDecoder float64 `json:"gpu_decoder" jsonschema:"minimum=0"`
GPUMemory uint64 `json:"gpu_memory_mbytes" jsonschema:"minimum=0" format:"uint64"`
WaitFor uint64 `json:"waitfor_seconds" jsonschema:"minimum=0" format:"uint64"`
}
// ProcessConfig represents the configuration of an ffmpeg process
@ -197,7 +201,13 @@ func (cfg *ProcessConfig) Marshal() (*app.Config, map[string]interface{}) {
Scheduler: cfg.Scheduler,
LimitCPU: cfg.Limits.CPU,
LimitMemory: cfg.Limits.Memory * 1024 * 1024,
LimitWaitFor: cfg.Limits.WaitFor,
LimitGPU: app.ConfigLimitGPU{
Usage: cfg.Limits.GPUUsage,
Encoder: cfg.Limits.GPUEncoder,
Decoder: cfg.Limits.GPUDecoder,
Memory: cfg.Limits.GPUMemory * 1024 * 1024,
},
LimitWaitFor: cfg.Limits.WaitFor,
}
cfg.generateInputOutputIDs(cfg.Input)
@ -283,6 +293,10 @@ func (cfg *ProcessConfig) Unmarshal(c *app.Config, metadata map[string]interface
cfg.Scheduler = c.Scheduler
cfg.Limits.CPU = c.LimitCPU
cfg.Limits.Memory = c.LimitMemory / 1024 / 1024
cfg.Limits.GPUUsage = c.LimitGPU.Usage
cfg.Limits.GPUEncoder = c.LimitGPU.Encoder
cfg.Limits.GPUDecoder = c.LimitGPU.Decoder
cfg.Limits.GPUMemory = c.LimitGPU.Memory / 1024 / 1024
cfg.Limits.WaitFor = c.LimitWaitFor
cfg.Options = make([]string, len(c.Options))
@ -364,20 +378,7 @@ func (s *ProcessState) Unmarshal(state *app.State) {
s.Memory = state.Memory
s.CPU = json.ToNumber(state.CPU)
s.LimitMode = state.LimitMode
s.Resources.CPU = ProcessUsageCPU{
NCPU: json.ToNumber(state.Resources.CPU.NCPU),
Current: json.ToNumber(state.Resources.CPU.Current),
Average: json.ToNumber(state.Resources.CPU.Average),
Max: json.ToNumber(state.Resources.CPU.Max),
Limit: json.ToNumber(state.Resources.CPU.Limit),
IsThrottling: state.Resources.CPU.IsThrottling,
}
s.Resources.Memory = ProcessUsageMemory{
Current: state.Resources.Memory.Current,
Average: json.ToNumber(state.Resources.Memory.Average),
Max: state.Resources.Memory.Max,
Limit: state.Resources.Memory.Limit,
}
s.Resources.Unmarshal(&state.Resources)
s.Command = state.Command
s.Progress.Unmarshal(&state.Progress)
@ -430,15 +431,15 @@ func (p *ProcessUsageCPU) Marshal() app.ProcessUsageCPU {
}
type ProcessUsageMemory struct {
Current uint64 `json:"cur" format:"uint64"`
Average json.Number `json:"avg" swaggertype:"number" jsonschema:"type=number"`
Max uint64 `json:"max" format:"uint64"`
Limit uint64 `json:"limit" format:"uint64"`
Current uint64 `json:"cur" format:"uint64"`
Average uint64 `json:"avg" format:"uint64"`
Max uint64 `json:"max" format:"uint64"`
Limit uint64 `json:"limit" format:"uint64"`
}
func (p *ProcessUsageMemory) Unmarshal(pp *app.ProcessUsageMemory) {
p.Current = pp.Current
p.Average = json.ToNumber(pp.Average)
p.Average = pp.Average
p.Max = pp.Max
p.Limit = pp.Limit
}
@ -446,31 +447,120 @@ func (p *ProcessUsageMemory) Unmarshal(pp *app.ProcessUsageMemory) {
func (p *ProcessUsageMemory) Marshal() app.ProcessUsageMemory {
pp := app.ProcessUsageMemory{
Current: p.Current,
Average: p.Average,
Max: p.Max,
Limit: p.Limit,
}
return pp
}
type ProcessUsageGPUMemory struct {
Current uint64 `json:"cur" format:"uint64"`
Average uint64 `json:"avg" format:"uint64"`
Max uint64 `json:"max" format:"uint64"`
Limit uint64 `json:"limit" format:"uint64"`
}
func (p *ProcessUsageGPUMemory) Unmarshal(pp *app.ProcessUsageGPUMemory) {
p.Current = pp.Current
p.Average = pp.Average
p.Max = pp.Max
p.Limit = pp.Limit
}
func (p *ProcessUsageGPUMemory) Marshal() app.ProcessUsageGPUMemory {
pp := app.ProcessUsageGPUMemory{
Current: p.Current,
Average: p.Average,
Max: p.Max,
Limit: p.Limit,
}
return pp
}
type ProcessUsageGPUUsage struct {
Current json.Number `json:"cur" swaggertype:"number" jsonschema:"type=number"`
Average json.Number `json:"avg" swaggertype:"number" jsonschema:"type=number"`
Max json.Number `json:"max" swaggertype:"number" jsonschema:"type=number"`
Limit json.Number `json:"limit" swaggertype:"number" jsonschema:"type=number"`
}
func (p *ProcessUsageGPUUsage) Unmarshal(pp *app.ProcessUsageGPUUsage) {
p.Current = json.ToNumber(pp.Current)
p.Average = json.ToNumber(pp.Average)
p.Max = json.ToNumber(pp.Max)
p.Limit = json.ToNumber(pp.Limit)
}
func (p *ProcessUsageGPUUsage) Marshal() app.ProcessUsageGPUUsage {
pp := app.ProcessUsageGPUUsage{}
if x, err := p.Current.Float64(); err == nil {
pp.Current = x
}
if x, err := p.Average.Float64(); err == nil {
pp.Average = x
}
if x, err := p.Max.Float64(); err == nil {
pp.Max = x
}
if x, err := p.Limit.Float64(); err == nil {
pp.Limit = x
}
return pp
}
type ProcessUsageGPU struct {
Index int `json:"index"`
Memory ProcessUsageGPUMemory `json:"memory_bytes"`
Usage ProcessUsageGPUUsage `json:"usage"`
Encoder ProcessUsageGPUUsage `json:"encoder"`
Decoder ProcessUsageGPUUsage `json:"decoder"`
}
func (p *ProcessUsageGPU) Unmarshal(pp *app.ProcessUsageGPU) {
p.Index = pp.Index
p.Memory.Unmarshal(&pp.Memory)
p.Usage.Unmarshal(&pp.Usage)
p.Encoder.Unmarshal(&pp.Encoder)
p.Decoder.Unmarshal(&pp.Decoder)
}
func (p *ProcessUsageGPU) Marshal() app.ProcessUsageGPU {
pp := app.ProcessUsageGPU{
Index: p.Index,
Memory: p.Memory.Marshal(),
Usage: p.Usage.Marshal(),
Encoder: p.Encoder.Marshal(),
Decoder: p.Decoder.Marshal(),
}
return pp
}
type ProcessUsage struct {
CPU ProcessUsageCPU `json:"cpu_usage"`
Memory ProcessUsageMemory `json:"memory_bytes"`
GPU ProcessUsageGPU `json:"gpu"`
}
func (p *ProcessUsage) Unmarshal(pp *app.ProcessUsage) {
p.CPU.Unmarshal(&pp.CPU)
p.Memory.Unmarshal(&pp.Memory)
p.GPU.Unmarshal(&pp.GPU)
}
func (p *ProcessUsage) Marshal() app.ProcessUsage {
pp := app.ProcessUsage{
CPU: p.CPU.Marshal(),
Memory: p.Memory.Marshal(),
GPU: p.GPU.Marshal(),
}
return pp

View File

@ -56,6 +56,33 @@ func TestProcessUsage(t *testing.T) {
Max: 150,
Limit: 200,
},
GPU: app.ProcessUsageGPU{
Index: 3,
Memory: app.ProcessUsageGPUMemory{
Current: 48,
Average: 43,
Max: 88,
Limit: 34,
},
Usage: app.ProcessUsageGPUUsage{
Current: 47,
Average: 22,
Max: 90,
Limit: 80,
},
Encoder: app.ProcessUsageGPUUsage{
Current: 48,
Average: 46,
Max: 74,
Limit: 46,
},
Decoder: app.ProcessUsageGPUUsage{
Current: 21,
Average: 42,
Max: 30,
Limit: 99,
},
},
}
p := ProcessUsage{}
@ -103,7 +130,13 @@ func TestProcessConfig(t *testing.T) {
LogPatterns: []string{"bla", "blubb"},
LimitCPU: 10,
LimitMemory: 100 * 1024 * 1024,
LimitWaitFor: 20,
LimitGPU: app.ConfigLimitGPU{
Usage: 50,
Encoder: 90,
Decoder: 80,
Memory: 24 * 1024 * 1024 * 1024,
},
LimitWaitFor: 20,
}
p := ProcessConfig{}

3
internal/.gitignore vendored
View File

@ -2,4 +2,5 @@ testhelper/ignoresigint/ignoresigint
testhelper/sigint/sigint
testhelper/sigintwait/sigintwait
testhelper/sigpropagate/sigpropagate
testhelper/ffmpeg/ffmpeg
testhelper/ffmpeg/ffmpeg
testhelper/nvidia-smi/nvidia-smi

View File

@ -0,0 +1,973 @@
package main
import (
"context"
"fmt"
"os"
"os/signal"
"time"
)
var pmondata = `# gpu pid type sm mem enc dec fb command
# Idx # C/G % % % % MB name
0 7372 C 2 0 2 - 136 ffmpeg
0 12176 C 5 2 3 7 782 ffmpeg
1 20035 C 8 2 4 1 1145 ffmpeg
1 20141 C 2 1 1 3 429 ffmpeg
0 29591 C 2 1 - 2 435 ffmpeg `
var querydata = `<?xml version="1.0" ?>
<!DOCTYPE nvidia_smi_log SYSTEM "nvsmi_device_v12.dtd">
<nvidia_smi_log>
<timestamp>Mon Jul 15 13:41:56 2024</timestamp>
<driver_version>555.42.06</driver_version>
<cuda_version>12.5</cuda_version>
<attached_gpus>2</attached_gpus>
<gpu id="00000000:01:00.0">
<product_name>NVIDIA L4</product_name>
<product_brand>NVIDIA</product_brand>
<product_architecture>Ada Lovelace</product_architecture>
<display_mode>Enabled</display_mode>
<display_active>Disabled</display_active>
<persistence_mode>Disabled</persistence_mode>
<addressing_mode>None</addressing_mode>
<mig_mode>
<current_mig>N/A</current_mig>
<pending_mig>N/A</pending_mig>
</mig_mode>
<mig_devices>
None
</mig_devices>
<accounting_mode>Disabled</accounting_mode>
<accounting_mode_buffer_size>4000</accounting_mode_buffer_size>
<driver_model>
<current_dm>N/A</current_dm>
<pending_dm>N/A</pending_dm>
</driver_model>
<serial>1654523003308</serial>
<uuid>GPU-c5533cd4-5a60-059e-348d-b6d7466932e4</uuid>
<minor_number>1</minor_number>
<vbios_version>95.04.29.00.06</vbios_version>
<multigpu_board>No</multigpu_board>
<board_id>0x100</board_id>
<board_part_number>900-2G193-0000-001</board_part_number>
<gpu_part_number>27B8-895-A1</gpu_part_number>
<gpu_fru_part_number>N/A</gpu_fru_part_number>
<gpu_module_id>1</gpu_module_id>
<inforom_version>
<img_version>G193.0200.00.01</img_version>
<oem_object>2.1</oem_object>
<ecc_object>6.16</ecc_object>
<pwr_object>N/A</pwr_object>
</inforom_version>
<inforom_bbx_flush>
<latest_timestamp>N/A</latest_timestamp>
<latest_duration>N/A</latest_duration>
</inforom_bbx_flush>
<gpu_operation_mode>
<current_gom>N/A</current_gom>
<pending_gom>N/A</pending_gom>
</gpu_operation_mode>
<c2c_mode>N/A</c2c_mode>
<gpu_virtualization_mode>
<virtualization_mode>None</virtualization_mode>
<host_vgpu_mode>N/A</host_vgpu_mode>
<vgpu_heterogeneous_mode>N/A</vgpu_heterogeneous_mode>
</gpu_virtualization_mode>
<gpu_reset_status>
<reset_required>No</reset_required>
<drain_and_reset_recommended>N/A</drain_and_reset_recommended>
</gpu_reset_status>
<gsp_firmware_version>555.42.06</gsp_firmware_version>
<ibmnpu>
<relaxed_ordering_mode>N/A</relaxed_ordering_mode>
</ibmnpu>
<pci>
<pci_bus>01</pci_bus>
<pci_device>00</pci_device>
<pci_domain>0000</pci_domain>
<pci_base_class>3</pci_base_class>
<pci_sub_class>2</pci_sub_class>
<pci_device_id>27B810DE</pci_device_id>
<pci_bus_id>00000000:01:00.0</pci_bus_id>
<pci_sub_system_id>16CA10DE</pci_sub_system_id>
<pci_gpu_link_info>
<pcie_gen>
<max_link_gen>4</max_link_gen>
<current_link_gen>4</current_link_gen>
<device_current_link_gen>4</device_current_link_gen>
<max_device_link_gen>4</max_device_link_gen>
<max_host_link_gen>5</max_host_link_gen>
</pcie_gen>
<link_widths>
<max_link_width>16x</max_link_width>
<current_link_width>16x</current_link_width>
</link_widths>
</pci_gpu_link_info>
<pci_bridge_chip>
<bridge_chip_type>N/A</bridge_chip_type>
<bridge_chip_fw>N/A</bridge_chip_fw>
</pci_bridge_chip>
<replay_counter>0</replay_counter>
<replay_rollover_counter>0</replay_rollover_counter>
<tx_util>0 KB/s</tx_util>
<rx_util>0 KB/s</rx_util>
<atomic_caps_inbound>N/A</atomic_caps_inbound>
<atomic_caps_outbound>N/A</atomic_caps_outbound>
</pci>
<fan_speed>N/A</fan_speed>
<performance_state>P0</performance_state>
<clocks_event_reasons>
<clocks_event_reason_gpu_idle>Active</clocks_event_reason_gpu_idle>
<clocks_event_reason_applications_clocks_setting>Not Active</clocks_event_reason_applications_clocks_setting>
<clocks_event_reason_sw_power_cap>Not Active</clocks_event_reason_sw_power_cap>
<clocks_event_reason_hw_slowdown>Not Active</clocks_event_reason_hw_slowdown>
<clocks_event_reason_hw_thermal_slowdown>Not Active</clocks_event_reason_hw_thermal_slowdown>
<clocks_event_reason_hw_power_brake_slowdown>Not Active</clocks_event_reason_hw_power_brake_slowdown>
<clocks_event_reason_sync_boost>Not Active</clocks_event_reason_sync_boost>
<clocks_event_reason_sw_thermal_slowdown>Not Active</clocks_event_reason_sw_thermal_slowdown>
<clocks_event_reason_display_clocks_setting>Not Active</clocks_event_reason_display_clocks_setting>
</clocks_event_reasons>
<sparse_operation_mode>N/A</sparse_operation_mode>
<fb_memory_usage>
<total>23034 MiB</total>
<reserved>434 MiB</reserved>
<used>1 MiB</used>
<free>22601 MiB</free>
</fb_memory_usage>
<bar1_memory_usage>
<total>32768 MiB</total>
<used>1 MiB</used>
<free>32767 MiB</free>
</bar1_memory_usage>
<cc_protected_memory_usage>
<total>0 MiB</total>
<used>0 MiB</used>
<free>0 MiB</free>
</cc_protected_memory_usage>
<compute_mode>Default</compute_mode>
<utilization>
<gpu_util>2 %</gpu_util>
<memory_util>0 %</memory_util>
<encoder_util>0 %</encoder_util>
<decoder_util>0 %</decoder_util>
<jpeg_util>0 %</jpeg_util>
<ofa_util>0 %</ofa_util>
</utilization>
<encoder_stats>
<session_count>0</session_count>
<average_fps>0</average_fps>
<average_latency>0</average_latency>
</encoder_stats>
<fbc_stats>
<session_count>0</session_count>
<average_fps>0</average_fps>
<average_latency>0</average_latency>
</fbc_stats>
<ecc_mode>
<current_ecc>Enabled</current_ecc>
<pending_ecc>Enabled</pending_ecc>
</ecc_mode>
<ecc_errors>
<volatile>
<sram_correctable>0</sram_correctable>
<sram_uncorrectable_parity>0</sram_uncorrectable_parity>
<sram_uncorrectable_secded>0</sram_uncorrectable_secded>
<dram_correctable>0</dram_correctable>
<dram_uncorrectable>0</dram_uncorrectable>
</volatile>
<aggregate>
<sram_correctable>0</sram_correctable>
<sram_uncorrectable_parity>0</sram_uncorrectable_parity>
<sram_uncorrectable_secded>0</sram_uncorrectable_secded>
<dram_correctable>0</dram_correctable>
<dram_uncorrectable>0</dram_uncorrectable>
<sram_threshold_exceeded>No</sram_threshold_exceeded>
</aggregate>
<aggregate_uncorrectable_sram_sources>
<sram_l2>0</sram_l2>
<sram_sm>0</sram_sm>
<sram_microcontroller>0</sram_microcontroller>
<sram_pcie>0</sram_pcie>
<sram_other>0</sram_other>
</aggregate_uncorrectable_sram_sources>
</ecc_errors>
<retired_pages>
<multiple_single_bit_retirement>
<retired_count>N/A</retired_count>
<retired_pagelist>N/A</retired_pagelist>
</multiple_single_bit_retirement>
<double_bit_retirement>
<retired_count>N/A</retired_count>
<retired_pagelist>N/A</retired_pagelist>
</double_bit_retirement>
<pending_blacklist>N/A</pending_blacklist>
<pending_retirement>N/A</pending_retirement>
</retired_pages>
<remapped_rows>
<remapped_row_corr>0</remapped_row_corr>
<remapped_row_unc>0</remapped_row_unc>
<remapped_row_pending>No</remapped_row_pending>
<remapped_row_failure>No</remapped_row_failure>
<row_remapper_histogram>
<row_remapper_histogram_max>96 bank(s)</row_remapper_histogram_max>
<row_remapper_histogram_high>0 bank(s)</row_remapper_histogram_high>
<row_remapper_histogram_partial>0 bank(s)</row_remapper_histogram_partial>
<row_remapper_histogram_low>0 bank(s)</row_remapper_histogram_low>
<row_remapper_histogram_none>0 bank(s)</row_remapper_histogram_none>
</row_remapper_histogram>
</remapped_rows>
<temperature>
<gpu_temp>45 C</gpu_temp>
<gpu_temp_tlimit>39 C</gpu_temp_tlimit>
<gpu_temp_max_tlimit_threshold>-5 C</gpu_temp_max_tlimit_threshold>
<gpu_temp_slow_tlimit_threshold>-2 C</gpu_temp_slow_tlimit_threshold>
<gpu_temp_max_gpu_tlimit_threshold>0 C</gpu_temp_max_gpu_tlimit_threshold>
<gpu_target_temperature>N/A</gpu_target_temperature>
<memory_temp>N/A</memory_temp>
<gpu_temp_max_mem_tlimit_threshold>N/A</gpu_temp_max_mem_tlimit_threshold>
</temperature>
<supported_gpu_target_temp>
<gpu_target_temp_min>N/A</gpu_target_temp_min>
<gpu_target_temp_max>N/A</gpu_target_temp_max>
</supported_gpu_target_temp>
<gpu_power_readings>
<power_state>P0</power_state>
<power_draw>27.22 W</power_draw>
<current_power_limit>72.00 W</current_power_limit>
<requested_power_limit>72.00 W</requested_power_limit>
<default_power_limit>72.00 W</default_power_limit>
<min_power_limit>40.00 W</min_power_limit>
<max_power_limit>72.00 W</max_power_limit>
</gpu_power_readings>
<gpu_memory_power_readings>
<power_draw>N/A</power_draw>
</gpu_memory_power_readings>
<module_power_readings>
<power_state>P0</power_state>
<power_draw>N/A</power_draw>
<current_power_limit>N/A</current_power_limit>
<requested_power_limit>N/A</requested_power_limit>
<default_power_limit>N/A</default_power_limit>
<min_power_limit>N/A</min_power_limit>
<max_power_limit>N/A</max_power_limit>
</module_power_readings>
<clocks>
<graphics_clock>2040 MHz</graphics_clock>
<sm_clock>2040 MHz</sm_clock>
<mem_clock>6250 MHz</mem_clock>
<video_clock>1770 MHz</video_clock>
</clocks>
<applications_clocks>
<graphics_clock>2040 MHz</graphics_clock>
<mem_clock>6251 MHz</mem_clock>
</applications_clocks>
<default_applications_clocks>
<graphics_clock>2040 MHz</graphics_clock>
<mem_clock>6251 MHz</mem_clock>
</default_applications_clocks>
<deferred_clocks>
<mem_clock>N/A</mem_clock>
</deferred_clocks>
<max_clocks>
<graphics_clock>2040 MHz</graphics_clock>
<sm_clock>2040 MHz</sm_clock>
<mem_clock>6251 MHz</mem_clock>
<video_clock>1770 MHz</video_clock>
</max_clocks>
<max_customer_boost_clocks>
<graphics_clock>2040 MHz</graphics_clock>
</max_customer_boost_clocks>
<clock_policy>
<auto_boost>N/A</auto_boost>
<auto_boost_default>N/A</auto_boost_default>
</clock_policy>
<voltage>
<graphics_volt>885.000 mV</graphics_volt>
</voltage>
<fabric>
<state>N/A</state>
<status>N/A</status>
<cliqueId>N/A</cliqueId>
<clusterUuid>N/A</clusterUuid>
<health>
<bandwidth>N/A</bandwidth>
</health>
</fabric>
<supported_clocks>
<supported_mem_clock>
<value>6251 MHz</value>
<supported_graphics_clock>2040 MHz</supported_graphics_clock>
<supported_graphics_clock>2025 MHz</supported_graphics_clock>
<supported_graphics_clock>2010 MHz</supported_graphics_clock>
<supported_graphics_clock>1995 MHz</supported_graphics_clock>
<supported_graphics_clock>1980 MHz</supported_graphics_clock>
<supported_graphics_clock>1965 MHz</supported_graphics_clock>
<supported_graphics_clock>1950 MHz</supported_graphics_clock>
<supported_graphics_clock>1935 MHz</supported_graphics_clock>
<supported_graphics_clock>1920 MHz</supported_graphics_clock>
<supported_graphics_clock>1905 MHz</supported_graphics_clock>
<supported_graphics_clock>1890 MHz</supported_graphics_clock>
<supported_graphics_clock>1875 MHz</supported_graphics_clock>
<supported_graphics_clock>1860 MHz</supported_graphics_clock>
<supported_graphics_clock>1845 MHz</supported_graphics_clock>
<supported_graphics_clock>1830 MHz</supported_graphics_clock>
<supported_graphics_clock>1815 MHz</supported_graphics_clock>
<supported_graphics_clock>1800 MHz</supported_graphics_clock>
<supported_graphics_clock>1785 MHz</supported_graphics_clock>
<supported_graphics_clock>1770 MHz</supported_graphics_clock>
<supported_graphics_clock>1755 MHz</supported_graphics_clock>
<supported_graphics_clock>1740 MHz</supported_graphics_clock>
<supported_graphics_clock>1725 MHz</supported_graphics_clock>
<supported_graphics_clock>1710 MHz</supported_graphics_clock>
<supported_graphics_clock>1695 MHz</supported_graphics_clock>
<supported_graphics_clock>1680 MHz</supported_graphics_clock>
<supported_graphics_clock>1665 MHz</supported_graphics_clock>
<supported_graphics_clock>1650 MHz</supported_graphics_clock>
<supported_graphics_clock>1635 MHz</supported_graphics_clock>
<supported_graphics_clock>1620 MHz</supported_graphics_clock>
<supported_graphics_clock>1605 MHz</supported_graphics_clock>
<supported_graphics_clock>1590 MHz</supported_graphics_clock>
<supported_graphics_clock>1575 MHz</supported_graphics_clock>
<supported_graphics_clock>1560 MHz</supported_graphics_clock>
<supported_graphics_clock>1545 MHz</supported_graphics_clock>
<supported_graphics_clock>1530 MHz</supported_graphics_clock>
<supported_graphics_clock>1515 MHz</supported_graphics_clock>
<supported_graphics_clock>1500 MHz</supported_graphics_clock>
<supported_graphics_clock>1485 MHz</supported_graphics_clock>
<supported_graphics_clock>1470 MHz</supported_graphics_clock>
<supported_graphics_clock>1455 MHz</supported_graphics_clock>
<supported_graphics_clock>1440 MHz</supported_graphics_clock>
<supported_graphics_clock>1425 MHz</supported_graphics_clock>
<supported_graphics_clock>1410 MHz</supported_graphics_clock>
<supported_graphics_clock>1395 MHz</supported_graphics_clock>
<supported_graphics_clock>1380 MHz</supported_graphics_clock>
<supported_graphics_clock>1365 MHz</supported_graphics_clock>
<supported_graphics_clock>1350 MHz</supported_graphics_clock>
<supported_graphics_clock>1335 MHz</supported_graphics_clock>
<supported_graphics_clock>1320 MHz</supported_graphics_clock>
<supported_graphics_clock>1305 MHz</supported_graphics_clock>
<supported_graphics_clock>1290 MHz</supported_graphics_clock>
<supported_graphics_clock>1275 MHz</supported_graphics_clock>
<supported_graphics_clock>1260 MHz</supported_graphics_clock>
<supported_graphics_clock>1245 MHz</supported_graphics_clock>
<supported_graphics_clock>1230 MHz</supported_graphics_clock>
<supported_graphics_clock>1215 MHz</supported_graphics_clock>
<supported_graphics_clock>1200 MHz</supported_graphics_clock>
<supported_graphics_clock>1185 MHz</supported_graphics_clock>
<supported_graphics_clock>1170 MHz</supported_graphics_clock>
<supported_graphics_clock>1155 MHz</supported_graphics_clock>
<supported_graphics_clock>1140 MHz</supported_graphics_clock>
<supported_graphics_clock>1125 MHz</supported_graphics_clock>
<supported_graphics_clock>1110 MHz</supported_graphics_clock>
<supported_graphics_clock>1095 MHz</supported_graphics_clock>
<supported_graphics_clock>1080 MHz</supported_graphics_clock>
<supported_graphics_clock>1065 MHz</supported_graphics_clock>
<supported_graphics_clock>1050 MHz</supported_graphics_clock>
<supported_graphics_clock>1035 MHz</supported_graphics_clock>
<supported_graphics_clock>1020 MHz</supported_graphics_clock>
<supported_graphics_clock>1005 MHz</supported_graphics_clock>
<supported_graphics_clock>990 MHz</supported_graphics_clock>
<supported_graphics_clock>975 MHz</supported_graphics_clock>
<supported_graphics_clock>960 MHz</supported_graphics_clock>
<supported_graphics_clock>945 MHz</supported_graphics_clock>
<supported_graphics_clock>930 MHz</supported_graphics_clock>
<supported_graphics_clock>915 MHz</supported_graphics_clock>
<supported_graphics_clock>900 MHz</supported_graphics_clock>
<supported_graphics_clock>885 MHz</supported_graphics_clock>
<supported_graphics_clock>870 MHz</supported_graphics_clock>
<supported_graphics_clock>855 MHz</supported_graphics_clock>
<supported_graphics_clock>840 MHz</supported_graphics_clock>
<supported_graphics_clock>825 MHz</supported_graphics_clock>
<supported_graphics_clock>810 MHz</supported_graphics_clock>
<supported_graphics_clock>795 MHz</supported_graphics_clock>
<supported_graphics_clock>780 MHz</supported_graphics_clock>
<supported_graphics_clock>765 MHz</supported_graphics_clock>
<supported_graphics_clock>750 MHz</supported_graphics_clock>
<supported_graphics_clock>735 MHz</supported_graphics_clock>
<supported_graphics_clock>720 MHz</supported_graphics_clock>
<supported_graphics_clock>705 MHz</supported_graphics_clock>
<supported_graphics_clock>690 MHz</supported_graphics_clock>
<supported_graphics_clock>675 MHz</supported_graphics_clock>
<supported_graphics_clock>660 MHz</supported_graphics_clock>
<supported_graphics_clock>645 MHz</supported_graphics_clock>
<supported_graphics_clock>630 MHz</supported_graphics_clock>
<supported_graphics_clock>615 MHz</supported_graphics_clock>
<supported_graphics_clock>600 MHz</supported_graphics_clock>
<supported_graphics_clock>585 MHz</supported_graphics_clock>
<supported_graphics_clock>570 MHz</supported_graphics_clock>
<supported_graphics_clock>555 MHz</supported_graphics_clock>
<supported_graphics_clock>540 MHz</supported_graphics_clock>
<supported_graphics_clock>525 MHz</supported_graphics_clock>
<supported_graphics_clock>510 MHz</supported_graphics_clock>
<supported_graphics_clock>495 MHz</supported_graphics_clock>
<supported_graphics_clock>480 MHz</supported_graphics_clock>
<supported_graphics_clock>465 MHz</supported_graphics_clock>
<supported_graphics_clock>450 MHz</supported_graphics_clock>
<supported_graphics_clock>435 MHz</supported_graphics_clock>
<supported_graphics_clock>420 MHz</supported_graphics_clock>
<supported_graphics_clock>405 MHz</supported_graphics_clock>
<supported_graphics_clock>390 MHz</supported_graphics_clock>
<supported_graphics_clock>375 MHz</supported_graphics_clock>
<supported_graphics_clock>360 MHz</supported_graphics_clock>
<supported_graphics_clock>345 MHz</supported_graphics_clock>
<supported_graphics_clock>330 MHz</supported_graphics_clock>
<supported_graphics_clock>315 MHz</supported_graphics_clock>
<supported_graphics_clock>300 MHz</supported_graphics_clock>
<supported_graphics_clock>285 MHz</supported_graphics_clock>
<supported_graphics_clock>270 MHz</supported_graphics_clock>
<supported_graphics_clock>255 MHz</supported_graphics_clock>
<supported_graphics_clock>240 MHz</supported_graphics_clock>
<supported_graphics_clock>225 MHz</supported_graphics_clock>
<supported_graphics_clock>210 MHz</supported_graphics_clock>
</supported_mem_clock>
<supported_mem_clock>
<value>405 MHz</value>
<supported_graphics_clock>645 MHz</supported_graphics_clock>
<supported_graphics_clock>630 MHz</supported_graphics_clock>
<supported_graphics_clock>615 MHz</supported_graphics_clock>
<supported_graphics_clock>600 MHz</supported_graphics_clock>
<supported_graphics_clock>585 MHz</supported_graphics_clock>
<supported_graphics_clock>570 MHz</supported_graphics_clock>
<supported_graphics_clock>555 MHz</supported_graphics_clock>
<supported_graphics_clock>540 MHz</supported_graphics_clock>
<supported_graphics_clock>525 MHz</supported_graphics_clock>
<supported_graphics_clock>510 MHz</supported_graphics_clock>
<supported_graphics_clock>495 MHz</supported_graphics_clock>
<supported_graphics_clock>480 MHz</supported_graphics_clock>
<supported_graphics_clock>465 MHz</supported_graphics_clock>
<supported_graphics_clock>450 MHz</supported_graphics_clock>
<supported_graphics_clock>435 MHz</supported_graphics_clock>
<supported_graphics_clock>420 MHz</supported_graphics_clock>
<supported_graphics_clock>405 MHz</supported_graphics_clock>
<supported_graphics_clock>390 MHz</supported_graphics_clock>
<supported_graphics_clock>375 MHz</supported_graphics_clock>
<supported_graphics_clock>360 MHz</supported_graphics_clock>
<supported_graphics_clock>345 MHz</supported_graphics_clock>
<supported_graphics_clock>330 MHz</supported_graphics_clock>
<supported_graphics_clock>315 MHz</supported_graphics_clock>
<supported_graphics_clock>300 MHz</supported_graphics_clock>
<supported_graphics_clock>285 MHz</supported_graphics_clock>
<supported_graphics_clock>270 MHz</supported_graphics_clock>
<supported_graphics_clock>255 MHz</supported_graphics_clock>
<supported_graphics_clock>240 MHz</supported_graphics_clock>
<supported_graphics_clock>225 MHz</supported_graphics_clock>
<supported_graphics_clock>210 MHz</supported_graphics_clock>
</supported_mem_clock>
</supported_clocks>
<processes>
<process_info>
<pid>10131</pid>
<type>C</type>
<process_name>ffmpeg</process_name>
<used_memory>389 MiB</used_memory>
</process_info>
<process_info>
<pid>13597</pid>
<type>C</type>
<process_name>ffmpeg</process_name>
<used_memory>1054 MiB</used_memory>
</process_info>
</processes>
<accounted_processes>
</accounted_processes>
<capabilities>
<egm>disabled</egm>
</capabilities>
</gpu>
<gpu id="00000000:C1:00.0">
<product_name>NVIDIA L4</product_name>
<product_brand>NVIDIA</product_brand>
<product_architecture>Ada Lovelace</product_architecture>
<display_mode>Enabled</display_mode>
<display_active>Disabled</display_active>
<persistence_mode>Disabled</persistence_mode>
<addressing_mode>None</addressing_mode>
<mig_mode>
<current_mig>N/A</current_mig>
<pending_mig>N/A</pending_mig>
</mig_mode>
<mig_devices>
None
</mig_devices>
<accounting_mode>Disabled</accounting_mode>
<accounting_mode_buffer_size>4000</accounting_mode_buffer_size>
<driver_model>
<current_dm>N/A</current_dm>
<pending_dm>N/A</pending_dm>
</driver_model>
<serial>1654523001128</serial>
<uuid>GPU-128ab6fb-6ec9-fd74-b479-4a5fd14f55bd</uuid>
<minor_number>0</minor_number>
<vbios_version>95.04.29.00.06</vbios_version>
<multigpu_board>No</multigpu_board>
<board_id>0xc100</board_id>
<board_part_number>900-2G193-0000-001</board_part_number>
<gpu_part_number>27B8-895-A1</gpu_part_number>
<gpu_fru_part_number>N/A</gpu_fru_part_number>
<gpu_module_id>1</gpu_module_id>
<inforom_version>
<img_version>G193.0200.00.01</img_version>
<oem_object>2.1</oem_object>
<ecc_object>6.16</ecc_object>
<pwr_object>N/A</pwr_object>
</inforom_version>
<inforom_bbx_flush>
<latest_timestamp>N/A</latest_timestamp>
<latest_duration>N/A</latest_duration>
</inforom_bbx_flush>
<gpu_operation_mode>
<current_gom>N/A</current_gom>
<pending_gom>N/A</pending_gom>
</gpu_operation_mode>
<c2c_mode>N/A</c2c_mode>
<gpu_virtualization_mode>
<virtualization_mode>None</virtualization_mode>
<host_vgpu_mode>N/A</host_vgpu_mode>
<vgpu_heterogeneous_mode>N/A</vgpu_heterogeneous_mode>
</gpu_virtualization_mode>
<gpu_reset_status>
<reset_required>No</reset_required>
<drain_and_reset_recommended>N/A</drain_and_reset_recommended>
</gpu_reset_status>
<gsp_firmware_version>555.42.06</gsp_firmware_version>
<ibmnpu>
<relaxed_ordering_mode>N/A</relaxed_ordering_mode>
</ibmnpu>
<pci>
<pci_bus>C1</pci_bus>
<pci_device>00</pci_device>
<pci_domain>0000</pci_domain>
<pci_base_class>3</pci_base_class>
<pci_sub_class>2</pci_sub_class>
<pci_device_id>27B810DE</pci_device_id>
<pci_bus_id>00000000:C1:00.0</pci_bus_id>
<pci_sub_system_id>16CA10DE</pci_sub_system_id>
<pci_gpu_link_info>
<pcie_gen>
<max_link_gen>4</max_link_gen>
<current_link_gen>4</current_link_gen>
<device_current_link_gen>4</device_current_link_gen>
<max_device_link_gen>4</max_device_link_gen>
<max_host_link_gen>5</max_host_link_gen>
</pcie_gen>
<link_widths>
<max_link_width>16x</max_link_width>
<current_link_width>1x</current_link_width>
</link_widths>
</pci_gpu_link_info>
<pci_bridge_chip>
<bridge_chip_type>N/A</bridge_chip_type>
<bridge_chip_fw>N/A</bridge_chip_fw>
</pci_bridge_chip>
<replay_counter>0</replay_counter>
<replay_rollover_counter>0</replay_rollover_counter>
<tx_util>0 KB/s</tx_util>
<rx_util>0 KB/s</rx_util>
<atomic_caps_inbound>N/A</atomic_caps_inbound>
<atomic_caps_outbound>N/A</atomic_caps_outbound>
</pci>
<fan_speed>N/A</fan_speed>
<performance_state>P0</performance_state>
<clocks_event_reasons>
<clocks_event_reason_gpu_idle>Active</clocks_event_reason_gpu_idle>
<clocks_event_reason_applications_clocks_setting>Not Active</clocks_event_reason_applications_clocks_setting>
<clocks_event_reason_sw_power_cap>Not Active</clocks_event_reason_sw_power_cap>
<clocks_event_reason_hw_slowdown>Not Active</clocks_event_reason_hw_slowdown>
<clocks_event_reason_hw_thermal_slowdown>Not Active</clocks_event_reason_hw_thermal_slowdown>
<clocks_event_reason_hw_power_brake_slowdown>Not Active</clocks_event_reason_hw_power_brake_slowdown>
<clocks_event_reason_sync_boost>Not Active</clocks_event_reason_sync_boost>
<clocks_event_reason_sw_thermal_slowdown>Not Active</clocks_event_reason_sw_thermal_slowdown>
<clocks_event_reason_display_clocks_setting>Not Active</clocks_event_reason_display_clocks_setting>
</clocks_event_reasons>
<sparse_operation_mode>N/A</sparse_operation_mode>
<fb_memory_usage>
<total>23034 MiB</total>
<reserved>434 MiB</reserved>
<used>1 MiB</used>
<free>22601 MiB</free>
</fb_memory_usage>
<bar1_memory_usage>
<total>32768 MiB</total>
<used>1 MiB</used>
<free>32767 MiB</free>
</bar1_memory_usage>
<cc_protected_memory_usage>
<total>0 MiB</total>
<used>0 MiB</used>
<free>0 MiB</free>
</cc_protected_memory_usage>
<compute_mode>Default</compute_mode>
<utilization>
<gpu_util>3 %</gpu_util>
<memory_util>0 %</memory_util>
<encoder_util>0 %</encoder_util>
<decoder_util>0 %</decoder_util>
<jpeg_util>0 %</jpeg_util>
<ofa_util>0 %</ofa_util>
</utilization>
<encoder_stats>
<session_count>0</session_count>
<average_fps>0</average_fps>
<average_latency>0</average_latency>
</encoder_stats>
<fbc_stats>
<session_count>0</session_count>
<average_fps>0</average_fps>
<average_latency>0</average_latency>
</fbc_stats>
<ecc_mode>
<current_ecc>Enabled</current_ecc>
<pending_ecc>Enabled</pending_ecc>
</ecc_mode>
<ecc_errors>
<volatile>
<sram_correctable>0</sram_correctable>
<sram_uncorrectable_parity>0</sram_uncorrectable_parity>
<sram_uncorrectable_secded>0</sram_uncorrectable_secded>
<dram_correctable>0</dram_correctable>
<dram_uncorrectable>0</dram_uncorrectable>
</volatile>
<aggregate>
<sram_correctable>0</sram_correctable>
<sram_uncorrectable_parity>0</sram_uncorrectable_parity>
<sram_uncorrectable_secded>0</sram_uncorrectable_secded>
<dram_correctable>0</dram_correctable>
<dram_uncorrectable>0</dram_uncorrectable>
<sram_threshold_exceeded>No</sram_threshold_exceeded>
</aggregate>
<aggregate_uncorrectable_sram_sources>
<sram_l2>0</sram_l2>
<sram_sm>0</sram_sm>
<sram_microcontroller>0</sram_microcontroller>
<sram_pcie>0</sram_pcie>
<sram_other>0</sram_other>
</aggregate_uncorrectable_sram_sources>
</ecc_errors>
<retired_pages>
<multiple_single_bit_retirement>
<retired_count>N/A</retired_count>
<retired_pagelist>N/A</retired_pagelist>
</multiple_single_bit_retirement>
<double_bit_retirement>
<retired_count>N/A</retired_count>
<retired_pagelist>N/A</retired_pagelist>
</double_bit_retirement>
<pending_blacklist>N/A</pending_blacklist>
<pending_retirement>N/A</pending_retirement>
</retired_pages>
<remapped_rows>
<remapped_row_corr>0</remapped_row_corr>
<remapped_row_unc>0</remapped_row_unc>
<remapped_row_pending>No</remapped_row_pending>
<remapped_row_failure>No</remapped_row_failure>
<row_remapper_histogram>
<row_remapper_histogram_max>96 bank(s)</row_remapper_histogram_max>
<row_remapper_histogram_high>0 bank(s)</row_remapper_histogram_high>
<row_remapper_histogram_partial>0 bank(s)</row_remapper_histogram_partial>
<row_remapper_histogram_low>0 bank(s)</row_remapper_histogram_low>
<row_remapper_histogram_none>0 bank(s)</row_remapper_histogram_none>
</row_remapper_histogram>
</remapped_rows>
<temperature>
<gpu_temp>40 C</gpu_temp>
<gpu_temp_tlimit>43 C</gpu_temp_tlimit>
<gpu_temp_max_tlimit_threshold>-5 C</gpu_temp_max_tlimit_threshold>
<gpu_temp_slow_tlimit_threshold>-2 C</gpu_temp_slow_tlimit_threshold>
<gpu_temp_max_gpu_tlimit_threshold>0 C</gpu_temp_max_gpu_tlimit_threshold>
<gpu_target_temperature>N/A</gpu_target_temperature>
<memory_temp>N/A</memory_temp>
<gpu_temp_max_mem_tlimit_threshold>N/A</gpu_temp_max_mem_tlimit_threshold>
</temperature>
<supported_gpu_target_temp>
<gpu_target_temp_min>N/A</gpu_target_temp_min>
<gpu_target_temp_max>N/A</gpu_target_temp_max>
</supported_gpu_target_temp>
<gpu_power_readings>
<power_state>P0</power_state>
<power_draw>29.54 W</power_draw>
<current_power_limit>72.00 W</current_power_limit>
<requested_power_limit>72.00 W</requested_power_limit>
<default_power_limit>72.00 W</default_power_limit>
<min_power_limit>40.00 W</min_power_limit>
<max_power_limit>72.00 W</max_power_limit>
</gpu_power_readings>
<gpu_memory_power_readings>
<power_draw>N/A</power_draw>
</gpu_memory_power_readings>
<module_power_readings>
<power_state>P0</power_state>
<power_draw>N/A</power_draw>
<current_power_limit>N/A</current_power_limit>
<requested_power_limit>N/A</requested_power_limit>
<default_power_limit>N/A</default_power_limit>
<min_power_limit>N/A</min_power_limit>
<max_power_limit>N/A</max_power_limit>
</module_power_readings>
<clocks>
<graphics_clock>2040 MHz</graphics_clock>
<sm_clock>2040 MHz</sm_clock>
<mem_clock>6250 MHz</mem_clock>
<video_clock>1770 MHz</video_clock>
</clocks>
<applications_clocks>
<graphics_clock>2040 MHz</graphics_clock>
<mem_clock>6251 MHz</mem_clock>
</applications_clocks>
<default_applications_clocks>
<graphics_clock>2040 MHz</graphics_clock>
<mem_clock>6251 MHz</mem_clock>
</default_applications_clocks>
<deferred_clocks>
<mem_clock>N/A</mem_clock>
</deferred_clocks>
<max_clocks>
<graphics_clock>2040 MHz</graphics_clock>
<sm_clock>2040 MHz</sm_clock>
<mem_clock>6251 MHz</mem_clock>
<video_clock>1770 MHz</video_clock>
</max_clocks>
<max_customer_boost_clocks>
<graphics_clock>2040 MHz</graphics_clock>
</max_customer_boost_clocks>
<clock_policy>
<auto_boost>N/A</auto_boost>
<auto_boost_default>N/A</auto_boost_default>
</clock_policy>
<voltage>
<graphics_volt>910.000 mV</graphics_volt>
</voltage>
<fabric>
<state>N/A</state>
<status>N/A</status>
<cliqueId>N/A</cliqueId>
<clusterUuid>N/A</clusterUuid>
<health>
<bandwidth>N/A</bandwidth>
</health>
</fabric>
<supported_clocks>
<supported_mem_clock>
<value>6251 MHz</value>
<supported_graphics_clock>2040 MHz</supported_graphics_clock>
<supported_graphics_clock>2025 MHz</supported_graphics_clock>
<supported_graphics_clock>2010 MHz</supported_graphics_clock>
<supported_graphics_clock>1995 MHz</supported_graphics_clock>
<supported_graphics_clock>1980 MHz</supported_graphics_clock>
<supported_graphics_clock>1965 MHz</supported_graphics_clock>
<supported_graphics_clock>1950 MHz</supported_graphics_clock>
<supported_graphics_clock>1935 MHz</supported_graphics_clock>
<supported_graphics_clock>1920 MHz</supported_graphics_clock>
<supported_graphics_clock>1905 MHz</supported_graphics_clock>
<supported_graphics_clock>1890 MHz</supported_graphics_clock>
<supported_graphics_clock>1875 MHz</supported_graphics_clock>
<supported_graphics_clock>1860 MHz</supported_graphics_clock>
<supported_graphics_clock>1845 MHz</supported_graphics_clock>
<supported_graphics_clock>1830 MHz</supported_graphics_clock>
<supported_graphics_clock>1815 MHz</supported_graphics_clock>
<supported_graphics_clock>1800 MHz</supported_graphics_clock>
<supported_graphics_clock>1785 MHz</supported_graphics_clock>
<supported_graphics_clock>1770 MHz</supported_graphics_clock>
<supported_graphics_clock>1755 MHz</supported_graphics_clock>
<supported_graphics_clock>1740 MHz</supported_graphics_clock>
<supported_graphics_clock>1725 MHz</supported_graphics_clock>
<supported_graphics_clock>1710 MHz</supported_graphics_clock>
<supported_graphics_clock>1695 MHz</supported_graphics_clock>
<supported_graphics_clock>1680 MHz</supported_graphics_clock>
<supported_graphics_clock>1665 MHz</supported_graphics_clock>
<supported_graphics_clock>1650 MHz</supported_graphics_clock>
<supported_graphics_clock>1635 MHz</supported_graphics_clock>
<supported_graphics_clock>1620 MHz</supported_graphics_clock>
<supported_graphics_clock>1605 MHz</supported_graphics_clock>
<supported_graphics_clock>1590 MHz</supported_graphics_clock>
<supported_graphics_clock>1575 MHz</supported_graphics_clock>
<supported_graphics_clock>1560 MHz</supported_graphics_clock>
<supported_graphics_clock>1545 MHz</supported_graphics_clock>
<supported_graphics_clock>1530 MHz</supported_graphics_clock>
<supported_graphics_clock>1515 MHz</supported_graphics_clock>
<supported_graphics_clock>1500 MHz</supported_graphics_clock>
<supported_graphics_clock>1485 MHz</supported_graphics_clock>
<supported_graphics_clock>1470 MHz</supported_graphics_clock>
<supported_graphics_clock>1455 MHz</supported_graphics_clock>
<supported_graphics_clock>1440 MHz</supported_graphics_clock>
<supported_graphics_clock>1425 MHz</supported_graphics_clock>
<supported_graphics_clock>1410 MHz</supported_graphics_clock>
<supported_graphics_clock>1395 MHz</supported_graphics_clock>
<supported_graphics_clock>1380 MHz</supported_graphics_clock>
<supported_graphics_clock>1365 MHz</supported_graphics_clock>
<supported_graphics_clock>1350 MHz</supported_graphics_clock>
<supported_graphics_clock>1335 MHz</supported_graphics_clock>
<supported_graphics_clock>1320 MHz</supported_graphics_clock>
<supported_graphics_clock>1305 MHz</supported_graphics_clock>
<supported_graphics_clock>1290 MHz</supported_graphics_clock>
<supported_graphics_clock>1275 MHz</supported_graphics_clock>
<supported_graphics_clock>1260 MHz</supported_graphics_clock>
<supported_graphics_clock>1245 MHz</supported_graphics_clock>
<supported_graphics_clock>1230 MHz</supported_graphics_clock>
<supported_graphics_clock>1215 MHz</supported_graphics_clock>
<supported_graphics_clock>1200 MHz</supported_graphics_clock>
<supported_graphics_clock>1185 MHz</supported_graphics_clock>
<supported_graphics_clock>1170 MHz</supported_graphics_clock>
<supported_graphics_clock>1155 MHz</supported_graphics_clock>
<supported_graphics_clock>1140 MHz</supported_graphics_clock>
<supported_graphics_clock>1125 MHz</supported_graphics_clock>
<supported_graphics_clock>1110 MHz</supported_graphics_clock>
<supported_graphics_clock>1095 MHz</supported_graphics_clock>
<supported_graphics_clock>1080 MHz</supported_graphics_clock>
<supported_graphics_clock>1065 MHz</supported_graphics_clock>
<supported_graphics_clock>1050 MHz</supported_graphics_clock>
<supported_graphics_clock>1035 MHz</supported_graphics_clock>
<supported_graphics_clock>1020 MHz</supported_graphics_clock>
<supported_graphics_clock>1005 MHz</supported_graphics_clock>
<supported_graphics_clock>990 MHz</supported_graphics_clock>
<supported_graphics_clock>975 MHz</supported_graphics_clock>
<supported_graphics_clock>960 MHz</supported_graphics_clock>
<supported_graphics_clock>945 MHz</supported_graphics_clock>
<supported_graphics_clock>930 MHz</supported_graphics_clock>
<supported_graphics_clock>915 MHz</supported_graphics_clock>
<supported_graphics_clock>900 MHz</supported_graphics_clock>
<supported_graphics_clock>885 MHz</supported_graphics_clock>
<supported_graphics_clock>870 MHz</supported_graphics_clock>
<supported_graphics_clock>855 MHz</supported_graphics_clock>
<supported_graphics_clock>840 MHz</supported_graphics_clock>
<supported_graphics_clock>825 MHz</supported_graphics_clock>
<supported_graphics_clock>810 MHz</supported_graphics_clock>
<supported_graphics_clock>795 MHz</supported_graphics_clock>
<supported_graphics_clock>780 MHz</supported_graphics_clock>
<supported_graphics_clock>765 MHz</supported_graphics_clock>
<supported_graphics_clock>750 MHz</supported_graphics_clock>
<supported_graphics_clock>735 MHz</supported_graphics_clock>
<supported_graphics_clock>720 MHz</supported_graphics_clock>
<supported_graphics_clock>705 MHz</supported_graphics_clock>
<supported_graphics_clock>690 MHz</supported_graphics_clock>
<supported_graphics_clock>675 MHz</supported_graphics_clock>
<supported_graphics_clock>660 MHz</supported_graphics_clock>
<supported_graphics_clock>645 MHz</supported_graphics_clock>
<supported_graphics_clock>630 MHz</supported_graphics_clock>
<supported_graphics_clock>615 MHz</supported_graphics_clock>
<supported_graphics_clock>600 MHz</supported_graphics_clock>
<supported_graphics_clock>585 MHz</supported_graphics_clock>
<supported_graphics_clock>570 MHz</supported_graphics_clock>
<supported_graphics_clock>555 MHz</supported_graphics_clock>
<supported_graphics_clock>540 MHz</supported_graphics_clock>
<supported_graphics_clock>525 MHz</supported_graphics_clock>
<supported_graphics_clock>510 MHz</supported_graphics_clock>
<supported_graphics_clock>495 MHz</supported_graphics_clock>
<supported_graphics_clock>480 MHz</supported_graphics_clock>
<supported_graphics_clock>465 MHz</supported_graphics_clock>
<supported_graphics_clock>450 MHz</supported_graphics_clock>
<supported_graphics_clock>435 MHz</supported_graphics_clock>
<supported_graphics_clock>420 MHz</supported_graphics_clock>
<supported_graphics_clock>405 MHz</supported_graphics_clock>
<supported_graphics_clock>390 MHz</supported_graphics_clock>
<supported_graphics_clock>375 MHz</supported_graphics_clock>
<supported_graphics_clock>360 MHz</supported_graphics_clock>
<supported_graphics_clock>345 MHz</supported_graphics_clock>
<supported_graphics_clock>330 MHz</supported_graphics_clock>
<supported_graphics_clock>315 MHz</supported_graphics_clock>
<supported_graphics_clock>300 MHz</supported_graphics_clock>
<supported_graphics_clock>285 MHz</supported_graphics_clock>
<supported_graphics_clock>270 MHz</supported_graphics_clock>
<supported_graphics_clock>255 MHz</supported_graphics_clock>
<supported_graphics_clock>240 MHz</supported_graphics_clock>
<supported_graphics_clock>225 MHz</supported_graphics_clock>
<supported_graphics_clock>210 MHz</supported_graphics_clock>
</supported_mem_clock>
<supported_mem_clock>
<value>405 MHz</value>
<supported_graphics_clock>645 MHz</supported_graphics_clock>
<supported_graphics_clock>630 MHz</supported_graphics_clock>
<supported_graphics_clock>615 MHz</supported_graphics_clock>
<supported_graphics_clock>600 MHz</supported_graphics_clock>
<supported_graphics_clock>585 MHz</supported_graphics_clock>
<supported_graphics_clock>570 MHz</supported_graphics_clock>
<supported_graphics_clock>555 MHz</supported_graphics_clock>
<supported_graphics_clock>540 MHz</supported_graphics_clock>
<supported_graphics_clock>525 MHz</supported_graphics_clock>
<supported_graphics_clock>510 MHz</supported_graphics_clock>
<supported_graphics_clock>495 MHz</supported_graphics_clock>
<supported_graphics_clock>480 MHz</supported_graphics_clock>
<supported_graphics_clock>465 MHz</supported_graphics_clock>
<supported_graphics_clock>450 MHz</supported_graphics_clock>
<supported_graphics_clock>435 MHz</supported_graphics_clock>
<supported_graphics_clock>420 MHz</supported_graphics_clock>
<supported_graphics_clock>405 MHz</supported_graphics_clock>
<supported_graphics_clock>390 MHz</supported_graphics_clock>
<supported_graphics_clock>375 MHz</supported_graphics_clock>
<supported_graphics_clock>360 MHz</supported_graphics_clock>
<supported_graphics_clock>345 MHz</supported_graphics_clock>
<supported_graphics_clock>330 MHz</supported_graphics_clock>
<supported_graphics_clock>315 MHz</supported_graphics_clock>
<supported_graphics_clock>300 MHz</supported_graphics_clock>
<supported_graphics_clock>285 MHz</supported_graphics_clock>
<supported_graphics_clock>270 MHz</supported_graphics_clock>
<supported_graphics_clock>255 MHz</supported_graphics_clock>
<supported_graphics_clock>240 MHz</supported_graphics_clock>
<supported_graphics_clock>225 MHz</supported_graphics_clock>
<supported_graphics_clock>210 MHz</supported_graphics_clock>
</supported_mem_clock>
</supported_clocks>
<processes>
<process_info>
<pid>16870</pid>
<type>C</type>
<process_name>ffmpeg</process_name>
<used_memory>549 MiB</used_memory>
</process_info>
</processes>
<accounted_processes>
</accounted_processes>
<capabilities>
<egm>disabled</egm>
</capabilities>
</gpu>
</nvidia_smi_log>`
func main() {
if len(os.Args) == 1 {
os.Exit(1)
}
ctx, cancel := context.WithCancel(context.Background())
if os.Args[1] == "pmon" {
go func(ctx context.Context) {
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
fmt.Fprintf(os.Stdout, "%s\n", pmondata)
}
}
}(ctx)
} else {
go func(ctx context.Context) {
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
fmt.Fprintf(os.Stdout, "%s\n", querydata)
}
}
}(ctx)
}
// Wait for interrupt signal to gracefully shutdown the app
quit := make(chan os.Signal, 1)
signal.Notify(quit, os.Interrupt)
<-quit
cancel()
os.Exit(0)
}

View File

@ -33,7 +33,7 @@ func NewCPUCollector(rsc resources.Resources) metric.Collector {
c.limitDescr = metric.NewDesc("cpu_limit", "Percentage of CPU to be consumed", nil)
c.throttleDescr = metric.NewDesc("cpu_throttling", "Whether the CPU is currently throttled", nil)
if ncpu, err := psutil.CPUCounts(true); err == nil {
if ncpu, err := psutil.CPUCounts(); err == nil {
c.ncpu = ncpu
}
@ -63,11 +63,11 @@ func (c *cpuCollector) Collect() metric.Metrics {
metrics.Add(metric.NewValue(c.ncpuDescr, c.ncpu))
limit, _ := c.resources.Limits()
limit, _, _, _ := c.resources.Limits()
metrics.Add(metric.NewValue(c.limitDescr, limit))
cpu, _ := c.resources.ShouldLimit()
cpu, _, _ := c.resources.ShouldLimit()
throttling := .0
if cpu {
throttling = 1

View File

@ -37,7 +37,7 @@ func (c *diskCollector) Describe() []*metric.Description {
func (c *diskCollector) Collect() metric.Metrics {
metrics := metric.NewMetrics()
stat, err := psutil.DiskUsage(c.path)
stat, err := psutil.Disk(c.path)
if err != nil {
return metrics
}

View File

@ -44,11 +44,11 @@ func (c *memCollector) Describe() []*metric.Description {
func (c *memCollector) Collect() metric.Metrics {
metrics := metric.NewMetrics()
_, limit := c.resources.Limits()
_, limit, _, _ := c.resources.Limits()
metrics.Add(metric.NewValue(c.limitDescr, float64(limit)))
_, memory := c.resources.ShouldLimit()
_, memory, _ := c.resources.ShouldLimit()
throttling := .0
if memory {
throttling = 1
@ -56,7 +56,7 @@ func (c *memCollector) Collect() metric.Metrics {
metrics.Add(metric.NewValue(c.throttleDescr, throttling))
stat, err := psutil.VirtualMemory()
stat, err := psutil.Memory()
if err != nil {
return metrics
}

View File

@ -33,7 +33,7 @@ func (c *netCollector) Describe() []*metric.Description {
func (c *netCollector) Collect() metric.Metrics {
metrics := metric.NewMetrics()
devs, err := psutil.NetIOCounters(true)
devs, err := psutil.Network()
if err != nil {
return metrics
}

View File

@ -25,9 +25,36 @@ type Usage struct {
Max uint64 // bytes
Limit uint64 // bytes
}
GPU struct {
Index int // number of the GPU
Memory struct {
Current uint64 // bytes
Average float64 // bytes
Max uint64 // bytes
Limit uint64 // bytes
}
Usage struct {
Current float64 // percent 0-100
Average float64 // percent 0-100
Max float64 // percent 0-100
Limit float64 // percent 0-100
}
Encoder struct {
Current float64 // percent 0-100
Average float64 // percent 0-100
Max float64 // percent 0-100
Limit float64 // percent 0-100
}
Decoder struct {
Current float64 // percent 0-100
Average float64 // percent 0-100
Max float64 // percent 0-100
Limit float64 // percent 0-100
}
}
}
type LimitFunc func(cpu float64, memory uint64)
type LimitFunc func(cpu float64, memory uint64, gpuusage, gpuencoder, gpudecoder float64, gpumemory uint64)
type LimitMode int
@ -44,18 +71,22 @@ func (m LimitMode) String() string {
}
const (
LimitModeHard LimitMode = 0 // Killing the process if either CPU or memory is above the limit for a certain time
LimitModeSoft LimitMode = 1 // Throttling the CPU if activated, killing the process if memory is above the limit for a certain time
LimitModeHard LimitMode = 0 // Killing the process if either resource is above the limit for a certain time.
LimitModeSoft LimitMode = 1 // If activated, will throttle the CPU, otherwise killing the process if resources are above the limit.
)
type LimiterConfig struct {
CPU float64 // Max. CPU usage in percent 0-100 in hard mode, 0-100*ncpu in softmode
Memory uint64 // Max. memory usage in bytes
WaitFor time.Duration // Duration for one of the limits has to be above the limit until OnLimit gets triggered
OnLimit LimitFunc // Function to be triggered if limits are exceeded
Mode LimitMode // How to limit CPU usage
PSUtil psutil.Util
Logger log.Logger
CPU float64 // Max. CPU usage in percent 0-100 in hard mode, 0-100*ncpu in soft mode.
Memory uint64 // Max. memory usage in bytes.
GPUUsage float64 // Max. GPU general usage in percent 0-100.
GPUEncoder float64 // Max. GPU encoder usage in percent 0-100.
GPUDecoder float64 // Max. GPU decoder usage in percent 0-100.
GPUMemory uint64 // Max. GPU memory usage in bytes.
WaitFor time.Duration // Duration for one of the limits has to be above the limit until OnLimit gets triggered.
OnLimit LimitFunc // Function to be triggered if limits are exceeded.
Mode LimitMode // How to limit CPU usage.
PSUtil psutil.Util
Logger log.Logger
}
type Limiter interface {
@ -65,26 +96,135 @@ type Limiter interface {
// Stop stops the limiter. The limiter can be reused by calling Start() again
Stop()
// Current returns the current CPU and memory values
// Deprecated: use Usage()
Current() (cpu float64, memory uint64)
// Limits returns the defined CPU and memory limits. Values <= 0 means no limit
// Deprecated: use Usage()
Limits() (cpu float64, memory uint64)
// Usage returns the current state of the limiter, such as current, average, max, and
// limit values for CPU and memory.
Usage() Usage
// Limit enables or disables the throttling of the CPU or killing because of to much
// memory consumption.
Limit(cpu, memory bool) error
// memory or GPU consumption.
Limit(cpu, memory, gpu bool) error
// Mode returns in which mode the limiter is running in.
Mode() LimitMode
}
type numbers interface {
~uint64 | ~float64
}
type metric[T numbers] struct {
limit T // Limit
current T // Current load value
last T // Last load value
max T // Max. load value
top T // Decaying max. load value
avg float64 // Average load value
avgCounter uint64 // Counter for average calculation
limitSince time.Time // Time when the limit has been reached (hard limiter mode)
limitEnable bool
}
func (x *metric[T]) Reset() {
var zero T
x.current = zero
x.last = zero
x.max = zero
x.top = zero
x.avg = 0
x.avgCounter = 0
x.limitEnable = false
}
func (x *metric[T]) Current() T {
return x.current
}
func (x *metric[T]) Top() T {
return x.top
}
func (x *metric[T]) Max() T {
return x.max
}
func (x *metric[T]) Avg() float64 {
return x.avg
}
func (x *metric[T]) SetLimit(limit T) {
x.limit = limit
}
func (x *metric[T]) Limit() T {
return x.limit
}
func (x *metric[T]) DoLimit(limit bool) (enabled, changed bool) {
if x.limitEnable != limit {
x.limitEnable = limit
changed = true
}
enabled = x.limitEnable
return
}
func (x *metric[T]) IsLimitEnabled() bool {
return x.limitEnable
}
func (x *metric[T]) Update(value T) {
x.last, x.current = x.current, value
if x.current > x.max {
x.max = x.current
}
if x.current > x.top {
x.top = x.current
} else {
x.top = T(float64(x.top) * 0.95)
}
x.avgCounter++
x.avg = ((x.avg * float64(x.avgCounter-1)) + float64(x.current)) / float64(x.avgCounter)
}
func (x *metric[T]) IsExceeded(waitFor time.Duration, mode LimitMode) bool {
if x.limit <= 0 {
return false
}
if mode == LimitModeSoft {
// Check if we actually should limit.
if !x.limitEnable {
return false
}
// If we are currently above the limit, the limit is exceeded.
if x.current > x.limit {
return true
}
} else {
if x.current > x.limit {
// Current value is higher than the limit.
if x.last <= x.limit {
// If the previous value is below the limit, then we reached the limit as of now.
x.limitSince = time.Now()
}
if time.Since(x.limitSince) >= waitFor {
return true
}
}
}
return false
}
type limiter struct {
psutil psutil.Util
@ -98,40 +238,27 @@ type limiter struct {
lastUsage Usage
lastUsageLock sync.RWMutex
cpu float64 // CPU limit
cpuCurrent float64 // Current CPU load of this process
cpuLast float64 // Last CPU load of this process
cpuMax float64 // Max. CPU load of this process
cpuTop float64 // Decaying max. CPU load of this process
cpuAvg float64 // Average CPU load of this process
cpuAvgCounter uint64 // Counter for average calculation
cpuLimitSince time.Time // Time when the CPU limit has been reached (hard limiter mode)
cpuLimitEnable bool // Whether CPU throttling is enabled (soft limiter mode)
cpuThrottling bool // Whether CPU throttling is currently active (soft limiter mode)
cpu metric[float64] // CPU limit
cpuThrottling bool // Whether CPU throttling is currently active (soft limiter mode)
memory uint64 // Memory limit (bytes)
memoryCurrent uint64 // Current memory usage
memoryLast uint64 // Last memory usage
memoryMax uint64 // Max. memory usage
memoryTop uint64 // Decaying max. memory usage
memoryAvg float64 // Average memory usage
memoryAvgCounter uint64 // Counter for average memory calculation
memoryLimitSince time.Time // Time when the memory limit has been reached (hard limiter mode)
memoryLimitEnable bool // Whether memory limiting is enabled (soft limiter mode)
memory metric[uint64] // Memory limit (bytes)
gpu struct {
memory metric[uint64] // GPU memory limit (0-100 percent)
usage metric[float64] // GPU load limit (0-100 percent)
encoder metric[float64] // GPU encoder limit (0-100 percent)
decoder metric[float64] // GPU decoder limit (0-100 percent)
}
waitFor time.Duration
mode LimitMode
cancelLimit context.CancelFunc
logger log.Logger
}
// NewLimiter returns a new Limiter
func NewLimiter(config LimiterConfig) Limiter {
l := &limiter{
cpu: config.CPU,
memory: config.Memory,
waitFor: config.WaitFor,
onLimit: config.OnLimit,
mode: config.Mode,
@ -139,6 +266,13 @@ func NewLimiter(config LimiterConfig) Limiter {
logger: config.Logger,
}
l.cpu.SetLimit(config.CPU / 100)
l.memory.SetLimit(config.Memory)
l.gpu.memory.SetLimit(config.GPUMemory)
l.gpu.usage.SetLimit(config.GPUUsage / 100)
l.gpu.encoder.SetLimit(config.GPUEncoder / 100)
l.gpu.decoder.SetLimit(config.GPUDecoder / 100)
if l.logger == nil {
l.logger = log.New("")
}
@ -147,57 +281,56 @@ func NewLimiter(config LimiterConfig) Limiter {
l.psutil = psutil.DefaultUtil
}
if ncpu, err := l.psutil.CPUCounts(true); err != nil {
if ncpu, err := l.psutil.CPUCounts(); err != nil {
l.ncpu = 1
} else {
l.ncpu = ncpu
}
l.lastUsage.CPU.NCPU = l.ncpu
l.lastUsage.CPU.Limit = l.cpu * l.ncpu
l.lastUsage.Memory.Limit = l.memory
l.lastUsage.CPU.Limit = l.cpu.Limit() * 100 * l.ncpu
l.lastUsage.Memory.Limit = l.memory.Limit()
l.lastUsage.GPU.Memory.Limit = l.gpu.memory.Limit()
l.lastUsage.GPU.Usage.Limit = l.gpu.usage.Limit() * 100
l.lastUsage.GPU.Encoder.Limit = l.gpu.encoder.Limit() * 100
l.lastUsage.GPU.Decoder.Limit = l.gpu.decoder.Limit() * 100
l.ncpuFactor = 1
mode := "hard"
if l.mode == LimitModeSoft {
mode = "soft"
l.cpu /= l.ncpu
l.cpu.SetLimit(l.cpu.Limit() / l.ncpu)
l.ncpuFactor = l.ncpu
}
l.cpu /= 100
if l.onLimit == nil {
l.onLimit = func(float64, uint64) {}
l.onLimit = func(float64, uint64, float64, float64, float64, uint64) {}
}
l.logger = l.logger.WithFields(log.Fields{
"cpu": l.cpu * l.ncpuFactor,
"memory": l.memory,
"mode": mode,
"cpu": l.cpu.Limit() * l.ncpuFactor,
"memory": l.memory.Limit(),
"gpumemory": l.gpu.memory.Limit(),
"gpuusage": l.gpu.usage.Limit(),
"gpuencoder": l.gpu.encoder.Limit(),
"gpudecoder": l.gpu.decoder.Limit(),
"mode": mode,
})
return l
}
func (l *limiter) reset() {
l.cpuCurrent = 0
l.cpuLast = 0
l.cpuAvg = 0
l.cpuAvgCounter = 0
l.cpuMax = 0
l.cpuTop = 0
l.cpuLimitEnable = false
l.cpu.Reset()
l.cpuThrottling = false
l.memoryCurrent = 0
l.memoryLast = 0
l.memoryAvg = 0
l.memoryAvgCounter = 0
l.memoryMax = 0
l.memoryTop = 0
l.memoryLimitEnable = false
l.memory.Reset()
l.gpu.memory.Reset()
l.gpu.usage.Reset()
l.gpu.encoder.Reset()
l.gpu.decoder.Reset()
}
func (l *limiter) Start(process psutil.Process) error {
@ -218,10 +351,7 @@ func (l *limiter) Start(process psutil.Process) error {
go l.ticker(ctx, time.Second)
if l.mode == LimitModeSoft {
ctx, cancel = context.WithCancel(context.Background())
l.cancelLimit = cancel
go l.limitCPU(ctx, l.cpu, time.Second)
go l.limitCPU(ctx, l.cpu.Limit(), time.Second)
}
return nil
@ -237,11 +367,6 @@ func (l *limiter) Stop() {
l.cancel()
if l.cancelLimit != nil {
l.cancelLimit()
l.cancelLimit = nil
}
l.proc.Stop()
l.proc = nil
@ -256,13 +381,13 @@ func (l *limiter) ticker(ctx context.Context, interval time.Duration) {
select {
case <-ctx.Done():
return
case t := <-ticker.C:
l.collect(t)
case <-ticker.C:
l.collect()
}
}
}
func (l *limiter) collect(_ time.Time) {
func (l *limiter) collect() {
l.lock.Lock()
proc := l.proc
l.lock.Unlock()
@ -271,118 +396,108 @@ func (l *limiter) collect(_ time.Time) {
return
}
mstat, merr := proc.VirtualMemory()
cpustat, cerr := proc.CPUPercent()
mstat, merr := proc.Memory()
cpustat, cerr := proc.CPU()
gstat, gerr := proc.GPU()
gindex := -1
l.lock.Lock()
defer l.lock.Unlock()
if merr == nil {
l.memoryLast, l.memoryCurrent = l.memoryCurrent, mstat
if l.memoryCurrent > l.memoryMax {
l.memoryMax = l.memoryCurrent
}
if l.memoryCurrent > l.memoryTop {
l.memoryTop = l.memoryCurrent
} else {
l.memoryTop = uint64(float64(l.memoryTop) * 0.95)
}
l.memoryAvgCounter++
l.memoryAvg = ((l.memoryAvg * float64(l.memoryAvgCounter-1)) + float64(l.memoryCurrent)) / float64(l.memoryAvgCounter)
l.memory.Update(mstat)
}
if cerr == nil {
l.cpuLast, l.cpuCurrent = l.cpuCurrent, (cpustat.System+cpustat.User+cpustat.Other)/100
l.cpu.Update((cpustat.System + cpustat.User + cpustat.Other) / 100)
}
if l.cpuCurrent > l.cpuMax {
l.cpuMax = l.cpuCurrent
}
if l.cpuCurrent > l.cpuTop {
l.cpuTop = l.cpuCurrent
} else {
l.cpuTop = l.cpuTop * 0.95
}
l.cpuAvgCounter++
l.cpuAvg = ((l.cpuAvg * float64(l.cpuAvgCounter-1)) + l.cpuCurrent) / float64(l.cpuAvgCounter)
if gerr == nil {
l.gpu.memory.Update(gstat.MemoryUsed)
l.gpu.usage.Update(gstat.Usage / 100)
l.gpu.encoder.Update(gstat.Encoder / 100)
l.gpu.decoder.Update(gstat.Decoder / 100)
gindex = gstat.Index
}
isLimitExceeded := false
if l.mode == LimitModeHard {
if l.cpu > 0 {
if l.cpuCurrent > l.cpu {
// Current value is higher than the limit
if l.cpuLast <= l.cpu {
// If the previous value is below the limit, then we reached the
// limit as of now
l.cpuLimitSince = time.Now()
}
if time.Since(l.cpuLimitSince) >= l.waitFor {
l.logger.Warn().Log("CPU limit exceeded")
isLimitExceeded = true
}
}
if l.cpu.IsExceeded(l.waitFor, l.mode) {
l.logger.Warn().Log("CPU limit exceeded")
isLimitExceeded = true
}
}
if l.memory > 0 {
if l.memoryCurrent > l.memory {
// Current value is higher than the limit
if l.memoryLast <= l.memory {
// If the previous value is below the limit, then we reached the
// limit as of now
l.memoryLimitSince = time.Now()
}
if l.memory.IsExceeded(l.waitFor, l.mode) {
l.logger.Warn().Log("Memory limit exceeded")
isLimitExceeded = true
}
if time.Since(l.memoryLimitSince) >= l.waitFor {
l.logger.Warn().Log("Memory limit exceeded")
isLimitExceeded = true
}
}
}
} else {
if l.memory > 0 && l.memoryLimitEnable {
if l.memoryCurrent > l.memory {
// Current value is higher than the limit
l.logger.Warn().Log("Memory limit exceeded")
isLimitExceeded = true
}
}
if l.gpu.memory.IsExceeded(l.waitFor, l.mode) {
l.logger.Warn().Log("GPU memory limit exceeded")
isLimitExceeded = true
}
if l.gpu.usage.IsExceeded(l.waitFor, l.mode) {
l.logger.Warn().Log("GPU usage limit exceeded")
isLimitExceeded = true
}
if l.gpu.encoder.IsExceeded(l.waitFor, l.mode) {
l.logger.Warn().Log("GPU encoder limit exceeded")
isLimitExceeded = true
}
if l.gpu.decoder.IsExceeded(l.waitFor, l.mode) {
l.logger.Warn().Log("GPU decoder limit exceeded")
isLimitExceeded = true
}
l.logger.Debug().WithFields(log.Fields{
"cur_cpu": l.cpuCurrent * l.ncpuFactor,
"top_cpu": l.cpuTop * l.ncpuFactor,
"cur_mem": l.memoryCurrent,
"top_mem": l.memoryTop,
"exceeded": isLimitExceeded,
"cur_cpu": l.cpu.Current() * l.ncpuFactor,
"top_cpu": l.cpu.Top() * l.ncpuFactor,
"cur_mem": l.memory.Current(),
"top_mem": l.memory.Top(),
"cur_gpu_mem": l.gpu.memory.Current(),
"top_gpu_mem": l.gpu.memory.Top(),
"exceeded": isLimitExceeded,
}).Log("Observation")
if isLimitExceeded {
go l.onLimit(l.cpuCurrent*l.ncpuFactor*100, l.memoryCurrent)
go l.onLimit(l.cpu.Current()*l.ncpuFactor*100, l.memory.Current(), l.gpu.usage.Current(), l.gpu.encoder.Current(), l.gpu.decoder.Current(), l.gpu.memory.Current())
}
l.lastUsageLock.Lock()
l.lastUsage.CPU.Current = l.cpuCurrent * l.ncpu * 100
l.lastUsage.CPU.Average = l.cpuAvg * l.ncpu * 100
l.lastUsage.CPU.Max = l.cpuMax * l.ncpu * 100
l.lastUsage.CPU.Current = l.cpu.Current() * l.ncpu * 100
l.lastUsage.CPU.Average = l.cpu.Avg() * l.ncpu * 100
l.lastUsage.CPU.Max = l.cpu.Max() * l.ncpu * 100
l.lastUsage.CPU.IsThrottling = l.cpuThrottling
l.lastUsage.Memory.Current = l.memoryCurrent
l.lastUsage.Memory.Average = l.memoryAvg
l.lastUsage.Memory.Max = l.memoryMax
l.lastUsageLock.Unlock()
l.lastUsage.Memory.Current = l.memory.Current()
l.lastUsage.Memory.Average = l.memory.Avg()
l.lastUsage.Memory.Max = l.memory.Max()
l.lock.Unlock()
l.lastUsage.GPU.Index = gindex
l.lastUsage.GPU.Memory.Current = l.gpu.memory.Current() * 100
l.lastUsage.GPU.Memory.Average = l.gpu.memory.Avg() * 100
l.lastUsage.GPU.Memory.Max = l.gpu.memory.Max() * 100
l.lastUsage.GPU.Usage.Current = l.gpu.usage.Current() * 100
l.lastUsage.GPU.Usage.Average = l.gpu.usage.Avg() * 100
l.lastUsage.GPU.Usage.Max = l.gpu.usage.Max() * 100
l.lastUsage.GPU.Encoder.Current = l.gpu.encoder.Current() * 100
l.lastUsage.GPU.Encoder.Average = l.gpu.encoder.Avg() * 100
l.lastUsage.GPU.Encoder.Max = l.gpu.encoder.Max() * 100
l.lastUsage.GPU.Decoder.Current = l.gpu.decoder.Current() * 100
l.lastUsage.GPU.Decoder.Average = l.gpu.decoder.Avg() * 100
l.lastUsage.GPU.Decoder.Max = l.gpu.decoder.Max() * 100
l.lastUsageLock.Unlock()
}
func (l *limiter) Limit(cpu, memory bool) error {
func (l *limiter) Limit(cpu, memory, gpu bool) error {
l.lock.Lock()
defer l.lock.Unlock()
@ -390,35 +505,31 @@ func (l *limiter) Limit(cpu, memory bool) error {
return nil
}
if memory {
if !l.memoryLimitEnable {
l.memoryLimitEnable = true
l.logger.Debug().Log("Memory limiter enabled")
}
} else {
if l.memoryLimitEnable {
l.memoryLimitEnable = false
l.logger.Debug().Log("Memory limiter disabled")
}
enabled, changed := l.cpu.DoLimit(cpu)
if enabled && changed {
l.logger.Debug().Log("CPU limiter enabled")
} else if !enabled && changed {
l.logger.Debug().Log("CPU limiter disabled")
}
if cpu {
if !l.cpuLimitEnable {
l.cpuLimitEnable = true
l.logger.Debug().Log("CPU limiter enabled")
}
} else {
if l.cpuLimitEnable {
l.cpuLimitEnable = false
l.logger.Debug().Log("CPU limiter disabled")
}
enabled, changed = l.memory.DoLimit(memory)
if enabled && changed {
l.logger.Debug().Log("Memory limiter enabled")
} else if !enabled && changed {
l.logger.Debug().Log("Memory limiter disabled")
}
enabled, changed = l.gpu.memory.DoLimit(gpu)
if enabled && changed {
l.logger.Debug().Log("GPU limiter enabled")
} else if !enabled && changed {
l.logger.Debug().Log("GPU limiter disabled")
}
l.gpu.usage.DoLimit(gpu)
l.gpu.encoder.DoLimit(gpu)
l.gpu.decoder.DoLimit(gpu)
return nil
}
@ -453,7 +564,7 @@ func (l *limiter) limitCPU(ctx context.Context, limit float64, interval time.Dur
l.lock.Lock()
if !l.cpuLimitEnable {
if !l.cpu.IsLimitEnabled() {
if factorTopLimit > 0 {
factorTopLimit -= 10
} else {
@ -469,7 +580,7 @@ func (l *limiter) limitCPU(ctx context.Context, limit float64, interval time.Dur
}
} else {
factorTopLimit = 100
topLimit = l.cpuTop - limit
topLimit = l.cpu.Top() - limit
l.cpuThrottling = true
}
@ -482,7 +593,7 @@ func (l *limiter) limitCPU(ctx context.Context, limit float64, interval time.Dur
lim += (100 - factorTopLimit) / 100 * topLimit
}
pcpu := l.cpuCurrent
pcpu := l.cpu.Current()
l.lock.Unlock()
@ -526,16 +637,6 @@ func (l *limiter) limitCPU(ctx context.Context, limit float64, interval time.Dur
}
}
func (l *limiter) Current() (cpu float64, memory uint64) {
l.lastUsageLock.RLock()
defer l.lastUsageLock.RUnlock()
cpu = l.lastUsage.CPU.Current / l.ncpu
memory = l.lastUsage.Memory.Current
return
}
func (l *limiter) Usage() Usage {
l.lastUsageLock.RLock()
defer l.lastUsageLock.RUnlock()
@ -543,10 +644,6 @@ func (l *limiter) Usage() Usage {
return l.lastUsage
}
func (l *limiter) Limits() (cpu float64, memory uint64) {
return l.cpu * 100, l.memory
}
func (l *limiter) Mode() LimitMode {
return l.mode
}

View File

@ -7,13 +7,13 @@ import (
"github.com/datarhei/core/v16/psutil"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
type psproc struct{}
func (p *psproc) CPUPercent() (*psutil.CPUInfoStat, error) {
return &psutil.CPUInfoStat{
func (p *psproc) CPU() (*psutil.CPUInfo, error) {
return &psutil.CPUInfo{
System: 50,
User: 0,
Idle: 0,
@ -21,10 +21,22 @@ func (p *psproc) CPUPercent() (*psutil.CPUInfoStat, error) {
}, nil
}
func (p *psproc) VirtualMemory() (uint64, error) {
func (p *psproc) Memory() (uint64, error) {
return 197, nil
}
func (p *psproc) GPU() (*psutil.GPUInfo, error) {
return &psutil.GPUInfo{
Index: 0,
Name: "L4",
MemoryTotal: 128,
MemoryUsed: 91,
Usage: 3,
Encoder: 9,
Decoder: 5,
}, nil
}
func (p *psproc) Stop() {}
func (p *psproc) Suspend() error { return nil }
func (p *psproc) Resume() error { return nil }
@ -42,7 +54,7 @@ func TestCPULimit(t *testing.T) {
l := NewLimiter(LimiterConfig{
CPU: 42,
OnLimit: func(float64, uint64) {
OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
wg.Done()
},
})
@ -57,7 +69,7 @@ func TestCPULimit(t *testing.T) {
lock.Unlock()
}()
assert.Eventually(t, func() bool {
require.Eventually(t, func() bool {
lock.Lock()
defer lock.Unlock()
@ -79,7 +91,7 @@ func TestCPULimitWaitFor(t *testing.T) {
l := NewLimiter(LimiterConfig{
CPU: 42,
WaitFor: 3 * time.Second,
OnLimit: func(float64, uint64) {
OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
wg.Done()
},
})
@ -94,7 +106,7 @@ func TestCPULimitWaitFor(t *testing.T) {
lock.Unlock()
}()
assert.Eventually(t, func() bool {
require.Eventually(t, func() bool {
lock.Lock()
defer lock.Unlock()
@ -115,7 +127,7 @@ func TestMemoryLimit(t *testing.T) {
l := NewLimiter(LimiterConfig{
Memory: 42,
OnLimit: func(float64, uint64) {
OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
wg.Done()
},
})
@ -130,7 +142,7 @@ func TestMemoryLimit(t *testing.T) {
lock.Unlock()
}()
assert.Eventually(t, func() bool {
require.Eventually(t, func() bool {
lock.Lock()
defer lock.Unlock()
@ -152,7 +164,7 @@ func TestMemoryLimitWaitFor(t *testing.T) {
l := NewLimiter(LimiterConfig{
Memory: 42,
WaitFor: 3 * time.Second,
OnLimit: func(float64, uint64) {
OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
wg.Done()
},
})
@ -167,7 +179,80 @@ func TestMemoryLimitWaitFor(t *testing.T) {
lock.Unlock()
}()
assert.Eventually(t, func() bool {
require.Eventually(t, func() bool {
lock.Lock()
defer lock.Unlock()
return done
}, 10*time.Second, 1*time.Second)
}
func TestGPUMemoryLimit(t *testing.T) {
lock := sync.Mutex{}
lock.Lock()
done := false
lock.Unlock()
go func() {
wg := sync.WaitGroup{}
wg.Add(1)
l := NewLimiter(LimiterConfig{
GPUMemory: 42,
OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
wg.Done()
},
})
l.Start(&psproc{})
defer l.Stop()
wg.Wait()
lock.Lock()
done = true
lock.Unlock()
}()
require.Eventually(t, func() bool {
lock.Lock()
defer lock.Unlock()
return done
}, 2*time.Second, 100*time.Millisecond)
}
func TestGPUMemoryLimitWaitFor(t *testing.T) {
lock := sync.Mutex{}
lock.Lock()
done := false
lock.Unlock()
go func() {
wg := sync.WaitGroup{}
wg.Add(1)
l := NewLimiter(LimiterConfig{
GPUMemory: 42,
WaitFor: 3 * time.Second,
OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
wg.Done()
},
})
l.Start(&psproc{})
defer l.Stop()
wg.Wait()
lock.Lock()
done = true
lock.Unlock()
}()
require.Eventually(t, func() bool {
lock.Lock()
defer lock.Unlock()
@ -189,7 +274,7 @@ func TestMemoryLimitSoftMode(t *testing.T) {
l := NewLimiter(LimiterConfig{
Memory: 42,
Mode: LimitModeSoft,
OnLimit: func(float64, uint64) {
OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
wg.Done()
},
})
@ -197,7 +282,7 @@ func TestMemoryLimitSoftMode(t *testing.T) {
l.Start(&psproc{})
defer l.Stop()
l.Limit(false, true)
l.Limit(false, true, false)
wg.Wait()
@ -206,7 +291,46 @@ func TestMemoryLimitSoftMode(t *testing.T) {
lock.Unlock()
}()
assert.Eventually(t, func() bool {
require.Eventually(t, func() bool {
lock.Lock()
defer lock.Unlock()
return done
}, 2*time.Second, 100*time.Millisecond)
}
func TestGPUMemoryLimitSoftMode(t *testing.T) {
lock := sync.Mutex{}
lock.Lock()
done := false
lock.Unlock()
go func() {
wg := sync.WaitGroup{}
wg.Add(1)
l := NewLimiter(LimiterConfig{
GPUMemory: 42,
Mode: LimitModeSoft,
OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
wg.Done()
},
})
l.Start(&psproc{})
defer l.Stop()
l.Limit(false, false, true)
wg.Wait()
lock.Lock()
done = true
lock.Unlock()
}()
require.Eventually(t, func() bool {
lock.Lock()
defer lock.Unlock()

View File

@ -46,29 +46,32 @@ type Process interface {
// Limit enables or disables CPU and memory limiting. CPU will be throttled
// into the configured limit. If memory consumption is above the configured
// limit, the process will be killed.
Limit(cpu, memory bool) error
Limit(cpu, memory, gpu bool) error
}
// Config is the configuration of a process
type Config struct {
Binary string // Path to the ffmpeg binary.
Args []string // List of arguments for the binary.
Reconnect bool // Whether to restart the process if it exited.
ReconnectDelay time.Duration // Duration to wait before restarting the process.
StaleTimeout time.Duration // Kill the process after this duration if it doesn't produce any output.
Timeout time.Duration // Kill the process after this duration.
LimitCPU float64 // Kill the process if the CPU usage in percent is above this value.
LimitMemory uint64 // Kill the process if the memory consumption in bytes is above this value.
LimitDuration time.Duration // Kill the process if the limits are exceeded for this duration.
LimitMode LimitMode // Select limiting mode
Scheduler Scheduler // A scheduler.
Parser Parser // A parser for the output of the process.
OnArgs func(args []string) []string // A callback which is called right before the process will start with the command args.
OnBeforeStart func() error // A callback which is called before the process will be started. If error is non-nil, the start will be refused.
OnStart func() // A callback which is called after the process started.
OnExit func(state string) // A callback which is called after the process exited with the exit state.
OnStateChange func(from, to string) // A callback which is called after a state changed.
Logger log.Logger
Binary string // Path to the ffmpeg binary.
Args []string // List of arguments for the binary.
Reconnect bool // Whether to restart the process if it exited.
ReconnectDelay time.Duration // Duration to wait before restarting the process.
StaleTimeout time.Duration // Kill the process after this duration if it doesn't produce any output.
Timeout time.Duration // Kill the process after this duration.
LimitCPU float64 // Kill the process if the CPU usage in percent is above this value, in percent 0-100 in hard mode, 0-100*ncpu in soft mode.
LimitMemory uint64 // Kill the process if the memory consumption in bytes is above this value.
LimitGPUUsage float64 // Kill the process if the GPU usage in percent is above this value, in percent 0-100.
LimitGPUEncoder float64 // Kill the process if the GPU encoder usage in percent is above this value, in percent 0-100.
LimitGPUDecoder float64 // Kill the process if the GPU decoder usage in percent is above this value, in percent 0-100.
LimitGPUMemory uint64 // Kill the process if the GPU memory consumption in bytes is above this value.
LimitDuration time.Duration // Kill the process if the limits are exceeded for this duration.
LimitMode LimitMode // Select limiting mode
Scheduler Scheduler // A scheduler.
Parser Parser // A parser for the output of the process.
OnBeforeStart func(args []string) ([]string, error) // A callback which is called before the process will be started. The string slice is the arguments of the command line. If error is non-nil, the start will be refused.
OnStart func() // A callback which is called after the process started.
OnExit func(state string) // A callback which is called after the process exited with the exit state.
OnStateChange func(from, to string) // A callback which is called after a state changed.
Logger log.Logger
}
// Status represents the current status of a process
@ -81,20 +84,47 @@ type Status struct {
Time time.Time // Time is the time of the last change of the state
CommandArgs []string // Currently running command arguments
LimitMode string // The limiting mode
CPU struct {
NCPU float64 // Number of logical CPUs
Current float64 // Currently consumed CPU in percent
Average float64 // Average consumed CPU in percent
Max float64 // Max. consumed CPU in percent
Limit float64 // Usage limit in percent
IsThrottling bool // Whether the CPU is currently limited
} // Used CPU in percent
Memory struct {
Current uint64 // Currently consumed memory in bytes
Average float64 // Average consumed memory in bytes
Max uint64 // Max. consumed memory in bytes
Limit uint64 // Usage limit in bytes
} // Used memory in bytes
CPU StatusCPU // CPU consumption in percent
Memory StatusMemory // Memory consumption in bytes
GPU StatusGPU // GPU consumption
}
type StatusCPU struct {
NCPU float64 // Number of logical CPUs
Current float64 // Currently consumed CPU in percent
Average float64 // Average consumed CPU in percent
Max float64 // Max. consumed CPU in percent
Limit float64 // Usage limit in percent
IsThrottling bool // Whether the CPU is currently limited
}
type StatusMemory struct {
Current uint64 // Currently consumed memory in bytes
Average uint64 // Average consumed memory in bytes
Max uint64 // Max. consumed memory in bytes
Limit uint64 // Usage limit in bytes
}
type StatusGPUMemory struct {
Current uint64 // Currently consumed memory in bytes
Average uint64 // Average consumed memory in bytes
Max uint64 // Max. consumed memory in bytes
Limit uint64 // Usage limit in bytes
}
type StatusGPUUsage struct {
Current float64 // Currently consumed GPU usage in percent
Average float64 // Average consumed GPU usage in percent
Max float64 // Max. consumed GPU usage in percent
Limit float64 // Usage limit in percent
}
type StatusGPU struct {
Index int
Memory StatusGPUMemory // GPU memory consumption
Usage StatusGPUUsage // GPU usage in percent
Encoder StatusGPUUsage // GPU encoder usage in percent
Decoder StatusGPUUsage // GPU decoder usage in percent
}
// States
@ -206,8 +236,7 @@ type process struct {
logger log.Logger
debuglogger log.Logger
callbacks struct {
onArgs func(args []string) []string
onBeforeStart func() error
onBeforeStart func(args []string) ([]string, error)
onStart func()
onExit func(state string)
onStateChange func(from, to string)
@ -263,28 +292,35 @@ func New(config Config) (Process, error) {
p.stale.last = time.Now()
p.stale.timeout = config.StaleTimeout
p.callbacks.onArgs = config.OnArgs
p.callbacks.onBeforeStart = config.OnBeforeStart
p.callbacks.onStart = config.OnStart
p.callbacks.onExit = config.OnExit
p.callbacks.onStateChange = config.OnStateChange
p.limits = NewLimiter(LimiterConfig{
CPU: config.LimitCPU,
Memory: config.LimitMemory,
WaitFor: config.LimitDuration,
Mode: config.LimitMode,
Logger: p.logger.WithComponent("ProcessLimiter"),
OnLimit: func(cpu float64, memory uint64) {
CPU: config.LimitCPU,
Memory: config.LimitMemory,
GPUUsage: config.LimitGPUUsage,
GPUEncoder: config.LimitGPUEncoder,
GPUDecoder: config.LimitGPUDecoder,
GPUMemory: config.LimitGPUMemory,
WaitFor: config.LimitDuration,
Mode: config.LimitMode,
Logger: p.logger.WithComponent("ProcessLimiter"),
OnLimit: func(cpu float64, memory uint64, gpuusage, gpuencoder, gpudecoder float64, gpumemory uint64) {
if !p.isRunning() {
return
}
p.logger.WithFields(log.Fields{
"cpu": cpu,
"memory": memory,
"cpu": cpu,
"memory": memory,
"gpuusage": gpuusage,
"gpuencoder": gpuencoder,
"gpudecoder": gpudecoder,
"gpumemmory": gpumemory,
}).Warn().Log("Killed because limits are exceeded")
p.Kill(false, fmt.Sprintf("Killed because limits are exceeded (mode: %s, tolerance: %s): %.2f (%.2f) CPU, %d (%d) bytes memory", config.LimitMode.String(), config.LimitDuration.String(), cpu, config.LimitCPU, memory, config.LimitMemory))
p.Kill(false, fmt.Sprintf("Killed because limits are exceeded (mode: %s, tolerance: %s): %.2f (%.2f) CPU, %d (%d) bytes memory, %.2f/%.2f/%.2f (%.2f) GPU usage, %d (%d) bytes GPU memory", config.LimitMode.String(), config.LimitDuration.String(), cpu, config.LimitCPU, memory, config.LimitMemory, gpuusage, gpuencoder, gpudecoder, config.LimitGPUUsage, gpumemory, config.LimitGPUMemory))
},
})
@ -467,8 +503,47 @@ func (p *process) Status() Status {
Duration: time.Since(stateTime),
Time: stateTime,
LimitMode: p.limits.Mode().String(),
CPU: usage.CPU,
Memory: usage.Memory,
CPU: StatusCPU{
NCPU: usage.CPU.NCPU,
Current: usage.CPU.Current,
Average: usage.CPU.Average,
Max: usage.CPU.Max,
Limit: usage.CPU.Limit,
IsThrottling: usage.CPU.IsThrottling,
},
Memory: StatusMemory{
Current: usage.Memory.Current,
Average: uint64(usage.Memory.Average),
Max: usage.Memory.Max,
Limit: usage.Memory.Limit,
},
GPU: StatusGPU{
Index: usage.GPU.Index,
Memory: StatusGPUMemory{
Current: usage.GPU.Memory.Current,
Average: uint64(usage.GPU.Memory.Average),
Max: usage.GPU.Memory.Max,
Limit: usage.GPU.Memory.Limit,
},
Usage: StatusGPUUsage{
Current: usage.GPU.Usage.Current,
Average: usage.GPU.Usage.Average,
Max: usage.GPU.Usage.Max,
Limit: usage.GPU.Usage.Limit,
},
Encoder: StatusGPUUsage{
Current: usage.GPU.Encoder.Current,
Average: usage.GPU.Encoder.Average,
Max: usage.GPU.Encoder.Max,
Limit: usage.GPU.Encoder.Limit,
},
Decoder: StatusGPUUsage{
Current: usage.GPU.Decoder.Current,
Average: usage.GPU.Decoder.Average,
Max: usage.GPU.Decoder.Max,
Limit: usage.GPU.Decoder.Limit,
},
},
}
s.CommandArgs = make([]string, len(p.args))
@ -488,7 +563,7 @@ func (p *process) IsRunning() bool {
return p.isRunning()
}
func (p *process) Limit(cpu, memory bool) error {
func (p *process) Limit(cpu, memory, gpu bool) error {
if !p.isRunning() {
return nil
}
@ -498,11 +573,12 @@ func (p *process) Limit(cpu, memory bool) error {
}
p.logger.Warn().WithFields(log.Fields{
"limit_cpu": cpu,
"limit_memory": memory,
"limit_cpu": cpu,
"limit_memory": memory,
"limit_gpumemory": gpu,
}).Log("Limiter triggered")
return p.limits.Limit(cpu, memory)
return p.limits.Limit(cpu, memory, gpu)
}
// Start will start the process and sets the order to "start". If the
@ -559,11 +635,21 @@ func (p *process) start() error {
args := p.args
if p.callbacks.onArgs != nil {
if p.callbacks.onBeforeStart != nil {
args = make([]string, len(p.args))
copy(args, p.args)
args = p.callbacks.onArgs(args)
args, err = p.callbacks.onBeforeStart(args)
if err != nil {
p.setState(stateFailed)
p.parser.Parse([]byte(err.Error()))
p.logger.WithError(err).Error().Log("Starting failed")
p.reconnect(p.delay(stateFailed))
return err
}
}
p.cmd = exec.Command(p.binary, args...)
@ -582,19 +668,6 @@ func (p *process) start() error {
return err
}
if p.callbacks.onBeforeStart != nil {
if err := p.callbacks.onBeforeStart(); err != nil {
p.setState(stateFailed)
p.parser.Parse([]byte(err.Error()))
p.logger.WithError(err).Error().Log("Starting failed")
p.reconnect(p.delay(stateFailed))
return err
}
}
if err := p.cmd.Start(); err != nil {
p.setState(stateFailed)

View File

@ -606,21 +606,15 @@ func TestProcessCallbacks(t *testing.T) {
"2",
},
Reconnect: false,
OnArgs: func(a []string) []string {
lock.Lock()
defer lock.Unlock()
args = make([]string, len(a))
copy(args, a)
return a
},
OnBeforeStart: func() error {
OnBeforeStart: func(a []string) ([]string, error) {
lock.Lock()
defer lock.Unlock()
onBeforeStart = true
return nil
args = make([]string, len(a))
copy(args, a)
return a, nil
},
OnStart: func() {
lock.Lock()
@ -681,8 +675,8 @@ func TestProcessCallbacksOnBeforeStart(t *testing.T) {
Parser: parser,
Reconnect: true,
ReconnectDelay: 10 * time.Second,
OnBeforeStart: func() error {
return fmt.Errorf("no, not now")
OnBeforeStart: func(a []string) ([]string, error) {
return a, fmt.Errorf("no, not now")
},
})
require.NoError(t, err)

View File

@ -3,21 +3,25 @@ package gpu
import "errors"
type Process struct {
PID int32
Memory uint64
PID int32
Index int
Memory uint64 // bytes
Usage float64 // percent 0-100
Encoder float64 // percent 0-100
Decoder float64 // percent 0-100
}
type Stats struct {
ID string
Name string
Architecture string
MemoryTotal uint64
MemoryUsed uint64
MemoryTotal uint64 // bytes
MemoryUsed uint64 // bytes
Usage float64
MemoryUsage float64
EncoderUsage float64
DecoderUsage float64
Usage float64 // percent 0-100
Encoder float64 // percent 0-100
Decoder float64 // percent 0-100
Process []Process
@ -25,9 +29,17 @@ type Stats struct {
}
type GPU interface {
// Count returns the number of GPU in the system.
Count() (int, error)
// Stats returns current GPU stats.
Stats() ([]Stats, error)
// Process returns a Process.
Process(pid int32) (Process, error)
// Close stops all GPU collection processes
Close()
}
var ErrProcessNotFound = errors.New("process not found")

View File

@ -0,0 +1,54 @@
# gpu pid type sm mem enc dec fb command
# Idx # C/G % % % % MB name
0 7372 C 2 0 2 - 136 ffmpeg
0 12176 C 5 2 3 7 782 ffmpeg
0 20035 C 8 2 4 1 1145 ffmpeg
0 20141 C 2 1 1 3 429 ffmpeg
0 29591 C 2 1 - 2 435 ffmpeg
0 7372 C 2 0 - - 136 ffmpeg
0 12176 C 8 3 7 9 782 ffmpeg
0 20035 C 8 2 3 1 1145 ffmpeg
0 20141 C - - 1 1 429 ffmpeg
0 29591 C 3 1 - 2 435 ffmpeg
0 7372 C 2 1 1 - 136 ffmpeg
0 12176 C 5 1 5 7 782 ffmpeg
0 20035 C 8 3 1 4 1145 ffmpeg
0 20141 C 2 0 1 - 429 ffmpeg
0 29591 C 2 0 1 3 435 ffmpeg
0 7372 C 2 0 - - 136 ffmpeg
0 12176 C 5 1 5 3 782 ffmpeg
0 20035 C 8 2 5 4 1145 ffmpeg
0 20141 C 3 1 - 5 429 ffmpeg
0 29591 C 2 0 - 1 435 ffmpeg
0 7372 C 2 1 - - 136 ffmpeg
0 12176 C 10 3 6 8 782 ffmpeg
0 20035 C 3 1 1 1 1145 ffmpeg
0 20141 C - - 4 1 429 ffmpeg
0 29591 C 5 2 - 2 435 ffmpeg
0 7372 C 5 1 2 - 136 ffmpeg
0 12176 C 6 2 4 7 782 ffmpeg
0 20035 C - - - - 1145 ffmpeg
0 20141 C 5 1 1 3 429 ffmpeg
0 29591 C 5 2 2 4 435 ffmpeg
0 7372 C - - 1 - 136 ffmpeg
0 12176 C 7 2 3 4 782 ffmpeg
0 20035 C 2 0 - 1 1145 ffmpeg
0 20141 C 7 2 4 4 429 ffmpeg
0 29591 C 5 1 2 3 435 ffmpeg
0 7372 C 2 0 1 - 136 ffmpeg
0 12176 C 9 3 3 6 782 ffmpeg
0 20035 C 2 1 - 1 1145 ffmpeg
0 20141 C 4 1 4 5 429 ffmpeg
0 29591 C 2 0 2 1 435 ffmpeg
0 7372 C - - - - 136 ffmpeg
0 12176 C 10 3 4 8 782 ffmpeg
0 20035 C 4 1 2 1 1145 ffmpeg
0 20141 C 7 2 3 3 429 ffmpeg
# gpu pid type sm mem enc dec fb command
# Idx # C/G % % % % MB name
0 29591 C - - 1 1 435 ffmpeg
0 7372 C 2 0 2 - 136 ffmpeg
0 12176 C 7 2 2 6 782 ffmpeg
0 20035 C 7 2 4 3 1145 ffmpeg
0 20141 C 5 1 1 3 429 ffmpeg
0 29591 C - - 1 1 435 ffmpeg

View File

@ -438,6 +438,18 @@
</supported_mem_clock>
</supported_clocks>
<processes>
<process_info>
<pid>10131</pid>
<type>C</type>
<process_name>ffmpeg</process_name>
<used_memory>389 MiB</used_memory>
</process_info>
<process_info>
<pid>13597</pid>
<type>C</type>
<process_name>ffmpeg</process_name>
<used_memory>1054 MiB</used_memory>
</process_info>
</processes>
<accounted_processes>
</accounted_processes>
@ -879,6 +891,12 @@
</supported_mem_clock>
</supported_clocks>
<processes>
<process_info>
<pid>16870</pid>
<type>C</type>
<process_name>ffmpeg</process_name>
<used_memory>549 MiB</used_memory>
</process_info>
</processes>
<accounted_processes>
</accounted_processes>

View File

@ -6,6 +6,9 @@ import (
"encoding/xml"
"fmt"
"os/exec"
"regexp"
"slices"
"strconv"
"sync"
"time"
@ -47,11 +50,19 @@ func (u *Utilization) UnmarshalText(text []byte) error {
}
type Process struct {
PID int32 `xml:"pid"`
Memory Megabytes `xml:"used_memory"`
Index int
PID int32
Memory uint64 // bytes
Usage float64 // percent 0-100
Encoder float64 // percent 0-100
Decoder float64 // percent 0-100
lastSeen time.Time
}
type GPUStats struct {
ID string `xml:"id,attr"`
Name string `xml:"product_name"`
Architecture string `xml:"product_architecture"`
@ -59,31 +70,17 @@ type GPUStats struct {
MemoryUsed Megabytes `xml:"fb_memory_usage>used"`
Usage Utilization `xml:"utilization>gpu_util"`
MemoryUsage Utilization `xml:"utilization>memory_util"`
EncoderUsage Utilization `xml:"utilization>encoder_util"`
DecoderUsage Utilization `xml:"utilization>decoder_util"`
Process []Process `xml:"processes>process_info"`
UsageEncoder Utilization `xml:"utilization>encoder_util"`
UsageDecoder Utilization `xml:"utilization>decoder_util"`
}
type Stats struct {
GPU []GPUStats `xml:"gpu"`
}
func parse(data []byte) (Stats, error) {
nv := Stats{}
err := xml.Unmarshal(data, &nv)
if err != nil {
return nv, fmt.Errorf("parsing report: %w", err)
}
return nv, nil
}
type nvidia struct {
cmd *exec.Cmd
wr *writer
wrQuery *writerQuery
wrProcess *writerProcess
lock sync.RWMutex
cancel context.CancelFunc
@ -97,33 +94,33 @@ type dummy struct{}
func (d *dummy) Count() (int, error) { return 0, nil }
func (d *dummy) Stats() ([]gpu.Stats, error) { return nil, nil }
func (d *dummy) Process(pid int32) (gpu.Process, error) { return gpu.Process{}, gpu.ErrProcessNotFound }
func (d *dummy) Close() {}
type writer struct {
buf bytes.Buffer
ch chan Stats
type writerQuery struct {
buf bytes.Buffer
ch chan Stats
terminator []byte
}
var terminator = []byte("</nvidia_smi_log>\n")
func (w *writer) Write(data []byte) (int, error) {
func (w *writerQuery) Write(data []byte) (int, error) {
n, err := w.buf.Write(data)
if err != nil {
return n, err
}
for {
idx := bytes.Index(w.buf.Bytes(), terminator)
idx := bytes.Index(w.buf.Bytes(), w.terminator)
if idx == -1 {
break
}
content := make([]byte, idx+len(terminator))
content := make([]byte, idx+len(w.terminator))
n, err := w.buf.Read(content)
if err != nil || n != len(content) {
break
}
s, err := parse(content)
s, err := w.parse(content)
if err != nil {
continue
}
@ -134,19 +131,132 @@ func (w *writer) Write(data []byte) (int, error) {
return n, nil
}
func (w *writerQuery) parse(data []byte) (Stats, error) {
nv := Stats{}
err := xml.Unmarshal(data, &nv)
if err != nil {
return nv, fmt.Errorf("parsing report: %w", err)
}
return nv, nil
}
type writerProcess struct {
buf bytes.Buffer
ch chan Process
re *regexp.Regexp
terminator []byte
}
func (w *writerProcess) Write(data []byte) (int, error) {
n, err := w.buf.Write(data)
if err != nil {
return n, err
}
for {
idx := bytes.Index(w.buf.Bytes(), w.terminator)
if idx == -1 {
break
}
content := make([]byte, idx+len(w.terminator))
n, err := w.buf.Read(content)
if err != nil || n != len(content) {
break
}
s, err := w.parse(content)
if err != nil {
continue
}
w.ch <- s
}
return n, nil
}
func (w *writerProcess) parse(data []byte) (Process, error) {
p := Process{}
if len(data) == 0 {
return p, fmt.Errorf("empty line")
}
if data[0] == '#' {
return p, fmt.Errorf("comment")
}
matches := w.re.FindStringSubmatch(string(data))
if matches == nil {
return p, fmt.Errorf("no matches found")
}
if len(matches) != 7 {
return p, fmt.Errorf("not the expected number of matches found")
}
if d, err := strconv.ParseInt(matches[1], 10, 0); err == nil {
p.Index = int(d)
}
if d, err := strconv.ParseInt(matches[2], 10, 32); err == nil {
p.PID = int32(d)
}
if matches[3][0] != '-' {
if d, err := strconv.ParseFloat(matches[3], 64); err == nil {
p.Usage = d
}
}
if matches[4][0] != '-' {
if d, err := strconv.ParseFloat(matches[4], 64); err == nil {
p.Encoder = d
}
}
if matches[5][0] != '-' {
if d, err := strconv.ParseFloat(matches[5], 64); err == nil {
p.Decoder = d
}
}
if d, err := strconv.ParseUint(matches[6], 10, 64); err == nil {
p.Memory = d * 1024 * 1024
}
return p, nil
}
func New(path string) gpu.GPU {
if len(path) == 0 {
path = "nvidia-smi"
}
_, err := exec.LookPath(path)
path, err := exec.LookPath(path)
if err != nil {
return &dummy{}
}
n := &nvidia{
wr: &writer{
ch: make(chan Stats, 1),
wrQuery: &writerQuery{
ch: make(chan Stats, 1),
terminator: []byte("</nvidia_smi_log>\n"),
},
wrProcess: &writerProcess{
ch: make(chan Process, 32),
// # gpu pid type sm mem enc dec fb command
// # Idx # C/G % % % % MB name
// 0 7372 C 2 0 2 - 136 ffmpeg
// 0 12176 C 5 2 3 7 782 ffmpeg
// 0 20035 C 8 2 4 1 1145 ffmpeg
// 0 20141 C 2 1 1 3 429 ffmpeg
// 0 29591 C 2 1 - 2 435 ffmpeg
re: regexp.MustCompile(`^\s*([0-9]+)\s+([0-9]+)\s+[A-Z]\s+([0-9-]+)\s+[0-9-]+\s+([0-9-]+)\s+([0-9-]+)\s+([0-9]+).*`),
terminator: []byte("\n"),
},
process: map[int32]Process{},
}
@ -154,7 +264,8 @@ func New(path string) gpu.GPU {
ctx, cancel := context.WithCancel(context.Background())
n.cancel = cancel
go n.runner(ctx, path)
go n.runnerQuery(ctx, path)
go n.runnerProcess(ctx, path)
go n.reader(ctx)
return n
@ -165,13 +276,18 @@ func (n *nvidia) reader(ctx context.Context) {
select {
case <-ctx.Done():
return
case stats := <-n.wr.ch:
case stats := <-n.wrQuery.ch:
n.lock.Lock()
n.stats = stats
n.process = map[int32]Process{}
for _, g := range n.stats.GPU {
for _, p := range g.Process {
n.process[p.PID] = p
n.lock.Unlock()
case process := <-n.wrProcess.ch:
process.lastSeen = time.Now()
n.lock.Lock()
n.process[process.PID] = process
for pid, p := range n.process {
if time.Since(p.lastSeen) > 11*time.Second {
delete(n.process, pid)
}
}
n.lock.Unlock()
@ -179,11 +295,11 @@ func (n *nvidia) reader(ctx context.Context) {
}
}
func (n *nvidia) runner(ctx context.Context, path string) {
func (n *nvidia) runnerQuery(ctx context.Context, path string) {
for {
n.cmd = exec.Command(path, "-q", "-x", "-l", "1")
n.cmd.Stdout = n.wr
err := n.cmd.Start()
cmd := exec.CommandContext(ctx, path, "-q", "-x", "-l", "1")
cmd.Stdout = n.wrQuery
err := cmd.Start()
if err != nil {
n.lock.Lock()
n.err = err
@ -193,7 +309,35 @@ func (n *nvidia) runner(ctx context.Context, path string) {
continue
}
err = n.cmd.Wait()
err = cmd.Wait()
n.lock.Lock()
n.err = err
n.lock.Unlock()
select {
case <-ctx.Done():
return
default:
}
}
}
func (n *nvidia) runnerProcess(ctx context.Context, path string) {
for {
cmd := exec.CommandContext(ctx, path, "pmon", "-s", "um", "-d", "5")
cmd.Stdout = n.wrProcess
err := cmd.Start()
if err != nil {
n.lock.Lock()
n.err = err
n.lock.Unlock()
time.Sleep(3 * time.Second)
continue
}
err = cmd.Wait()
n.lock.Lock()
n.err = err
@ -219,39 +363,55 @@ func (n *nvidia) Count() (int, error) {
}
func (n *nvidia) Stats() ([]gpu.Stats, error) {
s := []gpu.Stats{}
stats := []gpu.Stats{}
n.lock.RLock()
defer n.lock.RUnlock()
if n.err != nil {
return s, n.err
return stats, n.err
}
for _, nv := range n.stats.GPU {
stats := gpu.Stats{
s := gpu.Stats{
ID: nv.ID,
Name: nv.Name,
Architecture: nv.Architecture,
MemoryTotal: uint64(nv.MemoryTotal),
MemoryUsed: uint64(nv.MemoryUsed),
Usage: float64(nv.Usage),
MemoryUsage: float64(nv.MemoryUsage),
EncoderUsage: float64(nv.EncoderUsage),
DecoderUsage: float64(nv.DecoderUsage),
Encoder: float64(nv.UsageEncoder),
Decoder: float64(nv.UsageDecoder),
Process: []gpu.Process{},
}
for _, p := range nv.Process {
stats.Process = append(stats.Process, gpu.Process{
PID: p.PID,
Memory: uint64(p.Memory),
})
}
s = append(s, stats)
stats = append(stats, s)
}
return s, nil
for _, p := range n.process {
if p.Index >= len(stats) {
continue
}
stats[p.Index].Process = append(stats[p.Index].Process, gpu.Process{
PID: p.PID,
Index: p.Index,
Memory: p.Memory,
Usage: p.Usage,
Encoder: p.Encoder,
Decoder: p.Decoder,
})
}
for i := range stats {
p := stats[i].Process
slices.SortFunc(p, func(a, b gpu.Process) int {
return int(a.PID - b.PID)
})
stats[i].Process = p
}
return stats, nil
}
func (n *nvidia) Process(pid int32) (gpu.Process, error) {
@ -259,14 +419,18 @@ func (n *nvidia) Process(pid int32) (gpu.Process, error) {
defer n.lock.RUnlock()
p, hasProcess := n.process[pid]
if !hasProcess {
return gpu.Process{}, gpu.ErrProcessNotFound
if hasProcess {
return gpu.Process{
PID: p.PID,
Index: p.Index,
Memory: p.Memory,
Usage: p.Usage,
Encoder: p.Encoder,
Decoder: p.Decoder,
}, nil
}
return gpu.Process{
PID: p.PID,
Memory: uint64(p.Memory),
}, nil
return gpu.Process{Index: -1}, gpu.ErrProcessNotFound
}
func (n *nvidia) Close() {
@ -279,6 +443,4 @@ func (n *nvidia) Close() {
n.cancel()
n.cancel = nil
n.cmd.Process.Kill()
}

View File

@ -1,102 +1,430 @@
package nvidia
import (
"bytes"
"os"
"regexp"
"sync"
"testing"
"time"
"github.com/datarhei/core/v16/internal/testhelper"
"github.com/datarhei/core/v16/psutil/gpu"
"github.com/stretchr/testify/require"
)
func TestParseNV(t *testing.T) {
data, err := os.ReadFile("./fixtures/data1.xml")
func TestParseQuery(t *testing.T) {
data, err := os.ReadFile("./fixtures/query1.xml")
require.NoError(t, err)
nv, err := parse(data)
wr := &writerQuery{}
nv, err := wr.parse(data)
require.NoError(t, err)
require.Equal(t, Stats{
GPU: []GPUStats{
{
ID: "00000000:01:00.0",
Name: "NVIDIA GeForce GTX 1080",
Architecture: "Pascal",
MemoryTotal: 8119 * 1024 * 1024,
MemoryUsed: 918 * 1024 * 1024,
Usage: 15,
MemoryUsage: 7,
EncoderUsage: 3,
DecoderUsage: 0,
Process: []Process{
{
PID: 18179,
Memory: 916 * 1024 * 1024,
},
},
UsageEncoder: 3,
UsageDecoder: 0,
},
},
}, nv)
data, err = os.ReadFile("./fixtures/data2.xml")
data, err = os.ReadFile("./fixtures/query2.xml")
require.NoError(t, err)
nv, err = parse(data)
nv, err = wr.parse(data)
require.NoError(t, err)
require.Equal(t, Stats{
GPU: []GPUStats{
{
ID: "00000000:01:00.0",
Name: "NVIDIA L4",
Architecture: "Ada Lovelace",
MemoryTotal: 23034 * 1024 * 1024,
MemoryUsed: 1 * 1024 * 1024,
Usage: 2,
MemoryUsage: 0,
EncoderUsage: 0,
DecoderUsage: 0,
UsageEncoder: 0,
UsageDecoder: 0,
},
{
ID: "00000000:C1:00.0",
Name: "NVIDIA L4",
Architecture: "Ada Lovelace",
MemoryTotal: 23034 * 1024 * 1024,
MemoryUsed: 1 * 1024 * 1024,
Usage: 3,
MemoryUsage: 0,
EncoderUsage: 0,
DecoderUsage: 0,
UsageEncoder: 0,
UsageDecoder: 0,
},
},
}, nv)
data, err = os.ReadFile("./fixtures/data3.xml")
data, err = os.ReadFile("./fixtures/query3.xml")
require.NoError(t, err)
nv, err = parse(data)
nv, err = wr.parse(data)
require.NoError(t, err)
require.Equal(t, Stats{
GPU: []GPUStats{
{
ID: "00000000:01:00.0",
Name: "GeForce GTX 1080",
MemoryTotal: 8119 * 1024 * 1024,
MemoryUsed: 2006 * 1024 * 1024,
Usage: 32,
MemoryUsage: 11,
EncoderUsage: 17,
DecoderUsage: 25,
Process: []Process{
{
PID: 10131,
Memory: 389 * 1024 * 1024,
},
{
PID: 13597,
Memory: 1054 * 1024 * 1024,
},
{
PID: 16870,
Memory: 549 * 1024 * 1024,
},
},
UsageEncoder: 17,
UsageDecoder: 25,
},
},
}, nv)
}
func TestParseProcess(t *testing.T) {
data, err := os.ReadFile("./fixtures/process.txt")
require.NoError(t, err)
wr := &writerProcess{
re: regexp.MustCompile(`^\s*([0-9]+)\s+([0-9]+)\s+[A-Z]\s+([0-9-]+)\s+[0-9-]+\s+([0-9-]+)\s+([0-9-]+)\s+([0-9]+).*`),
}
lines := bytes.Split(data, []byte("\n"))
process := map[int32]Process{}
for _, line := range lines {
p, err := wr.parse(line)
if err != nil {
continue
}
process[p.PID] = p
}
require.Equal(t, map[int32]Process{
7372: {
Index: 0,
PID: 7372,
Memory: 136 * 1024 * 1024,
Usage: 2,
Encoder: 2,
Decoder: 0,
},
12176: {
Index: 0,
PID: 12176,
Memory: 782 * 1024 * 1024,
Usage: 7,
Encoder: 2,
Decoder: 6,
},
20035: {
Index: 0,
PID: 20035,
Memory: 1145 * 1024 * 1024,
Usage: 7,
Encoder: 4,
Decoder: 3,
},
20141: {
Index: 0,
PID: 20141,
Memory: 429 * 1024 * 1024,
Usage: 5,
Encoder: 1,
Decoder: 3,
},
29591: {
Index: 0,
PID: 29591,
Memory: 435 * 1024 * 1024,
Usage: 0,
Encoder: 1,
Decoder: 1,
},
}, process)
}
func TestWriterQuery(t *testing.T) {
data, err := os.ReadFile("./fixtures/query2.xml")
require.NoError(t, err)
wr := &writerQuery{
ch: make(chan Stats, 1),
terminator: []byte("</nvidia_smi_log>"),
}
stats := Stats{}
wg := sync.WaitGroup{}
wg.Add(1)
go func() {
defer wg.Done()
for s := range wr.ch {
stats = s
}
}()
_, err = wr.Write(data)
require.NoError(t, err)
close(wr.ch)
wg.Wait()
require.Equal(t, Stats{
GPU: []GPUStats{
{
ID: "00000000:01:00.0",
Name: "NVIDIA L4",
Architecture: "Ada Lovelace",
MemoryTotal: 23034 * 1024 * 1024,
MemoryUsed: 1 * 1024 * 1024,
Usage: 2,
UsageEncoder: 0,
UsageDecoder: 0,
},
{
ID: "00000000:C1:00.0",
Name: "NVIDIA L4",
Architecture: "Ada Lovelace",
MemoryTotal: 23034 * 1024 * 1024,
MemoryUsed: 1 * 1024 * 1024,
Usage: 3,
UsageEncoder: 0,
UsageDecoder: 0,
},
},
}, stats)
}
func TestWriterProcess(t *testing.T) {
data, err := os.ReadFile("./fixtures/process.txt")
require.NoError(t, err)
wr := &writerProcess{
ch: make(chan Process, 32),
re: regexp.MustCompile(`^\s*([0-9]+)\s+([0-9]+)\s+[A-Z]\s+([0-9-]+)\s+[0-9-]+\s+([0-9-]+)\s+([0-9-]+)\s+([0-9]+).*`),
terminator: []byte("\n"),
}
process := map[int32]Process{}
wg := sync.WaitGroup{}
wg.Add(1)
go func() {
defer wg.Done()
for p := range wr.ch {
process[p.PID] = p
}
}()
_, err = wr.Write(data)
require.NoError(t, err)
close(wr.ch)
wg.Wait()
require.Equal(t, map[int32]Process{
7372: {
Index: 0,
PID: 7372,
Memory: 136 * 1024 * 1024,
Usage: 2,
Encoder: 2,
Decoder: 0,
},
12176: {
Index: 0,
PID: 12176,
Memory: 782 * 1024 * 1024,
Usage: 7,
Encoder: 2,
Decoder: 6,
},
20035: {
Index: 0,
PID: 20035,
Memory: 1145 * 1024 * 1024,
Usage: 7,
Encoder: 4,
Decoder: 3,
},
20141: {
Index: 0,
PID: 20141,
Memory: 429 * 1024 * 1024,
Usage: 5,
Encoder: 1,
Decoder: 3,
},
29591: {
Index: 0,
PID: 29591,
Memory: 435 * 1024 * 1024,
Usage: 0,
Encoder: 1,
Decoder: 1,
},
}, process)
}
func TestNvidiaGPUCount(t *testing.T) {
binary, err := testhelper.BuildBinary("nvidia-smi", "../../../internal/testhelper")
require.NoError(t, err, "Failed to build helper program")
nv := New(binary)
t.Cleanup(func() {
nv.Close()
})
_, ok := nv.(*dummy)
require.False(t, ok)
require.Eventually(t, func() bool {
count, _ := nv.Count()
return count != 0
}, 5*time.Second, time.Second)
}
func TestNvidiaGPUStats(t *testing.T) {
binary, err := testhelper.BuildBinary("nvidia-smi", "../../../internal/testhelper")
require.NoError(t, err, "Failed to build helper program")
nv := New(binary)
t.Cleanup(func() {
nv.Close()
})
_, ok := nv.(*dummy)
require.False(t, ok)
require.Eventually(t, func() bool {
stats, _ := nv.Stats()
if len(stats) != 2 {
return false
}
if len(stats[0].Process) != 3 {
return false
}
if len(stats[1].Process) != 2 {
return false
}
return true
}, 5*time.Second, time.Second)
stats, err := nv.Stats()
require.NoError(t, err)
require.Equal(t, []gpu.Stats{
{
ID: "00000000:01:00.0",
Name: "NVIDIA L4",
Architecture: "Ada Lovelace",
MemoryTotal: 23034 * 1024 * 1024,
MemoryUsed: 1 * 1024 * 1024,
Usage: 2,
Encoder: 0,
Decoder: 0,
Process: []gpu.Process{
{
Index: 0,
PID: 7372,
Memory: 136 * 1024 * 1024,
Usage: 2,
Encoder: 2,
Decoder: 0,
},
{
Index: 0,
PID: 12176,
Memory: 782 * 1024 * 1024,
Usage: 5,
Encoder: 3,
Decoder: 7,
},
{
Index: 0,
PID: 29591,
Memory: 435 * 1024 * 1024,
Usage: 2,
Encoder: 0,
Decoder: 2,
},
},
},
{
ID: "00000000:C1:00.0",
Name: "NVIDIA L4",
Architecture: "Ada Lovelace",
MemoryTotal: 23034 * 1024 * 1024,
MemoryUsed: 1 * 1024 * 1024,
Usage: 3,
Encoder: 0,
Decoder: 0,
Process: []gpu.Process{
{
Index: 1,
PID: 20035,
Memory: 1145 * 1024 * 1024,
Usage: 8,
Encoder: 4,
Decoder: 1,
},
{
Index: 1,
PID: 20141,
Memory: 429 * 1024 * 1024,
Usage: 2,
Encoder: 1,
Decoder: 3,
},
},
},
}, stats)
}
func TestNvidiaGPUProcess(t *testing.T) {
binary, err := testhelper.BuildBinary("nvidia-smi", "../../../internal/testhelper")
require.NoError(t, err, "Failed to build helper program")
nv := New(binary)
t.Cleanup(func() {
nv.Close()
})
_, ok := nv.(*dummy)
require.False(t, ok)
require.Eventually(t, func() bool {
_, err := nv.Process(12176)
return err == nil
}, 5*time.Second, time.Second)
proc, err := nv.Process(12176)
require.NoError(t, err)
require.Equal(t, gpu.Process{
Index: 0,
PID: 12176,
Memory: 782 * 1024 * 1024,
Usage: 5,
Encoder: 3,
Decoder: 7,
}, proc)
}

View File

@ -5,24 +5,28 @@ import (
"sync"
"time"
"github.com/datarhei/core/v16/psutil/gpu/nvidia"
psprocess "github.com/shirou/gopsutil/v3/process"
)
type Process interface {
// CPUPercent returns the current CPU load for this process only. The values
// CPU returns the current CPU load for this process only. The values
// are normed to the range of 0 to 100.
CPUPercent() (*CPUInfoStat, error)
CPU() (*CPUInfo, error)
// VirtualMemory returns the current memory usage in bytes of this process only.
VirtualMemory() (uint64, error)
// Memory returns the current memory usage in bytes of this process only.
Memory() (uint64, error)
// GPU returns the current GPU memory in bytes and usage in percent (0-100) of this process only.
GPU() (*GPUInfo, error)
// Stop will stop collecting CPU and memory data for this process.
Stop()
// Suspend will send SIGSTOP to the process
// Suspend will send SIGSTOP to the process.
Suspend() error
// Resume will send SIGCONT to the process
// Resume will send SIGCONT to the process.
Resume() error
}
@ -142,7 +146,7 @@ func (p *process) Resume() error {
return p.proc.Resume()
}
func (p *process) CPUPercent() (*CPUInfoStat, error) {
func (p *process) CPU() (*CPUInfo, error) {
var diff float64
for {
@ -167,7 +171,7 @@ func (p *process) CPUPercent() (*CPUInfoStat, error) {
diff = p.statCurrentTime.Sub(p.statPreviousTime).Seconds() * p.ncpu
}
s := &CPUInfoStat{
s := &CPUInfo{
System: 0,
User: 0,
Idle: 0,
@ -186,9 +190,28 @@ func (p *process) CPUPercent() (*CPUInfoStat, error) {
return s, nil
}
func (p *process) VirtualMemory() (uint64, error) {
func (p *process) Memory() (uint64, error) {
p.lock.RLock()
defer p.lock.RUnlock()
return p.memRSS, nil
}
func (p *process) GPU() (*GPUInfo, error) {
info := &GPUInfo{
Index: -1,
}
proc, err := nvidia.Default.Process(p.pid)
if err != nil {
return info, nil
}
info.Index = proc.Index
info.MemoryUsed = proc.Memory
info.Usage = proc.Usage
info.Encoder = proc.Encoder
info.Decoder = proc.Decoder
return info, nil
}

View File

@ -47,29 +47,44 @@ func init() {
DefaultUtil, _ = New("/sys/fs/cgroup")
}
type MemoryInfoStat struct {
type DiskInfo struct {
Path string
Fstype string
Total uint64
Used uint64
InodesTotal uint64
InodesUsed uint64
}
type MemoryInfo struct {
Total uint64 // bytes
Available uint64 // bytes
Used uint64 // bytes
}
type CPUInfoStat struct {
type NetworkInfo struct {
Name string // interface name
BytesSent uint64 // number of bytes sent
BytesRecv uint64 // number of bytes received
}
type CPUInfo struct {
System float64 // percent 0-100
User float64 // percent 0-100
Idle float64 // percent 0-100
Other float64 // percent 0-100
}
type GPUInfoStat struct {
Name string
type GPUInfo struct {
Index int // Index of the GPU
Name string // Name of the GPU (not populated for a specific process)
MemoryTotal uint64 // bytes
MemoryTotal uint64 // bytes (not populated for a specific process)
MemoryUsed uint64 // bytes
Usage float64 // percent 0-100
MemoryUsage float64 // percent 0-100
EncoderUsage float64 // percent 0-100
DecoderUsage float64 // percent 0-100
Usage float64 // percent 0-100
Encoder float64 // percent 0-100
Decoder float64 // percent 0-100
}
type cpuTimesStat struct {
@ -85,18 +100,23 @@ type Util interface {
Stop()
// CPUCounts returns the number of cores, either logical or physical.
CPUCounts(logical bool) (float64, error)
CPUCounts() (float64, error)
// GPUCounts returns the number of GPU cores.
GPUCounts() (float64, error)
// CPUPercent returns the current CPU load in percent. The values range
// CPU returns the current CPU load in percent. The values range
// from 0 to 100, independently of the number of logical cores.
CPUPercent() (*CPUInfoStat, error)
DiskUsage(path string) (*disk.UsageStat, error)
VirtualMemory() (*MemoryInfoStat, error)
NetIOCounters(pernic bool) ([]net.IOCountersStat, error)
GPUStats() ([]GPUInfoStat, error)
CPU() (*CPUInfo, error)
// Disk returns the current usage of the partition specified by the path.
Disk(path string) (*DiskInfo, error)
// Memory return the current memory usage.
Memory() (*MemoryInfo, error)
// Network returns the current network interface statistics per network adapter.
Network() ([]NetworkInfo, error)
// GPU return the current usage for each CPU.
GPU() ([]GPUInfo, error)
// Process returns a process observer for a process with the given pid.
Process(pid int32) (Process, error)
@ -120,7 +140,7 @@ type util struct {
statPrevious cpuTimesStat
statPreviousTime time.Time
nTicks uint64
mem MemoryInfoStat
mem MemoryInfo
}
// New returns a new util, it will be started automatically
@ -140,7 +160,7 @@ func New(root string) (Util, error) {
if u.ncpu == 0 {
var err error
u.ncpu, err = u.CPUCounts(true)
u.ncpu, err = u.CPUCounts()
if err != nil {
return nil, err
}
@ -311,7 +331,7 @@ func (u *util) tickMemory(ctx context.Context, interval time.Duration) {
}
}
func (u *util) collectMemory() *MemoryInfoStat {
func (u *util) collectMemory() *MemoryInfo {
stat, err := u.virtualMemory()
if err != nil {
return nil
@ -320,12 +340,12 @@ func (u *util) collectMemory() *MemoryInfoStat {
return stat
}
func (u *util) CPUCounts(logical bool) (float64, error) {
func (u *util) CPUCounts() (float64, error) {
if u.hasCgroup && u.ncpu > 0 {
return u.ncpu, nil
}
ncpu, err := cpu.Counts(logical)
ncpu, err := cpu.Counts(true)
if err != nil {
return 0, err
}
@ -333,18 +353,8 @@ func (u *util) CPUCounts(logical bool) (float64, error) {
return float64(ncpu), nil
}
func CPUCounts(logical bool) (float64, error) {
return DefaultUtil.CPUCounts(logical)
}
func (u *util) GPUCounts() (float64, error) {
count, err := nvidia.Default.Count()
return float64(count), err
}
func GPUCounts() (float64, error) {
return DefaultUtil.GPUCounts()
func CPUCounts() (float64, error) {
return DefaultUtil.CPUCounts()
}
// cpuTimes returns the current cpu usage times in seconds.
@ -381,7 +391,7 @@ func (u *util) cpuTimes() (*cpuTimesStat, error) {
return s, nil
}
func (u *util) CPUPercent() (*CPUInfoStat, error) {
func (u *util) CPU() (*CPUInfo, error) {
var total float64
for {
@ -406,7 +416,7 @@ func (u *util) CPUPercent() (*CPUInfoStat, error) {
total = (u.statCurrent.total - u.statPrevious.total)
}
s := &CPUInfoStat{
s := &CPUInfo{
System: 0,
User: 0,
Idle: 100,
@ -429,8 +439,8 @@ func (u *util) CPUPercent() (*CPUInfoStat, error) {
return s, nil
}
func CPUPercent() (*CPUInfoStat, error) {
return DefaultUtil.CPUPercent()
func CPUPercent() (*CPUInfo, error) {
return DefaultUtil.CPU()
}
func (u *util) cgroupCPUTimes(version int) (*cpuTimesStat, error) {
@ -466,15 +476,29 @@ func (u *util) cgroupCPUTimes(version int) (*cpuTimesStat, error) {
return info, nil
}
func (u *util) DiskUsage(path string) (*disk.UsageStat, error) {
return disk.Usage(path)
func (u *util) Disk(path string) (*DiskInfo, error) {
usage, err := disk.Usage(path)
if err != nil {
return nil, err
}
info := &DiskInfo{
Path: usage.Path,
Fstype: usage.Fstype,
Total: usage.Total,
Used: usage.Used,
InodesTotal: usage.InodesTotal,
InodesUsed: usage.InodesUsed,
}
return info, nil
}
func DiskUsage(path string) (*disk.UsageStat, error) {
return DefaultUtil.DiskUsage(path)
func Disk(path string) (*DiskInfo, error) {
return DefaultUtil.Disk(path)
}
func (u *util) virtualMemory() (*MemoryInfoStat, error) {
func (u *util) virtualMemory() (*MemoryInfo, error) {
info, err := mem.VirtualMemory()
if err != nil {
return nil, err
@ -489,18 +513,18 @@ func (u *util) virtualMemory() (*MemoryInfoStat, error) {
}
}
return &MemoryInfoStat{
return &MemoryInfo{
Total: info.Total,
Available: info.Available,
Used: info.Used,
}, nil
}
func (u *util) VirtualMemory() (*MemoryInfoStat, error) {
func (u *util) Memory() (*MemoryInfo, error) {
u.lock.RLock()
defer u.lock.RUnlock()
stat := &MemoryInfoStat{
stat := &MemoryInfo{
Total: u.mem.Total,
Available: u.mem.Available,
Used: u.mem.Used,
@ -509,12 +533,12 @@ func (u *util) VirtualMemory() (*MemoryInfoStat, error) {
return stat, nil
}
func VirtualMemory() (*MemoryInfoStat, error) {
return DefaultUtil.VirtualMemory()
func Memory() (*MemoryInfo, error) {
return DefaultUtil.Memory()
}
func (u *util) cgroupVirtualMemory(version int) (*MemoryInfoStat, error) {
info := &MemoryInfoStat{}
func (u *util) cgroupVirtualMemory(version int) (*MemoryInfo, error) {
info := &MemoryInfo{}
if version == 1 {
lines, err := u.readFile("memory/memory.limit_in_bytes")
@ -569,12 +593,27 @@ func (u *util) cgroupVirtualMemory(version int) (*MemoryInfoStat, error) {
return info, nil
}
func (u *util) NetIOCounters(pernic bool) ([]net.IOCountersStat, error) {
return net.IOCounters(pernic)
func (u *util) Network() ([]NetworkInfo, error) {
netio, err := net.IOCounters(true)
if err != nil {
return nil, err
}
info := []NetworkInfo{}
for _, io := range netio {
info = append(info, NetworkInfo{
Name: io.Name,
BytesSent: io.BytesSent,
BytesRecv: io.BytesRecv,
})
}
return info, nil
}
func NetIOCounters(pernic bool) ([]net.IOCountersStat, error) {
return DefaultUtil.NetIOCounters(pernic)
func Network() ([]NetworkInfo, error) {
return DefaultUtil.Network()
}
func (u *util) readFile(path string) ([]string, error) {
@ -613,29 +652,28 @@ func cpuTotal(c *cpu.TimesStat) float64 {
c.Softirq + c.Steal + c.Guest + c.GuestNice
}
func (u *util) GPUStats() ([]GPUInfoStat, error) {
func (u *util) GPU() ([]GPUInfo, error) {
nvstats, err := nvidia.Default.Stats()
if err != nil {
return nil, err
}
stats := []GPUInfoStat{}
stats := []GPUInfo{}
for _, nv := range nvstats {
stats = append(stats, GPUInfoStat{
Name: nv.Name,
MemoryTotal: nv.MemoryTotal,
MemoryUsed: nv.MemoryUsed,
Usage: nv.Usage,
MemoryUsage: nv.MemoryUsage,
EncoderUsage: nv.EncoderUsage,
DecoderUsage: nv.DecoderUsage,
stats = append(stats, GPUInfo{
Name: nv.Name,
MemoryTotal: nv.MemoryTotal,
MemoryUsed: nv.MemoryUsed,
Usage: nv.Usage,
Encoder: nv.Encoder,
Decoder: nv.Decoder,
})
}
return stats, nil
}
func GPUStats() ([]GPUInfoStat, error) {
return DefaultUtil.GPUStats()
func GPU() ([]GPUInfo, error) {
return DefaultUtil.GPU()
}

View File

@ -9,11 +9,13 @@ import (
"github.com/datarhei/core/v16/log"
"github.com/datarhei/core/v16/psutil"
"github.com/datarhei/core/v16/slices"
)
type Info struct {
Mem MemoryInfo
CPU CPUInfo
GPU GPUInfo
}
type MemoryInfo struct {
@ -38,6 +40,44 @@ type CPUInfo struct {
Error error
}
type GPUInfo struct {
NGPU float64 // number of gpus
GPU []GPUInfoStat
Error error
}
type GPUInfoStat struct {
Index int
Name string
// Memory
MemoryTotal uint64 // bytes
MemoryUsed uint64 // bytes
MemoryAvailable uint64 // bytes
MemoryLimit uint64 // bytes
// GPU
Usage float64 // percent 0-100
Encoder float64 // percent 0-100
Decoder float64 // percent 0-100
UsageLimit float64 // percent 0-100
Throttling bool
}
type Request struct {
CPU float64 // percent 0-100*ncpu
Memory uint64 // bytes
GPUUsage float64 // percent 0-100
GPUEncoder float64 // percent 0-100
GPUDecoder float64 // percent 0-100
GPUMemory uint64 // bytes
}
type Response struct {
GPU int // GPU number, hwdevice
}
type resources struct {
psutil psutil.Util
@ -45,9 +85,14 @@ type resources struct {
maxCPU float64 // percent 0-100*ncpu
maxMemory uint64 // bytes
ngpu int
maxGPU float64 // general usage, percent 0-100
maxGPUMemory float64 // memory usage, percent 0-100
isUnlimited bool
isCPULimiting bool
isMemoryLimiting bool
isGPULimiting []bool
self psutil.Process
@ -67,30 +112,46 @@ type Resources interface {
// HasLimits returns whether any limits have been set.
HasLimits() bool
// Limits returns the CPU (percent 0-100) and memory (bytes) limits.
Limits() (float64, uint64)
// Limits returns the CPU (percent 0-100), memory (bytes) limits, and GPU limits (usage and memory each in percent 0-100).
Limits() (float64, uint64, float64, float64)
// ShouldLimit returns whether cpu and/or memory is currently limited.
ShouldLimit() (bool, bool)
// ShouldLimit returns whether cpu, memory, and/or GPU is currently limited.
ShouldLimit() (bool, bool, []bool)
// Request checks whether the requested resources are available.
Request(cpu float64, memory uint64) error
Request(req Request) (Response, error)
// Info returns the current resource usage
// Info returns the current resource usage.
Info() Info
}
type Config struct {
MaxCPU float64 // percent 0-100
MaxMemory float64 // percent 0-100
PSUtil psutil.Util
Logger log.Logger
MaxCPU float64 // percent 0-100
MaxMemory float64 // percent 0-100
MaxGPU float64 // general,encoder,decoder usage, percent 0-100
MaxGPUMemory float64 // memory usage, percent 0-100
PSUtil psutil.Util
Logger log.Logger
}
func New(config Config) (Resources, error) {
if config.PSUtil == nil {
config.PSUtil = psutil.DefaultUtil
}
gpu, err := config.PSUtil.GPU()
if err != nil {
return nil, fmt.Errorf("unable to determine number of GPUs: %w", err)
}
if len(gpu) == 0 {
config.MaxGPU = 0
config.MaxGPUMemory = 0
}
isUnlimited := false
if config.MaxCPU <= 0 && config.MaxMemory <= 0 {
if config.MaxCPU <= 0 && config.MaxMemory <= 0 && config.MaxGPU <= 0 && config.MaxGPUMemory <= 0 {
isUnlimited = true
}
@ -102,31 +163,39 @@ func New(config Config) (Resources, error) {
config.MaxMemory = 100
}
if config.MaxCPU > 100 || config.MaxMemory > 100 {
return nil, fmt.Errorf("both MaxCPU and MaxMemory must have a range of 0-100")
if config.MaxGPU <= 0 {
config.MaxGPU = 100
}
if config.MaxGPUMemory <= 0 {
config.MaxGPUMemory = 100
}
if config.MaxCPU > 100 || config.MaxMemory > 100 || config.MaxGPU > 100 || config.MaxGPUMemory > 100 {
return nil, fmt.Errorf("all Max... values must have a range of 0-100")
}
r := &resources{
maxCPU: config.MaxCPU,
psutil: config.PSUtil,
isUnlimited: isUnlimited,
logger: config.Logger,
maxCPU: config.MaxCPU,
maxGPU: config.MaxGPU,
maxGPUMemory: config.MaxGPUMemory,
psutil: config.PSUtil,
isUnlimited: isUnlimited,
ngpu: len(gpu),
isGPULimiting: make([]bool, len(gpu)),
logger: config.Logger,
}
if r.logger == nil {
r.logger = log.New("")
}
if r.psutil == nil {
r.psutil = psutil.DefaultUtil
}
vmstat, err := r.psutil.VirtualMemory()
vmstat, err := r.psutil.Memory()
if err != nil {
return nil, fmt.Errorf("unable to determine available memory: %w", err)
}
ncpu, err := r.psutil.CPUCounts(true)
ncpu, err := r.psutil.CPUCounts()
if err != nil {
return nil, fmt.Errorf("unable to determine number of logical CPUs: %w", err)
}
@ -137,12 +206,15 @@ func New(config Config) (Resources, error) {
r.maxMemory = uint64(float64(vmstat.Total) * config.MaxMemory / 100)
r.logger = r.logger.WithFields(log.Fields{
"ncpu": r.ncpu,
"max_cpu": r.maxCPU,
"max_memory": r.maxMemory,
"ncpu": r.ncpu,
"max_cpu": r.maxCPU,
"max_memory": r.maxMemory,
"ngpu": len(gpu),
"max_gpu": r.maxGPU,
"max_gpu_memory": r.maxGPUMemory,
})
r.self, err = psutil.NewProcess(int32(os.Getpid()), false)
r.self, err = r.psutil.Process(int32(os.Getpid()))
if err != nil {
return nil, fmt.Errorf("unable to create process observer for self: %w", err)
}
@ -189,7 +261,12 @@ func (r *resources) observe(ctx context.Context, interval time.Duration) {
case <-ctx.Done():
return
case <-ticker.C:
cpustat, err := r.psutil.CPUPercent()
if r.isUnlimited {
// If there aren't any limits imposed, don't do anything
continue
}
cpustat, err := r.psutil.CPU()
if err != nil {
r.logger.Warn().WithError(err).Log("Failed to determine system CPU usage")
continue
@ -197,12 +274,18 @@ func (r *resources) observe(ctx context.Context, interval time.Duration) {
cpuload := (cpustat.User + cpustat.System + cpustat.Other) * r.ncpu
vmstat, err := r.psutil.VirtualMemory()
vmstat, err := r.psutil.Memory()
if err != nil {
r.logger.Warn().WithError(err).Log("Failed to determine system memory usage")
continue
}
gpustat, err := r.psutil.GPU()
if err != nil {
r.logger.Warn().WithError(err).Log("Failed to determine GPU usage")
continue
}
r.logger.Debug().WithFields(log.Fields{
"cur_cpu": cpuload,
"cur_memory": vmstat.Used,
@ -210,34 +293,46 @@ func (r *resources) observe(ctx context.Context, interval time.Duration) {
doCPULimit := false
if !r.isUnlimited {
if !r.isCPULimiting {
if cpuload >= r.maxCPU {
r.logger.Debug().WithField("cpu", cpuload).Log("CPU limit reached")
doCPULimit = true
}
} else {
if !r.isCPULimiting {
if cpuload >= r.maxCPU {
r.logger.Debug().WithField("cpu", cpuload).Log("CPU limit reached")
doCPULimit = true
if cpuload < r.maxCPU {
r.logger.Debug().WithField("cpu", cpuload).Log("CPU limit released")
doCPULimit = false
}
}
} else {
doCPULimit = true
if cpuload < r.maxCPU {
r.logger.Debug().WithField("cpu", cpuload).Log("CPU limit released")
doCPULimit = false
}
}
doMemoryLimit := false
if !r.isUnlimited {
if !r.isMemoryLimiting {
if vmstat.Used >= r.maxMemory {
r.logger.Debug().WithField("memory", vmstat.Used).Log("Memory limit reached")
doMemoryLimit = true
if !r.isMemoryLimiting {
if vmstat.Used >= r.maxMemory {
r.logger.Debug().WithField("memory", vmstat.Used).Log("Memory limit reached")
doMemoryLimit = true
}
} else {
doMemoryLimit = true
if vmstat.Used < r.maxMemory {
r.logger.Debug().WithField("memory", vmstat.Used).Log("Memory limit released")
doMemoryLimit = false
}
}
doGPULimit := make([]bool, r.ngpu)
for i, limiting := range r.isGPULimiting {
maxMemory := uint64(r.maxGPUMemory * float64(gpustat[i].MemoryTotal) / 100)
if !limiting {
if gpustat[i].MemoryUsed >= maxMemory || (gpustat[i].Usage >= r.maxGPU && gpustat[i].Encoder >= r.maxGPU && gpustat[i].Decoder >= r.maxGPU) {
doGPULimit[i] = true
}
} else {
doMemoryLimit = true
if vmstat.Used < r.maxMemory {
r.logger.Debug().WithField("memory", vmstat.Used).Log("Memory limit released")
doMemoryLimit = false
doGPULimit[i] = true
if gpustat[i].MemoryUsed < maxMemory && (gpustat[i].Usage < r.maxGPU || gpustat[i].Encoder < r.maxGPU || gpustat[i].Decoder < r.maxGPU) {
doGPULimit[i] = false
}
}
}
@ -247,17 +342,26 @@ func (r *resources) observe(ctx context.Context, interval time.Duration) {
r.logger.Warn().WithFields(log.Fields{
"enabled": doCPULimit,
}).Log("Limiting CPU")
r.isCPULimiting = doCPULimit
}
r.isCPULimiting = doCPULimit
if r.isMemoryLimiting != doMemoryLimit {
r.logger.Warn().WithFields(log.Fields{
"enabled": doMemoryLimit,
}).Log("Limiting memory")
r.isMemoryLimiting = doMemoryLimit
}
r.isMemoryLimiting = doMemoryLimit
for i, limiting := range r.isGPULimiting {
if limiting != doGPULimit[i] {
r.logger.Warn().WithFields(log.Fields{
"enabled": doGPULimit,
"index": i,
}).Log("Limiting GPU")
}
}
r.isGPULimiting = doGPULimit
r.lock.Unlock()
}
}
@ -267,60 +371,136 @@ func (r *resources) HasLimits() bool {
return !r.isUnlimited
}
func (r *resources) Limits() (float64, uint64) {
return r.maxCPU / r.ncpu, r.maxMemory
func (r *resources) Limits() (float64, uint64, float64, float64) {
return r.maxCPU / r.ncpu, r.maxMemory, r.maxGPU, r.maxGPUMemory
}
func (r *resources) ShouldLimit() (bool, bool) {
func (r *resources) ShouldLimit() (bool, bool, []bool) {
r.lock.RLock()
defer r.lock.RUnlock()
return r.isCPULimiting, r.isMemoryLimiting
return r.isCPULimiting, r.isMemoryLimiting, slices.Copy(r.isGPULimiting)
}
func (r *resources) Request(cpu float64, memory uint64) error {
func (r *resources) Request(req Request) (Response, error) {
res := Response{
GPU: -1,
}
r.lock.RLock()
defer r.lock.RUnlock()
logger := r.logger.WithFields(log.Fields{
"req_cpu": cpu,
"req_memory": memory,
"req_cpu": req.CPU,
"req_memory": req.Memory,
"req_gpu": req.GPUUsage,
"req_gpu_encoder": req.GPUEncoder,
"req_gpu_decoder": req.GPUDecoder,
"req_gpu_memory": req.GPUMemory,
})
logger.Debug().Log("Request for acquiring resources")
// Check if anything is currently limiting.
if r.isCPULimiting || r.isMemoryLimiting {
logger.Debug().Log("Rejected, currently limiting")
return fmt.Errorf("resources are currenlty actively limited")
return res, fmt.Errorf("resources are currenlty actively limited")
}
if cpu <= 0 || memory == 0 {
// Check if the requested resources are valid.
if req.CPU <= 0 || req.Memory == 0 {
logger.Debug().Log("Rejected, invalid values")
return fmt.Errorf("the cpu and/or memory values are invalid: cpu=%f, memory=%d", cpu, memory)
return res, fmt.Errorf("the cpu and/or memory values are invalid: cpu=%f, memory=%d", req.CPU, req.Memory)
}
cpustat, err := r.psutil.CPUPercent()
// Get current CPU and memory values.
cpustat, err := r.psutil.CPU()
if err != nil {
r.logger.Warn().WithError(err).Log("Failed to determine system CPU usage")
return fmt.Errorf("the system CPU usage couldn't be determined")
return res, fmt.Errorf("the system CPU usage couldn't be determined")
}
cpuload := (cpustat.User + cpustat.System + cpustat.Other) * r.ncpu
vmstat, err := r.psutil.VirtualMemory()
vmstat, err := r.psutil.Memory()
if err != nil {
r.logger.Warn().WithError(err).Log("Failed to determine system memory usage")
return fmt.Errorf("the system memory usage couldn't be determined")
return res, fmt.Errorf("the system memory usage couldn't be determined")
}
if cpuload+cpu > r.maxCPU {
// Check if enough resources are available
if cpuload+req.CPU > r.maxCPU {
logger.Debug().WithField("cur_cpu", cpuload).Log("Rejected, CPU limit exceeded")
return fmt.Errorf("the CPU limit would be exceeded: %f + %f > %f", cpuload, cpu, r.maxCPU)
return res, fmt.Errorf("the CPU limit would be exceeded: %f + %f > %f", cpuload, req.CPU, r.maxCPU)
}
if vmstat.Used+memory > r.maxMemory {
if vmstat.Used+req.Memory > r.maxMemory {
logger.Debug().WithField("cur_memory", vmstat.Used).Log("Rejected, memory limit exceeded")
return fmt.Errorf("the memory limit would be exceeded: %d + %d > %d", vmstat.Used, memory, r.maxMemory)
return res, fmt.Errorf("the memory limit would be exceeded: %d + %d > %d", vmstat.Used, req.Memory, r.maxMemory)
}
// Check if any GPU resources are requested
if req.GPUUsage > 0 || req.GPUEncoder > 0 || req.GPUDecoder > 0 || req.GPUMemory > 0 {
if req.GPUUsage < 0 || req.GPUEncoder < 0 || req.GPUDecoder < 0 || req.GPUMemory == 0 {
logger.Debug().Log("Rejected, invalid values")
return res, fmt.Errorf("the gpu usage and memory values are invalid: usage=%f, encoder=%f, decoder=%f, memory=%d", req.GPUUsage, req.GPUEncoder, req.GPUDecoder, req.GPUMemory)
}
// Get current GPU values
gpustat, err := r.psutil.GPU()
if err != nil {
r.logger.Warn().WithError(err).Log("Failed to determine GPU usage")
return res, fmt.Errorf("the GPU usage couldn't be determined")
}
if len(gpustat) == 0 {
r.logger.Debug().WithError(err).Log("GPU resources requested but no GPU available")
return res, fmt.Errorf("some GPU resources requested but no GPU available")
}
foundGPU := -1
for _, g := range gpustat {
if req.GPUUsage > 0 && g.Usage+req.GPUUsage > r.maxGPU {
logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu": g.Usage}).Log("Rejected, GPU usage limit exceeded")
continue
}
if req.GPUEncoder > 0 && g.Encoder+req.GPUEncoder > r.maxGPU {
logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu_encoder": g.Usage}).Log("Rejected, GPU encoder usage limit exceeded")
continue
}
if req.GPUDecoder > 0 && g.Decoder+req.GPUDecoder > r.maxGPU {
logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu_decoder": g.Usage}).Log("Rejected, GPU decoder usage limit exceeded")
continue
}
gpuMemoryUsage := float64(g.MemoryUsed) / float64(g.MemoryTotal) * 100
requestedGPUMemoryUsage := float64(req.GPUMemory) / float64(g.MemoryTotal) * 100
if gpuMemoryUsage+requestedGPUMemoryUsage > r.maxGPUMemory {
logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu_memory": gpuMemoryUsage}).Log("Rejected, GPU memory usage limit exceeded")
continue
}
foundGPU = g.Index
logger = logger.Debug().WithFields(log.Fields{
"cur_gpu": foundGPU,
"cur_gpu_general": g.Usage,
"cur_gpu_encoder": g.Encoder,
"cur_gpu_decoder": g.Decoder,
"cur_gpu_memory": gpuMemoryUsage,
})
break
}
if foundGPU < 0 {
return res, fmt.Errorf("all GPU usage limits are exceeded")
}
res.GPU = foundGPU
}
logger.Debug().WithFields(log.Fields{
@ -328,17 +508,18 @@ func (r *resources) Request(cpu float64, memory uint64) error {
"cur_memory": vmstat.Used,
}).Log("Acquiring approved")
return nil
return res, nil
}
func (r *resources) Info() Info {
cpulimit, memlimit := r.Limits()
cputhrottling, memthrottling := r.ShouldLimit()
cpulimit, memlimit, gpulimit, gpumemlimit := r.Limits()
cputhrottling, memthrottling, gputhrottling := r.ShouldLimit()
cpustat, cpuerr := r.psutil.CPUPercent()
memstat, memerr := r.psutil.VirtualMemory()
selfcpu, _ := r.self.CPUPercent()
selfmem, _ := r.self.VirtualMemory()
cpustat, cpuerr := r.psutil.CPU()
memstat, memerr := r.psutil.Memory()
gpustat, gpuerr := r.psutil.GPU()
selfcpu, _ := r.self.CPU()
selfmem, _ := r.self.Memory()
cpuinfo := CPUInfo{
NCPU: r.ncpu,
@ -362,9 +543,31 @@ func (r *resources) Info() Info {
Error: memerr,
}
gpuinfo := GPUInfo{
NGPU: float64(len(gpustat)),
Error: gpuerr,
}
for i, g := range gpustat {
gpuinfo.GPU = append(gpuinfo.GPU, GPUInfoStat{
Index: g.Index,
Name: g.Name,
MemoryTotal: g.MemoryTotal,
MemoryUsed: g.MemoryUsed,
MemoryAvailable: g.MemoryTotal - g.MemoryUsed,
MemoryLimit: uint64(float64(g.MemoryTotal) * gpumemlimit / 100),
Usage: g.Usage,
Encoder: g.Encoder,
Decoder: g.Decoder,
UsageLimit: gpulimit,
Throttling: gputhrottling[i],
})
}
i := Info{
CPU: cpuinfo,
Mem: meminfo,
GPU: gpuinfo,
}
return i

View File

@ -1,68 +1,170 @@
package resources
import (
"slices"
"sync"
"testing"
"time"
"github.com/datarhei/core/v16/psutil"
"github.com/shirou/gopsutil/v3/disk"
"github.com/shirou/gopsutil/v3/net"
"github.com/stretchr/testify/require"
)
type util struct{}
type util struct {
lock sync.Mutex
cpu psutil.CPUInfo
mem psutil.MemoryInfo
gpu []psutil.GPUInfo
}
func newUtil(ngpu int) *util {
u := &util{
cpu: psutil.CPUInfo{
System: 10,
User: 50,
Idle: 35,
Other: 5,
},
mem: psutil.MemoryInfo{
Total: 200,
Available: 40,
Used: 160,
},
}
for i := 0; i < ngpu; i++ {
u.gpu = append(u.gpu, psutil.GPUInfo{
Index: i,
Name: "L4",
MemoryTotal: 24 * 1024 * 1024 * 1024,
MemoryUsed: uint64(12+i) * 1024 * 1024 * 1024,
Usage: 50 - float64((i+1)*5),
Encoder: 50 - float64((i+1)*10),
Decoder: 50 - float64((i+1)*3),
})
}
return u
}
func (u *util) Start() {}
func (u *util) Stop() {}
func (u *util) CPUCounts(logical bool) (float64, error) {
func (u *util) CPUCounts() (float64, error) {
return 2, nil
}
func (u *util) GPUCounts() (float64, error) {
return 0, nil
func (u *util) CPU() (*psutil.CPUInfo, error) {
u.lock.Lock()
defer u.lock.Unlock()
cpu := u.cpu
return &cpu, nil
}
func (u *util) CPUPercent() (*psutil.CPUInfoStat, error) {
return &psutil.CPUInfoStat{
System: 10,
User: 50,
Idle: 35,
Other: 5,
}, nil
func (u *util) Disk(path string) (*psutil.DiskInfo, error) {
return &psutil.DiskInfo{}, nil
}
func (u *util) DiskUsage(path string) (*disk.UsageStat, error) {
return &disk.UsageStat{}, nil
func (u *util) Memory() (*psutil.MemoryInfo, error) {
u.lock.Lock()
defer u.lock.Unlock()
mem := u.mem
return &mem, nil
}
func (u *util) VirtualMemory() (*psutil.MemoryInfoStat, error) {
return &psutil.MemoryInfoStat{
Total: 200,
Available: 40,
Used: 160,
}, nil
}
func (u *util) NetIOCounters(pernic bool) ([]net.IOCountersStat, error) {
func (u *util) Network() ([]psutil.NetworkInfo, error) {
return nil, nil
}
func (u *util) GPUStats() ([]psutil.GPUInfoStat, error) {
return nil, nil
func (u *util) GPU() ([]psutil.GPUInfo, error) {
u.lock.Lock()
defer u.lock.Unlock()
gpu := []psutil.GPUInfo{}
gpu = append(gpu, u.gpu...)
return gpu, nil
}
func (u *util) Process(pid int32) (psutil.Process, error) {
return nil, nil
return &process{}, nil
}
type process struct{}
func (p *process) CPU() (*psutil.CPUInfo, error) {
s := &psutil.CPUInfo{
System: 1,
User: 2,
Idle: 0,
Other: 3,
}
return s, nil
}
func (p *process) Memory() (uint64, error) { return 42, nil }
func (p *process) GPU() (*psutil.GPUInfo, error) {
return &psutil.GPUInfo{
Index: 0,
Name: "L4",
MemoryTotal: 128,
MemoryUsed: 42,
Usage: 5,
Encoder: 9,
Decoder: 7,
}, nil
}
func (p *process) Stop() {}
func (p *process) Suspend() error { return nil }
func (p *process) Resume() error { return nil }
func TestConfigNoLimits(t *testing.T) {
_, err := New(Config{
PSUtil: newUtil(0),
})
require.NoError(t, err)
}
func TestConfigWrongLimits(t *testing.T) {
_, err := New(Config{
MaxCPU: 102,
MaxMemory: 573,
PSUtil: newUtil(0),
})
require.Error(t, err)
_, err = New(Config{
MaxCPU: 0,
MaxMemory: 0,
MaxGPU: 101,
MaxGPUMemory: 103,
PSUtil: newUtil(0),
})
require.NoError(t, err)
_, err = New(Config{
MaxCPU: 0,
MaxMemory: 0,
MaxGPU: 101,
MaxGPUMemory: 103,
PSUtil: newUtil(1),
})
require.Error(t, err)
}
func TestMemoryLimit(t *testing.T) {
r, err := New(Config{
MaxCPU: 100,
MaxMemory: 150. / 200. * 100,
PSUtil: &util{},
PSUtil: newUtil(0),
Logger: nil,
})
require.NoError(t, err)
@ -86,7 +188,7 @@ func TestMemoryLimit(t *testing.T) {
for {
select {
case <-ticker.C:
_, limit = r.ShouldLimit()
_, limit, _ = r.ShouldLimit()
if limit {
return
}
@ -102,6 +204,95 @@ func TestMemoryLimit(t *testing.T) {
require.True(t, limit)
_, err = r.Request(Request{CPU: 5, Memory: 10})
require.Error(t, err)
r.Stop()
}
func TestMemoryUnlimit(t *testing.T) {
util := newUtil(0)
r, err := New(Config{
MaxCPU: 100,
MaxMemory: 150. / 200. * 100,
PSUtil: util,
Logger: nil,
})
require.NoError(t, err)
wg := sync.WaitGroup{}
wg.Add(1)
limit := false
go func() {
defer func() {
wg.Done()
}()
timer := time.NewTimer(10 * time.Second)
defer timer.Stop()
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
_, limit, _ = r.ShouldLimit()
if limit {
return
}
case <-timer.C:
return
}
}
}()
r.Start()
wg.Wait()
require.True(t, limit)
_, limit, _ = r.ShouldLimit()
require.True(t, limit)
util.lock.Lock()
util.mem.Used = 140
util.lock.Unlock()
wg.Add(1)
go func() {
defer func() {
wg.Done()
}()
timer := time.NewTimer(10 * time.Second)
defer timer.Stop()
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
_, limit, _ = r.ShouldLimit()
if !limit {
return
}
case <-timer.C:
return
}
}
}()
wg.Wait()
require.False(t, limit)
r.Stop()
}
@ -109,7 +300,7 @@ func TestCPULimit(t *testing.T) {
r, err := New(Config{
MaxCPU: 50.,
MaxMemory: 100,
PSUtil: &util{},
PSUtil: newUtil(0),
Logger: nil,
})
require.NoError(t, err)
@ -133,7 +324,7 @@ func TestCPULimit(t *testing.T) {
for {
select {
case <-ticker.C:
limit, _ = r.ShouldLimit()
limit, _, _ = r.ShouldLimit()
if limit {
return
}
@ -149,36 +340,541 @@ func TestCPULimit(t *testing.T) {
require.True(t, limit)
_, err = r.Request(Request{CPU: 5, Memory: 10})
require.Error(t, err)
r.Stop()
}
func TestRequest(t *testing.T) {
func TestCPUUnlimit(t *testing.T) {
util := newUtil(0)
r, err := New(Config{
MaxCPU: 70.,
MaxMemory: 170. / 200. * 100,
PSUtil: &util{},
MaxCPU: 50.,
MaxMemory: 100,
PSUtil: util,
Logger: nil,
})
require.NoError(t, err)
err = r.Request(-1, 0)
require.Error(t, err)
wg := sync.WaitGroup{}
wg.Add(1)
err = r.Request(5, 10)
limit := false
go func() {
defer func() {
wg.Done()
}()
timer := time.NewTimer(10 * time.Second)
defer timer.Stop()
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
limit, _, _ = r.ShouldLimit()
if limit {
return
}
case <-timer.C:
return
}
}
}()
r.Start()
wg.Wait()
require.True(t, limit)
limit, _, _ = r.ShouldLimit()
require.True(t, limit)
util.lock.Lock()
util.cpu.User = 20
util.lock.Unlock()
wg.Add(1)
go func() {
defer func() {
wg.Done()
}()
timer := time.NewTimer(10 * time.Second)
defer timer.Stop()
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
limit, _, _ = r.ShouldLimit()
if !limit {
return
}
case <-timer.C:
return
}
}
}()
wg.Wait()
require.False(t, limit)
r.Stop()
}
func TestGPULimitMemory(t *testing.T) {
r, err := New(Config{
MaxCPU: 100,
MaxMemory: 100,
MaxGPU: 100,
MaxGPUMemory: 20,
PSUtil: newUtil(2),
Logger: nil,
})
require.NoError(t, err)
err = r.Request(5, 20)
wg := sync.WaitGroup{}
wg.Add(1)
limit := []bool{}
go func() {
defer func() {
wg.Done()
}()
timer := time.NewTimer(10 * time.Second)
defer timer.Stop()
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
_, _, limit = r.ShouldLimit()
if slices.Contains(limit, true) {
return
}
case <-timer.C:
return
}
}
}()
r.Start()
wg.Wait()
require.Contains(t, limit, true)
_, err = r.Request(Request{CPU: 5, Memory: 10, GPUUsage: 10, GPUMemory: 10})
require.Error(t, err)
err = r.Request(10, 10)
r.Stop()
}
func TestGPUUnlimitMemory(t *testing.T) {
util := newUtil(2)
r, err := New(Config{
MaxCPU: 100,
MaxMemory: 100,
MaxGPU: 100,
MaxGPUMemory: 20,
PSUtil: util,
Logger: nil,
})
require.NoError(t, err)
wg := sync.WaitGroup{}
wg.Add(1)
limit := []bool{}
go func() {
defer func() {
wg.Done()
}()
timer := time.NewTimer(10 * time.Second)
defer timer.Stop()
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
_, _, limit = r.ShouldLimit()
if slices.Contains(limit, true) {
return
}
case <-timer.C:
return
}
}
}()
r.Start()
wg.Wait()
require.Contains(t, limit, true)
util.lock.Lock()
util.gpu[0].MemoryUsed = 10
util.gpu[1].MemoryUsed = 10
util.lock.Unlock()
wg.Add(1)
go func() {
defer func() {
wg.Done()
}()
timer := time.NewTimer(10 * time.Second)
defer timer.Stop()
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
_, _, limit = r.ShouldLimit()
if !slices.Contains(limit, true) {
return
}
case <-timer.C:
return
}
}
}()
wg.Wait()
require.NotContains(t, limit, true)
r.Stop()
}
func TestGPULimitMemorySome(t *testing.T) {
r, err := New(Config{
MaxCPU: 100,
MaxMemory: 100,
MaxGPU: 100,
MaxGPUMemory: 14. / 24. * 100.,
PSUtil: newUtil(4),
Logger: nil,
})
require.NoError(t, err)
wg := sync.WaitGroup{}
wg.Add(1)
limit := []bool{}
go func() {
defer func() {
wg.Done()
}()
timer := time.NewTimer(10 * time.Second)
defer timer.Stop()
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
_, _, limit = r.ShouldLimit()
if slices.Contains(limit, true) {
return
}
case <-timer.C:
return
}
}
}()
r.Start()
wg.Wait()
require.Equal(t, []bool{false, false, true, true}, limit)
_, err = r.Request(Request{CPU: 5, Memory: 10, GPUUsage: 10, GPUMemory: 10})
require.NoError(t, err)
r.Stop()
}
func TestGPULimitUsage(t *testing.T) {
r, err := New(Config{
MaxCPU: 100,
MaxMemory: 100,
MaxGPU: 40,
MaxGPUMemory: 100,
PSUtil: newUtil(3),
Logger: nil,
})
require.NoError(t, err)
wg := sync.WaitGroup{}
wg.Add(1)
limit := []bool{}
go func() {
defer func() {
wg.Done()
}()
timer := time.NewTimer(10 * time.Second)
defer timer.Stop()
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
_, _, limit = r.ShouldLimit()
if slices.Contains(limit, true) {
return
}
case <-timer.C:
return
}
}
}()
r.Start()
wg.Wait()
require.Equal(t, []bool{true, false, false}, limit)
_, err = r.Request(Request{CPU: 5, Memory: 10, GPUUsage: 10, GPUMemory: 10})
require.Error(t, err)
_, err = r.Request(Request{CPU: 5, Memory: 10, GPUEncoder: 10, GPUMemory: 10})
require.NoError(t, err)
r.Stop()
}
func TestGPUUnlimitUsage(t *testing.T) {
util := newUtil(3)
r, err := New(Config{
MaxCPU: 100,
MaxMemory: 100,
MaxGPU: 40,
MaxGPUMemory: 100,
PSUtil: util,
Logger: nil,
})
require.NoError(t, err)
wg := sync.WaitGroup{}
wg.Add(1)
limit := []bool{}
go func() {
defer func() {
wg.Done()
}()
timer := time.NewTimer(10 * time.Second)
defer timer.Stop()
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
_, _, limit = r.ShouldLimit()
if slices.Contains(limit, true) {
return
}
case <-timer.C:
return
}
}
}()
r.Start()
wg.Wait()
require.Equal(t, []bool{true, false, false}, limit)
util.lock.Lock()
util.gpu[0].Usage = 30
util.gpu[0].Encoder = 30
util.gpu[0].Decoder = 30
util.lock.Unlock()
wg.Add(1)
go func() {
defer func() {
wg.Done()
}()
timer := time.NewTimer(10 * time.Second)
defer timer.Stop()
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
_, _, limit = r.ShouldLimit()
if !slices.Contains(limit, true) {
return
}
case <-timer.C:
return
}
}
}()
wg.Wait()
require.Equal(t, []bool{false, false, false}, limit)
r.Stop()
}
func TestRequestCPU(t *testing.T) {
r, err := New(Config{
MaxCPU: 70.,
PSUtil: newUtil(0),
})
require.NoError(t, err)
_, err = r.Request(Request{CPU: 0, Memory: 0})
require.Error(t, err)
_, err = r.Request(Request{CPU: 5, Memory: 10})
require.NoError(t, err)
_, err = r.Request(Request{CPU: 30, Memory: 10})
require.Error(t, err)
}
func TestRequestMemory(t *testing.T) {
r, err := New(Config{
MaxMemory: 170. / 200. * 100,
PSUtil: newUtil(0),
})
require.NoError(t, err)
_, err = r.Request(Request{CPU: 5, Memory: 0})
require.Error(t, err)
_, err = r.Request(Request{CPU: 5, Memory: 10})
require.NoError(t, err)
_, err = r.Request(Request{CPU: 50, Memory: 20})
require.Error(t, err)
}
func TestRequestNoGPU(t *testing.T) {
r, err := New(Config{
MaxCPU: 100,
MaxMemory: 100,
PSUtil: newUtil(0),
})
require.NoError(t, err)
_, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 30, GPUMemory: 10})
require.Error(t, err)
}
func TestRequestInvalidGPURequest(t *testing.T) {
r, err := New(Config{
MaxCPU: 100,
MaxMemory: 100,
PSUtil: newUtil(1),
})
require.NoError(t, err)
_, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 30, GPUMemory: 0})
require.Error(t, err)
_, err = r.Request(Request{CPU: 10, Memory: 10, GPUUsage: -1, GPUEncoder: 30, GPUMemory: 0})
require.Error(t, err)
}
func TestRequestGPULimitsOneGPU(t *testing.T) {
r, err := New(Config{
MaxCPU: 100,
MaxMemory: 100,
MaxGPU: 50,
MaxGPUMemory: 60,
PSUtil: newUtil(1),
})
require.NoError(t, err)
_, err = r.Request(Request{CPU: 10, Memory: 10, GPUUsage: 50, GPUMemory: 10})
require.Error(t, err)
_, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 50, GPUMemory: 10})
require.Error(t, err)
_, err = r.Request(Request{CPU: 10, Memory: 10, GPUDecoder: 50, GPUMemory: 10})
require.Error(t, err)
_, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 10, GPUMemory: 5 * 1024 * 1024 * 1024})
require.Error(t, err)
res, err := r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 10, GPUMemory: 10})
require.NoError(t, err)
require.Equal(t, 0, res.GPU)
}
func TestRequestGPULimitsMoreGPU(t *testing.T) {
r, err := New(Config{
MaxCPU: 100,
MaxMemory: 100,
MaxGPU: 60,
MaxGPUMemory: 60,
PSUtil: newUtil(2),
})
require.NoError(t, err)
_, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 50, GPUMemory: 10})
require.Error(t, err)
res, err := r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 30, GPUMemory: 10})
require.NoError(t, err)
require.Equal(t, 1, res.GPU)
}
func TestHasLimits(t *testing.T) {
r, err := New(Config{
MaxCPU: 70.,
MaxMemory: 170. / 200. * 100,
PSUtil: &util{},
PSUtil: newUtil(0),
Logger: nil,
})
require.NoError(t, err)
@ -188,7 +884,7 @@ func TestHasLimits(t *testing.T) {
r, err = New(Config{
MaxCPU: 100,
MaxMemory: 100,
PSUtil: &util{},
PSUtil: newUtil(0),
Logger: nil,
})
require.NoError(t, err)
@ -198,10 +894,95 @@ func TestHasLimits(t *testing.T) {
r, err = New(Config{
MaxCPU: 0,
MaxMemory: 0,
PSUtil: &util{},
PSUtil: newUtil(0),
Logger: nil,
})
require.NoError(t, err)
require.False(t, r.HasLimits())
r, err = New(Config{
MaxCPU: 0,
MaxMemory: 0,
MaxGPU: 10,
PSUtil: newUtil(1),
Logger: nil,
})
require.NoError(t, err)
require.True(t, r.HasLimits())
r, err = New(Config{
MaxCPU: 0,
MaxMemory: 0,
MaxGPU: 10,
PSUtil: newUtil(0),
Logger: nil,
})
require.NoError(t, err)
require.False(t, r.HasLimits())
}
func TestInfo(t *testing.T) {
r, err := New(Config{
MaxCPU: 90,
MaxMemory: 90,
MaxGPU: 11,
MaxGPUMemory: 50,
PSUtil: newUtil(2),
})
require.NoError(t, err)
info := r.Info()
require.Equal(t, Info{
Mem: MemoryInfo{
Total: 200,
Available: 40,
Used: 160,
Limit: 180,
Core: 42,
Throttling: false,
Error: nil,
},
CPU: CPUInfo{
NCPU: 2,
System: 10,
User: 50,
Idle: 35,
Other: 5,
Limit: 90,
Core: 6,
Throttling: false,
Error: nil,
},
GPU: GPUInfo{
NGPU: 2,
GPU: []GPUInfoStat{{
Index: 0,
Name: "L4",
MemoryTotal: 24 * 1024 * 1024 * 1024,
MemoryUsed: 12 * 1024 * 1024 * 1024,
MemoryAvailable: 12 * 1024 * 1024 * 1024,
MemoryLimit: 12 * 1024 * 1024 * 1024,
Usage: 45,
Encoder: 40,
Decoder: 47,
UsageLimit: 11,
}, {
Index: 1,
Name: "L4",
MemoryTotal: 24 * 1024 * 1024 * 1024,
MemoryUsed: 13 * 1024 * 1024 * 1024,
MemoryAvailable: 11 * 1024 * 1024 * 1024,
MemoryLimit: 12 * 1024 * 1024 * 1024,
Usage: 40,
Encoder: 30,
Decoder: 44,
UsageLimit: 11,
}},
Error: nil,
},
}, info)
}

View File

@ -79,13 +79,21 @@ type Config struct {
Reconnect bool
ReconnectDelay uint64 // seconds
Autostart bool
StaleTimeout uint64 // seconds
Timeout uint64 // seconds
Scheduler string // crontab pattern or RFC3339 timestamp
LogPatterns []string // will be interpreted as regular expressions
LimitCPU float64 // percent
LimitMemory uint64 // bytes
LimitWaitFor uint64 // seconds
StaleTimeout uint64 // seconds
Timeout uint64 // seconds
Scheduler string // crontab pattern or RFC3339 timestamp
LogPatterns []string // will be interpreted as regular expressions
LimitCPU float64 // percent
LimitMemory uint64 // bytes
LimitGPU ConfigLimitGPU // GPU limits
LimitWaitFor uint64 // seconds
}
type ConfigLimitGPU struct {
Usage float64 // percent 0-100
Encoder float64 // percent 0-100
Decoder float64 // percent 0-100
Memory uint64 // bytes
}
func (config *Config) Clone() *Config {
@ -103,6 +111,7 @@ func (config *Config) Clone() *Config {
Scheduler: config.Scheduler,
LimitCPU: config.LimitCPU,
LimitMemory: config.LimitMemory,
LimitGPU: config.LimitGPU,
LimitWaitFor: config.LimitWaitFor,
}
@ -175,6 +184,10 @@ func (config *Config) Hash() []byte {
b.WriteString(strconv.FormatUint(config.LimitMemory, 10))
b.WriteString(strconv.FormatUint(config.LimitWaitFor, 10))
b.WriteString(strconv.FormatFloat(config.LimitCPU, 'f', -1, 64))
b.WriteString(strconv.FormatFloat(config.LimitGPU.Usage, 'f', -1, 64))
b.WriteString(strconv.FormatFloat(config.LimitGPU.Encoder, 'f', -1, 64))
b.WriteString(strconv.FormatFloat(config.LimitGPU.Decoder, 'f', -1, 64))
b.WriteString(strconv.FormatUint(config.LimitGPU.Memory, 10))
for _, x := range config.Input {
b.WriteString(x.HashString())
@ -294,7 +307,7 @@ type State struct {
Memory uint64 // Current memory consumption in bytes
CPU float64 // Current CPU consumption in percent
LimitMode string // How the process is limited (hard or soft)
Resources ProcessUsage // Current resource usage, include CPU and memory consumption
Resources ProcessUsage // Current resource usage, include CPU, memory and GPU consumption
Command []string // ffmpeg command line parameters
}
@ -326,10 +339,10 @@ func (p *ProcessUsageCPU) MarshalParser() parse.UsageCPU {
}
type ProcessUsageMemory struct {
Current uint64 // bytes
Average float64 // bytes
Max uint64 // bytes
Limit uint64 // bytes
Current uint64 // bytes
Average uint64 // bytes
Max uint64 // bytes
Limit uint64 // bytes
}
func (p *ProcessUsageMemory) UnmarshalParser(pp *parse.UsageMemory) {
@ -348,20 +361,97 @@ func (p *ProcessUsageMemory) MarshalParser() parse.UsageMemory {
return pp
}
type ProcessUsageGPU struct {
Index int
Usage ProcessUsageGPUUsage
Encoder ProcessUsageGPUUsage
Decoder ProcessUsageGPUUsage
Memory ProcessUsageGPUMemory
}
func (p *ProcessUsageGPU) UnmarshalParser(pp *parse.UsageGPU) {
p.Index = pp.Index
p.Usage.UnmarshalParser(&pp.Usage)
p.Encoder.UnmarshalParser(&pp.Encoder)
p.Decoder.UnmarshalParser(&pp.Decoder)
p.Memory.UnmarshalParser(&pp.Memory)
}
func (p *ProcessUsageGPU) MarshalParser() parse.UsageGPU {
pp := parse.UsageGPU{
Index: p.Index,
Usage: p.Usage.MarshalParser(),
Encoder: p.Encoder.MarshalParser(),
Decoder: p.Decoder.MarshalParser(),
Memory: p.Memory.MarshalParser(),
}
return pp
}
type ProcessUsageGPUUsage struct {
Current float64 // percent 0-100
Average float64 // percent 0-100
Max float64 // percent 0-100
Limit float64 // percent 0-100
}
func (p *ProcessUsageGPUUsage) UnmarshalParser(pp *parse.UsageGPUUsage) {
p.Average = pp.Average
p.Max = pp.Max
p.Limit = pp.Limit
}
func (p *ProcessUsageGPUUsage) MarshalParser() parse.UsageGPUUsage {
pp := parse.UsageGPUUsage{
Average: p.Average,
Max: p.Max,
Limit: p.Limit,
}
return pp
}
type ProcessUsageGPUMemory struct {
Current uint64 // bytes
Average uint64 // bytes
Max uint64 // bytes
Limit uint64 // bytes
}
func (p *ProcessUsageGPUMemory) UnmarshalParser(pp *parse.UsageGPUMemory) {
p.Average = pp.Average
p.Max = pp.Max
p.Limit = pp.Limit
}
func (p *ProcessUsageGPUMemory) MarshalParser() parse.UsageGPUMemory {
pp := parse.UsageGPUMemory{
Average: p.Average,
Max: p.Max,
Limit: p.Limit,
}
return pp
}
type ProcessUsage struct {
CPU ProcessUsageCPU
Memory ProcessUsageMemory
GPU ProcessUsageGPU
}
func (p *ProcessUsage) UnmarshalParser(pp *parse.Usage) {
p.CPU.UnmarshalParser(&pp.CPU)
p.Memory.UnmarshalParser(&pp.Memory)
p.GPU.UnmarshalParser(&pp.GPU)
}
func (p *ProcessUsage) MarshalParser() parse.Usage {
pp := parse.Usage{
CPU: p.CPU.MarshalParser(),
Memory: p.Memory.MarshalParser(),
GPU: p.GPU.MarshalParser(),
}
return pp

View File

@ -46,12 +46,18 @@ func TestConfigHash(t *testing.T) {
LogPatterns: []string{"^libx264"},
LimitCPU: 50,
LimitMemory: 3 * 1024 * 1024,
LimitWaitFor: 20,
LimitGPU: ConfigLimitGPU{
Usage: 10,
Encoder: 42,
Decoder: 14,
Memory: 500 * 1024 * 1024,
},
LimitWaitFor: 20,
}
hash1 := config.Hash()
require.Equal(t, []byte{0x7e, 0xae, 0x5b, 0xc3, 0xad, 0xe3, 0x9a, 0xfc, 0xd3, 0x49, 0x15, 0x28, 0x93, 0x17, 0xc5, 0xbf}, hash1)
require.Equal(t, []byte{0x5e, 0x85, 0xc3, 0xc5, 0x44, 0xfd, 0x3e, 0x10, 0x13, 0x76, 0x36, 0x8b, 0xbe, 0x7e, 0xa6, 0xbb}, hash1)
config.Reconnect = false

View File

@ -279,13 +279,14 @@ func (r *restream) resourceObserver(ctx context.Context, rsc resources.Resources
defer ticker.Stop()
limitCPU, limitMemory := false, false
var limitGPUs []bool = nil
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
cpu, memory := rsc.ShouldLimit()
cpu, memory, gpu := rsc.ShouldLimit()
hasChanges := false
@ -299,17 +300,34 @@ func (r *restream) resourceObserver(ctx context.Context, rsc resources.Resources
hasChanges = true
}
if limitGPUs == nil {
limitGPUs = make([]bool, len(gpu))
}
for i, g := range gpu {
if g != limitGPUs[i] {
limitGPUs[i] = g
hasChanges = true
}
}
if !hasChanges {
break
}
r.tasks.Range(func(id app.ProcessID, t *task) bool {
if t.Limit(limitCPU, limitMemory) {
limitGPU := false
gpuindex := t.GetHWDevice()
if gpuindex >= 0 {
limitGPU = limitGPUs[gpuindex]
}
if t.Limit(limitCPU, limitMemory, limitGPU) {
r.logger.Debug().WithFields(log.Fields{
"limit_cpu": limitCPU,
"limit_memory": limitMemory,
"limit_gpu": limitGPU,
"id": id,
}).Log("Limiting process CPU and memory consumption")
}).Log("Limiting process CPU, memory, and GPU consumption")
}
return true
@ -391,7 +409,11 @@ func (r *restream) load() error {
// Validate config with all placeholders replaced. However, we need to take care
// that the config with the task keeps its dynamic placeholders for process starts.
config := t.config.Clone()
resolveDynamicPlaceholder(config, r.replace)
resolveDynamicPlaceholder(config, r.replace, map[string]string{
"hwdevice": "0",
}, map[string]string{
"timestamp": time.Now().UTC().Format(time.RFC3339),
})
t.usesDisk, err = validateConfig(config, r.fs.list, r.ffmpeg)
if err != nil {
@ -414,30 +436,23 @@ func (r *restream) load() error {
}
ffmpeg, err := r.ffmpeg.New(ffmpeg.ProcessConfig{
Reconnect: t.config.Reconnect,
ReconnectDelay: time.Duration(t.config.ReconnectDelay) * time.Second,
StaleTimeout: time.Duration(t.config.StaleTimeout) * time.Second,
Timeout: time.Duration(t.config.Timeout) * time.Second,
LimitCPU: t.config.LimitCPU,
LimitMemory: t.config.LimitMemory,
LimitDuration: time.Duration(t.config.LimitWaitFor) * time.Second,
LimitMode: limitMode,
Scheduler: t.config.Scheduler,
Args: t.command,
Parser: t.parser,
Logger: t.logger,
OnArgs: r.onArgs(t.config.Clone()),
OnBeforeStart: func() error {
if !r.enableSoftLimit {
return nil
}
if err := r.resources.Request(t.config.LimitCPU, t.config.LimitMemory); err != nil {
return err
}
return nil
},
Reconnect: t.config.Reconnect,
ReconnectDelay: time.Duration(t.config.ReconnectDelay) * time.Second,
StaleTimeout: time.Duration(t.config.StaleTimeout) * time.Second,
Timeout: time.Duration(t.config.Timeout) * time.Second,
LimitCPU: t.config.LimitCPU,
LimitMemory: t.config.LimitMemory,
LimitGPUUsage: t.config.LimitGPU.Usage,
LimitGPUEncoder: t.config.LimitGPU.Encoder,
LimitGPUDecoder: t.config.LimitGPU.Decoder,
LimitGPUMemory: t.config.LimitGPU.Memory,
LimitDuration: time.Duration(t.config.LimitWaitFor) * time.Second,
LimitMode: limitMode,
Scheduler: t.config.Scheduler,
Args: t.command,
Parser: t.parser,
Logger: t.logger,
OnBeforeStart: r.onBeforeStart(t.config.Clone()),
})
if err != nil {
return true
@ -578,7 +593,11 @@ func (r *restream) createTask(config *app.Config) (*task, error) {
// Validate config with all placeholders replaced. However, we need to take care
// that the config with the task keeps its dynamic placeholders for process starts.
config := t.config.Clone()
resolveDynamicPlaceholder(config, r.replace)
resolveDynamicPlaceholder(config, r.replace, map[string]string{
"hwdevice": "0",
}, map[string]string{
"timestamp": time.Now().UTC().Format(time.RFC3339),
})
t.usesDisk, err = validateConfig(config, r.fs.list, r.ffmpeg)
if err != nil {
@ -600,30 +619,23 @@ func (r *restream) createTask(config *app.Config) (*task, error) {
}
ffmpeg, err := r.ffmpeg.New(ffmpeg.ProcessConfig{
Reconnect: t.config.Reconnect,
ReconnectDelay: time.Duration(t.config.ReconnectDelay) * time.Second,
StaleTimeout: time.Duration(t.config.StaleTimeout) * time.Second,
Timeout: time.Duration(t.config.Timeout) * time.Second,
LimitCPU: t.config.LimitCPU,
LimitMemory: t.config.LimitMemory,
LimitDuration: time.Duration(t.config.LimitWaitFor) * time.Second,
LimitMode: limitMode,
Scheduler: t.config.Scheduler,
Args: t.command,
Parser: t.parser,
Logger: t.logger,
OnArgs: r.onArgs(t.config.Clone()),
OnBeforeStart: func() error {
if !r.enableSoftLimit {
return nil
}
if err := r.resources.Request(t.config.LimitCPU, t.config.LimitMemory); err != nil {
return err
}
return nil
},
Reconnect: t.config.Reconnect,
ReconnectDelay: time.Duration(t.config.ReconnectDelay) * time.Second,
StaleTimeout: time.Duration(t.config.StaleTimeout) * time.Second,
Timeout: time.Duration(t.config.Timeout) * time.Second,
LimitCPU: t.config.LimitCPU,
LimitMemory: t.config.LimitMemory,
LimitGPUUsage: t.config.LimitGPU.Usage,
LimitGPUEncoder: t.config.LimitGPU.Encoder,
LimitGPUDecoder: t.config.LimitGPU.Decoder,
LimitGPUMemory: t.config.LimitGPU.Memory,
LimitDuration: time.Duration(t.config.LimitWaitFor) * time.Second,
LimitMode: limitMode,
Scheduler: t.config.Scheduler,
Args: t.command,
Parser: t.parser,
Logger: t.logger,
OnBeforeStart: r.onBeforeStart(t.config.Clone()),
})
if err != nil {
return nil, err
@ -636,21 +648,45 @@ func (r *restream) createTask(config *app.Config) (*task, error) {
return t, nil
}
// onArgs is a callback that gets called by a process before it will be started.
// It evalutes the dynamic placeholders in a process config and returns the
// resulting command line to the process.
func (r *restream) onArgs(cfg *app.Config) func([]string) []string {
return func(args []string) []string {
// onBeforeStart is a callback that gets called by a process before it will be started.
// It evalutes the dynamic placeholders in a process config and returns the resulting command line to the process.
func (r *restream) onBeforeStart(cfg *app.Config) func([]string) ([]string, error) {
return func(args []string) ([]string, error) {
selectedGPU := -1
if r.enableSoftLimit {
res, err := r.resources.Request(resources.Request{
CPU: cfg.LimitCPU,
Memory: cfg.LimitMemory,
GPUUsage: cfg.LimitGPU.Usage,
GPUEncoder: cfg.LimitGPU.Encoder,
GPUDecoder: cfg.LimitGPU.Decoder,
GPUMemory: cfg.LimitGPU.Memory,
})
if err != nil {
return []string{}, err
}
selectedGPU = res.GPU
}
if t, hasTask := r.tasks.Load(cfg.ProcessID()); hasTask {
t.SetHWDevice(selectedGPU)
}
config := cfg.Clone()
resolveDynamicPlaceholder(config, r.replace)
resolveDynamicPlaceholder(config, r.replace, map[string]string{
"hwdevice": fmt.Sprintf("%d", selectedGPU),
}, map[string]string{
"timestamp": time.Now().UTC().Format(time.RFC3339),
})
_, err := validateConfig(config, r.fs.list, r.ffmpeg)
if err != nil {
return []string{}
return []string{}, err
}
return config.CreateCommand()
return config.CreateCommand(), nil
}
}
@ -1448,7 +1484,11 @@ func (r *restream) Probe(config *app.Config, timeout time.Duration) app.Probe {
return probe
}
resolveDynamicPlaceholder(config, r.replace)
resolveDynamicPlaceholder(config, r.replace, map[string]string{
"hwdevice": "0",
}, map[string]string{
"timestamp": time.Now().UTC().Format(time.RFC3339),
})
_, err = validateConfig(config, r.fs.list, r.ffmpeg)
if err != nil {
@ -1712,22 +1752,26 @@ func resolveStaticPlaceholders(config *app.Config, r replace.Replacer) {
// resolveDynamicPlaceholder replaces placeholders in the config that should be replaced at process start.
// The config will be modified in place.
func resolveDynamicPlaceholder(config *app.Config, r replace.Replacer) {
vars := map[string]string{
"timestamp": time.Now().UTC().Format(time.RFC3339),
}
func resolveDynamicPlaceholder(config *app.Config, r replace.Replacer, values map[string]string, vars map[string]string) {
placeholders := []string{"date", "hwdevice"}
for i, option := range config.Options {
option = r.Replace(option, "date", "", vars, config, "global")
for _, placeholder := range placeholders {
option = r.Replace(option, placeholder, values[placeholder], vars, config, "global")
}
config.Options[i] = option
}
for i, input := range config.Input {
input.Address = r.Replace(input.Address, "date", "", vars, config, "input")
for _, placeholder := range placeholders {
input.Address = r.Replace(input.Address, placeholder, values[placeholder], vars, config, "input")
}
for j, option := range input.Options {
option = r.Replace(option, "date", "", vars, config, "input")
for _, placeholder := range placeholders {
option = r.Replace(option, placeholder, values[placeholder], vars, config, "input")
}
input.Options[j] = option
}
@ -1736,16 +1780,22 @@ func resolveDynamicPlaceholder(config *app.Config, r replace.Replacer) {
}
for i, output := range config.Output {
output.Address = r.Replace(output.Address, "date", "", vars, config, "output")
for _, placeholder := range placeholders {
output.Address = r.Replace(output.Address, placeholder, values[placeholder], vars, config, "output")
}
for j, option := range output.Options {
option = r.Replace(option, "date", "", vars, config, "output")
for _, placeholder := range placeholders {
option = r.Replace(option, placeholder, values[placeholder], vars, config, "output")
}
output.Options[j] = option
}
for j, cleanup := range output.Cleanup {
cleanup.Pattern = r.Replace(cleanup.Pattern, "date", "", vars, config, "output")
for _, placeholder := range placeholders {
cleanup.Pattern = r.Replace(cleanup.Pattern, placeholder, values[placeholder], vars, config, "output")
}
output.Cleanup[j] = cleanup
}

View File

@ -1261,7 +1261,7 @@ func TestReplacer(t *testing.T) {
require.Equal(t, wantprocess, process)
resolveDynamicPlaceholder(process, replacer)
resolveDynamicPlaceholder(process, replacer, nil, nil)
wantprocess.Input = []app.ConfigIO{
{
@ -1531,7 +1531,7 @@ func TestProcessLimit(t *testing.T) {
status := task.ffmpeg.Status()
ncpu, err := psutil.CPUCounts(true)
ncpu, err := psutil.CPUCounts()
require.NoError(t, err)
require.Equal(t, ncpu*process.LimitCPU, status.CPU.Limit)

View File

@ -3,6 +3,7 @@ package restream
import (
"errors"
"maps"
"sync/atomic"
"time"
"github.com/datarhei/core/v16/ffmpeg/parse"
@ -31,7 +32,8 @@ type task struct {
parser parse.Parser
playout map[string]int
logger log.Logger
usesDisk bool // Whether this task uses the disk
usesDisk bool // Whether this task uses the disk
hwdevice atomic.Int32 // Index of the GPU this task uses
metadata map[string]interface{}
lock *xsync.RBMutex
@ -234,8 +236,47 @@ func (t *task) State() (*app.State, error) {
state.Memory = status.Memory.Current
state.CPU = status.CPU.Current / status.CPU.NCPU
state.LimitMode = status.LimitMode
state.Resources.CPU = status.CPU
state.Resources.Memory = status.Memory
state.Resources.CPU = app.ProcessUsageCPU{
NCPU: status.CPU.NCPU,
Current: status.CPU.Current,
Average: status.CPU.Average,
Max: status.CPU.Max,
Limit: status.CPU.Limit,
IsThrottling: status.CPU.IsThrottling,
}
state.Resources.Memory = app.ProcessUsageMemory{
Current: status.Memory.Current,
Average: status.Memory.Average,
Max: status.Memory.Max,
Limit: status.Memory.Limit,
}
state.Resources.GPU = app.ProcessUsageGPU{
Index: status.GPU.Index,
Usage: app.ProcessUsageGPUUsage{
Current: status.GPU.Usage.Current,
Average: status.GPU.Usage.Average,
Max: status.GPU.Usage.Max,
Limit: status.GPU.Usage.Limit,
},
Encoder: app.ProcessUsageGPUUsage{
Current: status.GPU.Encoder.Current,
Average: status.GPU.Encoder.Average,
Max: status.GPU.Encoder.Max,
Limit: status.GPU.Encoder.Limit,
},
Decoder: app.ProcessUsageGPUUsage{
Current: status.GPU.Decoder.Current,
Average: status.GPU.Decoder.Average,
Max: status.GPU.Decoder.Max,
Limit: status.GPU.Decoder.Limit,
},
Memory: app.ProcessUsageGPUMemory{
Current: status.GPU.Memory.Current,
Average: status.GPU.Memory.Average,
Max: status.GPU.Memory.Max,
Limit: status.GPU.Memory.Limit,
},
}
state.Duration = status.Duration.Round(10 * time.Millisecond).Seconds()
state.Reconnect = -1
state.Command = status.CommandArgs
@ -420,7 +461,7 @@ func (t *task) ExportMetadata() map[string]interface{} {
return t.metadata
}
func (t *task) Limit(cpu, memory bool) bool {
func (t *task) Limit(cpu, memory, gpu bool) bool {
token := t.lock.RLock()
defer t.lock.RUnlock(token)
@ -428,11 +469,19 @@ func (t *task) Limit(cpu, memory bool) bool {
return false
}
t.ffmpeg.Limit(cpu, memory)
t.ffmpeg.Limit(cpu, memory, gpu)
return true
}
func (t *task) SetHWDevice(index int) {
t.hwdevice.Store(int32(index))
}
func (t *task) GetHWDevice() int {
return int(t.hwdevice.Load())
}
func (t *task) Equal(config *app.Config) bool {
token := t.lock.RLock()
defer t.lock.RUnlock(token)

View File

@ -8,6 +8,7 @@ import (
"time"
"github.com/datarhei/core/v16/io/fs"
"github.com/lestrrat-go/strftime"
"github.com/stretchr/testify/require"
)