Add GPU support
This commit is contained in:
parent
df30a6b8e3
commit
2dbe5b5685
@ -371,9 +371,11 @@ func (a *api) start(ctx context.Context) error {
|
||||
}
|
||||
|
||||
resources, err := resources.New(resources.Config{
|
||||
MaxCPU: cfg.Resources.MaxCPUUsage,
|
||||
MaxMemory: cfg.Resources.MaxMemoryUsage,
|
||||
Logger: a.log.logger.core.WithComponent("Resources"),
|
||||
MaxCPU: cfg.Resources.MaxCPUUsage,
|
||||
MaxMemory: cfg.Resources.MaxMemoryUsage,
|
||||
MaxGPU: cfg.Resources.MaxGPUUsage,
|
||||
MaxGPUMemory: cfg.Resources.MaxGPUMemoryUsage,
|
||||
Logger: a.log.logger.core.WithComponent("Resources"),
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to initialize resource manager: %w", err)
|
||||
|
||||
@ -18,18 +18,29 @@ type ClusterRaft struct {
|
||||
}
|
||||
|
||||
type ClusterNodeResources struct {
|
||||
IsThrottling bool // Whether this core is currently throttling
|
||||
NCPU float64 // Number of CPU on this node
|
||||
CPU float64 // Current CPU load, 0-100*ncpu
|
||||
CPULimit float64 // Defined CPU load limit, 0-100*ncpu
|
||||
CPUCore float64 // Current CPU load of the core itself, 0-100*ncpu
|
||||
Mem uint64 // Currently used memory in bytes
|
||||
MemLimit uint64 // Defined memory limit in bytes
|
||||
MemTotal uint64 // Total available memory in bytes
|
||||
MemCore uint64 // Current used memory of the core itself in bytes
|
||||
IsThrottling bool // Whether this core is currently throttling
|
||||
NCPU float64 // Number of CPU on this node
|
||||
CPU float64 // Current CPU load, 0-100*ncpu
|
||||
CPULimit float64 // Defined CPU load limit, 0-100*ncpu
|
||||
CPUCore float64 // Current CPU load of the core itself, 0-100*ncpu
|
||||
Mem uint64 // Currently used memory in bytes
|
||||
MemLimit uint64 // Defined memory limit in bytes
|
||||
MemTotal uint64 // Total available memory in bytes
|
||||
MemCore uint64 // Current used memory of the core itself in bytes
|
||||
GPU []ClusterNodeGPUResources // GPU resources
|
||||
Error error
|
||||
}
|
||||
|
||||
type ClusterNodeGPUResources struct {
|
||||
Mem uint64 // Currently used memory in bytes
|
||||
MemLimit uint64 // Defined memory limit in bytes
|
||||
MemTotal uint64 // Total available memory in bytes
|
||||
Usage float64 // Current general usage, 0-100
|
||||
UsageLimit float64 // Defined general usage limit, 0-100
|
||||
Encoder float64 // Current encoder usage, 0-100
|
||||
Decoder float64 // Current decoder usage, 0-100
|
||||
}
|
||||
|
||||
type ClusterNode struct {
|
||||
ID string
|
||||
Name string
|
||||
@ -157,6 +168,19 @@ func (c *cluster) About() (ClusterAbout, error) {
|
||||
},
|
||||
}
|
||||
|
||||
if len(nodeAbout.Resources.GPU) != 0 {
|
||||
node.Resources.GPU = make([]ClusterNodeGPUResources, len(nodeAbout.Resources.GPU))
|
||||
for i, gpu := range nodeAbout.Resources.GPU {
|
||||
node.Resources.GPU[i].Mem = gpu.Mem
|
||||
node.Resources.GPU[i].MemLimit = gpu.MemLimit
|
||||
node.Resources.GPU[i].MemTotal = gpu.MemTotal
|
||||
node.Resources.GPU[i].Usage = gpu.Usage
|
||||
node.Resources.GPU[i].UsageLimit = gpu.UsageLimit
|
||||
node.Resources.GPU[i].Encoder = gpu.Encoder
|
||||
node.Resources.GPU[i].Decoder = gpu.Decoder
|
||||
}
|
||||
}
|
||||
|
||||
if s, ok := serversMap[nodeAbout.ID]; ok {
|
||||
node.Voter = s.Voter
|
||||
node.Leader = s.Leader
|
||||
|
||||
@ -195,6 +195,19 @@ func (a *api) About(c echo.Context) error {
|
||||
},
|
||||
}
|
||||
|
||||
if len(resources.GPU.GPU) != 0 {
|
||||
about.Resources.GPU = make([]client.AboutResponseGPUResources, len(resources.GPU.GPU))
|
||||
for i, gpu := range resources.GPU.GPU {
|
||||
about.Resources.GPU[i].Mem = gpu.MemoryUsed
|
||||
about.Resources.GPU[i].MemLimit = gpu.MemoryLimit
|
||||
about.Resources.GPU[i].MemTotal = gpu.MemoryTotal
|
||||
about.Resources.GPU[i].Usage = gpu.Usage
|
||||
about.Resources.GPU[i].UsageLimit = gpu.UsageLimit
|
||||
about.Resources.GPU[i].Encoder = gpu.Encoder
|
||||
about.Resources.GPU[i].Decoder = gpu.Decoder
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
about.Resources.Error = err.Error()
|
||||
}
|
||||
|
||||
@ -83,17 +83,28 @@ type AboutResponse struct {
|
||||
Resources AboutResponseResources `json:"resources"`
|
||||
}
|
||||
|
||||
type AboutResponseGPUResources struct {
|
||||
Mem uint64 `json:"memory_bytes"` // Currently used memory in bytes
|
||||
MemLimit uint64 `json:"memory_limit_bytes"` // Defined memory limit in bytes
|
||||
MemTotal uint64 `json:"memory_total_bytes"` // Total available memory in bytes
|
||||
Usage float64 `json:"usage"` // Current general usage, 0-100
|
||||
Encoder float64 `json:"encoder"` // Current encoder usage, 0-100
|
||||
Decoder float64 `json:"decoder"` // Current decoder usage, 0-100
|
||||
UsageLimit float64 `json:"usage_limit"` // Defined general usage limit, 0-100
|
||||
}
|
||||
|
||||
type AboutResponseResources struct {
|
||||
IsThrottling bool `json:"is_throttling"` // Whether this core is currently throttling
|
||||
NCPU float64 `json:"ncpu"` // Number of CPU on this node
|
||||
CPU float64 `json:"cpu"` // Current CPU load, 0-100*ncpu
|
||||
CPULimit float64 `json:"cpu_limit"` // Defined CPU load limit, 0-100*ncpu
|
||||
CPUCore float64 `json:"cpu_core"` // Current CPU load of the core itself, 0-100*ncpu
|
||||
Mem uint64 `json:"memory_bytes"` // Currently used memory in bytes
|
||||
MemLimit uint64 `json:"memory_limit_bytes"` // Defined memory limit in bytes
|
||||
MemTotal uint64 `json:"memory_total_bytes"` // Total available memory in bytes
|
||||
MemCore uint64 `json:"memory_core_bytes"` // Current used memory of the core itself in bytes
|
||||
Error string `json:"error"` // Last error
|
||||
IsThrottling bool `json:"is_throttling"` // Whether this core is currently throttling
|
||||
NCPU float64 `json:"ncpu"` // Number of CPU on this node
|
||||
CPU float64 `json:"cpu"` // Current CPU load, 0-100*ncpu
|
||||
CPULimit float64 `json:"cpu_limit"` // Defined CPU load limit, 0-100*ncpu
|
||||
CPUCore float64 `json:"cpu_core"` // Current CPU load of the core itself, 0-100*ncpu
|
||||
Mem uint64 `json:"memory_bytes"` // Currently used memory in bytes
|
||||
MemLimit uint64 `json:"memory_limit_bytes"` // Defined memory limit in bytes
|
||||
MemTotal uint64 `json:"memory_total_bytes"` // Total available memory in bytes
|
||||
MemCore uint64 `json:"memory_core_bytes"` // Current used memory of the core itself in bytes
|
||||
GPU []AboutResponseGPUResources `json:"gpu"` // Currently used GPU resources
|
||||
Error string `json:"error"` // Last error
|
||||
}
|
||||
|
||||
type SetNodeStateRequest struct {
|
||||
|
||||
@ -78,7 +78,7 @@ func rebalance(have []node.Process, nodes map[string]node.About) ([]interface{},
|
||||
|
||||
// Mark nodes as throttling where at least one process is still throttling
|
||||
for _, haveP := range have {
|
||||
if haveP.Throttling {
|
||||
if haveP.Resources.Throttling {
|
||||
resources.Throttling(haveP.NodeID, true)
|
||||
}
|
||||
}
|
||||
@ -126,7 +126,7 @@ func rebalance(have []node.Process, nodes map[string]node.About) ([]interface{},
|
||||
continue
|
||||
}
|
||||
|
||||
if resources.HasNodeEnough(raNodeid, p.Config.LimitCPU, p.Config.LimitMemory) {
|
||||
if resources.HasNodeEnough(raNodeid, ResourcesFromConfig(p.Config)) {
|
||||
availableNodeid = raNodeid
|
||||
break
|
||||
}
|
||||
@ -135,7 +135,7 @@ func rebalance(have []node.Process, nodes map[string]node.About) ([]interface{},
|
||||
|
||||
// Find the best node with enough resources available.
|
||||
if len(availableNodeid) == 0 {
|
||||
nodes := resources.FindBestNodes(p.Config.LimitCPU, p.Config.LimitMemory)
|
||||
nodes := resources.FindBestNodes(ResourcesFromConfig(p.Config))
|
||||
for _, nodeid := range nodes {
|
||||
if nodeid == overloadedNodeid {
|
||||
continue
|
||||
@ -169,7 +169,7 @@ func rebalance(have []node.Process, nodes map[string]node.About) ([]interface{},
|
||||
processes[i] = p
|
||||
|
||||
// Adjust the resources.
|
||||
resources.Move(availableNodeid, overloadedNodeid, p.CPU, p.Mem)
|
||||
resources.Move(availableNodeid, overloadedNodeid, ResourcesFromProcess(p.Resources))
|
||||
|
||||
// Adjust the reference affinity.
|
||||
haveReferenceAffinity.Move(p.Config.Reference, p.Config.Domain, overloadedNodeid, availableNodeid)
|
||||
|
||||
@ -95,7 +95,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[
|
||||
|
||||
// Mark nodes as throttling where at least one process is still throttling
|
||||
for _, haveP := range have {
|
||||
if haveP.Throttling {
|
||||
if haveP.Resources.Throttling {
|
||||
resources.Throttling(haveP.NodeID, true)
|
||||
}
|
||||
}
|
||||
@ -136,7 +136,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[
|
||||
if len(targetNodeid) != 0 {
|
||||
_, hasNode := nodes[targetNodeid]
|
||||
|
||||
if !hasNode || !resources.HasNodeEnough(targetNodeid, process.Config.LimitCPU, process.Config.LimitMemory) {
|
||||
if !hasNode || !resources.HasNodeEnough(targetNodeid, ResourcesFromConfig(process.Config)) {
|
||||
targetNodeid = ""
|
||||
}
|
||||
}
|
||||
@ -152,7 +152,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[
|
||||
continue
|
||||
}
|
||||
|
||||
if resources.HasNodeEnough(raNodeid, process.Config.LimitCPU, process.Config.LimitMemory) {
|
||||
if resources.HasNodeEnough(raNodeid, ResourcesFromConfig(process.Config)) {
|
||||
targetNodeid = raNodeid
|
||||
break
|
||||
}
|
||||
@ -161,7 +161,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[
|
||||
|
||||
// Find the best node with enough resources available.
|
||||
if len(targetNodeid) == 0 {
|
||||
nodes := resources.FindBestNodes(process.Config.LimitCPU, process.Config.LimitMemory)
|
||||
nodes := resources.FindBestNodes(ResourcesFromConfig(process.Config))
|
||||
for _, nodeid := range nodes {
|
||||
if nodeid == sourceNodeid {
|
||||
continue
|
||||
@ -194,7 +194,7 @@ func relocate(have []node.Process, nodes map[string]node.About, relocateMap map[
|
||||
opBudget -= 5
|
||||
|
||||
// Adjust the resources.
|
||||
resources.Move(targetNodeid, sourceNodeid, process.CPU, process.Mem)
|
||||
resources.Move(targetNodeid, sourceNodeid, ResourcesFromProcess(process.Resources))
|
||||
|
||||
// Adjust the reference affinity.
|
||||
haveReferenceAffinity.Move(process.Config.Reference, process.Config.Domain, sourceNodeid, targetNodeid)
|
||||
|
||||
@ -143,7 +143,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
|
||||
|
||||
// Mark nodes as throttling where at least one process is still throttling
|
||||
for _, haveP := range have {
|
||||
if haveP.Throttling {
|
||||
if haveP.Resources.Throttling {
|
||||
resources.Throttling(haveP.NodeID, true)
|
||||
}
|
||||
}
|
||||
@ -182,7 +182,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
|
||||
processid: haveP.Config.ProcessID(),
|
||||
})
|
||||
|
||||
resources.Remove(haveP.NodeID, haveP.CPU, haveP.Mem)
|
||||
resources.Remove(haveP.NodeID, ResourcesFromProcess(haveP.Resources))
|
||||
|
||||
continue
|
||||
}
|
||||
@ -219,7 +219,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
|
||||
})
|
||||
|
||||
// Release the resources.
|
||||
resources.Remove(haveP.NodeID, haveP.CPU, haveP.Mem)
|
||||
resources.Remove(haveP.NodeID, ResourcesFromProcess(haveP.Resources))
|
||||
}
|
||||
}
|
||||
|
||||
@ -229,7 +229,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
|
||||
for _, haveP := range wantOrderStart {
|
||||
nodeid := haveP.NodeID
|
||||
|
||||
resources.Add(nodeid, haveP.Config.LimitCPU, haveP.Config.LimitMemory)
|
||||
resources.Add(nodeid, ResourcesFromConfig(haveP.Config))
|
||||
|
||||
// TODO: check if the current node has actually enough resources available,
|
||||
// otherwise it needs to be moved somewhere else. If the node doesn't
|
||||
@ -347,7 +347,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
|
||||
// Try to add the process to a node where other processes with the same reference currently reside.
|
||||
raNodes := haveReferenceAffinity.Nodes(wantP.Config.Reference, wantP.Config.Domain)
|
||||
for _, raNodeid := range raNodes {
|
||||
if resources.HasNodeEnough(raNodeid, wantP.Config.LimitCPU, wantP.Config.LimitMemory) {
|
||||
if resources.HasNodeEnough(raNodeid, ResourcesFromConfig(wantP.Config)) {
|
||||
nodeid = raNodeid
|
||||
break
|
||||
}
|
||||
@ -355,7 +355,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
|
||||
|
||||
// Find the node with the most resources available.
|
||||
if len(nodeid) == 0 {
|
||||
nodes := resources.FindBestNodes(wantP.Config.LimitCPU, wantP.Config.LimitMemory)
|
||||
nodes := resources.FindBestNodes(ResourcesFromConfig(wantP.Config))
|
||||
if len(nodes) > 0 {
|
||||
nodeid = nodes[0]
|
||||
}
|
||||
@ -372,7 +372,7 @@ func synchronize(wish map[string]string, want []store.Process, have []node.Proce
|
||||
opBudget -= 3
|
||||
|
||||
// Consume the resources
|
||||
resources.Add(nodeid, wantP.Config.LimitCPU, wantP.Config.LimitMemory)
|
||||
resources.Add(nodeid, ResourcesFromConfig(wantP.Config))
|
||||
|
||||
reality[pid] = nodeid
|
||||
|
||||
|
||||
@ -193,11 +193,13 @@ func TestSynchronizeOrderStop(t *testing.T) {
|
||||
|
||||
have := []node.Process{
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
},
|
||||
Runtime: 42,
|
||||
UpdatedAt: now,
|
||||
Config: &app.Config{
|
||||
@ -285,11 +287,13 @@ func TestSynchronizeOrderStart(t *testing.T) {
|
||||
|
||||
have := []node.Process{
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "stop",
|
||||
State: "finished",
|
||||
CPU: 0,
|
||||
Mem: 0,
|
||||
NodeID: "node1",
|
||||
Order: "stop",
|
||||
State: "finished",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 0,
|
||||
Mem: 0,
|
||||
},
|
||||
Runtime: 42,
|
||||
UpdatedAt: now,
|
||||
Config: &app.Config{
|
||||
@ -388,11 +392,13 @@ func TestSynchronizeAddReferenceAffinity(t *testing.T) {
|
||||
|
||||
have := []node.Process{
|
||||
{
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
},
|
||||
Runtime: 42,
|
||||
UpdatedAt: now,
|
||||
Config: &app.Config{
|
||||
@ -490,11 +496,13 @@ func TestSynchronizeAddReferenceAffinityMultiple(t *testing.T) {
|
||||
|
||||
have := []node.Process{
|
||||
{
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 2,
|
||||
},
|
||||
Runtime: 42,
|
||||
UpdatedAt: now,
|
||||
Config: &app.Config{
|
||||
@ -882,11 +890,13 @@ func TestSynchronizeRemove(t *testing.T) {
|
||||
|
||||
have := []node.Process{
|
||||
{
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar",
|
||||
@ -967,11 +977,13 @@ func TestSynchronizeAddRemove(t *testing.T) {
|
||||
|
||||
have := []node.Process{
|
||||
{
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar2",
|
||||
@ -1064,11 +1076,13 @@ func TestSynchronizeNoUpdate(t *testing.T) {
|
||||
|
||||
have := []node.Process{
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar",
|
||||
@ -1133,11 +1147,13 @@ func TestSynchronizeUpdate(t *testing.T) {
|
||||
|
||||
have := []node.Process{
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar",
|
||||
@ -1217,11 +1233,13 @@ func TestSynchronizeUpdateMetadata(t *testing.T) {
|
||||
|
||||
have := []node.Process{
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar",
|
||||
@ -1313,11 +1331,13 @@ func TestSynchronizeWaitDisconnectedNode(t *testing.T) {
|
||||
|
||||
have := []node.Process{
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
},
|
||||
Runtime: 42,
|
||||
UpdatedAt: now,
|
||||
Config: &app.Config{
|
||||
@ -1397,11 +1417,13 @@ func TestSynchronizeWaitDisconnectedNodeNoWish(t *testing.T) {
|
||||
|
||||
have := []node.Process{
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
},
|
||||
Runtime: 42,
|
||||
UpdatedAt: now,
|
||||
Config: &app.Config{
|
||||
@ -1493,11 +1515,13 @@ func TestSynchronizeWaitDisconnectedNodeUnrealisticWish(t *testing.T) {
|
||||
|
||||
have := []node.Process{
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
},
|
||||
Runtime: 42,
|
||||
UpdatedAt: now,
|
||||
Config: &app.Config{
|
||||
@ -1589,11 +1613,13 @@ func TestSynchronizeTimeoutDisconnectedNode(t *testing.T) {
|
||||
|
||||
have := []node.Process{
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
},
|
||||
Runtime: 42,
|
||||
UpdatedAt: now,
|
||||
Config: &app.Config{
|
||||
@ -1655,22 +1681,26 @@ func TestSynchronizeTimeoutDisconnectedNode(t *testing.T) {
|
||||
func TestRebalanceNothingToDo(t *testing.T) {
|
||||
processes := []node.Process{
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 35,
|
||||
Mem: 20,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 35,
|
||||
Mem: 20,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar1",
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar2",
|
||||
@ -1711,33 +1741,39 @@ func TestRebalanceNothingToDo(t *testing.T) {
|
||||
func TestRebalanceOverload(t *testing.T) {
|
||||
processes := []node.Process{
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 35,
|
||||
Mem: 20,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 35,
|
||||
Mem: 20,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar1",
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 17,
|
||||
Mem: 31,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 17,
|
||||
Mem: 31,
|
||||
},
|
||||
Runtime: 27,
|
||||
Config: &app.Config{
|
||||
ID: "foobar3",
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar2",
|
||||
@ -1806,33 +1842,39 @@ func TestRebalanceOverload(t *testing.T) {
|
||||
func TestRebalanceSkip(t *testing.T) {
|
||||
processes := []node.Process{
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 35,
|
||||
Mem: 20,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 35,
|
||||
Mem: 20,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar1",
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 17,
|
||||
Mem: 31,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 17,
|
||||
Mem: 31,
|
||||
},
|
||||
Runtime: 27,
|
||||
Config: &app.Config{
|
||||
ID: "foobar3",
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar2",
|
||||
@ -1908,22 +1950,26 @@ func TestRebalanceSkip(t *testing.T) {
|
||||
func TestRebalanceReferenceAffinity(t *testing.T) {
|
||||
processes := []node.Process{
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar1",
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 1,
|
||||
Config: &app.Config{
|
||||
ID: "foobar2",
|
||||
@ -1931,11 +1977,13 @@ func TestRebalanceReferenceAffinity(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar3",
|
||||
@ -1943,11 +1991,13 @@ func TestRebalanceReferenceAffinity(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node3",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node3",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar4",
|
||||
@ -1955,11 +2005,13 @@ func TestRebalanceReferenceAffinity(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node3",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node3",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar5",
|
||||
@ -2048,33 +2100,39 @@ func TestRebalanceReferenceAffinity(t *testing.T) {
|
||||
func TestRebalanceRelocateTarget(t *testing.T) {
|
||||
processes := []node.Process{
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 35,
|
||||
Mem: 20,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 35,
|
||||
Mem: 20,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar1",
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 17,
|
||||
Mem: 31,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 17,
|
||||
Mem: 31,
|
||||
},
|
||||
Runtime: 27,
|
||||
Config: &app.Config{
|
||||
ID: "foobar3",
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar2",
|
||||
@ -2165,33 +2223,39 @@ func TestRebalanceRelocateTarget(t *testing.T) {
|
||||
func TestRebalanceRelocateAny(t *testing.T) {
|
||||
processes := []node.Process{
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 35,
|
||||
Mem: 20,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 35,
|
||||
Mem: 20,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar1",
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 17,
|
||||
Mem: 31,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 17,
|
||||
Mem: 31,
|
||||
},
|
||||
Runtime: 27,
|
||||
Config: &app.Config{
|
||||
ID: "foobar3",
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 12,
|
||||
Mem: 5,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar2",
|
||||
@ -2319,7 +2383,10 @@ func TestFindBestNodesForProcess(t *testing.T) {
|
||||
|
||||
resources := NewResourcePlanner(nodes)
|
||||
|
||||
list := resources.FindBestNodes(35, 20)
|
||||
list := resources.FindBestNodes(Resources{
|
||||
CPU: 35,
|
||||
Mem: 20,
|
||||
})
|
||||
|
||||
require.Equal(t, []string{"node3", "node2", "node1"}, list)
|
||||
}
|
||||
@ -2433,7 +2500,10 @@ func TestFindBestNodesForProcess2(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
list := resources.FindBestNodes(4.0, 45*1024*1024)
|
||||
list := resources.FindBestNodes(Resources{
|
||||
CPU: 4.0,
|
||||
Mem: 45 * 1024 * 1024,
|
||||
})
|
||||
|
||||
require.Equal(t, []string{"node10", "node8", "node7", "node1", "node5", "node12", "node4", "node3", "node13", "node6", "node11", "node2"}, list)
|
||||
}
|
||||
@ -2441,11 +2511,13 @@ func TestFindBestNodesForProcess2(t *testing.T) {
|
||||
func TestCreateNodeProcessMap(t *testing.T) {
|
||||
processes := []node.Process{
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "finished",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "finished",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 1,
|
||||
Config: &app.Config{
|
||||
ID: "foobar7",
|
||||
@ -2453,11 +2525,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "failed",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "failed",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 1,
|
||||
Config: &app.Config{
|
||||
ID: "foobar8",
|
||||
@ -2465,22 +2539,26 @@ func TestCreateNodeProcessMap(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar1",
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 1,
|
||||
Config: &app.Config{
|
||||
ID: "foobar2",
|
||||
@ -2488,11 +2566,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 67,
|
||||
Config: &app.Config{
|
||||
ID: "foobar3",
|
||||
@ -2500,11 +2580,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar6",
|
||||
@ -2512,11 +2594,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node3",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node3",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 41,
|
||||
Config: &app.Config{
|
||||
ID: "foobar4",
|
||||
@ -2524,11 +2608,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node3",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node3",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar5",
|
||||
@ -2542,11 +2628,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
|
||||
require.Equal(t, map[string][]node.Process{
|
||||
"node1": {
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 1,
|
||||
Config: &app.Config{
|
||||
ID: "foobar2",
|
||||
@ -2554,11 +2642,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar1",
|
||||
@ -2567,11 +2657,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
|
||||
},
|
||||
"node2": {
|
||||
{
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar6",
|
||||
@ -2579,11 +2671,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 67,
|
||||
Config: &app.Config{
|
||||
ID: "foobar3",
|
||||
@ -2593,11 +2687,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
|
||||
},
|
||||
"node3": {
|
||||
{
|
||||
NodeID: "node3",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node3",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 41,
|
||||
Config: &app.Config{
|
||||
ID: "foobar4",
|
||||
@ -2605,11 +2701,13 @@ func TestCreateNodeProcessMap(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node3",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node3",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar5",
|
||||
@ -2623,22 +2721,26 @@ func TestCreateNodeProcessMap(t *testing.T) {
|
||||
func TestCreateReferenceAffinityNodeMap(t *testing.T) {
|
||||
processes := []node.Process{
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar1",
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node1",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 1,
|
||||
Config: &app.Config{
|
||||
ID: "foobar2",
|
||||
@ -2646,11 +2748,13 @@ func TestCreateReferenceAffinityNodeMap(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar3",
|
||||
@ -2658,11 +2762,13 @@ func TestCreateReferenceAffinityNodeMap(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node2",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar3",
|
||||
@ -2670,11 +2776,13 @@ func TestCreateReferenceAffinityNodeMap(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node3",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node3",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar4",
|
||||
@ -2682,11 +2790,13 @@ func TestCreateReferenceAffinityNodeMap(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
NodeID: "node3",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
NodeID: "node3",
|
||||
Order: "start",
|
||||
State: "running",
|
||||
Resources: node.ProcessResources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
},
|
||||
Runtime: 42,
|
||||
Config: &app.Config{
|
||||
ID: "foobar5",
|
||||
|
||||
@ -747,16 +747,62 @@ func (n *Core) MediaGetInfo(prefix, path string) (int64, time.Time, error) {
|
||||
}
|
||||
|
||||
type Process struct {
|
||||
NodeID string
|
||||
Order string
|
||||
State string
|
||||
NodeID string
|
||||
Order string
|
||||
State string
|
||||
Resources ProcessResources
|
||||
Runtime time.Duration
|
||||
UpdatedAt time.Time
|
||||
Config *app.Config
|
||||
Metadata map[string]interface{}
|
||||
}
|
||||
|
||||
type ProcessResources struct {
|
||||
CPU float64 // Current CPU load of this process, 0-100*ncpu
|
||||
Mem uint64 // Currently consumed memory of this process in bytes
|
||||
GPU ProcessGPUResources
|
||||
Throttling bool
|
||||
Runtime time.Duration
|
||||
UpdatedAt time.Time
|
||||
Config *app.Config
|
||||
Metadata map[string]interface{}
|
||||
}
|
||||
|
||||
type ProcessGPUResources struct {
|
||||
Index int // GPU number
|
||||
Usage float64 // Current GPU load, 0-100
|
||||
Encoder float64 // Current GPU encoder load, 0-100
|
||||
Decoder float64 // Current GPU decoder load, 0-100
|
||||
Mem uint64 // Currently consumed GPU memory of this process in bytes
|
||||
}
|
||||
|
||||
func (p *ProcessResources) Marshal(a *api.ProcessUsage) {
|
||||
p.Throttling = a.CPU.IsThrottling
|
||||
|
||||
if x, err := a.CPU.Current.Float64(); err == nil {
|
||||
p.CPU = x
|
||||
} else {
|
||||
p.CPU = 0
|
||||
}
|
||||
|
||||
p.Mem = a.Memory.Current
|
||||
|
||||
if x, err := a.GPU.Usage.Current.Float64(); err == nil {
|
||||
p.GPU.Usage = x
|
||||
} else {
|
||||
p.GPU.Usage = 0
|
||||
}
|
||||
|
||||
if x, err := a.GPU.Encoder.Current.Float64(); err == nil {
|
||||
p.GPU.Encoder = x
|
||||
} else {
|
||||
p.GPU.Encoder = 0
|
||||
}
|
||||
|
||||
if x, err := a.GPU.Decoder.Current.Float64(); err == nil {
|
||||
p.GPU.Decoder = x
|
||||
} else {
|
||||
p.GPU.Decoder = 0
|
||||
}
|
||||
|
||||
p.GPU.Mem = a.GPU.Memory.Current
|
||||
p.GPU.Index = a.GPU.Index
|
||||
}
|
||||
|
||||
func (n *Core) ClusterProcessList() ([]Process, error) {
|
||||
@ -780,21 +826,15 @@ func (n *Core) ClusterProcessList() ([]Process, error) {
|
||||
p.Config = &api.ProcessConfig{}
|
||||
}
|
||||
|
||||
cpu, err := p.State.Resources.CPU.Current.Float64()
|
||||
if err != nil {
|
||||
cpu = 0
|
||||
process := Process{
|
||||
NodeID: nodeid,
|
||||
Order: p.State.Order,
|
||||
State: p.State.State,
|
||||
Runtime: time.Duration(p.State.Runtime) * time.Second,
|
||||
UpdatedAt: time.Unix(p.UpdatedAt, 0),
|
||||
}
|
||||
|
||||
process := Process{
|
||||
NodeID: nodeid,
|
||||
Order: p.State.Order,
|
||||
State: p.State.State,
|
||||
Mem: p.State.Resources.Memory.Current,
|
||||
CPU: cpu,
|
||||
Throttling: p.State.Resources.CPU.IsThrottling,
|
||||
Runtime: time.Duration(p.State.Runtime) * time.Second,
|
||||
UpdatedAt: time.Unix(p.UpdatedAt, 0),
|
||||
}
|
||||
process.Resources.Marshal(&p.State.Resources)
|
||||
|
||||
config, _ := p.Config.Marshal()
|
||||
|
||||
|
||||
@ -138,17 +138,28 @@ type About struct {
|
||||
Resources Resources
|
||||
}
|
||||
|
||||
type ResourcesGPU struct {
|
||||
Mem uint64 // Currently used memory in bytes
|
||||
MemLimit uint64 // Defined memory limit in bytes
|
||||
MemTotal uint64 // Total available memory in bytes
|
||||
Usage float64 // Current general usage, 0-100
|
||||
UsageLimit float64 // Defined general usage limit, 0-100
|
||||
Encoder float64 // Current encoder usage, 0-100
|
||||
Decoder float64 // Current decoder usage, 0-100
|
||||
}
|
||||
|
||||
type Resources struct {
|
||||
IsThrottling bool // Whether this core is currently throttling
|
||||
NCPU float64 // Number of CPU on this node
|
||||
CPU float64 // Current CPU load, 0-100*ncpu
|
||||
CPULimit float64 // Defined CPU load limit, 0-100*ncpu
|
||||
CPUCore float64 // Current CPU load of the core itself, 0-100*ncpu
|
||||
Mem uint64 // Currently used memory in bytes
|
||||
MemLimit uint64 // Defined memory limit in bytes
|
||||
MemTotal uint64 // Total available memory in bytes
|
||||
MemCore uint64 // Current used memory of the core itself in bytes
|
||||
Error error // Last error
|
||||
IsThrottling bool // Whether this core is currently throttling
|
||||
NCPU float64 // Number of CPU on this node
|
||||
CPU float64 // Current CPU load, 0-100*ncpu
|
||||
CPULimit float64 // Defined CPU load limit, 0-100*ncpu
|
||||
CPUCore float64 // Current CPU load of the core itself, 0-100*ncpu
|
||||
Mem uint64 // Currently used memory in bytes
|
||||
MemLimit uint64 // Defined memory limit in bytes
|
||||
MemTotal uint64 // Total available memory in bytes
|
||||
MemCore uint64 // Current used memory of the core itself in bytes
|
||||
GPU []ResourcesGPU // Currently used GPU resources
|
||||
Error error // Last error
|
||||
}
|
||||
|
||||
func (n *Node) About() About {
|
||||
@ -514,6 +525,20 @@ func (n *Node) ping(ctx context.Context, interval time.Duration) {
|
||||
Error: nil,
|
||||
},
|
||||
}
|
||||
|
||||
if len(about.Resources.GPU) != 0 {
|
||||
n.nodeAbout.Resources.GPU = make([]ResourcesGPU, len(about.Resources.GPU))
|
||||
for i, gpu := range about.Resources.GPU {
|
||||
n.nodeAbout.Resources.GPU[i].Mem = gpu.Mem
|
||||
n.nodeAbout.Resources.GPU[i].MemLimit = gpu.MemLimit
|
||||
n.nodeAbout.Resources.GPU[i].MemTotal = gpu.MemTotal
|
||||
n.nodeAbout.Resources.GPU[i].Usage = gpu.Usage
|
||||
n.nodeAbout.Resources.GPU[i].UsageLimit = gpu.UsageLimit
|
||||
n.nodeAbout.Resources.GPU[i].Encoder = gpu.Encoder
|
||||
n.nodeAbout.Resources.GPU[i].Decoder = gpu.Decoder
|
||||
}
|
||||
}
|
||||
|
||||
if len(about.Resources.Error) != 0 {
|
||||
n.nodeAbout.Resources.Error = errors.New(about.Resources.Error)
|
||||
}
|
||||
|
||||
@ -4,8 +4,69 @@ import (
|
||||
"sort"
|
||||
|
||||
"github.com/datarhei/core/v16/cluster/node"
|
||||
"github.com/datarhei/core/v16/restream/app"
|
||||
)
|
||||
|
||||
type Resources struct {
|
||||
CPU float64 // CPU 0-100*ncpu
|
||||
Mem uint64 // Memoryin bytes
|
||||
GPU ResourcesGPU // GPU resources
|
||||
}
|
||||
|
||||
type ResourcesGPU struct {
|
||||
Index int // GPU number
|
||||
Usage float64 // GPU general, 0-100
|
||||
Encoder float64 // GPU encoder, 0-100
|
||||
Decoder float64 // GPU decoder, 0-100
|
||||
Mem uint64 // GPU memory in bytes
|
||||
}
|
||||
|
||||
func ResourcesFromConfig(c *app.Config) Resources {
|
||||
r := Resources{}
|
||||
r.MarshalConfig(c)
|
||||
return r
|
||||
}
|
||||
|
||||
func ResourcesFromProcess(c node.ProcessResources) Resources {
|
||||
r := Resources{}
|
||||
r.MarshalProcess(c)
|
||||
return r
|
||||
}
|
||||
|
||||
func (r *Resources) MarshalConfig(c *app.Config) {
|
||||
r.CPU = c.LimitCPU
|
||||
r.Mem = c.LimitMemory
|
||||
r.GPU.Usage = c.LimitGPU.Usage
|
||||
r.GPU.Encoder = c.LimitGPU.Encoder
|
||||
r.GPU.Decoder = c.LimitGPU.Decoder
|
||||
r.GPU.Index = -1
|
||||
}
|
||||
|
||||
func (r *Resources) MarshalProcess(c node.ProcessResources) {
|
||||
r.CPU = c.CPU
|
||||
r.Mem = c.Mem
|
||||
r.GPU.Usage = c.GPU.Usage
|
||||
r.GPU.Encoder = c.GPU.Encoder
|
||||
r.GPU.Decoder = c.GPU.Decoder
|
||||
r.GPU.Index = c.GPU.Index
|
||||
}
|
||||
|
||||
func (r *Resources) HasGPU() bool {
|
||||
if r.GPU.Usage > 0 || r.GPU.Encoder > 0 || r.GPU.Decoder > 0 || r.GPU.Mem > 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func (r *Resources) DoesFitGPU(g node.ResourcesGPU) bool {
|
||||
if g.Usage+r.GPU.Usage < g.UsageLimit && g.Encoder+r.GPU.Encoder < g.UsageLimit && g.Decoder+r.GPU.Decoder < g.UsageLimit && g.Mem+r.GPU.Mem < g.MemLimit {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
type resourcePlanner struct {
|
||||
nodes map[string]node.Resources
|
||||
blocked map[string]struct{}
|
||||
@ -39,8 +100,8 @@ func (r *resourcePlanner) Throttling(nodeid string, throttling bool) {
|
||||
}
|
||||
|
||||
// HasNodeEnough returns whether a node has enough resources available for the
|
||||
// requested cpu and memory consumption.
|
||||
func (r *resourcePlanner) HasNodeEnough(nodeid string, cpu float64, mem uint64) bool {
|
||||
// requested cpu, memory, anf gpu consumption.
|
||||
func (r *resourcePlanner) HasNodeEnough(nodeid string, req Resources) bool {
|
||||
res, hasNode := r.nodes[nodeid]
|
||||
if !hasNode {
|
||||
return false
|
||||
@ -50,20 +111,39 @@ func (r *resourcePlanner) HasNodeEnough(nodeid string, cpu float64, mem uint64)
|
||||
return false
|
||||
}
|
||||
|
||||
if res.Error == nil && res.CPU+cpu < res.CPULimit && res.Mem+mem < res.MemLimit && !res.IsThrottling {
|
||||
return true
|
||||
if res.Error != nil || res.IsThrottling {
|
||||
return false
|
||||
}
|
||||
|
||||
return false
|
||||
if res.CPU+req.CPU >= res.CPULimit || res.Mem+req.Mem >= res.MemLimit {
|
||||
return false
|
||||
}
|
||||
|
||||
if req.HasGPU() {
|
||||
found := false
|
||||
|
||||
for _, g := range res.GPU {
|
||||
if req.DoesFitGPU(g) {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !found {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// FindBestNodes returns an array of nodeids that can fit the requested cpu and memory requirements. If no
|
||||
// FindBestNodes returns an array of nodeids that can fit the requested cpu, memory, and gpu requirements. If no
|
||||
// such node is available, an empty array is returned. The array is sorted by the most suitable node first.
|
||||
func (r *resourcePlanner) FindBestNodes(cpu float64, mem uint64) []string {
|
||||
func (r *resourcePlanner) FindBestNodes(req Resources) []string {
|
||||
nodes := []string{}
|
||||
|
||||
for id := range r.nodes {
|
||||
if r.HasNodeEnough(id, cpu, mem) {
|
||||
if r.HasNodeEnough(id, req) {
|
||||
nodes = append(nodes, id)
|
||||
}
|
||||
}
|
||||
@ -81,43 +161,72 @@ func (r *resourcePlanner) FindBestNodes(cpu float64, mem uint64) []string {
|
||||
return nodes
|
||||
}
|
||||
|
||||
// Add adds the resources of the node according to the cpu and memory utilization.
|
||||
func (r *resourcePlanner) Add(nodeid string, cpu float64, mem uint64) {
|
||||
// Add adds the resources of the node according to the cpu, memory, and gpu utilization.
|
||||
func (r *resourcePlanner) Add(nodeid string, req Resources) {
|
||||
res, hasRes := r.nodes[nodeid]
|
||||
if !hasRes {
|
||||
return
|
||||
}
|
||||
|
||||
res.CPU += cpu
|
||||
res.Mem += mem
|
||||
res.CPU += req.CPU
|
||||
res.Mem += req.Mem
|
||||
|
||||
if req.HasGPU() {
|
||||
for i, g := range res.GPU {
|
||||
if req.DoesFitGPU(g) {
|
||||
g.Usage += req.GPU.Usage
|
||||
g.Encoder += req.GPU.Encoder
|
||||
g.Decoder += req.GPU.Decoder
|
||||
g.Mem += req.GPU.Mem
|
||||
res.GPU[i] = g
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
r.nodes[nodeid] = res
|
||||
}
|
||||
|
||||
// Remove subtracts the resources from the node according to the cpu and memory utilization.
|
||||
func (r *resourcePlanner) Remove(nodeid string, cpu float64, mem uint64) {
|
||||
// Remove subtracts the resources from the node according to the cpu, memory, and gpu utilization.
|
||||
func (r *resourcePlanner) Remove(nodeid string, req Resources) {
|
||||
res, hasRes := r.nodes[nodeid]
|
||||
if !hasRes {
|
||||
return
|
||||
}
|
||||
|
||||
res.CPU -= cpu
|
||||
if res.CPU < 0 {
|
||||
res.CPU = 0
|
||||
}
|
||||
if mem >= res.Mem {
|
||||
res.Mem = 0
|
||||
} else {
|
||||
res.Mem -= mem
|
||||
res.CPU -= min(res.CPU, req.CPU)
|
||||
res.Mem -= min(res.Mem, req.Mem)
|
||||
|
||||
if req.HasGPU() {
|
||||
if req.GPU.Index > 0 && req.GPU.Index < len(res.GPU) {
|
||||
gpu := res.GPU[req.GPU.Index]
|
||||
gpu.Usage -= min(gpu.Usage, req.GPU.Usage)
|
||||
gpu.Encoder -= min(gpu.Encoder, req.GPU.Encoder)
|
||||
gpu.Decoder -= min(gpu.Decoder, req.GPU.Decoder)
|
||||
gpu.Mem -= min(gpu.Mem, req.GPU.Mem)
|
||||
res.GPU[req.GPU.Index] = gpu
|
||||
}
|
||||
}
|
||||
|
||||
r.nodes[nodeid] = res
|
||||
}
|
||||
|
||||
// Move adjusts the resources from the target and source node according to the cpu and memory utilization.
|
||||
func (r *resourcePlanner) Move(target, source string, cpu float64, mem uint64) {
|
||||
r.Add(target, cpu, mem)
|
||||
r.Remove(source, cpu, mem)
|
||||
func (r *resourcePlanner) Move(target, source string, req Resources) {
|
||||
r.Add(target, req)
|
||||
r.Remove(source, req)
|
||||
}
|
||||
|
||||
func (r *resourcePlanner) Map() map[string]node.Resources {
|
||||
return r.nodes
|
||||
}
|
||||
|
||||
func (r *resourcePlanner) Blocked() []string {
|
||||
nodes := []string{}
|
||||
|
||||
for nodeid := range r.blocked {
|
||||
nodes = append(nodes, nodeid)
|
||||
}
|
||||
|
||||
return nodes
|
||||
}
|
||||
|
||||
603
cluster/resources_test.go
Normal file
603
cluster/resources_test.go
Normal file
@ -0,0 +1,603 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/datarhei/core/v16/cluster/node"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestResources(t *testing.T) {
|
||||
r := Resources{
|
||||
CPU: 1,
|
||||
Mem: 1,
|
||||
}
|
||||
|
||||
require.False(t, r.HasGPU())
|
||||
|
||||
r.GPU = ResourcesGPU{
|
||||
Index: 0,
|
||||
Usage: 1,
|
||||
Encoder: 0,
|
||||
Decoder: 0,
|
||||
Mem: 1,
|
||||
}
|
||||
|
||||
require.True(t, r.HasGPU())
|
||||
}
|
||||
|
||||
func TestResourcePlanner(t *testing.T) {
|
||||
nodes := map[string]node.About{
|
||||
"node1": {
|
||||
State: "online",
|
||||
Resources: node.Resources{
|
||||
NCPU: 1,
|
||||
CPU: 7,
|
||||
Mem: 35,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
},
|
||||
},
|
||||
"node2": {
|
||||
State: "online",
|
||||
Resources: node.Resources{
|
||||
NCPU: 1,
|
||||
CPU: 85,
|
||||
Mem: 11,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
planner := NewResourcePlanner(nodes)
|
||||
|
||||
require.Equal(t, map[string]node.Resources{
|
||||
"node1": {
|
||||
NCPU: 1,
|
||||
CPU: 7,
|
||||
Mem: 35,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
},
|
||||
"node2": {
|
||||
NCPU: 1,
|
||||
CPU: 85,
|
||||
Mem: 11,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
},
|
||||
}, planner.Map())
|
||||
}
|
||||
|
||||
func TestResourcePlannerBlocked(t *testing.T) {
|
||||
nodes := map[string]node.About{
|
||||
"node1": {
|
||||
State: "degraded",
|
||||
Resources: node.Resources{
|
||||
NCPU: 1,
|
||||
CPU: 7,
|
||||
Mem: 35,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
},
|
||||
},
|
||||
"node2": {
|
||||
State: "online",
|
||||
Resources: node.Resources{
|
||||
NCPU: 1,
|
||||
CPU: 85,
|
||||
Mem: 11,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
planner := NewResourcePlanner(nodes)
|
||||
|
||||
require.Equal(t, []string{"node1"}, planner.Blocked())
|
||||
}
|
||||
|
||||
func TestResourcePlannerThrottling(t *testing.T) {
|
||||
nodes := map[string]node.About{
|
||||
"node1": {
|
||||
State: "online",
|
||||
Resources: node.Resources{
|
||||
NCPU: 1,
|
||||
CPU: 7,
|
||||
Mem: 35,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
},
|
||||
},
|
||||
"node2": {
|
||||
State: "online",
|
||||
Resources: node.Resources{
|
||||
NCPU: 1,
|
||||
CPU: 85,
|
||||
Mem: 11,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
planner := NewResourcePlanner(nodes)
|
||||
|
||||
require.True(t, planner.HasNodeEnough("node1", Resources{
|
||||
CPU: 30,
|
||||
Mem: 5,
|
||||
}))
|
||||
|
||||
planner.Throttling("node1", true)
|
||||
|
||||
require.False(t, planner.HasNodeEnough("node1", Resources{
|
||||
CPU: 30,
|
||||
Mem: 5,
|
||||
}))
|
||||
|
||||
planner.Throttling("node1", false)
|
||||
|
||||
require.True(t, planner.HasNodeEnough("node1", Resources{
|
||||
CPU: 30,
|
||||
Mem: 5,
|
||||
}))
|
||||
}
|
||||
|
||||
func TestRecourcePlannerHasNodeEnough(t *testing.T) {
|
||||
nodes := map[string]node.About{
|
||||
"node1": {
|
||||
State: "online",
|
||||
Resources: node.Resources{
|
||||
NCPU: 1,
|
||||
CPU: 7,
|
||||
Mem: 35,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
GPU: []node.ResourcesGPU{
|
||||
{
|
||||
Mem: 5,
|
||||
MemLimit: 90,
|
||||
Usage: 53,
|
||||
UsageLimit: 90,
|
||||
Encoder: 32,
|
||||
Decoder: 26,
|
||||
},
|
||||
{
|
||||
Mem: 85,
|
||||
MemLimit: 90,
|
||||
Usage: 64,
|
||||
UsageLimit: 90,
|
||||
Encoder: 43,
|
||||
Decoder: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
"node2": {
|
||||
State: "online",
|
||||
Resources: node.Resources{
|
||||
NCPU: 1,
|
||||
CPU: 85,
|
||||
Mem: 11,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
GPU: []node.ResourcesGPU{
|
||||
{
|
||||
Mem: 5,
|
||||
MemLimit: 90,
|
||||
Usage: 53,
|
||||
UsageLimit: 90,
|
||||
Encoder: 32,
|
||||
Decoder: 26,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
planner := NewResourcePlanner(nodes)
|
||||
|
||||
require.True(t, planner.HasNodeEnough("node1", Resources{
|
||||
CPU: 30,
|
||||
Mem: 5,
|
||||
}))
|
||||
|
||||
require.False(t, planner.HasNodeEnough("node2", Resources{
|
||||
CPU: 30,
|
||||
Mem: 5,
|
||||
}))
|
||||
|
||||
require.True(t, planner.HasNodeEnough("node1", Resources{
|
||||
CPU: 30,
|
||||
Mem: 5,
|
||||
GPU: ResourcesGPU{
|
||||
Usage: 0,
|
||||
Encoder: 0,
|
||||
Decoder: 0,
|
||||
Mem: 50,
|
||||
},
|
||||
}))
|
||||
|
||||
require.False(t, planner.HasNodeEnough("node1", Resources{
|
||||
CPU: 30,
|
||||
Mem: 5,
|
||||
GPU: ResourcesGPU{
|
||||
Usage: 0,
|
||||
Encoder: 0,
|
||||
Decoder: 0,
|
||||
Mem: 86,
|
||||
},
|
||||
}))
|
||||
|
||||
require.True(t, planner.HasNodeEnough("node1", Resources{
|
||||
CPU: 30,
|
||||
Mem: 5,
|
||||
GPU: ResourcesGPU{
|
||||
Usage: 0,
|
||||
Encoder: 50,
|
||||
Decoder: 0,
|
||||
Mem: 50,
|
||||
},
|
||||
}))
|
||||
}
|
||||
|
||||
func TestResourcePlannerAdd(t *testing.T) {
|
||||
nodes := map[string]node.About{
|
||||
"node1": {
|
||||
State: "online",
|
||||
Resources: node.Resources{
|
||||
NCPU: 1,
|
||||
CPU: 7,
|
||||
Mem: 35,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
planner := NewResourcePlanner(nodes)
|
||||
|
||||
planner.Add("node1", Resources{
|
||||
CPU: 42,
|
||||
Mem: 33,
|
||||
})
|
||||
|
||||
require.Equal(t, map[string]node.Resources{
|
||||
"node1": {
|
||||
NCPU: 1,
|
||||
CPU: 49,
|
||||
Mem: 68,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
},
|
||||
}, planner.Map())
|
||||
}
|
||||
|
||||
func TestResourcePlannerNoGPUAddGPU(t *testing.T) {
|
||||
nodes := map[string]node.About{
|
||||
"node1": {
|
||||
State: "online",
|
||||
Resources: node.Resources{
|
||||
NCPU: 1,
|
||||
CPU: 7,
|
||||
Mem: 35,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
planner := NewResourcePlanner(nodes)
|
||||
|
||||
planner.Add("node1", Resources{
|
||||
CPU: 42,
|
||||
Mem: 33,
|
||||
GPU: ResourcesGPU{
|
||||
Index: 0,
|
||||
Usage: 1,
|
||||
Encoder: 2,
|
||||
Decoder: 3,
|
||||
Mem: 4,
|
||||
},
|
||||
})
|
||||
|
||||
require.Equal(t, map[string]node.Resources{
|
||||
"node1": {
|
||||
NCPU: 1,
|
||||
CPU: 49,
|
||||
Mem: 68,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
},
|
||||
}, planner.Map())
|
||||
}
|
||||
|
||||
func TestResourcePlannerAddGPU(t *testing.T) {
|
||||
nodes := map[string]node.About{
|
||||
"node1": {
|
||||
State: "online",
|
||||
Resources: node.Resources{
|
||||
NCPU: 1,
|
||||
CPU: 7,
|
||||
Mem: 35,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
GPU: []node.ResourcesGPU{
|
||||
{
|
||||
Mem: 0,
|
||||
MemLimit: 0,
|
||||
Usage: 0,
|
||||
UsageLimit: 0,
|
||||
Encoder: 0,
|
||||
Decoder: 0,
|
||||
},
|
||||
{
|
||||
Mem: 0,
|
||||
MemLimit: 100,
|
||||
Usage: 0,
|
||||
UsageLimit: 100,
|
||||
Encoder: 0,
|
||||
Decoder: 0,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
planner := NewResourcePlanner(nodes)
|
||||
|
||||
planner.Add("node1", Resources{
|
||||
CPU: 42,
|
||||
Mem: 33,
|
||||
GPU: ResourcesGPU{
|
||||
Usage: 1,
|
||||
Encoder: 2,
|
||||
Decoder: 3,
|
||||
Mem: 4,
|
||||
},
|
||||
})
|
||||
|
||||
require.Equal(t, map[string]node.Resources{
|
||||
"node1": {
|
||||
NCPU: 1,
|
||||
CPU: 49,
|
||||
Mem: 68,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
GPU: []node.ResourcesGPU{
|
||||
{
|
||||
Mem: 0,
|
||||
MemLimit: 0,
|
||||
Usage: 0,
|
||||
UsageLimit: 0,
|
||||
Encoder: 0,
|
||||
Decoder: 0,
|
||||
},
|
||||
{
|
||||
Mem: 4,
|
||||
MemLimit: 100,
|
||||
Usage: 1,
|
||||
UsageLimit: 100,
|
||||
Encoder: 2,
|
||||
Decoder: 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
}, planner.Map())
|
||||
}
|
||||
|
||||
func TestResourcePlannerRemove(t *testing.T) {
|
||||
nodes := map[string]node.About{
|
||||
"node1": {
|
||||
State: "online",
|
||||
Resources: node.Resources{
|
||||
NCPU: 1,
|
||||
CPU: 53,
|
||||
Mem: 35,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
planner := NewResourcePlanner(nodes)
|
||||
|
||||
planner.Remove("node1", Resources{
|
||||
CPU: 13,
|
||||
Mem: 20,
|
||||
})
|
||||
|
||||
require.Equal(t, map[string]node.Resources{
|
||||
"node1": {
|
||||
NCPU: 1,
|
||||
CPU: 40,
|
||||
Mem: 15,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
},
|
||||
}, planner.Map())
|
||||
}
|
||||
|
||||
func TestResourcePlannerRemoveTooMuch(t *testing.T) {
|
||||
nodes := map[string]node.About{
|
||||
"node1": {
|
||||
State: "online",
|
||||
Resources: node.Resources{
|
||||
NCPU: 1,
|
||||
CPU: 53,
|
||||
Mem: 35,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
planner := NewResourcePlanner(nodes)
|
||||
|
||||
planner.Remove("node1", Resources{
|
||||
CPU: 100,
|
||||
Mem: 100,
|
||||
})
|
||||
|
||||
require.Equal(t, map[string]node.Resources{
|
||||
"node1": {
|
||||
NCPU: 1,
|
||||
CPU: 0,
|
||||
Mem: 0,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
},
|
||||
}, planner.Map())
|
||||
}
|
||||
|
||||
func TestResourcePlannerRemoveGPU(t *testing.T) {
|
||||
nodes := map[string]node.About{
|
||||
"node1": {
|
||||
State: "online",
|
||||
Resources: node.Resources{
|
||||
NCPU: 1,
|
||||
CPU: 53,
|
||||
Mem: 35,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
GPU: []node.ResourcesGPU{
|
||||
{
|
||||
Mem: 4,
|
||||
MemLimit: 100,
|
||||
Usage: 1,
|
||||
UsageLimit: 100,
|
||||
Encoder: 2,
|
||||
Decoder: 3,
|
||||
},
|
||||
{
|
||||
Mem: 23,
|
||||
MemLimit: 100,
|
||||
Usage: 43,
|
||||
UsageLimit: 100,
|
||||
Encoder: 95,
|
||||
Decoder: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
planner := NewResourcePlanner(nodes)
|
||||
|
||||
planner.Remove("node1", Resources{
|
||||
CPU: 13,
|
||||
Mem: 20,
|
||||
GPU: ResourcesGPU{
|
||||
Index: 1,
|
||||
Usage: 3,
|
||||
Encoder: 40,
|
||||
Decoder: 0,
|
||||
Mem: 5,
|
||||
},
|
||||
})
|
||||
|
||||
require.Equal(t, map[string]node.Resources{
|
||||
"node1": {
|
||||
NCPU: 1,
|
||||
CPU: 40,
|
||||
Mem: 15,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
GPU: []node.ResourcesGPU{
|
||||
{
|
||||
Mem: 4,
|
||||
MemLimit: 100,
|
||||
Usage: 1,
|
||||
UsageLimit: 100,
|
||||
Encoder: 2,
|
||||
Decoder: 3,
|
||||
},
|
||||
{
|
||||
Mem: 18,
|
||||
MemLimit: 100,
|
||||
Usage: 40,
|
||||
UsageLimit: 100,
|
||||
Encoder: 55,
|
||||
Decoder: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
}, planner.Map())
|
||||
}
|
||||
|
||||
func TestResourcePlannerRemoveGPUTooMuch(t *testing.T) {
|
||||
nodes := map[string]node.About{
|
||||
"node1": {
|
||||
State: "online",
|
||||
Resources: node.Resources{
|
||||
NCPU: 1,
|
||||
CPU: 53,
|
||||
Mem: 35,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
GPU: []node.ResourcesGPU{
|
||||
{
|
||||
Mem: 4,
|
||||
MemLimit: 100,
|
||||
Usage: 1,
|
||||
UsageLimit: 100,
|
||||
Encoder: 2,
|
||||
Decoder: 3,
|
||||
},
|
||||
{
|
||||
Mem: 23,
|
||||
MemLimit: 100,
|
||||
Usage: 43,
|
||||
UsageLimit: 100,
|
||||
Encoder: 95,
|
||||
Decoder: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
planner := NewResourcePlanner(nodes)
|
||||
|
||||
planner.Remove("node1", Resources{
|
||||
CPU: 13,
|
||||
Mem: 20,
|
||||
GPU: ResourcesGPU{
|
||||
Index: 1,
|
||||
Usage: 100,
|
||||
Encoder: 100,
|
||||
Decoder: 100,
|
||||
Mem: 100,
|
||||
},
|
||||
})
|
||||
|
||||
require.Equal(t, map[string]node.Resources{
|
||||
"node1": {
|
||||
NCPU: 1,
|
||||
CPU: 40,
|
||||
Mem: 15,
|
||||
CPULimit: 90,
|
||||
MemLimit: 90,
|
||||
GPU: []node.ResourcesGPU{
|
||||
{
|
||||
Mem: 4,
|
||||
MemLimit: 100,
|
||||
Usage: 1,
|
||||
UsageLimit: 100,
|
||||
Encoder: 2,
|
||||
Decoder: 3,
|
||||
},
|
||||
{
|
||||
Mem: 0,
|
||||
MemLimit: 100,
|
||||
Usage: 0,
|
||||
UsageLimit: 100,
|
||||
Encoder: 0,
|
||||
Decoder: 0,
|
||||
},
|
||||
},
|
||||
},
|
||||
}, planner.Map())
|
||||
}
|
||||
@ -306,8 +306,10 @@ func (d *Config) init() {
|
||||
d.vars.Register(value.NewDir(&d.Router.UIPath, "", d.fs), "router.ui_path", "CORE_ROUTER_UI_PATH", nil, "Path to a directory holding UI files mounted as /ui", false, false)
|
||||
|
||||
// Resources
|
||||
d.vars.Register(value.NewFloat(&d.Resources.MaxCPUUsage, 0), "resources.max_cpu_usage", "CORE_RESOURCES_MAX_CPU_USAGE", nil, "Maximum system CPU usage in percent, from 0 (no limit) to 100", false, false)
|
||||
d.vars.Register(value.NewFloat(&d.Resources.MaxMemoryUsage, 0), "resources.max_memory_usage", "CORE_RESOURCES_MAX_MEMORY_USAGE", nil, "Maximum system usage in percent, from 0 (no limit) to 100", false, false)
|
||||
d.vars.Register(value.NewFloatRange(&d.Resources.MaxCPUUsage, 0, 0, 100), "resources.max_cpu_usage", "CORE_RESOURCES_MAX_CPU_USAGE", nil, "Maximum system CPU usage in percent, from 0 (no limit) to 100", false, false)
|
||||
d.vars.Register(value.NewFloatRange(&d.Resources.MaxMemoryUsage, 0, 0, 100), "resources.max_memory_usage", "CORE_RESOURCES_MAX_MEMORY_USAGE", nil, "Maximum system usage in percent, from 0 (no limit) to 100", false, false)
|
||||
d.vars.Register(value.NewFloatRange(&d.Resources.MaxGPUUsage, 0, 0, 100), "resources.max_gpu_usage", "CORE_RESOURCES_MAX_GPU_USAGE", nil, "Maximum general, encoder, and decoder GPU usage in percent per GPU, from 0 (no limit) to 100", false, false)
|
||||
d.vars.Register(value.NewFloatRange(&d.Resources.MaxGPUMemoryUsage, 0, 0, 100), "resources.max_gpu_memory_usage", "CORE_RESOURCES_MAX_GPU_MEMORY_USAGE", nil, "Maximum GPU memory usage in percent per GPU, from 0 (no limit) to 100", false, false)
|
||||
|
||||
// Cluster
|
||||
d.vars.Register(value.NewBool(&d.Cluster.Enable, false), "cluster.enable", "CORE_CLUSTER_ENABLE", nil, "Enable cluster mode", false, false)
|
||||
@ -494,17 +496,6 @@ func (d *Config) Validate(resetLogs bool) {
|
||||
}
|
||||
}
|
||||
|
||||
// If resource limits are given, all values must be set
|
||||
if d.Resources.MaxCPUUsage > 0 || d.Resources.MaxMemoryUsage > 0 {
|
||||
if d.Resources.MaxCPUUsage <= 0 || d.Resources.MaxCPUUsage > 100 {
|
||||
d.vars.Log("error", "resources.max_cpu_usage", "must be greater than 0 and smaller or equal to 100")
|
||||
}
|
||||
|
||||
if d.Resources.MaxMemoryUsage <= 0 {
|
||||
d.vars.Log("error", "resources.max_memory_usage", "must be greater than 0 and smaller or equal to 100")
|
||||
}
|
||||
}
|
||||
|
||||
// If cluster mode is enabled, a proper address must be provided
|
||||
if d.Cluster.Enable {
|
||||
if len(d.Cluster.Address) == 0 {
|
||||
|
||||
@ -184,8 +184,10 @@ type Data struct {
|
||||
UIPath string `json:"ui_path"`
|
||||
} `json:"router"`
|
||||
Resources struct {
|
||||
MaxCPUUsage float64 `json:"max_cpu_usage"` // percent 0-100
|
||||
MaxMemoryUsage float64 `json:"max_memory_usage"` // percent 0-100
|
||||
MaxCPUUsage float64 `json:"max_cpu_usage"` // percent 0-100
|
||||
MaxMemoryUsage float64 `json:"max_memory_usage"` // percent 0-100
|
||||
MaxGPUUsage float64 `json:"max_gpu_usage"` // percent 0-100
|
||||
MaxGPUMemoryUsage float64 `json:"max_gpu_memory_usage"` // percent 0-100
|
||||
} `json:"resources"`
|
||||
Cluster struct {
|
||||
Enable bool `json:"enable"`
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package value
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
@ -310,3 +311,56 @@ func (u *Float64) Validate() error {
|
||||
func (u *Float64) IsEmpty() bool {
|
||||
return float64(*u) == 0
|
||||
}
|
||||
|
||||
// float64 range
|
||||
|
||||
type Float64Range struct {
|
||||
p *float64
|
||||
from float64
|
||||
to float64
|
||||
}
|
||||
|
||||
func NewFloatRange(p *float64, val, from, to float64) *Float64Range {
|
||||
v := &Float64Range{
|
||||
p: p,
|
||||
from: from,
|
||||
to: to,
|
||||
}
|
||||
|
||||
*p = val
|
||||
|
||||
return v
|
||||
}
|
||||
|
||||
func (s *Float64Range) Set(val string) error {
|
||||
v, err := strconv.ParseFloat(val, 64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
*s.p = v
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Float64Range) String() string {
|
||||
if s.IsEmpty() {
|
||||
return "(empty)"
|
||||
}
|
||||
|
||||
return fmt.Sprintf("%.3f", *s.p)
|
||||
}
|
||||
|
||||
func (s *Float64Range) Validate() error {
|
||||
val := *s.p
|
||||
|
||||
if val < s.from || val > s.to {
|
||||
return fmt.Errorf("value %f is not in range [%f, %f]", val, s.from, s.to)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Float64Range) IsEmpty() bool {
|
||||
return *s.p == 0
|
||||
}
|
||||
|
||||
@ -165,3 +165,29 @@ func TestFloat64Value(t *testing.T) {
|
||||
|
||||
require.Equal(t, float64(77.7), x)
|
||||
}
|
||||
|
||||
func TestFloat64RangeValue(t *testing.T) {
|
||||
var x float64
|
||||
|
||||
val := NewFloatRange(&x, 11.1, 0, 100)
|
||||
|
||||
require.Equal(t, "11.100", val.String())
|
||||
require.NoError(t, val.Validate())
|
||||
require.Equal(t, false, val.IsEmpty())
|
||||
|
||||
x = 42.5
|
||||
|
||||
require.Equal(t, "42.500", val.String())
|
||||
require.NoError(t, val.Validate())
|
||||
require.Equal(t, false, val.IsEmpty())
|
||||
|
||||
val.Set("77.7")
|
||||
|
||||
require.Equal(t, float64(77.7), x)
|
||||
|
||||
val.Set("101.9")
|
||||
|
||||
require.Equal(t, "101.900", val.String())
|
||||
require.Error(t, val.Validate())
|
||||
require.Equal(t, false, val.IsEmpty())
|
||||
}
|
||||
|
||||
@ -29,23 +29,26 @@ type FFmpeg interface {
|
||||
}
|
||||
|
||||
type ProcessConfig struct {
|
||||
Reconnect bool // Whether to reconnect
|
||||
ReconnectDelay time.Duration // Duration until next reconnect
|
||||
StaleTimeout time.Duration // Duration to wait until killing the process if there is no progress in the process
|
||||
Timeout time.Duration // Duration to wait until killing the process
|
||||
LimitCPU float64 // Kill the process if the CPU usage in percent is above this value.
|
||||
LimitMemory uint64 // Kill the process if the memory consumption in bytes is above this value.
|
||||
LimitDuration time.Duration // Kill the process if the limits are exceeded for this duration.
|
||||
LimitMode string // How to limit the process, "hard" or "soft"
|
||||
Scheduler string // A scheduler for starting the process, either a concrete date (RFC3339) or in crontab syntax
|
||||
Args []string // Arguments for the process
|
||||
Parser process.Parser // Parser for the process output
|
||||
Logger log.Logger // Logger
|
||||
OnArgs func([]string) []string // Callback before starting the process to retrieve new arguments
|
||||
OnBeforeStart func() error // Callback which is called before the process will be started. If error is non-nil, the start will be refused.
|
||||
OnStart func() // Callback called after process has been started
|
||||
OnExit func(state string) // Callback called after the process stopped with exit state as argument
|
||||
OnStateChange func(from, to string) // Callback called on state change
|
||||
Reconnect bool // Whether to reconnect
|
||||
ReconnectDelay time.Duration // Duration until next reconnect
|
||||
StaleTimeout time.Duration // Duration to wait until killing the process if there is no progress in the process
|
||||
Timeout time.Duration // Duration to wait until killing the process
|
||||
LimitCPU float64 // Kill the process if the CPU usage in percent is above this value.
|
||||
LimitMemory uint64 // Kill the process if the memory consumption in bytes is above this value.
|
||||
LimitGPUUsage float64 // Kill the process id the GPU usage (general) in percent is above this value.
|
||||
LimitGPUEncoder float64 // Kill the process id the GPU usage (encoder) in percent is above this value.
|
||||
LimitGPUDecoder float64 // Kill the process id the GPU usage (decoder) in percent is above this value.
|
||||
LimitGPUMemory uint64 // Kill the process if the GPU memory consumption in bytes is above this value.
|
||||
LimitDuration time.Duration // Kill the process if the limits are exceeded for this duration.
|
||||
LimitMode string // How to limit the process, "hard" or "soft"
|
||||
Scheduler string // A scheduler for starting the process, either a concrete date (RFC3339) or in crontab syntax
|
||||
Args []string // Arguments for the process
|
||||
Parser process.Parser // Parser for the process output
|
||||
Logger log.Logger // Logger
|
||||
OnBeforeStart func([]string) ([]string, error) // Callback which is called before the process will be started. The string slice is the list of arguments which can be modified. If error is non-nil, the start will be refused.
|
||||
OnStart func() // Callback called after process has been started
|
||||
OnExit func(state string) // Callback called after the process stopped with exit state as argument
|
||||
OnStateChange func(from, to string) // Callback called on state change
|
||||
}
|
||||
|
||||
// Config is the configuration for ffmpeg that is part of the configuration
|
||||
@ -138,23 +141,26 @@ func (f *ffmpeg) New(config ProcessConfig) (process.Process, error) {
|
||||
}
|
||||
|
||||
ffmpeg, err := process.New(process.Config{
|
||||
Binary: f.binary,
|
||||
Args: config.Args,
|
||||
Reconnect: config.Reconnect,
|
||||
ReconnectDelay: config.ReconnectDelay,
|
||||
StaleTimeout: config.StaleTimeout,
|
||||
Timeout: config.Timeout,
|
||||
LimitCPU: config.LimitCPU,
|
||||
LimitMemory: config.LimitMemory,
|
||||
LimitDuration: config.LimitDuration,
|
||||
LimitMode: limitMode,
|
||||
Scheduler: scheduler,
|
||||
Parser: config.Parser,
|
||||
Logger: config.Logger,
|
||||
OnArgs: config.OnArgs,
|
||||
OnBeforeStart: config.OnBeforeStart,
|
||||
OnStart: config.OnStart,
|
||||
OnExit: config.OnExit,
|
||||
Binary: f.binary,
|
||||
Args: config.Args,
|
||||
Reconnect: config.Reconnect,
|
||||
ReconnectDelay: config.ReconnectDelay,
|
||||
StaleTimeout: config.StaleTimeout,
|
||||
Timeout: config.Timeout,
|
||||
LimitCPU: config.LimitCPU,
|
||||
LimitMemory: config.LimitMemory,
|
||||
LimitGPUUsage: config.LimitGPUUsage,
|
||||
LimitGPUEncoder: config.LimitGPUEncoder,
|
||||
LimitGPUDecoder: config.LimitGPUDecoder,
|
||||
LimitGPUMemory: config.LimitGPUMemory,
|
||||
LimitDuration: config.LimitDuration,
|
||||
LimitMode: limitMode,
|
||||
Scheduler: scheduler,
|
||||
Parser: config.Parser,
|
||||
Logger: config.Logger,
|
||||
OnBeforeStart: config.OnBeforeStart,
|
||||
OnStart: config.OnStart,
|
||||
OnExit: config.OnExit,
|
||||
OnStateChange: func(from, to string) {
|
||||
f.statesLock.Lock()
|
||||
switch to {
|
||||
|
||||
@ -619,7 +619,7 @@ func (p *parser) Stop(state string, pusage process.Usage) {
|
||||
usage.CPU.Max = pusage.CPU.Max
|
||||
usage.CPU.Limit = pusage.CPU.Limit
|
||||
|
||||
usage.Memory.Average = pusage.Memory.Average
|
||||
usage.Memory.Average = uint64(pusage.Memory.Average)
|
||||
usage.Memory.Max = pusage.Memory.Max
|
||||
usage.Memory.Limit = pusage.Memory.Limit
|
||||
|
||||
|
||||
@ -576,6 +576,7 @@ type AVstream struct {
|
||||
type Usage struct {
|
||||
CPU UsageCPU
|
||||
Memory UsageMemory
|
||||
GPU UsageGPU
|
||||
}
|
||||
|
||||
type UsageCPU struct {
|
||||
@ -586,7 +587,27 @@ type UsageCPU struct {
|
||||
}
|
||||
|
||||
type UsageMemory struct {
|
||||
Average uint64
|
||||
Max uint64
|
||||
Limit uint64
|
||||
}
|
||||
|
||||
type UsageGPU struct {
|
||||
Index int
|
||||
Usage UsageGPUUsage
|
||||
Encoder UsageGPUUsage
|
||||
Decoder UsageGPUUsage
|
||||
Memory UsageGPUMemory
|
||||
}
|
||||
|
||||
type UsageGPUUsage struct {
|
||||
Average float64
|
||||
Max float64
|
||||
Limit float64
|
||||
}
|
||||
|
||||
type UsageGPUMemory struct {
|
||||
Average uint64
|
||||
Max uint64
|
||||
Limit uint64
|
||||
}
|
||||
|
||||
@ -155,9 +155,13 @@ type ProcessConfigIOCleanup struct {
|
||||
}
|
||||
|
||||
type ProcessConfigLimits struct {
|
||||
CPU float64 `json:"cpu_usage" jsonschema:"minimum=0"`
|
||||
Memory uint64 `json:"memory_mbytes" jsonschema:"minimum=0" format:"uint64"`
|
||||
WaitFor uint64 `json:"waitfor_seconds" jsonschema:"minimum=0" format:"uint64"`
|
||||
CPU float64 `json:"cpu_usage" jsonschema:"minimum=0"`
|
||||
Memory uint64 `json:"memory_mbytes" jsonschema:"minimum=0" format:"uint64"`
|
||||
GPUUsage float64 `json:"gpu_usage" jsonschema:"minimum=0"`
|
||||
GPUEncoder float64 `json:"gpu_encoder" jsonschema:"minimum=0"`
|
||||
GPUDecoder float64 `json:"gpu_decoder" jsonschema:"minimum=0"`
|
||||
GPUMemory uint64 `json:"gpu_memory_mbytes" jsonschema:"minimum=0" format:"uint64"`
|
||||
WaitFor uint64 `json:"waitfor_seconds" jsonschema:"minimum=0" format:"uint64"`
|
||||
}
|
||||
|
||||
// ProcessConfig represents the configuration of an ffmpeg process
|
||||
@ -197,7 +201,13 @@ func (cfg *ProcessConfig) Marshal() (*app.Config, map[string]interface{}) {
|
||||
Scheduler: cfg.Scheduler,
|
||||
LimitCPU: cfg.Limits.CPU,
|
||||
LimitMemory: cfg.Limits.Memory * 1024 * 1024,
|
||||
LimitWaitFor: cfg.Limits.WaitFor,
|
||||
LimitGPU: app.ConfigLimitGPU{
|
||||
Usage: cfg.Limits.GPUUsage,
|
||||
Encoder: cfg.Limits.GPUEncoder,
|
||||
Decoder: cfg.Limits.GPUDecoder,
|
||||
Memory: cfg.Limits.GPUMemory * 1024 * 1024,
|
||||
},
|
||||
LimitWaitFor: cfg.Limits.WaitFor,
|
||||
}
|
||||
|
||||
cfg.generateInputOutputIDs(cfg.Input)
|
||||
@ -283,6 +293,10 @@ func (cfg *ProcessConfig) Unmarshal(c *app.Config, metadata map[string]interface
|
||||
cfg.Scheduler = c.Scheduler
|
||||
cfg.Limits.CPU = c.LimitCPU
|
||||
cfg.Limits.Memory = c.LimitMemory / 1024 / 1024
|
||||
cfg.Limits.GPUUsage = c.LimitGPU.Usage
|
||||
cfg.Limits.GPUEncoder = c.LimitGPU.Encoder
|
||||
cfg.Limits.GPUDecoder = c.LimitGPU.Decoder
|
||||
cfg.Limits.GPUMemory = c.LimitGPU.Memory / 1024 / 1024
|
||||
cfg.Limits.WaitFor = c.LimitWaitFor
|
||||
|
||||
cfg.Options = make([]string, len(c.Options))
|
||||
@ -364,20 +378,7 @@ func (s *ProcessState) Unmarshal(state *app.State) {
|
||||
s.Memory = state.Memory
|
||||
s.CPU = json.ToNumber(state.CPU)
|
||||
s.LimitMode = state.LimitMode
|
||||
s.Resources.CPU = ProcessUsageCPU{
|
||||
NCPU: json.ToNumber(state.Resources.CPU.NCPU),
|
||||
Current: json.ToNumber(state.Resources.CPU.Current),
|
||||
Average: json.ToNumber(state.Resources.CPU.Average),
|
||||
Max: json.ToNumber(state.Resources.CPU.Max),
|
||||
Limit: json.ToNumber(state.Resources.CPU.Limit),
|
||||
IsThrottling: state.Resources.CPU.IsThrottling,
|
||||
}
|
||||
s.Resources.Memory = ProcessUsageMemory{
|
||||
Current: state.Resources.Memory.Current,
|
||||
Average: json.ToNumber(state.Resources.Memory.Average),
|
||||
Max: state.Resources.Memory.Max,
|
||||
Limit: state.Resources.Memory.Limit,
|
||||
}
|
||||
s.Resources.Unmarshal(&state.Resources)
|
||||
s.Command = state.Command
|
||||
|
||||
s.Progress.Unmarshal(&state.Progress)
|
||||
@ -430,15 +431,15 @@ func (p *ProcessUsageCPU) Marshal() app.ProcessUsageCPU {
|
||||
}
|
||||
|
||||
type ProcessUsageMemory struct {
|
||||
Current uint64 `json:"cur" format:"uint64"`
|
||||
Average json.Number `json:"avg" swaggertype:"number" jsonschema:"type=number"`
|
||||
Max uint64 `json:"max" format:"uint64"`
|
||||
Limit uint64 `json:"limit" format:"uint64"`
|
||||
Current uint64 `json:"cur" format:"uint64"`
|
||||
Average uint64 `json:"avg" format:"uint64"`
|
||||
Max uint64 `json:"max" format:"uint64"`
|
||||
Limit uint64 `json:"limit" format:"uint64"`
|
||||
}
|
||||
|
||||
func (p *ProcessUsageMemory) Unmarshal(pp *app.ProcessUsageMemory) {
|
||||
p.Current = pp.Current
|
||||
p.Average = json.ToNumber(pp.Average)
|
||||
p.Average = pp.Average
|
||||
p.Max = pp.Max
|
||||
p.Limit = pp.Limit
|
||||
}
|
||||
@ -446,31 +447,120 @@ func (p *ProcessUsageMemory) Unmarshal(pp *app.ProcessUsageMemory) {
|
||||
func (p *ProcessUsageMemory) Marshal() app.ProcessUsageMemory {
|
||||
pp := app.ProcessUsageMemory{
|
||||
Current: p.Current,
|
||||
Average: p.Average,
|
||||
Max: p.Max,
|
||||
Limit: p.Limit,
|
||||
}
|
||||
|
||||
return pp
|
||||
}
|
||||
|
||||
type ProcessUsageGPUMemory struct {
|
||||
Current uint64 `json:"cur" format:"uint64"`
|
||||
Average uint64 `json:"avg" format:"uint64"`
|
||||
Max uint64 `json:"max" format:"uint64"`
|
||||
Limit uint64 `json:"limit" format:"uint64"`
|
||||
}
|
||||
|
||||
func (p *ProcessUsageGPUMemory) Unmarshal(pp *app.ProcessUsageGPUMemory) {
|
||||
p.Current = pp.Current
|
||||
p.Average = pp.Average
|
||||
p.Max = pp.Max
|
||||
p.Limit = pp.Limit
|
||||
}
|
||||
|
||||
func (p *ProcessUsageGPUMemory) Marshal() app.ProcessUsageGPUMemory {
|
||||
pp := app.ProcessUsageGPUMemory{
|
||||
Current: p.Current,
|
||||
Average: p.Average,
|
||||
Max: p.Max,
|
||||
Limit: p.Limit,
|
||||
}
|
||||
|
||||
return pp
|
||||
}
|
||||
|
||||
type ProcessUsageGPUUsage struct {
|
||||
Current json.Number `json:"cur" swaggertype:"number" jsonschema:"type=number"`
|
||||
Average json.Number `json:"avg" swaggertype:"number" jsonschema:"type=number"`
|
||||
Max json.Number `json:"max" swaggertype:"number" jsonschema:"type=number"`
|
||||
Limit json.Number `json:"limit" swaggertype:"number" jsonschema:"type=number"`
|
||||
}
|
||||
|
||||
func (p *ProcessUsageGPUUsage) Unmarshal(pp *app.ProcessUsageGPUUsage) {
|
||||
p.Current = json.ToNumber(pp.Current)
|
||||
p.Average = json.ToNumber(pp.Average)
|
||||
p.Max = json.ToNumber(pp.Max)
|
||||
p.Limit = json.ToNumber(pp.Limit)
|
||||
}
|
||||
|
||||
func (p *ProcessUsageGPUUsage) Marshal() app.ProcessUsageGPUUsage {
|
||||
pp := app.ProcessUsageGPUUsage{}
|
||||
|
||||
if x, err := p.Current.Float64(); err == nil {
|
||||
pp.Current = x
|
||||
}
|
||||
|
||||
if x, err := p.Average.Float64(); err == nil {
|
||||
pp.Average = x
|
||||
}
|
||||
|
||||
if x, err := p.Max.Float64(); err == nil {
|
||||
pp.Max = x
|
||||
}
|
||||
|
||||
if x, err := p.Limit.Float64(); err == nil {
|
||||
pp.Limit = x
|
||||
}
|
||||
|
||||
return pp
|
||||
}
|
||||
|
||||
type ProcessUsageGPU struct {
|
||||
Index int `json:"index"`
|
||||
Memory ProcessUsageGPUMemory `json:"memory_bytes"`
|
||||
Usage ProcessUsageGPUUsage `json:"usage"`
|
||||
Encoder ProcessUsageGPUUsage `json:"encoder"`
|
||||
Decoder ProcessUsageGPUUsage `json:"decoder"`
|
||||
}
|
||||
|
||||
func (p *ProcessUsageGPU) Unmarshal(pp *app.ProcessUsageGPU) {
|
||||
p.Index = pp.Index
|
||||
p.Memory.Unmarshal(&pp.Memory)
|
||||
p.Usage.Unmarshal(&pp.Usage)
|
||||
p.Encoder.Unmarshal(&pp.Encoder)
|
||||
p.Decoder.Unmarshal(&pp.Decoder)
|
||||
}
|
||||
|
||||
func (p *ProcessUsageGPU) Marshal() app.ProcessUsageGPU {
|
||||
pp := app.ProcessUsageGPU{
|
||||
Index: p.Index,
|
||||
Memory: p.Memory.Marshal(),
|
||||
Usage: p.Usage.Marshal(),
|
||||
Encoder: p.Encoder.Marshal(),
|
||||
Decoder: p.Decoder.Marshal(),
|
||||
}
|
||||
|
||||
return pp
|
||||
}
|
||||
|
||||
type ProcessUsage struct {
|
||||
CPU ProcessUsageCPU `json:"cpu_usage"`
|
||||
Memory ProcessUsageMemory `json:"memory_bytes"`
|
||||
GPU ProcessUsageGPU `json:"gpu"`
|
||||
}
|
||||
|
||||
func (p *ProcessUsage) Unmarshal(pp *app.ProcessUsage) {
|
||||
p.CPU.Unmarshal(&pp.CPU)
|
||||
p.Memory.Unmarshal(&pp.Memory)
|
||||
p.GPU.Unmarshal(&pp.GPU)
|
||||
}
|
||||
|
||||
func (p *ProcessUsage) Marshal() app.ProcessUsage {
|
||||
pp := app.ProcessUsage{
|
||||
CPU: p.CPU.Marshal(),
|
||||
Memory: p.Memory.Marshal(),
|
||||
GPU: p.GPU.Marshal(),
|
||||
}
|
||||
|
||||
return pp
|
||||
|
||||
@ -56,6 +56,33 @@ func TestProcessUsage(t *testing.T) {
|
||||
Max: 150,
|
||||
Limit: 200,
|
||||
},
|
||||
GPU: app.ProcessUsageGPU{
|
||||
Index: 3,
|
||||
Memory: app.ProcessUsageGPUMemory{
|
||||
Current: 48,
|
||||
Average: 43,
|
||||
Max: 88,
|
||||
Limit: 34,
|
||||
},
|
||||
Usage: app.ProcessUsageGPUUsage{
|
||||
Current: 47,
|
||||
Average: 22,
|
||||
Max: 90,
|
||||
Limit: 80,
|
||||
},
|
||||
Encoder: app.ProcessUsageGPUUsage{
|
||||
Current: 48,
|
||||
Average: 46,
|
||||
Max: 74,
|
||||
Limit: 46,
|
||||
},
|
||||
Decoder: app.ProcessUsageGPUUsage{
|
||||
Current: 21,
|
||||
Average: 42,
|
||||
Max: 30,
|
||||
Limit: 99,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
p := ProcessUsage{}
|
||||
@ -103,7 +130,13 @@ func TestProcessConfig(t *testing.T) {
|
||||
LogPatterns: []string{"bla", "blubb"},
|
||||
LimitCPU: 10,
|
||||
LimitMemory: 100 * 1024 * 1024,
|
||||
LimitWaitFor: 20,
|
||||
LimitGPU: app.ConfigLimitGPU{
|
||||
Usage: 50,
|
||||
Encoder: 90,
|
||||
Decoder: 80,
|
||||
Memory: 24 * 1024 * 1024 * 1024,
|
||||
},
|
||||
LimitWaitFor: 20,
|
||||
}
|
||||
|
||||
p := ProcessConfig{}
|
||||
|
||||
3
internal/.gitignore
vendored
3
internal/.gitignore
vendored
@ -2,4 +2,5 @@ testhelper/ignoresigint/ignoresigint
|
||||
testhelper/sigint/sigint
|
||||
testhelper/sigintwait/sigintwait
|
||||
testhelper/sigpropagate/sigpropagate
|
||||
testhelper/ffmpeg/ffmpeg
|
||||
testhelper/ffmpeg/ffmpeg
|
||||
testhelper/nvidia-smi/nvidia-smi
|
||||
973
internal/testhelper/nvidia-smi/nvidia-smi.go
Normal file
973
internal/testhelper/nvidia-smi/nvidia-smi.go
Normal file
@ -0,0 +1,973 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/signal"
|
||||
"time"
|
||||
)
|
||||
|
||||
var pmondata = `# gpu pid type sm mem enc dec fb command
|
||||
# Idx # C/G % % % % MB name
|
||||
0 7372 C 2 0 2 - 136 ffmpeg
|
||||
0 12176 C 5 2 3 7 782 ffmpeg
|
||||
1 20035 C 8 2 4 1 1145 ffmpeg
|
||||
1 20141 C 2 1 1 3 429 ffmpeg
|
||||
0 29591 C 2 1 - 2 435 ffmpeg `
|
||||
|
||||
var querydata = `<?xml version="1.0" ?>
|
||||
<!DOCTYPE nvidia_smi_log SYSTEM "nvsmi_device_v12.dtd">
|
||||
<nvidia_smi_log>
|
||||
<timestamp>Mon Jul 15 13:41:56 2024</timestamp>
|
||||
<driver_version>555.42.06</driver_version>
|
||||
<cuda_version>12.5</cuda_version>
|
||||
<attached_gpus>2</attached_gpus>
|
||||
<gpu id="00000000:01:00.0">
|
||||
<product_name>NVIDIA L4</product_name>
|
||||
<product_brand>NVIDIA</product_brand>
|
||||
<product_architecture>Ada Lovelace</product_architecture>
|
||||
<display_mode>Enabled</display_mode>
|
||||
<display_active>Disabled</display_active>
|
||||
<persistence_mode>Disabled</persistence_mode>
|
||||
<addressing_mode>None</addressing_mode>
|
||||
<mig_mode>
|
||||
<current_mig>N/A</current_mig>
|
||||
<pending_mig>N/A</pending_mig>
|
||||
</mig_mode>
|
||||
<mig_devices>
|
||||
None
|
||||
</mig_devices>
|
||||
<accounting_mode>Disabled</accounting_mode>
|
||||
<accounting_mode_buffer_size>4000</accounting_mode_buffer_size>
|
||||
<driver_model>
|
||||
<current_dm>N/A</current_dm>
|
||||
<pending_dm>N/A</pending_dm>
|
||||
</driver_model>
|
||||
<serial>1654523003308</serial>
|
||||
<uuid>GPU-c5533cd4-5a60-059e-348d-b6d7466932e4</uuid>
|
||||
<minor_number>1</minor_number>
|
||||
<vbios_version>95.04.29.00.06</vbios_version>
|
||||
<multigpu_board>No</multigpu_board>
|
||||
<board_id>0x100</board_id>
|
||||
<board_part_number>900-2G193-0000-001</board_part_number>
|
||||
<gpu_part_number>27B8-895-A1</gpu_part_number>
|
||||
<gpu_fru_part_number>N/A</gpu_fru_part_number>
|
||||
<gpu_module_id>1</gpu_module_id>
|
||||
<inforom_version>
|
||||
<img_version>G193.0200.00.01</img_version>
|
||||
<oem_object>2.1</oem_object>
|
||||
<ecc_object>6.16</ecc_object>
|
||||
<pwr_object>N/A</pwr_object>
|
||||
</inforom_version>
|
||||
<inforom_bbx_flush>
|
||||
<latest_timestamp>N/A</latest_timestamp>
|
||||
<latest_duration>N/A</latest_duration>
|
||||
</inforom_bbx_flush>
|
||||
<gpu_operation_mode>
|
||||
<current_gom>N/A</current_gom>
|
||||
<pending_gom>N/A</pending_gom>
|
||||
</gpu_operation_mode>
|
||||
<c2c_mode>N/A</c2c_mode>
|
||||
<gpu_virtualization_mode>
|
||||
<virtualization_mode>None</virtualization_mode>
|
||||
<host_vgpu_mode>N/A</host_vgpu_mode>
|
||||
<vgpu_heterogeneous_mode>N/A</vgpu_heterogeneous_mode>
|
||||
</gpu_virtualization_mode>
|
||||
<gpu_reset_status>
|
||||
<reset_required>No</reset_required>
|
||||
<drain_and_reset_recommended>N/A</drain_and_reset_recommended>
|
||||
</gpu_reset_status>
|
||||
<gsp_firmware_version>555.42.06</gsp_firmware_version>
|
||||
<ibmnpu>
|
||||
<relaxed_ordering_mode>N/A</relaxed_ordering_mode>
|
||||
</ibmnpu>
|
||||
<pci>
|
||||
<pci_bus>01</pci_bus>
|
||||
<pci_device>00</pci_device>
|
||||
<pci_domain>0000</pci_domain>
|
||||
<pci_base_class>3</pci_base_class>
|
||||
<pci_sub_class>2</pci_sub_class>
|
||||
<pci_device_id>27B810DE</pci_device_id>
|
||||
<pci_bus_id>00000000:01:00.0</pci_bus_id>
|
||||
<pci_sub_system_id>16CA10DE</pci_sub_system_id>
|
||||
<pci_gpu_link_info>
|
||||
<pcie_gen>
|
||||
<max_link_gen>4</max_link_gen>
|
||||
<current_link_gen>4</current_link_gen>
|
||||
<device_current_link_gen>4</device_current_link_gen>
|
||||
<max_device_link_gen>4</max_device_link_gen>
|
||||
<max_host_link_gen>5</max_host_link_gen>
|
||||
</pcie_gen>
|
||||
<link_widths>
|
||||
<max_link_width>16x</max_link_width>
|
||||
<current_link_width>16x</current_link_width>
|
||||
</link_widths>
|
||||
</pci_gpu_link_info>
|
||||
<pci_bridge_chip>
|
||||
<bridge_chip_type>N/A</bridge_chip_type>
|
||||
<bridge_chip_fw>N/A</bridge_chip_fw>
|
||||
</pci_bridge_chip>
|
||||
<replay_counter>0</replay_counter>
|
||||
<replay_rollover_counter>0</replay_rollover_counter>
|
||||
<tx_util>0 KB/s</tx_util>
|
||||
<rx_util>0 KB/s</rx_util>
|
||||
<atomic_caps_inbound>N/A</atomic_caps_inbound>
|
||||
<atomic_caps_outbound>N/A</atomic_caps_outbound>
|
||||
</pci>
|
||||
<fan_speed>N/A</fan_speed>
|
||||
<performance_state>P0</performance_state>
|
||||
<clocks_event_reasons>
|
||||
<clocks_event_reason_gpu_idle>Active</clocks_event_reason_gpu_idle>
|
||||
<clocks_event_reason_applications_clocks_setting>Not Active</clocks_event_reason_applications_clocks_setting>
|
||||
<clocks_event_reason_sw_power_cap>Not Active</clocks_event_reason_sw_power_cap>
|
||||
<clocks_event_reason_hw_slowdown>Not Active</clocks_event_reason_hw_slowdown>
|
||||
<clocks_event_reason_hw_thermal_slowdown>Not Active</clocks_event_reason_hw_thermal_slowdown>
|
||||
<clocks_event_reason_hw_power_brake_slowdown>Not Active</clocks_event_reason_hw_power_brake_slowdown>
|
||||
<clocks_event_reason_sync_boost>Not Active</clocks_event_reason_sync_boost>
|
||||
<clocks_event_reason_sw_thermal_slowdown>Not Active</clocks_event_reason_sw_thermal_slowdown>
|
||||
<clocks_event_reason_display_clocks_setting>Not Active</clocks_event_reason_display_clocks_setting>
|
||||
</clocks_event_reasons>
|
||||
<sparse_operation_mode>N/A</sparse_operation_mode>
|
||||
<fb_memory_usage>
|
||||
<total>23034 MiB</total>
|
||||
<reserved>434 MiB</reserved>
|
||||
<used>1 MiB</used>
|
||||
<free>22601 MiB</free>
|
||||
</fb_memory_usage>
|
||||
<bar1_memory_usage>
|
||||
<total>32768 MiB</total>
|
||||
<used>1 MiB</used>
|
||||
<free>32767 MiB</free>
|
||||
</bar1_memory_usage>
|
||||
<cc_protected_memory_usage>
|
||||
<total>0 MiB</total>
|
||||
<used>0 MiB</used>
|
||||
<free>0 MiB</free>
|
||||
</cc_protected_memory_usage>
|
||||
<compute_mode>Default</compute_mode>
|
||||
<utilization>
|
||||
<gpu_util>2 %</gpu_util>
|
||||
<memory_util>0 %</memory_util>
|
||||
<encoder_util>0 %</encoder_util>
|
||||
<decoder_util>0 %</decoder_util>
|
||||
<jpeg_util>0 %</jpeg_util>
|
||||
<ofa_util>0 %</ofa_util>
|
||||
</utilization>
|
||||
<encoder_stats>
|
||||
<session_count>0</session_count>
|
||||
<average_fps>0</average_fps>
|
||||
<average_latency>0</average_latency>
|
||||
</encoder_stats>
|
||||
<fbc_stats>
|
||||
<session_count>0</session_count>
|
||||
<average_fps>0</average_fps>
|
||||
<average_latency>0</average_latency>
|
||||
</fbc_stats>
|
||||
<ecc_mode>
|
||||
<current_ecc>Enabled</current_ecc>
|
||||
<pending_ecc>Enabled</pending_ecc>
|
||||
</ecc_mode>
|
||||
<ecc_errors>
|
||||
<volatile>
|
||||
<sram_correctable>0</sram_correctable>
|
||||
<sram_uncorrectable_parity>0</sram_uncorrectable_parity>
|
||||
<sram_uncorrectable_secded>0</sram_uncorrectable_secded>
|
||||
<dram_correctable>0</dram_correctable>
|
||||
<dram_uncorrectable>0</dram_uncorrectable>
|
||||
</volatile>
|
||||
<aggregate>
|
||||
<sram_correctable>0</sram_correctable>
|
||||
<sram_uncorrectable_parity>0</sram_uncorrectable_parity>
|
||||
<sram_uncorrectable_secded>0</sram_uncorrectable_secded>
|
||||
<dram_correctable>0</dram_correctable>
|
||||
<dram_uncorrectable>0</dram_uncorrectable>
|
||||
<sram_threshold_exceeded>No</sram_threshold_exceeded>
|
||||
</aggregate>
|
||||
<aggregate_uncorrectable_sram_sources>
|
||||
<sram_l2>0</sram_l2>
|
||||
<sram_sm>0</sram_sm>
|
||||
<sram_microcontroller>0</sram_microcontroller>
|
||||
<sram_pcie>0</sram_pcie>
|
||||
<sram_other>0</sram_other>
|
||||
</aggregate_uncorrectable_sram_sources>
|
||||
</ecc_errors>
|
||||
<retired_pages>
|
||||
<multiple_single_bit_retirement>
|
||||
<retired_count>N/A</retired_count>
|
||||
<retired_pagelist>N/A</retired_pagelist>
|
||||
</multiple_single_bit_retirement>
|
||||
<double_bit_retirement>
|
||||
<retired_count>N/A</retired_count>
|
||||
<retired_pagelist>N/A</retired_pagelist>
|
||||
</double_bit_retirement>
|
||||
<pending_blacklist>N/A</pending_blacklist>
|
||||
<pending_retirement>N/A</pending_retirement>
|
||||
</retired_pages>
|
||||
<remapped_rows>
|
||||
<remapped_row_corr>0</remapped_row_corr>
|
||||
<remapped_row_unc>0</remapped_row_unc>
|
||||
<remapped_row_pending>No</remapped_row_pending>
|
||||
<remapped_row_failure>No</remapped_row_failure>
|
||||
<row_remapper_histogram>
|
||||
<row_remapper_histogram_max>96 bank(s)</row_remapper_histogram_max>
|
||||
<row_remapper_histogram_high>0 bank(s)</row_remapper_histogram_high>
|
||||
<row_remapper_histogram_partial>0 bank(s)</row_remapper_histogram_partial>
|
||||
<row_remapper_histogram_low>0 bank(s)</row_remapper_histogram_low>
|
||||
<row_remapper_histogram_none>0 bank(s)</row_remapper_histogram_none>
|
||||
</row_remapper_histogram>
|
||||
</remapped_rows>
|
||||
<temperature>
|
||||
<gpu_temp>45 C</gpu_temp>
|
||||
<gpu_temp_tlimit>39 C</gpu_temp_tlimit>
|
||||
<gpu_temp_max_tlimit_threshold>-5 C</gpu_temp_max_tlimit_threshold>
|
||||
<gpu_temp_slow_tlimit_threshold>-2 C</gpu_temp_slow_tlimit_threshold>
|
||||
<gpu_temp_max_gpu_tlimit_threshold>0 C</gpu_temp_max_gpu_tlimit_threshold>
|
||||
<gpu_target_temperature>N/A</gpu_target_temperature>
|
||||
<memory_temp>N/A</memory_temp>
|
||||
<gpu_temp_max_mem_tlimit_threshold>N/A</gpu_temp_max_mem_tlimit_threshold>
|
||||
</temperature>
|
||||
<supported_gpu_target_temp>
|
||||
<gpu_target_temp_min>N/A</gpu_target_temp_min>
|
||||
<gpu_target_temp_max>N/A</gpu_target_temp_max>
|
||||
</supported_gpu_target_temp>
|
||||
<gpu_power_readings>
|
||||
<power_state>P0</power_state>
|
||||
<power_draw>27.22 W</power_draw>
|
||||
<current_power_limit>72.00 W</current_power_limit>
|
||||
<requested_power_limit>72.00 W</requested_power_limit>
|
||||
<default_power_limit>72.00 W</default_power_limit>
|
||||
<min_power_limit>40.00 W</min_power_limit>
|
||||
<max_power_limit>72.00 W</max_power_limit>
|
||||
</gpu_power_readings>
|
||||
<gpu_memory_power_readings>
|
||||
<power_draw>N/A</power_draw>
|
||||
</gpu_memory_power_readings>
|
||||
<module_power_readings>
|
||||
<power_state>P0</power_state>
|
||||
<power_draw>N/A</power_draw>
|
||||
<current_power_limit>N/A</current_power_limit>
|
||||
<requested_power_limit>N/A</requested_power_limit>
|
||||
<default_power_limit>N/A</default_power_limit>
|
||||
<min_power_limit>N/A</min_power_limit>
|
||||
<max_power_limit>N/A</max_power_limit>
|
||||
</module_power_readings>
|
||||
<clocks>
|
||||
<graphics_clock>2040 MHz</graphics_clock>
|
||||
<sm_clock>2040 MHz</sm_clock>
|
||||
<mem_clock>6250 MHz</mem_clock>
|
||||
<video_clock>1770 MHz</video_clock>
|
||||
</clocks>
|
||||
<applications_clocks>
|
||||
<graphics_clock>2040 MHz</graphics_clock>
|
||||
<mem_clock>6251 MHz</mem_clock>
|
||||
</applications_clocks>
|
||||
<default_applications_clocks>
|
||||
<graphics_clock>2040 MHz</graphics_clock>
|
||||
<mem_clock>6251 MHz</mem_clock>
|
||||
</default_applications_clocks>
|
||||
<deferred_clocks>
|
||||
<mem_clock>N/A</mem_clock>
|
||||
</deferred_clocks>
|
||||
<max_clocks>
|
||||
<graphics_clock>2040 MHz</graphics_clock>
|
||||
<sm_clock>2040 MHz</sm_clock>
|
||||
<mem_clock>6251 MHz</mem_clock>
|
||||
<video_clock>1770 MHz</video_clock>
|
||||
</max_clocks>
|
||||
<max_customer_boost_clocks>
|
||||
<graphics_clock>2040 MHz</graphics_clock>
|
||||
</max_customer_boost_clocks>
|
||||
<clock_policy>
|
||||
<auto_boost>N/A</auto_boost>
|
||||
<auto_boost_default>N/A</auto_boost_default>
|
||||
</clock_policy>
|
||||
<voltage>
|
||||
<graphics_volt>885.000 mV</graphics_volt>
|
||||
</voltage>
|
||||
<fabric>
|
||||
<state>N/A</state>
|
||||
<status>N/A</status>
|
||||
<cliqueId>N/A</cliqueId>
|
||||
<clusterUuid>N/A</clusterUuid>
|
||||
<health>
|
||||
<bandwidth>N/A</bandwidth>
|
||||
</health>
|
||||
</fabric>
|
||||
<supported_clocks>
|
||||
<supported_mem_clock>
|
||||
<value>6251 MHz</value>
|
||||
<supported_graphics_clock>2040 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>2025 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>2010 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1995 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1980 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1965 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1950 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1935 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1920 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1905 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1890 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1875 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1860 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1845 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1830 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1815 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1800 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1785 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1770 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1755 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1740 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1725 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1710 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1695 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1680 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1665 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1650 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1635 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1620 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1605 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1590 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1575 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1560 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1545 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1530 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1515 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1500 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1485 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1470 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1455 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1440 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1425 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1410 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1395 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1380 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1365 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1350 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1335 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1320 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1305 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1290 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1275 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1260 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1245 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1230 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1215 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1200 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1185 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1170 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1155 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1140 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1125 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1110 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1095 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1080 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1065 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1050 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1035 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1020 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1005 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>990 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>975 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>960 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>945 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>930 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>915 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>900 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>885 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>870 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>855 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>840 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>825 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>810 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>795 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>780 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>765 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>750 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>735 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>720 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>705 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>690 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>675 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>660 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>645 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>630 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>615 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>600 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>585 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>570 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>555 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>540 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>525 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>510 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>495 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>480 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>465 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>450 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>435 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>420 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>405 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>390 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>375 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>360 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>345 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>330 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>315 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>300 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>285 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>270 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>255 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>240 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>225 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>210 MHz</supported_graphics_clock>
|
||||
</supported_mem_clock>
|
||||
<supported_mem_clock>
|
||||
<value>405 MHz</value>
|
||||
<supported_graphics_clock>645 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>630 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>615 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>600 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>585 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>570 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>555 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>540 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>525 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>510 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>495 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>480 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>465 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>450 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>435 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>420 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>405 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>390 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>375 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>360 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>345 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>330 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>315 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>300 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>285 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>270 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>255 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>240 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>225 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>210 MHz</supported_graphics_clock>
|
||||
</supported_mem_clock>
|
||||
</supported_clocks>
|
||||
<processes>
|
||||
<process_info>
|
||||
<pid>10131</pid>
|
||||
<type>C</type>
|
||||
<process_name>ffmpeg</process_name>
|
||||
<used_memory>389 MiB</used_memory>
|
||||
</process_info>
|
||||
<process_info>
|
||||
<pid>13597</pid>
|
||||
<type>C</type>
|
||||
<process_name>ffmpeg</process_name>
|
||||
<used_memory>1054 MiB</used_memory>
|
||||
</process_info>
|
||||
</processes>
|
||||
<accounted_processes>
|
||||
</accounted_processes>
|
||||
<capabilities>
|
||||
<egm>disabled</egm>
|
||||
</capabilities>
|
||||
</gpu>
|
||||
|
||||
<gpu id="00000000:C1:00.0">
|
||||
<product_name>NVIDIA L4</product_name>
|
||||
<product_brand>NVIDIA</product_brand>
|
||||
<product_architecture>Ada Lovelace</product_architecture>
|
||||
<display_mode>Enabled</display_mode>
|
||||
<display_active>Disabled</display_active>
|
||||
<persistence_mode>Disabled</persistence_mode>
|
||||
<addressing_mode>None</addressing_mode>
|
||||
<mig_mode>
|
||||
<current_mig>N/A</current_mig>
|
||||
<pending_mig>N/A</pending_mig>
|
||||
</mig_mode>
|
||||
<mig_devices>
|
||||
None
|
||||
</mig_devices>
|
||||
<accounting_mode>Disabled</accounting_mode>
|
||||
<accounting_mode_buffer_size>4000</accounting_mode_buffer_size>
|
||||
<driver_model>
|
||||
<current_dm>N/A</current_dm>
|
||||
<pending_dm>N/A</pending_dm>
|
||||
</driver_model>
|
||||
<serial>1654523001128</serial>
|
||||
<uuid>GPU-128ab6fb-6ec9-fd74-b479-4a5fd14f55bd</uuid>
|
||||
<minor_number>0</minor_number>
|
||||
<vbios_version>95.04.29.00.06</vbios_version>
|
||||
<multigpu_board>No</multigpu_board>
|
||||
<board_id>0xc100</board_id>
|
||||
<board_part_number>900-2G193-0000-001</board_part_number>
|
||||
<gpu_part_number>27B8-895-A1</gpu_part_number>
|
||||
<gpu_fru_part_number>N/A</gpu_fru_part_number>
|
||||
<gpu_module_id>1</gpu_module_id>
|
||||
<inforom_version>
|
||||
<img_version>G193.0200.00.01</img_version>
|
||||
<oem_object>2.1</oem_object>
|
||||
<ecc_object>6.16</ecc_object>
|
||||
<pwr_object>N/A</pwr_object>
|
||||
</inforom_version>
|
||||
<inforom_bbx_flush>
|
||||
<latest_timestamp>N/A</latest_timestamp>
|
||||
<latest_duration>N/A</latest_duration>
|
||||
</inforom_bbx_flush>
|
||||
<gpu_operation_mode>
|
||||
<current_gom>N/A</current_gom>
|
||||
<pending_gom>N/A</pending_gom>
|
||||
</gpu_operation_mode>
|
||||
<c2c_mode>N/A</c2c_mode>
|
||||
<gpu_virtualization_mode>
|
||||
<virtualization_mode>None</virtualization_mode>
|
||||
<host_vgpu_mode>N/A</host_vgpu_mode>
|
||||
<vgpu_heterogeneous_mode>N/A</vgpu_heterogeneous_mode>
|
||||
</gpu_virtualization_mode>
|
||||
<gpu_reset_status>
|
||||
<reset_required>No</reset_required>
|
||||
<drain_and_reset_recommended>N/A</drain_and_reset_recommended>
|
||||
</gpu_reset_status>
|
||||
<gsp_firmware_version>555.42.06</gsp_firmware_version>
|
||||
<ibmnpu>
|
||||
<relaxed_ordering_mode>N/A</relaxed_ordering_mode>
|
||||
</ibmnpu>
|
||||
<pci>
|
||||
<pci_bus>C1</pci_bus>
|
||||
<pci_device>00</pci_device>
|
||||
<pci_domain>0000</pci_domain>
|
||||
<pci_base_class>3</pci_base_class>
|
||||
<pci_sub_class>2</pci_sub_class>
|
||||
<pci_device_id>27B810DE</pci_device_id>
|
||||
<pci_bus_id>00000000:C1:00.0</pci_bus_id>
|
||||
<pci_sub_system_id>16CA10DE</pci_sub_system_id>
|
||||
<pci_gpu_link_info>
|
||||
<pcie_gen>
|
||||
<max_link_gen>4</max_link_gen>
|
||||
<current_link_gen>4</current_link_gen>
|
||||
<device_current_link_gen>4</device_current_link_gen>
|
||||
<max_device_link_gen>4</max_device_link_gen>
|
||||
<max_host_link_gen>5</max_host_link_gen>
|
||||
</pcie_gen>
|
||||
<link_widths>
|
||||
<max_link_width>16x</max_link_width>
|
||||
<current_link_width>1x</current_link_width>
|
||||
</link_widths>
|
||||
</pci_gpu_link_info>
|
||||
<pci_bridge_chip>
|
||||
<bridge_chip_type>N/A</bridge_chip_type>
|
||||
<bridge_chip_fw>N/A</bridge_chip_fw>
|
||||
</pci_bridge_chip>
|
||||
<replay_counter>0</replay_counter>
|
||||
<replay_rollover_counter>0</replay_rollover_counter>
|
||||
<tx_util>0 KB/s</tx_util>
|
||||
<rx_util>0 KB/s</rx_util>
|
||||
<atomic_caps_inbound>N/A</atomic_caps_inbound>
|
||||
<atomic_caps_outbound>N/A</atomic_caps_outbound>
|
||||
</pci>
|
||||
<fan_speed>N/A</fan_speed>
|
||||
<performance_state>P0</performance_state>
|
||||
<clocks_event_reasons>
|
||||
<clocks_event_reason_gpu_idle>Active</clocks_event_reason_gpu_idle>
|
||||
<clocks_event_reason_applications_clocks_setting>Not Active</clocks_event_reason_applications_clocks_setting>
|
||||
<clocks_event_reason_sw_power_cap>Not Active</clocks_event_reason_sw_power_cap>
|
||||
<clocks_event_reason_hw_slowdown>Not Active</clocks_event_reason_hw_slowdown>
|
||||
<clocks_event_reason_hw_thermal_slowdown>Not Active</clocks_event_reason_hw_thermal_slowdown>
|
||||
<clocks_event_reason_hw_power_brake_slowdown>Not Active</clocks_event_reason_hw_power_brake_slowdown>
|
||||
<clocks_event_reason_sync_boost>Not Active</clocks_event_reason_sync_boost>
|
||||
<clocks_event_reason_sw_thermal_slowdown>Not Active</clocks_event_reason_sw_thermal_slowdown>
|
||||
<clocks_event_reason_display_clocks_setting>Not Active</clocks_event_reason_display_clocks_setting>
|
||||
</clocks_event_reasons>
|
||||
<sparse_operation_mode>N/A</sparse_operation_mode>
|
||||
<fb_memory_usage>
|
||||
<total>23034 MiB</total>
|
||||
<reserved>434 MiB</reserved>
|
||||
<used>1 MiB</used>
|
||||
<free>22601 MiB</free>
|
||||
</fb_memory_usage>
|
||||
<bar1_memory_usage>
|
||||
<total>32768 MiB</total>
|
||||
<used>1 MiB</used>
|
||||
<free>32767 MiB</free>
|
||||
</bar1_memory_usage>
|
||||
<cc_protected_memory_usage>
|
||||
<total>0 MiB</total>
|
||||
<used>0 MiB</used>
|
||||
<free>0 MiB</free>
|
||||
</cc_protected_memory_usage>
|
||||
<compute_mode>Default</compute_mode>
|
||||
<utilization>
|
||||
<gpu_util>3 %</gpu_util>
|
||||
<memory_util>0 %</memory_util>
|
||||
<encoder_util>0 %</encoder_util>
|
||||
<decoder_util>0 %</decoder_util>
|
||||
<jpeg_util>0 %</jpeg_util>
|
||||
<ofa_util>0 %</ofa_util>
|
||||
</utilization>
|
||||
<encoder_stats>
|
||||
<session_count>0</session_count>
|
||||
<average_fps>0</average_fps>
|
||||
<average_latency>0</average_latency>
|
||||
</encoder_stats>
|
||||
<fbc_stats>
|
||||
<session_count>0</session_count>
|
||||
<average_fps>0</average_fps>
|
||||
<average_latency>0</average_latency>
|
||||
</fbc_stats>
|
||||
<ecc_mode>
|
||||
<current_ecc>Enabled</current_ecc>
|
||||
<pending_ecc>Enabled</pending_ecc>
|
||||
</ecc_mode>
|
||||
<ecc_errors>
|
||||
<volatile>
|
||||
<sram_correctable>0</sram_correctable>
|
||||
<sram_uncorrectable_parity>0</sram_uncorrectable_parity>
|
||||
<sram_uncorrectable_secded>0</sram_uncorrectable_secded>
|
||||
<dram_correctable>0</dram_correctable>
|
||||
<dram_uncorrectable>0</dram_uncorrectable>
|
||||
</volatile>
|
||||
<aggregate>
|
||||
<sram_correctable>0</sram_correctable>
|
||||
<sram_uncorrectable_parity>0</sram_uncorrectable_parity>
|
||||
<sram_uncorrectable_secded>0</sram_uncorrectable_secded>
|
||||
<dram_correctable>0</dram_correctable>
|
||||
<dram_uncorrectable>0</dram_uncorrectable>
|
||||
<sram_threshold_exceeded>No</sram_threshold_exceeded>
|
||||
</aggregate>
|
||||
<aggregate_uncorrectable_sram_sources>
|
||||
<sram_l2>0</sram_l2>
|
||||
<sram_sm>0</sram_sm>
|
||||
<sram_microcontroller>0</sram_microcontroller>
|
||||
<sram_pcie>0</sram_pcie>
|
||||
<sram_other>0</sram_other>
|
||||
</aggregate_uncorrectable_sram_sources>
|
||||
</ecc_errors>
|
||||
<retired_pages>
|
||||
<multiple_single_bit_retirement>
|
||||
<retired_count>N/A</retired_count>
|
||||
<retired_pagelist>N/A</retired_pagelist>
|
||||
</multiple_single_bit_retirement>
|
||||
<double_bit_retirement>
|
||||
<retired_count>N/A</retired_count>
|
||||
<retired_pagelist>N/A</retired_pagelist>
|
||||
</double_bit_retirement>
|
||||
<pending_blacklist>N/A</pending_blacklist>
|
||||
<pending_retirement>N/A</pending_retirement>
|
||||
</retired_pages>
|
||||
<remapped_rows>
|
||||
<remapped_row_corr>0</remapped_row_corr>
|
||||
<remapped_row_unc>0</remapped_row_unc>
|
||||
<remapped_row_pending>No</remapped_row_pending>
|
||||
<remapped_row_failure>No</remapped_row_failure>
|
||||
<row_remapper_histogram>
|
||||
<row_remapper_histogram_max>96 bank(s)</row_remapper_histogram_max>
|
||||
<row_remapper_histogram_high>0 bank(s)</row_remapper_histogram_high>
|
||||
<row_remapper_histogram_partial>0 bank(s)</row_remapper_histogram_partial>
|
||||
<row_remapper_histogram_low>0 bank(s)</row_remapper_histogram_low>
|
||||
<row_remapper_histogram_none>0 bank(s)</row_remapper_histogram_none>
|
||||
</row_remapper_histogram>
|
||||
</remapped_rows>
|
||||
<temperature>
|
||||
<gpu_temp>40 C</gpu_temp>
|
||||
<gpu_temp_tlimit>43 C</gpu_temp_tlimit>
|
||||
<gpu_temp_max_tlimit_threshold>-5 C</gpu_temp_max_tlimit_threshold>
|
||||
<gpu_temp_slow_tlimit_threshold>-2 C</gpu_temp_slow_tlimit_threshold>
|
||||
<gpu_temp_max_gpu_tlimit_threshold>0 C</gpu_temp_max_gpu_tlimit_threshold>
|
||||
<gpu_target_temperature>N/A</gpu_target_temperature>
|
||||
<memory_temp>N/A</memory_temp>
|
||||
<gpu_temp_max_mem_tlimit_threshold>N/A</gpu_temp_max_mem_tlimit_threshold>
|
||||
</temperature>
|
||||
<supported_gpu_target_temp>
|
||||
<gpu_target_temp_min>N/A</gpu_target_temp_min>
|
||||
<gpu_target_temp_max>N/A</gpu_target_temp_max>
|
||||
</supported_gpu_target_temp>
|
||||
<gpu_power_readings>
|
||||
<power_state>P0</power_state>
|
||||
<power_draw>29.54 W</power_draw>
|
||||
<current_power_limit>72.00 W</current_power_limit>
|
||||
<requested_power_limit>72.00 W</requested_power_limit>
|
||||
<default_power_limit>72.00 W</default_power_limit>
|
||||
<min_power_limit>40.00 W</min_power_limit>
|
||||
<max_power_limit>72.00 W</max_power_limit>
|
||||
</gpu_power_readings>
|
||||
<gpu_memory_power_readings>
|
||||
<power_draw>N/A</power_draw>
|
||||
</gpu_memory_power_readings>
|
||||
<module_power_readings>
|
||||
<power_state>P0</power_state>
|
||||
<power_draw>N/A</power_draw>
|
||||
<current_power_limit>N/A</current_power_limit>
|
||||
<requested_power_limit>N/A</requested_power_limit>
|
||||
<default_power_limit>N/A</default_power_limit>
|
||||
<min_power_limit>N/A</min_power_limit>
|
||||
<max_power_limit>N/A</max_power_limit>
|
||||
</module_power_readings>
|
||||
<clocks>
|
||||
<graphics_clock>2040 MHz</graphics_clock>
|
||||
<sm_clock>2040 MHz</sm_clock>
|
||||
<mem_clock>6250 MHz</mem_clock>
|
||||
<video_clock>1770 MHz</video_clock>
|
||||
</clocks>
|
||||
<applications_clocks>
|
||||
<graphics_clock>2040 MHz</graphics_clock>
|
||||
<mem_clock>6251 MHz</mem_clock>
|
||||
</applications_clocks>
|
||||
<default_applications_clocks>
|
||||
<graphics_clock>2040 MHz</graphics_clock>
|
||||
<mem_clock>6251 MHz</mem_clock>
|
||||
</default_applications_clocks>
|
||||
<deferred_clocks>
|
||||
<mem_clock>N/A</mem_clock>
|
||||
</deferred_clocks>
|
||||
<max_clocks>
|
||||
<graphics_clock>2040 MHz</graphics_clock>
|
||||
<sm_clock>2040 MHz</sm_clock>
|
||||
<mem_clock>6251 MHz</mem_clock>
|
||||
<video_clock>1770 MHz</video_clock>
|
||||
</max_clocks>
|
||||
<max_customer_boost_clocks>
|
||||
<graphics_clock>2040 MHz</graphics_clock>
|
||||
</max_customer_boost_clocks>
|
||||
<clock_policy>
|
||||
<auto_boost>N/A</auto_boost>
|
||||
<auto_boost_default>N/A</auto_boost_default>
|
||||
</clock_policy>
|
||||
<voltage>
|
||||
<graphics_volt>910.000 mV</graphics_volt>
|
||||
</voltage>
|
||||
<fabric>
|
||||
<state>N/A</state>
|
||||
<status>N/A</status>
|
||||
<cliqueId>N/A</cliqueId>
|
||||
<clusterUuid>N/A</clusterUuid>
|
||||
<health>
|
||||
<bandwidth>N/A</bandwidth>
|
||||
</health>
|
||||
</fabric>
|
||||
<supported_clocks>
|
||||
<supported_mem_clock>
|
||||
<value>6251 MHz</value>
|
||||
<supported_graphics_clock>2040 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>2025 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>2010 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1995 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1980 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1965 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1950 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1935 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1920 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1905 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1890 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1875 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1860 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1845 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1830 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1815 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1800 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1785 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1770 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1755 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1740 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1725 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1710 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1695 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1680 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1665 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1650 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1635 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1620 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1605 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1590 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1575 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1560 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1545 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1530 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1515 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1500 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1485 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1470 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1455 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1440 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1425 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1410 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1395 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1380 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1365 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1350 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1335 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1320 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1305 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1290 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1275 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1260 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1245 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1230 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1215 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1200 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1185 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1170 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1155 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1140 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1125 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1110 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1095 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1080 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1065 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1050 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1035 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1020 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>1005 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>990 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>975 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>960 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>945 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>930 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>915 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>900 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>885 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>870 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>855 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>840 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>825 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>810 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>795 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>780 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>765 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>750 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>735 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>720 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>705 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>690 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>675 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>660 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>645 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>630 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>615 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>600 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>585 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>570 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>555 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>540 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>525 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>510 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>495 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>480 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>465 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>450 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>435 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>420 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>405 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>390 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>375 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>360 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>345 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>330 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>315 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>300 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>285 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>270 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>255 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>240 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>225 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>210 MHz</supported_graphics_clock>
|
||||
</supported_mem_clock>
|
||||
<supported_mem_clock>
|
||||
<value>405 MHz</value>
|
||||
<supported_graphics_clock>645 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>630 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>615 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>600 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>585 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>570 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>555 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>540 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>525 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>510 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>495 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>480 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>465 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>450 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>435 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>420 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>405 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>390 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>375 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>360 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>345 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>330 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>315 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>300 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>285 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>270 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>255 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>240 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>225 MHz</supported_graphics_clock>
|
||||
<supported_graphics_clock>210 MHz</supported_graphics_clock>
|
||||
</supported_mem_clock>
|
||||
</supported_clocks>
|
||||
<processes>
|
||||
<process_info>
|
||||
<pid>16870</pid>
|
||||
<type>C</type>
|
||||
<process_name>ffmpeg</process_name>
|
||||
<used_memory>549 MiB</used_memory>
|
||||
</process_info>
|
||||
</processes>
|
||||
<accounted_processes>
|
||||
</accounted_processes>
|
||||
<capabilities>
|
||||
<egm>disabled</egm>
|
||||
</capabilities>
|
||||
</gpu>
|
||||
|
||||
</nvidia_smi_log>`
|
||||
|
||||
func main() {
|
||||
if len(os.Args) == 1 {
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
if os.Args[1] == "pmon" {
|
||||
go func(ctx context.Context) {
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
fmt.Fprintf(os.Stdout, "%s\n", pmondata)
|
||||
}
|
||||
}
|
||||
}(ctx)
|
||||
} else {
|
||||
go func(ctx context.Context) {
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
fmt.Fprintf(os.Stdout, "%s\n", querydata)
|
||||
}
|
||||
}
|
||||
}(ctx)
|
||||
}
|
||||
|
||||
// Wait for interrupt signal to gracefully shutdown the app
|
||||
quit := make(chan os.Signal, 1)
|
||||
signal.Notify(quit, os.Interrupt)
|
||||
<-quit
|
||||
|
||||
cancel()
|
||||
|
||||
os.Exit(0)
|
||||
}
|
||||
@ -33,7 +33,7 @@ func NewCPUCollector(rsc resources.Resources) metric.Collector {
|
||||
c.limitDescr = metric.NewDesc("cpu_limit", "Percentage of CPU to be consumed", nil)
|
||||
c.throttleDescr = metric.NewDesc("cpu_throttling", "Whether the CPU is currently throttled", nil)
|
||||
|
||||
if ncpu, err := psutil.CPUCounts(true); err == nil {
|
||||
if ncpu, err := psutil.CPUCounts(); err == nil {
|
||||
c.ncpu = ncpu
|
||||
}
|
||||
|
||||
@ -63,11 +63,11 @@ func (c *cpuCollector) Collect() metric.Metrics {
|
||||
|
||||
metrics.Add(metric.NewValue(c.ncpuDescr, c.ncpu))
|
||||
|
||||
limit, _ := c.resources.Limits()
|
||||
limit, _, _, _ := c.resources.Limits()
|
||||
|
||||
metrics.Add(metric.NewValue(c.limitDescr, limit))
|
||||
|
||||
cpu, _ := c.resources.ShouldLimit()
|
||||
cpu, _, _ := c.resources.ShouldLimit()
|
||||
throttling := .0
|
||||
if cpu {
|
||||
throttling = 1
|
||||
|
||||
@ -37,7 +37,7 @@ func (c *diskCollector) Describe() []*metric.Description {
|
||||
func (c *diskCollector) Collect() metric.Metrics {
|
||||
metrics := metric.NewMetrics()
|
||||
|
||||
stat, err := psutil.DiskUsage(c.path)
|
||||
stat, err := psutil.Disk(c.path)
|
||||
if err != nil {
|
||||
return metrics
|
||||
}
|
||||
|
||||
@ -44,11 +44,11 @@ func (c *memCollector) Describe() []*metric.Description {
|
||||
func (c *memCollector) Collect() metric.Metrics {
|
||||
metrics := metric.NewMetrics()
|
||||
|
||||
_, limit := c.resources.Limits()
|
||||
_, limit, _, _ := c.resources.Limits()
|
||||
|
||||
metrics.Add(metric.NewValue(c.limitDescr, float64(limit)))
|
||||
|
||||
_, memory := c.resources.ShouldLimit()
|
||||
_, memory, _ := c.resources.ShouldLimit()
|
||||
throttling := .0
|
||||
if memory {
|
||||
throttling = 1
|
||||
@ -56,7 +56,7 @@ func (c *memCollector) Collect() metric.Metrics {
|
||||
|
||||
metrics.Add(metric.NewValue(c.throttleDescr, throttling))
|
||||
|
||||
stat, err := psutil.VirtualMemory()
|
||||
stat, err := psutil.Memory()
|
||||
if err != nil {
|
||||
return metrics
|
||||
}
|
||||
|
||||
@ -33,7 +33,7 @@ func (c *netCollector) Describe() []*metric.Description {
|
||||
func (c *netCollector) Collect() metric.Metrics {
|
||||
metrics := metric.NewMetrics()
|
||||
|
||||
devs, err := psutil.NetIOCounters(true)
|
||||
devs, err := psutil.Network()
|
||||
if err != nil {
|
||||
return metrics
|
||||
}
|
||||
|
||||
@ -25,9 +25,36 @@ type Usage struct {
|
||||
Max uint64 // bytes
|
||||
Limit uint64 // bytes
|
||||
}
|
||||
GPU struct {
|
||||
Index int // number of the GPU
|
||||
Memory struct {
|
||||
Current uint64 // bytes
|
||||
Average float64 // bytes
|
||||
Max uint64 // bytes
|
||||
Limit uint64 // bytes
|
||||
}
|
||||
Usage struct {
|
||||
Current float64 // percent 0-100
|
||||
Average float64 // percent 0-100
|
||||
Max float64 // percent 0-100
|
||||
Limit float64 // percent 0-100
|
||||
}
|
||||
Encoder struct {
|
||||
Current float64 // percent 0-100
|
||||
Average float64 // percent 0-100
|
||||
Max float64 // percent 0-100
|
||||
Limit float64 // percent 0-100
|
||||
}
|
||||
Decoder struct {
|
||||
Current float64 // percent 0-100
|
||||
Average float64 // percent 0-100
|
||||
Max float64 // percent 0-100
|
||||
Limit float64 // percent 0-100
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type LimitFunc func(cpu float64, memory uint64)
|
||||
type LimitFunc func(cpu float64, memory uint64, gpuusage, gpuencoder, gpudecoder float64, gpumemory uint64)
|
||||
|
||||
type LimitMode int
|
||||
|
||||
@ -44,18 +71,22 @@ func (m LimitMode) String() string {
|
||||
}
|
||||
|
||||
const (
|
||||
LimitModeHard LimitMode = 0 // Killing the process if either CPU or memory is above the limit for a certain time
|
||||
LimitModeSoft LimitMode = 1 // Throttling the CPU if activated, killing the process if memory is above the limit for a certain time
|
||||
LimitModeHard LimitMode = 0 // Killing the process if either resource is above the limit for a certain time.
|
||||
LimitModeSoft LimitMode = 1 // If activated, will throttle the CPU, otherwise killing the process if resources are above the limit.
|
||||
)
|
||||
|
||||
type LimiterConfig struct {
|
||||
CPU float64 // Max. CPU usage in percent 0-100 in hard mode, 0-100*ncpu in softmode
|
||||
Memory uint64 // Max. memory usage in bytes
|
||||
WaitFor time.Duration // Duration for one of the limits has to be above the limit until OnLimit gets triggered
|
||||
OnLimit LimitFunc // Function to be triggered if limits are exceeded
|
||||
Mode LimitMode // How to limit CPU usage
|
||||
PSUtil psutil.Util
|
||||
Logger log.Logger
|
||||
CPU float64 // Max. CPU usage in percent 0-100 in hard mode, 0-100*ncpu in soft mode.
|
||||
Memory uint64 // Max. memory usage in bytes.
|
||||
GPUUsage float64 // Max. GPU general usage in percent 0-100.
|
||||
GPUEncoder float64 // Max. GPU encoder usage in percent 0-100.
|
||||
GPUDecoder float64 // Max. GPU decoder usage in percent 0-100.
|
||||
GPUMemory uint64 // Max. GPU memory usage in bytes.
|
||||
WaitFor time.Duration // Duration for one of the limits has to be above the limit until OnLimit gets triggered.
|
||||
OnLimit LimitFunc // Function to be triggered if limits are exceeded.
|
||||
Mode LimitMode // How to limit CPU usage.
|
||||
PSUtil psutil.Util
|
||||
Logger log.Logger
|
||||
}
|
||||
|
||||
type Limiter interface {
|
||||
@ -65,26 +96,135 @@ type Limiter interface {
|
||||
// Stop stops the limiter. The limiter can be reused by calling Start() again
|
||||
Stop()
|
||||
|
||||
// Current returns the current CPU and memory values
|
||||
// Deprecated: use Usage()
|
||||
Current() (cpu float64, memory uint64)
|
||||
|
||||
// Limits returns the defined CPU and memory limits. Values <= 0 means no limit
|
||||
// Deprecated: use Usage()
|
||||
Limits() (cpu float64, memory uint64)
|
||||
|
||||
// Usage returns the current state of the limiter, such as current, average, max, and
|
||||
// limit values for CPU and memory.
|
||||
Usage() Usage
|
||||
|
||||
// Limit enables or disables the throttling of the CPU or killing because of to much
|
||||
// memory consumption.
|
||||
Limit(cpu, memory bool) error
|
||||
// memory or GPU consumption.
|
||||
Limit(cpu, memory, gpu bool) error
|
||||
|
||||
// Mode returns in which mode the limiter is running in.
|
||||
Mode() LimitMode
|
||||
}
|
||||
|
||||
type numbers interface {
|
||||
~uint64 | ~float64
|
||||
}
|
||||
|
||||
type metric[T numbers] struct {
|
||||
limit T // Limit
|
||||
current T // Current load value
|
||||
last T // Last load value
|
||||
max T // Max. load value
|
||||
top T // Decaying max. load value
|
||||
avg float64 // Average load value
|
||||
avgCounter uint64 // Counter for average calculation
|
||||
limitSince time.Time // Time when the limit has been reached (hard limiter mode)
|
||||
limitEnable bool
|
||||
}
|
||||
|
||||
func (x *metric[T]) Reset() {
|
||||
var zero T
|
||||
|
||||
x.current = zero
|
||||
x.last = zero
|
||||
x.max = zero
|
||||
x.top = zero
|
||||
x.avg = 0
|
||||
x.avgCounter = 0
|
||||
x.limitEnable = false
|
||||
}
|
||||
|
||||
func (x *metric[T]) Current() T {
|
||||
return x.current
|
||||
}
|
||||
|
||||
func (x *metric[T]) Top() T {
|
||||
return x.top
|
||||
}
|
||||
|
||||
func (x *metric[T]) Max() T {
|
||||
return x.max
|
||||
}
|
||||
|
||||
func (x *metric[T]) Avg() float64 {
|
||||
return x.avg
|
||||
}
|
||||
|
||||
func (x *metric[T]) SetLimit(limit T) {
|
||||
x.limit = limit
|
||||
}
|
||||
|
||||
func (x *metric[T]) Limit() T {
|
||||
return x.limit
|
||||
}
|
||||
|
||||
func (x *metric[T]) DoLimit(limit bool) (enabled, changed bool) {
|
||||
if x.limitEnable != limit {
|
||||
x.limitEnable = limit
|
||||
changed = true
|
||||
}
|
||||
|
||||
enabled = x.limitEnable
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (x *metric[T]) IsLimitEnabled() bool {
|
||||
return x.limitEnable
|
||||
}
|
||||
|
||||
func (x *metric[T]) Update(value T) {
|
||||
x.last, x.current = x.current, value
|
||||
|
||||
if x.current > x.max {
|
||||
x.max = x.current
|
||||
}
|
||||
|
||||
if x.current > x.top {
|
||||
x.top = x.current
|
||||
} else {
|
||||
x.top = T(float64(x.top) * 0.95)
|
||||
}
|
||||
|
||||
x.avgCounter++
|
||||
|
||||
x.avg = ((x.avg * float64(x.avgCounter-1)) + float64(x.current)) / float64(x.avgCounter)
|
||||
}
|
||||
|
||||
func (x *metric[T]) IsExceeded(waitFor time.Duration, mode LimitMode) bool {
|
||||
if x.limit <= 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
if mode == LimitModeSoft {
|
||||
// Check if we actually should limit.
|
||||
if !x.limitEnable {
|
||||
return false
|
||||
}
|
||||
|
||||
// If we are currently above the limit, the limit is exceeded.
|
||||
if x.current > x.limit {
|
||||
return true
|
||||
}
|
||||
} else {
|
||||
if x.current > x.limit {
|
||||
// Current value is higher than the limit.
|
||||
if x.last <= x.limit {
|
||||
// If the previous value is below the limit, then we reached the limit as of now.
|
||||
x.limitSince = time.Now()
|
||||
}
|
||||
|
||||
if time.Since(x.limitSince) >= waitFor {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
type limiter struct {
|
||||
psutil psutil.Util
|
||||
|
||||
@ -98,40 +238,27 @@ type limiter struct {
|
||||
lastUsage Usage
|
||||
lastUsageLock sync.RWMutex
|
||||
|
||||
cpu float64 // CPU limit
|
||||
cpuCurrent float64 // Current CPU load of this process
|
||||
cpuLast float64 // Last CPU load of this process
|
||||
cpuMax float64 // Max. CPU load of this process
|
||||
cpuTop float64 // Decaying max. CPU load of this process
|
||||
cpuAvg float64 // Average CPU load of this process
|
||||
cpuAvgCounter uint64 // Counter for average calculation
|
||||
cpuLimitSince time.Time // Time when the CPU limit has been reached (hard limiter mode)
|
||||
cpuLimitEnable bool // Whether CPU throttling is enabled (soft limiter mode)
|
||||
cpuThrottling bool // Whether CPU throttling is currently active (soft limiter mode)
|
||||
cpu metric[float64] // CPU limit
|
||||
cpuThrottling bool // Whether CPU throttling is currently active (soft limiter mode)
|
||||
|
||||
memory uint64 // Memory limit (bytes)
|
||||
memoryCurrent uint64 // Current memory usage
|
||||
memoryLast uint64 // Last memory usage
|
||||
memoryMax uint64 // Max. memory usage
|
||||
memoryTop uint64 // Decaying max. memory usage
|
||||
memoryAvg float64 // Average memory usage
|
||||
memoryAvgCounter uint64 // Counter for average memory calculation
|
||||
memoryLimitSince time.Time // Time when the memory limit has been reached (hard limiter mode)
|
||||
memoryLimitEnable bool // Whether memory limiting is enabled (soft limiter mode)
|
||||
memory metric[uint64] // Memory limit (bytes)
|
||||
|
||||
gpu struct {
|
||||
memory metric[uint64] // GPU memory limit (0-100 percent)
|
||||
usage metric[float64] // GPU load limit (0-100 percent)
|
||||
encoder metric[float64] // GPU encoder limit (0-100 percent)
|
||||
decoder metric[float64] // GPU decoder limit (0-100 percent)
|
||||
}
|
||||
|
||||
waitFor time.Duration
|
||||
mode LimitMode
|
||||
|
||||
cancelLimit context.CancelFunc
|
||||
|
||||
logger log.Logger
|
||||
}
|
||||
|
||||
// NewLimiter returns a new Limiter
|
||||
func NewLimiter(config LimiterConfig) Limiter {
|
||||
l := &limiter{
|
||||
cpu: config.CPU,
|
||||
memory: config.Memory,
|
||||
waitFor: config.WaitFor,
|
||||
onLimit: config.OnLimit,
|
||||
mode: config.Mode,
|
||||
@ -139,6 +266,13 @@ func NewLimiter(config LimiterConfig) Limiter {
|
||||
logger: config.Logger,
|
||||
}
|
||||
|
||||
l.cpu.SetLimit(config.CPU / 100)
|
||||
l.memory.SetLimit(config.Memory)
|
||||
l.gpu.memory.SetLimit(config.GPUMemory)
|
||||
l.gpu.usage.SetLimit(config.GPUUsage / 100)
|
||||
l.gpu.encoder.SetLimit(config.GPUEncoder / 100)
|
||||
l.gpu.decoder.SetLimit(config.GPUDecoder / 100)
|
||||
|
||||
if l.logger == nil {
|
||||
l.logger = log.New("")
|
||||
}
|
||||
@ -147,57 +281,56 @@ func NewLimiter(config LimiterConfig) Limiter {
|
||||
l.psutil = psutil.DefaultUtil
|
||||
}
|
||||
|
||||
if ncpu, err := l.psutil.CPUCounts(true); err != nil {
|
||||
if ncpu, err := l.psutil.CPUCounts(); err != nil {
|
||||
l.ncpu = 1
|
||||
} else {
|
||||
l.ncpu = ncpu
|
||||
}
|
||||
|
||||
l.lastUsage.CPU.NCPU = l.ncpu
|
||||
l.lastUsage.CPU.Limit = l.cpu * l.ncpu
|
||||
l.lastUsage.Memory.Limit = l.memory
|
||||
l.lastUsage.CPU.Limit = l.cpu.Limit() * 100 * l.ncpu
|
||||
l.lastUsage.Memory.Limit = l.memory.Limit()
|
||||
l.lastUsage.GPU.Memory.Limit = l.gpu.memory.Limit()
|
||||
l.lastUsage.GPU.Usage.Limit = l.gpu.usage.Limit() * 100
|
||||
l.lastUsage.GPU.Encoder.Limit = l.gpu.encoder.Limit() * 100
|
||||
l.lastUsage.GPU.Decoder.Limit = l.gpu.decoder.Limit() * 100
|
||||
|
||||
l.ncpuFactor = 1
|
||||
|
||||
mode := "hard"
|
||||
if l.mode == LimitModeSoft {
|
||||
mode = "soft"
|
||||
l.cpu /= l.ncpu
|
||||
l.cpu.SetLimit(l.cpu.Limit() / l.ncpu)
|
||||
l.ncpuFactor = l.ncpu
|
||||
}
|
||||
|
||||
l.cpu /= 100
|
||||
|
||||
if l.onLimit == nil {
|
||||
l.onLimit = func(float64, uint64) {}
|
||||
l.onLimit = func(float64, uint64, float64, float64, float64, uint64) {}
|
||||
}
|
||||
|
||||
l.logger = l.logger.WithFields(log.Fields{
|
||||
"cpu": l.cpu * l.ncpuFactor,
|
||||
"memory": l.memory,
|
||||
"mode": mode,
|
||||
"cpu": l.cpu.Limit() * l.ncpuFactor,
|
||||
"memory": l.memory.Limit(),
|
||||
"gpumemory": l.gpu.memory.Limit(),
|
||||
"gpuusage": l.gpu.usage.Limit(),
|
||||
"gpuencoder": l.gpu.encoder.Limit(),
|
||||
"gpudecoder": l.gpu.decoder.Limit(),
|
||||
"mode": mode,
|
||||
})
|
||||
|
||||
return l
|
||||
}
|
||||
|
||||
func (l *limiter) reset() {
|
||||
l.cpuCurrent = 0
|
||||
l.cpuLast = 0
|
||||
l.cpuAvg = 0
|
||||
l.cpuAvgCounter = 0
|
||||
l.cpuMax = 0
|
||||
l.cpuTop = 0
|
||||
l.cpuLimitEnable = false
|
||||
l.cpu.Reset()
|
||||
l.cpuThrottling = false
|
||||
|
||||
l.memoryCurrent = 0
|
||||
l.memoryLast = 0
|
||||
l.memoryAvg = 0
|
||||
l.memoryAvgCounter = 0
|
||||
l.memoryMax = 0
|
||||
l.memoryTop = 0
|
||||
l.memoryLimitEnable = false
|
||||
l.memory.Reset()
|
||||
|
||||
l.gpu.memory.Reset()
|
||||
l.gpu.usage.Reset()
|
||||
l.gpu.encoder.Reset()
|
||||
l.gpu.decoder.Reset()
|
||||
}
|
||||
|
||||
func (l *limiter) Start(process psutil.Process) error {
|
||||
@ -218,10 +351,7 @@ func (l *limiter) Start(process psutil.Process) error {
|
||||
go l.ticker(ctx, time.Second)
|
||||
|
||||
if l.mode == LimitModeSoft {
|
||||
ctx, cancel = context.WithCancel(context.Background())
|
||||
l.cancelLimit = cancel
|
||||
|
||||
go l.limitCPU(ctx, l.cpu, time.Second)
|
||||
go l.limitCPU(ctx, l.cpu.Limit(), time.Second)
|
||||
}
|
||||
|
||||
return nil
|
||||
@ -237,11 +367,6 @@ func (l *limiter) Stop() {
|
||||
|
||||
l.cancel()
|
||||
|
||||
if l.cancelLimit != nil {
|
||||
l.cancelLimit()
|
||||
l.cancelLimit = nil
|
||||
}
|
||||
|
||||
l.proc.Stop()
|
||||
l.proc = nil
|
||||
|
||||
@ -256,13 +381,13 @@ func (l *limiter) ticker(ctx context.Context, interval time.Duration) {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case t := <-ticker.C:
|
||||
l.collect(t)
|
||||
case <-ticker.C:
|
||||
l.collect()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (l *limiter) collect(_ time.Time) {
|
||||
func (l *limiter) collect() {
|
||||
l.lock.Lock()
|
||||
proc := l.proc
|
||||
l.lock.Unlock()
|
||||
@ -271,118 +396,108 @@ func (l *limiter) collect(_ time.Time) {
|
||||
return
|
||||
}
|
||||
|
||||
mstat, merr := proc.VirtualMemory()
|
||||
cpustat, cerr := proc.CPUPercent()
|
||||
mstat, merr := proc.Memory()
|
||||
cpustat, cerr := proc.CPU()
|
||||
gstat, gerr := proc.GPU()
|
||||
gindex := -1
|
||||
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
|
||||
if merr == nil {
|
||||
l.memoryLast, l.memoryCurrent = l.memoryCurrent, mstat
|
||||
|
||||
if l.memoryCurrent > l.memoryMax {
|
||||
l.memoryMax = l.memoryCurrent
|
||||
}
|
||||
|
||||
if l.memoryCurrent > l.memoryTop {
|
||||
l.memoryTop = l.memoryCurrent
|
||||
} else {
|
||||
l.memoryTop = uint64(float64(l.memoryTop) * 0.95)
|
||||
}
|
||||
|
||||
l.memoryAvgCounter++
|
||||
|
||||
l.memoryAvg = ((l.memoryAvg * float64(l.memoryAvgCounter-1)) + float64(l.memoryCurrent)) / float64(l.memoryAvgCounter)
|
||||
l.memory.Update(mstat)
|
||||
}
|
||||
|
||||
if cerr == nil {
|
||||
l.cpuLast, l.cpuCurrent = l.cpuCurrent, (cpustat.System+cpustat.User+cpustat.Other)/100
|
||||
l.cpu.Update((cpustat.System + cpustat.User + cpustat.Other) / 100)
|
||||
}
|
||||
|
||||
if l.cpuCurrent > l.cpuMax {
|
||||
l.cpuMax = l.cpuCurrent
|
||||
}
|
||||
|
||||
if l.cpuCurrent > l.cpuTop {
|
||||
l.cpuTop = l.cpuCurrent
|
||||
} else {
|
||||
l.cpuTop = l.cpuTop * 0.95
|
||||
}
|
||||
|
||||
l.cpuAvgCounter++
|
||||
|
||||
l.cpuAvg = ((l.cpuAvg * float64(l.cpuAvgCounter-1)) + l.cpuCurrent) / float64(l.cpuAvgCounter)
|
||||
if gerr == nil {
|
||||
l.gpu.memory.Update(gstat.MemoryUsed)
|
||||
l.gpu.usage.Update(gstat.Usage / 100)
|
||||
l.gpu.encoder.Update(gstat.Encoder / 100)
|
||||
l.gpu.decoder.Update(gstat.Decoder / 100)
|
||||
gindex = gstat.Index
|
||||
}
|
||||
|
||||
isLimitExceeded := false
|
||||
|
||||
if l.mode == LimitModeHard {
|
||||
if l.cpu > 0 {
|
||||
if l.cpuCurrent > l.cpu {
|
||||
// Current value is higher than the limit
|
||||
if l.cpuLast <= l.cpu {
|
||||
// If the previous value is below the limit, then we reached the
|
||||
// limit as of now
|
||||
l.cpuLimitSince = time.Now()
|
||||
}
|
||||
|
||||
if time.Since(l.cpuLimitSince) >= l.waitFor {
|
||||
l.logger.Warn().Log("CPU limit exceeded")
|
||||
isLimitExceeded = true
|
||||
}
|
||||
}
|
||||
if l.cpu.IsExceeded(l.waitFor, l.mode) {
|
||||
l.logger.Warn().Log("CPU limit exceeded")
|
||||
isLimitExceeded = true
|
||||
}
|
||||
}
|
||||
|
||||
if l.memory > 0 {
|
||||
if l.memoryCurrent > l.memory {
|
||||
// Current value is higher than the limit
|
||||
if l.memoryLast <= l.memory {
|
||||
// If the previous value is below the limit, then we reached the
|
||||
// limit as of now
|
||||
l.memoryLimitSince = time.Now()
|
||||
}
|
||||
if l.memory.IsExceeded(l.waitFor, l.mode) {
|
||||
l.logger.Warn().Log("Memory limit exceeded")
|
||||
isLimitExceeded = true
|
||||
}
|
||||
|
||||
if time.Since(l.memoryLimitSince) >= l.waitFor {
|
||||
l.logger.Warn().Log("Memory limit exceeded")
|
||||
isLimitExceeded = true
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if l.memory > 0 && l.memoryLimitEnable {
|
||||
if l.memoryCurrent > l.memory {
|
||||
// Current value is higher than the limit
|
||||
l.logger.Warn().Log("Memory limit exceeded")
|
||||
isLimitExceeded = true
|
||||
}
|
||||
}
|
||||
if l.gpu.memory.IsExceeded(l.waitFor, l.mode) {
|
||||
l.logger.Warn().Log("GPU memory limit exceeded")
|
||||
isLimitExceeded = true
|
||||
}
|
||||
|
||||
if l.gpu.usage.IsExceeded(l.waitFor, l.mode) {
|
||||
l.logger.Warn().Log("GPU usage limit exceeded")
|
||||
isLimitExceeded = true
|
||||
}
|
||||
|
||||
if l.gpu.encoder.IsExceeded(l.waitFor, l.mode) {
|
||||
l.logger.Warn().Log("GPU encoder limit exceeded")
|
||||
isLimitExceeded = true
|
||||
}
|
||||
|
||||
if l.gpu.decoder.IsExceeded(l.waitFor, l.mode) {
|
||||
l.logger.Warn().Log("GPU decoder limit exceeded")
|
||||
isLimitExceeded = true
|
||||
}
|
||||
|
||||
l.logger.Debug().WithFields(log.Fields{
|
||||
"cur_cpu": l.cpuCurrent * l.ncpuFactor,
|
||||
"top_cpu": l.cpuTop * l.ncpuFactor,
|
||||
"cur_mem": l.memoryCurrent,
|
||||
"top_mem": l.memoryTop,
|
||||
"exceeded": isLimitExceeded,
|
||||
"cur_cpu": l.cpu.Current() * l.ncpuFactor,
|
||||
"top_cpu": l.cpu.Top() * l.ncpuFactor,
|
||||
"cur_mem": l.memory.Current(),
|
||||
"top_mem": l.memory.Top(),
|
||||
"cur_gpu_mem": l.gpu.memory.Current(),
|
||||
"top_gpu_mem": l.gpu.memory.Top(),
|
||||
"exceeded": isLimitExceeded,
|
||||
}).Log("Observation")
|
||||
|
||||
if isLimitExceeded {
|
||||
go l.onLimit(l.cpuCurrent*l.ncpuFactor*100, l.memoryCurrent)
|
||||
go l.onLimit(l.cpu.Current()*l.ncpuFactor*100, l.memory.Current(), l.gpu.usage.Current(), l.gpu.encoder.Current(), l.gpu.decoder.Current(), l.gpu.memory.Current())
|
||||
}
|
||||
|
||||
l.lastUsageLock.Lock()
|
||||
l.lastUsage.CPU.Current = l.cpuCurrent * l.ncpu * 100
|
||||
l.lastUsage.CPU.Average = l.cpuAvg * l.ncpu * 100
|
||||
l.lastUsage.CPU.Max = l.cpuMax * l.ncpu * 100
|
||||
l.lastUsage.CPU.Current = l.cpu.Current() * l.ncpu * 100
|
||||
l.lastUsage.CPU.Average = l.cpu.Avg() * l.ncpu * 100
|
||||
l.lastUsage.CPU.Max = l.cpu.Max() * l.ncpu * 100
|
||||
l.lastUsage.CPU.IsThrottling = l.cpuThrottling
|
||||
|
||||
l.lastUsage.Memory.Current = l.memoryCurrent
|
||||
l.lastUsage.Memory.Average = l.memoryAvg
|
||||
l.lastUsage.Memory.Max = l.memoryMax
|
||||
l.lastUsageLock.Unlock()
|
||||
l.lastUsage.Memory.Current = l.memory.Current()
|
||||
l.lastUsage.Memory.Average = l.memory.Avg()
|
||||
l.lastUsage.Memory.Max = l.memory.Max()
|
||||
|
||||
l.lock.Unlock()
|
||||
l.lastUsage.GPU.Index = gindex
|
||||
l.lastUsage.GPU.Memory.Current = l.gpu.memory.Current() * 100
|
||||
l.lastUsage.GPU.Memory.Average = l.gpu.memory.Avg() * 100
|
||||
l.lastUsage.GPU.Memory.Max = l.gpu.memory.Max() * 100
|
||||
|
||||
l.lastUsage.GPU.Usage.Current = l.gpu.usage.Current() * 100
|
||||
l.lastUsage.GPU.Usage.Average = l.gpu.usage.Avg() * 100
|
||||
l.lastUsage.GPU.Usage.Max = l.gpu.usage.Max() * 100
|
||||
|
||||
l.lastUsage.GPU.Encoder.Current = l.gpu.encoder.Current() * 100
|
||||
l.lastUsage.GPU.Encoder.Average = l.gpu.encoder.Avg() * 100
|
||||
l.lastUsage.GPU.Encoder.Max = l.gpu.encoder.Max() * 100
|
||||
|
||||
l.lastUsage.GPU.Decoder.Current = l.gpu.decoder.Current() * 100
|
||||
l.lastUsage.GPU.Decoder.Average = l.gpu.decoder.Avg() * 100
|
||||
l.lastUsage.GPU.Decoder.Max = l.gpu.decoder.Max() * 100
|
||||
l.lastUsageLock.Unlock()
|
||||
}
|
||||
|
||||
func (l *limiter) Limit(cpu, memory bool) error {
|
||||
func (l *limiter) Limit(cpu, memory, gpu bool) error {
|
||||
l.lock.Lock()
|
||||
defer l.lock.Unlock()
|
||||
|
||||
@ -390,35 +505,31 @@ func (l *limiter) Limit(cpu, memory bool) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
if memory {
|
||||
if !l.memoryLimitEnable {
|
||||
l.memoryLimitEnable = true
|
||||
|
||||
l.logger.Debug().Log("Memory limiter enabled")
|
||||
}
|
||||
} else {
|
||||
if l.memoryLimitEnable {
|
||||
l.memoryLimitEnable = false
|
||||
|
||||
l.logger.Debug().Log("Memory limiter disabled")
|
||||
}
|
||||
enabled, changed := l.cpu.DoLimit(cpu)
|
||||
if enabled && changed {
|
||||
l.logger.Debug().Log("CPU limiter enabled")
|
||||
} else if !enabled && changed {
|
||||
l.logger.Debug().Log("CPU limiter disabled")
|
||||
}
|
||||
|
||||
if cpu {
|
||||
if !l.cpuLimitEnable {
|
||||
l.cpuLimitEnable = true
|
||||
|
||||
l.logger.Debug().Log("CPU limiter enabled")
|
||||
}
|
||||
} else {
|
||||
if l.cpuLimitEnable {
|
||||
l.cpuLimitEnable = false
|
||||
|
||||
l.logger.Debug().Log("CPU limiter disabled")
|
||||
}
|
||||
|
||||
enabled, changed = l.memory.DoLimit(memory)
|
||||
if enabled && changed {
|
||||
l.logger.Debug().Log("Memory limiter enabled")
|
||||
} else if !enabled && changed {
|
||||
l.logger.Debug().Log("Memory limiter disabled")
|
||||
}
|
||||
|
||||
enabled, changed = l.gpu.memory.DoLimit(gpu)
|
||||
if enabled && changed {
|
||||
l.logger.Debug().Log("GPU limiter enabled")
|
||||
} else if !enabled && changed {
|
||||
l.logger.Debug().Log("GPU limiter disabled")
|
||||
}
|
||||
|
||||
l.gpu.usage.DoLimit(gpu)
|
||||
l.gpu.encoder.DoLimit(gpu)
|
||||
l.gpu.decoder.DoLimit(gpu)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -453,7 +564,7 @@ func (l *limiter) limitCPU(ctx context.Context, limit float64, interval time.Dur
|
||||
|
||||
l.lock.Lock()
|
||||
|
||||
if !l.cpuLimitEnable {
|
||||
if !l.cpu.IsLimitEnabled() {
|
||||
if factorTopLimit > 0 {
|
||||
factorTopLimit -= 10
|
||||
} else {
|
||||
@ -469,7 +580,7 @@ func (l *limiter) limitCPU(ctx context.Context, limit float64, interval time.Dur
|
||||
}
|
||||
} else {
|
||||
factorTopLimit = 100
|
||||
topLimit = l.cpuTop - limit
|
||||
topLimit = l.cpu.Top() - limit
|
||||
l.cpuThrottling = true
|
||||
}
|
||||
|
||||
@ -482,7 +593,7 @@ func (l *limiter) limitCPU(ctx context.Context, limit float64, interval time.Dur
|
||||
lim += (100 - factorTopLimit) / 100 * topLimit
|
||||
}
|
||||
|
||||
pcpu := l.cpuCurrent
|
||||
pcpu := l.cpu.Current()
|
||||
|
||||
l.lock.Unlock()
|
||||
|
||||
@ -526,16 +637,6 @@ func (l *limiter) limitCPU(ctx context.Context, limit float64, interval time.Dur
|
||||
}
|
||||
}
|
||||
|
||||
func (l *limiter) Current() (cpu float64, memory uint64) {
|
||||
l.lastUsageLock.RLock()
|
||||
defer l.lastUsageLock.RUnlock()
|
||||
|
||||
cpu = l.lastUsage.CPU.Current / l.ncpu
|
||||
memory = l.lastUsage.Memory.Current
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (l *limiter) Usage() Usage {
|
||||
l.lastUsageLock.RLock()
|
||||
defer l.lastUsageLock.RUnlock()
|
||||
@ -543,10 +644,6 @@ func (l *limiter) Usage() Usage {
|
||||
return l.lastUsage
|
||||
}
|
||||
|
||||
func (l *limiter) Limits() (cpu float64, memory uint64) {
|
||||
return l.cpu * 100, l.memory
|
||||
}
|
||||
|
||||
func (l *limiter) Mode() LimitMode {
|
||||
return l.mode
|
||||
}
|
||||
|
||||
@ -7,13 +7,13 @@ import (
|
||||
|
||||
"github.com/datarhei/core/v16/psutil"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
type psproc struct{}
|
||||
|
||||
func (p *psproc) CPUPercent() (*psutil.CPUInfoStat, error) {
|
||||
return &psutil.CPUInfoStat{
|
||||
func (p *psproc) CPU() (*psutil.CPUInfo, error) {
|
||||
return &psutil.CPUInfo{
|
||||
System: 50,
|
||||
User: 0,
|
||||
Idle: 0,
|
||||
@ -21,10 +21,22 @@ func (p *psproc) CPUPercent() (*psutil.CPUInfoStat, error) {
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (p *psproc) VirtualMemory() (uint64, error) {
|
||||
func (p *psproc) Memory() (uint64, error) {
|
||||
return 197, nil
|
||||
}
|
||||
|
||||
func (p *psproc) GPU() (*psutil.GPUInfo, error) {
|
||||
return &psutil.GPUInfo{
|
||||
Index: 0,
|
||||
Name: "L4",
|
||||
MemoryTotal: 128,
|
||||
MemoryUsed: 91,
|
||||
Usage: 3,
|
||||
Encoder: 9,
|
||||
Decoder: 5,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (p *psproc) Stop() {}
|
||||
func (p *psproc) Suspend() error { return nil }
|
||||
func (p *psproc) Resume() error { return nil }
|
||||
@ -42,7 +54,7 @@ func TestCPULimit(t *testing.T) {
|
||||
|
||||
l := NewLimiter(LimiterConfig{
|
||||
CPU: 42,
|
||||
OnLimit: func(float64, uint64) {
|
||||
OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
|
||||
wg.Done()
|
||||
},
|
||||
})
|
||||
@ -57,7 +69,7 @@ func TestCPULimit(t *testing.T) {
|
||||
lock.Unlock()
|
||||
}()
|
||||
|
||||
assert.Eventually(t, func() bool {
|
||||
require.Eventually(t, func() bool {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
|
||||
@ -79,7 +91,7 @@ func TestCPULimitWaitFor(t *testing.T) {
|
||||
l := NewLimiter(LimiterConfig{
|
||||
CPU: 42,
|
||||
WaitFor: 3 * time.Second,
|
||||
OnLimit: func(float64, uint64) {
|
||||
OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
|
||||
wg.Done()
|
||||
},
|
||||
})
|
||||
@ -94,7 +106,7 @@ func TestCPULimitWaitFor(t *testing.T) {
|
||||
lock.Unlock()
|
||||
}()
|
||||
|
||||
assert.Eventually(t, func() bool {
|
||||
require.Eventually(t, func() bool {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
|
||||
@ -115,7 +127,7 @@ func TestMemoryLimit(t *testing.T) {
|
||||
|
||||
l := NewLimiter(LimiterConfig{
|
||||
Memory: 42,
|
||||
OnLimit: func(float64, uint64) {
|
||||
OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
|
||||
wg.Done()
|
||||
},
|
||||
})
|
||||
@ -130,7 +142,7 @@ func TestMemoryLimit(t *testing.T) {
|
||||
lock.Unlock()
|
||||
}()
|
||||
|
||||
assert.Eventually(t, func() bool {
|
||||
require.Eventually(t, func() bool {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
|
||||
@ -152,7 +164,7 @@ func TestMemoryLimitWaitFor(t *testing.T) {
|
||||
l := NewLimiter(LimiterConfig{
|
||||
Memory: 42,
|
||||
WaitFor: 3 * time.Second,
|
||||
OnLimit: func(float64, uint64) {
|
||||
OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
|
||||
wg.Done()
|
||||
},
|
||||
})
|
||||
@ -167,7 +179,80 @@ func TestMemoryLimitWaitFor(t *testing.T) {
|
||||
lock.Unlock()
|
||||
}()
|
||||
|
||||
assert.Eventually(t, func() bool {
|
||||
require.Eventually(t, func() bool {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
|
||||
return done
|
||||
}, 10*time.Second, 1*time.Second)
|
||||
}
|
||||
|
||||
func TestGPUMemoryLimit(t *testing.T) {
|
||||
lock := sync.Mutex{}
|
||||
|
||||
lock.Lock()
|
||||
done := false
|
||||
lock.Unlock()
|
||||
|
||||
go func() {
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
|
||||
l := NewLimiter(LimiterConfig{
|
||||
GPUMemory: 42,
|
||||
OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
|
||||
wg.Done()
|
||||
},
|
||||
})
|
||||
|
||||
l.Start(&psproc{})
|
||||
defer l.Stop()
|
||||
|
||||
wg.Wait()
|
||||
|
||||
lock.Lock()
|
||||
done = true
|
||||
lock.Unlock()
|
||||
}()
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
|
||||
return done
|
||||
}, 2*time.Second, 100*time.Millisecond)
|
||||
}
|
||||
|
||||
func TestGPUMemoryLimitWaitFor(t *testing.T) {
|
||||
lock := sync.Mutex{}
|
||||
|
||||
lock.Lock()
|
||||
done := false
|
||||
lock.Unlock()
|
||||
|
||||
go func() {
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
|
||||
l := NewLimiter(LimiterConfig{
|
||||
GPUMemory: 42,
|
||||
WaitFor: 3 * time.Second,
|
||||
OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
|
||||
wg.Done()
|
||||
},
|
||||
})
|
||||
|
||||
l.Start(&psproc{})
|
||||
defer l.Stop()
|
||||
|
||||
wg.Wait()
|
||||
|
||||
lock.Lock()
|
||||
done = true
|
||||
lock.Unlock()
|
||||
}()
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
|
||||
@ -189,7 +274,7 @@ func TestMemoryLimitSoftMode(t *testing.T) {
|
||||
l := NewLimiter(LimiterConfig{
|
||||
Memory: 42,
|
||||
Mode: LimitModeSoft,
|
||||
OnLimit: func(float64, uint64) {
|
||||
OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
|
||||
wg.Done()
|
||||
},
|
||||
})
|
||||
@ -197,7 +282,7 @@ func TestMemoryLimitSoftMode(t *testing.T) {
|
||||
l.Start(&psproc{})
|
||||
defer l.Stop()
|
||||
|
||||
l.Limit(false, true)
|
||||
l.Limit(false, true, false)
|
||||
|
||||
wg.Wait()
|
||||
|
||||
@ -206,7 +291,46 @@ func TestMemoryLimitSoftMode(t *testing.T) {
|
||||
lock.Unlock()
|
||||
}()
|
||||
|
||||
assert.Eventually(t, func() bool {
|
||||
require.Eventually(t, func() bool {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
|
||||
return done
|
||||
}, 2*time.Second, 100*time.Millisecond)
|
||||
}
|
||||
|
||||
func TestGPUMemoryLimitSoftMode(t *testing.T) {
|
||||
lock := sync.Mutex{}
|
||||
|
||||
lock.Lock()
|
||||
done := false
|
||||
lock.Unlock()
|
||||
|
||||
go func() {
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
|
||||
l := NewLimiter(LimiterConfig{
|
||||
GPUMemory: 42,
|
||||
Mode: LimitModeSoft,
|
||||
OnLimit: func(float64, uint64, float64, float64, float64, uint64) {
|
||||
wg.Done()
|
||||
},
|
||||
})
|
||||
|
||||
l.Start(&psproc{})
|
||||
defer l.Stop()
|
||||
|
||||
l.Limit(false, false, true)
|
||||
|
||||
wg.Wait()
|
||||
|
||||
lock.Lock()
|
||||
done = true
|
||||
lock.Unlock()
|
||||
}()
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
|
||||
|
||||
@ -46,29 +46,32 @@ type Process interface {
|
||||
// Limit enables or disables CPU and memory limiting. CPU will be throttled
|
||||
// into the configured limit. If memory consumption is above the configured
|
||||
// limit, the process will be killed.
|
||||
Limit(cpu, memory bool) error
|
||||
Limit(cpu, memory, gpu bool) error
|
||||
}
|
||||
|
||||
// Config is the configuration of a process
|
||||
type Config struct {
|
||||
Binary string // Path to the ffmpeg binary.
|
||||
Args []string // List of arguments for the binary.
|
||||
Reconnect bool // Whether to restart the process if it exited.
|
||||
ReconnectDelay time.Duration // Duration to wait before restarting the process.
|
||||
StaleTimeout time.Duration // Kill the process after this duration if it doesn't produce any output.
|
||||
Timeout time.Duration // Kill the process after this duration.
|
||||
LimitCPU float64 // Kill the process if the CPU usage in percent is above this value.
|
||||
LimitMemory uint64 // Kill the process if the memory consumption in bytes is above this value.
|
||||
LimitDuration time.Duration // Kill the process if the limits are exceeded for this duration.
|
||||
LimitMode LimitMode // Select limiting mode
|
||||
Scheduler Scheduler // A scheduler.
|
||||
Parser Parser // A parser for the output of the process.
|
||||
OnArgs func(args []string) []string // A callback which is called right before the process will start with the command args.
|
||||
OnBeforeStart func() error // A callback which is called before the process will be started. If error is non-nil, the start will be refused.
|
||||
OnStart func() // A callback which is called after the process started.
|
||||
OnExit func(state string) // A callback which is called after the process exited with the exit state.
|
||||
OnStateChange func(from, to string) // A callback which is called after a state changed.
|
||||
Logger log.Logger
|
||||
Binary string // Path to the ffmpeg binary.
|
||||
Args []string // List of arguments for the binary.
|
||||
Reconnect bool // Whether to restart the process if it exited.
|
||||
ReconnectDelay time.Duration // Duration to wait before restarting the process.
|
||||
StaleTimeout time.Duration // Kill the process after this duration if it doesn't produce any output.
|
||||
Timeout time.Duration // Kill the process after this duration.
|
||||
LimitCPU float64 // Kill the process if the CPU usage in percent is above this value, in percent 0-100 in hard mode, 0-100*ncpu in soft mode.
|
||||
LimitMemory uint64 // Kill the process if the memory consumption in bytes is above this value.
|
||||
LimitGPUUsage float64 // Kill the process if the GPU usage in percent is above this value, in percent 0-100.
|
||||
LimitGPUEncoder float64 // Kill the process if the GPU encoder usage in percent is above this value, in percent 0-100.
|
||||
LimitGPUDecoder float64 // Kill the process if the GPU decoder usage in percent is above this value, in percent 0-100.
|
||||
LimitGPUMemory uint64 // Kill the process if the GPU memory consumption in bytes is above this value.
|
||||
LimitDuration time.Duration // Kill the process if the limits are exceeded for this duration.
|
||||
LimitMode LimitMode // Select limiting mode
|
||||
Scheduler Scheduler // A scheduler.
|
||||
Parser Parser // A parser for the output of the process.
|
||||
OnBeforeStart func(args []string) ([]string, error) // A callback which is called before the process will be started. The string slice is the arguments of the command line. If error is non-nil, the start will be refused.
|
||||
OnStart func() // A callback which is called after the process started.
|
||||
OnExit func(state string) // A callback which is called after the process exited with the exit state.
|
||||
OnStateChange func(from, to string) // A callback which is called after a state changed.
|
||||
Logger log.Logger
|
||||
}
|
||||
|
||||
// Status represents the current status of a process
|
||||
@ -81,20 +84,47 @@ type Status struct {
|
||||
Time time.Time // Time is the time of the last change of the state
|
||||
CommandArgs []string // Currently running command arguments
|
||||
LimitMode string // The limiting mode
|
||||
CPU struct {
|
||||
NCPU float64 // Number of logical CPUs
|
||||
Current float64 // Currently consumed CPU in percent
|
||||
Average float64 // Average consumed CPU in percent
|
||||
Max float64 // Max. consumed CPU in percent
|
||||
Limit float64 // Usage limit in percent
|
||||
IsThrottling bool // Whether the CPU is currently limited
|
||||
} // Used CPU in percent
|
||||
Memory struct {
|
||||
Current uint64 // Currently consumed memory in bytes
|
||||
Average float64 // Average consumed memory in bytes
|
||||
Max uint64 // Max. consumed memory in bytes
|
||||
Limit uint64 // Usage limit in bytes
|
||||
} // Used memory in bytes
|
||||
CPU StatusCPU // CPU consumption in percent
|
||||
Memory StatusMemory // Memory consumption in bytes
|
||||
GPU StatusGPU // GPU consumption
|
||||
}
|
||||
|
||||
type StatusCPU struct {
|
||||
NCPU float64 // Number of logical CPUs
|
||||
Current float64 // Currently consumed CPU in percent
|
||||
Average float64 // Average consumed CPU in percent
|
||||
Max float64 // Max. consumed CPU in percent
|
||||
Limit float64 // Usage limit in percent
|
||||
IsThrottling bool // Whether the CPU is currently limited
|
||||
}
|
||||
|
||||
type StatusMemory struct {
|
||||
Current uint64 // Currently consumed memory in bytes
|
||||
Average uint64 // Average consumed memory in bytes
|
||||
Max uint64 // Max. consumed memory in bytes
|
||||
Limit uint64 // Usage limit in bytes
|
||||
}
|
||||
|
||||
type StatusGPUMemory struct {
|
||||
Current uint64 // Currently consumed memory in bytes
|
||||
Average uint64 // Average consumed memory in bytes
|
||||
Max uint64 // Max. consumed memory in bytes
|
||||
Limit uint64 // Usage limit in bytes
|
||||
}
|
||||
|
||||
type StatusGPUUsage struct {
|
||||
Current float64 // Currently consumed GPU usage in percent
|
||||
Average float64 // Average consumed GPU usage in percent
|
||||
Max float64 // Max. consumed GPU usage in percent
|
||||
Limit float64 // Usage limit in percent
|
||||
}
|
||||
|
||||
type StatusGPU struct {
|
||||
Index int
|
||||
Memory StatusGPUMemory // GPU memory consumption
|
||||
Usage StatusGPUUsage // GPU usage in percent
|
||||
Encoder StatusGPUUsage // GPU encoder usage in percent
|
||||
Decoder StatusGPUUsage // GPU decoder usage in percent
|
||||
}
|
||||
|
||||
// States
|
||||
@ -206,8 +236,7 @@ type process struct {
|
||||
logger log.Logger
|
||||
debuglogger log.Logger
|
||||
callbacks struct {
|
||||
onArgs func(args []string) []string
|
||||
onBeforeStart func() error
|
||||
onBeforeStart func(args []string) ([]string, error)
|
||||
onStart func()
|
||||
onExit func(state string)
|
||||
onStateChange func(from, to string)
|
||||
@ -263,28 +292,35 @@ func New(config Config) (Process, error) {
|
||||
p.stale.last = time.Now()
|
||||
p.stale.timeout = config.StaleTimeout
|
||||
|
||||
p.callbacks.onArgs = config.OnArgs
|
||||
p.callbacks.onBeforeStart = config.OnBeforeStart
|
||||
p.callbacks.onStart = config.OnStart
|
||||
p.callbacks.onExit = config.OnExit
|
||||
p.callbacks.onStateChange = config.OnStateChange
|
||||
|
||||
p.limits = NewLimiter(LimiterConfig{
|
||||
CPU: config.LimitCPU,
|
||||
Memory: config.LimitMemory,
|
||||
WaitFor: config.LimitDuration,
|
||||
Mode: config.LimitMode,
|
||||
Logger: p.logger.WithComponent("ProcessLimiter"),
|
||||
OnLimit: func(cpu float64, memory uint64) {
|
||||
CPU: config.LimitCPU,
|
||||
Memory: config.LimitMemory,
|
||||
GPUUsage: config.LimitGPUUsage,
|
||||
GPUEncoder: config.LimitGPUEncoder,
|
||||
GPUDecoder: config.LimitGPUDecoder,
|
||||
GPUMemory: config.LimitGPUMemory,
|
||||
WaitFor: config.LimitDuration,
|
||||
Mode: config.LimitMode,
|
||||
Logger: p.logger.WithComponent("ProcessLimiter"),
|
||||
OnLimit: func(cpu float64, memory uint64, gpuusage, gpuencoder, gpudecoder float64, gpumemory uint64) {
|
||||
if !p.isRunning() {
|
||||
return
|
||||
}
|
||||
|
||||
p.logger.WithFields(log.Fields{
|
||||
"cpu": cpu,
|
||||
"memory": memory,
|
||||
"cpu": cpu,
|
||||
"memory": memory,
|
||||
"gpuusage": gpuusage,
|
||||
"gpuencoder": gpuencoder,
|
||||
"gpudecoder": gpudecoder,
|
||||
"gpumemmory": gpumemory,
|
||||
}).Warn().Log("Killed because limits are exceeded")
|
||||
p.Kill(false, fmt.Sprintf("Killed because limits are exceeded (mode: %s, tolerance: %s): %.2f (%.2f) CPU, %d (%d) bytes memory", config.LimitMode.String(), config.LimitDuration.String(), cpu, config.LimitCPU, memory, config.LimitMemory))
|
||||
p.Kill(false, fmt.Sprintf("Killed because limits are exceeded (mode: %s, tolerance: %s): %.2f (%.2f) CPU, %d (%d) bytes memory, %.2f/%.2f/%.2f (%.2f) GPU usage, %d (%d) bytes GPU memory", config.LimitMode.String(), config.LimitDuration.String(), cpu, config.LimitCPU, memory, config.LimitMemory, gpuusage, gpuencoder, gpudecoder, config.LimitGPUUsage, gpumemory, config.LimitGPUMemory))
|
||||
},
|
||||
})
|
||||
|
||||
@ -467,8 +503,47 @@ func (p *process) Status() Status {
|
||||
Duration: time.Since(stateTime),
|
||||
Time: stateTime,
|
||||
LimitMode: p.limits.Mode().String(),
|
||||
CPU: usage.CPU,
|
||||
Memory: usage.Memory,
|
||||
CPU: StatusCPU{
|
||||
NCPU: usage.CPU.NCPU,
|
||||
Current: usage.CPU.Current,
|
||||
Average: usage.CPU.Average,
|
||||
Max: usage.CPU.Max,
|
||||
Limit: usage.CPU.Limit,
|
||||
IsThrottling: usage.CPU.IsThrottling,
|
||||
},
|
||||
Memory: StatusMemory{
|
||||
Current: usage.Memory.Current,
|
||||
Average: uint64(usage.Memory.Average),
|
||||
Max: usage.Memory.Max,
|
||||
Limit: usage.Memory.Limit,
|
||||
},
|
||||
GPU: StatusGPU{
|
||||
Index: usage.GPU.Index,
|
||||
Memory: StatusGPUMemory{
|
||||
Current: usage.GPU.Memory.Current,
|
||||
Average: uint64(usage.GPU.Memory.Average),
|
||||
Max: usage.GPU.Memory.Max,
|
||||
Limit: usage.GPU.Memory.Limit,
|
||||
},
|
||||
Usage: StatusGPUUsage{
|
||||
Current: usage.GPU.Usage.Current,
|
||||
Average: usage.GPU.Usage.Average,
|
||||
Max: usage.GPU.Usage.Max,
|
||||
Limit: usage.GPU.Usage.Limit,
|
||||
},
|
||||
Encoder: StatusGPUUsage{
|
||||
Current: usage.GPU.Encoder.Current,
|
||||
Average: usage.GPU.Encoder.Average,
|
||||
Max: usage.GPU.Encoder.Max,
|
||||
Limit: usage.GPU.Encoder.Limit,
|
||||
},
|
||||
Decoder: StatusGPUUsage{
|
||||
Current: usage.GPU.Decoder.Current,
|
||||
Average: usage.GPU.Decoder.Average,
|
||||
Max: usage.GPU.Decoder.Max,
|
||||
Limit: usage.GPU.Decoder.Limit,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
s.CommandArgs = make([]string, len(p.args))
|
||||
@ -488,7 +563,7 @@ func (p *process) IsRunning() bool {
|
||||
return p.isRunning()
|
||||
}
|
||||
|
||||
func (p *process) Limit(cpu, memory bool) error {
|
||||
func (p *process) Limit(cpu, memory, gpu bool) error {
|
||||
if !p.isRunning() {
|
||||
return nil
|
||||
}
|
||||
@ -498,11 +573,12 @@ func (p *process) Limit(cpu, memory bool) error {
|
||||
}
|
||||
|
||||
p.logger.Warn().WithFields(log.Fields{
|
||||
"limit_cpu": cpu,
|
||||
"limit_memory": memory,
|
||||
"limit_cpu": cpu,
|
||||
"limit_memory": memory,
|
||||
"limit_gpumemory": gpu,
|
||||
}).Log("Limiter triggered")
|
||||
|
||||
return p.limits.Limit(cpu, memory)
|
||||
return p.limits.Limit(cpu, memory, gpu)
|
||||
}
|
||||
|
||||
// Start will start the process and sets the order to "start". If the
|
||||
@ -559,11 +635,21 @@ func (p *process) start() error {
|
||||
|
||||
args := p.args
|
||||
|
||||
if p.callbacks.onArgs != nil {
|
||||
if p.callbacks.onBeforeStart != nil {
|
||||
args = make([]string, len(p.args))
|
||||
copy(args, p.args)
|
||||
|
||||
args = p.callbacks.onArgs(args)
|
||||
args, err = p.callbacks.onBeforeStart(args)
|
||||
if err != nil {
|
||||
p.setState(stateFailed)
|
||||
|
||||
p.parser.Parse([]byte(err.Error()))
|
||||
p.logger.WithError(err).Error().Log("Starting failed")
|
||||
|
||||
p.reconnect(p.delay(stateFailed))
|
||||
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
p.cmd = exec.Command(p.binary, args...)
|
||||
@ -582,19 +668,6 @@ func (p *process) start() error {
|
||||
return err
|
||||
}
|
||||
|
||||
if p.callbacks.onBeforeStart != nil {
|
||||
if err := p.callbacks.onBeforeStart(); err != nil {
|
||||
p.setState(stateFailed)
|
||||
|
||||
p.parser.Parse([]byte(err.Error()))
|
||||
p.logger.WithError(err).Error().Log("Starting failed")
|
||||
|
||||
p.reconnect(p.delay(stateFailed))
|
||||
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if err := p.cmd.Start(); err != nil {
|
||||
p.setState(stateFailed)
|
||||
|
||||
|
||||
@ -606,21 +606,15 @@ func TestProcessCallbacks(t *testing.T) {
|
||||
"2",
|
||||
},
|
||||
Reconnect: false,
|
||||
OnArgs: func(a []string) []string {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
|
||||
args = make([]string, len(a))
|
||||
copy(args, a)
|
||||
return a
|
||||
},
|
||||
OnBeforeStart: func() error {
|
||||
OnBeforeStart: func(a []string) ([]string, error) {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
|
||||
onBeforeStart = true
|
||||
|
||||
return nil
|
||||
args = make([]string, len(a))
|
||||
copy(args, a)
|
||||
return a, nil
|
||||
},
|
||||
OnStart: func() {
|
||||
lock.Lock()
|
||||
@ -681,8 +675,8 @@ func TestProcessCallbacksOnBeforeStart(t *testing.T) {
|
||||
Parser: parser,
|
||||
Reconnect: true,
|
||||
ReconnectDelay: 10 * time.Second,
|
||||
OnBeforeStart: func() error {
|
||||
return fmt.Errorf("no, not now")
|
||||
OnBeforeStart: func(a []string) ([]string, error) {
|
||||
return a, fmt.Errorf("no, not now")
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
@ -3,21 +3,25 @@ package gpu
|
||||
import "errors"
|
||||
|
||||
type Process struct {
|
||||
PID int32
|
||||
Memory uint64
|
||||
PID int32
|
||||
Index int
|
||||
Memory uint64 // bytes
|
||||
Usage float64 // percent 0-100
|
||||
Encoder float64 // percent 0-100
|
||||
Decoder float64 // percent 0-100
|
||||
}
|
||||
|
||||
type Stats struct {
|
||||
ID string
|
||||
Name string
|
||||
Architecture string
|
||||
|
||||
MemoryTotal uint64
|
||||
MemoryUsed uint64
|
||||
MemoryTotal uint64 // bytes
|
||||
MemoryUsed uint64 // bytes
|
||||
|
||||
Usage float64
|
||||
MemoryUsage float64
|
||||
EncoderUsage float64
|
||||
DecoderUsage float64
|
||||
Usage float64 // percent 0-100
|
||||
Encoder float64 // percent 0-100
|
||||
Decoder float64 // percent 0-100
|
||||
|
||||
Process []Process
|
||||
|
||||
@ -25,9 +29,17 @@ type Stats struct {
|
||||
}
|
||||
|
||||
type GPU interface {
|
||||
// Count returns the number of GPU in the system.
|
||||
Count() (int, error)
|
||||
|
||||
// Stats returns current GPU stats.
|
||||
Stats() ([]Stats, error)
|
||||
|
||||
// Process returns a Process.
|
||||
Process(pid int32) (Process, error)
|
||||
|
||||
// Close stops all GPU collection processes
|
||||
Close()
|
||||
}
|
||||
|
||||
var ErrProcessNotFound = errors.New("process not found")
|
||||
|
||||
54
psutil/gpu/nvidia/fixtures/process.txt
Normal file
54
psutil/gpu/nvidia/fixtures/process.txt
Normal file
@ -0,0 +1,54 @@
|
||||
# gpu pid type sm mem enc dec fb command
|
||||
# Idx # C/G % % % % MB name
|
||||
0 7372 C 2 0 2 - 136 ffmpeg
|
||||
0 12176 C 5 2 3 7 782 ffmpeg
|
||||
0 20035 C 8 2 4 1 1145 ffmpeg
|
||||
0 20141 C 2 1 1 3 429 ffmpeg
|
||||
0 29591 C 2 1 - 2 435 ffmpeg
|
||||
0 7372 C 2 0 - - 136 ffmpeg
|
||||
0 12176 C 8 3 7 9 782 ffmpeg
|
||||
0 20035 C 8 2 3 1 1145 ffmpeg
|
||||
0 20141 C - - 1 1 429 ffmpeg
|
||||
0 29591 C 3 1 - 2 435 ffmpeg
|
||||
0 7372 C 2 1 1 - 136 ffmpeg
|
||||
0 12176 C 5 1 5 7 782 ffmpeg
|
||||
0 20035 C 8 3 1 4 1145 ffmpeg
|
||||
0 20141 C 2 0 1 - 429 ffmpeg
|
||||
0 29591 C 2 0 1 3 435 ffmpeg
|
||||
0 7372 C 2 0 - - 136 ffmpeg
|
||||
0 12176 C 5 1 5 3 782 ffmpeg
|
||||
0 20035 C 8 2 5 4 1145 ffmpeg
|
||||
0 20141 C 3 1 - 5 429 ffmpeg
|
||||
0 29591 C 2 0 - 1 435 ffmpeg
|
||||
0 7372 C 2 1 - - 136 ffmpeg
|
||||
0 12176 C 10 3 6 8 782 ffmpeg
|
||||
0 20035 C 3 1 1 1 1145 ffmpeg
|
||||
0 20141 C - - 4 1 429 ffmpeg
|
||||
0 29591 C 5 2 - 2 435 ffmpeg
|
||||
0 7372 C 5 1 2 - 136 ffmpeg
|
||||
0 12176 C 6 2 4 7 782 ffmpeg
|
||||
0 20035 C - - - - 1145 ffmpeg
|
||||
0 20141 C 5 1 1 3 429 ffmpeg
|
||||
0 29591 C 5 2 2 4 435 ffmpeg
|
||||
0 7372 C - - 1 - 136 ffmpeg
|
||||
0 12176 C 7 2 3 4 782 ffmpeg
|
||||
0 20035 C 2 0 - 1 1145 ffmpeg
|
||||
0 20141 C 7 2 4 4 429 ffmpeg
|
||||
0 29591 C 5 1 2 3 435 ffmpeg
|
||||
0 7372 C 2 0 1 - 136 ffmpeg
|
||||
0 12176 C 9 3 3 6 782 ffmpeg
|
||||
0 20035 C 2 1 - 1 1145 ffmpeg
|
||||
0 20141 C 4 1 4 5 429 ffmpeg
|
||||
0 29591 C 2 0 2 1 435 ffmpeg
|
||||
0 7372 C - - - - 136 ffmpeg
|
||||
0 12176 C 10 3 4 8 782 ffmpeg
|
||||
0 20035 C 4 1 2 1 1145 ffmpeg
|
||||
0 20141 C 7 2 3 3 429 ffmpeg
|
||||
# gpu pid type sm mem enc dec fb command
|
||||
# Idx # C/G % % % % MB name
|
||||
0 29591 C - - 1 1 435 ffmpeg
|
||||
0 7372 C 2 0 2 - 136 ffmpeg
|
||||
0 12176 C 7 2 2 6 782 ffmpeg
|
||||
0 20035 C 7 2 4 3 1145 ffmpeg
|
||||
0 20141 C 5 1 1 3 429 ffmpeg
|
||||
0 29591 C - - 1 1 435 ffmpeg
|
||||
@ -438,6 +438,18 @@
|
||||
</supported_mem_clock>
|
||||
</supported_clocks>
|
||||
<processes>
|
||||
<process_info>
|
||||
<pid>10131</pid>
|
||||
<type>C</type>
|
||||
<process_name>ffmpeg</process_name>
|
||||
<used_memory>389 MiB</used_memory>
|
||||
</process_info>
|
||||
<process_info>
|
||||
<pid>13597</pid>
|
||||
<type>C</type>
|
||||
<process_name>ffmpeg</process_name>
|
||||
<used_memory>1054 MiB</used_memory>
|
||||
</process_info>
|
||||
</processes>
|
||||
<accounted_processes>
|
||||
</accounted_processes>
|
||||
@ -879,6 +891,12 @@
|
||||
</supported_mem_clock>
|
||||
</supported_clocks>
|
||||
<processes>
|
||||
<process_info>
|
||||
<pid>16870</pid>
|
||||
<type>C</type>
|
||||
<process_name>ffmpeg</process_name>
|
||||
<used_memory>549 MiB</used_memory>
|
||||
</process_info>
|
||||
</processes>
|
||||
<accounted_processes>
|
||||
</accounted_processes>
|
||||
@ -6,6 +6,9 @@ import (
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@ -47,11 +50,19 @@ func (u *Utilization) UnmarshalText(text []byte) error {
|
||||
}
|
||||
|
||||
type Process struct {
|
||||
PID int32 `xml:"pid"`
|
||||
Memory Megabytes `xml:"used_memory"`
|
||||
Index int
|
||||
PID int32
|
||||
Memory uint64 // bytes
|
||||
|
||||
Usage float64 // percent 0-100
|
||||
Encoder float64 // percent 0-100
|
||||
Decoder float64 // percent 0-100
|
||||
|
||||
lastSeen time.Time
|
||||
}
|
||||
|
||||
type GPUStats struct {
|
||||
ID string `xml:"id,attr"`
|
||||
Name string `xml:"product_name"`
|
||||
Architecture string `xml:"product_architecture"`
|
||||
|
||||
@ -59,31 +70,17 @@ type GPUStats struct {
|
||||
MemoryUsed Megabytes `xml:"fb_memory_usage>used"`
|
||||
|
||||
Usage Utilization `xml:"utilization>gpu_util"`
|
||||
MemoryUsage Utilization `xml:"utilization>memory_util"`
|
||||
EncoderUsage Utilization `xml:"utilization>encoder_util"`
|
||||
DecoderUsage Utilization `xml:"utilization>decoder_util"`
|
||||
|
||||
Process []Process `xml:"processes>process_info"`
|
||||
UsageEncoder Utilization `xml:"utilization>encoder_util"`
|
||||
UsageDecoder Utilization `xml:"utilization>decoder_util"`
|
||||
}
|
||||
|
||||
type Stats struct {
|
||||
GPU []GPUStats `xml:"gpu"`
|
||||
}
|
||||
|
||||
func parse(data []byte) (Stats, error) {
|
||||
nv := Stats{}
|
||||
|
||||
err := xml.Unmarshal(data, &nv)
|
||||
if err != nil {
|
||||
return nv, fmt.Errorf("parsing report: %w", err)
|
||||
}
|
||||
|
||||
return nv, nil
|
||||
}
|
||||
|
||||
type nvidia struct {
|
||||
cmd *exec.Cmd
|
||||
wr *writer
|
||||
wrQuery *writerQuery
|
||||
wrProcess *writerProcess
|
||||
|
||||
lock sync.RWMutex
|
||||
cancel context.CancelFunc
|
||||
@ -97,33 +94,33 @@ type dummy struct{}
|
||||
func (d *dummy) Count() (int, error) { return 0, nil }
|
||||
func (d *dummy) Stats() ([]gpu.Stats, error) { return nil, nil }
|
||||
func (d *dummy) Process(pid int32) (gpu.Process, error) { return gpu.Process{}, gpu.ErrProcessNotFound }
|
||||
func (d *dummy) Close() {}
|
||||
|
||||
type writer struct {
|
||||
buf bytes.Buffer
|
||||
ch chan Stats
|
||||
type writerQuery struct {
|
||||
buf bytes.Buffer
|
||||
ch chan Stats
|
||||
terminator []byte
|
||||
}
|
||||
|
||||
var terminator = []byte("</nvidia_smi_log>\n")
|
||||
|
||||
func (w *writer) Write(data []byte) (int, error) {
|
||||
func (w *writerQuery) Write(data []byte) (int, error) {
|
||||
n, err := w.buf.Write(data)
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
|
||||
for {
|
||||
idx := bytes.Index(w.buf.Bytes(), terminator)
|
||||
idx := bytes.Index(w.buf.Bytes(), w.terminator)
|
||||
if idx == -1 {
|
||||
break
|
||||
}
|
||||
|
||||
content := make([]byte, idx+len(terminator))
|
||||
content := make([]byte, idx+len(w.terminator))
|
||||
n, err := w.buf.Read(content)
|
||||
if err != nil || n != len(content) {
|
||||
break
|
||||
}
|
||||
|
||||
s, err := parse(content)
|
||||
s, err := w.parse(content)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
@ -134,19 +131,132 @@ func (w *writer) Write(data []byte) (int, error) {
|
||||
return n, nil
|
||||
}
|
||||
|
||||
func (w *writerQuery) parse(data []byte) (Stats, error) {
|
||||
nv := Stats{}
|
||||
|
||||
err := xml.Unmarshal(data, &nv)
|
||||
if err != nil {
|
||||
return nv, fmt.Errorf("parsing report: %w", err)
|
||||
}
|
||||
|
||||
return nv, nil
|
||||
}
|
||||
|
||||
type writerProcess struct {
|
||||
buf bytes.Buffer
|
||||
ch chan Process
|
||||
re *regexp.Regexp
|
||||
terminator []byte
|
||||
}
|
||||
|
||||
func (w *writerProcess) Write(data []byte) (int, error) {
|
||||
n, err := w.buf.Write(data)
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
|
||||
for {
|
||||
idx := bytes.Index(w.buf.Bytes(), w.terminator)
|
||||
if idx == -1 {
|
||||
break
|
||||
}
|
||||
|
||||
content := make([]byte, idx+len(w.terminator))
|
||||
n, err := w.buf.Read(content)
|
||||
if err != nil || n != len(content) {
|
||||
break
|
||||
}
|
||||
|
||||
s, err := w.parse(content)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
w.ch <- s
|
||||
}
|
||||
|
||||
return n, nil
|
||||
}
|
||||
|
||||
func (w *writerProcess) parse(data []byte) (Process, error) {
|
||||
p := Process{}
|
||||
|
||||
if len(data) == 0 {
|
||||
return p, fmt.Errorf("empty line")
|
||||
}
|
||||
|
||||
if data[0] == '#' {
|
||||
return p, fmt.Errorf("comment")
|
||||
}
|
||||
|
||||
matches := w.re.FindStringSubmatch(string(data))
|
||||
if matches == nil {
|
||||
return p, fmt.Errorf("no matches found")
|
||||
}
|
||||
|
||||
if len(matches) != 7 {
|
||||
return p, fmt.Errorf("not the expected number of matches found")
|
||||
}
|
||||
|
||||
if d, err := strconv.ParseInt(matches[1], 10, 0); err == nil {
|
||||
p.Index = int(d)
|
||||
}
|
||||
|
||||
if d, err := strconv.ParseInt(matches[2], 10, 32); err == nil {
|
||||
p.PID = int32(d)
|
||||
}
|
||||
|
||||
if matches[3][0] != '-' {
|
||||
if d, err := strconv.ParseFloat(matches[3], 64); err == nil {
|
||||
p.Usage = d
|
||||
}
|
||||
}
|
||||
|
||||
if matches[4][0] != '-' {
|
||||
if d, err := strconv.ParseFloat(matches[4], 64); err == nil {
|
||||
p.Encoder = d
|
||||
}
|
||||
}
|
||||
|
||||
if matches[5][0] != '-' {
|
||||
if d, err := strconv.ParseFloat(matches[5], 64); err == nil {
|
||||
p.Decoder = d
|
||||
}
|
||||
}
|
||||
|
||||
if d, err := strconv.ParseUint(matches[6], 10, 64); err == nil {
|
||||
p.Memory = d * 1024 * 1024
|
||||
}
|
||||
|
||||
return p, nil
|
||||
}
|
||||
|
||||
func New(path string) gpu.GPU {
|
||||
if len(path) == 0 {
|
||||
path = "nvidia-smi"
|
||||
}
|
||||
|
||||
_, err := exec.LookPath(path)
|
||||
path, err := exec.LookPath(path)
|
||||
if err != nil {
|
||||
return &dummy{}
|
||||
}
|
||||
|
||||
n := &nvidia{
|
||||
wr: &writer{
|
||||
ch: make(chan Stats, 1),
|
||||
wrQuery: &writerQuery{
|
||||
ch: make(chan Stats, 1),
|
||||
terminator: []byte("</nvidia_smi_log>\n"),
|
||||
},
|
||||
wrProcess: &writerProcess{
|
||||
ch: make(chan Process, 32),
|
||||
// # gpu pid type sm mem enc dec fb command
|
||||
// # Idx # C/G % % % % MB name
|
||||
// 0 7372 C 2 0 2 - 136 ffmpeg
|
||||
// 0 12176 C 5 2 3 7 782 ffmpeg
|
||||
// 0 20035 C 8 2 4 1 1145 ffmpeg
|
||||
// 0 20141 C 2 1 1 3 429 ffmpeg
|
||||
// 0 29591 C 2 1 - 2 435 ffmpeg
|
||||
re: regexp.MustCompile(`^\s*([0-9]+)\s+([0-9]+)\s+[A-Z]\s+([0-9-]+)\s+[0-9-]+\s+([0-9-]+)\s+([0-9-]+)\s+([0-9]+).*`),
|
||||
terminator: []byte("\n"),
|
||||
},
|
||||
process: map[int32]Process{},
|
||||
}
|
||||
@ -154,7 +264,8 @@ func New(path string) gpu.GPU {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
n.cancel = cancel
|
||||
|
||||
go n.runner(ctx, path)
|
||||
go n.runnerQuery(ctx, path)
|
||||
go n.runnerProcess(ctx, path)
|
||||
go n.reader(ctx)
|
||||
|
||||
return n
|
||||
@ -165,13 +276,18 @@ func (n *nvidia) reader(ctx context.Context) {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case stats := <-n.wr.ch:
|
||||
case stats := <-n.wrQuery.ch:
|
||||
n.lock.Lock()
|
||||
n.stats = stats
|
||||
n.process = map[int32]Process{}
|
||||
for _, g := range n.stats.GPU {
|
||||
for _, p := range g.Process {
|
||||
n.process[p.PID] = p
|
||||
n.lock.Unlock()
|
||||
case process := <-n.wrProcess.ch:
|
||||
process.lastSeen = time.Now()
|
||||
n.lock.Lock()
|
||||
n.process[process.PID] = process
|
||||
|
||||
for pid, p := range n.process {
|
||||
if time.Since(p.lastSeen) > 11*time.Second {
|
||||
delete(n.process, pid)
|
||||
}
|
||||
}
|
||||
n.lock.Unlock()
|
||||
@ -179,11 +295,11 @@ func (n *nvidia) reader(ctx context.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
func (n *nvidia) runner(ctx context.Context, path string) {
|
||||
func (n *nvidia) runnerQuery(ctx context.Context, path string) {
|
||||
for {
|
||||
n.cmd = exec.Command(path, "-q", "-x", "-l", "1")
|
||||
n.cmd.Stdout = n.wr
|
||||
err := n.cmd.Start()
|
||||
cmd := exec.CommandContext(ctx, path, "-q", "-x", "-l", "1")
|
||||
cmd.Stdout = n.wrQuery
|
||||
err := cmd.Start()
|
||||
if err != nil {
|
||||
n.lock.Lock()
|
||||
n.err = err
|
||||
@ -193,7 +309,35 @@ func (n *nvidia) runner(ctx context.Context, path string) {
|
||||
continue
|
||||
}
|
||||
|
||||
err = n.cmd.Wait()
|
||||
err = cmd.Wait()
|
||||
|
||||
n.lock.Lock()
|
||||
n.err = err
|
||||
n.lock.Unlock()
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (n *nvidia) runnerProcess(ctx context.Context, path string) {
|
||||
for {
|
||||
cmd := exec.CommandContext(ctx, path, "pmon", "-s", "um", "-d", "5")
|
||||
cmd.Stdout = n.wrProcess
|
||||
err := cmd.Start()
|
||||
if err != nil {
|
||||
n.lock.Lock()
|
||||
n.err = err
|
||||
n.lock.Unlock()
|
||||
|
||||
time.Sleep(3 * time.Second)
|
||||
continue
|
||||
}
|
||||
|
||||
err = cmd.Wait()
|
||||
|
||||
n.lock.Lock()
|
||||
n.err = err
|
||||
@ -219,39 +363,55 @@ func (n *nvidia) Count() (int, error) {
|
||||
}
|
||||
|
||||
func (n *nvidia) Stats() ([]gpu.Stats, error) {
|
||||
s := []gpu.Stats{}
|
||||
stats := []gpu.Stats{}
|
||||
|
||||
n.lock.RLock()
|
||||
defer n.lock.RUnlock()
|
||||
|
||||
if n.err != nil {
|
||||
return s, n.err
|
||||
return stats, n.err
|
||||
}
|
||||
|
||||
for _, nv := range n.stats.GPU {
|
||||
stats := gpu.Stats{
|
||||
s := gpu.Stats{
|
||||
ID: nv.ID,
|
||||
Name: nv.Name,
|
||||
Architecture: nv.Architecture,
|
||||
MemoryTotal: uint64(nv.MemoryTotal),
|
||||
MemoryUsed: uint64(nv.MemoryUsed),
|
||||
Usage: float64(nv.Usage),
|
||||
MemoryUsage: float64(nv.MemoryUsage),
|
||||
EncoderUsage: float64(nv.EncoderUsage),
|
||||
DecoderUsage: float64(nv.DecoderUsage),
|
||||
Encoder: float64(nv.UsageEncoder),
|
||||
Decoder: float64(nv.UsageDecoder),
|
||||
Process: []gpu.Process{},
|
||||
}
|
||||
|
||||
for _, p := range nv.Process {
|
||||
stats.Process = append(stats.Process, gpu.Process{
|
||||
PID: p.PID,
|
||||
Memory: uint64(p.Memory),
|
||||
})
|
||||
}
|
||||
|
||||
s = append(s, stats)
|
||||
stats = append(stats, s)
|
||||
}
|
||||
|
||||
return s, nil
|
||||
for _, p := range n.process {
|
||||
if p.Index >= len(stats) {
|
||||
continue
|
||||
}
|
||||
|
||||
stats[p.Index].Process = append(stats[p.Index].Process, gpu.Process{
|
||||
PID: p.PID,
|
||||
Index: p.Index,
|
||||
Memory: p.Memory,
|
||||
Usage: p.Usage,
|
||||
Encoder: p.Encoder,
|
||||
Decoder: p.Decoder,
|
||||
})
|
||||
}
|
||||
|
||||
for i := range stats {
|
||||
p := stats[i].Process
|
||||
slices.SortFunc(p, func(a, b gpu.Process) int {
|
||||
return int(a.PID - b.PID)
|
||||
})
|
||||
stats[i].Process = p
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
func (n *nvidia) Process(pid int32) (gpu.Process, error) {
|
||||
@ -259,14 +419,18 @@ func (n *nvidia) Process(pid int32) (gpu.Process, error) {
|
||||
defer n.lock.RUnlock()
|
||||
|
||||
p, hasProcess := n.process[pid]
|
||||
if !hasProcess {
|
||||
return gpu.Process{}, gpu.ErrProcessNotFound
|
||||
if hasProcess {
|
||||
return gpu.Process{
|
||||
PID: p.PID,
|
||||
Index: p.Index,
|
||||
Memory: p.Memory,
|
||||
Usage: p.Usage,
|
||||
Encoder: p.Encoder,
|
||||
Decoder: p.Decoder,
|
||||
}, nil
|
||||
}
|
||||
|
||||
return gpu.Process{
|
||||
PID: p.PID,
|
||||
Memory: uint64(p.Memory),
|
||||
}, nil
|
||||
return gpu.Process{Index: -1}, gpu.ErrProcessNotFound
|
||||
}
|
||||
|
||||
func (n *nvidia) Close() {
|
||||
@ -279,6 +443,4 @@ func (n *nvidia) Close() {
|
||||
|
||||
n.cancel()
|
||||
n.cancel = nil
|
||||
|
||||
n.cmd.Process.Kill()
|
||||
}
|
||||
|
||||
@ -1,102 +1,430 @@
|
||||
package nvidia
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"os"
|
||||
"regexp"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/datarhei/core/v16/internal/testhelper"
|
||||
"github.com/datarhei/core/v16/psutil/gpu"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestParseNV(t *testing.T) {
|
||||
data, err := os.ReadFile("./fixtures/data1.xml")
|
||||
func TestParseQuery(t *testing.T) {
|
||||
data, err := os.ReadFile("./fixtures/query1.xml")
|
||||
require.NoError(t, err)
|
||||
|
||||
nv, err := parse(data)
|
||||
wr := &writerQuery{}
|
||||
|
||||
nv, err := wr.parse(data)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.Equal(t, Stats{
|
||||
GPU: []GPUStats{
|
||||
{
|
||||
ID: "00000000:01:00.0",
|
||||
Name: "NVIDIA GeForce GTX 1080",
|
||||
Architecture: "Pascal",
|
||||
MemoryTotal: 8119 * 1024 * 1024,
|
||||
MemoryUsed: 918 * 1024 * 1024,
|
||||
Usage: 15,
|
||||
MemoryUsage: 7,
|
||||
EncoderUsage: 3,
|
||||
DecoderUsage: 0,
|
||||
Process: []Process{
|
||||
{
|
||||
PID: 18179,
|
||||
Memory: 916 * 1024 * 1024,
|
||||
},
|
||||
},
|
||||
UsageEncoder: 3,
|
||||
UsageDecoder: 0,
|
||||
},
|
||||
},
|
||||
}, nv)
|
||||
|
||||
data, err = os.ReadFile("./fixtures/data2.xml")
|
||||
data, err = os.ReadFile("./fixtures/query2.xml")
|
||||
require.NoError(t, err)
|
||||
|
||||
nv, err = parse(data)
|
||||
nv, err = wr.parse(data)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.Equal(t, Stats{
|
||||
GPU: []GPUStats{
|
||||
{
|
||||
ID: "00000000:01:00.0",
|
||||
Name: "NVIDIA L4",
|
||||
Architecture: "Ada Lovelace",
|
||||
MemoryTotal: 23034 * 1024 * 1024,
|
||||
MemoryUsed: 1 * 1024 * 1024,
|
||||
Usage: 2,
|
||||
MemoryUsage: 0,
|
||||
EncoderUsage: 0,
|
||||
DecoderUsage: 0,
|
||||
UsageEncoder: 0,
|
||||
UsageDecoder: 0,
|
||||
},
|
||||
{
|
||||
ID: "00000000:C1:00.0",
|
||||
Name: "NVIDIA L4",
|
||||
Architecture: "Ada Lovelace",
|
||||
MemoryTotal: 23034 * 1024 * 1024,
|
||||
MemoryUsed: 1 * 1024 * 1024,
|
||||
Usage: 3,
|
||||
MemoryUsage: 0,
|
||||
EncoderUsage: 0,
|
||||
DecoderUsage: 0,
|
||||
UsageEncoder: 0,
|
||||
UsageDecoder: 0,
|
||||
},
|
||||
},
|
||||
}, nv)
|
||||
|
||||
data, err = os.ReadFile("./fixtures/data3.xml")
|
||||
data, err = os.ReadFile("./fixtures/query3.xml")
|
||||
require.NoError(t, err)
|
||||
|
||||
nv, err = parse(data)
|
||||
nv, err = wr.parse(data)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.Equal(t, Stats{
|
||||
GPU: []GPUStats{
|
||||
{
|
||||
ID: "00000000:01:00.0",
|
||||
Name: "GeForce GTX 1080",
|
||||
MemoryTotal: 8119 * 1024 * 1024,
|
||||
MemoryUsed: 2006 * 1024 * 1024,
|
||||
Usage: 32,
|
||||
MemoryUsage: 11,
|
||||
EncoderUsage: 17,
|
||||
DecoderUsage: 25,
|
||||
Process: []Process{
|
||||
{
|
||||
PID: 10131,
|
||||
Memory: 389 * 1024 * 1024,
|
||||
},
|
||||
{
|
||||
PID: 13597,
|
||||
Memory: 1054 * 1024 * 1024,
|
||||
},
|
||||
{
|
||||
PID: 16870,
|
||||
Memory: 549 * 1024 * 1024,
|
||||
},
|
||||
},
|
||||
UsageEncoder: 17,
|
||||
UsageDecoder: 25,
|
||||
},
|
||||
},
|
||||
}, nv)
|
||||
}
|
||||
|
||||
func TestParseProcess(t *testing.T) {
|
||||
data, err := os.ReadFile("./fixtures/process.txt")
|
||||
require.NoError(t, err)
|
||||
|
||||
wr := &writerProcess{
|
||||
re: regexp.MustCompile(`^\s*([0-9]+)\s+([0-9]+)\s+[A-Z]\s+([0-9-]+)\s+[0-9-]+\s+([0-9-]+)\s+([0-9-]+)\s+([0-9]+).*`),
|
||||
}
|
||||
|
||||
lines := bytes.Split(data, []byte("\n"))
|
||||
process := map[int32]Process{}
|
||||
|
||||
for _, line := range lines {
|
||||
p, err := wr.parse(line)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
process[p.PID] = p
|
||||
}
|
||||
|
||||
require.Equal(t, map[int32]Process{
|
||||
7372: {
|
||||
Index: 0,
|
||||
PID: 7372,
|
||||
Memory: 136 * 1024 * 1024,
|
||||
Usage: 2,
|
||||
Encoder: 2,
|
||||
Decoder: 0,
|
||||
},
|
||||
12176: {
|
||||
Index: 0,
|
||||
PID: 12176,
|
||||
Memory: 782 * 1024 * 1024,
|
||||
Usage: 7,
|
||||
Encoder: 2,
|
||||
Decoder: 6,
|
||||
},
|
||||
20035: {
|
||||
Index: 0,
|
||||
PID: 20035,
|
||||
Memory: 1145 * 1024 * 1024,
|
||||
Usage: 7,
|
||||
Encoder: 4,
|
||||
Decoder: 3,
|
||||
},
|
||||
20141: {
|
||||
Index: 0,
|
||||
PID: 20141,
|
||||
Memory: 429 * 1024 * 1024,
|
||||
Usage: 5,
|
||||
Encoder: 1,
|
||||
Decoder: 3,
|
||||
},
|
||||
29591: {
|
||||
Index: 0,
|
||||
PID: 29591,
|
||||
Memory: 435 * 1024 * 1024,
|
||||
Usage: 0,
|
||||
Encoder: 1,
|
||||
Decoder: 1,
|
||||
},
|
||||
}, process)
|
||||
}
|
||||
|
||||
func TestWriterQuery(t *testing.T) {
|
||||
data, err := os.ReadFile("./fixtures/query2.xml")
|
||||
require.NoError(t, err)
|
||||
|
||||
wr := &writerQuery{
|
||||
ch: make(chan Stats, 1),
|
||||
terminator: []byte("</nvidia_smi_log>"),
|
||||
}
|
||||
|
||||
stats := Stats{}
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
|
||||
for s := range wr.ch {
|
||||
stats = s
|
||||
}
|
||||
}()
|
||||
|
||||
_, err = wr.Write(data)
|
||||
require.NoError(t, err)
|
||||
|
||||
close(wr.ch)
|
||||
|
||||
wg.Wait()
|
||||
|
||||
require.Equal(t, Stats{
|
||||
GPU: []GPUStats{
|
||||
{
|
||||
ID: "00000000:01:00.0",
|
||||
Name: "NVIDIA L4",
|
||||
Architecture: "Ada Lovelace",
|
||||
MemoryTotal: 23034 * 1024 * 1024,
|
||||
MemoryUsed: 1 * 1024 * 1024,
|
||||
Usage: 2,
|
||||
UsageEncoder: 0,
|
||||
UsageDecoder: 0,
|
||||
},
|
||||
{
|
||||
ID: "00000000:C1:00.0",
|
||||
Name: "NVIDIA L4",
|
||||
Architecture: "Ada Lovelace",
|
||||
MemoryTotal: 23034 * 1024 * 1024,
|
||||
MemoryUsed: 1 * 1024 * 1024,
|
||||
Usage: 3,
|
||||
UsageEncoder: 0,
|
||||
UsageDecoder: 0,
|
||||
},
|
||||
},
|
||||
}, stats)
|
||||
}
|
||||
|
||||
func TestWriterProcess(t *testing.T) {
|
||||
data, err := os.ReadFile("./fixtures/process.txt")
|
||||
require.NoError(t, err)
|
||||
|
||||
wr := &writerProcess{
|
||||
ch: make(chan Process, 32),
|
||||
re: regexp.MustCompile(`^\s*([0-9]+)\s+([0-9]+)\s+[A-Z]\s+([0-9-]+)\s+[0-9-]+\s+([0-9-]+)\s+([0-9-]+)\s+([0-9]+).*`),
|
||||
terminator: []byte("\n"),
|
||||
}
|
||||
|
||||
process := map[int32]Process{}
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for p := range wr.ch {
|
||||
process[p.PID] = p
|
||||
}
|
||||
}()
|
||||
|
||||
_, err = wr.Write(data)
|
||||
require.NoError(t, err)
|
||||
|
||||
close(wr.ch)
|
||||
|
||||
wg.Wait()
|
||||
|
||||
require.Equal(t, map[int32]Process{
|
||||
7372: {
|
||||
Index: 0,
|
||||
PID: 7372,
|
||||
Memory: 136 * 1024 * 1024,
|
||||
Usage: 2,
|
||||
Encoder: 2,
|
||||
Decoder: 0,
|
||||
},
|
||||
12176: {
|
||||
Index: 0,
|
||||
PID: 12176,
|
||||
Memory: 782 * 1024 * 1024,
|
||||
Usage: 7,
|
||||
Encoder: 2,
|
||||
Decoder: 6,
|
||||
},
|
||||
20035: {
|
||||
Index: 0,
|
||||
PID: 20035,
|
||||
Memory: 1145 * 1024 * 1024,
|
||||
Usage: 7,
|
||||
Encoder: 4,
|
||||
Decoder: 3,
|
||||
},
|
||||
20141: {
|
||||
Index: 0,
|
||||
PID: 20141,
|
||||
Memory: 429 * 1024 * 1024,
|
||||
Usage: 5,
|
||||
Encoder: 1,
|
||||
Decoder: 3,
|
||||
},
|
||||
29591: {
|
||||
Index: 0,
|
||||
PID: 29591,
|
||||
Memory: 435 * 1024 * 1024,
|
||||
Usage: 0,
|
||||
Encoder: 1,
|
||||
Decoder: 1,
|
||||
},
|
||||
}, process)
|
||||
}
|
||||
|
||||
func TestNvidiaGPUCount(t *testing.T) {
|
||||
binary, err := testhelper.BuildBinary("nvidia-smi", "../../../internal/testhelper")
|
||||
require.NoError(t, err, "Failed to build helper program")
|
||||
|
||||
nv := New(binary)
|
||||
|
||||
t.Cleanup(func() {
|
||||
nv.Close()
|
||||
})
|
||||
|
||||
_, ok := nv.(*dummy)
|
||||
require.False(t, ok)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
count, _ := nv.Count()
|
||||
return count != 0
|
||||
}, 5*time.Second, time.Second)
|
||||
}
|
||||
|
||||
func TestNvidiaGPUStats(t *testing.T) {
|
||||
binary, err := testhelper.BuildBinary("nvidia-smi", "../../../internal/testhelper")
|
||||
require.NoError(t, err, "Failed to build helper program")
|
||||
|
||||
nv := New(binary)
|
||||
|
||||
t.Cleanup(func() {
|
||||
nv.Close()
|
||||
})
|
||||
|
||||
_, ok := nv.(*dummy)
|
||||
require.False(t, ok)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
stats, _ := nv.Stats()
|
||||
|
||||
if len(stats) != 2 {
|
||||
return false
|
||||
}
|
||||
|
||||
if len(stats[0].Process) != 3 {
|
||||
return false
|
||||
}
|
||||
|
||||
if len(stats[1].Process) != 2 {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}, 5*time.Second, time.Second)
|
||||
|
||||
stats, err := nv.Stats()
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, []gpu.Stats{
|
||||
{
|
||||
ID: "00000000:01:00.0",
|
||||
Name: "NVIDIA L4",
|
||||
Architecture: "Ada Lovelace",
|
||||
MemoryTotal: 23034 * 1024 * 1024,
|
||||
MemoryUsed: 1 * 1024 * 1024,
|
||||
Usage: 2,
|
||||
Encoder: 0,
|
||||
Decoder: 0,
|
||||
Process: []gpu.Process{
|
||||
{
|
||||
Index: 0,
|
||||
PID: 7372,
|
||||
Memory: 136 * 1024 * 1024,
|
||||
Usage: 2,
|
||||
Encoder: 2,
|
||||
Decoder: 0,
|
||||
},
|
||||
{
|
||||
Index: 0,
|
||||
PID: 12176,
|
||||
Memory: 782 * 1024 * 1024,
|
||||
Usage: 5,
|
||||
Encoder: 3,
|
||||
Decoder: 7,
|
||||
},
|
||||
{
|
||||
Index: 0,
|
||||
PID: 29591,
|
||||
Memory: 435 * 1024 * 1024,
|
||||
Usage: 2,
|
||||
Encoder: 0,
|
||||
Decoder: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
ID: "00000000:C1:00.0",
|
||||
Name: "NVIDIA L4",
|
||||
Architecture: "Ada Lovelace",
|
||||
MemoryTotal: 23034 * 1024 * 1024,
|
||||
MemoryUsed: 1 * 1024 * 1024,
|
||||
Usage: 3,
|
||||
Encoder: 0,
|
||||
Decoder: 0,
|
||||
Process: []gpu.Process{
|
||||
{
|
||||
Index: 1,
|
||||
PID: 20035,
|
||||
Memory: 1145 * 1024 * 1024,
|
||||
Usage: 8,
|
||||
Encoder: 4,
|
||||
Decoder: 1,
|
||||
},
|
||||
{
|
||||
Index: 1,
|
||||
PID: 20141,
|
||||
Memory: 429 * 1024 * 1024,
|
||||
Usage: 2,
|
||||
Encoder: 1,
|
||||
Decoder: 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
}, stats)
|
||||
}
|
||||
|
||||
func TestNvidiaGPUProcess(t *testing.T) {
|
||||
binary, err := testhelper.BuildBinary("nvidia-smi", "../../../internal/testhelper")
|
||||
require.NoError(t, err, "Failed to build helper program")
|
||||
|
||||
nv := New(binary)
|
||||
|
||||
t.Cleanup(func() {
|
||||
nv.Close()
|
||||
})
|
||||
|
||||
_, ok := nv.(*dummy)
|
||||
require.False(t, ok)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
_, err := nv.Process(12176)
|
||||
return err == nil
|
||||
}, 5*time.Second, time.Second)
|
||||
|
||||
proc, err := nv.Process(12176)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, gpu.Process{
|
||||
Index: 0,
|
||||
PID: 12176,
|
||||
Memory: 782 * 1024 * 1024,
|
||||
Usage: 5,
|
||||
Encoder: 3,
|
||||
Decoder: 7,
|
||||
}, proc)
|
||||
}
|
||||
|
||||
@ -5,24 +5,28 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/datarhei/core/v16/psutil/gpu/nvidia"
|
||||
psprocess "github.com/shirou/gopsutil/v3/process"
|
||||
)
|
||||
|
||||
type Process interface {
|
||||
// CPUPercent returns the current CPU load for this process only. The values
|
||||
// CPU returns the current CPU load for this process only. The values
|
||||
// are normed to the range of 0 to 100.
|
||||
CPUPercent() (*CPUInfoStat, error)
|
||||
CPU() (*CPUInfo, error)
|
||||
|
||||
// VirtualMemory returns the current memory usage in bytes of this process only.
|
||||
VirtualMemory() (uint64, error)
|
||||
// Memory returns the current memory usage in bytes of this process only.
|
||||
Memory() (uint64, error)
|
||||
|
||||
// GPU returns the current GPU memory in bytes and usage in percent (0-100) of this process only.
|
||||
GPU() (*GPUInfo, error)
|
||||
|
||||
// Stop will stop collecting CPU and memory data for this process.
|
||||
Stop()
|
||||
|
||||
// Suspend will send SIGSTOP to the process
|
||||
// Suspend will send SIGSTOP to the process.
|
||||
Suspend() error
|
||||
|
||||
// Resume will send SIGCONT to the process
|
||||
// Resume will send SIGCONT to the process.
|
||||
Resume() error
|
||||
}
|
||||
|
||||
@ -142,7 +146,7 @@ func (p *process) Resume() error {
|
||||
return p.proc.Resume()
|
||||
}
|
||||
|
||||
func (p *process) CPUPercent() (*CPUInfoStat, error) {
|
||||
func (p *process) CPU() (*CPUInfo, error) {
|
||||
var diff float64
|
||||
|
||||
for {
|
||||
@ -167,7 +171,7 @@ func (p *process) CPUPercent() (*CPUInfoStat, error) {
|
||||
diff = p.statCurrentTime.Sub(p.statPreviousTime).Seconds() * p.ncpu
|
||||
}
|
||||
|
||||
s := &CPUInfoStat{
|
||||
s := &CPUInfo{
|
||||
System: 0,
|
||||
User: 0,
|
||||
Idle: 0,
|
||||
@ -186,9 +190,28 @@ func (p *process) CPUPercent() (*CPUInfoStat, error) {
|
||||
return s, nil
|
||||
}
|
||||
|
||||
func (p *process) VirtualMemory() (uint64, error) {
|
||||
func (p *process) Memory() (uint64, error) {
|
||||
p.lock.RLock()
|
||||
defer p.lock.RUnlock()
|
||||
|
||||
return p.memRSS, nil
|
||||
}
|
||||
|
||||
func (p *process) GPU() (*GPUInfo, error) {
|
||||
info := &GPUInfo{
|
||||
Index: -1,
|
||||
}
|
||||
|
||||
proc, err := nvidia.Default.Process(p.pid)
|
||||
if err != nil {
|
||||
return info, nil
|
||||
}
|
||||
|
||||
info.Index = proc.Index
|
||||
info.MemoryUsed = proc.Memory
|
||||
info.Usage = proc.Usage
|
||||
info.Encoder = proc.Encoder
|
||||
info.Decoder = proc.Decoder
|
||||
|
||||
return info, nil
|
||||
}
|
||||
|
||||
174
psutil/psutil.go
174
psutil/psutil.go
@ -47,29 +47,44 @@ func init() {
|
||||
DefaultUtil, _ = New("/sys/fs/cgroup")
|
||||
}
|
||||
|
||||
type MemoryInfoStat struct {
|
||||
type DiskInfo struct {
|
||||
Path string
|
||||
Fstype string
|
||||
Total uint64
|
||||
Used uint64
|
||||
InodesTotal uint64
|
||||
InodesUsed uint64
|
||||
}
|
||||
|
||||
type MemoryInfo struct {
|
||||
Total uint64 // bytes
|
||||
Available uint64 // bytes
|
||||
Used uint64 // bytes
|
||||
}
|
||||
|
||||
type CPUInfoStat struct {
|
||||
type NetworkInfo struct {
|
||||
Name string // interface name
|
||||
BytesSent uint64 // number of bytes sent
|
||||
BytesRecv uint64 // number of bytes received
|
||||
}
|
||||
|
||||
type CPUInfo struct {
|
||||
System float64 // percent 0-100
|
||||
User float64 // percent 0-100
|
||||
Idle float64 // percent 0-100
|
||||
Other float64 // percent 0-100
|
||||
}
|
||||
|
||||
type GPUInfoStat struct {
|
||||
Name string
|
||||
type GPUInfo struct {
|
||||
Index int // Index of the GPU
|
||||
Name string // Name of the GPU (not populated for a specific process)
|
||||
|
||||
MemoryTotal uint64 // bytes
|
||||
MemoryTotal uint64 // bytes (not populated for a specific process)
|
||||
MemoryUsed uint64 // bytes
|
||||
|
||||
Usage float64 // percent 0-100
|
||||
MemoryUsage float64 // percent 0-100
|
||||
EncoderUsage float64 // percent 0-100
|
||||
DecoderUsage float64 // percent 0-100
|
||||
Usage float64 // percent 0-100
|
||||
Encoder float64 // percent 0-100
|
||||
Decoder float64 // percent 0-100
|
||||
}
|
||||
|
||||
type cpuTimesStat struct {
|
||||
@ -85,18 +100,23 @@ type Util interface {
|
||||
Stop()
|
||||
|
||||
// CPUCounts returns the number of cores, either logical or physical.
|
||||
CPUCounts(logical bool) (float64, error)
|
||||
CPUCounts() (float64, error)
|
||||
|
||||
// GPUCounts returns the number of GPU cores.
|
||||
GPUCounts() (float64, error)
|
||||
|
||||
// CPUPercent returns the current CPU load in percent. The values range
|
||||
// CPU returns the current CPU load in percent. The values range
|
||||
// from 0 to 100, independently of the number of logical cores.
|
||||
CPUPercent() (*CPUInfoStat, error)
|
||||
DiskUsage(path string) (*disk.UsageStat, error)
|
||||
VirtualMemory() (*MemoryInfoStat, error)
|
||||
NetIOCounters(pernic bool) ([]net.IOCountersStat, error)
|
||||
GPUStats() ([]GPUInfoStat, error)
|
||||
CPU() (*CPUInfo, error)
|
||||
|
||||
// Disk returns the current usage of the partition specified by the path.
|
||||
Disk(path string) (*DiskInfo, error)
|
||||
|
||||
// Memory return the current memory usage.
|
||||
Memory() (*MemoryInfo, error)
|
||||
|
||||
// Network returns the current network interface statistics per network adapter.
|
||||
Network() ([]NetworkInfo, error)
|
||||
|
||||
// GPU return the current usage for each CPU.
|
||||
GPU() ([]GPUInfo, error)
|
||||
|
||||
// Process returns a process observer for a process with the given pid.
|
||||
Process(pid int32) (Process, error)
|
||||
@ -120,7 +140,7 @@ type util struct {
|
||||
statPrevious cpuTimesStat
|
||||
statPreviousTime time.Time
|
||||
nTicks uint64
|
||||
mem MemoryInfoStat
|
||||
mem MemoryInfo
|
||||
}
|
||||
|
||||
// New returns a new util, it will be started automatically
|
||||
@ -140,7 +160,7 @@ func New(root string) (Util, error) {
|
||||
|
||||
if u.ncpu == 0 {
|
||||
var err error
|
||||
u.ncpu, err = u.CPUCounts(true)
|
||||
u.ncpu, err = u.CPUCounts()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -311,7 +331,7 @@ func (u *util) tickMemory(ctx context.Context, interval time.Duration) {
|
||||
}
|
||||
}
|
||||
|
||||
func (u *util) collectMemory() *MemoryInfoStat {
|
||||
func (u *util) collectMemory() *MemoryInfo {
|
||||
stat, err := u.virtualMemory()
|
||||
if err != nil {
|
||||
return nil
|
||||
@ -320,12 +340,12 @@ func (u *util) collectMemory() *MemoryInfoStat {
|
||||
return stat
|
||||
}
|
||||
|
||||
func (u *util) CPUCounts(logical bool) (float64, error) {
|
||||
func (u *util) CPUCounts() (float64, error) {
|
||||
if u.hasCgroup && u.ncpu > 0 {
|
||||
return u.ncpu, nil
|
||||
}
|
||||
|
||||
ncpu, err := cpu.Counts(logical)
|
||||
ncpu, err := cpu.Counts(true)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
@ -333,18 +353,8 @@ func (u *util) CPUCounts(logical bool) (float64, error) {
|
||||
return float64(ncpu), nil
|
||||
}
|
||||
|
||||
func CPUCounts(logical bool) (float64, error) {
|
||||
return DefaultUtil.CPUCounts(logical)
|
||||
}
|
||||
|
||||
func (u *util) GPUCounts() (float64, error) {
|
||||
count, err := nvidia.Default.Count()
|
||||
|
||||
return float64(count), err
|
||||
}
|
||||
|
||||
func GPUCounts() (float64, error) {
|
||||
return DefaultUtil.GPUCounts()
|
||||
func CPUCounts() (float64, error) {
|
||||
return DefaultUtil.CPUCounts()
|
||||
}
|
||||
|
||||
// cpuTimes returns the current cpu usage times in seconds.
|
||||
@ -381,7 +391,7 @@ func (u *util) cpuTimes() (*cpuTimesStat, error) {
|
||||
return s, nil
|
||||
}
|
||||
|
||||
func (u *util) CPUPercent() (*CPUInfoStat, error) {
|
||||
func (u *util) CPU() (*CPUInfo, error) {
|
||||
var total float64
|
||||
|
||||
for {
|
||||
@ -406,7 +416,7 @@ func (u *util) CPUPercent() (*CPUInfoStat, error) {
|
||||
total = (u.statCurrent.total - u.statPrevious.total)
|
||||
}
|
||||
|
||||
s := &CPUInfoStat{
|
||||
s := &CPUInfo{
|
||||
System: 0,
|
||||
User: 0,
|
||||
Idle: 100,
|
||||
@ -429,8 +439,8 @@ func (u *util) CPUPercent() (*CPUInfoStat, error) {
|
||||
return s, nil
|
||||
}
|
||||
|
||||
func CPUPercent() (*CPUInfoStat, error) {
|
||||
return DefaultUtil.CPUPercent()
|
||||
func CPUPercent() (*CPUInfo, error) {
|
||||
return DefaultUtil.CPU()
|
||||
}
|
||||
|
||||
func (u *util) cgroupCPUTimes(version int) (*cpuTimesStat, error) {
|
||||
@ -466,15 +476,29 @@ func (u *util) cgroupCPUTimes(version int) (*cpuTimesStat, error) {
|
||||
return info, nil
|
||||
}
|
||||
|
||||
func (u *util) DiskUsage(path string) (*disk.UsageStat, error) {
|
||||
return disk.Usage(path)
|
||||
func (u *util) Disk(path string) (*DiskInfo, error) {
|
||||
usage, err := disk.Usage(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
info := &DiskInfo{
|
||||
Path: usage.Path,
|
||||
Fstype: usage.Fstype,
|
||||
Total: usage.Total,
|
||||
Used: usage.Used,
|
||||
InodesTotal: usage.InodesTotal,
|
||||
InodesUsed: usage.InodesUsed,
|
||||
}
|
||||
|
||||
return info, nil
|
||||
}
|
||||
|
||||
func DiskUsage(path string) (*disk.UsageStat, error) {
|
||||
return DefaultUtil.DiskUsage(path)
|
||||
func Disk(path string) (*DiskInfo, error) {
|
||||
return DefaultUtil.Disk(path)
|
||||
}
|
||||
|
||||
func (u *util) virtualMemory() (*MemoryInfoStat, error) {
|
||||
func (u *util) virtualMemory() (*MemoryInfo, error) {
|
||||
info, err := mem.VirtualMemory()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -489,18 +513,18 @@ func (u *util) virtualMemory() (*MemoryInfoStat, error) {
|
||||
}
|
||||
}
|
||||
|
||||
return &MemoryInfoStat{
|
||||
return &MemoryInfo{
|
||||
Total: info.Total,
|
||||
Available: info.Available,
|
||||
Used: info.Used,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (u *util) VirtualMemory() (*MemoryInfoStat, error) {
|
||||
func (u *util) Memory() (*MemoryInfo, error) {
|
||||
u.lock.RLock()
|
||||
defer u.lock.RUnlock()
|
||||
|
||||
stat := &MemoryInfoStat{
|
||||
stat := &MemoryInfo{
|
||||
Total: u.mem.Total,
|
||||
Available: u.mem.Available,
|
||||
Used: u.mem.Used,
|
||||
@ -509,12 +533,12 @@ func (u *util) VirtualMemory() (*MemoryInfoStat, error) {
|
||||
return stat, nil
|
||||
}
|
||||
|
||||
func VirtualMemory() (*MemoryInfoStat, error) {
|
||||
return DefaultUtil.VirtualMemory()
|
||||
func Memory() (*MemoryInfo, error) {
|
||||
return DefaultUtil.Memory()
|
||||
}
|
||||
|
||||
func (u *util) cgroupVirtualMemory(version int) (*MemoryInfoStat, error) {
|
||||
info := &MemoryInfoStat{}
|
||||
func (u *util) cgroupVirtualMemory(version int) (*MemoryInfo, error) {
|
||||
info := &MemoryInfo{}
|
||||
|
||||
if version == 1 {
|
||||
lines, err := u.readFile("memory/memory.limit_in_bytes")
|
||||
@ -569,12 +593,27 @@ func (u *util) cgroupVirtualMemory(version int) (*MemoryInfoStat, error) {
|
||||
return info, nil
|
||||
}
|
||||
|
||||
func (u *util) NetIOCounters(pernic bool) ([]net.IOCountersStat, error) {
|
||||
return net.IOCounters(pernic)
|
||||
func (u *util) Network() ([]NetworkInfo, error) {
|
||||
netio, err := net.IOCounters(true)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
info := []NetworkInfo{}
|
||||
|
||||
for _, io := range netio {
|
||||
info = append(info, NetworkInfo{
|
||||
Name: io.Name,
|
||||
BytesSent: io.BytesSent,
|
||||
BytesRecv: io.BytesRecv,
|
||||
})
|
||||
}
|
||||
|
||||
return info, nil
|
||||
}
|
||||
|
||||
func NetIOCounters(pernic bool) ([]net.IOCountersStat, error) {
|
||||
return DefaultUtil.NetIOCounters(pernic)
|
||||
func Network() ([]NetworkInfo, error) {
|
||||
return DefaultUtil.Network()
|
||||
}
|
||||
|
||||
func (u *util) readFile(path string) ([]string, error) {
|
||||
@ -613,29 +652,28 @@ func cpuTotal(c *cpu.TimesStat) float64 {
|
||||
c.Softirq + c.Steal + c.Guest + c.GuestNice
|
||||
}
|
||||
|
||||
func (u *util) GPUStats() ([]GPUInfoStat, error) {
|
||||
func (u *util) GPU() ([]GPUInfo, error) {
|
||||
nvstats, err := nvidia.Default.Stats()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
stats := []GPUInfoStat{}
|
||||
stats := []GPUInfo{}
|
||||
|
||||
for _, nv := range nvstats {
|
||||
stats = append(stats, GPUInfoStat{
|
||||
Name: nv.Name,
|
||||
MemoryTotal: nv.MemoryTotal,
|
||||
MemoryUsed: nv.MemoryUsed,
|
||||
Usage: nv.Usage,
|
||||
MemoryUsage: nv.MemoryUsage,
|
||||
EncoderUsage: nv.EncoderUsage,
|
||||
DecoderUsage: nv.DecoderUsage,
|
||||
stats = append(stats, GPUInfo{
|
||||
Name: nv.Name,
|
||||
MemoryTotal: nv.MemoryTotal,
|
||||
MemoryUsed: nv.MemoryUsed,
|
||||
Usage: nv.Usage,
|
||||
Encoder: nv.Encoder,
|
||||
Decoder: nv.Decoder,
|
||||
})
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
func GPUStats() ([]GPUInfoStat, error) {
|
||||
return DefaultUtil.GPUStats()
|
||||
func GPU() ([]GPUInfo, error) {
|
||||
return DefaultUtil.GPU()
|
||||
}
|
||||
|
||||
@ -9,11 +9,13 @@ import (
|
||||
|
||||
"github.com/datarhei/core/v16/log"
|
||||
"github.com/datarhei/core/v16/psutil"
|
||||
"github.com/datarhei/core/v16/slices"
|
||||
)
|
||||
|
||||
type Info struct {
|
||||
Mem MemoryInfo
|
||||
CPU CPUInfo
|
||||
GPU GPUInfo
|
||||
}
|
||||
|
||||
type MemoryInfo struct {
|
||||
@ -38,6 +40,44 @@ type CPUInfo struct {
|
||||
Error error
|
||||
}
|
||||
|
||||
type GPUInfo struct {
|
||||
NGPU float64 // number of gpus
|
||||
GPU []GPUInfoStat
|
||||
Error error
|
||||
}
|
||||
|
||||
type GPUInfoStat struct {
|
||||
Index int
|
||||
Name string
|
||||
|
||||
// Memory
|
||||
MemoryTotal uint64 // bytes
|
||||
MemoryUsed uint64 // bytes
|
||||
MemoryAvailable uint64 // bytes
|
||||
MemoryLimit uint64 // bytes
|
||||
|
||||
// GPU
|
||||
Usage float64 // percent 0-100
|
||||
Encoder float64 // percent 0-100
|
||||
Decoder float64 // percent 0-100
|
||||
UsageLimit float64 // percent 0-100
|
||||
|
||||
Throttling bool
|
||||
}
|
||||
|
||||
type Request struct {
|
||||
CPU float64 // percent 0-100*ncpu
|
||||
Memory uint64 // bytes
|
||||
GPUUsage float64 // percent 0-100
|
||||
GPUEncoder float64 // percent 0-100
|
||||
GPUDecoder float64 // percent 0-100
|
||||
GPUMemory uint64 // bytes
|
||||
}
|
||||
|
||||
type Response struct {
|
||||
GPU int // GPU number, hwdevice
|
||||
}
|
||||
|
||||
type resources struct {
|
||||
psutil psutil.Util
|
||||
|
||||
@ -45,9 +85,14 @@ type resources struct {
|
||||
maxCPU float64 // percent 0-100*ncpu
|
||||
maxMemory uint64 // bytes
|
||||
|
||||
ngpu int
|
||||
maxGPU float64 // general usage, percent 0-100
|
||||
maxGPUMemory float64 // memory usage, percent 0-100
|
||||
|
||||
isUnlimited bool
|
||||
isCPULimiting bool
|
||||
isMemoryLimiting bool
|
||||
isGPULimiting []bool
|
||||
|
||||
self psutil.Process
|
||||
|
||||
@ -67,30 +112,46 @@ type Resources interface {
|
||||
// HasLimits returns whether any limits have been set.
|
||||
HasLimits() bool
|
||||
|
||||
// Limits returns the CPU (percent 0-100) and memory (bytes) limits.
|
||||
Limits() (float64, uint64)
|
||||
// Limits returns the CPU (percent 0-100), memory (bytes) limits, and GPU limits (usage and memory each in percent 0-100).
|
||||
Limits() (float64, uint64, float64, float64)
|
||||
|
||||
// ShouldLimit returns whether cpu and/or memory is currently limited.
|
||||
ShouldLimit() (bool, bool)
|
||||
// ShouldLimit returns whether cpu, memory, and/or GPU is currently limited.
|
||||
ShouldLimit() (bool, bool, []bool)
|
||||
|
||||
// Request checks whether the requested resources are available.
|
||||
Request(cpu float64, memory uint64) error
|
||||
Request(req Request) (Response, error)
|
||||
|
||||
// Info returns the current resource usage
|
||||
// Info returns the current resource usage.
|
||||
Info() Info
|
||||
}
|
||||
|
||||
type Config struct {
|
||||
MaxCPU float64 // percent 0-100
|
||||
MaxMemory float64 // percent 0-100
|
||||
PSUtil psutil.Util
|
||||
Logger log.Logger
|
||||
MaxCPU float64 // percent 0-100
|
||||
MaxMemory float64 // percent 0-100
|
||||
MaxGPU float64 // general,encoder,decoder usage, percent 0-100
|
||||
MaxGPUMemory float64 // memory usage, percent 0-100
|
||||
PSUtil psutil.Util
|
||||
Logger log.Logger
|
||||
}
|
||||
|
||||
func New(config Config) (Resources, error) {
|
||||
if config.PSUtil == nil {
|
||||
config.PSUtil = psutil.DefaultUtil
|
||||
}
|
||||
|
||||
gpu, err := config.PSUtil.GPU()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to determine number of GPUs: %w", err)
|
||||
}
|
||||
|
||||
if len(gpu) == 0 {
|
||||
config.MaxGPU = 0
|
||||
config.MaxGPUMemory = 0
|
||||
}
|
||||
|
||||
isUnlimited := false
|
||||
|
||||
if config.MaxCPU <= 0 && config.MaxMemory <= 0 {
|
||||
if config.MaxCPU <= 0 && config.MaxMemory <= 0 && config.MaxGPU <= 0 && config.MaxGPUMemory <= 0 {
|
||||
isUnlimited = true
|
||||
}
|
||||
|
||||
@ -102,31 +163,39 @@ func New(config Config) (Resources, error) {
|
||||
config.MaxMemory = 100
|
||||
}
|
||||
|
||||
if config.MaxCPU > 100 || config.MaxMemory > 100 {
|
||||
return nil, fmt.Errorf("both MaxCPU and MaxMemory must have a range of 0-100")
|
||||
if config.MaxGPU <= 0 {
|
||||
config.MaxGPU = 100
|
||||
}
|
||||
|
||||
if config.MaxGPUMemory <= 0 {
|
||||
config.MaxGPUMemory = 100
|
||||
}
|
||||
|
||||
if config.MaxCPU > 100 || config.MaxMemory > 100 || config.MaxGPU > 100 || config.MaxGPUMemory > 100 {
|
||||
return nil, fmt.Errorf("all Max... values must have a range of 0-100")
|
||||
}
|
||||
|
||||
r := &resources{
|
||||
maxCPU: config.MaxCPU,
|
||||
psutil: config.PSUtil,
|
||||
isUnlimited: isUnlimited,
|
||||
logger: config.Logger,
|
||||
maxCPU: config.MaxCPU,
|
||||
maxGPU: config.MaxGPU,
|
||||
maxGPUMemory: config.MaxGPUMemory,
|
||||
psutil: config.PSUtil,
|
||||
isUnlimited: isUnlimited,
|
||||
ngpu: len(gpu),
|
||||
isGPULimiting: make([]bool, len(gpu)),
|
||||
logger: config.Logger,
|
||||
}
|
||||
|
||||
if r.logger == nil {
|
||||
r.logger = log.New("")
|
||||
}
|
||||
|
||||
if r.psutil == nil {
|
||||
r.psutil = psutil.DefaultUtil
|
||||
}
|
||||
|
||||
vmstat, err := r.psutil.VirtualMemory()
|
||||
vmstat, err := r.psutil.Memory()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to determine available memory: %w", err)
|
||||
}
|
||||
|
||||
ncpu, err := r.psutil.CPUCounts(true)
|
||||
ncpu, err := r.psutil.CPUCounts()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to determine number of logical CPUs: %w", err)
|
||||
}
|
||||
@ -137,12 +206,15 @@ func New(config Config) (Resources, error) {
|
||||
r.maxMemory = uint64(float64(vmstat.Total) * config.MaxMemory / 100)
|
||||
|
||||
r.logger = r.logger.WithFields(log.Fields{
|
||||
"ncpu": r.ncpu,
|
||||
"max_cpu": r.maxCPU,
|
||||
"max_memory": r.maxMemory,
|
||||
"ncpu": r.ncpu,
|
||||
"max_cpu": r.maxCPU,
|
||||
"max_memory": r.maxMemory,
|
||||
"ngpu": len(gpu),
|
||||
"max_gpu": r.maxGPU,
|
||||
"max_gpu_memory": r.maxGPUMemory,
|
||||
})
|
||||
|
||||
r.self, err = psutil.NewProcess(int32(os.Getpid()), false)
|
||||
r.self, err = r.psutil.Process(int32(os.Getpid()))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to create process observer for self: %w", err)
|
||||
}
|
||||
@ -189,7 +261,12 @@ func (r *resources) observe(ctx context.Context, interval time.Duration) {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
cpustat, err := r.psutil.CPUPercent()
|
||||
if r.isUnlimited {
|
||||
// If there aren't any limits imposed, don't do anything
|
||||
continue
|
||||
}
|
||||
|
||||
cpustat, err := r.psutil.CPU()
|
||||
if err != nil {
|
||||
r.logger.Warn().WithError(err).Log("Failed to determine system CPU usage")
|
||||
continue
|
||||
@ -197,12 +274,18 @@ func (r *resources) observe(ctx context.Context, interval time.Duration) {
|
||||
|
||||
cpuload := (cpustat.User + cpustat.System + cpustat.Other) * r.ncpu
|
||||
|
||||
vmstat, err := r.psutil.VirtualMemory()
|
||||
vmstat, err := r.psutil.Memory()
|
||||
if err != nil {
|
||||
r.logger.Warn().WithError(err).Log("Failed to determine system memory usage")
|
||||
continue
|
||||
}
|
||||
|
||||
gpustat, err := r.psutil.GPU()
|
||||
if err != nil {
|
||||
r.logger.Warn().WithError(err).Log("Failed to determine GPU usage")
|
||||
continue
|
||||
}
|
||||
|
||||
r.logger.Debug().WithFields(log.Fields{
|
||||
"cur_cpu": cpuload,
|
||||
"cur_memory": vmstat.Used,
|
||||
@ -210,34 +293,46 @@ func (r *resources) observe(ctx context.Context, interval time.Duration) {
|
||||
|
||||
doCPULimit := false
|
||||
|
||||
if !r.isUnlimited {
|
||||
if !r.isCPULimiting {
|
||||
if cpuload >= r.maxCPU {
|
||||
r.logger.Debug().WithField("cpu", cpuload).Log("CPU limit reached")
|
||||
doCPULimit = true
|
||||
}
|
||||
} else {
|
||||
if !r.isCPULimiting {
|
||||
if cpuload >= r.maxCPU {
|
||||
r.logger.Debug().WithField("cpu", cpuload).Log("CPU limit reached")
|
||||
doCPULimit = true
|
||||
if cpuload < r.maxCPU {
|
||||
r.logger.Debug().WithField("cpu", cpuload).Log("CPU limit released")
|
||||
doCPULimit = false
|
||||
}
|
||||
}
|
||||
} else {
|
||||
doCPULimit = true
|
||||
if cpuload < r.maxCPU {
|
||||
r.logger.Debug().WithField("cpu", cpuload).Log("CPU limit released")
|
||||
doCPULimit = false
|
||||
}
|
||||
}
|
||||
|
||||
doMemoryLimit := false
|
||||
|
||||
if !r.isUnlimited {
|
||||
if !r.isMemoryLimiting {
|
||||
if vmstat.Used >= r.maxMemory {
|
||||
r.logger.Debug().WithField("memory", vmstat.Used).Log("Memory limit reached")
|
||||
doMemoryLimit = true
|
||||
if !r.isMemoryLimiting {
|
||||
if vmstat.Used >= r.maxMemory {
|
||||
r.logger.Debug().WithField("memory", vmstat.Used).Log("Memory limit reached")
|
||||
doMemoryLimit = true
|
||||
}
|
||||
} else {
|
||||
doMemoryLimit = true
|
||||
if vmstat.Used < r.maxMemory {
|
||||
r.logger.Debug().WithField("memory", vmstat.Used).Log("Memory limit released")
|
||||
doMemoryLimit = false
|
||||
}
|
||||
}
|
||||
|
||||
doGPULimit := make([]bool, r.ngpu)
|
||||
|
||||
for i, limiting := range r.isGPULimiting {
|
||||
maxMemory := uint64(r.maxGPUMemory * float64(gpustat[i].MemoryTotal) / 100)
|
||||
if !limiting {
|
||||
if gpustat[i].MemoryUsed >= maxMemory || (gpustat[i].Usage >= r.maxGPU && gpustat[i].Encoder >= r.maxGPU && gpustat[i].Decoder >= r.maxGPU) {
|
||||
doGPULimit[i] = true
|
||||
}
|
||||
} else {
|
||||
doMemoryLimit = true
|
||||
if vmstat.Used < r.maxMemory {
|
||||
r.logger.Debug().WithField("memory", vmstat.Used).Log("Memory limit released")
|
||||
doMemoryLimit = false
|
||||
doGPULimit[i] = true
|
||||
if gpustat[i].MemoryUsed < maxMemory && (gpustat[i].Usage < r.maxGPU || gpustat[i].Encoder < r.maxGPU || gpustat[i].Decoder < r.maxGPU) {
|
||||
doGPULimit[i] = false
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -247,17 +342,26 @@ func (r *resources) observe(ctx context.Context, interval time.Duration) {
|
||||
r.logger.Warn().WithFields(log.Fields{
|
||||
"enabled": doCPULimit,
|
||||
}).Log("Limiting CPU")
|
||||
|
||||
r.isCPULimiting = doCPULimit
|
||||
}
|
||||
r.isCPULimiting = doCPULimit
|
||||
|
||||
if r.isMemoryLimiting != doMemoryLimit {
|
||||
r.logger.Warn().WithFields(log.Fields{
|
||||
"enabled": doMemoryLimit,
|
||||
}).Log("Limiting memory")
|
||||
|
||||
r.isMemoryLimiting = doMemoryLimit
|
||||
}
|
||||
r.isMemoryLimiting = doMemoryLimit
|
||||
|
||||
for i, limiting := range r.isGPULimiting {
|
||||
if limiting != doGPULimit[i] {
|
||||
r.logger.Warn().WithFields(log.Fields{
|
||||
"enabled": doGPULimit,
|
||||
"index": i,
|
||||
}).Log("Limiting GPU")
|
||||
}
|
||||
}
|
||||
r.isGPULimiting = doGPULimit
|
||||
|
||||
r.lock.Unlock()
|
||||
}
|
||||
}
|
||||
@ -267,60 +371,136 @@ func (r *resources) HasLimits() bool {
|
||||
return !r.isUnlimited
|
||||
}
|
||||
|
||||
func (r *resources) Limits() (float64, uint64) {
|
||||
return r.maxCPU / r.ncpu, r.maxMemory
|
||||
func (r *resources) Limits() (float64, uint64, float64, float64) {
|
||||
return r.maxCPU / r.ncpu, r.maxMemory, r.maxGPU, r.maxGPUMemory
|
||||
}
|
||||
|
||||
func (r *resources) ShouldLimit() (bool, bool) {
|
||||
func (r *resources) ShouldLimit() (bool, bool, []bool) {
|
||||
r.lock.RLock()
|
||||
defer r.lock.RUnlock()
|
||||
|
||||
return r.isCPULimiting, r.isMemoryLimiting
|
||||
return r.isCPULimiting, r.isMemoryLimiting, slices.Copy(r.isGPULimiting)
|
||||
}
|
||||
|
||||
func (r *resources) Request(cpu float64, memory uint64) error {
|
||||
func (r *resources) Request(req Request) (Response, error) {
|
||||
res := Response{
|
||||
GPU: -1,
|
||||
}
|
||||
|
||||
r.lock.RLock()
|
||||
defer r.lock.RUnlock()
|
||||
|
||||
logger := r.logger.WithFields(log.Fields{
|
||||
"req_cpu": cpu,
|
||||
"req_memory": memory,
|
||||
"req_cpu": req.CPU,
|
||||
"req_memory": req.Memory,
|
||||
"req_gpu": req.GPUUsage,
|
||||
"req_gpu_encoder": req.GPUEncoder,
|
||||
"req_gpu_decoder": req.GPUDecoder,
|
||||
"req_gpu_memory": req.GPUMemory,
|
||||
})
|
||||
|
||||
logger.Debug().Log("Request for acquiring resources")
|
||||
|
||||
// Check if anything is currently limiting.
|
||||
if r.isCPULimiting || r.isMemoryLimiting {
|
||||
logger.Debug().Log("Rejected, currently limiting")
|
||||
return fmt.Errorf("resources are currenlty actively limited")
|
||||
return res, fmt.Errorf("resources are currenlty actively limited")
|
||||
}
|
||||
|
||||
if cpu <= 0 || memory == 0 {
|
||||
// Check if the requested resources are valid.
|
||||
if req.CPU <= 0 || req.Memory == 0 {
|
||||
logger.Debug().Log("Rejected, invalid values")
|
||||
return fmt.Errorf("the cpu and/or memory values are invalid: cpu=%f, memory=%d", cpu, memory)
|
||||
return res, fmt.Errorf("the cpu and/or memory values are invalid: cpu=%f, memory=%d", req.CPU, req.Memory)
|
||||
}
|
||||
|
||||
cpustat, err := r.psutil.CPUPercent()
|
||||
// Get current CPU and memory values.
|
||||
cpustat, err := r.psutil.CPU()
|
||||
if err != nil {
|
||||
r.logger.Warn().WithError(err).Log("Failed to determine system CPU usage")
|
||||
return fmt.Errorf("the system CPU usage couldn't be determined")
|
||||
return res, fmt.Errorf("the system CPU usage couldn't be determined")
|
||||
}
|
||||
|
||||
cpuload := (cpustat.User + cpustat.System + cpustat.Other) * r.ncpu
|
||||
|
||||
vmstat, err := r.psutil.VirtualMemory()
|
||||
vmstat, err := r.psutil.Memory()
|
||||
if err != nil {
|
||||
r.logger.Warn().WithError(err).Log("Failed to determine system memory usage")
|
||||
return fmt.Errorf("the system memory usage couldn't be determined")
|
||||
return res, fmt.Errorf("the system memory usage couldn't be determined")
|
||||
}
|
||||
|
||||
if cpuload+cpu > r.maxCPU {
|
||||
// Check if enough resources are available
|
||||
if cpuload+req.CPU > r.maxCPU {
|
||||
logger.Debug().WithField("cur_cpu", cpuload).Log("Rejected, CPU limit exceeded")
|
||||
return fmt.Errorf("the CPU limit would be exceeded: %f + %f > %f", cpuload, cpu, r.maxCPU)
|
||||
return res, fmt.Errorf("the CPU limit would be exceeded: %f + %f > %f", cpuload, req.CPU, r.maxCPU)
|
||||
}
|
||||
|
||||
if vmstat.Used+memory > r.maxMemory {
|
||||
if vmstat.Used+req.Memory > r.maxMemory {
|
||||
logger.Debug().WithField("cur_memory", vmstat.Used).Log("Rejected, memory limit exceeded")
|
||||
return fmt.Errorf("the memory limit would be exceeded: %d + %d > %d", vmstat.Used, memory, r.maxMemory)
|
||||
return res, fmt.Errorf("the memory limit would be exceeded: %d + %d > %d", vmstat.Used, req.Memory, r.maxMemory)
|
||||
}
|
||||
|
||||
// Check if any GPU resources are requested
|
||||
if req.GPUUsage > 0 || req.GPUEncoder > 0 || req.GPUDecoder > 0 || req.GPUMemory > 0 {
|
||||
if req.GPUUsage < 0 || req.GPUEncoder < 0 || req.GPUDecoder < 0 || req.GPUMemory == 0 {
|
||||
logger.Debug().Log("Rejected, invalid values")
|
||||
return res, fmt.Errorf("the gpu usage and memory values are invalid: usage=%f, encoder=%f, decoder=%f, memory=%d", req.GPUUsage, req.GPUEncoder, req.GPUDecoder, req.GPUMemory)
|
||||
}
|
||||
|
||||
// Get current GPU values
|
||||
gpustat, err := r.psutil.GPU()
|
||||
if err != nil {
|
||||
r.logger.Warn().WithError(err).Log("Failed to determine GPU usage")
|
||||
return res, fmt.Errorf("the GPU usage couldn't be determined")
|
||||
}
|
||||
|
||||
if len(gpustat) == 0 {
|
||||
r.logger.Debug().WithError(err).Log("GPU resources requested but no GPU available")
|
||||
return res, fmt.Errorf("some GPU resources requested but no GPU available")
|
||||
}
|
||||
|
||||
foundGPU := -1
|
||||
for _, g := range gpustat {
|
||||
if req.GPUUsage > 0 && g.Usage+req.GPUUsage > r.maxGPU {
|
||||
logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu": g.Usage}).Log("Rejected, GPU usage limit exceeded")
|
||||
continue
|
||||
}
|
||||
|
||||
if req.GPUEncoder > 0 && g.Encoder+req.GPUEncoder > r.maxGPU {
|
||||
logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu_encoder": g.Usage}).Log("Rejected, GPU encoder usage limit exceeded")
|
||||
continue
|
||||
}
|
||||
|
||||
if req.GPUDecoder > 0 && g.Decoder+req.GPUDecoder > r.maxGPU {
|
||||
logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu_decoder": g.Usage}).Log("Rejected, GPU decoder usage limit exceeded")
|
||||
continue
|
||||
}
|
||||
|
||||
gpuMemoryUsage := float64(g.MemoryUsed) / float64(g.MemoryTotal) * 100
|
||||
requestedGPUMemoryUsage := float64(req.GPUMemory) / float64(g.MemoryTotal) * 100
|
||||
|
||||
if gpuMemoryUsage+requestedGPUMemoryUsage > r.maxGPUMemory {
|
||||
logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu_memory": gpuMemoryUsage}).Log("Rejected, GPU memory usage limit exceeded")
|
||||
continue
|
||||
}
|
||||
|
||||
foundGPU = g.Index
|
||||
|
||||
logger = logger.Debug().WithFields(log.Fields{
|
||||
"cur_gpu": foundGPU,
|
||||
"cur_gpu_general": g.Usage,
|
||||
"cur_gpu_encoder": g.Encoder,
|
||||
"cur_gpu_decoder": g.Decoder,
|
||||
"cur_gpu_memory": gpuMemoryUsage,
|
||||
})
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
if foundGPU < 0 {
|
||||
return res, fmt.Errorf("all GPU usage limits are exceeded")
|
||||
}
|
||||
|
||||
res.GPU = foundGPU
|
||||
}
|
||||
|
||||
logger.Debug().WithFields(log.Fields{
|
||||
@ -328,17 +508,18 @@ func (r *resources) Request(cpu float64, memory uint64) error {
|
||||
"cur_memory": vmstat.Used,
|
||||
}).Log("Acquiring approved")
|
||||
|
||||
return nil
|
||||
return res, nil
|
||||
}
|
||||
|
||||
func (r *resources) Info() Info {
|
||||
cpulimit, memlimit := r.Limits()
|
||||
cputhrottling, memthrottling := r.ShouldLimit()
|
||||
cpulimit, memlimit, gpulimit, gpumemlimit := r.Limits()
|
||||
cputhrottling, memthrottling, gputhrottling := r.ShouldLimit()
|
||||
|
||||
cpustat, cpuerr := r.psutil.CPUPercent()
|
||||
memstat, memerr := r.psutil.VirtualMemory()
|
||||
selfcpu, _ := r.self.CPUPercent()
|
||||
selfmem, _ := r.self.VirtualMemory()
|
||||
cpustat, cpuerr := r.psutil.CPU()
|
||||
memstat, memerr := r.psutil.Memory()
|
||||
gpustat, gpuerr := r.psutil.GPU()
|
||||
selfcpu, _ := r.self.CPU()
|
||||
selfmem, _ := r.self.Memory()
|
||||
|
||||
cpuinfo := CPUInfo{
|
||||
NCPU: r.ncpu,
|
||||
@ -362,9 +543,31 @@ func (r *resources) Info() Info {
|
||||
Error: memerr,
|
||||
}
|
||||
|
||||
gpuinfo := GPUInfo{
|
||||
NGPU: float64(len(gpustat)),
|
||||
Error: gpuerr,
|
||||
}
|
||||
|
||||
for i, g := range gpustat {
|
||||
gpuinfo.GPU = append(gpuinfo.GPU, GPUInfoStat{
|
||||
Index: g.Index,
|
||||
Name: g.Name,
|
||||
MemoryTotal: g.MemoryTotal,
|
||||
MemoryUsed: g.MemoryUsed,
|
||||
MemoryAvailable: g.MemoryTotal - g.MemoryUsed,
|
||||
MemoryLimit: uint64(float64(g.MemoryTotal) * gpumemlimit / 100),
|
||||
Usage: g.Usage,
|
||||
Encoder: g.Encoder,
|
||||
Decoder: g.Decoder,
|
||||
UsageLimit: gpulimit,
|
||||
Throttling: gputhrottling[i],
|
||||
})
|
||||
}
|
||||
|
||||
i := Info{
|
||||
CPU: cpuinfo,
|
||||
Mem: meminfo,
|
||||
GPU: gpuinfo,
|
||||
}
|
||||
|
||||
return i
|
||||
|
||||
@ -1,68 +1,170 @@
|
||||
package resources
|
||||
|
||||
import (
|
||||
"slices"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/datarhei/core/v16/psutil"
|
||||
|
||||
"github.com/shirou/gopsutil/v3/disk"
|
||||
"github.com/shirou/gopsutil/v3/net"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
type util struct{}
|
||||
type util struct {
|
||||
lock sync.Mutex
|
||||
|
||||
cpu psutil.CPUInfo
|
||||
mem psutil.MemoryInfo
|
||||
gpu []psutil.GPUInfo
|
||||
}
|
||||
|
||||
func newUtil(ngpu int) *util {
|
||||
u := &util{
|
||||
cpu: psutil.CPUInfo{
|
||||
System: 10,
|
||||
User: 50,
|
||||
Idle: 35,
|
||||
Other: 5,
|
||||
},
|
||||
mem: psutil.MemoryInfo{
|
||||
Total: 200,
|
||||
Available: 40,
|
||||
Used: 160,
|
||||
},
|
||||
}
|
||||
|
||||
for i := 0; i < ngpu; i++ {
|
||||
u.gpu = append(u.gpu, psutil.GPUInfo{
|
||||
Index: i,
|
||||
Name: "L4",
|
||||
MemoryTotal: 24 * 1024 * 1024 * 1024,
|
||||
MemoryUsed: uint64(12+i) * 1024 * 1024 * 1024,
|
||||
Usage: 50 - float64((i+1)*5),
|
||||
Encoder: 50 - float64((i+1)*10),
|
||||
Decoder: 50 - float64((i+1)*3),
|
||||
})
|
||||
}
|
||||
|
||||
return u
|
||||
}
|
||||
|
||||
func (u *util) Start() {}
|
||||
func (u *util) Stop() {}
|
||||
|
||||
func (u *util) CPUCounts(logical bool) (float64, error) {
|
||||
func (u *util) CPUCounts() (float64, error) {
|
||||
return 2, nil
|
||||
}
|
||||
|
||||
func (u *util) GPUCounts() (float64, error) {
|
||||
return 0, nil
|
||||
func (u *util) CPU() (*psutil.CPUInfo, error) {
|
||||
u.lock.Lock()
|
||||
defer u.lock.Unlock()
|
||||
|
||||
cpu := u.cpu
|
||||
|
||||
return &cpu, nil
|
||||
}
|
||||
|
||||
func (u *util) CPUPercent() (*psutil.CPUInfoStat, error) {
|
||||
return &psutil.CPUInfoStat{
|
||||
System: 10,
|
||||
User: 50,
|
||||
Idle: 35,
|
||||
Other: 5,
|
||||
}, nil
|
||||
func (u *util) Disk(path string) (*psutil.DiskInfo, error) {
|
||||
return &psutil.DiskInfo{}, nil
|
||||
}
|
||||
|
||||
func (u *util) DiskUsage(path string) (*disk.UsageStat, error) {
|
||||
return &disk.UsageStat{}, nil
|
||||
func (u *util) Memory() (*psutil.MemoryInfo, error) {
|
||||
u.lock.Lock()
|
||||
defer u.lock.Unlock()
|
||||
|
||||
mem := u.mem
|
||||
|
||||
return &mem, nil
|
||||
}
|
||||
|
||||
func (u *util) VirtualMemory() (*psutil.MemoryInfoStat, error) {
|
||||
return &psutil.MemoryInfoStat{
|
||||
Total: 200,
|
||||
Available: 40,
|
||||
Used: 160,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (u *util) NetIOCounters(pernic bool) ([]net.IOCountersStat, error) {
|
||||
func (u *util) Network() ([]psutil.NetworkInfo, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (u *util) GPUStats() ([]psutil.GPUInfoStat, error) {
|
||||
return nil, nil
|
||||
func (u *util) GPU() ([]psutil.GPUInfo, error) {
|
||||
u.lock.Lock()
|
||||
defer u.lock.Unlock()
|
||||
|
||||
gpu := []psutil.GPUInfo{}
|
||||
|
||||
gpu = append(gpu, u.gpu...)
|
||||
|
||||
return gpu, nil
|
||||
}
|
||||
|
||||
func (u *util) Process(pid int32) (psutil.Process, error) {
|
||||
return nil, nil
|
||||
return &process{}, nil
|
||||
}
|
||||
|
||||
type process struct{}
|
||||
|
||||
func (p *process) CPU() (*psutil.CPUInfo, error) {
|
||||
s := &psutil.CPUInfo{
|
||||
System: 1,
|
||||
User: 2,
|
||||
Idle: 0,
|
||||
Other: 3,
|
||||
}
|
||||
|
||||
return s, nil
|
||||
}
|
||||
|
||||
func (p *process) Memory() (uint64, error) { return 42, nil }
|
||||
func (p *process) GPU() (*psutil.GPUInfo, error) {
|
||||
return &psutil.GPUInfo{
|
||||
Index: 0,
|
||||
Name: "L4",
|
||||
MemoryTotal: 128,
|
||||
MemoryUsed: 42,
|
||||
Usage: 5,
|
||||
Encoder: 9,
|
||||
Decoder: 7,
|
||||
}, nil
|
||||
}
|
||||
func (p *process) Stop() {}
|
||||
func (p *process) Suspend() error { return nil }
|
||||
func (p *process) Resume() error { return nil }
|
||||
|
||||
func TestConfigNoLimits(t *testing.T) {
|
||||
_, err := New(Config{
|
||||
PSUtil: newUtil(0),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
func TestConfigWrongLimits(t *testing.T) {
|
||||
_, err := New(Config{
|
||||
MaxCPU: 102,
|
||||
MaxMemory: 573,
|
||||
PSUtil: newUtil(0),
|
||||
})
|
||||
require.Error(t, err)
|
||||
|
||||
_, err = New(Config{
|
||||
MaxCPU: 0,
|
||||
MaxMemory: 0,
|
||||
MaxGPU: 101,
|
||||
MaxGPUMemory: 103,
|
||||
PSUtil: newUtil(0),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = New(Config{
|
||||
MaxCPU: 0,
|
||||
MaxMemory: 0,
|
||||
MaxGPU: 101,
|
||||
MaxGPUMemory: 103,
|
||||
PSUtil: newUtil(1),
|
||||
})
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestMemoryLimit(t *testing.T) {
|
||||
r, err := New(Config{
|
||||
MaxCPU: 100,
|
||||
MaxMemory: 150. / 200. * 100,
|
||||
PSUtil: &util{},
|
||||
PSUtil: newUtil(0),
|
||||
Logger: nil,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
@ -86,7 +188,7 @@ func TestMemoryLimit(t *testing.T) {
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
_, limit = r.ShouldLimit()
|
||||
_, limit, _ = r.ShouldLimit()
|
||||
if limit {
|
||||
return
|
||||
}
|
||||
@ -102,6 +204,95 @@ func TestMemoryLimit(t *testing.T) {
|
||||
|
||||
require.True(t, limit)
|
||||
|
||||
_, err = r.Request(Request{CPU: 5, Memory: 10})
|
||||
require.Error(t, err)
|
||||
|
||||
r.Stop()
|
||||
}
|
||||
|
||||
func TestMemoryUnlimit(t *testing.T) {
|
||||
util := newUtil(0)
|
||||
|
||||
r, err := New(Config{
|
||||
MaxCPU: 100,
|
||||
MaxMemory: 150. / 200. * 100,
|
||||
PSUtil: util,
|
||||
Logger: nil,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
|
||||
limit := false
|
||||
|
||||
go func() {
|
||||
defer func() {
|
||||
wg.Done()
|
||||
}()
|
||||
|
||||
timer := time.NewTimer(10 * time.Second)
|
||||
defer timer.Stop()
|
||||
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
_, limit, _ = r.ShouldLimit()
|
||||
if limit {
|
||||
return
|
||||
}
|
||||
case <-timer.C:
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
r.Start()
|
||||
|
||||
wg.Wait()
|
||||
|
||||
require.True(t, limit)
|
||||
|
||||
_, limit, _ = r.ShouldLimit()
|
||||
require.True(t, limit)
|
||||
|
||||
util.lock.Lock()
|
||||
util.mem.Used = 140
|
||||
util.lock.Unlock()
|
||||
|
||||
wg.Add(1)
|
||||
|
||||
go func() {
|
||||
defer func() {
|
||||
wg.Done()
|
||||
}()
|
||||
|
||||
timer := time.NewTimer(10 * time.Second)
|
||||
defer timer.Stop()
|
||||
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
_, limit, _ = r.ShouldLimit()
|
||||
if !limit {
|
||||
return
|
||||
}
|
||||
case <-timer.C:
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
wg.Wait()
|
||||
|
||||
require.False(t, limit)
|
||||
|
||||
r.Stop()
|
||||
}
|
||||
|
||||
@ -109,7 +300,7 @@ func TestCPULimit(t *testing.T) {
|
||||
r, err := New(Config{
|
||||
MaxCPU: 50.,
|
||||
MaxMemory: 100,
|
||||
PSUtil: &util{},
|
||||
PSUtil: newUtil(0),
|
||||
Logger: nil,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
@ -133,7 +324,7 @@ func TestCPULimit(t *testing.T) {
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
limit, _ = r.ShouldLimit()
|
||||
limit, _, _ = r.ShouldLimit()
|
||||
if limit {
|
||||
return
|
||||
}
|
||||
@ -149,36 +340,541 @@ func TestCPULimit(t *testing.T) {
|
||||
|
||||
require.True(t, limit)
|
||||
|
||||
_, err = r.Request(Request{CPU: 5, Memory: 10})
|
||||
require.Error(t, err)
|
||||
|
||||
r.Stop()
|
||||
}
|
||||
|
||||
func TestRequest(t *testing.T) {
|
||||
func TestCPUUnlimit(t *testing.T) {
|
||||
util := newUtil(0)
|
||||
|
||||
r, err := New(Config{
|
||||
MaxCPU: 70.,
|
||||
MaxMemory: 170. / 200. * 100,
|
||||
PSUtil: &util{},
|
||||
MaxCPU: 50.,
|
||||
MaxMemory: 100,
|
||||
PSUtil: util,
|
||||
Logger: nil,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
err = r.Request(-1, 0)
|
||||
require.Error(t, err)
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
|
||||
err = r.Request(5, 10)
|
||||
limit := false
|
||||
|
||||
go func() {
|
||||
defer func() {
|
||||
wg.Done()
|
||||
}()
|
||||
|
||||
timer := time.NewTimer(10 * time.Second)
|
||||
defer timer.Stop()
|
||||
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
limit, _, _ = r.ShouldLimit()
|
||||
if limit {
|
||||
return
|
||||
}
|
||||
case <-timer.C:
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
r.Start()
|
||||
|
||||
wg.Wait()
|
||||
|
||||
require.True(t, limit)
|
||||
|
||||
limit, _, _ = r.ShouldLimit()
|
||||
require.True(t, limit)
|
||||
|
||||
util.lock.Lock()
|
||||
util.cpu.User = 20
|
||||
util.lock.Unlock()
|
||||
|
||||
wg.Add(1)
|
||||
|
||||
go func() {
|
||||
defer func() {
|
||||
wg.Done()
|
||||
}()
|
||||
|
||||
timer := time.NewTimer(10 * time.Second)
|
||||
defer timer.Stop()
|
||||
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
limit, _, _ = r.ShouldLimit()
|
||||
if !limit {
|
||||
return
|
||||
}
|
||||
case <-timer.C:
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
wg.Wait()
|
||||
|
||||
require.False(t, limit)
|
||||
|
||||
r.Stop()
|
||||
}
|
||||
|
||||
func TestGPULimitMemory(t *testing.T) {
|
||||
r, err := New(Config{
|
||||
MaxCPU: 100,
|
||||
MaxMemory: 100,
|
||||
MaxGPU: 100,
|
||||
MaxGPUMemory: 20,
|
||||
PSUtil: newUtil(2),
|
||||
Logger: nil,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
err = r.Request(5, 20)
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
|
||||
limit := []bool{}
|
||||
|
||||
go func() {
|
||||
defer func() {
|
||||
wg.Done()
|
||||
}()
|
||||
|
||||
timer := time.NewTimer(10 * time.Second)
|
||||
defer timer.Stop()
|
||||
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
_, _, limit = r.ShouldLimit()
|
||||
if slices.Contains(limit, true) {
|
||||
return
|
||||
}
|
||||
case <-timer.C:
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
r.Start()
|
||||
|
||||
wg.Wait()
|
||||
|
||||
require.Contains(t, limit, true)
|
||||
|
||||
_, err = r.Request(Request{CPU: 5, Memory: 10, GPUUsage: 10, GPUMemory: 10})
|
||||
require.Error(t, err)
|
||||
|
||||
err = r.Request(10, 10)
|
||||
r.Stop()
|
||||
}
|
||||
|
||||
func TestGPUUnlimitMemory(t *testing.T) {
|
||||
util := newUtil(2)
|
||||
|
||||
r, err := New(Config{
|
||||
MaxCPU: 100,
|
||||
MaxMemory: 100,
|
||||
MaxGPU: 100,
|
||||
MaxGPUMemory: 20,
|
||||
PSUtil: util,
|
||||
Logger: nil,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
|
||||
limit := []bool{}
|
||||
|
||||
go func() {
|
||||
defer func() {
|
||||
wg.Done()
|
||||
}()
|
||||
|
||||
timer := time.NewTimer(10 * time.Second)
|
||||
defer timer.Stop()
|
||||
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
_, _, limit = r.ShouldLimit()
|
||||
if slices.Contains(limit, true) {
|
||||
return
|
||||
}
|
||||
case <-timer.C:
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
r.Start()
|
||||
|
||||
wg.Wait()
|
||||
|
||||
require.Contains(t, limit, true)
|
||||
|
||||
util.lock.Lock()
|
||||
util.gpu[0].MemoryUsed = 10
|
||||
util.gpu[1].MemoryUsed = 10
|
||||
util.lock.Unlock()
|
||||
|
||||
wg.Add(1)
|
||||
|
||||
go func() {
|
||||
defer func() {
|
||||
wg.Done()
|
||||
}()
|
||||
|
||||
timer := time.NewTimer(10 * time.Second)
|
||||
defer timer.Stop()
|
||||
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
_, _, limit = r.ShouldLimit()
|
||||
if !slices.Contains(limit, true) {
|
||||
return
|
||||
}
|
||||
case <-timer.C:
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
wg.Wait()
|
||||
|
||||
require.NotContains(t, limit, true)
|
||||
|
||||
r.Stop()
|
||||
}
|
||||
|
||||
func TestGPULimitMemorySome(t *testing.T) {
|
||||
r, err := New(Config{
|
||||
MaxCPU: 100,
|
||||
MaxMemory: 100,
|
||||
MaxGPU: 100,
|
||||
MaxGPUMemory: 14. / 24. * 100.,
|
||||
PSUtil: newUtil(4),
|
||||
Logger: nil,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
|
||||
limit := []bool{}
|
||||
|
||||
go func() {
|
||||
defer func() {
|
||||
wg.Done()
|
||||
}()
|
||||
|
||||
timer := time.NewTimer(10 * time.Second)
|
||||
defer timer.Stop()
|
||||
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
_, _, limit = r.ShouldLimit()
|
||||
if slices.Contains(limit, true) {
|
||||
return
|
||||
}
|
||||
case <-timer.C:
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
r.Start()
|
||||
|
||||
wg.Wait()
|
||||
|
||||
require.Equal(t, []bool{false, false, true, true}, limit)
|
||||
|
||||
_, err = r.Request(Request{CPU: 5, Memory: 10, GPUUsage: 10, GPUMemory: 10})
|
||||
require.NoError(t, err)
|
||||
|
||||
r.Stop()
|
||||
}
|
||||
|
||||
func TestGPULimitUsage(t *testing.T) {
|
||||
r, err := New(Config{
|
||||
MaxCPU: 100,
|
||||
MaxMemory: 100,
|
||||
MaxGPU: 40,
|
||||
MaxGPUMemory: 100,
|
||||
PSUtil: newUtil(3),
|
||||
Logger: nil,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
|
||||
limit := []bool{}
|
||||
|
||||
go func() {
|
||||
defer func() {
|
||||
wg.Done()
|
||||
}()
|
||||
|
||||
timer := time.NewTimer(10 * time.Second)
|
||||
defer timer.Stop()
|
||||
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
_, _, limit = r.ShouldLimit()
|
||||
if slices.Contains(limit, true) {
|
||||
return
|
||||
}
|
||||
case <-timer.C:
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
r.Start()
|
||||
|
||||
wg.Wait()
|
||||
|
||||
require.Equal(t, []bool{true, false, false}, limit)
|
||||
|
||||
_, err = r.Request(Request{CPU: 5, Memory: 10, GPUUsage: 10, GPUMemory: 10})
|
||||
require.Error(t, err)
|
||||
|
||||
_, err = r.Request(Request{CPU: 5, Memory: 10, GPUEncoder: 10, GPUMemory: 10})
|
||||
require.NoError(t, err)
|
||||
|
||||
r.Stop()
|
||||
}
|
||||
|
||||
func TestGPUUnlimitUsage(t *testing.T) {
|
||||
util := newUtil(3)
|
||||
|
||||
r, err := New(Config{
|
||||
MaxCPU: 100,
|
||||
MaxMemory: 100,
|
||||
MaxGPU: 40,
|
||||
MaxGPUMemory: 100,
|
||||
PSUtil: util,
|
||||
Logger: nil,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
|
||||
limit := []bool{}
|
||||
|
||||
go func() {
|
||||
defer func() {
|
||||
wg.Done()
|
||||
}()
|
||||
|
||||
timer := time.NewTimer(10 * time.Second)
|
||||
defer timer.Stop()
|
||||
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
_, _, limit = r.ShouldLimit()
|
||||
if slices.Contains(limit, true) {
|
||||
return
|
||||
}
|
||||
case <-timer.C:
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
r.Start()
|
||||
|
||||
wg.Wait()
|
||||
|
||||
require.Equal(t, []bool{true, false, false}, limit)
|
||||
|
||||
util.lock.Lock()
|
||||
util.gpu[0].Usage = 30
|
||||
util.gpu[0].Encoder = 30
|
||||
util.gpu[0].Decoder = 30
|
||||
util.lock.Unlock()
|
||||
|
||||
wg.Add(1)
|
||||
|
||||
go func() {
|
||||
defer func() {
|
||||
wg.Done()
|
||||
}()
|
||||
|
||||
timer := time.NewTimer(10 * time.Second)
|
||||
defer timer.Stop()
|
||||
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
_, _, limit = r.ShouldLimit()
|
||||
if !slices.Contains(limit, true) {
|
||||
return
|
||||
}
|
||||
case <-timer.C:
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
wg.Wait()
|
||||
|
||||
require.Equal(t, []bool{false, false, false}, limit)
|
||||
|
||||
r.Stop()
|
||||
}
|
||||
|
||||
func TestRequestCPU(t *testing.T) {
|
||||
r, err := New(Config{
|
||||
MaxCPU: 70.,
|
||||
PSUtil: newUtil(0),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = r.Request(Request{CPU: 0, Memory: 0})
|
||||
require.Error(t, err)
|
||||
|
||||
_, err = r.Request(Request{CPU: 5, Memory: 10})
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = r.Request(Request{CPU: 30, Memory: 10})
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestRequestMemory(t *testing.T) {
|
||||
r, err := New(Config{
|
||||
MaxMemory: 170. / 200. * 100,
|
||||
PSUtil: newUtil(0),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = r.Request(Request{CPU: 5, Memory: 0})
|
||||
require.Error(t, err)
|
||||
|
||||
_, err = r.Request(Request{CPU: 5, Memory: 10})
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = r.Request(Request{CPU: 50, Memory: 20})
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestRequestNoGPU(t *testing.T) {
|
||||
r, err := New(Config{
|
||||
MaxCPU: 100,
|
||||
MaxMemory: 100,
|
||||
PSUtil: newUtil(0),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 30, GPUMemory: 10})
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestRequestInvalidGPURequest(t *testing.T) {
|
||||
r, err := New(Config{
|
||||
MaxCPU: 100,
|
||||
MaxMemory: 100,
|
||||
PSUtil: newUtil(1),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 30, GPUMemory: 0})
|
||||
require.Error(t, err)
|
||||
|
||||
_, err = r.Request(Request{CPU: 10, Memory: 10, GPUUsage: -1, GPUEncoder: 30, GPUMemory: 0})
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestRequestGPULimitsOneGPU(t *testing.T) {
|
||||
r, err := New(Config{
|
||||
MaxCPU: 100,
|
||||
MaxMemory: 100,
|
||||
MaxGPU: 50,
|
||||
MaxGPUMemory: 60,
|
||||
PSUtil: newUtil(1),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = r.Request(Request{CPU: 10, Memory: 10, GPUUsage: 50, GPUMemory: 10})
|
||||
require.Error(t, err)
|
||||
|
||||
_, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 50, GPUMemory: 10})
|
||||
require.Error(t, err)
|
||||
|
||||
_, err = r.Request(Request{CPU: 10, Memory: 10, GPUDecoder: 50, GPUMemory: 10})
|
||||
require.Error(t, err)
|
||||
|
||||
_, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 10, GPUMemory: 5 * 1024 * 1024 * 1024})
|
||||
require.Error(t, err)
|
||||
|
||||
res, err := r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 10, GPUMemory: 10})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 0, res.GPU)
|
||||
}
|
||||
|
||||
func TestRequestGPULimitsMoreGPU(t *testing.T) {
|
||||
r, err := New(Config{
|
||||
MaxCPU: 100,
|
||||
MaxMemory: 100,
|
||||
MaxGPU: 60,
|
||||
MaxGPUMemory: 60,
|
||||
PSUtil: newUtil(2),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 50, GPUMemory: 10})
|
||||
require.Error(t, err)
|
||||
|
||||
res, err := r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 30, GPUMemory: 10})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, res.GPU)
|
||||
}
|
||||
|
||||
func TestHasLimits(t *testing.T) {
|
||||
r, err := New(Config{
|
||||
MaxCPU: 70.,
|
||||
MaxMemory: 170. / 200. * 100,
|
||||
PSUtil: &util{},
|
||||
PSUtil: newUtil(0),
|
||||
Logger: nil,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
@ -188,7 +884,7 @@ func TestHasLimits(t *testing.T) {
|
||||
r, err = New(Config{
|
||||
MaxCPU: 100,
|
||||
MaxMemory: 100,
|
||||
PSUtil: &util{},
|
||||
PSUtil: newUtil(0),
|
||||
Logger: nil,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
@ -198,10 +894,95 @@ func TestHasLimits(t *testing.T) {
|
||||
r, err = New(Config{
|
||||
MaxCPU: 0,
|
||||
MaxMemory: 0,
|
||||
PSUtil: &util{},
|
||||
PSUtil: newUtil(0),
|
||||
Logger: nil,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
require.False(t, r.HasLimits())
|
||||
|
||||
r, err = New(Config{
|
||||
MaxCPU: 0,
|
||||
MaxMemory: 0,
|
||||
MaxGPU: 10,
|
||||
PSUtil: newUtil(1),
|
||||
Logger: nil,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
require.True(t, r.HasLimits())
|
||||
|
||||
r, err = New(Config{
|
||||
MaxCPU: 0,
|
||||
MaxMemory: 0,
|
||||
MaxGPU: 10,
|
||||
PSUtil: newUtil(0),
|
||||
Logger: nil,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
require.False(t, r.HasLimits())
|
||||
}
|
||||
|
||||
func TestInfo(t *testing.T) {
|
||||
r, err := New(Config{
|
||||
MaxCPU: 90,
|
||||
MaxMemory: 90,
|
||||
MaxGPU: 11,
|
||||
MaxGPUMemory: 50,
|
||||
PSUtil: newUtil(2),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
info := r.Info()
|
||||
|
||||
require.Equal(t, Info{
|
||||
Mem: MemoryInfo{
|
||||
Total: 200,
|
||||
Available: 40,
|
||||
Used: 160,
|
||||
Limit: 180,
|
||||
Core: 42,
|
||||
Throttling: false,
|
||||
Error: nil,
|
||||
},
|
||||
CPU: CPUInfo{
|
||||
NCPU: 2,
|
||||
System: 10,
|
||||
User: 50,
|
||||
Idle: 35,
|
||||
Other: 5,
|
||||
Limit: 90,
|
||||
Core: 6,
|
||||
Throttling: false,
|
||||
Error: nil,
|
||||
},
|
||||
GPU: GPUInfo{
|
||||
NGPU: 2,
|
||||
GPU: []GPUInfoStat{{
|
||||
Index: 0,
|
||||
Name: "L4",
|
||||
MemoryTotal: 24 * 1024 * 1024 * 1024,
|
||||
MemoryUsed: 12 * 1024 * 1024 * 1024,
|
||||
MemoryAvailable: 12 * 1024 * 1024 * 1024,
|
||||
MemoryLimit: 12 * 1024 * 1024 * 1024,
|
||||
Usage: 45,
|
||||
Encoder: 40,
|
||||
Decoder: 47,
|
||||
UsageLimit: 11,
|
||||
}, {
|
||||
Index: 1,
|
||||
Name: "L4",
|
||||
MemoryTotal: 24 * 1024 * 1024 * 1024,
|
||||
MemoryUsed: 13 * 1024 * 1024 * 1024,
|
||||
MemoryAvailable: 11 * 1024 * 1024 * 1024,
|
||||
MemoryLimit: 12 * 1024 * 1024 * 1024,
|
||||
Usage: 40,
|
||||
Encoder: 30,
|
||||
Decoder: 44,
|
||||
UsageLimit: 11,
|
||||
}},
|
||||
Error: nil,
|
||||
},
|
||||
}, info)
|
||||
}
|
||||
|
||||
@ -79,13 +79,21 @@ type Config struct {
|
||||
Reconnect bool
|
||||
ReconnectDelay uint64 // seconds
|
||||
Autostart bool
|
||||
StaleTimeout uint64 // seconds
|
||||
Timeout uint64 // seconds
|
||||
Scheduler string // crontab pattern or RFC3339 timestamp
|
||||
LogPatterns []string // will be interpreted as regular expressions
|
||||
LimitCPU float64 // percent
|
||||
LimitMemory uint64 // bytes
|
||||
LimitWaitFor uint64 // seconds
|
||||
StaleTimeout uint64 // seconds
|
||||
Timeout uint64 // seconds
|
||||
Scheduler string // crontab pattern or RFC3339 timestamp
|
||||
LogPatterns []string // will be interpreted as regular expressions
|
||||
LimitCPU float64 // percent
|
||||
LimitMemory uint64 // bytes
|
||||
LimitGPU ConfigLimitGPU // GPU limits
|
||||
LimitWaitFor uint64 // seconds
|
||||
}
|
||||
|
||||
type ConfigLimitGPU struct {
|
||||
Usage float64 // percent 0-100
|
||||
Encoder float64 // percent 0-100
|
||||
Decoder float64 // percent 0-100
|
||||
Memory uint64 // bytes
|
||||
}
|
||||
|
||||
func (config *Config) Clone() *Config {
|
||||
@ -103,6 +111,7 @@ func (config *Config) Clone() *Config {
|
||||
Scheduler: config.Scheduler,
|
||||
LimitCPU: config.LimitCPU,
|
||||
LimitMemory: config.LimitMemory,
|
||||
LimitGPU: config.LimitGPU,
|
||||
LimitWaitFor: config.LimitWaitFor,
|
||||
}
|
||||
|
||||
@ -175,6 +184,10 @@ func (config *Config) Hash() []byte {
|
||||
b.WriteString(strconv.FormatUint(config.LimitMemory, 10))
|
||||
b.WriteString(strconv.FormatUint(config.LimitWaitFor, 10))
|
||||
b.WriteString(strconv.FormatFloat(config.LimitCPU, 'f', -1, 64))
|
||||
b.WriteString(strconv.FormatFloat(config.LimitGPU.Usage, 'f', -1, 64))
|
||||
b.WriteString(strconv.FormatFloat(config.LimitGPU.Encoder, 'f', -1, 64))
|
||||
b.WriteString(strconv.FormatFloat(config.LimitGPU.Decoder, 'f', -1, 64))
|
||||
b.WriteString(strconv.FormatUint(config.LimitGPU.Memory, 10))
|
||||
|
||||
for _, x := range config.Input {
|
||||
b.WriteString(x.HashString())
|
||||
@ -294,7 +307,7 @@ type State struct {
|
||||
Memory uint64 // Current memory consumption in bytes
|
||||
CPU float64 // Current CPU consumption in percent
|
||||
LimitMode string // How the process is limited (hard or soft)
|
||||
Resources ProcessUsage // Current resource usage, include CPU and memory consumption
|
||||
Resources ProcessUsage // Current resource usage, include CPU, memory and GPU consumption
|
||||
Command []string // ffmpeg command line parameters
|
||||
}
|
||||
|
||||
@ -326,10 +339,10 @@ func (p *ProcessUsageCPU) MarshalParser() parse.UsageCPU {
|
||||
}
|
||||
|
||||
type ProcessUsageMemory struct {
|
||||
Current uint64 // bytes
|
||||
Average float64 // bytes
|
||||
Max uint64 // bytes
|
||||
Limit uint64 // bytes
|
||||
Current uint64 // bytes
|
||||
Average uint64 // bytes
|
||||
Max uint64 // bytes
|
||||
Limit uint64 // bytes
|
||||
}
|
||||
|
||||
func (p *ProcessUsageMemory) UnmarshalParser(pp *parse.UsageMemory) {
|
||||
@ -348,20 +361,97 @@ func (p *ProcessUsageMemory) MarshalParser() parse.UsageMemory {
|
||||
return pp
|
||||
}
|
||||
|
||||
type ProcessUsageGPU struct {
|
||||
Index int
|
||||
Usage ProcessUsageGPUUsage
|
||||
Encoder ProcessUsageGPUUsage
|
||||
Decoder ProcessUsageGPUUsage
|
||||
Memory ProcessUsageGPUMemory
|
||||
}
|
||||
|
||||
func (p *ProcessUsageGPU) UnmarshalParser(pp *parse.UsageGPU) {
|
||||
p.Index = pp.Index
|
||||
p.Usage.UnmarshalParser(&pp.Usage)
|
||||
p.Encoder.UnmarshalParser(&pp.Encoder)
|
||||
p.Decoder.UnmarshalParser(&pp.Decoder)
|
||||
p.Memory.UnmarshalParser(&pp.Memory)
|
||||
}
|
||||
|
||||
func (p *ProcessUsageGPU) MarshalParser() parse.UsageGPU {
|
||||
pp := parse.UsageGPU{
|
||||
Index: p.Index,
|
||||
Usage: p.Usage.MarshalParser(),
|
||||
Encoder: p.Encoder.MarshalParser(),
|
||||
Decoder: p.Decoder.MarshalParser(),
|
||||
Memory: p.Memory.MarshalParser(),
|
||||
}
|
||||
|
||||
return pp
|
||||
}
|
||||
|
||||
type ProcessUsageGPUUsage struct {
|
||||
Current float64 // percent 0-100
|
||||
Average float64 // percent 0-100
|
||||
Max float64 // percent 0-100
|
||||
Limit float64 // percent 0-100
|
||||
}
|
||||
|
||||
func (p *ProcessUsageGPUUsage) UnmarshalParser(pp *parse.UsageGPUUsage) {
|
||||
p.Average = pp.Average
|
||||
p.Max = pp.Max
|
||||
p.Limit = pp.Limit
|
||||
}
|
||||
|
||||
func (p *ProcessUsageGPUUsage) MarshalParser() parse.UsageGPUUsage {
|
||||
pp := parse.UsageGPUUsage{
|
||||
Average: p.Average,
|
||||
Max: p.Max,
|
||||
Limit: p.Limit,
|
||||
}
|
||||
|
||||
return pp
|
||||
}
|
||||
|
||||
type ProcessUsageGPUMemory struct {
|
||||
Current uint64 // bytes
|
||||
Average uint64 // bytes
|
||||
Max uint64 // bytes
|
||||
Limit uint64 // bytes
|
||||
}
|
||||
|
||||
func (p *ProcessUsageGPUMemory) UnmarshalParser(pp *parse.UsageGPUMemory) {
|
||||
p.Average = pp.Average
|
||||
p.Max = pp.Max
|
||||
p.Limit = pp.Limit
|
||||
}
|
||||
|
||||
func (p *ProcessUsageGPUMemory) MarshalParser() parse.UsageGPUMemory {
|
||||
pp := parse.UsageGPUMemory{
|
||||
Average: p.Average,
|
||||
Max: p.Max,
|
||||
Limit: p.Limit,
|
||||
}
|
||||
|
||||
return pp
|
||||
}
|
||||
|
||||
type ProcessUsage struct {
|
||||
CPU ProcessUsageCPU
|
||||
Memory ProcessUsageMemory
|
||||
GPU ProcessUsageGPU
|
||||
}
|
||||
|
||||
func (p *ProcessUsage) UnmarshalParser(pp *parse.Usage) {
|
||||
p.CPU.UnmarshalParser(&pp.CPU)
|
||||
p.Memory.UnmarshalParser(&pp.Memory)
|
||||
p.GPU.UnmarshalParser(&pp.GPU)
|
||||
}
|
||||
|
||||
func (p *ProcessUsage) MarshalParser() parse.Usage {
|
||||
pp := parse.Usage{
|
||||
CPU: p.CPU.MarshalParser(),
|
||||
Memory: p.Memory.MarshalParser(),
|
||||
GPU: p.GPU.MarshalParser(),
|
||||
}
|
||||
|
||||
return pp
|
||||
|
||||
@ -46,12 +46,18 @@ func TestConfigHash(t *testing.T) {
|
||||
LogPatterns: []string{"^libx264"},
|
||||
LimitCPU: 50,
|
||||
LimitMemory: 3 * 1024 * 1024,
|
||||
LimitWaitFor: 20,
|
||||
LimitGPU: ConfigLimitGPU{
|
||||
Usage: 10,
|
||||
Encoder: 42,
|
||||
Decoder: 14,
|
||||
Memory: 500 * 1024 * 1024,
|
||||
},
|
||||
LimitWaitFor: 20,
|
||||
}
|
||||
|
||||
hash1 := config.Hash()
|
||||
|
||||
require.Equal(t, []byte{0x7e, 0xae, 0x5b, 0xc3, 0xad, 0xe3, 0x9a, 0xfc, 0xd3, 0x49, 0x15, 0x28, 0x93, 0x17, 0xc5, 0xbf}, hash1)
|
||||
require.Equal(t, []byte{0x5e, 0x85, 0xc3, 0xc5, 0x44, 0xfd, 0x3e, 0x10, 0x13, 0x76, 0x36, 0x8b, 0xbe, 0x7e, 0xa6, 0xbb}, hash1)
|
||||
|
||||
config.Reconnect = false
|
||||
|
||||
|
||||
194
restream/core.go
194
restream/core.go
@ -279,13 +279,14 @@ func (r *restream) resourceObserver(ctx context.Context, rsc resources.Resources
|
||||
defer ticker.Stop()
|
||||
|
||||
limitCPU, limitMemory := false, false
|
||||
var limitGPUs []bool = nil
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
cpu, memory := rsc.ShouldLimit()
|
||||
cpu, memory, gpu := rsc.ShouldLimit()
|
||||
|
||||
hasChanges := false
|
||||
|
||||
@ -299,17 +300,34 @@ func (r *restream) resourceObserver(ctx context.Context, rsc resources.Resources
|
||||
hasChanges = true
|
||||
}
|
||||
|
||||
if limitGPUs == nil {
|
||||
limitGPUs = make([]bool, len(gpu))
|
||||
}
|
||||
|
||||
for i, g := range gpu {
|
||||
if g != limitGPUs[i] {
|
||||
limitGPUs[i] = g
|
||||
hasChanges = true
|
||||
}
|
||||
}
|
||||
|
||||
if !hasChanges {
|
||||
break
|
||||
}
|
||||
|
||||
r.tasks.Range(func(id app.ProcessID, t *task) bool {
|
||||
if t.Limit(limitCPU, limitMemory) {
|
||||
limitGPU := false
|
||||
gpuindex := t.GetHWDevice()
|
||||
if gpuindex >= 0 {
|
||||
limitGPU = limitGPUs[gpuindex]
|
||||
}
|
||||
if t.Limit(limitCPU, limitMemory, limitGPU) {
|
||||
r.logger.Debug().WithFields(log.Fields{
|
||||
"limit_cpu": limitCPU,
|
||||
"limit_memory": limitMemory,
|
||||
"limit_gpu": limitGPU,
|
||||
"id": id,
|
||||
}).Log("Limiting process CPU and memory consumption")
|
||||
}).Log("Limiting process CPU, memory, and GPU consumption")
|
||||
}
|
||||
|
||||
return true
|
||||
@ -391,7 +409,11 @@ func (r *restream) load() error {
|
||||
// Validate config with all placeholders replaced. However, we need to take care
|
||||
// that the config with the task keeps its dynamic placeholders for process starts.
|
||||
config := t.config.Clone()
|
||||
resolveDynamicPlaceholder(config, r.replace)
|
||||
resolveDynamicPlaceholder(config, r.replace, map[string]string{
|
||||
"hwdevice": "0",
|
||||
}, map[string]string{
|
||||
"timestamp": time.Now().UTC().Format(time.RFC3339),
|
||||
})
|
||||
|
||||
t.usesDisk, err = validateConfig(config, r.fs.list, r.ffmpeg)
|
||||
if err != nil {
|
||||
@ -414,30 +436,23 @@ func (r *restream) load() error {
|
||||
}
|
||||
|
||||
ffmpeg, err := r.ffmpeg.New(ffmpeg.ProcessConfig{
|
||||
Reconnect: t.config.Reconnect,
|
||||
ReconnectDelay: time.Duration(t.config.ReconnectDelay) * time.Second,
|
||||
StaleTimeout: time.Duration(t.config.StaleTimeout) * time.Second,
|
||||
Timeout: time.Duration(t.config.Timeout) * time.Second,
|
||||
LimitCPU: t.config.LimitCPU,
|
||||
LimitMemory: t.config.LimitMemory,
|
||||
LimitDuration: time.Duration(t.config.LimitWaitFor) * time.Second,
|
||||
LimitMode: limitMode,
|
||||
Scheduler: t.config.Scheduler,
|
||||
Args: t.command,
|
||||
Parser: t.parser,
|
||||
Logger: t.logger,
|
||||
OnArgs: r.onArgs(t.config.Clone()),
|
||||
OnBeforeStart: func() error {
|
||||
if !r.enableSoftLimit {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := r.resources.Request(t.config.LimitCPU, t.config.LimitMemory); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
},
|
||||
Reconnect: t.config.Reconnect,
|
||||
ReconnectDelay: time.Duration(t.config.ReconnectDelay) * time.Second,
|
||||
StaleTimeout: time.Duration(t.config.StaleTimeout) * time.Second,
|
||||
Timeout: time.Duration(t.config.Timeout) * time.Second,
|
||||
LimitCPU: t.config.LimitCPU,
|
||||
LimitMemory: t.config.LimitMemory,
|
||||
LimitGPUUsage: t.config.LimitGPU.Usage,
|
||||
LimitGPUEncoder: t.config.LimitGPU.Encoder,
|
||||
LimitGPUDecoder: t.config.LimitGPU.Decoder,
|
||||
LimitGPUMemory: t.config.LimitGPU.Memory,
|
||||
LimitDuration: time.Duration(t.config.LimitWaitFor) * time.Second,
|
||||
LimitMode: limitMode,
|
||||
Scheduler: t.config.Scheduler,
|
||||
Args: t.command,
|
||||
Parser: t.parser,
|
||||
Logger: t.logger,
|
||||
OnBeforeStart: r.onBeforeStart(t.config.Clone()),
|
||||
})
|
||||
if err != nil {
|
||||
return true
|
||||
@ -578,7 +593,11 @@ func (r *restream) createTask(config *app.Config) (*task, error) {
|
||||
// Validate config with all placeholders replaced. However, we need to take care
|
||||
// that the config with the task keeps its dynamic placeholders for process starts.
|
||||
config := t.config.Clone()
|
||||
resolveDynamicPlaceholder(config, r.replace)
|
||||
resolveDynamicPlaceholder(config, r.replace, map[string]string{
|
||||
"hwdevice": "0",
|
||||
}, map[string]string{
|
||||
"timestamp": time.Now().UTC().Format(time.RFC3339),
|
||||
})
|
||||
|
||||
t.usesDisk, err = validateConfig(config, r.fs.list, r.ffmpeg)
|
||||
if err != nil {
|
||||
@ -600,30 +619,23 @@ func (r *restream) createTask(config *app.Config) (*task, error) {
|
||||
}
|
||||
|
||||
ffmpeg, err := r.ffmpeg.New(ffmpeg.ProcessConfig{
|
||||
Reconnect: t.config.Reconnect,
|
||||
ReconnectDelay: time.Duration(t.config.ReconnectDelay) * time.Second,
|
||||
StaleTimeout: time.Duration(t.config.StaleTimeout) * time.Second,
|
||||
Timeout: time.Duration(t.config.Timeout) * time.Second,
|
||||
LimitCPU: t.config.LimitCPU,
|
||||
LimitMemory: t.config.LimitMemory,
|
||||
LimitDuration: time.Duration(t.config.LimitWaitFor) * time.Second,
|
||||
LimitMode: limitMode,
|
||||
Scheduler: t.config.Scheduler,
|
||||
Args: t.command,
|
||||
Parser: t.parser,
|
||||
Logger: t.logger,
|
||||
OnArgs: r.onArgs(t.config.Clone()),
|
||||
OnBeforeStart: func() error {
|
||||
if !r.enableSoftLimit {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := r.resources.Request(t.config.LimitCPU, t.config.LimitMemory); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
},
|
||||
Reconnect: t.config.Reconnect,
|
||||
ReconnectDelay: time.Duration(t.config.ReconnectDelay) * time.Second,
|
||||
StaleTimeout: time.Duration(t.config.StaleTimeout) * time.Second,
|
||||
Timeout: time.Duration(t.config.Timeout) * time.Second,
|
||||
LimitCPU: t.config.LimitCPU,
|
||||
LimitMemory: t.config.LimitMemory,
|
||||
LimitGPUUsage: t.config.LimitGPU.Usage,
|
||||
LimitGPUEncoder: t.config.LimitGPU.Encoder,
|
||||
LimitGPUDecoder: t.config.LimitGPU.Decoder,
|
||||
LimitGPUMemory: t.config.LimitGPU.Memory,
|
||||
LimitDuration: time.Duration(t.config.LimitWaitFor) * time.Second,
|
||||
LimitMode: limitMode,
|
||||
Scheduler: t.config.Scheduler,
|
||||
Args: t.command,
|
||||
Parser: t.parser,
|
||||
Logger: t.logger,
|
||||
OnBeforeStart: r.onBeforeStart(t.config.Clone()),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -636,21 +648,45 @@ func (r *restream) createTask(config *app.Config) (*task, error) {
|
||||
return t, nil
|
||||
}
|
||||
|
||||
// onArgs is a callback that gets called by a process before it will be started.
|
||||
// It evalutes the dynamic placeholders in a process config and returns the
|
||||
// resulting command line to the process.
|
||||
func (r *restream) onArgs(cfg *app.Config) func([]string) []string {
|
||||
return func(args []string) []string {
|
||||
// onBeforeStart is a callback that gets called by a process before it will be started.
|
||||
// It evalutes the dynamic placeholders in a process config and returns the resulting command line to the process.
|
||||
func (r *restream) onBeforeStart(cfg *app.Config) func([]string) ([]string, error) {
|
||||
return func(args []string) ([]string, error) {
|
||||
selectedGPU := -1
|
||||
if r.enableSoftLimit {
|
||||
res, err := r.resources.Request(resources.Request{
|
||||
CPU: cfg.LimitCPU,
|
||||
Memory: cfg.LimitMemory,
|
||||
GPUUsage: cfg.LimitGPU.Usage,
|
||||
GPUEncoder: cfg.LimitGPU.Encoder,
|
||||
GPUDecoder: cfg.LimitGPU.Decoder,
|
||||
GPUMemory: cfg.LimitGPU.Memory,
|
||||
})
|
||||
if err != nil {
|
||||
return []string{}, err
|
||||
}
|
||||
|
||||
selectedGPU = res.GPU
|
||||
}
|
||||
|
||||
if t, hasTask := r.tasks.Load(cfg.ProcessID()); hasTask {
|
||||
t.SetHWDevice(selectedGPU)
|
||||
}
|
||||
|
||||
config := cfg.Clone()
|
||||
|
||||
resolveDynamicPlaceholder(config, r.replace)
|
||||
resolveDynamicPlaceholder(config, r.replace, map[string]string{
|
||||
"hwdevice": fmt.Sprintf("%d", selectedGPU),
|
||||
}, map[string]string{
|
||||
"timestamp": time.Now().UTC().Format(time.RFC3339),
|
||||
})
|
||||
|
||||
_, err := validateConfig(config, r.fs.list, r.ffmpeg)
|
||||
if err != nil {
|
||||
return []string{}
|
||||
return []string{}, err
|
||||
}
|
||||
|
||||
return config.CreateCommand()
|
||||
return config.CreateCommand(), nil
|
||||
}
|
||||
}
|
||||
|
||||
@ -1448,7 +1484,11 @@ func (r *restream) Probe(config *app.Config, timeout time.Duration) app.Probe {
|
||||
return probe
|
||||
}
|
||||
|
||||
resolveDynamicPlaceholder(config, r.replace)
|
||||
resolveDynamicPlaceholder(config, r.replace, map[string]string{
|
||||
"hwdevice": "0",
|
||||
}, map[string]string{
|
||||
"timestamp": time.Now().UTC().Format(time.RFC3339),
|
||||
})
|
||||
|
||||
_, err = validateConfig(config, r.fs.list, r.ffmpeg)
|
||||
if err != nil {
|
||||
@ -1712,22 +1752,26 @@ func resolveStaticPlaceholders(config *app.Config, r replace.Replacer) {
|
||||
|
||||
// resolveDynamicPlaceholder replaces placeholders in the config that should be replaced at process start.
|
||||
// The config will be modified in place.
|
||||
func resolveDynamicPlaceholder(config *app.Config, r replace.Replacer) {
|
||||
vars := map[string]string{
|
||||
"timestamp": time.Now().UTC().Format(time.RFC3339),
|
||||
}
|
||||
func resolveDynamicPlaceholder(config *app.Config, r replace.Replacer, values map[string]string, vars map[string]string) {
|
||||
placeholders := []string{"date", "hwdevice"}
|
||||
|
||||
for i, option := range config.Options {
|
||||
option = r.Replace(option, "date", "", vars, config, "global")
|
||||
for _, placeholder := range placeholders {
|
||||
option = r.Replace(option, placeholder, values[placeholder], vars, config, "global")
|
||||
}
|
||||
|
||||
config.Options[i] = option
|
||||
}
|
||||
|
||||
for i, input := range config.Input {
|
||||
input.Address = r.Replace(input.Address, "date", "", vars, config, "input")
|
||||
for _, placeholder := range placeholders {
|
||||
input.Address = r.Replace(input.Address, placeholder, values[placeholder], vars, config, "input")
|
||||
}
|
||||
|
||||
for j, option := range input.Options {
|
||||
option = r.Replace(option, "date", "", vars, config, "input")
|
||||
for _, placeholder := range placeholders {
|
||||
option = r.Replace(option, placeholder, values[placeholder], vars, config, "input")
|
||||
}
|
||||
|
||||
input.Options[j] = option
|
||||
}
|
||||
@ -1736,16 +1780,22 @@ func resolveDynamicPlaceholder(config *app.Config, r replace.Replacer) {
|
||||
}
|
||||
|
||||
for i, output := range config.Output {
|
||||
output.Address = r.Replace(output.Address, "date", "", vars, config, "output")
|
||||
for _, placeholder := range placeholders {
|
||||
output.Address = r.Replace(output.Address, placeholder, values[placeholder], vars, config, "output")
|
||||
}
|
||||
|
||||
for j, option := range output.Options {
|
||||
option = r.Replace(option, "date", "", vars, config, "output")
|
||||
for _, placeholder := range placeholders {
|
||||
option = r.Replace(option, placeholder, values[placeholder], vars, config, "output")
|
||||
}
|
||||
|
||||
output.Options[j] = option
|
||||
}
|
||||
|
||||
for j, cleanup := range output.Cleanup {
|
||||
cleanup.Pattern = r.Replace(cleanup.Pattern, "date", "", vars, config, "output")
|
||||
for _, placeholder := range placeholders {
|
||||
cleanup.Pattern = r.Replace(cleanup.Pattern, placeholder, values[placeholder], vars, config, "output")
|
||||
}
|
||||
|
||||
output.Cleanup[j] = cleanup
|
||||
}
|
||||
|
||||
@ -1261,7 +1261,7 @@ func TestReplacer(t *testing.T) {
|
||||
|
||||
require.Equal(t, wantprocess, process)
|
||||
|
||||
resolveDynamicPlaceholder(process, replacer)
|
||||
resolveDynamicPlaceholder(process, replacer, nil, nil)
|
||||
|
||||
wantprocess.Input = []app.ConfigIO{
|
||||
{
|
||||
@ -1531,7 +1531,7 @@ func TestProcessLimit(t *testing.T) {
|
||||
|
||||
status := task.ffmpeg.Status()
|
||||
|
||||
ncpu, err := psutil.CPUCounts(true)
|
||||
ncpu, err := psutil.CPUCounts()
|
||||
require.NoError(t, err)
|
||||
|
||||
require.Equal(t, ncpu*process.LimitCPU, status.CPU.Limit)
|
||||
|
||||
@ -3,6 +3,7 @@ package restream
|
||||
import (
|
||||
"errors"
|
||||
"maps"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/datarhei/core/v16/ffmpeg/parse"
|
||||
@ -31,7 +32,8 @@ type task struct {
|
||||
parser parse.Parser
|
||||
playout map[string]int
|
||||
logger log.Logger
|
||||
usesDisk bool // Whether this task uses the disk
|
||||
usesDisk bool // Whether this task uses the disk
|
||||
hwdevice atomic.Int32 // Index of the GPU this task uses
|
||||
metadata map[string]interface{}
|
||||
|
||||
lock *xsync.RBMutex
|
||||
@ -234,8 +236,47 @@ func (t *task) State() (*app.State, error) {
|
||||
state.Memory = status.Memory.Current
|
||||
state.CPU = status.CPU.Current / status.CPU.NCPU
|
||||
state.LimitMode = status.LimitMode
|
||||
state.Resources.CPU = status.CPU
|
||||
state.Resources.Memory = status.Memory
|
||||
state.Resources.CPU = app.ProcessUsageCPU{
|
||||
NCPU: status.CPU.NCPU,
|
||||
Current: status.CPU.Current,
|
||||
Average: status.CPU.Average,
|
||||
Max: status.CPU.Max,
|
||||
Limit: status.CPU.Limit,
|
||||
IsThrottling: status.CPU.IsThrottling,
|
||||
}
|
||||
state.Resources.Memory = app.ProcessUsageMemory{
|
||||
Current: status.Memory.Current,
|
||||
Average: status.Memory.Average,
|
||||
Max: status.Memory.Max,
|
||||
Limit: status.Memory.Limit,
|
||||
}
|
||||
state.Resources.GPU = app.ProcessUsageGPU{
|
||||
Index: status.GPU.Index,
|
||||
Usage: app.ProcessUsageGPUUsage{
|
||||
Current: status.GPU.Usage.Current,
|
||||
Average: status.GPU.Usage.Average,
|
||||
Max: status.GPU.Usage.Max,
|
||||
Limit: status.GPU.Usage.Limit,
|
||||
},
|
||||
Encoder: app.ProcessUsageGPUUsage{
|
||||
Current: status.GPU.Encoder.Current,
|
||||
Average: status.GPU.Encoder.Average,
|
||||
Max: status.GPU.Encoder.Max,
|
||||
Limit: status.GPU.Encoder.Limit,
|
||||
},
|
||||
Decoder: app.ProcessUsageGPUUsage{
|
||||
Current: status.GPU.Decoder.Current,
|
||||
Average: status.GPU.Decoder.Average,
|
||||
Max: status.GPU.Decoder.Max,
|
||||
Limit: status.GPU.Decoder.Limit,
|
||||
},
|
||||
Memory: app.ProcessUsageGPUMemory{
|
||||
Current: status.GPU.Memory.Current,
|
||||
Average: status.GPU.Memory.Average,
|
||||
Max: status.GPU.Memory.Max,
|
||||
Limit: status.GPU.Memory.Limit,
|
||||
},
|
||||
}
|
||||
state.Duration = status.Duration.Round(10 * time.Millisecond).Seconds()
|
||||
state.Reconnect = -1
|
||||
state.Command = status.CommandArgs
|
||||
@ -420,7 +461,7 @@ func (t *task) ExportMetadata() map[string]interface{} {
|
||||
return t.metadata
|
||||
}
|
||||
|
||||
func (t *task) Limit(cpu, memory bool) bool {
|
||||
func (t *task) Limit(cpu, memory, gpu bool) bool {
|
||||
token := t.lock.RLock()
|
||||
defer t.lock.RUnlock(token)
|
||||
|
||||
@ -428,11 +469,19 @@ func (t *task) Limit(cpu, memory bool) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
t.ffmpeg.Limit(cpu, memory)
|
||||
t.ffmpeg.Limit(cpu, memory, gpu)
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func (t *task) SetHWDevice(index int) {
|
||||
t.hwdevice.Store(int32(index))
|
||||
}
|
||||
|
||||
func (t *task) GetHWDevice() int {
|
||||
return int(t.hwdevice.Load())
|
||||
}
|
||||
|
||||
func (t *task) Equal(config *app.Config) bool {
|
||||
token := t.lock.RLock()
|
||||
defer t.lock.RUnlock(token)
|
||||
|
||||
@ -8,6 +8,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/datarhei/core/v16/io/fs"
|
||||
|
||||
"github.com/lestrrat-go/strftime"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user