Improved NodeTypeIterator loop detection (armadaproject#3435)

severinson · web-flow · commit 18c0b68bbe87 · 2024-03-01T13:01:46.000Z
* Improved NodeTypeIterator loop detection

* Comments

* Comments

* Include job ids in scheduler errors
diff --git a/internal/scheduler/context/context.go b/internal/scheduler/context/context.go
@@ -559,6 +559,15 @@ func NewGangSchedulingContext(jctxs []*JobSchedulingContext) *GangSchedulingCont
 	}
 }
 
+// JobIds returns a sliced composed of the ids of the jobs that make up the gang.
+func (gctx *GangSchedulingContext) JobIds() []string {
+	rv := make([]string, len(gctx.JobSchedulingContexts))
+	for i, jctx := range gctx.JobSchedulingContexts {
+		rv[i] = jctx.JobId
+	}
+	return rv
+}
+
 // Cardinality returns the number of jobs in the gang.
 func (gctx *GangSchedulingContext) Cardinality() int {
 	return len(gctx.JobSchedulingContexts)
diff --git a/internal/scheduler/gang_scheduler.go b/internal/scheduler/gang_scheduler.go
@@ -4,6 +4,7 @@ import (
 	"fmt"
 
 	"github.com/hashicorp/go-memdb"
+	"github.com/pkg/errors"
 
 	"github.com/armadaproject/armada/internal/common/armadacontext"
 	"github.com/armadaproject/armada/internal/common/util"
@@ -106,8 +107,9 @@ func (sch *GangScheduler) Schedule(ctx *armadacontext.Context, gctx *schedulerco
 	// This deferred function ensures unschedulable jobs are registered as such.
 	gangAddedToSchedulingContext := false
 	defer func() {
-		// Do nothing if an error occurred.
+		// If an error occurred, augment the error message and return.
 		if err != nil {
+			err = errors.WithMessagef(err, "failed scheduling gang %s composed of jobs %v", gctx.Id, gctx.JobIds())
 			return
 		}
 
diff --git a/internal/scheduler/nodedb/encoding.go b/internal/scheduler/nodedb/encoding.go
@@ -8,12 +8,16 @@ import (
 	"github.com/armadaproject/armada/internal/scheduler/schedulerobjects"
 )
 
-// NodeIndexKey returns a []byte to be used as a key with the NodeIndex memdb index with layout
+// NodeIndexKey returns a []byte to be used as a key with the NodeIndex memdb index.
+// This key should be used for lookup. Use the rounded version below for inserts.
 //
-// 0            8              16             32
-// | nodeTypeId | resources[0] | resources[1] | ... |
+// The layout of the key is:
 //
-// where the numbers indicate number of bytes.
+// 0            8              16             32    x          x+8
+// | nodeTypeId | resources[0] | resources[1] | ... | nodeIndex |
+//
+// where the numbers indicate byte index.
+// NodeIndex ensures each key is unique and so must be unique across all nodes.
 //
 // The key layout is such that an index ordered first by the nodeTypeId, then resources[0], and so on.
 // The byte representation is appended to out, which is returned.
@@ -22,20 +26,32 @@ func NodeIndexKey(out []byte, nodeTypeId uint64, resources []resource.Quantity)
 	for _, q := range resources {
 		out = EncodeQuantity(out, q)
 	}
+	// Because the key returned by this function should be used with a lower-bound operation on allocatable resources
+	// we set the nodeIndex to 0.
+	out = EncodeUint64(out, 0)
 	return out
 }
 
 // RoundedNodeIndexKeyFromResourceList works like NodeIndexKey, except that prior to constructing the key
 // the i-th resource is rounded down to the closest multiple of resourceResolutionMillis[i].
+// This rounding makes iterating over nodes with at least some amount of available resources more efficient.
 // It also takes as arguments a list of resource names and a resourceList, instead of a list of resources.
-func RoundedNodeIndexKeyFromResourceList(out []byte, nodeTypeId uint64, resourceNames []string, resourceResolutionMillis []int64, rl schedulerobjects.ResourceList) []byte {
+func RoundedNodeIndexKeyFromResourceList(
+	out []byte,
+	nodeTypeId uint64,
+	resourceNames []string,
+	resourceResolutionMillis []int64,
+	rl schedulerobjects.ResourceList,
+	nodeIndex uint64,
+) []byte {
 	out = EncodeUint64(out, nodeTypeId)
 	for i, name := range resourceNames {
 		resolution := resourceResolutionMillis[i]
 		q := rl.Get(name)
 		q = roundQuantityToResolution(q, resolution)
 		out = EncodeQuantity(out, q)
 	}
+	out = EncodeUint64(out, nodeIndex)
 	return out
 }
 
@@ -52,7 +68,7 @@ func EncodeQuantity(out []byte, val resource.Quantity) []byte {
 	return EncodeInt64(out, val.MilliValue())
 }
 
-// EncodeInt64 returns the canonical byte representation of a int64 used within the nodeDb.
+// EncodeInt64 returns the canonical byte representation of an int64 used within the nodeDb.
 // The resulting []byte is such that for two int64 a and b, a.Cmp(b) = bytes.Compare(enc(a), enc(b)).
 // The byte representation is appended to out, which is returned.
 func EncodeInt64(out []byte, val int64) []byte {
@@ -65,6 +81,10 @@ func EncodeInt64(out []byte, val int64) []byte {
 	// becomes the maximum positive uint.
 	scaled := val ^ int64(-1<<(size*8-1))
 
+	// TODO(albin): It's possible (though unlikely) that this shifting causes nodeType clashes,
+	//              since they're computed by hashing labels etc. and so may be big integers.
+	//              This would reduce the efficiency of nodeType indexing but shouldn't affect correctness.
+
 	binary.BigEndian.PutUint64(out[len(out)-8:], uint64(scaled))
 	return out
 }
diff --git a/internal/scheduler/nodedb/encoding_test.go b/internal/scheduler/nodedb/encoding_test.go
@@ -99,6 +99,51 @@ func TestEncodeQuantity(t *testing.T) {
 	}
 }
 
+func TestRoundQuantityToResolution(t *testing.T) {
+	tests := map[string]struct {
+		q                resource.Quantity
+		resolutionMillis int64
+		expected         resource.Quantity
+	}{
+		"1Ki": {
+			q:                resource.MustParse("1Ki"),
+			resolutionMillis: 1,
+			expected:         resource.MustParse("1Ki"),
+		},
+		"resolution equal to quantity": {
+			q:                resource.MustParse("1Ki"),
+			resolutionMillis: 1024 * 1000,
+			expected:         resource.MustParse("1Ki"),
+		},
+		"0": {
+			q:                resource.MustParse("0"),
+			resolutionMillis: 1,
+			expected:         resource.MustParse("0"),
+		},
+		"1m": {
+			q:                resource.MustParse("1m"),
+			resolutionMillis: 1,
+			expected:         resource.MustParse("1m"),
+		},
+		"1": {
+			q:                resource.MustParse("1"),
+			resolutionMillis: 1,
+			expected:         resource.MustParse("1"),
+		},
+		"resolution 3": {
+			q:                resource.MustParse("1"),
+			resolutionMillis: 3,
+			expected:         resource.MustParse("999m"),
+		},
+	}
+	for name, tc := range tests {
+		t.Run(name, func(t *testing.T) {
+			actual := roundQuantityToResolution(tc.q, tc.resolutionMillis)
+			assert.Truef(t, actual.Equal(tc.expected), "expected %s, but got %s", tc.expected.String(), actual.String())
+		})
+	}
+}
+
 func TestNodeIndexKey(t *testing.T) {
 	type nodeIndexKeyValues struct {
 		nodeTypeId uint64
@@ -205,6 +250,7 @@ func TestRoundedNodeIndexKeyFromResourceList(t *testing.T) {
 			schedulerobjects.ResourceList{
 				Resources: map[string]resource.Quantity{"foo": resource.MustParse("1"), "bar": resource.MustParse("2")},
 			},
+			0,
 		),
 	)
 	assert.NotEqual(
@@ -218,6 +264,7 @@ func TestRoundedNodeIndexKeyFromResourceList(t *testing.T) {
 			schedulerobjects.ResourceList{
 				Resources: map[string]resource.Quantity{"foo": resource.MustParse("1"), "bar": resource.MustParse("2")},
 			},
+			0,
 		),
 	)
 }
diff --git a/internal/scheduler/nodedb/nodedb.go b/internal/scheduler/nodedb/nodedb.go
@@ -39,9 +39,15 @@ const (
 var empty struct{}
 
 type Node struct {
-	Id       string
-	Name     string
+	// Unique id and index of this node.
+	// TODO(albin): Having both id and index is redundant.
+	//              Currently, the id is "cluster name" + "node name"  and index an integer assigned on node creation.
+	Id    string
+	Index uint64
+
+	// Executor this node belongs to and node name, which must be unique per executor.
 	Executor string
+	Name     string
 
 	// We need to store taints and labels separately from the node type: the latter only includes
 	// indexed taints and labels, but we need all of them when checking pod requirements.
@@ -65,9 +71,11 @@ type Node struct {
 // shallow copies of fields that are not mutated by methods of NodeDb.
 func (node *Node) UnsafeCopy() *Node {
 	return &Node{
-		Id:       node.Id,
-		Name:     node.Name,
+		Id:    node.Id,
+		Index: node.Index,
+
 		Executor: node.Executor,
+		Name:     node.Name,
 
 		Taints: node.Taints,
 		Labels: node.Labels,
@@ -139,16 +147,19 @@ func (nodeDb *NodeDb) create(node *schedulerobjects.Node) (*Node, error) {
 			nodeDb.indexedNodeLabelValues[key][value] = empty
 		}
 	}
+	index := uint64(nodeDb.numNodes)
 	nodeDb.numNodes++
 	nodeDb.numNodesByNodeType[nodeType.Id]++
 	nodeDb.totalResources.Add(totalResources)
 	nodeDb.nodeTypes[nodeType.Id] = nodeType
 	nodeDb.mu.Unlock()
 
 	entry := &Node{
-		Id:       node.Id,
-		Name:     node.Name,
+		Id:    node.Id,
+		Index: index,
+
 		Executor: node.Executor,
+		Name:     node.Name,
 
 		Taints: taints,
 		Labels: labels,
@@ -256,8 +267,10 @@ type NodeDb struct {
 	//
 	// Lower resolution makes scheduling faster, but may lead to jobs incorrectly being considered unschedulable.
 	indexedResourceResolutionMillis []int64
-	// Map from priority class priority to the index tracking allocatable resources at that priority.
+	// Map from priority class priority to the database index tracking allocatable resources at that priority.
 	indexNameByPriority map[int32]string
+	// Map from priority class priority to the index of node.keys corresponding to that priority.
+	keyIndexByPriority map[int32]int
 	// Taint keys that to create indexes for.
 	// Should include taints frequently used for scheduling.
 	// Since the NodeDb can efficiently sort out nodes with taints not tolerated
@@ -317,7 +330,7 @@ func NewNodeDb(
 	nodeDbPriorities = append(nodeDbPriorities, types.AllowedPriorities(priorityClasses)...)
 
 	indexedResourceNames := util.Map(indexedResources, func(v configuration.IndexedResource) string { return v.Name })
-	schema, indexNameByPriority := nodeDbSchema(nodeDbPriorities, indexedResourceNames)
+	schema, indexNameByPriority, keyIndexByPriority := nodeDbSchema(nodeDbPriorities, indexedResourceNames)
 	db, err := memdb.NewMemDB(schema)
 	if err != nil {
 		return nil, errors.WithStack(err)
@@ -359,6 +372,7 @@ func NewNodeDb(
 			func(v configuration.IndexedResource) int64 { return v.Resolution.MilliValue() },
 		),
 		indexNameByPriority:    indexNameByPriority,
+		keyIndexByPriority:     keyIndexByPriority,
 		indexedTaints:          mapFromSlice(indexedTaints),
 		indexedNodeLabels:      mapFromSlice(indexedNodeLabels),
 		indexedNodeLabelValues: indexedNodeLabelValues,
@@ -432,7 +446,7 @@ func (nodeDb *NodeDb) IndexedNodeLabelValues(label string) (map[string]struct{},
 func (nodeDb *NodeDb) NumNodes() int {
 	nodeDb.mu.Lock()
 	defer nodeDb.mu.Unlock()
-	return nodeDb.numNodes
+	return int(nodeDb.numNodes)
 }
 
 func (nodeDb *NodeDb) TotalResources() schedulerobjects.ResourceList {
@@ -791,11 +805,16 @@ func (nodeDb *NodeDb) selectNodeForPodAtPriority(
 	if !ok {
 		return nil, errors.Errorf("no index for priority %d; must be in %v", priority, nodeDb.indexNameByPriority)
 	}
+	keyIndex, ok := nodeDb.keyIndexByPriority[priority]
+	if !ok {
+		return nil, errors.Errorf("no key index for priority %d; must be in %v", priority, nodeDb.keyIndexByPriority)
+	}
 	it, err := NewNodeTypesIterator(
 		txn,
 		matchingNodeTypeIds,
 		indexName,
 		priority,
+		keyIndex,
 		nodeDb.indexedResources,
 		indexResourceRequests,
 		nodeDb.indexedResourceResolutionMillis,
@@ -1158,7 +1177,7 @@ func (nodeDb *NodeDb) Upsert(node *Node) error {
 func (nodeDb *NodeDb) UpsertWithTxn(txn *memdb.Txn, node *Node) error {
 	keys := make([][]byte, len(nodeDb.nodeDbPriorities))
 	for i, p := range nodeDb.nodeDbPriorities {
-		keys[i] = nodeDb.nodeDbKey(keys[i], node.NodeTypeId, node.AllocatableByPriority[p])
+		keys[i] = nodeDb.nodeDbKey(keys[i], node.NodeTypeId, node.AllocatableByPriority[p], node.Index)
 	}
 	node.Keys = keys
 
@@ -1204,38 +1223,40 @@ func (nodeDb *NodeDb) AddEvictedJobSchedulingContextWithTxn(txn *memdb.Txn, inde
 	return nil
 }
 
-func nodeDbSchema(priorities []int32, resources []string) (*memdb.DBSchema, map[int32]string) {
-	nodesTable, indexNameByPriority := nodesTableSchema(priorities, resources)
+func nodeDbSchema(priorities []int32, resources []string) (*memdb.DBSchema, map[int32]string, map[int32]int) {
+	nodesTable, indexNameByPriority, keyIndexByPriority := nodesTableSchema(priorities, resources)
 	evictionsTable := evictionsTableSchema()
 	return &memdb.DBSchema{
 		Tables: map[string]*memdb.TableSchema{
 			nodesTable.Name:     nodesTable,
 			evictionsTable.Name: evictionsTable,
 		},
-	}, indexNameByPriority
+	}, indexNameByPriority, keyIndexByPriority
 }
 
-func nodesTableSchema(priorities []int32, resources []string) (*memdb.TableSchema, map[int32]string) {
+func nodesTableSchema(priorities []int32, resources []string) (*memdb.TableSchema, map[int32]string, map[int32]int) {
 	indexes := make(map[string]*memdb.IndexSchema, len(priorities)+1)
 	indexes["id"] = &memdb.IndexSchema{
 		Name:    "id",
 		Unique:  true,
 		Indexer: &memdb.StringFieldIndex{Field: "Id"},
 	}
 	indexNameByPriority := make(map[int32]string, len(priorities))
+	keyIndexByPriority := make(map[int32]int, len(priorities))
 	for i, priority := range priorities {
 		name := nodeIndexName(i)
 		indexNameByPriority[priority] = name
+		keyIndexByPriority[priority] = i
 		indexes[name] = &memdb.IndexSchema{
 			Name:    name,
-			Unique:  false,
+			Unique:  true,
 			Indexer: &NodeIndex{KeyIndex: i},
 		}
 	}
 	return &memdb.TableSchema{
 		Name:    "nodes",
 		Indexes: indexes,
-	}, indexNameByPriority
+	}, indexNameByPriority, keyIndexByPriority
 }
 
 func evictionsTableSchema() *memdb.TableSchema {
@@ -1278,12 +1299,13 @@ func (nodeDb *NodeDb) stringFromPodRequirementsNotMetReason(reason PodRequiremen
 // nodeDbKey returns the index key for a particular node.
 // Allocatable resources are rounded down to the closest multiple of nodeDb.indexedResourceResolutionMillis.
 // This improves efficiency by reducing the number of distinct values in the index.
-func (nodeDb *NodeDb) nodeDbKey(out []byte, nodeTypeId uint64, allocatable schedulerobjects.ResourceList) []byte {
+func (nodeDb *NodeDb) nodeDbKey(out []byte, nodeTypeId uint64, allocatable schedulerobjects.ResourceList, nodeIndex uint64) []byte {
 	return RoundedNodeIndexKeyFromResourceList(
 		out,
 		nodeTypeId,
 		nodeDb.indexedResources,
 		nodeDb.indexedResourceResolutionMillis,
 		allocatable,
+		nodeIndex,
 	)
 }
diff --git a/internal/scheduler/nodedb/nodedb_test.go b/internal/scheduler/nodedb/nodedb_test.go
@@ -23,7 +23,7 @@ import (
 )
 
 func TestNodeDbSchema(t *testing.T) {
-	schema, _ := nodeDbSchema(testfixtures.TestPriorities, testfixtures.TestResourceNames)
+	schema, _, _ := nodeDbSchema(testfixtures.TestPriorities, testfixtures.TestResourceNames)
 	assert.NoError(t, schema.Validate())
 }
 
diff --git a/internal/scheduler/nodedb/nodeiteration.go b/internal/scheduler/nodedb/nodeiteration.go
diff --git a/internal/scheduler/nodedb/nodeiteration_test.go b/internal/scheduler/nodedb/nodeiteration_test.go

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ import (`
`23`	`23`	`)`
`24`	`24`
`25`	`25`	`func TestNodeDbSchema(t *testing.T) {`
`26`		`- schema, _ := nodeDbSchema(testfixtures.TestPriorities, testfixtures.TestResourceNames)`
	`26`	`+ schema, _, _ := nodeDbSchema(testfixtures.TestPriorities, testfixtures.TestResourceNames)`
`27`	`27`	`assert.NoError(t, schema.Validate())`
`28`	`28`	`}`
`29`	`29`