Skip to content

Commit

Permalink
GKE Autopilot: Separate game server workloads (#2840)
Browse files Browse the repository at this point in the history
This PR does a couple of things towards #2777:

* Enforce that on Autopilot the scheduling strategy is `Packed`

* If the user does not provide a nodeSelector or tolerations
in the PodSpec, we add:

```
    nodeSelector:
      agones.dev/role: gameserver
    tolerations:
    - effect: NoSchedule
      key: agones.dev/role
      operator: Equal
      value: gameserver
```

This has the effect of splitting the game server pods off to their
own nodes ala https://wdenniss.com/autopilot-workload-separation.

If the user has either defined, we assume they know what they're
doing and skip this.

Note: In `Packed` we already set an affinity:

```
    affinity:
      podAffinity:
        preferredDuringSchedulingIgnoredDuringExecution:
        - podAffinityTerm:
            labelSelector:
              matchLabels:
                agones.dev/role: gameserver
            topologyKey: kubernetes.io/hostname
          weight: 100
```

We keep this affinity, but it becomes mostly a no-op after using
a `nodeSelector`.
  • Loading branch information
zmerlynn committed Dec 9, 2022
1 parent e4144ef commit fd29174
Show file tree
Hide file tree
Showing 6 changed files with 128 additions and 7 deletions.
3 changes: 3 additions & 0 deletions pkg/cloudproduct/cloudproduct.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ type CloudProduct interface {
// ValidateGameServer is called by GameServer.Validate to allow for product specific validation.
ValidateGameServer(*agonesv1.GameServer) []metav1.StatusCause

// MutateGameServerPod is called by createGameServerPod to allow for product specific pod mutation.
MutateGameServerPod(*agonesv1.GameServer, *corev1.Pod) error

// NewPortAllocator creates a PortAllocator. See gameservers.NewPortAllocator for parameters.
NewPortAllocator(int32, int32, informers.SharedInformerFactory, externalversions.SharedInformerFactory) portallocator.Interface
}
Expand Down
6 changes: 3 additions & 3 deletions pkg/cloudproduct/generic/generic.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,6 @@ func (*generic) NewPortAllocator(minPort, maxPort int32,
return portallocator.New(minPort, maxPort, kubeInformerFactory, agonesInformerFactory)
}

func (*generic) ValidateGameServer(*agonesv1.GameServer) []metav1.StatusCause {
return nil
}
func (*generic) ValidateGameServer(*agonesv1.GameServer) []metav1.StatusCause { return nil }

func (*generic) MutateGameServerPod(gs *agonesv1.GameServer, pod *corev1.Pod) error { return nil }
30 changes: 30 additions & 0 deletions pkg/cloudproduct/gke/gke.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"encoding/json"
"fmt"

"agones.dev/agones/pkg/apis"
agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
"agones.dev/agones/pkg/client/informers/externalversions"
"agones.dev/agones/pkg/portallocator"
Expand All @@ -36,6 +37,7 @@ const (
hostPortAssignmentAnnotation = "autopilot.gke.io/host-port-assignment"

errPortPolicyMustBeDynamic = "PortPolicy must be Dynamic on GKE Autopilot"
errSchedulingMustBePacked = "Scheduling strategy must be Packed on GKE Autopilot"
)

var logger = runtime.NewLoggerWithSource("gke")
Expand Down Expand Up @@ -108,9 +110,37 @@ func (*gkeAutopilot) ValidateGameServer(gs *agonesv1.GameServer) []metav1.Status
})
}
}
if gs.Spec.Scheduling != apis.Packed {
causes = append(causes, metav1.StatusCause{
Type: metav1.CauseTypeFieldValueInvalid,
Field: "scheduling",
Message: errSchedulingMustBePacked,
})
}
return causes
}

func (*gkeAutopilot) MutateGameServerPod(gs *agonesv1.GameServer, pod *corev1.Pod) error {
if gs.Spec.Scheduling != apis.Packed {
return fmt.Errorf("Scheduling strategy %s != Packed, which webhook should have rejected on Autopilot", gs.Spec.Scheduling)
}
if len(pod.Spec.Tolerations) > 0 || len(pod.Spec.NodeSelector) > 0 {
// If the user has specified tolerations or a node selector already,
// we assume they know what they're doing.
return nil
}
pod.Spec.Tolerations = []corev1.Toleration{{
Key: agonesv1.RoleLabel,
Value: agonesv1.GameServerLabelRole,
Operator: corev1.TolerationOpEqual,
Effect: corev1.TaintEffectNoSchedule,
}}
pod.Spec.NodeSelector = map[string]string{
agonesv1.RoleLabel: agonesv1.GameServerLabelRole,
}
return nil
}

type autopilotPortAllocator struct {
minPort int32
maxPort int32
Expand Down
83 changes: 79 additions & 4 deletions pkg/cloudproduct/gke/gke_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package gke
import (
"testing"

"agones.dev/agones/pkg/apis"
agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -75,10 +76,11 @@ func TestSyncPodPortsToGameServer(t *testing.T) {

func TestValidateGameServer(t *testing.T) {
for name, tc := range map[string]struct {
ports []agonesv1.GameServerPort
want []metav1.StatusCause
ports []agonesv1.GameServerPort
scheduling apis.SchedulingStrategy
want []metav1.StatusCause
}{
"no ports => validated": {},
"no ports => validated": {scheduling: apis.Packed},
"good ports => validated": {
ports: []agonesv1.GameServerPort{
{
Expand All @@ -100,6 +102,7 @@ func TestValidateGameServer(t *testing.T) {
Protocol: corev1.ProtocolTCP,
},
},
scheduling: apis.Packed,
},
"bad policy => fails validation": {
ports: []agonesv1.GameServerPort{
Expand All @@ -122,6 +125,7 @@ func TestValidateGameServer(t *testing.T) {
Protocol: corev1.ProtocolUDP,
},
},
scheduling: apis.Distributed,
want: []metav1.StatusCause{
{
Type: "FieldValueInvalid",
Expand All @@ -133,16 +137,87 @@ func TestValidateGameServer(t *testing.T) {
Message: "PortPolicy must be Dynamic on GKE Autopilot",
Field: "another-bad-udp.portPolicy",
},
{
Type: "FieldValueInvalid",
Message: "Scheduling strategy must be Packed on GKE Autopilot",
Field: "scheduling",
},
},
},
} {
t.Run(name, func(t *testing.T) {
causes := (&gkeAutopilot{}).ValidateGameServer(&agonesv1.GameServer{Spec: agonesv1.GameServerSpec{Ports: tc.ports}})
causes := (&gkeAutopilot{}).ValidateGameServer(&agonesv1.GameServer{Spec: agonesv1.GameServerSpec{
Ports: tc.ports,
Scheduling: tc.scheduling,
}})
require.Equal(t, tc.want, causes)
})
}
}

func TestMutateGameServerPod(t *testing.T) {
packedGS := &agonesv1.GameServer{Spec: agonesv1.GameServerSpec{Scheduling: apis.Packed}}
distributedGS := &agonesv1.GameServer{Spec: agonesv1.GameServerSpec{Scheduling: apis.Distributed}}

kvToleration := func(k, v string) []corev1.Toleration {
return []corev1.Toleration{{
Key: k,
Value: v,
Operator: corev1.TolerationOpEqual,
Effect: corev1.TaintEffectNoSchedule,
}}
}

for name, tc := range map[string]struct {
gs *agonesv1.GameServer
pod *corev1.Pod
wantPod *corev1.Pod
wantErr bool
}{
"good": {
gs: packedGS,
pod: &corev1.Pod{},
wantPod: &corev1.Pod{Spec: corev1.PodSpec{
Tolerations: kvToleration(agonesv1.RoleLabel, agonesv1.GameServerLabelRole),
NodeSelector: map[string]string{agonesv1.RoleLabel: agonesv1.GameServerLabelRole},
}},
},
"toleration already set": {
gs: packedGS,
pod: &corev1.Pod{Spec: corev1.PodSpec{
Tolerations: kvToleration("moose", "cookie"),
}},
},
"node selector already set": {
gs: packedGS,
pod: &corev1.Pod{Spec: corev1.PodSpec{
NodeSelector: map[string]string{"moose": "cookie"},
}},
},
"not Packed": {
gs: distributedGS,
wantErr: true,
},
} {
t.Run(name, func(t *testing.T) {
pod := tc.pod.DeepCopy()
err := (&gkeAutopilot{}).MutateGameServerPod(tc.gs, pod)
if tc.wantErr {
assert.NotNil(t, err)
return
}
// allow nil to indicate no change
want := tc.wantPod
if want == nil {
want = tc.pod
}
if assert.NoError(t, err) {
require.Equal(t, want, pod)
}
})
}
}

func TestAutopilotPortAllocator(t *testing.T) {
for name, tc := range map[string]struct {
ports []agonesv1.GameServerPort
Expand Down
6 changes: 6 additions & 0 deletions pkg/gameservers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,12 @@ func (c *Controller) createGameServerPod(ctx context.Context, gs *agonesv1.GameS
gs, err = c.moveToErrorState(ctx, gs, err.Error())
return gs, err
}
if err := c.cloudProduct.MutateGameServerPod(gs, pod); err != nil {
// this shouldn't happen, but if it does.
c.loggerForGameServer(gs).WithError(err).Error("error from cloud product mutation hook")
gs, err = c.moveToErrorState(ctx, gs, err.Error())
return gs, err
}

// if the service account is not set, then you are in the "opinionated"
// mode. If the user sets the service account, we assume they know what they are
Expand Down
7 changes: 7 additions & 0 deletions pkg/gameservers/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1028,6 +1028,13 @@ func TestControllerCreateGameServerPod(t *testing.T) {
assert.Equal(t, fixture.ObjectMeta.Name, pod.ObjectMeta.Labels[agonesv1.GameServerPodLabel])
assert.True(t, metav1.IsControlledBy(pod, fixture))

// gke.MutateGameServerPod assumes that a non-empty NodeSelector / Tolerations are user
// intent. The generic cloudproduct that we use in unit tests does not manipulate these.
// So we verify using the generic cloudproduct that NodeSelector/Tolerations are empty
// as a change detector - if this test fails, gke.MutateGameServerPod will not work.
assert.Empty(t, pod.Spec.NodeSelector)
assert.Empty(t, pod.Spec.Tolerations)

assert.Len(t, pod.Spec.Containers, 2, "Should have a sidecar container")

sidecarContainer := pod.Spec.Containers[0]
Expand Down

0 comments on commit fd29174

Please sign in to comment.