Skip to content

Commit

Permalink
fix: emit events when node template fails to resolve (aws#4512)
Browse files Browse the repository at this point in the history
  • Loading branch information
njtran authored Aug 29, 2023
1 parent 717d10d commit 2aed007
Show file tree
Hide file tree
Showing 13 changed files with 122 additions and 14 deletions.
1 change: 1 addition & 0 deletions cmd/controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ func main() {
awsCloudProvider := cloudprovider.New(
op.InstanceTypesProvider,
op.InstanceProvider,
op.EventRecorder,
op.GetClient(),
op.AMIProvider,
op.SecurityGroupProvider,
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ require (
github.com/PuerkitoBio/goquery v1.8.1
github.com/avast/retry-go v3.0.0+incompatible
github.com/aws/aws-sdk-go v1.44.328
github.com/aws/karpenter-core v0.30.0-rc.0.0.20230824190041-a4c05cede32f
github.com/aws/karpenter-core v0.30.0-rc.0.0.20230829175231-8ad12e5df1cb
github.com/imdario/mergo v0.3.16
github.com/mitchellh/hashstructure/v2 v2.0.2
github.com/onsi/ginkgo/v2 v2.11.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHS
github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY=
github.com/aws/aws-sdk-go v1.44.328 h1:WBwlf8ym9SDQ/GTIBO9eXyvwappKJyOetWJKl4mT7ZU=
github.com/aws/aws-sdk-go v1.44.328/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI=
github.com/aws/karpenter-core v0.30.0-rc.0.0.20230824190041-a4c05cede32f h1:AT2yPv2RzY2rx/GP4aBd+G52FRmx9yXYNz5isH45ThA=
github.com/aws/karpenter-core v0.30.0-rc.0.0.20230824190041-a4c05cede32f/go.mod h1:AQl8m8OtgO2N8IlZlzAU6MTrJTJSbe6K4GwdRUNSJVc=
github.com/aws/karpenter-core v0.30.0-rc.0.0.20230829175231-8ad12e5df1cb h1:JN5JeajfAO/v9ONyXscRTwEIrSQPGMa/Atblpo/iX4A=
github.com/aws/karpenter-core v0.30.0-rc.0.0.20230829175231-8ad12e5df1cb/go.mod h1:AQl8m8OtgO2N8IlZlzAU6MTrJTJSbe6K4GwdRUNSJVc=
github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
github.com/benbjohnson/clock v1.3.0 h1:ip6w0uFQkncKQ979AypyG0ER7mqUSBdKLOgAle/AT8A=
github.com/benbjohnson/clock v1.3.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
Expand Down
3 changes: 2 additions & 1 deletion hack/docs/instancetypes_gen_docs.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ func main() {
Manager: &FakeManager{},
KubernetesInterface: kubernetes.NewForConfigOrDie(&rest.Config{}),
})
cp := awscloudprovider.New(op.InstanceTypesProvider, op.InstanceProvider, op.GetClient(), op.AMIProvider, op.SecurityGroupProvider, op.SubnetProvider)
cp := awscloudprovider.New(op.InstanceTypesProvider, op.InstanceProvider,
op.EventRecorder, op.GetClient(), op.AMIProvider, op.SecurityGroupProvider, op.SubnetProvider)

provider := v1alpha1.AWS{SubnetSelector: map[string]string{
"*": "*",
Expand Down
15 changes: 14 additions & 1 deletion pkg/cloudprovider/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/runtime/schema"

"github.com/aws/karpenter-core/pkg/events"
"github.com/aws/karpenter-core/pkg/utils/functional"
"github.com/aws/karpenter/pkg/apis"
"github.com/aws/karpenter/pkg/apis/v1alpha1"
Expand All @@ -40,6 +41,7 @@ import (
"knative.dev/pkg/logging"
"sigs.k8s.io/controller-runtime/pkg/client"

cloudproviderevents "github.com/aws/karpenter/pkg/cloudprovider/events"
"github.com/aws/karpenter/pkg/providers/amifamily"
"github.com/aws/karpenter/pkg/providers/instance"
"github.com/aws/karpenter/pkg/providers/instancetype"
Expand All @@ -65,9 +67,10 @@ type CloudProvider struct {
amiProvider *amifamily.Provider
securityGroupProvider *securitygroup.Provider
subnetProvider *subnet.Provider
recorder events.Recorder
}

func New(instanceTypeProvider *instancetype.Provider, instanceProvider *instance.Provider,
func New(instanceTypeProvider *instancetype.Provider, instanceProvider *instance.Provider, recorder events.Recorder,
kubeClient client.Client, amiProvider *amifamily.Provider, securityGroupProvider *securitygroup.Provider, subnetProvider *subnet.Provider) *CloudProvider {
return &CloudProvider{
instanceTypeProvider: instanceTypeProvider,
Expand All @@ -76,6 +79,7 @@ func New(instanceTypeProvider *instancetype.Provider, instanceProvider *instance
amiProvider: amiProvider,
securityGroupProvider: securityGroupProvider,
subnetProvider: subnetProvider,
recorder: recorder,
}
}

Expand All @@ -85,6 +89,9 @@ func (c *CloudProvider) Create(ctx context.Context, machine *v1alpha5.Machine) (
Annotations[v1alpha5.ProviderCompatabilityAnnotationKey]), machine.
Spec.MachineTemplateRef)
if err != nil {
if errors.IsNotFound(err) {
c.recorder.Publish(cloudproviderevents.MachineFailedToResolveNodeTemplate(machine))
}
return nil, fmt.Errorf("resolving node template, %w", err)
}
instanceTypes, err := c.resolveInstanceTypes(ctx, machine, nodeTemplate)
Expand Down Expand Up @@ -167,6 +174,9 @@ func (c *CloudProvider) GetInstanceTypes(ctx context.Context, provisioner *v1alp
}
nodeTemplate, err := c.resolveNodeTemplate(ctx, rawProvider, provisioner.Spec.ProviderRef)
if err != nil {
if errors.IsNotFound(err) {
c.recorder.Publish(cloudproviderevents.ProvisionerFailedToResolveNodeTemplate(provisioner))
}
return nil, client.IgnoreNotFound(err)
}
// TODO, break this coupling
Expand Down Expand Up @@ -200,6 +210,9 @@ func (c *CloudProvider) IsMachineDrifted(ctx context.Context, machine *v1alpha5.
}
nodeTemplate, err := c.resolveNodeTemplate(ctx, nil, provisioner.Spec.ProviderRef)
if err != nil {
if errors.IsNotFound(err) {
c.recorder.Publish(cloudproviderevents.ProvisionerFailedToResolveNodeTemplate(provisioner))
}
return "", client.IgnoreNotFound(fmt.Errorf("resolving node template, %w", err))
}
driftReason, err := c.isNodeTemplateDrifted(ctx, machine, provisioner, nodeTemplate)
Expand Down
40 changes: 40 additions & 0 deletions pkg/cloudprovider/events/events.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package events

import (
v1 "k8s.io/api/core/v1"

"github.com/aws/karpenter-core/pkg/apis/v1alpha5"
"github.com/aws/karpenter-core/pkg/events"
)

func ProvisionerFailedToResolveNodeTemplate(provisioner *v1alpha5.Provisioner) events.Event {
return events.Event{
InvolvedObject: provisioner,
Type: v1.EventTypeWarning,
Message: "Failed resolving AWSNodeTemplate",
DedupeValues: []string{string(provisioner.UID)},
}
}

func MachineFailedToResolveNodeTemplate(machine *v1alpha5.Machine) events.Event {
return events.Event{
InvolvedObject: machine,
Type: v1.EventTypeWarning,
Message: "Failed resolving AWSNodeTemplate",
DedupeValues: []string{string(machine.UID)},
}
}
9 changes: 6 additions & 3 deletions pkg/cloudprovider/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ var fakeClock *clock.FakeClock
var provisioner *v1alpha5.Provisioner
var nodeTemplate *v1alpha1.AWSNodeTemplate
var machine *v1alpha5.Machine
var recorder events.Recorder

func TestAWS(t *testing.T) {
ctx = TestContextWithLogger(t)
Expand All @@ -85,9 +86,11 @@ var _ = BeforeSuite(func() {
ctx, stop = context.WithCancel(ctx)
awsEnv = test.NewEnvironment(ctx, env)
fakeClock = clock.NewFakeClock(time.Now())
cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider)
recorder = events.NewRecorder(&record.FakeRecorder{})
cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, recorder,
env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider)
cluster = state.NewCluster(fakeClock, env.Client, cloudProvider)
prov = provisioning.NewProvisioner(env.Client, env.KubernetesInterface.CoreV1(), events.NewRecorder(&record.FakeRecorder{}), cloudProvider, cluster)
prov = provisioning.NewProvisioner(env.Client, env.KubernetesInterface.CoreV1(), recorder, cloudProvider, cluster)
})

var _ = AfterSuite(func() {
Expand Down Expand Up @@ -452,7 +455,7 @@ var _ = Describe("CloudProvider", func() {
Entry("DetailedMonitoring Drift", v1alpha1.AWSNodeTemplateSpec{DetailedMonitoring: aws.Bool(true)}),
Entry("AMIFamily Drift", v1alpha1.AWSNodeTemplateSpec{AWS: v1alpha1.AWS{AMIFamily: aws.String(v1alpha1.AMIFamilyBottlerocket)}}),
)
DescribeTable("should not return drifted if dynamic felids are updated",
DescribeTable("should not return drifted if dynamic fields are updated",
func(awsnodetemplatespec v1alpha1.AWSNodeTemplateSpec) {
ExpectApplied(ctx, env.Client, provisioner, nodeTemplate)
isDrifted, err := cloudProvider.IsMachineDrifted(ctx, machine)
Expand Down
5 changes: 4 additions & 1 deletion pkg/controllers/machine/garbagecollection/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,14 @@ import (
"github.com/patrickmn/go-cache"
"github.com/samber/lo"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/tools/record"
. "knative.dev/pkg/logging/testing"
"sigs.k8s.io/controller-runtime/pkg/client"

coresettings "github.com/aws/karpenter-core/pkg/apis/settings"
"github.com/aws/karpenter-core/pkg/apis/v1alpha5"
corecloudprovider "github.com/aws/karpenter-core/pkg/cloudprovider"
"github.com/aws/karpenter-core/pkg/events"
"github.com/aws/karpenter-core/pkg/operator/controller"
"github.com/aws/karpenter-core/pkg/operator/scheme"
coretest "github.com/aws/karpenter-core/pkg/test"
Expand Down Expand Up @@ -67,7 +69,8 @@ var _ = BeforeSuite(func() {
ctx = settings.ToContext(ctx, test.Settings())
env = coretest.NewEnvironment(scheme.Scheme, coretest.WithCRDs(apis.CRDs...))
awsEnv = test.NewEnvironment(ctx, env)
cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider)
cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, events.NewRecorder(&record.FakeRecorder{}),
env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider)
linkedMachineCache = cache.New(time.Minute*10, time.Second*10)
linkController := &link.Controller{
Cache: linkedMachineCache,
Expand Down
5 changes: 4 additions & 1 deletion pkg/controllers/machine/link/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/tools/record"
. "knative.dev/pkg/logging/testing"
"sigs.k8s.io/controller-runtime/pkg/client"

coresettings "github.com/aws/karpenter-core/pkg/apis/settings"
"github.com/aws/karpenter-core/pkg/apis/v1alpha5"
"github.com/aws/karpenter-core/pkg/events"
"github.com/aws/karpenter-core/pkg/operator/controller"
"github.com/aws/karpenter-core/pkg/operator/scheme"
coretest "github.com/aws/karpenter-core/pkg/test"
Expand Down Expand Up @@ -66,7 +68,8 @@ var _ = BeforeSuite(func() {
ctx = settings.ToContext(ctx, test.Settings())
env = coretest.NewEnvironment(scheme.Scheme, coretest.WithCRDs(apis.CRDs...))
awsEnv = test.NewEnvironment(ctx, env)
cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider)
cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, events.NewRecorder(&record.FakeRecorder{}),
env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider)
linkController = link.NewController(env.Client, cloudProvider)
})
var _ = AfterSuite(func() {
Expand Down
5 changes: 4 additions & 1 deletion pkg/providers/instance/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,13 @@ import (
"github.com/samber/lo"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/tools/record"
. "knative.dev/pkg/logging/testing"

coresettings "github.com/aws/karpenter-core/pkg/apis/settings"
"github.com/aws/karpenter-core/pkg/apis/v1alpha5"
corecloudprovider "github.com/aws/karpenter-core/pkg/cloudprovider"
"github.com/aws/karpenter-core/pkg/events"
"github.com/aws/karpenter-core/pkg/operator/injection"
"github.com/aws/karpenter-core/pkg/operator/options"
"github.com/aws/karpenter-core/pkg/operator/scheme"
Expand Down Expand Up @@ -62,7 +64,8 @@ var _ = BeforeSuite(func() {
ctx = coresettings.ToContext(ctx, coretest.Settings())
ctx = settings.ToContext(ctx, test.Settings())
awsEnv = test.NewEnvironment(ctx, env)
cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider)
cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, events.NewRecorder(&record.FakeRecorder{}),
env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider)
})

var _ = AfterSuite(func() {
Expand Down
5 changes: 4 additions & 1 deletion pkg/providers/instancetype/instancetype.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,11 @@ func (p *Provider) List(ctx context.Context, kc *v1alpha5.KubeletConfiguration,
if item, ok := p.cache.Get(key); ok {
return item.([]*cloudprovider.InstanceType), nil
}
result := lo.Map(instanceTypes, func(i *ec2.InstanceTypeInfo, _ int) *cloudprovider.InstanceType {
// Reject any instance types that don't have any offerings due to zone
result := lo.Reject(lo.Map(instanceTypes, func(i *ec2.InstanceTypeInfo, _ int) *cloudprovider.InstanceType {
return NewInstanceType(ctx, i, kc, p.region, nodeTemplate, p.createOfferings(ctx, i, instanceTypeZones[aws.StringValue(i.InstanceType)]))
}), func(i *cloudprovider.InstanceType, _ int) bool {
return len(i.Offerings) == 0
})
for _, instanceType := range instanceTypes {
InstanceTypeVCPU.With(prometheus.Labels{
Expand Down
39 changes: 38 additions & 1 deletion pkg/providers/instancetype/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ var _ = BeforeSuite(func() {
awsEnv = test.NewEnvironment(ctx, env)

fakeClock = &clock.FakeClock{}
cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider)
cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, events.NewRecorder(&record.FakeRecorder{}),
env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider)
cluster = state.NewCluster(fakeClock, env.Client, cloudProvider)
prov = provisioning.NewProvisioner(env.Client, env.KubernetesInterface.CoreV1(), events.NewRecorder(&record.FakeRecorder{}), cloudProvider, cluster)
})
Expand Down Expand Up @@ -1135,6 +1136,42 @@ var _ = Describe("Instance Types", func() {
}
})
It("shouldn't report more resources than are actually available on instances", func() {
awsEnv.EC2API.DescribeSubnetsOutput.Set(&ec2.DescribeSubnetsOutput{
Subnets: []*ec2.Subnet{
{
AvailabilityZone: aws.String("us-west-2a"),
SubnetId: aws.String("subnet-12345"),
},
},
})
awsEnv.EC2API.DescribeInstanceTypeOfferingsOutput.Set(&ec2.DescribeInstanceTypeOfferingsOutput{
InstanceTypeOfferings: []*ec2.InstanceTypeOffering{
{
InstanceType: aws.String("t4g.small"),
Location: aws.String("us-west-2a"),
},
{
InstanceType: aws.String("t4g.medium"),
Location: aws.String("us-west-2a"),
},
{
InstanceType: aws.String("t4g.xlarge"),
Location: aws.String("us-west-2a"),
},
{
InstanceType: aws.String("m5.large"),
Location: aws.String("us-west-2a"),
},
},
})
awsEnv.EC2API.DescribeInstanceTypesOutput.Set(&ec2.DescribeInstanceTypesOutput{
InstanceTypes: []*ec2.InstanceTypeInfo{
{InstanceType: aws.String("t4g.small")},
{InstanceType: aws.String("t4g.medium")},
{InstanceType: aws.String("t4g.xlarge")},
{InstanceType: aws.String("m5.large")},
},
})

ExpectApplied(ctx, env.Client, provisioner, nodeTemplate)
its, err := cloudProvider.GetInstanceTypes(ctx, provisioner)
Expand Down
3 changes: 2 additions & 1 deletion pkg/providers/launchtemplate/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ var _ = BeforeSuite(func() {
awsEnv = test.NewEnvironment(ctx, env)

fakeClock = &clock.FakeClock{}
cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider)
cloudProvider = cloudprovider.New(awsEnv.InstanceTypesProvider, awsEnv.InstanceProvider, events.NewRecorder(&record.FakeRecorder{}),
env.Client, awsEnv.AMIProvider, awsEnv.SecurityGroupProvider, awsEnv.SubnetProvider)
cluster = state.NewCluster(fakeClock, env.Client, cloudProvider)
prov = provisioning.NewProvisioner(env.Client, env.KubernetesInterface.CoreV1(), events.NewRecorder(&record.FakeRecorder{}), cloudProvider, cluster)
})
Expand Down

0 comments on commit 2aed007

Please sign in to comment.