diff --git a/charts/karpenter-crd/templates/karpenter.k8s.aws_ec2nodeclasses.yaml b/charts/karpenter-crd/templates/karpenter.k8s.aws_ec2nodeclasses.yaml index c2453a1152e3..a35e3dff300c 100644 --- a/charts/karpenter-crd/templates/karpenter.k8s.aws_ec2nodeclasses.yaml +++ b/charts/karpenter-crd/templates/karpenter.k8s.aws_ec2nodeclasses.yaml @@ -516,6 +516,23 @@ spec: - optional type: string type: object + placementGroupSelector: + description: PlacementGroupSelector defines the name or the id of the placement to resolve with the nodeclass. + properties: + id: + description: ID is the placement group id in EC2 + pattern: ^pg-[0-9a-z]+$ + type: string + name: + description: Name is the placement group name in EC2 + minLength: 1 + type: string + type: object + x-kubernetes-validations: + - message: expected at least one, got none, ['name', 'id'] + rule: has(self.name) || has(self.id) + - message: '''name'' and ''id'' are mutually exclusive' + rule: '!(has(self.name) && has(self.id))' role: description: |- Role is the AWS identity that nodes use. @@ -814,6 +831,50 @@ spec: instanceProfile: description: InstanceProfile contains the resolved instance profile for the role type: string + placementGroups: + description: PlacementGroups contains the placement group values that are available to this NodeClass. + items: + properties: + id: + description: The id for the placement group. + pattern: ^pg-[0-9a-z]+$ + type: string + name: + description: The name for the placement group. + type: string + partitionCount: + description: The partition count for the partition placement group. + format: int32 + type: integer + spreadLevel: + description: The spread level for the spread placement group. + enum: + - host + - rack + type: string + state: + default: available + description: The state of the placement group. + enum: + - available + - pending + - deleting + - deleted + type: string + strategy: + description: The strategy for the placement group. + enum: + - cluster + - partition + - spread + type: string + required: + - id + - name + - state + - strategy + type: object + type: array securityGroups: description: |- SecurityGroups contains the current security group values that are available to the diff --git a/charts/karpenter-crd/templates/karpenter.sh_nodeclaims.yaml b/charts/karpenter-crd/templates/karpenter.sh_nodeclaims.yaml index ca6ebd520f0c..1287acbcc60e 100644 --- a/charts/karpenter-crd/templates/karpenter.sh_nodeclaims.yaml +++ b/charts/karpenter-crd/templates/karpenter.sh_nodeclaims.yaml @@ -137,7 +137,7 @@ spec: - message: label "kubernetes.io/hostname" is restricted rule: self != "kubernetes.io/hostname" - message: label domain "karpenter.k8s.aws" is restricted - rule: self in ["karpenter.k8s.aws/instance-tenancy", "karpenter.k8s.aws/capacity-reservation-type", "karpenter.k8s.aws/capacity-reservation-id", "karpenter.k8s.aws/capacity-reservation-interruptible", "karpenter.k8s.aws/ec2nodeclass", "karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu", "karpenter.k8s.aws/instance-cpu-manufacturer", "karpenter.k8s.aws/instance-cpu-sustained-clock-speed-mhz", "karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-ebs-bandwidth", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count", "karpenter.k8s.aws/instance-capability-flex"] || !self.find("^([^/]+)").endsWith("karpenter.k8s.aws") + rule: self in ["karpenter.k8s.aws/instance-tenancy", "karpenter.k8s.aws/capacity-reservation-type", "karpenter.k8s.aws/capacity-reservation-id", "karpenter.k8s.aws/capacity-reservation-interruptible", "karpenter.k8s.aws/ec2nodeclass", "karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu", "karpenter.k8s.aws/instance-cpu-manufacturer", "karpenter.k8s.aws/instance-cpu-sustained-clock-speed-mhz", "karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-ebs-bandwidth", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count", "karpenter.k8s.aws/instance-capability-flex", "karpenter.k8s.aws/placement-group-id", "karpenter.k8s.aws/placement-group-partition"] || !self.find("^([^/]+)").endsWith("karpenter.k8s.aws") minValues: description: |- This field is ALPHA and can be dropped or replaced at any time diff --git a/charts/karpenter-crd/templates/karpenter.sh_nodepools.yaml b/charts/karpenter-crd/templates/karpenter.sh_nodepools.yaml index 3864abd09533..907869493f9a 100644 --- a/charts/karpenter-crd/templates/karpenter.sh_nodepools.yaml +++ b/charts/karpenter-crd/templates/karpenter.sh_nodepools.yaml @@ -225,7 +225,7 @@ spec: - message: label "kubernetes.io/hostname" is restricted rule: self.all(x, x != "kubernetes.io/hostname") - message: label domain "karpenter.k8s.aws" is restricted - rule: self.all(x, x in ["karpenter.k8s.aws/instance-tenancy", "karpenter.k8s.aws/capacity-reservation-type", "karpenter.k8s.aws/capacity-reservation-id", "karpenter.k8s.aws/capacity-reservation-interruptible", "karpenter.k8s.aws/ec2nodeclass", "karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu", "karpenter.k8s.aws/instance-cpu-manufacturer", "karpenter.k8s.aws/instance-cpu-sustained-clock-speed-mhz", "karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-ebs-bandwidth", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count", "karpenter.k8s.aws/instance-capability-flex"] || !x.find("^([^/]+)").endsWith("karpenter.k8s.aws")) + rule: self.all(x, x in ["karpenter.k8s.aws/instance-tenancy", "karpenter.k8s.aws/capacity-reservation-type", "karpenter.k8s.aws/capacity-reservation-id", "karpenter.k8s.aws/capacity-reservation-interruptible", "karpenter.k8s.aws/ec2nodeclass", "karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu", "karpenter.k8s.aws/instance-cpu-manufacturer", "karpenter.k8s.aws/instance-cpu-sustained-clock-speed-mhz", "karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-ebs-bandwidth", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count", "karpenter.k8s.aws/instance-capability-flex", "karpenter.k8s.aws/placement-group-id", "karpenter.k8s.aws/placement-group-partition"] || !x.find("^([^/]+)").endsWith("karpenter.k8s.aws")) type: object spec: description: |- @@ -294,7 +294,7 @@ spec: - message: label "kubernetes.io/hostname" is restricted rule: self != "kubernetes.io/hostname" - message: label domain "karpenter.k8s.aws" is restricted - rule: self in ["karpenter.k8s.aws/instance-tenancy", "karpenter.k8s.aws/capacity-reservation-type", "karpenter.k8s.aws/capacity-reservation-id", "karpenter.k8s.aws/capacity-reservation-interruptible", "karpenter.k8s.aws/ec2nodeclass", "karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu", "karpenter.k8s.aws/instance-cpu-manufacturer", "karpenter.k8s.aws/instance-cpu-sustained-clock-speed-mhz", "karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-ebs-bandwidth", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count", "karpenter.k8s.aws/instance-capability-flex"] || !self.find("^([^/]+)").endsWith("karpenter.k8s.aws") + rule: self in ["karpenter.k8s.aws/instance-tenancy", "karpenter.k8s.aws/capacity-reservation-type", "karpenter.k8s.aws/capacity-reservation-id", "karpenter.k8s.aws/capacity-reservation-interruptible", "karpenter.k8s.aws/ec2nodeclass", "karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu", "karpenter.k8s.aws/instance-cpu-manufacturer", "karpenter.k8s.aws/instance-cpu-sustained-clock-speed-mhz", "karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-ebs-bandwidth", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count", "karpenter.k8s.aws/instance-capability-flex", "karpenter.k8s.aws/placement-group-id", "karpenter.k8s.aws/placement-group-partition"] || !self.find("^([^/]+)").endsWith("karpenter.k8s.aws") minValues: description: |- This field is ALPHA and can be dropped or replaced at any time diff --git a/cmd/controller/main.go b/cmd/controller/main.go index bec13d6e27c9..e3d509f8d7a2 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -17,6 +17,7 @@ package main import ( v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1" "github.com/aws/karpenter-provider-aws/pkg/cloudprovider" + "github.com/aws/karpenter-provider-aws/pkg/cloudprovider/registrationhooks" "github.com/aws/karpenter-provider-aws/pkg/controllers" "github.com/aws/karpenter-provider-aws/pkg/operator" @@ -60,6 +61,7 @@ func main() { overlayUndecoratedCloudProvider, clusterState, op.InstanceTypeStore, + corecontrollers.WithRegistrationHook(registrationhooks.NewPlacementGroupRegistrationHook(op.GetClient(), op.InstanceProvider)), )...). WithControllers(ctx, controllers.NewControllers( ctx, @@ -84,6 +86,7 @@ func main() { op.VersionProvider, op.InstanceTypesProvider, op.CapacityReservationProvider, + op.PlacementGroupProvider, op.AMIResolver, )...). Start(ctx) diff --git a/designs/placement-groups-support.md b/designs/placement-groups-support.md new file mode 100644 index 000000000000..ce2df98111cf --- /dev/null +++ b/designs/placement-groups-support.md @@ -0,0 +1,569 @@ +# Placement Groups Support + +This document proposes supporting placement groups in Karpenter + +- [Placement Groups Support](#placement-groups-support) + * [Overview](#overview) + + [Placement Groups](#placement-groups) + + [Placement Group Placement Strategies](#placement-group-placement-strategies) + * [Customer Use Cases](#customer-use-cases) + + [ML Training with EFA (Cluster Placement Group)](#ml-training-with-efa-cluster-placement-group) + + [Kafka with Partition Isolation (Partition Placement Group)](#kafka-with-partition-isolation-partition-placement-group) + + [ETCD with Hardware Spread (Spread Placement Group)](#etcd-with-hardware-spread-spread-placement-group) + * [Goals](#goals) + * [Non-Goals](#non-goals) + * [Placement Group Selection](#placement-group-selection) + + [EC2NodeClass API](#ec2nodeclass-api) + + [Labels](#labels) + + [NodePool API](#nodepool-api) + * [Scheduling and Launch Behavior](#scheduling-and-launch-behavior) + + [Strategy-Specific Behavior](#strategy-specific-behavior) + + [Partition Label Assignment via Registration Hook](#partition-label-assignment-via-registration-hook) + + [Placement Groups as Constraints on Instance Type Offerings](#placement-groups-as-constraints-on-instance-type-offerings) + + [Interaction with Capacity Reservations (ODCRs)](#interaction-with-capacity-reservations-odcrs) + * [Placement Group-Aware ICE Cache](#placement-group-aware-ice-cache) + * [Pricing/Consolidation](#pricingconsolidation) + * [Drift](#drift) + * [Spread Placement Group Disruption Limitations](#spread-placement-group-disruption-limitations) + * [Appendix](#appendix) + + [Input/Output for CreateFleet with Placement Groups](#inputoutput-for-createfleet-with-placement-groups) + + [Strategy-Specific Limitations](#strategy-specific-limitations) + +## Overview + +In AWS, [Placement Groups](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-groups.html) allow users to launch a group of interdependent EC2 instances with an influence on placement that the workload requires. + +This RFC outlines the proposed API and implementation of support for Placement Groups within Karpenter. This feature will allow users to select a single placement group through `placementGroupSelector` in their EC2NodeClass. Karpenter will then discover and use this placement group during scheduling and disruption (including consolidation) simulations to ensure instances are launched respecting the placement constraints. + +**Key design principle:** Each EC2NodeClass maps to exactly one placement group. All instances launched from that EC2NodeClass go into the resolved placement group — this is not conditional on application topology requirements. + +### Placement Groups + +Each [Placement Group](https://pkg.go.dev/github.com/aws/aws-sdk-go-v2/service/ec2/types#PlacementGroup) is defined with: + +- The placement strategy for which to launch instances + - [Cluster](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-strategies.html#placement-groups-cluster) -- logical grouping of instances within a single Availability Zone that enjoy a higher per-flow throughput limit for TCP/IP traffic and are placed in the same high-bisection bandwidth segment of the network + - [Partition](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-strategies.html#placement-groups-partition) -- multiple logical groupings of instances across one or more Availability Zones called partitions where no two partitions within the placement group share the same racks, allowing you to isolate the impact of hardware failure within your application + - [Spread](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-strategies.html#placement-groups-spread) -- logical grouping of instances across a single Region with access to distinct hardware to minimize the risk of simultaneous failures that might occur when instances share the same equipment +- The partition count for partition strategy placement groups +- The spread level for spread strategy placement groups + +Placement Groups also have these limitations: + +- You can't launch Dedicated Hosts in placement groups. +- You can't launch a Spot Instance that is configured to stop or hibernate on interruption in a placement group. Since Karpenter always uses the `terminate` interruption behavior, spot instances are fully compatible with all placement group strategies. + +### Placement Group Placement Strategies + +Currently, [Placement Groups](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-groups.html) supports three placement strategies -- cluster, partition, and spread. Each strategy has its own sets of limitations that are relevant for scheduling. See also [Strategy-Specific Limitations](#strategy-specific-limitations) in the Appendix for a summary table. + +- [Cluster Placement Groups](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-strategies.html#placement-groups-cluster) + - A cluster placement group can't span multiple Availability Zones + - Recommended to use a single launch request to launch the number of instances that you need in the placement group and to use the same instance type for all instances in the placement group. (Note: this limitation can be mitigated by creating an On-Demand Capacity Reservation linked to the cluster placement group, which reserves capacity and avoids incremental ICE risk when adding instances over time.) + - If you try to add more instances to the placement group later, or if you try to launch more than one instance type in the placement group, you increase your chances of getting an insufficient capacity error. + - If you receive a capacity error when launching an instance in a placement group that already has running instances, stop and start all of the instances in the placement group, and try the launch again. Starting the instances may migrate them to hardware that has capacity for all of the requested instances. + - There is an instance type restriction: only the following are supported — previous generation instances (A1, C3, C4, I2, M4, R3, and R4) and current generation instances, except for [burstable performance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/burstable-performance-instances.html) instances (for example, T2), [Mac1](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-mac-instances.html) instances, and M7i-flex instances. +- [Partition Placement Groups](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-strategies.html#placement-groups-partition) + - A partition placement group can have a maximum of seven partitions per Availability Zone. + - When instances are launched into a partition placement group, Amazon EC2 tries to evenly distribute the instances across all partitions. Amazon EC2 doesn't guarantee an even distribution of instances across all partitions. + - A partition placement group with Dedicated Instances can have a maximum of two partitions. + - Capacity Reservations do not reserve capacity in a partition placement group. +- [Spread Placement Groups](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-strategies.html#placement-groups-spread) + - There are two types of spread placement groups: rack-level spread placement groups (available in AWS Regions and on AWS Outposts) and host-level spread placement groups (available on AWS Outposts only). + - A rack-level spread placement group can span multiple Availability Zones in the same Region. + - In a Region, a rack-level spread placement group can have a maximum of seven running instances per Availability Zone per group. + - If you need more than seven instances in an Availability Zone, we recommend that you use multiple spread placement groups. + - Using multiple spread placement groups does not provide guarantees about the spread of instances between groups, but it does help ensure the spread for each group, thus limiting the impact from certain classes of failures. + - Spread placement groups are not supported for Dedicated Instances. + - Capacity Reservations do not reserve capacity in a spread placement group. + +## Customer Use Cases + +The following use cases motivate the design decisions in this RFC. + +### ML Training with EFA (Cluster Placement Group) + +Distributed ML training workloads require low-latency, high-throughput networking between GPU nodes. A cluster placement group ensures all training nodes are physically colocated in the same AZ on the same network segment, which is critical for EFA (Elastic Fabric Adapter) performance. + +```yaml +apiVersion: karpenter.k8s.aws/v1 +kind: EC2NodeClass +metadata: + name: ml-training +spec: + placementGroupSelector: + name: "ml-training-pg" + role: "KarpenterNodeRole-my-cluster" + amiSelectorTerms: + - alias: al2023@latest + subnetSelectorTerms: + - tags: + karpenter.sh/discovery: "my-cluster" + securityGroupSelectorTerms: + - tags: + karpenter.sh/discovery: "my-cluster" +--- +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: ml-training +spec: + template: + spec: + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: ml-training + requirements: + - key: node.kubernetes.io/instance-type + operator: In + values: ["p5.48xlarge", "p5e.48xlarge"] + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand"] + - key: topology.kubernetes.io/zone + operator: In + values: ["us-east-1a"] + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 1h + limits: + cpu: "1000" + nvidia.com/gpu: "64" +``` + +### Kafka with Partition Isolation (Partition Placement Group) + +Kafka brokers benefit from partition placement groups to achieve hardware fault isolation between brokers. By using TSCs with the `karpenter.k8s.aws/placement-group-partition` label, brokers are spread across different partitions, ensuring no two brokers share the same underlying rack. + +```yaml +apiVersion: karpenter.k8s.aws/v1 +kind: EC2NodeClass +metadata: + name: kafka-brokers +spec: + placementGroupSelector: + name: "kafka-partitioned-pg" + ... + blockDeviceMappings: + - deviceName: /dev/xvda + ebs: + volumeSize: 100Gi + volumeType: gp3 +--- +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: kafka-brokers +spec: + template: + spec: + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: kafka-brokers + requirements: + - key: node.kubernetes.io/instance-type + operator: In + values: ["i4i.8xlarge", "i8g.8xlarge"] + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand"] + taints: + - key: workload-type + value: kafka + effect: NoSchedule + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: Never + limits: + cpu: "200" +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: kafka +spec: + replicas: 7 + template: + spec: + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: karpenter.k8s.aws/placement-group-partition + whenUnsatisfiable: DoNotSchedule + labelSelector: + matchLabels: + app: kafka + tolerations: + - key: workload-type + value: kafka + effect: NoSchedule +``` + +### ETCD with Hardware Spread (Spread Placement Group) + +ETCD clusters require high availability with each member on distinct hardware to minimize the risk of correlated failures. A spread placement group ensures each ETCD node is placed on a separate rack, providing hardware-level fault isolation. + +```yaml +apiVersion: karpenter.k8s.aws/v1 +kind: EC2NodeClass +metadata: + name: etcd +spec: + placementGroupSelector: + name: "etcd-spread-pg" + ... +--- +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: etcd +spec: + template: + spec: + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: etcd + requirements: + - key: node.kubernetes.io/instance-type + operator: In + values: ["m7g.xlarge", "m7g.2xlarge"] + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand"] + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: Never + limits: + cpu: "40" +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: etcd +spec: + replicas: 5 + template: + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app: etcd + topologyKey: kubernetes.io/hostname +``` + +Since the spread placement group guarantees each instance is on distinct hardware, the `podAntiAffinity` on `kubernetes.io/hostname` ensures one ETCD member per node, and the placement group ensures those nodes are on separate racks. + +## Goals + +1. Allow selection of cluster Placement Groups with Karpenter +2. Allow selection of partition, rack-level spread, and host-level spread Placement Groups with Karpenter +3. Ensure Karpenter launches capacity into a Placement Group respecting strategy-specific constraints (e.g., single AZ for cluster, 7 instances per AZ for spread) +4. Ensure Karpenter consolidates respecting Placement Group constraints expressed through offerings and pod scheduling rules +5. Allow users to constrain a NodePool to only launch into a specified Placement Group + +## Non-Goals + +Below lists the non-goals for _this RFC design._ Each of these items represents potential follow-ups for the initial implementation and are features we will consider based on feature requests. + +1. Create/Manage/Delete Placement Groups based on application topology requirements + +## Placement Group Selection + +### EC2NodeClass API + +- Add a new struct under `spec` for `placementGroupSelector` to `EC2NodeClass` for defining which Placement Group to be used for a specific `EC2NodeClass` + - Each EC2NodeClass maps to exactly one Placement Group + - The struct accepts either a placement group name or id as a string value +- Add a new field under `status` for the resolved Placement Group by the `spec.placementGroupSelector` for the `EC2NodeClass` + +```yaml +apiVersion: karpenter.k8s.aws/v1 +kind: EC2NodeClass +metadata: + name: example-node-class +spec: + # placementGroupSelector specifies a placement group name or id to identify + # a single Placement Group via the EC2 DescribePlacementGroups API. + placementGroupSelector: + name: String | None + id: String | None + # CEL validation ensures placementGroupSelector is not empty and is either + # a placement group name or a placement group id (pg-* prefix) is populated +status: + conditions: + - # PlacementGroupReady indicates whether the placement group specified + # by spec.placementGroupSelector has been successfully resolved. + # The EC2NodeClass is not ready if this condition is False, + # blocking all launches from this NodeClass. + type: PlacementGroupReady + # placementGroup contains the resolved placement group details. + placementGroup: + # Id for the Placement Group + id: String + # Name of the Placement Group + name: String + # Number of partitions for partition Placement Groups + partitionCount: int | None + # Spread level for spread Placement Groups {host, rack} + spreadLevel: String | None + # State of the placement group + # {pending, available, deleting, deleted} + # Karpenter sets PlacementGroupReady to False for any state other than "available" + state: String + # Strategy of the placement group + # {cluster, partition, spread} + strategy: String +``` + +This API closely follows how [DescribePlacementGroups](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribePlacementGroups.html) can filter placement groups -- allowing Karpenter to receive the server-side filtered version of the placement groups to store in its status. + +### Labels + +When Karpenter launches an instance into a placement group, it will apply the following well-known labels to the Node/NodeClaim: + +| Label | Values | Description | +|-------|--------|-------------| +| `karpenter.k8s.aws/placement-group-id` | Placement group ID (e.g., `"pg-0123456789abcdef0"`) | Uniquely identifies the placement group the node belongs to. Used as the primary key for offering requirements and ICE cache scoping. | +| `karpenter.k8s.aws/placement-group-partition` | Partition number as a string (e.g., `"2"`) | The partition number (**partition strategy only**) | + +These labels serve multiple purposes: + +1. **Pod Scheduling** -- Applications can use node selectors or node affinities on these labels to ensure pods land on nodes in specific placement groups or partitions +2. **Drift Detection** -- Karpenter uses these labels to detect when a node's placement group membership has changed relative to the EC2NodeClass's `placementGroupSelector` + +Example Node labels for an instance in a partition placement group: + +```yaml +metadata: + labels: + karpenter.k8s.aws/placement-group-id: "pg-0123456789abcdef0" + karpenter.k8s.aws/placement-group-partition: "2" # partition strategy only +``` + +### NodePool API + +The EC2NodeClass determines the placement group; the NodePool expresses additional constraints via `requirements` using the labels defined above (e.g., constraining to a specific partition). + +**Cluster** — pin the AZ since cluster PGs are single-AZ: +```yaml +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: cluster-placement +spec: + template: + spec: + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: cluster-placement-node-class +``` + +**Partition** — constrain instance types; use TSCs in workloads to spread across partitions: +```yaml +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: partition-placement +spec: + template: + spec: + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: partition-placement-node-class + requirements: + - key: node.kubernetes.io/instance-type + operator: In + values: ["i4i.8xlarge", "i8g.8xlarge"] + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand"] +``` + +An application can then use TSCs to spread across partitions: + +```yaml +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: kafka +spec: + replicas: 7 + template: + spec: + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: karpenter.k8s.aws/placement-group-partition + whenUnsatisfiable: DoNotSchedule + labelSelector: + matchLabels: + app: kafka +``` + +**Spread** — no additional requirements needed; EC2 enforces the 7-instance-per-AZ limit and Karpenter handles the resulting errors: +```yaml +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: spread-placement +spec: + template: + spec: + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: spread-placement-node-class + # No additional requirements needed -- EC2 enforces + # the 7-instance-per-AZ limit; Karpenter handles the errors + requirements: [] +``` + +## Scheduling and Launch Behavior + +Since Placement Groups are an AWS-specific concept, there needs to be a mechanism to pass down these placement constraints for the scheduler to reason about. The placement group ID is specified in the launch template's `Placement` block when calling `CreateFleet`. Karpenter will use the `lowest-price` allocation strategy for on-demand instances and the `price-capacity-optimized` allocation strategy for spot instances, consistent with current behavior. The placement group constraint is applied via the launch template rather than through the allocation strategy. + +### Strategy-Specific Behavior + +Each placement group strategy constrains the scheduler and `CreateFleet` call differently: + +| Strategy | AZ Behavior | Targeting | Capacity Limit | ICE Handling | +|----------|------------|-----------|----------------|--------------| +| **Cluster** | Single AZ. If PG is empty and NodePool allows multiple AZs, Karpenter passes all valid subnet overrides (across AZs) in the CreateFleet request; EC2's allocation strategy (`lowest-price` for OD, `price-capacity-optimized` for Spot) selects the AZ based on availability and price. If PG already has instances, Karpenter determines the existing AZ via `DescribeInstances` filtered by placement group (to account for instances launched outside of Karpenter) and constrains subsequent launches to that AZ. | N/A | No hard limit, but ICE risk increases when adding instances incrementally. Karpenter retries with alternative instance types. | ICE scoped to `(PG ID, instance type, zone)` — does not block non-PG launches of the same instance type + zone | +| **Partition** | Multi-AZ | Karpenter does not target a specific partition unless pod scheduling constraints require it (via node selector/affinity on `karpenter.k8s.aws/placement-group-partition`). EC2 auto-assigns partitions. The partition label is populated during node registration via a `RegistrationHook` (see [Partition Label Assignment via Registration Hook](#partition-label-assignment-via-registration-hook)). | 7 partitions per AZ. Capacity Reservations do not reserve capacity. | ICE scoped to `(PG ID, [partition,] instance type, zone)` — when a specific partition is targeted, only that partition is marked unavailable; other partitions remain eligible | +| **Spread** | Multi-AZ | N/A | **Hard limit: 7 instances per AZ per group**, enforced by EC2. When a `CreateFleet` call exceeds the limit, EC2 returns an `InsufficientInstanceCapacity` error (error code `UnfulfillableCapacity`) with the message `"You've reached the limit of instances in this spread placement group. A spread placement group can have up to seven instances per Availability Zone."` Karpenter parses this error message to distinguish it from genuine capacity shortages and marks the AZ as unavailable for that placement group in the ICE cache. Capacity Reservations do not reserve capacity. | Spread limit: all instance types in the AZ marked unavailable for this PG. Genuine capacity error: scoped to `(PG ID, instance type, zone)` | + +**Note on cluster PG launches:** A cluster placement group is confined to a single AZ. Karpenter does not proactively discover the AZ of a non-empty cluster PG — instead, it relies on EC2 to enforce the single-AZ constraint. When a cluster PG is empty, all AZs are eligible and Karpenter passes all valid subnet overrides in the `CreateFleet` request; EC2 selects the AZ. During the initial scale-up, parallel `CreateFleet` calls may target different AZs. The first successful call pins the PG to an AZ, and calls targeting other AZs will fail. When the PG already has instances, EC2 automatically constrains new launches to the pinned AZ and returns `InsufficientInstanceCapacity` for overrides targeting other AZs. These failures are expected and handled as typical ICE — Karpenter retries with alternative instance types on subsequent scheduling loops. Users can avoid these transient failures by pinning the AZ in their NodePool requirements. + +### Partition Label Assignment via Registration Hook + +For partition placement groups, the `karpenter.k8s.aws/placement-group-partition` label cannot be determined until after launch, since EC2 auto-assigns the partition and the assignment is only discoverable via `DescribeInstances`. To ensure TopologySpreadConstraints using `karpenter.k8s.aws/placement-group-partition` as the topology key always see accurate partition data, Karpenter uses a `RegistrationHook` to gate node registration until the partition label is populated. + +**How it works:** + +1. When a NodeClaim is created for an EC2NodeClass with a partition placement group, the instance is launched via `CreateFleet` without specifying a `PartitionNumber`, allowing EC2 to auto-assign the partition. +2. During node registration, before the `karpenter.sh/unregistered` taint is removed, the `PlacementGroupRegistrationHook` runs as part of the NodeClaim lifecycle controller's registration phase. +3. The hook resolves the EC2NodeClass from the NodeClaim's `spec.nodeClassRef` and checks `status.placementGroup` to determine if this is a partition placement group. If not, the hook passes through immediately. +4. For partition placement groups, the hook calls `DescribeInstances` to discover the EC2-assigned partition number from `Placement.PartitionNumber`. +5. Once the partition number is available, the hook sets the `karpenter.k8s.aws/placement-group-partition` label on the NodeClaim and allows registration to proceed. The label is then synced to the Node as part of the normal registration sync. +6. If the partition number is not yet available (e.g., the instance is still initializing), the hook returns `false`, causing the lifecycle controller to requeue after 1 second and retry. + +This approach leverages the existing `karpenter.sh/unregistered` taint to block pod scheduling until the partition label is set, without requiring any additional startup taints. The hook is registered via `WithRegistrationHook()` in `cmd/controller/main.go` and is evaluated alongside any other registration hooks before the unregistered taint is removed. + +### Placement Groups as Constraints on Instance Type Offerings + +Unlike ODCRs which add additional offerings, placement groups primarily _constrain_ existing offerings. When an EC2NodeClass resolves a placement group, Karpenter filters instance type offerings based on the strategy and adds placement group labels as requirements on offerings so the scheduler can match them against NodePool/pod constraints. This is analogous to how `karpenter.sh/capacity-type: reserved` is used in the ODCR design. + +**Strategy-specific filtering:** + +- **Cluster**: Filter out unsupported instance types (burstable, Mac1, M7i-flex). If the PG already has instances, filter offerings to only the existing AZ. +- **Partition**: No AZ or instance type filtering. Offerings are expanded into per-partition variants (one per partition count) to support TopologySpreadConstraints on `karpenter.k8s.aws/placement-group-partition`. When a specific partition is targeted (via nodeSelector/affinity), the scheduler picks only offerings for that partition, and the partition number is passed through to the launch template's `Placement.PartitionNumber`. +- **Spread**: No proactive filtering during offering resolution. The 7-instance-per-AZ limit is enforced reactively by EC2 — when the limit is exceeded, EC2 returns an `InsufficientInstanceCapacity` error with a spread-specific message, and Karpenter marks the AZ as unavailable for that placement group in the ICE cache. + +Example offering for a `p5.48xlarge` in a placement group: + +```yaml +name: p5.48xlarge +offerings: + - price: 98.32 + available: 4294967295 + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand"] + - key: topology.kubernetes.io/zone + operator: In + values: ["us-east-1a"] + - key: karpenter.k8s.aws/placement-group-id + operator: In + values: ["pg-dsf832nr1232"] +``` + +### Interaction with Capacity Reservations (ODCRs) + +Placement groups and capacity reservations are orthogonal constraints applied at the EC2 launch template level — placement groups control physical placement via `Placement.GroupId`, while capacity reservations control capacity sourcing via `CapacityReservationSpecification`. Both can be specified simultaneously on a single launch. + +When an EC2NodeClass specifies both `placementGroupSelector` and `capacityReservationSelectorTerms`, Karpenter's existing launch ordering naturally handles the interaction: + +1. **Reserved capacity first**: The scheduler considers reserved offerings (ODCR ∩ NodePool requirements). If a match is found, CreateFleet targets both the capacity reservation and the placement group. The instance consumes the reserved capacity and is physically placed within the placement group. +2. **Fallback to on-demand/spot**: If no reserved capacity is available, the scheduler falls back to on-demand or spot offerings. The instance is launched into the placement group without a capacity reservation, with a broader set of instance types to choose from. + +This behavior applies uniformly to all placement group strategies (cluster, partition, and spread). Note that while EC2 supports creating capacity reservations *within* a cluster placement group, capacity reservations cannot be scoped to partition or spread placement groups. However, standalone capacity reservations (not associated with any placement group) can still be consumed by instances launched into any placement group strategy. + +**Spread placement group limit and capacity reservations:** When a reserved launch fails due to the spread placement group 7-instance-per-AZ limit, Karpenter does not mark the capacity reservation as unavailable. The spread limit is a placement constraint failure, not a capacity reservation failure — the CR still has available capacity. This ensures that when a PG slot frees up (e.g., after a node is deleted), subsequent launches can still target the reserved capacity rather than falling back to on-demand. + +## Placement Group-Aware ICE Cache + +Karpenter's existing ICE (Insufficient Capacity Error) cache tracks instance type + AZ combinations that have recently failed. This cache must be extended to be placement group-aware so that an ICE from a placement group launch does not incorrectly prevent launches of the same instance type + AZ combination outside of that placement group. + +When an ICE occurs, Karpenter caches at the most granular failure domain that was targeted. The placement group ID (`pg-*`) is used as the cache key rather than the placement group name, since IDs are immutable and globally unique: + +| Strategy | ICE Cache Key | What remains unaffected | +|----------|--------------|------------------------| +| **Cluster** | `(placement group ID, instance type)` | Same instance type + AZ without this PG | +| **Partition** | `(placement group ID, [partition,] instance type)` | Other partitions, non-PG launches in same AZ | +| **Spread** | `(placement group ID, AZ, instance type)` for genuine capacity errors; `(placement group ID, AZ)` when the 7-instance limit is reached. Karpenter distinguishes these by parsing the EC2 error message — if it contains `"spread placement group"`, it's the 7-instance limit and the entire AZ is marked unavailable for that PG; otherwise, it's a genuine capacity error scoped to the instance type. | Other PGs, non-PG launches in same AZ; for genuine capacity errors, other instance types in the same AZ within this PG | + +**Example:** NodePool A targets cluster PG `pg-0123456789abcdef0` (`ml-training-pg`) in `us-east-1a`. NodePool B has no placement group. If `p5.48xlarge` in `us-east-1a` returns ICE when launching into `pg-0123456789abcdef0`, the cache entry is `(pg-0123456789abcdef0, p5.48xlarge)`. NodePool B can still launch `p5.48xlarge` in `us-east-1a`. + +## Pricing/Consolidation + +Placement groups do not directly affect pricing, but they constrain the set of valid instance types and availability zones, which indirectly affects cost optimization. + +**No additional consolidation logic is added for placement groups.** Karpenter's standard consolidation behavior applies — consolidation works through scheduling simulation against available offerings. Since the placement group ID is a requirement on the offering (set by the EC2NodeClass), placement group membership is naturally preserved when a pod is re-scheduled onto a replacement node within the same NodePool/NodeClass. + +**Cross-placement-group consolidation:** With this implementation, placement group membership is a NodeClass-level constraint, not an application-level constraint. If a pod has no placement-group-related scheduling constraints (no `nodeSelector`, `nodeAffinity`, or `podAffinity` on placement group labels), and multiple NodePools can serve it (one with a PG, one without, or two with different PGs), Karpenter's consolidation may move the pod to a different placement group or out of a placement group entirely if it finds a cheaper option. Applications that require placement group membership should express this via pod-level constraints (e.g., `nodeSelector` on `karpenter.k8s.aws/placement-group-id`) to ensure consolidation respects their placement requirements. + +**Within-placement-group consolidation:** When consolidating within the same NodePool/NodeClass, the replacement node naturally launches into the same placement group because the offering requirements include the placement group ID. EC2 strategy-specific limits (single AZ for cluster, 7 instances per AZ for spread) are enforced by EC2 at launch time — if a replacement launch violates these limits, it fails with an ICE and Karpenter handles it through the standard ICE cache mechanism. + +## Drift + +Nodes are marked as drifted when their placement group ID label (`karpenter.k8s.aws/placement-group-id`) no longer matches the EC2NodeClass's resolved placement group ID (`status.placementGroup.id`). This is checked explicitly in the `isPlacementGroupDrifted` function, which compares the NodeClaim's `placement-group-id` label against the resolved PG ID from the NodeClass status. + +| Scenario | Detection | Recovery | +|----------|-----------|----------| +| `placementGroupSelector` added to an EC2NodeClass that previously had none | Existing nodes lack `placement-group-id` label | Nodes drifted, replaced into the placement group | +| `placementGroupSelector` removed | Existing nodes have a `placement-group-id` label that no longer matches | Nodes drifted, replaced without placement group | +| `placementGroupSelector` changed to a different placement group | `placement-group-id` label value differs from resolved PG ID | Nodes drifted, replaced into new placement group | +| Placement group deleted externally | `PlacementGroupReady` condition → `False`; EC2NodeClass becomes not ready, blocking all launches | Since a placement group cannot be deleted while it still contains instances, this scenario implies all instances have already been terminated and there are no existing nodes to drift. The only effect is that the EC2NodeClass becomes not ready, blocking future launches until `placementGroupSelector` is updated to a valid placement group. | + +Karpenter detects a deleted placement group via `DescribePlacementGroups` returning no results. Note that EC2 prevents deletion of a placement group that still contains running instances, so this scenario only arises after all instances in the group have been terminated. + +## Spread Placement Group Disruption Limitations + +Spread placement groups impose a hard limit of 7 running instances per AZ per group, enforced by EC2. This limit creates a known limitation for Karpenter's disruption (drift, consolidation) workflows: + +**Replace-then-delete is blocked at capacity:** Karpenter's disruption model launches a replacement node before terminating the old one. When a spread PG is at 7 instances in an AZ, launching an 8th instance in that AZ fails with an `InsufficientInstanceCapacity` error. If all AZs are at capacity (7 × number of AZs), no replacement can be scheduled and disruption is blocked entirely — drifted or consolidation-candidate nodes remain running until capacity frees up. + +**No fallback to non-placement-group launches:** The current implementation does not attempt to launch replacement instances outside the placement group. All offerings from an EC2NodeClass with a placement group unconditionally include the placement group ID as a scheduling requirement, and all launch templates include the placement group in the `Placement` block. + +Users who need consolidation to function at spread PG capacity limits should use `WhenEmpty` consolidation (`spec.disruption.consolidationPolicy: WhenEmpty` with a `consolidateAfter` duration). With `WhenEmpty`, Karpenter deletes a node only after all non-daemonset pods have been drained from it — freeing a placement group slot without requiring a replacement launch first. Users are responsible for ensuring pods are moved off the node (e.g., by scaling down the workload or using pod disruption budgets), at which point Karpenter detects the node as empty, waits the `consolidateAfter` duration, and deletes it. Note that `WhenEmpty` only applies to consolidation — drift always uses replace-then-delete regardless of the consolidation policy, so drift remains blocked when the spread PG is at capacity. + +## Appendix + +### Input/Output for CreateFleet with Placement Groups + +The following table documents the `CreateFleet` API behavior when specifying placement groups through the launch template's `Placement` block: + +| Scenario | Placement Configuration | Result | +|----------|------------------------|--------| +| Single placement group targeting | `Placement.GroupId` set in launch template | Instances launched into the specified placement group | +| Empty cluster PG, multiple AZs allowed | `Placement.GroupId` set, no AZ constraint | EC2 selects AZ to maximize instance type availability | +| Non-empty cluster PG | `Placement.GroupId` set, PG already has instances | Instances launched in the same AZ as existing instances; ICE if no capacity | +| Cluster PG, instance type not available | `Placement.GroupId` set, specific instance type | `InsufficientInstanceCapacity` error; Karpenter retries with alternative instance types | +| Partition PG, no partition specified | `Placement.GroupId` set, no `PartitionNumber` | EC2 auto-assigns partition, distributing instances across partitions | +| Partition PG, specific partition | `Placement.GroupId` + `Placement.PartitionNumber` set | Instance launched into the specified partition | +| Spread PG at capacity | `Placement.GroupId` set, 7 instances already in AZ | `InsufficientInstanceCapacity` error | + +### Strategy-Specific Limitations + +| Strategy | AZ Constraint | Instance Limit | Instance Type Restrictions | Capacity Reservation Support | Spot Support | +|----------|--------------|----------------|---------------------------|------------------------------|-------------| +| Cluster | Single AZ | No hard limit (ICE risk increases) | No burstable, Mac1, M7i-flex | Yes | Yes (terminate only) | +| Partition | Per-AZ partitions | 7 partitions per AZ | None | No | Yes (terminate only) | +| Spread (rack) | Multi-AZ | 7 instances per AZ per group | None | No | Yes (terminate only) | +| Spread (host) | Single AZ (Outposts only) | 1 instance per host | None | No | Yes (terminate only) | diff --git a/go.mod b/go.mod index 6051cc6e9bef..085dacfcfba5 100644 --- a/go.mod +++ b/go.mod @@ -50,7 +50,7 @@ require ( k8s.io/klog/v2 v2.130.1 k8s.io/utils v0.0.0-20251222233032-718f0e51e6d2 sigs.k8s.io/controller-runtime v0.22.4 - sigs.k8s.io/karpenter v1.10.0 + sigs.k8s.io/karpenter v1.10.1-0.20260323181619-defdfae64097 sigs.k8s.io/yaml v1.6.0 ) diff --git a/go.sum b/go.sum index 98cc0aceda74..8e44ad7669b6 100644 --- a/go.sum +++ b/go.sum @@ -375,8 +375,8 @@ sigs.k8s.io/controller-runtime v0.22.4 h1:GEjV7KV3TY8e+tJ2LCTxUTanW4z/FmNB7l327U sigs.k8s.io/controller-runtime v0.22.4/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= -sigs.k8s.io/karpenter v1.10.0 h1:F8cupDXyn5c7TQDgTSj86nPmUJxFaV0wxu5HIdp+TJc= -sigs.k8s.io/karpenter v1.10.0/go.mod h1:XQtYAxoCysLHjytci7Fx5zw2txgcW2Vxc+qq6DDiFX8= +sigs.k8s.io/karpenter v1.10.1-0.20260323181619-defdfae64097 h1:5ISIicLPMylfC6SxAJlXnlXeAWEJyPmF/gptXvKtdno= +sigs.k8s.io/karpenter v1.10.1-0.20260323181619-defdfae64097/go.mod h1:XQtYAxoCysLHjytci7Fx5zw2txgcW2Vxc+qq6DDiFX8= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= sigs.k8s.io/structured-merge-diff/v6 v6.3.1 h1:JrhdFMqOd/+3ByqlP2I45kTOZmTRLBUm5pvRjeheg7E= diff --git a/hack/code/instancetype_testdata_gen/main.go b/hack/code/instancetype_testdata_gen/main.go index 341fb14d5dc6..809ac21db8dd 100644 --- a/hack/code/instancetype_testdata_gen/main.go +++ b/hack/code/instancetype_testdata_gen/main.go @@ -237,6 +237,11 @@ func getInstanceTypeInfo(info ec2types.InstanceTypeInfo) string { } fmt.Fprintf(src, "},\n") fmt.Fprintf(src, "},\n") + if info.PlacementGroupInfo != nil { + fmt.Fprintf(src, "PlacementGroupInfo: &ec2types.PlacementGroupInfo{\n") + fmt.Fprintf(src, "SupportedStrategies: []ec2types.PlacementGroupStrategy{%s},\n", getStringSliceData(info.PlacementGroupInfo.SupportedStrategies)) + fmt.Fprintf(src, "},\n") + } return src.String() } @@ -279,6 +284,6 @@ func getGPUDeviceInfo(info ec2types.GpuDeviceInfo) string { return src.String() } -func getStringSliceData[T ec2types.UsageClassType | ec2types.VirtualizationType | ec2types.ArchitectureType](slice []T) string { +func getStringSliceData[T ec2types.UsageClassType | ec2types.VirtualizationType | ec2types.ArchitectureType | ec2types.PlacementGroupStrategy](slice []T) string { return strings.Join(lo.Map(slice, func(s T, _ int) string { return fmt.Sprintf(`"%s"`, s) }), ",") } diff --git a/hack/validation/labels.sh b/hack/validation/labels.sh index e453b3cdd058..6f23bac228ac 100755 --- a/hack/validation/labels.sh +++ b/hack/validation/labels.sh @@ -2,7 +2,7 @@ function injectDomainLabelRestrictions() { domain=$1 - rule="self.all(x, x in [\"${domain}/instance-tenancy\", \"${domain}/capacity-reservation-type\", \"${domain}/capacity-reservation-id\", \"${domain}/capacity-reservation-interruptible\", \"${domain}/ec2nodeclass\", \"${domain}/instance-encryption-in-transit-supported\", \"${domain}/instance-category\", \"${domain}/instance-hypervisor\", \"${domain}/instance-family\", \"${domain}/instance-generation\", \"${domain}/instance-local-nvme\", \"${domain}/instance-size\", \"${domain}/instance-cpu\", \"${domain}/instance-cpu-manufacturer\", \"${domain}/instance-cpu-sustained-clock-speed-mhz\", \"${domain}/instance-memory\", \"${domain}/instance-ebs-bandwidth\", \"${domain}/instance-network-bandwidth\", \"${domain}/instance-gpu-name\", \"${domain}/instance-gpu-manufacturer\", \"${domain}/instance-gpu-count\", \"${domain}/instance-gpu-memory\", \"${domain}/instance-accelerator-name\", \"${domain}/instance-accelerator-manufacturer\", \"${domain}/instance-accelerator-count\", \"${domain}/instance-capability-flex\"] || !x.find(\"^([^/]+)\").endsWith(\"${domain}\"))" + rule="self.all(x, x in [\"${domain}/instance-tenancy\", \"${domain}/capacity-reservation-type\", \"${domain}/capacity-reservation-id\", \"${domain}/capacity-reservation-interruptible\", \"${domain}/ec2nodeclass\", \"${domain}/instance-encryption-in-transit-supported\", \"${domain}/instance-category\", \"${domain}/instance-hypervisor\", \"${domain}/instance-family\", \"${domain}/instance-generation\", \"${domain}/instance-local-nvme\", \"${domain}/instance-size\", \"${domain}/instance-cpu\", \"${domain}/instance-cpu-manufacturer\", \"${domain}/instance-cpu-sustained-clock-speed-mhz\", \"${domain}/instance-memory\", \"${domain}/instance-ebs-bandwidth\", \"${domain}/instance-network-bandwidth\", \"${domain}/instance-gpu-name\", \"${domain}/instance-gpu-manufacturer\", \"${domain}/instance-gpu-count\", \"${domain}/instance-gpu-memory\", \"${domain}/instance-accelerator-name\", \"${domain}/instance-accelerator-manufacturer\", \"${domain}/instance-accelerator-count\", \"${domain}/instance-capability-flex\", \"${domain}/placement-group-id\", \"${domain}/placement-group-partition\"] || !x.find(\"^([^/]+)\").endsWith(\"${domain}\"))" message="label domain \"${domain}\" is restricted" MSG="${message}" RULE="${rule}" yq eval '.spec.versions[0].schema.openAPIV3Schema.properties.spec.properties.template.properties.metadata.properties.labels.x-kubernetes-validations += [{"message": strenv(MSG), "rule": strenv(RULE)}]' -i pkg/apis/crds/karpenter.sh_nodepools.yaml } diff --git a/hack/validation/requirements.sh b/hack/validation/requirements.sh index 7b17d596f12c..365811ae9eec 100755 --- a/hack/validation/requirements.sh +++ b/hack/validation/requirements.sh @@ -2,7 +2,7 @@ function injectDomainRequirementRestrictions() { domain=$1 - rule="self in [\"${domain}/instance-tenancy\", \"${domain}/capacity-reservation-type\", \"${domain}/capacity-reservation-id\", \"${domain}/capacity-reservation-interruptible\", \"${domain}/ec2nodeclass\", \"${domain}/instance-encryption-in-transit-supported\", \"${domain}/instance-category\", \"${domain}/instance-hypervisor\", \"${domain}/instance-family\", \"${domain}/instance-generation\", \"${domain}/instance-local-nvme\", \"${domain}/instance-size\", \"${domain}/instance-cpu\", \"${domain}/instance-cpu-manufacturer\", \"${domain}/instance-cpu-sustained-clock-speed-mhz\", \"${domain}/instance-memory\", \"${domain}/instance-ebs-bandwidth\", \"${domain}/instance-network-bandwidth\", \"${domain}/instance-gpu-name\", \"${domain}/instance-gpu-manufacturer\", \"${domain}/instance-gpu-count\", \"${domain}/instance-gpu-memory\", \"${domain}/instance-accelerator-name\", \"${domain}/instance-accelerator-manufacturer\", \"${domain}/instance-accelerator-count\", \"${domain}/instance-capability-flex\"] || !self.find(\"^([^/]+)\").endsWith(\"${domain}\")" + rule="self in [\"${domain}/instance-tenancy\", \"${domain}/capacity-reservation-type\", \"${domain}/capacity-reservation-id\", \"${domain}/capacity-reservation-interruptible\", \"${domain}/ec2nodeclass\", \"${domain}/instance-encryption-in-transit-supported\", \"${domain}/instance-category\", \"${domain}/instance-hypervisor\", \"${domain}/instance-family\", \"${domain}/instance-generation\", \"${domain}/instance-local-nvme\", \"${domain}/instance-size\", \"${domain}/instance-cpu\", \"${domain}/instance-cpu-manufacturer\", \"${domain}/instance-cpu-sustained-clock-speed-mhz\", \"${domain}/instance-memory\", \"${domain}/instance-ebs-bandwidth\", \"${domain}/instance-network-bandwidth\", \"${domain}/instance-gpu-name\", \"${domain}/instance-gpu-manufacturer\", \"${domain}/instance-gpu-count\", \"${domain}/instance-gpu-memory\", \"${domain}/instance-accelerator-name\", \"${domain}/instance-accelerator-manufacturer\", \"${domain}/instance-accelerator-count\", \"${domain}/instance-capability-flex\", \"${domain}/placement-group-id\", \"${domain}/placement-group-partition\"] || !self.find(\"^([^/]+)\").endsWith(\"${domain}\")" message="label domain \"${domain}\" is restricted" MSG="${message}" RULE="${rule}" yq eval '.spec.versions[0].schema.openAPIV3Schema.properties.spec.properties.requirements.items.properties.key.x-kubernetes-validations += [{"message": strenv(MSG), "rule": strenv(RULE)}]' -i pkg/apis/crds/karpenter.sh_nodeclaims.yaml MSG="${message}" RULE="${rule}" yq eval '.spec.versions[0].schema.openAPIV3Schema.properties.spec.properties.template.properties.spec.properties.requirements.items.properties.key.x-kubernetes-validations += [{"message": strenv(MSG), "rule": strenv(RULE)}]' -i pkg/apis/crds/karpenter.sh_nodepools.yaml diff --git a/kwok/main.go b/kwok/main.go index 5e26efcb78f6..6da8203d0671 100644 --- a/kwok/main.go +++ b/kwok/main.go @@ -99,6 +99,7 @@ func main() { op.VersionProvider, op.InstanceTypesProvider, op.CapacityReservationProvider, + op.PlacementGroupProvider, op.AMIResolver, )...). Start(ctx) diff --git a/kwok/operator/operator.go b/kwok/operator/operator.go index b0b246d6ad8a..b06e04ae77ac 100644 --- a/kwok/operator/operator.go +++ b/kwok/operator/operator.go @@ -61,6 +61,7 @@ import ( "github.com/aws/karpenter-provider-aws/pkg/providers/instanceprofile" "github.com/aws/karpenter-provider-aws/pkg/providers/instancetype" "github.com/aws/karpenter-provider-aws/pkg/providers/launchtemplate" + "github.com/aws/karpenter-provider-aws/pkg/providers/placementgroup" "github.com/aws/karpenter-provider-aws/pkg/providers/pricing" "github.com/aws/karpenter-provider-aws/pkg/providers/securitygroup" ssmp "github.com/aws/karpenter-provider-aws/pkg/providers/ssm" @@ -93,6 +94,7 @@ type Operator struct { InstanceProvider instance.Provider SSMProvider ssmp.Provider CapacityReservationProvider capacityreservation.Provider + PlacementGroupProvider placementgroup.Provider EC2API *kwokec2.Client } @@ -165,6 +167,10 @@ func NewOperator(ctx context.Context, operator *operator.Operator) (context.Cont cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval), cache.New(awscache.CapacityReservationAvailabilityTTL, awscache.DefaultCleanupInterval), ) + placementGroupProvider := placementgroup.NewProvider( + ec2api, + cache.New(awscache.PlacementGroupTTL, awscache.DefaultCleanupInterval), + ) instanceTypeProvider := instancetype.NewDefaultProvider( cache.New(awscache.InstanceTypesZonesAndOfferingsTTL, awscache.DefaultCleanupInterval), cache.New(awscache.InstanceTypesZonesAndOfferingsTTL, awscache.DefaultCleanupInterval), @@ -215,6 +221,7 @@ func NewOperator(ctx context.Context, operator *operator.Operator) (context.Cont InstanceProvider: instanceProvider, SSMProvider: ssmProvider, CapacityReservationProvider: capacityReservationProvider, + PlacementGroupProvider: placementGroupProvider, EC2API: ec2api, } } diff --git a/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml b/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml index 8679a865c48f..34b0c5617645 100644 --- a/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml +++ b/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml @@ -513,6 +513,23 @@ spec: - optional type: string type: object + placementGroupSelector: + description: PlacementGroupSelector defines the name or the id of the placement to resolve with the nodeclass. + properties: + id: + description: ID is the placement group id in EC2 + pattern: ^pg-[0-9a-z]+$ + type: string + name: + description: Name is the placement group name in EC2 + minLength: 1 + type: string + type: object + x-kubernetes-validations: + - message: expected at least one, got none, ['name', 'id'] + rule: has(self.name) || has(self.id) + - message: '''name'' and ''id'' are mutually exclusive' + rule: '!(has(self.name) && has(self.id))' role: description: |- Role is the AWS identity that nodes use. @@ -811,6 +828,50 @@ spec: instanceProfile: description: InstanceProfile contains the resolved instance profile for the role type: string + placementGroups: + description: PlacementGroups contains the placement group values that are available to this NodeClass. + items: + properties: + id: + description: The id for the placement group. + pattern: ^pg-[0-9a-z]+$ + type: string + name: + description: The name for the placement group. + type: string + partitionCount: + description: The partition count for the partition placement group. + format: int32 + type: integer + spreadLevel: + description: The spread level for the spread placement group. + enum: + - host + - rack + type: string + state: + default: available + description: The state of the placement group. + enum: + - available + - pending + - deleting + - deleted + type: string + strategy: + description: The strategy for the placement group. + enum: + - cluster + - partition + - spread + type: string + required: + - id + - name + - state + - strategy + type: object + type: array securityGroups: description: |- SecurityGroups contains the current security group values that are available to the diff --git a/pkg/apis/crds/karpenter.sh_nodeclaims.yaml b/pkg/apis/crds/karpenter.sh_nodeclaims.yaml index bc1420d17f91..c0c7b898389a 100644 --- a/pkg/apis/crds/karpenter.sh_nodeclaims.yaml +++ b/pkg/apis/crds/karpenter.sh_nodeclaims.yaml @@ -134,7 +134,7 @@ spec: - message: label "kubernetes.io/hostname" is restricted rule: self != "kubernetes.io/hostname" - message: label domain "karpenter.k8s.aws" is restricted - rule: self in ["karpenter.k8s.aws/instance-tenancy", "karpenter.k8s.aws/capacity-reservation-type", "karpenter.k8s.aws/capacity-reservation-id", "karpenter.k8s.aws/capacity-reservation-interruptible", "karpenter.k8s.aws/ec2nodeclass", "karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu", "karpenter.k8s.aws/instance-cpu-manufacturer", "karpenter.k8s.aws/instance-cpu-sustained-clock-speed-mhz", "karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-ebs-bandwidth", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count", "karpenter.k8s.aws/instance-capability-flex"] || !self.find("^([^/]+)").endsWith("karpenter.k8s.aws") + rule: self in ["karpenter.k8s.aws/instance-tenancy", "karpenter.k8s.aws/capacity-reservation-type", "karpenter.k8s.aws/capacity-reservation-id", "karpenter.k8s.aws/capacity-reservation-interruptible", "karpenter.k8s.aws/ec2nodeclass", "karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu", "karpenter.k8s.aws/instance-cpu-manufacturer", "karpenter.k8s.aws/instance-cpu-sustained-clock-speed-mhz", "karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-ebs-bandwidth", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count", "karpenter.k8s.aws/instance-capability-flex", "karpenter.k8s.aws/placement-group-id", "karpenter.k8s.aws/placement-group-partition"] || !self.find("^([^/]+)").endsWith("karpenter.k8s.aws") minValues: description: |- This field is ALPHA and can be dropped or replaced at any time diff --git a/pkg/apis/crds/karpenter.sh_nodepools.yaml b/pkg/apis/crds/karpenter.sh_nodepools.yaml index ea269529fb12..96bfc85cae58 100644 --- a/pkg/apis/crds/karpenter.sh_nodepools.yaml +++ b/pkg/apis/crds/karpenter.sh_nodepools.yaml @@ -222,7 +222,7 @@ spec: - message: label "kubernetes.io/hostname" is restricted rule: self.all(x, x != "kubernetes.io/hostname") - message: label domain "karpenter.k8s.aws" is restricted - rule: self.all(x, x in ["karpenter.k8s.aws/instance-tenancy", "karpenter.k8s.aws/capacity-reservation-type", "karpenter.k8s.aws/capacity-reservation-id", "karpenter.k8s.aws/capacity-reservation-interruptible", "karpenter.k8s.aws/ec2nodeclass", "karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu", "karpenter.k8s.aws/instance-cpu-manufacturer", "karpenter.k8s.aws/instance-cpu-sustained-clock-speed-mhz", "karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-ebs-bandwidth", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count", "karpenter.k8s.aws/instance-capability-flex"] || !x.find("^([^/]+)").endsWith("karpenter.k8s.aws")) + rule: self.all(x, x in ["karpenter.k8s.aws/instance-tenancy", "karpenter.k8s.aws/capacity-reservation-type", "karpenter.k8s.aws/capacity-reservation-id", "karpenter.k8s.aws/capacity-reservation-interruptible", "karpenter.k8s.aws/ec2nodeclass", "karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu", "karpenter.k8s.aws/instance-cpu-manufacturer", "karpenter.k8s.aws/instance-cpu-sustained-clock-speed-mhz", "karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-ebs-bandwidth", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count", "karpenter.k8s.aws/instance-capability-flex", "karpenter.k8s.aws/placement-group-id", "karpenter.k8s.aws/placement-group-partition"] || !x.find("^([^/]+)").endsWith("karpenter.k8s.aws")) type: object spec: description: |- @@ -291,7 +291,7 @@ spec: - message: label "kubernetes.io/hostname" is restricted rule: self != "kubernetes.io/hostname" - message: label domain "karpenter.k8s.aws" is restricted - rule: self in ["karpenter.k8s.aws/instance-tenancy", "karpenter.k8s.aws/capacity-reservation-type", "karpenter.k8s.aws/capacity-reservation-id", "karpenter.k8s.aws/capacity-reservation-interruptible", "karpenter.k8s.aws/ec2nodeclass", "karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu", "karpenter.k8s.aws/instance-cpu-manufacturer", "karpenter.k8s.aws/instance-cpu-sustained-clock-speed-mhz", "karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-ebs-bandwidth", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count", "karpenter.k8s.aws/instance-capability-flex"] || !self.find("^([^/]+)").endsWith("karpenter.k8s.aws") + rule: self in ["karpenter.k8s.aws/instance-tenancy", "karpenter.k8s.aws/capacity-reservation-type", "karpenter.k8s.aws/capacity-reservation-id", "karpenter.k8s.aws/capacity-reservation-interruptible", "karpenter.k8s.aws/ec2nodeclass", "karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu", "karpenter.k8s.aws/instance-cpu-manufacturer", "karpenter.k8s.aws/instance-cpu-sustained-clock-speed-mhz", "karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-ebs-bandwidth", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count", "karpenter.k8s.aws/instance-capability-flex", "karpenter.k8s.aws/placement-group-id", "karpenter.k8s.aws/placement-group-partition"] || !self.find("^([^/]+)").endsWith("karpenter.k8s.aws") minValues: description: |- This field is ALPHA and can be dropped or replaced at any time diff --git a/pkg/apis/v1/ec2nodeclass.go b/pkg/apis/v1/ec2nodeclass.go index b76671430ebc..10b669f99657 100644 --- a/pkg/apis/v1/ec2nodeclass.go +++ b/pkg/apis/v1/ec2nodeclass.go @@ -52,6 +52,11 @@ type EC2NodeClassSpec struct { // +kubebuilder:validation:MaxItems:=30 // +optional CapacityReservationSelectorTerms []CapacityReservationSelectorTerm `json:"capacityReservationSelectorTerms" hash:"ignore"` + // PlacementGroupSelector defines the name or the id of the placement to resolve with the nodeclass. + // +kubebuilder:validation:XValidation:message="expected at least one, got none, ['name', 'id']",rule="has(self.name) || has(self.id)" + // +kubebuilder:validation:XValidation:message="'name' and 'id' are mutually exclusive",rule="!(has(self.name) && has(self.id))" + // +optional + PlacementGroupSelector *PlacementGroupSelectorTerm `json:"placementGroupSelector,omitempty"` // AssociatePublicIPAddress controls if public IP addresses are assigned to instances that are launched with the nodeclass. // +optional AssociatePublicIPAddress *bool `json:"associatePublicIPAddress,omitempty"` @@ -199,6 +204,17 @@ type CapacityReservationSelectorTerm struct { InstanceMatchCriteria string `json:"instanceMatchCriteria,omitempty"` } +type PlacementGroupSelectorTerm struct { + // Name is the placement group name in EC2 + // +kubebuilder:validation:MinLength:=1 + // +optional + Name string `json:"name,omitempty"` + // ID is the placement group id in EC2 + // +kubebuilder:validation:Pattern:="^pg-[0-9a-z]+$" + // +optional + ID string `json:"id,omitempty"` +} + // AMISelectorTerm defines selection logic for an ami used by Karpenter to launch nodes. // If multiple fields are used for selection, the requirements are ANDed. type AMISelectorTerm struct { diff --git a/pkg/apis/v1/ec2nodeclass_status.go b/pkg/apis/v1/ec2nodeclass_status.go index 068dfc49edeb..d76017b0c948 100644 --- a/pkg/apis/v1/ec2nodeclass_status.go +++ b/pkg/apis/v1/ec2nodeclass_status.go @@ -38,6 +38,7 @@ const ( ConditionTypeInstanceProfileReady = "InstanceProfileReady" ConditionTypeCapacityReservationsReady = "CapacityReservationsReady" ConditionTypeValidationSucceeded = "ValidationSucceeded" + ConditionTypePlacementGroupReady = "PlacementGroupReady" ) // Subnet contains resolved Subnet selector values utilized for node launch @@ -139,6 +140,80 @@ const ( CapacityReservationStateExpiring CapacityReservationState = "expiring" ) +type PlacementGroup struct { + // The id for the placement group. + // +kubebuilder:validation:Pattern:="^pg-[0-9a-z]+$" + // +required + ID string `json:"id"` + // The name for the placement group. + // +required + Name string `json:"name"` + // The partition count for the partition placement group. + // +optional + PartitionCount int32 `json:"partitionCount,omitempty"` + // The spread level for the spread placement group. + // +kubebuilder:validation:Enum:={host,rack} + // +optional + SpreadLevel PlacementGroupSpreadLevel `json:"spreadLevel,omitempty"` + // The state of the placement group. + // +kubebuilder:validation:Enum:={available,pending,deleting,deleted} + // +kubebuilder:default=available + // +required + State PlacementGroupState `json:"state"` + // The strategy for the placement group. + // +kubebuilder:validation:Enum:={cluster,partition,spread} + // +required + Strategy PlacementGroupStrategy `json:"strategy,omitempty"` +} + +type PlacementGroupSpreadLevel string + +const ( + PlacementGroupSpreadLevelRack PlacementGroupSpreadLevel = "rack" + PlacementGroupSpreadLevelHost PlacementGroupSpreadLevel = "host" +) + +func (PlacementGroupSpreadLevel) Values() []PlacementGroupSpreadLevel { + return []PlacementGroupSpreadLevel{ + PlacementGroupSpreadLevelRack, + PlacementGroupSpreadLevelHost, + } +} + +type PlacementGroupState string + +const ( + PlacementGroupStateAvailable PlacementGroupState = "available" + PlacementGroupStatePending PlacementGroupState = "pending" + PlacementGroupStateDeleting PlacementGroupState = "deleting" + PlacementGroupStateDeleted PlacementGroupState = "deleted" +) + +func (PlacementGroupState) Values() []PlacementGroupState { + return []PlacementGroupState{ + PlacementGroupStateAvailable, + PlacementGroupStatePending, + PlacementGroupStateDeleting, + PlacementGroupStateDeleted, + } +} + +type PlacementGroupStrategy string + +const ( + PlacementGroupStrategyCluster PlacementGroupStrategy = "cluster" + PlacementGroupStrategyPartition PlacementGroupStrategy = "partition" + PlacementGroupStrategySpread PlacementGroupStrategy = "spread" +) + +func (PlacementGroupStrategy) Values() []PlacementGroupStrategy { + return []PlacementGroupStrategy{ + PlacementGroupStrategyCluster, + PlacementGroupStrategyPartition, + PlacementGroupStrategySpread, + } +} + // EC2NodeClassStatus contains the resolved state of the EC2NodeClass type EC2NodeClassStatus struct { // Subnets contains the current subnet values that are available to the @@ -153,6 +228,9 @@ type EC2NodeClassStatus struct { // CapacityReservation selectors. // +optional CapacityReservations []CapacityReservation `json:"capacityReservations,omitempty"` + // PlacementGroups contains the placement group values that are available to this NodeClass. + // +optional + PlacementGroups []PlacementGroup `json:"placementGroups,omitempty"` // AMI contains the current AMI values that are available to the // cluster under the AMI selectors. // +optional @@ -176,6 +254,9 @@ func (in *EC2NodeClass) StatusConditions() status.ConditionSet { if CapacityReservationsEnabled { conds = append(conds, ConditionTypeCapacityReservationsReady) } + if in.Spec.PlacementGroupSelector != nil { + conds = append(conds, ConditionTypePlacementGroupReady) + } return status.NewReadyConditions(conds...).For(in) } @@ -195,6 +276,10 @@ func (in *EC2NodeClass) CapacityReservations() []CapacityReservation { return in.Status.CapacityReservations } +func (in *EC2NodeClass) PlacementGroups() []PlacementGroup { + return in.Status.PlacementGroups +} + type ZoneInfo struct { Zone string ZoneID string @@ -265,3 +350,72 @@ func CapacityReservationFromEC2(clk clock.Clock, cr *ec2types.CapacityReservatio State: state, }, nil } + +func PlacementGroupSpreadLevelFromEC2(spreadLevel ec2types.SpreadLevel) (PlacementGroupSpreadLevel, error) { + if spreadLevel == "" { + return "", nil + } + resolvedType, ok := lo.Find(PlacementGroupSpreadLevel("").Values(), func(crt PlacementGroupSpreadLevel) bool { + return string(crt) == string(spreadLevel) + }) + if !ok { + return "", serrors.Wrap( + fmt.Errorf("received placement group with unsupported spread level from ec2"), + "spread-level", string(spreadLevel), + ) + } + return resolvedType, nil +} + +func PlacementGroupStateFromEC2(state ec2types.PlacementGroupState) (PlacementGroupState, error) { + if state == "" { + return "", serrors.Wrap(fmt.Errorf("received placement group with no state from ec2")) + } + resolvedType, ok := lo.Find(PlacementGroupState("").Values(), func(crt PlacementGroupState) bool { + return string(crt) == string(state) + }) + if !ok { + return "", serrors.Wrap( + fmt.Errorf("received placement group with unrecognized state from ec2"), + "state", string(state), + ) + } + return resolvedType, nil +} + +func PlacementGroupStrategyFromEC2(strategy ec2types.PlacementStrategy) (PlacementGroupStrategy, error) { + resolvedType, ok := lo.Find(PlacementGroupStrategy("").Values(), func(crt PlacementGroupStrategy) bool { + return string(crt) == string(strategy) + }) + if !ok { + return "", serrors.Wrap( + fmt.Errorf("received placement group with unsupported strategy from ec2"), + "strategy", string(strategy), + ) + } + return resolvedType, nil +} + +func PlacementGroupFromEC2(pg *ec2types.PlacementGroup) (PlacementGroup, error) { + spreadLevel, err := PlacementGroupSpreadLevelFromEC2(pg.SpreadLevel) + if err != nil { + return PlacementGroup{}, serrors.Wrap(err, "placement-group", *pg.GroupId) + } + state, err := PlacementGroupStateFromEC2(pg.State) + if err != nil { + return PlacementGroup{}, serrors.Wrap(err, "state", pg.State) + } + strategy, err := PlacementGroupStrategyFromEC2(pg.Strategy) + if err != nil { + return PlacementGroup{}, serrors.Wrap(err, "strategy", pg.Strategy) + } + + return PlacementGroup{ + ID: *pg.GroupId, + Name: *pg.GroupName, + PartitionCount: lo.FromPtr(pg.PartitionCount), + SpreadLevel: spreadLevel, + State: state, + Strategy: strategy, + }, nil +} diff --git a/pkg/apis/v1/labels.go b/pkg/apis/v1/labels.go index c2740def4df5..1ddf5a690196 100644 --- a/pkg/apis/v1/labels.go +++ b/pkg/apis/v1/labels.go @@ -58,6 +58,8 @@ func init() { LabelTopologyZoneID, LabelInstanceTenancy, corev1.LabelWindowsBuild, + LabelPlacementGroupID, + LabelPlacementGroupPartition, ) karpv1.WellKnownValuesForRequirements[LabelInstanceTenancy] = sets.New(string(ec2types.TenancyDedicated), string(ec2types.TenancyDefault)) @@ -163,6 +165,8 @@ var ( LabelInstanceAcceleratorCount = apis.Group + "/instance-accelerator-count" LabelNodeClass = apis.Group + "/ec2nodeclass" LabelInstanceTenancy = apis.Group + "/instance-tenancy" + LabelPlacementGroupID = apis.Group + "/placement-group-id" + LabelPlacementGroupPartition = apis.Group + "/placement-group-partition" LabelTopologyZoneID = "topology.k8s.aws/zone-id" diff --git a/pkg/apis/v1/zz_generated.deepcopy.go b/pkg/apis/v1/zz_generated.deepcopy.go index 6b283ed4ca03..8b777a6983d9 100644 --- a/pkg/apis/v1/zz_generated.deepcopy.go +++ b/pkg/apis/v1/zz_generated.deepcopy.go @@ -293,6 +293,11 @@ func (in *EC2NodeClassSpec) DeepCopyInto(out *EC2NodeClassSpec) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.PlacementGroupSelector != nil { + in, out := &in.PlacementGroupSelector, &out.PlacementGroupSelector + *out = new(PlacementGroupSelectorTerm) + **out = **in + } if in.AssociatePublicIPAddress != nil { in, out := &in.AssociatePublicIPAddress, &out.AssociatePublicIPAddress *out = new(bool) @@ -400,6 +405,11 @@ func (in *EC2NodeClassStatus) DeepCopyInto(out *EC2NodeClassStatus) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.PlacementGroups != nil { + in, out := &in.PlacementGroups, &out.PlacementGroups + *out = make([]PlacementGroup, len(*in)) + copy(*out, *in) + } if in.AMIs != nil { in, out := &in.AMIs, &out.AMIs *out = make([]AMI, len(*in)) @@ -546,6 +556,36 @@ func (in *MetadataOptions) DeepCopy() *MetadataOptions { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PlacementGroup) DeepCopyInto(out *PlacementGroup) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PlacementGroup. +func (in *PlacementGroup) DeepCopy() *PlacementGroup { + if in == nil { + return nil + } + out := new(PlacementGroup) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PlacementGroupSelectorTerm) DeepCopyInto(out *PlacementGroupSelectorTerm) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PlacementGroupSelectorTerm. +func (in *PlacementGroupSelectorTerm) DeepCopy() *PlacementGroupSelectorTerm { + if in == nil { + return nil + } + out := new(PlacementGroupSelectorTerm) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SecurityGroup) DeepCopyInto(out *SecurityGroup) { *out = *in diff --git a/pkg/aws/sdk.go b/pkg/aws/sdk.go index a594c9e53646..df8ea4bee3bb 100644 --- a/pkg/aws/sdk.go +++ b/pkg/aws/sdk.go @@ -28,6 +28,7 @@ import ( type EC2API interface { DescribeCapacityReservations(context.Context, *ec2.DescribeCapacityReservationsInput, ...func(*ec2.Options)) (*ec2.DescribeCapacityReservationsOutput, error) + DescribePlacementGroups(context.Context, *ec2.DescribePlacementGroupsInput, ...func(*ec2.Options)) (*ec2.DescribePlacementGroupsOutput, error) DescribeImages(context.Context, *ec2.DescribeImagesInput, ...func(*ec2.Options)) (*ec2.DescribeImagesOutput, error) DescribeLaunchTemplates(context.Context, *ec2.DescribeLaunchTemplatesInput, ...func(*ec2.Options)) (*ec2.DescribeLaunchTemplatesOutput, error) DescribeSubnets(context.Context, *ec2.DescribeSubnetsInput, ...func(*ec2.Options)) (*ec2.DescribeSubnetsOutput, error) diff --git a/pkg/cache/cache.go b/pkg/cache/cache.go index e1d750267183..cfd70c7712d3 100644 --- a/pkg/cache/cache.go +++ b/pkg/cache/cache.go @@ -31,6 +31,9 @@ const ( // updated every minute, but we want to persist the data longer in the event of an EC2 API outage. 24 hours was the // compormise made for API outage reseliency and gargage collecting entries for orphaned reservations. CapacityReservationAvailabilityTTL = 24 * time.Hour + // PlacementGroupTTL is the time we will persist cached placement group data. Like capacity reservations, we persist + // longer than the reconciliation interval to maintain availability during EC2 API outages. + PlacementGroupTTL = 6 * time.Hour // InstanceTypesZonesAndOfferingsTTL is the time before we refresh instance types, zones, and offerings at EC2 InstanceTypesZonesAndOfferingsTTL = 5 * time.Minute // InstanceProfileTTL is the time before we refresh checking instance profile existence at IAM diff --git a/pkg/cache/unavailableofferings.go b/pkg/cache/unavailableofferings.go index f8edd784d9bb..59d03fccd716 100644 --- a/pkg/cache/unavailableofferings.go +++ b/pkg/cache/unavailableofferings.go @@ -27,6 +27,16 @@ import ( "github.com/patrickmn/go-cache" ) +// PlacementGroupScope optionally scopes ICE cache entries to a specific placement group and partition. +// When set, ICE entries are isolated so that placement-group-specific failures don't block non-PG launches. +type PlacementGroupScope struct { + // ID is the placement group ID (e.g., "pg-0123456789abcdef0") + ID string + // Partition is the partition number as a string (e.g., "3"). Empty when no specific partition is targeted + // or for non-partition placement groups. + Partition string +} + // UnavailableOfferings stores any offerings that return ICE (insufficient capacity errors) when // attempting to launch the capacity. These offerings are ignored as long as they are in the cache on // GetInstanceTypes responses @@ -54,8 +64,8 @@ func NewUnavailableOfferings() *UnavailableOfferings { } uo.offeringCache.OnEvicted(func(k string, _ any) { elems := strings.Split(k, ":") - if len(elems) != 3 { - panic("unavailable offerings cache key is not of expected format ::") + if len(elems) < 3 || len(elems) > 5 { + panic("unavailable offerings cache key is not of expected format ::[:[:]]") } uo.offeringCacheSeqNumMu.Lock() uo.offeringCacheSeqNum[ec2types.InstanceType(elems[1])]++ @@ -79,16 +89,22 @@ func (u *UnavailableOfferings) SeqNum(instanceType ec2types.InstanceType) uint64 return v + u.capacityTypeCacheSeqNum.Load() + u.azCacheSeqNum.Load() } -// IsUnavailable returns true if the offering appears in the cache -func (u *UnavailableOfferings) IsUnavailable(instanceType ec2types.InstanceType, zone, capacityType string) bool { - _, offeringFound := u.offeringCache.Get(u.key(instanceType, zone, capacityType)) +// IsUnavailable returns true if the offering appears in the cache. +// The pgScope parameter scopes the lookup so that an ICE from a placement group launch +// does not incorrectly prevent launches of the same instance type + zone without that placement group. +// When a partition is specified in the scope, the lookup is further scoped to that partition. +func (u *UnavailableOfferings) IsUnavailable(instanceType ec2types.InstanceType, zone, capacityType string, pgScope ...PlacementGroupScope) bool { + _, offeringFound := u.offeringCache.Get(u.key(instanceType, zone, capacityType, pgScope...)) _, capacityTypeFound := u.capacityTypeCache.Get(capacityType) _, azFound := u.azCache.Get(zone) return offeringFound || capacityTypeFound || azFound } -// MarkUnavailable communicates recently observed temporary capacity shortages in the provided offerings -func (u *UnavailableOfferings) MarkUnavailable(ctx context.Context, instanceType ec2types.InstanceType, zone, capacityType string, unavailableReason map[string]string) { +// MarkUnavailable communicates recently observed temporary capacity shortages in the provided offerings. +// The pgScope parameter scopes the cache entry so that placement-group-specific ICEs don't +// block non-PG launches of the same instance type + zone. When a partition is specified, the cache +// entry is further scoped so only that partition is marked unavailable. +func (u *UnavailableOfferings) MarkUnavailable(ctx context.Context, instanceType ec2types.InstanceType, zone, capacityType string, unavailableReason map[string]string, pgScope ...PlacementGroupScope) { // even if the key is already in the cache, we still need to call Set to extend the cached entry's TTL logValues := []any{ "reason", unavailableReason["reason"], @@ -97,6 +113,12 @@ func (u *UnavailableOfferings) MarkUnavailable(ctx context.Context, instanceType "capacity-type", capacityType, "ttl", UnavailableOfferingsTTL, } + if len(pgScope) > 0 && pgScope[0].ID != "" { + logValues = append(logValues, "placement-group-id", pgScope[0].ID) + if pgScope[0].Partition != "" { + logValues = append(logValues, "placement-group-partition", pgScope[0].Partition) + } + } // Add fleetID if provided key := "fleet-id" _, ok := unavailableReason[key] @@ -104,7 +126,7 @@ func (u *UnavailableOfferings) MarkUnavailable(ctx context.Context, instanceType logValues = append(logValues, key, unavailableReason[key]) } log.FromContext(ctx).WithValues(logValues...).V(1).Info("removing offering from offerings") - u.offeringCache.SetDefault(u.key(instanceType, zone, capacityType), struct{}{}) + u.offeringCache.SetDefault(u.key(instanceType, zone, capacityType, pgScope...), struct{}{}) u.offeringCacheSeqNumMu.Lock() u.offeringCacheSeqNum[instanceType]++ u.offeringCacheSeqNumMu.Unlock() @@ -130,7 +152,16 @@ func (u *UnavailableOfferings) Flush() { u.azCache.Flush() } -// key returns the cache key for all offerings in the cache -func (u *UnavailableOfferings) key(instanceType ec2types.InstanceType, zone string, capacityType string) string { +// key returns the cache key for all offerings in the cache. +// When a placement group scope is provided, the PG ID (and optionally partition) is included in the key +// to scope ICE entries per placement group and partition. +// Format: ::[:[:]] +func (u *UnavailableOfferings) key(instanceType ec2types.InstanceType, zone string, capacityType string, pgScope ...PlacementGroupScope) string { + if len(pgScope) > 0 && pgScope[0].ID != "" { + if pgScope[0].Partition != "" { + return fmt.Sprintf("%s:%s:%s:%s:%s", capacityType, instanceType, zone, pgScope[0].ID, pgScope[0].Partition) + } + return fmt.Sprintf("%s:%s:%s:%s", capacityType, instanceType, zone, pgScope[0].ID) + } return fmt.Sprintf("%s:%s:%s", capacityType, instanceType, zone) } diff --git a/pkg/cloudprovider/cloudprovider.go b/pkg/cloudprovider/cloudprovider.go index 7e2ca909ec69..d4b3c91a2e46 100644 --- a/pkg/cloudprovider/cloudprovider.go +++ b/pkg/cloudprovider/cloudprovider.go @@ -460,6 +460,11 @@ func (c *CloudProvider) instanceToNodeClaim(i *instance.Instance, instanceType * labels[v1.LabelCapacityReservationType] = string(i.CapacityReservationDetails.Type) labels[v1.LabelCapacityReservationInterruptible] = fmt.Sprintf("%t", i.CapacityReservationDetails.Interruptible) } + // Placement group labels + if nodeClass != nil && len(nodeClass.Status.PlacementGroups) > 0 { + pg := nodeClass.Status.PlacementGroups[0] + labels[v1.LabelPlacementGroupID] = pg.ID + } if v, ok := i.Tags[karpv1.NodePoolLabelKey]; ok { labels[karpv1.NodePoolLabelKey] = v } diff --git a/pkg/cloudprovider/drift.go b/pkg/cloudprovider/drift.go index 5fb1bfc4ad49..db2c39fafead 100644 --- a/pkg/cloudprovider/drift.go +++ b/pkg/cloudprovider/drift.go @@ -37,6 +37,7 @@ const ( SubnetDrift cloudprovider.DriftReason = "SubnetDrift" SecurityGroupDrift cloudprovider.DriftReason = "SecurityGroupDrift" CapacityReservationDrift cloudprovider.DriftReason = "CapacityReservationDrift" + PlacementGroupDrift cloudprovider.DriftReason = "PlacementGroupDrift" NodeClassDrift cloudprovider.DriftReason = "NodeClassDrift" ) @@ -65,10 +66,12 @@ func (c *CloudProvider) isNodeClassDrifted(ctx context.Context, nodeClaim *karpv return "", fmt.Errorf("calculating subnet drift, %w", err) } capacityReservationsDrifted := c.isCapacityReservationDrifted(instance, nodeClass) + placementGroupDrifted := c.isPlacementGroupDrifted(nodeClaim, nodeClass) drifted := lo.FindOrElse([]cloudprovider.DriftReason{ securitygroupDrifted, subnetDrifted, capacityReservationsDrifted, + placementGroupDrifted, }, "", func(i cloudprovider.DriftReason) bool { return string(i) != "" }) @@ -146,6 +149,20 @@ func (c *CloudProvider) isCapacityReservationDrifted(instance *instance.Instance return "" } +// isPlacementGroupDrifted checks if the node's placement group ID label no longer matches the EC2NodeClass's +// resolved placement group. This covers scenarios where placementGroupSelector was added, removed, or changed. +func (c *CloudProvider) isPlacementGroupDrifted(nodeClaim *karpv1.NodeClaim, nodeClass *v1.EC2NodeClass) cloudprovider.DriftReason { + nodeClaimPGID := nodeClaim.Labels[v1.LabelPlacementGroupID] + var nodeClassPGID string + if len(nodeClass.Status.PlacementGroups) > 0 { + nodeClassPGID = nodeClass.Status.PlacementGroups[0].ID + } + if nodeClaimPGID != nodeClassPGID { + return PlacementGroupDrift + } + return "" +} + func (c *CloudProvider) areStaticFieldsDrifted(nodeClaim *karpv1.NodeClaim, nodeClass *v1.EC2NodeClass) cloudprovider.DriftReason { nodeClassHash, foundNodeClassHash := nodeClass.Annotations[v1.AnnotationEC2NodeClassHash] nodeClassHashVersion, foundNodeClassHashVersion := nodeClass.Annotations[v1.AnnotationEC2NodeClassHashVersion] diff --git a/pkg/cloudprovider/registrationhooks/placementgrouphook.go b/pkg/cloudprovider/registrationhooks/placementgrouphook.go new file mode 100644 index 000000000000..7d468a466ce9 --- /dev/null +++ b/pkg/cloudprovider/registrationhooks/placementgrouphook.go @@ -0,0 +1,98 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package registrationhooks + +import ( + "context" + "fmt" + "strconv" + + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider" + + v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1" + "github.com/aws/karpenter-provider-aws/pkg/providers/instance" + "github.com/aws/karpenter-provider-aws/pkg/utils" +) + +// PlacementGroupRegistrationHook gates node registration until the placement group partition +// label is populated. For partition placement groups, EC2 auto-assigns the partition number +// and it is only discoverable via DescribeInstances after launch. This hook ensures the +// karpenter.k8s.aws/placement-group-partition label is set before the karpenter.sh/unregistered +// taint is removed, so that TopologySpreadConstraints using the partition topology key always +// see accurate partition data. +type PlacementGroupRegistrationHook struct { + kubeClient client.Client + instanceProvider instance.Provider +} + +func NewPlacementGroupRegistrationHook(kubeClient client.Client, instanceProvider instance.Provider) *PlacementGroupRegistrationHook { + return &PlacementGroupRegistrationHook{ + kubeClient: kubeClient, + instanceProvider: instanceProvider, + } +} + +func (h *PlacementGroupRegistrationHook) Name() string { + return "PlacementGroupRegistrationHook" +} + +func (h *PlacementGroupRegistrationHook) Registered(ctx context.Context, nodeClaim *karpv1.NodeClaim) (cloudprovider.NodeLifecycleHookResult, error) { + // Resolve the EC2NodeClass from the NodeClaim's nodeClassRef + nodeClass := &v1.EC2NodeClass{} + if err := h.kubeClient.Get(ctx, types.NamespacedName{Name: nodeClaim.Spec.NodeClassRef.Name}, nodeClass); err != nil { + return cloudprovider.NodeLifecycleHookResult{}, fmt.Errorf("resolving ec2nodeclass for placement group hook, %w", err) + } + + // Check if the EC2NodeClass has a partition placement group + if len(nodeClass.Status.PlacementGroups) == 0 || + nodeClass.Status.PlacementGroups[0].Strategy != v1.PlacementGroupStrategyPartition { + return cloudprovider.NodeLifecycleHookResult{}, nil + } + + // Check if the partition label is already populated on the NodeClaim + if _, ok := nodeClaim.Labels[v1.LabelPlacementGroupPartition]; ok { + return cloudprovider.NodeLifecycleHookResult{}, nil + } + + // We need the providerID to look up the instance + if nodeClaim.Status.ProviderID == "" { + return cloudprovider.NodeLifecycleHookResult{Requeue: true}, nil + } + + // Parse the instance ID from the provider ID + instanceID, err := utils.ParseInstanceID(nodeClaim.Status.ProviderID) + if err != nil { + return cloudprovider.NodeLifecycleHookResult{Requeue: true}, fmt.Errorf("parsing instance ID from provider ID, %w", err) + } + + // Get the instance details, skipping cache to get fresh partition data + inst, err := h.instanceProvider.Get(ctx, instanceID, instance.SkipCache) + if err != nil { + return cloudprovider.NodeLifecycleHookResult{Requeue: true}, fmt.Errorf("describing instance for partition number, %w", err) + } + + // Check if the partition number has been assigned + if inst.PartitionNumber == nil { + return cloudprovider.NodeLifecycleHookResult{Requeue: true}, nil + } + + // Set the partition label on the NodeClaim so it gets synced to the Node during registration + nodeClaim.Labels[v1.LabelPlacementGroupPartition] = strconv.FormatInt(int64(*inst.PartitionNumber), 10) + return cloudprovider.NodeLifecycleHookResult{}, nil +} diff --git a/pkg/cloudprovider/registrationhooks/placementgrouphook_test.go b/pkg/cloudprovider/registrationhooks/placementgrouphook_test.go new file mode 100644 index 000000000000..143fe5033ad3 --- /dev/null +++ b/pkg/cloudprovider/registrationhooks/placementgrouphook_test.go @@ -0,0 +1,277 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package registrationhooks_test + +import ( + "fmt" + + ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/samber/lo" + + opstatus "github.com/awslabs/operatorpkg/status" + + v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1" + "github.com/aws/karpenter-provider-aws/pkg/cloudprovider/registrationhooks" + "github.com/aws/karpenter-provider-aws/pkg/fake" + "github.com/aws/karpenter-provider-aws/pkg/test" + + karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" + coretest "sigs.k8s.io/karpenter/pkg/test" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + . "sigs.k8s.io/karpenter/pkg/test/expectations" +) + +var _ = Describe("PlacementGroupRegistrationHook", func() { + var hook *registrationhooks.PlacementGroupRegistrationHook + var nodeClass *v1.EC2NodeClass + var nc *karpv1.NodeClaim + + BeforeEach(func() { + hook = registrationhooks.NewPlacementGroupRegistrationHook(env.Client, awsEnv.InstanceProvider) + nodeClass = test.EC2NodeClass(v1.EC2NodeClass{ + Status: v1.EC2NodeClassStatus{ + InstanceProfile: "test-profile", + SecurityGroups: []v1.SecurityGroup{ + {ID: "sg-test1", Name: "securityGroup-test1"}, + }, + Subnets: []v1.Subnet{ + {ID: "subnet-test1", Zone: "test-zone-1a", ZoneID: "tstz1-1a"}, + }, + }, + }) + nodeClass.StatusConditions().SetTrue(opstatus.ConditionReady) + nc = coretest.NodeClaim(karpv1.NodeClaim{ + Spec: karpv1.NodeClaimSpec{ + NodeClassRef: &karpv1.NodeClassReference{ + Group: "karpenter.k8s.aws", + Kind: "EC2NodeClass", + Name: nodeClass.Name, + }, + }, + }) + nc.Labels = map[string]string{} + }) + + It("should pass through immediately when EC2NodeClass has no placement group", func() { + nodeClass.Status.PlacementGroups = nil + ExpectApplied(ctx, env.Client, nodeClass, nc) + + result, err := hook.Registered(ctx, nc) + Expect(err).ToNot(HaveOccurred()) + Expect(result.Requeue).To(BeFalse()) + }) + + It("should pass through immediately when EC2NodeClass has a cluster placement group", func() { + nodeClass.Status.PlacementGroups = []v1.PlacementGroup{ + { + ID: "pg-cluster123", + Name: "cluster-pg", + Strategy: v1.PlacementGroupStrategyCluster, + State: v1.PlacementGroupStateAvailable, + }, + } + ExpectApplied(ctx, env.Client, nodeClass, nc) + + result, err := hook.Registered(ctx, nc) + Expect(err).ToNot(HaveOccurred()) + Expect(result.Requeue).To(BeFalse()) + }) + + It("should pass through immediately when EC2NodeClass has a spread placement group", func() { + nodeClass.Status.PlacementGroups = []v1.PlacementGroup{ + { + ID: "pg-spread123", + Name: "spread-pg", + Strategy: v1.PlacementGroupStrategySpread, + SpreadLevel: v1.PlacementGroupSpreadLevelRack, + State: v1.PlacementGroupStateAvailable, + }, + } + ExpectApplied(ctx, env.Client, nodeClass, nc) + + result, err := hook.Registered(ctx, nc) + Expect(err).ToNot(HaveOccurred()) + Expect(result.Requeue).To(BeFalse()) + }) + + It("should pass through when partition label is already set", func() { + nodeClass.Status.PlacementGroups = []v1.PlacementGroup{ + { + ID: "pg-partition123", + Name: "partition-pg", + PartitionCount: 7, + Strategy: v1.PlacementGroupStrategyPartition, + State: v1.PlacementGroupStateAvailable, + }, + } + nc.Labels[v1.LabelPlacementGroupPartition] = "3" + ExpectApplied(ctx, env.Client, nodeClass, nc) + + result, err := hook.Registered(ctx, nc) + Expect(err).ToNot(HaveOccurred()) + Expect(result.Requeue).To(BeFalse()) + }) + + It("should block registration when providerID is empty for partition placement group", func() { + nodeClass.Status.PlacementGroups = []v1.PlacementGroup{ + { + ID: "pg-partition123", + Name: "partition-pg", + PartitionCount: 7, + Strategy: v1.PlacementGroupStrategyPartition, + State: v1.PlacementGroupStateAvailable, + }, + } + nc.Status.ProviderID = "" + ExpectApplied(ctx, env.Client, nodeClass, nc) + + result, err := hook.Registered(ctx, nc) + Expect(err).ToNot(HaveOccurred()) + Expect(result.Requeue).To(BeTrue()) + }) + + It("should set partition label and proceed when DescribeInstances returns partition number", func() { + instanceID := fake.InstanceID() + nodeClass.Status.PlacementGroups = []v1.PlacementGroup{ + { + ID: "pg-partition123", + Name: "partition-pg", + PartitionCount: 7, + Strategy: v1.PlacementGroupStrategyPartition, + State: v1.PlacementGroupStateAvailable, + }, + } + nc.Status.ProviderID = fmt.Sprintf("aws:///test-zone-1a/%s", instanceID) + ExpectApplied(ctx, env.Client, nodeClass, nc) + nc.Labels = map[string]string{} + + // Configure the EC2 API to return an instance with a partition number + awsEnv.EC2API.Instances.Store(instanceID, ec2types.Instance{ + InstanceId: lo.ToPtr(instanceID), + InstanceType: "m5.large", + Placement: &ec2types.Placement{ + AvailabilityZone: lo.ToPtr("test-zone-1a"), + PartitionNumber: lo.ToPtr[int32](3), + }, + State: &ec2types.InstanceState{ + Name: ec2types.InstanceStateNameRunning, + }, + }) + + result, err := hook.Registered(ctx, nc) + Expect(err).ToNot(HaveOccurred()) + Expect(result.Requeue).To(BeFalse()) + Expect(nc.Labels[v1.LabelPlacementGroupPartition]).To(Equal("3")) + }) + + It("should block registration when instance has no partition number yet", func() { + instanceID := fake.InstanceID() + nodeClass.Status.PlacementGroups = []v1.PlacementGroup{ + { + ID: "pg-partition123", + Name: "partition-pg", + PartitionCount: 7, + Strategy: v1.PlacementGroupStrategyPartition, + State: v1.PlacementGroupStateAvailable, + }, + } + nc.Status.ProviderID = fmt.Sprintf("aws:///test-zone-1a/%s", instanceID) + ExpectApplied(ctx, env.Client, nodeClass, nc) + + // Configure the EC2 API to return an instance without a partition number + awsEnv.EC2API.Instances.Store(instanceID, ec2types.Instance{ + InstanceId: lo.ToPtr(instanceID), + InstanceType: "m5.large", + Placement: &ec2types.Placement{ + AvailabilityZone: lo.ToPtr("test-zone-1a"), + }, + State: &ec2types.InstanceState{ + Name: ec2types.InstanceStateNameRunning, + }, + }) + + result, err := hook.Registered(ctx, nc) + Expect(err).ToNot(HaveOccurred()) + Expect(result.Requeue).To(BeTrue()) + }) + + It("should return error when instance is not found", func() { + instanceID := fake.InstanceID() + nodeClass.Status.PlacementGroups = []v1.PlacementGroup{ + { + ID: "pg-partition123", + Name: "partition-pg", + PartitionCount: 7, + Strategy: v1.PlacementGroupStrategyPartition, + State: v1.PlacementGroupStateAvailable, + }, + } + nc.Status.ProviderID = fmt.Sprintf("aws:///test-zone-1a/%s", instanceID) + ExpectApplied(ctx, env.Client, nodeClass, nc) + + // Don't store any instance — the instance won't be found + + _, err := hook.Registered(ctx, nc) + Expect(err).To(HaveOccurred()) + }) + + It("should handle multiple partition numbers correctly", func() { + for _, partitionNum := range []int32{1, 4, 7} { + instanceID := fake.InstanceID() + nodeClass.Status.PlacementGroups = []v1.PlacementGroup{ + { + ID: "pg-partition123", + Name: "partition-pg", + PartitionCount: 7, + Strategy: v1.PlacementGroupStrategyPartition, + State: v1.PlacementGroupStateAvailable, + }, + } + + testNC := coretest.NodeClaim(karpv1.NodeClaim{ + Spec: karpv1.NodeClaimSpec{ + NodeClassRef: &karpv1.NodeClassReference{ + Group: "karpenter.k8s.aws", + Kind: "EC2NodeClass", + Name: nodeClass.Name, + }, + }, + }) + testNC.Labels = map[string]string{} + testNC.Status.ProviderID = fmt.Sprintf("aws:///test-zone-1a/%s", instanceID) + ExpectApplied(ctx, env.Client, nodeClass, testNC) + testNC.Labels = map[string]string{} + + awsEnv.EC2API.Instances.Store(instanceID, ec2types.Instance{ + InstanceId: lo.ToPtr(instanceID), + InstanceType: "m5.large", + Placement: &ec2types.Placement{ + AvailabilityZone: lo.ToPtr("test-zone-1a"), + PartitionNumber: lo.ToPtr(partitionNum), + }, + State: &ec2types.InstanceState{ + Name: ec2types.InstanceStateNameRunning, + }, + }) + + result, err := hook.Registered(ctx, testNC) + Expect(err).ToNot(HaveOccurred()) + Expect(result.Requeue).To(BeFalse()) + Expect(testNC.Labels[v1.LabelPlacementGroupPartition]).To(Equal(fmt.Sprintf("%d", partitionNum))) + } + }) +}) diff --git a/pkg/cloudprovider/registrationhooks/suite_test.go b/pkg/cloudprovider/registrationhooks/suite_test.go new file mode 100644 index 000000000000..2b8c61377801 --- /dev/null +++ b/pkg/cloudprovider/registrationhooks/suite_test.go @@ -0,0 +1,74 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package registrationhooks_test + +import ( + "context" + "testing" + + "sigs.k8s.io/karpenter/pkg/test/v1alpha1" + + "github.com/samber/lo" + + "github.com/aws/karpenter-provider-aws/pkg/apis" + "github.com/aws/karpenter-provider-aws/pkg/operator/options" + "github.com/aws/karpenter-provider-aws/pkg/test" + + coreoptions "sigs.k8s.io/karpenter/pkg/operator/options" + coretest "sigs.k8s.io/karpenter/pkg/test" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + . "sigs.k8s.io/karpenter/pkg/test/expectations" + . "sigs.k8s.io/karpenter/pkg/utils/testing" +) + +var ctx context.Context +var stop context.CancelFunc +var env *coretest.Environment +var awsEnv *test.Environment + +func TestRegistrationHooks(t *testing.T) { + ctx = TestContextWithLogger(t) + RegisterFailHandler(Fail) + RunSpecs(t, "RegistrationHooks") +} + +var _ = BeforeSuite(func() { + env = coretest.NewEnvironment( + coretest.WithCRDs(test.DisableCapacityReservationIDValidation(test.RemoveNodeClassTagValidation(apis.CRDs))...), + coretest.WithCRDs(v1alpha1.CRDs...), + coretest.WithFieldIndexers(coretest.NodePoolNodeClassRefFieldIndexer(ctx)), + ) + ctx = coreoptions.ToContext(ctx, coretest.Options(coretest.OptionsFields{FeatureGates: coretest.FeatureGates{ReservedCapacity: lo.ToPtr(true)}})) + ctx = options.ToContext(ctx, test.Options()) + ctx, stop = context.WithCancel(ctx) + awsEnv = test.NewEnvironment(ctx, env) +}) + +var _ = AfterSuite(func() { + stop() + Expect(env.Stop()).To(Succeed(), "Failed to stop environment") +}) + +var _ = BeforeEach(func() { + ctx = coreoptions.ToContext(ctx, coretest.Options(coretest.OptionsFields{FeatureGates: coretest.FeatureGates{ReservedCapacity: lo.ToPtr(true)}})) + ctx = options.ToContext(ctx, test.Options()) + awsEnv.Reset() +}) + +var _ = AfterEach(func() { + ExpectCleanedUp(ctx, env.Client) +}) diff --git a/pkg/cloudprovider/suite_test.go b/pkg/cloudprovider/suite_test.go index 1ffd14a4bea6..ce8b3b47c5e0 100644 --- a/pkg/cloudprovider/suite_test.go +++ b/pkg/cloudprovider/suite_test.go @@ -1233,7 +1233,7 @@ var _ = Describe("CloudProvider", func() { {SubnetId: aws.String("test-subnet-2"), AvailabilityZone: aws.String("test-zone-1a"), AvailabilityZoneId: aws.String("tstz1-1a"), AvailableIpAddressCount: aws.Int32(100), Tags: []ec2types.Tag{{Key: aws.String("Name"), Value: aws.String("test-subnet-2")}}}, }}) - controller := nodeclass.NewController(awsEnv.Clock, env.Client, cloudProvider, recorder, fake.DefaultRegion, awsEnv.SubnetProvider, awsEnv.SecurityGroupProvider, awsEnv.AMIProvider, awsEnv.InstanceProfileProvider, awsEnv.InstanceTypesProvider, awsEnv.LaunchTemplateProvider, awsEnv.CapacityReservationProvider, awsEnv.EC2API, awsEnv.ValidationCache, awsEnv.RecreationCache, awsEnv.AMIResolver, options.FromContext(ctx).DisableDryRun) + controller := nodeclass.NewController(awsEnv.Clock, env.Client, cloudProvider, recorder, fake.DefaultRegion, awsEnv.SubnetProvider, awsEnv.SecurityGroupProvider, awsEnv.AMIProvider, awsEnv.InstanceProfileProvider, awsEnv.InstanceTypesProvider, awsEnv.LaunchTemplateProvider, awsEnv.CapacityReservationProvider, awsEnv.PlacementGroupProvider, awsEnv.EC2API, awsEnv.ValidationCache, awsEnv.RecreationCache, awsEnv.AMIResolver, options.FromContext(ctx).DisableDryRun) ExpectApplied(ctx, env.Client, nodePool, nodeClass) ExpectObjectReconciled(ctx, env.Client, controller, nodeClass) pod := coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: map[string]string{corev1.LabelTopologyZone: "test-zone-1a"}}) @@ -1250,7 +1250,7 @@ var _ = Describe("CloudProvider", func() { {SubnetId: aws.String("test-subnet-2"), AvailabilityZone: aws.String("test-zone-1a"), AvailabilityZoneId: aws.String("tstz1-1a"), AvailableIpAddressCount: aws.Int32(11), Tags: []ec2types.Tag{{Key: aws.String("Name"), Value: aws.String("test-subnet-2")}}}, }}) - controller := nodeclass.NewController(awsEnv.Clock, env.Client, cloudProvider, recorder, fake.DefaultRegion, awsEnv.SubnetProvider, awsEnv.SecurityGroupProvider, awsEnv.AMIProvider, awsEnv.InstanceProfileProvider, awsEnv.InstanceTypesProvider, awsEnv.LaunchTemplateProvider, awsEnv.CapacityReservationProvider, awsEnv.EC2API, awsEnv.ValidationCache, awsEnv.RecreationCache, awsEnv.AMIResolver, options.FromContext(ctx).DisableDryRun) + controller := nodeclass.NewController(awsEnv.Clock, env.Client, cloudProvider, recorder, fake.DefaultRegion, awsEnv.SubnetProvider, awsEnv.SecurityGroupProvider, awsEnv.AMIProvider, awsEnv.InstanceProfileProvider, awsEnv.InstanceTypesProvider, awsEnv.LaunchTemplateProvider, awsEnv.CapacityReservationProvider, awsEnv.PlacementGroupProvider, awsEnv.EC2API, awsEnv.ValidationCache, awsEnv.RecreationCache, awsEnv.AMIResolver, options.FromContext(ctx).DisableDryRun) nodeClass.Spec.Kubelet = &v1.KubeletConfiguration{ MaxPods: aws.Int32(1), } @@ -1299,7 +1299,7 @@ var _ = Describe("CloudProvider", func() { }) nodeClass.Spec.SubnetSelectorTerms = []v1.SubnetSelectorTerm{{Tags: map[string]string{"Name": "test-subnet-1"}}} ExpectApplied(ctx, env.Client, nodePool, nodeClass) - controller := nodeclass.NewController(awsEnv.Clock, env.Client, cloudProvider, recorder, fake.DefaultRegion, awsEnv.SubnetProvider, awsEnv.SecurityGroupProvider, awsEnv.AMIProvider, awsEnv.InstanceProfileProvider, awsEnv.InstanceTypesProvider, awsEnv.LaunchTemplateProvider, awsEnv.CapacityReservationProvider, awsEnv.EC2API, awsEnv.ValidationCache, awsEnv.RecreationCache, awsEnv.AMIResolver, options.FromContext(ctx).DisableDryRun) + controller := nodeclass.NewController(awsEnv.Clock, env.Client, cloudProvider, recorder, fake.DefaultRegion, awsEnv.SubnetProvider, awsEnv.SecurityGroupProvider, awsEnv.AMIProvider, awsEnv.InstanceProfileProvider, awsEnv.InstanceTypesProvider, awsEnv.LaunchTemplateProvider, awsEnv.CapacityReservationProvider, awsEnv.PlacementGroupProvider, awsEnv.EC2API, awsEnv.ValidationCache, awsEnv.RecreationCache, awsEnv.AMIResolver, options.FromContext(ctx).DisableDryRun) ExpectObjectReconciled(ctx, env.Client, controller, nodeClass) podSubnet1 := coretest.UnschedulablePod() ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, podSubnet1) diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index af66cb7cdba3..21e7be9c1919 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -40,6 +40,7 @@ import ( controllersversion "github.com/aws/karpenter-provider-aws/pkg/controllers/providers/version" capacityreservationprovider "github.com/aws/karpenter-provider-aws/pkg/providers/capacityreservation" "github.com/aws/karpenter-provider-aws/pkg/providers/launchtemplate" + "github.com/aws/karpenter-provider-aws/pkg/providers/placementgroup" "github.com/aws/karpenter-provider-aws/pkg/providers/version" "k8s.io/utils/clock" @@ -86,11 +87,12 @@ func NewControllers( versionProvider *version.DefaultProvider, instanceTypeProvider *instancetype.DefaultProvider, capacityReservationProvider capacityreservationprovider.Provider, + placementGroupProvider placementgroup.Provider, amiResolver amifamily.Resolver, ) []controller.Controller { controllers := []controller.Controller{ nodeclasshash.NewController(kubeClient), - nodeclass.NewController(clk, kubeClient, cloudProvider, recorder, cfg.Region, subnetProvider, securityGroupProvider, amiProvider, instanceProfileProvider, instanceTypeProvider, launchTemplateProvider, capacityReservationProvider, ec2api, validationCache, recreationCache, amiResolver, options.FromContext(ctx).DisableDryRun), + nodeclass.NewController(clk, kubeClient, cloudProvider, recorder, cfg.Region, subnetProvider, securityGroupProvider, amiProvider, instanceProfileProvider, instanceTypeProvider, launchTemplateProvider, capacityReservationProvider, placementGroupProvider, ec2api, validationCache, recreationCache, amiResolver, options.FromContext(ctx).DisableDryRun), nodeclaimgarbagecollection.NewController(kubeClient, cloudProvider), nodeclaimtagging.NewController(kubeClient, cloudProvider, instanceProvider), controllerspricing.NewController(pricingProvider), diff --git a/pkg/controllers/nodeclass/controller.go b/pkg/controllers/nodeclass/controller.go index 8609e683f207..813b6a206f32 100644 --- a/pkg/controllers/nodeclass/controller.go +++ b/pkg/controllers/nodeclass/controller.go @@ -57,6 +57,7 @@ import ( "github.com/aws/karpenter-provider-aws/pkg/providers/instanceprofile" "github.com/aws/karpenter-provider-aws/pkg/providers/instancetype" "github.com/aws/karpenter-provider-aws/pkg/providers/launchtemplate" + "github.com/aws/karpenter-provider-aws/pkg/providers/placementgroup" "github.com/aws/karpenter-provider-aws/pkg/providers/securitygroup" "github.com/aws/karpenter-provider-aws/pkg/providers/subnet" ) @@ -84,6 +85,7 @@ func NewController( instanceTypeProvider instancetype.Provider, launchTemplateProvider launchtemplate.Provider, capacityReservationProvider capacityreservation.Provider, + placementGroupProvider placementgroup.Provider, ec2api sdk.EC2API, validationCache *cache.Cache, recreationCache *cache.Cache, @@ -101,6 +103,7 @@ func NewController( reconcilers: []reconcile.TypedReconciler[*v1.EC2NodeClass]{ NewAMIReconciler(amiProvider), NewCapacityReservationReconciler(clk, capacityReservationProvider), + NewPlacementGroupReconciler(placementGroupProvider), NewSubnetReconciler(subnetProvider), NewSecurityGroupReconciler(securityGroupProvider), NewInstanceProfileReconciler(instanceProfileProvider, region, recreationCache), diff --git a/pkg/controllers/nodeclass/placementgroup.go b/pkg/controllers/nodeclass/placementgroup.go new file mode 100644 index 000000000000..386adb9eca77 --- /dev/null +++ b/pkg/controllers/nodeclass/placementgroup.go @@ -0,0 +1,91 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodeclass + +import ( + "context" + "fmt" + + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/karpenter/pkg/utils/pretty" + + "github.com/samber/lo" + + v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1" + "github.com/aws/karpenter-provider-aws/pkg/providers/placementgroup" +) + +type PlacementGroupReconciler struct { + provider placementgroup.Provider + cm *pretty.ChangeMonitor +} + +func NewPlacementGroupReconciler(provider placementgroup.Provider) *PlacementGroupReconciler { + return &PlacementGroupReconciler{ + provider: provider, + cm: pretty.NewChangeMonitor(), + } +} + +func (p *PlacementGroupReconciler) Reconcile(ctx context.Context, nc *v1.EC2NodeClass) (reconcile.Result, error) { + // If no placement group selector is specified, clear status and remove any stale + // PlacementGroupReady condition from a previous reconciliation when the selector was set. + // Since PlacementGroupReady is no longer in the StatusConditions set when the selector is nil, + // we need to explicitly clear it. + if nc.Spec.PlacementGroupSelector == nil { + nc.Status.PlacementGroups = nil + err := nc.StatusConditions().Clear(v1.ConditionTypePlacementGroupReady) + return reconcile.Result{}, err + } + + term := lo.FromPtr(nc.Spec.PlacementGroupSelector) + selector := term.Name + if selector == "" { + selector = term.ID + } + pg, err := p.provider.Get(ctx, term) + if err != nil { + nc.Status.PlacementGroups = nil + nc.StatusConditions().SetFalse(v1.ConditionTypePlacementGroupReady, "PlacementGroupResolutionFailed", fmt.Sprintf("Failed to resolve placement group %q: %s", selector, err)) + return reconcile.Result{}, fmt.Errorf("resolving placement group %q, %w", selector, err) + } + if pg == nil { + nc.Status.PlacementGroups = nil + nc.StatusConditions().SetFalse(v1.ConditionTypePlacementGroupReady, "PlacementGroupNotFound", fmt.Sprintf("Placement group %q not found", selector)) + return reconcile.Result{}, nil + } + + resolved, err := v1.PlacementGroupFromEC2(pg) + if err != nil { + nc.Status.PlacementGroups = nil + nc.StatusConditions().SetFalse(v1.ConditionTypePlacementGroupReady, "PlacementGroupResolutionFailed", fmt.Sprintf("Failed to parse placement group %q: %s", selector, err)) + return reconcile.Result{}, fmt.Errorf("parsing placement group %q, %w", selector, err) + } + + if resolved.State != v1.PlacementGroupStateAvailable { + nc.Status.PlacementGroups = nil + nc.StatusConditions().SetFalse(v1.ConditionTypePlacementGroupReady, "PlacementGroupNotAvailable", fmt.Sprintf("Placement group %q is in state %q, expected %q", selector, resolved.State, v1.PlacementGroupStateAvailable)) + return reconcile.Result{}, nil + } + + if p.cm.HasChanged(nc.Name, resolved.ID) { + log.FromContext(ctx).V(1).WithValues("id", resolved.ID, "name", resolved.Name, "strategy", resolved.Strategy).Info("discovered placement group") + } + + nc.Status.PlacementGroups = []v1.PlacementGroup{resolved} + nc.StatusConditions().SetTrue(v1.ConditionTypePlacementGroupReady) + return reconcile.Result{}, nil +} diff --git a/pkg/controllers/nodeclass/placementgroup_test.go b/pkg/controllers/nodeclass/placementgroup_test.go new file mode 100644 index 000000000000..5812ee644795 --- /dev/null +++ b/pkg/controllers/nodeclass/placementgroup_test.go @@ -0,0 +1,183 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodeclass_test + +import ( + "github.com/aws/aws-sdk-go-v2/service/ec2" + ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + . "sigs.k8s.io/karpenter/pkg/test/expectations" + + v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1" +) + +var _ = Describe("NodeClass Placement Group Reconciler", func() { + BeforeEach(func() { + awsEnv.EC2API.DescribePlacementGroupsOutput.Set(&ec2.DescribePlacementGroupsOutput{ + PlacementGroups: []ec2types.PlacementGroup{ + { + GroupId: lo.ToPtr("pg-cluster123"), + GroupName: lo.ToPtr("my-cluster-pg"), + State: ec2types.PlacementGroupStateAvailable, + Strategy: ec2types.PlacementStrategyCluster, + }, + { + GroupId: lo.ToPtr("pg-partition456"), + GroupName: lo.ToPtr("my-partition-pg"), + State: ec2types.PlacementGroupStateAvailable, + Strategy: ec2types.PlacementStrategyPartition, + PartitionCount: lo.ToPtr[int32](7), + }, + { + GroupId: lo.ToPtr("pg-spread789"), + GroupName: lo.ToPtr("my-spread-pg"), + State: ec2types.PlacementGroupStateAvailable, + Strategy: ec2types.PlacementStrategySpread, + SpreadLevel: ec2types.SpreadLevelRack, + }, + { + GroupId: lo.ToPtr("pg-pending000"), + GroupName: lo.ToPtr("my-pending-pg"), + State: ec2types.PlacementGroupStatePending, + Strategy: ec2types.PlacementStrategyCluster, + }, + }, + }) + }) + + It("should not have PlacementGroupReady condition when no placement group selector is specified", func() { + // nodeClass has no PlacementGroupSelector by default + ExpectApplied(ctx, env.Client, nodeClass) + ExpectObjectReconciled(ctx, env.Client, controller, nodeClass) + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + Expect(nodeClass.StatusConditions().Get(v1.ConditionTypePlacementGroupReady)).To(BeNil()) + Expect(nodeClass.Status.PlacementGroups).To(BeNil()) + }) + It("should resolve a cluster placement group by name", func() { + nodeClass.Spec.PlacementGroupSelector = &v1.PlacementGroupSelectorTerm{Name: "my-cluster-pg"} + ExpectApplied(ctx, env.Client, nodeClass) + ExpectObjectReconciled(ctx, env.Client, controller, nodeClass) + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + Expect(nodeClass.StatusConditions().Get(v1.ConditionTypePlacementGroupReady).IsTrue()).To(BeTrue()) + Expect(nodeClass.Status.PlacementGroups).To(HaveLen(1)) + Expect(nodeClass.Status.PlacementGroups[0]).To(Equal(v1.PlacementGroup{ + ID: "pg-cluster123", + Name: "my-cluster-pg", + State: v1.PlacementGroupStateAvailable, + Strategy: v1.PlacementGroupStrategyCluster, + })) + }) + It("should resolve a placement group by ID", func() { + nodeClass.Spec.PlacementGroupSelector = &v1.PlacementGroupSelectorTerm{ID: "pg-cluster123"} + ExpectApplied(ctx, env.Client, nodeClass) + ExpectObjectReconciled(ctx, env.Client, controller, nodeClass) + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + Expect(nodeClass.StatusConditions().Get(v1.ConditionTypePlacementGroupReady).IsTrue()).To(BeTrue()) + Expect(nodeClass.Status.PlacementGroups).To(HaveLen(1)) + Expect(nodeClass.Status.PlacementGroups[0].ID).To(Equal("pg-cluster123")) + Expect(nodeClass.Status.PlacementGroups[0].Name).To(Equal("my-cluster-pg")) + }) + It("should resolve a partition placement group with partition count", func() { + nodeClass.Spec.PlacementGroupSelector = &v1.PlacementGroupSelectorTerm{Name: "my-partition-pg"} + ExpectApplied(ctx, env.Client, nodeClass) + ExpectObjectReconciled(ctx, env.Client, controller, nodeClass) + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + Expect(nodeClass.StatusConditions().Get(v1.ConditionTypePlacementGroupReady).IsTrue()).To(BeTrue()) + Expect(nodeClass.Status.PlacementGroups).To(HaveLen(1)) + Expect(nodeClass.Status.PlacementGroups[0]).To(Equal(v1.PlacementGroup{ + ID: "pg-partition456", + Name: "my-partition-pg", + State: v1.PlacementGroupStateAvailable, + Strategy: v1.PlacementGroupStrategyPartition, + PartitionCount: 7, + })) + }) + It("should resolve a spread placement group with spread level", func() { + nodeClass.Spec.PlacementGroupSelector = &v1.PlacementGroupSelectorTerm{Name: "my-spread-pg"} + ExpectApplied(ctx, env.Client, nodeClass) + ExpectObjectReconciled(ctx, env.Client, controller, nodeClass) + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + Expect(nodeClass.StatusConditions().Get(v1.ConditionTypePlacementGroupReady).IsTrue()).To(BeTrue()) + Expect(nodeClass.Status.PlacementGroups).To(HaveLen(1)) + Expect(nodeClass.Status.PlacementGroups[0]).To(Equal(v1.PlacementGroup{ + ID: "pg-spread789", + Name: "my-spread-pg", + State: v1.PlacementGroupStateAvailable, + Strategy: v1.PlacementGroupStrategySpread, + SpreadLevel: v1.PlacementGroupSpreadLevelRack, + })) + }) + It("should set condition false when placement group is not found", func() { + nodeClass.Spec.PlacementGroupSelector = &v1.PlacementGroupSelectorTerm{Name: "nonexistent-pg"} + ExpectApplied(ctx, env.Client, nodeClass) + ExpectObjectReconciled(ctx, env.Client, controller, nodeClass) + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + condition := nodeClass.StatusConditions().Get(v1.ConditionTypePlacementGroupReady) + Expect(condition.IsFalse()).To(BeTrue()) + Expect(condition.Reason).To(Equal("PlacementGroupNotFound")) + Expect(condition.Message).To(ContainSubstring("nonexistent-pg")) + Expect(nodeClass.Status.PlacementGroups).To(BeNil()) + }) + It("should set condition false when placement group is not in available state", func() { + // The DescribePlacementGroupsInput always filters by state=available, so a pending PG + // is filtered out at the EC2 API level. The reconciler sees nil from the provider and + // sets "PlacementGroupNotFound". + nodeClass.Spec.PlacementGroupSelector = &v1.PlacementGroupSelectorTerm{Name: "my-pending-pg"} + ExpectApplied(ctx, env.Client, nodeClass) + ExpectObjectReconciled(ctx, env.Client, controller, nodeClass) + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + condition := nodeClass.StatusConditions().Get(v1.ConditionTypePlacementGroupReady) + Expect(condition.IsFalse()).To(BeTrue()) + Expect(condition.Reason).To(Equal("PlacementGroupNotFound")) + Expect(condition.Message).To(ContainSubstring("my-pending-pg")) + Expect(nodeClass.Status.PlacementGroups).To(BeNil()) + }) + It("should clear status and condition when placement group selector is removed", func() { + // First, set up with a placement group + nodeClass.Spec.PlacementGroupSelector = &v1.PlacementGroupSelectorTerm{Name: "my-cluster-pg"} + ExpectApplied(ctx, env.Client, nodeClass) + ExpectObjectReconciled(ctx, env.Client, controller, nodeClass) + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + Expect(nodeClass.Status.PlacementGroups).To(HaveLen(1)) + + // Now remove the selector - PlacementGroupReady condition should be cleared + nodeClass.Spec.PlacementGroupSelector = nil + ExpectApplied(ctx, env.Client, nodeClass) + ExpectObjectReconciled(ctx, env.Client, controller, nodeClass) + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + Expect(nodeClass.StatusConditions().Get(v1.ConditionTypePlacementGroupReady)).To(BeNil()) + Expect(nodeClass.Status.PlacementGroups).To(BeNil()) + }) + It("should update status when placement group selector changes", func() { + // Start with cluster PG + nodeClass.Spec.PlacementGroupSelector = &v1.PlacementGroupSelectorTerm{Name: "my-cluster-pg"} + ExpectApplied(ctx, env.Client, nodeClass) + ExpectObjectReconciled(ctx, env.Client, controller, nodeClass) + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + Expect(nodeClass.Status.PlacementGroups).To(HaveLen(1)) + Expect(nodeClass.Status.PlacementGroups[0].ID).To(Equal("pg-cluster123")) + + // Switch to spread PG + nodeClass.Spec.PlacementGroupSelector = &v1.PlacementGroupSelectorTerm{Name: "my-spread-pg"} + ExpectApplied(ctx, env.Client, nodeClass) + ExpectObjectReconciled(ctx, env.Client, controller, nodeClass) + nodeClass = ExpectExists(ctx, env.Client, nodeClass) + Expect(nodeClass.Status.PlacementGroups).To(HaveLen(1)) + Expect(nodeClass.Status.PlacementGroups[0].ID).To(Equal("pg-spread789")) + Expect(nodeClass.Status.PlacementGroups[0].Strategy).To(Equal(v1.PlacementGroupStrategySpread)) + }) +}) diff --git a/pkg/controllers/nodeclass/suite_test.go b/pkg/controllers/nodeclass/suite_test.go index 64ccfe25fae4..71487bbcd56f 100644 --- a/pkg/controllers/nodeclass/suite_test.go +++ b/pkg/controllers/nodeclass/suite_test.go @@ -100,6 +100,7 @@ var _ = BeforeEach(func() { awsEnv.InstanceTypesProvider, awsEnv.LaunchTemplateProvider, awsEnv.CapacityReservationProvider, + awsEnv.PlacementGroupProvider, awsEnv.EC2API, awsEnv.ValidationCache, awsEnv.RecreationCache, diff --git a/pkg/controllers/nodeclass/validation_test.go b/pkg/controllers/nodeclass/validation_test.go index 06d8973cb17c..36914df037de 100644 --- a/pkg/controllers/nodeclass/validation_test.go +++ b/pkg/controllers/nodeclass/validation_test.go @@ -523,6 +523,7 @@ var _ = Describe("NodeClass Validation Status Controller", func() { awsEnv.InstanceTypesProvider, awsEnv.LaunchTemplateProvider, awsEnv.CapacityReservationProvider, + awsEnv.PlacementGroupProvider, awsEnv.EC2API, awsEnv.ValidationCache, awsEnv.RecreationCache, diff --git a/pkg/errors/errors.go b/pkg/errors/errors.go index 1f0ff0379cb9..8bf50f2eda73 100644 --- a/pkg/errors/errors.go +++ b/pkg/errors/errors.go @@ -183,6 +183,15 @@ func IsInsufficientFreeAddressesInSubnet(err ec2types.CreateFleetError) bool { return *err.ErrorCode == InsufficientFreeAddressesInSubnetErrorCode } +// IsSpreadPlacementGroupLimitError returns true if the fleet error indicates that +// the 7-instance-per-AZ limit for a spread placement group has been reached. +// EC2 returns this as an InsufficientInstanceCapacity error with the message: +// "You've reached the limit of instances in this spread placement group. A spread +// placement group can have up to seven instances per Availability Zone." +func IsSpreadPlacementGroupLimitError(err ec2types.CreateFleetError) bool { + return err.ErrorMessage != nil && strings.Contains(*err.ErrorMessage, "limit of instances in this spread placement group") +} + // IsReservationCapacityExceeded returns true if the fleet error means there is no remaining capacity for the provided // capacity reservation. func IsReservationCapacityExceeded(err ec2types.CreateFleetError) bool { diff --git a/pkg/fake/ec2api.go b/pkg/fake/ec2api.go index 3f1e3b1b1d62..ca3b34ddebe3 100644 --- a/pkg/fake/ec2api.go +++ b/pkg/fake/ec2api.go @@ -49,6 +49,7 @@ type CapacityPool struct { // pollute each other. type EC2Behavior struct { DescribeCapacityReservationsOutput AtomicPtr[ec2.DescribeCapacityReservationsOutput] + DescribePlacementGroupsOutput AtomicPtr[ec2.DescribePlacementGroupsOutput] DescribeImagesOutput AtomicPtr[ec2.DescribeImagesOutput] DescribeLaunchTemplatesOutput AtomicPtr[ec2.DescribeLaunchTemplatesOutput] DescribeInstanceTypesOutput AtomicPtr[ec2.DescribeInstanceTypesOutput] @@ -384,6 +385,19 @@ func filterInstances(instances []ec2types.Instance, filters []ec2types.Filter) [ return ret } +func (e *EC2API) DescribePlacementGroups(_ context.Context, input *ec2.DescribePlacementGroupsInput, _ ...func(*ec2.Options)) (*ec2.DescribePlacementGroupsOutput, error) { + if !e.NextError.IsNil() { + defer e.NextError.Reset() + return nil, e.NextError.Get() + } + if !e.DescribePlacementGroupsOutput.IsNil() { + out := e.DescribePlacementGroupsOutput.Clone() + out.PlacementGroups = FilterDescribePlacementGroups(out.PlacementGroups, input.GroupIds, input.GroupNames, input.Filters) + return out, nil + } + return &ec2.DescribePlacementGroupsOutput{}, nil +} + func (e *EC2API) DescribeCapacityReservations(ctx context.Context, input *ec2.DescribeCapacityReservationsInput, _ ...func(*ec2.Options)) (*ec2.DescribeCapacityReservationsOutput, error) { if !e.NextError.IsNil() { defer e.NextError.Reset() diff --git a/pkg/fake/utils.go b/pkg/fake/utils.go index ff92c4a242a7..5b8602172a70 100644 --- a/pkg/fake/utils.go +++ b/pkg/fake/utils.go @@ -111,6 +111,20 @@ func FilterDescribeCapacityReservations(crs []ec2types.CapacityReservation, ids }) } +func FilterDescribePlacementGroups(pgs []ec2types.PlacementGroup, ids []string, names []string, filters []ec2types.Filter) []ec2types.PlacementGroup { + idSet := sets.New(ids...) + nameSet := sets.New(names...) + return lo.Filter(pgs, func(pg ec2types.PlacementGroup, _ int) bool { + if len(ids) != 0 && !idSet.Has(lo.FromPtr(pg.GroupId)) { + return false + } + if len(names) != 0 && !nameSet.Has(lo.FromPtr(pg.GroupName)) { + return false + } + return Filter(filters, lo.FromPtr(pg.GroupId), lo.FromPtr(pg.GroupName), "", string(pg.State), pg.Tags) + }) +} + func FilterDescribeImages(images []ec2types.Image, filters []ec2types.Filter) []ec2types.Image { return lo.Filter(images, func(image ec2types.Image, _ int) bool { return Filter(filters, *image.ImageId, *image.Name, "", string(image.State), image.Tags) @@ -201,6 +215,13 @@ func MakeInstances() []ec2types.InstanceTypeInfo { MaximumNetworkInterfaces: aws.Int32(3), }}, }, + PlacementGroupInfo: &ec2types.PlacementGroupInfo{ + SupportedStrategies: []ec2types.PlacementGroupStrategy{ + ec2types.PlacementGroupStrategyCluster, + ec2types.PlacementGroupStrategyPartition, + ec2types.PlacementGroupStrategySpread, + }, + }, SupportedUsageClasses: DefaultSupportedUsageClasses, }) } diff --git a/pkg/fake/zz_generated.describe_instance_types.go b/pkg/fake/zz_generated.describe_instance_types.go index 4a9068c987a0..908b01049a2a 100644 --- a/pkg/fake/zz_generated.describe_instance_types.go +++ b/pkg/fake/zz_generated.describe_instance_types.go @@ -73,6 +73,9 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{ }, }, }, + PlacementGroupInfo: &ec2types.PlacementGroupInfo{ + SupportedStrategies: []ec2types.PlacementGroupStrategy{"cluster", "partition", "spread"}, + }, }, { InstanceType: "dl1.24xlarge", @@ -148,6 +151,9 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{ }, }, }, + PlacementGroupInfo: &ec2types.PlacementGroupInfo{ + SupportedStrategies: []ec2types.PlacementGroupStrategy{"cluster", "partition", "spread"}, + }, }, { InstanceType: "g4ad.16xlarge", @@ -208,6 +214,9 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{ }, }, }, + PlacementGroupInfo: &ec2types.PlacementGroupInfo{ + SupportedStrategies: []ec2types.PlacementGroupStrategy{"cluster", "partition", "spread"}, + }, }, { InstanceType: "g4dn.8xlarge", @@ -271,6 +280,9 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{ }, }, }, + PlacementGroupInfo: &ec2types.PlacementGroupInfo{ + SupportedStrategies: []ec2types.PlacementGroupStrategy{"cluster", "partition", "spread"}, + }, }, { InstanceType: "inf2.24xlarge", @@ -331,6 +343,9 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{ }, }, }, + PlacementGroupInfo: &ec2types.PlacementGroupInfo{ + SupportedStrategies: []ec2types.PlacementGroupStrategy{"cluster", "partition", "spread"}, + }, }, { InstanceType: "inf2.xlarge", @@ -391,6 +406,9 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{ }, }, }, + PlacementGroupInfo: &ec2types.PlacementGroupInfo{ + SupportedStrategies: []ec2types.PlacementGroupStrategy{"cluster", "partition", "spread"}, + }, }, { InstanceType: "m5.large", @@ -436,6 +454,9 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{ }, }, }, + PlacementGroupInfo: &ec2types.PlacementGroupInfo{ + SupportedStrategies: []ec2types.PlacementGroupStrategy{"cluster", "partition", "spread"}, + }, }, { InstanceType: "m5.metal", @@ -481,6 +502,9 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{ }, }, }, + PlacementGroupInfo: &ec2types.PlacementGroupInfo{ + SupportedStrategies: []ec2types.PlacementGroupStrategy{"cluster", "partition", "spread"}, + }, }, { InstanceType: "m5.xlarge", @@ -526,6 +550,9 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{ }, }, }, + PlacementGroupInfo: &ec2types.PlacementGroupInfo{ + SupportedStrategies: []ec2types.PlacementGroupStrategy{"cluster", "partition", "spread"}, + }, }, { InstanceType: "m6idn.32xlarge", @@ -581,6 +608,9 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{ }, }, }, + PlacementGroupInfo: &ec2types.PlacementGroupInfo{ + SupportedStrategies: []ec2types.PlacementGroupStrategy{"cluster", "partition", "spread"}, + }, }, { InstanceType: "m7i-flex.large", @@ -626,6 +656,9 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{ }, }, }, + PlacementGroupInfo: &ec2types.PlacementGroupInfo{ + SupportedStrategies: []ec2types.PlacementGroupStrategy{"partition", "spread"}, + }, }, { InstanceType: "p3.8xlarge", @@ -683,6 +716,9 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{ }, }, }, + PlacementGroupInfo: &ec2types.PlacementGroupInfo{ + SupportedStrategies: []ec2types.PlacementGroupStrategy{"cluster", "partition", "spread"}, + }, }, { InstanceType: "t3.large", @@ -728,6 +764,9 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{ }, }, }, + PlacementGroupInfo: &ec2types.PlacementGroupInfo{ + SupportedStrategies: []ec2types.PlacementGroupStrategy{"partition", "spread"}, + }, }, { InstanceType: "t4g.medium", @@ -773,6 +812,9 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{ }, }, }, + PlacementGroupInfo: &ec2types.PlacementGroupInfo{ + SupportedStrategies: []ec2types.PlacementGroupStrategy{"partition", "spread"}, + }, }, { InstanceType: "t4g.small", @@ -818,6 +860,9 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{ }, }, }, + PlacementGroupInfo: &ec2types.PlacementGroupInfo{ + SupportedStrategies: []ec2types.PlacementGroupStrategy{"partition", "spread"}, + }, }, { InstanceType: "t4g.xlarge", @@ -863,6 +908,9 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{ }, }, }, + PlacementGroupInfo: &ec2types.PlacementGroupInfo{ + SupportedStrategies: []ec2types.PlacementGroupStrategy{"partition", "spread"}, + }, }, { InstanceType: "trn1.2xlarge", @@ -926,6 +974,9 @@ var defaultDescribeInstanceTypesOutput = &ec2.DescribeInstanceTypesOutput{ }, }, }, + PlacementGroupInfo: &ec2types.PlacementGroupInfo{ + SupportedStrategies: []ec2types.PlacementGroupStrategy{"cluster", "partition", "spread"}, + }, }, }, } diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index 1ce87461f537..0d1df2e71a0a 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -59,6 +59,7 @@ import ( "github.com/aws/karpenter-provider-aws/pkg/providers/instanceprofile" "github.com/aws/karpenter-provider-aws/pkg/providers/instancetype" "github.com/aws/karpenter-provider-aws/pkg/providers/launchtemplate" + "github.com/aws/karpenter-provider-aws/pkg/providers/placementgroup" "github.com/aws/karpenter-provider-aws/pkg/providers/pricing" "github.com/aws/karpenter-provider-aws/pkg/providers/securitygroup" ssmp "github.com/aws/karpenter-provider-aws/pkg/providers/ssm" @@ -91,6 +92,7 @@ type Operator struct { InstanceProvider instance.Provider SSMProvider ssmp.Provider CapacityReservationProvider capacityreservation.Provider + PlacementGroupProvider placementgroup.Provider EC2API *ec2.Client } @@ -174,6 +176,10 @@ func NewOperator(ctx context.Context, operator *operator.Operator) (context.Cont cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval), cache.New(awscache.CapacityReservationAvailabilityTTL, awscache.DefaultCleanupInterval), ) + placementGroupProvider := placementgroup.NewProvider( + ec2api, + cache.New(awscache.PlacementGroupTTL, awscache.DefaultCleanupInterval), + ) instanceTypeProvider := instancetype.NewDefaultProvider( cache.New(awscache.InstanceTypesZonesAndOfferingsTTL, awscache.DefaultCleanupInterval), cache.New(awscache.InstanceTypesZonesAndOfferingsTTL, awscache.DefaultCleanupInterval), @@ -224,6 +230,7 @@ func NewOperator(ctx context.Context, operator *operator.Operator) (context.Cont InstanceProvider: instanceProvider, SSMProvider: ssmProvider, CapacityReservationProvider: capacityReservationProvider, + PlacementGroupProvider: placementGroupProvider, EC2API: ec2api, } } diff --git a/pkg/providers/amifamily/resolver.go b/pkg/providers/amifamily/resolver.go index 36e5c0b35768..f91d7f889b11 100644 --- a/pkg/providers/amifamily/resolver.go +++ b/pkg/providers/amifamily/resolver.go @@ -18,6 +18,7 @@ import ( "context" "fmt" "net" + "strconv" "strings" "github.com/aws/aws-sdk-go-v2/aws" @@ -87,6 +88,8 @@ type LaunchTemplate struct { CapacityReservationType v1.CapacityReservationType CapacityReservationInterruptible bool Tenancy string + PlacementGroupID string + PlacementGroupPartition int32 } // AMIFamily can be implemented to override the default logic for generating dynamic launch template parameters @@ -133,6 +136,8 @@ func NewDefaultResolver(region string) *DefaultResolver { // Resolve generates launch templates using the static options and dynamically generates launch template parameters. // Multiple ResolvedTemplates are returned based on the instanceTypes passed in to support special AMIs for certain instance types like GPUs. +// +//nolint:gocyclo func (r DefaultResolver) Resolve(nodeClass *v1.EC2NodeClass, nodeClaim *karpv1.NodeClaim, instanceTypes []*cloudprovider.InstanceType, capacityType string, tenancyType string, options *Options) ([]*LaunchTemplate, error) { amiFamily := GetAMIFamily(nodeClass.AMIFamily(), options) if len(nodeClass.Status.AMIs) == 0 { @@ -142,6 +147,22 @@ func (r DefaultResolver) Resolve(nodeClass *v1.EC2NodeClass, nodeClaim *karpv1.N if len(mappedAMIs) == 0 { return nil, fmt.Errorf("no instance types satisfy requirements of amis %v", lo.Uniq(lo.Map(nodeClass.Status.AMIs, func(a v1.AMI, _ int) string { return a.ID }))) } + // Resolve placement group ID and partition targeting from nodeClass status and nodeClaim requirements. + var placementGroupID string + var placementGroupPartition int32 + if len(nodeClass.Status.PlacementGroups) > 0 { + placementGroupID = nodeClass.Status.PlacementGroups[0].ID + // If the NodeClaim targets a specific partition (via nodeSelector/affinity on the partition label), + // pass the partition number through to the launch template so EC2 places the instance in that partition. + if nodeClass.Status.PlacementGroups[0].Strategy == v1.PlacementGroupStrategyPartition { + reqs := scheduling.NewNodeSelectorRequirementsWithMinValues(nodeClaim.Spec.Requirements...) + if partitionReq := reqs.Get(v1.LabelPlacementGroupPartition); partitionReq != nil && partitionReq.Len() == 1 { + if parsed, err := strconv.ParseInt(partitionReq.Any(), 10, 32); err == nil { + placementGroupPartition = int32(parsed) + } + } + } + } var resolvedTemplates []*LaunchTemplate for amiID, instanceTypes := range mappedAMIs { // In order to support reserved ENIs for CNI custom networking setups, @@ -195,7 +216,7 @@ func (r DefaultResolver) Resolve(nodeClass *v1.EC2NodeClass, nodeClaim *karpv1.N for params, instanceTypes := range paramsToInstanceTypes { reservationIDs := strings.Split(params.reservationIDs, ",") - resolvedTemplates = append(resolvedTemplates, r.resolveLaunchTemplates(nodeClass, nodeClaim, instanceTypes, capacityType, amiFamily, amiID, params.maxPods, params.efaCount, reservationIDs, params.reservationType, params.reservationInterruptible, options, tenancyType)...) + resolvedTemplates = append(resolvedTemplates, r.resolveLaunchTemplates(nodeClass, nodeClaim, instanceTypes, capacityType, amiFamily, amiID, params.maxPods, params.efaCount, reservationIDs, params.reservationType, params.reservationInterruptible, options, tenancyType, placementGroupID, placementGroupPartition)...) } } return resolvedTemplates, nil @@ -246,6 +267,7 @@ func (r DefaultResolver) defaultClusterDNS(opts *Options, kubeletConfig *v1.Kube return newKubeletConfig } +//nolint:gocyclo func (r DefaultResolver) resolveLaunchTemplates( nodeClass *v1.EC2NodeClass, nodeClaim *karpv1.NodeClaim, @@ -260,6 +282,8 @@ func (r DefaultResolver) resolveLaunchTemplates( capacityReservationInterruptible bool, options *Options, tenancyType string, + placementGroupID string, + placementGroupPartition int32, ) []*LaunchTemplate { kubeletConfig := &v1.KubeletConfiguration{} if nodeClass.Spec.Kubelet != nil { @@ -319,6 +343,8 @@ func (r DefaultResolver) resolveLaunchTemplates( CapacityReservationType: capacityReservationType, CapacityReservationInterruptible: capacityReservationInterruptible, Tenancy: tenancyType, + PlacementGroupID: placementGroupID, + PlacementGroupPartition: placementGroupPartition, } if len(resolved.BlockDeviceMappings) == 0 { resolved.BlockDeviceMappings = amiFamily.DefaultBlockDeviceMappings() diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index 924cb7e5c021..5e15fa11e3bc 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -353,7 +353,7 @@ func (p *DefaultProvider) launchInstance( } return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("creating fleet request, %w", err), reason, fmt.Sprintf("Error creating fleet request: %s", message)) } - p.updateUnavailableOfferingsCache(ctx, createFleetOutput.Errors, capacityType, nodeClaim, instanceTypes, aws.ToString(createFleetOutput.FleetId)) + p.updateUnavailableOfferingsCache(ctx, createFleetOutput.Errors, capacityType, nodeClaim, nodeClass, instanceTypes, aws.ToString(createFleetOutput.FleetId)) if len(createFleetOutput.Instances) == 0 || len(createFleetOutput.Instances[0].InstanceIds) == 0 { requestID, _ := awsmiddleware.GetRequestIDMetadata(createFleetOutput.ResultMetadata) return ec2types.CreateFleetInstance{}, serrors.Wrap( @@ -485,9 +485,23 @@ func (p *DefaultProvider) updateUnavailableOfferingsCache( errs []ec2types.CreateFleetError, capacityType string, nodeClaim *karpv1.NodeClaim, + nodeClass *v1.EC2NodeClass, instanceTypes []*cloudprovider.InstanceType, fleetID string, ) { + // Resolve the placement group scope from the nodeClass and nodeClaim for scoping ICE cache entries. + // When a specific partition is targeted (via NodeClaim requirements), the ICE entry is scoped to + // that partition so other partitions remain available for scheduling. + var pgScope awscache.PlacementGroupScope + if len(nodeClass.Status.PlacementGroups) > 0 { + pgScope.ID = nodeClass.Status.PlacementGroups[0].ID + // Check if a specific partition was targeted for this launch + reqs := scheduling.NewNodeSelectorRequirementsWithMinValues(nodeClaim.Spec.Requirements...) + if partitionReq := reqs.Get(v1.LabelPlacementGroupPartition); partitionReq != nil && partitionReq.Len() == 1 { + pgScope.Partition = partitionReq.Any() + } + } + for _, err := range errs { zone := lo.FromPtr(err.LaunchTemplateAndOverrides.Overrides.AvailabilityZone) if awserrors.IsInsufficientFreeAddressesInSubnet(err) && zone != "" { @@ -506,7 +520,29 @@ func (p *DefaultProvider) updateUnavailableOfferingsCache( if fleetID != "" { unavailableReason["fleet-id"] = fleetID } - p.unavailableOfferings.MarkUnavailable(ctx, instanceType, zone, capacityType, unavailableReason) + // For spread placement groups, detect the 7-instance-per-AZ limit error. + // When this limit is reached, mark all instance types in the AZ as unavailable + // for this placement group, since the limit is per-AZ per-group (not per instance type). + if awserrors.IsSpreadPlacementGroupLimitError(err) && pgScope.ID != "" { + log.FromContext(ctx).WithValues( + "placement-group-id", pgScope.ID, + "zone", zone, + ).V(1).Info("spread placement group AZ limit reached, marking all instance types unavailable for this PG in AZ") + // Mark every instance type in this AZ as unavailable for this placement group. + // Spread PGs don't have partitions, so we only scope by PG ID. + spreadScope := awscache.PlacementGroupScope{ID: pgScope.ID} + for _, it := range instanceTypes { + p.unavailableOfferings.MarkUnavailable(ctx, ec2types.InstanceType(it.Name), zone, capacityType, unavailableReason, spreadScope) + } + continue + } + // For all other ICE errors, scope the cache entry to the placement group + // (and partition if targeted) so PG-specific ICEs don't block non-PG launches + if pgScope.ID != "" { + p.unavailableOfferings.MarkUnavailable(ctx, instanceType, zone, capacityType, unavailableReason, pgScope) + } else { + p.unavailableOfferings.MarkUnavailable(ctx, instanceType, zone, capacityType, unavailableReason) + } } if awserrors.IsServiceLinkedRoleCreationNotPermitted(err) { p.unavailableOfferings.MarkCapacityTypeUnavailable(karpv1.CapacityTypeSpot) @@ -520,6 +556,9 @@ func (p *DefaultProvider) updateUnavailableOfferingsCache( reservationIDs := make([]string, 0, len(errs)) for i := range errs { if awserrors.IsUnfulfillableCapacity(errs[i]) { + if awserrors.IsSpreadPlacementGroupLimitError(errs[i]) { + continue + } capacityReservationDetails := p.getCapacityReservationDetailsForInstance( string(errs[i].LaunchTemplateAndOverrides.Overrides.InstanceType), lo.FromPtr(errs[i].LaunchTemplateAndOverrides.Overrides.AvailabilityZone), diff --git a/pkg/providers/instance/types.go b/pkg/providers/instance/types.go index cf33c9123553..c56176fac8b1 100644 --- a/pkg/providers/instance/types.go +++ b/pkg/providers/instance/types.go @@ -48,12 +48,14 @@ type Instance struct { EFAEnabled bool CapacityReservationDetails *CapacityReservationDetails Tenancy string + PartitionNumber *int32 } type CapacityReservationDetails struct { - ID string - Type v1.CapacityReservationType - Interruptible bool + ID string + Type v1.CapacityReservationType + Interruptible bool + PartitionNumber *int32 } func NewInstance(ctx context.Context, instance ec2types.Instance) *Instance { @@ -89,9 +91,17 @@ func NewInstance(ctx context.Context, instance ec2types.Instance) *Instance { }), CapacityReservationDetails: capacityReservationDetails, Tenancy: tenancyFromInstance(instance), + PartitionNumber: partitionNumberFromInstance(instance), } } +func partitionNumberFromInstance(instance ec2types.Instance) *int32 { + if instance.Placement != nil && instance.Placement.PartitionNumber != nil && *instance.Placement.PartitionNumber != 0 { + return instance.Placement.PartitionNumber + } + return nil +} + func tenancyFromInstance(instance ec2types.Instance) string { tenancy := instance.Placement.Tenancy return string(lo.Ternary(tenancy == "", ec2types.TenancyDefault, tenancy)) diff --git a/pkg/providers/instancetype/compatibility/compatibility.go b/pkg/providers/instancetype/compatibility/compatibility.go index 8b369e577304..eb89a697b734 100644 --- a/pkg/providers/instancetype/compatibility/compatibility.go +++ b/pkg/providers/instancetype/compatibility/compatibility.go @@ -18,12 +18,14 @@ import ( "strings" ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/samber/lo" v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1" ) type NodeClass interface { AMIFamily() string + PlacementGroups() []v1.PlacementGroup } type CompatibleCheck interface { @@ -33,6 +35,7 @@ type CompatibleCheck interface { func IsCompatibleWithNodeClass(info ec2types.InstanceTypeInfo, nodeClass NodeClass) bool { for _, check := range []CompatibleCheck{ amiFamilyCompatibility(nodeClass.AMIFamily()), + placementGroupCompatibility(nodeClass.PlacementGroups()), } { if !check.compatibleCheck(info) { return false @@ -58,3 +61,27 @@ func (c amiFamilyCheck) compatibleCheck(info ec2types.InstanceTypeInfo) bool { } return true } + +type placementGroupCheck struct { + placementGroups []v1.PlacementGroup +} + +func placementGroupCompatibility(placementGroups []v1.PlacementGroup) CompatibleCheck { + return &placementGroupCheck{ + placementGroups: placementGroups, + } +} + +func (c placementGroupCheck) compatibleCheck(info ec2types.InstanceTypeInfo) bool { + if len(c.placementGroups) == 0 { + return true + } + return lo.Contains(info.PlacementGroupInfo.SupportedStrategies, PlacementGroupStrategyToEC2(c.placementGroups[0].Strategy)) +} + +func PlacementGroupStrategyToEC2(strategy v1.PlacementGroupStrategy) ec2types.PlacementGroupStrategy { + resolvedType, _ := lo.Find(ec2types.PlacementGroupStrategy("").Values(), func(crt ec2types.PlacementGroupStrategy) bool { + return string(crt) == string(strategy) + }) + return resolvedType +} diff --git a/pkg/providers/instancetype/compatibility/suite_test.go b/pkg/providers/instancetype/compatibility/suite_test.go index 3a4fd269e1f5..9dd54e069357 100644 --- a/pkg/providers/instancetype/compatibility/suite_test.go +++ b/pkg/providers/instancetype/compatibility/suite_test.go @@ -36,7 +36,7 @@ var _ = Describe("CompatibilityTest", func() { DescribeTable("should handle various instance types across different AMI families", func(instanceType string, amiFamily string, expected bool) { info := makeInstanceTypeInfo(instanceType) - nc := newMockNodeClass(amiFamily) + nc := newMockNodeClass(amiFamily, nil) result := compatibility.IsCompatibleWithNodeClass(info, nc) Expect(result).To(Equal(expected)) }, @@ -48,22 +48,65 @@ var _ = Describe("CompatibilityTest", func() { Entry("a1.large w/ Bottlerocket", "a1.large", v1.AMIFamilyBottlerocket, true), ) }) + Context("PlacementGroupCompatibility", func() { + DescribeTable("should handle placement group strategy compatibility", + func(instanceType string, supportedStrategies []ec2types.PlacementGroupStrategy, placementGroups []v1.PlacementGroup, expected bool) { + info := makeInstanceTypeInfo(instanceType) + info.PlacementGroupInfo = &ec2types.PlacementGroupInfo{ + SupportedStrategies: supportedStrategies, + } + nc := newMockNodeClass(v1.AMIFamilyAL2023, placementGroups) + result := compatibility.IsCompatibleWithNodeClass(info, nc) + Expect(result).To(Equal(expected)) + }, + Entry("nil placement groups", "m5.large", + []ec2types.PlacementGroupStrategy{ec2types.PlacementGroupStrategyCluster}, + nil, true), + Entry("empty placement groups", "m5.large", + []ec2types.PlacementGroupStrategy{ec2types.PlacementGroupStrategyCluster}, + []v1.PlacementGroup{}, true), + Entry("cluster strategy supported", "m5.large", + []ec2types.PlacementGroupStrategy{ec2types.PlacementGroupStrategyCluster, ec2types.PlacementGroupStrategyPartition, ec2types.PlacementGroupStrategySpread}, + []v1.PlacementGroup{{Strategy: v1.PlacementGroupStrategyCluster}}, true), + Entry("cluster strategy not supported", "t3.medium", + []ec2types.PlacementGroupStrategy{ec2types.PlacementGroupStrategyPartition, ec2types.PlacementGroupStrategySpread}, + []v1.PlacementGroup{{Strategy: v1.PlacementGroupStrategyCluster}}, false), + Entry("partition strategy supported", "m5.large", + []ec2types.PlacementGroupStrategy{ec2types.PlacementGroupStrategyCluster, ec2types.PlacementGroupStrategyPartition, ec2types.PlacementGroupStrategySpread}, + []v1.PlacementGroup{{Strategy: v1.PlacementGroupStrategyPartition}}, true), + Entry("partition strategy not supported", "t3.medium", + []ec2types.PlacementGroupStrategy{ec2types.PlacementGroupStrategySpread}, + []v1.PlacementGroup{{Strategy: v1.PlacementGroupStrategyPartition}}, false), + Entry("spread strategy supported", "m5.large", + []ec2types.PlacementGroupStrategy{ec2types.PlacementGroupStrategySpread}, + []v1.PlacementGroup{{Strategy: v1.PlacementGroupStrategySpread}}, true), + Entry("spread strategy not supported", "t3.medium", + []ec2types.PlacementGroupStrategy{ec2types.PlacementGroupStrategyCluster}, + []v1.PlacementGroup{{Strategy: v1.PlacementGroupStrategySpread}}, false), + ) + }) }) -func newMockNodeClass(amiFamily string) *mockNodeClass { +func newMockNodeClass(amiFamily string, placementGroups []v1.PlacementGroup) *mockNodeClass { return &mockNodeClass{ - amiFamily: amiFamily, + amiFamily: amiFamily, + placementGroups: placementGroups, } } type mockNodeClass struct { - amiFamily string + amiFamily string + placementGroups []v1.PlacementGroup } func (m mockNodeClass) AMIFamily() string { return m.amiFamily } +func (m mockNodeClass) PlacementGroups() []v1.PlacementGroup { + return m.placementGroups +} + func makeInstanceTypeInfo(instanceType string) ec2types.InstanceTypeInfo { return ec2types.InstanceTypeInfo{ InstanceType: ec2types.InstanceType(instanceType), diff --git a/pkg/providers/instancetype/instancetype.go b/pkg/providers/instancetype/instancetype.go index 57825e18ee55..74d56454099e 100644 --- a/pkg/providers/instancetype/instancetype.go +++ b/pkg/providers/instancetype/instancetype.go @@ -57,6 +57,7 @@ type NodeClass interface { AMIs() []v1.AMI BlockDeviceMappings() []*v1.BlockDeviceMapping CapacityReservations() []v1.CapacityReservation + PlacementGroups() []v1.PlacementGroup InstanceStorePolicy() *v1.InstanceStorePolicy KubeletConfiguration() *v1.KubeletConfiguration ZoneInfo() []v1.ZoneInfo diff --git a/pkg/providers/instancetype/offering/offering.go b/pkg/providers/instancetype/offering/offering.go index 728bbf0000db..196b633d5dbf 100644 --- a/pkg/providers/instancetype/offering/offering.go +++ b/pkg/providers/instancetype/offering/offering.go @@ -43,6 +43,7 @@ type Provider interface { type NodeClass interface { CapacityReservations() []v1.CapacityReservation + PlacementGroups() []v1.PlacementGroup ZoneInfo() []v1.ZoneInfo AMIFamily() string } @@ -92,6 +93,8 @@ func (p *DefaultProvider) InjectOfferings( allZones, subnetZonesToZoneIDs, ) + // For partition placement groups, expand each offering into N offerings (one per partition) + offerings = p.expandPartitionOfferings(offerings, nodeClass) // NOTE: By making this copy one level deep, we can modify the offerings without mutating the results from previous // GetInstanceTypes calls. This should still be done with caution - it is currently done here in the provider, and // once in the instance provider (filterReservedInstanceTypes) @@ -130,6 +133,11 @@ func (p *DefaultProvider) createOfferings( if ofs, ok := p.cache.Get(p.cacheKeyFromInstanceType(it)); ok && lastSeqNum == seqNum { offerings = append(offerings, ofs.([]*cloudprovider.Offering)...) } else { + // Resolve the placement group scope for scoping ICE cache lookups + var pgScope awscache.PlacementGroupScope + if pgs := nodeClass.PlacementGroups(); len(pgs) > 0 { + pgScope.ID = pgs[0].ID + } var cachedOfferings []*cloudprovider.Offering for zone := range allZones { for _, capacityType := range it.Requirements.Get(karpv1.CapacityTypeLabelKey).Values() { @@ -137,7 +145,12 @@ func (p *DefaultProvider) createOfferings( if capacityType == karpv1.CapacityTypeReserved { continue } - isUnavailable := p.unavailableOfferings.IsUnavailable(ec2types.InstanceType(it.Name), zone, capacityType) + var isUnavailable bool + if pgScope.ID != "" { + isUnavailable = p.unavailableOfferings.IsUnavailable(ec2types.InstanceType(it.Name), zone, capacityType, pgScope) + } else { + isUnavailable = p.unavailableOfferings.IsUnavailable(ec2types.InstanceType(it.Name), zone, capacityType) + } var price float64 var hasPrice bool switch capacityType { @@ -169,45 +182,91 @@ func (p *DefaultProvider) createOfferings( p.lastUnavailableOfferingsSeqNum.Store(ec2types.InstanceType(it.Name), seqNum) offerings = append(offerings, cachedOfferings...) } - if !options.FromContext(ctx).FeatureGates.ReservedCapacity { - return offerings - } - - capacityReservations := nodeClass.CapacityReservations() - for i := range capacityReservations { - if capacityReservations[i].InstanceType != it.Name { - continue - } - reservation := &capacityReservations[i] - price := 0.0 - if odPrice, ok := p.pricingProvider.OnDemandPrice(ec2types.InstanceType(it.Name)); ok { - // Divide the on-demand price by a sufficiently large constant. This allows us to treat the reservation as "free", - // while maintaining relative ordering for consolidation. If the pricing details are unavailable for whatever reason, - // still succeed to create the offering and leave the price at zero. This will break consolidation, but will allow - // users to utilize the instances they're already paying for. - price = odPrice / 10_000_000.0 - } - reservationCapacity := p.capacityReservationProvider.GetAvailableInstanceCount(reservation.ID) - offering := &cloudprovider.Offering{ - Requirements: scheduling.NewRequirements( - scheduling.NewRequirement(karpv1.CapacityTypeLabelKey, corev1.NodeSelectorOpIn, karpv1.CapacityTypeReserved), - scheduling.NewRequirement(corev1.LabelTopologyZone, corev1.NodeSelectorOpIn, reservation.AvailabilityZone), - scheduling.NewRequirement(cloudprovider.ReservationIDLabel, corev1.NodeSelectorOpIn, reservation.ID), - scheduling.NewRequirement(v1.LabelCapacityReservationType, corev1.NodeSelectorOpIn, string(reservation.ReservationType)), - scheduling.NewRequirement(v1.LabelCapacityReservationInterruptible, corev1.NodeSelectorOpIn, fmt.Sprintf("%t", reservation.Interruptible)), - ), - Price: price, - Available: isCompatibleWithNodeClass && reservationCapacity != 0 && itZones.Has(reservation.AvailabilityZone) && reservation.State != v1.CapacityReservationStateExpiring, - ReservationCapacity: reservationCapacity, + if options.FromContext(ctx).FeatureGates.ReservedCapacity { + capacityReservations := nodeClass.CapacityReservations() + for i := range capacityReservations { + if capacityReservations[i].InstanceType != it.Name { + continue + } + reservation := &capacityReservations[i] + price := 0.0 + if odPrice, ok := p.pricingProvider.OnDemandPrice(ec2types.InstanceType(it.Name)); ok { + // Divide the on-demand price by a sufficiently large constant. This allows us to treat the reservation as "free", + // while maintaining relative ordering for consolidation. If the pricing details are unavailable for whatever reason, + // still succeed to create the offering and leave the price at zero. This will break consolidation, but will allow + // users to utilize the instances they're already paying for. + price = odPrice / 10_000_000.0 + } + reservationCapacity := p.capacityReservationProvider.GetAvailableInstanceCount(reservation.ID) + offering := &cloudprovider.Offering{ + Requirements: scheduling.NewRequirements( + scheduling.NewRequirement(karpv1.CapacityTypeLabelKey, corev1.NodeSelectorOpIn, karpv1.CapacityTypeReserved), + scheduling.NewRequirement(corev1.LabelTopologyZone, corev1.NodeSelectorOpIn, reservation.AvailabilityZone), + scheduling.NewRequirement(cloudprovider.ReservationIDLabel, corev1.NodeSelectorOpIn, reservation.ID), + scheduling.NewRequirement(v1.LabelCapacityReservationType, corev1.NodeSelectorOpIn, string(reservation.ReservationType)), + scheduling.NewRequirement(v1.LabelCapacityReservationInterruptible, corev1.NodeSelectorOpIn, fmt.Sprintf("%t", reservation.Interruptible)), + ), + Price: price, + Available: isCompatibleWithNodeClass && reservationCapacity != 0 && itZones.Has(reservation.AvailabilityZone) && reservation.State != v1.CapacityReservationStateExpiring, + ReservationCapacity: reservationCapacity, + } + if id, ok := subnetZonesToZoneIDs[reservation.AvailabilityZone]; ok { + offering.Requirements.Add(scheduling.NewRequirement(v1.LabelTopologyZoneID, corev1.NodeSelectorOpIn, id)) + } + offerings = append(offerings, offering) } - if id, ok := subnetZonesToZoneIDs[reservation.AvailabilityZone]; ok { - offering.Requirements.Add(scheduling.NewRequirement(v1.LabelTopologyZoneID, corev1.NodeSelectorOpIn, id)) + } + // Add placement group ID requirement to all offerings (on-demand, spot, and reserved) when a placement group + // is configured. This enables the scheduler to match offerings against NodePool/pod constraints on placement + // group membership, and enables drift detection when the EC2NodeClass's placement group changes. + if pgs := nodeClass.PlacementGroups(); len(pgs) > 0 { + for i, offering := range offerings { + // Copy the offering and its requirements before mutating to avoid concurrent map writes + // on cached offering objects shared across goroutines. + reqs := scheduling.NewRequirements(offering.Requirements.Values()...) + reqs.Add( + scheduling.NewRequirement(v1.LabelPlacementGroupID, corev1.NodeSelectorOpIn, pgs[0].ID), + ) + offerings[i] = &cloudprovider.Offering{ + Requirements: reqs, + Price: offering.Price, + Available: isCompatibleWithNodeClass && offering.Available, + ReservationCapacity: offering.ReservationCapacity, + } } - offerings = append(offerings, offering) } return offerings } +// expandPartitionOfferings expands each offering into N offerings (one per partition) for partition placement groups. +// This enables the scheduler to use TopologySpreadConstraints with the partition topology key. +func (p *DefaultProvider) expandPartitionOfferings(offerings cloudprovider.Offerings, nodeClass NodeClass) cloudprovider.Offerings { + pgs := nodeClass.PlacementGroups() + if len(pgs) == 0 || pgs[0].Strategy != v1.PlacementGroupStrategyPartition { + return offerings + } + partitionCount := int(pgs[0].PartitionCount) + if partitionCount <= 0 { + return offerings + } + var expanded []*cloudprovider.Offering + for _, offering := range offerings { + // Copy the base offering's requirements and add the partition label + reqs := scheduling.NewRequirements(offering.Requirements.Values()...) + for partition := 1; partition <= partitionCount; partition++ { + // We do this to save on memory + reqs[v1.LabelPlacementGroupPartition] = scheduling.NewRequirement(v1.LabelPlacementGroupPartition, corev1.NodeSelectorOpIn, fmt.Sprintf("%d", partition)) + expanded = append(expanded, &cloudprovider.Offering{ + Requirements: reqs, + Price: offering.Price, + Available: offering.Available, + ReservationCapacity: offering.ReservationCapacity, + }) + } + } + return expanded +} + func (p *DefaultProvider) cacheKeyFromInstanceType(it *cloudprovider.InstanceType) string { zonesHash, _ := hashstructure.Hash( it.Requirements.Get(corev1.LabelTopologyZone).Values(), @@ -219,10 +278,22 @@ func (p *DefaultProvider) cacheKeyFromInstanceType(it *cloudprovider.InstanceTyp hashstructure.FormatV2, &hashstructure.HashOptions{SlicesAsSets: true}, ) + placementGroupsHash, _ := hashstructure.Hash( + it.Requirements.Get(v1.LabelPlacementGroupID).Values(), + hashstructure.FormatV2, + &hashstructure.HashOptions{SlicesAsSets: true}, + ) + placementGroupPartitionsHash, _ := hashstructure.Hash( + it.Requirements.Get(v1.LabelPlacementGroupPartition).Values(), + hashstructure.FormatV2, + &hashstructure.HashOptions{SlicesAsSets: true}, + ) return fmt.Sprintf( - "%s-%016x-%016x", + "%s-%016x-%016x-%016x-%016x", it.Name, zonesHash, capacityTypesHash, + placementGroupsHash, + placementGroupPartitionsHash, ) } diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index a7977d85e051..39660391c779 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -266,6 +266,9 @@ var _ = Describe("InstanceTypeProvider", func() { v1.LabelCapacityReservationID, v1.LabelCapacityReservationType, v1.LabelCapacityReservationInterruptible, + // Placement group labels are only present when a placement group is configured on the NodeClass + v1.LabelPlacementGroupID, + v1.LabelPlacementGroupPartition, )).UnsortedList(), lo.Keys(karpv1.NormalizedLabels)...))) var pods []*corev1.Pod @@ -319,7 +322,7 @@ var _ = Describe("InstanceTypeProvider", func() { "topology.ebs.csi.aws.com/zone": "test-zone-1a", } - // Ensure that we're exercising all well known labels except for the accelerator and capacity reservation labels + // Ensure that we're exercising all well known labels except for the accelerator, capacity reservation, and placement group labels Expect(lo.Keys(nodeSelector)).To(ContainElements( append( karpv1.WellKnownLabels.Difference(sets.New( @@ -329,6 +332,8 @@ var _ = Describe("InstanceTypeProvider", func() { v1.LabelInstanceAcceleratorCount, v1.LabelInstanceAcceleratorName, v1.LabelInstanceAcceleratorManufacturer, + v1.LabelPlacementGroupID, + v1.LabelPlacementGroupPartition, corev1.LabelWindowsBuild, )).UnsortedList(), lo.Keys(karpv1.NormalizedLabels)...))) @@ -376,7 +381,7 @@ var _ = Describe("InstanceTypeProvider", func() { "topology.ebs.csi.aws.com/zone": "test-zone-1a", } - // Ensure that we're exercising all well known labels except for the gpu, nvme and capacity reservation id labels + // Ensure that we're exercising all well known labels except for the gpu, nvme, capacity reservation, and placement group labels expectedLabels := append(karpv1.WellKnownLabels.Difference(sets.New( v1.LabelCapacityReservationID, v1.LabelCapacityReservationType, @@ -386,6 +391,8 @@ var _ = Describe("InstanceTypeProvider", func() { v1.LabelInstanceGPUManufacturer, v1.LabelInstanceGPUMemory, v1.LabelInstanceLocalNVME, + v1.LabelPlacementGroupID, + v1.LabelPlacementGroupPartition, corev1.LabelWindowsBuild, )).UnsortedList(), lo.Keys(karpv1.NormalizedLabels)...) Expect(lo.Keys(nodeSelector)).To(ContainElements(expectedLabels)) diff --git a/pkg/providers/launchtemplate/suite_test.go b/pkg/providers/launchtemplate/suite_test.go index 8626adeb8835..4669e481b6a0 100644 --- a/pkg/providers/launchtemplate/suite_test.go +++ b/pkg/providers/launchtemplate/suite_test.go @@ -2225,7 +2225,7 @@ eviction-max-pod-grace-period = 10 } ExpectApplied(ctx, env.Client, nodeClass, nodePool) - controller := nodeclass.NewController(awsEnv.Clock, env.Client, cloudProvider, recorder, fake.DefaultRegion, awsEnv.SubnetProvider, awsEnv.SecurityGroupProvider, awsEnv.AMIProvider, awsEnv.InstanceProfileProvider, awsEnv.InstanceTypesProvider, awsEnv.LaunchTemplateProvider, awsEnv.CapacityReservationProvider, awsEnv.EC2API, awsEnv.ValidationCache, awsEnv.RecreationCache, awsEnv.AMIResolver, options.FromContext(ctx).DisableDryRun) + controller := nodeclass.NewController(awsEnv.Clock, env.Client, cloudProvider, recorder, fake.DefaultRegion, awsEnv.SubnetProvider, awsEnv.SecurityGroupProvider, awsEnv.AMIProvider, awsEnv.InstanceProfileProvider, awsEnv.InstanceTypesProvider, awsEnv.LaunchTemplateProvider, awsEnv.CapacityReservationProvider, awsEnv.PlacementGroupProvider, awsEnv.EC2API, awsEnv.ValidationCache, awsEnv.RecreationCache, awsEnv.AMIResolver, options.FromContext(ctx).DisableDryRun) ExpectObjectReconciled(ctx, env.Client, controller, nodeClass) pod := coretest.UnschedulablePod() diff --git a/pkg/providers/launchtemplate/types.go b/pkg/providers/launchtemplate/types.go index 8efecc7640da..290858ad7f73 100644 --- a/pkg/providers/launchtemplate/types.go +++ b/pkg/providers/launchtemplate/types.go @@ -133,9 +133,7 @@ func (b *CreateLaunchTemplateInputBuilder) Build(ctx context.Context) *ec2.Creat }, NetworkInterfaces: networkInterfaces, TagSpecifications: launchTemplateDataTags, - Placement: &ec2types.LaunchTemplatePlacementRequest{ - Tenancy: ec2types.Tenancy(b.options.Tenancy), - }, + Placement: b.buildPlacement(), }, TagSpecifications: []ec2types.TagSpecification{ { @@ -176,3 +174,16 @@ func (b *CreateLaunchTemplateInputBuilder) Build(ctx context.Context) *ec2.Creat } return lt } + +func (b *CreateLaunchTemplateInputBuilder) buildPlacement() *ec2types.LaunchTemplatePlacementRequest { + placement := &ec2types.LaunchTemplatePlacementRequest{ + Tenancy: ec2types.Tenancy(b.options.Tenancy), + } + if b.options.PlacementGroupID != "" { + placement.GroupId = lo.ToPtr(b.options.PlacementGroupID) + } + if b.options.PlacementGroupPartition != 0 { + placement.PartitionNumber = lo.ToPtr(b.options.PlacementGroupPartition) + } + return placement +} diff --git a/pkg/providers/placementgroup/provider.go b/pkg/providers/placementgroup/provider.go new file mode 100644 index 000000000000..8fa28c0c98a2 --- /dev/null +++ b/pkg/providers/placementgroup/provider.go @@ -0,0 +1,77 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package placementgroup + +import ( + "context" + "fmt" + "sync" + + ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/patrickmn/go-cache" + "sigs.k8s.io/karpenter/pkg/utils/pretty" + + v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1" + sdk "github.com/aws/karpenter-provider-aws/pkg/aws" + awserrors "github.com/aws/karpenter-provider-aws/pkg/errors" +) + +type Provider interface { + // Get resolves a single placement group from a PlacementGroupSelectorTerm. + Get(context.Context, v1.PlacementGroupSelectorTerm) (*ec2types.PlacementGroup, error) +} + +type DefaultProvider struct { + sync.Mutex + + ec2api sdk.EC2API + cache *cache.Cache + cm *pretty.ChangeMonitor +} + +func NewProvider(ec2api sdk.EC2API, placementGroupCache *cache.Cache) *DefaultProvider { + return &DefaultProvider{ + ec2api: ec2api, + cache: placementGroupCache, + cm: pretty.NewChangeMonitor(), + } +} + +func (p *DefaultProvider) Get(ctx context.Context, term v1.PlacementGroupSelectorTerm) (*ec2types.PlacementGroup, error) { + p.Lock() + defer p.Unlock() + + q := &Query{ID: term.ID, Name: term.Name} + + if entry, ok := p.cache.Get(q.CacheKey()); ok { + return entry.(*ec2types.PlacementGroup), nil + } + + out, err := p.ec2api.DescribePlacementGroups(ctx, q.DescribePlacementGroupsInput()) + if err != nil { + if awserrors.IsNotFound(err) { + p.cache.Delete(q.CacheKey()) + return nil, nil + } + return nil, fmt.Errorf("describing placement groups, %w", err) + } + if len(out.PlacementGroups) == 0 { + return nil, nil + } + + pg := &out.PlacementGroups[0] + p.cache.SetDefault(q.CacheKey(), pg) + return pg, nil +} diff --git a/pkg/providers/placementgroup/suite_test.go b/pkg/providers/placementgroup/suite_test.go new file mode 100644 index 000000000000..21238b81db6f --- /dev/null +++ b/pkg/providers/placementgroup/suite_test.go @@ -0,0 +1,208 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package placementgroup_test + +import ( + "context" + "fmt" + "testing" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/ec2" + ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/samber/lo" + coreoptions "sigs.k8s.io/karpenter/pkg/operator/options" + coretest "sigs.k8s.io/karpenter/pkg/test" + + "github.com/aws/karpenter-provider-aws/pkg/apis" + v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1" + "github.com/aws/karpenter-provider-aws/pkg/operator/options" + "github.com/aws/karpenter-provider-aws/pkg/providers/placementgroup" + "github.com/aws/karpenter-provider-aws/pkg/test" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "sigs.k8s.io/karpenter/pkg/test/v1alpha1" + . "sigs.k8s.io/karpenter/pkg/utils/testing" +) + +var ctx context.Context +var env *coretest.Environment +var awsEnv *test.Environment + +func TestAPIs(t *testing.T) { + ctx = TestContextWithLogger(t) + RegisterFailHandler(Fail) + RunSpecs(t, "PlacementGroup") +} + +var _ = BeforeSuite(func() { + env = coretest.NewEnvironment( + coretest.WithCRDs(test.DisableCapacityReservationIDValidation(test.RemoveNodeClassTagValidation(apis.CRDs))...), + coretest.WithCRDs(v1alpha1.CRDs...), + ) + ctx = coreoptions.ToContext(ctx, coretest.Options(coretest.OptionsFields{FeatureGates: coretest.FeatureGates{ReservedCapacity: lo.ToPtr(true)}})) + ctx = options.ToContext(ctx, test.Options()) + awsEnv = test.NewEnvironment(ctx, env) +}) + +var _ = AfterSuite(func() { + Expect(env.Stop()).To(Succeed(), "Failed to stop environment") +}) + +var _ = BeforeEach(func() { + awsEnv.Reset() +}) + +var _ = Describe("Query", func() { + It("should use GroupNames when query specifies Name", func() { + q := &placementgroup.Query{Name: "my-placement-group"} + input := q.DescribePlacementGroupsInput() + Expect(input.GroupNames).To(ConsistOf("my-placement-group")) + Expect(input.GroupIds).To(BeEmpty()) + Expect(input.Filters).To(HaveLen(1)) + Expect(aws.ToString(input.Filters[0].Name)).To(Equal("state")) + Expect(input.Filters[0].Values).To(ConsistOf(string(ec2types.PlacementGroupStateAvailable))) + }) + It("should use GroupIds when query specifies ID", func() { + q := &placementgroup.Query{ID: "pg-0123456789abcdef0"} + input := q.DescribePlacementGroupsInput() + Expect(input.GroupIds).To(ConsistOf("pg-0123456789abcdef0")) + Expect(input.GroupNames).To(BeEmpty()) + Expect(input.Filters).To(HaveLen(1)) + Expect(aws.ToString(input.Filters[0].Name)).To(Equal("state")) + Expect(input.Filters[0].Values).To(ConsistOf(string(ec2types.PlacementGroupStateAvailable))) + }) + It("should use GroupNames even when name has pg- prefix", func() { + q := &placementgroup.Query{Name: "pg-mygroup"} + input := q.DescribePlacementGroupsInput() + Expect(input.GroupNames).To(ConsistOf("pg-mygroup")) + Expect(input.GroupIds).To(BeEmpty()) + }) + It("should produce consistent cache keys for the same query", func() { + q1 := &placementgroup.Query{Name: "my-pg"} + q2 := &placementgroup.Query{Name: "my-pg"} + Expect(q1.CacheKey()).To(Equal(q2.CacheKey())) + }) + It("should produce different cache keys for different queries", func() { + q1 := &placementgroup.Query{Name: "my-pg"} + q2 := &placementgroup.Query{ID: "pg-123"} + Expect(q1.CacheKey()).ToNot(Equal(q2.CacheKey())) + }) +}) + +var _ = Describe("Placement Group Provider", func() { + var clusterPG ec2types.PlacementGroup + + BeforeEach(func() { + clusterPG = ec2types.PlacementGroup{ + GroupId: lo.ToPtr("pg-cluster123"), + GroupName: lo.ToPtr("my-cluster-pg"), + State: ec2types.PlacementGroupStateAvailable, + Strategy: ec2types.PlacementStrategyCluster, + } + awsEnv.EC2API.DescribePlacementGroupsOutput.Set(&ec2.DescribePlacementGroupsOutput{ + PlacementGroups: []ec2types.PlacementGroup{clusterPG}, + }) + }) + + It("should return a placement group by name from the EC2 API", func() { + pg, err := awsEnv.PlacementGroupProvider.Get(ctx, v1.PlacementGroupSelectorTerm{Name: "my-cluster-pg"}) + Expect(err).ToNot(HaveOccurred()) + Expect(pg).ToNot(BeNil()) + Expect(aws.ToString(pg.GroupId)).To(Equal("pg-cluster123")) + Expect(aws.ToString(pg.GroupName)).To(Equal("my-cluster-pg")) + Expect(pg.Strategy).To(Equal(ec2types.PlacementStrategyCluster)) + }) + It("should return a placement group by ID from the EC2 API", func() { + pg, err := awsEnv.PlacementGroupProvider.Get(ctx, v1.PlacementGroupSelectorTerm{ID: "pg-cluster123"}) + Expect(err).ToNot(HaveOccurred()) + Expect(pg).ToNot(BeNil()) + Expect(aws.ToString(pg.GroupId)).To(Equal("pg-cluster123")) + }) + It("should return nil when placement group is not found", func() { + awsEnv.EC2API.DescribePlacementGroupsOutput.Set(&ec2.DescribePlacementGroupsOutput{ + PlacementGroups: []ec2types.PlacementGroup{}, + }) + pg, err := awsEnv.PlacementGroupProvider.Get(ctx, v1.PlacementGroupSelectorTerm{Name: "nonexistent"}) + Expect(err).ToNot(HaveOccurred()) + Expect(pg).To(BeNil()) + }) + It("should return nil when no matching placement group is found by name", func() { + pg, err := awsEnv.PlacementGroupProvider.Get(ctx, v1.PlacementGroupSelectorTerm{Name: "does-not-exist"}) + Expect(err).ToNot(HaveOccurred()) + Expect(pg).To(BeNil()) + }) + It("should return an error when EC2 API returns a non-not-found error", func() { + awsEnv.EC2API.NextError.Set(fmt.Errorf("InternalError: something went wrong")) + pg, err := awsEnv.PlacementGroupProvider.Get(ctx, v1.PlacementGroupSelectorTerm{Name: "my-cluster-pg"}) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("describing placement groups")) + Expect(pg).To(BeNil()) + }) + It("should cache results and return from cache on subsequent calls", func() { + pg1, err := awsEnv.PlacementGroupProvider.Get(ctx, v1.PlacementGroupSelectorTerm{Name: "my-cluster-pg"}) + Expect(err).ToNot(HaveOccurred()) + Expect(pg1).ToNot(BeNil()) + + awsEnv.EC2API.DescribePlacementGroupsOutput.Set(&ec2.DescribePlacementGroupsOutput{ + PlacementGroups: []ec2types.PlacementGroup{}, + }) + + pg2, err := awsEnv.PlacementGroupProvider.Get(ctx, v1.PlacementGroupSelectorTerm{Name: "my-cluster-pg"}) + Expect(err).ToNot(HaveOccurred()) + Expect(pg2).ToNot(BeNil()) + Expect(aws.ToString(pg2.GroupId)).To(Equal("pg-cluster123")) + }) + It("should not return a cached entry for a different selector", func() { + pg1, err := awsEnv.PlacementGroupProvider.Get(ctx, v1.PlacementGroupSelectorTerm{Name: "my-cluster-pg"}) + Expect(err).ToNot(HaveOccurred()) + Expect(pg1).ToNot(BeNil()) + + pg2, err := awsEnv.PlacementGroupProvider.Get(ctx, v1.PlacementGroupSelectorTerm{Name: "other-pg"}) + Expect(err).ToNot(HaveOccurred()) + Expect(pg2).To(BeNil()) + }) + It("should return nil when the output has no placement groups", func() { + awsEnv.EC2API.DescribePlacementGroupsOutput.Set(&ec2.DescribePlacementGroupsOutput{ + PlacementGroups: []ec2types.PlacementGroup{}, + }) + pg, err := awsEnv.PlacementGroupProvider.Get(ctx, v1.PlacementGroupSelectorTerm{Name: "empty"}) + Expect(err).ToNot(HaveOccurred()) + Expect(pg).To(BeNil()) + }) + It("should return the first placement group when multiple are returned", func() { + awsEnv.EC2API.DescribePlacementGroupsOutput.Set(&ec2.DescribePlacementGroupsOutput{ + PlacementGroups: []ec2types.PlacementGroup{ + { + GroupId: lo.ToPtr("pg-first"), + GroupName: lo.ToPtr("first-pg"), + State: ec2types.PlacementGroupStateAvailable, + Strategy: ec2types.PlacementStrategyCluster, + }, + { + GroupId: lo.ToPtr("pg-second"), + GroupName: lo.ToPtr("second-pg"), + State: ec2types.PlacementGroupStateAvailable, + Strategy: ec2types.PlacementStrategySpread, + }, + }, + }) + pg, err := awsEnv.PlacementGroupProvider.Get(ctx, v1.PlacementGroupSelectorTerm{Name: "first-pg"}) + Expect(err).ToNot(HaveOccurred()) + Expect(pg).ToNot(BeNil()) + Expect(aws.ToString(pg.GroupId)).To(Equal("pg-first")) + }) +}) diff --git a/pkg/providers/placementgroup/types.go b/pkg/providers/placementgroup/types.go new file mode 100644 index 000000000000..35fc23ec1c03 --- /dev/null +++ b/pkg/providers/placementgroup/types.go @@ -0,0 +1,52 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package placementgroup + +import ( + "fmt" + + "github.com/aws/aws-sdk-go-v2/service/ec2" + ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/mitchellh/hashstructure/v2" + "github.com/samber/lo" +) + +type Query struct { + ID string + Name string +} + +func (q *Query) CacheKey() string { + return fmt.Sprintf("%d", lo.Must(hashstructure.Hash(q, hashstructure.FormatV2, &hashstructure.HashOptions{ + SlicesAsSets: true, + }))) +} + +func (q *Query) DescribePlacementGroupsInput() *ec2.DescribePlacementGroupsInput { + input := &ec2.DescribePlacementGroupsInput{ + Filters: []ec2types.Filter{ + { + Name: lo.ToPtr("state"), + Values: []string{string(ec2types.PlacementGroupStateAvailable)}, + }, + }, + } + if q.ID != "" { + input.GroupIds = []string{q.ID} + } else if q.Name != "" { + input.GroupNames = []string{q.Name} + } + return input +} diff --git a/pkg/test/environment.go b/pkg/test/environment.go index 5ff5f8bab1bd..e26af5be982d 100644 --- a/pkg/test/environment.go +++ b/pkg/test/environment.go @@ -37,6 +37,7 @@ import ( "github.com/aws/karpenter-provider-aws/pkg/providers/instanceprofile" "github.com/aws/karpenter-provider-aws/pkg/providers/instancetype" "github.com/aws/karpenter-provider-aws/pkg/providers/launchtemplate" + "github.com/aws/karpenter-provider-aws/pkg/providers/placementgroup" "github.com/aws/karpenter-provider-aws/pkg/providers/pricing" "github.com/aws/karpenter-provider-aws/pkg/providers/securitygroup" ssmp "github.com/aws/karpenter-provider-aws/pkg/providers/ssm" @@ -85,12 +86,14 @@ type Environment struct { DiscoveredCapacityCache *cache.Cache CapacityReservationCache *cache.Cache CapacityReservationAvailabilityCache *cache.Cache + PlacementGroupCache *cache.Cache ValidationCache *cache.Cache RecreationCache *cache.Cache ProtectedProfilesCache *cache.Cache // Providers CapacityReservationProvider *capacityreservation.DefaultProvider + PlacementGroupProvider *placementgroup.DefaultProvider InstanceTypesResolver *instancetype.DefaultResolver InstanceTypesProvider *instancetype.DefaultProvider InstanceProvider *instance.DefaultProvider @@ -135,6 +138,7 @@ func NewEnvironment(ctx context.Context, env *coretest.Environment) *Environment ssmCache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval) capacityReservationCache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval) capacityReservationAvailabilityCache := cache.New(24*time.Hour, awscache.DefaultCleanupInterval) + placementGroupCache := cache.New(awscache.PlacementGroupTTL, awscache.DefaultCleanupInterval) validationCache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval) recreationCache := cache.New(awscache.DefaultTTL, awscache.DefaultCleanupInterval) fakePricingAPI := &fake.PricingAPI{} @@ -155,6 +159,7 @@ func NewEnvironment(ctx context.Context, env *coretest.Environment) *Environment amiResolver := amifamily.NewDefaultResolver(fake.DefaultRegion) instanceTypesResolver := instancetype.NewDefaultResolver(fake.DefaultRegion) capacityReservationProvider := capacityreservation.NewProvider(ec2api, clock, capacityReservationCache, capacityReservationAvailabilityCache) + placementGroupProvider := placementgroup.NewProvider(ec2api, placementGroupCache) instanceTypesProvider := instancetype.NewDefaultProvider(instanceTypeCache, offeringCache, discoveredCapacityCache, ec2api, subnetProvider, pricingProvider, capacityReservationProvider, unavailableOfferingsCache, instanceTypesResolver) // Ensure we're able to hydrate instance types before starting any reliant controllers. // Instance type updates are hydrated asynchronously after this by controllers. @@ -218,11 +223,13 @@ func NewEnvironment(ctx context.Context, env *coretest.Environment) *Environment DiscoveredCapacityCache: discoveredCapacityCache, CapacityReservationCache: capacityReservationCache, CapacityReservationAvailabilityCache: capacityReservationAvailabilityCache, + PlacementGroupCache: placementGroupCache, ValidationCache: validationCache, RecreationCache: recreationCache, ProtectedProfilesCache: protectedProfilesCache, CapacityReservationProvider: capacityReservationProvider, + PlacementGroupProvider: placementGroupProvider, InstanceTypesResolver: instanceTypesResolver, InstanceTypesProvider: instanceTypesProvider, InstanceProvider: instanceProvider, @@ -262,6 +269,7 @@ func (env *Environment) Reset() { env.SSMCache.Flush() env.DiscoveredCapacityCache.Flush() env.CapacityReservationCache.Flush() + env.PlacementGroupCache.Flush() env.ValidationCache.Flush() env.RecreationCache.Flush() env.ProtectedProfilesCache.Flush() diff --git a/website/content/en/preview/concepts/nodeclasses.md b/website/content/en/preview/concepts/nodeclasses.md index b8bf25e19c19..e1d031564078 100644 --- a/website/content/en/preview/concepts/nodeclasses.md +++ b/website/content/en/preview/concepts/nodeclasses.md @@ -115,6 +115,11 @@ spec: - id: cr-123 - instanceMatchCriteria: open + # Optional, the terms are exclusive + placementGroupSelector: + name: my-pg + id: pg-123 + # Optional, propagates tags to underlying EC2 resources tags: team: team-a @@ -213,6 +218,15 @@ status: reservationType: default state: active + # Placement Group + placementGroup: + - id: pg-01234567890123456 + name: my-pg + partitionCount: 7 + spreadLevel: rack + state: available + strategy: cluster + # Generated instance profile name from "role" instanceProfile: "${CLUSTER_NAME}-0123456778901234567789" conditions: @@ -962,6 +976,39 @@ spec: key: foo ``` +## spec.placementGroupSelector + +Placement Group Selector allows you to select a [placement group](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-groups.html) for instances launched by this EC2NodeClass. Each EC2NodeClass maps to exactly one placement group — all instances launched from that EC2NodeClass are placed into the resolved placement group. + +Placement groups can be selected by either name or ID. Only one of `name` or `id` may be specified. + +Karpenter supports all three placement group strategies: +- **Cluster** — instances are placed in a single AZ on the same network segment for low-latency, high-throughput networking (e.g., EFA workloads) +- **Partition** — instances are distributed across isolated partitions (up to 7 per AZ) for hardware fault isolation. Applications can use `topologySpreadConstraints` with the `karpenter.k8s.aws/placement-group-partition` label to spread workloads across partitions. +- **Spread** — each instance is placed on distinct hardware (up to 7 instances per AZ per group) for maximum fault isolation + +{{% alert title="Note" color="primary" %}} +The IAM role Karpenter assumes must have permissions for the [ec2:DescribePlacementGroups](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribePlacementGroups.html) action to discover placement groups and the [ec2:RunInstances](https://docs.aws.amazon.com/service-authorization/latest/reference/list_amazonec2.html#amazonec2-RunInstances) / [ec2:CreateFleet](https://docs.aws.amazon.com/service-authorization/latest/reference/list_amazonec2.html#amazonec2-CreateFleet) actions to launch instances into the placement group. +{{% /alert %}} + +#### Examples + +Select the placement group with the given ID: + +```yaml +spec: + placementGroupSelector: + id: pg-123 +``` + +Select the placement group with the given name: + +```yaml +spec: + placementGroupSelector: + name: my-pg-a +``` + ## spec.tags Karpenter adds tags to all resources it creates, including EC2 Instances, EBS volumes, and Launch Templates. The default set of tags are listed below. diff --git a/website/content/en/preview/concepts/scheduling.md b/website/content/en/preview/concepts/scheduling.md index fa6285608d9e..a19b1852c7d4 100755 --- a/website/content/en/preview/concepts/scheduling.md +++ b/website/content/en/preview/concepts/scheduling.md @@ -184,6 +184,8 @@ Take care to ensure the label domains are correct. A well known label like `karp | karpenter.k8s.aws/instance-local-nvme | 900 | [AWS Specific] Number of gibibytes of local nvme storage on the instance | | karpenter.k8s.aws/instance-capability-flex | true | [AWS Specific] Instance with capacity flex | | karpenter.k8s.aws/instance-tenancy | default | [AWS Specific] Tenancy types include `default`, and `dedicated` | +| karpenter.k8s.aws/placement-group-id | pg-0fa32af67ed0f8da0 | [AWS Specific] The placement group ID. +| karpenter.k8s.aws/placement-group-partition | 7 | [AWS Specific] The partition number of the partition placement group the instance is in. | topology.k8s.aws/zone-id | use1-az1 | [AWS Specific] Globally consistent [zone id](https://docs.aws.amazon.com/global-infrastructure/latest/regions/az-ids.html) | diff --git a/website/content/en/preview/reference/cloudformation.md b/website/content/en/preview/reference/cloudformation.md index e1734f7cc1ac..000f0a871f73 100644 --- a/website/content/en/preview/reference/cloudformation.md +++ b/website/content/en/preview/reference/cloudformation.md @@ -113,7 +113,7 @@ NodeLifecyclePolicy: The AllowScopedEC2InstanceAccessActions statement ID (Sid) identifies a set of EC2 resources that are allowed to be accessed with [RunInstances](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_RunInstances.html) and [CreateFleet](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_CreateFleet.html) actions. -For `RunInstances` and `CreateFleet` actions, the Karpenter controller can read (but not create) `image`, `snapshot`, `security-group`, `subnet` and `capacity-reservation` EC2 resources, scoped for the particular AWS partition and region. +For `RunInstances` and `CreateFleet` actions, the Karpenter controller can read (but not create) `image`, `snapshot`, `security-group`, `subnet`, `capacity-reservation`, and `placement group` EC2 resources, scoped for the particular AWS partition and region. ```json { @@ -124,7 +124,8 @@ For `RunInstances` and `CreateFleet` actions, the Karpenter controller can read "arn:${AWS::Partition}:ec2:${AWS::Region}::snapshot/*", "arn:${AWS::Partition}:ec2:${AWS::Region}:*:security-group/*", "arn:${AWS::Partition}:ec2:${AWS::Region}:*:subnet/*", - "arn:${AWS::Partition}:ec2:${AWS::Region}:*:capacity-reservation/*" + "arn:${AWS::Partition}:ec2:${AWS::Region}:*:capacity-reservation/*", + "arn:${AWS::Partition}:ec2:${AWS::Region}:*:placement-group/*", ], "Action": [ "ec2:RunInstances", @@ -496,7 +497,7 @@ ResourceDiscoveryPolicy: #### AllowRegionalReadActions -The AllowRegionalReadActions Sid allows [DescribeAvailabilityZones](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeAvailabilityZones.html), [DescribeImages](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeImages.html), [DescribeInstances](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeInstances.html), [DescribeInstanceTypeOfferings](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeInstanceTypeOfferings.html), [DescribeInstanceTypes](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeInstanceTypes.html), [DescribeLaunchTemplates](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeLaunchTemplates.html), [DescribeSecurityGroups](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeSecurityGroups.html), [DescribeSpotPriceHistory](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeSpotPriceHistory.html), and [DescribeSubnets](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeSubnets.html) actions for the current AWS region. +The AllowRegionalReadActions Sid allows [DescribeCapacityReservations](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeCapacityReservations.html), [DescribeAvailabilityZones](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeAvailabilityZones.html), [DescribeImages](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeImages.html), [DescribeInstances](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeInstances.html), [DescribeInstanceTypeOfferings](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeInstanceTypeOfferings.html), [DescribeInstanceTypes](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeInstanceTypes.html), [DescribeLaunchTemplates](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeLaunchTemplates.html), [DescribePlacementGroups](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribePlacementGroups.html), [DescribeSecurityGroups](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeSecurityGroups.html), [DescribeSpotPriceHistory](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeSpotPriceHistory.html), and [DescribeSubnets](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeSubnets.html) actions for the current AWS region. This allows the Karpenter controller to do any of those read-only actions across all related resources for that AWS region. ```json @@ -511,6 +512,7 @@ This allows the Karpenter controller to do any of those read-only actions across "ec2:DescribeInstanceTypeOfferings", "ec2:DescribeInstanceTypes", "ec2:DescribeLaunchTemplates", + "ec2:DescribePlacementGroups", "ec2:DescribeSecurityGroups", "ec2:DescribeSpotPriceHistory", "ec2:DescribeSubnets"