From 06ebd4517db1bdd4d4ca699d276a67f4de1071c3 Mon Sep 17 00:00:00 2001 From: Aaron Lehmann Date: Fri, 2 Sep 2016 14:12:05 -0700 Subject: [PATCH] Service update failure thresholds and rollback This adds support for two enhancements to swarm service rolling updates: - Failure thresholds: In Docker 1.12, a service update could be set up to either pause or continue after a single failure occurs. This adds an --update-max-failure-ratio flag that controls how many tasks need to fail to update for the update as a whole to be considered a failure. A counterpart flag, --update-monitor, controls how long to monitor each task for a failure after starting it during the update. - Rollback flag: service update --rollback reverts the service to its previous version. If a service update encounters task failures, or fails to function properly for some other reason, the user can roll back the update. SwarmKit also has the ability to roll back updates automatically after hitting the failure thresholds, but we've decided not to expose this in the Docker API/CLI for now, favoring a workflow where the decision to roll back is always made by an admin. Depending on user feedback, we may add a "rollback" option to --update-failure-action in the future. Signed-off-by: Aaron Lehmann --- command/formatter/service.go | 18 +++++- command/service/opts.go | 104 +++++++++++++++++++---------------- command/service/update.go | 34 ++++++++++-- 3 files changed, 103 insertions(+), 53 deletions(-) diff --git a/command/formatter/service.go b/command/formatter/service.go index 71ee4d656a..1549047b72 100644 --- a/command/formatter/service.go +++ b/command/formatter/service.go @@ -41,10 +41,14 @@ Placement: {{- if .HasUpdateConfig }} UpdateConfig: Parallelism: {{ .UpdateParallelism }} -{{- if .HasUpdateDelay -}} +{{- if .HasUpdateDelay}} Delay: {{ .UpdateDelay }} {{- end }} On failure: {{ .UpdateOnFailure }} +{{- if .HasUpdateMonitor}} + Monitoring Period: {{ .UpdateMonitor }} +{{- end }} + Max failure ratio: {{ .UpdateMaxFailureRatio }} {{- end }} ContainerSpec: Image: {{ .ContainerImage }} @@ -218,6 +222,18 @@ func (ctx *serviceInspectContext) UpdateOnFailure() string { return ctx.Service.Spec.UpdateConfig.FailureAction } +func (ctx *serviceInspectContext) HasUpdateMonitor() bool { + return ctx.Service.Spec.UpdateConfig.Monitor.Nanoseconds() > 0 +} + +func (ctx *serviceInspectContext) UpdateMonitor() time.Duration { + return ctx.Service.Spec.UpdateConfig.Monitor +} + +func (ctx *serviceInspectContext) UpdateMaxFailureRatio() float32 { + return ctx.Service.Spec.UpdateConfig.MaxFailureRatio +} + func (ctx *serviceInspectContext) ContainerImage() string { return ctx.Service.Spec.TaskTemplate.ContainerSpec.Image } diff --git a/command/service/opts.go b/command/service/opts.go index 1e966f90c6..cf25b78273 100644 --- a/command/service/opts.go +++ b/command/service/opts.go @@ -267,9 +267,11 @@ func (m *MountOpt) Value() []mounttypes.Mount { } type updateOptions struct { - parallelism uint64 - delay time.Duration - onFailure string + parallelism uint64 + delay time.Duration + monitor time.Duration + onFailure string + maxFailureRatio float32 } type resourceOptions struct { @@ -458,9 +460,11 @@ func (opts *serviceOptions) ToService() (swarm.ServiceSpec, error) { Networks: convertNetworks(opts.networks), Mode: swarm.ServiceMode{}, UpdateConfig: &swarm.UpdateConfig{ - Parallelism: opts.update.parallelism, - Delay: opts.update.delay, - FailureAction: opts.update.onFailure, + Parallelism: opts.update.parallelism, + Delay: opts.update.delay, + Monitor: opts.update.monitor, + FailureAction: opts.update.onFailure, + MaxFailureRatio: opts.update.maxFailureRatio, }, EndpointSpec: opts.endpoint.ToEndpointSpec(), } @@ -507,7 +511,9 @@ func addServiceFlags(cmd *cobra.Command, opts *serviceOptions) { flags.Uint64Var(&opts.update.parallelism, flagUpdateParallelism, 1, "Maximum number of tasks updated simultaneously (0 to update all at once)") flags.DurationVar(&opts.update.delay, flagUpdateDelay, time.Duration(0), "Delay between updates") + flags.DurationVar(&opts.update.monitor, flagUpdateMonitor, time.Duration(0), "Duration after each task update to monitor for failure") flags.StringVar(&opts.update.onFailure, flagUpdateFailureAction, "pause", "Action on update failure (pause|continue)") + flags.Float32Var(&opts.update.maxFailureRatio, flagUpdateMaxFailureRatio, 0, "Failure rate to tolerate during an update") flags.StringVar(&opts.endpoint.mode, flagEndpointMode, "", "Endpoint mode (vip or dnsrr)") @@ -518,46 +524,48 @@ func addServiceFlags(cmd *cobra.Command, opts *serviceOptions) { } const ( - flagConstraint = "constraint" - flagConstraintRemove = "constraint-rm" - flagConstraintAdd = "constraint-add" - flagContainerLabel = "container-label" - flagContainerLabelRemove = "container-label-rm" - flagContainerLabelAdd = "container-label-add" - flagEndpointMode = "endpoint-mode" - flagEnv = "env" - flagEnvRemove = "env-rm" - flagEnvAdd = "env-add" - flagGroupAdd = "group-add" - flagGroupRemove = "group-rm" - flagLabel = "label" - flagLabelRemove = "label-rm" - flagLabelAdd = "label-add" - flagLimitCPU = "limit-cpu" - flagLimitMemory = "limit-memory" - flagMode = "mode" - flagMount = "mount" - flagMountRemove = "mount-rm" - flagMountAdd = "mount-add" - flagName = "name" - flagNetwork = "network" - flagPublish = "publish" - flagPublishRemove = "publish-rm" - flagPublishAdd = "publish-add" - flagReplicas = "replicas" - flagReserveCPU = "reserve-cpu" - flagReserveMemory = "reserve-memory" - flagRestartCondition = "restart-condition" - flagRestartDelay = "restart-delay" - flagRestartMaxAttempts = "restart-max-attempts" - flagRestartWindow = "restart-window" - flagStopGracePeriod = "stop-grace-period" - flagUpdateDelay = "update-delay" - flagUpdateFailureAction = "update-failure-action" - flagUpdateParallelism = "update-parallelism" - flagUser = "user" - flagWorkdir = "workdir" - flagRegistryAuth = "with-registry-auth" - flagLogDriver = "log-driver" - flagLogOpt = "log-opt" + flagConstraint = "constraint" + flagConstraintRemove = "constraint-rm" + flagConstraintAdd = "constraint-add" + flagContainerLabel = "container-label" + flagContainerLabelRemove = "container-label-rm" + flagContainerLabelAdd = "container-label-add" + flagEndpointMode = "endpoint-mode" + flagEnv = "env" + flagEnvRemove = "env-rm" + flagEnvAdd = "env-add" + flagGroupAdd = "group-add" + flagGroupRemove = "group-rm" + flagLabel = "label" + flagLabelRemove = "label-rm" + flagLabelAdd = "label-add" + flagLimitCPU = "limit-cpu" + flagLimitMemory = "limit-memory" + flagMode = "mode" + flagMount = "mount" + flagMountRemove = "mount-rm" + flagMountAdd = "mount-add" + flagName = "name" + flagNetwork = "network" + flagPublish = "publish" + flagPublishRemove = "publish-rm" + flagPublishAdd = "publish-add" + flagReplicas = "replicas" + flagReserveCPU = "reserve-cpu" + flagReserveMemory = "reserve-memory" + flagRestartCondition = "restart-condition" + flagRestartDelay = "restart-delay" + flagRestartMaxAttempts = "restart-max-attempts" + flagRestartWindow = "restart-window" + flagStopGracePeriod = "stop-grace-period" + flagUpdateDelay = "update-delay" + flagUpdateFailureAction = "update-failure-action" + flagUpdateMaxFailureRatio = "update-max-failure-ratio" + flagUpdateMonitor = "update-monitor" + flagUpdateParallelism = "update-parallelism" + flagUser = "user" + flagWorkdir = "workdir" + flagRegistryAuth = "with-registry-auth" + flagLogDriver = "log-driver" + flagLogOpt = "log-opt" ) diff --git a/command/service/update.go b/command/service/update.go index be3218ed60..797c989271 100644 --- a/command/service/update.go +++ b/command/service/update.go @@ -36,6 +36,7 @@ func newUpdateCommand(dockerCli *command.DockerCli) *cobra.Command { flags := cmd.Flags() flags.String("image", "", "Service image tag") flags.String("args", "", "Service command args") + flags.Bool("rollback", false, "Rollback to previous specification") addServiceFlags(cmd, opts) flags.Var(newListOptsVar(), flagEnvRemove, "Remove an environment variable") @@ -68,7 +69,20 @@ func runUpdate(dockerCli *command.DockerCli, flags *pflag.FlagSet, serviceID str return err } - err = updateService(flags, &service.Spec) + rollback, err := flags.GetBool("rollback") + if err != nil { + return err + } + + spec := &service.Spec + if rollback { + spec = service.PreviousSpec + if spec == nil { + return fmt.Errorf("service does not have a previous specification to roll back to") + } + } + + err = updateService(flags, spec) if err != nil { return err } @@ -81,15 +95,19 @@ func runUpdate(dockerCli *command.DockerCli, flags *pflag.FlagSet, serviceID str if sendAuth { // Retrieve encoded auth token from the image reference // This would be the old image if it didn't change in this update - image := service.Spec.TaskTemplate.ContainerSpec.Image + image := spec.TaskTemplate.ContainerSpec.Image encodedAuth, err := command.RetrieveAuthTokenFromImage(ctx, dockerCli, image) if err != nil { return err } updateOpts.EncodedRegistryAuth = encodedAuth + } else if rollback { + updateOpts.RegistryAuthFrom = types.RegistryAuthFromPreviousSpec + } else { + updateOpts.RegistryAuthFrom = types.RegistryAuthFromSpec } - err = apiClient.ServiceUpdate(ctx, service.ID, service.Version, service.Spec, updateOpts) + err = apiClient.ServiceUpdate(ctx, service.ID, service.Version, *spec, updateOpts) if err != nil { return err } @@ -111,6 +129,12 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error { } } + updateFloat32 := func(flag string, field *float32) { + if flags.Changed(flag) { + *field, _ = flags.GetFloat32(flag) + } + } + updateDuration := func(flag string, field *time.Duration) { if flags.Changed(flag) { *field, _ = flags.GetDuration(flag) @@ -195,13 +219,15 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error { return err } - if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay, flagUpdateFailureAction) { + if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay, flagUpdateMonitor, flagUpdateFailureAction, flagUpdateMaxFailureRatio) { if spec.UpdateConfig == nil { spec.UpdateConfig = &swarm.UpdateConfig{} } updateUint64(flagUpdateParallelism, &spec.UpdateConfig.Parallelism) updateDuration(flagUpdateDelay, &spec.UpdateConfig.Delay) + updateDuration(flagUpdateMonitor, &spec.UpdateConfig.Monitor) updateString(flagUpdateFailureAction, &spec.UpdateConfig.FailureAction) + updateFloat32(flagUpdateMaxFailureRatio, &spec.UpdateConfig.MaxFailureRatio) } if flags.Changed(flagEndpointMode) {