Service update failure thresholds and rollback

This adds support for two enhancements to swarm service rolling updates:

- Failure thresholds: In Docker 1.12, a service update could be set up
  to either pause or continue after a single failure occurs. This adds
  an --update-max-failure-ratio flag that controls how many tasks need to
  fail to update for the update as a whole to be considered a failure. A
  counterpart flag, --update-monitor, controls how long to monitor each
  task for a failure after starting it during the update.

- Rollback flag: service update --rollback reverts the service to its
  previous version. If a service update encounters task failures, or
  fails to function properly for some other reason, the user can roll back
  the update.

SwarmKit also has the ability to roll back updates automatically after
hitting the failure thresholds, but we've decided not to expose this in
the Docker API/CLI for now, favoring a workflow where the decision to
roll back is always made by an admin. Depending on user feedback, we may
add a "rollback" option to --update-failure-action in the future.

Signed-off-by: Aaron Lehmann <aaron.lehmann@docker.com>
This commit is contained in:
Aaron Lehmann 2016-09-02 14:12:05 -07:00
parent 879b4d1fba
commit 06ebd4517d
3 changed files with 103 additions and 53 deletions

View File

@ -41,10 +41,14 @@ Placement:
{{- if .HasUpdateConfig }}
UpdateConfig:
Parallelism: {{ .UpdateParallelism }}
{{- if .HasUpdateDelay -}}
{{- if .HasUpdateDelay}}
Delay: {{ .UpdateDelay }}
{{- end }}
On failure: {{ .UpdateOnFailure }}
{{- if .HasUpdateMonitor}}
Monitoring Period: {{ .UpdateMonitor }}
{{- end }}
Max failure ratio: {{ .UpdateMaxFailureRatio }}
{{- end }}
ContainerSpec:
Image: {{ .ContainerImage }}
@ -218,6 +222,18 @@ func (ctx *serviceInspectContext) UpdateOnFailure() string {
return ctx.Service.Spec.UpdateConfig.FailureAction
}
func (ctx *serviceInspectContext) HasUpdateMonitor() bool {
return ctx.Service.Spec.UpdateConfig.Monitor.Nanoseconds() > 0
}
func (ctx *serviceInspectContext) UpdateMonitor() time.Duration {
return ctx.Service.Spec.UpdateConfig.Monitor
}
func (ctx *serviceInspectContext) UpdateMaxFailureRatio() float32 {
return ctx.Service.Spec.UpdateConfig.MaxFailureRatio
}
func (ctx *serviceInspectContext) ContainerImage() string {
return ctx.Service.Spec.TaskTemplate.ContainerSpec.Image
}

View File

@ -267,9 +267,11 @@ func (m *MountOpt) Value() []mounttypes.Mount {
}
type updateOptions struct {
parallelism uint64
delay time.Duration
onFailure string
parallelism uint64
delay time.Duration
monitor time.Duration
onFailure string
maxFailureRatio float32
}
type resourceOptions struct {
@ -458,9 +460,11 @@ func (opts *serviceOptions) ToService() (swarm.ServiceSpec, error) {
Networks: convertNetworks(opts.networks),
Mode: swarm.ServiceMode{},
UpdateConfig: &swarm.UpdateConfig{
Parallelism: opts.update.parallelism,
Delay: opts.update.delay,
FailureAction: opts.update.onFailure,
Parallelism: opts.update.parallelism,
Delay: opts.update.delay,
Monitor: opts.update.monitor,
FailureAction: opts.update.onFailure,
MaxFailureRatio: opts.update.maxFailureRatio,
},
EndpointSpec: opts.endpoint.ToEndpointSpec(),
}
@ -507,7 +511,9 @@ func addServiceFlags(cmd *cobra.Command, opts *serviceOptions) {
flags.Uint64Var(&opts.update.parallelism, flagUpdateParallelism, 1, "Maximum number of tasks updated simultaneously (0 to update all at once)")
flags.DurationVar(&opts.update.delay, flagUpdateDelay, time.Duration(0), "Delay between updates")
flags.DurationVar(&opts.update.monitor, flagUpdateMonitor, time.Duration(0), "Duration after each task update to monitor for failure")
flags.StringVar(&opts.update.onFailure, flagUpdateFailureAction, "pause", "Action on update failure (pause|continue)")
flags.Float32Var(&opts.update.maxFailureRatio, flagUpdateMaxFailureRatio, 0, "Failure rate to tolerate during an update")
flags.StringVar(&opts.endpoint.mode, flagEndpointMode, "", "Endpoint mode (vip or dnsrr)")
@ -518,46 +524,48 @@ func addServiceFlags(cmd *cobra.Command, opts *serviceOptions) {
}
const (
flagConstraint = "constraint"
flagConstraintRemove = "constraint-rm"
flagConstraintAdd = "constraint-add"
flagContainerLabel = "container-label"
flagContainerLabelRemove = "container-label-rm"
flagContainerLabelAdd = "container-label-add"
flagEndpointMode = "endpoint-mode"
flagEnv = "env"
flagEnvRemove = "env-rm"
flagEnvAdd = "env-add"
flagGroupAdd = "group-add"
flagGroupRemove = "group-rm"
flagLabel = "label"
flagLabelRemove = "label-rm"
flagLabelAdd = "label-add"
flagLimitCPU = "limit-cpu"
flagLimitMemory = "limit-memory"
flagMode = "mode"
flagMount = "mount"
flagMountRemove = "mount-rm"
flagMountAdd = "mount-add"
flagName = "name"
flagNetwork = "network"
flagPublish = "publish"
flagPublishRemove = "publish-rm"
flagPublishAdd = "publish-add"
flagReplicas = "replicas"
flagReserveCPU = "reserve-cpu"
flagReserveMemory = "reserve-memory"
flagRestartCondition = "restart-condition"
flagRestartDelay = "restart-delay"
flagRestartMaxAttempts = "restart-max-attempts"
flagRestartWindow = "restart-window"
flagStopGracePeriod = "stop-grace-period"
flagUpdateDelay = "update-delay"
flagUpdateFailureAction = "update-failure-action"
flagUpdateParallelism = "update-parallelism"
flagUser = "user"
flagWorkdir = "workdir"
flagRegistryAuth = "with-registry-auth"
flagLogDriver = "log-driver"
flagLogOpt = "log-opt"
flagConstraint = "constraint"
flagConstraintRemove = "constraint-rm"
flagConstraintAdd = "constraint-add"
flagContainerLabel = "container-label"
flagContainerLabelRemove = "container-label-rm"
flagContainerLabelAdd = "container-label-add"
flagEndpointMode = "endpoint-mode"
flagEnv = "env"
flagEnvRemove = "env-rm"
flagEnvAdd = "env-add"
flagGroupAdd = "group-add"
flagGroupRemove = "group-rm"
flagLabel = "label"
flagLabelRemove = "label-rm"
flagLabelAdd = "label-add"
flagLimitCPU = "limit-cpu"
flagLimitMemory = "limit-memory"
flagMode = "mode"
flagMount = "mount"
flagMountRemove = "mount-rm"
flagMountAdd = "mount-add"
flagName = "name"
flagNetwork = "network"
flagPublish = "publish"
flagPublishRemove = "publish-rm"
flagPublishAdd = "publish-add"
flagReplicas = "replicas"
flagReserveCPU = "reserve-cpu"
flagReserveMemory = "reserve-memory"
flagRestartCondition = "restart-condition"
flagRestartDelay = "restart-delay"
flagRestartMaxAttempts = "restart-max-attempts"
flagRestartWindow = "restart-window"
flagStopGracePeriod = "stop-grace-period"
flagUpdateDelay = "update-delay"
flagUpdateFailureAction = "update-failure-action"
flagUpdateMaxFailureRatio = "update-max-failure-ratio"
flagUpdateMonitor = "update-monitor"
flagUpdateParallelism = "update-parallelism"
flagUser = "user"
flagWorkdir = "workdir"
flagRegistryAuth = "with-registry-auth"
flagLogDriver = "log-driver"
flagLogOpt = "log-opt"
)

View File

@ -36,6 +36,7 @@ func newUpdateCommand(dockerCli *command.DockerCli) *cobra.Command {
flags := cmd.Flags()
flags.String("image", "", "Service image tag")
flags.String("args", "", "Service command args")
flags.Bool("rollback", false, "Rollback to previous specification")
addServiceFlags(cmd, opts)
flags.Var(newListOptsVar(), flagEnvRemove, "Remove an environment variable")
@ -68,7 +69,20 @@ func runUpdate(dockerCli *command.DockerCli, flags *pflag.FlagSet, serviceID str
return err
}
err = updateService(flags, &service.Spec)
rollback, err := flags.GetBool("rollback")
if err != nil {
return err
}
spec := &service.Spec
if rollback {
spec = service.PreviousSpec
if spec == nil {
return fmt.Errorf("service does not have a previous specification to roll back to")
}
}
err = updateService(flags, spec)
if err != nil {
return err
}
@ -81,15 +95,19 @@ func runUpdate(dockerCli *command.DockerCli, flags *pflag.FlagSet, serviceID str
if sendAuth {
// Retrieve encoded auth token from the image reference
// This would be the old image if it didn't change in this update
image := service.Spec.TaskTemplate.ContainerSpec.Image
image := spec.TaskTemplate.ContainerSpec.Image
encodedAuth, err := command.RetrieveAuthTokenFromImage(ctx, dockerCli, image)
if err != nil {
return err
}
updateOpts.EncodedRegistryAuth = encodedAuth
} else if rollback {
updateOpts.RegistryAuthFrom = types.RegistryAuthFromPreviousSpec
} else {
updateOpts.RegistryAuthFrom = types.RegistryAuthFromSpec
}
err = apiClient.ServiceUpdate(ctx, service.ID, service.Version, service.Spec, updateOpts)
err = apiClient.ServiceUpdate(ctx, service.ID, service.Version, *spec, updateOpts)
if err != nil {
return err
}
@ -111,6 +129,12 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error {
}
}
updateFloat32 := func(flag string, field *float32) {
if flags.Changed(flag) {
*field, _ = flags.GetFloat32(flag)
}
}
updateDuration := func(flag string, field *time.Duration) {
if flags.Changed(flag) {
*field, _ = flags.GetDuration(flag)
@ -195,13 +219,15 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error {
return err
}
if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay, flagUpdateFailureAction) {
if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay, flagUpdateMonitor, flagUpdateFailureAction, flagUpdateMaxFailureRatio) {
if spec.UpdateConfig == nil {
spec.UpdateConfig = &swarm.UpdateConfig{}
}
updateUint64(flagUpdateParallelism, &spec.UpdateConfig.Parallelism)
updateDuration(flagUpdateDelay, &spec.UpdateConfig.Delay)
updateDuration(flagUpdateMonitor, &spec.UpdateConfig.Monitor)
updateString(flagUpdateFailureAction, &spec.UpdateConfig.FailureAction)
updateFloat32(flagUpdateMaxFailureRatio, &spec.UpdateConfig.MaxFailureRatio)
}
if flags.Changed(flagEndpointMode) {