From 8c03c1201bfb389aea3b2c59d21e1c87c4562b13 Mon Sep 17 00:00:00 2001 From: Aaron Lehmann Date: Fri, 2 Sep 2016 14:12:05 -0700 Subject: [PATCH] Service update failure thresholds and rollback This adds support for two enhancements to swarm service rolling updates: - Failure thresholds: In Docker 1.12, a service update could be set up to either pause or continue after a single failure occurs. This adds an --update-max-failure-ratio flag that controls how many tasks need to fail to update for the update as a whole to be considered a failure. A counterpart flag, --update-monitor, controls how long to monitor each task for a failure after starting it during the update. - Rollback flag: service update --rollback reverts the service to its previous version. If a service update encounters task failures, or fails to function properly for some other reason, the user can roll back the update. SwarmKit also has the ability to roll back updates automatically after hitting the failure thresholds, but we've decided not to expose this in the Docker API/CLI for now, favoring a workflow where the decision to roll back is always made by an admin. Depending on user feedback, we may add a "rollback" option to --update-failure-action in the future. Signed-off-by: Aaron Lehmann --- contrib/completion/bash/docker | 3 + contrib/completion/zsh/_docker | 3 + docs/reference/commandline/service_create.md | 62 ++++++++-------- docs/reference/commandline/service_update.md | 77 ++++++++++---------- 4 files changed, 78 insertions(+), 67 deletions(-) diff --git a/contrib/completion/bash/docker b/contrib/completion/bash/docker index cb24bd0841..43197c7d5f 100644 --- a/contrib/completion/bash/docker +++ b/contrib/completion/bash/docker @@ -1809,9 +1809,12 @@ _docker_service_update() { --restart-delay --restart-max-attempts --restart-window + --rollback --stop-grace-period --update-delay --update-failure-action + --update-max-failure-ratio + --update-monitor --update-parallelism --user -u --workdir -w diff --git a/contrib/completion/zsh/_docker b/contrib/completion/zsh/_docker index d9246105b9..cb73073905 100644 --- a/contrib/completion/zsh/_docker +++ b/contrib/completion/zsh/_docker @@ -1108,6 +1108,8 @@ __docker_service_subcommand() { "($help)--stop-grace-period=[Time to wait before force killing a container]:grace period: " "($help)--update-delay=[Delay between updates]:delay: " "($help)--update-failure-action=[Action on update failure]:mode:(pause continue)" + "($help)--update-max-failure-ratio=[Failure rate to tolerate during an update]:fraction: " + "($help)--update-monitor=[Duration after each task update to monitor for failure]:window: " "($help)--update-parallelism=[Maximum number of tasks updated simultaneously]:number: " "($help -u --user)"{-u=,--user=}"[Username or UID]:user:_users" "($help)--with-registry-auth[Send registry authentication details to swarm agents]" @@ -1185,6 +1187,7 @@ __docker_service_subcommand() { "($help)*--container-label-rm=[Remove a container label by its key]:label: " \ "($help)*--group-rm=[Remove previously added user groups from the container]:group:_groups" \ "($help)--image=[Service image tag]:image:__docker_repositories" \ + "($help)--rollback[Rollback to previous specification]" \ "($help -)1:service:__docker_complete_services" && ret=0 ;; (help) diff --git a/docs/reference/commandline/service_create.md b/docs/reference/commandline/service_create.md index f4d0815070..93ffb0e9a9 100644 --- a/docs/reference/commandline/service_create.md +++ b/docs/reference/commandline/service_create.md @@ -12,36 +12,38 @@ Usage: docker service create [OPTIONS] IMAGE [COMMAND] [ARG...] Create a new service Options: - --constraint value Placement constraints (default []) - --container-label value Service container labels (default []) - --endpoint-mode string Endpoint mode (vip or dnsrr) - -e, --env value Set environment variables (default []) - --group-add value Add additional user groups to the container (default []) - --help Print usage - -l, --label value Service labels (default []) - --limit-cpu value Limit CPUs (default 0.000) - --limit-memory value Limit Memory (default 0 B) - --log-driver string Logging driver for service - --log-opt value Logging driver options (default []) - --mode string Service mode (replicated or global) (default "replicated") - --mount value Attach a mount to the service - --name string Service name - --network value Network attachments (default []) - -p, --publish value Publish a port as a node port (default []) - --replicas value Number of tasks (default none) - --reserve-cpu value Reserve CPUs (default 0.000) - --reserve-memory value Reserve Memory (default 0 B) - --restart-condition string Restart when condition is met (none, on-failure, or any) - --restart-delay value Delay between restart attempts (default none) - --restart-max-attempts value Maximum number of restarts before giving up (default none) - --restart-window value Window used to evaluate the restart policy (default none) - --stop-grace-period value Time to wait before force killing a container (default none) - --update-delay duration Delay between updates - --update-failure-action string Action on update failure (pause|continue) (default "pause") - --update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1) - -u, --user string Username or UID (format: [:]) - --with-registry-auth Send registry authentication details to Swarm agents - -w, --workdir string Working directory inside the container + --constraint value Placement constraints (default []) + --container-label value Service container labels (default []) + --endpoint-mode string Endpoint mode (vip or dnsrr) + -e, --env value Set environment variables (default []) + --group-add value Add additional user groups to the container (default []) + --help Print usage + -l, --label value Service labels (default []) + --limit-cpu value Limit CPUs (default 0.000) + --limit-memory value Limit Memory (default 0 B) + --log-driver string Logging driver for service + --log-opt value Logging driver options (default []) + --mode string Service mode (replicated or global) (default "replicated") + --mount value Attach a mount to the service + --name string Service name + --network value Network attachments (default []) + -p, --publish value Publish a port as a node port (default []) + --replicas value Number of tasks (default none) + --reserve-cpu value Reserve CPUs (default 0.000) + --reserve-memory value Reserve Memory (default 0 B) + --restart-condition string Restart when condition is met (none, on-failure, or any) + --restart-delay value Delay between restart attempts (default none) + --restart-max-attempts value Maximum number of restarts before giving up (default none) + --restart-window value Window used to evaluate the restart policy (default none) + --stop-grace-period value Time to wait before force killing a container (default none) + --update-delay duration Delay between updates + --update-failure-action string Action on update failure (pause|continue) (default "pause") + --update-max-failure-ratio value Failure rate to tolerate during an update + --update-monitor duration Duration after each task update to monitor for failure (default 0s) + --update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1) + -u, --user string Username or UID (format: [:]) + --with-registry-auth Send registry authentication details to Swarm agents + -w, --workdir string Working directory inside the container ``` Creates a service as described by the specified parameters. You must run this diff --git a/docs/reference/commandline/service_update.md b/docs/reference/commandline/service_update.md index f1698c3e01..d70a656837 100644 --- a/docs/reference/commandline/service_update.md +++ b/docs/reference/commandline/service_update.md @@ -12,43 +12,46 @@ Usage: docker service update [OPTIONS] SERVICE Update a service Options: - --args string Service command args - --constraint-add value Add or update placement constraints (default []) - --constraint-rm value Remove a constraint (default []) - --container-label-add value Add or update container labels (default []) - --container-label-rm value Remove a container label by its key (default []) - --endpoint-mode string Endpoint mode (vip or dnsrr) - --env-add value Add or update environment variables (default []) - --env-rm value Remove an environment variable (default []) - --group-add value Add additional user groups to the container (default []) - --group-rm value Remove previously added user groups from the container (default []) - --help Print usage - --image string Service image tag - --label-add value Add or update service labels (default []) - --label-rm value Remove a label by its key (default []) - --limit-cpu value Limit CPUs (default 0.000) - --limit-memory value Limit Memory (default 0 B) - --log-driver string Logging driver for service - --log-opt value Logging driver options (default []) - --mount-add value Add or update a mount on a service - --mount-rm value Remove a mount by its target path (default []) - --name string Service name - --publish-add value Add or update a published port (default []) - --publish-rm value Remove a published port by its target port (default []) - --replicas value Number of tasks (default none) - --reserve-cpu value Reserve CPUs (default 0.000) - --reserve-memory value Reserve Memory (default 0 B) - --restart-condition string Restart when condition is met (none, on-failure, or any) - --restart-delay value Delay between restart attempts (default none) - --restart-max-attempts value Maximum number of restarts before giving up (default none) - --restart-window value Window used to evaluate the restart policy (default none) - --stop-grace-period value Time to wait before force killing a container (default none) - --update-delay duration Delay between updates - --update-failure-action string Action on update failure (pause|continue) (default "pause") - --update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1) - -u, --user string Username or UID (format: [:]) - --with-registry-auth Send registry authentication details to Swarm agents - -w, --workdir string Working directory inside the container + --args string Service command args + --constraint-add value Add or update placement constraints (default []) + --constraint-rm value Remove a constraint (default []) + --container-label-add value Add or update container labels (default []) + --container-label-rm value Remove a container label by its key (default []) + --endpoint-mode string Endpoint mode (vip or dnsrr) + --env-add value Add or update environment variables (default []) + --env-rm value Remove an environment variable (default []) + --group-add value Add additional user groups to the container (default []) + --group-rm value Remove previously added user groups from the container (default []) + --help Print usage + --image string Service image tag + --label-add value Add or update service labels (default []) + --label-rm value Remove a label by its key (default []) + --limit-cpu value Limit CPUs (default 0.000) + --limit-memory value Limit Memory (default 0 B) + --log-driver string Logging driver for service + --log-opt value Logging driver options (default []) + --mount-add value Add or update a mount on a service + --mount-rm value Remove a mount by its target path (default []) + --name string Service name + --publish-add value Add or update a published port (default []) + --publish-rm value Remove a published port by its target port (default []) + --replicas value Number of tasks (default none) + --reserve-cpu value Reserve CPUs (default 0.000) + --reserve-memory value Reserve Memory (default 0 B) + --restart-condition string Restart when condition is met (none, on-failure, or any) + --restart-delay value Delay between restart attempts (default none) + --restart-max-attempts value Maximum number of restarts before giving up (default none) + --restart-window value Window used to evaluate the restart policy (default none) + --rollback Rollback to previous specification + --stop-grace-period value Time to wait before force killing a container (default none) + --update-delay duration Delay between updates + --update-failure-action string Action on update failure (pause|continue) (default "pause") + --update-max-failure-ratio value Failure rate to tolerate during an update + --update-monitor duration Duration after each task update to monitor for failure (default 0s) + --update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1) + -u, --user string Username or UID (format: [:]) + --with-registry-auth Send registry authentication details to Swarm agents + -w, --workdir string Working directory inside the container ``` Updates a service as described by the specified parameters. This command has to be run targeting a manager node.