Skip to content

Commit

Permalink
Add a configurable escalation behaviour (#162)
Browse files Browse the repository at this point in the history
* Add a configurable escalation behaviour

# Motivation
The service group is often running multiple services and orchestrates shutdown for them. We have seen that sometimes some services never shutdown nor do they respond to task cancellation properly. This can become quite problematic when the whole application is waiting for a service to cancel and otherwise appears to be healthy but in reality can't serve any traffic.

# Modification
This PR adds a new configuration to escalate both graceful shutdown and cancellation. The escalation order is graceful shutdown -> task cancellation -> `fatalError`. The `fatalError` acts a last resort to make sure applications are never stuck.

* George review

* Fix some smaller issues
  • Loading branch information
FranzBusch authored Oct 13, 2023
1 parent b71a961 commit d673fdc
Show file tree
Hide file tree
Showing 3 changed files with 273 additions and 17 deletions.
120 changes: 112 additions & 8 deletions Sources/ServiceLifecycle/ServiceGroup.swift
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ public actor ServiceGroup: Sendable {
private let logger: Logger
/// The logging configuration.
private let loggingConfiguration: ServiceGroupConfiguration.LoggingConfiguration
/// The maximum amount of time that graceful shutdown is allowed to take.
private let maximumGracefulShutdownDuration: (secondsComponent: Int64, attosecondsComponent: Int64)?
/// The maximum amount of time that task cancellation is allowed to take.
private let maximumCancellationDuration: (secondsComponent: Int64, attosecondsComponent: Int64)?
/// The signals that lead to graceful shutdown.
private let gracefulShutdownSignals: [UnixSignal]
/// The signals that lead to cancellation.
Expand All @@ -57,6 +61,8 @@ public actor ServiceGroup: Sendable {
self.cancellationSignals = configuration.cancellationSignals
self.logger = configuration.logger
self.loggingConfiguration = configuration.logging
self.maximumGracefulShutdownDuration = configuration._maximumGracefulShutdownDuration
self.maximumCancellationDuration = configuration._maximumCancellationDuration
}

/// Initializes a new ``ServiceGroup``.
Expand Down Expand Up @@ -94,6 +100,8 @@ public actor ServiceGroup: Sendable {
self.cancellationSignals = configuration.cancellationSignals
self.logger = logger
self.loggingConfiguration = configuration.logging
self.maximumGracefulShutdownDuration = configuration._maximumGracefulShutdownDuration
self.maximumCancellationDuration = configuration._maximumCancellationDuration
}

/// Runs all the services by spinning up a child task per service.
Expand Down Expand Up @@ -176,6 +184,8 @@ public actor ServiceGroup: Sendable {
case signalSequenceFinished
case gracefulShutdownCaught
case gracefulShutdownFinished
case gracefulShutdownTimedOut
case cancellationCaught
}

private func _run(
Expand All @@ -191,6 +201,10 @@ public actor ServiceGroup: Sendable {
]
)

// A task that is spawned when we got cancelled or
// we cancel the task group to keep track of a timeout.
var cancellationTimeoutTask: Task<Void, Never>?

// Using a result here since we want a task group that has non-throwing child tasks
// but the body itself is throwing
let result = try await withThrowingTaskGroup(of: ChildTaskResult.self, returning: Result<Void, Error>.self) { group in
Expand Down Expand Up @@ -267,6 +281,13 @@ public actor ServiceGroup: Sendable {
}
}

group.addTask {
// This child task is waiting forever until the group gets cancelled.
let (stream, _) = AsyncStream.makeStream(of: Void.self)
await stream.first { _ in true }
return .cancellationCaught
}

// We are storing the services in an optional array now. When a slot in the array is
// empty it indicates that the service has been shutdown.
var services = services.map { Optional($0) }
Expand All @@ -293,7 +314,7 @@ public actor ServiceGroup: Sendable {
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
]
)
group.cancelAll()
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
return .failure(ServiceGroupError.serviceFinishedUnexpectedly())

case .gracefullyShutdownGroup:
Expand All @@ -307,6 +328,7 @@ public actor ServiceGroup: Sendable {
do {
try await self.shutdownGracefully(
services: services,
cancellationTimeoutTask: &cancellationTimeoutTask,
group: &group,
gracefulShutdownManagers: gracefulShutdownManagers
)
Expand All @@ -327,7 +349,7 @@ public actor ServiceGroup: Sendable {
self.logger.debug(
"All services finished."
)
group.cancelAll()
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
return .success(())
}
}
Expand All @@ -342,7 +364,7 @@ public actor ServiceGroup: Sendable {
self.loggingConfiguration.keys.errorKey: "\(serviceError)",
]
)
group.cancelAll()
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
return .failure(serviceError)

case .gracefullyShutdownGroup:
Expand All @@ -358,6 +380,7 @@ public actor ServiceGroup: Sendable {
do {
try await self.shutdownGracefully(
services: services,
cancellationTimeoutTask: &cancellationTimeoutTask,
group: &group,
gracefulShutdownManagers: gracefulShutdownManagers
)
Expand All @@ -381,7 +404,7 @@ public actor ServiceGroup: Sendable {
"All services finished."
)

group.cancelAll()
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
return .success(())
}
}
Expand All @@ -398,6 +421,7 @@ public actor ServiceGroup: Sendable {
do {
try await self.shutdownGracefully(
services: services,
cancellationTimeoutTask: &cancellationTimeoutTask,
group: &group,
gracefulShutdownManagers: gracefulShutdownManagers
)
Expand All @@ -413,7 +437,7 @@ public actor ServiceGroup: Sendable {
]
)

group.cancelAll()
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
}

case .gracefulShutdownCaught:
Expand All @@ -423,19 +447,29 @@ public actor ServiceGroup: Sendable {
do {
try await self.shutdownGracefully(
services: services,
cancellationTimeoutTask: &cancellationTimeoutTask,
group: &group,
gracefulShutdownManagers: gracefulShutdownManagers
)
} catch {
return .failure(error)
}

case .cancellationCaught:
// We caught cancellation in our child task so we have to spawn
// our cancellation timeout task if needed
self.logger.debug("Caught cancellation.")
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)

case .signalSequenceFinished, .gracefulShutdownFinished:
// This can happen when we are either cancelling everything or
// when the user did not specify any shutdown signals. We just have to tolerate
// this.
continue

case .gracefulShutdownTimedOut:
fatalError("Received gracefulShutdownTimedOut but never triggered a graceful shutdown")

case nil:
fatalError("Invalid result from group.next(). We checked if the group is empty before and still got nil")
}
Expand All @@ -447,18 +481,30 @@ public actor ServiceGroup: Sendable {
self.logger.debug(
"Service lifecycle ended"
)
cancellationTimeoutTask?.cancel()
try result.get()
}

private func shutdownGracefully(
services: [ServiceGroupConfiguration.ServiceConfiguration?],
cancellationTimeoutTask: inout Task<Void, Never>?,
group: inout ThrowingTaskGroup<ChildTaskResult, Error>,
gracefulShutdownManagers: [GracefulShutdownManager]
) async throws {
guard case .running = self.state else {
fatalError("Unexpected state")
}

if #available(macOS 13.0, iOS 16.0, watchOS 9.0, tvOS 16.0, *), let maximumGracefulShutdownDuration = self.maximumGracefulShutdownDuration {
group.addTask {
try? await Task.sleep(for: Duration(
secondsComponent: maximumGracefulShutdownDuration.secondsComponent,
attosecondsComponent: maximumGracefulShutdownDuration.attosecondsComponent
))
return .gracefulShutdownTimedOut
}
}

// We are storing the first error of a service that threw here.
var error: Error?

Expand Down Expand Up @@ -509,7 +555,7 @@ public actor ServiceGroup: Sendable {
]
)

group.cancelAll()
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
throw ServiceGroupError.serviceFinishedUnexpectedly()
}

Expand Down Expand Up @@ -561,9 +607,26 @@ public actor ServiceGroup: Sendable {
]
)

group.cancelAll()
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
}

case .gracefulShutdownTimedOut:
// Gracefully shutting down took longer than the user configured
// so we have to escalate it now.
self.logger.debug(
"Graceful shutdown took longer than allowed by the configuration. Cancelling the group now.",
metadata: [
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
]
)
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)

case .cancellationCaught:
// We caught cancellation in our child task so we have to spawn
// our cancellation timeout task if needed
self.logger.debug("Caught cancellation.")
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)

case .signalSequenceFinished, .gracefulShutdownCaught, .gracefulShutdownFinished:
// We just have to tolerate this since signals and parent graceful shutdowns downs can race.
continue
Expand All @@ -575,7 +638,9 @@ public actor ServiceGroup: Sendable {

// If we hit this then all services are shutdown. The only thing remaining
// are the tasks that listen to the various graceful shutdown signals. We
// just have to cancel those
// just have to cancel those.
// In this case we don't have to spawn our cancellation timeout task since
// we are sure all other child tasks are handling cancellation appropriately.
group.cancelAll()

// If we saw an error during graceful shutdown from a service that triggers graceful
Expand All @@ -584,6 +649,45 @@ public actor ServiceGroup: Sendable {
throw error
}
}

private func cancelGroupAndSpawnTimeoutIfNeeded(
group: inout ThrowingTaskGroup<ChildTaskResult, Error>,
cancellationTimeoutTask: inout Task<Void, Never>?
) {
guard cancellationTimeoutTask == nil else {
// We already have a cancellation timeout task running.
self.logger.debug(
"Task cancellation timeout task already running."
)
return
}
group.cancelAll()

if #available(macOS 13.0, iOS 16.0, watchOS 9.0, tvOS 16.0, *), let maximumCancellationDuration = self.maximumCancellationDuration {
// We have to spawn an unstructured task here because the call to our `run`
// method might have already been cancelled and we need to protect the sleep
// from being cancelled.
cancellationTimeoutTask = Task {
do {
self.logger.debug(
"Task cancellation timeout task started."
)
try await Task.sleep(for: Duration(
secondsComponent: maximumCancellationDuration.secondsComponent,
attosecondsComponent: maximumCancellationDuration.attosecondsComponent
))
self.logger.debug(
"Cancellation took longer than allowed by the configuration."
)
fatalError("Cancellation took longer than allowed by the configuration.")
} catch {
// We got cancelled so our services must have finished up.
}
}
} else {
cancellationTimeoutTask = nil
}
}
}

// This should be removed once we support Swift 5.9+
Expand Down
55 changes: 55 additions & 0 deletions Sources/ServiceLifecycle/ServiceGroupConfiguration.swift
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,61 @@ public struct ServiceGroupConfiguration: Sendable {
/// The group's logging configuration.
public var logging = LoggingConfiguration()

/// The maximum amount of time that graceful shutdown is allowed to take.
///
/// After this time has elapsed graceful shutdown will be escalated to task cancellation.
@available(macOS 13.0, iOS 16.0, watchOS 9.0, tvOS 16.0, *)
public var maximumGracefulShutdownDuration: Duration? {
get {
if let maximumGracefulShutdownDuration = self._maximumGracefulShutdownDuration {
return .init(
secondsComponent: maximumGracefulShutdownDuration.secondsComponent,
attosecondsComponent: maximumGracefulShutdownDuration.attosecondsComponent
)
} else {
return nil
}
}
set {
if let newValue = newValue {
self._maximumGracefulShutdownDuration = (newValue.components.seconds, newValue.components.attoseconds)
} else {
self._maximumGracefulShutdownDuration = nil
}
}
}

internal var _maximumGracefulShutdownDuration: (secondsComponent: Int64, attosecondsComponent: Int64)?

/// The maximum amount of time that task cancellation is allowed to take.
///
/// After this time has elapsed task cancellation will be escalated to a `fatalError`.
///
/// - Important: This setting is useful to guarantee that your application will exit at some point and
/// should be used to identify APIs that are not properly implementing task cancellation.
@available(macOS 13.0, iOS 16.0, watchOS 9.0, tvOS 16.0, *)
public var maximumCancellationDuration: Duration? {
get {
if let maximumCancellationDuration = self._maximumCancellationDuration {
return .init(
secondsComponent: maximumCancellationDuration.secondsComponent,
attosecondsComponent: maximumCancellationDuration.attosecondsComponent
)
} else {
return nil
}
}
set {
if let newValue = newValue {
self._maximumCancellationDuration = (newValue.components.seconds, newValue.components.attoseconds)
} else {
self._maximumCancellationDuration = nil
}
}
}

internal var _maximumCancellationDuration: (secondsComponent: Int64, attosecondsComponent: Int64)?

/// Initializes a new ``ServiceGroupConfiguration``.
///
/// - Parameters:
Expand Down
Loading

0 comments on commit d673fdc

Please sign in to comment.