1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
// Copyright (c) The Diem Core Contributors
// Copyright (c) 2022, Mysten Labs, Inc.
// SPDX-License-Identifier: Apache-2.0

#![forbid(unsafe_code)]

//! A bounded tokio [`Handle`]. Only a bounded number of tasks can run
//! concurrently when spawned through this executor, defined by the initial
//! `capacity`.

use futures::{future::Future, FutureExt};
use std::sync::Arc;
use tokio::{
    runtime::Handle,
    sync::{OwnedSemaphorePermit, Semaphore},
    task::JoinHandle,
};

use tracing::debug;

use thiserror::Error;

#[derive(Error)]
pub enum BoundedExecutionError<F>
where
    F: Future + Send + 'static,
    F::Output: Send + 'static,
{
    #[error("Concurrent execution limit reached")]
    Full(F),
}

impl<F> std::fmt::Debug for BoundedExecutionError<F>
where
    F: Future + Send + 'static,
    F::Output: Send + 'static,
{
    // Elide the future to let this be unwrapped
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Full(_f) => f.debug_tuple("Full").finish(),
        }
    }
}

#[derive(Clone, Debug)]
pub struct BoundedExecutor {
    semaphore: Arc<Semaphore>,
    executor: Handle,
}

impl BoundedExecutor {
    /// Create a new `BoundedExecutor` from an existing tokio [`Handle`]
    /// with a maximum concurrent task capacity of `capacity`.
    pub fn new(capacity: usize, executor: Handle) -> Self {
        let semaphore = Arc::new(Semaphore::new(capacity));
        Self {
            semaphore,
            executor,
        }
    }

    // Acquires a permit with the semaphore, first gracefully,
    // then queuing after logging that we're out of capacity.
    async fn acquire_permit(semaphore: Arc<Semaphore>) -> OwnedSemaphorePermit {
        match semaphore.clone().try_acquire_owned() {
            Ok(p) => p,
            Err(_) => {
                debug!("concurrent task limit reached, waiting...");
                semaphore.acquire_owned().await.unwrap()
            }
        }
    }

    /// Returns the executor available capacity for running tasks.
    pub fn available_capacity(&self) -> usize {
        self.semaphore.available_permits()
    }

    /// Spawn a [`Future`] on the `BoundedExecutor`. This function is async and
    /// will block if the executor is at capacity until one of the other spawned
    /// futures completes. This function returns a [`JoinHandle`] that the caller
    /// can `.await` on for the results of the [`Future`].
    pub async fn spawn<F>(&self, f: F) -> JoinHandle<F::Output>
    where
        F: Future + Send + 'static,
        F::Output: Send + 'static,
    {
        let permit = Self::acquire_permit(self.semaphore.clone()).await;

        self.spawn_with_permit(f, permit)
    }

    /// Try to spawn a [`Future`] on the `BoundedExecutor`. If the `BoundedExecutor`
    /// is at capacity, this will return an `Err(F)`, passing back the future the
    /// caller attempted to spawn. Otherwise, this will spawn the future on the
    /// executor and send back a [`JoinHandle`] that the caller can `.await` on
    /// for the results of the [`Future`].
    pub fn try_spawn<F>(&self, f: F) -> Result<JoinHandle<F::Output>, BoundedExecutionError<F>>
    where
        F: Future + Send + 'static,
        F::Output: Send + 'static,
    {
        match self.semaphore.clone().try_acquire_owned() {
            Ok(permit) => Ok(self.spawn_with_permit(f, permit)),
            Err(_) => Err(BoundedExecutionError::Full(f)),
        }
    }

    #[must_use]
    fn spawn_with_permit<F>(
        &self,
        f: F,
        spawn_permit: OwnedSemaphorePermit,
    ) -> JoinHandle<F::Output>
    where
        F: Future + Send + 'static,
        F::Output: Send + 'static,
    {
        // Release the permit back to the semaphore when this task completes.
        let f = Self::with_permit(f, spawn_permit);
        self.executor.spawn(f)
    }

    // Returns a [`Future`] that complies with the `BoundedExecutor`. Once launched,
    // will block if the executor is at capacity, until one of the other spawned
    // futures completes.
    async fn run_on_semaphore<F>(semaphore: Arc<Semaphore>, f: F) -> F::Output
    where
        F: Future + Send + 'static,
        F::Output: Send + 'static,
    {
        let permit = Self::acquire_permit(semaphore.clone()).await;
        Self::with_permit(f, permit).await
    }

    /// Unconditionally spawns a task driving retries of a [`Future`] on the `BoundedExecutor`.
    /// This [`Future`] will be executed in the form of attempts, one after the other, run on
    /// our bounded executor, each according to the provided [`crate::RetryConfig`].
    ///
    /// Each attempt is async and will block if the executor is at capacity until
    /// one of the other attempts completes. In case the attempt completes with an error,
    /// the driver completes a backoff (according to the retry configuration) without holding
    /// a permit, before, queueing an attempt on the executor again.
    ///
    /// This function returns a [`JoinHandle`] that the caller can `.await` on for
    /// the results of the overall retry driver.
    ///
    /// TODO: this still spawns one task, unconditionally, per call.
    /// We would instead like to have one central task that drives all retries
    /// for the whole executor.
    #[must_use]
    pub(crate) fn spawn_with_retries<F, Fut, T, E>(
        &self,
        retry_config: crate::RetryConfig,
        mut f: F,
    ) -> JoinHandle<Result<T, E>>
    where
        F: FnMut() -> Fut + Send + 'static,
        Fut: Future<Output = Result<T, backoff::Error<E>>> + Send + 'static,
        T: Send + 'static,
        E: Send + 'static,
    {
        let retrier = {
            let semaphore = self.semaphore.clone();

            let executor = move || {
                let semaphore = semaphore.clone();
                BoundedExecutor::run_on_semaphore(semaphore, f())
            };

            retry_config.retry(executor)
        };
        self.executor.spawn(retrier)
    }

    // Equips a future with a final step that drops the held semaphore permit
    async fn with_permit<F>(f: F, spawn_permit: OwnedSemaphorePermit) -> F::Output
    where
        F: Future + Send + 'static,
        F::Output: Send + 'static,
    {
        f.map(|ret| {
            drop(spawn_permit);
            ret
        })
        .await
    }
}

#[cfg(test)]
mod test {
    use crate::RetryConfig;

    use super::*;
    use futures::{channel::oneshot, executor::block_on, future::Future, FutureExt};
    use std::{
        sync::{
            atomic::{AtomicU32, Ordering},
            mpsc,
        },
        thread,
        time::Duration,
    };
    use tokio::{runtime::Runtime, time::sleep};

    #[test]
    fn try_spawn_panicking() {
        let rt = Runtime::new().unwrap();
        let executor = rt.handle().clone();
        let executor = BoundedExecutor::new(1, executor);

        // executor has a free slot, spawn should succeed
        let fpanic = executor.try_spawn(panicking()).unwrap();
        // this would return a JoinError::panic
        block_on(fpanic).unwrap_err();

        let (tx1, rx1) = oneshot::channel();
        // the executor should not be full, because the permit for the panicking task should drop at unwinding
        let f1 = executor.try_spawn(rx1).unwrap();

        // cleanup
        tx1.send(()).unwrap();
        block_on(f1).unwrap().unwrap();
    }

    async fn panicking() {
        panic!();
    }

    #[test]
    fn try_spawn() {
        let rt = Runtime::new().unwrap();
        let executor = rt.handle().clone();
        let executor = BoundedExecutor::new(1, executor);

        let (tx1, rx1) = oneshot::channel();
        let (tx2, rx2) = oneshot::channel();

        // executor has a free slot, spawn should succeed
        assert_eq!(executor.available_capacity(), 1);

        let f1 = executor.try_spawn(rx1).unwrap();

        // executor is full, try_spawn should return err and give back the task
        // we attempted to spawn
        let BoundedExecutionError::Full(rx2) = executor.try_spawn(rx2).unwrap_err();

        // currently running tasks is updated
        assert_eq!(executor.available_capacity(), 0);

        // complete f1 future, should open a free slot in executor

        tx1.send(()).unwrap();
        block_on(f1).unwrap().unwrap();

        // should successfully spawn a new task now that the first is complete
        let f2 = executor.try_spawn(rx2).unwrap();

        // cleanup

        tx2.send(()).unwrap();
        block_on(f2).unwrap().unwrap();

        //ensure current running goes back to one
        assert_eq!(executor.available_capacity(), 1);
    }

    // ensure tasks spawned with retries do not hog the semaphore
    #[test]
    fn test_spawn_with_semaphore() {
        // beware: the timeout is here to witness a failure rather than a hung test in case the
        // executor does not work correctly.
        panic_after(Duration::from_secs(10), || {
            let rt = Runtime::new().unwrap();
            let executor = rt.handle().clone();
            let executor = BoundedExecutor::new(1, executor);

            let infinite_retry_config = RetryConfig {
                // Retry for forever
                retrying_max_elapsed_time: None,
                ..Default::default()
            };

            // we can queue this future with infinite retries
            let handle_infinite_fails =
                executor.spawn_with_retries(infinite_retry_config, always_failing);

            // check we can still enqueue another successful task
            let (tx1, rx1) = oneshot::channel();
            let f1 = block_on(executor.spawn(rx1));

            // complete f1 future, should open a free slot in executor
            tx1.send(()).unwrap();
            block_on(f1).unwrap().unwrap();

            // cleanup
            handle_infinite_fails.abort();
        })
    }

    async fn always_failing() -> Result<(), backoff::Error<eyre::Report>> {
        Err(Into::into(eyre::eyre!("oops")))
    }

    fn panic_after<T, F>(d: Duration, f: F) -> T
    where
        T: Send + 'static,
        F: FnOnce() -> T,
        F: Send + 'static,
    {
        let (done_tx, done_rx) = mpsc::channel();
        let handle = thread::spawn(move || {
            let val = f();
            done_tx.send(()).expect("Unable to send completion signal");
            val
        });

        match done_rx.recv_timeout(d) {
            Ok(_) => handle.join().expect("Thread panicked"),
            Err(_) => panic!("Thread took too long"),
        }
    }

    // spawn NUM_TASKS futures on a BoundedExecutor, ensuring that no more than
    // MAX_WORKERS ever enter the critical section.
    #[test]
    fn concurrent_bounded_executor() {
        const MAX_WORKERS: u32 = 20;
        const NUM_TASKS: u32 = 1000;
        static WORKERS: AtomicU32 = AtomicU32::new(0);
        static COMPLETED_TASKS: AtomicU32 = AtomicU32::new(0);

        let rt = Runtime::new().unwrap();
        let executor = rt.handle().clone();
        let executor = BoundedExecutor::new(MAX_WORKERS as usize, executor);

        for _ in 0..NUM_TASKS {
            block_on(executor.spawn(async move {
                // acquired permit, there should only ever be MAX_WORKERS in this
                // critical section

                let prev_workers = WORKERS.fetch_add(1, Ordering::SeqCst);
                assert!(prev_workers < MAX_WORKERS);

                // yield back to the tokio scheduler
                yield_task().await;

                let prev_workers = WORKERS.fetch_sub(1, Ordering::SeqCst);
                assert!(prev_workers > 0 && prev_workers <= MAX_WORKERS);

                COMPLETED_TASKS.fetch_add(1, Ordering::Relaxed);
            }));
        }

        // spin until completed
        loop {
            let completed = COMPLETED_TASKS.load(Ordering::Relaxed);
            if completed == NUM_TASKS {
                break;
            } else {
                std::hint::spin_loop()
            }
        }
    }

    fn yield_task() -> impl Future<Output = ()> {
        sleep(Duration::from_millis(1)).map(|_| ())
    }
}