2

I have a custom "CachedEnumerable" class (inspired by Caching IEnumerable) that I need to make thread safe for my asp.net core web app.

Is the following implementation of the Enumerator thread safe? (All other reads/writes to IList _cache are locked appropriately) (Possibly related to Does the C# Yield free a lock?)

And more specifically, if there are 2 threads accessing the enumerator, how do I protect against one thread incrementing "index" causing a second enumerating thread from getting the wrong element from the _cache (ie. element at index + 1 instead of at index)? Is this race condition a real concern?

public IEnumerator<T> GetEnumerator()
{
    var index = 0;

    while (true)
    {
        T current;
        lock (_enumeratorLock)
        {
            if (index >= _cache.Count && !MoveNext()) break;
            current = _cache[index];
            index++;
        }
        yield return current;
    }
}

Full code of my version of CachedEnumerable:

 public class CachedEnumerable<T> : IDisposable, IEnumerable<T>
    {
        IEnumerator<T> _enumerator;
        private IList<T> _cache = new List<T>();
        public bool CachingComplete { get; private set; } = false;

        public CachedEnumerable(IEnumerable<T> enumerable)
        {
            switch (enumerable)
            {
                case CachedEnumerable<T> cachedEnumerable: //This case is actually dealt with by the extension method.
                    _cache = cachedEnumerable._cache;
                    CachingComplete = cachedEnumerable.CachingComplete;
                    _enumerator = cachedEnumerable.GetEnumerator();

                    break;
                case IList<T> list:
                    //_cache = list; //without clone...
                    //Clone:
                    _cache = new T[list.Count];
                    list.CopyTo((T[]) _cache, 0);
                    CachingComplete = true;
                    break;
                default:
                    _enumerator = enumerable.GetEnumerator();
                    break;
            }
        }

        public CachedEnumerable(IEnumerator<T> enumerator)
        {
            _enumerator = enumerator;
        }

        private int CurCacheCount
        {
            get
            {
                lock (_enumeratorLock)
                {
                    return _cache.Count;
                }
            }
        }

        public IEnumerator<T> GetEnumerator()
        {
            var index = 0;

            while (true)
            {
                T current;
                lock (_enumeratorLock)
                {
                    if (index >= _cache.Count && !MoveNext()) break;
                    current = _cache[index];
                    index++;
                }
                yield return current;
            }
        }

        //private readonly AsyncLock _enumeratorLock = new AsyncLock();
        private readonly object _enumeratorLock = new object();

        private bool MoveNext()
        {
            if (CachingComplete) return false;

            if (_enumerator != null && _enumerator.MoveNext()) //The null check should have been unnecessary b/c of the lock...
            {
                _cache.Add(_enumerator.Current);
                return true;
            }
            else
            {
                CachingComplete = true;
                DisposeWrappedEnumerator(); //Release the enumerator, as it is no longer needed.
            }

            return false;
        }

        public T ElementAt(int index)
        {
            lock (_enumeratorLock)
            {
                if (index < _cache.Count)
                {
                    return _cache[index];
                }
            }

            EnumerateUntil(index);

            lock (_enumeratorLock)
            {
                if (_cache.Count <= index) throw new ArgumentOutOfRangeException(nameof(index));
                return _cache[index];
            }
        }


        public bool TryGetElementAt(int index, out T value)
        {
            lock (_enumeratorLock)
            {
                value = default;
                if (index < CurCacheCount)
                {
                    value = _cache[index];
                    return true;
                }
            }

            EnumerateUntil(index);

            lock (_enumeratorLock)
            {
                if (_cache.Count <= index) return false;
                value = _cache[index];
            }

            return true;
        }

        private void EnumerateUntil(int index)
        {
            while (true)
            {
                lock (_enumeratorLock)
                {
                    if (_cache.Count > index || !MoveNext()) break;
                }
            }
        }


        public void Dispose()
        {
            DisposeWrappedEnumerator();
        }

        private void DisposeWrappedEnumerator()
        {
            if (_enumerator != null)
            {
                _enumerator.Dispose();
                _enumerator = null;
                if (_cache is List<T> list)
                {
                    list.Trim();
                }
            }
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            return GetEnumerator();
        }

        public int CachedCount
        {
            get
            {
                lock (_enumeratorLock)
                {
                    return _cache.Count;
                }
            }
        }

        public int Count()
        {
            if (CachingComplete)
            {
                return _cache.Count;
            }

            EnsureCachingComplete();

            return _cache.Count;
        }

        private void EnsureCachingComplete()
        {
            if (CachingComplete)
            {
                return;
            }

            //Enumerate the rest of the collection
            while (!CachingComplete)
            {
                lock (_enumeratorLock)
                {
                    if (!MoveNext()) break;
                }
            }
        }

        public T[] ToArray()
        {
            EnsureCachingComplete();
            //Once Caching is complete, we don't need to lock
            if (!(_cache is T[] array))
            {
                array = _cache.ToArray();
                _cache = array;
            }

            return array;
        }

        public T this[int index] => ElementAt(index);
    }

    public static CachedEnumerable<T> Cached<T>(this IEnumerable<T> source)
    {
        //no gain in caching a cache.
        if (source is CachedEnumerable<T> cached)
        {
            return cached;
        }

        return new CachedEnumerable<T>(source);
    }
}

Basic Usage: (Although not a meaningful use case)

var cached = expensiveEnumerable.Cached();
foreach (var element in cached) {
   Console.WriteLine(element);
}

Update

I tested the current implementation based on @Theodors answer https://stackoverflow.com/a/58547863/5683904 and confirmed (AFAICT) that it is thread-safe when enumerated with a foreach without creating duplicate values (Thread-safe Cached Enumerator - lock with yield):

class Program
{
    static async Task Main(string[] args)
    {
        var enumerable = Enumerable.Range(0, 1_000_000);
        var cachedEnumerable = new CachedEnumerable<int>(enumerable);
        var c = new ConcurrentDictionary<int, List<int>>();
        var tasks = Enumerable.Range(1, 100).Select(id => Test(id, cachedEnumerable, c));
        Task.WaitAll(tasks.ToArray());
        foreach (var keyValuePair in c)
        {
            var hasDuplicates = keyValuePair.Value.Distinct().Count() != keyValuePair.Value.Count;
            Console.WriteLine($"Task #{keyValuePair.Key} count: {keyValuePair.Value.Count}. Has duplicates? {hasDuplicates}");
        }
    }

    static async Task Test(int id, IEnumerable<int> cache, ConcurrentDictionary<int, List<int>> c)
    {
        foreach (var i in cache)
        {
            //await Task.Delay(10);
            c.AddOrUpdate(id, v => new List<int>() {i}, (k, v) =>
            {
                v.Add(i);
                return v;
            });
        }
    }
}
akraines
  • 1,965
  • 2
  • 12
  • 27
  • 3
    Thread-safe or not, the enumeration will simply not be correct if `_cache` is modified while enumerating. If `_cache` is initialized once in a thread-safe manner, further access doesn't need to be guarded with a lock. If it's not, all bets are off anyway. – Jeroen Mostert Oct 24 '19 at 12:37
  • I provided the full code of the CachedEnumerable. _cache is a private field and is only modified by growing. It provides readonly random-access to elements in the cache. – akraines Oct 24 '19 at 12:40
  • 2
    All this stuff seems absurdly over-engineered compared to just squirreling away the `.ToList()` of an enumerable and using that, by the way, which can be made *obviously* thread-safe. These shenanigans might possibly save some memory/responsiveness of the first request compared to that, but you more than lose out on ability to reason about behavior. Having to ask random people on the Internet if it's thread-safe is never a good sign. – Jeroen Mostert Oct 24 '19 at 12:42
  • I use this only for memoizing expensive calculations that are process and memory intensive, that are lazily generated and used multiple times. – akraines Oct 24 '19 at 12:46
  • Can you show us how this is used (both the producer and the consumer sides)? – tymtam Oct 24 '19 at 12:53
  • You may be able to greatly simplify threading logic by using `ImmutableList`, which is thread-safe by virtue of being immutable. You would only need to replace the reference with each operation. – Jeroen Mostert Oct 24 '19 at 12:56
  • It seems that issue over here is not the cache, but the index variable in the enumerator – akraines Oct 24 '19 at 15:38
  • Isn't it a problem for an ASP.NET application to block thread-pool threads? How about modifying your class to cache [`IAsyncEnumerable`](https://learn.microsoft.com/en-us/dotnet/api/system.collections.generic.iasyncenumerable-1)s instead of `IEnumerable`s? – Theodor Zoulias Oct 25 '19 at 07:20
  • I'll need to update to 3.0 first. But I'll keep it in mind. Thanks – akraines Oct 25 '19 at 12:42
  • @JeroenMostert You made good points about how to deal with thread-safety issues in general. However I needed a lazy enumerated memoizer for my use case, and my question about thread safety was not about the _cache, but about the *enumerator*, as I mentioned in the edit in the OP. "over-engineered" obviously depends on context and the use case. – akraines Nov 04 '19 at 13:35

2 Answers2

2

Your class is not thread safe, because shared state is mutated in unprotected regions inside your class. The unprotected regions are:

  1. The constructor
  2. The Dispose method

The shared state is:

  1. The _enumerator private field
  2. The _cache private field
  3. The CachingComplete public property

Some other issues regarding your class:

  1. Implementing IDisposable creates the responsibility to the caller to dispose your class. There is no need for IEnumerables to be disposable. In the contrary IEnumerators are disposable, but there is language support for their automatic disposal (feature of foreach statement).
  2. Your class offers extended functionality not expected from an IEnumerable (ElementAt, Count etc). Maybe you intended to implement a CachedList instead? Without implementing the IList<T> interface, LINQ methods like Count() and ToArray() cannot take advantage of your extended functionality, and will use the slow path like they do with plain vanilla IEnumerables.

Update: I just noticed another thread-safety issue. This one is related to the public IEnumerator<T> GetEnumerator() method. The enumerator is compiler-generated, since the method is an iterator (utilizes yield return). Compiler-generated enumerators are not thread safe. Consider this code for example:

var enumerable = Enumerable.Range(0, 1_000_000);
var cachedEnumerable = new CachedEnumerable<int>(enumerable);
var enumerator = cachedEnumerable.GetEnumerator();
var tasks = Enumerable.Range(1, 4).Select(id => Task.Run(() =>
{
    int count = 0;
    while (enumerator.MoveNext())
    {
        count++;
    }
    Console.WriteLine($"Task #{id} count: {count}");
})).ToArray();
Task.WaitAll(tasks);

Four threads are using concurrently the same IEnumerator. The enumerable has 1,000,000 items. You may expect that each thread would enumerate ~250,000 items, but that's not what happens.

Output:

Task #1 count: 0
Task #4 count: 0
Task #3 count: 0
Task #2 count: 1000000

The MoveNext in the line while (enumerator.MoveNext()) is not your safe MoveNext. It is the compiler-generated unsafe MoveNext. Although unsafe, it includes a mechanism intended probably for dealing with exceptions, that marks temporarily the enumerator as finished before calling the externally provided code. So when multiple threads are calling the MoveNext concurrently, all but the first will get a return value of false, and will terminate instantly the enumeration, having completed zero loops. To solve this you must probably code your own IEnumerator class.


Update: Actually my last point about thread-safe enumeration is a bit unfair, because enumerating with the IEnumerator interface is an inherently unsafe operation, which is impossible to fix without the cooperation of the calling code. This is because obtaining the next element is not an atomic operation, since it involves two steps (call MoveNext() + read Current). So your thread-safety concerns are limited to the protection of the internal state of your class (fields _enumerator, _cache and CachingComplete). These are left unprotected only in the constructor and in the Dispose method, but I suppose that the normal use of your class may not follow code paths that create the race conditions that would result to internal state corruption.

Personally I would prefer to take care of these code paths too, and I wouldn't let it to the whims of chance.


Update: I wrote a cache for IAsyncEnumerables, to demonstrate an alternative technique. The enumeration of the source IAsyncEnumerable is not driven by the callers, using locks or semaphores to obtain exclusive access, but by a separate worker-task. The first caller starts the worker-task. Each caller at first yields all items that are already cached, and then awaits for more items, or for a notification that there are no more items. As notification mechanism I used a TaskCompletionSource<bool>. A lock is still used to ensure that all access to shared resources is synchronized.

public class CachedAsyncEnumerable<T> : IAsyncEnumerable<T>
{
    private readonly object _locker = new object();
    private IAsyncEnumerable<T> _source;
    private Task _sourceEnumerationTask;
    private List<T> _buffer;
    private TaskCompletionSource<bool> _moveNextTCS;
    private Exception _sourceEnumerationException;
    private int _sourceEnumerationVersion; // Incremented on exception

    public CachedAsyncEnumerable(IAsyncEnumerable<T> source)
    {
        _source = source ?? throw new ArgumentNullException(nameof(source));
    }

    public async IAsyncEnumerator<T> GetAsyncEnumerator(
        CancellationToken cancellationToken = default)
    {
        lock (_locker)
        {
            if (_sourceEnumerationTask == null)
            {
                _buffer = new List<T>();
                _moveNextTCS = new TaskCompletionSource<bool>();
                _sourceEnumerationTask = Task.Run(
                    () => EnumerateSourceAsync(cancellationToken));
            }
        }
        int index = 0;
        int localVersion = -1;
        while (true)
        {
            T current = default;
            Task<bool> moveNextTask = null;
            lock (_locker)
            {
                if (localVersion == -1)
                {
                    localVersion = _sourceEnumerationVersion;
                }
                else if (_sourceEnumerationVersion != localVersion)
                {
                    ExceptionDispatchInfo
                        .Capture(_sourceEnumerationException).Throw();
                }
                if (index < _buffer.Count)
                {
                    current = _buffer[index];
                    index++;
                }
                else
                {
                    moveNextTask = _moveNextTCS.Task;
                }
            }
            if (moveNextTask == null)
            {
                yield return current;
                continue;
            }
            var moved = await moveNextTask;
            if (!moved) yield break;
            lock (_locker)
            {
                current = _buffer[index];
                index++;
            }
            yield return current;
        }
    }

    private async Task EnumerateSourceAsync(CancellationToken cancellationToken)
    {
        TaskCompletionSource<bool> localMoveNextTCS;
        try
        {
            await foreach (var item in _source.WithCancellation(cancellationToken))
            {
                lock (_locker)
                {
                    _buffer.Add(item);
                    localMoveNextTCS = _moveNextTCS;
                    _moveNextTCS = new TaskCompletionSource<bool>();
                }
                localMoveNextTCS.SetResult(true);
            }
            lock (_locker)
            {
                localMoveNextTCS = _moveNextTCS;
                _buffer.TrimExcess();
                _source = null;
            }
            localMoveNextTCS.SetResult(false);
        }
        catch (Exception ex)
        {
            lock (_locker)
            {
                localMoveNextTCS = _moveNextTCS;
                _sourceEnumerationException = ex;
                _sourceEnumerationVersion++;
                _sourceEnumerationTask = null;
            }
            localMoveNextTCS.SetException(ex);
        }
    }
}

This implementation follows a specific strategy for dealing with exceptions. If an exception occurs while enumerating the source IAsyncEnumerable, the exception will be propagated to all current callers, the currently used IAsyncEnumerator will be discarded, and the incomplete cached data will be discarded too. A new worker-task may start again later, when the next enumeration request is received.

Theodor Zoulias
  • 34,835
  • 7
  • 69
  • 104
  • 2
    Added one more observation. – Theodor Zoulias Oct 24 '19 at 20:41
  • Wow! Thanks - that was exactly the test that I was planning on writing (https://stackoverflow.com/questions/58541336/thread-safe-cached-enumerator-lock-with-yield#comment103410272_58542142). I was concerned about what Yield expands to, but I never came across this point before. Regarding your other points: I didn't implement IList, since it caused issues in other places where Linq or Json.Net fully materilized the list, even though I specifically wanted it to be lazy. – akraines Oct 24 '19 at 21:40
  • However, I tested it with a foreach, and it works correctly. The issue that you are pointing out is that IEnumerator is not thread safe. var enumerable = Enumerable.Range(0, 10); var cachedEnumerable = new HelpersAndExtentions.CachedEnumerable(enumerable); var tasks = Enumerable.Range(1, 4).Select(id => Task.Run(() => { foreach (var i in cachedEnumerable) { Console.WriteLine($"Task #{id} count: {i}"); } })).ToArray(); Task.WaitAll(tasks); – akraines Oct 24 '19 at 21:41
  • Regarding implementing ElementAt - you are correct. The use case was casting to CachedEnumerable in those places where I knew I had a CachedEnumerable and then calling ElementAt – akraines Oct 24 '19 at 22:20
  • 1
    Good point regarding the explanation why `CachedEnumerable` is not implementing `IList`! I updated my answer once more with a correction, since I think that my previous point was unfair. – Theodor Zoulias Oct 25 '19 at 01:05
  • 1
    I added a `CachedAsyncEnumerable` implementation, to demonstrate another approach for solving the same problem. – Theodor Zoulias Oct 25 '19 at 17:22
  • One aspect that CachedAsyncEnumerable doesn't mimic that it isn't lazy "IAsyncEnumerable is not driven by the callers" - One of the goals of CachedEnumerable is that it doesn't do work unless needed. My Usecase is: A user can make potentially expensive queries to the App (a query describes how to take data, transform it and search it with a predicate), but doesnt necessarily scroll through all the rows, the CachedEnumerable only materialises the minimum amount of rows needed to serve those requests, and doesnt use up processing power or memory to store the results that are not/wont be needed. – akraines Oct 27 '19 at 06:01
  • Hm, so what should happen with a `CachedEnumerable` instance if a caller performs a partial enumeration? Would you still store this instance in the application's cache, so that it can serve a future request? Are you comfortable with caching for an indefinite amount of time an active incomplete enumerator, that may hold a handle to an open physical file, or a reference to an open database connection? – Theodor Zoulias Oct 27 '19 at 06:32
  • You are really on the mark. I expect the user to do a partial enumeration, and I also expect multiple users to make similar queries (that is why I'm caching). I store the CachedEnumerable in a LazyCache (https://github.com/alastairtree/LazyCache) which is evicted if not accessed within a few minutes - as soon as the reference is released, GC can collect it. I just need to be extra careful not to store references to it anywhere else, in order not to cause a "memory leak". The data-source is in-memory, so I don't have an issue with IO or DB connections. – akraines Oct 27 '19 at 08:34
  • I see. I guess that the production of the cache-worthy data involves some heavy computation over the raw data stored in memory, otherwise it wouldn't be worth the trouble to cache them. I would say that your case is quite unique. – Theodor Zoulias Oct 27 '19 at 09:05
1

The access to cache, yes it is thread safe, only one thread per time can read from _cache object.

But in that way you can't assure that all threads gets elements in the same order as they access to GetEnumerator.

Check these two exaples, if the behavior is what you expect, you can use lock in that way.

Example 1:

THREAD1 Calls GetEnumerator

THREAD1 Initialize T current;

THREAD2 Calls GetEnumerator

THREAD2 Initialize T current;

THREAD2 LOCK THREAD

THREAD1 WAIT

THREAD2 read from cache safely _cache[0]

THREAD2 index++

THREAD2 UNLOCK

THREAD1 LOCK

THREAD1 read from cache safely _cache[1]

THREAD1 i++

THREAD1 UNLOCK

THREAD2 yield return current;

THREAD1 yield return current;


Example 2:

THREAD2 Initialize T current;

THREAD2 LOCK THREAD

THREAD2 read from cache safely

THREAD2 UNLOCK

THREAD1 Initialize T current;

THREAD1 LOCK THREAD

THREAD1 read from cache safely

THREAD1 UNLOCK

THREAD1 yield return current;

THREAD2 yield return current;

Stefano Balzarotti
  • 1,760
  • 1
  • 18
  • 35
  • I checked only the method you asked for, I have not checked if all the class is thread safe, – Stefano Balzarotti Oct 24 '19 at 13:11
  • In Example 1, will THREAD 1 is supposed to return _cache[0], but since THREAD 2 incremented index, will it return current = _cache[1]? That is the bug that I'm most concerned about. – akraines Oct 24 '19 at 13:18
  • If that is bug for you, you need to wrap all while loop inside a lock. – Stefano Balzarotti Oct 24 '19 at 13:27
  • Thanks. That is what I was worried about. Do you have a way for me to solve this? – akraines Oct 24 '19 at 13:28
  • Anyway is hard to say what thread should be access first and in what order, in my example I have given you an order, but that can happens at same time. It is a responsability of the caller to sync the threads in that case. – Stefano Balzarotti Oct 24 '19 at 13:29
  • 1
    I don't think that is the problem, the class iself is thread safe, but you can't assure the correct order with a sync inside a Getenumerator. It is responsability of the caller do do the proper syncronizhation, if you want preserve the order. How can you decide what thread comes before, if both access at same time? – Stefano Balzarotti Oct 24 '19 at 14:32
  • 1
    Anyway take a look to https://referencesource.microsoft.com/#mscorlib/system/Collections/Concurrent/ConcurrentQueue.cs,497, it can give you a better way to implement thread safe GetEnumerator without deadlocks. As you see it uses SpinWait instead of lock statement. – Stefano Balzarotti Oct 24 '19 at 14:35
  • Thanks - I'll look into it. I think that the first step is to create a test case to prove that it is faulty, and then to experiment an see how to solve the issue based on the reference you just cited. – akraines Oct 24 '19 at 15:21
  • I tested it https://stackoverflow.com/questions/58541336/thread-safe-cached-enumerator-lock-with-yield#comment103419879_58547863 and it seems to perform correctly in a foreach loop (emitting the numbers in the correct sequence, without missing), even when shared between threads. – akraines Oct 24 '19 at 21:44
  • I did a more rigorous test with 100 threads and 1_000_000 values, and stored the returned values in a list for each thread and verified that each thread produced all 100000 values. – akraines Oct 24 '19 at 21:51
  • 1
    I am not really sure of the test that you did, I consider the code thread safe in the sense that all elements are always enumerated without throwing exceptions and conflicts. But as different threads executed at same time, they can't return each one all the values. With lock you assure only that each element is processed only one time by different threads. I consider @Theodor Zoulias answer more complete and reliable, since he analyzed also the compiler generated code and showed a reproducible example. – Stefano Balzarotti Oct 24 '19 at 23:14
  • I updated my OP with the code I used for my sample test. Just play around with the parameters. If so, I'll mark his answer as accepted, although you provided alot of insight into sharpening and answering the question. – akraines Oct 25 '19 at 06:24