none
Any way to avoid the performance overhead of ThreadPool.RegisterWaitForSingleObject?

    Domanda

  • I have a simple class to handle requests posted to a queue using a thread that I create. The thread uses a ManualResetEvent in the thread's worker function to wait for a signal indicating that some other thread has written an item to the queue. I have experimented with an alternative that uses ThreadPool.RegisterWaitForSingleObject instead of starting my own thread. It works fine, but some (admittedly quite crude) performance testing indicates that the latter approach is relatively slow.

    I was wondering whether there is some particular overhead inherent in using the thread pool to wait in this way. I am unregistering the wait handle each time as indicated in the documentation. Code to reproduce my performance testing is listed below. It takes 1.5 seconds to complete using a thread to wait on the queue and over 10 seconds using the thread pool (for 1 million operations). I understand that this represents a small hit for each request, but taken cumulatively it looks fishy.

    using System.Collections;
    using System.Collections.Generic;
    using System.Diagnostics;
    using System.Threading;
     
    namespace SpeedTest
    {
    	class Program
    	{
    		public const int COUNT = 1000000;
     
    		static void Main(string[] args)
    		{
    			Foo.Reset();
    			QueueWithThread qThread = new QueueWithThread();
    			qThread.Run();
     
    			Stopwatch sw = Stopwatch.StartNew();
     
    			for (int i = 0; i < COUNT; i++)
    			{
    				qThread.QueueRequest();
    			}
     
    			Foo._evt.WaitOne(10000);
    			long time = sw.ElapsedMilliseconds;
    			Debug.WriteLine("{0,-14}{1,8}ms    {2,8}μs each""Thread", time, (1000.0 * time / COUNT).ToString("0.0"));
    			qThread.Stop();
     
    			Foo.Reset();
    			QueueWithThreadPool qPool = new QueueWithThreadPool();
    			qPool.Run();
     
    			sw = Stopwatch.StartNew();
     
    			for (int i = 0; i < COUNT; i++)
    			{
    				qPool.QueueRequest();
    			}
     
    			Foo._evt.WaitOne(10000);
    			time = sw.ElapsedMilliseconds;
    			Debug.WriteLine("{0,-14}{1,8}ms    {2,8}μs each""Pool", time, (1000.0 * time / COUNT).ToString("0.0"));
    			qPool.Stop();
    		}
    	}
     
    	public class QueueWithThread
    	{
    		private bool _bRequestExit = false;
    		private object _objLock = new object();
    		private Queue<object> _queue = new Queue<object>();
    		private ManualResetEvent _evItemOnQueue = new ManualResetEvent(false);
     
    		public void QueueRequest()
    		{
    			lock (((ICollection)_queue).SyncRoot)
    			{
    				_queue.Enqueue(new object());
    			}
     
    			_evItemOnQueue.Set();
    		}
     
    		public void Run()
    		{
    			_evItemOnQueue.Set();
     
    			Thread _thread = new Thread(new ThreadStart(WorkerFunction));
    			_thread.Start();
    		}
     
    		public void Stop()
    		{
    			_bRequestExit = true;
    			_evItemOnQueue.Set();
    		}
     
    		private void WorkerFunction()
    		{
    			for (; ; )
    			{
    				_evItemOnQueue.WaitOne(1000, false);
     
    				if (_bRequestExit)
    				{
    					break;
    				}
     
    				object objOnQueue = null;
     
    				lock (((ICollection)_queue).SyncRoot)
    				{
    					if (_queue.Count > 0)
    					{
    						objOnQueue = _queue.Dequeue();
    					}
    				}
     
    				if (objOnQueue != null)
    				{
    					Foo.DoNothing();
    				}
     
    				lock (((ICollection)_queue).SyncRoot)
    				{
    					if (_queue.Count == 0)
    					{
    						_evItemOnQueue.Reset();
    					}
    				}
    			}
    		}
    	}
     
    	public class QueueWithThreadPool
    	{
    		private bool _bRequestExit = false;
    		private object _objLock = new object();
    		private Queue<object> _queue = new Queue<object>();
    		private ManualResetEvent _evItemOnQueue = new ManualResetEvent(false);
    		private RegisteredWaitHandle _waitHandle = null;
     
    		public void QueueRequest()
    		{
    			lock (((ICollection)_queue).SyncRoot)
    			{
    				_queue.Enqueue(new object());
    			}
     
    			_evItemOnQueue.Set();
    		}
     
    		public void Run()
    		{
    			_evItemOnQueue.Set();
    			WaitOnQueue();
    		}
     
    		public void Stop()
    		{
    			_bRequestExit = true;
    			_evItemOnQueue.Set();
     
    			if (_waitHandle != null)
    			{
    				_waitHandle.Unregister(null);
    				_waitHandle = null;
    			}
    		}
     
    		private void WaitOnQueue()
    		{
    			_waitHandle = ThreadPool.RegisterWaitForSingleObject(_evItemOnQueue,
    				new WaitOrTimerCallback(WaitOnQueueCallback),
    				null,
    				1000,
    				true);
    		}
     
    		private void WaitOnQueueCallback(object state, bool bTimedOut)
    		{
    			if (_waitHandle != null)
    			{
    				_waitHandle.Unregister(null);
    				_waitHandle = null;
    			}
     
    			object objOnQueue = null;
     
    			lock (((ICollection)_queue).SyncRoot)
    			{
    				if (_queue.Count > 0)
    				{
    					objOnQueue = _queue.Dequeue();
    				}
    			}
     
    			if (objOnQueue != null)
    			{
    				Foo.DoNothing();
    			}
     
    			lock (((ICollection)_queue).SyncRoot)
    			{
    				if (_queue.Count == 0)
    				{
    					_evItemOnQueue.Reset();
    				}
    			}
     
    			if (_bRequestExit == false)
    			{
    				WaitOnQueue();
    			}
    		}
    	}
     
    	public static class Foo
    	{
    		private static int _count;
    		public static ManualResetEvent _evt;
     
    		public static void Reset()
    		{
    			_count = 0;
     
    			if (_evt != null)
    			{
    				_evt.Dispose();
    			}
     
    			_evt = new ManualResetEvent(false);
    		}
     
    		public static void DoNothing()
    		{
    			if (++_count >= Program.COUNT)
    			{
    				_evt.Set();
    			}
    		}
    	}
    }
    


    R

    giovedì 23 febbraio 2012 21:16

Risposte

  • I think the answer to the original question is 'no', but thanks for helpful responses.

    Using ConcurrentQueue rather than locking on a normal Queue improves performance, as does taking items from the queue while there are any there (not blindly waiting each time).

    The underlying issue is that calling WaitOne on a ManualResetEvent where the event is already set has practially no overhead, while calling RegisterWaitForSingleObject on a wait handle that is already set has significantly more.


    R

    • Contrassegnato come risposta Ron Idaho martedì 28 febbraio 2012 08:38
    martedì 28 febbraio 2012 08:37

Tutte le risposte

  • If you're using .NET 4, I'd recommend using BlockingCollection<T> instead. 

    It was designed specifically for multithreaded producer/consumer scenarios, and allows you to dramatically simplify your processing, while maintaining very good performance.  It even allows multiple threads to produce (add to queue) and/or consume in any combination.  This is done using the new lockless collections in System.Collections.Concurrent, which will likely outperform your lock-based approach above.

    The code for this ends up looking similar to:

    // Using:
    
    BlockingCollection<Foo> queue;
    
    // Producer - Just add items as needed from any thread
    queue.Add(new Foo("Bar"));
    
    
    // Consumer (in another thread):
    // This automatically handles the wait handles + signalling, etc
    foreach(Foo item in queue.GetConsumingEnumerable())
    {
       // Process items
       item.DoSomething();
    }


    Reed Copsey, Jr. - http://reedcopsey.com
    If a post answers your question, please click "Mark As Answer" on that post and "Mark as Helpful".

    giovedì 23 febbraio 2012 21:21
  • I think the way you call Unregister() is incorrect. You should pass the WaitHandle object (callback) to this method to unregister the timer which has been created in the background by ThreadPool.RegisterWaitForSingleObject().

    As you have passed a no-named object to this method, I suggest you to apply a little change to your code. Declare a variable for your callback so that you are able to pass the variable to Unregister() as well.

    giovedì 23 febbraio 2012 21:47
  • Using BlockingCollection<T> rather than Queue<T> makes little difference to the results and in particular the relative performance difference between the thread and thread pool versions of the code remains. The underlying issue that I would like to understand/address is why the thread pool version is relatively so much slower.


    R

    giovedì 23 febbraio 2012 21:48
  • This (calling unregister with the wait handle as a parameter) does not make any difference to the performance in the test.

    R


    • Modificato Ron Idaho giovedì 23 febbraio 2012 21:55
    giovedì 23 febbraio 2012 21:52
  • I noticed that you instanciate Thread objects. Why don't you use pooled threads (ThreadPool.QueueUserWorkItem)?
    giovedì 23 febbraio 2012 21:59
  • The class that starts the thread will, in the real code where it is used, live for an indeterminate period, perhaps the life of the application. It waits for requests to be queued and then services them in the thread it creates. The version of the code using the thread pool and RegisterWaitForSingleObject is an attempt to avoid starting a thread, but as noted it is relatively slow. I can't use a thread from the thread pool to wait on a queue when it may be waiting for a long time.

    R

    giovedì 23 febbraio 2012 22:09
  • I think the problem of your code is that you are calling WaitOnQueue() recursively (It calls RegisterWaitForSingleObject, it then calls the callback and the callback calls WaitOneQueue again). Each time this method is called it overwrites _waitHandle variable withought the older WaitHandle being disposed immediately. So the old WaitHandles will remain in the memory until they are garbage collected which is not determine when it happens. I think you'd better review your algorithm.

    giovedì 23 febbraio 2012 22:19
  • The ThreadPool is a funny thing where it's quality's tend to not shine in little proof-of-concept's like this one. In addition, you're not exactly comparing apples to apples. In your handwritten QueueWithThread the worker thread can get called once, and loop as long as items are in the queue, and empty the queue with never even waiting on the ManualResetEvent. In QueueWithThreadPool, each instance of the callback gets called for every item in the queue. Calling a delegate is way more expensive than performing an interation of a for loop. So the performance loss comes more from calling a delegate 1000000 times vs. checking for _bRequestExit 1000000 times, than anything related to RegisterWaitForSingleObject.
    venerdì 24 febbraio 2012 05:54
  • It is not recursive because the callback completes in a thread provided by the thread pool. As the new wait operation starts just before the previous callback exits the worst that can happen is that two threads are in the callback at the same time, but the wait handle will have been tidied by the first before the second enters. Run the code, add trace logging, step through, do whatever to prove the case, but it does not behave as you assert. Recursion on this scale would lead to a stack overflow, which does not happen.


    R


    • Modificato Ron Idaho venerdì 24 febbraio 2012 08:14
    venerdì 24 febbraio 2012 07:56
  • Interesting, but I don't think the reasons for the performance loss have been identified and that the examples tested are at least different varieties of apples:

    1. If I change QueueWthThread to invoke DoNothing() using a delegate, so that the delegate is called 1000000 times (in fact, created and caleld 1000000 times) as part of the test then it adds 400ms to the timing, still leaving a gap of nearly 8 seconds in performance.
    2. Both examples check the state of _bRequestExit 1000000 times.
    3. QueueWithThread calls WaitOne on the ManualResetEvent 1000000 times.

    I was expecting that the thread pool version would show a small overhead compared with using a thread directly, but the deviation is still relatively very large. Is it taking the thread pool 8 seconds to invoke a delegate 1000000 times when doing the same thing directly in code takes 1/20th of that time? If you still believe that the test is not a fair one, can you suggest a variation that would establish the performance difference more accurately?


    R

    venerdì 24 febbraio 2012 08:13
  • If I change QueueWthThread to invoke DoNothing() using a delegate, so that the delegate is called 1000000 times

    But does it recreate the delegate every time?

    Both examples check the state of _bRequestExit 1000000 times.

    True, I was using that as a way to contrast the amount of overhead between the two methods.

    QueueWithThread calls WaitOne on the ManualResetEvent 1000000 times

    It does, but WaitOne never blocks because the event is never unsignaled. I ran it with the Concurrency Visualizer and checked.

    A more apples to apples comparison would be one where the consumer blocked on an event (for at least a quantum), created a delegate and then invoked it. Or you could have QueueWithThreadPool keep taking objects out of the queue as long as there were objects in it.

    venerdì 24 febbraio 2012 17:27
  • "The underlying issue that I would like to understand/address is why the thread pool version is relatively so much slower."

    The difference in performance between these two approaches is fairly obvious - 

    The Thread approach here empties the entire queue - where the ThreadPool approach only dequeues one item per callback.  The overhead of setting up the wait handle, registering a callback, signaling, and then rescheduling on a threadpool thread for every item is going to add some overhead.

    However, that's probably not the main culprit - I highly suspect that you're maxing out the ThreadPool threads here, as well.  Since the ThreadPool version is using the ThreadPool, and scheduling one thread pool thread per request, as soon as you run out of thread pool threads, you're going to be blocking until the previous items can be handled.  This effectively throttles the queue processing to only allow a fixed number of thread pool threads to execute.  Even if you never hit the max thread pool threads - the threadpool starts off with a limited number of threads, and waits (by design) before "ramping up".  A few seconds of extra processing time could easily be explained by the ThreadPool not having enough threads to process the items at the same time (or at least, as quickly as you're adding them).


    Reed Copsey, Jr. - http://reedcopsey.com
    If a post answers your question, please click "Mark As Answer" on that post and "Mark as Helpful".

    venerdì 24 febbraio 2012 18:04
  •  I highly suspect that you're maxing out the ThreadPool threads here, as well.

    Actually it's not. When I ran it under the Concurrency Visualizer the ThreadPool didn't create any additional threads once it warmed up. On my four core machine there were 4 (maybe 5) ThreadPool threads and none of them were destroyed, and no additional ones were created after the base number of threads were created.
    venerdì 24 febbraio 2012 19:21
  • I think the answer to the original question is 'no', but thanks for helpful responses.

    Using ConcurrentQueue rather than locking on a normal Queue improves performance, as does taking items from the queue while there are any there (not blindly waiting each time).

    The underlying issue is that calling WaitOne on a ManualResetEvent where the event is already set has practially no overhead, while calling RegisterWaitForSingleObject on a wait handle that is already set has significantly more.


    R

    • Contrassegnato come risposta Ron Idaho martedì 28 febbraio 2012 08:38
    martedì 28 febbraio 2012 08:37