File: system\threading\Tasks\ParallelRangeManager.cs
Project: ndp\clr\src\bcl\mscorlib.csproj (mscorlib)
// ==++==
//
//   Copyright (c) Microsoft Corporation.  All rights reserved.
// 
// ==--==
// =+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+
//
// ParallelRangeManager.cs
//
// <OWNER>Microsoft</OWNER>
//
// Implements the algorithm for distributing loop indices to parallel loop workers
//
// =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
using System;
using System.Threading;
using System.Diagnostics.Contracts;
using System.Runtime.InteropServices;
 
#pragma warning disable 0420
 
namespace System.Threading.Tasks
{
    /// <summary>
    /// Represents an index range
    /// </summary>
    [StructLayout(LayoutKind.Auto)]
    internal struct IndexRange
    {
        // the From and To values for this range. These do not change.
        internal long m_nFromInclusive;
        internal long m_nToExclusive;
 
        // The shared index, stored as the offset from nFromInclusive. Using an offset rather than the actual 
        // value saves us from overflows that can happen due to multiple workers racing to increment this.
        // All updates to this field need to be interlocked.  To avoid split interlockeds across cache-lines
        // in 32-bit processes, in 32-bit processes when the range fits in a 32-bit value, we prefer to use
        // a 32-bit field, and just use the first 32-bits of the long.  And to minimize false sharing, each
        // value is stored in its own heap-allocated object, which is lazily allocated by the thread using
        // that range, minimizing the chances it'll be near the objects from other threads.
        internal volatile Shared<long> m_nSharedCurrentIndexOffset;
 
        // to be set to 1 by the worker that finishes this range. It's OK to do a non-interlocked write here.
        internal int m_bRangeFinished;
    }
 
 
    /// <summary>
    /// The RangeWorker struct wraps the state needed by a task that services the parallel loop
    /// </summary>
    [StructLayout(LayoutKind.Auto)]
    internal struct RangeWorker
    {
        // reference to the IndexRange array allocated by the range manager
        internal readonly IndexRange[] m_indexRanges;
 
        // index of the current index range that this worker is grabbing chunks from
        internal int m_nCurrentIndexRange;        
 
        // the step for this loop. Duplicated here for quick access (rather than jumping to rangemanager)
        internal long m_nStep;
 
        // increment value is the current amount that this worker will use 
        // to increment the shared index of the range it's working on
        internal long m_nIncrementValue;
 
        // the increment value is doubled each time this worker finds work, and is capped at this value
        internal readonly long m_nMaxIncrementValue;
 
        // whether to use 32-bits or 64-bits of current index in each range
        internal readonly bool _use32BitCurrentIndex;
 
        /// <summary>
        /// Initializes a RangeWorker struct
        /// </summary>
        internal RangeWorker(IndexRange[] ranges, int nInitialRange, long nStep, bool use32BitCurrentIndex)
        {
            m_indexRanges = ranges;
            m_nCurrentIndexRange = nInitialRange;
            _use32BitCurrentIndex = use32BitCurrentIndex;
            m_nStep = nStep;
 
            m_nIncrementValue = nStep;
 
            m_nMaxIncrementValue = Parallel.DEFAULT_LOOP_STRIDE * nStep;
        }
 
        /// <summary>
        /// Implements the core work search algorithm that will be used for this range worker. 
        /// </summary> 
        /// 
        /// Usage pattern is:
        ///    1) the thread associated with this rangeworker calls FindNewWork
        ///    2) if we return true, the worker uses the nFromInclusiveLocal and nToExclusiveLocal values
        ///       to execute the sequential loop
        ///    3) if we return false it means there is no more work left. It's time to quit.        
        ///    
        internal bool FindNewWork(out long nFromInclusiveLocal, out long nToExclusiveLocal)
        {
            // since we iterate over index ranges circularly, we will use the
            // count of visited ranges as our exit condition
            int numIndexRangesToVisit = m_indexRanges.Length;
 
            do
            {
                // local snap to save array access bounds checks in places where we only read fields
                IndexRange currentRange = m_indexRanges[m_nCurrentIndexRange];
 
                if (currentRange.m_bRangeFinished == 0)
                {
                    if (m_indexRanges[m_nCurrentIndexRange].m_nSharedCurrentIndexOffset == null)
                    {
                        Interlocked.CompareExchange(ref m_indexRanges[m_nCurrentIndexRange].m_nSharedCurrentIndexOffset, new Shared<long>(0), null);
                    }
 
                    // this access needs to be on the array slot
                    long nMyOffset;
                    if (IntPtr.Size == 4 && _use32BitCurrentIndex)
                    {
                       // In 32-bit processes, we prefer to use 32-bit interlocked operations, to avoid the possibility of doing
                       // a 64-bit interlocked when the target value crosses a cache line, as that can be super expensive.
                       // We use the first 32 bits of the Int64 index in such cases.
                       unsafe
                       {
                           fixed (long* indexPtr = &m_indexRanges[m_nCurrentIndexRange].m_nSharedCurrentIndexOffset.Value)
                           {
                             nMyOffset = Interlocked.Add(ref *(int*)indexPtr, (int)m_nIncrementValue) - m_nIncrementValue;
                           }
                       }
                    }
                    else
                    {
                      nMyOffset = Interlocked.Add(ref m_indexRanges[m_nCurrentIndexRange].m_nSharedCurrentIndexOffset.Value, m_nIncrementValue) - m_nIncrementValue;
                    }                           
 
                    if (currentRange.m_nToExclusive - currentRange.m_nFromInclusive > nMyOffset)
                    {
                        // we found work
 
                        nFromInclusiveLocal = currentRange.m_nFromInclusive + nMyOffset;
                        nToExclusiveLocal = nFromInclusiveLocal + m_nIncrementValue;
 
                        // Check for going past end of range, or wrapping
                        if ( (nToExclusiveLocal > currentRange.m_nToExclusive) || (nToExclusiveLocal < currentRange.m_nFromInclusive) )
                        {
                            nToExclusiveLocal = currentRange.m_nToExclusive;
                        }
 
                        // We will double our unit of increment until it reaches the maximum.
                        if (m_nIncrementValue < m_nMaxIncrementValue)
                        {
                            m_nIncrementValue *= 2;
                            if (m_nIncrementValue > m_nMaxIncrementValue)
                            {
                                m_nIncrementValue = m_nMaxIncrementValue;
                            }
                        }
 
                        return true;
                    }
                    else
                    {
                        // this index range is completed, mark it so that others can skip it quickly
                        Interlocked.Exchange(ref m_indexRanges[m_nCurrentIndexRange].m_bRangeFinished, 1);
                    }
                }
 
                // move on to the next index range, in circular order.
                m_nCurrentIndexRange = (m_nCurrentIndexRange + 1) % m_indexRanges.Length;
                numIndexRangesToVisit--;
 
            } while (numIndexRangesToVisit > 0);
            // we've visited all index ranges possible => there's no work remaining
 
            nFromInclusiveLocal = 0;
            nToExclusiveLocal = 0;
 
            return false;
        }
 
 
        /// <summary>
        /// 32 bit integer version of FindNewWork. Assumes the ranges were initialized with 32 bit values.
        /// </summary> 
        internal bool FindNewWork32(out int nFromInclusiveLocal32, out int nToExclusiveLocal32)
        {
            long nFromInclusiveLocal;
            long nToExclusiveLocal;
 
            bool bRetVal = FindNewWork(out nFromInclusiveLocal, out nToExclusiveLocal);
 
            Contract.Assert((nFromInclusiveLocal <= Int32.MaxValue) && (nFromInclusiveLocal >= Int32.MinValue) &&
                            (nToExclusiveLocal <= Int32.MaxValue) && (nToExclusiveLocal >= Int32.MinValue));
            
            // convert to 32 bit before returning
            nFromInclusiveLocal32 = (int)nFromInclusiveLocal;
            nToExclusiveLocal32 = (int)nToExclusiveLocal;
            
            return bRetVal;
        }
    }
 
 
    /// <summary>
    /// Represents the entire loop operation, keeping track of workers and ranges.
    /// </summary>
    /// 
    /// The usage pattern is:
    ///    1) The Parallel loop entry function (ForWorker) creates an instance of this class
    ///    2) Every thread joining to service the parallel loop calls RegisterWorker to grab a 
    ///       RangeWorker struct to wrap the state it will need to find and execute work, 
    ///       and they keep interacting with that struct until the end of the loop
    internal class RangeManager
    {
        internal readonly IndexRange[] m_indexRanges;
        internal readonly bool _use32BitCurrentIndex;
 
        internal int m_nCurrentIndexRangeToAssign;
        internal long m_nStep;
        
        /// <summary>
        /// Initializes a RangeManager with the given loop parameters, and the desired number of outer ranges
        /// </summary>
        internal RangeManager(long nFromInclusive, long nToExclusive, long nStep, int nNumExpectedWorkers)
        {
            m_nCurrentIndexRangeToAssign = 0;
            m_nStep = nStep;
 
            // Our signed math breaks down w/ nNumExpectedWorkers == 1.  So change it to 2.
            if (nNumExpectedWorkers == 1)
                nNumExpectedWorkers = 2;
 
            //
            // calculate the size of each index range
            //
 
            ulong uSpan = (ulong)(nToExclusive - nFromInclusive);
            ulong uRangeSize = uSpan / (ulong) nNumExpectedWorkers; // rough estimate first
            
            uRangeSize -= uRangeSize % (ulong) nStep; // snap to multiples of nStep 
                                                      // otherwise index range transitions will derail us from nStep
 
            if (uRangeSize == 0)
            {
                uRangeSize = (ulong) nStep;
            }
 
            //
            // find the actual number of index ranges we will need
            //
            Contract.Assert((uSpan / uRangeSize) < Int32.MaxValue);
 
            int nNumRanges = (int)(uSpan / uRangeSize);
            
            if (uSpan % uRangeSize != 0)
            {
                nNumRanges++;
            }
 
 
            // Convert to signed so the rest of the logic works.
            // Should be fine so long as uRangeSize < Int64.MaxValue, which we guaranteed by setting #workers >= 2. 
            long nRangeSize = (long)uRangeSize; 
            _use32BitCurrentIndex = IntPtr.Size == 4 && nRangeSize <= int.MaxValue;
 
            // allocate the array of index ranges
            m_indexRanges = new IndexRange[nNumRanges];
 
            long nCurrentIndex = nFromInclusive;
            for (int i = 0; i < nNumRanges; i++)
            {
                // the fromInclusive of the new index range is always on nCurrentIndex
                m_indexRanges[i].m_nFromInclusive = nCurrentIndex;
                m_indexRanges[i].m_nSharedCurrentIndexOffset = null;
                m_indexRanges[i].m_bRangeFinished = 0;
 
                // now increment it to find the toExclusive value for our range
                nCurrentIndex += nRangeSize;
 
                // detect integer overflow or range overage and snap to nToExclusive
                if (nCurrentIndex < nCurrentIndex - nRangeSize ||
                    nCurrentIndex > nToExclusive)
                {
                    // this should only happen at the last index
                    Contract.Assert(i == nNumRanges - 1);
 
                    nCurrentIndex = nToExclusive;
                }
 
                // now that the end point of the new range is calculated, assign it.
                m_indexRanges[i].m_nToExclusive = nCurrentIndex;
            }
        }
 
        /// <summary>
        /// The function that needs to be called by each new worker thread servicing the parallel loop
        /// in order to get a RangeWorker struct that wraps the state for finding and executing indices
        /// </summary>
        internal RangeWorker RegisterNewWorker()
        {
            Contract.Assert(m_indexRanges != null && m_indexRanges.Length != 0);
 
            int nInitialRange = (Interlocked.Increment(ref m_nCurrentIndexRangeToAssign) - 1) % m_indexRanges.Length;
 
            return new RangeWorker(m_indexRanges, nInitialRange, m_nStep, _use32BitCurrentIndex);
        }
    }
}
#pragma warning restore 0420