libstdc++-v3/include/parallel/multiway_mergesort.h - gcc - Git at Google

 // -*- C++ -*-

 // Copyright (C) 2007, 2008 Free Software Foundation, Inc.
 //
 // This file is part of the GNU ISO C++ Library.  This library is free
 // software; you can redistribute it and/or modify it under the terms
 // of the GNU General Public License as published by the Free Software
 // Foundation; either version 2, or (at your option) any later
 // version.

 // This library is distributed in the hope that it will be useful, but
 // WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 // General Public License for more details.

 // You should have received a copy of the GNU General Public License
 // along with this library; see the file COPYING.  If not, write to
 // the Free Software Foundation, 59 Temple Place - Suite 330, Boston,
 // MA 02111-1307, USA.

 // As a special exception, you may use this file as part of a free
 // software library without restriction.  Specifically, if other files
 // instantiate templates or use macros or inline functions from this
 // file, or you compile this file and link it with other files to
 // produce an executable, this file does not by itself cause the
 // resulting executable to be covered by the GNU General Public
 // License.  This exception does not however invalidate any other
 // reasons why the executable file might be covered by the GNU General
 // Public License.

 /** @file parallel/multiway_mergesort.h
  *  @brief Parallel multiway merge sort.
  *  This file is a GNU parallel extension to the Standard C++ Library.
  */

 // Written by Johannes Singler.

 #ifndef _GLIBCXX_PARALLEL_MERGESORT_H
 #define _GLIBCXX_PARALLEL_MERGESORT_H 1

 #include <vector>

 #include <parallel/basic_iterator.h>
 #include <bits/stl_algo.h>
 #include <parallel/parallel.h>
 #include <parallel/multiway_merge.h>

 namespace __gnu_parallel
 {

 /** @brief Subsequence description. */
 template<typename _DifferenceTp>
   struct Piece
   {
     typedef _DifferenceTp difference_type;

     /** @brief Begin of subsequence. */
     difference_type begin;

     /** @brief End of subsequence. */
     difference_type end;
   };

 /** @brief Data accessed by all threads.
   *
   *  PMWMS = parallel multiway mergesort */
 template<typename RandomAccessIterator>
   struct PMWMSSortingData
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
     typedef typename traits_type::difference_type difference_type;

     /** @brief Number of threads involved. */
     thread_index_t num_threads;

     /** @brief Input begin. */
     RandomAccessIterator source;

     /** @brief Start indices, per thread. */
     difference_type* starts;

     /** @brief Temporary arrays for each thread.
      *
      *  Indirection Allows using the temporary storage in different
      *  ways, without code duplication.
      *  @see _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST */
     value_type** temporaries;

 #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
     /** @brief Storage in which to sort. */
     RandomAccessIterator* sorting_places;

     /** @brief Storage into which to merge. */
     value_type** merging_places;
 #else
     /** @brief Storage in which to sort. */
     value_type** sorting_places;

     /** @brief Storage into which to merge. */
     RandomAccessIterator* merging_places;
 #endif
     /** @brief Samples. */
     value_type* samples;

     /** @brief Offsets to add to the found positions. */
     difference_type* offsets;

     /** @brief Pieces of data to merge @c [thread][sequence] */
     std::vector<Piece<difference_type> >* pieces;

     /** @brief Stable sorting desired. */
     bool stable;
 };

 /**
   *  @brief Select samples from a sequence.
   *  @param sd Pointer to algorithm data. Result will be placed in
   *  @c sd->samples.
   *  @param num_samples Number of samples to select.
   */
 template<typename RandomAccessIterator, typename _DifferenceTp>
   void
   determine_samples(PMWMSSortingData<RandomAccessIterator>* sd,
                     _DifferenceTp& num_samples)
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
     typedef _DifferenceTp difference_type;

     thread_index_t iam = omp_get_thread_num();

     num_samples = _Settings::get().sort_mwms_oversampling * sd->num_threads - 1;

     difference_type* es = new difference_type[num_samples + 2];

     equally_split(sd->starts[iam + 1] - sd->starts[iam],
                   num_samples + 1, es);

     for (difference_type i = 0; i < num_samples; ++i)
       ::new(&(sd->samples[iam * num_samples + i]))
 	  value_type(sd->source[sd->starts[iam] + es[i + 1]]);

     delete[] es;
   }

 /** @brief PMWMS code executed by each thread.
   *  @param sd Pointer to algorithm data.
   *  @param comp Comparator.
   */
 template<typename RandomAccessIterator, typename Comparator>
   void
   parallel_sort_mwms_pu(PMWMSSortingData<RandomAccessIterator>* sd,
                         Comparator& comp)
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
     typedef typename traits_type::difference_type difference_type;

     thread_index_t iam = omp_get_thread_num();

     // Length of this thread's chunk, before merging.
     difference_type length_local = sd->starts[iam + 1] - sd->starts[iam];

 #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
     typedef RandomAccessIterator SortingPlacesIterator;

     // Sort in input storage.
     sd->sorting_places[iam] = sd->source + sd->starts[iam];
 #else
     typedef value_type* SortingPlacesIterator;

     // Sort in temporary storage, leave space for sentinel.
     sd->sorting_places[iam] = sd->temporaries[iam] =
         static_cast<value_type*>(
         ::operator new(sizeof(value_type) * (length_local + 1)));

     // Copy there.
     std::uninitialized_copy(sd->source + sd->starts[iam],
                             sd->source + sd->starts[iam] + length_local,
                             sd->sorting_places[iam]);
 #endif

     // Sort locally.
     if (sd->stable)
       __gnu_sequential::stable_sort(sd->sorting_places[iam],
                                     sd->sorting_places[iam] + length_local,
                                     comp);
     else
       __gnu_sequential::sort(sd->sorting_places[iam],
                              sd->sorting_places[iam] + length_local,
                              comp);

     // Invariant: locally sorted subsequence in sd->sorting_places[iam],
     // sd->sorting_places[iam] + length_local.
     const _Settings& __s = _Settings::get();
     if (__s.sort_splitting == SAMPLING)
       {
         difference_type num_samples;
         determine_samples(sd, num_samples);

 #       pragma omp barrier

 #       pragma omp single
         __gnu_sequential::sort(sd->samples,
                                sd->samples + (num_samples * sd->num_threads),
                                comp);

 #       pragma omp barrier

         for (int s = 0; s < sd->num_threads; ++s)
           {
             // For each sequence.
               if (num_samples * iam > 0)
                 sd->pieces[iam][s].begin =
                     std::lower_bound(sd->sorting_places[s],
                         sd->sorting_places[s]
                             + (sd->starts[s + 1] - sd->starts[s]),
                         sd->samples[num_samples * iam],
                         comp)
                     - sd->sorting_places[s];
             else
               // Absolute beginning.
               sd->pieces[iam][s].begin = 0;

             if ((num_samples * (iam + 1)) < (num_samples * sd->num_threads))
               sd->pieces[iam][s].end =
                   std::lower_bound(sd->sorting_places[s],
                           sd->sorting_places[s]
                               + (sd->starts[s + 1] - sd->starts[s]),
                           sd->samples[num_samples * (iam + 1)],
                           comp)
                   - sd->sorting_places[s];
             else
               // Absolute end.
               sd->pieces[iam][s].end = sd->starts[s + 1] - sd->starts[s];
             }
       }
     else if (__s.sort_splitting == EXACT)
       {
 #       pragma omp barrier

         std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
             seqs(sd->num_threads);
         for (int s = 0; s < sd->num_threads; ++s)
           seqs[s] = std::make_pair(sd->sorting_places[s],
                                    sd->sorting_places[s]
                                        + (sd->starts[s + 1] - sd->starts[s]));

         std::vector<SortingPlacesIterator> offsets(sd->num_threads);

         // if not last thread
         if (iam < sd->num_threads - 1)
           multiseq_partition(seqs.begin(), seqs.end(),
                              sd->starts[iam + 1], offsets.begin(), comp);

         for (int seq = 0; seq < sd->num_threads; ++seq)
           {
             // for each sequence
             if (iam < (sd->num_threads - 1))
               sd->pieces[iam][seq].end = offsets[seq] - seqs[seq].first;
             else
               // very end of this sequence
               sd->pieces[iam][seq].end = (sd->starts[seq + 1]
 					  - sd->starts[seq]);
           }

 #       pragma omp barrier

         for (int seq = 0; seq < sd->num_threads; ++seq)
           {
             // For each sequence.
             if (iam > 0)
               sd->pieces[iam][seq].begin = sd->pieces[iam - 1][seq].end;
             else
               // Absolute beginning.
               sd->pieces[iam][seq].begin = 0;
           }
       }

     // Offset from target begin, length after merging.
     difference_type offset = 0, length_am = 0;
     for (int s = 0; s < sd->num_threads; ++s)
       {
         length_am += sd->pieces[iam][s].end - sd->pieces[iam][s].begin;
         offset += sd->pieces[iam][s].begin;
       }

 #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
     // Merge to temporary storage, uninitialized creation not possible
     // since there is no multiway_merge calling the placement new
     // instead of the assignment operator.
     // XXX incorrect (de)construction
     sd->merging_places[iam] = sd->temporaries[iam] =
         static_cast<value_type*>(::operator new(sizeof(value_type)
 						* length_am));
 #else
     // Merge directly to target.
     sd->merging_places[iam] = sd->source + offset;
 #endif
     std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
         seqs(sd->num_threads);

     for (int s = 0; s < sd->num_threads; ++s)
       {
         seqs[s] =
 	  std::make_pair(sd->sorting_places[s] + sd->pieces[iam][s].begin,
 			 sd->sorting_places[s] + sd->pieces[iam][s].end);
       }

     multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp,
                    length_am, sd->stable, false, sequential_tag());

 #   pragma omp barrier

 #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
     // Write back.
     std::copy(sd->merging_places[iam],
               sd->merging_places[iam] + length_am,
               sd->source + offset);
 #endif

     ::operator delete(sd->temporaries[iam]);
   }

 /** @brief PMWMS main call.
   *  @param begin Begin iterator of sequence.
   *  @param end End iterator of sequence.
   *  @param comp Comparator.
   *  @param n Length of sequence.
   *  @param num_threads Number of threads to use.
   *  @param stable Stable sorting.
   */
 template<typename RandomAccessIterator, typename Comparator>
   void
   parallel_sort_mwms(RandomAccessIterator begin, RandomAccessIterator end,
                      Comparator comp, typename
 		     std::iterator_traits<RandomAccessIterator>::
 		     difference_type n, int num_threads, bool stable)
   {
     _GLIBCXX_CALL(n)

     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
     typedef typename traits_type::difference_type difference_type;

     if (n <= 1)
       return;

     // at least one element per thread
     if (num_threads > n)
       num_threads = static_cast<thread_index_t>(n);

     // shared variables
     PMWMSSortingData<RandomAccessIterator> sd;
     difference_type* starts;
     const _Settings& __s = _Settings::get();

 #   pragma omp parallel num_threads(num_threads)
       {
         num_threads = omp_get_num_threads();  //no more threads than requested

 #       pragma omp single
           {
             sd.num_threads = num_threads;
             sd.source = begin;
             sd.temporaries = new value_type*[num_threads];

 #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
             sd.sorting_places = new RandomAccessIterator[num_threads];
             sd.merging_places = new value_type*[num_threads];
 #else
             sd.sorting_places = new value_type*[num_threads];
             sd.merging_places = new RandomAccessIterator[num_threads];
 #endif

             if (__s.sort_splitting == SAMPLING)
               {
                 unsigned int size =
                     (__s.sort_mwms_oversampling * num_threads - 1)
                         * num_threads;
                 sd.samples = static_cast<value_type*>(
 		  ::operator new(size * sizeof(value_type)));
               }
             else
               sd.samples = NULL;

             sd.offsets = new difference_type[num_threads - 1];
             sd.pieces = new std::vector<Piece<difference_type> >[num_threads];
             for (int s = 0; s < num_threads; ++s)
               sd.pieces[s].resize(num_threads);
             starts = sd.starts = new difference_type[num_threads + 1];
             sd.stable = stable;

             difference_type chunk_length = n / num_threads;
             difference_type split = n % num_threads;
             difference_type pos = 0;
             for (int i = 0; i < num_threads; ++i)
               {
                 starts[i] = pos;
                 pos += (i < split) ? (chunk_length + 1) : chunk_length;
               }
             starts[num_threads] = pos;
           }

         // Now sort in parallel.
         parallel_sort_mwms_pu(&sd, comp);
       } //parallel

     delete[] starts;
     delete[] sd.temporaries;
     delete[] sd.sorting_places;
     delete[] sd.merging_places;

     if (__s.sort_splitting == SAMPLING)
       ::operator delete(sd.samples);

     delete[] sd.offsets;
     delete[] sd.pieces;
   }
 } //namespace __gnu_parallel

 #endif
	// -- C++ --

	// Copyright (C) 2007, 2008 Free Software Foundation, Inc.
	//
	// This file is part of the GNU ISO C++ Library. This library is free
	// software; you can redistribute it and/or modify it under the terms
	// of the GNU General Public License as published by the Free Software
	// Foundation; either version 2, or (at your option) any later
	// version.

	// This library is distributed in the hope that it will be useful, but
	// WITHOUT ANY WARRANTY; without even the implied warranty of
	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	// General Public License for more details.

	// You should have received a copy of the GNU General Public License
	// along with this library; see the file COPYING. If not, write to
	// the Free Software Foundation, 59 Temple Place - Suite 330, Boston,
	// MA 02111-1307, USA.

	// As a special exception, you may use this file as part of a free
	// software library without restriction. Specifically, if other files
	// instantiate templates or use macros or inline functions from this
	// file, or you compile this file and link it with other files to
	// produce an executable, this file does not by itself cause the
	// resulting executable to be covered by the GNU General Public
	// License. This exception does not however invalidate any other
	// reasons why the executable file might be covered by the GNU General
	// Public License.

	/** @file parallel/multiway_mergesort.h
	* @brief Parallel multiway merge sort.
	* This file is a GNU parallel extension to the Standard C++ Library.
	*/

	// Written by Johannes Singler.

	#ifndef _GLIBCXX_PARALLEL_MERGESORT_H
	#define _GLIBCXX_PARALLEL_MERGESORT_H 1

	#include <vector>

	#include <parallel/basic_iterator.h>
	#include <bits/stl_algo.h>
	#include <parallel/parallel.h>
	#include <parallel/multiway_merge.h>

	namespace __gnu_parallel
	{

	/** @brief Subsequence description. */
	template<typename _DifferenceTp>
	struct Piece
	{
	typedef _DifferenceTp difference_type;

	/** @brief Begin of subsequence. */
	difference_type begin;

	/** @brief End of subsequence. */
	difference_type end;
	};

	/** @brief Data accessed by all threads.
	*
	* PMWMS = parallel multiway mergesort */
	template<typename RandomAccessIterator>
	struct PMWMSSortingData
	{
	typedef std::iterator_traits<RandomAccessIterator> traits_type;
	typedef typename traits_type::value_type value_type;
	typedef typename traits_type::difference_type difference_type;

	/** @brief Number of threads involved. */
	thread_index_t num_threads;

	/** @brief Input begin. */
	RandomAccessIterator source;

	/** @brief Start indices, per thread. */
	difference_type* starts;

	/** @brief Temporary arrays for each thread.
	*
	* Indirection Allows using the temporary storage in different
	* ways, without code duplication.
	* @see _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST */
	value_type** temporaries;

	#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
	/** @brief Storage in which to sort. */
	RandomAccessIterator* sorting_places;

	/** @brief Storage into which to merge. */
	value_type** merging_places;
	#else
	/** @brief Storage in which to sort. */
	value_type** sorting_places;

	/** @brief Storage into which to merge. */
	RandomAccessIterator* merging_places;
	#endif
	/** @brief Samples. */
	value_type* samples;

	/** @brief Offsets to add to the found positions. */
	difference_type* offsets;

	/** @brief Pieces of data to merge @c [thread][sequence] */
	std::vector<Piece<difference_type> >* pieces;

	/** @brief Stable sorting desired. */
	bool stable;
	};

	/**
	* @brief Select samples from a sequence.
	* @param sd Pointer to algorithm data. Result will be placed in
	* @c sd->samples.
	* @param num_samples Number of samples to select.
	*/
	template<typename RandomAccessIterator, typename _DifferenceTp>
	void
	determine_samples(PMWMSSortingData<RandomAccessIterator>* sd,
	_DifferenceTp& num_samples)
	{
	typedef std::iterator_traits<RandomAccessIterator> traits_type;
	typedef typename traits_type::value_type value_type;
	typedef _DifferenceTp difference_type;

	thread_index_t iam = omp_get_thread_num();

	num_samples = _Settings::get().sort_mwms_oversampling * sd->num_threads - 1;

	difference_type* es = new difference_type[num_samples + 2];

	equally_split(sd->starts[iam + 1] - sd->starts[iam],
	num_samples + 1, es);

	for (difference_type i = 0; i < num_samples; ++i)
	::new(&(sd->samples[iam * num_samples + i]))
	value_type(sd->source[sd->starts[iam] + es[i + 1]]);

	delete[] es;
	}

	/** @brief PMWMS code executed by each thread.
	* @param sd Pointer to algorithm data.
	* @param comp Comparator.
	*/
	template<typename RandomAccessIterator, typename Comparator>
	void
	parallel_sort_mwms_pu(PMWMSSortingData<RandomAccessIterator>* sd,
	Comparator& comp)
	{
	typedef std::iterator_traits<RandomAccessIterator> traits_type;
	typedef typename traits_type::value_type value_type;
	typedef typename traits_type::difference_type difference_type;

	thread_index_t iam = omp_get_thread_num();

	// Length of this thread's chunk, before merging.
	difference_type length_local = sd->starts[iam + 1] - sd->starts[iam];

	#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
	typedef RandomAccessIterator SortingPlacesIterator;

	// Sort in input storage.
	sd->sorting_places[iam] = sd->source + sd->starts[iam];
	#else
	typedef value_type* SortingPlacesIterator;

	// Sort in temporary storage, leave space for sentinel.
	sd->sorting_places[iam] = sd->temporaries[iam] =
	static_cast<value_type*>(
	::operator new(sizeof(value_type) * (length_local + 1)));

	// Copy there.
	std::uninitialized_copy(sd->source + sd->starts[iam],
	sd->source + sd->starts[iam] + length_local,
	sd->sorting_places[iam]);
	#endif

	// Sort locally.
	if (sd->stable)
	__gnu_sequential::stable_sort(sd->sorting_places[iam],
	sd->sorting_places[iam] + length_local,
	comp);
	else
	__gnu_sequential::sort(sd->sorting_places[iam],
	sd->sorting_places[iam] + length_local,
	comp);

	// Invariant: locally sorted subsequence in sd->sorting_places[iam],
	// sd->sorting_places[iam] + length_local.
	const _Settings& __s = _Settings::get();
	if (__s.sort_splitting == SAMPLING)
	{
	difference_type num_samples;
	determine_samples(sd, num_samples);

	# pragma omp barrier

	# pragma omp single
	__gnu_sequential::sort(sd->samples,
	sd->samples + (num_samples * sd->num_threads),
	comp);

	# pragma omp barrier

	for (int s = 0; s < sd->num_threads; ++s)
	{
	// For each sequence.
	if (num_samples * iam > 0)
	sd->pieces[iam][s].begin =
	std::lower_bound(sd->sorting_places[s],
	sd->sorting_places[s]
	+ (sd->starts[s + 1] - sd->starts[s]),
	sd->samples[num_samples * iam],
	comp)
	- sd->sorting_places[s];
	else
	// Absolute beginning.
	sd->pieces[iam][s].begin = 0;

	if ((num_samples * (iam + 1)) < (num_samples * sd->num_threads))
	sd->pieces[iam][s].end =
	std::lower_bound(sd->sorting_places[s],
	sd->sorting_places[s]
	+ (sd->starts[s + 1] - sd->starts[s]),
	sd->samples[num_samples * (iam + 1)],
	comp)
	- sd->sorting_places[s];
	else
	// Absolute end.
	sd->pieces[iam][s].end = sd->starts[s + 1] - sd->starts[s];
	}
	}
	else if (__s.sort_splitting == EXACT)
	{
	# pragma omp barrier

	std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
	seqs(sd->num_threads);
	for (int s = 0; s < sd->num_threads; ++s)
	seqs[s] = std::make_pair(sd->sorting_places[s],
	sd->sorting_places[s]
	+ (sd->starts[s + 1] - sd->starts[s]));

	std::vector<SortingPlacesIterator> offsets(sd->num_threads);

	// if not last thread
	if (iam < sd->num_threads - 1)
	multiseq_partition(seqs.begin(), seqs.end(),
	sd->starts[iam + 1], offsets.begin(), comp);

	for (int seq = 0; seq < sd->num_threads; ++seq)
	{
	// for each sequence
	if (iam < (sd->num_threads - 1))
	sd->pieces[iam][seq].end = offsets[seq] - seqs[seq].first;
	else
	// very end of this sequence
	sd->pieces[iam][seq].end = (sd->starts[seq + 1]
	- sd->starts[seq]);
	}

	# pragma omp barrier

	for (int seq = 0; seq < sd->num_threads; ++seq)
	{
	// For each sequence.
	if (iam > 0)
	sd->pieces[iam][seq].begin = sd->pieces[iam - 1][seq].end;
	else
	// Absolute beginning.
	sd->pieces[iam][seq].begin = 0;
	}
	}

	// Offset from target begin, length after merging.
	difference_type offset = 0, length_am = 0;
	for (int s = 0; s < sd->num_threads; ++s)
	{
	length_am += sd->pieces[iam][s].end - sd->pieces[iam][s].begin;
	offset += sd->pieces[iam][s].begin;
	}

	#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
	// Merge to temporary storage, uninitialized creation not possible
	// since there is no multiway_merge calling the placement new
	// instead of the assignment operator.
	// XXX incorrect (de)construction
	sd->merging_places[iam] = sd->temporaries[iam] =
	static_cast<value_type*>(::operator new(sizeof(value_type)
	* length_am));
	#else
	// Merge directly to target.
	sd->merging_places[iam] = sd->source + offset;
	#endif
	std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
	seqs(sd->num_threads);

	for (int s = 0; s < sd->num_threads; ++s)
	{
	seqs[s] =
	std::make_pair(sd->sorting_places[s] + sd->pieces[iam][s].begin,
	sd->sorting_places[s] + sd->pieces[iam][s].end);
	}

	multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp,
	length_am, sd->stable, false, sequential_tag());

	# pragma omp barrier

	#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
	// Write back.
	std::copy(sd->merging_places[iam],
	sd->merging_places[iam] + length_am,
	sd->source + offset);
	#endif

	::operator delete(sd->temporaries[iam]);
	}

	/** @brief PMWMS main call.
	* @param begin Begin iterator of sequence.
	* @param end End iterator of sequence.
	* @param comp Comparator.
	* @param n Length of sequence.
	* @param num_threads Number of threads to use.
	* @param stable Stable sorting.
	*/
	template<typename RandomAccessIterator, typename Comparator>
	void
	parallel_sort_mwms(RandomAccessIterator begin, RandomAccessIterator end,
	Comparator comp, typename
	std::iterator_traits<RandomAccessIterator>::
	difference_type n, int num_threads, bool stable)
	{
	_GLIBCXX_CALL(n)

	typedef std::iterator_traits<RandomAccessIterator> traits_type;
	typedef typename traits_type::value_type value_type;
	typedef typename traits_type::difference_type difference_type;

	if (n <= 1)
	return;

	// at least one element per thread
	if (num_threads > n)
	num_threads = static_cast<thread_index_t>(n);

	// shared variables
	PMWMSSortingData<RandomAccessIterator> sd;
	difference_type* starts;
	const _Settings& __s = _Settings::get();

	# pragma omp parallel num_threads(num_threads)
	{
	num_threads = omp_get_num_threads(); //no more threads than requested

	# pragma omp single
	{
	sd.num_threads = num_threads;
	sd.source = begin;
	sd.temporaries = new value_type*[num_threads];

	#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
	sd.sorting_places = new RandomAccessIterator[num_threads];
	sd.merging_places = new value_type*[num_threads];
	#else
	sd.sorting_places = new value_type*[num_threads];
	sd.merging_places = new RandomAccessIterator[num_threads];
	#endif

	if (__s.sort_splitting == SAMPLING)
	{
	unsigned int size =
	(__s.sort_mwms_oversampling * num_threads - 1)
	* num_threads;
	sd.samples = static_cast<value_type*>(
	::operator new(size * sizeof(value_type)));
	}
	else
	sd.samples = NULL;

	sd.offsets = new difference_type[num_threads - 1];
	sd.pieces = new std::vector<Piece<difference_type> >[num_threads];
	for (int s = 0; s < num_threads; ++s)
	sd.pieces[s].resize(num_threads);
	starts = sd.starts = new difference_type[num_threads + 1];
	sd.stable = stable;

	difference_type chunk_length = n / num_threads;
	difference_type split = n % num_threads;
	difference_type pos = 0;
	for (int i = 0; i < num_threads; ++i)
	{
	starts[i] = pos;
	pos += (i < split) ? (chunk_length + 1) : chunk_length;
	}
	starts[num_threads] = pos;
	}

	// Now sort in parallel.
	parallel_sort_mwms_pu(&sd, comp);
	} //parallel

	delete[] starts;
	delete[] sd.temporaries;
	delete[] sd.sorting_places;
	delete[] sd.merging_places;

	if (__s.sort_splitting == SAMPLING)
	::operator delete(sd.samples);

	delete[] sd.offsets;
	delete[] sd.pieces;
	}
	} //namespace __gnu_parallel

	#endif