d1/d35/a03398_source.html

// -*- C++ -*-

/***************************************************************************

 *

 * The IPPL Framework

 *

 * This program was prepared by PSI.

 * All rights in the program are reserved by PSI.

 * Neither PSI nor the author(s)

 * makes any warranty, express or implied, or assumes any liability or

 * responsibility for the use of this software

 *

 * Visit www.amas.web.psi for more details

 *

 ***************************************************************************/


// -*- C++ -*-

/***************************************************************************

 *

 * The IPPL Framework

 *

 *

 * Visit http://people.web.psi.ch/adelmann/ for more details

 *

 ***************************************************************************/


// include files

#include "FieldLayout/BinaryBalancer.h"

#include "FieldLayout/FieldLayout.h"

#include "Field/BareField.h"

#include "Utility/PAssert.h"


/*


  Implementation of BinaryBalancer.


  The general strategy is that you do log(P) splits on the domain.  It

  starts with the whole domain, does a reduction to find where to

  split it, then does reductions on each of the resulting domains to

  find where to split those, then reductions on those to split them,

  and so on until it is done.


  Suppose you're on the n'th split, so there are 2**n domains figured

  out so far, and after the split there will be 2**(n+1) splits.  In

  each of those 2**n domains you need to find


  a) The axis to split on. This is done by just finding the longest

     axis in that domain.


  b) The location within that domain to make the split.  This is done

     by reducing the weights on all dimensions except the axis to be

     split and finding the location within that array that puts half

     the weight on each side.


  The reduction for b) is done is a scalable way.  It is a parallel

  reduction, and if there are 2**n domains being split, the reductions

  are accumulated onto processors 0..2**n-1.  Those processors

  calculate the split locations and broadcast them to all the

  processors.


  At every stage of the process all the processors know all the

  domains.  This is necessary because the weight array could be

  distributed arbitrarily, so the reductions could involve any

  processors.


  Nevertheless, the reductions are performed efficiently.  Using

  DomainMaps, only the processors that need to participate in a

  reduction do participate.


*/


//

// Given an NDIndex<Dim> find the axis with the longest length, that is

// not SERIAL.

//

template <unsigned Dim>

static int

FindCutAxis(const NDIndex<Dim> &domain, const FieldLayout<Dim> &layout)

{


  // CutAxis will be the dimension to cut.

  int cutAxis=-1;

  // MaxLength will have the maximum length of any dimension.

  unsigned int maxLength=0;

  // Loop over dimension.

  for (unsigned int d=0; d<Dim; ++d) {

    if (layout.getDistribution(d) != SERIAL ||

        layout.getRequestedDistribution(d) != SERIAL) {

      // Check if this axis is longer than the current max.

      unsigned int length = domain[d].length();

      if ( maxLength < length ) {

        // If so, remember it.

        cutAxis = d;

        maxLength = length;

      }

    }

  }


  // Make sure we found one.

  //PAssert_GE(cutAxis, 0);


  if(cutAxis<0)

    throw BinaryRepartitionFailed();


  // Return the longest axis.

  return cutAxis;

}


//

// Find the median point in a container.

// The first two arguments are begin and end iterators.

// The third is a dummy argument of type T, where the

// container is of objects of type T.

//


template<class RandomIterator, class T>

static RandomIterator

FindMedian(int nprocs,RandomIterator begin, RandomIterator end, T)

{

  // First find the total weight.

  T w = 0;

  // Use w to find T's name


  // If we have only one processor, cut at the left.

  if ( nprocs == 1 )

    return begin;


  int lprocs = nprocs/2;

  RandomIterator rp, median;

  for (rp=begin; rp!=end; ++rp)

    w += *rp;


  // If the total weight is zero, we need to do things specially.

  if ( w==0 )

    {

      // The total weight is zero.

      // Put about as much zero weight stuff on the left and the right.

      median = begin + ((end-begin)*lprocs)/nprocs;

    }

  else

    {

      // The total weight is nonzero.

      // Put equal amounts on the left and right processors.

      T w2 = (w*lprocs)/nprocs;

      // Find the point with half the weight to the left.

      bool found = false;

      w = T(0);

      for (rp=begin; (rp!=end)&&(!found); ++rp) {

        // Add current element to running total

        w += *rp;

        if (w>=w2) {

          found = true;

          if ( (w-w2) > (*rp/T(2)) )

            median = rp;

          else

            median = (rp+1 != end) ? rp+1 : rp;

        }

      }

    }

  // Found it.  Exit.

  return median;

}


//

// Routines for doing reductions over all dimensions but one of a

// local BrickIterator.

//

// These will only work for 1, 2 and 3 dimensions right now.

// I'm sure there is a way to make this work efficiently

// for arbitrary dimension, but this works for now.

//


// Reduce over a 3 dimensional BrickIterator

// given the axis not to reduce on

// and where in that dimension to reduce.


static inline double

PerpReduce(BrickIterator<double,3>& data, int i, int cutAxis)

{

  double r=0;

  if (cutAxis==0)

    {

      int l1 = data.size(1);

      int l2 = data.size(2);

      if ( (l1>0) && (l2>0) )

        for (int j2=0; j2<l2; ++j2)

          for (int j1=0; j1<l1; ++j1)

            r += data.offset(i,j1,j2);

    }

  else if (cutAxis==1)

    {

      int l0 = data.size(0);

      int l2 = data.size(2);

      if ( (l0>0) && (l2>0) )

        for (int j2=0; j2<l2; ++j2)

          for (int j0=0; j0<l0; ++j0)

            r += data.offset(j0,i,j2);

    }

  else if (cutAxis==2)

    {

      int l0 = data.size(0);

      int l1 = data.size(1);

      if ( (l0>0) && (l1>0) )

        for (int j1=0; j1<l1; ++j1)

          for (int j0=0; j0<l0; ++j0)

            r += data.offset(j0,j1,i);

    }

  return r;

}


// Reduce over a 2 dimensional BrickIterator

// given the axis not to reduce on

// and where in that dimension to reduce.


static inline double

PerpReduce(BrickIterator<double,2>& data, int i, int cutAxis)

{

  double r=0;

  if (cutAxis==0)

    {

      int length = data.size(1);

      for (int j=0; j<length; ++j)

        r += data.offset(i,j);

    }

  else

    {

      int length = data.size(0);

      for (int j=0; j<length; ++j)

        r += data.offset(j,i);

    }

  return r;

}


//

// Reduce over a 1 dimensional BrickIterator

// given the axis not to reduce on

// and where in that dimension to reduce.

//


static inline double

PerpReduce(BrickIterator<double,1>& data, int i, int )

{

  return data.offset(i);

}


//

// Reduce over all the dimensions but one of a brick iterator.

// Put the results in an array of doubles.

//


template<unsigned Dim>

static void

LocalReduce(double *reduced, int cutAxis, BrickIterator<double,Dim> data)

{


  int length = data.size(cutAxis);

  for (int i=0; i<length; ++i)

    reduced[i] = PerpReduce(data,i,cutAxis);

}


//

// For each domain, do the local reduction of

// data from weights, and send that to the node

// that is accumulating stuff for that domain.

//

// The local reductions take place all across the machine.

// The reductions for each domain are finished on a single processor.

// Each of those final reductions are on different processors.

//


template<class IndexIterator, unsigned Dim>

static void

SendReduce(IndexIterator domainsBegin, IndexIterator domainsEnd,

           BareField<double,Dim>& weights, int tag)

{


  // Buffers to store up domains and blocks of reduced data.

  std::vector<double*> reducedBuffer;

  std::vector<Index> domainBuffer;

  // Loop over all of the domains.  Keep a counter of which one you're on.

  int di;

  IndexIterator dp;

  /*out << "SendReduce, ndomains=" << domainsEnd-domainsBegin << endl;*/

  for (dp=domainsBegin, di=0; dp!=domainsEnd; ++dp, ++di)

    {

      /*out << "SendReduce, domain=" << *dp << endl;*/

      // Find the dimension we'll be cutting on.

      // We'll reduce in the dimensions perpendicular to this.

      int cutAxis = FindCutAxis(*dp, weights.getLayout());

      // Find the LFields on this processor that touch this domain.

      typename BareField<double,Dim>::iterator_if lf_p;

      for (lf_p=weights.begin_if(); lf_p != weights.end_if(); ++lf_p)

        if ( (*dp).touches( (*lf_p).second->getOwned() ) )

          {

            // Find the intersection with this LField.

            NDIndex<Dim> intersection =

              (*dp).intersect( (*lf_p).second->getOwned() );

            // Allocate the accumulation buffer.

            int length = intersection[cutAxis].length();

            double *reduced = new double[length];

            // Reduce into the local buffer.

            /*out << "LocalReduce " << intersection << endl;*/

            LocalReduce(reduced,cutAxis,(*lf_p).second->begin(intersection));

            // Save the domain and the data.

            reducedBuffer.push_back(reduced);

            domainBuffer.push_back(intersection[cutAxis]);

          }


      // If we found any hits, send them out.

      int nrdomains = reducedBuffer.size();

      /*out << "nrdomains=" << nrdomains << endl;*/

      if ( nrdomains>0 )

        {

          // Build a message to hold everything for this domain.

          Message *mess = new Message;

          // The number of reduced domains is the first thing in the message.

          mess->put(nrdomains);

          // Loop over the reduced domains, storing in the message each time.

          std::vector<Index>::iterator dbp = domainBuffer.begin();

          std::vector<double*>::iterator rbp = reducedBuffer.begin();

          for (int i=0; i<nrdomains; ++i, ++dbp, ++rbp)

            {

              // First store the domain.

              /*out << "putMessage " << *dbp << endl;*/

              putMessage(*mess,*dbp);

              // Then the reduced data using begin/end iterators.

              // Tell the message to delete the memory when it is done.

              double *p = *rbp;

              mess->setCopy(false).setDelete(true).put(p,p+(*dbp).length());

            }

          // Send the message to proc di.

          Ippl::Comm->send(mess, di, tag);

        }

      // Clear out the buffers.

      domainBuffer.erase( domainBuffer.begin(), domainBuffer.end() );

      reducedBuffer.erase( reducedBuffer.begin(), reducedBuffer.end() );

      /*out << "Bottom of SendReduce loop" << endl;*/

    }

}


//

// Receive the messages with reduced data sent out in SendReduce.

// Finish the reduction.

// Return begin and end iterators for the reduced data.

//


template<unsigned Dim>

static void

ReceiveReduce(NDIndex<Dim>& domain, BareField<double,Dim>& weights,

              int reduce_tag, int nprocs,

              int& cutLoc, int& cutAxis)

{


  // Build a place to accumulate the reduced data.

  cutAxis = FindCutAxis(domain, weights.getLayout());

  /*out << "ReceiveReduce, cutAxis=" << cutAxis << endl;*/

  int i, length = domain[cutAxis].length();

  int offset = domain[cutAxis].first();

  std::vector<double> reduced(length);

  std::vector<double> subReduced(length);

  for (i=0; i<length; ++i)

    reduced[i] = 0;


  // Build a count of the number of messages to expect.

  // We get *one message* from each node that has a touch.

  int expected = 0;

  int nodes = Ippl::getNodes();

  int mynode = Ippl::myNode();

  bool* found_touch = new bool[nodes];

  for (i=0; i<nodes; ++i) found_touch[i] = false;

  // First look in the local vnodes of weights.

  typename BareField<double,Dim>::iterator_if lf_p, lf_end = weights.end_if();

  for (lf_p = weights.begin_if();

       lf_p != lf_end && !(found_touch[mynode]); ++lf_p) {

    // Expect a message if it touches.

    if ( (*lf_p).second->getOwned().touches(domain) )

      found_touch[mynode] = true;

  }

  // Now look in the remote parts of weights.

  typename FieldLayout<Dim>::touch_iterator_dv rf_p;

  // Get the range of remote vnodes that touch domain.

  typename FieldLayout<Dim>::touch_range_dv range =

    weights.getLayout().touch_range_rdv( domain );

  // Record the processors who have touches

  for (rf_p = range.first; rf_p != range.second ; ++rf_p) {

    int owner = (*((*rf_p).second)).getNode();

    found_touch[owner] = true;

  }

  // now just count up the number of messages to receive

  for (i=0; i<nodes; ++i)

    if (found_touch[i]) expected++;

  delete [] found_touch;


  // Receive messages until we're done.

  while ( --expected >= 0 )

    {

      // Receive a message.

      int any_node = COMM_ANY_NODE;

      Message *mess = Ippl::Comm->receive_block(any_node,reduce_tag);

      PAssert(mess);

      // Loop over all the domains in this message.

      int received_domains = 0;

      mess->get(received_domains);

      while ( --received_domains>=0 )

        {

          // Get the domain for the next part.

          Index rdomain;

          getMessage( *mess, rdomain );

          /*out << "ReceiveReduce, rdomain=" << rdomain << endl;*/

          // Get the incoming reduced data.

          int rfirst = rdomain.first() - offset;

          mess->get(subReduced[rfirst]);

          // Accumulate it with the rest.

          int rlast = rdomain.last() - offset;

          for (int i=rfirst; i<=rlast; ++i)

            reduced[i] += subReduced[i];

        }

      // Delete the message, we're done with it

      delete mess;

    }


  // Get the median.

  cutLoc =

    FindMedian(nprocs,reduced.begin(),reduced.begin()+length,double())

    -reduced.begin() + domain[cutAxis].first();

  /*out << "ReceiveReduce, cutLoc=" << cutLoc << endl;*/

}


//

// Given the location and axis of the cut,

// Broadcast to everybody.

//


inline void

BcastCuts(int cutLoc, int cutAxis, int bcast_tag)

{


  // Make a message.

  Message *mess = new Message();

  // Add the data to it.

  mess->put(cutLoc);

  mess->put(cutAxis);

  // Send it out.

  Ippl::Comm->broadcast_all(mess,bcast_tag);

}


//

// Receive the broadcast cuts.

// Cut up each of the domains using the cuts.

//


template<unsigned Dim>

static void

ReceiveCuts(std::vector< NDIndex<Dim> > &domains,

            std::vector< int >& nprocs,

            int bcast_tag)

{


  // Make a container to hold the split domains.

  int nDomains = domains.size();

  std::vector< NDIndex<Dim> > cutDomains(nDomains*2);

  std::vector<int> cutProcs(std::vector<int>::size_type(nDomains*2));


  // Everybody receives the broadcasts.

  // There will be one for each domain in the list.

  for (int expected = 0; expected < nDomains; ++expected)

    {

      // Receive each broadcast.

      // The processor number will correspond to the location

      // in the domains vector.

      int whichDomain = COMM_ANY_NODE;

      int cutLocation = 0, cutAxis = 0;

      Message *mess = Ippl::Comm->receive_block(whichDomain,bcast_tag);

      PAssert(mess);

      mess->get(cutLocation);

      mess->get(cutAxis);

      delete mess;


      // Split this domain.

      const NDIndex<Dim>& domain = domains[whichDomain];

      NDIndex<Dim>& left = cutDomains[ whichDomain*2 ];

      NDIndex<Dim>& right = cutDomains[ whichDomain*2+1 ];

      // Build the left and right domains.

      left = domain ;

      right = domain ;

      /*out << "Build indexes from : "

          << domain[cutAxis].first() << " "

          << cutLocation<< " "

          << domain[cutAxis].last()<< " "

          << endl;*/

      left[ cutAxis ] = Index( domain[cutAxis].first(), cutLocation-1 );

      right[ cutAxis ] = Index( cutLocation, domain[cutAxis].last() );


      int procs = nprocs[whichDomain];

      cutProcs[ whichDomain*2 ] = procs/2;

      cutProcs[ whichDomain*2+1 ] = procs - procs/2;

    }


  // Put the domains you've just built into the input containers.

  // Strip out the domains with no processors assigned.

  domains.clear();

  nprocs.clear();

  PAssert_EQ(cutProcs.size(), cutDomains.size());

  for (unsigned int i=0; i<cutProcs.size(); ++i)

    {

      if ( cutProcs[i] != 0 )

        {

          domains.push_back(cutDomains[i]);

          nprocs.push_back(cutProcs[i]);

        }

      else

        {

          PAssert_EQ(cutDomains[i].size(), 0);

        }

    }

}


//

// Sweep through a list of domains, splitting each one

// according to the weights in a BareField.

//


template<unsigned Dim>

static void

CutEach(std::vector< NDIndex<Dim> >& domains,

        std::vector< int >& nprocs,

        BareField<double,Dim>& weights)

{


  // Get tags for the reduction and the broadcast.

  int reduce_tag = Ippl::Comm->next_tag( F_REDUCE_PERP_TAG , F_TAG_CYCLE );

  int bcast_tag  = Ippl::Comm->next_tag( F_REDUCE_PERP_TAG , F_TAG_CYCLE );

  /*out << "reduce_tag=" << reduce_tag << endl;*/

  /*out << "bcast_tag=" << bcast_tag << endl;*/


  // Do the sends for the reduces.

  SendReduce(domains.begin(),domains.end(),weights,reduce_tag);


  // On the appropriate processors, receive the data for the reduce,

  // and broadcast the cuts.

  unsigned int mynode = Ippl::Comm->myNode();

  if ( mynode < domains.size() )

    {

      // Receive partially reduced data, finish the reduction, find the median.

      int cutAxis, cutLoc;

      ReceiveReduce(domains[mynode],weights,reduce_tag,

                    nprocs[mynode],cutLoc,cutAxis);

      // Broadcast those cuts out to everybody.

      BcastCuts(cutLoc,cutAxis,bcast_tag);

    }


  // Receive the broadcast cuts and slice up the domains.

  ReceiveCuts(domains,nprocs,bcast_tag);

}


template<unsigned Dim>

NDIndex<Dim>

CalcBinaryRepartition(FieldLayout<Dim>& layout, BareField<double,Dim>& weights)

{

// Build a list of domains as we go.

  std::vector< NDIndex<Dim> > domains; // used by TAU_TYPE_STRING

  std::vector<int> procs;


  /*out << "Starting CalcBinaryRepartition, outstanding msgs="

      << Ippl::Comm->getReceived()

      << endl;*/


  // Get the processors we'll be dealing with.

  int nprocs = Ippl::Comm->getNodes();

  int myproc = Ippl::Comm->myNode();

  domains.reserve(nprocs);

  procs.reserve(nprocs);

  // Start the list with just the top level domain.

  domains.push_back( layout.getDomain() );

  procs.push_back( nprocs );


  // mprocs is the max number of procs assigned to a domain.

  int mprocs=nprocs;


  // Loop as long as some domain has more than one proc assigned to it.

  while ( mprocs>1 )

    {

      // Cut all the domains in half.

      CutEach(domains,procs,weights);


      // Find the max number of procs assigned to a domain.

      mprocs = 0;

      for (unsigned int i=0; i<procs.size(); ++i)

        if (mprocs<procs[i]) mprocs = procs[i];

    }

  // Return the domain on this processor.


  //seriously dirty fix

  typename std::vector< NDIndex<Dim> >::iterator i;


  bool degenerated = false;


  for(i = domains.begin();i!=domains.end();++i)

  {

          for(unsigned int d = 0;d<Dim;++d)

                if((*i)[d].first() == (*i)[d].last())

                {

                        degenerated = true;

                        break;

                }

        if(degenerated)

                break;

  }


  if(!degenerated)

        return domains.begin()[myproc];

  else

        {

                throw BinaryRepartitionFailed();

        }


}


/***************************************************************************

 * $RCSfile: BinaryBalancer.cpp,v $   $Author: adelmann $

 * $Revision: 1.1.1.1 $   $Date: 2003/01/23 07:40:27 $

 * IPPL_VERSION_ID: $Id: BinaryBalancer.cpp,v 1.1.1.1 2003/01/23 07:40:27 adelmann Exp $

 ***************************************************************************/

end
PartBunchBase< T, Dim >::ConstIterator end(PartBunchBase< T, Dim > const &bunch)
Definition: PartBunchBase.h:736

begin
PartBunchBase< T, Dim >::ConstIterator begin(PartBunchBase< T, Dim > const &bunch)
Definition: PartBunchBase.h:731

Dim
const unsigned Dim
Definition: P3MPoissonSolver.h:26

BinaryBalancer.h

BcastCuts
void BcastCuts(int cutLoc, int cutAxis, int bcast_tag)
Definition: BinaryBalancer.hpp:454

CalcBinaryRepartition
NDIndex< Dim > CalcBinaryRepartition(FieldLayout< Dim > &layout, BareField< double, Dim > &weights)
Definition: BinaryBalancer.hpp:586

FieldLayout.h

SERIAL
@ SERIAL
Definition: FieldLayout.h:55

putMessage
void putMessage(Message &m, const T &t)
Definition: Message.h:549

getMessage
void getMessage(Message &m, T &t)
Definition: Message.h:572

COMM_ANY_NODE
const int COMM_ANY_NODE
Definition: Communicate.h:40

F_REDUCE_PERP_TAG
#define F_REDUCE_PERP_TAG
Definition: Tags.h:49

F_TAG_CYCLE
#define F_TAG_CYCLE
Definition: Tags.h:53

BareField.h

PAssert.h

PAssert_EQ
#define PAssert_EQ(a, b)
Definition: PAssert.h:104

PAssert
#define PAssert(c)
Definition: PAssert.h:102

mslang::iterator
std::string::iterator iterator
Definition: MSLang.h:16

Attrib::Legacy::Distribution::T
@ T
Definition: Distribution.h:171

NDIndex
Definition: NDIndex.h:74

NDIndex::intersect
NDIndex< Dim > intersect(const NDIndex< Dim > &) const
Definition: NDIndexInlines.h:31

DomainMap::touch_iterator
Definition: DomainMap.h:378

BareField< double, Dim >

BareField::begin_if
iterator_if begin_if()
Definition: BareField.h:100

BareField::getLayout
Layout_t & getLayout() const
Definition: BareField.h:131

BareField::end_if
iterator_if end_if()
Definition: BareField.h:101

FieldLayout
Definition: FieldLayout.h:62

FieldLayout::touch_range_rdv
touch_range_dv touch_range_rdv(const NDIndex< Dim > &domain, const GuardCellSizes< Dim > &gc=gc0()) const
Definition: FieldLayout.h:780

FieldLayout::getDomain
const NDIndex< Dim > & getDomain() const
Definition: FieldLayout.h:325

FieldLayout::touch_range_dv
std::pair< touch_iterator_dv, touch_iterator_dv > touch_range_dv
Definition: FieldLayout.h:77

FieldLayout::getRequestedDistribution
e_dim_tag getRequestedDistribution(unsigned int d) const
Definition: FieldLayout.h:405

FieldLayout::getDistribution
e_dim_tag getDistribution(unsigned int d) const
Definition: FieldLayout.h:396

BrickCounter::size
int size(unsigned d) const
Definition: BrickIterator.h:43

BrickIterator
Definition: BrickIterator.h:60

BrickIterator::offset
T & offset(int i) const
Definition: BrickIterator.h:107

BinaryRepartitionFailed
Definition: BinaryBalancer.h:58

Index
Definition: Index.h:237

Index::last
int last() const
Definition: IndexInlines.h:136

Index::first
int first() const
Definition: IndexInlines.h:116

Communicate::send
bool send(Message *, int node, int tag, bool delmsg=true)
Definition: Communicate.cpp:181

Communicate::receive_block
Message * receive_block(int &node, int &tag)
Definition: Communicate.cpp:300

Communicate::getNodes
int getNodes() const
Definition: Communicate.h:143

Communicate::broadcast_all
virtual int broadcast_all(Message *, int)
Definition: Communicate.cpp:384

Communicate::myNode
int myNode() const
Definition: Communicate.h:155

Message
Definition: Message.h:143

Message::setCopy
Message & setCopy(const bool c)
Definition: Message.h:319

Message::put
Message & put(const T &val)
Definition: Message.h:406

Message::setDelete
Message & setDelete(const bool c)
Definition: Message.h:331

Message::get
Message & get(const T &cval)
Definition: Message.h:476

TagMaker::next_tag
int next_tag(int t, int s=1000)
Definition: TagMaker.h:39

IpplInfo::getNodes
static int getNodes()
Definition: IpplInfo.cpp:670

IpplInfo::myNode
static int myNode()
Definition: IpplInfo.cpp:691

IpplInfo::Comm
static Communicate * Comm
Definition: IpplInfo.h:84