The document you referred to is the previous work of my group. Since you asked about copyright, the code is available in GPL2 from the web page of our group: http://padas.ices.utexas.edu/software
The new code (pvfmm I'm working on) is also available for download at the link above and is available under GPL3. It has a simpler and very efficient algorithm for balancing. The algorithm is described in this article: http://padas.ices.utexas.edu/static/papers/pvfmm.pdf
I applied the code below to remove the parallelism distributed memory. The balanceOctree function implements the algorithm.
#include <vector> #include <set> #include <cstdlib> #include <cmath> #include <stdint.h> #include <omp.h> #include <algorithm> #include <cstring> #ifndef MAX_DEPTH #define MAX_DEPTH 30 #endif #if MAX_DEPTH < 7 #define UINT_T uint8_t #define INT_T int8_t #elif MAX_DEPTH < 15 #define UINT_T uint16_t #define INT_T int16_t #elif MAX_DEPTH < 31 #define UINT_T uint32_t #define INT_T int32_t #elif MAX_DEPTH < 63 #define UINT_T uint64_t #define INT_T int64_t #endif namespace omp_par{ template <class T,class StrictWeakOrdering> void merge(T A_,T A_last,T B_,T B_last,T C_,int p,StrictWeakOrdering comp){ typedef typename std::iterator_traits<T>::difference_type _DiffType; typedef typename std::iterator_traits<T>::value_type _ValType; _DiffType N1=A_last-A_; _DiffType N2=B_last-B_; if(N1==0 && N2==0) return; if(N1==0 || N2==0){ _ValType* A=(N1==0? &B_[0]: &A_[0]); _DiffType N=(N1==0? N2 : N1 ); #pragma omp parallel for for(int i=0;i<p;i++){ _DiffType indx1=( i *N)/p; _DiffType indx2=((i+1)*N)/p; memcpy(&C_[indx1], &A[indx1], (indx2-indx1)*sizeof(_ValType)); } return; } //Split both arrays ( A and B ) into n equal parts. //Find the position of each split in the final merged array. int n=10; _ValType* split=new _ValType[p*n*2]; _DiffType* split_size=new _DiffType[p*n*2]; #pragma omp parallel for for(int i=0;i<p;i++){ for(int j=0;j<n;j++){ int indx=i*n+j; _DiffType indx1=(indx*N1)/(p*n); split [indx]=A_[indx1]; split_size[indx]=indx1+(std::lower_bound(B_,B_last,split[indx],comp)-B_); indx1=(indx*N2)/(p*n); indx+=p*n; split [indx]=B_[indx1]; split_size[indx]=indx1+(std::lower_bound(A_,A_last,split[indx],comp)-A_); } } //Find the closest split position for each thread that will //divide the final array equally between the threads. _DiffType* split_indx_A=new _DiffType[p+1]; _DiffType* split_indx_B=new _DiffType[p+1]; split_indx_A[0]=0; split_indx_B[0]=0; split_indx_A[p]=N1; split_indx_B[p]=N2; #pragma omp parallel for for(int i=1;i<p;i++){ _DiffType req_size=(i*(N1+N2))/p; int j=std::lower_bound(&split_size[0],&split_size[p*n],req_size,std::less<_DiffType>())-&split_size[0]; if(j>=p*n) j=p*n-1; _ValType split1 =split [j]; _DiffType split_size1=split_size[j]; j=(std::lower_bound(&split_size[p*n],&split_size[p*n*2],req_size,std::less<_DiffType>())-&split_size[p*n])+p*n; if(j>=2*p*n) j=2*p*n-1; if(abs(split_size[j]-req_size)<abs(split_size1-req_size)){ split1 =split [j]; split_size1=split_size[j]; } split_indx_A[i]=std::lower_bound(A_,A_last,split1,comp)-A_; split_indx_B[i]=std::lower_bound(B_,B_last,split1,comp)-B_; } delete[] split; delete[] split_size; //Merge for each thread independently. #pragma omp parallel for for(int i=0;i<p;i++){ TC=C_+split_indx_A[i]+split_indx_B[i]; std::merge(A_+split_indx_A[i],A_+split_indx_A[i+1],B_+split_indx_B[i],B_+split_indx_B[i+1],C,comp); } delete[] split_indx_A; delete[] split_indx_B; } template <class T,class StrictWeakOrdering> void merge_sort(TA,T A_last,StrictWeakOrdering comp){ typedef typename std::iterator_traits<T>::difference_type _DiffType; typedef typename std::iterator_traits<T>::value_type _ValType; int p=omp_get_max_threads(); _DiffType N=A_last-A; if(N<2*p){ std::sort(A,A_last,comp); return; } //Split the array A into p equal parts. _DiffType* split=new _DiffType[p+1]; split[p]=N; #pragma omp parallel for for(int id=0;id<p;id++){ split[id]=(id*N)/p; } //Sort each part independently. #pragma omp parallel for for(int id=0;id<p;id++){ std::sort(A+split[id],A+split[id+1],comp); } //Merge two parts at a time. _ValType* B=new _ValType[N]; _ValType* A_=&A[0]; _ValType* B_=&B[0]; for(int j=1;j<p;j=j*2){ for(int i=0;i<p;i=i+2*j){ if(i+j<p){ merge(A_+split[i],A_+split[i+j],A_+split[i+j],A_+split[(i+2*j<=p?i+2*j:p)],B_+split[i],p,comp); }else{ #pragma omp parallel for for(int k=split[i];k<split[p];k++) B_[k]=A_[k]; } } _ValType* tmp_swap=A_; A_=B_; B_=tmp_swap; } //The final result should be in A. if(A_!=&A[0]){ #pragma omp parallel for for(int i=0;i<N;i++) A[i]=A_[i]; } //Free memory. delete[] split; delete[] B; } template <class T> void merge_sort(TA,T A_last){ typedef typename std::iterator_traits<T>::value_type _ValType; merge_sort(A,A_last,std::less<_ValType>()); } template <class T, class I> T reduce(T* A, I cnt){ T sum=0; #pragma omp parallel for reduction(+:sum) for(I i = 0; i < cnt; i++) sum+=A[i]; return sum; } template <class T, class I> void scan(T* A, T* B,I cnt){ int p=omp_get_max_threads(); if(cnt<(I)100*p){ for(I i=1;i<cnt;i++) B[i]=B[i-1]+A[i-1]; return; } I step_size=cnt/p; #pragma omp parallel for for(int i=0; i<p; i++){ int start=i*step_size; int end=start+step_size; if(i==p-1) end=cnt; if(i!=0)B[start]=0; for(I j=(I)start+1; j<(I)end; j++) B[j]=B[j-1]+A[j-1]; } T* sum=new T[p]; sum[0]=0; for(int i=1;i<p;i++) sum[i]=sum[i-1]+B[i*step_size-1]+A[i*step_size-1]; #pragma omp parallel for for(int i=1; i<p; i++){ int start=i*step_size; int end=start+step_size; if(i==p-1) end=cnt; T sum_=sum[i]; for(I j=(I)start; j<(I)end; j++) B[j]+=sum_; } delete[] sum; } } class MortonId{ public: MortonId(); MortonId(MortonId m, unsigned char depth); template <class T> MortonId(T x_f,T y_f, T z_f, unsigned char depth=MAX_DEPTH); template <class T> MortonId(T* coord, unsigned char depth=MAX_DEPTH); unsigned int GetDepth() const; template <class T> void GetCoord(T* coord); MortonId NextId() const; MortonId getAncestor(unsigned char ancestor_level) const; /** * \brief Returns the deepest first descendant. */ MortonId getDFD(unsigned char level=MAX_DEPTH) const; void NbrList(std::vector<MortonId>& nbrs,unsigned char level, int periodic) const; std::vector<MortonId> Children() const; int operator<(const MortonId& m) const; int operator>(const MortonId& m) const; int operator==(const MortonId& m) const; int operator!=(const MortonId& m) const; int operator<=(const MortonId& m) const; int operator>=(const MortonId& m) const; int isAncestor(MortonId const & other) const; private: UINT_T x,y,z; unsigned char depth; }; inline MortonId::MortonId():x(0), y(0), z(0), depth(0){} inline MortonId::MortonId(MortonId m, unsigned char depth_):x(mx), y(my), z(mz), depth(depth_){} template <class T> inline MortonId::MortonId(T x_f,T y_f, T z_f, unsigned char depth_): depth(depth_){ UINT_T max_int=((UINT_T)1)<<(MAX_DEPTH); x=(UINT_T)floor(x_f*max_int); y=(UINT_T)floor(y_f*max_int); z=(UINT_T)floor(z_f*max_int); } template <class T> inline MortonId::MortonId(T* coord, unsigned char depth_): depth(depth_){ UINT_T max_int=((UINT_T)1)<<(MAX_DEPTH); x=(UINT_T)floor(coord[0]*max_int); y=(UINT_T)floor(coord[1]*max_int); z=(UINT_T)floor(coord[2]*max_int); } template <class T> inline void MortonId::GetCoord(T* coord){ UINT_T max_int=((UINT_T)1)<<(MAX_DEPTH); T s=1.0/((T)max_int); coord[0]=x*s; coord[1]=y*s; coord[2]=z*s; } inline unsigned int MortonId::GetDepth() const{ return depth; } inline MortonId MortonId::NextId() const{ MortonId m=*this; UINT_T mask=((UINT_T)1)<<(MAX_DEPTH-depth); int i; for(i=depth;i>=0;i--){ mx=(mx^mask); if((mx & mask)) break; my=(my^mask); if((my & mask)) break; mz=(mz^mask); if((mz & mask)) break; mask=(mask<<1); } m.depth=i; return m; } inline MortonId MortonId::getAncestor(unsigned char ancestor_level) const{ MortonId m=*this; m.depth=ancestor_level; UINT_T mask=(((UINT_T)1)<<(MAX_DEPTH))-(((UINT_T)1)<<(MAX_DEPTH-ancestor_level)); mx=(mx & mask); my=(my & mask); mz=(mz & mask); return m; } inline MortonId MortonId::getDFD(unsigned char level) const{ MortonId m=*this; m.depth=level; return m; } inline int MortonId::operator<(const MortonId& m) const{ if(x==mx && y==my && z==mz) return depth<m.depth; UINT_T x_=(x^mx); UINT_T y_=(y^my); UINT_T z_=(z^mz); if((z_>x_ || ((z_^x_)<x_ && (z_^x_)<z_)) && (z_>y_ || ((z_^y_)<y_ && (z_^y_)<z_))) return z<mz; if(y_>x_ || ((y_^x_)<x_ && (y_^x_)<y_)) return y<my; return x<mx; } inline int MortonId::operator>(const MortonId& m) const{ if(x==mx && y==my && z==mz) return depth>m.depth; UINT_T x_=(x^mx); UINT_T y_=(y^my); UINT_T z_=(z^mz); if((z_>x_ || ((z_^x_)<x_ && (z_^x_)<z_)) && (z_>y_ || ((z_^y_)<y_ && (z_^y_)<z_))) return z>mz; if((y_>x_ || ((y_^x_)<x_ && (y_^x_)<y_))) return y>my; return x>mx; } inline int MortonId::operator==(const MortonId& m) const{ return (x==mx && y==my && z==mz && depth==m.depth); } inline int MortonId::operator!=(const MortonId& m) const{ return !(*this==m); } inline int MortonId::operator<=(const MortonId& m) const{ return !(*this>m); } inline int MortonId::operator>=(const MortonId& m) const{ return !(*this<m); } inline int MortonId::isAncestor(MortonId const & other) const { return other.depth>depth && other.getAncestor(depth)==*this; } void MortonId::NbrList(std::vector<MortonId>& nbrs, unsigned char level, int periodic) const{ nbrs.clear(); static int dim=3; static int nbr_cnt=(int)(pow(3.0,dim)+0.5); static UINT_T maxCoord=(((UINT_T)1)<<(MAX_DEPTH)); UINT_T mask=maxCoord-(((UINT_T)1)<<(MAX_DEPTH-level)); UINT_T pX=x & mask; UINT_T pY=y & mask; UINT_T pZ=z & mask; MortonId mid_tmp; mask=(((UINT_T)1)<<(MAX_DEPTH-level)); for(int i=0; i<nbr_cnt; i++){ INT_T dX = ((i/1)%3-1)*mask; INT_T dY = ((i/3)%3-1)*mask; INT_T dZ = ((i/9)%3-1)*mask; INT_T newX=(INT_T)pX+dX; INT_T newY=(INT_T)pY+dY; INT_T newZ=(INT_T)pZ+dZ; if(!periodic){ if(newX>=0 && newX<(INT_T)maxCoord) if(newY>=0 && newY<(INT_T)maxCoord) if(newZ>=0 && newZ<(INT_T)maxCoord){ mid_tmp.x=newX; mid_tmp.y=newY; mid_tmp.z=newZ; mid_tmp.depth=level; nbrs.push_back(mid_tmp); } }else{ if(newX<0) newX+=maxCoord; if(newX>=(INT_T)maxCoord) newX-=maxCoord; if(newY<0) newY+=maxCoord; if(newY>=(INT_T)maxCoord) newY-=maxCoord; if(newZ<0) newZ+=maxCoord; if(newZ>=(INT_T)maxCoord) newZ-=maxCoord; mid_tmp.x=newX; mid_tmp.y=newY; mid_tmp.z=newZ; mid_tmp.depth=level; nbrs.push_back(mid_tmp); } } } std::vector<MortonId> MortonId::Children() const{ static int dim=3; static int c_cnt=(1UL<<dim); static UINT_T maxCoord=(((UINT_T)1)<<(MAX_DEPTH)); std::vector<MortonId> child(c_cnt); UINT_T mask=maxCoord-(((UINT_T)1)<<(MAX_DEPTH-depth)); UINT_T pX=x & mask; UINT_T pY=y & mask; UINT_T pZ=z & mask; mask=(((UINT_T)1)<<(MAX_DEPTH-(depth+1))); for(int i=0; i<c_cnt; i++){ child[i].x=pX+mask*((i/1)%2); child[i].y=pY+mask*((i/2)%2); child[i].z=pZ+mask*((i/4)%2); child[i].depth=depth+1; } return child; } //list must be sorted. void lineariseList(std::vector<MortonId> & list) { if(!list.empty()) {// Remove duplicates and ancestors. std::vector<MortonId> tmp; for(unsigned int i = 0; i < (list.size()-1); i++) { if( (!(list[i].isAncestor(list[i+1]))) && (list[i] != list[i+1]) ) { tmp.push_back(list[i]); } } tmp.push_back(list[list.size()-1]); list.swap(tmp); } } void balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &out, unsigned int maxDepth, bool periodic) { unsigned int dim=3; int omp_p=omp_get_max_threads(); if(in.size()==1){ out=in; return 0; } //Build level-by-level set of nodes. std::vector<std::set<MortonId> > nodes((maxDepth+1)*omp_p); #pragma omp parallel for for(int p=0;p<omp_p;p++){ size_t a=( p *in.size())/omp_p; size_t b=((p+1)*in.size())/omp_p; for(size_t i=a;i<b;){ size_t d=in[i].GetDepth(); if(d==0){i++; continue;} MortonId pnode=in[i].getAncestor(d-1); nodes[d-1+(maxDepth+1)*p].insert(pnode); while(i<b && d==in[i].GetDepth() && pnode==in[i].getAncestor(d-1)) i++; } //Add new nodes level-by-level. std::vector<MortonId> nbrs; unsigned int num_chld=1UL<<dim; for(unsigned int l=maxDepth;l>=1;l--){ //Build set of parents of balancing nodes. std::set<MortonId> nbrs_parent; std::set<MortonId>::iterator start=nodes[l+(maxDepth+1)*p].begin(); std::set<MortonId>::iterator end =nodes[l+(maxDepth+1)*p].end(); for(std::set<MortonId>::iterator node=start; node != end;){ node->NbrList(nbrs, l, periodic); int nbr_cnt=nbrs.size(); for(int i=0;i<nbr_cnt;i++) nbrs_parent.insert(nbrs[i].getAncestor(l-1)); node++; } //Get the balancing nodes. std::set<MortonId>& ancestor_nodes=nodes[l-1+(maxDepth+1)*p]; start=nbrs_parent.begin(); end =nbrs_parent.end(); ancestor_nodes.insert(start,end); } //Remove ancestors nodes. (optional) for(unsigned int l=1;l<=maxDepth;l++){ std::set<MortonId>::iterator start=nodes[l +(maxDepth+1)*p].begin(); std::set<MortonId>::iterator end =nodes[l +(maxDepth+1)*p].end(); std::set<MortonId>& ancestor_nodes=nodes[l-1+(maxDepth+1)*p]; for(std::set<MortonId>::iterator node=start; node != end; node++){ MortonId parent=node->getAncestor(node->GetDepth()-1); ancestor_nodes.erase(parent); } } } //Resize out. std::vector<size_t> node_cnt(omp_p,0); std::vector<size_t> node_dsp(omp_p,0); #pragma omp parallel for for(int i=0;i<omp_p;i++){ for(unsigned int j=0;j<=maxDepth;j++) node_cnt[i]+=nodes[j+i*(maxDepth+1)].size(); } omp_par::scan(&node_cnt[0],&node_dsp[0], omp_p); out.resize(node_cnt[omp_p-1]+node_dsp[omp_p-1]); //Copy leaf nodes to out. #pragma omp parallel for for(int p=0;p<omp_p;p++){ size_t node_iter=node_dsp[p]; for(unsigned int l=0;l<=maxDepth;l++){ std::set<MortonId>::iterator start=nodes[l +(maxDepth+1)*p].begin(); std::set<MortonId>::iterator end =nodes[l +(maxDepth+1)*p].end(); for(std::set<MortonId>::iterator node=start; node != end; node++) out[node_iter++]=*node; } } //Sort, Linearise, Redistribute. omp_par::merge_sort(&out[0], &out[out.size()]); lineariseList(out); { // Add children MortonId nxt_mid(0,0,0,0); std::vector<MortonId> out1; std::vector<MortonId> children; for(size_t i=0;i<out.size();i++){ while(nxt_mid.getDFD()<out[i]){ while(nxt_mid.isAncestor(out[i])){ nxt_mid=nxt_mid.getAncestor(nxt_mid.GetDepth()+1); } out1.push_back(nxt_mid); nxt_mid=nxt_mid.NextId(); } children=out[i].Children(); for(size_t j=0;j<8;j++){ out1.push_back(children[j]); } nxt_mid=out[i].NextId(); } while(nxt_mid.GetDepth()>0){ out1.push_back(nxt_mid); nxt_mid=nxt_mid.NextId(); } out.swap(out1); } }
The above code is optimized for performance, and OpenMP further complicates the situation, but the basic idea is this:
- Build a set of level nodes by level (MortonIds).
- Iterate over levels from the top level to the root level.
- At each level, collapse the entire node tree at that level and calculate the MortonId for your parent neighbors. Add them to the set of tree nodes at a grosser level (without adding duplicate nodes).
- Finally, take all the nodes of the tree (from all levels) and sort them by MortonId.
- Remove the ancestor nodes for a balanced linear octet.