MPI, 2D array, buffers overlap by 바죠

MPI 라이버러리를 활용한 병렬 계산 기법 소개입니다.
노드별로 계산된 2차원 배열 자료들을 교환하기 위한 (MPI 라이버러리들을 활용한) 방법들은 다양하게 만들어 볼 수 있을것이다. 요구되는 조건, 주어진 조건에 따라서 구현 방법은 달라진다.

      buffers overlap, array (2D), contiguous data 
      
아래의 경우, work 배열은 2차원 배열이다. 모든 노드에서 같은 배열 이름으로 접근 가능하다. (buffers overlap)

특별한 구간에서 work(:,njobs)계산이 진행된다고 가정한다.
이 때, 늘 natomxyz*3 의 크기는 항상 유지되어 데이터가 만들어지고 전달된다. (contiguous)
이렇게 되는 경우에 대해서만 통신을 논한다. 
이렇게 될 때, 0 번 노드에서 완성본의 데이터를 구성해야할 경우가 있을 수 있다.
즉, 각 노드에서 작업한 (물론, 구간별로만 작업이 완성되어 있다. 이는 병렬 작업이다.)
MPI 통신 라이버러리를 활용하여 데이터를 0번 노드에서 받아 보자.

njobs :  총 작업의 단위, 주어진 nproc를 이용하여 가능한한 균등하게 작업량을 분해하려고 한다.
각 노드마다, jstart, jfinish를 계산하므로써, 각 노드에서 특별한 구간만의 작업량을 할당할 수 있다.

nproc : 노드들의 수, 총 노드수
myid  : 0, 1, 2,..., nproc-1
       call MPI_COMM_RANK( MPI_COMM_WORLD, myid, ierr )
       call MPI_COMM_SIZE( MPI_COMM_WORLD, nproc, ierr )

--------------------------------------------------------------
--------------------------------------------------------------


       integer myid,nproc
       integer ireq,irank,n1,n2, njobs,jstart,jfinish
       integer, allocatable :: jjsta(:),jjlen(:),iireq(:)
       integer istatus(MPI_STATUS_SIZE)



!--------{   work(3*natomxyz,njobs)
       n1=1 ; n2=njobs
       allocate(jjsta(0:nproc-1), jjlen(0:nproc-1), iireq(0:nproc-1))
       do irank=0, nproc-1
       call equal_load(n1,n2,nproc,irank,jstart,jfinish)
       jjsta(irank)=jstart
       jjlen(irank)=(natomxyz*3)*(jfinish-jstart+1)
       enddo
!                                                                       parallel processing
       call equal_load(n1,n2,nproc,myid,jstart,jfinish)
       do j=jstart,jfinish
       ij=iseed1+iter   ; kl=iseed2+j
       ij=mod(ij,31328  +1) 
       kl=mod(kl,30081  +1)
       call rmarin(ij,kl)
          xxx(1,1)=work(1,j)+(ranmar()-0.5)*0.0001d0
          xxx(1,2)=work(2,j)+(ranmar()-0.5)*0.0001d0
          xxx(1,3)=work(3,j)+(ranmar()-0.5)*0.0001d0
       call md_cannonical(xxx,vvv,force,ymass)
          work(1,j)=xxx(1,1)
          work(2,j)=xxx(1,2)
          work(3,j)=xxx(1,3)
       enddo
!
       if(myid == 0)then   ! -----[   process id = 0
       do irank=1,nproc-1
       call MPI_IRECV(work(1,jjsta(irank)),jjlen(irank),MPI_DOUBLE_PRECISION, irank, 1, MPI_COMM_WORLD, iireq(irank),ierr)
       enddo
       do irank=1,nproc-1
       call MPI_WAIT(iireq(irank),istatus,ierr)
       enddo
                    else   ! -----~   process id = 0
       call MPI_ISEND(work(1,jstart),jjlen(myid), MPI_DOUBLE_PRECISION, 0, 1, MPI_COMM_WORLD, ireq, ierr)
       call MPI_WAIT(ireq,istatus,ierr)
                    endif  ! -----]   process id = 0
       deallocate(jjsta,jjlen,iireq)
!--------}

--------------------------------------------------------------



모든 노드에 업데이트된 정보를 공유하고 싶을 때, 즉, synchronization을 요구할 경우.
 synchronizing data, buffers overlap
       
       integer myid,nproc
       integer ierr,irank,jsta,jend,n1,n2
       real*8 aaaa(m,n)
       integer, allocatable :: jjsta(:), jjlen(:)


       allocate(jjsta(0:nproc-1), jjlen(0:nproc-1))
        do irank=0,nproc-1
        call equal_load(n1,n2,nproc,irank,jsta,jend)
         jjsta(irank)=jsta
         jjlen(irank)=m*(jend-jsta+1)
        enddo
        call equal_load(n1,n2,nproc,myid,jsta,jend)


 
         do irank=0,nproc-1
         call MPI_BCAST(aaaa(1,jjsta(irank), jjlen(irank), MPI_DOUBLE_PRECISION, irank, MPI_COMM_WORLD, ierr)
         enddo
         deallocate(jjsta,jjlen)



--------------------------------------------------------------


      program main
      include "mpif.h"
      real*8, parameter :: PI25DT = 3.141592653589793238462643d0
      real*8  xmypi, gpi, hh, xsum, x, f
      integer npoints, myid, nproc, i, ierr
!                                 function to integrate
      f(x) = 4.d0 / (1.d0 + x*x)
      call MPI_INIT(ierr)
      call MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr)
      call MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierr)
 10   if ( myid == 0 ) then
         write(6,*), 'Enter the number of intervals: (0 to quit) '
         read(5,*) npoints
                             endif
!                                 broadcast npoints
      call MPI_BCAST(npoints,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)
!                                 check for quit signal
      if ( npoints <=  0 ) goto 30
!                                 calculate the interval size
      hh = 1.0d0/float(npoints)
      xsum  = 0.0d0
      do i = myid+1, npoints, nproc
         x = hh * (float(i) - 0.5d0)
         xsum = xsum + f(x)
      enddo
      xmypi = hh * xsum
!                                 collect all the partial xsum
      call MPI_REDUCE(xmypi,gpi,1,MPI_DOUBLE_PRECISION,MPI_SUM,0, MPI_COMM_WORLD,ierr)
!                                 node 0 prints the answer.
      if (myid ==   0) then
         write(6,*), 'gpi is ', gpi, ' Error is', abs(gpi - PI25DT)
                            endif
       goto 10
 30   call MPI_FINALIZE(ierr)
       stop
       end


--------------------------------------------------------------


       implicit none
       integer n1,n2,irank,nproc,istart,ifinish

       n1=-1
       n2=8

       nproc=5
       write(6,*) n1,n2,' n1,n2'
       write(6,*) nproc,' nproc'
       do irank=0,nproc-1
       call equal_load(n1,n2,nproc,irank,istart,ifinish)
       write(6,*) istart,ifinish
       enddo

       stop
       end
           -1            8  n1,n2
            5  nproc
           -1            0
            1            2
            3            4
            5            6
            7            8
FORTRAN STOP

--------------------------------------------------------------



       subroutine equal_load(n1,n2,nproc,myid,istart,ifinish)
       implicit none
       integer nproc,myid,istart,ifinish,n1,n2
       integer iw1,iw2
       iw1=(n2-n1+1)/nproc ; iw2=mod(n2-n1+1,nproc)
       istart=myid*iw1+n1+min(myid,iw2)
       ifinish=istart+iw1-1 ; if(iw2 > myid) ifinish=ifinish+1
!      print*, n1,n2,myid,nproc,istart,ifinish
       if(n2 < istart) ifinish=istart-1
       return
       end

--------------------------------------------------------------


메모리의 낭비가 있기는 하지만 뭔가 쉽게 쉽게 가자는 분위기로 만들 수 있는 방식.

       allocate(exqq(natom_ac,3,0:np_ac), exforce(natom_ac,3,0:np_ac), exvofqj(0:np_ac))
       if(myid == 0)then    ! -----{ PROCESS ID = 0
       exqq=qq
                    endif   ! -----} PROCESS ID = 0
       iroot=0 ; kount=3*natom_ac*(np_ac+1)
       call MPI_BCAST(exqq, kount,MPI_REAL8, iroot,MPI_COMM_WORLD,ierr)

       n1=0 ; n2=np_ac
       call equal_load(n1,n2,nproc,myid,jstart,jfinish)
!
       exforce=0.0d0 ; exvofqj=0.0d0
       do j=jstart,jfinish
       call xtinker(exqq(1,1,j),exforce(1,1,j),exvofqj(j),natom_ac,isequence,isymbol,iattyp,ii12)
       enddo

       iroot=0 ; kount=3*natom_ac*(np_ac+1)
       call MPI_REDUCE(exforce,force_ac,kount,MPI_DOUBLE_PRECISION,MPI_SUM,iroot,MPI_COMM_WORLD,ierr)
       iroot=0 ; kount=(np_ac+1)
       call MPI_REDUCE(exvofqj,vofqj,kount,MPI_DOUBLE_PRECISION,MPI_SUM,iroot,MPI_COMM_WORLD,ierr)
!      
       deallocate(exforce, exvofqj, exqq)

--------------------------------------------------------------


        real*8 aa(n,n), bb(n,n)


        do j=1,n
        do i=1,n
        aa(i,j)=0.d0
        bb(i,j)=0.0d0
        enddo
        enddo
        
         do j=1,n
         do i=1+myid,n, nproc
         aa(i,j)=...
         enddo
         enddo
          
         call MPI_REDUCE(aa,bb,n*n, MPI_DOUBLE_PRECISION, MPI_SUM, 0, MPI_COMM_WORLD, ierr)


--------------------------------------------------------------


          array shrinking
!234567890
          program equal_load_sum
          implicit none
          include 'mpif.h'
          integer nn
          real*8, allocatable :: aa(:)
          integer nproc,myid,ierr,istart,ifinish
          integer i
          real*8 xsum,xxsum

          nn=10000

          call MPI_INIT(ierr)
          call MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierr)
          call MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr)

          call equal_load(1,nn,nproc,myid,istart,ifinish)

          allocate(aa(istart:ifinish)) ! 단순한 인덱스의 분할 뿐만아니라 메모리의 분할이 이루어지고 있다. 노드별로


          do i=istart,ifinish
          aa(i)=float(i)
          enddo

          xsum=0.0d0
          do i=istart,ifinish
          xsum=xsum+aa(i)
          enddo


         call MPI_REDUCE(xsum,xxsum,1,MPI_DOUBLE_PRECISION,MPI_SUM,0,MPI_COMM_WORLD,ierr)
         xsum=xxsum


         if(myid == 0)then
         write(6,*) xsum,' xsum'
         endif

        deallocate(aa)
         call MPI_FINALIZE(ierr)
         end program equal_load_sum


--------------------------------------------------------------


two buffers (do not overlap), contiguous data


aaaa(m,n), bbbb(m,n) 모두 2차원 배열이다.
bbbb 배열은 0 번 노드에서만 존재하다고 가정한다.
aaaa 배열은 모든 노드에서 존재한다. 병렬작업과 관련된 배열이다.


        real*8 aaaa(m,n), bbbb(m,n)
        integer, allocatable :: idisp(:), jjlen(:)
        integer irank,n1,n2,ierr,jsta,jend
        integer myid,nproc

        n1=1 ; n2=n
        allocate(idisp(0:nproc-1), jjlen(0:nproc-1))
        do irank=0,nproc-1
        call equal_load(n1,n2,nproc,irank,jsta,jend)
        jjlen(irank)=m*(jend-jsta+1)
        idisp(irank)=m*(jsta-1)
        enddo
        call equal_load(n1,n2,nproc,myid,jsta,jend)

        call MPI_GATHERV(aaaa(1,jsta), jjlen(myid), MPI_DOUBLE_PRECISION, bbbb, jjlen, idisp, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD,  ierr)

         deallocate(idisp, jjlen)


--------------------------------------------------------------


        
        real*8 f(n)      

         do i=1,n
         f(i)=0.0d0
        enddo

        do i=1,n-1
        do j=i+1,n
          f(i)=f(i)+.......
          f(j)=f(j)+........
        enddo
        enddo


-->
        real*8 f(n),  ff(n)
  
        do i=1,n
         f(i)=0.d0
        enddo
        do i=1+myid,n-1,nproc
        do j=i+1,n
          f(i)=f(i)+.....
          f(j)=f(j)+.....
        enddo
        enddo
  
        call MPI_ALL_REDUCE(f, ff,  n,MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD, ierr)


     
        

--------------------------------------------------------------


synchronization, buffers do not overlap

        real*8 aaaa(m,n),bbbb(m,n)
        integer, allocatable :: idisp(:), jjlen(:)


        allocate(idisp(0:nproc-1), jjlen(0:nproc-1))

         do irank=0,nproc-1
         call equal_load(1,n,nproc
,irank,jsta,jend)
           jjlen(irank)=m * (jend-jsta+1)
           idisp(irank)=m * (jsta-1)
         enddo
         call equal_load(1,n,nproc,myid,jsta,jend)


         call MPI_ALLGATHERV(aaaa(1,jsta),jjlen(myid),MPI_DOUBLE_PRECISION, bbbb, jjlen,idisp, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD, ierr)

         deallocate(idisp,jjlen)


--------------------------------------------------------------
--------------------------------------------------------------
--------------------------------------------------------------
--------------------------------------------------------------
--------------------------------------------------------------
--------------------------------------------------------------
--------------------------------------------------------------




참고 사이트들:
http://incredible.egloos.com/3302455
http://incredible.egloos.com/2950371
http://incredible.egloos.com/3755171
http://incredible.egloos.com/4086075


핑백

덧글

댓글 입력 영역

최근 포토로그



MathJax