module force_mbl_solv
!-- 全ての項は d (U,V) / d t で計算されている.

  use derivation
  use mpi
  use mpi_mod
  use fftsub_mod
  use diag_function

contains

subroutine HADV_term_MBL( u_isp, v_isp, HADVu, HADVv )
!-- calculating horizontal advection terms
  use Math_Const
  use savegloval_define
  implicit none
  double precision, dimension(jynt,jxnt,0:nzp+1), intent(in) :: u_isp  ! u for ISPACK
  double precision, dimension(jynt,jxnt,0:nzp+1), intent(in) :: v_isp  ! v for ISPACK
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: HADVu  ! advection term for U
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: HADVv  ! advection term for V

  integer :: i, j, k
  double precision :: pi2, lxi, lyi
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1) :: auukl, auvkl, avvkl
  double precision, dimension(jynt,jxnt,0:nzp+1) :: auu_isp, auv_isp, avv_isp

  lxi=1.0d0/Lx
  lyi=1.0d0/Ly
  pi2=2.0d0*pi_dp

  auu_isp=0.0d0
  auv_isp=0.0d0
  avv_isp=0.0d0
  auukl=0.0d0
  auvkl=0.0d0
  avvkl=0.0d0

  HADVu=0.0d0
  HADVv=0.0d0

!$omp parallel default(shared)
!$omp do schedule(runtime) private(i,j,k)

  do k=1,nzp
     do i=1,jxnt
        do j=1,jynt
           auu_isp(j,i,k)=u_isp(j,i,k)*u_isp(j,i,k)
           auv_isp(j,i,k)=u_isp(j,i,k)*v_isp(j,i,k)
           avv_isp(j,i,k)=v_isp(j,i,k)*v_isp(j,i,k)
        end do
     end do
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(k)

  do k=1,nzp
     call phys2spec_isp( auu_isp(1:jynt,1:jxnt,k), auukl(1:kxnt,1:kynt,k) )
     call phys2spec_isp( auv_isp(1:jynt,1:jxnt,k), auvkl(1:kxnt,1:kynt,k) )
     call phys2spec_isp( avv_isp(1:jynt,1:jxnt,k), avvkl(1:kxnt,1:kynt,k) )
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(i,k)

  do k=1,nzp
     do i=2,hxnt+1
        HADVu(i,1,k)=-img_cdp*pi2*(dble(i-1)*lxi*auukl(i,1,k))
        HADVv(i,1,k)=-img_cdp*pi2*(dble(i-1)*lxi*auvkl(i,1,k))
        HADVu(kxnt-i+2,1,k)=dconjg(HADVu(i,1,k))
        HADVv(kxnt-i+2,1,k)=dconjg(HADVv(i,1,k))
     end do
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(j,k)

  do k=1,nzp
     do j=2,hynt+1
        HADVu(1,j,k)=-img_cdp*pi2*(dble(j-1)*lyi*auvkl(1,j,k))
        HADVv(1,j,k)=-img_cdp*pi2*(dble(j-1)*lyi*avvkl(1,j,k))
        HADVu(1,kynt-j+2,k)=img_cdp*pi2*(dble(j-1)*lyi*auvkl(1,kynt-j+2,k))
        HADVv(1,kynt-j+2,k)=img_cdp*pi2*(dble(j-1)*lyi*avvkl(1,kynt-j+2,k))
!-- ここでは, ISPACK の仕様により, l 方向は -l について独立
!-- なので, jy-j+2 では l 方向の波数が奇数乗でかかる場合,
!-- (-l) でかける必要がある (偶数乗はかけなくても同じ結果). 
     end do
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(i,j,k)

  do k=1,nzp
     do j=2,hynt+1
        do i=2,hxnt+1
           HADVu(i,j,k)=-img_cdp*pi2  &
  &                    *(dble(i-1)*lxi*auukl(i,j,k)  &
  &                     +dble(j-1)*lyi*auvkl(i,j,k))
           HADVv(i,j,k)=-img_cdp*pi2  &
  &                    *(dble(i-1)*lxi*auvkl(i,j,k)  &
  &                     +dble(j-1)*lyi*avvkl(i,j,k))
           HADVu(kxnt-i+2,kynt-j+2,k)=dconjg(HADVu(i,j,k))
           HADVv(kxnt-i+2,kynt-j+2,k)=dconjg(HADVv(i,j,k))
           HADVu(i,kynt-j+2,k)=-img_cdp*pi2  &
  &                           *(dble(i-1)*lxi*auukl(i,kynt-j+2,k)  &
  &                            -dble(j-1)*lyi*auvkl(i,kynt-j+2,k))
           HADVv(i,kynt-j+2,k)=-img_cdp*pi2  &
  &                           *(dble(i-1)*lxi*auvkl(i,kynt-j+2,k)  &
  &                            -dble(j-1)*lyi*avvkl(i,kynt-j+2,k))
           HADVu(kxnt-i+2,j,k)=dconjg(HADVu(i,kynt-j+2,k))
           HADVv(kxnt-i+2,j,k)=dconjg(HADVv(i,kynt-j+2,k))
!-- ここでは, ISPACK の仕様により, l 方向は -l について独立
!-- なので, jy-j+2 では l 方向の波数が奇数乗でかかる場合,
!-- (-l) でかける必要がある (偶数乗はかけなくても同じ結果). 
        end do
     end do
  end do

!$omp end do
!$omp end parallel

end subroutine HADV_term_MBL


subroutine VADV_term_MBL( u_isp, v_isp, w_isp, u_nbm_isp, v_nbm_isp,  &
  &                       VADVu, VADVv )
!-- calculating horizontal advection terms
  use Math_Const
  use savegloval_define
  implicit none
  double precision, dimension(jynt,jxnt,0:nzp+1), intent(in) :: u_isp  ! u for ISPACK
  double precision, dimension(jynt,jxnt,0:nzp+1), intent(in) :: v_isp  ! v for ISPACK
  double precision, dimension(jynt,jxnt,0:nzp+1), intent(in) :: w_isp  ! w for ISPACK
  double precision, dimension(jynt,jxnt), intent(in) :: u_nbm_isp   ! u for ISPACK
  double precision, dimension(jynt,jxnt), intent(in) :: v_nbm_isp   ! v for ISPACK
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: VADVu  ! advection term for U
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: VADVv  ! advection term for V

  integer :: i, j, k, IERROR
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1) :: awukl, awvkl
  double precision, dimension(jynt,jxnt,0:nzp+1) :: awu_isp, awv_isp

  awu_isp=0.0d0
  awv_isp=0.0d0
  awukl=0.0d0
  awvkl=0.0d0

  VADVu=0.0d0
  VADVv=0.0d0

!$omp parallel default(shared)
!$omp do schedule(runtime) private(i,j,k)

  do k=1,nzp
     do i=1,jxnt
        do j=1,jynt
           awu_isp(j,i,k)=w_isp(j,i,k)*u_isp(j,i,k)
           awv_isp(j,i,k)=w_isp(j,i,k)*v_isp(j,i,k)
        end do
     end do
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(k)

  do k=1,nzp
     call phys2spec_isp( awu_isp(1:jynt,1:jxnt,k), awukl(1:kxnt,1:kynt,k) )
     call phys2spec_isp( awv_isp(1:jynt,1:jxnt,k), awvkl(1:kxnt,1:kynt,k) )
  end do

!$omp end do
!$omp end parallel

  call mpi_sendrecv_valcpd( IERROR, awukl, awvkl )

  if(MY_RANK==PETOT-1)then  ! aw{u,v}kl(nzp+1) == bw{u,v}kl

!$omp parallel default(shared)
!$omp do schedule(runtime) private(i,j)

     do i=1,jxnt
        do j=1,jynt
           awu_isp(j,i,nzp+1)=w_isp(j,i,nzp)*u_nbm_isp(j,i)
           awv_isp(j,i,nzp+1)=w_isp(j,i,nzp)*v_nbm_isp(j,i)
        end do
     end do

!$omp end do
!$omp end parallel

     call phys2spec_isp( awu_isp(1:jynt,1:jxnt,nzp+1), awukl(1:kxnt,1:kynt,nzp+1) )
     call phys2spec_isp( awv_isp(1:jynt,1:jxnt,nzp+1), awvkl(1:kxnt,1:kynt,nzp+1) )

  end if

!$omp parallel default(shared)
!$omp do schedule(runtime) private(i,k)

  do k=1,nzp
     do i=2,hxnt+1
        VADVu(i,1,k)=-(awukl(i,1,k+1)-awukl(i,1,k-1))/(z_pe(k+1)-z_pe(k-1))
        VADVv(i,1,k)=-(awvkl(i,1,k+1)-awvkl(i,1,k-1))/(z_pe(k+1)-z_pe(k-1))
        VADVu(kxnt-i+2,1,k)=dconjg(VADVu(i,1,k))
        VADVv(kxnt-i+2,1,k)=dconjg(VADVv(i,1,k))
     end do
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(j,k)

  do k=1,nzp
     do j=2,hynt+1
        VADVu(1,j,k)=-(awukl(1,j,k+1)-awukl(1,j,k-1))/(z_pe(k+1)-z_pe(k-1))
        VADVv(1,j,k)=-(awvkl(1,j,k+1)-awvkl(1,j,k-1))/(z_pe(k+1)-z_pe(k-1))
        VADVu(1,kynt-j+2,k)=-(awukl(1,kynt-j+2,k+1)-awukl(1,kynt-j+2,k-1))/(z_pe(k+1)-z_pe(k-1))
        VADVv(1,kynt-j+2,k)=-(awvkl(1,kynt-j+2,k+1)-awvkl(1,kynt-j+2,k-1))/(z_pe(k+1)-z_pe(k-1))
     end do
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(i,j,k)

  do k=1,nzp
     do j=2,hynt+1
        do i=2,hxnt+1
           VADVu(i,j,k)=-(awukl(i,j,k+1)-awukl(i,j,k-1))/(z_pe(k+1)-z_pe(k-1))
           VADVv(i,j,k)=-(awvkl(i,j,k+1)-awvkl(i,j,k-1))/(z_pe(k+1)-z_pe(k-1))
           VADVu(kxnt-i+2,kynt-j+2,k)=dconjg(VADVu(i,j,k))
           VADVv(kxnt-i+2,kynt-j+2,k)=dconjg(VADVv(i,j,k))
           VADVu(i,kynt-j+2,k)=-(awukl(i,kynt-j+2,k+1)-awukl(i,kynt-j+2,k-1))/(z_pe(k+1)-z_pe(k-1))
           VADVv(i,kynt-j+2,k)=-(awvkl(i,kynt-j+2,k+1)-awvkl(i,kynt-j+2,k-1))/(z_pe(k+1)-z_pe(k-1))
           VADVu(kxnt-i+2,j,k)=dconjg(VADVu(i,kynt-j+2,k))
           VADVv(kxnt-i+2,j,k)=dconjg(VADVv(i,kynt-j+2,k))
        end do
     end do
  end do

!$omp end do
!$omp end parallel

!-- 以下は上に含まれるので特別に行う必要はない. 
!-- k==1 では aw{u,v}kl(:,:,0) == 0 なので, 
!-- 上の操作と同じ.
!  if(MY_RANK==0)then  ! Bottom k==1
!
!!$omp parallel default(shared)
!!$omp do schedule(runtime) private(i)
!
!     do i=2,hxnt+1
!        VADVu(i,1,1)=-awukl(i,1,2)/z_pe(2)
!        VADVv(i,1,1)=-awvkl(i,1,2)/z_pe(2)
!        VADVu(kxnt-i+2,1,1)=dconjg(VADVu(i,1,1))
!        VADVv(kxnt-i+2,1,1)=dconjg(VADVv(i,1,1))
!     end do
!
!!$omp end do
!!$omp barrier
!!$omp do schedule(runtime) private(j)
!
!     do j=2,hynt+1
!        VADVu(1,j,1)=-awukl(1,j,2)/z_pe(2)
!        VADVv(1,j,1)=-awvkl(1,j,2)/z_pe(2)
!        VADVu(1,kynt-j+2,1)=-awukl(1,kynt-j+2,2)/z_pe(2)
!        VADVv(1,kynt-j+2,1)=-awvkl(1,kynt-j+2,2)/z_pe(2)
!     end do
!
!!$omp end do
!!$omp barrier
!!$omp do schedule(runtime) private(i,j)
!
!     do j=2,hynt+1
!        do i=2,hxnt+1
!           VADVu(i,j,1)=-awukl(i,j,2)/z_pe(2)
!           VADVv(i,j,1)=-awvkl(i,j,2)/z_pe(2)
!           VADVu(kxnt-i+2,kynt-j+2,1)=dconjg(VADVu(i,j,1))
!           VADVv(kxnt-i+2,kynt-j+2,1)=dconjg(VADVv(i,j,1))
!           VADVu(i,kynt-j+2,1)=-awukl(i,kynt-j+2,2)/z_pe(2)
!           VADVv(i,kynt-j+2,1)=-awvkl(i,kynt-j+2,2)/z_pe(2)
!           VADVu(kxnt-i+2,j,1)=dconjg(VADVu(i,kynt-j+2,1))
!           VADVv(kxnt-i+2,j,1)=dconjg(VADVv(i,kynt-j+2,1))
!        end do
!     end do
!
!!$omp end do
!!$omp end parallel
!
!-- k==nzp では aw{u,v}kl(:,:,nzp+1) を上で定義しているので, 
!-- z_pe(nzp+1)-z_pe(nzp-1) 区間で微分すればよい.
!-- (z_pe(nzp+1) は val_alloc で定義している.)
!  else if(MY_RANK==PETOT-1)then  ! Top k==nzp
!
!!$omp parallel default(shared)
!!$omp do schedule(runtime) private(i)
!
!     do i=2,hxnt+1
!        VADVu(i,1,nzp)=-0.5d0*(awukl(i,1,nzp+1)-awukl(i,1,nzp-1))  &
!  &                     /(z_pe(nzp)-z_pe(nzp-1))
!        VADVv(i,1,nzp)=-0.5d0*(awvkl(i,1,nzp+1)-awvkl(i,1,nzp-1))  &
!  &                     /(z_pe(nzp)-z_pe(nzp-1))
!        VADVu(kxnt-i+2,1,nzp)=dconjg(VADVu(i,1,nzp))
!        VADVv(kxnt-i+2,1,nzp)=dconjg(VADVv(i,1,nzp))
!     end do
!
!!$omp end do
!!$omp barrier
!!$omp do schedule(runtime) private(j)
!
!     do j=2,hynt+1
!        VADVu(1,j,nzp)=-0.5d0*(awukl(1,j,nzp+1)-awukl(1,j,nzp-1))  &
!  &                     /(z_pe(nzp)-z_pe(nzp-1))
!        VADVv(1,j,nzp)=-0.5d0*(awvkl(1,j,nzp+1)-awvkl(1,j,nzp-1))  &
!  &                     /(z_pe(nzp)-z_pe(nzp-1))
!        VADVu(1,kynt-j+2,nzp)=-0.5d0*(awukl(1,kynt-j+2,nzp+1)-awukl(1,kynt-j+2,nzp-1))  &
!  &                     /(z_pe(nzp)-z_pe(nzp-1))
!        VADVv(1,kynt-j+2,nzp)=-0.5d0*(awvkl(1,kynt-j+2,nzp+1)-awvkl(1,kynt-j+2,nzp-1))  &
!  &                     /(z_pe(nzp)-z_pe(nzp-1))
!     end do
!
!!$omp end do
!!$omp barrier
!!$omp do schedule(runtime) private(i,j)
!
!     do j=2,hynt+1
!        do i=2,hxnt+1
!           VADVu(i,j,nzp)=-0.5d0*(awukl(i,j,nzp+1)-awukl(i,j,nzp-1))  &
!  &                        /(z_pe(nzp)-z_pe(nzp-1))
!           VADVv(i,j,nzp)=-0.5d0*(awvkl(i,j,nzp+1)-awvkl(i,j,nzp-1))  &
!  &                        /(z_pe(nzp)-z_pe(nzp-1))
!           VADVu(kxnt-i+2,kynt-j+2,nzp)=dconjg(VADVu(i,j,nzp))
!           VADVv(kxnt-i+2,kynt-j+2,nzp)=dconjg(VADVv(i,j,nzp))
!           VADVu(i,kynt-j+2,nzp)=-0.5d0*(awukl(i,kynt-j+2,nzp+1)  &
!  &                                     -awukl(i,kynt-j+2,nzp-1))  &
!  &                               /(z_pe(nzp)-z_pe(nzp-1))
!           VADVv(i,kynt-j+2,nzp)=-0.5d0*(awvkl(i,kynt-j+2,nzp+1)  &
!  &                                     -awvkl(i,kynt-j+2,nzp-1))  &
!  &                               /(z_pe(nzp)-z_pe(nzp-1))
!           VADVu(kxnt-i+2,j,nzp)=dconjg(VADVu(i,kynt-j+2,nzp))
!           VADVv(kxnt-i+2,j,nzp)=dconjg(VADVv(i,kynt-j+2,nzp))
!        end do
!     end do
!
!!$omp end do
!!$omp end parallel
!
!  end if

end subroutine VADV_term_MBL


subroutine VADV_term_MBL2( u_isp, v_isp, w_isp, u_nbm_isp, v_nbm_isp,  &
  &                        VADVu, VADVv )
!-- calculating horizontal advection terms
  use Math_Const
  use savegloval_define
  implicit none
  double precision, dimension(jynt,jxnt,0:nzp+1), intent(in) :: u_isp  ! u for ISPACK
  double precision, dimension(jynt,jxnt,0:nzp+1), intent(in) :: v_isp  ! v for ISPACK
  double precision, dimension(jynt,jxnt,0:nzp+1), intent(in) :: w_isp  ! w for ISPACK
  double precision, dimension(jynt,jxnt), intent(in) :: u_nbm_isp   ! u for ISPACK
  double precision, dimension(jynt,jxnt), intent(in) :: v_nbm_isp   ! v for ISPACK
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: VADVu  ! advection term for U
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: VADVv  ! advection term for V

  integer :: i, j, k, IERROR
!  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1) :: awukl, awvkl
  double precision, dimension(jynt,jxnt,0:nzp+1) :: awu_isp, awv_isp
  double precision, dimension(jynt,jxnt,0:nzp+1) :: tmpu_isp, tmpv_isp
  double precision :: wdtdz, dti

  dti=1.0d0/dt

  awu_isp=0.0d0
  awv_isp=0.0d0
  tmpu_isp=u_isp
  tmpv_isp=v_isp
!  awukl=0.0d0
!  awvkl=0.0d0

  VADVu=0.0d0
  VADVv=0.0d0

  call mpi_sendrecv_vald( IERROR, tmpu_isp, tmpv_isp )

!$omp parallel default(shared)
!$omp do schedule(runtime) private(i,j,k,wdtdz)

  do k=1,nzp
     do i=1,jxnt
        do j=1,jynt
           wdtdz=w_isp(j,i,k)*dt/(0.5d0*(z_pe(k+1)-z_pe(k-1)))
           awu_isp(j,i,k)=0.5d0*wdtdz*(wdtdz-1.0d0)*tmpu_isp(j,i,k+1)  &
  &                      +0.5d0*wdtdz*(wdtdz+1.0d0)*tmpu_isp(j,i,k-1)  &
  &                      -(wdtdz**2)*tmpu_isp(j,i,k)
           awv_isp(j,i,k)=0.5d0*wdtdz*(wdtdz-1.0d0)*tmpv_isp(j,i,k+1)  &
  &                      +0.5d0*wdtdz*(wdtdz+1.0d0)*tmpv_isp(j,i,k-1)  &
  &                      -(wdtdz**2)*tmpv_isp(j,i,k)
           awu_isp(j,i,k)=awu_isp(j,i,k)*dti
           awv_isp(j,i,k)=awv_isp(j,i,k)*dti
        end do
     end do
  end do

!$omp end do
!$omp end parallel

  if(MY_RANK==PETOT-1)then  ! aw{u,v}kl(nzp+1) == bw{u,v}kl

!$omp parallel default(shared)
!$omp do schedule(runtime) private(i,j,wdtdz)

     do i=1,jxnt
        do j=1,jynt
           if(w_isp(j,i,nzp)>=0.0d0)then
              wdtdz=w_isp(j,i,nzp)/(z_pe(nzp)-z_pe(nzp-1))
              awu_isp(j,i,nzp)=-wdtdz*(tmpu_isp(j,i,nzp)-tmpu_isp(j,i,nzp-1))
              awv_isp(j,i,nzp)=-wdtdz*(tmpv_isp(j,i,nzp)-tmpv_isp(j,i,nzp-1))
           else
              wdtdz=w_isp(j,i,nzp)/(z_pe(nzp+1)-z_pe(nzp))
              awu_isp(j,i,nzp)=-wdtdz*(u_nbm_isp(j,i)-tmpu_isp(j,i,nzp))
              awv_isp(j,i,nzp)=-wdtdz*(v_nbm_isp(j,i)-tmpv_isp(j,i,nzp))
           end if
        end do
     end do

!$omp end do
!$omp end parallel

  else if(MY_RANK==0)then  ! aw{u,v}kl(0) == bw{u,v}kl

!$omp parallel default(shared)
!$omp do schedule(runtime) private(i,j,wdtdz)

     do i=1,jxnt
        do j=1,jynt
           if(w_isp(j,i,1)<0.0d0)then
              wdtdz=w_isp(j,i,1)/(z_pe(2)-z_pe(1))
              awu_isp(j,i,1)=-wdtdz*(tmpu_isp(j,i,2)-tmpu_isp(j,i,1))
              awv_isp(j,i,1)=-wdtdz*(tmpv_isp(j,i,2)-tmpv_isp(j,i,1))
           else
              awu_isp(j,i,1)=0.0d0
              awv_isp(j,i,1)=0.0d0
           end if
        end do
     end do

!$omp end do
!$omp end parallel

  end if

!$omp parallel default(shared)
!$omp do schedule(runtime) private(k)

  do k=1,nzp
     call phys2spec_isp( awu_isp(1:jynt,1:jxnt,k), VADVu(1:kxnt,1:kynt,k) )
     call phys2spec_isp( awv_isp(1:jynt,1:jxnt,k), VADVv(1:kxnt,1:kynt,k) )
  end do

!$omp end do
!$omp end parallel

!  call mpi_sendrecv_valcpd( IERROR, awukl, awvkl )

end subroutine VADV_term_MBL2


subroutine CORIL_term_MBL( uk, vk, CORILu, CORILv )
!-- calculating Coriolis term
!-- +fvk, -fuk
  use savegloval_define
  implicit none
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(in) :: uk        ! uk
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(in) :: vk        ! vk
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: CORILu ! Coriolis term for U
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: CORILv ! Coriolis term for V

  integer :: i, j, k

  CORILu=0.0d0
  CORILv=0.0d0

!$omp parallel default(shared)
!$omp do schedule(runtime) private(i,k)

  do k=1,nzp
     do i=2,hxnt+1
        CORILu(i,1,k)=f0*vk(i,1,k)
        CORILv(i,1,k)=-f0*uk(i,1,k)
        CORILu(kxnt-i+2,1,k)=dconjg(vk(i,1,k))
        CORILv(kxnt-i+2,1,k)=dconjg(uk(i,1,k))
     end do
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(j,k)

  do k=1,nzp
     do j=2,hynt+1
        CORILu(1,j,k)=f0*vk(1,j,k)
        CORILv(1,j,k)=-f0*uk(1,j,k)
        CORILu(1,kynt-j+2,k)=f0*vk(1,kynt-j+2,k)
        CORILv(1,kynt-j+2,k)=-f0*uk(1,kynt-j+2,k)
     end do
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(i,j,k)

  do k=1,nzp
     do j=2,hynt+1
        do i=2,hxnt+1
           CORILu(i,j,k)=f0*vk(i,j,k)
           CORILv(i,j,k)=-f0*uk(i,j,k)
           CORILu(kxnt-i+2,kynt-j+2,k)=dconjg(CORILu(i,j,k))
           CORILv(kxnt-i+2,kynt-j+2,k)=dconjg(CORILv(i,j,k))
           CORILu(i,kynt-j+2,k)=f0*vk(i,kynt-j+2,k)
           CORILv(i,kynt-j+2,k)=-f0*uk(i,kynt-j+2,k)
           CORILu(kxnt-i+2,j,k)=dconjg(CORILu(i,kynt-j+2,k))
           CORILv(kxnt-i+2,j,k)=dconjg(CORILv(i,kynt-j+2,k))
        end do
     end do
  end do

!$omp end do
!$omp end parallel

end subroutine CORIL_term_MBL


subroutine HDIFF_term_MBL( uk, vk, HDIFFu, HDIFFv )
!-- calculating horizontal diffusion term
! K*lap(val)
  use Math_Const
  use savegloval_define
  implicit none
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(in) :: uk        ! uk
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(in) :: vk        ! vk
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: HDIFFu ! diffusion term for U
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: HDIFFv ! diffusion term for V

  integer :: i, j, k
  double precision :: lxi, lyi, pi4, kcoe

  HDIFFu=0.0d0
  HDIFFv=0.0d0

  lxi=1.0d0/Lx
  lyi=1.0d0/Ly
  pi4=4.0d0*pi_dp*pi_dp

!$omp parallel default(shared)
!$omp do schedule(runtime) private(i,k,kcoe)

  do k=1,nzp
     do i=2,hxnt+1
        kcoe=pi4*(dble(i-1)*lxi)**2
!ORG        HDIFFu(i,1,k)=-KH_mbl*pi4*((dble(i-1)*lxi)**2)*uk(i,1,k)
!ORG        HDIFFv(i,1,k)=-KH_mbl*pi4*((dble(i-1)*lxi)**2)*vk(i,1,k)
        HDIFFu(i,1,k)=-KH_mbl*(kcoe**ndiff_mbl)*uk(i,1,k)
        HDIFFv(i,1,k)=-KH_mbl*(kcoe**ndiff_mbl)*vk(i,1,k)
        HDIFFu(kxnt-i+2,1,k)=dconjg(HDIFFu(i,1,k))
        HDIFFv(kxnt-i+2,1,k)=dconjg(HDIFFv(i,1,k))
     end do
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(j,k,kcoe)

  do k=1,nzp
     do j=2,hynt+1
        kcoe=pi4*(dble(j-1)*lyi)**2
!ORG        HDIFFu(1,j,k)=-KH_mbl*pi4*((dble(j-1)*lyi)**2)*uk(1,j,k)
!ORG        HDIFFv(1,j,k)=-KH_mbl*pi4*((dble(j-1)*lyi)**2)*vk(1,j,k)
        HDIFFu(1,j,k)=-KH_mbl*(kcoe**ndiff_mbl)*uk(1,j,k)
        HDIFFv(1,j,k)=-KH_mbl*(kcoe**ndiff_mbl)*vk(1,j,k)
!ORG        HDIFFu(1,kynt-j+2,k)=-KH_mbl*pi4*((dble(j-1)*lyi)**2)*uk(1,kynt-j+2,k)
!ORG        HDIFFv(1,kynt-j+2,k)=-KH_mbl*pi4*((dble(j-1)*lyi)**2)*vk(1,kynt-j+2,k)
        HDIFFu(1,kynt-j+2,k)=-KH_mbl*(kcoe**ndiff_mbl)*uk(1,kynt-j+2,k)
        HDIFFv(1,kynt-j+2,k)=-KH_mbl*(kcoe**ndiff_mbl)*vk(1,kynt-j+2,k)
     end do
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(i,j,k,kcoe)

  do k=1,nzp
     do j=2,hynt+1
        do i=2,hxnt+1
           kcoe=pi4*((dble(i-1)*lxi)**2+(dble(j-1)*lyi)**2)
!ORG           HDIFFu(i,j,k)=-KH_mbl*pi4*((dble(i-1)*lxi)**2+(dble(j-1)*lyi)**2)*uk(i,j,k)
!ORG           HDIFFv(i,j,k)=-KH_mbl*pi4*((dble(i-1)*lxi)**2+(dble(j-1)*lyi)**2)*vk(i,j,k)
           HDIFFu(i,j,k)=-KH_mbl*(kcoe**ndiff_mbl)*uk(i,j,k)
           HDIFFv(i,j,k)=-KH_mbl*(kcoe**ndiff_mbl)*vk(i,j,k)
           HDIFFu(kxnt-i+2,kynt-j+2,k)=dconjg(HDIFFu(i,j,k))
           HDIFFv(kxnt-i+2,kynt-j+2,k)=dconjg(HDIFFv(i,j,k))
!ORG           HDIFFu(i,kynt-j+2,k)=-KH_mbl*pi4*((dble(i-1)*lxi)**2+(dble(j-1)*lyi)**2)  &
!ORG  &                          *uk(i,kynt-j+2,k)
!ORG           HDIFFv(i,kynt-j+2,k)=-KH_mbl*pi4*((dble(i-1)*lxi)**2+(dble(j-1)*lyi)**2)  &
!ORG  &                          *vk(i,kynt-j+2,k)
           HDIFFu(i,kynt-j+2,k)=-KH_mbl*((kcoe**ndiff_mbl))*uk(i,kynt-j+2,k)
           HDIFFv(i,kynt-j+2,k)=-KH_mbl*((kcoe**ndiff_mbl))*vk(i,kynt-j+2,k)
           HDIFFu(kxnt-i+2,j,k)=dconjg(HDIFFu(i,kynt-j+2,k))
           HDIFFv(kxnt-i+2,j,k)=dconjg(HDIFFv(i,kynt-j+2,k))
        end do
     end do
  end do

!$omp end do
!$omp end parallel

end subroutine HDIFF_term_MBL


subroutine VDIFF_term_MBL( uk, vk, VDIFFu, VDIFFv )
!-- calculating vertical diffusion term
!-- (旧) MY_RANK==0 の k==1 と MY_RANK==PETOT-1 の k==nzp は後で置き換えられる.
!-- (新) MY_RANK==0 の k==1 は後で置き換えられる.
  use savegloval_define
  implicit none
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(in) :: uk        ! uk
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(in) :: vk        ! vk
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: VDIFFu  ! diffusion term for U
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: VDIFFv  ! diffusion term for V

  integer :: i, j, k

  VDIFFu=0.0d0
  VDIFFv=0.0d0

!$omp parallel default(shared)
!$omp do schedule(runtime) private(i,k)

  do k=1,nzp
     do i=2,hxnt+1
        VDIFFu(i,1,k)=(K_mbl/(0.25d0*((z_pe(k+1)-z_pe(k-1))**2)))  &
  &                  *(uk(i,1,k+1)-2.0d0*uk(i,1,k)+uk(i,1,k-1))
        VDIFFv(i,1,k)=(K_mbl/(0.25d0*((z_pe(k+1)-z_pe(k-1))**2)))  &
  &                  *(vk(i,1,k+1)-2.0d0*vk(i,1,k)+vk(i,1,k-1))
        VDIFFu(kxnt-i+2,1,k)=dconjg(VDIFFu(i,1,k))
        VDIFFv(kxnt-i+2,1,k)=dconjg(VDIFFv(i,1,k))
     end do
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(j,k)

  do k=1,nzp
     do j=2,hynt+1
        VDIFFu(1,j,k)=(K_mbl/(0.25d0*((z_pe(k+1)-z_pe(k-1))**2)))  &
  &                  *(uk(1,j,k+1)-2.0d0*uk(1,j,k)+uk(1,j,k-1))
        VDIFFv(1,j,k)=(K_mbl/(0.25d0*((z_pe(k+1)-z_pe(k-1))**2)))  &
  &                  *(vk(1,j,k+1)-2.0d0*vk(1,j,k)+vk(1,j,k-1))
        VDIFFu(1,kynt-j+2,k)=(K_mbl/(0.25d0*((z_pe(k+1)-z_pe(k-1))**2)))  &
  &                  *(uk(1,kynt-j+2,k+1)-2.0d0*uk(1,kynt-j+2,k)+uk(1,kynt-j+2,k-1))
        VDIFFv(1,kynt-j+2,k)=(K_mbl/(0.25d0*((z_pe(k+1)-z_pe(k-1))**2)))  &
  &                  *(vk(1,kynt-j+2,k+1)-2.0d0*vk(1,kynt-j+2,k)+vk(1,kynt-j+2,k-1))
     end do
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(i,j,k)

  do k=1,nzp
     do j=2,hynt+1
        do i=2,hxnt+1
           VDIFFu(i,j,k)=(K_mbl/(0.25d0*((z_pe(k+1)-z_pe(k-1))**2)))  &
  &                     *(uk(i,j,k+1)-2.0d0*uk(i,j,k)+uk(i,j,k-1))
           VDIFFv(i,j,k)=(K_mbl/(0.25d0*((z_pe(k+1)-z_pe(k-1))**2)))  &
  &                     *(vk(i,j,k+1)-2.0d0*vk(i,j,k)+vk(i,j,k-1))
           VDIFFu(kxnt-i+2,kynt-j+2,k)=dconjg(VDIFFu(i,j,k))
           VDIFFv(kxnt-i+2,kynt-j+2,k)=dconjg(VDIFFv(i,j,k))
           VDIFFu(i,kynt-j+2,k)=(K_mbl/(0.25d0*((z_pe(k+1)-z_pe(k-1))**2)))  &
  &                            *(uk(i,kynt-j+2,k+1)-2.0d0*uk(i,kynt-j+2,k)+uk(i,kynt-j+2,k-1))
           VDIFFv(i,kynt-j+2,k)=(K_mbl/(0.25d0*((z_pe(k+1)-z_pe(k-1))**2)))  &
  &                            *(vk(i,kynt-j+2,k+1)-2.0d0*vk(i,kynt-j+2,k)+vk(i,kynt-j+2,k-1))
           VDIFFu(kxnt-i+2,j,k)=dconjg(VDIFFu(i,kynt-j+2,k))
           VDIFFv(kxnt-i+2,j,k)=dconjg(VDIFFv(i,kynt-j+2,k))
        end do
     end do
  end do

!$omp end do
!$omp end parallel

end subroutine VDIFF_term_MBL


subroutine VDIFF_MY2_term_MBL( u_isp, v_isp, VDIFFu, VDIFFv )
!-- calculating vertical diffusion term based on Mellor and Yamada (1973)
!-- MY_RANK==0 の k==1 と MY_RANK==PETOT-1 の k==nzp は後で置き換えられる.
!!!!!!!!!!!!!!!![開発途中]
  use savegloval_define
  implicit none
  double precision, dimension(jynt,jxnt,0:nzp+1), intent(in) :: u_isp  ! u for ISPACK
  double precision, dimension(jynt,jxnt,0:nzp+1), intent(in) :: v_isp  ! v for ISPACK
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: VDIFFu ! diffusion term for U
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: VDIFFv ! diffusion term for V
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1) :: dudzkl, dvdzkl
  double precision, dimension(jynt,jxnt,0:nzp+1) :: dudz_isp, dvdz_isp
  integer :: i, j, k

  dudz_isp=0.0d0
  dvdz_isp=0.0d0
  dudzkl=0.0d0
  dvdzkl=0.0d0

  VDIFFu=0.0d0
  VDIFFv=0.0d0

!$omp parallel default(shared)
!$omp do schedule(runtime) private(i,j)

  do i=1,jxnt
     do j=1,jynt
        call grad_1d( z_pe(0:nzp+1), u_isp(j,i,0:nzp+1), dudz_isp(j,i,0:nzp+1) )
        call grad_1d( z_pe(0:nzp+1), v_isp(j,i,0:nzp+1), dvdz_isp(j,i,0:nzp+1) )
     end do
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(k)

  do k=1,nzp
     call phys2spec_isp( dudz_isp(1:jynt,1:jxnt,k), dudzkl(1:kxnt,1:kynt,k) )
     call phys2spec_isp( dvdz_isp(1:jynt,1:jxnt,k), dvdzkl(1:kxnt,1:kynt,k) )
  end do

!!!!!!!!!!!!!!!!!!!-- これ以降は HADV のルーチンからコピーした方がよいかも

!$omp end do
!$omp end parallel

end subroutine VDIFF_MY2_term_MBL


subroutine FRICTION_term_phy_MBL( u_mbl_isp, v_mbl_isp,  &
  &                               FRICTIONu_isp, FRICTIONv_isp )
!-- calculating friction terms (for ISPACK on physical space)
  use Math_Const
  use savegloval_define
  implicit none
  double precision, dimension(jynt,jxnt), intent(in) :: u_mbl_isp   ! u for ISPACK
  double precision, dimension(jynt,jxnt), intent(in) :: v_mbl_isp   ! v for ISPACK
  double precision, dimension(jynt,jxnt), intent(inout) :: FRICTIONu_isp
                                                  ! friction term of U for ISPACK
  double precision, dimension(jynt,jxnt), intent(inout) :: FRICTIONv_isp
                                                  ! friction term of V for ISPACK

  integer :: i, j
  double precision :: vs, Cd_diag, coef

  FRICTIONu_isp=0.0d0
  FRICTIONv_isp=0.0d0

!$omp parallel default(shared)
!$omp do schedule(runtime) private(i,j,vs,Cd_diag,coef)

     do i=1,jxnt
        do j=1,jynt
           Cd_diag=Cd_K16( u_mbl_isp(j,i), v_mbl_isp(j,i) )
           vs=dsqrt(u_mbl_isp(j,i)**2+v_mbl_isp(j,i)**2)
           coef=Cd_diag*vs
           FRICTIONu_isp(j,i)=coef*u_mbl_isp(j,i)
           FRICTIONv_isp(j,i)=coef*v_mbl_isp(j,i)
        end do
     end do

!$omp end do
!$omp end parallel

end subroutine FRICTION_term_phy_MBL


subroutine SET_BOUNDARIES( VDIFFu, VDIFFv, uk, vk, FRICu_isp, FRICv_isp )
!-- calculating stretching terms (on physical space)
!-- VDIFF{u,v} are replaced at k==1 (MY_RANK==0) and k==nzp (MY_RANK==PETOT-1)
  use savegloval_define
  implicit none
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: VDIFFu  ! diffusion term for U
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: VDIFFv  ! diffusion term for V
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(in) :: uk        ! uk
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(in) :: vk        ! vk
  double precision, dimension(jynt,jxnt), intent(inout) :: FRICu_isp
                                                  ! friction term of U for ISPACK
  double precision, dimension(jynt,jxnt), intent(inout) :: FRICv_isp
                                                  ! friction term of V for ISPACK
  complex(kind(0d0)), dimension(kxnt,kynt) :: tauukl    ! bottom friction uk
  complex(kind(0d0)), dimension(kxnt,kynt) :: tauvkl    ! bottom friction vk
  complex(kind(0d0)), dimension(kxnt,kynt) :: uk3, vk3
  double precision :: z3

  integer :: i, j, IERROR

  if(nzp==1)then  ! 1 プロセスが 1 層しか担当しない場合 (隣のプロセスが uk3 を持つ)
     if(MY_RANK==0.or.MY_RANK==1)then
        call mpi_sendrecvp2p_valcpd( IERROR, 1, 0,  &
  &                                  uk(1:kxnt,1:kynt,2),  &
  &                                  vk(1:kxnt,1:kynt,2),  &
  &                                  uk3(1:kxnt,1:kynt),  &
  &                                  vk3(1:kxnt,1:kynt) )
        if(MY_RANK==0)then
           z3=z(3)
        end if
     else if(MY_RANK==PETOT-2.or.MY_RANK==PETOT-1)then
        call mpi_sendrecvp2p_valcpd( IERROR, PETOT-2, PETOT-1,  &
  &                                  uk(1:kxnt,1:kynt,0),  &
  &                                  vk(1:kxnt,1:kynt,0),  &
  &                                  uk3(1:kxnt,1:kynt),  &
  &                                  vk3(1:kxnt,1:kynt) )
        if(MY_RANK==PETOT-1)then
           z3=z(nz-2)
        end if
     end if
  else  ! 1 プロセスが 2 層以上担当する場合 (自プロセスに uk3 がある)
     if(MY_RANK==0)then
        uk3(1:kxnt,1:kynt)=uk(1:kxnt,1:kynt,3)
        vk3(1:kxnt,1:kynt)=vk(1:kxnt,1:kynt,3)
        z3=z(3)
     else if(MY_RANK==PETOT-1)then
        uk3(1:kxnt,1:kynt)=uk(1:kxnt,1:kynt,nzp-2)
        vk3(1:kxnt,1:kynt)=vk(1:kxnt,1:kynt,nzp-2)
        z3=z(nz-2)
     end if
  end if

  if(MY_RANK==0)then  ! Bottom boundary

     call phys2spec_isp( FRICu_isp(1:jynt,1:jxnt), tauukl(1:kxnt,1:kynt) )
     call phys2spec_isp( FRICv_isp(1:jynt,1:jxnt), tauvkl(1:kxnt,1:kynt) )

!$omp parallel default(shared)
!$omp do schedule(runtime) private(i)

     do i=2,hxnt+1
        VDIFFu(i,1,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                   *(uk3(i,1)-uk(i,1,1))-tauukl(i,1)/z_pe(2)
        VDIFFv(i,1,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                   *(vk3(i,1)-vk(i,1,1))-tauvkl(i,1)/z_pe(2)
        VDIFFu(kxnt-i+2,1,1)=dconjg(VDIFFu(i,1,1))
        VDIFFv(kxnt-i+2,1,1)=dconjg(VDIFFv(i,1,1))
     end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(j)

     do j=2,hynt+1
        VDIFFu(1,j,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                   *(uk3(1,j)-uk(1,j,1))-tauukl(1,j)/z_pe(2)
        VDIFFv(1,j,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                   *(vk3(1,j)-vk(1,j,1))-tauvkl(1,j)/z_pe(2)
        VDIFFu(1,kynt-j+2,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                          *(uk3(1,kynt-j+2)-uk(1,kynt-j+2,1))  &
  &                         -tauukl(1,kynt-j+2)/z_pe(2)
        VDIFFv(1,kynt-j+2,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                          *(vk3(1,kynt-j+2)-vk(1,kynt-j+2,1))  &
  &                         -tauvkl(1,kynt-j+2)/z_pe(2)
     end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(i,j)

     do j=2,hynt+1
        do i=2,hxnt+1
           VDIFFu(i,j,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                      *(uk3(i,j)-uk(i,j,1))-tauukl(i,j)/z_pe(2)
           VDIFFv(i,j,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                      *(vk3(i,j)-vk(i,j,1))-tauvkl(i,j)/z_pe(2)
           VDIFFu(kxnt-i+2,kynt-j+2,1)=dconjg(VDIFFu(i,j,1))
           VDIFFv(kxnt-i+2,kynt-j+2,1)=dconjg(VDIFFv(i,j,1))
           VDIFFu(i,kynt-j+2,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                             *(uk3(i,kynt-j+2)-uk(i,kynt-j+2,1))  &
  &                            -tauukl(i,kynt-j+2)/z_pe(2)
           VDIFFv(i,kynt-j+2,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                            *(vk3(i,kynt-j+2)-vk(i,kynt-j+2,1))  &
  &                            -tauvkl(i,kynt-j+2)/z_pe(2)
           VDIFFu(kxnt-i+2,j,1)=dconjg(VDIFFu(i,kynt-j+2,1))
           VDIFFv(kxnt-i+2,j,1)=dconjg(VDIFFv(i,kynt-j+2,1))
        end do
     end do

!$omp end do
!$omp end parallel

  else if(MY_RANK==PETOT-1)then

!$omp parallel default(shared)
!$omp do schedule(runtime) private(i)

     do i=2,hxnt+1
        VDIFFu(i,1,nzp)=-0.5d0*(K_mbl/((z_pe(nzp)-z_pe(nzp-1))*(z_pe(nzp)-z3)))  &
  &                     *(uk(i,1,nzp)-uk3(i,1))
        VDIFFv(i,1,nzp)=-0.5d0*(K_mbl/((z_pe(nzp)-z_pe(nzp-1))*(z_pe(nzp)-z3)))  &
  &                     *(vk(i,1,nzp)-vk3(i,1))
        VDIFFu(kxnt-i+2,1,nzp)=dconjg(VDIFFu(i,1,nzp))
        VDIFFv(kxnt-i+2,1,nzp)=dconjg(VDIFFv(i,1,nzp))
     end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(j)

     do j=2,hynt+1
        VDIFFu(1,j,nzp)=-0.5d0*(K_mbl/((z_pe(nzp)-z_pe(nzp-1))*(z_pe(nzp)-z3)))  &
  &                     *(uk(1,j,nzp)-uk3(1,j))
        VDIFFv(1,j,nzp)=-0.5d0*(K_mbl/((z_pe(nzp)-z_pe(nzp-1))*(z_pe(nzp)-z3)))  &
  &                     *(vk(1,j,nzp)-vk3(1,j))
        VDIFFu(1,kynt-j+2,nzp)=-0.5d0*(K_mbl/((z_pe(nzp)-z_pe(nzp-1))*(z_pe(nzp)-z3)))  &
  &                     *(uk(1,kynt-j+2,nzp)-uk3(1,kynt-j+2))
        VDIFFv(1,kynt-j+2,nzp)=-0.5d0*(K_mbl/((z_pe(nzp)-z_pe(nzp-1))*(z_pe(nzp)-z3)))  &
  &                     *(vk(1,kynt-j+2,nzp)-vk3(1,kynt-j+2))
     end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(i,j)

     do j=2,hynt+1
        do i=2,hxnt+1
           VDIFFu(i,j,nzp)=-0.5d0*(K_mbl/((z_pe(nzp)-z_pe(nzp-1))*(z_pe(nzp)-z3)))  &
  &                        *(uk(i,j,nzp)-uk3(i,j))
           VDIFFv(i,j,nzp)=-0.5d0*(K_mbl/((z_pe(nzp)-z_pe(nzp-1))*(z_pe(nzp)-z3)))  &
  &                        *(vk(i,j,nzp)-vk3(i,j))
           VDIFFu(kxnt-i+2,kynt-j+2,nzp)=dconjg(VDIFFu(i,j,nzp))
           VDIFFv(kxnt-i+2,kynt-j+2,nzp)=dconjg(VDIFFv(i,j,nzp))
           VDIFFu(i,kynt-j+2,nzp)=-0.5d0*(K_mbl/((z_pe(nzp)-z_pe(nzp-1))*(z_pe(nzp)-z3)))  &
  &                              *(uk(i,kynt-j+2,nzp)-uk3(i,kynt-j+2))
           VDIFFv(i,kynt-j+2,nzp)=-0.5d0*(K_mbl/((z_pe(nzp)-z_pe(nzp-1))*(z_pe(nzp)-z3)))  &
  &                              *(vk(i,kynt-j+2,nzp)-vk3(i,kynt-j+2))
           VDIFFu(kxnt-i+2,j,nzp)=dconjg(VDIFFu(i,kynt-j+2,nzp))
           VDIFFv(kxnt-i+2,j,nzp)=dconjg(VDIFFv(i,kynt-j+2,nzp))
        end do
     end do

!$omp end do
!$omp end parallel

  end if

end subroutine SET_BOUNDARIES


subroutine SET_BOUNDARIES2( VDIFFu, VDIFFv, uk, vk, FRICu_isp, FRICv_isp )
!-- calculating stretching terms (on physical space)
!-- VDIFF{u,v} are replaced at k==1 (MY_RANK==0) and k==nzp (MY_RANK==PETOT-1)
  use savegloval_define
  implicit none
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: VDIFFu  ! diffusion term for U
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: VDIFFv  ! diffusion term for V
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(in) :: uk        ! uk
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(in) :: vk        ! vk
  double precision, dimension(jynt,jxnt), intent(inout) :: FRICu_isp
                                                  ! friction term of U for ISPACK
  double precision, dimension(jynt,jxnt), intent(inout) :: FRICv_isp
                                                  ! friction term of V for ISPACK
  complex(kind(0d0)), dimension(kxnt,kynt) :: tauukl    ! bottom friction uk
  complex(kind(0d0)), dimension(kxnt,kynt) :: tauvkl    ! bottom friction vk
  complex(kind(0d0)), dimension(kxnt,kynt) :: uk3, vk3
  double precision :: z3

  integer :: i, j, IERROR

  if(nzp==1)then  ! 1 プロセスが 1 層しか担当しない場合 (隣のプロセスが uk3 を持つ)
     if(MY_RANK==0.or.MY_RANK==1)then
        call mpi_sendrecvp2p_valcpd( IERROR, 1, 0,  &
  &                                  uk(1:kxnt,1:kynt,2),  &
  &                                  vk(1:kxnt,1:kynt,2),  &
  &                                  uk3(1:kxnt,1:kynt),  &
  &                                  vk3(1:kxnt,1:kynt) )
        if(MY_RANK==0)then
           z3=z(3)
        end if
     end if
  else  ! 1 プロセスが 2 層以上担当する場合 (自プロセスに uk3 がある)
     if(MY_RANK==0)then
        uk3(1:kxnt,1:kynt)=uk(1:kxnt,1:kynt,3)
        vk3(1:kxnt,1:kynt)=vk(1:kxnt,1:kynt,3)
        z3=z(3)
     end if
  end if

  if(MY_RANK==0)then  ! Bottom boundary

     call phys2spec_isp( FRICu_isp(1:jynt,1:jxnt), tauukl(1:kxnt,1:kynt) )
     call phys2spec_isp( FRICv_isp(1:jynt,1:jxnt), tauvkl(1:kxnt,1:kynt) )

!$omp parallel default(shared)
!$omp do schedule(runtime) private(i)

     do i=2,hxnt+1
        VDIFFu(i,1,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                   *(uk3(i,1)-uk(i,1,1))-tauukl(i,1)/z_pe(2)
        VDIFFv(i,1,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                   *(vk3(i,1)-vk(i,1,1))-tauvkl(i,1)/z_pe(2)
        VDIFFu(kxnt-i+2,1,1)=dconjg(VDIFFu(i,1,1))
        VDIFFv(kxnt-i+2,1,1)=dconjg(VDIFFv(i,1,1))
     end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(j)

     do j=2,hynt+1
        VDIFFu(1,j,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                   *(uk3(1,j)-uk(1,j,1))-tauukl(1,j)/z_pe(2)
        VDIFFv(1,j,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                   *(vk3(1,j)-vk(1,j,1))-tauvkl(1,j)/z_pe(2)
        VDIFFu(1,kynt-j+2,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                          *(uk3(1,kynt-j+2)-uk(1,kynt-j+2,1))  &
  &                         -tauukl(1,kynt-j+2)/z_pe(2)
        VDIFFv(1,kynt-j+2,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                          *(vk3(1,kynt-j+2)-vk(1,kynt-j+2,1))  &
  &                         -tauvkl(1,kynt-j+2)/z_pe(2)
     end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(i,j)

     do j=2,hynt+1
        do i=2,hxnt+1
           VDIFFu(i,j,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                      *(uk3(i,j)-uk(i,j,1))-tauukl(i,j)/z_pe(2)
           VDIFFv(i,j,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                      *(vk3(i,j)-vk(i,j,1))-tauvkl(i,j)/z_pe(2)
           VDIFFu(kxnt-i+2,kynt-j+2,1)=dconjg(VDIFFu(i,j,1))
           VDIFFv(kxnt-i+2,kynt-j+2,1)=dconjg(VDIFFv(i,j,1))
           VDIFFu(i,kynt-j+2,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                             *(uk3(i,kynt-j+2)-uk(i,kynt-j+2,1))  &
  &                            -tauukl(i,kynt-j+2)/z_pe(2)
           VDIFFv(i,kynt-j+2,1)=(K_mbl/((z3-z_pe(1))*z_pe(2)))  &
  &                            *(vk3(i,kynt-j+2)-vk(i,kynt-j+2,1))  &
  &                            -tauvkl(i,kynt-j+2)/z_pe(2)
           VDIFFu(kxnt-i+2,j,1)=dconjg(VDIFFu(i,kynt-j+2,1))
           VDIFFv(kxnt-i+2,j,1)=dconjg(VDIFFv(i,kynt-j+2,1))
        end do
     end do

!$omp end do
!$omp end parallel

  end if

end subroutine SET_BOUNDARIES2


subroutine PGRAD_term_MBL( pk, PGRADu, PGRADv )
!-- calculating pressure gradient terms
! -mu*zeta
  use Math_Const
  use savegloval_define
  implicit none
  complex(kind(0d0)), dimension(kxnt,kynt), intent(in) :: pk      ! pk
  complex(kind(0d0)), dimension(kxnt,kynt), intent(inout) :: PGRADu
                                            ! pressure gradient term for U
  complex(kind(0d0)), dimension(kxnt,kynt), intent(inout) :: PGRADv
                                            ! pressure gradient term for V

  integer :: i, j
  double precision :: lxi, lyi, pi2, rhoi

  PGRADu=0.0d0
  PGRADv=0.0d0

  lxi=1.0d0/Lx
  lyi=1.0d0/Ly
  pi2=2.0d0*pi_dp
  rhoi=1.0d0/rho0

!$omp parallel default(shared)
!$omp do schedule(runtime) private(i)

  do i=2,hxnt+1
     PGRADu(i,1)=-img_cdp*pi2*(dble(i-1)*lxi)*pk(i,1)
     PGRADu(kxnt-i+2,1)=dconjg(PGRADu(i,1))
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(j)

  do j=2,hynt+1
     PGRADv(1,j)=-img_cdp*pi2*(dble(j-1)*lyi)*pk(1,j)
     PGRADv(1,kynt-j+2)=img_cdp*pi2*(dble(j-1)*lyi)*pk(1,kynt-j+2)
!-- ここでは, ISPACK の仕様により, l 方向は -l について独立
!-- なので, jy-j+2 では l 方向の波数が奇数乗でかかる場合,
!-- (-l) でかける必要がある (偶数乗はかけなくても同じ結果). 
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(i,j)

  do j=2,hynt+1
     do i=2,hxnt+1
        PGRADu(i,j)=-img_cdp*pi2*(dble(i-1)*lxi)*pk(i,j)
        PGRADv(i,j)=-img_cdp*pi2*(dble(j-1)*lyi)*pk(i,j)
        PGRADu(kxnt-i+2,kynt-j+2)=dconjg(PGRADu(i,j))
        PGRADv(kxnt-i+2,kynt-j+2)=dconjg(PGRADv(i,j))
        PGRADu(i,kynt-j+2)=-img_cdp*pi2*(dble(i-1)*lxi)*pk(i,kynt-j+2)
        PGRADv(i,kynt-j+2)=img_cdp*pi2*(dble(j-1)*lyi)*pk(i,kynt-j+2)
        PGRADu(kxnt-i+2,j)=dconjg(PGRADu(i,kynt-j+2))
        PGRADv(kxnt-i+2,j)=dconjg(PGRADv(i,kynt-j+2))
!-- ここでは, ISPACK の仕様により, l 方向は -l について独立
!-- なので, jy-j+2 では l 方向の波数が奇数乗でかかる場合,
!-- (-l) でかける必要がある (偶数乗はかけなくても同じ結果). 
     end do
  end do

!$omp end do
!$omp end parallel

  PGRADu=PGRADu*rhoi
  PGRADv=PGRADv*rhoi

end subroutine PGRAD_term_MBL


subroutine smooth_val_MBL( valk )
!-- calculating vertical flow based on a mass continuity
  use Math_Const
  use savegloval_define
  implicit none
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: valk

  integer :: i, j, k
  double precision :: coefk, coefl
  double precision, dimension(kxnt,kynt) :: smt_coef

  coefk=pi_dp/hxnt
  coefl=pi_dp/hynt

  smt_coef=0.0d0

!$omp parallel default(shared)
!$omp do schedule(runtime) private(i)

  do i=2,hxnt+1
     smt_coef(i,1)=dsin(dble(i-1)*coefk)/(dble(i-1)*coefk)
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(j)

  do j=2,hynt+1
     smt_coef(1,j)=dsin(dble(j-1)*coefl)/(dble(j-1)*coefl)
     smt_coef(1,kynt-j+2)=dsin(dble(j-1)*coefl)/(dble(j-1)*coefl)
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(i,j)

  do j=2,hynt+1
     do i=2,hxnt+1
        smt_coef(i,j)=dsin(dble(i-1)*coefk)/(dble(i-1)*coefk)  &
  &                  *dsin(dble(j-1)*coefl)/(dble(j-1)*coefl)
        smt_coef(i,kynt-j+2)=dsin(dble(i-1)*coefk)/(dble(i-1)*coefk)  &
  &                         *dsin(dble(j-1)*coefl)/(dble(j-1)*coefl)
     end do
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(i,k)

  do k=1,nzp
     do i=2,hxnt+1
        valk(i,1,k)=valk(i,1,k)*smt_coef(i,1)
        valk(kxnt-i+2,1,k)=dconjg(valk(i,1,k))
     end do
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(j,k)

  do k=1,nzp
     do j=2,hynt+1
        valk(1,j,k)=valk(1,j,k)*smt_coef(1,j)
        valk(1,kynt-j+2,k)=valk(1,kynt-j+2,k)*smt_coef(1,kynt-j+2)
     end do
  end do

!$omp end do
!$omp barrier
!$omp do schedule(runtime) private(i,j,k)

  do k=1,nzp
  do j=2,hynt+1
     do i=2,hxnt+1
        valk(i,j,k)=valk(i,j,k)*smt_coef(i,j)
        valk(kxnt-i+2,kynt-j+2,k)=dconjg(valk(i,j,k))
        valk(i,kynt-j+2,k)=valk(i,kynt-j+2,k)*smt_coef(i,kynt-j+2)
        valk(kxnt-i+2,j,k)=dconjg(valk(i,kynt-j+2,k))
     end do
     end do
  end do

!$omp end do
!$omp end parallel

end subroutine smooth_val_MBL


subroutine force_MBL( psik_nbm, u_nbm_isp, v_nbm_isp, pk, uk_mbl, vk_mbl,  &
  &                   forceu, forcev )
!-- calculating total forcing terms in NBM
  use savegloval_define
  use mpi
  use mpi_mod
  implicit none
  complex(kind(0d0)), dimension(kxnt,kynt), intent(in) :: psik_nbm  ! psik for NBM
  double precision, dimension(jynt,jxnt), intent(in) :: u_nbm_isp   ! U for NBM on jxnt, jynt
  double precision, dimension(jynt,jxnt), intent(in) :: v_nbm_isp   ! V for NBM on jxnt, jynt
  complex(kind(0d0)), dimension(kxnt,kynt), intent(in) :: pk        ! pk
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: uk_mbl    ! uk for MBL
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: vk_mbl    ! vk for MBL
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: forceu ! total force for U
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1), intent(inout) :: forcev ! total force for V

  integer :: i, j, k, IERROR
  double precision, dimension(jynt,jxnt) :: FRICu_isp, FRICv_isp
  double precision, dimension(jynt,jxnt,0:nzp+1) :: u_mbl_isp, v_mbl_isp, w_mbl_isp
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1) :: HADVu, HADVv, VADVu, VADVv
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1) :: CORILu, CORILv
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1) :: HDIFFu, HDIFFv, VDIFFu, VDIFFv
  complex(kind(0d0)), dimension(kxnt,kynt,0:nzp+1) :: wk_mbl, PGRADu, PGRADv

  forceu=0.0d0
  forcev=0.0d0

!-- calculating wk based on mass continuity
!-- Due to vertical integration, call MPI_gather

  call W_divergence2( uk_mbl, vk_mbl, wk_mbl )

!-- Sending and receiving shadow layer in each process

  call mpi_sendrecv_valcpd( IERROR, uk_mbl, vk_mbl )
  !-- replacing {u,v}k_mbl(nzp+1) in PETOT-1 with {u,v}k_mbl
  !-- VDIFF_term_MBL で PETOT-1 の上端 (nzp) も他と同じ定式化で計算する.
  if(MY_RANK==PETOT-1)then
     call psik2ukvk( psik_nbm, uk_mbl(1:kxnt,1:kynt,nzp+1),  &
  &                  vk_mbl(1:kxnt,1:kynt,nzp+1) )
  end if

!-- converting {u,v,w}k to {u,v,w}_mbl (for calculating nonlinear terms)

!$omp parallel default(shared)
!$omp do schedule(runtime) private(k)
  do k=1,nzp
     call spec2phys_isp( uk_mbl(1:kxnt,1:kynt,k), u_mbl_isp(1:jynt,1:jxnt,k) )
     call spec2phys_isp( vk_mbl(1:kxnt,1:kynt,k), v_mbl_isp(1:jynt,1:jxnt,k) )
     call spec2phys_isp( wk_mbl(1:kxnt,1:kynt,k), w_mbl_isp(1:jynt,1:jxnt,k) )
  end do
!$omp end do
!$omp end parallel

!-- calculating advecting term 

  call HADV_term_MBL( u_mbl_isp, v_mbl_isp, HADVu, HADVv )
!ORG  call VADV_term_MBL( u_mbl_isp, v_mbl_isp, w_mbl_isp,  &
!ORG  &                   u_nbm_isp, v_nbm_isp, VADVu, VADVv )
  call VADV_term_MBL2( u_mbl_isp, v_mbl_isp, w_mbl_isp,  &
  &                    u_nbm_isp, v_nbm_isp, VADVu, VADVv )

!-- calculating Coriolis term

  call CORIL_term_MBL( uk_mbl, vk_mbl, CORILu, CORILv )

!-- calculating diffusion term

  call HDIFF_term_MBL( uk_mbl, vk_mbl, HDIFFu, HDIFFv )
  call VDIFF_term_MBL( uk_mbl, vk_mbl, VDIFFu, VDIFFv )

!-- calculating pressure gradient term

  call PGRAD_term_MBL( pk, PGRADu(1:kxnt,1:kynt,1), PGRADv(1:kxnt,1:kynt,1) )
  call fill_val_3d( PGRADu(1:kxnt,1:kynt,1), PGRADu )
  call fill_val_3d( PGRADv(1:kxnt,1:kynt,1), PGRADv )

!-- calculating surface friction term

  if(MY_RANK==0)then  ! only on the surface
     call FRICTION_term_phy_MBL( u_mbl_isp(1:jynt,1:jxnt,1),  &
  &                              v_mbl_isp(1:jynt,1:jxnt,1),  &
  &                              FRICu_isp(1:jynt,1:jxnt),  &
  &                              FRICv_isp(1:jynt,1:jxnt) )
  end if

!-- setting bottom and top boundary conditions

  call SET_BOUNDARIES2( VDIFFu, VDIFFv, uk_mbl, vk_mbl,  &
  &                    FRICu_isp(1:jynt,1:jxnt),  &
  &                    FRICv_isp(1:jynt,1:jxnt) )
!ORG  call SET_BOUNDARIES( VDIFFu, VDIFFv, uk_mbl, vk_mbl,  &
!ORG  &                    FRICu_isp(1:jynt,1:jxnt),  &
!ORG  &                    FRICv_isp(1:jynt,1:jxnt) )

!$omp parallel default(shared)
!$omp do schedule(runtime) private(i,j,k)

  do k=1,nzp
     do j=1,2*hynt+1
        do i=1,hxnt+1
           forceu(i,j,k)=HADVu(i,j,k)+VADVu(i,j,k)+CORILu(i,j,k)  &
  &                     +HDIFFu(i,j,k)+VDIFFu(i,j,k)+PGRADu(i,j,k)
           forcev(i,j,k)=HADVv(i,j,k)+VADVv(i,j,k)+CORILv(i,j,k)  &
  &                     +HDIFFv(i,j,k)+VDIFFv(i,j,k)+PGRADv(i,j,k)
        end do
     end do
  end do

!$omp end do
!$omp end parallel

!-- For debug

  HADVud=HADVu
  HADVvd=HADVv
  VADVud=VADVu
  VADVvd=VADVv
  HDIFFud=HDIFFu
  HDIFFvd=HDIFFv
  VDIFFud=VDIFFu
  VDIFFvd=VDIFFv
  CORILud=CORILu
  CORILvd=CORILv
  PGRADud=forceu
  PGRADvd=forcev

end subroutine force_MBL


end module force_mbl_solv
