      MODULE TRIDIAG_MOD
!@sum TRIDIAG_MOD contains subroutine TRIDIAG
      PRIVATE
      PUBLIC TRIDIAG
      PUBLIC TRIDIAG_CYCLIC
      PUBLIC TRIDIAG_NEW

      Interface Tridiag_new
      Module Procedure tridiag
#ifndef OFFLINE_RUN
      Module procedure tridiag_2d_dist_new
      Module procedure tridiag_3d_dist_new
#endif
      End Interface

      interface
        subroutine dgtsv(n, nrhs, dl, d, du, b, ldb, info)
        integer, intent(in) :: n, nrhs, ldb
        integer, intent(out) :: info
        real*8, intent(inout) :: dl(*), d(*), du(*), b(ldb,*)
        end subroutine dgtsv
      end interface

      contains

      subroutine tridiag(a, b, c, r, u, n)
      implicit none
      integer, intent(in) :: n
      real*8, intent(in) :: a(n), b(n), c(n), r(n)
      real*8, intent(out) :: u(n)

      real*8 :: dl(n-1), d(n), du(n-1)
      real*8 :: bmat(n,1)
      integer :: info

      dl = a(2:n)
      d = b
      du = c(1:n-1)
      bmat(:,1) = r

      call dgtsv(n, 1, dl, d, du, bmat, n, info)
      if (info /= 0) call stop_model( 'dgtsv ', -info )
      u = bmat(:,1)
      end subroutine tridiag

      
      SUBROUTINE TRIDIAG_cyclic(A,B,C,R,U,N)
!@sum TRIDIAG_cyclic solves a cyclic tridiagonal matrix equation (A,B,C)U=R
!@+   having nonzero A(1) and C(N), using the Thomas algorithm and
!@+   Sherman-Morrison formula.
      IMPLICIT NONE
      INTEGER :: N         !@var N    dimension of arrays
      REAL*8  :: A(N)   !@var A    coefficients of u_i-1
      REAL*8  :: B(N)   !@var B    coefficients of u_i
      REAL*8  :: C(N)   !@var C    coefficients of u_i+1
      REAL*8  :: R(N)   !@var R    RHS vector
      INTENT(IN) :: N,A,B,C,R
      REAL*8, INTENT(OUT):: U(N)   !@var U    solution vector
      REAL*8 :: BET                !@var BET  work variable
      REAL*8 :: GAM(Size(A))       !@var GAM  work array
      REAL*8 :: Q(Size(A))         !@var Q    work array
      INTEGER :: J                 !@var J    loop variable
      REAL*8 :: QCOEFF,A1,B1,C1,R1

      A1 = A(1)
      B1 = B(1)
      C1 = C(1)
      R1 = R(1)
      IF(B1.EQ.1D0) THEN
        A1 = A1*2D0
        B1 = B1*2D0
        C1 = C1*2D0
        R1 = R1*2D0
      ENDIF
      BET=B1-1d0
      U(1)=R1/BET
      Q(1)=1d0/BET
      GAM(2)=C1/BET
      DO J=2,N-1
        BET=B(J)-A(J)*GAM(J)
        IF (BET.eq.0) call stop_model("TRIDIAG: DENOMINATOR = ZERO",255)
c        IF (BET.eq.0) stop 'BET==0'
        U(J)=(R(J)-A(J)*U(J-1))/BET
        Q(J)=     -A(J)*Q(J-1) /BET
        GAM(J+1) = C(J)/BET
      END DO
      J=N
        BET=B(J)-A(J)*GAM(J)-A1*C(N)
        IF (BET.eq.0) call stop_model("TRIDIAG: DENOMINATOR = ZERO",255)
c        IF (BET.eq.0) stop 'BET==0'
        U(J)=(R(J)-A(J)*U(J-1))/BET
        Q(J)=(C(N)-A(J)*Q(J-1))/BET
      DO J=N-1,1,-1
        U(J)=U(J)-GAM(J+1)*U(J+1)
        Q(J)=Q(J)-GAM(J+1)*Q(J+1)
      END DO
      BET=1d0+Q(1)+A1*Q(N)
        IF (BET.eq.0) call stop_model("TRIDIAG: DENOMINATOR = ZERO",255)
c        IF (BET.eq.0) stop 'BET==0'
      QCOEFF = (U(1)+A1*U(N))/BET
      DO J=1,N
        U(J) = U(J) - QCOEFF*Q(J)
      ENDDO
      RETURN
      END SUBROUTINE TRIDIAG_cyclic

#ifndef OFFLINE_RUN
      SUBROUTINE TRIDIAG_2D_DIST_new(A_dist, B_dist, C_dist, R_dist,
     &                           U_dist,grid, j_lower, j_upper )
!@sum  TRIDIAG  solves an array of tridiagonal matrix equations (A,B,C)U=R
!@auth Max Kelley
      USE DOMAIN_DECOMP_1D, ONLY : DIST_GRID
      USE DOMAIN_DECOMP_1D, ONLY : TRANSP,TRANSPOSE_COLUMN
      IMPLICIT NONE

      Type (DIST_GRID), Intent(IN) :: grid
      REAL*8, INTENT(INOUT) :: A_dist(:,grid%j_strt_halo:)
      REAL*8, INTENT(INOUT) :: B_dist(:,grid%j_strt_halo:)
      REAL*8, INTENT(INOUT) :: C_dist(:,grid%j_strt_halo:)
      REAL*8, INTENT(INOUT) :: R_dist(:,grid%j_strt_halo:)
      REAL*8, INTENT(OUT)   :: U_dist(:,grid%j_strt_halo:)
      INTEGER, INTENT(IN)   :: J_LOWER, J_UPPER

      REAL*8, ALLOCATABLE :: ABCR(:,:,:,:),ABCR_tr(:,:,:,:),
     &                       U_tr(:,:)

      REAL*8 :: BET
      REAL*8, allocatable :: BYBET(:),GAM(:,:) !@var BET,GAM  work arrays

      Integer :: i, j
      Integer :: N, N_i, IM


! Determine the size of the global arrays
      N = grid%jm_world
      n_i = grid%ni_loc
      allocate( bybet(n_i), gam(n_i,n) )

! Matrix size consistent with array size?
      if ( J_upper > N ) then
        print*, 'TRIDIAG: upper bound of matrix arrays is too large'
        print*, 'j_upper = ', j_upper, 'jm =', n, ' ( need j_upper<=jm)'
        call stop_model('TRIDIAG: j_upper argument too large', 255)
      end if

! Copy j-distributed A,B,C,R into single array to do all tranposes together
      IM = grid%im_world
      allocate(abcr(4,IM,grid%j_strt_halo:grid%j_stop_halo,1))
      do j=grid%j_strt,grid%j_stop
        do i=1,IM
          abcr(1,i,j,1) = a_dist(i,j)
          abcr(2,i,j,1) = b_dist(i,j)
          abcr(3,i,j,1) = c_dist(i,j)
          abcr(4,i,j,1) = r_dist(i,j)
        enddo
      enddo

! Allocate the transposed arrays
      allocate( abcr_tr(4,n_i,n,1) )
      allocate( u_tr(n_i,n) )

! Do the transpose of A,B,C,R
      call transpose_column(grid, abcr, abcr_tr)

! Solve
      do i=1,n_i
        BET=ABCR_tr(2,i,j_lower,1)
        IF (BET.eq.0) then
          print*, "TRIDIAG_2D_DIST: DENOM. = ZERO  i,j= ", i,' 1'
          stop
        end if
        BYBET(I) = 1D0/BET
        U_tr(i,j_lower)=ABCR_tr(4,i,j_lower,1)*BYBET(I)
      enddo
      do J=j_lower+1, j_upper
        do i=1,n_i
          GAM(I,J)=ABCR_tr(3,i,J-1,1)*BYBET(I)
          BET=ABCR_tr(2,i,J,1)-ABCR_tr(1,i,J,1)*GAM(I,J)
          IF (BET.eq.0) then
            print*, "TRIDIAG_2D_DIST: DENOM. = ZERO i,j= ", i, j
            stop
          end if
          BYBET(I) = 1D0/BET
          U_tr(i,J)=( ABCR_tr(4,i,J,1)-ABCR_tr(1,i,J,1)*U_tr(i,J-1) )
     &         *BYBET(I)
        enddo
      enddo
      do J=j_upper-1,j_lower,-1
        do i=1,n_i
          U_tr(i,J)=U_tr(i,J)-GAM(I,J+1)*U_tr(i,J+1)
        enddo
      enddo

! Transfer the solution to the j-distributed array
      call transp( grid, u_dist, u_tr, reverse=.true.)

      deallocate( abcr, abcr_tr, u_tr, bybet, gam )

      RETURN
      END SUBROUTINE TRIDIAG_2D_DIST_new

      SUBROUTINE TRIDIAG_3D_DIST_new(A_dist, B_dist, C_dist, R_dist,
     &                           U_dist,grid, j_lower, j_upper )
!@sum  TRIDIAG  solves an array of tridiagonal matrix equations (A,B,C)U=R
!@auth Max Kelley
      USE DOMAIN_DECOMP_1D, ONLY : DIST_GRID
      USE DOMAIN_DECOMP_1D, ONLY : TRANSP,TRANSPOSE_COLUMN
      IMPLICIT NONE

      Type (DIST_GRID), Intent(IN) :: grid
      REAL*8, INTENT(INOUT) :: A_dist(:,grid%j_strt_halo:,:)
      REAL*8, INTENT(INOUT) :: B_dist(:,grid%j_strt_halo:,:)
      REAL*8, INTENT(INOUT) :: C_dist(:,grid%j_strt_halo:,:)
      REAL*8, INTENT(INOUT) :: R_dist(:,grid%j_strt_halo:,:)
      REAL*8, INTENT(OUT)   :: U_dist(:,grid%j_strt_halo:,:)
      INTEGER, INTENT(IN)   :: J_LOWER, J_UPPER

      REAL*8, ALLOCATABLE :: ABCR(:,:,:,:),ABCR_tr(:,:,:,:),
     &                       U_tr(:,:,:)

      REAL*8 :: BET
      REAL*8, allocatable :: BYBET(:),GAM(:,:) !@var BET,GAM  work arrays

      Integer :: i, j, l
      Integer :: N, N_i, N_l, IM


! Determine the size of the global arrays
      N = grid%jm_world
      n_i = grid%ni_loc
      n_l = size(A_dist,3)
      allocate( bybet(n_i), gam(n_i,n) )

! Matrix size consistent with array size?
      if ( J_upper > N ) then
        print*, 'TRIDIAG: upper bound of matrix arrays is too large'
        print*, 'j_upper = ', j_upper, 'jm =', n, ' ( need j_upper<=jm)'
        call stop_model('TRIDIAG: j_upper argument too large', 255)
      end if

! Copy j-distributed A,B,C,R into single array to do all tranposes together
      IM = grid%im_world
      allocate(abcr(4,IM,grid%j_strt_halo:grid%j_stop_halo,n_l))
      do l=1,n_l
        do j=grid%j_strt,grid%j_stop
          do i=1,IM
            abcr(1,i,j,l) = a_dist(i,j,l)
            abcr(2,i,j,l) = b_dist(i,j,l)
            abcr(3,i,j,l) = c_dist(i,j,l)
            abcr(4,i,j,l) = r_dist(i,j,l)
          enddo
        enddo
      enddo

! Allocate the transposed arrays
      allocate( abcr_tr(4,n_i,n,n_l) )
      allocate( u_tr(n_i,n,n_l) )

! Do the transpose of A,B,C,R
      call transpose_column(grid, abcr, abcr_tr)

! Solve
      do l=1,n_l
      do i=1,n_i
        BET=ABCR_tr(2,i,j_lower,l)
        IF (BET.eq.0) then
          print*, "TRIDIAG_3D_DIST: DENOM. = ZERO  i,j= ", i,' 1'
          stop
        end if
        BYBET(I) = 1D0/BET
        U_tr(i,j_lower,l)=ABCR_tr(4,i,j_lower,l)*BYBET(I)
      enddo
      do J=j_lower+1, j_upper
        do i=1,n_i
          GAM(I,J)=ABCR_tr(3,i,J-1,l)*BYBET(I)
          BET=ABCR_tr(2,i,J,l)-ABCR_tr(1,i,J,l)*GAM(I,J)
          IF (BET.eq.0) then
            print*, "TRIDIAG_3D_DIST: DENOM. = ZERO i,j= ", i, j
            stop
          end if
          BYBET(I) = 1D0/BET
          U_tr(i,J,l)=(ABCR_tr(4,i,J,l)-ABCR_tr(1,i,J,l)*U_tr(i,J-1,l))
     &         *BYBET(I)
        enddo
      enddo
      do J=j_upper-1,j_lower,-1
        do i=1,n_i
          U_tr(i,J,l)=U_tr(i,J,l)-GAM(I,J+1)*U_tr(i,J+1,l)
        enddo
      enddo
      enddo

! Transfer the solution to the j-distributed array
      call transp( grid, u_dist, u_tr, reverse=.true.)

      deallocate( abcr, abcr_tr, u_tr, bybet, gam )

      RETURN
      END SUBROUTINE TRIDIAG_3D_DIST_new
#endif



      END MODULE TRIDIAG_MOD
