!===============================================================================
! Copyright 2021-2022 Intel Corporation.
!
! This software and the related documents are Intel copyrighted  materials,  and
! your use of  them is  governed by the  express license  under which  they were
! provided to you (License).  Unless the License provides otherwise, you may not
! use, modify, copy, publish, distribute,  disclose or transmit this software or
! the related documents without Intel's prior written permission.
!
! This software and the related documents  are provided as  is,  with no express
! or implied  warranties,  other  than those  that are  expressly stated  in the
! License.
!===============================================================================

! Content:
! A simple example of batch double-precision complex-to-complex out-of-place
! 1D FFTs using Intel(R) oneAPI Math Kernel Library (oneMKL) DFTI
!
!*****************************************************************************

include "mkl_dfti_omp_offload.f90"

program dp_complex_1d_batch_outofplace

  use MKL_DFTI_OMP_OFFLOAD, forget => DFTI_DOUBLE, DFTI_DOUBLE => DFTI_DOUBLE_R
  use omp_lib, ONLY : omp_get_num_devices
  use, intrinsic :: ISO_C_BINDING
  ! Size of 1D transform
  integer, parameter :: N = 135
  
  ! Number of transforms
  integer, parameter :: M = 100
  
  integer, parameter :: numPoints = N*M
  
  ! Arbitrary harmonic used to verify FFT
  integer, parameter :: H = -N/2

  ! Working precision is double precision
  integer, parameter :: WP = selected_real_kind(15,307)

  ! Execution status
  integer :: status = 0, ignored_status

  ! The input data array
  complex(WP), allocatable :: x (:)
  ! The output data array
  complex(WP), allocatable :: y (:)

  ! DFTI descriptor handle
  type(DFTI_DESCRIPTOR), POINTER :: hand

  hand => null()

  print *,"Example dp_complex_1d_batch_outofplace"
  print *,"Batch forward and backward double-precision",        &
    &      " complex-to-complex out-of-place 1D transform"
  print *,"Configuration parameters:"
  print *,"DFTI_PRECISION      = DFTI_DOUBLE"
  print *,"DFTI_FORWARD_DOMAIN = DFTI_COMPLEX"
  print *,"DFTI_DIMENSION      = 1"
  print '(" DFTI_NUMBER_OF_TRANSFORMS = "I0"")', M
  print '(" DFTI_LENGTHS        = /"I0"/" )', N

  print *,"Create DFTI descriptor"
  status = DftiCreateDescriptor(hand, DFTI_DOUBLE, DFTI_COMPLEX, 1, N)
  if (0 /= status) goto 999
  
  print *,"Set DFTI descriptor for number of batch out-of-place transforms"
  status = DftiSetValue(hand, DFTI_NUMBER_OF_TRANSFORMS, M)
  if (0 /= status) goto 999

  print *,"Set DFTI descriptor for size of individual transforms (output)"
  status = DftiSetValue(hand, DFTI_OUTPUT_DISTANCE, N)
  if (0 /= status) goto 999
  
  print *,"Set DFTI descriptor for size of individual transforms (input)"
  status = DftiSetValue(hand, DFTI_INPUT_DISTANCE, N)
  if (0 /= status) goto 999
  
  print *,"Set DFTI descriptor for out-of-place computation"
  status = DftiSetValue(hand, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
  if (0 /= status) goto 999

  print *,"Commit DFTI descriptor"
#if defined(ONEMKL_USE_OPENMP_VERSION) && (ONEMKL_USE_OPENMP_VERSION >= 202011)
  !$omp dispatch
#else
  !$omp target variant dispatch
#endif
  status = DftiCommitDescriptor(hand)
#if !defined(ONEMKL_USE_OPENMP_VERSION) || (ONEMKL_USE_OPENMP_VERSION < 202011)
  !$omp end target variant dispatch
#endif
  if (0 /= status) goto 999

  print *,"Allocate array for input and output data"
  allocate ( x(numPoints), STAT = status)
  if (0 /= status) goto 999
  allocate ( y(numPoints), STAT = status)
  if (0 /= status) goto 999

  print *,"Initialize input for forward transforms"
  call init(x, M, N, H)

  print *,"Compute forward transforms"
  !$omp target data map(to:x) map(from:y)
#if defined(ONEMKL_USE_OPENMP_VERSION) && (ONEMKL_USE_OPENMP_VERSION >= 202011)
  !$omp dispatch
#else
  !$omp target variant dispatch use_device_ptr(x, y)
#endif
  status = DftiComputeForward(hand, x, y)
#if !defined(ONEMKL_USE_OPENMP_VERSION) || (ONEMKL_USE_OPENMP_VERSION < 202011)
  !$omp end target variant dispatch
#endif
  !$omp end target data
  if (0 /= status) goto 999

  print *,"Verify the result"
  status = verify(y, M, N, H)
  if (0 /= status) goto 999

  print *,"Initialize input for backward transforms"
  call init(y, M, N, -H)

  print *,"Compute backward transforms"
  !$omp target data map(to:y) map(from:x)
#if defined(ONEMKL_USE_OPENMP_VERSION) && (ONEMKL_USE_OPENMP_VERSION >= 202011)
  !$omp dispatch
#else
  !$omp target variant dispatch use_device_ptr(x, y)
#endif
  status = DftiComputeBackward(hand, y, x)
#if !defined(ONEMKL_USE_OPENMP_VERSION) || (ONEMKL_USE_OPENMP_VERSION < 202011)
  !$omp end target variant dispatch
#endif
  !$omp end target data
  if (0 /= status) goto 999

  print *,"Verify the results"
  status = verify(x, M, N, H)
  if (0 /= status) goto 999

100 continue

  print *,"Release the DFTI descriptor"
  ignored_status = DftiFreeDescriptor(hand)

  if (allocated(x)) then
      print *,"Deallocate input data array"
      deallocate(x)
  endif
  if (allocated(y)) then
      print *,"Deallocate output data array"
      deallocate(y)
  endif

  if (status == 0) then
    print *,"TEST PASSED"
    call exit(0)
  else
    print *,"TEST FAILED"
    call exit(1)
  endif

999 print '("  Error, status = ",I0)', status
  goto 100

contains

  ! Compute mod(K*L,M) accurately
  pure real(WP) function moda(k,l,m)
    integer, intent(in) :: k,l,m
    integer*8 :: k8
    k8 = k
    moda = real(mod(k8*l,m),WP)
  end function moda

  ! Initialize array with harmonic H
  subroutine init(x, M, N, H)
    integer M, N, H
    complex(WP) :: x(:)

    integer j, k
    complex(WP), parameter :: I_TWOPI = (0.0_WP,6.2831853071795864769_WP)

    do j = 1, M
      do k = 1, N
        x((j-1)*N + k) = exp(I_TWOPI*moda(k-1, H, N)/cmplx(N))/cmplx(N)
      end do
    end do
  end subroutine init

  ! Verify that x(N) is unit peak at x(H)
  integer function verify(x, M, N, H)
    integer M, N, H
    complex(WP) :: x(:)

    integer j, k
    real(WP) err, errthr, maxerr
    complex(WP) res_exp, res_got

    ! Note, this simple error bound doesn't take into account error of
    ! input data
    errthr = 5.0 * log(real(N, WP)) / log(2.0_WP) * EPSILON(1.0_WP)
    print '("  Check if err is below errthr " G10.3)', errthr

    maxerr = 0.0_WP
    do j = 1, M
      do k = 1, N
        if (mod(k-1-H,N)==0) then
          res_exp = 1.0_WP
        else
          res_exp = 0.0_WP
        end if
        res_got = x((j-1)*N + k)
        err = abs(res_got - res_exp)
        maxerr = max(err,maxerr)
        if (.not.(err < errthr)) then
          print '("  x("I0"): "$)', (j-1)*N + k
          print '(" expected ("G24.17", "G24.17"),"$)', res_exp
          print '(" got ("G24.17", "G24.17"),"$)', res_got
          print '(" err "G10.3)', err
          print *," Verification FAILED"
          verify = 100
          return
        end if
      end do
    end do
    print '("  Verified,  maximum error was " G10.3)', maxerr
    verify = 0
  end function verify

end program dp_complex_1d_batch_outofplace
