|  | >Can you give us some declarations?  For example, are tim(*,*) and
>tau(*,*) arrays or functions?  What datatype do they use?  What are
>the expected values of nsub, ntau, ntimept(*), nwldet(*)?  
tim and tau are real*4 arrays
yfit is a real array (because I use -r8 as a comp. option it will be real*8)
>At first glance, it seems you would do better by moving the j loop to
>be innermost, so that exp(...k,m,i,n) can be moved outside the loop.
>This looks like it may have more payoff (~100 cycles per call) than
>reordering the loops for optimal memory behavior.
GREAT ! the elapsed time for the run drops from 54" to 34" !!!
>What operating system and version are you running this program on?  On Unix,
>several versions back, there was a severe performance problem when doing
>exp on some (very large negative?) values.
digital unix 4.0 564  ; f77 v4.1-92
I use f77 -O5 -fast -tune host -speculate_all -r8
I also did manual blocking on Bill's code, by introducing additional
loops to have the l,i,n,j,k loops varying on a range sized nb, and I found 
the best result (about 28" ) using nb=20 and compiling with
f77 -O4 -fast -tune host  -r8
(Here -O5 slows it down to one minute, perhaps because it interferes
with the manual unrolling ?? )
By the way I was kind of surprised that the min. lies on nb=20;
it is an nb**5 domain (l, k, j, i, n) there are 2 doubles (crap and yfit )
and 4 real*4 (tau, tim, das ). To fit D-cache it should go like this :
      8000 = (8+4+4+4+8)* (nb**5) 
which should give nb=3 ???
Here follows the modified subroutine (i.e. Bill's idea+blocking )
Thanks to all of you, in particular Bill for the grand idea!
-Joseph
	subroutine calc_decay
c
c include global values
c
	include	'kfdimdef.inc'
	include	'kfgbldef.inc'
c
c define variables
c
c        real*8 crap(DIM_TAU,DIM_SUB,DIM_TIMEPNT)
         real*8 crap
	integer*4	i, j, k, l, m, n
c
      nb =20 
      do l = 1, nwlexc
            m = dataset(2,l)
       do k=1,ntimepnt(m)
          do j=1,nwldet(l)
                  yfit(k,j,l) = 0.0
          end do
       end do
      end do 
c
c       
      do ll=1,nwlexc,nb
       do l=ll,min(nwlexc,ll+nb-1)
            m = dataset(2,l)
         do kk=1, ntimepnt(m),nb
          do nn=1,nsub,nb
           do ii=1,ntau,nb 
            do jj=1,nwldet(l),nb
             do k=kk,min( ntimepnt(m),kk+nb-1)
               do n=nn,min(nsub,nn+nb-1)
                do i=ii,min(ntau,ii+nb-1)
                        crap=exp(-tim(k,m)/tau(i,n))
                  do j = jj,min(nwldet(l),jj+nb-1)
			    yfit(k,j,l) = yfit(k,j,l)
     1                 + das(j,i,l,n) * crap 
                  end do                               ! end do j
                end do                                 ! end do i
               end do                                  ! end do n
             end do                                    ! end do k
            end do                                     !end do jj
           end do                                        ! end do ii
          end do                                   ! end do nn
         end do                                   ! end do kk
       end do                                   ! end do l
      end do                                       ! end do ll
c
	return
	end
c
c  Filename:		kfgbldef.inc
c  Author:		I. Martin
c  Date:		V1.0	01-dec-1995
c  Description:		global values
c
c -----
c
c define variables in common blocks
c
	integer*4	ierr, lenlis
	character*80	fillis, fillst
	integer*4	mode, nabs, nflu, dataset(DIM_SASTYP,DIM_WLEXC),
	1		nwlexc, nwldet(DIM_WLEXC), mwldet(DIM_SASTYP), ntau,
	1		nsub, nsas(DIM_TAU,DIM_SASTYP),
	1		ngf(DIM_TAU,DIM_SASTYP), ntimepnt(DIM_SASTYP)
	real*4		wlexc(DIM_WLEXC),
	1		wldet(DIM_WLDET*DIM_WLEXC,DIM_SASTYP)
	integer*4	inddas(DIM_WLDET,DIM_WLEXC,DIM_SASTYP)
	real		amplinpsum(DIM_WLDET,DIM_WLEXC)
	logical*4	exccalc
	real*4		excfacdec(DIM_WLDET,DIM_WLEXC),
	1		prob(DIM_TAU), problb(DIM_TAU), probub(DIM_TAU)
	integer*4	probfit(DIM_TAU)
	real*4		ratmat(DIM_TAU,DIM_TAU,DIM_SUB),
	1		ratlb(DIM_TAU,DIM_TAU), ratub(DIM_TAU,DIM_TAU)
	integer*4	ratfit(DIM_TAU,DIM_TAU)
	real*4		excfac(DIM_WLEXC), excrel(DIM_TAU,DIM_WLEXC),
	1		excmat(DIM_TAU,DIM_WLEXC,DIM_SUB),
	1		exclb(DIM_WLEXC), excub(DIM_WLEXC)
	integer*4	excfit(DIM_WLEXC)
	integer*4	sasmode
	real*4		gfamp(DIM_GF,DIM_TAU,DIM_SASTYP),
	1		gfpos(DIM_GF,DIM_TAU,DIM_SASTYP),
	1		gfhw(DIM_GF,DIM_TAU,DIM_SASTYP),
	1		gfint(DIM_TAU,DIM_SASTYP),	! intercept and
	1		gfsl(DIM_TAU,DIM_SASTYP),	! slope for subground
	1		gfamplb(DIM_GF,DIM_TAU,DIM_SASTYP),
	1		gfposlb(DIM_GF,DIM_TAU,DIM_SASTYP),
	1		gfhwlb(DIM_GF,DIM_TAU,DIM_SASTYP),
	1		gfintlb(DIM_TAU,DIM_SASTYP),
	1		gfsllb(DIM_TAU,DIM_SASTYP),
	1		gfampub(DIM_GF,DIM_TAU,DIM_SASTYP),
	1		gfposub(DIM_GF,DIM_TAU,DIM_SASTYP),
	1		gfhwub(DIM_GF,DIM_TAU,DIM_SASTYP),
	1		gfintub(DIM_TAU,DIM_SASTYP),
	1		gfslub(DIM_TAU,DIM_SASTYP)
	integer*4	gfampfit(DIM_GF,DIM_TAU,DIM_SASTYP),
	1		gfposfit(DIM_GF,DIM_TAU,DIM_SASTYP),
	1		gfhwfit(DIM_GF,DIM_TAU,DIM_SASTYP),
	1		gfintfit(DIM_TAU,DIM_SASTYP),
	1		gfslfit(DIM_TAU,DIM_SASTYP)
	real*4		gfbeg(DIM_GF,DIM_TAU,DIM_SASTYP),
	1		gfend(DIM_GF,DIM_TAU,DIM_SASTYP),
	1		gfbegl(DIM_TAU,DIM_SASTYP), gfendl(DIM_TAU,DIM_SASTYP),
	1		sas(DIM_WLDET*DIM_WLEXC,DIM_TAU,DIM_SASTYP),
	1		saslb(DIM_TAU,DIM_SASTYP), sasub(DIM_TAU,DIM_SASTYP)
	integer*4	sasfit(DIM_TAU,DIM_SASTYP), g
 | 
|  | I had the best result  using the following silicon-graphics 
code, compiled with -O5 -fast -tune host -speculate-all -r8
It is about 20 "
------------------------------------------------
	subroutine calc_decay
c
c include global values
c
	include	'kfdimdef.inc'
	include	'kfgbldef.inc'
c
c define variables
c
	integer*4	i, j, k, l, m, n
c
        real*4 tau_inv(DIM_SUB,DIM_TAU)
        real*4 exp_vec(DIM_SASTYP,DIM_TIMEPNT,DIM_SUB,DIM_TAU)
        real*4 exp_arg(DIM_TIMEPNT,DIM_SUB,DIM_TAU)
        do n = 1, nsub
           do i = 1, ntau
              tau_inv(n,i) = 1.0/tau(i,n)
           enddo
        enddo
        do l = 1, nwlexc
            m = dataset(2,l)
            do k = 1, ntimepnt(m)
                do n = 1, nsub
                    do i = 1, ntau
                        exp_vec(m,k,n,i) = exp(-tim(k,m)*tau_inv(n,i))
                    end do
                end do
            end do
        end do
	do l = 1, nwlexc
	    m = dataset(2,l)
	    do j = 1, nwldet(l)
		do k = 1, ntimepnt(m)
		    yfit(k,j,l) = 0.0
Cum		    do n = 1, nsub
Cum			do i = 1, ntau
Cum			    yfit(k,j,l) = yfit(k,j,l)
Cum	1			+ das(j,i,l,n) * exp(-tim(k,m)/tau(i,n))
Cum			end do
Cum		    end do
		    do n = 1, nsub
			do i = 1, ntau
			    yfit(k,j,l) = yfit(k,j,l)
	1			+ das(j,i,l,n) * exp_vec(m,k,n,i)
			end do
		    end do
		end do
	    end do
	end do
c
	return
	end
 | 
|  | 
Joseph,
I'd be curious to know how the code below performs. It does essentially the
same thing your best code does, but uses less memory and accesses temporary
data arrays contiguously.
Also, if there's any way you can redefine "das" so that memory is accessed
contiguously, [ make das(j,i,l,n) into das(i,n,j,l) ] do it. It can only
help.
	- Dwight -
--------------------------------------------------------------------------------
	subroutine calc_decay
c
c include global values
c
	include	'kfdimdef.inc'
	include	'kfgbldef.inc'
c
c define variables
c
	integer*4	i, j, k, l, m, n
c
        real*4 tau_inv(DIM_TAU,DIM_SUB)
        real*4 exp_vec(DIM_TAU,DIM_SUB)
        real*4 temp
c
        do n = 1, nsub
           do i = 1, ntau
              tau_inv(i,n) = 1.0/tau(i,n)
           enddo
        enddo
        do l = 1, nwlexc
            m = dataset(2,l)
            do k = 1, ntimepnt(m)
               do n = 1, nsub
                  do i = 1, ntau
                     exp_vec(i,n) = exp(-tim(k,m)*tau_inv(i,n))
                  end do
               end do
	       do j = 1, nwldet(l)
		    temp = 0.0
		    do n = 1, nsub
			do i = 1, ntau
			    temp = temp + das(j,i,l,n) * exp_vec(i,n)
			end do
		    end do
		    yfit(k,j,l) = temp
		end do
	    end do
	end do
c
	return
	end
 |