########################################################################
# ISPACK FORTRAN SUBROUTINE LIBRARY FOR SCIENTIFIC COMPUTING
# Copyright (C) 1998--2017 Keiichi Ishioka <ishioka@gfd-dennou.org>
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
# 
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301 USA.
########################################################################
.text
.globl fvzqf0_
.globl _fvzqf0_	
fvzqf0_:
_fvzqf0_:	
	movl (%rdi), %edi  # NHH  rdi 
      # X Ƭɥ쥹 rsi

	vbroadcastsd C2(%rip),%ymm11 # ư 2  ymm11 4ս

	shlq $3,%rdi # rdi  NHH*8 	
        cvtsi2sd  %edi, %xmm0	

	movsd C2(%rip),%xmm1
	divsd %xmm0, %xmm1
	movsd %xmm1,-8(%rsp)	
	vbroadcastsd -8(%rsp),%ymm12 

	shlq $3,%rdi # rdi  NHH*64 

# I=0 ξ	

	lea (%rdi,%rdi),%rcx
	lea (%rdi,%rcx),%rdx
	lea (%rdi,%rsi),%rax
#.align 16
L1:
	vmovapd  (%rsi),   %ymm0 # 00R
	vmovapd 32(%rsi),  %ymm1 # 00I	
	vmovapd  (%rsi,%rcx),  %ymm2 # 10R
	vmovapd 32(%rsi,%rcx),  %ymm3 # 10I
	vmovapd  (%rsi,%rdi),  %ymm4 # 01R
	vmovapd 32(%rsi,%rdi),  %ymm5 # 01I
	vmovapd  (%rsi,%rdx),  %ymm6 # 11R
	vmovapd 32(%rsi,%rdx),  %ymm7 # 11I

#-- scaling --
	vmulpd %ymm12,%ymm0,%ymm0
	vmulpd %ymm12,%ymm1,%ymm1
#	vmulpd %ymm12,%ymm2,%ymm2
#	vmulpd %ymm12,%ymm3,%ymm3
	vmulpd %ymm12,%ymm4,%ymm4
	vmulpd %ymm12,%ymm5,%ymm5
#	vmulpd %ymm12,%ymm6,%ymm6
#	vmulpd %ymm12,%ymm7,%ymm7
#-------------

#	vsubpd %ymm2,%ymm0,%ymm2
	vfnmadd213pd %ymm0,%ymm12,%ymm2 # ymm2 = - ymm2 * ymm12 + ymm0	
#	vsubpd %ymm3,%ymm1,%ymm3
	vfnmadd213pd %ymm1,%ymm12,%ymm3 # ymm3 = - ymm3 * ymm12 + ymm1
	
#	vmulpd %ymm11,%ymm0,%ymm0
#	vsubpd %ymm2,%ymm0,%ymm0
	vfmsub213pd %ymm2,%ymm11,%ymm0 # ymm0 = ymm0 * ymm11 - ymm2	
#	vmulpd %ymm11,%ymm1,%ymm1
#	vsubpd %ymm3,%ymm1,%ymm1
	vfmsub213pd %ymm3,%ymm11,%ymm1 # ymm1 = ymm1 * ymm11 - ymm3	

#	vsubpd %ymm6,%ymm4,%ymm6
	vfnmadd213pd %ymm4,%ymm12,%ymm6 # ymm6 = - ymm6 * ymm12 + ymm4	
#	vsubpd %ymm7,%ymm5,%ymm7
	vfnmadd213pd %ymm5,%ymm12,%ymm7 # ymm7 = - ymm7 * ymm12 + ymm5
	
#	vmulpd %ymm11,%ymm4,%ymm4
#	vsubpd %ymm6,%ymm4,%ymm4
	vfmsub213pd %ymm6,%ymm11,%ymm4 # ymm4 = ymm4 * ymm11 - ymm6	
#	vmulpd %ymm11,%ymm5,%ymm5
#	vsubpd %ymm7,%ymm5,%ymm5
	vfmsub213pd %ymm7,%ymm11,%ymm5 # ymm5 = ymm5 * ymm11 - ymm7	

#--
	vsubpd %ymm4,%ymm0,%ymm9
	vmovapd %ymm9,(%rsi,%rdi)  # 01R		
	vsubpd %ymm5,%ymm1,%ymm5
	vmovapd %ymm5,32(%rsi,%rdi)   # 01I	
	
#	vmulpd %ymm11,%ymm0,%ymm0
#	vsubpd %ymm9,%ymm0,%ymm0
	vfmsub213pd %ymm9,%ymm11,%ymm0 # ymm0 = ymm0 * ymm11 - ymm9	
	vmovapd %ymm0, (%rsi)       # 00R	
#	vmulpd %ymm11,%ymm1,%ymm1
#	vsubpd %ymm5,%ymm1,%ymm1
	vfmsub213pd %ymm5,%ymm11,%ymm1 # ymm1 = ymm1 * ymm11 - ymm5	
	vmovapd %ymm1,32(%rsi)       # 00I	
	
	vsubpd %ymm7,%ymm2,%ymm9
	vmovapd %ymm9, (%rsi,%rdx)   # 11R		
	
	vaddpd %ymm6,%ymm3,%ymm7	
	vmovapd %ymm7,32(%rsi,%rdx)   # 11I		

#	vmulpd %ymm11,%ymm2,%ymm2
#	vsubpd %ymm9,%ymm2,%ymm2
	vfmsub213pd %ymm9,%ymm11,%ymm2 # ymm2 = ymm2 * ymm11 - ymm9	
	vmovapd %ymm2, (%rsi,%rcx)   # 10R		
#	vmulpd %ymm11,%ymm3,%ymm3
#	vsubpd %ymm7,%ymm3,%ymm3
	vfmsub213pd %ymm7,%ymm11,%ymm3 # ymm3 = ymm3 * ymm11 - ymm7	
	vmovapd %ymm3,32(%rsi,%rcx)   # 10I	
	
#-----
	addq $64,%rsi
	cmpq %rsi,%rax
	jne L1

	ret
       
C2: # ư 2
	.long   0x00000000,0x40000000
	
