o
    i4                     @   s   d Z ddlZddlmZmZ ddlmZ edejdejdejd	ejfd
dZedejdejfddZ	edejdejdejdejdejf
ddZ
edejdejdejdejdejf
ddZG dd dejjZejZdS )ao  
Fused Attention
===============
This is a Triton implementation of the Flash Attention algorithm
(see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf)

Sequence Parallel implementation inspired by HazyResearch
(see https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py)
    N   )cdivjit)languageBLOCK_MBLOCK_DMODELBLOCK_N	IS_CAUSALc           6   	   C   s  t d}t d}|| }t j| | ||f||	f|| df||fdd} t j|| ||f||fd||fdd}!t j|| ||f||fd||fdd}"|| t d| }#t d|}$t j|gt jdtd }%t j|gt jd}&t j||gt jd}'|d	 }(t | })|)|( |j	j
})d}*|r|d | n|}+t|*|+|D ]},t |!}-t |"}.t j||gt jd}/|rt |#d d d f |,|$d d d f  k|/td
}/|/t j|)|-dd7 }/t |%t |/d}0t j|%|0 }1t j|/|0d d d f  }2|&d |1 }3|'|3d d d f 9 }'|'t j|2|j	j
|.dd7 }'|&|1 t |2d }&|0}%t |!d|f}!t |"|df}"q|'|&d d d f  }'|||  |# }4t |4|%t j|&  t j|| ||f||f|| df||fdd}5t |5|'|j	j
 d S )Nr      )r
   r   )baseshapestridesoffsetsZblock_shapeorder)r   r   )r   r
   dtypeinf/ldG?-infTZ
allow_tf32)tl
program_idZmake_block_ptrarangezerosfloat32floatloadtor   
element_tyrangewheredotmaximummaxmathexp2sumadvancestorelog2)6QKVsm_scaleLOut	stride_qz	stride_qh	stride_qm	stride_qk	stride_kz	stride_kh	stride_kn	stride_kk	stride_vz	stride_vh	stride_vk	stride_vnZ	stride_ozZ	stride_ohZ	stride_omZ	stride_onZHN_CTXr   r   r   r	   start_moff_hzZ
qvk_offsetZQ_block_ptrZK_block_ptrZV_block_ptroffs_moffs_nZm_il_iaccqk_scaleqlohistart_nkvqkZm_i_newalphapZ	acc_scalel_ptrsZO_block_ptr rP   i/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/triton/ops/flash_attention.py_fwd_kernel   s   


	


2
rR   D_HEADc           
      C   s   t d| t d| }t d|}t | |d d d f |  |d d d f  t j}t ||d d d f |  |d d d f  t j}t j|| dd}	t || |	 d S )Nr   r
   )Zaxis)r   r   r   r   r   r   r&   r(   )
r/   DODeltar   rS   Zoff_mZoff_nododeltarP   rP   rQ   _bwd_preprocessr   s   66rY   SEQUENCE_PARALLELCAUSALc$           A      C   s  |"r|| tj| 7 }|#r|| }$nd}$|$td| }%|| td| }&td|!}'td| }(| |%d d d f | |(d d d f |   })||&d d d f | |(d d d f |   }*||&d d d f | |(d d d f |   }+||%d d d f | |(d d d f |   },||%d d d f | |(d d d f |   }-|||  }.|
||  }/tj|| gtjd}0tj|| gtjd}1t|*}2t|+}3t|$|| |D ]}4|4|' }5t|)}6|#rt|5d d d f |&d d d f kt	dt	d}7n
tj||!gtjd}7|7t
|6t|27 }7|7|9 }7t|/|5 }8tj|7|8d d d f  }9t|,}:|0tj
t|9 | jj|:dd7 }0t|.|5 };tj
|:t|3dd}<|9|<|;d d d f   |  | jj}=|1tj
t|=|6dd7 }1|"st|-}>|>tj
|=|2dd7 }>t|-|> n|"rttj
t|2t|=dd}>t|-|> |-|| 7 }-|)|| 7 })|,|| 7 },q|	|&d d d f | |(d d d f |   }?||&d d d f | |(d d d f |   }@t|?|0 t|@|1 d S )Nr   r   g        r   Tr   )r   r   Zint64r   r   r   r   r   r    r   r!   Ztransr$   r%   r   r   r(   )Ar*   r+   r,   r-   rE   r/   rT   DQDKDVr.   D
stride_dqar0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r@   rI   Z	num_blockr   r   r   rZ   r[   rG   Zoffs_qmrB   rA   Zoffs_kZq_ptrsZk_ptrsZv_ptrsZdo_ptrsZdq_ptrsZD_ptrsrO   dvdkrJ   rK   r?   Zoffs_m_currrF   rL   rC   rN   rW   ZDiZdpZdsdqZdv_ptrsZdk_ptrsrP   rP   rQ   _bwd_kernel_one_col_block   sd   
,,,,,


4
$&
",,rd   c            &   	   C   s  |d } t d}!|!| }"|!| }#| |"| |#|  7 } ||"| |#|  7 }||"| |#|  7 }||"| |#|  7 }||"| |#|  7 }||"| |#|  7 }||"| |#|  7 }t ||}$|std|$D ]L}%tg | |||| ||||||	|
|||||||||||||||||!|%|$R |||||d qdd S t d}%tg | |||| ||||||	|
|||||||||||||||||!|%|$R |||||d d S )Nr   r   )r   r   r   rZ   r[   r
   )r   r   r   r   rd   )&r*   r+   r,   r-   r/   rT   r\   r]   r^   r.   r_   r`   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r   r   r   rZ   r[   rE   r@   Zoff_zZoff_hZnum_block_nrI   rP   rP   rQ   _bwd_kernel   s0  
			


			
re   c                   @   s&   e Zd ZedddZedd ZdS )
_attentionFc                 C   s  t j }|d dk rtdd}d}	|jd |jd |jd }
}}|
|kr,||ks.J |dv s4J t |}t|jd ||jd |jd	  d	f}t j|jd |jd	  |jd f|jt j	d
}|dkridnd}t
| |||||||d|d	|d|d|d|d	|d|d|d|d	|d|d|d|d	|d|d|jd |jd	 |jd f||	|||dd | ||||| || _|| _|| _|| _|| _|S )Nr      zEFlash attention currently only supported for compute capability >= 80   @   >          ri   rh   r   r
   devicer         )r   r   r   r	   	num_warps
num_stages)torchcudaZget_device_capabilityRuntimeErrorr   
empty_liker   emptyrn   r   rR   strideZsave_for_backwardgridr-   r   causalsequence_parallel)ctxrF   rJ   rK   rz   r-   r{   Z
capabilityr   r   ZLqZLkZLvrV   ry   r.   rq   rP   rP   rQ   forward#  sD   
"
&.    	z_attention.forwardc                 C   s  d}| j \}}}}}| j}|jd }	| }|r.t|	|}
|
f|j }tj||j|jd}ntj	|tj
d}t|}t|}t|}t| jd | jd  f ||||| jd t| jd |rft|	|ndf |||| j|||||||| |d|d|d|d|d|d|d|d|d|d|d|d|jd |jd |jd f||| j|| jd	dd
 t|jdkr|jdd}|||d d d fS )Nrh   r   rm   r   r   r
   )r   rS   rp   rg   )r   r   r   rZ   r[   rq   rr      )dim)Zsaved_tensorsr{   r   
contiguousr   rs   r   rn   r   Z
zeros_liker   rv   rY   ry   r   re   r-   Znumelrx   rz   lenr&   )r|   rW   ZBLOCKrF   rJ   rK   rV   r.   r{   Z
seq_len_kvZreplicasZnew_dq_shaperc   rb   ra   rX   rP   rP   rQ   backwardI  sN   





&  
z_attention.backwardN)F)__name__
__module____qualname__staticmethodr}   r   rP   rP   rP   rQ   rf   !  s
    %rf   )__doc__rs    r   r   r   r   Z	constexprrR   rY   rd   re   ZautogradFunctionrf   applyZ	attentionrP   rP   rP   rQ   <module>   s\    
		
`YC
T