o
    i                     @   s   d dl Z ddlmZ ddlmZ ddlmZ dd Zedejd	ejd
ejfddZedejd	ejd
ejfddZ	G dd de j
jZG dd dZdS )    N   )jit)language)next_power_of_2c                 C   s4   | dkrdS | dkrdS | dkrdS | dkrdS d	S )
N            i      i          )nr   r   m/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/triton/ops/blocksparse/softmax.py	num_warps   s   r   ROW_SIZE
BLOCK_SIZEIS_DENSEc                  C   s  t d}t d}t d}|t d | }t d|
| }t d|
| }||| d  }t |d }t |d }|| }||| | | 7 }||| | 7 }|r]t d|
}n$|dt d t d |  }t j|| | ||k dd}|| | }||k }t j|| | |td d}|t j}|}||9 }|d ur||| 7 }||| 7 }|| d | }|dk||k @ }t j|||  | |dd}||7 }|t j}t ||k|	@ td |}t 	|}t j
| | | ||d d S )Nr   r   r	   maskotherinf        r   )tl
program_idnum_programsarangeloadfloattofloat32wheresoftmaxstore) OutAZ	stride_xzLUTRextent	stride_zr	stride_hrscale	is_causalr   r   r   hmzhmlane_nblock_nheadersizeoffsetZoff_ansoff_lutstart_nr   aoutoff_lomask_lo
rel_logitsr   r   r   _blocksparse_softmax_fwd   sB   
	

 
r?   c           '      C   s   t d}t d}t d}|t d | }t d|| }t d|| }||| d  }t |d }t |d }|| | | }||| | 7 }||k }|||  | }|||  | }|rkt d|}n"|dt d t d |  }t j|| | |dd} | | | }t j|| |dd}!|!t j}!t j|| |dd}"|"t j}"t ||k|@ |!|!k@ d|!}!|!|"t |!|" d  }#|d ur|||
 7 }||| 7 }|	| d | }$|$dk|$|	k @ |@ }%t j	|||	  |$ |#|%d |#| }#| ||  | }&t j	|&| |#|d d S )Nr   r   r	   r   r   r   )
r   r   r   r   r   r    r!   r"   sumr$   )'ZDAZ
stride_zdxZDOutZstride_zdoutr%   Zstride_zoutr,   r'   ZDRr)   r*   r+   Z	stride_err-   r   r   r   r.   r/   r0   r1   r2   r3   r4   r5   r6   Zoff_mnr   ZAsZDOutsr7   r8   r9   r:   doutdar<   r=   ZDAsr   r   r   _blocksparse_softmax_bwdK   sD   


 rC   c                   @   s0   e Zd Zedd Zedd Zedd ZdS )_softmaxc              	   C   s   t jg t j| jd}| }t| jd D ]}t || |d d d d f df}q|| }t 	|}t j
|d d dd|dd < | jddd d df }t j||fddd}	t |	|ft j|}
|
t| fS )	Ndtypedevicer   )dimr   F)as_tupler	   )torchZtensorZint64rG   clonerangeshapecatr@   Z
zeros_likeZcumsumZnonzerostackviewtypeZint32r    intmax)layoutblockrG   _emptysizesr.   Ztotal_sizesoffsetscolumnsr4   lutr   r   r   make_lut   s   (
z_softmax.make_lutc
                 C   s  |d urt |tjr|jjdksJ | }|jd }
|d |d | |
g}|d u r,dn|j}|d u r5dn| }t|}t	| |||d|||d |d |d |||t
||	t|d | || || _|| _|| _|| _|| _|| _|j| _|	| _|| _|S )Ncpur   r   )r   r   r   r   rH   r   r   r   r   )
isinstancerK   ZTensorrG   rR   itemrN   stride
empty_liker?   r   r   Zsave_for_backwardspdimsrV   maxlutr,   	rel_shaperel_stridesrF   	rel_dtypeis_denser-   )ctxr:   r,   r>   r-   rc   rV   r[   rd   rh   Mgridre   rf   r;   r   r   r   forward   s:   

z_softmax.forwardc                 C   s   | j \}}d }| jd rtj| j| j|jd}|jd }| jd | jd | j	 |f}t
|}t| ||d||d||d| j||| jd | jd | jd | jd | j| j	t| j| jt| jd |d d |d d d d d d d d d d d d d d fS )Nr   rE   r   r   rH   r	   r^   )Zsaved_tensorsZneeds_input_gradrK   Zzerosre   rg   rG   rN   rc   rV   rb   rC   ra   r,   rf   r-   r   rd   rh   r   )ri   rA   r;   r[   Zdrrj   rk   rB   r   r   r   backward   s6   






"

z_softmax.backwardN)__name__
__module____qualname__staticmethodr\   rl   rm   r   r   r   r   rD      s    

%rD   c                   @   s(   e Zd Zd	ddZddddddZdS )
r#   Fc                 C   s8   |j | _|| _|| _t| j| j|\| _| _|| _d S )N)	rN   rc   rU   rV   rD   r\   r[   rd   rh   )selfrU   rV   rG   rh   r   r   r   __init__   s
   
zsoftmax.__init__g      ?N)r,   r>   r-   c                C   sL   |d ur|j |j krtd|j  t||||| j| j| j| j| j	}|S )Nz$relative position embedding must be )	rF   
ValueErrorrD   applyrc   rV   r[   rd   rh   )rr   r:   r,   r>   r-   r   r   r   __call__   s   zsoftmax.__call__)F)rn   ro   rp   rs   rv   r   r   r   r   r#      s    
r#   )rK    r   r   r   r   r   Z	constexprr?   rC   ZautogradFunctionrD   r#   r   r   r   r   <module>   s.    6	
;Z