o
    åiÿ<  ã                   @   sä   d dl Z ddlmZmZmZ ddlmZ eddd„ iƒedejd	ejd
ejdejdejf
dd„ƒƒZddd„Z	dd„ Z
edejd	ejd
ejdejdejf
dd„ƒZddd„Zdd„ Zddd„ZG dd„ de jjƒZG dd„ dƒZdS ) é    Né   )ÚcdivÚ
heuristicsÚjit)ÚlanguageÚEVEN_Kc                 C   s   | d | d  dkS )NÚKÚTILE_Kr   © )Únargsr
   r
   úl/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/triton/ops/blocksparse/matmul.pyÚ<lambda>   ó    r   ÚTILE_MÚTILE_Nr	   ÚBLOCKc           *      C   s<  t  d¡| }||d 7 }t  d¡}t  |d ¡}t  |d ¡}|| t  d|¡|  }t  d|¡}| ||  ||  |d d …d f |  |d d d …f |  }t  |d ¡}|| t  d|¡|  }t  d|¡} |||  ||  |d d d …f |
  | d d …d f |	  }!t j||ft jd}"t|d| ƒD ]F}#|r§t  |¡}$t  |!¡}%n t j||d d d …f |#k dd}$t j|!| d d …d f |#k dd}%|"t j|$|%t jd7 }"||| 7 }|!||	 7 }!q˜|" |j	j
¡}&t  d|¡| }'t  d|¡| }(|||  ||  |'d d …d f |  |(d d d …f |  })t j|)|&d	d
 d S )Nr   r   é   é   ©Údtypeg        )ÚmaskÚother©Z	out_dtypeT©r   )ÚtlÚ
program_idÚloadÚarangeÚzerosÚfloat32ÚrangeÚdotÚtor   Ú
element_tyÚstore)*ÚAÚBÚCZ	stride_zaÚ	stride_haZ	stride_maÚ	stride_akÚ	stride_zbÚ	stride_hbÚ	stride_bkZ	stride_nbÚ	stride_zcÚ	stride_hcZ	stride_mcZ	stride_ncr   Zgrid_offsetÚlutr   r   r	   r   r   Úblock_idZoff_zÚoff_hZstart_amÚoffs_amÚoffs_akZa_ptrsZstart_bnÚoffs_bnÚoffs_bkZb_ptrsÚaccÚkÚaÚbÚcÚoffs_cmÚoffs_cnÚpcr
   r
   r   Ú_sdd_kernel   sf   
ÿþýüÿþýü
  ÿþýür>   c
                 C   sÆ  |   d¡dkr|   d¡dkr|  ¡ } |  d¡dkr$|  d¡dkr$| ¡ }|r2|| } }| | }}|r6dnd}
|r<dnd}| j|
 |j| }}||krXtd|› d|› dƒ‚|	d u rqtj| jd	 |jd	 ||f| j| jd
}n|	j| jd	 |jd	 ||fks‚J ‚|	}|jd d|jd	 g}t| | |||   d	¡|   d¡|   |r£dnd¡|   |r«dnd¡|  d	¡|  d¡|  |r»dnd¡|  |rÃdnd¡|  d	¡|  d¡|  d¡|  d¡|d	|||d|ddd |S )Nr   r   r   éþÿÿÿéÿÿÿÿzInner dimension mismatch (A: z vs B: ú)r   ©r   Údeviceé    é   )r   r   r	   r   Ú
num_stagesÚ	num_warps)	ÚstrideÚ
contiguousÚshapeÚ
ValueErrorÚtorchÚemptyr   rC   r>   )r8   r9   Útrans_aÚtrans_bÚtrans_cÚspdimsÚblockr/   ÚwidthsÚoutZa_dimZb_dimZKaZKbr:   Úgridr
   r
   r   Ú
sdd_matmulT   s6   
*"00 
ù	rV   c                 C   s&   | j dd |¡ ¡ }| ¡ }|d fS )NF©Úas_tuple)Únonzeror"   ÚintrI   )ÚlayoutrR   rC   r/   r
   r
   r   Úsdd_lutv   s   r\   ÚGROUP_SIZE_Mc           4      C   sü  t  d¡}t  d¡}t  d¡}t  d¡}t  |||||¡\}}t  d¡}||d  }t  |d ¡}t  |d ¡}t  |d ¡}t  |d ¡} || }!t  |!d ¡}"t  |"d¡}"t  d|¡}#t  d|¡}$| ||  |"|  |#d d …d f |  |$d d d …f |  }%|| t  d|¡ }&t  t  |&| |¡|¡}&t  |!¡}'t  |'d¡}'|'t  d|¡ }(|||  | |  |&d d d …f |
  |(d d …d f |	  })t j||ft j	d}*|!d7 }!t  |!d ¡}+t  |+d¡}+t  |!¡},t  |,d¡},t
|d| ƒD ]=}-t  |%¡}.t  |)¡}/|*t j|.|/t j	d7 }*|%|+7 }%|)|,|	 7 })|!d7 }!t  |!d ¡}+t  |+d¡}+t  |!¡},t  |,d¡},qõ|* |jj¡}0|| t  d|¡ }1|| t  d|¡ }2|| |  ||  |1d d …d f |  |2d d d …f |  }3t j|3|0|2d d d …f |k d	 d S )
Nr   r   r   rE   r   é   r   r   r   )r   r   Znum_programsZ	swizzle2dr   Zmultiple_ofr   Zmax_contiguousr   r   r    r!   r"   r   r#   r$   )4r%   r&   r'   Z	stride_azr(   Z	stride_amr)   r*   r+   r,   Z	stride_bnr-   r.   Z	stride_cmZ	stride_cnZDS0ZDS1r/   r   r   r	   r]   r   Zpid_mZpid_nZ	num_pid_mZ	num_pid_nZpidzÚheaderÚoffsetr   Úcolumnr1   Zpincr0   r2   r3   Úpar4   Zstart_bkr5   Zpbr6   Zinc_aZinc_br7   r8   r9   r:   r;   r<   r=   r
   r
   r   Ú_dsd_kernel‚   s‚   





ÿþý

ÿþý



ÿþýü$rc   c
                    s¾  |   d¡dkr|   d¡dkr|  ¡ } |  d¡dkr$|  d¡dkr$| ¡ }|||r*dnd  }
| d¡‰ | d¡}| |r>dnd¡‰| j}ˆ }|}|rLˆn|
}|rR|
nˆ}|	d u rftj||||f|| jd}n|	j||||fksqJ ‚|	}d}‡ ‡‡fdd„}t| | |||   d¡|   d¡|   |r‘dnd¡|   |r™dnd¡|  d¡|  d¡|  |r©dnd¡|  |r±dnd¡|  d¡|  d¡|  |rÁdnd¡|  |rÉdnd¡ˆ|
|f||t	|d	ƒ|d
d
d
dœŽ |S )Nr   r   r   r   rB   é€   c                    s   t ˆ| d ƒˆˆ gS )Nr   )r   )Úmeta©ÚBS0ZBS3Úwidthr
   r   r   æ   r   zdsd_matmul.<locals>.<lambda>rD   rE   )r   r   r	   r   rF   rG   r]   )
rH   rI   Úsizer   rL   rM   rC   rJ   rc   Úmin)r8   r9   rN   rO   rP   rQ   rR   r/   rh   rT   ZAS1ÚBS1r   ZCS0ZCS1ZCS2ZCS3r:   r   rU   r
   rf   r   Ú
dsd_matmulÎ   s<   

000ûù
rl   c                 C   sž  t  | |rdnd¡}t  |¡jdd\}}| ¡ }|| }|r&| jdd}	n
|  dd¡jdd}	|	 d¡}
t  |¡}t j|dd… dd	|dd…< t  	||
d t  |¡ ¡}|	dd…df | }| 
¡ }|dd…  |dd… 8  < || }| dd¡ d|¡}||dd…dd…f< |dd…df  |d | 8  < |||dk  |||dk df< | d¡}|rºt j|
| jd
}nLt jg t j| jd}d}t|  d¡ƒD ]8}| |dd…dd…f  
¡  ¡ }| ¡ }dt j|| jd
 ||dk< t  |||j|jdk  d f¡}||7 }qÍ|| | }|dd…  |dd… | | 8  < | dd¡ d|¡}|rF||dd…dd…f< |dd…df  |d | 8  < n|| |dd…dd…f< |dd…df  |d | | 8  < |||dk  |||dk df< | d¡}| d¡}|d | d|  }|| }t j||||fdd	 d¡ ¡ }t j||fdd	 d¡ ¡ }t jd|j|jd}t  ||f¡}t  ||f¡}| t j¡ |¡}||fS )a  
    Generates the look-up table for incrementing pointers in the DSD/DDS matmul.
    Example (BLOCK=32, STEP=16)
    [[1, 0, 0, 1, 0],
     [0, 1, 1, 0, 1],
     [1, 0, 1, 0, 0]]

    Then the offsets for A are
     [0 , 16, 32, 48] <- row 0
      \----/  \----/
      col=0   col=3
     [64, 80, 96, 112, 128, 144] <- row 1
      \----/   \----/  \------/
       col=1    col=2    col=3
     [160, 176, 192, 208]
    which leads to increments table
    [0, 16, 16, 16, || 64, 16, 16, 16, 16, 16, || 160, 16, 16, 16]

    Because B is dense, the offsets are
    [0, 16, 96, 112] <- row 0
    [32, 48, 64, 80]  <- row 1
    [0, 16, 64, 80]   <- row 2
    r   r   TrW   Fr   Nr@   )Údim)rC   rB   rE   é   )rC   r   )rL   ÚsumZ	ones_likerY   ÚflattenZ	transposeri   Z
zeros_likeZcumsumrj   ÚcloneÚviewÚrepeatr   rC   ZtensorZint64r    ÚlongÚcatÚTÚstackrI   r   r   ÚtypeZint32r"   )r[   rR   ÚstepZtransrC   ÚsizesZhead_idZcol_idÚsegmentsZnnzZ
num_blocksÚoffsetsZB_idxZB_incsÚdivZA_idxÚcurrent_offsetÚzZlayoutwZmsumZA_incsrh   r_   ZincsÚpadr/   r
   r
   r   Údsd_lutô   sd   

  
"
$"$ 

 r   c
           
      C   s"   t || | | | |||||	d
S ©N)rT   )rl   )
r8   r9   rN   rO   rP   rQ   rR   r/   rh   rT   r
   r
   r   Ú
dds_matmulZ  s   "rƒ   c                   @   s0   e Zd ZeeedœZedd„ ƒZedd„ ƒZ	dS )Ú_matmul©ÚsddÚdsdÚddsc                 C   sx   t j| ||||||||	|
|d
}|  ||¡ || _|| _|| _|| _|| _|| _|| _	|| _
|| _|| _|d u| _|S r‚   )r„   ÚfnZsave_for_backwardÚda_lutÚda_widthÚdb_lutÚdb_widthÚmoderQ   rR   rN   rO   rP   Úhas_out)Úctxr8   r9   rN   rO   rP   rŽ   rQ   rR   Úc_lutÚc_widthrŠ   r‹   rŒ   r   rT   r:   r
   r
   r   Úforwardf  s   "
z_matmul.forwardc           
      C   sè   | j \}}d\}}| j}| jd r4|d |d  |d  }tj| ||| j| j | j| j| j	| j
| jƒ	}| jd r\|d |d  |d  }tj| ||| j | j| j| j| j	| j| jƒ	}| jra|nd }	||d d d d d d d d d d d d |	fS )N)NNr   r   r   )Zsaved_tensorsrŽ   Zneeds_input_gradr„   r‰   rP   rO   rN   rQ   rR   rŠ   r‹   rŒ   r   r   )
r   Zdcr8   r9   ÚdaÚdbrŽ   Zmode_daZmode_dbZdoutr
   r
   r   Úbackward{  s$   

"ÿ
"ÿ
þz_matmul.backwardN)
Ú__name__Ú
__module__Ú__qualname__rV   rl   rƒ   r‰   Ústaticmethodr“   r–   r
   r
   r
   r   r„   b  s    
r„   c                   @   s    e Zd Zddd„Zddd„ZdS )	ÚmatmulFc           	      C   sD  |dvrt dƒ‚|| _|| _|| _|| _|| _|| _|j| _t	|dƒ}| jdkrJt
|||ƒ\| _| _t|||d|ƒ\| _| _t|||d|ƒ\| _| _| jdkrtt|||| j |ƒ\| _| _t
|||ƒ\| _| _t|||| j|ƒ\| _| _| jdkr t|||| j|ƒ\| _| _t|||| j |ƒ\| _| _t
|||ƒ\| _| _d S d S )	Nr…   z"Supported modes are: sdd, dsd, ddsrD   r†   TFr‡   rˆ   )ÚNotImplementedErrorrR   rŽ   rN   rO   rP   r[   rJ   rQ   rj   r\   r‘   r’   r   rŠ   r‹   rŒ   r   )	Úselfr[   rR   rŽ   rC   rN   rO   rP   ry   r
   r
   r   Ú__init__•  s.   



ýzmatmul.__init__Nc                 C   sB   t  ||| j| j| j| j| j| j| j| j	| j
| j| j| j|¡}|S ©N)r„   ÚapplyrN   rO   rP   rŽ   rQ   rR   r‘   r’   rŠ   r‹   rŒ   r   )r   r8   r9   rT   r:   r
   r
   r   Ú__call__­  s   ûzmatmul.__call__)FFFrŸ   )r—   r˜   r™   rž   r¡   r
   r
   r
   r   r›   “  s    
r›   rŸ   )rL   Ú r   r   r   r   r   Z	constexprr>   rV   r\   rc   rl   r   rƒ   ZautogradÚFunctionr„   r›   r
   r
   r
   r   Ú<module>   sJ    ÿúúúùù
@"úúúùù
K&
f1