o
    iZ                     @   sz  d dl Z ddlmZmZmZmZmZ ddlmZ ddl	m
Z
mZ e je je jgZdd Zd	d
 Zdd Zeeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddeddddddddge  g de
edddeddd iedejdejd ejd!ejd"ejd#ejdejfd$d%ZG d&d' d'e jjZejZdS )(    N   )Configautotunecdiv
heuristicsjit)language   )early_config_pruneestimate_matmul_timec                 C   sR   | |u r| S | t v sJ |t v sJ t D ]}| |u r|  S ||u r&|   S qd S N)_ordered_datatypes)abd r   `/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/triton/ops/matmul.pyget_higher_dtype
   s   r   c                    s    fddS )Nc                    s   |     S r   )Zzero_)nargsnamer   r   <lambda>   s    zinit_to_zero.<locals>.<lambda>r   r   r   r   r   init_to_zero   s   r   c                  C   s   g } dD ]?}dD ]:}dD ]5}dD ]0}|dkrdnd}|  t|||dd	||d
 dD ]}|  t||||d	||tdd q+qqqq| S )N)r               )       )r   @   )r   r         r   r   r   r	   BLOCK_MBLOCK_NBLOCK_KSPLIT_K
num_stages	num_warps)r   r      r   C)r(   r)   Zpre_hook)appendr   r   )configsr(   Zblock_mZblock_kZblock_nr)   Zsplit_kr   r   r   get_configs_io_bound   s(   

r.   r    r!   r   r"   r   r*   r'   r   r   r   )MNK
   )r
   Z
perf_modelZtop_k)r-   keyZprune_configs_byEVEN_Kc                 C   s   | d | d | d   dkS )Nr1   r%   r&   r   r   )argsr   r   r   r   L   s    r   dot_out_dtyper#   r$   r%   GROUP_Mr&   c           (      C   s  t d}t d}t ||}t ||}|| }|| }t|||  |}|| ||  }|| | }|| t d| }|| t d| }t t || ||}t t || ||}|| t d| } | |d d d f | | d d d f |   } || d d d f | |d d d f |	   }t j||f|d}!tdt ||| D ]h}"|rt 	| }#t 	|}$n1||"||   }%t jd|j
jd}&t j	| | d d d f |%k |&d}#t j	|| d d d f |%k |&d}$|#|j
j}#|$|j
j}$|!t j|#|$|d7 }!| || | 7 } ||| | 7 }q|!|j
j}!|| t d| }|| t d| }||d d d f |
 |d d d f |   }||k d d d f ||k d d d f @ }'|dkrht j||!|'d d S t j||!|'d d S )Nr   r	   )dtype)r	   r	   )maskother)Z	out_dtype)r9   )tlZ
program_idr   minZarangeZmax_contiguousZmultiple_ofZzerosrangeloadr8   Z
element_tytodotstoreZ
atomic_add)(ABr+   r/   r0   r1   Z	stride_amZ	stride_akZ	stride_bkZ	stride_bnZ	stride_cmZ	stride_cnr6   r#   r$   r%   r7   r&   r4   pidZpid_zZgrid_mZgrid_nwidthZgroup_idZ
group_sizeZpid_mZpid_nZrmZrnramZrbnZrkacckr   r   Zk_remainingZ_0r9   r   r   r   _kernel-   sL   
+
,,
  ,(
rI   c                   @   s.   e Zd ZeZi Zedd ZedddZdS )_matmulc           	         s  | j }| ddkr| ddkr|  } |ddkr'|ddkr'| }| jd |jd ks5J d| j\ }|j\}| jtjtjtjfv sU|jtjtjtjfv rYt	j
}nt| j|j}t	j f||d}|d u r|t	j
t	jt	jfv r|tj}n&tj}n"t|t	jsJ d|t	j
krtj
}n|t	jt	jfv rtj}ntj} fdd}t| | || || d| d|d|d|d|d|dd	 |S )
Nr   r	   zincompatible dimensions)devicer8   z#dot_out_dtype must be a torch.dtypec                    s$   t  | d t | d  | d fS )Nr#   r$   r&   )r   )ZMETAr/   r0   r   r   r      s   $ z_matmul._call.<locals>.<lambda>r*   )r6   r7   )rK   Zstride
contiguousshaper8   r;   Zfloat8e4Zfloat8e4b15Zfloat8e5torchfloat16r   emptyfloat32bfloat16Zint32
isinstancerI   )	r   r   r6   rK   r1   _Zc_dtypecgridr   rL   r   _call   s@   


z_matmul._callNc                 C   s   t j|||dS )N)r6   )rJ   rX   )ctxr   r   r6   r   r   r   forward   s   z_matmul.forwardr   )	__name__
__module____qualname__rI   ZkernelZ_locksstaticmethodrX   rZ   r   r   r   r   rJ      s    
)rJ   )rO    r   r   r   r   r   r   r;   Zmatmul_perf_modelr
   r   rP   rS   rR   r   r   r   r.   Z	constexprrI   ZautogradFunctionrJ   applymatmulr   r   r   r   <module>   sr    8
4