o
    i                     @   sx   d dl Z d dlZddlmZ ddlmZ ddlmZ ddlmZm	Z	m
Z
 dd Zd	d
 Zdd Z	dddZdd ZdS )    N   )cdiv)runtime)driver)get_dram_gbpsget_max_simd_tflopsget_max_tensorcore_tflopsc                 C   @   |t |d }tj|d d }t ||| t|| | }|S z# return compute throughput in TOPS    multiprocessor_count)minr   utilsget_device_propertiesr   backenddevicenum_ctas	num_warpsdtypeZtotal_warpsZnum_subcoresZtflops r   k/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/triton/ops/matmul_perf_model.pyget_tensorcore_tflops      r   c                 C   r	   r
   )r   r   r   r   r   r   r   r   r   get_simd_tflops   r   r   c                 C   sB   t j|}|d dk r|t jkrt| ||||S t| ||||S )Nr      )torchcudaget_device_capabilityfloat32r   r   )r   r   r   r   r   
capabilityr   r   r   
get_tflops   s   r!   Fc           ,      K   s  t jj}tj }|j}| }t||}t||	}|}|| | }t	||t	||	}}d| | | d }t
|||| |}|| }tj|d }td|| }td|d }t	td|d d d}t|||d |d	   }|d
 }|| | dd|d    }|| | d |d  } || | dd|d    }!|| | d |d  }"||! d }#| |" d }$|#| |$|  }%|d }&|| | | d }'|dkr|'|& }(n|&})|'|) }(|| d d |& }*|(|*7 }(t	||%|( }+|r	td|+ d| d|% d|( d|d  d |+S )zO return estimated running time in ms
          = max(compute, loading) + store r   i   @r          L   r   gffffff?g?r   g?g?i   g333333?zTotal time: zms, compute time: zms, loading time: zms, store time: zms, Activate CTAs: d   %)r   r   CUDAr   r   current_devicer   element_sizer   maxr!   r   r   r   r   r   print),r   
num_stagesABCMNKBLOCK_MBLOCK_NBLOCK_KSPLIT_Kdebugkwargsr   r   r   dtsizeZ	num_cta_mZ	num_cta_nZ	num_cta_kr   Z	total_opsZtputZ
compute_msZnum_smZactive_cta_ratioZactive_cta_ratio_bw1Zactive_cta_ratio_bw2Zdram_bwZl2_bwZload_a_dramZ	load_a_l2Zload_b_dramZ	load_b_l2Z
total_dramZtotal_l2Zload_msZstore_bwZstore_c_dramZstore_msZ	reduce_bwZzero_msZtotal_time_msr   r   r   estimate_matmul_time"   sV   





r:   c                    s  t j }t j }|d  }|d j}g }| D ]1}|j}|d |d |d |jf\}	}
}}tj	
|d }|	|
 | | | }||krJ|| q|} |t jt jfvr\dd | D } i }| D ]9}|j}|d |d |d |d |j|jf\}	}
}}}}|	|
|||f}||v r|| ||f q`||fg||< q`g }| D ]O\}}|\}	}
}}}|d	 d
kr|	|
 | d }|td| d
 }d}||  tjd| fddd}|D ]	}||d	  qq|d	 d	 }d|_|| q|S )Nr-   r3   r4   r5   Zmax_shared_memc                 S   s   g | ]}|j d  dkr|qS )r6   r"   )r8   ).0configr   r   r   
<listcomp>z   s    z&early_config_prune.<locals>.<listcomp>r6   r   r   i   r   i,  r   c                    s0   | d   dk rdt | d    S | d   S )Nr"   r   
   )abs)xZoptimal_num_stagesr   r   <lambda>   s   z$early_config_prune.<locals>.<lambda>)key)r   r   r(   r   r)   r   r8   r,   r   r   r   appendZfloat16r   r   itemsr   heapq	nsmallest)ZconfigsZ
named_argsr   r    r9   r   Zpruned_configsr<   kwr3   r4   r5   r,   Zmax_shared_memoryZrequired_shared_memoryZconfigs_mapr6   r   rC   kvZmmasZ
mma_cyclesZldgsts_latencyZnearestnZrandom_configr   rA   r   early_config_pruned   sT   




"rL   )F)rF   r    r   Z_C.libtriton.tritonr   r   testingr   r   r   r   r   r!   r:   rL   r   r   r   r   <module>   s    
B