o
    i(E                     @   s   d dl Z d dlZd dlZd dlZd dlmZ ddlmZ dd Zd'dd	Z					d(ddZ
d)ddZG dd dZG dd dZdd Zd*ddZd+ddZdd Zdd  Zed,d#d$Zd*d%d&ZdS )-    N)contextmanager   )runtimec                 C   L   d | } dddd|  dg}t|}|tjjd}dd |D }|S )	N,
nvidia-smi-i0--query-gpu=--format=csv,noheader,nounitsc                 S      g | ]}t |qS  int.0xr   r   ]/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/triton/testing.py
<listcomp>       znvsmi.<locals>.<listcomp>join
subprocesscheck_outputdecodesysstdoutencodingsplitattrscmdoutretr   r   r   nvsmi
   s   

r$      c                    s  dd l 	 j j krtd|   |d ur+|D ]}|  |d d |_qj  j	  |   W d    n1 sDw   Y  j
   fdd} jjdd}jjdd}|  |   |  j
  ||}tdt|| }fdd	t|D }fd
d	t|D }g }d}	t|	D ]C}
j
  t|D ]}|d ur|D ]}d |_q||   |   ||   qj
  dd	 t||D }|| q| S )Nr   zQCannot capture graph in default stream. Please use side stream in benchmark code.Tc                      s      S N)Zreplayr   )gr   r   <lambda>,   s    z$do_bench_cudagraph.<locals>.<lambda>Zenable_timingr   c                       g | ]	} j jd dqS Tr)   cudaEventr   itorchr   r   r   8       z&do_bench_cudagraph.<locals>.<listcomp>c                    r*   r+   r,   r/   r1   r   r   r   9   r3   2   c                 S      g | ]	\}}| |qS r   elapsed_timer   ser   r   r   r   K   r3   )r2   r-   Zcurrent_streamZdefault_streamRuntimeErrorZdetach_Zrequires_grad_gradZ	CUDAGraphgraphsynchronizer.   recordr7   maxr   rangetensorzipappendminmeanitem)fnrepgrad_to_noner   start_event	end_eventestimate_msn_repeatr#   Z	n_retries_r0   timesr   )r'   r2   r   do_bench_cudagraph   sR   







rQ      d   TrF   c                    s  |dv sJ dd l  	 |    j  |r! jtd jdd}n jtd jdd} jjdd} jjdd}	|  td	D ]	}
|	  |   qB|	   j  |
|	d	 }td
t|| }td
t|| } fddt|D } fddt|D }	t|D ]}
|   qt|D ]!}|d ur|D ]}d |_q|	  ||   |   |	|   q j   jdd t||	D  jd}|d ur | j| jd }t|d
kr|d }|S t || S )N)rE   r@   rF   Zmedianr   g    Ar-   )dtypedeviceg    ATr)      r   c                    r*   r+   r,   r/   r1   r   r   r      r3   zdo_bench.<locals>.<listcomp>c                    r*   r+   r,   r/   r1   r   r   r      r3   c                 S   r5   r   r6   r8   r   r   r   r      r3   )rT   )r2   r-   r>   emptyr   int8r.   r?   rA   Zzero_r7   r@   r<   rB   rC   floatZquantiletolistlengetattrrG   )rH   ZwarmuprI   rJ   Z	quantilesZ
fast_flushZreturn_modecacherK   rL   rO   rM   Zn_warmuprN   r0   r   rP   r#   r   r1   r   do_benchP   sN   


 r^    c                 C   sJ  dd l }dd l}t| |js|| } t||js||}|d u r$d}t|r-|| jn|}|d u r5d}t|r>|| jn|}t| |jrX| j|jkrP|  } | 	 
   } t||jrp|j|jkrh| }|	 
   }| jdksz|jdkr|jj| |||dd d S |j| |||dst| d|  d	| d
| d| d
d S )Nr   g{Gz?g        r   T)atolrtolZ	equal_nan)r`   ra    z is not close to z (atol=z, rtol=))numpyr2   
isinstanceZTensorrB   callablerT   bfloat16rY   cpudetachsizetestingZassert_allcloseZallcloseAssertionError)r   yr`   ra   err_msgnpr2   r   r   r   assert_close   s4   

&rp   c                   @   s&   e Zd ZdZ						dddZdS )	Benchmarkzk
    This class is used by the :code:`perf_report` function to generate line plots with a concise API.
    r_   FNc                 C   sL   || _ || _|
| _|| _|| _|| _|| _|| _|| _|	| _	|| _
|| _dS )a  
        Constructor

        :param x_names: Name of the arguments that should appear on the x axis of the plot. If the list contains more than one element, all the arguments are assumed to have the same value.
        :type x_names: List[str]
        :param x_vals: List of values to use for the arguments in :code:`x_names`.
        :type x_vals: List[Any]
        :param line_arg: Argument name for which different values correspond to different lines in the plot.
        :type line_arg: str
        :param line_vals: List of values to use for the arguments in :code:`line_arg`.
        :type line_vals: List[str]
        :param line_names: Label names for the different lines.
        :type line_names: List[str]
        :param plot_name: Name of the plot.
        :type plot_name: str
        :param args: List of arguments to remain fixed throughout the benchmark.
        :type args: List[str]
        :param xlabel: Label for the x axis of the plot.
        :type xlabel: str, optional
        :param ylabel: Label for the y axis of the plot.
        :type ylabel: str, optional
        :param x_log: Whether the x axis should be log scale.
        :type x_log: bool, optional
        :param y_log: Whether the y axis should be log scale.
        :type y_log: bool, optional
        N)x_namesx_valsx_logline_arg	line_vals
line_namesy_logstylesxlabelylabel	plot_nameargs)selfrr   rs   ru   rv   rw   r|   r}   rz   r{   rt   rx   colorry   r   r   r   __init__   s   *
zBenchmark.__init__)r_   r_   FFNN)__name__
__module____qualname____doc__r   r   r   r   r   rq      s    rq   c                   @   s&   e Zd Zdd Zdd Zd
ddZd	S )Markc                 C   s   || _ || _d S r&   )rH   
benchmarks)r~   rH   r   r   r   r   r     s   
zMark.__init__c              
      s  dd l }dd lm} dd l}|j}dd |jD }	dd |jD }
|j|jd g| |	 |
 d}|jD ]`  fdd|jD }g g g }}}|jD ]:}| j	di ||j
|i|j}z|\}}	}
W n tyu   |d d }}	}
Y nw ||g7 }||	g7 }||
g7 }qK g| | | |jt|< q4|jrA|  | }|jd  t|jD ]K\}}||d  ||d	  }	}
|jr|j| d nd }|jr|j| d
 nd }|j|  || |||d |	d ur|
d ur|j|  |	|
d|d q|  |jr|jnd|j}|| ||j ||jrdnd ||jr&dnd |r0|   |rA|!|j"||j d ||jd g|j  }|rZt#|jd  t#| |rp|j$|j"||j dddd d S d S )Nr   c                 S      g | ]}| d qS )-minr   r   r   r   r   r         zMark._run.<locals>.<listcomp>c                 S   r   )-maxr   r   r   r   r   r     r   )columnsc                    s   i | ]}| qS r   r   )r   Zx_namer   r   r   
<dictcomp>  s    zMark._run.<locals>.<dictcomp>r   r   r   )labelr   Zlsg333333?)alphar   z = logZlinearz.png:z.csvz%.1fF)Zfloat_formatindexr   )%osZmatplotlib.pyplotZpyplotZpandasrw   Z	DataFramerr   rs   rv   rH   ru   r}   	TypeErrorlocr[   r|   ZfigureZsubplot	enumeratery   ZplotZfill_betweenZlegendrz   r   Z
set_xlabelZ
set_ylabelr{   Z
set_xscalert   Z
set_yscalerx   showZsavefigpathprintZto_csv)r~   bench	save_path
show_plots
print_datar   ZpltpdZy_meanZy_minZy_maxZdfZx_argsZrow_meanZrow_minZrow_maxrm   r#   axr0   colZstyrz   r   r   r   _run  sd    

 



&z	Mark._runFr_   c                 C   s   t | jt}|r| jgn| j}|r ttj|dd}|d |D ]}| |||| |r8|d|j	 d q"|rB|d d S d S )Nzresults.htmlwz<html><body>
z<image src="z.png"/>
z</body></html>
)
re   r   rq   openr   r   r   writer   r|   )r~   r   r   r   Zhas_single_benchr   htmlr   r   r   r   run:  s   
zMark.runN)FFr_   )r   r   r   r   r   r   r   r   r   r   r     s    3r   c                        fdd}|S )z
    Mark a function for benchmarking. The benchmark can then be executed by using the :code:`.run` method on the return value.

    :param benchmarks: Benchmarking configurations.
    :type benchmarks: List of :class:`Benchmark`
    c                    s
   t |  S r&   )r   )rH   r   r   r   r(   O  s   
 zperf_report.<locals>.<lambda>r   )r   wrapperr   r   r   perf_reportH  s   r   c                 C   sf   ddl }ddlm} | stjj} |s|j }|j|d }|j|d }|| d d d	 }|S )
z return DRAM bandwidth in GB/s r   Nr   driverZmem_clock_rateZmem_bus_width   g    .A   )	r2   r   r   backendCUDAr-   current_deviceutilsget_device_properties)r   rU   r2   r   Zmem_clock_khzZ	bus_widthZbw_gbpsr   r   r   get_dram_gbpsS  s   
r   c           
      C   s   dd l }ddlm} |stjj}|s|j }|j|d d }|s+|j|d }|j	|}|d dk rA| |j
ks>J d}n| |jkrId}n| |j
|jfv rTd	}n| |jkr\d
}ntd|| | d }	|	S )Nr   r   r   multiprocessor_count   sm_clock_rater      i   i   dtype not supported&.>)r2   r   r   r   r   r-   r   r   r   get_device_capabilityfloat16float32rg   rX   r;   )
rT   r   rU   
clock_rater2   r   num_subcores
capabilityops_per_sub_coretflopsr   r   r   get_max_tensorcore_tflopsb  s,   


r   c                     r   )Nc                    s   t   fdd}|S )Nc            
         s   dd l }|t  }  | k}|rg|dkrgtjjd }tj	d dd}d|v s4J d|d j
jj}| d	j d
| d}tjddd|gd|d}	|	jdks\J ddt|	jv seJ d S | i | d S )Nr   zcuda-memcheck__file__PATH1)r   ZPYTORCH_NO_CUDA_MEMORY_CACHINGrequestz@memcheck'ed test must have a (possibly unused) `request` fixturez::[]Zpytestz-vsT)capture_outputenvz7cuda-memcheck returned an error: bounds checking failedzERROR SUMMARY: 0 errors)psutilProcessr   getppidnameitemsr   realpath__globals__environnodeZcallspecidr   r   r   
returncodestrr   )
r}   kwargsr   Z	ppid_nameZrun_cuda_memcheckr   r   Ztest_idr!   r"   )target_kwargstest_fnr   r   r     s   z1cuda_memcheck.<locals>.decorator.<locals>.wrapper)	functoolswraps)r   r   r   )r   r   	decorator  s   z cuda_memcheck.<locals>.decoratorr   )r   r   r   r   r   cuda_memcheck  s   r   c                 C   r   )	Nr   r   r   r	   r
   r   c                 S   r   r   r   r   r   r   r   r     r   znvsmi_attr.<locals>.<listcomp>r   r   r   r   r   
nvsmi_attr  s   

r   F    c              
   c   s$   zzt g d t dddd|  d|  g t dddd| d| g tdgd	 }td
gd	 }t||  dk sEJ d|  dt|| dk sUJ d| dd|  }d| d }||fV  W t g d t g d t g d d S t g d t g d t g d w )N)r   r   r	   -pmr   r   r   r	   z--lock-gpu-clocks=r   z--lock-memory-clocks=zclocks.current.smr   zclocks.current.memory
   zGPU SMs must run at z MHzg 3O?i   gMbP?)r   r   r	   r   r	   )r   r   r	   z-rgc)r   r   r	   z-rmc)r   r   r   abs)Zref_sm_clockZref_mem_clockZcur_sm_clockZcur_mem_clockr   Zgbpsr   r   r   set_gpu_clock  s<     r   c           
      C   s   dd l }ddlm} |stjj}|s|j }|j|d d }|j|d }|j	 }|d dk rH| |j
kr<d}n#| |jkrDd	}ntd
| |j
krPd}n| |j|jfv r[d	}ntd
|| | d }	|	S )Nr   r   r   r   r   r   r       @   r   r   )r2   r   r   r   r   r-   r   r   r   r   r   r   r;   rg   )
rT   r   rU   r2   r   r   r   r   r   r   r   r   r   get_max_simd_tflops  s,   




r   )r%   N)rR   rS   NNTrF   )NNr_   )NN)NNN)r   r   )r   r   r   r   
contextlibr   Z_C.libtriton.tritonr   r$   rQ   r^   rp   rq   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s0    
	=

O%>F

  