o
    iY                     @  s(  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZmZ ddlmZmZmZmZmZmZmZ ddlmZmZ dd	lmZ dd
lmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, dd Z-dd Z.dd Z/dd Z0dd Z1dd Z2dd Z3e4 dYd!d"Z5dZd[d(d)Z6d\d+d,Z7d-d. Z8d/d0 Z9d]d5d6Z:d^d9d:Z;d;d< Z<d=d> Z=d?Z>d@Z?e>e>e?dAZ@dBZAdCZBeAeAeBdAZCdDZDdEdF ZEdGdH ZFe
dIdJdKgeG eG gdLZHdMdN ZIdOdP ZJdQdR ZKdSdT ZLdUdV ZMG dWdX dXZNdS )_    )annotationsN)
namedtuple)Path)AnyTuple   )add_external_libscompile_ptx_to_cubinget_shared_memory_sizeirtranslate_llvmir_to_hsacotranslate_llvmir_to_ptxtranslate_triton_gpu_to_llvmir)get_backendpath_to_ptxas)OutOfResources)get_cache_manager)driver)JITFunctionget_cuda_streamget_current_deviceget_device_capabilityversion_key)extract   )ast_to_ttir)	make_stubc                 C  s*   t | j}|  |  ||  | S N)r   pass_managercontextenable_debugadd_inliner_passrun)modpm r%   g/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/triton/compiler/compiler.pyinline_triton_ir   s
   
r'   c                 C  s4   t | j}|  t|r|| ||  | S r   )r   r   r   r    _is_cudaZadd_rewrite_tensor_pointer_passr"   r#   archr$   r%   r%   r&   ttir_compute_capability_rewrite'   s   

r+   c                 C  sl   t | } t| |} t| j}|  |  |  |  |	  |
  |  |  ||  | S r   )r'   r+   r   r   r   r    r!   Zadd_triton_combine_passZadd_canonicalizer_passZadd_reorder_broadcast_passadd_cse_passZadd_licm_passadd_symbol_dce_passr"   r)   r%   r%   r&   optimize_ttir2   s   

r.   c                 C  s,   t | j}|  || ||  | S r   )r   r   r   r    Z$add_convert_triton_to_tritongpu_passr"   )r#   	num_warpsr$   r%   r%   r&   ttir_to_ttgirB   s
   

r0   c                 C  s   t | j}|  |  |  t|tr|| |  |	  |
| |  |	  |  |  |  |  |  ||  | S r   )r   r   r   r    Zadd_tritongpu_coalesce_passZ,add_tritongpu_remove_layout_conversions_pass
isinstanceintZ$add_tritongpu_accelerate_matmul_passZ(add_tritongpu_optimize_dot_operands_passZadd_tritongpu_pipeline_passZadd_tritongpu_prefetch_passZ(add_tritongpu_decompose_conversions_passZ'add_tritongpu_reorder_instructions_passr,   r-   r"   )r#   
num_stagesr*   r$   r%   r%   r&   optimize_ttgirJ   s$   



r4   c                 C  sP   |  D ]\}}t|dkst|dkr d S qt| t| t|  d S )Nr   )itemslenr   listkeysvalues)r#   Zlibsnamepathr%   r%   r&   _add_external_libs_   s
    r<   c                 C  s.   |rt | | t|rt| |dS t| ddS )NFr   T)r<   r(   r   )r#   extern_libsr*   r%   r%   r&   ttgir_to_llirf   s
   
r>   returnr2   c                 C  sZ   t | tsJ tt| d\}}|dkrd| S |dkr!d| S |dkr)d| S td)	zK
    Get the highest PTX version supported by the current CUDA driver.
    .   P      F   
   ?   z'Triton only support CUDA 10.0 or higher)r1   strmapr2   splitRuntimeError)cuda_versionmajorminorr%   r%   r&   ptx_get_versionr   s   rN   r#   r   r*   ptx_versionrG   c                 C  s&   |du rt  \}}t|}t| ||S )zr
    Translate TritonGPU module to PTX code.
    :param mod: a TritonGPU dialect module
    :return: PTX code
    N)r   rN   r   )r#   r*   rO   _rK   r%   r%   r&   llir_to_ptx   s   
rQ   ptxc                 C  s   t  \}}t| ||S )z
    Compile TritonGPU module to cubin.
    :param ptx: ptx code
    :param compute_capability: compute capability
    :return: str
    )r   r	   )rR   r*   ZptxasrP   r%   r%   r&   ptx_to_cubin   s   
rS   c                 C  s   g d}| d }t d|d }d| d }tjttj	
 d}i }d}|D ]}|| }	tj|	rC|	|dt| < |d7 }q+|| }
tj|
rV|
|dt| < |S )N)z	opencl.bczocml.bczockl.bczoclc_finite_only_off.bczoclc_daz_opt_off.bcz!oclc_correctly_rounded_sqrt_on.bczoclc_unsafe_math_off.bczoclc_wavefrontsize64_on.bcr   zgfx(\w+)Zoclc_isa_version_z.bczthird_party/rocm/lib/bitcode/Zlibrary_)researchgroupstriposr;   joinr   __file__parentresolveexistsrG   )r*   Z#gpu_arch_agnostic_bitcode_librariesgfx_archZgfx_arch_idZ!gpu_arch_specific_bitcode_libraryZbitcode_path_dirZamdgcn_bitcode_pathsiZbc_libZbc_pathZbc_gfx_pathr%   r%   r&   get_amdgcn_bitcode_paths   s"   	r`   c                  C  s   zUt jddd} t| d  }td|d 	d}|d }|d 	d	}|d }d
}t
|dkrPdtd|d d d td|d d }|||gW S  ty_   Y dS w )z
    get the amdgpu fulll ISA details for compiling:
    i.e., arch_triple: amdgcn-amd-amdhsa; arch_name: gfx906; arch_features: sramecc+:xnack-
    Z	ROCM_PATHz	/opt/rocm)defaultz/bin/rocminfozamd.*r   z--r   :    +z\w+z,-r   N)rX   getenv
subprocesscheck_outputdecoderT   rU   rV   rW   rI   r6   BaseException)Zrocm_path_dirZrocminfoZgfx_arch_detailsZarch_tripleZarch_name_featuresZ	arch_nameZarch_featuresr%   r%   r&   get_amdgpu_arch_fulldetails   s    rk   r^   
gfx_triplegfx_featuresTuple[str, str]c                 C  s   t | |||S )z
    Translate TritonGPU module to HSACO code based on full details of gpu architecture.
    :param mod: a TritonGPU dialect module
    :return:
        - AMDGCN code
        - Path to HSACO object
    )r   )r#   r^   rl   rm   r%   r%   r&   llir_to_amdgcn_and_hsaco   s   ro   srcpatternc                 C  s>   | sJ |  dD ]}| }||r|  d   S q	dS )zd
    Get kernel name from PTX code.
    This Kernel name is required when launching the kernel.
    
N)rI   rW   
startswith)rp   rq   liner%   r%   r&   get_kernel_name   s   
rv   c                 C  s*   t d| }|d urdt|d S | S )Nz!tt\.ptr<(.*)>*r   )rT   rU   convert_type_reprrV   )xmatchr%   r%   r&   rx      s   rx   c                   s   t | tr\|d }|d }|dt }|dd}|dd}|dd	}d
d   fdd|D }	| j dd|  d|	 d| d| d| d| d| }
t|
	d
 S t | tscJ tt|  t  	d
 S )Nconfigs	signature	constantsr/      r3   rd   debugFc                 S  s   t | jt | jfS r   )sorteddivisible_by_16
equal_to_1)confr%   r%   r&   <lambda>       zmake_hash.<locals>.<lambda>c                   s   g | ]} |qS r%   r%   ).0r   Zget_conf_keyr%   r&   
<listcomp>   r   zmake_hash.<locals>.<listcomp>-rc   zutf-8)r1   r   getdict	cache_keyrY   r9   hashlibmd5encode	hexdigestrG   r   	read_textr   )fnr*   kwargsr{   r|   r}   r/   r3   r   Zconfigs_keykeyr%   r   r&   	make_hash   s   
>"r   z`^\s*tt\.func\s+(?:public\s+)?(@\w+)(\((?:%\w+: \S+(?: \{\S+ = \S+ : \S+\})?(?:, )?)*\))\s*\{\s*$z=\.(?:visible|extern)\s+\.(?:entry|func)\s+(\w+)\s*\(([^)]*)\))ttirttgirrR   z-%\w+: ([^,^\)\s]+)(?: \{\S+ = \S+ : \S+\})?,?z\.param\s+\.(\w+)z&"triton_gpu.num-warps"\s?=\s?(\d+)\s?:c                 C  s2   dd }i }| D ]}|| | r| | ||< q|S )Nc              	   S  s*   zt |  W dS  ttfy   Y dS w )NTF)jsondumps	TypeErrorOverflowError)ry   r%   r%   r&   _is_jsonable  s   
z-_get_jsonable_constants.<locals>._is_jsonabler%   )r}   r   Zserialized_constantsZconstantr%   r%   r&   _get_jsonable_constants  s   r   c                 C  s   t | |}||_|S r   )r   parse_mlir_moduler   )r;   r   moduler%   r%   r&   r   +  s   r   instance_descriptorr   r   )defaultsc                 C  s
   t | tS r   )r1   r2   r*   r%   r%   r&   r(   6  s   
r(   c                 C  sh   zdd l }W n ty   tdw | d u r2|jjd u r/t }t|} | d d | d  } | S t } | S )Nr   z'Triton requires PyTorch to be installedrE   r   )torchImportErrorversionhipr   r   rk   )Z
capabilityr   devicer%   r%   r&   get_architecture_descriptor:  s   r   c                   s   | t|  t|D ]}|| dks|| d u r|| q| tjdd   d u r2tddd  fddf|d< d S )	Nrc   ZMI_GPU_ARCHr   z gfx_arch is None (not specified)c                 S     t |  S r   r   r   r;   r%   r%   r&   r   T      z!add_rocm_stages.<locals>.<lambda>c                   s   t |  d d S )Nr   r   )ro   rp   r^   Zgfx_arch_full_detailsr%   r&   r   U  s    amdgcn)updater`   r7   poprX   environr   rJ   )r*   r=   stagesr   r%   r   r&   add_rocm_stagesI  s   
r   c                   s4   dd  fddf|d< dd  fddf|d< d S )Nc                 S  r   r   r   r   r%   r%   r&   r   \  r   z!add_cuda_stages.<locals>.<lambda>c                   
   t |  S r   )rQ   r   r   r%   r&   r   ]     
 rR   c                 S  r   r   )r   
read_bytesr   r%   r%   r&   r   ^  r   c                   r   r   )rS   r   r   r%   r&   r   _  r   cubinr%   )r*   r=   r   r%   r   r&   add_cuda_stagesZ  s   

r   c           "        s8  | dd}t|}|dv rt| dd  nt|}|sJ |jd7i | |dko.t }|dv o5| }t | dt | dd| d|rR d	krRd
nd| dt d u rct | ddt }fddd f|d< fdd 	fddf|d< fdd fddf|d< dd  fddf|d< |rt | n|rt | n|	 | t
tr| dd |d 	d u rt gtdksJ |d< j}d}t
	trdd  t	d!D 		|d< nt
tsJ tjd"\}	}
t }dd l}|t|
 ||j}|d|d}	|t|
 	}|
dkrd|t|}t|dksLJ d#d|vs^t|d ks^J d$t|d d%d& |D }d'd  t|D 	t |! "|
}|s|rt#|	}n|$|	}t%t& fi |}t
trjd}}ntjd"\}}d }| d(}|'|pi }| |}|d urt(|}t)*|}W d    n	1 sw   Y  nt+ d)}|d*krd+|v s J d,|d+ |d+< ||d< t |! "|}t }}t |, |d  D ]\}
\}}| d"|
 }|
|kr9|}na| |}|d u ru||}td-krf| d.} |-|d |||< |-|d | || < n4|-||||< |-|| n%|
d-kr| d.} | | }!|!d usJ d/||||!f}n||}|
d0kr|||
< n|
d-krt|d ||
< nt|||
< |
dkrd+|vrt.||d+< |
d*krt/|d1d2|d3< |
d-krt/|d d4d2|d3< |d |d5< |s|s|0|
|||| |}q"|d u r|j-t)1||dd6||< |2|| t3|||S )8Ndevice_typecudar   r   ccr}   r/   r~   r3   K   rd   r   r=   r   Fc                   s    S r   r%   r   )r   r%   r&   r   {  s    zcompile.<locals>.<lambda>astc                   r   r   r   r   r   r%   r&   r   |  r   c              	     s   t t| d  d S )Nr   )r   r*   )r.   r   r   )r*   r{   r}   r   r|   r%   r&   r   }  s    r   c                   r   r   r   r   r   r%   r&   r   ~  r   c                   s   t t|  S r   )r4   r0   r   )r*   r3   r/   r%   r&   r     s    r   c                 S  r   r   r   r   r%   r%   r&   r     r   c                   s   t |  S r   )r>   r   )r*   r=   r%   r&   r     r   Zllirr{   r|   r   r   c                 S  s   i | ]	\}}||  qS r%   )rW   r   kvr%   r%   r&   
<dictcomp>  s    zcompile.<locals>.<dictcomp>,r@   z(Expected exactly one match for num_warpsz6num_warps in ttgir does not match num_warps in compilec                 S  s   g | ]}t |qS r%   )rx   )r   tyr%   r%   r&   r     r   zcompile.<locals>.<listcomp>c                 S  s   i | ]\}}||qS r%   r%   r   r%   r%   r&   r     s    z.json)r/   r3   r}   r   r*   rR   sharedz/ptx compilation must provide shared memory sizer   z.hsaco_pathz?Expected to have hsaco_path in metadata when we have the amdgcnr   z	// .globl)rq   r:   z.globl
hsaco_path)binaryr%   )4r   r   r   r(   r   r   r   r   r   Z
add_stagesr1   r   r   r6   __name__rG   	enumeraterI   rX   r;   basenamer   r   rT   rU   prototype_pattern	MULTILINErV   findallarg_type_patternttgir_num_warps_patternr2   r7   r8   indexr   Zmake_launcher_stubr   r   Z	get_groupopenr   loadr   r5   putr
   rv   Zadd_meta_infor   Z	put_groupCompiledKernel)"r   r   r   Z_device_backendZis_cudaZis_hipr   r:   Zfirst_stagerP   Zir_namerp   rT   rz   typesZnum_warps_matchesZ	param_tysso_pathZfn_cache_managerextmetadataZmetadata_filenameZmetadata_groupZmetadata_pathfasmr   parseZcompile_kernelZir_filenameZnext_moduler;   Zextra_file_nameZ
hasco_pathr%   )
r*   r{   r}   r   r   r=   r   r3   r/   r|   r&   compileb  s  





$




 














r   c                      sF   e Zd ZdZdZdd Zdd Z fddZdd	 Zdd
dZ	  Z
S )r   Nc                 C  s   dd l }|jd|}|j|}|| _|j| t|d| _d|v r(|d nd| _	|d | _
|d | _|d | _|d | _| jd	vrIt| jnd | _|| _|| _d | _d | _d S )
Nr   Z__triton_launcherZlaunchr   r/   r3   r}   r   r   )importlib.utilutilspec_from_file_locationmodule_from_specr   loaderexec_modulegetattr	c_wrapperr   r/   r3   r}   r   r   device_backendr   r   	cu_modulecu_function)selfr   r   r   r   	importlibspecr#   r%   r%   r&   __init__  s    




zCompiledKernel.__init__c           	      C  s   | j d urd S | jdv r't }tjdtjditj }tj|d }tjj	}n| j
s,J | j
 }| j
 }| j
|d }| j
 }| j|krOt| j|d|| jd | j| | j|\}}}}|| _|| _|| _ || _d S )Nr   r   r   Zmax_shared_memzshared memoryr:   )r   r   r   r   ZHIPCUDAbackendutilsZget_device_propertiesZload_binaryr   Zget_kernel_binZget_load_binary_fnr   r   r   r   n_spillsn_regsr   )	r   r   Zbin_pathZ
max_sharedZfn_load_binaryr#   funcr   r   r%   r%   r&   _init_handles$  s.   







$
zCompiledKernel._init_handlesc                   s   |dkr|    t |S )Nr   )r   super__getattribute__)r   r:   	__class__r%   r&   r   A  s   zCompiledKernel.__getattribute__c                   s       d d fdd
}|S )N)streamc                   sh   | d u rj dv rt } ntj d } j d  d  d jj| jtj	tj
g
|R   d S )N)r   Zrocmr   r   r   )r   r   r   
get_streamr   r/   r   r   r   launch_enter_hooklaunch_exit_hook)r   argsgridr   r%   r&   runnerI  s   
$
z*CompiledKernel.__getitem__.<locals>.runner)r   )r   r   r  r%   r   r&   __getitem__F  s   zCompiledKernel.__getitem__c              	   C  s   d| j v r
| j d S t \}}z*t|d}|| j d  W d    n1 s)w   Y  t||| _W t| nt| w | j| j d< | jS )Nsasswbr   )	r   tempfilemkstempr   writer   r  rX   remove)r   Zfunfdr;   r   r%   r%   r&   get_sassS  s   

zCompiledKernel.get_sassr   )r   
__module____qualname__r   r   r   r   r   r  r
  __classcell__r%   r%   r   r&   r     s    r   )r?   r2   r   )r#   r   r*   r2   rO   r2   r?   rG   )rR   rG   r*   r2   )
r#   r   r^   rG   rl   rG   rm   rG   r?   rn   )rp   rG   rq   rG   r?   rG   )O
__future__r   	functoolsr   r   rX   rT   rg   r  collectionsr   pathlibr   typingr   r   Z_C.libtriton.tritonr   r	   r
   r   r   r   r   Zcommon.backendr   r   Zruntime.autotunerr   Zruntime.cacher   Zruntime.driverr   Zruntime.jitr   r   r   r   r   Ztools.disasmr   Zcode_generatorr   Zmake_launcherr   r'   r+   r.   r0   r4   r<   r>   	lru_cacherN   rQ   rS   r`   rk   ro   rv   rx   r   Zmlir_prototype_patternZptx_prototype_patternr   Zmlir_arg_type_patternZptx_arg_type_patternr   r   r   r   setr   r(   r   r   r   r   r   r%   r%   r%   r&   <module>   sx    $


 %