o
    i                  
   @   s  d dl mZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d
dlmZ d dlmZ d dlmZmZmZ d dlm Z m!Z!m"Z" e
 Z#e#j$Z$e#j%Z&e#j'Z'dd Z(e&e)eddd Z*e&e)eddd Z+e&e)eddd Z,e&e)eddd Z-e&e)eddd Z.e&e d d!d" Z/e&e d#d$d% Z0e&e d&d'd( Z1e$ej2j3d)d* Z4e$d+e!d,d- Z5e$ej6j7ej8d.d/ Z9d a:d0d1 Z;e$ej<j=ej>ej?d2d3 Z@e$ej<j=ejAej?e$ej<j=ejBej?d4d5 ZCe$ejDj=ej>ej?d6d7 ZEe$ejDj=ejAej?e$ejDj=ejBej?d8d9 ZFe$ejGd:d; ZHe$ejId<d= ZJe$ejKd>d? ZLe$ejMd@dA ZNe$ejMejOdBdC ZPe$ejQejOejOejOejOejOe$ejQejOejOejRejOejOe$ejQejOejOejSejOejOe$ejQejOejOejTejOejOdDdE ZUe$ejVejOejOejWdFdG ZXe$ejYejOejOe$ejYejOejRe$ejYejOejSe$ejYejOejTdHdI ZZe$ej[ejOejOe$ej[ejOejRe$ej[ejOejSe$ej[ejOejTdJdK Z\e$ej]dLdM Z^e$ej_dNdO Z`e$ejaej?dPdQ Zbe$ejcej?ej?ej?dRdS ZddTdU ZeeejfejgdVdW ZheejgejfdXdY ZidZd[ Zjeejfejkd\d] Zleejkejfeej>ejfd^d_ Zmd`da Znenejojpdb enejqdb enejrdb enejojsdc enejtdc enejudc enejojvdd enejwdd enejxdd e$ejojyejfdedf Zze$ej{ejfdgdh Z|e$ejoj}ejfdidj Z~e$eejfdkdl Ze$ejojejfejfejfdmdn Ze$ejejfejfe$ejejfejfdodp ZdqZdrds Ze$ejojejfejfedt e$ejejfejfedt e$ejojejfejfedu e$ejejfejfedu e$ejojejfejfedv e$ejejfejfedv e$ejojejfejfedw e$ejejfejfedw e$ejojejfejfedx e$ejejfejfedx e$ejojejfejfedy e$ejejfejfedy dzd{ Zeejojd|dw eejojd}dy ejd~ejdiZe$ejeje$ejejdd Ze$ejejdd Ze$ejejdd Ze$ejej?dd Ze$ejejOe$ejejdd Ze$ejejRe$ejejdd Ze$ejej?ej?ej?dd Ze$eejSejSdd Ze$eejTejSe$eejSejTe$eejTejTdd Ze$eejSejSdd Ze$eejTejSe$eejSejTe$eejTejTdd Ze$eejSe$eejTdd Ze$eejSejke$eejTejkdd Zdd Zejd Zdej Ze$ejejSee e$ejejTee e$ejejSee e$ejejTee dd Zdd Ze$ejjqej8ejej?e$ejjqej8ejBej?e$ejjqej8ejAej?edd Ze$ejjtej8ejej?e$ejjtej8ejBej?e$ejjtej8ejAej?edd Ze$ejjej8ejej?e$ejjej8ejBej?e$ejjej8ejAej?edd Ze$ejjej8ejej?e$ejjej8ejBej?e$ejjej8ejAej?edd Zdd Zeejjd eejjd eejjd e$ejjej8ejej?e$ejjej8ejBej?e$ejjej8ejAej?edd Ze$ejjej8ejej?e$ejjej8ejAej?e$ejjej8ejBej?edd Ze$ejjej8ejej?e$ejjej8ejAej?e$ejjej8ejBej?edd Ze$ejjej8ejej?e$ejjej8ejAej?e$ejjej8ejBej?edd Ze$ejjej8ejej?e$ejjej8ejAej?e$ejjej8ejBej?edd Ze$ejjej8ej?ej?dd Ze$ejjej8ejej?ej?e$ejjej8ejAej?ej?e$ejjej8ejBej?ej?dd Ze$ejej҃dd Z	dddZe'e"dd Zee֡ e$ dS )    )reduceN)ir)Registry
lower_cast)parse_dtype)models)typescgutils)ufunc_db)register_ufuncs   )nvvm)cuda)	nvvmutilsstubserrors)dim3
grid_groupCUDADispatcherc                 C   sB   t | d| }t | d| }t | d| }t| |||fS )Nz%s.xz%s.yz%s.z)r   	call_sregr	   Zpack_struct)builderprefixxyz r   b/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/numba/cuda/cudaimpl.pyinitialize_dim3   s   r   Z	threadIdxc                 C   
   t |dS )Ntidr   contextr   sigargsr   r   r   cuda_threadIdx       
r%   ZblockDimc                 C   r   )NZntidr    r!   r   r   r   cuda_blockDim%   r&   r'   ZblockIdxc                 C   r   )NZctaidr    r!   r   r   r   cuda_blockIdx*   r&   r(   ZgridDimc                 C   r   )NZnctaidr    r!   r   r   r   cuda_gridDim/   r&   r)   laneidc                 C   s   t |dS )Nr*   )r   r   r!   r   r   r   cuda_laneid4      r+   r   c                 C      | |dS Nr   extract_valuer!   r   r   r   dim3_x9   r,   r1   r   c                 C   r-   Nr   r/   r!   r   r   r   dim3_y>   r,   r3   r   c                 C   r-   )N   r/   r!   r   r   r   dim3_zC   r,   r5   c                 C   s(   |  tjd}|j}|t||fS r2   )get_constantr   int32modulecallr   Z declare_cudaCGGetIntrinsicHandle)r"   r   r#   r$   onelmodr   r   r   cg_this_gridH   s   r<   zGridGroup.syncc                 C   s0   |  tjd}|j}|t|g ||R S r.   )r6   r   r7   r8   r9   r   Zdeclare_cudaCGSynchronize)r"   r   r#   r$   flagsr;   r   r   r   ptx_sync_groupQ   s   r>   c                 C   s   |d S r.   r   r!   r   r   r   cuda_const_array_like\   s   r?   c                 C   s   t d7 a d| t S )zDue to bug with NVVM invalid internalizing of shared memory in the
    PTX output.  We can't mark shared memory to be internal. We have to
    ensure unique name is generated for shared memory symbol.
    r   z{0}_{1})_unique_smem_idformatnamer   r   r   _get_unique_smem_idf   s   rD   c              	   C   s8   |j d j}t|j d }t| ||f|tdtjddS )Nr   r   _cudapy_smemTshapedtypesymbol_name	addrspacecan_dynsized)r$   literal_valuer   _generic_arrayrD   r   ADDRSPACE_SHAREDr"   r   r#   r$   lengthrH   r   r   r   cuda_shared_array_integerp   s   rQ   c              	   C   s>   dd |j d D }t|j d }t| |||tdtjddS )Nc                 S      g | ]}|j qS r   rL   .0sr   r   r   
<listcomp>}       z+cuda_shared_array_tuple.<locals>.<listcomp>r   r   rE   TrF   )r$   r   rM   rD   r   rN   r"   r   r#   r$   rG   rH   r   r   r   cuda_shared_array_tuplez   s   
rZ   c              	   C   s4   |j d j}t|j d }t| ||f|dtjddS )Nr   r   _cudapy_lmemFrF   )r$   rL   r   rM   r   ADDRSPACE_LOCALrO   r   r   r   cuda_local_array_integer   s   r]   c              	   C   s:   dd |j d D }t|j d }t| |||dtjddS )Nc                 S   rR   r   rS   rT   r   r   r   rW      rX   z(ptx_lmem_alloc_array.<locals>.<listcomp>r   r   r[   FrF   )r$   r   rM   r   r\   rY   r   r   r   ptx_lmem_alloc_array   s   
r^   c                 C   D   |rJ d}|j }tt d}t|||}||d |  S )Nzllvm.nvvm.membar.ctar   r8   r   FunctionTypeVoidTyper	   get_or_insert_functionr9   get_dummy_valuer"   r   r#   r$   fnamer;   fntysyncr   r   r   ptx_threadfence_block      ri   c                 C   r_   )Nzllvm.nvvm.membar.sysr   r`   re   r   r   r   ptx_threadfence_system   rj   rk   c                 C   r_   )Nzllvm.nvvm.membar.glr   r`   re   r   r   r   ptx_threadfence_device   rj   rl   c                 C   s*   |  tjd}ttj}t| |||gS )Nl    )r6   r   r7   noneptx_syncwarp_mask)r"   r   r#   r$   maskZmask_sigr   r   r   ptx_syncwarp   s   rp   c                 C   sD   d}|j }tt tdf}t|||}||| |  S )Nzllvm.nvvm.bar.warp.sync    )	r8   r   ra   rb   IntTyper	   rc   r9   rd   re   r   r   r   rn      s   rn   c              
   C   s  |\}}}}}|j d }	|	tjv r||t|	j}d}
|j}tt	tdtdftdtdtdtdtdf}t
|||
}|	jdkr|||||||f}|	tjkr}||d}||d}||t }t
|||f}|S ||td}||| tjd}||td}|||||||f}|||||||f}||d}||d}||d}||td}||td}||| tjd}|||}|	tjkr||t }t
|||f}|S )a  
    The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
    function supports both 32 and 64 bit ints and floats, so for feature parity,
    i64, f32, and f64 are implemented. Floats by way of bitcasting the float to
    an int, then shuffling, then bitcasting back. And 64-bit values by packing
    them into 2 32bit values, shuffling thoose, and then packing back together.
    r4   zllvm.nvvm.shfl.sync.i32rq   r   r   @   )r$   r   real_domainbitcastr   rr   bitwidthr8   ra   LiteralStructTyper	   rc   r9   float32r0   	FloatTypeZmake_anonymous_structtruncZlshrr6   i8zextZshlor_float64
DoubleType)r"   r   r#   r$   ro   modevalueindexclampZ
value_typerf   r;   rg   funcretrvpredZfvZvalue1Z
value_lshrZvalue2Zret1Zret2Zrv1Zrv2Zrv1_64Zrv2_64Zrv_shlr   r   r   ptx_shfl_sync_i32   sJ   




r   c                 C   s^   d}|j }tttdtdftdtdtdf}t|||}|||S )Nzllvm.nvvm.vote.syncrq   r   )r8   r   ra   rw   rr   r	   rc   r9   )r"   r   r#   r$   rf   r;   rg   r   r   r   r   ptx_vote_sync  s   r   c                 C   s   |\}}|j d j}|j d tjv r||t|}d|}|j}t	tdtdt|f}	t
||	|}
||
||fS )Nr   zllvm.nvvm.match.any.sync.i{}rq   )r$   rv   r   rt   ru   r   rr   rA   r8   ra   r	   rc   r9   r"   r   r#   r$   ro   r   widthrf   r;   rg   r   r   r   r   ptx_match_any_sync  s   
"r   c                 C   s   |\}}|j d j}|j d tjv r||t|}d|}|j}t	t
tdtdftdt|f}	t||	|}
||
||fS )Nr   zllvm.nvvm.match.all.sync.i{}rq   )r$   rv   r   rt   ru   r   rr   rA   r8   ra   rw   r	   rc   r9   r   r   r   r   ptx_match_all_sync  s   
r   c                 C   ,   t jt t dg dddd}||g S )Nrq   zactivemask.b32 $0;=rTZside_effectr   	InlineAsmra   rr   r9   r"   r   r#   r$   
activemaskr   r   r   ptx_activemask/  s   r   c                 C   r   )Nrq   zmov.u32 $0, %lanemask_lt;r   Tr   r   r   r   r   r   ptx_lanemask_lt6  s
   r   c                 C   s   | |d S r.   )Zctpopr!   r   r   r   ptx_popc>     r   c                 C   s
   |j | S N)fmar!   r   r   r   ptx_fmaC  r&   r   c                 C   s:   ddd}z||  W S  t y   d|  d}t|w )N)Zf32f)Zf64d)rq   rs   z$Conversion between float16 and float unsupportedKeyErrorr   ZCudaLoweringErrorrv   typemapmsgr   r   r   float16_float_ty_constraintH  s   


r   c           	      C   sd   |j |j kr|S t|j \}}t| |tdg}t|d| dd| d}|||gS )N   zcvt..f16 $0, $1;=,h)rv   r   r   ra   get_value_typerr   r   r9   	r"   r   fromtytotyvalty
constraintrg   asmr   r   r   float16_to_float_castR  s   r   c           	      C   sb   |j |j kr|S t|j \}}ttd| |g}t|d| dd| }|||gS )Nr   cvt.rn.f16. $0, $1;=h,)rv   r   r   ra   rr   r   r   r9   r   r   r   r   float_to_float16_cast^  s   r   c                 C   s>   ddddd}z||  W S  t y   d|  d}t|w )Nchrl)   r   rq   rs   z"Conversion between float16 and intr   r   r   r   r   r   float16_int_constraintj  s   

r   c           
      C   sf   |j }t|}|jrdnd}t| |tdg}t|d| | dd| d}	||	|gS )NrV   ur   zcvt.rni.r   r   r   )	rv   r   signedr   ra   r   rr   r   r9   
r"   r   r   r   r   rv   r   Z
signednessrg   r   r   r   r   float16_to_integer_castt  s   
r   c           
      C   sd   |j }t|}|jrdnd}ttd| |g}t|d| | dd| }	||	|gS )NrV   r   r   r   r   r   )	rv   r   r   r   ra   rr   r   r   r9   r   r   r   r   integer_to_float16_cast  s   
r   c                    s    t | tjtj fdd}d S )Nc                    sB   t t dt dt dg}t |  dd}|||S )Nr   z.f16 $0,$1,$2;=h,h,hr   ra   rr   r   r9   r"   r   r#   r$   rg   r   opr   r   ptx_fp16_binary  s
   z*lower_fp16_binary.<locals>.ptx_fp16_binarylowerr   float16)fnr   r   r   r   r   lower_fp16_binary     r   addsubmulc                 C   4   t t dt dg}t |dd}|||S )Nr   zneg.f16 $0, $1;=h,hr   r   r   r   r   ptx_fp16_hneg     r   c                 C      t | |||S r   )r   r!   r   r   r   operator_hneg  r   r   c                 C   r   )Nr   zabs.f16 $0, $1;r   r   r   r   r   r   ptx_fp16_habs  r   r   c                 C   r   r   )r   r!   r   r   r   operator_habs  r   r   c                 C   sH   t dt dt dg}t t d|}t |dd}|||S )Nr   zfma.rn.f16 $0,$1,$2,$3;z=h,h,h,h)r   rr   ra   r   r9   )r"   r   r#   r$   Zargtysrg   r   r   r   r   ptx_hfma  s   r   c                 C      dd }|  ||||S )Nc                 S   s   t j| |S r   )r   fp16Zhdiv)r   r   r   r   r   fp16_div  s   zfp16_div_impl.<locals>.fp16_divZcompile_internal)r"   r   r#   r$   r   r   r   r   fp16_div_impl  s   r   z{{
          .reg .pred __$$f16_cmp_tmp;
          setp.{op}.f16 __$$f16_cmp_tmp, $1, $2;
          selp.u16 $0, 1, 0, __$$f16_cmp_tmp;
        }}c                        fdd}|S )Nc           	         sr   t t dt dt dg}t |tj dd}|||}| tj	d}|
|t d}|d||S )Nr   r   r   r   z!=)r   ra   rr   r   	_fp16_cmprA   r9   r6   r   Zint16ru   Zicmp_unsigned)	r"   r   r#   r$   rg   r   resultzeroZ
int_resultr   r   r   ptx_fp16_comparison  s   "z*_gen_fp16_cmp.<locals>.ptx_fp16_comparisonr   )r   r   r   r   r   _gen_fp16_cmp  s   r   eqnegegtleltc                    s    t | tjtj fdd}d S )Nc                    s(   t  | |||}|||d |d S )Nr   r   )r   select)r"   r   r#   r$   choicer   r   r   ptx_fp16_minmax  s   z*lower_fp16_minmax.<locals>.ptx_fp16_minmaxr   )r   rf   r   r   r   r   r   lower_fp16_minmax  r   r   maxminZ
__nv_cbrtfZ	__nv_cbrtc           
      C   sF   |j }t| }| |}|j}t||g}t|||}	||	|S r   )	return_type
cbrt_funcsr   r8   r   ra   r	   rc   r9   )
r"   r   r#   r$   r   rf   Zftyr;   rg   r   r   r   r   ptx_cbrt  s   
r   c              	   C   2   t |jttdtdfd}|||S )Nrq   Z	__nv_brevr	   rc   r8   r   ra   rr   r9   r"   r   r#   r$   r   r   r   r   ptx_brev_u4     r   c              	   C   r   )Nrs   Z__nv_brevllr   r   r   r   r   ptx_brev_u8  r   r   c                 C   s   | |d | tjdS r.   )Zctlzr6   r   booleanr!   r   r   r   ptx_clz'  s   r   c              	   C   r   )Nrq   Z__nv_ffsr   r   r   r   r   
ptx_ffs_32.     r   c              	   C   s2   t |jttdtdfd}|||S )Nrq   rs   Z
__nv_ffsllr   r   r   r   r   
ptx_ffs_648  r   r   c                 C   s   |\}}}| |||S r   )r   )r"   r   r#   r$   testabr   r   r   ptx_selpB  s   
r  c              	   C   4   t |jtt t t fd}|||S )NZ
__nv_fmaxfr	   rc   r8   r   ra   ry   r9   r   r   r   r   
ptx_max_f4H     r  c              
   C   h   t |jtt t t fd}||| ||d |jd t	j
| ||d |jd t	j
gS )NZ	__nv_fmaxr   r   r	   rc   r8   r   ra   r   r9   castr$   r   doubler   r   r   r   
ptx_max_f8S     r
  c              	   C   r  )NZ
__nv_fminfr  r   r   r   r   
ptx_min_f4d  r  r  c              
   C   r  )NZ	__nv_fminr   r   r  r   r   r   r   
ptx_min_f8o  r  r  c              	   C   sJ   t |jttdt fd}||| ||d |j	d t
jgS )Nrs   Z__nv_llrintr   )r	   rc   r8   r   ra   rr   r   r9   r  r$   r   r	  r   r   r   r   	ptx_round  s   r  c                 C   r   )Nc                 S   s   t | s
t | r| S |dkr1|dkrd|d  }d}nd| }d}| | | }t |r0| S n	d|  }| | }t|}t || dkrOdt|d  }|dkr[|| | }|S ||9 }|S )Nr      g      $@gMDg      ?g      ?g       @)mathisinfisnanroundfabs)r   ndigitsZpow1Zpow2r   r   r   r   r   round_ndigits  s,   

z$round_to_impl.<locals>.round_ndigitsr   )r"   r   r#   r$   r  r   r   r   round_to_impl  s   !r  c                    r   )Nc                    s$   |j \}| | }|||d S r.   )r$   r6   Zfmul)r"   r   r#   r$   Zargtyfactorconstr   r   impl  s   zgen_deg_rad.<locals>.implr   )r  r  r   r  r   gen_deg_rad  s   r  g     f@c                    s   |t jv rt j|dd}|g}n
tj |t|d} fddt||D }|j}||kr6td||f |j	t|krHtd|j	t|f ||fS )z4
    Convert integer indices into tuple of intp
    r   )rH   count)r  c                    s"   g | ]\}}  ||tjqS r   )r  r   intp)rU   tir   r"   r   r   rW     s    z&_normalize_indices.<locals>.<listcomp>zexpect %s but got %sz#indexing %d-D array with %d-D index)
r   Zinteger_domainUniTupler	   Zunpack_tuplelenziprH   	TypeErrorndim)r"   r   indtyindsarytyvaltyindicesrH   r   r!  r   _normalize_indices  s   
r,  c                    r   )Nc                    sj   |j \}}}|\}}}	|j}
t| |||||\}}| || ||}tj| ||||dd} | ||
||	S )NTZ
wraparound)r$   rH   r,  
make_arrayr	   get_item_pointer)r"   r   r#   r$   r)  r'  r*  aryr(  r   rH   r+  laryptrdispatch_fnr   r   imp  s   

z_atomic_dispatcher.<locals>.impr   )r4  r5  r   r3  r   _atomic_dispatcher  s   r6  c                 C   \   |t jkr|j}|t|||fS |t jkr&|j}|t|||fS |d||dS )Nr   	monotonic)	r   rx   r8   r9   r   Zdeclare_atomic_add_float32r~   Zdeclare_atomic_add_float64
atomic_rmwr"   r   rH   r2  r   r;   r   r   r   ptx_atomic_add_tuple     

r;  c                 C   r7  )Nr   r8  )	r   rx   r8   r9   r   Zdeclare_atomic_sub_float32r~   Zdeclare_atomic_sub_float64r9  r:  r   r   r   ptx_atomic_sub  r<  r=  c                 C   L   |t jjv r|j}|j}ttd| }|||||fS td| d)NZdeclare_atomic_inc_intzUnimplemented atomic inc with  array	r   cudadeclZunsigned_int_numba_typesrv   r8   getattrr   r9   r%  r"   r   rH   r2  r   bwr;   r   r   r   r   ptx_atomic_inc     rE  c                 C   r>  )NZdeclare_atomic_dec_intzUnimplemented atomic dec with r?  r@  rC  r   r   r   ptx_atomic_dec"  rF  rG  c                    s@   t  fdd}tjtjtjfD ]}t| tj|tj| qd S )Nc                    s2   |t jjv r| ||dS td  d| d)Nr8  zUnimplemented atomic z with r?  r   rA  integer_numba_typesr9  r%  r"   r   rH   r2  r   r   r   r   impl_ptx_atomic1  s   z+ptx_atomic_bitwise.<locals>.impl_ptx_atomic)r6  r   r  r"  Tupler   ArrayAny)Zstubr   rK  r   r   r   r   ptx_atomic_bitwise0  s
   rO  andorxorc                 C   s,   |t jjv r|d||dS td| d)NZxchgr8  zUnimplemented atomic exch with r?  rH  rJ  r   r   r   ptx_atomic_exchA  s   rS  c                 C      |j }|tjkr|t|||fS |tjkr#|t|||fS |tjtj	fv r4|j
d||ddS |tjtjfv rE|j
d||ddS td| Nr   r8  ZorderingZumaxz&Unimplemented atomic max with %s array)r8   r   r~   r9   r   Zdeclare_atomic_max_float64rx   Zdeclare_atomic_max_float32r7   int64r9  uint32uint64r%  r:  r   r   r   ptx_atomic_maxL     

rZ  c                 C   rT  Nr   r8  rV  Zuminz&Unimplemented atomic min with %s array)r8   r   r~   r9   r   Zdeclare_atomic_min_float64rx   Zdeclare_atomic_min_float32r7   rW  r9  rX  rY  r%  r:  r   r   r   ptx_atomic_min`  r[  r]  c                 C   rT  rU  )r8   r   r~   r9   r   Zdeclare_atomic_nanmax_float64rx   Zdeclare_atomic_nanmax_float32r7   rW  r9  rX  rY  r%  r:  r   r   r   ptx_atomic_nanmaxt  r[  r^  c                 C   rT  r\  )r8   r   r~   r9   r   Zdeclare_atomic_nanmin_float64rx   Zdeclare_atomic_nanmin_float32r7   rW  r9  rX  rY  r%  r:  r   r   r   ptx_atomic_nanmin  r[  r_  c                 C   sT   | |jd tj|jd |jd }|d | tjd|d |d f}t| |||S )Nr   r   r4   )r   r$   r   r  r6   ptx_atomic_casr!   r   r   r   ptx_atomic_compare_and_swap  s   $"ra  c                 C   s   |j \}}}}|\}}	}
}t| |||	||\}}| || ||}tj| ||||dd}|jtjjv rD|j	}|jj
}t|||||
|S td|j )NTr-  z&Unimplemented atomic cas with %s array)r$   r,  r.  r	   r/  rH   r   rA  rI  r8   rv   r   Zatomic_cmpxchgr%  )r"   r   r#   r$   r)  r'  Zoldtyr*  r0  r(  oldr   r+  r1  r2  r;   rv   r   r   r   r`    s   r`  c                 C   s@   t jt t  t dgdddd}|d }|||g d S )Nrq   znanosleep.u32 $0;r   Tr   r   )r   r   ra   rb   rr   r9   )r"   r   r#   r$   	nanosleepnsr   r   r   ptx_nanosleep  s
   re  Fc               	      sb  t tj|d}|dko|ot|dk}|dkr|std j| }	t|tjtj	fp5t|	t
jp5|tjk}
|tjvrC|
sCtd|  |}t||}|tjkr\tj|||d}n4|j}t||||} |}d|d  > |_|r{d|_nt|tj|_||t t!dd}t"#t$ j%} |}|&|}|}g }t't(|D ]\}}|)| ||9 }qd	d
 t(|D } fdd
|D }|rtj*t+t!dg dddd}|,|-|g t!d} .tj/|}|0||g}n	 fdd
|D }t|}tj1||dd} 2| |} j3||4||j5j6|| .tj/|d d |7 S )Nr   r   zarray length <= 0zunsupported type: %srB   Zexternalr   Zgenericc                 S   s   g | ]}|qS r   r   rT   r   r   r   rW     s    z"_generic_array.<locals>.<listcomp>c                       g | ]	}  tj|qS r   r6   r   r  rT   r"   r   r   rW         rq   zmov.u32 $0, %dynamic_smem_size;r   Tr   rs   c                    rf  r   rg  rT   rh  r   r   rW     ri  C)rH   r&  Zlayout)datarG   stridesitemsizeZmeminfo)8r   operatorr   r#  
ValueErrorZdata_model_manager
isinstancer   ZRecordBooleanr   ZStructModelr   Znumber_domainr%  Zget_data_typer   	ArrayTyper   r\   r	   Zalloca_oncer8   Zadd_global_variableZget_abi_sizeof
bit_lengthalignlinkageConstant	UndefinedZinitializerZaddrspacecastZPointerTyperr   llZcreate_target_dataZNVVMZdata_layoutZget_abi_size	enumeratereversedappendr   ra   r|   r9   r6   r  ZudivrM  r.  Zpopulate_arrayru   rk  typeZ	_getvalue) r"   r   rG   rH   rI   rJ   rK   Z	elemcountZdynamic_smemZ
data_modelZother_supported_typeZlldtypeZlarytyZdataptrr;   Zgvmemrt  Z
targetdatarm  Z
laststrideZrstridesr   Zlastsizerl  ZkstridesZget_dynshared_sizeZdynsmem_sizeZ	kitemsizeZkshaper&  r)  r0  r   rh  r   rM     sx   









rM   c                 C   s   |   S r   )rd   )r"   r   r   Zpyvalr   r   r   cuda_dispatcher_const*  s   r}  )F)	functoolsr   rn  r  Zllvmliter   Zllvmlite.bindingZbindingrx  Znumba.core.imputilsr   r   Znumba.core.typing.npydeclr   Znumba.core.datamodelr   Z
numba.corer   r	   Znumba.npr
   Znumba.np.npyimplr   Zcudadrvr   Znumbar   Z
numba.cudar   r   r   Znumba.cuda.typesr   r   r   registryr   Zlower_getattrZ
lower_attrZlower_constantr   Moduler%   r'   r(   r)   r+   r1   r3   r5   ZcgZ	this_gridr<   r>   r  Z
array_likerM  r?   r@   rD   ZsharedarrayZIntegerLiteralrN  rQ   rL  r"  rZ   localr]   r^   Zthreadfence_blockri   Zthreadfence_systemrk   Zthreadfencerl   Zsyncwarprp   i4rn   Zshfl_sync_intrinsicr{   Zf4Zf8r   Zvote_sync_intrinsicr   r   Zmatch_any_syncr   Zmatch_all_syncr   r   r   Zlanemask_ltr   Zpopcr   r   r   r   r   Floatr   r   r   Integerr   r   r   r   Zhaddr   iaddZhsubr   isubZhmulr   imulZhnegr   negr   Zhabsr   absr   Zhfmar   truedivitruedivr   r   r   Zheqr   hner   Zhger   Zhgtr   hler   Zhltr   r   ZhmaxZhminrx   r~   r   Zcbrtr   ZbrevZu4r   u8r   Zclzr   Zffsr   r   Zselpr  r   r  r
  r   r  r  r  r  r  r  piZ_deg2radZ_rad2degradiansdegreesr,  r6  Zatomicr  r;  r=  incrE  decrG  rO  and_r}   rR  ZexchrS  rZ  r]  Znanmaxr^  Znanminr_  Zcompare_and_swapra  Zcasr`  rc  rX  re  rM   r}  Z
get_ufuncsr   r   r   r   <module>   s   














		
		







	.










	
















%






d
