o
    i"                     @   s   d dl mZ d dlmZ d dlmZ d dlmZm	Z	m
Z
 G dd deZG dd de
ZG d	d
 d
e	ZG dd deZdZG dd dejZdZG dd dejZdS )    )cuda)array)deviceufunc)UFuncMechanismGeneralizedUFuncGUFuncCallStepsc                   @   s2   e Zd ZdZdd Zdd ZdddZd	d
 ZdS )CUDAUFuncDispatcherzD
    Invoke the CUDA ufunc specialization for the given inputs.
    c                 C   s   || _ |j| _d S N)	functions__name__)selfZtypes_to_retty_kernelspyfunc r   e/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/numba/cuda/vectorizers.py__init__   s   zCUDAUFuncDispatcher.__init__c                 O   s   t | j||S )a  
        *args: numpy arrays or DeviceArrayBase (created by cuda.to_device).
               Cannot mix the two types in one call.

        **kws:
            stream -- cuda stream; when defined, asynchronous mode is used.
            out    -- output array. Can be a numpy array or DeviceArrayBase
                      depending on the input arguments.  Type must match
                      the input arguments.
        )CUDAUFuncMechanismcallr
   )r   argskwsr   r   r   __call__   s   zCUDAUFuncDispatcher.__call__r   c                 C   s   t t| j d dksJ d|jdksJ d|jd }g }|dkr)td|dkr1|d S |p6t }|	 0 tj
j|rF|}nt||}| |||}td|jd}|j||d	 W d    |d S 1 snw   Y  |d S )
Nr      zmust be a binary ufunc   zmust use 1d arrayzReduction on an empty array.)r   )dtypestream)lenlistr
   keysndimshape	TypeErrorr   r   Zauto_synchronizecudadrvdevicearrayis_cuda_ndarray	to_device_CUDAUFuncDispatcher__reducenp_arrayr   copy_to_host)r   argr   ngpu_memsmemoutbufr   r   r   reduce   s(   "


zCUDAUFuncDispatcher.reducec           
      C   s   |j d }|d dkr2||d \}}|| || | |||}|| | ||||dS ||d \}}	|| ||	 | ||	||d |d dkrZ| |||S |S )Nr   r   r   )r,   r   )r   splitappendr%   )
r   r+   r*   r   r)   ZfatcutZthincutr,   leftrightr   r   r   Z__reduce;   s   





zCUDAUFuncDispatcher.__reduceNr   )r   
__module____qualname____doc__r   r   r.   r%   r   r   r   r   r      s    
r   c                       sR   e Zd ZdgZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
  ZS )_CUDAGUFuncCallSteps_streamc                    s$   t  |||| |dd| _d S )Nr   r   )superr   getr8   )r   ZninZnoutr   kwargs	__class__r   r   r   X   s   z_CUDAGUFuncCallSteps.__init__c                 C   
   t |S r	   r   Zis_cuda_arrayr   objr   r   r   is_device_array\      
z$_CUDAGUFuncCallSteps.is_device_arrayc                 C      t jj|r	|S t |S r	   r   r!   r"   r#   Zas_cuda_arrayr@   r   r   r   as_device_array_      
z$_CUDAGUFuncCallSteps.as_device_arrayc                 C   s   t j|| jdS Nr   )r   r$   r8   )r   hostaryr   r   r   r$   i      z_CUDAGUFuncCallSteps.to_devicec                 C   s   |j || jd}|S rH   )r'   r8   )r   devaryrI   r,   r   r   r   to_hostl   s   z_CUDAGUFuncCallSteps.to_hostc                 C   s   t j||| jdS N)r   r   r   )r   device_arrayr8   )r   r   r   r   r   r   allocate_device_arrayp   s   z*_CUDAGUFuncCallSteps.allocate_device_arrayc                 C   s   |j || jd|  d S rH   )forallr8   )r   ZkernelZnelemr   r   r   r   launch_kernels   s   z"_CUDAGUFuncCallSteps.launch_kernel)r   r4   r5   	__slots__r   rB   rF   r$   rL   rO   rQ   __classcell__r   r   r<   r   r7   S   s    
r7   c                       s8   e Zd Z fddZedd Zdd Zdd Z  ZS )	CUDAGeneralizedUFuncc                    s   |j | _ t || d S r	   )r   r9   r   )r   	kernelmapenginer   r<   r   r   r   x   s   zCUDAGeneralizedUFunc.__init__c                 C      t S r	   )r7   r   r   r   r   _call_steps|      z CUDAGeneralizedUFunc._call_stepsc                 C   s   t jjj|d|j|jdS Nr3   r   stridesr   gpu_data)r   r!   r"   DeviceNDArrayr   r^   )r   aryr   r   r   r   _broadcast_scalar_input   s
   
z,CUDAGeneralizedUFunc._broadcast_scalar_inputc                 C   s:   t |t |j }d| |j }tjjj|||j|jdS r[   )	r   r   r]   r   r!   r"   r_   r   r^   )r   r`   ZnewshapeZnewaxZ
newstridesr   r   r   _broadcast_add_axis   s   
z(CUDAGeneralizedUFunc._broadcast_add_axis)	r   r4   r5   r   propertyrY   ra   rb   rS   r   r   r<   r   rT   w   s    
rT   c                   @   sL   e Zd ZdZdZdd Zdd Zdd Zd	d
 Zdd Z	dd Z
dd ZdS )r   z%
    Provide CUDA specialization
    r   c                 C   s   |j ||d|  d S rH   )rP   )r   funccountr   r   r   r   r   launch   s   zCUDAUFuncMechanism.launchc                 C   r>   r	   r?   r@   r   r   r   rB      rC   z"CUDAUFuncMechanism.is_device_arrayc                 C   rD   r	   rE   r@   r   r   r   rF      rG   z"CUDAUFuncMechanism.as_device_arrayc                 C   s   t j||dS rH   )r   r$   )r   rI   r   r   r   r   r$         zCUDAUFuncMechanism.to_devicec                 C   s   |j |dS rH   )r'   )r   rK   r   r   r   r   rL      s   zCUDAUFuncMechanism.to_hostc                 C   s   t j|||dS rM   )r   rN   )r   r   r   r   r   r   r   rO      rJ   z(CUDAUFuncMechanism.allocate_device_arrayc                    sn    fddt tD }tt j }dg| t j }|D ]}d||< q#tjjj| j	 j
dS )Nc                    s,   g | ]}| j ks j| | kr|qS r   )r   r   ).0axr`   r   r   r   
<listcomp>   s
    
z7CUDAUFuncMechanism.broadcast_device.<locals>.<listcomp>r   r\   )ranger   r   r   r]   r   r!   r"   r_   r   r^   )r   r`   r   Z
ax_differsZ
missingdimr]   ri   r   rj   r   broadcast_device   s   

z#CUDAUFuncMechanism.broadcast_deviceN)r   r4   r5   r6   ZDEFAULT_STREAMrf   rB   rF   r$   rL   rO   rm   r   r   r   r   r      s    
r   z
def __vectorized_{name}({args}, __out__):
    __tid__ = __cuda__.grid(1)
    if __tid__ < __out__.shape[0]:
        __out__[__tid__] = __core__({argitems})
c                   @   s8   e Zd Zdd Zdd Zdd Zdd Zed	d
 ZdS )CUDAVectorizec                 C   s*   t j|ddd| j}||j|j jjfS )NT)deviceinline)r   jitr   Z	overloadsr   	signaturereturn_type)r   sigZcudevfnr   r   r   _compile_core   s   zCUDAVectorize._compile_corec                 C   s    | j j }|t|d |S )NZ__cuda__Z__core__)r   __globals__copyupdater   )r   corefnZglblr   r   r   _get_globals   s
   zCUDAVectorize._get_globalsc                 C   r>   r	   r   rq   r   Zfnobjrt   r   r   r   _compile_kernel   rC   zCUDAVectorize._compile_kernelc                 C   s   t | j| jS r	   )r   rU   r   rX   r   r   r   build_ufunc   rg   zCUDAVectorize.build_ufuncc                 C   rW   r	   )vectorizer_stager_sourcerX   r   r   r   _kernel_template   rZ   zCUDAVectorize._kernel_templateN)	r   r4   r5   ru   r{   r~   r   rc   r   r   r   r   r   rn      s    rn   zy
def __gufunc_{name}({args}):
    __tid__ = __cuda__.grid(1)
    if __tid__ < {checkedarg}:
        __core__({argitems})
c                   @   s0   e Zd Zdd Zdd Zedd Zdd Zd	S )
CUDAGUFuncVectorizec                 C   s"   t | j| j}t| j|| jdS )N)rU   rV   r   )r   ZGUFuncEngineZinputsigZ	outputsigrT   rU   r   )r   rV   r   r   r   r      s
   zCUDAGUFuncVectorize.build_ufuncc                 C   s   t ||S r	   r|   r}   r   r   r   r~      rg   z#CUDAGUFuncVectorize._compile_kernelc                 C   rW   r	   )_gufunc_stager_sourcerX   r   r   r   r      rZ   z$CUDAGUFuncVectorize._kernel_templatec                 C   s4   t j|dd| j}| jj }|t |d |S )NT)ro   rv   )r   rq   r   Zpy_funcrw   rx   ry   )r   rt   rz   Zglblsr   r   r   r{      s   z CUDAGUFuncVectorize._get_globalsN)r   r4   r5   r   r~   rc   r   r{   r   r   r   r   r      s    
r   N)Znumbar   numpyr   r&   Znumba.np.ufuncr   Znumba.np.ufunc.deviceufuncr   r   r   objectr   r7   rT   r   r   ZDeviceVectorizern   r   ZDeviceGUFuncVectorizer   r   r   r   r   <module>   s    K$0