o
    i                     @   s:   d dl mZ d dlmZ d dlZd dlmZ dddZdS )    )cuda)driverN)numpy_supportc                    s   t | dd}|s%| j\}}| jj| | jjf}tjjj||f|| j|d}t	| j t
 j}ttdt|dd }t|| }||d ftj fdd}	t|jd | d t|jd | d f}
||f}|	|
||f | | |S )a  Compute the transpose of 'a' and store it into 'b', if given,
    and return it. If 'b' is not given, allocate a new array
    and return that.

    This implements the algorithm documented in
    http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/

    :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
        the device its stream will be used to perform the transpose (and to copy
        `b` to the device if necessary).
    streamr   )dtyper         c           	         s   t jj d}t jj}t jj}t jjt jj }t jjt jj }|| }|| }|| | jd k rH|| | jd k rH| || || f |||f< t 	  ||jd k rf||jd k rh|||f |||f< d S d S d S )N)shaper   r   r   )
r   ZsharedarrayZ	threadIdxxyZblockIdxZblockDimr	   Zsyncthreads)	inputoutputZtileZtxtybxZbyr   r   dtZ
tile_shape k/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/numba/cuda/kernels/transpose.pykernel)   s   $ztranspose.<locals>.kernel)getattrr	   r   itemsizer   ZcudadrvZdevicearrayZDeviceNDArraynpsZ
from_dtyper   Z
get_deviceZMAX_THREADS_PER_BLOCKintmathpowlogZjit)abr   colsrowsstridesZtpbZ
tile_widthZtile_heightr   blocksthreadsr   r   r   	transpose   s*   

,r$   )N)	Znumbar   Znumba.cuda.cudadrv.driverr   r   Znumba.npr   r   r$   r   r   r   r   <module>   s
    