o
    !‘i1T  ã                   @   s¾   d dl Z d dlZd dlmZmZ ddlmZ e	ej
dƒs4edƒej
jd< edƒej
jd< edƒej
jd< d d	lmZmZmZ d
d„ Zdd„ ZG dd„ dej
jƒZG dd„ dƒZ	ddd„ZdS )é    N)Útree_flattenÚtree_unflattené   )Ú_dummy_typeZ_CudaStreamBaseÚ
_CUDAGraphÚ_graph_pool_handleÚ_cuda_isCurrentStreamCapturing)r   r   r   c                   C   ó   t ƒ S )zÒ
    Returns True if CUDA graph capture is underway on the current CUDA stream, False otherwise.

    If a CUDA context does not exist on the current device, returns False without initializing the context.
    )r   © r
   r
   ú`/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/torch/cuda/graphs.pyÚis_current_stream_capturing   s   r   c                   C   r	   )zß
    Returns an opaque token representing the id of a graph memory pool.
    See :ref:`Graph memory management<graph-memory-management>`.

    .. warning::
        This API is in beta and may change in future releases.
    )r   r
   r
   r
   r   Úgraph_pool_handle$   s   r   c                       sv   e Zd ZdZ‡ fdd„Zd‡ fdd„	Z‡ fdd	„Z‡ fd
d„Z‡ fdd„Z‡ fdd„Z	‡ fdd„Z
‡ fdd„Z‡  ZS )Ú	CUDAGraphzw
    Wrapper around a CUDA graph.

    .. warning::
        This API is in beta and may change in future releases.
    c                    s   t ƒ  | ¡S ©N)ÚsuperÚ__new__)Úcls©Ú	__class__r
   r   r   8   s   zCUDAGraph.__new__NÚglobalc                    s   t ƒ j||d dS )aú  
        Begins capturing CUDA work on the current stream.

        Typically, you shouldn't call ``capture_begin`` yourself.
        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
        which call ``capture_begin`` internally.

        Arguments:
            pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
                :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
                with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
            capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
                Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
                may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
                actions in the current thread, and "relaxed" will not error on these actions. Do NOT change this setting
                unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_
        )ÚpoolÚcapture_error_modeN)r   Úcapture_begin)Úselfr   r   r   r
   r   r   ;   s   zCUDAGraph.capture_beginc                    ó   t ƒ  ¡  dS )aP  
        Ends CUDA graph capture on the current stream.
        After ``capture_end``, ``replay`` may be called on this instance.

        Typically, you shouldn't call ``capture_end`` yourself.
        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
        which call ``capture_end`` internally.
        N)r   Úcapture_end©r   r   r
   r   r   O   s   	zCUDAGraph.capture_endc                    r   )z?
        Replays the CUDA work captured by this graph.
        N)r   Úreplayr   r   r
   r   r   Z   ó   zCUDAGraph.replayc                    r   )zD
        Deletes the graph currently held by this instance.
        N)r   Úresetr   r   r
   r   r   `   r   zCUDAGraph.resetc                    ó
   t ƒ  ¡ S )zí
        Returns an opaque token representing the id of this graph's memory pool.
        This id can optionally be passed to another graph's ``capture_begin``,
        which hints the other graph may share the same memory pool.
        )r   r   r   r   r
   r   r   f   s   
zCUDAGraph.poolc                    r    )zB
        Enables debugging mode for CUDAGraph.debug_dump.
        )r   Úenable_debug_moder   r   r
   r   r!   n   s   
zCUDAGraph.enable_debug_modec                    s   t ƒ  |¡S )zÖ
        Arguments:
            debug_path (required): Path to dump the graph to.

        Calls a debugging function to dump the graph if the debugging is
        enabled via CUDAGraph.enable_debug_mode()
        )r   Ú
debug_dump)r   Z
debug_pathr   r
   r   r"   t   s   zCUDAGraph.debug_dump)Nr   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r   r   r   r   r!   r"   Ú__classcell__r
   r
   r   r   r   0   s    r   c                   @   s:   e Zd ZdZdZ			ddefdd„Zdd„ Zd	d
„ ZdS )ÚgraphaÎ  
    Context-manager that captures CUDA work into a :class:`torch.cuda.CUDAGraph`
    object for later replay.

    See :ref:`CUDA Graphs <cuda-graph-semantics>` for a general introduction,
    detailed use, and constraints.

    Arguments:
        cuda_graph (torch.cuda.CUDAGraph): Graph object used for capture.
        pool (optional): Opaque token (returned by a call to :func:`~torch.cuda.graph_pool_handle()` or
            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) hinting this graph's capture
            may share memory from the specified pool. See :ref:`Graph memory management<graph-memory-management>`.
        stream (torch.cuda.Stream, optional): If supplied, will be set as the current stream in the context.
            If not supplied, ``graph`` sets its own internal side stream as the current stream in the context.
        capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
            Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
            may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
            actions in the current thread, and "relaxed" will not error on actions. Do NOT change this setting
            unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_

    .. note::
        For effective memory sharing, if you pass a ``pool`` used by a previous capture and the previous capture
        used an explicit ``stream`` argument, you should pass the same ``stream`` argument to this capture.

    .. warning::
        This API is in beta and may change in future releases.

    .. _cudaStreamCaptureMode:
        https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
    Nr   r   c                 C   sr   | j jd u rtj ¡ | j _|d u rdn|f| _|d ur|n| j j| _| jd us)J ‚tj | j¡| _|| _	|| _
d S )Nr
   )r   Údefault_capture_streamÚtorchÚcudaÚStreamr   Zcapture_streamÚstreamÚ
stream_ctxÚ
cuda_graphr   )r   r/   r   r-   r   r
   r
   r   Ú__init__    s   
ÿ
zgraph.__init__c                 C   s@   t j ¡  t ¡  t j ¡  | j ¡  | jj	| j
d| jiŽ d S )Nr   )r*   r+   ÚsynchronizeÚgcZcollectZempty_cacher.   Ú	__enter__r/   r   r   r   r   r
   r
   r   r3   ¶   s   


ÿ
ÿzgraph.__enter__c                 C   s   | j  ¡  | j |||¡ d S r   )r/   r   r.   Ú__exit__)r   Úexc_typeÚ	exc_valueÚ	tracebackr
   r
   r   r4   Ä   s   
zgraph.__exit__)NNr   )	r#   r$   r%   r&   r)   Ústrr0   r3   r4   r
   r
   r
   r   r(      s    û
ûr(   é   Fc           %         sn  t  ¡ rt  ¡ rtdƒ‚d}t| tƒsd}| f} |f}g ‰ t| |ƒD ]M\}}t|t jjƒrUt	|j
ƒdkrBt	|jƒdkrBt	|jƒdksFJ dƒ‚tdd„ | ¡ D ƒƒsUJ dƒ‚t|ƒ\}}ˆ  t|ƒ¡ td	d„ |D ƒƒsoJ d
ƒ‚q"dd„ ˆ D ƒ}	dd„ | D ƒ‰‡ ‡fdd„tt	| ƒƒD ƒ}
dd„ tt	| ƒƒD ƒ}dd„ tt	| ƒƒD ƒ}tƒ }t j ¡  t j t j ¡ ¡E t| ||
ƒD ]6\}}}t|ƒD ]*}t||Ž ƒ\}}t jjtdd„ |D ƒƒtdd„ |D ƒƒtdd„ |D ƒƒd|d}qÃ~~qºW d  ƒ n1 sûw   Y  t j ¡  g }g }t| ||ƒD ]5\}}}t jj||d ||Ž }W d  ƒ n	1 s,w   Y  t|ƒ\}}| t|ƒ¡ | |¡ qg }g }tt|
ƒt|ƒt|ƒtˆƒƒD ]w\}}}}tdd„ |D ƒƒ}t jj||d( t jjtdd„ |D ƒƒtdd„ |D ƒƒtdd„ |D ƒƒd|d}W d  ƒ n	1 s›w   Y  g }d}|D ]}|jr¹| || ¡ |d7 }q¦| d¡ q¦t|ƒ}| |¡ | |¡ qXtt|ƒƒ}tt|ƒƒ}dd„ } g }!t| ƒD ]E\}"}| ||" ||" ˆ|" |	|" ||" |
|" ||" ||" ||" ƒ	}#t|t jjƒr%dd„ }$|$||j|#|jƒ|_|! |¡ qæ|! |#¡ qæ|r3|!d S t|!ƒS ) a¸  
    Accepts callables (functions or :class:`nn.Module<torch.nn.Module>`\ s)
    and returns graphed versions.

    Each graphed callable's forward pass runs its source callable's
    forward CUDA work as a CUDA graph inside a single autograd node.

    The graphed callable's forward pass also appends
    a backward node to the autograd graph. During backward, this node runs the
    callable's backward work as a CUDA graph.

    Therefore, each graphed callable should be a drop-in replacement for its source callable
    in an autograd-enabled training loop.

    See :ref:`Partial-network capture<partial-network-capture>` for detailed use and constraints.

    If you pass a tuple of several callables, their captures will use the same memory pool.
    See :ref:`Graph memory management<graph-memory-management>` for when this is appropriate.

    Arguments:
        callables (torch.nn.Module or Python function, or tuple of these): Callable or callables to graph.
            See :ref:`Graph memory management<graph-memory-management>` for when passing a tuple of callables
            is appropriate.  If you pass a tuple of callables, their order in the tuple must be the same order
            they'll run in the live workload.
        sample_args (tuple of Tensors, or tuple of tuples of Tensors): Samples args for each callable.
            If a single callable was passed, ``sample_args`` must be a single tuple of argument Tensors.
            If a tuple of callables was passed, ``sample_args`` must be tuple of tuples of argument Tensors.
        num_warmup_iters (int): The number of warmup iterations. Currently, ``DataDistributedParallel`` needs
            11 iterations for warm up. Default: ``3``.
        allow_unused_input (bool): If False, specifying inputs that were not used when computing outputs
            (and therefore their grad is always zero) is an error. Defaults to False.

    .. note::
        The ``requires_grad`` state of each Tensor in ``sample_args`` must match the state
        that's expected for the corresponding real input in the training loop.

    .. warning::
        This API is in beta and may change in future releases.

    .. warning::
        ``sample_args`` for each callable must contain only Tensors. Other types are not allowed.

    .. warning::
        Returned callables do not support higher order differentiation (e.g., double backward).

    .. warning::
        In any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters
        may be trainable. Buffers must have ``requires_grad=False``.

    .. warning::
        After you pass a :class:`torch.nn.Module` through :func:`~make_graphed_callables`,
        you may not add or remove any of that Module's parameters or buffers.

    .. warning::
        :class:`torch.nn.Module`\s passed to :func:`~torch.cuda.make_graphed_callables` must not have module hooks
        registered on them at the time they are passed. However, registering hooks on modules *after* passing them
        through :func:`~torch.cuda.make_graphed_callables` is allowed.

    .. warning::
        When running a graphed callable, you must pass its arguments in the same order and format
        they appeared in that callable's ``sample_args``.

    .. warning::
        The automatic mixed precision is supported in :func:`~torch.cuda.make_graphed_callables` only with disabled
        caching. The context manager `torch.cuda.amp.autocast()` must have `cache_enabled=False`.
    z_make_graphed_callables does not support the autocast caching. Please set `cache_enabled=False`.FTr   z§Modules must not have hooks registered at the time they are passed. However, registering hooks on modules after passing them through make_graphed_callables is allowed.c                 s   s    | ]}|j d u V  qdS )FN©Úrequires_grad©Ú.0Úbr
   r
   r   Ú	<genexpr>'  ó   € z)make_graphed_callables.<locals>.<genexpr>zœIn any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters may be trainable. All buffers must have ``requires_grad=False``.c                 s   s    | ]	}t |tjƒV  qd S r   )Ú
isinstancer*   ZTensor)r=   Úargr
   r
   r   r?   .  ó   € zfIn the beta API, sample_args for each callable must contain only Tensors. Other types are not allowed.c                 S   s   g | ]}t |ƒ‘qS r
   )Úlen)r=   Úargsr
   r
   r   Ú
<listcomp>5  s    z*make_graphed_callables.<locals>.<listcomp>c                 S   s*   g | ]}t |tjjƒrt| ¡ ƒnd ‘qS )r
   )rA   r*   ÚnnÚModuleÚtupleÚ
parameters)r=   Úcr
   r
   r   rF   6  s    ÿÿc                    s   g | ]
}ˆ | ˆ|  ‘qS r
   r
   ©r=   Úi©Zflatten_sample_argsZper_callable_module_paramsr
   r   rF   :  s    ÿÿc                 S   ó   g | ]}t j ¡ ‘qS r
   ©r*   r+   r   ©r=   Ú_r
   r
   r   rF   ?  ó    c                 S   rO   r
   rP   rQ   r
   r
   r   rF   @  rS   c                 s   ó    | ]}|j r|V  qd S r   r:   ©r=   Úor
   r
   r   r?   O  r@   c                 s   rT   r   r:   rL   r
   r
   r   r?   P  r@   c                 s   s     | ]}|j rt |¡V  qd S r   ©r;   r*   Z
empty_likerU   r
   r
   r   r?   Q  s   € ÿ
ÿ)ÚoutputsÚinputsZgrad_outputsZonly_inputsZallow_unusedN)r   c                 s   s$    | ]}|j rt |¡nd V  qd S r   rW   rU   r
   r
   r   r?   t  ó   € 
ÿc                 s   rT   r   r:   rU   r
   r
   r   r?   z  r@   c                 s   rT   r   r:   rL   r
   r
   r   r?   {  r@   c                 s   s    | ]	}|d ur|V  qd S r   r
   rU   r
   r
   r   r?   |  rC   r   c	           
         s8   G ‡‡‡‡‡‡‡	fdd„dt jjƒ‰ ‡ ‡‡fdd„}	|	S )Nc                       s@   e Zd Ze‡‡‡‡fdd„ƒZeejjj‡ ‡‡fdd„ƒƒZ	dS )zOmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphedc                    s`   t ˆƒD ]}ˆ|  ¡ ||  ¡ krˆ|  || ¡ qˆ  ¡  tˆtƒs'J ‚tdd„ ˆD ƒƒS )Nc                 s   s    | ]}|  ¡ V  qd S r   ©ÚdetachrU   r
   r
   r   r?   ª  s   € zjmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.forward.<locals>.<genexpr>)ÚrangeÚdata_ptrÚcopy_r   rA   rI   )ÚctxrY   rM   )Ú	fwd_graphÚlen_user_argsÚstatic_input_surfaceÚstatic_outputsr
   r   Úforward¢  s   €zWmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.forwardc                    sr   t |ƒt ˆƒks
J ‚tˆ|ƒD ]\}}|d ur$| ¡ | ¡ kr$| |¡ qˆ  ¡  tˆtƒs0J ‚tdd„ ˆD ƒƒS )Nc                 s   s$    | ]}|d ur|  ¡ n|V  qd S r   r[   r<   r
   r
   r   r?   º  rZ   zkmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.backward.<locals>.<genexpr>)rD   Úzipr^   r_   r   rA   rI   )r`   ZgradsÚgÚgrad)Ú	bwd_graphÚstatic_grad_inputsÚstatic_grad_outputsr
   r   Úbackward¬  s   
€ÿzXmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.backwardN)
r#   r$   r%   Ústaticmethodre   r*   ÚautogradÚfunctionZonce_differentiablerl   r
   )ri   ra   rb   rj   rk   rc   rd   r
   r   ÚGraphed¡  s    	rp   c                     s(   t | ƒ\}}ˆ jt|ƒˆ Ž }t|ˆƒS r   )Ú_tree_flattenÚapplyrI   Ú_tree_unflatten)Ú	user_argsZflatten_user_argsrR   Úout)rp   Úmodule_paramsÚoutput_unflatten_specr
   r   Úfunctionalized¾  s   
zVmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.functionalized)r*   rn   ÚFunction)
ra   ri   rv   rb   rw   rc   rd   rk   rj   rx   r
   )
rp   ri   ra   rb   rv   rw   rj   rk   rc   rd   r   Úmake_graphed_autograd_function–  s   $z>make_graphed_callables.<locals>.make_graphed_autograd_functionc                    s   ‡ ‡‡‡fdd„}|S )Nc                     s   ˆ j ˆkr	ˆ| Ž S ˆ| Ž S r   )Útraining)rt   ©ÚfuncÚgraph_training_stateÚgraphedÚorig_fwdr
   r   Únew_fwdÚ  s   
zEmake_graphed_callables.<locals>.make_graphed_forward.<locals>.new_fwdr
   )r}   r~   r   r€   r   r
   r|   r   Úmake_graphed_forwardÙ  s   z4make_graphed_callables.<locals>.make_graphed_forward) r*   Zis_autocast_enabledZis_autocast_cache_enabledÚRuntimeErrorrA   rI   rf   rG   rH   rD   Z_backward_hooksZ_forward_hooksZ_forward_pre_hooksÚallÚbuffersrq   Úappendr]   r   r+   r1   r-   r,   rn   rh   r(   Úreversedr;   ÚlistÚ	enumerater{   re   )%Z	callablesZsample_argsZnum_warmup_itersZallow_unused_inputZjust_one_callablerK   rE   Zflatten_argrR   Zper_callable_len_user_argsZ"per_callable_static_input_surfacesZ
fwd_graphsZ
bwd_graphsZmempoolr}   rc   rX   Zgrad_inputsZper_callable_static_outputsZ"per_callable_output_unflatten_specra   Zflatten_outputsÚspecZ per_callable_static_grad_outputsZper_callable_static_grad_inputsrd   ri   rv   rk   rj   Zgrad_idxrB   rz   ÚretrM   r   r‚   r
   rN   r   Úmake_graphed_callablesÊ   sì   Eÿ
þÿÿþ
þ
ÿÿù	òÿ

ÿüÿûÿ
3÷rŒ   )r9   F)r2   r*   Ztorch.utils._pytreer   rq   r   rs   Ú_utilsr   ÚhasattrZ_CÚ__dict__Ztorch._Cr   r   r   r   r   r   r(   rŒ   r
   r
   r
   r   Ú<module>   s"    ÿ
OLÿ