o
    *iT'                     @   s   d dl mZmZ d dlZdgZdejdejfddZG dd	 d	ejjZ	G d
d dejjZ
G dd dejjZG dd dejjZdS )    )OptionalTupleN	Conformerlengthsreturnc                 C   sF   | j d }tt|  }tj|| j| jd||| 	dk}|S )Nr   )devicedtype   )
shapeinttorchmaxitemZaranger   r   expandZ	unsqueeze)r   Z
batch_size
max_lengthZpadding_mask r   j/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/torchaudio/models/conformer.py_lengths_to_padding_mask	   s   
r   c                       s\   e Zd ZdZ			ddededededed	ed
df fddZdej	d
ej	fddZ
  ZS )_ConvolutionModulea  Conformer convolution module.

    Args:
        input_dim (int): input dimension.
        num_channels (int): number of depthwise convolution layer input channels.
        depthwise_kernel_size (int): kernel size of depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        bias (bool, optional): indicates whether to add bias term to each convolution layer. (Default: ``False``)
        use_group_norm (bool, optional): use GroupNorm rather than BatchNorm. (Default: ``False``)
            F	input_dimnum_channelsdepthwise_kernel_sizedropoutbiasuse_group_normr   Nc                    s   t    |d d dkrtdtj|| _tjtjj|d| ddd|dtjj	ddtjj|||d|d d ||d|rHtjj
d|dntj|tj tjj||ddd|d	tj|| _d S )
Nr	      r   z<depthwise_kernel_size must be odd to achieve 'SAME' padding.)stridepaddingr   )dim)r   r   groupsr   )Z
num_groupsr   )Zkernel_sizer   r   r   )super__init__
ValueErrorr   nn	LayerNorm
layer_norm
SequentialZConv1dZGLUZ	GroupNormZBatchNorm1dSiLUDropout
sequential)selfr   r   r   r   r   r   	__class__r   r   r"      sJ   
	




z_ConvolutionModule.__init__inputc                 C   s,   |  |}|dd}| |}|ddS )z
        Args:
            input (torch.Tensor): with shape `(B, T, D)`.

        Returns:
            torch.Tensor: output, with shape `(B, T, D)`.
        r	   r   )r&   	transposer*   )r+   r.   xr   r   r   forwardM   s   

z_ConvolutionModule.forwardr   FF)__name__
__module____qualname____doc__r   floatboolr"   r   Tensorr1   __classcell__r   r   r,   r   r      s*    /r   c                	       sJ   e Zd ZdZddedededdf fdd	Zd
ejdejfddZ	  Z
S )_FeedForwardModulezPositionwise feed forward layer.

    Args:
        input_dim (int): input dimension.
        hidden_dim (int): hidden dimension.
        dropout (float, optional): dropout probability. (Default: 0.0)
    r   r   
hidden_dimr   r   Nc                    s`   t    tjtj|tjj||ddtj tj|tjj||ddtj|| _	d S )NT)r   )
r!   r"   r   r$   r'   r%   ZLinearr(   r)   r*   )r+   r   r<   r   r,   r   r   r"   d   s   




z_FeedForwardModule.__init__r.   c                 C   s
   |  |S )z
        Args:
            input (torch.Tensor): with shape `(*, D)`.

        Returns:
            torch.Tensor: output, with shape `(*, D)`.
        )r*   )r+   r.   r   r   r   r1   o   s   
z_FeedForwardModule.forward)r   )r3   r4   r5   r6   r   r7   r"   r   r9   r1   r:   r   r   r,   r   r;   [   s     r;   c                       s   e Zd ZdZ			ddededededed	ed
eddf fddZdej	dej	fddZ
dej	deej	 dej	fddZ  ZS )ConformerLayera  Conformer layer that constitutes Conformer.

    Args:
        input_dim (int): input dimension.
        ffn_dim (int): hidden layer dimension of feedforward network.
        num_attention_heads (int): number of attention heads.
        depthwise_conv_kernel_size (int): kernel size of depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d``
            in the convolution module. (Default: ``False``)
        convolution_first (bool, optional): apply the convolution module ahead of
            the attention module. (Default: ``False``)
    r   Fr   ffn_dimnum_attention_headsdepthwise_conv_kernel_sizer   r   convolution_firstr   Nc                    s   t    t|||d| _tj|| _tjj|||d| _	tj
|| _t||||d|d| _t|||d| _tj|| _|| _d S )N)r   T)r   r   r   r   r   r   )r!   r"   r;   ffn1r   r$   r%   self_attn_layer_normZMultiheadAttention	self_attnr)   self_attn_dropoutr   conv_moduleffn2final_layer_normrA   )r+   r   r>   r?   r@   r   r   rA   r,   r   r   r"      s    

	
zConformerLayer.__init__r.   c                 C   s2   |}| dd}| |}| dd}|| }|S )Nr   r	   )r/   rF   )r+   r.   residualr   r   r   _apply_convolution   s   
z!ConformerLayer._apply_convolutionkey_padding_maskc                 C   s   |}|  |}|d | }| jr| |}|}| |}| j||||dd\}}| |}|| }| js9| |}|}| |}|d | }| |}|S )a
  
        Args:
            input (torch.Tensor): input, with shape `(T, B, D)`.
            key_padding_mask (torch.Tensor or None): key padding mask to use in self attention layer.

        Returns:
            torch.Tensor: output, with shape `(T, B, D)`.
        g      ?F)querykeyvaluerK   Zneed_weights)rB   rA   rJ   rC   rD   rE   rG   rH   )r+   r.   rK   rI   r0   _r   r   r   r1      s.   	







zConformerLayer.forwardr2   )r3   r4   r5   r6   r   r7   r8   r"   r   r9   rJ   r   r1   r:   r   r   r,   r   r=   z   s0    	(r=   c                       sp   e Zd ZdZ			ddededededed	ed
edef fddZdej	dej	de
ej	ej	f fddZ  ZS )r   a(  Conformer architecture introduced in
    *Conformer: Convolution-augmented Transformer for Speech Recognition*
    :cite:`gulati2020conformer`.

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads in each Conformer layer.
        ffn_dim (int): hidden layer dimension of feedforward networks.
        num_layers (int): number of Conformer layers to instantiate.
        depthwise_conv_kernel_size (int): kernel size of each Conformer layer's depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d``
            in the convolution module. (Default: ``False``)
        convolution_first (bool, optional): apply the convolution module ahead of
            the attention module. (Default: ``False``)

    Examples:
        >>> conformer = Conformer(
        >>>     input_dim=80,
        >>>     num_heads=4,
        >>>     ffn_dim=128,
        >>>     num_layers=4,
        >>>     depthwise_conv_kernel_size=31,
        >>> )
        >>> lengths = torch.randint(1, 400, (10,))  # (batch,)
        >>> input = torch.rand(10, int(lengths.max()), input_dim)  # (batch, num_frames, input_dim)
        >>> output = conformer(input, lengths)
    r   Fr   	num_headsr>   
num_layersr@   r   r   rA   c	           	   	      s:   t    tj fddt|D | _d S )Nc                    s"   g | ]}t  d qS ))r   r   rA   )r=   ).0rO   rA   r@   r   r>   r   rP   r   r   r   
<listcomp>  s    
z&Conformer.__init__.<locals>.<listcomp>)r!   r"   r   r$   Z
ModuleListrangeconformer_layers)	r+   r   rP   r>   rQ   r@   r   r   rA   r,   rS   r   r"      s   


zConformer.__init__r.   r   r   c                 C   s:   t |}|dd}| jD ]}|||}q|dd|fS )aX  
        Args:
            input (torch.Tensor): with shape `(B, T, input_dim)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``input``.

        Returns:
            (torch.Tensor, torch.Tensor)
                torch.Tensor
                    output frames, with shape `(B, T, input_dim)`
                torch.Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid frames for i-th batch element in output frames.
        r   r	   )r   r/   rV   )r+   r.   r   Zencoder_padding_maskr0   layerr   r   r   r1     s
   
zConformer.forwardr2   )r3   r4   r5   r6   r   r7   r8   r"   r   r9   r   r1   r:   r   r   r,   r   r      s.    $	.)typingr   r   r   __all__r9   r   r$   Moduler   r;   r=   r   r   r   r   r   <module>   s    	I]