o
    *i0                     @   sH  U d dl Z d dlmZmZmZ d dlZd dlmZ d dlm  m	Z
 dedefddZdedfZeeef ed	< G d
d dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdedededejjfddZ	d&dededededededed ed!ee defd"d#Zdefd$d%ZdS )'    N)ListOptionalTuplexreturnc                 C   s   dddt d|  d    S )a  The metric defined by ITU-T P.862 is often called 'PESQ score', which is defined
    for narrow-band signals and has a value range of [-0.5, 4.5] exactly. Here, we use the metric
    defined by ITU-T P.862.2, commonly known as 'wide-band PESQ' and will be referred to as "PESQ score".

    Args:
        x (float): Narrow-band PESQ score.

    Returns:
        (float): Wide-band PESQ score.
    g+?g@   g;pΈgׁsF@)mathexp)r    r
   p/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/torchaudio/models/squim/objective.pytransform_wb_pesq_range	   s   r         ?g      @	PESQRangec                       sF   e Zd Zd
deeef ddf fddZdejdejfdd	Z  Z	S )RangeSigmoid        r   	val_ranger   Nc                    s<   t t|   t|trt|dksJ || _t | _	d S )N   )
superr   __init__
isinstancetuplelenr   nnZSigmoidsigmoid)selfr   	__class__r
   r   r       s   zRangeSigmoid.__init__r   c                 C   s,   |  || jd | jd   | jd  }|S )Nr   r   )r   r   r   r   outr
   r
   r   forward&   s   (zRangeSigmoid.forward)r   )
__name__
__module____qualname__r   floatr   torchTensorr    __classcell__r
   r
   r   r   r      s     r   c                       sF   e Zd ZdZddededdf fdd	Zd
ejdejfddZ  Z	S )EncoderzEncoder module that transform 1D waveform to 2D representations.

    Args:
        feat_dim (int, optional): The feature dimension after Encoder module. (Default: 512)
        win_len (int, optional): kernel size in the Conv1D layer. (Default: 32)
           feat_dimwin_lenr   Nc                    s,   t t|   tjd|||d dd| _d S )Nr   r   F)ZstrideZbias)r   r(   r   r   ZConv1dconv1d)r   r+   r,   r   r
   r   r   3   s   zEncoder.__init__r   c                 C   s    |j dd}t| |}|S )a  Apply waveforms to convolutional layer and ReLU layer.

        Args:
            x (torch.Tensor): Input waveforms. Tensor with dimensions `(batch, time)`.

        Returns:
            (torch,Tensor): Feature Tensor with dimensions `(batch, channel, frame)`.
        r   dim)Z	unsqueezeFZrelur-   r   r
   r
   r   r    8   s   	zEncoder.forward)r)   r*   )
r!   r"   r#   __doc__intr   r%   r&   r    r'   r
   r
   r   r   r(   +   s    r(   c                       sJ   e Zd Zddededededdf
 fdd	Zd
ejdejfddZ	  Z
S )	SingleRNNr   rnn_type
input_sizehidden_sizedropoutr   Nc                    sR   t t|   || _|| _|| _tt|||d|ddd| _t	|d || _
d S )Nr   T)r7   batch_firstbidirectionalr   )r   r3   r   r4   r5   r6   getattrr   rnnLinearproj)r   r4   r5   r6   r7   r   r
   r   r   G   s   	zSingleRNN.__init__r   c                 C   s   |  |\}}| |}|S N)r;   r=   )r   r   r   _r
   r
   r   r    Y   s   
zSingleRNN.forward)r   )r!   r"   r#   strr2   r$   r   r%   r&   r    r'   r
   r
   r   r   r3   F   s    $r3   c                       s   e Zd ZdZ							dd	ed
ededededededdf fddZdejde	ejef fddZ
dejde	ejef fddZdejdedejfddZdejdejfddZ  ZS )DPRNNa  *Dual-path recurrent neural networks (DPRNN)* :cite:`luo2020dual`.

    Args:
        feat_dim (int, optional): The feature dimension after Encoder module. (Default: 64)
        hidden_dim (int, optional): Hidden dimension in the RNN layer of DPRNN. (Default: 128)
        num_blocks (int, optional): Number of DPRNN layers. (Default: 6)
        rnn_type (str, optional): Type of RNN in DPRNN. Valid options are ["RNN", "LSTM", "GRU"]. (Default: "LSTM")
        d_model (int, optional): The number of expected features in the input. (Default: 256)
        chunk_size (int, optional): Chunk size of input for DPRNN. (Default: 100)
        chunk_stride (int, optional): Stride of chunk input for DPRNN. (Default: 50)
    @         LSTM   d   2   r+   
hidden_dim
num_blocksr4   d_model
chunk_sizechunk_strider   Nc           	         s   t t|   || _tg | _tg | _tg | _tg | _	t
|D ].}| jt||| | jt||| | jtjd|dd | j	tjd|dd q&tt||dt | _|| _|| _d S )Nr   g:0yE>)Zeps)r   rA   r   rJ   r   
ModuleListrow_rnncol_rnnrow_normcol_normrangeappendr3   Z	GroupNorm
SequentialZConv2dPReLUconvrL   rM   )	r   r+   rI   rJ   r4   rK   rL   rM   r?   r   r
   r   r   m   s"   

zDPRNN.__init__r   c                 C   sF   |j d }| j| j|| j  | j  }t|| j|| j g}||fS )N)shaperL   rM   r0   pad)r   r   seq_lenrestr   r
   r
   r   	pad_chunk   s   
zDPRNN.pad_chunkc           	      C   s   |  |\}}|j\}}}|d d d d d | j f  ||d| j}|d d d d | jd f  ||d| j}tj||gdd}|||d| jdd }||fS )NrX      r.   r   )	r]   rY   rM   
contiguousviewrL   r%   cat	transpose)	r   r   r   r\   
batch_sizer+   r[   Z	segments1Z	segments2r
   r
   r   chunking   s   0.zDPRNN.chunkingr\   c           	      C   s   |j \}}}}|dd ||d| jd }|d d d d d d d | jf  ||dd d d d | jd f }|d d d d d d | jd f  ||dd d d d d | j f }|| }|dkrw|d d d d d | f }| }|S )Nr   r^   rX   r   )rY   rb   r_   r`   rL   rM   )	r   r   r\   rc   r/   r?   r   Zout1Zout2r
   r
   r   merging   s   "HJzDPRNN.mergingc                 C   s*  |  |\}}|j\}}}}|}t| j| j| j| jD ]d\}}	}
}|dddd 	|| |d }||}|	|||ddddd }|	|}|| }|dddd 	|| |d }|
|}|	|||ddddd }||}|| }q| 
|}| ||}|dd }|S )Nr   r^   r   r   rX   )rd   rY   ziprO   rQ   rP   rR   Zpermuter_   r`   rW   re   rb   )r   r   r\   rc   r?   Zdim1Zdim2r   rO   rQ   rP   rR   Zrow_inZrow_outZcol_inZcol_outr
   r
   r   r       s$   "& & 

zDPRNN.forward)rB   rC   rD   rE   rF   rG   rH   )r!   r"   r#   r1   r2   r@   r   r%   r&   r   r]   rd   re   r    r'   r
   r
   r   r   rA   `   s<    		rA   c                       s>   e Zd Zd
deddf fddZdejdejfdd	Z  ZS )AutoPoolr   pool_dimr   Nc                    s>   t t|   || _tj|d| _| dtt	
d d S )Nr.   alphar   )r   rg   r   rh   r   ZSoftmaxsoftmaxZregister_parameter	Parameterr%   Zones)r   rh   r   r
   r   r      s   zAutoPool.__init__r   c                 C   s0   |  t|| j}tjt||| jd}|S )Nr.   )rj   r%   mulri   sumrh   )r   r   weightr   r
   r
   r   r       s   zAutoPool.forward)r   )	r!   r"   r#   r2   r   r%   r&   r    r'   r
   r
   r   r   rg      s    rg   c                       sN   e Zd ZdZdejdejdejf fddZdej	de
ej	 fd	d
Z  ZS )SquimObjectivea  Speech Quality and Intelligibility Measures (SQUIM) model that predicts **objective** metric scores
    for speech enhancement (e.g., STOI, PESQ, and SI-SDR).

    Args:
        encoder (torch.nn.Module): Encoder module to transform 1D waveform to 2D feature representation.
        dprnn (torch.nn.Module): DPRNN module to model sequential feature.
        branches (torch.nn.ModuleList): Transformer branches in which each branch estimate one objective metirc score.
    encoderdprnnbranchesc                    s$   t t|   || _|| _|| _d S r>   )r   ro   r   rp   rq   rr   )r   rp   rq   rr   r   r
   r   r      s   
zSquimObjective.__init__r   r   c                 C   sz   |j dkrtd|j  d|tj|d dddd d  }| |}| |}g }| jD ]}|||jdd	 q-|S )
z
        Args:
            x (torch.Tensor): Input waveforms. Tensor with dimensions `(batch, time)`.

        Returns:
            List(torch.Tensor): List of score Tenosrs. Each Tensor is with dimension `(batch,)`.
        r   z/The input must be a 2D Tensor. Found dimension .r   T)r/   Zkeepdimg      ?   r.   )	ndim
ValueErrorr%   meanrp   rq   rr   rT   Zsqueeze)r   r   r   Zscoresbranchr
   r
   r   r       s   
 


zSquimObjective.forward)r!   r"   r#   r1   r   ModulerN   r   r%   r&   r   r    r'   r
   r
   r   r   ro      s    	"ro   rK   nheadmetricc                 C   s   t j| || d ddd}t }|dkr't t | | t  t | dt }n+|dkrAt t | | t  t | dttd}nt t | | t  t | d}t |||S )	al  Create branch module after DPRNN model for predicting metric score.

    Args:
        d_model (int): The number of expected features in the input.
        nhead (int): Number of heads in the multi-head attention model.
        metric (str): The metric name to predict.

    Returns:
        (nn.Module): Returned module to predict corresponding metric score.
       r   T)r7   r8   stoir   pesq)r   )r   ZTransformerEncoderLayerrg   rU   r<   rV   r   r   )rK   rz   r{   Zlayer1Zlayer2Zlayer3r
   r
   r   _create_branch   s$   



"r   r+   r,   rI   rJ   r4   rL   rM   c	                 C   sb   |du r|d }t | |}	t| ||||||}
tt||dt||dt||dg}t|	|
|S )a  Build a custome :class:`torchaudio.prototype.models.SquimObjective` model.

    Args:
        feat_dim (int, optional): The feature dimension after Encoder module.
        win_len (int): Kernel size in the Encoder module.
        d_model (int): The number of expected features in the input.
        nhead (int): Number of heads in the multi-head attention model.
        hidden_dim (int): Hidden dimension in the RNN layer of DPRNN.
        num_blocks (int): Number of DPRNN layers.
        rnn_type (str): Type of RNN in DPRNN. Valid options are ["RNN", "LSTM", "GRU"].
        chunk_size (int): Chunk size of input for DPRNN.
        chunk_stride (int or None, optional): Stride of chunk input for DPRNN.
    Nr   r}   r~   Zsisdr)r(   rA   r   rN   r   ro   )r+   r,   rK   rz   rI   rJ   r4   rL   rM   rp   rq   rr   r
   r
   r   squim_objective_model  s   



r   c                
   C   s   t dddddddddS )zWBuild :class:`torchaudio.prototype.models.SquimObjective` model with default arguments.rF   rB   r|   r   rE   G   )r+   r,   rK   rz   rI   rJ   r4   rL   )r   r
   r
   r
   r   squim_objective_base;  s   r   r>   )r   typingr   r   r   r%   Ztorch.nnr   Ztorch.nn.functionalZ
functionalr0   r$   r   r   __annotations__ry   r   r(   r3   rA   rg   ro   r2   r@   modulesr   r   r   r
   r
   r
   r   <module>   sN   
 `()	

&