o
    *iV<                     @   s   d dl Z d dlmZmZmZ d dlZd dlm  mZ	 d dlmZm
Z
 g dZG dd dejZG dd dejZG d	d
 d
ejZG dd dejZG dd dejZdS )    N)ListOptionalTuple)nnTensor)ResBlock	MelResNet	Stretch2dUpsampleNetworkWaveRNNc                       s>   e Zd ZdZddeddf fddZdedefd	d
Z  ZS )r   af  ResNet block based on *Efficient Neural Audio Synthesis* :cite:`kalchbrenner2018efficient`.

    Args:
        n_freq: the number of bins in a spectrogram. (Default: ``128``)

    Examples
        >>> resblock = ResBlock()
        >>> input = torch.rand(10, 128, 512)  # a random spectrogram
        >>> output = resblock(input)  # shape: (10, 128, 512)
       n_freqreturnNc                    sR   t    ttj||dddt|tjddtj||dddt|| _d S )N   Fin_channelsout_channelskernel_sizebiasTZinplace)super__init__r   
SequentialConv1dBatchNorm1dReLUresblock_model)selfr   	__class__ h/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/torchaudio/models/wavernn.pyr      s   


zResBlock.__init__specgramc                 C   s   |  || S )zPass the input through the ResBlock layer.
        Args:
            specgram (Tensor): the input sequence to the ResBlock layer (n_batch, n_freq, n_time).

        Return:
            Tensor shape: (n_batch, n_freq, n_time)
        )r   r   r"   r    r    r!   forward(   s   	zResBlock.forward)r   	__name__
__module____qualname____doc__intr   r   r$   __classcell__r    r    r   r!   r      s    r   c                       sP   e Zd ZdZ	ddedededed	ed
df fddZded
efddZ  ZS )r   a  MelResNet layer uses a stack of ResBlocks on spectrogram.

    Args:
        n_res_block: the number of ResBlock in stack. (Default: ``10``)
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
        n_output: the number of output dimensions of melresnet. (Default: ``128``)
        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)

    Examples
        >>> melresnet = MelResNet()
        >>> input = torch.rand(10, 128, 512)  # a random spectrogram
        >>> output = melresnet(input)  # shape: (10, 128, 508)
    
   r      n_res_blockr   n_hiddenn_outputr   r   Nc                    sh   t     fddt|D }tjtj| |ddt tjddg|tj |ddR  | _d S )	Nc                    s   g | ]}t  qS r    )r   ).0_r/   r    r!   
<listcomp>I   s    z&MelResNet.__init__.<locals>.<listcomp>Fr   Tr   r   )r   r   r   )	r   r   ranger   r   r   r   r   melresnet_model)r   r.   r   r/   r0   r   Z	ResBlocksr   r3   r!   r   D   s   

zMelResNet.__init__r"   c                 C   s
   |  |S )zPass the input through the MelResNet layer.
        Args:
            specgram (Tensor): the input sequence to the MelResNet layer (n_batch, n_freq, n_time).

        Return:
            Tensor shape: (n_batch, n_output, n_time - kernel_size + 1)
        )r6   r#   r    r    r!   r$   S   s   
	zMelResNet.forwardr,   r   r   r   r-   r%   r    r    r   r!   r   4   s"    r   c                       s@   e Zd ZdZdededdf fddZdedefd	d
Z  ZS )r	   a  Upscale the frequency and time dimensions of a spectrogram.

    Args:
        time_scale: the scale factor in time dimension
        freq_scale: the scale factor in frequency dimension

    Examples
        >>> stretch2d = Stretch2d(time_scale=10, freq_scale=5)

        >>> input = torch.rand(10, 100, 512)  # a random spectrogram
        >>> output = stretch2d(input)  # shape: (10, 500, 5120)
    
time_scale
freq_scaler   Nc                    s   t    || _|| _d S N)r   r   r9   r8   )r   r8   r9   r   r    r!   r   m   s   

zStretch2d.__init__r"   c                 C   s   | | jd | jdS )zPass the input through the Stretch2d layer.

        Args:
            specgram (Tensor): the input sequence to the Stretch2d layer (..., n_freq, n_time).

        Return:
            Tensor shape: (..., n_freq * freq_scale, n_time * time_scale)
        )Zrepeat_interleaver9   r8   r#   r    r    r!   r$   s   s   
zStretch2d.forwardr%   r    r    r   r!   r	   _   s    r	   c                       sh   e Zd ZdZ					ddee dededed	ed
eddf fddZdedeeef fddZ	  Z
S )r
   a  Upscale the dimensions of a spectrogram.

    Args:
        upsample_scales: the list of upsample scales.
        n_res_block: the number of ResBlock in stack. (Default: ``10``)
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
        n_output: the number of output dimensions of melresnet. (Default: ``128``)
        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)

    Examples
        >>> upsamplenetwork = UpsampleNetwork(upsample_scales=[4, 4, 16])
        >>> input = torch.rand(10, 128, 10)  # a random spectrogram
        >>> output = upsamplenetwork(input)  # shape: (10, 128, 1536), (10, 128, 1536)
    r,   r   r-   upsample_scalesr.   r   r/   r0   r   r   Nc                    s   t    d}|D ]}||9 }q	|| _|d d | | _t|||||| _t|d| _g }	|D ]2}
t|
d}tj	ddd|
d d fd|
fdd}t
jj|jd|
d d   |	| |	| q/tj|	 | _d S )Nr      r   F)r   r   r   paddingr         ?)r   r   total_scaleindentr   resnetr	   resnet_stretchr   ZConv2dtorchinitZ	constant_weightappendr   upsample_layers)r   r=   r.   r   r/   r0   r   rA   upsample_scaleZ	up_layersscaleZstretchconvr   r    r!   r      s$   
	


zUpsampleNetwork.__init__r"   c                 C   sf   |  |d}| |}|d}|d}| |}|ddddd| j| j f }||fS )a  Pass the input through the UpsampleNetwork layer.

        Args:
            specgram (Tensor): the input sequence to the UpsampleNetwork layer (n_batch, n_freq, n_time)

        Return:
            Tensor shape: (n_batch, n_freq, (n_time - kernel_size + 1) * total_scale),
                          (n_batch, n_output, (n_time - kernel_size + 1) * total_scale)
        where total_scale is the product of all elements in upsample_scales.
        r   N)rC   	unsqueezerD   squeezerI   rB   )r   r"   Zresnet_outputZupsampling_outputr    r    r!   r$      s   



&zUpsampleNetwork.forwardr7   )r&   r'   r(   r)   r   r*   r   r   r   r$   r+   r    r    r   r!   r
      s.    "r
   c                       s   e Zd ZdZ							ddee deded	ed
ededededededdf fddZdededefddZe	j
jddedee deeee f fddZ  ZS )r   aW  WaveRNN model from *Efficient Neural Audio Synthesis* :cite:`wavernn`
    based on the implementation from `fatchord/WaveRNN <https://github.com/fatchord/WaveRNN>`_.

    The original implementation was introduced in *Efficient Neural Audio Synthesis*
    :cite:`kalchbrenner2018efficient`. The input channels of waveform and spectrogram have to be 1.
    The product of `upsample_scales` must equal `hop_length`.

    See Also:
        * `Training example <https://github.com/pytorch/audio/tree/release/0.12/examples/pipeline_wavernn>`__
        * :class:`torchaudio.pipelines.Tacotron2TTSBundle`: TTS pipeline with pretrained model.

    Args:
        upsample_scales: the list of upsample scales.
        n_classes: the number of output classes.
        hop_length: the number of samples between the starts of consecutive frames.
        n_res_block: the number of ResBlock in stack. (Default: ``10``)
        n_rnn: the dimension of RNN layer. (Default: ``512``)
        n_fc: the dimension of fully connected layer. (Default: ``512``)
        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
        n_output: the number of output dimensions of melresnet. (Default: ``128``)

    Example
        >>> wavernn = WaveRNN(upsample_scales=[5,5,8], n_classes=512, hop_length=200)
        >>> waveform, sample_rate = torchaudio.load(file)
        >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length)
        >>> specgram = MelSpectrogram(sample_rate)(waveform)  # shape: (n_batch, n_channel, n_freq, n_time)
        >>> output = wavernn(waveform, specgram)
        >>> # output shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length, n_classes)
    r,      r-   r   r=   	n_classes
hop_lengthr.   n_rnnn_fcr   r   r/   r0   r   Nc                    s:  t    || _|d r|d n|d | _|| _|
d | _|| _|| _tt	
| j| _d}|D ]}||9 }q0|| jkrFtd| d| t||||	|
|| _t|| j d || _tj||dd| _tj|| j |dd| _tjdd| _tjdd| _t|| j || _t|| j || _t|| j| _d S )	Nr>   r      z/Expected: total_scale == hop_length, but found z != T)Zbatch_firstr   )r   r   r   _padrR   n_auxrQ   rP   r*   mathlog2n_bits
ValueErrorr
   upsampler   ZLinearfcZGRUrnn1rnn2r   relu1relu2fc1fc2fc3)r   r=   rP   rQ   r.   rR   rS   r   r   r/   r0   rA   rJ   r   r    r!   r      s,   



zWaveRNN.__init__waveformr"   c                    s  | ddkrtd| ddkrtd|d|d}}| d}tjd| j|j|jd}tjd| j|j|jd} |\}}|	dd}|	dd} fddt
d	D }|d
d
d
d
|d |d f }|d
d
d
d
|d |d f }	|d
d
d
d
|d |d f }
|d
d
d
d
|d |d f }tj|d||gdd} |}|} ||\}}|| }|}tj||	gdd} ||\}}|| }tj||
gdd} |} |}tj||gdd} |} |} |}|dS )a  Pass the input through the WaveRNN model.

        Args:
            waveform: the input waveform to the WaveRNN layer (n_batch, 1, (n_time - kernel_size + 1) * hop_length)
            specgram: the input spectrogram to the WaveRNN layer (n_batch, 1, n_freq, n_time)

        Return:
            Tensor: shape (n_batch, 1, (n_time - kernel_size + 1) * hop_length, n_classes)
        r   z*Require the input channel of waveform is 1z*Require the input channel of specgram is 1r   )dtypedevicer>   c                    s   g | ]} j | qS r    rV   r1   ir   r    r!   r4   .  s    z#WaveRNN.forward.<locals>.<listcomp>r-   N   rT   r<   dim)sizerZ   rN   rE   zerosrR   re   rf   r[   Z	transposer5   catrM   r\   r]   r^   ra   r_   rb   r`   rc   )r   rd   r"   Z
batch_sizeh1h2auxZaux_idxZa1Za2a3Za4xresr2   r    rj   r!   r$     sB   
""""






zWaveRNN.forwardlengthsc                    s  |j }|j}tjj|jjf}|\} |dur#|jj }g }|	 \}}}tj
d|jf||d}	tj
d|jf||d}
tj
|df||d} fddtdD }t|D ]|ddddf }fdd|D \}}}}tj|||gdd}|}|d|	\}}	||	d	  }tj||gdd}|d|
\}}
||
d	  }tj||gdd}t|}tj||gdd}t|}|}tj|dd}t|d }d
| d
j d  d }|| q^t|dd
d	|fS )a  Inference method of WaveRNN.

        This function currently only supports multinomial sampling, which assumes the
        network is trained on cross entropy loss.

        Args:
            specgram (Tensor):
                Batch of spectrograms. Shape: `(n_batch, n_freq, n_time)`.
            lengths (Tensor or None, optional):
                Indicates the valid length of each audio in the batch.
                Shape: `(batch, )`.
                When the ``specgram`` contains spectrograms with different durations,
                by providing ``lengths`` argument, the model will compute
                the corresponding valid output lengths.
                If ``None``, it is assumed that all the audio in ``waveforms``
                have valid length. Default: ``None``.

        Returns:
            (Tensor, Optional[Tensor]):
            Tensor
                The inferred waveform of size `(n_batch, 1, n_time)`.
                1 stands for a single channel.
            Tensor or None
                If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
                is returned.
                It indicates the valid length in time axis of the output Tensor.
        Nr   )rf   re   c                    s6   g | ]} d d j | j |d  d d f qS )Nr   rg   rh   )rs   r   r    r!   r4   x  s   6 z!WaveRNN.infer.<locals>.<listcomp>rT   c                    s"   g | ]}|d d d d  f qS r:   r    )r1   a)ri   r    r!   r4   ~  s   " rl   r   r>   r@   )rf   re   rE   r   
functionalpadrU   r[   rA   rn   ro   rR   r5   rp   r\   r]   rM   r^   FZrelura   rb   rc   ZsoftmaxZmultinomialfloatrY   rH   stackZpermute)r   r"   rw   rf   re   outputZb_sizer2   Zseq_lenrq   rr   ru   Z	aux_splitZm_tZa1_tZa2_tZa3_tZa4_tinpZlogitsZ	posteriorr    )rs   ri   r   r!   inferK  s@   

zWaveRNN.infer)r,   rO   rO   r-   r   r   r   r:   )r&   r'   r(   r)   r   r*   r   r   r$   rE   ZjitZexportr   r   r   r+   r    r    r   r!   r      sF    %	
*92r   )rW   typingr   r   r   rE   Ztorch.nn.functionalr   ry   r{   r   __all__Moduler   r   r	   r
   r   r    r    r    r!   <module>   s    	#+!G