o
    *i5                     @   s`  d dl Z d dlZd dlmZmZ d dlmZ d dlmZ d dl	m
Z
mZmZ d dlZd dlZd dlmZ d dlmZmZmZ g Zdeeejj Zed	d
e Zdd ZG dd dejjZ G dd dejjZ!G dd deZ"G dd deZ#G dd dejje"Z$G dd de#Z%eG dd dZ&e&deeddddd d!d"d#d$d%d&d d'Z'd(e'_(dS ))    N)ABCabstractmethod)	dataclass)partial)CallableListTuple)module_utils)emformer_rnnt_baseRNNTRNNTBeamSearch(   
   g?c                 C   s@   t | | tjk | | tjk< | | tjk tj | | tjk< | S N)torchlogmathex r   q/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/torchaudio/pipelines/rnnt_pipeline.py_piecewise_linear_log   s   r   c                       $   e Zd Z fddZdd Z  ZS )_FunctionalModulec                       t    || _d S r   )super__init__
functional)selfr   	__class__r   r   r         

z_FunctionalModule.__init__c                 C   s
   |  |S r   )r   r   inputr   r   r   forward   s   
z_FunctionalModule.forward__name__
__module____qualname__r   r%   __classcell__r   r   r    r   r      s    r   c                       r   )_GlobalStatsNormalizationc                    sp   t    t|}t| }W d    n1 sw   Y  | dt|d  | dt|d  d S )Nmean	invstddev)	r   r   openjsonloadsreadZregister_bufferr   tensor)r   Zglobal_stats_pathfZblobr    r   r   r   $   s   

z"_GlobalStatsNormalization.__init__c                 C   s   || j  | j S r   )r,   r-   r#   r   r   r   r%   -   s   z!_GlobalStatsNormalization.forwardr&   r   r   r    r   r+   #   s    	r+   c                   @   s0   e Zd Zedejdeejejf fddZdS )_FeatureExtractorr$   returnc                 C      dS )X  Generates features and length output from the given input tensor.

        Args:
            input (torch.Tensor): input tensor.

        Returns:
            (torch.Tensor, torch.Tensor):
            torch.Tensor:
                Features, with shape `(length, *)`.
            torch.Tensor:
                Length, with shape `(1,)`.
        Nr   r#   r   r   r   __call__2       z_FeatureExtractor.__call__N)r'   r(   r)   r   r   Tensorr   r8   r   r   r   r   r4   1   s    &r4   c                   @   s&   e Zd Zedee defddZdS )_TokenProcessortokensr5   c                 K   r6   )zDecodes given list of tokens to text sequence.

        Args:
            tokens (List[int]): list of tokens to decode.

        Returns:
            str:
                Decoded text sequence.
        Nr   )r   r<   kwargsr   r   r   r8   C   r9   z_TokenProcessor.__call__N)r'   r(   r)   r   r   intstrr8   r   r   r   r   r;   B   s    r;   c                       sN   e Zd ZdZdejjddf fddZdejde	ejejf fdd	Z
  ZS )
_ModuleFeatureExtractorz``torch.nn.Module``-based feature extraction pipeline.

    Args:
        pipeline (torch.nn.Module): module that implements feature extraction logic.
    pipeliner5   Nc                    r   r   )r   r   rA   )r   rA   r    r   r   r   W   r"   z _ModuleFeatureExtractor.__init__r$   c                 C   s$   |  |}t|jd g}||fS )r7   r   )rA   r   r2   shape)r   r$   featureslengthr   r   r   r%   [   s   
z_ModuleFeatureExtractor.forward)r'   r(   r)   __doc__r   nnModuler   r:   r   r%   r*   r   r   r    r   r@   P   s    (r@   c                   @   s>   e Zd ZdZdeddfddZddee d	edefd
dZ	dS )_SentencePieceTokenProcessorztSentencePiece-model-based token processor.

    Args:
        sp_model_path (str): path to SentencePiece model.
    sp_model_pathr5   Nc                 C   sJ   t ds	tddd l}|j|d| _| j | j | j h| _	d S )Nsentencepiecez2SentencePiece is not available. Please install it.r   )Z
model_file)
r	   Zis_module_availableRuntimeErrorrJ   ZSentencePieceProcessorsp_modelZunk_idZeos_idZpad_idpost_process_remove_list)r   rI   Zspmr   r   r   r   t   s   

z%_SentencePieceTokenProcessor.__init__Tr<   lstripc                    sD    fdd|dd D }d  j|dd}|r | S |S )aX  Decodes given list of tokens to text sequence.

        Args:
            tokens (List[int]): list of tokens to decode.
            lstrip (bool, optional): if ``True``, returns text sequence with leading whitespace
                removed. (Default: ``True``).

        Returns:
            str:
                Decoded text sequence.
        c                    s   g | ]	}| j vr|qS r   )rM   ).0Ztoken_indexr   r   r   
<listcomp>   s    z9_SentencePieceTokenProcessor.__call__.<locals>.<listcomp>   N u   ▁ )joinrL   Zid_to_piecereplacerN   )r   r<   rN   Zfiltered_hypo_tokensZoutput_stringr   rP   r   r8      s   

z%_SentencePieceTokenProcessor.__call__)T)
r'   r(   r)   rE   r?   r   r   r>   boolr8   r   r   r   r   rH   m   s     rH   c                   @   sL  e Zd ZU dZG dd deZG dd deZee	d< e
g ef e	d< ee	d< ee	d	< ee	d
< ee	d< ee	d< ee	d< ee	d< ee	d< ee	d< ee	d< defddZedefddZedefddZedefddZedefddZedefddZedefdd Zdefd!d"Zdefd#d$Zdefd%d&Zdefd'd(Zd)S )*
RNNTBundleu  Dataclass that bundles components for performing automatic speech recognition (ASR, speech-to-text)
    inference with an RNN-T model.

    More specifically, the class provides methods that produce the featurization pipeline,
    decoder wrapping the specified RNN-T model, and output token post-processor that together
    constitute a complete end-to-end ASR inference pipeline that produces a text sequence
    given a raw waveform.

    It can support non-streaming (full-context) inference as well as streaming inference.

    Users should not directly instantiate objects of this class; rather, users should use the
    instances (representing pre-trained models) that exist within the module,
    e.g. :data:`torchaudio.pipelines.EMFORMER_RNNT_BASE_LIBRISPEECH`.

    Example
        >>> import torchaudio
        >>> from torchaudio.pipelines import EMFORMER_RNNT_BASE_LIBRISPEECH
        >>> import torch
        >>>
        >>> # Non-streaming inference.
        >>> # Build feature extractor, decoder with RNN-T model, and token processor.
        >>> feature_extractor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_feature_extractor()
        100%|███████████████████████████████| 3.81k/3.81k [00:00<00:00, 4.22MB/s]
        >>> decoder = EMFORMER_RNNT_BASE_LIBRISPEECH.get_decoder()
        Downloading: "https://download.pytorch.org/torchaudio/models/emformer_rnnt_base_librispeech.pt"
        100%|███████████████████████████████| 293M/293M [00:07<00:00, 42.1MB/s]
        >>> token_processor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_token_processor()
        100%|███████████████████████████████| 295k/295k [00:00<00:00, 25.4MB/s]
        >>>
        >>> # Instantiate LibriSpeech dataset; retrieve waveform for first sample.
        >>> dataset = torchaudio.datasets.LIBRISPEECH("/home/librispeech", url="test-clean")
        >>> waveform = next(iter(dataset))[0].squeeze()
        >>>
        >>> with torch.no_grad():
        >>>     # Produce mel-scale spectrogram features.
        >>>     features, length = feature_extractor(waveform)
        >>>
        >>>     # Generate top-10 hypotheses.
        >>>     hypotheses = decoder(features, length, 10)
        >>>
        >>> # For top hypothesis, convert predicted tokens to text.
        >>> text = token_processor(hypotheses[0][0])
        >>> print(text)
        he hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to [...]
        >>>
        >>>
        >>> # Streaming inference.
        >>> hop_length = EMFORMER_RNNT_BASE_LIBRISPEECH.hop_length
        >>> num_samples_segment = EMFORMER_RNNT_BASE_LIBRISPEECH.segment_length * hop_length
        >>> num_samples_segment_right_context = (
        >>>     num_samples_segment + EMFORMER_RNNT_BASE_LIBRISPEECH.right_context_length * hop_length
        >>> )
        >>>
        >>> # Build streaming inference feature extractor.
        >>> streaming_feature_extractor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_streaming_feature_extractor()
        >>>
        >>> # Process same waveform as before, this time sequentially across overlapping segments
        >>> # to simulate streaming inference. Note the usage of ``streaming_feature_extractor`` and ``decoder.infer``.
        >>> state, hypothesis = None, None
        >>> for idx in range(0, len(waveform), num_samples_segment):
        >>>     segment = waveform[idx: idx + num_samples_segment_right_context]
        >>>     segment = torch.nn.functional.pad(segment, (0, num_samples_segment_right_context - len(segment)))
        >>>     with torch.no_grad():
        >>>         features, length = streaming_feature_extractor(segment)
        >>>         hypotheses, state = decoder.infer(features, length, 10, state=state, hypothesis=hypothesis)
        >>>     hypothesis = hypotheses[0]
        >>>     transcript = token_processor(hypothesis[0])
        >>>     if transcript:
        >>>         print(transcript, end=" ", flush=True)
        he hoped there would be stew for dinner turn ips and car rots and bru 'd oes and fat mut ton pieces to [...]
    c                   @      e Zd ZdZdS )zRNNTBundle.FeatureExtractorz:Interface of the feature extraction part of RNN-T pipelineNr'   r(   r)   rE   r   r   r   r   FeatureExtractor       r[   c                   @   rY   )zRNNTBundle.TokenProcessorz7Interface of the token processor part of RNN-T pipelineNrZ   r   r   r   r   TokenProcessor   r\   r]   
_rnnt_path_rnnt_factory_func_global_stats_path_sp_model_path_right_padding_blank_sample_rate_n_fft_n_mels_hop_length_segment_length_right_context_lengthr5   c                 C   s6   |   }tj| j}t|}|| |  |S r   )	r_   
torchaudioutilsdownload_assetr^   r   loadZload_state_dicteval)r   modelpathZ
state_dictr   r   r   
_get_model   s   

zRNNTBundle._get_modelc                 C      | j S )zSSample rate (in cycles per second) of input waveforms.

        :type: int
        )rd   rP   r   r   r   sample_rate      zRNNTBundle.sample_ratec                 C   rr   )z7Size of FFT window to use.

        :type: int
        )re   rP   r   r   r   n_fft  rt   zRNNTBundle.n_fftc                 C   rr   )z`Number of mel spectrogram features to extract from input waveforms.

        :type: int
        )rf   rP   r   r   r   n_mels  rt   zRNNTBundle.n_melsc                 C   rr   )zdNumber of samples between successive frames in input expected by model.

        :type: int
        )rg   rP   r   r   r   
hop_length  rt   zRNNTBundle.hop_lengthc                 C   rr   )zTNumber of frames in segment in input expected by model.

        :type: int
        )rh   rP   r   r   r   segment_length  rt   zRNNTBundle.segment_lengthc                 C   rr   )zcNumber of frames in right contextual block in input expected by model.

        :type: int
        )ri   rP   r   r   r   right_context_length%  rt   zRNNTBundle.right_context_lengthc                 C   s   |   }t|| jS )zOConstructs RNN-T decoder.

        Returns:
            RNNTBeamSearch
        )rq   r   rc   )r   ro   r   r   r   get_decoder-  s   zRNNTBundle.get_decoderc                    s^   t j j}ttjt jj	 j
 j j jdtdd tdd t|t fddS )zzConstructs feature extractor for non-streaming (full-context) ASR.

        Returns:
            FeatureExtractor
        rs   ru   rv   rw   c                 S      |  ddS NrR   r   Z	transposer   r   r   r   <lambda>B      z2RNNTBundle.get_feature_extractor.<locals>.<lambda>c                 S      t | t S r   r   _gainr   r   r   r   r   C  r   c                    s   t jj| ddd jfS )Nr   )r   rF   r   padrb   r   rP   r   r   r   E  s    rj   rk   rl   r`   r@   r   rF   Z
SequentialZ
transformsZMelSpectrogramrs   ru   rv   rw   r   r+   r   Z
local_pathr   rP   r   get_feature_extractor6  s   

z RNNTBundle.get_feature_extractorc              	   C   sP   t j| j}ttjt jj	| j
| j| j| jdtdd tdd t|S )zvConstructs feature extractor for streaming (simultaneous) ASR.

        Returns:
            FeatureExtractor
        r{   c                 S   r|   r}   r~   r   r   r   r   r   U  r   z<RNNTBundle.get_streaming_feature_extractor.<locals>.<lambda>c                 S   r   r   r   r   r   r   r   r   V  r   r   r   r   r   r   get_streaming_feature_extractorI  s   

z*RNNTBundle.get_streaming_feature_extractorc                 C   s   t j| j}t|S )zQConstructs token processor.

        Returns:
            TokenProcessor
        )rj   rk   rl   ra   rH   r   r   r   r   get_token_processor[  s   zRNNTBundle.get_token_processorN)r'   r(   r)   rE   r4   r[   r;   r]   r?   __annotations__r   r   r>   rq   propertyrs   ru   rv   rw   rx   ry   r   rz   r   r   r   r   r   r   r   rX      sB   
 H	rX   z(models/emformer_rnnt_base_librispeech.pti  )Znum_symbolsz2pipeline-assets/global_stats_rnnt_librispeech.jsonz.pipeline-assets/spm_bpe_4096_librispeech.model   i   i>  i  P         )r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   a  ASR pipeline based on Emformer-RNNT,
pretrained on *LibriSpeech* dataset :cite:`7178964`,
capable of performing both streaming and non-streaming inference.

The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base`
and utilizes weights trained on LibriSpeech using training script ``train.py``
`here <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>`__ with default arguments.

Please refer to :py:class:`RNNTBundle` for usage instructions.
))r/   r   abcr   r   dataclassesr   	functoolsr   typingr   r   r   r   rj   Ztorchaudio._internalr	   Ztorchaudio.modelsr
   r   r   __all__log10ZiinfoZint16maxZ_decibelpowr   r   rF   rG   r   r+   r4   r;   r@   rH   rX   ZEMFORMER_RNNT_BASE_LIBRISPEECHrE   r   r   r   r   <module>   sL    	+ M

