o
    *i'2                     @   s  d dl mZmZmZmZmZ d dlZd dlmZ ddgZ	eee
 ejeeej  ef Zde_dedee
 fd	d
ZdedejfddZdedeeej  fddZdedefddZdedefddZdee deeej  fddZdeeej  de
dejdeeej  fddZdedefddZdee dejde
deejejejf fdd Zded!ee ddfd"d#ZG d$d dejjZdS )%    )CallableDictListOptionalTupleN)RNNT
HypothesisRNNTBeamSearchzHypothesis generated by RNN-T beam search decoder,
    represented as tuple of (tokens, prediction network output, prediction network state, score).
    hyporeturnc                 C      | d S Nr    r
   r   r   m/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/torchaudio/models/rnnt_decoder.py_get_hypo_tokens      r   c                 C   r   N   r   r   r   r   r   _get_hypo_predictor_out   r   r   c                 C   r   )N   r   r   r   r   r   _get_hypo_state   r   r   c                 C   r   )N   r   r   r   r   r   _get_hypo_score   r   r   c                 C   s   t | d S r   )strr   r   r   r   _get_hypo_key    s   r   hyposc              	      sn   g }t tt| d D ]( g }t tt| d   D ]|t fdd| D  q|| q|S )Nr   c                    s   g | ]
}t |   qS r   )r   .0r
   ijr   r   
<listcomp>)   s    z _batch_state.<locals>.<listcomp>)rangelenr   appendtorchcat)r   statesZbatched_state_componentsr   r   r   _batch_state$   s   "r)   r(   idxdevicec                    s"   t j|g|d  fdd| D S )Nr+   c                    s   g | ]} fd d|D qS )c                    s   g | ]}| d  qS )r   )Zindex_select)r   stateZ
idx_tensorr   r   r"   0   s    z+_slice_state.<locals>.<listcomp>.<listcomp>r   )r   Zstate_tupler.   r   r   r"   0   s    z _slice_state.<locals>.<listcomp>)r&   tensor)r(   r*   r+   r   r.   r   _slice_state.   s   r0   c                 C   s   t | tt| d  S r   )r   r$   r   r   r   r   r   _default_hypo_sort_key3   s   r1   next_token_probs
beam_widthc           	      C   sr   t dd | D d}||d d d df  }|d|\}}|j|jd dd}||jd  }|||fS )Nc                 S      g | ]}t |qS r   r   r   hr   r   r   r"   <       z+_compute_updated_scores.<locals>.<listcomp>r   trunc)Zrounding_mode)r&   r/   	unsqueezeZreshapetopkdivshape)	r   r2   r3   Zhypo_scoresZnonblank_scoresnonblank_nbest_scoresZnonblank_nbest_idxnonblank_nbest_hypo_idxnonblank_nbest_tokenr   r   r   _compute_updated_scores7   s   
rB   	hypo_listc                 C   s2   t |D ]\}}t| t|kr||=  d S qd S N)	enumerater   )r
   rC   r    elemr   r   r   _remove_hypoD   s   rG   c                       s  e Zd ZdZ			d.dedededeee	gef  d	ed
df fddZ
dejd
ee	 fddZdejdee	 dejd
ejfddZdee	 dee	 dejdeee	f d
ee	 f
ddZdee	 dee	 dejdededejd
ee	 fddZdee	 dee d ee dedejd
ee	 fd!d"Zdejd#eee	  ded
ee	 fd$d%Zd&ejd'ejded
ee	 fd(d)Zejj		d/d&ejd'ejded*eeeej   d+eee	  d
eee	 eeej  f fd,d-Z  ZS )0r	   a)  Beam search decoder for RNN-T model.

    See Also:
        * :class:`torchaudio.pipelines.RNNTBundle`: ASR pipeline with pretrained model.

    Args:
        model (RNNT): RNN-T model to use.
        blank (int): index of blank token in vocabulary.
        temperature (float, optional): temperature to apply to joint network output.
            Larger values yield more uniform samples. (Default: 1.0)
        hypo_sort_key (Callable[[Hypothesis], float] or None, optional): callable that computes a score
            for a given hypothesis to rank hypotheses by. If ``None``, defaults to callable that returns
            hypothesis score normalized by token sequence length. (Default: None)
        step_max_tokens (int, optional): maximum number of tokens to emit per input time step. (Default: 100)
          ?Nd   modelblanktemperaturehypo_sort_keystep_max_tokensr   c                    s<   t    || _|| _|| _|d u rt| _n|| _|| _d S rD   )super__init__rJ   rK   rL   r1   rM   rN   )selfrJ   rK   rL   rM   rN   	__class__r   r   rP   \   s   

zRNNTBeamSearch.__init__r+   c           	      C   sZ   | j }d }tjdg|d}| jtj|gg|d||\}}}|g|d  |df}|gS )Nr   r,   r   g        )rK   r&   r/   rJ   predictdetach)	rQ   r+   tokenr-   
one_tensorpred_out_Z
pred_stateZ	init_hypor   r   r   _init_b_hyposp   s   $
zRNNTBeamSearch._init_b_hyposenc_outr   c              	   C   s~   t jdg|d}t jdd |D dd}| j|||t jdgt| |d\}}}t jjj|| j	 dd}|d d ddf S )Nr   r,   c                 S   r4   r   )r   r6   r   r   r   r"      r8   z8RNNTBeamSearch._gen_next_token_probs.<locals>.<listcomp>r   )dimr   )
r&   r/   stackrJ   joinr$   nnZ
functionalZlog_softmaxrL   )rQ   r[   r   r+   rW   Zpredictor_outZ
joined_outrY   r   r   r   _gen_next_token_probs~   s   
z$RNNTBeamSearch._gen_next_token_probsb_hyposa_hyposr2   key_to_b_hypoc                    s   t t|D ]I}|| }t|||df  }t||v r4|t| }t|  ttt||}	nt|}	t	|t
|t||	f} | ||t|< qtdd  D  \}
} fdd|D S )Nr9   c                 S   r4   r   r5   r   r   r   r   r"      r8   z/RNNTBeamSearch._gen_b_hypos.<locals>.<listcomp>c                       g | ]} | qS r   r   r   r*   ra   r   r   r"      r8   )r#   r$   r   r   rG   floatr&   r/   Z	logaddexpr   r   r   r%   sort)rQ   ra   rb   r2   rc   r    h_aZappend_blank_scoreZh_bscorerY   
sorted_idxr   rf   r   _gen_b_hypos   s"   

zRNNTBeamSearch._gen_b_hypostr3   c                 C   s   t |||\}}}	t||k rtd }
nt||  }
g }g }g }t|D ]'}t|| }||
krMt|| }|||  |t|	|  || q&|r[| |||||}|S g }|S )Ninf)rB   r$   rg   r   r#   intr%   _gen_new_hypos)rQ   rb   ra   r2   rm   r3   r+   r?   r@   rA   Zb_nbest_score
base_hypos
new_tokensZ
new_scoresr    rj   Z
a_hypo_idx	new_hyposr   r   r   _gen_a_hypos   s0   

zRNNTBeamSearch._gen_a_hyposrq   tokensscoresc              	   C   s   t jdd |D |d}t|}| j|t jdgt| |d|\}}	}
g }t|D ] \}}t||| g }|||| 	 t
|
|||| f q+|S )Nc                 S   s   g | ]}|gqS r   r   )r   rV   r   r   r   r"      s    z1RNNTBeamSearch._gen_new_hypos.<locals>.<listcomp>r,   r   )r&   r/   r)   rJ   rT   r$   rE   r   r%   rU   r0   )rQ   rq   ru   rv   rm   r+   Z
tgt_tokensr(   rX   rY   Zpred_statesrs   r    ri   rr   r   r   r   rp      s   
(zRNNTBeamSearch._gen_new_hyposr
   c              	      s   |j d }|j}g }|d u r|n| t|D ]b} }tjtt g  i }d}	|ra	|d d ||d f ||}
|

 }
 ||
| |	jkrOn| |
|||}|r_|	d7 }	|s,tfdd D |\}} fdd|D  q S )Nr   r   c                    s   g | ]}  |qS r   )rM   )r   Zhyp)rQ   r   r   r"     s    z*RNNTBeamSearch._search.<locals>.<listcomp>c                    rd   r   r   re   rf   r   r   r"     r8   )r>   r+   rZ   r#   r&   jitZannotater   r   r`   cpurl   rN   rt   r/   r<   )rQ   r[   r
   r3   Zn_time_stepsr+   rb   rm   rc   Zsymbols_current_tr2   rY   rk   r   )ra   rQ   r   _search   s:   
"
"zRNNTBeamSearch._searchinputlengthc                 C   s   |  dkr|  dkr|jd dkstd|  dkr"|d}|jdkr0|jdkr0td|  dkr;|d}| j||\}}| |d	|S )
a  Performs beam search for the given input sequence.

        T: number of frames;
        D: feature dimension of each frame.

        Args:
            input (torch.Tensor): sequence of input frames, with shape (T, D) or (1, T, D).
            length (torch.Tensor): number of valid frames in input
                sequence, with shape () or (1,).
            beam_width (int): beam size to use during search.

        Returns:
            List[Hypothesis]: top-``beam_width`` hypotheses found by beam search.
        r   r   r   r   *input must be of shape (T, D) or (1, T, D)r   r   "length must be of shape () or (1,)N)r\   r>   
ValueErrorr;   rJ   Z
transcribery   )rQ   rz   r{   r3   r[   rY   r   r   r   forward  s   &

zRNNTBeamSearch.forwardr-   
hypothesisc                 C   s   |  dkr|  dkr|jd dkstd|  dkr"|d}|jdkr0|jdkr0td|  dkr;|d}| j|||\}}}| ||||fS )	a  Performs beam search for the given input sequence in streaming mode.

        T: number of frames;
        D: feature dimension of each frame.

        Args:
            input (torch.Tensor): sequence of input frames, with shape (T, D) or (1, T, D).
            length (torch.Tensor): number of valid frames in input
                sequence, with shape () or (1,).
            beam_width (int): beam size to use during search.
            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
                representing transcription network internal state generated in preceding
                invocation. (Default: ``None``)
            hypothesis (List[Hypothesis] or None): hypotheses from preceding invocation to seed
                search with. (Default: ``None``)

        Returns:
            (List[Hypothesis], List[List[torch.Tensor]]):
                List[Hypothesis]
                    top-``beam_width`` hypotheses found by beam search.
                List[List[torch.Tensor]]
                    list of lists of tensors representing transcription network
                    internal state generated in current invocation.
        r   r   r   r   r|   r   r}   r~   )r\   r>   r   r;   rJ   Ztranscribe_streamingry   )rQ   rz   r{   r3   r-   r   r[   rY   r   r   r   infer'  s   &!

zRNNTBeamSearch.infer)rH   NrI   )NN)__name__
__module____qualname____doc__r   ro   rg   r   r   r   rP   r&   r+   r   rZ   Tensorr`   r   r   rl   rt   rp   ry   r   rw   Zexportr   r   __classcell__r   r   rR   r   r	   K   s    



&


")
)typingr   r   r   r   r   r&   Ztorchaudio.modelsr   __all__ro   r   rg   r   r   r   r   r   r   r   r   r)   r+   r0   r1   rB   rG   r_   Moduler	   r   r   r   r   <module>   s2      0

