o
    *i                     @   s2  d dl Z d dlmZmZmZ d dlZdgZdejdejfddZ	d#dejd	ejd
ejdejdejdeej deej fddZ	de
dejjfddZdee
 dedeee  fddZdee dee dedejdejf
ddZG dd dejjZG dd dejjZG d d! d!ejjZG d"d deZdS )$    N)ListOptionalTupleEmformerlengthsreturnc                 C   sF   | j d }tt|  }tj|| j| jd||| 	dk}|S )Nr   )devicedtype   )
shapeinttorchmaxitemZaranger   r	   expand	unsqueeze)r   
batch_size
max_lengthpadding_mask r   i/var/www/html/eduruby.in/lip-sync/lip-sync-env/lib/python3.10/site-packages/torchaudio/models/emformer.py_lengths_to_padding_mask
   s   
r   	utteranceright_contextsummarymemsleft_context_keyc                 C   s   | d|  d | d }| d}|dkrd }|S |t|  | d }	|d ur3| dnd}
|| d |	 |
 }t|d}|S )Nr   r
   )r   )sizer   r   r   r   )r   r   r   r   r   r   TBr   right_context_blocks_lengthZleft_context_blocks_lengthZklengthsr   r   r   _gen_padding_mask   s   

r!   
activationc                 C   sD   | dkr	t j S | dkrt j S | dkrt j S td|  )NreluZgeluZsiluzUnsupported activation )r   nnZReLUZGELUZSiLU
ValueError)r"   r   r   r   _get_activation_module'   s   


r&   weight_init_scale_strategy
num_layersc                 C   s\   | d u rdd t |D S | dkrdd t |D S | dkr'dd t |D S td|  )Nc                 S   s   g | ]}d qS Nr   ).0_r   r   r   
<listcomp>4   s    z*_get_weight_init_gains.<locals>.<listcomp>	depthwisec                 S   s   g | ]}d t |d  qS )      ?r
   mathsqrtr*   	layer_idxr   r   r   r,   6   s    Zconstantc                 S   s   g | ]	}d t d qS )r.      r/   r2   r   r   r   r,   8   s    z-Unsupported weight_init_scale_strategy value )ranger%   )r'   r(   r   r   r   _get_weight_init_gains2   s   r6   
col_widthscol_masknum_rowsr   c                    s@   t | t |krtd fddt| |D }tj|ddS )Nz0Length of col_widths must match that of col_maskc                    s4   g | ]\}}|rt j| d nt j| d qS )r   )r   Zoneszeros)r*   Z	col_widthZis_ones_colr   r9   r   r   r,   C   s    z-_gen_attention_mask_block.<locals>.<listcomp>r
   dim)lenr%   zipr   cat)r7   r8   r9   r   Z
mask_blockr   r<   r   _gen_attention_mask_block=   s   rB   c                       s  e Zd ZdZ				d$dededed	ee d
edef fddZde	j
de	j
dee	j
e	j
f fddZde	j
de	j
dee	j
 de	j
fddZ		d%de	j
de	j
de	j
de	j
de	j
de	j
dee	j
 dee	j
 dee	j
e	j
e	j
e	j
f fddZde	j
de	j
de	j
de	j
de	j
de	j
dee	j
e	j
f fd d!Ze	jjde	j
de	j
de	j
de	j
de	j
de	j
de	j
dee	j
e	j
e	j
e	j
f fd"d#Z  ZS )&_EmformerAttentiona_  Emformer layer attention module.

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads in each Emformer layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        weight_init_gain (float or None, optional): scale factor to apply when initializing
            attention module parameters. (Default: ``None``)
        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
            NF    ח	input_dim	num_headsdropoutweight_init_gaintanh_on_memnegative_infc                    s   t    || dkrtd| d| d|| _|| _|| _|| _|| _| j| j d | _t	j
j|d| dd| _t	j
j||dd| _t	j
j||dd| _|rht	j
jj| jj|d	 t	j
jj| jj|d	 d S d S )
Nr   zinput_dim (z") is not a multiple of num_heads (z).g      r4   T)Zbias)Zgain)super__init__r%   rF   rG   rH   rJ   rK   scalingr   r$   Linearemb_to_key_valueemb_to_queryout_projinitZxavier_uniform_weight)selfrF   rG   rH   rI   rJ   rK   	__class__r   r   rM   Y   s    
	z_EmformerAttention.__init__inputr   r   c           
      C   sX   |j \}}}|dd }|d ||  }t||g}| |jddd\}}	||	fS )Nr   r
   r4   chunksr>   )r   r   r   rA   rP   chunk)
rU   rX   r   r   r+   summary_lengthZright_ctx_utterance_blockZmems_right_ctx_utterance_blockkeyvaluer   r   r   _gen_key_valuew   s   z!_EmformerAttention._gen_key_valueattention_weightsattention_maskr   c                 C   s   |  }||d| j}|d}|d| j }|d urC||| j|d}||ddtj	| j}||| j |d}tj
jj|dd|}tj
jj|t | j| jdS )Nr   r
   r4   r=   )ptraining)floatZmasked_fillr   rK   r   rG   viewtor   boolr$   Z
functionalZsoftmaxZtype_asrH   rd   )rU   r`   ra   r   Zattention_weights_floatr   r   attention_probsr   r   r   _gen_attention_probs   s   
z'_EmformerAttention._gen_attention_probsr   r   r   r   r   left_context_valc	                    s   | d | d| d | d }	t|||g}
t|||gjddd\}}|d ur{|d ur{|	t|  | d }t|d | d|  ||| d| d  g}t|d | d|  ||| d| d  g} fdd|
||fD \}}}t|j	 |
dd}t||||||}|||}t||}|j j |	jj fkrtd|
dd |	 j}|}| d}|d |	|  }||	| d  }jrt|}ntj|dd	d
}||||fS )Nr
   r   r4   rY   c                    s4   g | ]}|  d  j jj ddqS )rb   r   r
   )
contiguousrf   rG   rF   	transpose)r*   Ztensorr   rU   r   r   r,      s    &z4_EmformerAttention._forward_impl.<locals>.<listcomp>z+Computed attention has incorrect dimensionsi
   )minr   )r   rQ   r   rA   rP   r[   r   r   ZbmmrN   rm   r!   rj   r   rG   rF   AssertionErrorrl   rf   rR   rJ   tanhclamp)rU   r   r   r   r   r   ra   r   rk   r   queryr]   r^   r    Zreshaped_queryZreshaped_keyZreshaped_valuer`   r   ri   	attentionZoutput_right_context_memsr\   output_right_contextoutput_memsr   rn   r   _forward_impl   sP   
$	


z _EmformerAttention._forward_implc           
      C   s,   |  ||||||\}}}	}	||dd fS )ac  Forward pass for training.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        S: number of summary elements;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            summary (torch.Tensor): summary elements, with shape `(S, B, D)`.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
            attention_mask (torch.Tensor): attention mask for underlying attention module.

        Returns:
            (Tensor, Tensor):
                Tensor
                    output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
        Nrb   )rx   )
rU   r   r   r   r   r   ra   outputrw   r+   r   r   r   forward   s   "z_EmformerAttention.forwardc              
   C   s   | d| d | d }| d| d | d | d }	t||	jtj|jd}
d|
dd| df< | j||||||
||d\}}}}|||| d| d d || d| d d fS )a  Forward pass for inference.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        S: number of summary elements;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            summary (torch.Tensor): summary elements, with shape `(S, B, D)`.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
            left_context_key (torch.Tensor): left context attention key computed from preceding invocation.
            left_context_val (torch.Tensor): left context attention value computed from preceding invocation.

        Returns:
            (Tensor, Tensor, Tensor, and Tensor):
                Tensor
                    output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
                Tensor
                    attention key computed for left context and utterance.
                Tensor
                    attention value computed for left context and utterance.
        r   r	   r   Trb   N)r   rk   )r   r   r;   rg   rh   r   rx   )rU   r   r   r   r   r   r   rk   Z	query_dimZkey_dimra   ry   rw   r]   r^   r   r   r   infer   s&   )(z_EmformerAttention.infer)rD   NFrE   )NN)__name__
__module____qualname____doc__r   re   r   rh   rM   r   Tensorr   r_   rj   rx   rz   jitexportr|   __classcell__r   r   rV   r   rC   L   s    &
	

I
%	rC   c                       sh  e Zd ZdZ							d6ded	ed
ededededededee dedef fddZ	dedee
j dee
j fddZdee
j dee
je
je
jf fddZde
jde
jded e
jdee
j dee
j fd!d"Zd#e
jd$e
jd%e
jde
jfd&d'Zd$e
jd%e
jdee
je
jf fd(d)Zd#e
jd$e
jd%e
jdee
je
jf fd*d+Zd$e
jd,e
jd%e
jd e
jd-ee
j dee
je
jf fd.d/Zd$e
jd,e
jd%e
jd e
jdeee
j  dee
je
jee
j f fd0d1Zd$e
jd,e
jd%e
jd e
jd-e
jdee
je
je
jf fd2d3Ze
jjd$e
jd,e
jd%e
jdeee
j  d e
jdee
je
jee
j e
jf fd4d5Z  ZS )7_EmformerLayera$  Emformer layer that constitutes Emformer.

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads.
        ffn_dim: (int): hidden layer dimension of feedforward network.
        segment_length (int): length of each input segment.
        dropout (float, optional): dropout probability. (Default: 0.0)
        activation (str, optional): activation function to use in feedforward network.
            Must be one of ("relu", "gelu", "silu"). (Default: "relu")
        left_context_length (int, optional): length of left context. (Default: 0)
        max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
        weight_init_gain (float or None, optional): scale factor to apply when initializing
            attention module parameters. (Default: ``None``)
        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
    rD   r#   r   NFrE   rF   rG   ffn_dimsegment_lengthrH   r"   left_context_lengthmax_memory_sizerI   rJ   rK   c              
      s   t    t||||	|
|d| _tj|| _tjj||dd| _	t
|}tjtj|tj|||tj|tj||tj|| _tj|| _tj|| _|| _|| _|| _|| _|dk| _d S )N)rF   rG   rH   rI   rJ   rK   TZkernel_sizeZstrideZ	ceil_moder   )rL   rM   rC   ru   r   r$   ZDropoutrH   	AvgPool1d	memory_opr&   Z
SequentialZ	LayerNormrO   pos_fflayer_norm_inputlayer_norm_outputr   r   r   rF   use_mem)rU   rF   rG   r   r   rH   r"   r   r   rI   rJ   rK   Zactivation_modulerV   r   r   rM   R  s6   



z_EmformerLayer.__init__r   r   r   c                 C   sb   t j| j|| j|d}t j| j|| j|d}t j| j|| j|d}t jd|t j|d}||||gS )Nr:   r
   r{   )r   r;   r   rF   r   Zint32)rU   r   r   Zempty_memoryr   rk   past_lengthr   r   r   _init_state  s
   z_EmformerLayer._init_statestatec                 C   s   |d d d   }t| j|}t| jt|| j }|d | j| d  }|d | j| d  }|d | j| d  }|||fS )N   r   r
   r4   )r   rp   r   r   r0   ceilr   )rU   r   r   Zpast_left_context_lengthZpast_mem_lengthpre_memslc_keylc_valr   r   r   _unpack_state  s   
z_EmformerLayer._unpack_statenext_knext_vupdate_lengthr   c                 C   s   t |d |g}t |d |g}t |d |g| j d  |d< ||jd | j d  |d< ||jd | j d  |d< |d | |d< |S )Nr
   r4   r   r   )r   rA   r   r   r   )rU   r   r   r   r   r   Znew_knew_vr   r   r   _pack_state  s   "z_EmformerLayer._pack_state	rc_outputr   r   c                 C   s4   |  |t||g }| || }| |}|S r)   )rH   r   rA   r   r   )rU   r   r   r   resultr   r   r   _process_attention_output  s   
z(_EmformerLayer._process_attention_outputc                 C   s8   |  t||g}||dd  |d |d fS Nr   )r   r   rA   r   )rU   r   r   r   r   r   r   _apply_pre_attention_layer_norm  s   z._EmformerLayer._apply_pre_attention_layer_normc                 C   s2   |  |||}||dd  |d |d fS r   )r   r   )rU   r   r   r   r   r   r   _apply_post_attention_ffn  s   $z(_EmformerLayer._apply_post_attention_ffnr   ra   c           	      C   sp   |d u rt d| jr| |dddddd}ntdj|j|jd}| j	||||||d\}}||fS )Nz;attention_mask must be not None when for_inference is Falser
   r4   r   r{   )r   r   r   r   r   ra   )
r%   r   r   permuter   emptyrg   r	   r   ru   )	rU   r   r   r   r   ra   r   r   next_mr   r   r   _apply_attention_forward  s    
z'_EmformerLayer._apply_attention_forwardc              	   C   s   |d u r| j |d|jd}| |\}}}| jr0| |dddddd}	|	d d }	ntdj	|j
|jd}	| jj||||	|||d\}
}}}| |||d||}|
||fS )Nr
   r:   r4   r   r{   )r   r   r   r   r   r   rk   )r   r   r   r   r   r   r   r   r   rg   r	   ru   r|   r   )rU   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   _apply_attention_infer  s$   	
z%_EmformerLayer._apply_attention_inferc                 C   sB   |  ||\}}| |||||\}}	| |||\}
}|
||	fS )a1  Forward pass for training.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
            attention_mask (torch.Tensor): attention mask for underlying attention module.

        Returns:
            (Tensor, Tensor, Tensor):
                Tensor
                    encoded utterance frames, with shape `(T, B, D)`.
                Tensor
                    updated right context frames, with shape `(R, B, D)`.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
        )r   r   r   )rU   r   r   r   r   ra   layer_norm_utterancelayer_norm_right_contextr   rw   output_utterancerv   r   r   r   rz     s   
$
z_EmformerLayer.forwardc                 C   sF   |  ||\}}| |||||\}}	}
| |||\}}|||
|	fS )a2  Forward pass for inference.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            state (List[torch.Tensor] or None): list of tensors representing layer internal state
                generated in preceding invocation of ``infer``.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.

        Returns:
            (Tensor, Tensor, List[torch.Tensor], Tensor):
                Tensor
                    encoded utterance frames, with shape `(T, B, D)`.
                Tensor
                    updated right context frames, with shape `(R, B, D)`.
                List[Tensor]
                    list of tensors representing layer internal state
                    generated in current invocation of ``infer``.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
        )r   r   r   )rU   r   r   r   r   r   r   r   r   rw   output_stater   rv   r   r   r   r|     s   
)

z_EmformerLayer.infer)rD   r#   r   r   NFrE   )r}   r~   r   r   r   re   strr   rh   rM   r   r   r   r   r   r   r   r   r   r   r   r   r   rz   r   r   r|   r   r   r   rV   r   r   ?  s    	
".(	


	



/r   c                       s   e Zd Z			ddejjdedededef
 fddZd	ejd
ejfddZ	deded
e
e fddZd	ejd
ejfddZd	ejdejd
eejejf fddZejj	dd	ejdejdee
e
ej   d
eejeje
e
ej  f fddZ  ZS )_EmformerImplr   emformer_layersr   r   right_context_lengthr   c                    sJ   t    |dk| _tjj||dd| _|| _|| _|| _	|| _
|| _d S )Nr   Tr   )rL   rM   r   r   r$   r   r   r   r   r   r   r   )rU   r   r   r   r   r   rV   r   r   rM   P  s   


z_EmformerImpl.__init__rX   r   c                 C   s   |j d }t|| j | j }g }t|d D ]}|d | j }|| j }||||  q|||| j d   t|S Nr   r
   )	r   r0   r   r   r   r5   appendr   rA   )rU   rX   r   num_segsZright_context_blocksseg_idxstartendr   r   r   _gen_right_contextf  s   


z _EmformerImpl._gen_right_contextr   utterance_lengthc              
   C   s   t || j }| j}| j}|| }|| }t|| j | d}t|d | j |}	| j| }
| jrUt|| j d}|d }||| || |||
| ||	| ||	 g	}|S |||
| ||	| ||	 g}|S r   )	r0   r   r   r   r   r   rp   r   r   )rU   r   r   r   rclcZrc_startZrc_endZ	seg_startZseg_endZ	rc_lengthZm_startZ
mem_lengthr7   r   r   r   _gen_attention_mask_col_widthsq  s<   
	z,_EmformerImpl._gen_attention_mask_col_widthsc              	   C   s*  | d}t|| j }g }g }g }| jr0d}dd t|D }dd t|D }	|||g}
nd}dd t|D }d }	||g}
t|D ]=}| ||}t||| j|j	}|
| t||t| j||| j  |j	}|
| |	d urt||	d|j	}|
| qEdtd	d |
D  tj}|S )
Nr   	   c                 S      g | ]}|d v qS ))r
         r   r*   idxr   r   r   r,         z5_EmformerImpl._gen_attention_mask.<locals>.<listcomp>c                 S   r   ))r   r   r   r   r   r   r   r,     r      c                 S   r   ))r
   r   r   r   r   r   r   r,     r   r
   c                 S   s   g | ]}t |qS r   )r   rA   )r*   maskr   r   r   r,     s    )r   r0   r   r   r   r5   r   rB   r   r   r   rp   r   rA   rg   rh   )rU   rX   r   r   Zrc_maskZ
query_maskZsummary_maskZnum_colsZrc_q_cols_maskZs_cols_maskZmasks_to_concatr   r7   Zrc_mask_blockZquery_mask_blockZsummary_mask_blockra   r   r   r   _gen_attention_mask  sH   


	
 z!_EmformerImpl._gen_attention_maskr   c           	      C   s   | ddd}| |}|d|d| j  }| |}| jr3| | ddd ddddd ntdj	|j
|jd}|}| jD ]}||||||\}}}qD| ddd|fS )aG  Forward pass for training and non-streaming inference.

        B: batch size;
        T: max number of input frames in batch;
        D: feature dimension of each frame.

        Args:
            input (torch.Tensor): utterance frames right-padded with right context frames, with
                shape `(B, T + right_context_length, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid utterance frames for i-th batch element in ``input``.

        Returns:
            (Tensor, Tensor):
                Tensor
                    output frames, with shape `(B, T, D)`.
                Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid frames for i-th batch element in output frames.
        r
   r   r4   Nrb   r{   )r   r   r   r   r   r   r   r   r   rg   r	   r   r   )	rU   rX   r   r   r   ra   r   ry   layerr   r   r   rz     s   

(
z_EmformerImpl.forwardNstatesc                 C   s$  | d| j| j krtd| j| j  d| d d|ddd}| d| j }||d }|d| }tj|| j dd}| jrT| |ddddddnt	dj
|j|jd	}|}	g }
t| jD ]\}}||	|||du rxdn|| |\}	}}}|
| qi|	ddd||
fS )
a  Forward pass for streaming inference.

        B: batch size;
        D: feature dimension of each frame.

        Args:
            input (torch.Tensor): utterance frames right-padded with right context frames, with
                shape `(B, segment_length + right_context_length, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``input``.
            states (List[List[torch.Tensor]] or None, optional): list of lists of tensors
                representing internal state generated in preceding invocation of ``infer``. (Default: ``None``)

        Returns:
            (Tensor, Tensor, List[List[Tensor]]):
                Tensor
                    output frames, with shape `(B, segment_length, D)`.
                Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid frames for i-th batch element in output frames.
                List[List[Tensor]]
                    output states; list of lists of tensors representing internal state
                    generated in current invocation of ``infer``.
        r
   zIPer configured segment_length and right_context_length, expected size of z# for dimension 1 of input, but got .r   r4   N)rp   r{   )r   r   r   r%   r   r   rs   r   r   r   rg   r	   r   	enumerater   r|   r   )rU   rX   r   r   Zright_context_start_idxr   r   Zoutput_lengthsr   ry   Zoutput_statesr3   r   r   r   r   r   r|     s:   
 z_EmformerImpl.infer)r   r   r   r)   )r}   r~   r   r   r$   
ModuleListr   rM   r   r   r   r   r   r   rz   r   r   r   r|   r   r   r   rV   r   r   O  s<    $&0#r   c                       sl   e Zd ZdZ								dded	ed
ededededededededee dedef fddZ	  Z
S )r   a_  Emformer architecture introduced in
    *Emformer: Efficient Memory Transformer Based Acoustic Model for Low Latency Streaming Speech Recognition*
    :cite:`shi2021emformer`.

    See Also:
        * :func:`~torchaudio.models.emformer_rnnt_model`,
          :func:`~torchaudio.models.emformer_rnnt_base`: factory functions.
        * :class:`torchaudio.pipelines.RNNTBundle`: ASR pipelines with pretrained model.

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads in each Emformer layer.
        ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
        num_layers (int): number of Emformer layers to instantiate.
        segment_length (int): length of each input segment.
        dropout (float, optional): dropout probability. (Default: 0.0)
        activation (str, optional): activation function to use in each Emformer layer's
            feedforward network. Must be one of ("relu", "gelu", "silu"). (Default: "relu")
        left_context_length (int, optional): length of left context. (Default: 0)
        right_context_length (int, optional): length of right context. (Default: 0)
        max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
        weight_init_scale_strategy (str or None, optional): per-layer weight initialization scaling
            strategy. Must be one of ("depthwise", "constant", ``None``). (Default: "depthwise")
        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)

    Examples:
        >>> emformer = Emformer(512, 8, 2048, 20, 4, right_context_length=1)
        >>> input = torch.rand(128, 400, 512)  # batch, num_frames, feature_dim
        >>> lengths = torch.randint(1, 200, (128,))  # batch
        >>> output, lengths = emformer(input, lengths)
        >>> input = torch.rand(128, 5, 512)
        >>> lengths = torch.ones(128) * 5
        >>> output, lengths, states = emformer.infer(input, lengths, None)
    rD   r#   r   r-   FrE   rF   rG   r   r(   r   rH   r"   r   r   r   r'   rJ   rK   c                    sV   t ||
tj 	
fddt|D }t j||	d d S )Nc                    s.   g | ]}t  
| 	d qS ))rH   r"   r   r   rI   rJ   rK   )r   r2   r"   rH   r   rF   r   r   rK   rG   r   rJ   Zweight_init_gainsr   r   r,   ]  s     z%Emformer.__init__.<locals>.<listcomp>)r   r   r   )r6   r   r$   r   r5   rL   rM   )rU   rF   rG   r   r(   r   rH   r"   r   r   r   r'   rJ   rK   r   rV   r   r   rM   K  s   

zEmformer.__init__)rD   r#   r   r   r   r-   FrE   )r}   r~   r   r   r   re   r   r   rh   rM   r   r   r   rV   r   r   &  sJ    +	
r)   )r0   typingr   r   r   r   __all__r   r   r!   r   r$   Moduler&   r   re   r6   rh   r   rB   rC   r   r   r   r   r   r   r   <module>   sT    
"
 t   X