
    h                     P   d dl mZmZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ d
dlmZmZmZmZ ddlmZ  ej,                  e      Z G d ded      Z G d dej4                        Z G d de      Z G d de      Z G d de      Z G d de      Zg dZ y)    )Optional	TypedDictN)nn   )ACT2FN)Cache)Unpack)logging)deprecate_kwarg   )GraniteMoeDecoderLayerGraniteMoeForCausalLMGraniteMoeModelGraniteMoePreTrainedModel   )GraniteMoeSharedConfigc                       e Zd ZU dZej
                  ed<   ej
                  ed<   eed<   eed<   ej                  ed<   y)GraniteFlashAttentionKwargsa  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    Attributes:
        cu_seq_lens_q (`torch.LongTensor`)
            Gets cumulative sequence length for query state.
        cu_seq_lens_k (`torch.LongTensor`)
            Gets cumulative sequence length for key state.
        max_length_q (`int`):
            Maximum sequence length for query state.
        max_length_k (`int`):
            Maximum sequence length for key state.
        seq_idx (`torch.IntTensor):
            Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor     {/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/modular_granitemoeshared.pyr   r   &   s7    " ######__r$   r   F)totalc                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )GraniteMoeSharedMLPz~
    MLP layer for shared experts

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                 `   t         |           |j                  | _        |j                  | _        t
        |j                     | _        t        j                  | j                  | j                  dz  d      | _
        t        j                  | j                  | j                  d      | _        y )Nr   F)bias)super__init__hidden_size
input_sizeshared_intermediate_sizer   
hidden_act
activationr   Linearinput_linearoutput_linearselfr)   	__class__s     r%   r-   zGraniteMoeSharedMLP.__init__H   s     ,,!:: !2!23IIdoot7G7G!7KRWXYYt'7'7uUr$   hidden_statesreturnc                     | j                  |      }|j                  dd      }| j                  |d         |d   z  }| j                  |      }|S )Nr   )dimr   r   )r4   chunkr2   r5   )r7   r9   chunked_hidden_statess      r%   forwardzGraniteMoeSharedMLP.forwardQ   s^    ))-8 - 3 3A2 3 >(=a(@ADYZ[D\\**=9r$   )
r   r   r   r   r   r-   r   Tensorr@   __classcell__r8   s   @r%   r(   r(   ?   s2    V5 VU\\ ell r$   r(   c                       e Zd Zdedef fdZ eddd      	 	 	 	 	 	 	 	 ddej                  d	e	ej                     d
e	ej                     de	e   de	e   de	e   de	ej                     de	e   de	eej                  ej                  f      dee   deej                   e	eej                   ej                   f      f   fd       Z xZS )GraniteMoeSharedDecoderLayerr)   	layer_idxc                 t    t         |   ||       |j                  dk(  rd | _        y t        |      | _        y )Nr   )r,   r-   r0   r(   
shared_mlpr7   r)   rF   r8   s      r%   r-   z%GraniteMoeSharedDecoderLayer.__init__Z   s3    +"("A"AQ"F$L_`fLgr$   past_key_valuepast_key_valuesz4.58)new_nameversionr9   attention_maskposition_idsoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingskwargsr:   c
                 l   |}| j                  |      } | j                  d||||||||	d|
\  }}||| j                  z  z   }|}| j                  |      }| j	                  |      \  }}| j
                  |}n|| j                  |      z   }~||| j                  z  z   }|f}|r||fz  }|r||fz  }|S )aD  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs. Can be used to provide `GraniteFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        )r9   rN   rO   rK   rP   rQ   rR   rT   r#   )input_layernorm	self_attnresidual_multiplierpost_attention_layernormblock_sparse_moerH   )r7   r9   rN   rO   rK   rP   rQ   rR   rS   rT   rU   residualself_attn_weightsmoe_hidden_statesrouter_logitsoutputss                   r%   r@   z$GraniteMoeSharedDecoderLayer.forward^   s   N !,,]; ,:4>> 
,
')%+/) 3
,
 
,
(( !=43K3K#KK !55mD+/+@+@+O(=??"-M-0NNM =43K3K#KK ")++G''Gr$   )NNNFFNFN)r   r   r   r   r!   r-   r   r   rA   r   r   r   booltupler	   r   FloatTensorr@   rB   rC   s   @r%   rE   rE   Y   sH   h5 h# h %0A6R 2637+/,1$)59/4KOO||O !.O u//0	O
 "%O $D>O D>O !!1!12O 'tnO &eELL%,,,F&GHO 45O 
u  (51B1BEDUDU1U+V"WW	XO SOr$   rE   c                       e Zd ZU eed<   dgZy)GraniteMoeSharedPreTrainedModelr)   rE   N)r   r   r   r   r    _no_split_modulesr#   r$   r%   re   re      s    ""78r$   re   c                   $     e Zd Zdef fdZ xZS )GraniteMoeSharedModelr)   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        y c c}w N)r,   r-   r   
ModuleListrangenum_hidden_layersrE   layersrI   s      r%   r-   zGraniteMoeSharedModel.__init__   sE     mmNSTZTlTlNmn)&)<n
ns   A)r   r   r   r   r-   rB   rC   s   @r%   rh   rh      s    
5 
 
r$   rh   c                   *     e Zd ZdgZdef fdZ xZS )GraniteMoeSharedForCausalLMzlm_head.weightr)   c                 d    t         |   |       t        |      | _        | j	                          y rj   )r,   r-   rh   model	post_initr6   s     r%   r-   z$GraniteMoeSharedForCausalLM.__init__   s&     *62
r$   )r   r   r   _tied_weights_keysr   r-   rB   rC   s   @r%   rp   rp      s    *+5  r$   rp   )rp   rh   re   )!typingr   r   r   r   activationsr   cache_utilsr   processing_utilsr	   utilsr
   utils.deprecationr   granitemoe.modeling_granitemoer   r   r   r   configuration_granitemoesharedr   
get_loggerr   loggerr   Moduler(   rE   re   rh   rp   __all__r#   r$   r%   <module>r      s     '   !   &  0  C 
		H	%)5 2")) 4U#9 Up9&? 9

O 
"7  fr$   