
    h                        d dl mZmZ d dlZd dlmZ d dlmZ ddlm	Z	m
Z
 ddlmZ ddlmZmZmZ dd	lmZmZmZmZ  G d
 de      Ze G d de             Z G d dee      Z ed       G d dee             Zg dZy)    )OptionalUnionN)IJepaConfig   )BaseModelOutputWithPoolingImageClassifierOutput)Unpack)TransformersKwargsauto_docstring	torch_int   )ViTEmbeddingsViTForImageClassificationViTModelViTPreTrainedModelc            	            e Zd Zddededdf fdZdej                  dededej                  fd	Z		 	 dd
ej                  de
ej                     dedej                  fdZ xZS )IJepaEmbeddingsconfiguse_mask_tokenreturnNc                     t         |   ||       | `| j                  j                  }t        j                  t        j                  d||j                              | _
        y )N   )super__init__	cls_tokenpatch_embeddingsnum_patchesnn	Parametertorchrandnhidden_sizeposition_embeddings)selfr   r   r   	__class__s       e/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/transformers/models/ijepa/modular_ijepa.pyr   zIJepaEmbeddings.__init__   sL    0N++77#%<<A{FL^L^0_#`     
embeddingsheightwidthc                 0   |j                   d   }| j                  j                   d   }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  }|j                   d   }|| j
                  z  }|| j
                  z  }	t        |dz        }
|j                  d|
|
|      }|j                  dddd      }t        j                  j                  |||	fdd	      }|j                  dddd      j                  dd|      }|S )
a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   g      ?r   r   r   bicubicF)sizemodealign_corners)shaper#   r    jit
is_tracing
patch_sizer   reshapepermuter   
functionalinterpolateview)r$   r(   r)   r*   r   num_positionspatch_pos_embeddim
new_height	new_widthsqrt_num_positionss              r&   interpolate_pos_encodingz(IJepaEmbeddings.interpolate_pos_encoding   s#    !&&q)0066q9 yy##%+*F6UZ?+++22r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nr'   pixel_valuesbool_masked_posr@   c                 x   |j                   \  }}}}| j                  ||      }|Z|j                   d   }	| j                  j                  ||	d      }
|j	                  d      j                  |
      }|d|z
  z  |
|z  z   }|r|| j                  |||      z   }n|| j                  z   }| j                  |      }|S )N)r@   r   r,         ?)	r1   r   
mask_tokenexpand	unsqueezetype_asr@   r#   dropout)r$   rA   rB   r@   
batch_size_r)   r*   r(   
seq_lengthmask_tokensmasks               r&   forwardzIJepaEmbeddings.forward=   s     (4'9'9$
Avu**<Rj*k
&#))!,J//00ZLK",,R088ED#sTz2[45GGJ $#d&C&CJPVX]&^^J#d&>&>>J\\*-
r'   )F)NF)__name__
__module____qualname__r   boolr   r    Tensorintr@   r   
BoolTensorrO   __classcell__r%   s   @r&   r   r      s    a{ aD aT a%5<< % %UX %]b]i]i %T 7;).	ll "%"2"23 #'	
 
r'   r   c                   d    e Zd Zdeej
                  ej                  ej                  f   ddfdZy)IJepaPreTrainedModelmoduler   Nc                 l   t        |t        j                  t        j                  f      rt        j                  j                  |j                  j                  j                  t        j                        d| j                  j                        j                  |j                  j                        |j                  _        |j                  %|j                  j                  j                          yyt        |t        j                         rJ|j                  j                  j                          |j                  j                  j#                  d       yt        |t$              rt        j                  j                  |j&                  j                  j                  t        j                        d| j                  j                        j                  |j&                  j                        |j&                  _        |j(                  %|j(                  j                  j                          yyy)zInitialize the weightsg        )meanstdNrD   )
isinstancer   LinearConv2dinittrunc_normal_weightdatator    float32r   initializer_rangedtypebiaszero_	LayerNormfill_r   r#   rE   )r$   r[   s     r&   _init_weightsz"IJepaPreTrainedModel._init_weightsZ   s   fryy"))45 "$!6!6""%%emm43DKKDaDa "7 "b$$% MM {{&  &&( '-KK""$MM$$S)0.0gg.C.C**//225==AKK11 /D / b++112	 &&+
   ,!!&&,,. - 1r'   )	rP   rQ   rR   r   r   r`   ra   rl   rn    r'   r&   rZ   rZ   X   s.    /E"))RYY*L$M /RV /r'   rZ   c                   .     e Zd Zddededef fdZ xZS )
IJepaModelr   add_pooling_layerr   c                 V    t         |   |       || _        t        ||      | _        y)z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )r   N)r   r   r   r   r(   )r$   r   rr   r   r%   s       r&   r   zIJepaModel.__init__r   s'     	 )&Pr'   )FF)rP   rQ   rR   r   rS   r   rW   rX   s   @r&   rq   rq   q   s(    	Q{ 	Qt 	Q]a 	Q 	Qr'   rq   a  
    IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states)
    e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    )custom_introc                        e Zd Zdef fdZ	 	 	 	 d
deej                     deej                     deej                     dee   de	e
   defd	Z xZS )IJepaForImageClassificationr   c                 h    t         |   |       t        |d      | _        | j	                          y )NF)rr   )r   r   rq   ijepa	post_init)r$   r   r%   s     r&   r   z$IJepaForImageClassification.__init__   s(     %@
r'   rA   	head_masklabelsr@   kwargsr   c                     | j                   |f||d|}|j                  }| j                  |j                  d            }d}	| | j                  ||| j
                  fi |}	t        |	||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )rz   r@   r   )r<   N)losslogitshidden_states
attentions)	rx   last_hidden_state
classifierr]   loss_functionr   r   r   r   )
r$   rA   rz   r{   r@   r|   outputssequence_outputr   r~   s
             r&   rO   z#IJepaForImageClassification.forward   s     /9djj/
%=/
 	/
 "33!5!5!!5!<=%4%%ffdkkLVLD$!//))	
 	
r'   )NNNN)rP   rQ   rR   r   r   r   r    rT   rS   r	   r
   r   rO   rW   rX   s   @r&   rv   rv   ~   s    {  04,0)-37!
u||,!
 ELL)!
 &	!

 #+4.!
 +,!
 
!
r'   rv   )rZ   rq   rv   )typingr   r   r    torch.nnr   -transformers.models.ijepa.configuration_ijepar   modeling_outputsr   r   processing_utilsr	   utilsr
   r   r   vit.modeling_vitr   r   r   r   r   rZ   rq   rv   __all__ro   r'   r&   <module>r      s    "   E Q & B B e eGm GT /- / /0
Q%x 
Q '
"68Q '
'
Tr'   