
    hi:                         d dl Z d dlmZmZ d dlZd dlmZmZ d dlmZ  G d dej                        Z
 G d dej                        Z G d	 d
ej                        Zy)    N)TupleType)Tensornn)MLPBlockc                        e Zd ZdZej
                  dfdededededeej                     ded	d
f fdZ	de
j                  de
j                  de
j                  d	ee
j                  e
j                  f   fdZ xZS )TwoWayTransformera  
    A Two-Way Transformer module for simultaneous attention to image and query points.

    This class implements a specialized transformer decoder that attends to an input image using queries with
    supplied positional embeddings. It's useful for tasks like object detection, image segmentation, and point
    cloud processing.

    Attributes:
        depth (int): Number of layers in the transformer.
        embedding_dim (int): Channel dimension for input embeddings.
        num_heads (int): Number of heads for multihead attention.
        mlp_dim (int): Internal channel dimension for the MLP block.
        layers (nn.ModuleList): List of TwoWayAttentionBlock layers composing the transformer.
        final_attn_token_to_image (Attention): Final attention layer from queries to image.
        norm_final_attn (nn.LayerNorm): Layer normalization applied to final queries.

    Methods:
        forward: Process image and point embeddings through the transformer.

    Examples:
        >>> transformer = TwoWayTransformer(depth=6, embedding_dim=256, num_heads=8, mlp_dim=2048)
        >>> image_embedding = torch.randn(1, 256, 32, 32)
        >>> image_pe = torch.randn(1, 256, 32, 32)
        >>> point_embedding = torch.randn(1, 100, 256)
        >>> output_queries, output_image = transformer(image_embedding, image_pe, point_embedding)
        >>> print(output_queries.shape, output_image.shape)
       depthembedding_dim	num_headsmlp_dim
activationattention_downsample_ratereturnNc                 `   t         |           || _        || _        || _        || _        t        j                         | _        t        |      D ]/  }| j                  j                  t        ||||||dk(               1 t        |||      | _        t        j                  |      | _        y)ak  
        Initialize a Two-Way Transformer for simultaneous attention to image and query points.

        Args:
            depth (int): Number of layers in the transformer.
            embedding_dim (int): Channel dimension for input embeddings.
            num_heads (int): Number of heads for multihead attention. Must divide embedding_dim.
            mlp_dim (int): Internal channel dimension for the MLP block.
            activation (Type[nn.Module], optional): Activation function to use in the MLP block.
            attention_downsample_rate (int, optional): Downsampling rate for attention mechanism.
        r   )r   r   r   r   r   skip_first_layer_pedownsample_rateN)super__init__r   r   r   r   r   
ModuleListlayersrangeappendTwoWayAttentionBlock	Attentionfinal_attn_token_to_image	LayerNormnorm_final_attn)	selfr   r   r   r   r   r   i	__class__s	           h/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/ultralytics/models/sam/modules/transformer.pyr   zTwoWayTransformer.__init__)   s    ( 	
*"mmou 
	AKK$"/'#).G)*a	
	 *3=)]v)w&!||M:    image_embeddingimage_pepoint_embeddingc                 B   |j                  d      j                  ddd      }|j                  d      j                  ddd      }|}|}| j                  D ]  } |||||      \  }} ||z   }||z   }| j                  |||      }	||	z   }| j	                  |      }||fS )a  
        Process image and point embeddings through the Two-Way Transformer.

        Args:
            image_embedding (torch.Tensor): Image to attend to, with shape (B, embedding_dim, H, W).
            image_pe (torch.Tensor): Positional encoding to add to the image, with same shape as image_embedding.
            point_embedding (torch.Tensor): Embedding to add to query points, with shape (B, N_points, embedding_dim).

        Returns:
            queries (torch.Tensor): Processed point embeddings with shape (B, N_points, embedding_dim).
            keys (torch.Tensor): Processed image embeddings with shape (B, H*W, embedding_dim).
        r
   r      )querieskeysquery_pekey_peqkv)flattenpermuter   r   r    )
r!   r&   r'   r(   r+   r,   layerr0   r1   attn_outs
             r$   forwardzTwoWayTransformer.forwardS   s    & *11!4<<Q1E##A&..q!Q7 " [[ 	E!(	MGT	 o%8O11Ad1CH$&&w/}r%   )__name__
__module____qualname____doc__r   ReLUintr   Moduler   torchr   r   r7   __classcell__r#   s   @r$   r	   r	      s    D ')gg)*(;(; (; 	(;
 (; O(; $'(; 
(;T** ,,* 	*
 
u||U\\)	**r%   r	   c                       e Zd ZdZdej
                  ddfdedededeej                     d	ed
e	ddf fdZ
dej                  dej                  dej                  dej                  deej                  ej                  f   f
dZ xZS )r   aG  
    A two-way attention block for simultaneous attention to image and query points.

    This class implements a specialized transformer block with four main layers: self-attention on sparse inputs,
    cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention of dense
    inputs to sparse inputs.

    Attributes:
        self_attn (Attention): Self-attention layer for queries.
        norm1 (nn.LayerNorm): Layer normalization after self-attention.
        cross_attn_token_to_image (Attention): Cross-attention layer from queries to keys.
        norm2 (nn.LayerNorm): Layer normalization after token-to-image attention.
        mlp (MLPBlock): MLP block for transforming query embeddings.
        norm3 (nn.LayerNorm): Layer normalization after MLP block.
        norm4 (nn.LayerNorm): Layer normalization after image-to-token attention.
        cross_attn_image_to_token (Attention): Cross-attention layer from keys to queries.
        skip_first_layer_pe (bool): Whether to skip positional encoding in the first layer.

    Methods:
        forward: Apply self-attention and cross-attention to queries and keys.

    Examples:
        >>> embedding_dim, num_heads = 256, 8
        >>> block = TwoWayAttentionBlock(embedding_dim, num_heads)
        >>> queries = torch.randn(1, 100, embedding_dim)
        >>> keys = torch.randn(1, 1000, embedding_dim)
        >>> query_pe = torch.randn(1, 100, embedding_dim)
        >>> key_pe = torch.randn(1, 1000, embedding_dim)
        >>> processed_queries, processed_keys = block(queries, keys, query_pe, key_pe)
    i   r
   Fr   r   r   r   r   r   r   Nc                    t         |           t        ||      | _        t	        j
                  |      | _        t        |||      | _        t	        j
                  |      | _        t        |||      | _
        t	        j
                  |      | _        t	        j
                  |      | _        t        |||      | _        || _        y)a  
        Initialize a TwoWayAttentionBlock for simultaneous attention to image and query points.

        This block implements a specialized transformer layer with four main components: self-attention on sparse
        inputs, cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention
        of dense inputs to sparse inputs.

        Args:
            embedding_dim (int): Channel dimension of the embeddings.
            num_heads (int): Number of attention heads in the attention layers.
            mlp_dim (int, optional): Hidden dimension of the MLP block.
            activation (Type[nn.Module], optional): Activation function for the MLP block.
            attention_downsample_rate (int, optional): Downsampling rate for the attention mechanism.
            skip_first_layer_pe (bool, optional): Whether to skip positional encoding in the first layer.
        r   N)r   r   r   	self_attnr   r   norm1cross_attn_token_to_imagenorm2r   mlpnorm3norm4cross_attn_image_to_tokenr   )r!   r   r   r   r   r   r   r#   s          r$   r   zTwoWayAttentionBlock.__init__   s    0 	"=)<\\-0
)2=)]v)w&\\-0
M7J?\\-0
\\-0
)2=)]v)w&#6 r%   r+   r,   r-   r.   c                    | j                   r| j                  |||      }n||z   }| j                  |||      }||z   }| j                  |      }||z   }||z   }| j                  |||      }||z   }| j	                  |      }| j                  |      }||z   }| j                  |      }||z   }||z   }| j                  |||      }||z   }| j                  |      }||fS )a  
        Apply two-way attention to process query and key embeddings in a transformer block.

        Args:
            queries (torch.Tensor): Query embeddings with shape (B, N_queries, embedding_dim).
            keys (torch.Tensor): Key embeddings with shape (B, N_keys, embedding_dim).
            query_pe (torch.Tensor): Positional encodings for queries with same shape as queries.
            key_pe (torch.Tensor): Positional encodings for keys with same shape as keys.

        Returns:
            queries (torch.Tensor): Processed query embeddings with shape (B, N_queries, embedding_dim).
            keys (torch.Tensor): Processed key embeddings with shape (B, N_keys, embedding_dim).
        r/   )	r   rD   rE   rF   rG   rH   rI   rK   rJ   )	r!   r+   r,   r-   r.   r0   r6   r1   mlp_outs	            r$   r7   zTwoWayAttentionBlock.forward   s   " ##nnw'WnEG("A~~Q'~:H(G**W% h6M11Ad1CH$**W% ((7#G#**W% h6M11Ag1Fhzz$}r%   )r8   r9   r:   r;   r   r<   r=   r   r>   boolr   r?   r   r   r7   r@   rA   s   @r$   r   r      s    F &(gg)*$)%7%7 %7 	%7
 O%7 $'%7 "%7 
%7N,||,+0<<,CH<<,Y^YeYe,	u||U\\)	*,r%   r   c                       e Zd ZdZ	 	 ddededededdf
 fdZed	ej                  dedej                  fd
       Z	ed	edefd       Z
dej                  dej                  dej                  dej                  fdZ xZS )r   a  
    An attention layer with downscaling capability for embedding size after projection.

    This class implements a multi-head attention mechanism with the option to downsample the internal
    dimension of queries, keys, and values.

    Attributes:
        embedding_dim (int): Dimensionality of input embeddings.
        kv_in_dim (int): Dimensionality of key and value inputs.
        internal_dim (int): Internal dimension after downsampling.
        num_heads (int): Number of attention heads.
        q_proj (nn.Linear): Linear projection for queries.
        k_proj (nn.Linear): Linear projection for keys.
        v_proj (nn.Linear): Linear projection for values.
        out_proj (nn.Linear): Linear projection for output.

    Methods:
        _separate_heads: Separate input tensor into attention heads.
        _recombine_heads: Recombine separated attention heads.
        forward: Compute attention output for given query, key, and value tensors.

    Examples:
        >>> attn = Attention(embedding_dim=256, num_heads=8, downsample_rate=2)
        >>> q = torch.randn(1, 100, 256)
        >>> k = v = torch.randn(1, 50, 256)
        >>> output = attn(q, k, v)
        >>> print(output.shape)
        torch.Size([1, 100, 256])
    Nr   r   r   	kv_in_dimr   c                    t         |           || _        ||n|| _        ||z  | _        || _        | j                  |z  dk(  sJ d       t        j                  || j                        | _        t        j                  | j                  | j                        | _	        t        j                  | j                  | j                        | _
        t        j                  | j                  |      | _        y)a?  
        Initialize the Attention module with specified dimensions and settings.

        Args:
            embedding_dim (int): Dimensionality of input embeddings.
            num_heads (int): Number of attention heads.
            downsample_rate (int, optional): Factor by which internal dimensions are downsampled.
            kv_in_dim (int | None, optional): Dimensionality of key and value inputs. If None, uses embedding_dim.

        Raises:
            AssertionError: If num_heads does not evenly divide the internal dim (embedding_dim / downsample_rate).
        Nr   z$num_heads must divide embedding_dim.)r   r   r   rP   internal_dimr   r   Linearq_projk_projv_projout_proj)r!   r   r   r   rP   r#   s        r$   r   zAttention.__init__  s    & 	*&/&;)_<"  9,1Y3YY1iit/@/@Aii0A0ABii0A0AB		$"3"3]Cr%   xc                 t    | j                   \  }}}| j                  |||||z        } | j                  dd      S )zGSeparate the input tensor into the specified number of attention heads.r*   r
   )shapereshape	transpose)rX   r   bncs        r$   _separate_headszAttention._separate_heads4  s<     ''1aIIaIqI~6{{1a  r%   c                 t    | j                   \  }}}}| j                  dd      } | j                  ||||z        S )z9Recombine separated attention heads into a single tensor.r*   r
   )rZ   r\   r[   )rX   r]   n_headsn_tokens
c_per_heads        r$   _recombine_headszAttention._recombine_heads;  s>     ,-77(7HjKK1yyHg
&:;;r%   r0   r1   r2   c                    | j                  |      }| j                  |      }| j                  |      }| j                  || j                        }| j                  || j                        }| j                  || j                        }|j
                  \  }}}}||j                  dddd      z  }|t        j                  |      z  }t        j                  |d      }||z  }| j                  |      }| j                  |      S )a  
        Apply multi-head attention to query, key, and value tensors with optional downsampling.

        Args:
            q (torch.Tensor): Query tensor with shape (B, N_q, embedding_dim).
            k (torch.Tensor): Key tensor with shape (B, N_k, embedding_dim).
            v (torch.Tensor): Value tensor with shape (B, N_k, embedding_dim).

        Returns:
            (torch.Tensor): Output tensor after attention with shape (B, N_q, embedding_dim).
        r   r*      r
   )dim)rT   rU   rV   r`   r   rZ   r4   mathsqrtr?   softmaxre   rW   )r!   r0   r1   r2   _rd   attnouts           r$   r7   zAttention.forwardB  s     KKNKKNKKN   DNN3  DNN3  DNN3  gg1a199Q1a((dii
++}}Tr* Qh##C(}}S!!r%   )r*   N)r8   r9   r:   r;   r=   r   staticmethodr?   r   r`   re   r7   r@   rA   s   @r$   r   r      s    D  !DD D 	D
 D 
D> !5<< !C !ELL ! ! <F <v < <" "%,, "5<< "ELL "r%   r   )rj   typingr   r   r?   r   r   ultralytics.nn.modulesr   r>   r	   r   r    r%   r$   <module>rt      sN        +q		 qhs299 slk"		 k"r%   