
    iX                        d dl mZ d dlZd dlmZ d dlZd dlmZ d dlmZ ddl	m
Z
  ej                  e      Z G d d	e      Zy)
    )annotationsN)Literal)Tensor)InputModule   )WhitespaceTokenizerc                       e Zd ZU dZdZded<   g dZded<   i dd	f	 	 	 	 	 	 	 d fd
ZddZddZ	d Z
	 d	 	 	 	 	 ddZd	dddZ xZS )BoWzImplements a Bag-of-Words (BoW) model to derive sentence embeddings.

    A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab.
    Fboolsave_in_root)vocabword_weightsunknown_word_weightcumulative_term_frequency	list[str]config_keysr   Tc                   t         |           t        t        j	                  |            }|| _        || _        || _        || _        g | _	        d}|D ]T  }|}||v r||   }n+|j                         |v r||j                            }n|dz  }| j                  j                  |       V t        j                  | dt        |       d|        t        |t!               d      | _        t        |      | _        y )Nr   r   z out of z0 words without a weighting value. Set weight to F)
stop_wordsdo_lower_case)super__init__listdictfromkeysr   r   r   r   weightslowerappendloggerinfolenr   set	tokenizersentence_embedding_dimension)	selfr   r   r   r   num_unknown_wordswordweight	__class__s	           ^/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/sentence_transformers/models/BoW.pyr   zBoW.__init__   s     	T]]5)*
(#6 )B&  	(D(F|#%d+-%djjl3!Q&!LL'	( 	 !#e*5efyez{	
 -UsuTYZ,/J)    c                    |S N )r$   featuress     r)   forwardzBoW.forward;   s    r*   c                    |D cg c]  } | j                   j                  |fi |! }}| j                  |      S c c}w r,   )r"   tokenizeget_sentence_features)r$   textskwargstext	tokenizeds        r)   r1   zBoW.tokenize?   sC    INO,T^^,,T<V<O	O)))44 Ps   $<c                    | j                   S r,   )r#   )r$   s    r)    get_sentence_embedding_dimensionz$BoW.get_sentence_embedding_dimensionC   s    000r*   c                J   g }|D ]  }t        j                  | j                         t         j                        }|D ];  }| j                  r||xx   | j
                  |   z  cc<   *| j
                  |   ||<   = |j                  |        dt        j                  |      iS )N)dtypesentence_embedding)torchzerosr8   float32r   r   r   stack)r$   tokenized_textspad_seq_lengthvectorstokensvectortokens          r)   r2   zBoW.get_sentence_featuresF   s     % 	#F[[!F!F!HPUP]P]^F 8115MT\\%%88M$(LL$7F5M	8
 NN6"	# %ekk'&:;;r*   )safe_serializationc               &    | j                  |       y r,   )save_config)r$   output_pathrF   argsr4   s        r)   savezBoW.saveV   s    %r*   )r   r   r   zdict[str, float]r   floatr   r   )r.   zdict[str, Tensor])r3   r   returnz	list[int])r   )r@   zlist[list[int]]rA   intrM   z1dict[Literal['sentence_embedding'], torch.Tensor])rI   strrF   r   rM   None)__name__
__module____qualname____doc__r   __annotations__r   r   r/   r1   r8   r2   rK   __classcell__)r(   s   @r)   r
   r
      s    
 L$jKj
 *,%&*. 7 7 ' 7 #	 7
 $( 7D51 GH<.<@C<	:<  HL & &r*   r
   )
__future__r   loggingtypingr   r<   r   (sentence_transformers.models.InputModuler   r"   r   	getLoggerrQ   r   r
   r-   r*   r)   <module>r\      s:    "     @ *			8	$G&+ G&r*   