
    hy                    8   d Z ddlmZ ddlZddlZddlZddlZddlZddl	Z	ddl
m
Z
mZ ddlmZmZmZmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlm Z m!Z!m"Z" ddlm#Z$ ddlm%Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2m3Z3 ddl1m4Z5 ddl6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z> ddl?m@Z@ ddlAmBZB ddlCmDZD ddlEmFZF ddlGmHZH ddlImJZJmKZKmLZL ddlMmNZO ddlPm#ZQ ddlPmRZRmSZS erddlTZU ej                  eW      ZXeeg eeFe f   f   eeeYgef   e eFf   ZZeeg eeFe f   f   ef   Z[ G d de\      Z] G d  d!eY      Z^ G d" d#eY      Z_	 dM	 	 	 	 	 dNd$Z`dOd%Za G d& d'eD      ZbdPd(Zc	 	 	 	 	 	 dQd)Zd	 	 	 	 	 	 	 	 dRd*Ze	 	 	 	 	 	 	 	 dSd+Zf	 	 	 	 	 	 	 	 	 	 dTd,Zg	 	 	 	 	 	 dUd-Zh	 	 	 	 	 	 dVd.Zi	 	 	 	 	 	 dWd/Zj	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dXd0Zk	 	 	 	 	 	 	 	 	 	 dYd1Zl	 	 	 	 	 	 	 	 	 	 	 	 	 	 dZd2Zmddddd3	 	 	 	 	 	 	 	 	 	 	 	 	 d[d4Zndddd5	 	 	 	 	 	 	 	 	 	 	 	 	 d\d6Zodd7	 	 	 	 	 	 	 	 	 d]d8Zpdddd5	 	 	 	 	 	 	 	 	 	 	 	 	 d^d9Zqdddd5	 	 	 	 	 	 	 	 	 	 	 	 	 d\d:Zrdd7	 	 	 	 	 	 	 	 	 d]d;Zs	 	 	 d_	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d`d<Zt G d= d>eDd?@      Zuej                   G dA dB             ZwdadCZxdbdDZydEZzdddFddd?ddG	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dcdHZ{dddFddd?ddG	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dcdIZ|dJZ}e}e|_         e}j                  dKdL      e{_         y)dz>Utilities for running language models or Chains over datasets.    )annotationsN)datetimetimezone)TYPE_CHECKINGAnyCallableOptionalUnioncast)warn_deprecated)	Callbacks)BaseLanguageModel)BaseMessagemessages_from_dict)
ChatResult	LLMResult)RunnableRunnableConfigRunnableLambdaconfig)utils)EvaluatorCallbackHandlerwait_for_all_evaluators)LangChainTracer)Client)get_git_infoget_langchain_env_var_metadata)EvaluationResultRunEvaluator)run_evaluator)as_runnableis_traceable_function)DatasetDataTypeExampleRunTracerSession)LangSmithError)	HTTPError)	TypedDict)Chain)load_evaluator)EvaluatorTypePairwiseStringEvaluatorStringEvaluator)
evaluation)name_generationprogressc                      e Zd ZdZy)InputFormatErrorz(Raised when the input format is invalid.N)__name__
__module____qualname____doc__     e/var/www/html/eduruby.in/venv/lib/python3.12/site-packages/langchain/smith/evaluation/runner_utils.pyr5   r5   K   s    2r;   r5   c                  $    e Zd ZdZ	 	 ddZddZy)
TestResultz1A dictionary of the results of a single test run.c                    | j                         }|j                  D cg c]  }|j                  d      s|dv r| }}|j                  d      j	                  |d      S c c}w )zReturn quantiles for the feedback scores.

        This method calculates and prints the quantiles for the feedback scores
        across all feedback keys.

        Returns:
            A DataFrame containing the quantiles for each feedback key.
        )inputs.outputs.	reference>   inputoutputall)include   )axis)to_dataframecolumns
startswithdescribedrop)selfdfcolto_drops       r<   get_aggregate_feedbackz!TestResult.get_aggregate_feedbackU   ss       zz
~~BC)) 
 
 {{5{)..wQ.??
s   A"c           	        	 ddl }g }g }| d   j                         D ]e  \  }}|d   }|j                  d      }	t	        |	t
              r'|	j                         D 
ci c]  \  }
}d|
 | }}
}n	|	i }nd|	i}i |d   j                         D 
ci c]  \  }
}d	|
 | c}}
|}d
|v rUt	        |d
   t
              r:|j                  |d
   j                         D 
ci c]  \  }
}d|
 | c}}
       n|d
   |d
<   |j                  i |D ci c]  }d|j                   |j                   c}|j                  d      |d   |j                  d      d       |j                  |       |j                  |       h  |j                  ||      S # t        $ r}d}t        |      |d}~ww xY wc c}}
w c c}}
w c c}}
w c c}w )z#Convert the results to a dataframe.r   NzfPandas is required to convert the results to a dataframe. to install pandas, run `pip install pandas`.resultsfeedbackrD   rA   rC   r@   rB   z
reference.z	feedback.Errorexecution_timerun_id)errorrW   rX   )index)pandasImportErroritemsget
isinstancedictupdatekeyscoreappend	DataFrame)rN   pdemsgindicesrecords
example_idresultrU   output_kvrD   rfs                  r<   rI   zTestResult.to_dataframej   s   	* "&y/"7"7"9 	'Jj)Hjj*G'4(8?H1HQC.!+HH"G,06w0E0E0GH1WQC=!#HA f$f[148HH9?9L9R9R9TUA:aS)1,U &,K%8AkNHH=EF155'*AGG3F#ZZ0&,-=&>$jj2	 NN1NN:&=	'@ r||G733S  	*@  c")	* I I V Gs/   F F9F?
 G!G	F6#F11F6N)returnpd.DataFrame)r6   r7   r8   r9   rR   rI   r:   r;   r<   r>   r>   R   s    ;@	@*-4r;   r>   c                  ,     e Zd ZdZd fdZddZ xZS )	EvalErrorz"Your architecture raised an error.c                (    t        |   dd|i| y )NrV   r:   )super__init__)rN   rV   kwargs	__class__s      r<   rx   zEvalError.__init__   s    /u//r;   c                T    	 | |   S # t         $ r}d| d}t        |      |d }~ww xY w)Nz%'EvalError' object has no attribute '')KeyErrorAttributeError)rN   namerg   rh   s       r<   __getattr__zEvalError.__getattr__   s<    	-: 	-9$qAC %1,	-s    	'"')rV   BaseExceptionry   r   rr   None)r   strrr   r   )r6   r7   r8   r9   rx   r   __classcell__)rz   s   @r<   ru   ru      s    ,0-r;   ru   c                N  	
 t        | t              r`| j                  j                  }| j                  7j                  j                  j                  }d| d| d| d}t        |      fdS t        | t              r| S t        | t              r| 

fdS t        |       rt        |       rt        t        t        |             fdS 	  |        }t        t        |       	t        |t              r|S t        t        t        |            rt        t        t        |            fd
S t        |t              s	fdS 	S | S # t        $ rN t        t        |       }t        j                  |      }t         j#                  d|       t%        |      fd	cY S w xY w)zForgive the user if they pass in a chain without memory instead of a chain
    factory. It's a common mistake. Raise a more helpful error message as well.a$  Cannot directly evaluate a chain with stateful memory. To evaluate this chain, pass in a chain constructor that initializes fresh memory each time it is called.  This will safegaurd against information leakage between dataset examples.
For example:

def chain_constructor():
    new_memory = z(...)
    return z*(memory=new_memory, ...)

run_on_dataset("z", chain_constructor, ...)c                      S Nr:   )chains   r<   <lambda>z(_wrap_in_chain_factory.<locals>.<lambda>   s    u r;   c                      S r   r:   )lcfs   r<   r   z(_wrap_in_chain_factory.<locals>.<lambda>   s    s r;   c                      S r   r:   	runnable_s   r<   r   z(_wrap_in_chain_factory.<locals>.<lambda>       9 r;   z'Wrapping function %s as RunnableLambda.c                      S r   r:   )wrappeds   r<   r   z(_wrap_in_chain_factory.<locals>.<lambda>   s    7 r;   c                      S r   r:   r   s   r<   r   z(_wrap_in_chain_factory.<locals>.<lambda>   r   r;   c                     t               S r   )r   )constructors   r<   r   z(_wrap_in_chain_factory.<locals>.<lambda>   s    >+6 r;   )r_   r,   rz   r6   memory
ValueErrorr   r   callabler#   r"   r   r   	TypeErrorinspect	signatureloggerinfor   )llm_or_chain_factorydataset_namechain_classmemory_classrh   _model	user_funcsigr   r   r   r   r   s           @@@@@r<   _wrap_in_chain_factoryr      s    &.$oo..&&2 <<11::L$ %1> 2)]##/.0J
L  S/!&(9:##&1"$% !56#D3G$HII$$	#)+F 8%9:f/0 M h!78#D6$:;I$$&(+66)  	#X';<I##I.CKKA3G$Y/G""	#s   E AF$#F$c                   | sd}t        |      g }d| v r@t        | d   t              s&dt        | d         j                   }t        |      | d   g}nd| v rTt        | d   t
              rt        d | d   D              s&dt        | d         j                   }t        |      | d   }nt        |       dk(  rkt        t        | j                                     }t        |t              r|g}nEt        |t
              rt        d |D              r|}n d	|  }t        |      d
|  }t        |      t        |      dk(  r|d   S dt        |       d}t        |      )zGet prompt from inputs.

    Args:
        inputs: The input dictionary.

    Returns:
        A string prompt.
    Raises:
        InputFormatError: If the input format is invalid.
    Inputs should not be empty.promptz"Expected string for 'prompt', got promptsc              3  <   K   | ]  }t        |t                y wr   r_   r   .0is     r<   	<genexpr>z_get_prompt.<locals>.<genexpr>   s      >
#$Jq#>
   z,Expected list of strings for 'prompts', got rG   c              3  <   K   | ]  }t        |t                y wr   r   r   s     r<   r   z_get_prompt.<locals>.<genexpr>  s     .Saz!S/A.Sr   z)LLM Run expects string prompt input. Got z5LLM Run expects 'prompt' or 'prompts' in inputs. Got r   z)LLM Run expects single prompt input. Got z	 prompts.)r5   r_   r   typer6   listrE   lennextitervalues)inputsrh   r   prompt_s       r<   _get_promptr      s    +s##G6&*C06tF8<L7M7V7V6WXC"3''(#$	f	&+T2# >
(.y(9>
 ;
VI./889;  #3''#	V	tFMMO,-gs#iG&3.S7.S+SG=fXFC"3''EfXNs##
7|qqz5c'l^9
MC
3
r;   c                      e Zd ZU dZded<   y)ChatModelInputzVInput for a chat model.

    Parameters:
        messages: List of chat messages.
    zlist[BaseMessage]messagesNr6   r7   r8   r9   __annotations__r:   r;   r<   r   r     s      r;   r   c                   | sd}t        |      | j                         }d| v r|j                  d      |d<   n3t        |       dk(  r%t	        t        | j                                     |d<   d|v rX|d   }t        |t              rt        d |D              r|g}t        |      dk(  rt        |d         |d<   |S d}t        |      d|  }t        |      )	zGet Chat Messages from inputs.

    Args:
        inputs: The input dictionary.

    Returns:
        A list of chat messages.
    Raises:
        InputFormatError: If the input format is invalid.
    r   r   rC   rG   c              3  <   K   | ]  }t        |t                y wr   )r_   r`   r   s     r<   r   z _get_messages.<locals>.<genexpr>5  s      2
$%Jq$2
r   r   zGBatch messages not supported. Please provide a single list of messages.zMChat Run expects single List[dict] or List[List[dict]] 'messages' input. Got )r5   copypopr   r   r   r   r_   r   rE   r   )r   rh   
input_copyraw_messagess       r<   _get_messagesr      s     +s##JV(nnZ8
7	V	"4#89
7*!'*lD)c 2
)52
 /
 )>L|!"4\!_"EJw 	,  #3''h	   3
r;   c                   |rg || j                   xs i       }t        |t              s@t        |t              rt	        d |D              sd| dt        |       d}t        |      y y 	 t        | j                   xs i        y # t        $ rJ 	 t        | j                   xs i        Y y # t        $ r!}d| j                    d}t        |      |d }~ww xY ww xY w)Nc              3  <   K   | ]  }t        |t                y wr   r_   r   r   rh   s     r<   r   z>_validate_example_inputs_for_language_model.<locals>.<genexpr>R  s     ISJsK0Ir   zWhen using an input_mapper to prepare dataset example inputs for an LLM or chat model, the output must a single string or a list of chat messages.
Got: 	 of type .zvExample inputs do not match language model input format. Expected a dictionary with messages or a single prompt. Got: z Please update your dataset OR provide an input_mapper to convert the example.inputs to a compatible format for the llm or chat model you wish to evaluate.)	r   r_   r   r   rE   r   r5   r   r   )first_exampleinput_mapperprompt_inputrh   err2s        r<   +_validate_example_inputs_for_language_modelr   J  s     #M$8$8$>B?,,|T*ILII 'yl1C0DAG  #3'' J -	6,,23 	66m228b9# 	6*112GG  's+5	6	6s*   +B 	CB**	C3CCCc                :   |r || j                   xs i       }t        |j                        j                  |      }t	        |t
              sd| dt        |       d}t        |      |r+d|j                   d|j                          }t        |      y| j                   }t        |j                        j                  |      }t        |      dk(  rt        |j                        dk(  ry|r+d|j                   d|j                          }t        |      y)	z<Validate that the example inputs match the chain input keys.zvWhen using an input_mapper to prepare dataset example inputs for a chain, the mapped value must be a dictionary.
Got: r   r   zAMissing keys after loading example using input_mapper.
Expected: z. Got: rG   zExample inputs missing expected chain input keys. Please provide an input_mapper to convert the example.inputs to a compatible format for the chain you wish to evaluate.Expected: N)
r   set
input_keys
differencer_   r`   r   r5   keysr   )r   r   r   first_inputsmissing_keysrh   s         r<   "_validate_example_inputs_for_chainr   m  sD    #M$8$8$>B?5++,77E,-&yl1C0DAG 
 #3''$//08I8I8K7LN  #3''  %++5++,77E|!c%*:*:&;q&@  #--. /$))+,	.  #3'' r;   c                    t        |t              rt        | |       y |       }t        |t              rt	        | ||       yt        |t
              rt        j                  d|       yy)z9Validate that the example inputs are valid for the model.z Skipping input validation for %sN)r_   r   r   r,   r   r   r   debug)exampler   r   r   s       r<   _validate_example_inputsr     sV     &(9:3G\J$&eU#.w|Lx(LL;UC )r;   c           	     :   |rt        | t              rd\  }}d}nEd} |        }t        |t              r|j                  nd}t        |t              r|j                  nd}t        ||||d   j                  rt        |d   j                        nd||      }|S d}|S )z<Configure the evaluators to run on the results of the chain.)NNllmr   Nr   )r_   r   r,   r   output_keys_load_run_evaluatorsoutputsr   )	r   examplesr1   	data_type
run_inputsrun_outputsrun_typer   run_evaluatorss	            r<   _setup_evaluationr     s     *,=>&0#JHH(*E-7u-E))4J/9%/G%++TK-)1!)<)<D!$$%$
  r;   c                    d }| j                   r+| j                   }|r||vrt        j                  d||       |S |rt        |      dk(  r|d   }|S |$t        |      dkD  rt        j                  d|       |S )NzZInput key %s not in chain's specified input keys %s. Evaluation behavior may be undefined.rG   r   zChain expects multiple input keys: %s, Evaluator is likely to fail. Evaluation behavior may be undefined. Specify an input_key in the RunEvalConfig to avoid this warning.)	input_keyr   warningr   )r   r   r   s      r<   _determine_input_keyr     s     I$$	):5NN8	   
J1,qM	  
	C
Oa$7P 		
 r;   c                    d }| j                   r+| j                   }|r||vrt        j                  d||       |S |rt        |      dk(  r|d   }|S |$t        |      dkD  rt        j                  d|       |S )Nz`Prediction key %s not in chain's specified output keys %s. Evaluation behavior may be undefined.rG   r   zChain expects multiple output keys: %s, Evaluation behavior may be undefined. Specify a prediction_key in the RunEvalConfig to avoid this warning.)prediction_keyr   r   r   )r   r   r   s      r<   _determine_prediction_keyr     s     N..><NN8	  
[)Q.$Q  
	 S%5%9; 		
 r;   c                    | j                   r'| j                   }|r||vrd| d| }t        |      |S |r$t        |      dk(  rt        t	        |            }|S d }|S )NzReference key z! not in Dataset example outputs: rG   )reference_keyr   r   r   r   )r   example_outputsr   rh   s       r<   _determine_reference_keyr     s     ,,}OC  0%%4$57  S/!
 	 
S1Q6T/23  r;   c           	     ~   t        | t              r| S t        | t        t        f      r5t        | t              st        |       } t	        | |      }| j
                  }	nt        | t        j                        rd|i| j                         }
t	        | j                  fi |
}| j                  j
                  }	t        | t        j                        r`| j                  xs |}| j                  xs |}| j                  xs |}n/t        |       rt        |       S dt!        |        }t#        |      t        |t$              rK|j&                  r|d|	 d| d}t#        |      t(        j*                  j-                  |||||||	g      }|S t        |t.              rd|	 d	}t1        |      d|	 d
}t1        |      )N)r   r   zUnknown evaluator type: zPMust specify reference_key in smith_eval.RunEvalConfig to use evaluator of type z) with dataset with multiple output keys: r   )r   r   r   tagszRun evaluator for z is not implemented. PairwiseStringEvaluators compare the outputs of two different models rather than the output of a single model. Did you mean to use a StringEvaluator instead?
See: https://python.langchain.com/docs/guides/evaluation/string/z is not implemented)r_   r    r.   r   r-   valuesmith_eval_config
EvalConfig
get_kwargsevaluator_typeSingleKeyEvalConfigr   r   r   r   run_evaluator_decr   r   r0   requires_reference
smith_evalStringRunEvaluatorChainfrom_run_and_data_typer/   NotImplementedError)eval_configeval_llmr   r   r   r   r   r   
evaluator_eval_type_tagry   rh   r!   s                r<   _construct_run_evaluatorr    s    +|,+s34+}5'4K#KX>
#))	K!2!=!=	>>[%;%;%=>#K$>$>I&I
#2288k#4#H#HI#--:I(77I>N'55FM	+	 --(k):(;<o*o.((]-B&&3_ 577F6GqJ 
 S/!"::QQ)' R 
,  
J 7	8  0Q Q 	 "#&& #=/1DE!#&&r;   c                T    t        | |      }t        | |      }t        | |      }|||fS r   )r   r   r   )r   r   r   r   r   r   r   s          r<   	_get_keysr  T  s5     %VZ8I.v{CN,V_EMnm33r;   c                   g }d\  }}}	| j                   s(| j                  r.t        d | j                  D              rt        | |||      \  }}}	| j                   D ]/  }
t	        |
| j
                  ||||	||      }|j                  |       1 | j                  xs g }|D ]  }t        |t              r|j                  |       %t        |t              r5|j                  t        j                  j                  ||||||	             jt        |      r|j                  t        |             d| d}t        |       |S )z
    Load run evaluators from a configuration.

    Args:
        config: Configuration for the run evaluators.

    Returns:
        A list of run evaluators.
    NNNc              3  <   K   | ]  }t        |t                y wr   )r_   r0   )r   rg   s     r<   r   z'_load_run_evaluators.<locals>.<genexpr>u  s     Q1
1o.Qr   )r   r   r   zUnsupported custom evaluator: z+. Expected RunEvaluator or StringEvaluator.)
evaluatorscustom_evaluatorsanyr  r  r  rd   r_   r    r0   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r  r!   r  custom_evaluatorrh   s                  r<   r   r   `  sv   " N/?,I~}  Q8P8PQQ3<	4
0	>= (( -0OO	
 	m,- 006B- "&5!!"23(/:!!22II$'#1"/ J 	 &'!!"34D"EF 11A0B C= >  S/!+". r;   r   	callbacksr   metadatac                  K   |z ||      }t        |t              s"t        |t              rAt        d |D              r/| j	                  |t        ||xs g |xs i              d{   S d| d}t        |      	 t        |      }| j	                  |t        ||xs g |xs i              d{   }	|	S 7 R7 # t        $ rB t        |      }
 | j                  di |
dt        ||xs g |xs i       i d{  7  }	Y |	S w xY ww)	a  Asynchronously run the language model.

    Args:
        llm: The language model to run.
        inputs: The input dictionary.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.
        input_mapper: Optional function to map inputs to the expected format.

    Returns:
        The LLMResult or ChatResult.
    Raises:
        ValueError: If the LLM type is unsupported.
        InputFormatError: If the input format is invalid.
    Nc              3  <   K   | ]  }t        |t                y wr   r   r   s     r<   r   z_arun_llm.<locals>.<genexpr>       OSJsK0Or   r  r   r  r   z%Input mapper returned invalid format 3
Expected a single string or list of chat messages.r   r:   )	r_   r   r   rE   ainvoker   r5   r   r   )r   r   r   r  r   r  prompt_or_messagesrh   r   
llm_output
llm_inputss              r<   	_arun_llmr    sU    0 )&1(#.)40O<NOO"%'%^ %   "#BC 	
 s##
V$47KK!#ZR!R 5@ 5
 /

$ G"/
  	
"6*
&3;; 

!#ZR!R
 
 

 	
sU   A'D)B<*D 5C  5B>6C  :D>C   ADDDD
DDr   r   r  c          	       K   ||n ||      }t        | t              r}t        |t              rmt        |      dk(  r_| j                  rSt        t        |j                                     }| j                  |t        ||xs g |xs i              d{   }|S t        |xs g ||xs i       }	| j                  ||	       d{   }|S 7 97 w)z%Run a chain asynchronously on inputs.NrG   r  r   r   r  r  )
r_   r,   r`   r   r   r   r   r   r  r   
r   r   r  r   r   r  inputs_valrD   runnable_configs
             r<   _arun_chainr$    s      %,f,v2FG5% w%LA4()*}}!#ZR!R % 
 
 M )^

 }}W_}EEM
 Fs$   BCC2CC	CC)r   c          
       K   t        |t              rdnd}d}	 t        |t              r=t        || j                  xs i |d   |d   ||j	                  d             d{   }nC |       }t        || j                  xs i |d   |d   ||j	                  d             d{   }|}|S 7 L7 
# t        $ rD}t        j                  d|| j                  | j                  |       t        |	      }Y d}~|S d}~ww xY ww)
a  Asynchronously run the Chain or language model.

    Args:
        example: The example to run.
        llm_or_chain_factory: The Chain or language model constructor to run.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.
        input_mapper: Optional function to map the input to the expected format.

    Returns:
        A list of outputs.
    LLMr,   Nr   r  r  r  z*%s failed for example %s with inputs %s
%srV   )r_   r   r  r   r^   r$  	Exceptionr   r   idru   )	r   r   r   r   chain_or_llmrl   rD   r   rg   s	            r<   _arun_llm_or_chainr+    s    ( 02CD'  F$*,=> )$$"F^ -)J/! F )*E&$"F^ -)J/ F  M9  $9JJNN	
 #M$sT   DAB2 !B."AB2 %B0&B2 ,D.B2 0B2 2	C?;9C:4D:C??Dc                  |t ||      }t        |t              s"t        |t              r;t        d |D              r)| j	                  |t        ||xs g |xs i             }|S d| d}t        |      	 t        |      }	| j	                  |	t        ||xs g |xs i             }|S # t        $ r4 t        |      }
 | j                  di |
dt        ||xs i       i}Y |S w xY w)	a  
    Run the language model on the example.

    Args:
        llm: The language model to run.
        inputs: The input dictionary.
        callbacks: The callbacks to use during the run.
        tags: Optional tags to add to the run.
        input_mapper: function to map to the inputs dictionary from an Example
    Returns:
        The LLMResult or ChatResult.
    Raises:
        ValueError: If the LLM type is unsupported.
        InputFormatError: If the input format is invalid.
    c              3  <   K   | ]  }t        |t                y wr   r   r   s     r<   r   z_run_llm.<locals>.<genexpr>g  r  r   r  r   z'Input mapper returned invalid format:  r  r   )r  r  r:   )	r_   r   r   rE   invoker   r5   r   r   )r   r   r  r   r   r  r  r  rh   llm_promptsr  s              r<   _run_llmr0  J  s-   2 )&1(#.)40O<NOO25**"%'%^ 3= 3J@ -&'FG 
 #3''	%f-K%'%^ $ J    	&v.J# %	HNPRSJ 	s   82B, ,9C)(C)c          	     z   ||n ||      }t        | t              rut        |t              ret        |      dk(  rW| j                  rKt        t        |j                                     }| j                  |t        ||xs g |xs i             }|S t        |xs g ||xs i       }	| j                  ||	      }|S )zRun a chain on inputs.rG   r  r   r  )
r_   r,   r`   r   r   r   r   r   r.  r   r   s
             r<   
_run_chainr2    s     %,f,v2FG5% w%LA4()*!#ZR!R  
 M )^

 go>Mr;   c          
        t        |t              rdnd}d}	 t        |t              r5t        || j                  xs i |d   |d   ||j	                  d            }n; |       }t        || j                  xs i |d   |d   ||j	                  d            }|}|S # t        $ rZ}t        |      j                  }	t        j                  d|| j                  | j                  |	|       t        |	      }Y d}~|S d}~ww xY w)
a  
    Run the Chain or language model synchronously.

    Args:
        example: The example to run.
        llm_or_chain_factory: The Chain or language model constructor to run.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.

    Returns:
        Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
          The outputs of the model or chain.
    r&  r,   Nr  r   r  r  zC%s failed for example %s with inputs %s
Error Type: %s, Message: %sr'  )r_   r   r0  r   r^   r2  r(  r   r6   r   r   r)  ru   )
r   r   r   r   r*  rl   rD   r   rg   
error_types
             r<   _run_llm_or_chainr5    s   * 02CD'  F$*,=>"$$"{#F^)J/F )*E$"{#F^)J/F  M  
$!W%%
RJJNN	
 #M
$s   BB 	C?%AC::C?c           
        t        ||      }| j                  |      }t        | j                  |j                  |            }	|	sd| d}
t        |
      |	D cg c]  }|j                  s|j                   }}|rt        |      nd }|r|j                         nd }	 |xs i }t               }|ri |d|i}||d<   | j                  ||j                  |rd|ini |      }|j                   d|j                   z   }t#        d| d| d| d|j                    d       ||||	fS c c}w # t        t
        t        f$ rG}d	t        |      vr t        j                         }d
| d| d| d}d| d| }
t        |
      |d }~ww xY w)N)r   )
dataset_idas_ofzDataset z has no example rows.gitdataset_versionr   )reference_dataset_idproject_extrar  zalready exists z+
run_on_dataset(
    ...
    project_name="z - z", # Update since z already exists
)
zTest project z/ already exists. Please use a different name:

z/compare?selectedSessions=z)View the evaluation results for project 'z' at:
z

View all tests for Dataset z at:
T)flush)r   read_datasetr   list_examplesr)  r   modified_atmax	isoformatr   create_projectr*   r)   r   uuiduuid4urlprint)clientr   r   project_nameproject_metadatar   r:  wrapped_modeldatasetr   rh   exr@  max_modified_atinferred_versiongit_infoprojectrg   uidexample_msgcomparison_urls                        r<   _prepare_eval_runrU    s    ++?NM!!|!<GF((GJJo(VWH&;<o,4Gb2>>GKG +6c+&4O6E0024%+1r> " x 
 /?*+''!(,064.b%	 ( 
* [[%?

|#LLN	
3L> B  !&&2^6'++	H 	 '7H44Y H, z>2 %CF*jjl  .C5(:<. I L> *-! 	 o1$%s&   D+.D+"AD0 0FAFFc                  0    e Zd ZU dZded<   ded<   ded<   y)	
_RowResultz5A dictionary of the results for a single example row.z Optional[list[EvaluationResult]]rU   zOptional[float]rW   Optional[str]rX   Nr   r:   r;   r<   rW  rW  *  s    ?..##r;   rW  F)totalc                      e Zd ZU dZded<   ded<   ded<   ded	<   d
ed<   dZded<   	 	 	 	 	 	 ddZddZddZ	 	 	 	 ddZ		 d	 	 	 	 	 ddZ
e	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd       Zy)_DatasetRunContainerz3A container to help manage the state of a eval run.r   rH  r(   rQ  MCFrK  list[Example]r   zlist[RunnableConfig]configsNz6Optional[list[smith_eval_config.BATCH_EVALUATOR_LIKE]]batch_evaluatorsc           	     `   i }t        | j                  |      D ]  \  }}t        t        |j	                  t        |j                        i             }|j                  |j	                  dg       |j	                  d      |j	                  d      d|t        |j                        <   t        |t              r&|j                  |t        |j                           d<   n||t        |j                           d<   |j                  s|j                  |t        |j                           d<    |S )NrU   rW   rX   )rC   rU   rW   rX   rV   rD   rB   )zipr   r   rW  r^   r   r)  r   r_   ru   rV   r   )rN   batch_resultsall_eval_resultsrT   r   rD   
row_results          r<   _merge_test_outputsz(_DatasetRunContainer._merge_test_outputs=  s    
 "4==-@ 	HOGVj*:*>*>s7::PR*STJ &NN:r:",..1A"B$..2	(GC

O$ &),4:LLGJJ(15;GJJ(28?GJJ(5	H r;   c           	        | j                   }|sg S | j                  D cg c]  }|t        |j                            }}g }t        j
                  j                         5 }|D ]  }	  ||| j                        }t        |t              r|j                         }|j                  t        t        |              |j                  | j                  j                  fi |d | j                  j                  d  	 d d d        |S c c}w # t         $ r" t"        j%                  dt'        |             Y w xY w# 1 sw Y   |S xY w)N)rX   
project_idz Error running batch evaluator %s)r_  r   r   r)  
concurrentfuturesThreadPoolExecutorr_   r   r`   rd   r   submitrH  create_feedbackrQ  r(  r   	exceptionrepr)	rN   runsr  r   	runs_listaggregate_feedbackexecutor	evaluatorrl   s	            r<   _run_batch_evaluatorsz*_DatasetRunContainer._run_batch_evaluatorsS  s(   **
I:>--HwT#gjj/*H	H224 	' 	&y$--@F!&*:;!'&--d4.@A#HOO33   $#'<<??		" "!' I ! $$:DO	" "!s6   D D9'BD9D9(D63D95D66D99Ec                   i }i }| j                   D ]>  }t        t        |d         D ]$  }t        |t              rT|j
                  }|j                         D ]4  \  \  }}}|j                  t        |      i       j                  d|i       6 ht        |t              sy|j                  }	|	r3|	j                  r'|	j                  |	j                  z
  j                         nd }
|	rt        |	j                        nd }|j                  t        |j                         i       j                  |
||	d       |	|t        |j                         <   ' A t        t"        t        t$        f   |      |fS )Nr  rU   )rW   rX   run)r^  r   r   r_   r   logged_eval_resultsr]   
setdefaultr   ra   r   
latest_runend_time
start_timetotal_secondsr)  rk   r`   rW  )rN   rc  all_runsccallbackeval_results_rk   ro   rv  rW   rX   s               r<   _collect_metricsz%_DatasetRunContainer._collect_metricsl  sX   !# 	=A q~6 =h(@A#+#?#?L.:.@.@.B *J(33C
ORHOO'O  /:"--C 3<< 6EEG! #
 -0S[TF$//H4G4G0H"MTT.<&,#& :=HS!4!456-=	=0 Dj)+;<hFFr;   c                6   t         j                  d       t                | j                         \  }}d }| j                  r&t         j                  d       | j                  |      }| j                  ||      }t        | j                  j                  ||      S )Nz#Waiting for evaluators to complete.zRunning session evaluators.)rI  rT   aggregate_metrics)
r   r   r   r  r_  rt  re  r>   rQ  r   )rN   rb  rc  r}  rq  rT   s         r<   _collect_test_resultsz*_DatasetRunContainer._collect_test_results  s     	9:!%)%:%:%<"(!  KK56!%!;!;H!E**=:JK**0
 	
r;   c                   | j                  |      }|r	 |j                         }t        |       	 | j                  j                  | j                  j                  t        j                  t        j                               |S # t        $ r"}t        j                  d|d       Y d }~{d }~ww xY w# t        $ r#}t        j                  d|d       Y d }~|S d }~ww xY w)Nz&Failed to print aggregate feedback: %sT)exc_info)rz  zFailed to close project: %s)r  rR   _display_aggregate_resultsr(  r   r   rH  update_projectrQ  r)  r   nowr   utc)rN   rb  verboserT   agg_feedbackrg   s         r<   finishz_DatasetRunContainer.finish  s    
 ,,];Y&==?*<8	JKK&&!hll3 '    YEqSWXXY  	JLL6DLII	Js/   B AB3 	B0B++B03	C<CCc                ,   |xs t        j                         }|
r|	si }	|	j                  d|
i       t        |||||	||      \  }}}}|xs g }|j                  j                  d      xs i j                         D ]  \  }}|j                  d| d|         d|j                  d   i}|
r|
|d<   t        |      }t        ||||j                  xs t        j                        }t        |d   ||       t        j                  t!        |            }|D cg c]O  }t#        t%        |j&                  ||j(                        t+        |xs g ||j(                  d	      |g|||
      Q }} | ||||||r|j,                        S d       S c c}w )Nrevision_id)rJ  r   r:  r9  zgit:=r:  r   )rI  rH  rk   )r  rH  rk   max_concurrency)r  r   r  r  )rH  rQ  rK  r   r^  r_  )r2   random_namera   rU  r  r^   r]   rd   r   r   r   r%   kvr   r3   ProgressBarCallbackr   r   r   r   r)  r   r_  )clsrH  r   r   rI  r1   r   r   concurrency_levelrJ  r  r:  rK  rQ  rL  r   rn   ro   run_metadatar   progress_barr   r^  s                          r<   preparez_DatasetRunContainer.prepare  s    $D'B'B'D##% ##]K$@A4E -+5
1w zr%%))%06B==? 	(DAqKK$qc1#'	()7+;+;<M+NO*5L'./CD*,	
 	!!m\J33CMB* $)
( ' #%,\\%#*::
 -#1#7R%#*::()	 !  1%#
 
, '<FZ88
 	
 MQ
 	
-
s   AF)rb  r   rc  zdict[str, _RowResult]rr   r`   )ro  zdict[str, Run]rr   z
list[dict])rr   z,tuple[dict[str, _RowResult], dict[str, Run]])rb  z-list[Union[dict, str, LLMResult, ChatResult]]rr   r>   )F)rb  r   r  boolrr   r>   )NNN   NNN)rH  r   r   r   r   MODEL_OR_CHAIN_FACTORYrI  rX  r1   "Optional[smith_eval.RunEvalConfig]r   Optional[list[str]]r   Optional[Callable[[dict], Any]]r  intrJ  Optional[dict[str, Any]]r  rX  r:  Optional[Union[datetime, str]]rr   r[  )r6   r7   r8   r9   r   r_  re  rt  r  r  r  classmethodr  r:   r;   r<   r[  r[  2  sF   =N!!OSLS 0 
	,"2G:
D
 

*   
	,  :>$(8<!"59%):>H
H
 H
 5	H

 $H
 7H
 "H
 6H
 H
 3H
 #H
 8H
 
H
 H
r;   r[  c                 ~    	 ddl m}   |        } |        d uxr dt        t        |            v S # t        $ r Y yw xY w)Nr   )get_ipythonzmqshellF)IPython.core.getipythonr  r   r   r\   )r  ress     r<   _is_jupyter_environmentr    sB    7m}D(IZ3tCy>-II s   -0 	<<c                    t               rddlm}m}  | |d              ||        y | j	                  d d      }t        d       t        |       y )Nr   )HTMLdisplayz<h3>Experiment Results:</h3>c                
    | dS )Nz.2fr:   )xs    r<   r   z,_display_aggregate_results.<locals>.<lambda>  s
    aW r;   right)float_formatjustifyz
 Experiment Results:)r  IPython.displayr  r  	to_stringrG  )aggregate_resultsr  r  formatted_strings       r<   r  r    sR     1345!",66- 7 
 	&'r;   a  The input_mapper argument is deprecated and will be removed in a future release. Please add a  RunnableLambda to your chain to map inputs to the expected format instead. Example:
def construct_chain():
    my_chain = ...
    input_mapper = {'other_key': 'MyOtherInput', 'my_input_key': x}
    return input_mapper | my_chain
run_on_dataset(..., llm_or_chain_factory=construct_chain)
(See https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.base.RunnableLambda.html)r  )r1   r:  r  rI  rJ  r  r  c                 K   |
j                  dd       }|rt        dt        d       |	t               j	                  d      }	|
j                  dd       }|rt        ddd       |
r t        dd	|
j                          d
d       | xs
 t               } t        j                  | |||||||||	|      }t        j                  |j                  d   j	                  d      gt        t        j                  t        |j                   |      |j"                  |j                          d {   }|j%                  ||      S 7 w)Nr   0.0.305Tmessagependingr  r   0.1.9qThe tags argument is deprecated and will be removed in a future release. Please specify project_metadata instead.PThe following arguments are deprecated and will be removed in a future release: r   r  removalrJ  r  r:  r   r  r   r   r  )r   r   _INPUT_MAPPER_DEP_WARNINGr   r^   r   r   r[  r  runnable_utilsgather_with_concurrencyr^  map	functoolspartialr+  rK  r   r  )rH  r   r   r1   r:  r  rI  rJ  r  r  ry   r   r   	containerrb  s                  r<   arun_on_datasetr  '  sd     ::nd3L	+DdS46::=I::fd#DU		
 4{{}oQ  	
 vxF$,,)' - I )@@!  !23	"%.%<%<)
 

 M M7;;s   D,E.E/Ec               N   |
j                  dd       }|rt        dt        d       |
j                  dd       }|rt        ddd       |	t               j	                  d      }	|
r t        dd	|
j                          d
d       | xs
 t               } t        j                  | |||||||||	|      }|dk(  rJt        |j                  |j                        D cg c]  \  }}t        |||j                  |        }}}nt        j                  |j                  d         5 }t!        |j#                  t%        j&                  t        |j                  |      |j                  |j                              }d d d        |j)                  |      S c c}}w # 1 sw Y   "xY w)Nr   r  Tr  r   r  r  r  r  r   r  r  r   r  r  )r   r   r  r   r^   r   r   r[  r  ra  r   r^  r5  rK  r#  get_executor_for_configr   r  r  r  r  )rH  r   r   r1   r:  r  rI  rJ  r  r  ry   r   r   r  r   r   rb  rr  s                     r<   run_on_datasetr  h  s    ::nd3L	+DdS::fd#DU		
 46::=I4{{}oQ  	
 vxF$,,)' - I A $'y'9'99;L;L#M
   %.%<%<)	
 
 44Y5F5Fq5IJ 	h %%)-6-D-D%1
 &&%%
M	 M7;;/
	 	s   #F%AFF$a1  
Run the Chain or language model on a dataset and store traces
to the specified project name.

Args:
    dataset_name: Name of the dataset to run the chain on.
    llm_or_chain_factory: Language model or Chain constructor to run
        over the dataset. The Chain constructor is used to permit
        independent calls on each example without carrying over state.
    evaluation: Configuration for evaluators to run on the
        results of the chain
    concurrency_level: The number of async tasks to run concurrently.
    project_name: Name of the project to store the traces in.
        Defaults to {dataset_name}-{chain class name}-{datetime}.
    project_metadata: Optional metadata to add to the project.
        Useful for storing information the test variant.
        (prompt version, model version, etc.)
    client: LangSmith client to use to access the dataset and to
        log feedback and run traces.
    verbose: Whether to print progress.
    tags: Tags to add to each run in the project.
    revision_id: Optional revision identifier to assign this test run to
        track the performance of different versions of your system.
Returns:
    A dictionary containing the run's project name and the resulting model outputs.


For the (usually faster) async version of this function, see :func:`arun_on_dataset`.

Examples
--------

.. code-block:: python

    from langsmith import Client
    from langchain_openai import ChatOpenAI
    from langchain.chains import LLMChain
    from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset

    # Chains may have memory. Passing in a constructor function lets the
    # evaluation framework avoid cross-contamination between runs.
    def construct_chain():
        llm = ChatOpenAI(temperature=0)
        chain = LLMChain.from_string(
            llm,
            "What's the answer to {your_input_key}"
        )
        return chain

    # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
    evaluation_config = smith_eval.RunEvalConfig(
        evaluators=[
            "qa",  # "Correctness" against a reference answer
            "embedding_distance",
            smith_eval.RunEvalConfig.Criteria("helpfulness"),
            smith_eval.RunEvalConfig.Criteria({
                "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
            }),
        ]
    )

    client = Client()
    run_on_dataset(
        client,
        dataset_name="<my_dataset_name>",
        llm_or_chain_factory=construct_chain,
        evaluation=evaluation_config,
    )

You can also create custom evaluators by subclassing the
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
or LangSmith's `RunEvaluator` classes.

.. code-block:: python

    from typing import Optional
    from langchain.evaluation import StringEvaluator

    class MyStringEvaluator(StringEvaluator):

        @property
        def requires_input(self) -> bool:
            return False

        @property
        def requires_reference(self) -> bool:
            return True

        @property
        def evaluation_name(self) -> str:
            return "exact_match"

        def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
            return {"score": prediction == reference}


    evaluation_config = smith_eval.RunEvalConfig(
        custom_evaluators = [MyStringEvaluator()],
    )

    run_on_dataset(
        client,
        dataset_name="<my_dataset_name>",
        llm_or_chain_factory=construct_chain,
        evaluation=evaluation_config,
    )
zrun_on_dataset(zawait arun_on_dataset()z<my_dataset>)r   r  r   r   rr   r\  )r   dict[str, Any]rr   r   )r   r  rr   r`   )r   r&   r   r  rr   r   )r   r&   r   r,   r   r  rr   r   )r   r&   r   r\  r   r  rr   r   )
r   r\  r   r]  r1   r  r   r%   rr   zOptional[list[RunEvaluator]])r   smith_eval.RunEvalConfigr   r  rr   rX  )r   r  r   r  rr   rX  )r   r  r   r  rr   rX  )r  zYUnion[smith_eval_config.SINGLE_EVAL_CONFIG_TYPE, smith_eval_config.CUSTOM_EVALUATOR_TYPE]r  zOptional[BaseLanguageModel]r   r   r   r%   r   r  r   rX  r   rX  r   rX  rr   r    )
r   r  r   r  r   r  r   r  rr   z2tuple[Optional[str], Optional[str], Optional[str]])r   r  r   r   r   r%   r   r  r   r  r   r  rr   zlist[RunEvaluator])r   r   r   r  r   r  r  r   r   r  r  r  rr   Union[str, BaseMessage])r   zUnion[Chain, Runnable]r   r  r  r   r   r  r   r  r  r  rr   zUnion[dict, str])
r   r&   r   r   r   r\  r   r  rr   z'Union[dict, str, LLMResult, ChatResult])r   r   r   r  r  r   r   r  r   r  r  r  rr   r  r
  )rH  r   r   r   r   r  rI  r   rJ  r  r   r  r:  zOptional[Union[str, datetime]]rr   z1tuple[MCF, TracerSession, Dataset, list[Example]])rr   r  )r  rs   rr   r   )rH  zOptional[Client]r   r   r   r  r1   r  r:  r  r  r  rI  rX  rJ  r  r  r  r  rX  ry   r   rr   r  )r9   
__future__r   concurrent.futuresrh  dataclassesr  r   loggingrD  r   r   typingr   r   r   r	   r
   r   langchain_core._apir   langchain_core.callbacksr   langchain_core.language_modelsr   langchain_core.messagesr   r   langchain_core.outputsr   r   langchain_core.runnablesr   r   r   r   r#  r   r  !langchain_core.tracers.evaluationr   r    langchain_core.tracers.langchainr   langsmith.clientr   langsmith.envr   r   langsmith.evaluationr   r    r!   r   langsmith.run_helpersr"   r#   langsmith.schemasr$   r%   r&   r'   r(   langsmith.utilsr)   requestsr*   typing_extensionsr+   langchain.chains.baser,   langchain.evaluation.loadingr-   langchain.evaluation.schemar.   r/   r0   langchain.smithr1   r   langchain.smith.evaluationr   r2   r3   r[   rf   	getLoggerr6   r   r`   r  r\  r(  r5   r>   ru   r   r   r   r   r   r   r   r   r   r   r   r  r  r   r  r$  r+  r0  r2  r5  rU  rW  	dataclassr[  r  r  r  r  r  _RUN_ON_DATASET_DOCSTRINGreplacer:   r;   r<   <module>r     s+   D "       '  0 . < C 8 M M > < = # F E L L *  ' ' 7 
 5 B @			8	$Ruh''(dVS[		  HRuh//02CCD3y 3E4 E4P- -  ': 0: :  	: z. b Y  & T 6 61 6 
 6F%(%(%( 2%( 
	%(PDDD 2D 
	D( 3 	
 "<$# 6$$ 4$( &BB
 *B B B )B !B B "B BJ	4$	4#	4 %	4 )		4
 8	4A$AA A )	A
 $A %A AV !%48)-A	AA 	A
 A 2A 'A AR !%48)-!!!! !
 ! 2! '! !R 59555 	5
 25 -5@ !%48)-?	?? ?
 ? 2? '? ?N !%48)-!!!! !
 ! 2! '! !R 59888 	8
 28 -8@ 26 $6:<5<5<5 1<5 	<5
 /<5 <5 4<5 7<5~%  G
 G
 G
T  
: , 6:6:"&15!%><><>< 1><
 3>< 4>< ><  >< />< >< >< >< ><L 6:6:"&15!%J<J<J< 1J<
 3J< 4J< J<  J< /J< J< J< J< J<Zj V 3 3;; r;   