forked from aws-samples/foundation-model-benchmarking-tool
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel_eval_all_info.yml
More file actions
94 lines (88 loc) · 5.45 KB
/
model_eval_all_info.yml
File metadata and controls
94 lines (88 loc) · 5.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# This file contains the evaluation information for majority voting. Here, we initialize
# the embeddings model used to calculate quantitative metrics such as
# cosine similarity. The other part of this evaluation is using subjective
# evaluation methods: majority voting. In the case of when a ground truth
# is provided, FMBench can use majority voting with the help of a 'panel of judges' to get a verdict [correct, incorrect].
# For more information, view this paper: https://arxiv.org/pdf/2404.18796. Majority voting using a panel of LLM evaluators
# helps in getting a 'close to human evaluation', reduces cost of evaluations, and eliminates intra model bias.
model_evaluations:
ground_truth_col: {ground_truth}
question_col: {question}
PoLL_Composition_and_Voting:
method: majority_vote
# Set this variable to yes if you want to make partial correct/incorrect decisions based
# on quantitative metrics like cosine similarity, levenshtein score and token set ratio. Set
# this to yes only if you have a very direct QnA use case
use_quantitative_metrics: yes
model_eval_dir:
# This is the directory in S3 and locally where all the evaluation instructions are stored for
# evaluating the candidate model responses using majority voting
eval_prompts_dir: eval_criteria
# the directory contains a folder that contains all the files with rules for evaluations
# and another directory that stores the standard prompt template that is used for evlauation
# of different answers at runtime. For example, `claude_eval_prompt_templates` contains the
# prompt template that claude will use for majority voting, etc.
eval_prompt_template_dir_list:
- claude_eval_prompt_templates
- llama3_eval_prompt_templates
- cohere_eval_prompt_templates
- mistral_eval_prompt_templates
# These are the rules that are prefilled within the
# prompt templates evaluating for majority voting
eval_instructions_dir: eval_instructions
eval_instructions_files:
- evaluation_instructions_majority_vote.txt
# This represents the information that is used to get the quantitative metrics
# from the evaluation step. This includes calculating the cosine similarity.
# If a ground truth is provided, measure the cosine similarity against the ground truth,
# else measure it against the context provided. We use the `sentence-transformers/all-mpnet-base-v2`
# dataset. There is also an option to use the Titan embeddings model (WIP)
quantitative_eval_info:
embeddings_model_id:
model_id: sentence-transformers/all-mpnet-base-v2
# This contains information about quantitative metrics thresholds that need to be set while
# evaluating whether a candidate model response is correct or incorrect without parsing it through
# the panel of LLM evaluation procedure
# There are two cosine similarity verdict scores that are used, one to determine whether a candidate model
# response is incorrect and another to determine whether it is correct. If the incorrect threshold is met, for
# example if the LLM evaluator provides an incorrect verdict, the actual incorrectness will be defined once
# it also is below the incorrect cosine similarity threshold of for example 0.40.
# If the LLM evaluator provides a correct verdict and it exceeds the correctness cosine similarity score of
# 0.05 for example, then the answer is defined as correctly evaluated as "correct"
incorrect_verdict_cosine_similarity_threshold: 0.40
correct_verdict_cosine_similarity_threshold: 0.01
# This represents the information that is used to get subjective evaluations on the
# content that is generated. It uses an LLM as a judge (that is configurable) and evaluates
# each content from the inference step on different evaluation criteria. The information about
# the LLM as a judge panel is given below that is used in the majority voting
subjective_eval_info:
# this is the judge panel list that is used in the evaluation process
judge_panel_list:
# Information on judge 1 on the evaluation judge panel
- model_id: meta.llama3-70b-instruct-v1:0
# this is the prompt template that is used in the evaluation process
# based on the method: majority voting
eval_prompt_template_dir: "llama3_eval_prompt_templates"
eval_prompt_template_name: "llama3_eval_{method_name}"
# Information on judge 2 on the evaluation judge panel
- model_id: anthropic.claude-3-sonnet-20240229-v1:0
# this is the prompt template that is used in the evaluation process
# based on the method: majority voting
eval_prompt_template_dir: "claude_eval_prompt_templates"
eval_prompt_template_name: "claude_eval_{method_name}"
# Information on judge 3 on the evaluation judge panel
# We use the most powerful cohere model - cohere command R +
- model_id: cohere.command-r-plus-v1:0
# this is the prompt template that is used in the evaluation process
# based on the method: majority voting
eval_prompt_template_dir: "cohere_eval_prompt_templates"
eval_prompt_template_name: "cohere_eval_{method_name}"
# number of parallel calls made asyncronously to bedrock using Ray
run_parallel_inference_count: 10
# Common inference parameters used in the evaluation process
# We use LiteLLM for interfacing with Bedrock
inference_parameters:
temperature: 0.1
max_tokens: 300
top_p: 0.92
caching: False