Skip to content

Commit 4a1e83f

Browse files
author
Adam Holm
committed
Enhance LlmAsJudgeEvalsAsPlugins functionality
- Updated LlmAsJudgeEvalsAsPlugins.Demo.csproj with a new package reference for OpenAI. - Introduced LlmAsJudgeEvalsAsPlugins.Tests project for unit testing. - Enhanced evaluation criteria descriptions in GptGroundedness.yaml files. - Significant updates to EvalService.cs, including new constructors and evaluation methods. - Improved input model creation methods in InputModels.cs for better documentation. - Updated Readme.md with new features, installation instructions, and usage examples. - Created EvalServiceTests.cs with unit tests for evaluating functions. - Modified TokenString.cs to improve token data handling. - Configured LlmAsJudgeEvalsAsPlugins.Tests.csproj with necessary testing dependencies.
1 parent d644707 commit 4a1e83f

18 files changed

Lines changed: 555 additions & 101 deletions

Demo/LlmAsJudgeEvalsAsPlugins.Demo/EvalManager.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
using System.Text;
22
using System.Text.Json;
33
using HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;
4+
using Microsoft.Extensions.AI;
45
using Microsoft.SemanticKernel;
56
using Microsoft.SemanticKernel.ChatCompletion;
67
using Microsoft.SemanticKernel.Connectors.OpenAI;
78
using Microsoft.SemanticKernel.Memory;
89
using Microsoft.SemanticKernel.Text;
10+
using OpenAI;
911
using UglyToad.PdfPig;
1012

13+
1114
namespace LlmAsJudgeEvalsAsPlugins.Demo;
1215

1316
public class EvalManager(IConfiguration configuration, ILoggerFactory loggerFactory)

Demo/LlmAsJudgeEvalsAsPlugins.Demo/LlmAsJudgeEvalsAsPlugins.Demo.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
<ItemGroup>
2020
<PackageReference Include="Markdig" Version="0.37.0" />
21+
<PackageReference Include="Microsoft.Extensions.AI.OpenAI" Version="9.4.3-preview.1.25230.7" />
2122
<PackageReference Include="PdfPig" Version="0.1.8" />
2223
<PackageReference Include="Radzen.Blazor" Version="6.0.9" />
2324
<PackageReference Include="Tiktoken" Version="2.0.3" />
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.IO;
4+
using System.Threading.Tasks;
5+
using HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;
6+
using Microsoft.SemanticKernel;
7+
using Microsoft.SemanticKernel.Connectors.OpenAI;
8+
using Xunit;
9+
using Moq;
10+
11+
namespace LlmAsJudgeEvalsAsPlugins.Tests;
12+
//[Collection(nameof(EvalServiceFixture))]
13+
public class EvalServiceTests(EvalServiceFixture fixture) : IClassFixture<EvalServiceFixture>
14+
{
15+
private const string TestYaml = """
16+
name: binary_quality
17+
description: Return 4 if the answer is perfect, 0 otherwise.
18+
template: |
19+
Give me just "4".
20+
""";
21+
22+
private sealed record TestInput(string SystemAnswer)
23+
: IInputModel
24+
{
25+
public string FunctionName => "binary_quality";
26+
public KernelArguments RequiredInputs =>
27+
new()
28+
{
29+
["answer"] = SystemAnswer
30+
};
31+
}
32+
33+
[Fact]
34+
public void AddEvalFunction_AddsFunction_WhenNotExists()
35+
{
36+
// Arrange
37+
var kernel = Kernel.CreateBuilder().Build();
38+
var service = new EvalService(kernel);
39+
var function = KernelFunctionFactory.CreateFromPrompt("prompt", new OpenAIPromptExecutionSettings(), "testFunction");
40+
var name = "testFunction";
41+
42+
// Act
43+
service.AddEvalFunction(name, function);
44+
45+
// Assert
46+
var evalFunctions = typeof(EvalService).GetProperty("EvalFunctions", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance)!.GetValue(service) as Dictionary<string, KernelFunction>;
47+
Assert.True(evalFunctions!.ContainsKey(name));
48+
Assert.Equal(function, evalFunctions[name]);
49+
}
50+
51+
[Fact]
52+
public void AddEvalFunction_Override_ReplacesFunction()
53+
{
54+
// Arrange
55+
var kernel = Kernel.CreateBuilder().Build();
56+
var service = new EvalService(kernel);
57+
var function1 = KernelFunctionFactory.CreateFromPrompt("prompt1", new OpenAIPromptExecutionSettings(), "testFunction");
58+
var function2 = KernelFunctionFactory.CreateFromPrompt("prompt2", new OpenAIPromptExecutionSettings(), "testFunction");
59+
var name = "testFunction";
60+
service.AddEvalFunction(name, function1);
61+
62+
// Act
63+
service.AddEvalFunction(name, function2, overrideExisting: true);
64+
65+
// Assert
66+
var evalFunctions = typeof(EvalService).GetProperty("EvalFunctions", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance)!.GetValue(service) as Dictionary<string, KernelFunction>;
67+
Assert.Equal(function2, evalFunctions![name]);
68+
}
69+
70+
[Fact]
71+
public void AddEvalFunction_FromPrompt_AddsFunction()
72+
{
73+
// Arrange
74+
var kernel = Kernel.CreateBuilder().Build();
75+
var service = new EvalService(kernel);
76+
var prompt = "Test prompt";
77+
var settings = new OpenAIPromptExecutionSettings();
78+
var name = "promptFunction";
79+
80+
// Act
81+
fixture.Sut.AddEvalFunction(name, prompt, settings);
82+
83+
// Assert
84+
var evalFunctions = typeof(EvalService).GetProperty("EvalFunctions", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance)!.GetValue(fixture.Sut) as Dictionary<string, KernelFunction>;
85+
Assert.True(evalFunctions!.ContainsKey(name));
86+
}
87+
88+
[Fact]
89+
public void AddEvalFunction_FromYaml_AddsFunction()
90+
{
91+
//Act
92+
fixture.Sut.AddEvalFunctionFromYaml(TestYaml, "binary_quality", true);
93+
94+
//Assert
95+
var evalFunctions = typeof(EvalService).GetProperty("EvalFunctions", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance)!.GetValue(fixture.Sut) as Dictionary<string, KernelFunction>;
96+
Assert.True(evalFunctions!.ContainsKey("binary_quality"));
97+
}
98+
99+
[Fact]
100+
public void AggregateResults_ReturnsAverageScore()
101+
{
102+
// Arrange
103+
var scores = new List<ResultScore>
104+
{
105+
new ResultScore("Eval1", "2"),
106+
new ResultScore("Eval1", "4"),
107+
new ResultScore("Eval2", "3")
108+
};
109+
scores[0].ProbScore = 0.5;
110+
scores[1].ProbScore = 0.7;
111+
scores[2].ProbScore = 0.2;
112+
113+
// Act
114+
var result = EvalService.AggregateResults(scores);
115+
var resultProb = EvalService.AggregateResults(scores, useLogProbs: true);
116+
117+
// Assert
118+
Assert.Equal(3.0, result["Eval1"]);
119+
Assert.Equal(3.0, result["Eval2"]);
120+
Assert.Equal(0.6, resultProb["Eval1"], 1);
121+
Assert.Equal(0.2, resultProb["Eval2"]);
122+
}
123+
124+
[Fact]
125+
public void AddEvalFunctionFromYaml_AddsFunction()
126+
{
127+
// Arrange
128+
var function = KernelFunctionFactory.CreateFromPrompt("prompt", new OpenAIPromptExecutionSettings(), "yamlFunction");
129+
var kernel = Kernel.CreateBuilder().Build();
130+
Func<string, KernelFunction> factory = yaml => function;
131+
var service = new EvalService(kernel, factory);
132+
var yaml = TestYaml;
133+
var name = "binary_quality";
134+
// Act
135+
service.AddEvalFunctionFromYaml(yaml, name);
136+
// Assert
137+
var evalFunctions = typeof(EvalService).GetProperty("EvalFunctions", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance)!.GetValue(service) as Dictionary<string, KernelFunction>;
138+
Assert.True(evalFunctions!.ContainsKey(name));
139+
}
140+
141+
[Fact]
142+
public void AddEvalFunctionFromYaml_Stream_AddsFunction()
143+
{
144+
// Arrange
145+
var service = fixture.Sut;
146+
var yaml = TestYaml;
147+
var name = "binary_quality";
148+
using var stream = new MemoryStream(System.Text.Encoding.UTF8.GetBytes(yaml));
149+
// Act
150+
service.AddEvalFunctionFromYaml(stream, name);
151+
// Assert
152+
var evalFunctions = typeof(EvalService).GetProperty("EvalFunctions", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance)!.GetValue(service) as Dictionary<string, KernelFunction>;
153+
Assert.True(evalFunctions!.ContainsKey(name));
154+
}
155+
156+
[Fact]
157+
public async Task ExecuteEval_ReturnsExpectedScore()
158+
{
159+
// Arrange
160+
161+
fixture.Sut.AddEvalFunctionFromYaml(TestYaml, "binary_quality", true);
162+
// Act
163+
var score = await fixture.Sut.ExecuteEval(new TestInput("any"));
164+
165+
// Assert
166+
Assert.Equal(4, score.Score); // parsed from assistant reply
167+
Assert.Equal("binary_quality", score.EvalName);
168+
}
169+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<TargetFramework>net8.0</TargetFramework>
5+
<LangVersion>latest</LangVersion>
6+
<ImplicitUsings>enable</ImplicitUsings>
7+
<Nullable>enable</Nullable>
8+
</PropertyGroup>
9+
10+
<ItemGroup>
11+
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.12.0" />
12+
<PackageReference Include="Moq" Version="4.20.72" />
13+
<PackageReference Include="MSTest" Version="3.6.4" />
14+
<PackageReference Include="xunit" Version="2.9.3" />
15+
<PackageReference Include="xunit.runner.visualstudio" Version="3.1.0">
16+
<PrivateAssets>all</PrivateAssets>
17+
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
18+
</PackageReference>
19+
</ItemGroup>
20+
21+
<ItemGroup>
22+
<ProjectReference Include="..\LlmAsJudgeEvalsAsPlugins\LlmAsJudgeEvalsAsPlugins.csproj" />
23+
</ItemGroup>
24+
25+
26+
</Project>
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
using HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;
2+
using Microsoft.SemanticKernel.ChatCompletion;
3+
using Microsoft.SemanticKernel;
4+
using Moq;
5+
using System;
6+
using System.Collections.Generic;
7+
using System.Linq;
8+
using System.Text;
9+
using System.Threading.Tasks;
10+
using Microsoft.Extensions.DependencyInjection;
11+
using OpenAI.Chat;
12+
using Xunit;
13+
using ChatMessageContent = Microsoft.SemanticKernel.ChatMessageContent;
14+
15+
namespace LlmAsJudgeEvalsAsPlugins.Tests;
16+
17+
// TestFixture.cs – one per test class
18+
public sealed class EvalServiceFixture : IAsyncLifetime
19+
{
20+
public Kernel Kernel { get; private set; } = default!;
21+
public IChatCompletionService FakeChat { get; } = CreateFakeChat();
22+
23+
public EvalService Sut { get; private set; } = default!; // system-under-test
24+
25+
public Task InitializeAsync()
26+
{
27+
var builder = Kernel.CreateBuilder();
28+
builder.Services.AddSingleton(FakeChat); // ≤— inject the fake LLM
29+
Kernel = builder.Build();
30+
Sut = new EvalService(Kernel);
31+
return Task.CompletedTask;
32+
}
33+
34+
public Task DisposeAsync() => Task.CompletedTask;
35+
36+
// Creates a Moq that pretends to be the LLM.
37+
private static IChatCompletionService CreateFakeChat()
38+
{
39+
var mock = new Mock<IChatCompletionService>();
40+
41+
// Fake assistant reply that includes log-probs in plain JSON-friendly form
42+
var fakeAssistantReply = new ChatMessageContent(
43+
AuthorRole.Assistant,
44+
"4", // the model’s text
45+
metadata: new Dictionary<string, object?>
46+
{
47+
["ContentTokenLogProbabilities"] = new[]
48+
{
49+
new
50+
{
51+
Tokens = new[] { "1","2","3","4","5" },
52+
LogProbabilities = new[] { -2.1, -1.9, -1.7, -0.01, -0.05 }
53+
}
54+
}
55+
});
56+
57+
mock.Setup(x => x.GetChatMessageContentsAsync(
58+
It.IsAny<ChatHistory>(),
59+
It.IsAny<PromptExecutionSettings>(),
60+
It.IsAny<Kernel>(),
61+
It.IsAny<CancellationToken>()))
62+
.ReturnsAsync([fakeAssistantReply]);
63+
64+
return mock.Object;
65+
}
66+
67+
}

LlmAsJudgeEvalsAsPlugins.sln

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LlmAsJudgeEvalsAsPlugins",
77
EndProject
88
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LlmAsJudgeEvalsAsPlugins.Demo", "Demo\LlmAsJudgeEvalsAsPlugins.Demo\LlmAsJudgeEvalsAsPlugins.Demo.csproj", "{B6DC903A-619D-4910-B943-C50292333746}"
99
EndProject
10+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LlmAsJudgeEvalsAsPlugins.Tests", "LlmAsJudgeEvalsAsPlugins.Tests\LlmAsJudgeEvalsAsPlugins.Tests.csproj", "{7AA2988A-3728-48F8-A6F3-89ECD0583D5C}"
11+
EndProject
1012
Global
1113
GlobalSection(SolutionConfigurationPlatforms) = preSolution
1214
Debug|Any CPU = Debug|Any CPU
@@ -21,6 +23,10 @@ Global
2123
{B6DC903A-619D-4910-B943-C50292333746}.Debug|Any CPU.Build.0 = Debug|Any CPU
2224
{B6DC903A-619D-4910-B943-C50292333746}.Release|Any CPU.ActiveCfg = Release|Any CPU
2325
{B6DC903A-619D-4910-B943-C50292333746}.Release|Any CPU.Build.0 = Release|Any CPU
26+
{7AA2988A-3728-48F8-A6F3-89ECD0583D5C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
27+
{7AA2988A-3728-48F8-A6F3-89ECD0583D5C}.Debug|Any CPU.Build.0 = Debug|Any CPU
28+
{7AA2988A-3728-48F8-A6F3-89ECD0583D5C}.Release|Any CPU.ActiveCfg = Release|Any CPU
29+
{7AA2988A-3728-48F8-A6F3-89ECD0583D5C}.Release|Any CPU.Build.0 = Release|Any CPU
2430
EndGlobalSection
2531
GlobalSection(SolutionProperties) = preSolution
2632
HideSolutionNode = FALSE

LlmAsJudgeEvalsAsPlugins/EvalPluginYaml/ExplainPlusScore/GptGroundednessExplain.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,14 @@ template: |
4747
{"CONTEXT": "Some are reported as not having been wanted at all.", "ANSWER": "All are reported as being completely and fully wanted."}
4848
#### Example Task #4 Output:
4949
1
50+
#### Example Task #5 Input:
51+
{"CONTEXT": "The Eiffel Tower is located in Paris, France, and was completed in 1889 as the entrance arch to the 1889 World's Fair.", "ANSWER": "The Eiffel Tower was built in the 19th century in France."}
52+
#### Example Task #5 Output:
53+
3
54+
#### Example Task #6 Input:
55+
{"CONTEXT": "The Amazon rainforest is the largest tropical rainforest in the world, known for its biodiversity and size, covering much of northwestern Brazil and extending into Colombia, Peru and other South American countries.", "ANSWER": "The Amazon rainforest is the largest forest in Africa."}
56+
#### Example Task #6 Output:
57+
2
5058
5159
### Output Format
5260

LlmAsJudgeEvalsAsPlugins/EvalPluginYaml/GptGroundedness.yaml

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
description: Evaluate the groundedness, using a score of 1-5, of generated answer
2-
in the retreived context
1+
description: Evaluate the groundedness (whether the answer follows logically from the context provided), using a score of 1-5, of generated answer in the retreived context
32
execution_settings:
43
default:
54
max_tokens: 128
@@ -19,13 +18,10 @@ input_variables:
1918
default: ''
2019
name: GptGroundedness
2120
template: |
22-
System:
23-
You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
24-
User:
2521
You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating:
2622
1. 5: The ANSWER follows logically from the information contained in the CONTEXT.
2723
2. 1: The ANSWER is logically false from the information contained in the CONTEXT.
28-
3. an (ASCII) integer score between 1 and 5 and if such integer score does not exists, use 1: It is not possible to determine whether the ANSWER is true or false without further information.
24+
3. an (ASCII) integer score between 1 and 5 and if such integer score does not exists, use 1: The ANSWER is, to some degree, partially true or false from the information contained in the CONTEXT.
2925
3026
Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails.
3127
@@ -47,10 +43,18 @@ template: |
4743
{"CONTEXT": "Some are reported as not having been wanted at all.", "ANSWER": "All are reported as being completely and fully wanted."}
4844
#### Example Task #4 Output:
4945
1
50-
51-
Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context.
46+
#### Example Task #5 Input:
47+
{"CONTEXT": "The Eiffel Tower is located in Paris, France, and was completed in 1889 as the entrance arch to the 1889 World's Fair.", "ANSWER": "The Eiffel Tower was built in the 19th century in France."}
48+
#### Example Task #5 Output:
49+
3
50+
#### Example Task #6 Input:
51+
{"CONTEXT": "The Amazon rainforest is the largest tropical rainforest in the world, known for its biodiversity and size, covering much of northwestern Brazil and extending into Colombia, Peru and other South American countries.", "ANSWER": "The Amazon rainforest is the largest forest in Africa."}
52+
#### Example Task #6 Output:
53+
2
54+
55+
Reminder: The return values for each task should be correctly formatted as an ASCII integer between 1 and 5. Do not repeat the context.
5256
5357
#### Actual Task Input:
5458
{"CONTEXT": {{$context}}, "ANSWER": {{$answer}}}
5559
56-
#### Actual Task Output:
60+
#### Actual Task Output:

LlmAsJudgeEvalsAsPlugins/EvalPluginYaml/GptGroundedness2.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
description: Evaluate the groundedness, using a score of 1-5, of generated answer
1+
description: Evaluate the groundedness (whether the answer is anchored in the provided context), using a score of 1-5, of generated answer
22
in the retreived context
33
execution_settings:
44
default:
@@ -95,6 +95,6 @@ template: |
9595

9696

9797
# Tasks
98-
## Please provide your assessment Score for the previous RESPONSE in relation to the CONTEXT and QUERY based on the Definitions above. Your output MUST be a integer score (i.e., 1, 2...) based on the levels of the definitions.
98+
## Please provide your assessment Score for the previous RESPONSE in relation to the CONTEXT and QUERY based on the Definitions above. Your output MUST be an ASCII integer score (i.e., 1, 2...) based on the levels of the definitions.
9999

100100
# Output

LlmAsJudgeEvalsAsPlugins/EvalPluginYaml/PerceivedIntelligence.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ template: |
2727
- Score 10 means the answer is excellent for perceived intelligence
2828
- Score 1 means the answer is poor for perceived intelligence
2929
- Score 5 means the answer is normal for perceived intelligence
30-
- Just respond with the score, nothing else.
30+
- Just respond with the score as an ASCII integer between 1 and 10, nothing else.
3131
3232
# Real work
3333

0 commit comments

Comments
 (0)