HillPhelmuth
diff --git a/‎Demo/LlmAsJudgeEvalsAsPlugins.Demo/EvalManager.cs‎
Lines changed: 3 additions & 0 deletions b/‎Demo/LlmAsJudgeEvalsAsPlugins.Demo/EvalManager.cs‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎Demo/LlmAsJudgeEvalsAsPlugins.Demo/LlmAsJudgeEvalsAsPlugins.Demo.csproj‎
Lines changed: 1 addition & 0 deletions b/‎Demo/LlmAsJudgeEvalsAsPlugins.Demo/LlmAsJudgeEvalsAsPlugins.Demo.csproj‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎LlmAsJudgeEvalsAsPlugins.Tests/EvalServiceTests.cs‎
Lines changed: 169 additions & 0 deletions b/‎LlmAsJudgeEvalsAsPlugins.Tests/EvalServiceTests.cs‎
Lines changed: 169 additions & 0 deletions
diff --git a/‎LlmAsJudgeEvalsAsPlugins.Tests/LlmAsJudgeEvalsAsPlugins.Tests.csproj‎
Lines changed: 26 additions & 0 deletions b/‎LlmAsJudgeEvalsAsPlugins.Tests/LlmAsJudgeEvalsAsPlugins.Tests.csproj‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎LlmAsJudgeEvalsAsPlugins.Tests/TestFixture.cs‎
Lines changed: 67 additions & 0 deletions b/‎LlmAsJudgeEvalsAsPlugins.Tests/TestFixture.cs‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎LlmAsJudgeEvalsAsPlugins.sln‎
Lines changed: 6 additions & 0 deletions b/‎LlmAsJudgeEvalsAsPlugins.sln‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎LlmAsJudgeEvalsAsPlugins/EvalPluginYaml/ExplainPlusScore/GptGroundednessExplain.yaml‎
Lines changed: 8 additions & 0 deletions b/‎LlmAsJudgeEvalsAsPlugins/EvalPluginYaml/ExplainPlusScore/GptGroundednessExplain.yaml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎LlmAsJudgeEvalsAsPlugins/EvalPluginYaml/GptGroundedness.yaml‎
Lines changed: 13 additions & 9 deletions b/‎LlmAsJudgeEvalsAsPlugins/EvalPluginYaml/GptGroundedness.yaml‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎LlmAsJudgeEvalsAsPlugins/EvalPluginYaml/GptGroundedness2.yaml‎
Lines changed: 2 additions & 2 deletions b/‎LlmAsJudgeEvalsAsPlugins/EvalPluginYaml/GptGroundedness2.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎LlmAsJudgeEvalsAsPlugins/EvalPluginYaml/PerceivedIntelligence.yaml‎
Lines changed: 1 addition & 1 deletion b/‎LlmAsJudgeEvalsAsPlugins/EvalPluginYaml/PerceivedIntelligence.yaml‎
Lines changed: 1 addition & 1 deletion
@@ -1,13 +1,16 @@
 using System.Text;
 using System.Text.Json;
 using HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;
+using Microsoft.Extensions.AI;
 using Microsoft.SemanticKernel;
 using Microsoft.SemanticKernel.ChatCompletion;
 using Microsoft.SemanticKernel.Connectors.OpenAI;
 using Microsoft.SemanticKernel.Memory;
 using Microsoft.SemanticKernel.Text;
+using OpenAI;
 using UglyToad.PdfPig;
 
+
 namespace LlmAsJudgeEvalsAsPlugins.Demo;
 
 public class EvalManager(IConfiguration configuration, ILoggerFactory loggerFactory)
 
@@ -18,6 +18,7 @@
 
   <ItemGroup>
     <PackageReference Include="Markdig" Version="0.37.0" />
+    <PackageReference Include="Microsoft.Extensions.AI.OpenAI" Version="9.4.3-preview.1.25230.7" />
     <PackageReference Include="PdfPig" Version="0.1.8" />
     <PackageReference Include="Radzen.Blazor" Version="6.0.9" />
     <PackageReference Include="Tiktoken" Version="2.0.3" />
 
@@ -0,0 +1,169 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Threading.Tasks;
+using HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;
+using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.Connectors.OpenAI;
+using Xunit;
+using Moq;
+
+namespace LlmAsJudgeEvalsAsPlugins.Tests;
+//[Collection(nameof(EvalServiceFixture))]
+public class EvalServiceTests(EvalServiceFixture fixture) : IClassFixture<EvalServiceFixture>
+{
+    private const string TestYaml = """
+                                    name: binary_quality
+                                    description: Return 4 if the answer is perfect, 0 otherwise.
+                                    template: |
+                                      Give me just "4".
+                                    """;
+
+    private sealed record TestInput(string SystemAnswer)
+        : IInputModel
+    {
+        public string FunctionName => "binary_quality";
+        public KernelArguments RequiredInputs =>
+            new()
+            {
+                ["answer"] = SystemAnswer
+            };
+    }
+
+    [Fact]
+    public void AddEvalFunction_AddsFunction_WhenNotExists()
+    {
+        // Arrange
+        var kernel = Kernel.CreateBuilder().Build();
+        var service = new EvalService(kernel);
+        var function = KernelFunctionFactory.CreateFromPrompt("prompt", new OpenAIPromptExecutionSettings(), "testFunction");
+        var name = "testFunction";
+
+        // Act
+        service.AddEvalFunction(name, function);
+
+        // Assert
+        var evalFunctions = typeof(EvalService).GetProperty("EvalFunctions", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance)!.GetValue(service) as Dictionary<string, KernelFunction>;
+        Assert.True(evalFunctions!.ContainsKey(name));
+        Assert.Equal(function, evalFunctions[name]);
+    }
+
+    [Fact]
+    public void AddEvalFunction_Override_ReplacesFunction()
+    {
+        // Arrange
+        var kernel = Kernel.CreateBuilder().Build();
+        var service = new EvalService(kernel);
+        var function1 = KernelFunctionFactory.CreateFromPrompt("prompt1", new OpenAIPromptExecutionSettings(), "testFunction");
+        var function2 = KernelFunctionFactory.CreateFromPrompt("prompt2", new OpenAIPromptExecutionSettings(), "testFunction");
+        var name = "testFunction";
+        service.AddEvalFunction(name, function1);
+
+        // Act
+        service.AddEvalFunction(name, function2, overrideExisting: true);
+
+        // Assert
+        var evalFunctions = typeof(EvalService).GetProperty("EvalFunctions", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance)!.GetValue(service) as Dictionary<string, KernelFunction>;
+        Assert.Equal(function2, evalFunctions![name]);
+    }
+
+    [Fact]
+    public void AddEvalFunction_FromPrompt_AddsFunction()
+    {
+        // Arrange
+        var kernel = Kernel.CreateBuilder().Build();
+        var service = new EvalService(kernel);
+        var prompt = "Test prompt";
+        var settings = new OpenAIPromptExecutionSettings();
+        var name = "promptFunction";
+
+        // Act
+        fixture.Sut.AddEvalFunction(name, prompt, settings);
+
+        // Assert
+        var evalFunctions = typeof(EvalService).GetProperty("EvalFunctions", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance)!.GetValue(fixture.Sut) as Dictionary<string, KernelFunction>;
+        Assert.True(evalFunctions!.ContainsKey(name));
+    }
+
+    [Fact]
+    public void AddEvalFunction_FromYaml_AddsFunction()
+    {
+        //Act
+        fixture.Sut.AddEvalFunctionFromYaml(TestYaml, "binary_quality", true);
+
+        //Assert
+        var evalFunctions = typeof(EvalService).GetProperty("EvalFunctions", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance)!.GetValue(fixture.Sut) as Dictionary<string, KernelFunction>;
+        Assert.True(evalFunctions!.ContainsKey("binary_quality"));
+    }
+
+    [Fact]
+    public void AggregateResults_ReturnsAverageScore()
+    {
+        // Arrange
+        var scores = new List<ResultScore>
+        {
+            new ResultScore("Eval1", "2"),
+            new ResultScore("Eval1", "4"),
+            new ResultScore("Eval2", "3")
+        };
+        scores[0].ProbScore = 0.5;
+        scores[1].ProbScore = 0.7;
+        scores[2].ProbScore = 0.2;
+
+        // Act
+        var result = EvalService.AggregateResults(scores);
+        var resultProb = EvalService.AggregateResults(scores, useLogProbs: true);
+
+        // Assert
+        Assert.Equal(3.0, result["Eval1"]);
+        Assert.Equal(3.0, result["Eval2"]);
+        Assert.Equal(0.6, resultProb["Eval1"], 1);
+        Assert.Equal(0.2, resultProb["Eval2"]);
+    }
+
+    [Fact]
+    public void AddEvalFunctionFromYaml_AddsFunction()
+    {
+        // Arrange
+        var function = KernelFunctionFactory.CreateFromPrompt("prompt", new OpenAIPromptExecutionSettings(), "yamlFunction");
+        var kernel = Kernel.CreateBuilder().Build();
+        Func<string, KernelFunction> factory = yaml => function;
+        var service = new EvalService(kernel, factory);
+        var yaml = TestYaml;
+        var name = "binary_quality";
+        // Act
+        service.AddEvalFunctionFromYaml(yaml, name);
+        // Assert
+        var evalFunctions = typeof(EvalService).GetProperty("EvalFunctions", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance)!.GetValue(service) as Dictionary<string, KernelFunction>;
+        Assert.True(evalFunctions!.ContainsKey(name));
+    }
+
+    [Fact]
+    public void AddEvalFunctionFromYaml_Stream_AddsFunction()
+    {
+        // Arrange
+        var service = fixture.Sut;
+        var yaml = TestYaml;
+        var name = "binary_quality";
+        using var stream = new MemoryStream(System.Text.Encoding.UTF8.GetBytes(yaml));
+        // Act
+        service.AddEvalFunctionFromYaml(stream, name);
+        // Assert
+        var evalFunctions = typeof(EvalService).GetProperty("EvalFunctions", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance)!.GetValue(service) as Dictionary<string, KernelFunction>;
+        Assert.True(evalFunctions!.ContainsKey(name));
+    }
+
+    [Fact]
+    public async Task ExecuteEval_ReturnsExpectedScore()
+    {
+        // Arrange
+
+        fixture.Sut.AddEvalFunctionFromYaml(TestYaml, "binary_quality", true);
+        // Act
+        var score = await fixture.Sut.ExecuteEval(new TestInput("any"));
+
+        // Assert
+        Assert.Equal(4, score.Score);          // parsed from assistant reply
+        Assert.Equal("binary_quality", score.EvalName);
+    }
+}
@@ -0,0 +1,26 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>net8.0</TargetFramework>
+    <LangVersion>latest</LangVersion>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.12.0" />
+    <PackageReference Include="Moq" Version="4.20.72" />
+    <PackageReference Include="MSTest" Version="3.6.4" />
+    <PackageReference Include="xunit" Version="2.9.3" />
+    <PackageReference Include="xunit.runner.visualstudio" Version="3.1.0">
+      <PrivateAssets>all</PrivateAssets>
+      <IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
+    </PackageReference>
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\LlmAsJudgeEvalsAsPlugins\LlmAsJudgeEvalsAsPlugins.csproj" />
+  </ItemGroup>
+ 
+
+</Project>
@@ -0,0 +1,67 @@
+using HillPhelmuth.SemanticKernel.LlmAsJudgeEvals;
+using Microsoft.SemanticKernel.ChatCompletion;
+using Microsoft.SemanticKernel;
+using Moq;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using Microsoft.Extensions.DependencyInjection;
+using OpenAI.Chat;
+using Xunit;
+using ChatMessageContent = Microsoft.SemanticKernel.ChatMessageContent;
+
+namespace LlmAsJudgeEvalsAsPlugins.Tests;
+
+// TestFixture.cs – one per test class
+public sealed class EvalServiceFixture : IAsyncLifetime
+{
+    public Kernel Kernel { get; private set; } = default!;
+    public IChatCompletionService FakeChat { get; } = CreateFakeChat();
+
+    public EvalService Sut { get; private set; } = default!;     // system-under-test
+
+    public Task InitializeAsync()
+    {
+        var builder = Kernel.CreateBuilder();
+        builder.Services.AddSingleton(FakeChat);   // ≤— inject the fake LLM
+        Kernel = builder.Build();
+        Sut = new EvalService(Kernel);
+        return Task.CompletedTask;
+    }
+
+    public Task DisposeAsync() => Task.CompletedTask;
+
+    // Creates a Moq that pretends to be the LLM.
+    private static IChatCompletionService CreateFakeChat()
+    {
+        var mock = new Mock<IChatCompletionService>();
+
+        // Fake assistant reply that includes log-probs in plain JSON-friendly form
+        var fakeAssistantReply = new ChatMessageContent(
+            AuthorRole.Assistant,
+            "4",                                             // the model’s text
+            metadata: new Dictionary<string, object?>
+            {
+                ["ContentTokenLogProbabilities"] = new[]
+                {
+                    new
+                    {
+                        Tokens          = new[] { "1","2","3","4","5" },
+                        LogProbabilities = new[] { -2.1, -1.9, -1.7, -0.01, -0.05 }
+                    }
+                }
+            });
+
+        mock.Setup(x => x.GetChatMessageContentsAsync(
+                It.IsAny<ChatHistory>(),
+                It.IsAny<PromptExecutionSettings>(),
+                It.IsAny<Kernel>(),
+                It.IsAny<CancellationToken>()))
+            .ReturnsAsync([fakeAssistantReply]);
+
+        return mock.Object;
+    }
+
+}
@@ -7,6 +7,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "LlmAsJudgeEvalsAsPlugins",
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LlmAsJudgeEvalsAsPlugins.Demo", "Demo\LlmAsJudgeEvalsAsPlugins.Demo\LlmAsJudgeEvalsAsPlugins.Demo.csproj", "{B6DC903A-619D-4910-B943-C50292333746}"
 EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LlmAsJudgeEvalsAsPlugins.Tests", "LlmAsJudgeEvalsAsPlugins.Tests\LlmAsJudgeEvalsAsPlugins.Tests.csproj", "{7AA2988A-3728-48F8-A6F3-89ECD0583D5C}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@@ -21,6 +23,10 @@ Global
 		{B6DC903A-619D-4910-B943-C50292333746}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{B6DC903A-619D-4910-B943-C50292333746}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{B6DC903A-619D-4910-B943-C50292333746}.Release|Any CPU.Build.0 = Release|Any CPU
+		{7AA2988A-3728-48F8-A6F3-89ECD0583D5C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{7AA2988A-3728-48F8-A6F3-89ECD0583D5C}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{7AA2988A-3728-48F8-A6F3-89ECD0583D5C}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{7AA2988A-3728-48F8-A6F3-89ECD0583D5C}.Release|Any CPU.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 
@@ -47,6 +47,14 @@ template: |
     {"CONTEXT": "Some are reported as not having been wanted at all.", "ANSWER": "All are reported as being completely and fully wanted."}
     #### Example Task #4 Output:
     1
+    #### Example Task #5 Input:
+    {"CONTEXT": "The Eiffel Tower is located in Paris, France, and was completed in 1889 as the entrance arch to the 1889 World's Fair.", "ANSWER": "The Eiffel Tower was built in the 19th century in France."}
+    #### Example Task #5 Output:
+    3
+    #### Example Task #6 Input:
+    {"CONTEXT": "The Amazon rainforest is the largest tropical rainforest in the world, known for its biodiversity and size, covering much of northwestern Brazil and extending into Colombia, Peru and other South American countries.", "ANSWER": "The Amazon rainforest is the largest forest in Africa."}
+    #### Example Task #6 Output:
+    2
 
     ### Output Format
 
 
@@ -1,5 +1,4 @@
-description: Evaluate the groundedness, using a score of 1-5, of generated answer
-  in the retreived context
+description: Evaluate the groundedness (whether the answer follows logically from the context provided), using a score of 1-5, of generated answer in the retreived context
 execution_settings:
   default:
     max_tokens: 128
@@ -19,13 +18,10 @@ input_variables:
   default: ''
 name: GptGroundedness
 template: |
-    System:
-    You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric.
-    User:
     You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating:
     1. 5: The ANSWER follows logically from the information contained in the CONTEXT.
     2. 1: The ANSWER is logically false from the information contained in the CONTEXT.
-    3. an (ASCII) integer score between 1 and 5 and if such integer score does not exists, use 1: It is not possible to determine whether the ANSWER is true or false without further information.
+    3. an (ASCII) integer score between 1 and 5 and if such integer score does not exists, use 1: The ANSWER is, to some degree, partially true or false from the information contained in the CONTEXT.
 
     Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails.
 
@@ -47,10 +43,18 @@ template: |
     {"CONTEXT": "Some are reported as not having been wanted at all.", "ANSWER": "All are reported as being completely and fully wanted."}
     #### Example Task #4 Output:
     1
-
-    Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context.
+    #### Example Task #5 Input:
+    {"CONTEXT": "The Eiffel Tower is located in Paris, France, and was completed in 1889 as the entrance arch to the 1889 World's Fair.", "ANSWER": "The Eiffel Tower was built in the 19th century in France."}
+    #### Example Task #5 Output:
+    3
+    #### Example Task #6 Input:
+    {"CONTEXT": "The Amazon rainforest is the largest tropical rainforest in the world, known for its biodiversity and size, covering much of northwestern Brazil and extending into Colombia, Peru and other South American countries.", "ANSWER": "The Amazon rainforest is the largest forest in Africa."}
+    #### Example Task #6 Output:
+    2
+    
+    Reminder: The return values for each task should be correctly formatted as an ASCII integer between 1 and 5. Do not repeat the context.
 
     #### Actual Task Input:
     {"CONTEXT": {{$context}}, "ANSWER": {{$answer}}}
 
-    #### Actual Task Output:    
+    #### Actual Task Output:
@@ -1,4 +1,4 @@
-description: Evaluate the groundedness, using a score of 1-5, of generated answer
+description: Evaluate the groundedness (whether the answer is anchored in the provided context), using a score of 1-5, of generated answer
   in the retreived context
 execution_settings:
   default:
@@ -95,6 +95,6 @@ template: |
 
 
  # Tasks
- ## Please provide your assessment Score for the previous RESPONSE in relation to the CONTEXT and QUERY based on the Definitions above. Your output MUST be a integer score (i.e., 1, 2...) based on the levels of the definitions.
+ ## Please provide your assessment Score for the previous RESPONSE in relation to the CONTEXT and QUERY based on the Definitions above. Your output MUST be an ASCII integer score (i.e., 1, 2...) based on the levels of the definitions.
 
  # Output
@@ -27,7 +27,7 @@ template: |
         - Score 10 means the answer is excellent for perceived intelligence
         - Score 1 means the answer is poor for perceived intelligence
         - Score 5 means the answer is normal for perceived intelligence
-    - Just respond with the score, nothing else.
+    - Just respond with the score as an ASCII integer between 1 and 10, nothing else.
   
     # Real work