bb-io · RiabushenkoA · Apr 27, 2026 · Apr 27, 2026 · Apr 27, 2026
diff --git a/Apps.OpenAI/Actions/AudioActions.cs b/Apps.OpenAI/Actions/AudioActions.cs
@@ -82,7 +82,7 @@ public async Task<TranscriptionResponse> CreateTranscription(
 
         return new()
         {
-            Transcription = response.Text,
+            Transcription = BuildTranscription(response, isDiarizationModel),
             Words = JsonConvert.SerializeObject(words),
             Segments = JsonConvert.SerializeObject(segments)
         };
@@ -111,6 +111,47 @@ static void ValidateTranscriptionRequest(
                 throw new PluginMisconfigurationException("Timestamp granularities are only supported when using the 'whisper-1' model.");
             }
         }
+
+        static string BuildTranscription(TranscriptionDto response, bool isDiarizationModel)
+        {
+            if (!isDiarizationModel || response.Segments is null || !response.Segments.Any())
+            {
+                return response.Text;
+            }
+
+            var speakerTurns = new List<string>();
+            string? currentSpeaker = null;
+            var currentText = new List<string>();
+
+            foreach (var segment in response.Segments.Where(x => !string.IsNullOrWhiteSpace(x.Text)))
+            {
+                var speaker = string.IsNullOrWhiteSpace(segment.Speaker) ? "Unknown" : segment.Speaker.Trim();
+                var text = segment.Text.Trim();
+
+                if (!string.Equals(currentSpeaker, speaker, StringComparison.Ordinal))
+                {
+                    AddSpeakerTurn(speakerTurns, currentSpeaker, currentText);
+                    currentSpeaker = speaker;
+                    currentText = [text];
+                    continue;
+                }
+
+                currentText.Add(text);
+            }
+
+            AddSpeakerTurn(speakerTurns, currentSpeaker, currentText);
+            return speakerTurns.Count > 0 ? string.Join(Environment.NewLine, speakerTurns) : response.Text;
+        }
+
+        static void AddSpeakerTurn(List<string> speakerTurns, string? speaker, List<string> currentText)
+        {
+            if (string.IsNullOrWhiteSpace(speaker) || currentText.Count == 0)
+            {
+                return;
+            }
+
+            speakerTurns.Add($"{speaker}: {string.Join(" ", currentText)}");
+        }
     }
 
     [Action("Create speech", Description = "Generates speech audio from input text.")]

diff --git a/Apps.OpenAI/Apps.OpenAI.csproj b/Apps.OpenAI/Apps.OpenAI.csproj
@@ -4,7 +4,7 @@
     <TargetFramework>net8.0</TargetFramework>
 	  <Product>OpenAI</Product>
 	  <Description>Creating safe artificial general intelligence that benefits all of humanity</Description>
-	  <Version>2.8.36</Version>
+	  <Version>2.8.37</Version>
 	  <AssemblyName>Apps.OpenAI</AssemblyName>
   </PropertyGroup>
 

diff --git a/Apps.OpenAI/Dtos/TranscriptionDto.cs b/Apps.OpenAI/Dtos/TranscriptionDto.cs
@@ -1,13 +1,13 @@
-using Apps.OpenAI.Utils;
+using Apps.OpenAI.Utils;
 using Newtonsoft.Json;
 
 namespace Apps.OpenAI.Dtos;
 
 public record TranscriptionDto(string Text) : TextDto(Text)
 {
-    public WordDto[] Words { get; init; }
+    public WordDto[]? Words { get; init; }
 
-    public SegmentDto[] Segments { get; init; }
+    public SegmentDto[]? Segments { get; init; }
 
     public double Temperature { get; init; }
 
@@ -20,4 +20,30 @@ public record TranscriptionDto(string Text) : TextDto(Text)
 
 public record WordDto(string Word, double Start, double End);
 
-public record SegmentDto([property: JsonConverter(typeof(FlexibleIdConverter))] string Id, int Seek, double Start, double End, string Text, int[] Tokens, double Temperature, double AvgLogprob, double CompressionRatio, double NoSpeechProb);
+public class SegmentDto
+{
+    public string? Type { get; init; }
+
+    [JsonConverter(typeof(FlexibleIdConverter))]
+    public string Id { get; init; } = string.Empty;
+
+    public int? Seek { get; init; }
+
+    public double Start { get; init; }
+
+    public double End { get; init; }
+
+    public string Text { get; init; } = string.Empty;
+
+    public string? Speaker { get; init; }
+
+    public int[]? Tokens { get; init; }
+
+    public double? Temperature { get; init; }
+
+    public double? AvgLogprob { get; init; }
+
+    public double? CompressionRatio { get; init; }
+
+    public double? NoSpeechProb { get; init; }
+}
diff --git a/Apps.OpenAI/Models/Responses/Audio/TranscriptionResponse.cs b/Apps.OpenAI/Models/Responses/Audio/TranscriptionResponse.cs
@@ -28,8 +28,12 @@ public record SegmentResponse(SegmentDto dto)
     [Display("Segment ID")]
     public string Id { get; set; } = dto.Id;
 
+    public string? Type { get; set; } = dto.Type;
+
     public string Text { get; set; } = dto.Text;
 
+    public string? Speaker { get; set; } = dto.Speaker;
+
     public double Start { get; set; } = dto.Start;
 
     public double End { get; set; } = dto.End;

diff --git a/Tests.OpenAI/AudioServiceTests.cs b/Tests.OpenAI/AudioServiceTests.cs
@@ -7,6 +7,7 @@
 using Blackbird.Applications.Sdk.Common.Files;
 using Blackbird.Applications.Sdk.Common.Exceptions;
 using Blackbird.Applications.Sdk.Common.Invocation;
+using Newtonsoft.Json.Linq;
 
 namespace Tests.OpenAI;
 
@@ -21,50 +22,52 @@ public async Task CreateTranscription_OpenAi_ReturnsTranscription_DiarizedJsonFo
         var model = new AudioModelIdentifier { ModelId = "gpt-4o-transcribe-diarize" };
         var request = new TranscriptionRequest
         {
-            File = new FileReference { Name = "tts delorean.mp3" },
-            Language = "en",
+            File = new FileReference { Name = "Transcription sample short.mp3" },
+            Language = "pt",
         };
 
         // Act
         var result = await handler.CreateTranscription(model, request);
+        var segments = JArray.Parse(result.Segments);
 
         // Assert
         TestContext.WriteLine(result.Transcription);
         TestContext.WriteLine(result.Segments);
         Assert.IsNotNull(result);
+        Assert.IsTrue(segments.Count > 0);
+        Assert.IsTrue(segments.Any(x => x["Speaker"] != null));
     }
 
     [TestMethod, ContextDataSource(ConnectionTypes.OpenAiEmbedded, ConnectionTypes.OpenAi)]
-    public async Task CreateTranscription_OpenAi_ReturnsTranscription_VerboseJsonFormat(InvocationContext context)
+    public async Task CreateTranscription_OpenAi_DiarizedModel_AssemblesTranscriptionBySpeaker(InvocationContext context)
     {
         // Arrange
         var handler = new AudioActions(context, FileManagementClient);
         var model = new AudioModelIdentifier { ModelId = "gpt-4o-transcribe-diarize" };
         var request = new TranscriptionRequest
         {
-            File = new FileReference { Name = "tts delorean.mp3" },
-            Language = "en",
+            File = new FileReference { Name = "Transcription sample short.mp3" },
+            Language = "pt",
         };
 
         // Act
         var result = await handler.CreateTranscription(model, request);
 
         // Assert
-        TestContext.WriteLine(result.Transcription);
-        TestContext.WriteLine(result.Segments);
+        Console.WriteLine(Newtonsoft.Json.JsonConvert.SerializeObject(result, Newtonsoft.Json.Formatting.Indented));
         Assert.IsNotNull(result);
     }
 
     [TestMethod, ContextDataSource(ConnectionTypes.OpenAiEmbedded, ConnectionTypes.OpenAi)]
-    public async Task CreateTranscription_OpenAi_ReturnsTranscription_JsonFormat(InvocationContext context)
+    public async Task CreateTranscription_OpenAi_StandardModel_ReturnsSingleBlobText(InvocationContext context)
     {
         // Arrange
         var handler = new AudioActions(context, FileManagementClient);
-        var model = new AudioModelIdentifier { ModelId = "gpt-4o-transcribe-diarize" };
+        var model = new AudioModelIdentifier { ModelId = "gpt-4o-transcribe" };
         var request = new TranscriptionRequest
         {
-            File = new FileReference { Name = "tts delorean.mp3" },
-            Language = "en",
+            File = new FileReference { Name = "Transcription sample short.mp3" },
+            Language = "pt",
         };
 
         // Act
@@ -74,6 +77,7 @@ public async Task CreateTranscription_OpenAi_ReturnsTranscription_JsonFormat(Inv
         TestContext.WriteLine(result.Transcription);
         TestContext.WriteLine(result.Segments);
         Assert.IsNotNull(result);
+        Assert.IsFalse(result.Transcription.Contains("A:"));
     }
 
     [TestMethod, ContextDataSource(ConnectionTypes.AzureOpenAi)]
@@ -104,7 +108,7 @@ public async Task CreateTranscription_OpenAi_Prompt_WithGpt4oTranscribeDiarize_T
         var model = new AudioModelIdentifier { ModelId = "gpt-4o-transcribe-diarize" };
         var request = new TranscriptionRequest
         {
-            File = new FileReference { Name = "tts delorean.mp3" },
+            File = new FileReference { Name = "Transcription sample short.mp3" },
             Prompt = "Some prompt",
         };
 
@@ -125,7 +129,7 @@ public async Task CreateTranscription_OpenAi_TimestampGranularities_WithoutWhisp
         var model = new AudioModelIdentifier { ModelId = "gpt-4o-transcribe-diarize" };
         var request = new TranscriptionRequest
         {
-            File = new FileReference { Name = "tts delorean.mp3" },
+            File = new FileReference { Name = "Transcription sample short.mp3" },
             TimestampGranularities = ["word"]
         };
 

diff --git a/Tests.OpenAI/ConnectionValidatorTests.cs b/Tests.OpenAI/ConnectionValidatorTests.cs
@@ -9,7 +9,7 @@ namespace Tests.OpenAI;
 [TestClass]
 public class ConnectionValidatorTests : TestBaseWithContext
 {
-    [TestMethod, ContextDataSource(ConnectionTypes.OpenAi)]
+    [TestMethod, ContextDataSource(ConnectionTypes.AzureOpenAi)]
     public async Task ValidateConnection_WithCorrectCredentials_ReturnsValidResult(InvocationContext context)
     {
         var validator = new ConnectionValidator();

diff --git a/Tests.OpenAI/EditTests.cs b/Tests.OpenAI/EditTests.cs
@@ -21,19 +21,21 @@ public async Task Edit_xliff(InvocationContext context)
         var modelIdentifier = new TextChatModelIdentifier { ModelId = "gpt-4o" };
         var editRequest = new EditContentRequest
         {
-            File = new FileReference { Name = "GUID-99E95005-E212-481D-AEBC-67DFA3BD38E8_1_en-US-en-zh_cn-Tr.mxliff" },
+            File = new FileReference { Name = "pmdm-13460-all-locales5832465158620282096-en-ja-Pe.mxliff" },
             OutputFileHandling = "xliff1",
             ProcessOnlySegmentState = "Initial",
+            FilterGlossary = true,
+
             ModifiedBy = "1441948"
         };
         var reasoningEffortRequest = new ReasoningEffortRequest
         {
-            //ReasoningEffort = "low"
+
         };
-        string? systemMessage = "Your task is to post-edit translation segments by correcting critical errors, comparing each target to its source. Critical errors include tag misplacements, malformed tags, number mismatches, translation omissions, or glossary term violations.  Tags appear as combinations of {, }, <, or > with a number (e.g., {1}, <2}, {3>), and these must match the source exactly. Tags define font styles of texts between two tags or represent inserted links and line breaks. \nDo not revert any translation to English.\nDo no change translation style. ";
-        var glossaryRequest = new GlossaryRequest();
+        string? systemMessage = "Perform critical-errors-only post-editing.\r\n\r\nAssume every target segment is already final and approved.\r\nLeave the target unchanged unless there is a clear critical error.\r\n\r\nCritical errors are limited to:\r\n- number or unit mismatch\r\n- omission or unjustified addition\r\n- clear mistranslation that changes meaning\r\n- broken tags/formatting\r\n- explicit glossary violation where a Japanese glossary target exists\r\n\r\nDo not edit for fluency, style, consistency, or terminology normalization alone.\r\nDo not replace approved English terms in the Japanese target unless clearly required by an explicit Japanese glossary entry.\r\nIf no clear critical error exists, return the target exactly unchanged.\r\nWhen unsure, do not change anything.\r\n";
+        var glossaryRequest = new GlossaryRequest { Glossary= new FileReference { Name= "PMDM TB.tbx" } };
 
-        var result = await actions.EditContent(modelIdentifier, editRequest, systemMessage, glossaryRequest, reasoningEffortRequest);
+        var result = await actions.EditContent(modelIdentifier, editRequest, systemMessage, glossaryRequest, reasoningEffortRequest, bucketSize: 25, ProcessLockedSegments: false);
 
         Assert.IsNotNull(result);
         //Assert.Contains("contentful", result.File.Name);