From 3d659f5525470f4139e626b5bddae85393289243 Mon Sep 17 00:00:00 2001 From: Artem Riabushenko Date: Mon, 27 Apr 2026 17:55:22 +0300 Subject: [PATCH 1/2] Extention of transcripts outputs --- Apps.OpenAI/Actions/AudioActions.cs | 43 ++++++++++++++++++- Apps.OpenAI/Dtos/TranscriptionDto.cs | 34 +++++++++++++-- .../Responses/Audio/TranscriptionResponse.cs | 4 ++ Tests.OpenAI/AudioServiceTests.cs | 30 +++++++------ Tests.OpenAI/ConnectionValidatorTests.cs | 2 +- Tests.OpenAI/EditTests.cs | 12 +++--- 6 files changed, 101 insertions(+), 24 deletions(-) diff --git a/Apps.OpenAI/Actions/AudioActions.cs b/Apps.OpenAI/Actions/AudioActions.cs index 509865c..bee9c58 100644 --- a/Apps.OpenAI/Actions/AudioActions.cs +++ b/Apps.OpenAI/Actions/AudioActions.cs @@ -82,7 +82,7 @@ public async Task CreateTranscription( return new() { - Transcription = response.Text, + Transcription = BuildTranscription(response, isDiarizationModel), Words = JsonConvert.SerializeObject(words), Segments = JsonConvert.SerializeObject(segments) }; @@ -111,6 +111,47 @@ static void ValidateTranscriptionRequest( throw new PluginMisconfigurationException("Timestamp granularities are only supported when using the 'whisper-1' model."); } } + + static string BuildTranscription(TranscriptionDto response, bool isDiarizationModel) + { + if (!isDiarizationModel || response.Segments is null || !response.Segments.Any()) + { + return response.Text; + } + + var speakerTurns = new List(); + string? currentSpeaker = null; + var currentText = new List(); + + foreach (var segment in response.Segments.Where(x => !string.IsNullOrWhiteSpace(x.Text))) + { + var speaker = string.IsNullOrWhiteSpace(segment.Speaker) ? "Unknown" : segment.Speaker.Trim(); + var text = segment.Text.Trim(); + + if (!string.Equals(currentSpeaker, speaker, StringComparison.Ordinal)) + { + AddSpeakerTurn(speakerTurns, currentSpeaker, currentText); + currentSpeaker = speaker; + currentText = [text]; + continue; + } + + currentText.Add(text); + } + + AddSpeakerTurn(speakerTurns, currentSpeaker, currentText); + return speakerTurns.Count > 0 ? string.Join(Environment.NewLine, speakerTurns) : response.Text; + } + + static void AddSpeakerTurn(List speakerTurns, string? speaker, List currentText) + { + if (string.IsNullOrWhiteSpace(speaker) || currentText.Count == 0) + { + return; + } + + speakerTurns.Add($"{speaker}: {string.Join(" ", currentText)}"); + } } [Action("Create speech", Description = "Generates speech audio from input text.")] diff --git a/Apps.OpenAI/Dtos/TranscriptionDto.cs b/Apps.OpenAI/Dtos/TranscriptionDto.cs index 3868b97..564b305 100644 --- a/Apps.OpenAI/Dtos/TranscriptionDto.cs +++ b/Apps.OpenAI/Dtos/TranscriptionDto.cs @@ -1,13 +1,13 @@ -using Apps.OpenAI.Utils; +using Apps.OpenAI.Utils; using Newtonsoft.Json; namespace Apps.OpenAI.Dtos; public record TranscriptionDto(string Text) : TextDto(Text) { - public WordDto[] Words { get; init; } + public WordDto[]? Words { get; init; } - public SegmentDto[] Segments { get; init; } + public SegmentDto[]? Segments { get; init; } public double Temperature { get; init; } @@ -20,4 +20,30 @@ public record TranscriptionDto(string Text) : TextDto(Text) public record WordDto(string Word, double Start, double End); -public record SegmentDto([property: JsonConverter(typeof(FlexibleIdConverter))] string Id, int Seek, double Start, double End, string Text, int[] Tokens, double Temperature, double AvgLogprob, double CompressionRatio, double NoSpeechProb); \ No newline at end of file +public class SegmentDto +{ + public string? Type { get; init; } + + [JsonConverter(typeof(FlexibleIdConverter))] + public string Id { get; init; } = string.Empty; + + public int? Seek { get; init; } + + public double Start { get; init; } + + public double End { get; init; } + + public string Text { get; init; } = string.Empty; + + public string? Speaker { get; init; } + + public int[]? Tokens { get; init; } + + public double? Temperature { get; init; } + + public double? AvgLogprob { get; init; } + + public double? CompressionRatio { get; init; } + + public double? NoSpeechProb { get; init; } +} diff --git a/Apps.OpenAI/Models/Responses/Audio/TranscriptionResponse.cs b/Apps.OpenAI/Models/Responses/Audio/TranscriptionResponse.cs index 83e5e50..955cd32 100644 --- a/Apps.OpenAI/Models/Responses/Audio/TranscriptionResponse.cs +++ b/Apps.OpenAI/Models/Responses/Audio/TranscriptionResponse.cs @@ -28,8 +28,12 @@ public record SegmentResponse(SegmentDto dto) [Display("Segment ID")] public string Id { get; set; } = dto.Id; + public string? Type { get; set; } = dto.Type; + public string Text { get; set; } = dto.Text; + public string? Speaker { get; set; } = dto.Speaker; + public double Start { get; set; } = dto.Start; public double End { get; set; } = dto.End; diff --git a/Tests.OpenAI/AudioServiceTests.cs b/Tests.OpenAI/AudioServiceTests.cs index 7339b96..f7e42d3 100644 --- a/Tests.OpenAI/AudioServiceTests.cs +++ b/Tests.OpenAI/AudioServiceTests.cs @@ -7,6 +7,7 @@ using Blackbird.Applications.Sdk.Common.Files; using Blackbird.Applications.Sdk.Common.Exceptions; using Blackbird.Applications.Sdk.Common.Invocation; +using Newtonsoft.Json.Linq; namespace Tests.OpenAI; @@ -21,50 +22,52 @@ public async Task CreateTranscription_OpenAi_ReturnsTranscription_DiarizedJsonFo var model = new AudioModelIdentifier { ModelId = "gpt-4o-transcribe-diarize" }; var request = new TranscriptionRequest { - File = new FileReference { Name = "tts delorean.mp3" }, - Language = "en", + File = new FileReference { Name = "Transcription sample short.mp3" }, + Language = "pt", }; // Act var result = await handler.CreateTranscription(model, request); + var segments = JArray.Parse(result.Segments); // Assert TestContext.WriteLine(result.Transcription); TestContext.WriteLine(result.Segments); Assert.IsNotNull(result); + Assert.IsTrue(segments.Count > 0); + Assert.IsTrue(segments.Any(x => x["Speaker"] != null)); } [TestMethod, ContextDataSource(ConnectionTypes.OpenAiEmbedded, ConnectionTypes.OpenAi)] - public async Task CreateTranscription_OpenAi_ReturnsTranscription_VerboseJsonFormat(InvocationContext context) + public async Task CreateTranscription_OpenAi_DiarizedModel_AssemblesTranscriptionBySpeaker(InvocationContext context) { // Arrange var handler = new AudioActions(context, FileManagementClient); var model = new AudioModelIdentifier { ModelId = "gpt-4o-transcribe-diarize" }; var request = new TranscriptionRequest { - File = new FileReference { Name = "tts delorean.mp3" }, - Language = "en", + File = new FileReference { Name = "Transcription sample short.mp3" }, + Language = "pt", }; // Act var result = await handler.CreateTranscription(model, request); // Assert - TestContext.WriteLine(result.Transcription); - TestContext.WriteLine(result.Segments); + Console.WriteLine(Newtonsoft.Json.JsonConvert.SerializeObject(result, Newtonsoft.Json.Formatting.Indented)); Assert.IsNotNull(result); } [TestMethod, ContextDataSource(ConnectionTypes.OpenAiEmbedded, ConnectionTypes.OpenAi)] - public async Task CreateTranscription_OpenAi_ReturnsTranscription_JsonFormat(InvocationContext context) + public async Task CreateTranscription_OpenAi_StandardModel_ReturnsSingleBlobText(InvocationContext context) { // Arrange var handler = new AudioActions(context, FileManagementClient); - var model = new AudioModelIdentifier { ModelId = "gpt-4o-transcribe-diarize" }; + var model = new AudioModelIdentifier { ModelId = "gpt-4o-transcribe" }; var request = new TranscriptionRequest { - File = new FileReference { Name = "tts delorean.mp3" }, - Language = "en", + File = new FileReference { Name = "Transcription sample short.mp3" }, + Language = "pt", }; // Act @@ -74,6 +77,7 @@ public async Task CreateTranscription_OpenAi_ReturnsTranscription_JsonFormat(Inv TestContext.WriteLine(result.Transcription); TestContext.WriteLine(result.Segments); Assert.IsNotNull(result); + Assert.IsFalse(result.Transcription.Contains("A:")); } [TestMethod, ContextDataSource(ConnectionTypes.AzureOpenAi)] @@ -104,7 +108,7 @@ public async Task CreateTranscription_OpenAi_Prompt_WithGpt4oTranscribeDiarize_T var model = new AudioModelIdentifier { ModelId = "gpt-4o-transcribe-diarize" }; var request = new TranscriptionRequest { - File = new FileReference { Name = "tts delorean.mp3" }, + File = new FileReference { Name = "Transcription sample short.mp3" }, Prompt = "Some prompt", }; @@ -125,7 +129,7 @@ public async Task CreateTranscription_OpenAi_TimestampGranularities_WithoutWhisp var model = new AudioModelIdentifier { ModelId = "gpt-4o-transcribe-diarize" }; var request = new TranscriptionRequest { - File = new FileReference { Name = "tts delorean.mp3" }, + File = new FileReference { Name = "Transcription sample short.mp3" }, TimestampGranularities = ["word"] }; diff --git a/Tests.OpenAI/ConnectionValidatorTests.cs b/Tests.OpenAI/ConnectionValidatorTests.cs index d2143a7..a3adc0a 100644 --- a/Tests.OpenAI/ConnectionValidatorTests.cs +++ b/Tests.OpenAI/ConnectionValidatorTests.cs @@ -9,7 +9,7 @@ namespace Tests.OpenAI; [TestClass] public class ConnectionValidatorTests : TestBaseWithContext { - [TestMethod, ContextDataSource(ConnectionTypes.OpenAi)] + [TestMethod, ContextDataSource(ConnectionTypes.AzureOpenAi)] public async Task ValidateConnection_WithCorrectCredentials_ReturnsValidResult(InvocationContext context) { var validator = new ConnectionValidator(); diff --git a/Tests.OpenAI/EditTests.cs b/Tests.OpenAI/EditTests.cs index d5fcb31..7cfa450 100644 --- a/Tests.OpenAI/EditTests.cs +++ b/Tests.OpenAI/EditTests.cs @@ -21,19 +21,21 @@ public async Task Edit_xliff(InvocationContext context) var modelIdentifier = new TextChatModelIdentifier { ModelId = "gpt-4o" }; var editRequest = new EditContentRequest { - File = new FileReference { Name = "GUID-99E95005-E212-481D-AEBC-67DFA3BD38E8_1_en-US-en-zh_cn-Tr.mxliff" }, + File = new FileReference { Name = "pmdm-13460-all-locales5832465158620282096-en-ja-Pe.mxliff" }, OutputFileHandling = "xliff1", ProcessOnlySegmentState = "Initial", + FilterGlossary = true, + ModifiedBy = "1441948" }; var reasoningEffortRequest = new ReasoningEffortRequest { - //ReasoningEffort = "low" + }; - string? systemMessage = "Your task is to post-edit translation segments by correcting critical errors, comparing each target to its source. Critical errors include tag misplacements, malformed tags, number mismatches, translation omissions, or glossary term violations. Tags appear as combinations of {, }, <, or > with a number (e.g., {1}, <2}, {3>), and these must match the source exactly. Tags define font styles of texts between two tags or represent inserted links and line breaks. \nDo not revert any translation to English.\nDo no change translation style. "; - var glossaryRequest = new GlossaryRequest(); + string? systemMessage = "Perform critical-errors-only post-editing.\r\n\r\nAssume every target segment is already final and approved.\r\nLeave the target unchanged unless there is a clear critical error.\r\n\r\nCritical errors are limited to:\r\n- number or unit mismatch\r\n- omission or unjustified addition\r\n- clear mistranslation that changes meaning\r\n- broken tags/formatting\r\n- explicit glossary violation where a Japanese glossary target exists\r\n\r\nDo not edit for fluency, style, consistency, or terminology normalization alone.\r\nDo not replace approved English terms in the Japanese target unless clearly required by an explicit Japanese glossary entry.\r\nIf no clear critical error exists, return the target exactly unchanged.\r\nWhen unsure, do not change anything.\r\n"; + var glossaryRequest = new GlossaryRequest { Glossary= new FileReference { Name= "PMDM TB.tbx" } }; - var result = await actions.EditContent(modelIdentifier, editRequest, systemMessage, glossaryRequest, reasoningEffortRequest); + var result = await actions.EditContent(modelIdentifier, editRequest, systemMessage, glossaryRequest, reasoningEffortRequest, bucketSize: 25, ProcessLockedSegments: false); Assert.IsNotNull(result); //Assert.Contains("contentful", result.File.Name); From 04842bf34d64b1f6dd6f069c011c58f79e58597e Mon Sep 17 00:00:00 2001 From: Artem Riabushenko Date: Mon, 27 Apr 2026 18:25:27 +0300 Subject: [PATCH 2/2] Bump version --- Apps.OpenAI/Apps.OpenAI.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Apps.OpenAI/Apps.OpenAI.csproj b/Apps.OpenAI/Apps.OpenAI.csproj index 7209837..1099552 100644 --- a/Apps.OpenAI/Apps.OpenAI.csproj +++ b/Apps.OpenAI/Apps.OpenAI.csproj @@ -4,7 +4,7 @@ net8.0 OpenAI Creating safe artificial general intelligence that benefits all of humanity - 2.8.36 + 2.8.37 Apps.OpenAI