Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 42 additions & 1 deletion Apps.OpenAI/Actions/AudioActions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ public async Task<TranscriptionResponse> CreateTranscription(

return new()
{
Transcription = response.Text,
Transcription = BuildTranscription(response, isDiarizationModel),
Words = JsonConvert.SerializeObject(words),
Segments = JsonConvert.SerializeObject(segments)
};
Expand Down Expand Up @@ -111,6 +111,47 @@ static void ValidateTranscriptionRequest(
throw new PluginMisconfigurationException("Timestamp granularities are only supported when using the 'whisper-1' model.");
}
}

static string BuildTranscription(TranscriptionDto response, bool isDiarizationModel)
{
if (!isDiarizationModel || response.Segments is null || !response.Segments.Any())
{
return response.Text;
}

var speakerTurns = new List<string>();
string? currentSpeaker = null;
var currentText = new List<string>();

foreach (var segment in response.Segments.Where(x => !string.IsNullOrWhiteSpace(x.Text)))
{
var speaker = string.IsNullOrWhiteSpace(segment.Speaker) ? "Unknown" : segment.Speaker.Trim();
var text = segment.Text.Trim();

if (!string.Equals(currentSpeaker, speaker, StringComparison.Ordinal))
{
AddSpeakerTurn(speakerTurns, currentSpeaker, currentText);
currentSpeaker = speaker;
currentText = [text];
continue;
}

currentText.Add(text);
}

AddSpeakerTurn(speakerTurns, currentSpeaker, currentText);
return speakerTurns.Count > 0 ? string.Join(Environment.NewLine, speakerTurns) : response.Text;
}

static void AddSpeakerTurn(List<string> speakerTurns, string? speaker, List<string> currentText)
{
if (string.IsNullOrWhiteSpace(speaker) || currentText.Count == 0)
{
return;
}

speakerTurns.Add($"{speaker}: {string.Join(" ", currentText)}");
}
}

[Action("Create speech", Description = "Generates speech audio from input text.")]
Expand Down
2 changes: 1 addition & 1 deletion Apps.OpenAI/Apps.OpenAI.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<TargetFramework>net8.0</TargetFramework>
<Product>OpenAI</Product>
<Description>Creating safe artificial general intelligence that benefits all of humanity</Description>
<Version>2.8.36</Version>
<Version>2.8.37</Version>
<AssemblyName>Apps.OpenAI</AssemblyName>
</PropertyGroup>

Expand Down
34 changes: 30 additions & 4 deletions Apps.OpenAI/Dtos/TranscriptionDto.cs
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
using Apps.OpenAI.Utils;
using Apps.OpenAI.Utils;
using Newtonsoft.Json;

namespace Apps.OpenAI.Dtos;

public record TranscriptionDto(string Text) : TextDto(Text)
{
public WordDto[] Words { get; init; }
public WordDto[]? Words { get; init; }

public SegmentDto[] Segments { get; init; }
public SegmentDto[]? Segments { get; init; }

public double Temperature { get; init; }

Expand All @@ -20,4 +20,30 @@ public record TranscriptionDto(string Text) : TextDto(Text)

public record WordDto(string Word, double Start, double End);

public record SegmentDto([property: JsonConverter(typeof(FlexibleIdConverter))] string Id, int Seek, double Start, double End, string Text, int[] Tokens, double Temperature, double AvgLogprob, double CompressionRatio, double NoSpeechProb);
public class SegmentDto
{
public string? Type { get; init; }

[JsonConverter(typeof(FlexibleIdConverter))]
public string Id { get; init; } = string.Empty;

public int? Seek { get; init; }

public double Start { get; init; }

public double End { get; init; }

public string Text { get; init; } = string.Empty;

public string? Speaker { get; init; }

public int[]? Tokens { get; init; }

public double? Temperature { get; init; }

public double? AvgLogprob { get; init; }

public double? CompressionRatio { get; init; }

public double? NoSpeechProb { get; init; }
}
4 changes: 4 additions & 0 deletions Apps.OpenAI/Models/Responses/Audio/TranscriptionResponse.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,12 @@ public record SegmentResponse(SegmentDto dto)
[Display("Segment ID")]
public string Id { get; set; } = dto.Id;

public string? Type { get; set; } = dto.Type;

public string Text { get; set; } = dto.Text;

public string? Speaker { get; set; } = dto.Speaker;

public double Start { get; set; } = dto.Start;

public double End { get; set; } = dto.End;
Expand Down
30 changes: 17 additions & 13 deletions Tests.OpenAI/AudioServiceTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
using Blackbird.Applications.Sdk.Common.Files;
using Blackbird.Applications.Sdk.Common.Exceptions;
using Blackbird.Applications.Sdk.Common.Invocation;
using Newtonsoft.Json.Linq;

namespace Tests.OpenAI;

Expand All @@ -21,50 +22,52 @@ public async Task CreateTranscription_OpenAi_ReturnsTranscription_DiarizedJsonFo
var model = new AudioModelIdentifier { ModelId = "gpt-4o-transcribe-diarize" };
var request = new TranscriptionRequest
{
File = new FileReference { Name = "tts delorean.mp3" },
Language = "en",
File = new FileReference { Name = "Transcription sample short.mp3" },
Language = "pt",
};

// Act
var result = await handler.CreateTranscription(model, request);
var segments = JArray.Parse(result.Segments);

// Assert
TestContext.WriteLine(result.Transcription);
TestContext.WriteLine(result.Segments);
Assert.IsNotNull(result);
Assert.IsTrue(segments.Count > 0);
Assert.IsTrue(segments.Any(x => x["Speaker"] != null));
}

[TestMethod, ContextDataSource(ConnectionTypes.OpenAiEmbedded, ConnectionTypes.OpenAi)]
public async Task CreateTranscription_OpenAi_ReturnsTranscription_VerboseJsonFormat(InvocationContext context)
public async Task CreateTranscription_OpenAi_DiarizedModel_AssemblesTranscriptionBySpeaker(InvocationContext context)
{
// Arrange
var handler = new AudioActions(context, FileManagementClient);
var model = new AudioModelIdentifier { ModelId = "gpt-4o-transcribe-diarize" };
var request = new TranscriptionRequest
{
File = new FileReference { Name = "tts delorean.mp3" },
Language = "en",
File = new FileReference { Name = "Transcription sample short.mp3" },
Language = "pt",
};

// Act
var result = await handler.CreateTranscription(model, request);

// Assert
TestContext.WriteLine(result.Transcription);
TestContext.WriteLine(result.Segments);
Console.WriteLine(Newtonsoft.Json.JsonConvert.SerializeObject(result, Newtonsoft.Json.Formatting.Indented));
Assert.IsNotNull(result);
}

[TestMethod, ContextDataSource(ConnectionTypes.OpenAiEmbedded, ConnectionTypes.OpenAi)]
public async Task CreateTranscription_OpenAi_ReturnsTranscription_JsonFormat(InvocationContext context)
public async Task CreateTranscription_OpenAi_StandardModel_ReturnsSingleBlobText(InvocationContext context)
{
// Arrange
var handler = new AudioActions(context, FileManagementClient);
var model = new AudioModelIdentifier { ModelId = "gpt-4o-transcribe-diarize" };
var model = new AudioModelIdentifier { ModelId = "gpt-4o-transcribe" };
var request = new TranscriptionRequest
{
File = new FileReference { Name = "tts delorean.mp3" },
Language = "en",
File = new FileReference { Name = "Transcription sample short.mp3" },
Language = "pt",
};

// Act
Expand All @@ -74,6 +77,7 @@ public async Task CreateTranscription_OpenAi_ReturnsTranscription_JsonFormat(Inv
TestContext.WriteLine(result.Transcription);
TestContext.WriteLine(result.Segments);
Assert.IsNotNull(result);
Assert.IsFalse(result.Transcription.Contains("A:"));
}

[TestMethod, ContextDataSource(ConnectionTypes.AzureOpenAi)]
Expand Down Expand Up @@ -104,7 +108,7 @@ public async Task CreateTranscription_OpenAi_Prompt_WithGpt4oTranscribeDiarize_T
var model = new AudioModelIdentifier { ModelId = "gpt-4o-transcribe-diarize" };
var request = new TranscriptionRequest
{
File = new FileReference { Name = "tts delorean.mp3" },
File = new FileReference { Name = "Transcription sample short.mp3" },
Prompt = "Some prompt",
};

Expand All @@ -125,7 +129,7 @@ public async Task CreateTranscription_OpenAi_TimestampGranularities_WithoutWhisp
var model = new AudioModelIdentifier { ModelId = "gpt-4o-transcribe-diarize" };
var request = new TranscriptionRequest
{
File = new FileReference { Name = "tts delorean.mp3" },
File = new FileReference { Name = "Transcription sample short.mp3" },
TimestampGranularities = ["word"]
};

Expand Down
2 changes: 1 addition & 1 deletion Tests.OpenAI/ConnectionValidatorTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ namespace Tests.OpenAI;
[TestClass]
public class ConnectionValidatorTests : TestBaseWithContext
{
[TestMethod, ContextDataSource(ConnectionTypes.OpenAi)]
[TestMethod, ContextDataSource(ConnectionTypes.AzureOpenAi)]
public async Task ValidateConnection_WithCorrectCredentials_ReturnsValidResult(InvocationContext context)
{
var validator = new ConnectionValidator();
Expand Down
12 changes: 7 additions & 5 deletions Tests.OpenAI/EditTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,21 @@ public async Task Edit_xliff(InvocationContext context)
var modelIdentifier = new TextChatModelIdentifier { ModelId = "gpt-4o" };
var editRequest = new EditContentRequest
{
File = new FileReference { Name = "GUID-99E95005-E212-481D-AEBC-67DFA3BD38E8_1_en-US-en-zh_cn-Tr.mxliff" },
File = new FileReference { Name = "pmdm-13460-all-locales5832465158620282096-en-ja-Pe.mxliff" },
OutputFileHandling = "xliff1",
ProcessOnlySegmentState = "Initial",
FilterGlossary = true,

ModifiedBy = "1441948"
};
var reasoningEffortRequest = new ReasoningEffortRequest
{
//ReasoningEffort = "low"

};
string? systemMessage = "Your task is to post-edit translation segments by correcting critical errors, comparing each target to its source. Critical errors include tag misplacements, malformed tags, number mismatches, translation omissions, or glossary term violations. Tags appear as combinations of {, }, <, or > with a number (e.g., {1}, <2}, {3>), and these must match the source exactly. Tags define font styles of texts between two tags or represent inserted links and line breaks. \nDo not revert any translation to English.\nDo no change translation style. ";
var glossaryRequest = new GlossaryRequest();
string? systemMessage = "Perform critical-errors-only post-editing.\r\n\r\nAssume every target segment is already final and approved.\r\nLeave the target unchanged unless there is a clear critical error.\r\n\r\nCritical errors are limited to:\r\n- number or unit mismatch\r\n- omission or unjustified addition\r\n- clear mistranslation that changes meaning\r\n- broken tags/formatting\r\n- explicit glossary violation where a Japanese glossary target exists\r\n\r\nDo not edit for fluency, style, consistency, or terminology normalization alone.\r\nDo not replace approved English terms in the Japanese target unless clearly required by an explicit Japanese glossary entry.\r\nIf no clear critical error exists, return the target exactly unchanged.\r\nWhen unsure, do not change anything.\r\n";
var glossaryRequest = new GlossaryRequest { Glossary= new FileReference { Name= "PMDM TB.tbx" } };

var result = await actions.EditContent(modelIdentifier, editRequest, systemMessage, glossaryRequest, reasoningEffortRequest);
var result = await actions.EditContent(modelIdentifier, editRequest, systemMessage, glossaryRequest, reasoningEffortRequest, bucketSize: 25, ProcessLockedSegments: false);

Assert.IsNotNull(result);
//Assert.Contains("contentful", result.File.Name);
Expand Down