WIP: Adding ultra realistic voice support

This commit is contained in:
Jackson Harper
2022-11-04 17:27:53 +08:00
parent eda8ff5f97
commit 3d9976e06a
7 changed files with 205 additions and 54 deletions

View File

@ -146,7 +146,7 @@
+
Text(audioController.unreadText)
.font(.textToSpeechRead.leading(.loose))
.foregroundColor(Color.appGrayText)
.foregroundColor(audioController.useUltraRealisticVoices ? Color.appGrayTextContrast : Color.appGrayText)
}
}
}

View File

@ -21,7 +21,11 @@
}
var isPresented: Bool {
audioController.itemAudioProperties != nil && audioController.state != .stopped
let presented = audioController.itemAudioProperties != nil && audioController.state != .stopped
if !presented {
print("isPresented: ", audioController.itemAudioProperties, audioController.state)
}
return true // presented
}
var playPauseButtonImage: String {
@ -172,6 +176,9 @@
}
}
}
}.alert("There was an error playing back your audio.",
isPresented: $audioController.playbackError) {
Button("Dismiss", role: .none) {}
}
}
}

View File

@ -17,36 +17,39 @@
var body: some View {
Group {
Form {
if showLanguageChanger {
Section("Language") {
NavigationLink(destination: TextToSpeechLanguageView().navigationTitle("Language")) {
Text(audioController.currentVoiceLanguage.name)
Toggle("Use Ultra Realistic Voices", isOn: $audioController.useUltraRealisticVoices)
.accentColor(Color.green)
if audioController.useUltraRealisticVoices {
Section {
Text("Ultra realistic voices take longer to generate and do not offer a follow along user interface.")
.multilineTextAlignment(.leading)
}
ultraRealisticVoices
} else {
if showLanguageChanger {
Section("Language") {
NavigationLink(destination: TextToSpeechLanguageView().navigationTitle("Language")) {
Text(audioController.currentVoiceLanguage.name)
}
}
}
standardVoices
}
innerBody
}
}
.navigationTitle("Choose a Voice")
}
private var innerBody: some View {
private var standardVoices: some View {
ForEach(language.categories, id: \.self) { category in
Section(category.rawValue) {
ForEach(audioController.voiceList?.filter { $0.category == category } ?? [], id: \.key.self) { voice in
HStack {
// Voice samples are not working yet
// Button(action: {
// audioController.playVoiceSample(voice: voice.key)
// }) {
// Image(systemName: "play.circle").font(.appTitleTwo)
// }
// .buttonStyle(PlainButtonStyle())
Button(action: {
audioController.setPreferredVoice(voice.key, forLanguage: language.key)
audioController.currentVoice = voice.key
}) {
}, label: {
HStack {
Text(voice.name)
Spacer()
@ -60,12 +63,46 @@
}
}
.contentShape(Rectangle())
}
.buttonStyle(PlainButtonStyle())
})
.buttonStyle(PlainButtonStyle())
}
}
}
}
}
private var ultraRealisticVoices: some View {
ForEach([VoiceCategory.enUS, VoiceCategory.enCA, VoiceCategory.enUK], id: \.self) { category in
Section(category.rawValue) {
ForEach(audioController.realisticVoiceList?.filter { $0.category == category } ?? [], id: \.key.self) { voice in
voiceRow(for: voice)
}
}
}
}
func voiceRow(for voice: VoiceItem) -> some View {
HStack {
Button(action: {
audioController.setPreferredVoice(voice.key, forLanguage: language.key)
audioController.currentVoice = voice.key
}, label: {
HStack {
Text(voice.name)
Spacer()
if voice.selected {
if audioController.isPlaying, audioController.isLoading {
ProgressView()
} else {
Image(systemName: "checkmark")
}
}
}
.contentShape(Rectangle())
})
.buttonStyle(PlainButtonStyle())
}
}
}
#endif

View File

@ -59,6 +59,9 @@
let duration = CMTimeGetSeconds(item.duration)
item.session.updateDuration(forItem: item.speechItem, newDuration: duration)
}
if item.status == .failed {
item.session.stopWithError()
}
}
NotificationCenter.default.addObserver(
@ -119,23 +122,35 @@
Task {
guard let speechItem = self.owner?.speechItem else {
// This probably can't happen, but if it does, just returning should
// let AVPlayer try again.
print("No speech item found: ", self.owner?.speechItem)
DispatchQueue.main.async {
self.processPlaybackError(error: BasicError.message(messageText: "No speech item found."))
}
return
}
// TODO: how do we want to propogate this and handle it in the player
let speechData = try? await SpeechSynthesizer.download(speechItem: speechItem, session: self.session)
DispatchQueue.main.async {
if speechData == nil {
self.session = nil
}
if let owner = self.owner, let speechData = speechData {
owner.speechMarks = speechData.speechMarks
}
self.mediaData = speechData?.audioData
do {
let speechData = try await SpeechSynthesizer.download(speechItem: speechItem, session: self.session ?? URLSession.shared)
self.processPendingRequests()
DispatchQueue.main.async {
if speechData == nil {
self.session = nil
self.processPlaybackError(error: BasicError.message(messageText: "Unable to download speech data."))
return
}
if let owner = self.owner, let speechData = speechData {
owner.speechMarks = speechData.speechMarks
}
self.mediaData = speechData?.audioData
self.processPendingRequests()
}
} catch URLError.cancelled {
print("cancelled request error being ignored")
} catch {
DispatchQueue.main.async {
self.processPlaybackError(error: error)
}
}
}
}
@ -158,6 +173,15 @@
_ = requestsFulfilled.map { self.pendingRequests.remove($0) }
}
func processPlaybackError(error: Error?) {
let requestsFulfilled = Set<AVAssetResourceLoadingRequest>(pendingRequests.compactMap {
$0.finishLoading(with: error)
return nil
})
_ = requestsFulfilled.map { self.pendingRequests.remove($0) }
}
func fillInContentInformationRequest(_ contentInformationRequest: AVAssetResourceLoadingContentInformationRequest?) {
contentInformationRequest?.contentType = UTType.mp3.identifier
@ -205,10 +229,13 @@
@Published public var duration: TimeInterval = 0
@Published public var timeElapsedString: String?
@Published public var durationString: String?
@Published public var voiceList: [(name: String, key: String, category: VoiceCategory, selected: Bool)]?
@Published public var voiceList: [VoiceItem]?
@Published public var realisticVoiceList: [VoiceItem]?
@Published public var textItems: [String]?
@Published public var playbackError: Bool = false
let dataService: DataService
var timer: Timer?
@ -224,6 +251,7 @@
super.init()
self.voiceList = generateVoiceList()
self.realisticVoiceList = generateRealisticVoiceList()
}
deinit {
@ -277,11 +305,25 @@
}
}
public func generateVoiceList() -> [(name: String, key: String, category: VoiceCategory, selected: Bool)] {
public func stopWithError() {
stop()
playbackError = true
}
public func generateVoiceList() -> [VoiceItem] {
Voices.Pairs.flatMap { voicePair in
[
(name: voicePair.firstName, key: voicePair.firstKey, category: voicePair.category, selected: voicePair.firstKey == currentVoice),
(name: voicePair.secondName, key: voicePair.secondKey, category: voicePair.category, selected: voicePair.secondKey == currentVoice)
VoiceItem(name: voicePair.firstName, key: voicePair.firstKey, category: voicePair.category, selected: voicePair.firstKey == currentVoice),
VoiceItem(name: voicePair.secondName, key: voicePair.secondKey, category: voicePair.category, selected: voicePair.secondKey == currentVoice)
]
}.sorted { $0.name.lowercased() < $1.name.lowercased() }
}
public func generateRealisticVoiceList() -> [VoiceItem] {
Voices.UltraPairs.flatMap { voicePair in
[
VoiceItem(name: voicePair.firstName, key: voicePair.firstKey, category: voicePair.category, selected: voicePair.firstKey == currentVoice),
VoiceItem(name: voicePair.secondName, key: voicePair.secondKey, category: voicePair.category, selected: voicePair.secondKey == currentVoice)
]
}.sorted { $0.name.lowercased() < $1.name.lowercased() }
}
@ -419,6 +461,8 @@
@AppStorage(UserDefaultKey.textToSpeechPreloadEnabled.rawValue) public var preloadEnabled = false
@AppStorage(UserDefaultKey.textToSpeechUseUltraRealisticVoices.rawValue) public var useUltraRealisticVoices = false
public var currentVoiceLanguage: VoiceLanguage {
Voices.Languages.first(where: { $0.key == currentLanguage }) ?? Voices.English
}
@ -458,6 +502,7 @@
set {
_currentVoice = newValue
voiceList = generateVoiceList()
realisticVoiceList = generateRealisticVoiceList()
var currentIdx = 0
var currentOffset = 0.0

View File

@ -16,6 +16,7 @@ struct UtteranceRequest: Codable {
let voice: String
let language: String
let rate: String
let isUltraRealisticVoice: Bool
}
struct Utterance: Decodable {
@ -26,10 +27,12 @@ struct Utterance: Decodable {
public let wordCount: Double
func toSSML(document: SpeechDocument) throws -> Data? {
let usedVoice = voice ?? document.defaultVoice
let request = UtteranceRequest(text: text,
voice: voice ?? document.defaultVoice,
voice: usedVoice,
language: document.language,
rate: "1.1")
rate: "1.1",
isUltraRealisticVoice: Voices.isUltraRealisticVoice(usedVoice))
return try JSONEncoder().encode(request)
}
}
@ -120,7 +123,7 @@ struct SpeechSynthesizer {
func createPlayerItems(from: Int) -> [SpeechItem] {
var result: [SpeechItem] = []
for idx in from ..< min(7, document.utterances.count) {
for idx in from ..< document.utterances.count {
let utterance = document.utterances[idx]
let voiceStr = utterance.voice ?? document.defaultVoice
let segmentStr = String(format: "%04d", arguments: [idx])
@ -159,9 +162,31 @@ struct SpeechSynthesizer {
return request
}
static func downloadData(session: URLSession, request: URLRequest) async throws -> Data {
do {
let result: (Data, URLResponse)? = try await session.data(for: request)
guard let httpResponse = result?.1 as? HTTPURLResponse, 200 ..< 300 ~= httpResponse.statusCode else {
print("error: ", result?.1)
throw BasicError.message(messageText: "audioFetch failed. no response or bad status code.")
}
guard let data = result?.0 else {
throw BasicError.message(messageText: "audioFetch failed. no data received.")
}
return data
} catch URLError.cancelled {
print("cancled request error being ignored")
return Data()
} catch {
print("ERROR DOWNLOADING AUDIO DATA", error)
throw error
}
}
static func download(speechItem: SpeechItem,
redownloadCached: Bool = false,
session: URLSession? = URLSession.shared) async throws -> SynthesizeData?
session: URLSession = URLSession.shared) async throws -> SynthesizeData?
{
let decoder = JSONDecoder()
@ -174,16 +199,7 @@ struct SpeechSynthesizer {
}
}
let request = speechItem.urlRequest
let result: (Data, URLResponse)? = try? await (session ?? URLSession.shared).data(for: request)
guard let httpResponse = result?.1 as? HTTPURLResponse, 200 ..< 300 ~= httpResponse.statusCode else {
print("error: ", result?.1 as Any)
throw BasicError.message(messageText: "audioFetch failed. no response or bad status code.")
}
guard let data = result?.0 else {
throw BasicError.message(messageText: "audioFetch failed. no data received.")
}
let data = try await downloadData(session: session, request: speechItem.urlRequest)
let tempPath = FileManager.default
.urls(for: .cachesDirectory, in: .userDomainMask)[0]
@ -204,8 +220,6 @@ struct SpeechSynthesizer {
try? FileManager.default.removeItem(at: speechItem.localAudioURL)
try FileManager.default.moveItem(at: tempPath, to: speechItem.localAudioURL)
let savedData = try? Data(contentsOf: speechItem.localAudioURL)
let encoder = JSONEncoder()
let speechMarksData = try encoder.encode(jsonData.speechMarks)
try speechMarksData.write(to: tempSMPath)
@ -214,6 +228,7 @@ struct SpeechSynthesizer {
return SynthesizeData(audioData: audioData, speechMarks: jsonData.speechMarks)
} catch {
print("ERROR DOWNLOADING SPEECH DATA:", error)
let errorMessage = "audioFetch failed. could not write MP3 data to disk"
throw BasicError.message(messageText: errorMessage)
}
@ -222,12 +237,12 @@ struct SpeechSynthesizer {
struct SynthesizeResult: Decodable {
let audioData: String
let speechMarks: [SpeechMark]
let speechMarks: [SpeechMark]?
}
struct SynthesizeData: Decodable {
let audioData: Data
let speechMarks: [SpeechMark]
let speechMarks: [SpeechMark]?
}
extension Data {

View File

@ -14,6 +14,13 @@ public struct VoiceLanguage {
public let categories: [VoiceCategory]
}
public struct VoiceItem {
public let name: String
public let key: String
public let category: VoiceCategory
public let selected: Bool
}
public enum VoiceCategory: String, CaseIterable {
case enUS = "English (US)"
case enAU = "English (Australia)"
@ -40,6 +47,12 @@ public struct VoicePair {
}
public enum Voices {
public static func isUltraRealisticVoice(_ voiceKey: String) -> Bool {
UltraPairs.contains(where: { voice in
voice.firstKey == voiceKey || voice.secondKey == voiceKey
})
}
public static let English = VoiceLanguage(key: "en",
name: "English",
defaultVoice: "en-US-ChristopherNeural",
@ -72,4 +85,37 @@ public enum Voices {
VoicePair(firstKey: "de-DE-ChristophNeural", secondKey: "de-DE-LouisaNeural", firstName: "Christoph", secondName: "Louisa", language: "de-DE", category: .deDE),
VoicePair(firstKey: "ja-JP-NanamiNeural", secondKey: "ja-JP-KeitaNeural", firstName: "Nanami", secondName: "Keita", language: "ja-JP", category: .jaJP)
]
public static let UltraPairs = [
VoicePair(firstKey: "Larry", secondKey: "susan", firstName: "Larry", secondName: "Susan", language: "en-US", category: .enUS),
VoicePair(firstKey: "Jordan", secondKey: "William", firstName: "Jordan", secondName: "William", language: "en-US", category: .enUS),
VoicePair(firstKey: "Adrian", secondKey: "Anthony", firstName: "Adrian", secondName: "Anthony", language: "en-US", category: .enUS),
VoicePair(firstKey: "Oliver", secondKey: "Arthur", firstName: "Oliver", secondName: "Arthur", language: "en-UK", category: .enUK),
VoicePair(firstKey: "Daniel", secondKey: "Charlotte", firstName: "Daniel", secondName: "Charlotte", language: "en-CA", category: .enCA),
VoicePair(firstKey: "Alexander", secondKey: "Aurora", firstName: "Alexander", secondName: "Aurora", language: "en-UK", category: .enUK),
VoicePair(firstKey: "Axel", secondKey: "Carter", firstName: "Axel", secondName: "Carter", language: "en-US", category: .enUS),
VoicePair(firstKey: "Ellie", secondKey: "Evelyn", firstName: "Ellie", secondName: "Evelyn", language: "en-US", category: .enUS),
VoicePair(firstKey: "Frankie", secondKey: "Harrison", firstName: "Frankie", secondName: "Harrison", language: "en-US", category: .enUS),
VoicePair(firstKey: "Frederick", secondKey: "Hunter", firstName: "Frederick", secondName: "Hunter", language: "en-UK", category: .enUK),
VoicePair(firstKey: "Lillian", secondKey: "Lottie", firstName: "Lillian", secondName: "Lottie", language: "en-UK", category: .enUK),
VoicePair(firstKey: "Nolan", secondKey: "Phoebe", firstName: "Nolan", secondName: "Phoebe", language: "en-UK", category: .enUK),
VoicePair(firstKey: "Daisy", secondKey: "Stella", firstName: "Daisy", secondName: "Stella", language: "en-UK", category: .enUK),
VoicePair(firstKey: "Maverick", secondKey: "Natalie", firstName: "Maverick", secondName: "Natalie", language: "en-US", category: .enUS),
VoicePair(firstKey: "Nova", secondKey: "Owen", firstName: "Nova", secondName: "Owen", language: "en-US", category: .enUS)
// VoicePair(firstKey: "Theodore", secondKey: "Theodore", firstName: "Theodore", secondName: "Theodore", language: "en-US", category: .enUS)
]
}

View File

@ -17,6 +17,7 @@ public enum UserDefaultKey: String {
case textToSpeechPreferredVoice
case textToSpeechDefaultLanguage
case textToSpeechPreloadEnabled
case textToSpeechUseUltraRealisticVoices
case recentSearchTerms
case audioPlayerExpanded
case themeName