WIP: Adding ultra realistic voice support
This commit is contained in:
@ -146,7 +146,7 @@
|
||||
+
|
||||
Text(audioController.unreadText)
|
||||
.font(.textToSpeechRead.leading(.loose))
|
||||
.foregroundColor(Color.appGrayText)
|
||||
.foregroundColor(audioController.useUltraRealisticVoices ? Color.appGrayTextContrast : Color.appGrayText)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -21,7 +21,11 @@
|
||||
}
|
||||
|
||||
var isPresented: Bool {
|
||||
audioController.itemAudioProperties != nil && audioController.state != .stopped
|
||||
let presented = audioController.itemAudioProperties != nil && audioController.state != .stopped
|
||||
if !presented {
|
||||
print("isPresented: ", audioController.itemAudioProperties, audioController.state)
|
||||
}
|
||||
return true // presented
|
||||
}
|
||||
|
||||
var playPauseButtonImage: String {
|
||||
@ -172,6 +176,9 @@
|
||||
}
|
||||
}
|
||||
}
|
||||
}.alert("There was an error playing back your audio.",
|
||||
isPresented: $audioController.playbackError) {
|
||||
Button("Dismiss", role: .none) {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -17,36 +17,39 @@
|
||||
var body: some View {
|
||||
Group {
|
||||
Form {
|
||||
if showLanguageChanger {
|
||||
Section("Language") {
|
||||
NavigationLink(destination: TextToSpeechLanguageView().navigationTitle("Language")) {
|
||||
Text(audioController.currentVoiceLanguage.name)
|
||||
Toggle("Use Ultra Realistic Voices", isOn: $audioController.useUltraRealisticVoices)
|
||||
.accentColor(Color.green)
|
||||
|
||||
if audioController.useUltraRealisticVoices {
|
||||
Section {
|
||||
Text("Ultra realistic voices take longer to generate and do not offer a follow along user interface.")
|
||||
.multilineTextAlignment(.leading)
|
||||
}
|
||||
ultraRealisticVoices
|
||||
} else {
|
||||
if showLanguageChanger {
|
||||
Section("Language") {
|
||||
NavigationLink(destination: TextToSpeechLanguageView().navigationTitle("Language")) {
|
||||
Text(audioController.currentVoiceLanguage.name)
|
||||
}
|
||||
}
|
||||
}
|
||||
standardVoices
|
||||
}
|
||||
innerBody
|
||||
}
|
||||
}
|
||||
.navigationTitle("Choose a Voice")
|
||||
}
|
||||
|
||||
private var innerBody: some View {
|
||||
private var standardVoices: some View {
|
||||
ForEach(language.categories, id: \.self) { category in
|
||||
Section(category.rawValue) {
|
||||
ForEach(audioController.voiceList?.filter { $0.category == category } ?? [], id: \.key.self) { voice in
|
||||
HStack {
|
||||
// Voice samples are not working yet
|
||||
// Button(action: {
|
||||
// audioController.playVoiceSample(voice: voice.key)
|
||||
// }) {
|
||||
// Image(systemName: "play.circle").font(.appTitleTwo)
|
||||
// }
|
||||
// .buttonStyle(PlainButtonStyle())
|
||||
|
||||
Button(action: {
|
||||
audioController.setPreferredVoice(voice.key, forLanguage: language.key)
|
||||
audioController.currentVoice = voice.key
|
||||
}) {
|
||||
}, label: {
|
||||
HStack {
|
||||
Text(voice.name)
|
||||
Spacer()
|
||||
@ -60,12 +63,46 @@
|
||||
}
|
||||
}
|
||||
.contentShape(Rectangle())
|
||||
}
|
||||
.buttonStyle(PlainButtonStyle())
|
||||
})
|
||||
.buttonStyle(PlainButtonStyle())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private var ultraRealisticVoices: some View {
|
||||
ForEach([VoiceCategory.enUS, VoiceCategory.enCA, VoiceCategory.enUK], id: \.self) { category in
|
||||
Section(category.rawValue) {
|
||||
ForEach(audioController.realisticVoiceList?.filter { $0.category == category } ?? [], id: \.key.self) { voice in
|
||||
voiceRow(for: voice)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func voiceRow(for voice: VoiceItem) -> some View {
|
||||
HStack {
|
||||
Button(action: {
|
||||
audioController.setPreferredVoice(voice.key, forLanguage: language.key)
|
||||
audioController.currentVoice = voice.key
|
||||
}, label: {
|
||||
HStack {
|
||||
Text(voice.name)
|
||||
Spacer()
|
||||
|
||||
if voice.selected {
|
||||
if audioController.isPlaying, audioController.isLoading {
|
||||
ProgressView()
|
||||
} else {
|
||||
Image(systemName: "checkmark")
|
||||
}
|
||||
}
|
||||
}
|
||||
.contentShape(Rectangle())
|
||||
})
|
||||
.buttonStyle(PlainButtonStyle())
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -59,6 +59,9 @@
|
||||
let duration = CMTimeGetSeconds(item.duration)
|
||||
item.session.updateDuration(forItem: item.speechItem, newDuration: duration)
|
||||
}
|
||||
if item.status == .failed {
|
||||
item.session.stopWithError()
|
||||
}
|
||||
}
|
||||
|
||||
NotificationCenter.default.addObserver(
|
||||
@ -119,23 +122,35 @@
|
||||
Task {
|
||||
guard let speechItem = self.owner?.speechItem else {
|
||||
// This probably can't happen, but if it does, just returning should
|
||||
// let AVPlayer try again.
|
||||
print("No speech item found: ", self.owner?.speechItem)
|
||||
DispatchQueue.main.async {
|
||||
self.processPlaybackError(error: BasicError.message(messageText: "No speech item found."))
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// TODO: how do we want to propogate this and handle it in the player
|
||||
let speechData = try? await SpeechSynthesizer.download(speechItem: speechItem, session: self.session)
|
||||
DispatchQueue.main.async {
|
||||
if speechData == nil {
|
||||
self.session = nil
|
||||
}
|
||||
if let owner = self.owner, let speechData = speechData {
|
||||
owner.speechMarks = speechData.speechMarks
|
||||
}
|
||||
self.mediaData = speechData?.audioData
|
||||
do {
|
||||
let speechData = try await SpeechSynthesizer.download(speechItem: speechItem, session: self.session ?? URLSession.shared)
|
||||
|
||||
self.processPendingRequests()
|
||||
DispatchQueue.main.async {
|
||||
if speechData == nil {
|
||||
self.session = nil
|
||||
self.processPlaybackError(error: BasicError.message(messageText: "Unable to download speech data."))
|
||||
return
|
||||
}
|
||||
|
||||
if let owner = self.owner, let speechData = speechData {
|
||||
owner.speechMarks = speechData.speechMarks
|
||||
}
|
||||
self.mediaData = speechData?.audioData
|
||||
|
||||
self.processPendingRequests()
|
||||
}
|
||||
} catch URLError.cancelled {
|
||||
print("cancelled request error being ignored")
|
||||
} catch {
|
||||
DispatchQueue.main.async {
|
||||
self.processPlaybackError(error: error)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -158,6 +173,15 @@
|
||||
_ = requestsFulfilled.map { self.pendingRequests.remove($0) }
|
||||
}
|
||||
|
||||
func processPlaybackError(error: Error?) {
|
||||
let requestsFulfilled = Set<AVAssetResourceLoadingRequest>(pendingRequests.compactMap {
|
||||
$0.finishLoading(with: error)
|
||||
return nil
|
||||
})
|
||||
|
||||
_ = requestsFulfilled.map { self.pendingRequests.remove($0) }
|
||||
}
|
||||
|
||||
func fillInContentInformationRequest(_ contentInformationRequest: AVAssetResourceLoadingContentInformationRequest?) {
|
||||
contentInformationRequest?.contentType = UTType.mp3.identifier
|
||||
|
||||
@ -205,10 +229,13 @@
|
||||
@Published public var duration: TimeInterval = 0
|
||||
@Published public var timeElapsedString: String?
|
||||
@Published public var durationString: String?
|
||||
@Published public var voiceList: [(name: String, key: String, category: VoiceCategory, selected: Bool)]?
|
||||
@Published public var voiceList: [VoiceItem]?
|
||||
@Published public var realisticVoiceList: [VoiceItem]?
|
||||
|
||||
@Published public var textItems: [String]?
|
||||
|
||||
@Published public var playbackError: Bool = false
|
||||
|
||||
let dataService: DataService
|
||||
|
||||
var timer: Timer?
|
||||
@ -224,6 +251,7 @@
|
||||
|
||||
super.init()
|
||||
self.voiceList = generateVoiceList()
|
||||
self.realisticVoiceList = generateRealisticVoiceList()
|
||||
}
|
||||
|
||||
deinit {
|
||||
@ -277,11 +305,25 @@
|
||||
}
|
||||
}
|
||||
|
||||
public func generateVoiceList() -> [(name: String, key: String, category: VoiceCategory, selected: Bool)] {
|
||||
public func stopWithError() {
|
||||
stop()
|
||||
playbackError = true
|
||||
}
|
||||
|
||||
public func generateVoiceList() -> [VoiceItem] {
|
||||
Voices.Pairs.flatMap { voicePair in
|
||||
[
|
||||
(name: voicePair.firstName, key: voicePair.firstKey, category: voicePair.category, selected: voicePair.firstKey == currentVoice),
|
||||
(name: voicePair.secondName, key: voicePair.secondKey, category: voicePair.category, selected: voicePair.secondKey == currentVoice)
|
||||
VoiceItem(name: voicePair.firstName, key: voicePair.firstKey, category: voicePair.category, selected: voicePair.firstKey == currentVoice),
|
||||
VoiceItem(name: voicePair.secondName, key: voicePair.secondKey, category: voicePair.category, selected: voicePair.secondKey == currentVoice)
|
||||
]
|
||||
}.sorted { $0.name.lowercased() < $1.name.lowercased() }
|
||||
}
|
||||
|
||||
public func generateRealisticVoiceList() -> [VoiceItem] {
|
||||
Voices.UltraPairs.flatMap { voicePair in
|
||||
[
|
||||
VoiceItem(name: voicePair.firstName, key: voicePair.firstKey, category: voicePair.category, selected: voicePair.firstKey == currentVoice),
|
||||
VoiceItem(name: voicePair.secondName, key: voicePair.secondKey, category: voicePair.category, selected: voicePair.secondKey == currentVoice)
|
||||
]
|
||||
}.sorted { $0.name.lowercased() < $1.name.lowercased() }
|
||||
}
|
||||
@ -419,6 +461,8 @@
|
||||
|
||||
@AppStorage(UserDefaultKey.textToSpeechPreloadEnabled.rawValue) public var preloadEnabled = false
|
||||
|
||||
@AppStorage(UserDefaultKey.textToSpeechUseUltraRealisticVoices.rawValue) public var useUltraRealisticVoices = false
|
||||
|
||||
public var currentVoiceLanguage: VoiceLanguage {
|
||||
Voices.Languages.first(where: { $0.key == currentLanguage }) ?? Voices.English
|
||||
}
|
||||
@ -458,6 +502,7 @@
|
||||
set {
|
||||
_currentVoice = newValue
|
||||
voiceList = generateVoiceList()
|
||||
realisticVoiceList = generateRealisticVoiceList()
|
||||
|
||||
var currentIdx = 0
|
||||
var currentOffset = 0.0
|
||||
|
||||
@ -16,6 +16,7 @@ struct UtteranceRequest: Codable {
|
||||
let voice: String
|
||||
let language: String
|
||||
let rate: String
|
||||
let isUltraRealisticVoice: Bool
|
||||
}
|
||||
|
||||
struct Utterance: Decodable {
|
||||
@ -26,10 +27,12 @@ struct Utterance: Decodable {
|
||||
public let wordCount: Double
|
||||
|
||||
func toSSML(document: SpeechDocument) throws -> Data? {
|
||||
let usedVoice = voice ?? document.defaultVoice
|
||||
let request = UtteranceRequest(text: text,
|
||||
voice: voice ?? document.defaultVoice,
|
||||
voice: usedVoice,
|
||||
language: document.language,
|
||||
rate: "1.1")
|
||||
rate: "1.1",
|
||||
isUltraRealisticVoice: Voices.isUltraRealisticVoice(usedVoice))
|
||||
return try JSONEncoder().encode(request)
|
||||
}
|
||||
}
|
||||
@ -120,7 +123,7 @@ struct SpeechSynthesizer {
|
||||
func createPlayerItems(from: Int) -> [SpeechItem] {
|
||||
var result: [SpeechItem] = []
|
||||
|
||||
for idx in from ..< min(7, document.utterances.count) {
|
||||
for idx in from ..< document.utterances.count {
|
||||
let utterance = document.utterances[idx]
|
||||
let voiceStr = utterance.voice ?? document.defaultVoice
|
||||
let segmentStr = String(format: "%04d", arguments: [idx])
|
||||
@ -159,9 +162,31 @@ struct SpeechSynthesizer {
|
||||
return request
|
||||
}
|
||||
|
||||
static func downloadData(session: URLSession, request: URLRequest) async throws -> Data {
|
||||
do {
|
||||
let result: (Data, URLResponse)? = try await session.data(for: request)
|
||||
guard let httpResponse = result?.1 as? HTTPURLResponse, 200 ..< 300 ~= httpResponse.statusCode else {
|
||||
print("error: ", result?.1)
|
||||
throw BasicError.message(messageText: "audioFetch failed. no response or bad status code.")
|
||||
}
|
||||
|
||||
guard let data = result?.0 else {
|
||||
throw BasicError.message(messageText: "audioFetch failed. no data received.")
|
||||
}
|
||||
|
||||
return data
|
||||
} catch URLError.cancelled {
|
||||
print("cancled request error being ignored")
|
||||
return Data()
|
||||
} catch {
|
||||
print("ERROR DOWNLOADING AUDIO DATA", error)
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
static func download(speechItem: SpeechItem,
|
||||
redownloadCached: Bool = false,
|
||||
session: URLSession? = URLSession.shared) async throws -> SynthesizeData?
|
||||
session: URLSession = URLSession.shared) async throws -> SynthesizeData?
|
||||
{
|
||||
let decoder = JSONDecoder()
|
||||
|
||||
@ -174,16 +199,7 @@ struct SpeechSynthesizer {
|
||||
}
|
||||
}
|
||||
|
||||
let request = speechItem.urlRequest
|
||||
let result: (Data, URLResponse)? = try? await (session ?? URLSession.shared).data(for: request)
|
||||
guard let httpResponse = result?.1 as? HTTPURLResponse, 200 ..< 300 ~= httpResponse.statusCode else {
|
||||
print("error: ", result?.1 as Any)
|
||||
throw BasicError.message(messageText: "audioFetch failed. no response or bad status code.")
|
||||
}
|
||||
|
||||
guard let data = result?.0 else {
|
||||
throw BasicError.message(messageText: "audioFetch failed. no data received.")
|
||||
}
|
||||
let data = try await downloadData(session: session, request: speechItem.urlRequest)
|
||||
|
||||
let tempPath = FileManager.default
|
||||
.urls(for: .cachesDirectory, in: .userDomainMask)[0]
|
||||
@ -204,8 +220,6 @@ struct SpeechSynthesizer {
|
||||
try? FileManager.default.removeItem(at: speechItem.localAudioURL)
|
||||
try FileManager.default.moveItem(at: tempPath, to: speechItem.localAudioURL)
|
||||
|
||||
let savedData = try? Data(contentsOf: speechItem.localAudioURL)
|
||||
|
||||
let encoder = JSONEncoder()
|
||||
let speechMarksData = try encoder.encode(jsonData.speechMarks)
|
||||
try speechMarksData.write(to: tempSMPath)
|
||||
@ -214,6 +228,7 @@ struct SpeechSynthesizer {
|
||||
|
||||
return SynthesizeData(audioData: audioData, speechMarks: jsonData.speechMarks)
|
||||
} catch {
|
||||
print("ERROR DOWNLOADING SPEECH DATA:", error)
|
||||
let errorMessage = "audioFetch failed. could not write MP3 data to disk"
|
||||
throw BasicError.message(messageText: errorMessage)
|
||||
}
|
||||
@ -222,12 +237,12 @@ struct SpeechSynthesizer {
|
||||
|
||||
struct SynthesizeResult: Decodable {
|
||||
let audioData: String
|
||||
let speechMarks: [SpeechMark]
|
||||
let speechMarks: [SpeechMark]?
|
||||
}
|
||||
|
||||
struct SynthesizeData: Decodable {
|
||||
let audioData: Data
|
||||
let speechMarks: [SpeechMark]
|
||||
let speechMarks: [SpeechMark]?
|
||||
}
|
||||
|
||||
extension Data {
|
||||
|
||||
@ -14,6 +14,13 @@ public struct VoiceLanguage {
|
||||
public let categories: [VoiceCategory]
|
||||
}
|
||||
|
||||
public struct VoiceItem {
|
||||
public let name: String
|
||||
public let key: String
|
||||
public let category: VoiceCategory
|
||||
public let selected: Bool
|
||||
}
|
||||
|
||||
public enum VoiceCategory: String, CaseIterable {
|
||||
case enUS = "English (US)"
|
||||
case enAU = "English (Australia)"
|
||||
@ -40,6 +47,12 @@ public struct VoicePair {
|
||||
}
|
||||
|
||||
public enum Voices {
|
||||
public static func isUltraRealisticVoice(_ voiceKey: String) -> Bool {
|
||||
UltraPairs.contains(where: { voice in
|
||||
voice.firstKey == voiceKey || voice.secondKey == voiceKey
|
||||
})
|
||||
}
|
||||
|
||||
public static let English = VoiceLanguage(key: "en",
|
||||
name: "English",
|
||||
defaultVoice: "en-US-ChristopherNeural",
|
||||
@ -72,4 +85,37 @@ public enum Voices {
|
||||
VoicePair(firstKey: "de-DE-ChristophNeural", secondKey: "de-DE-LouisaNeural", firstName: "Christoph", secondName: "Louisa", language: "de-DE", category: .deDE),
|
||||
VoicePair(firstKey: "ja-JP-NanamiNeural", secondKey: "ja-JP-KeitaNeural", firstName: "Nanami", secondName: "Keita", language: "ja-JP", category: .jaJP)
|
||||
]
|
||||
|
||||
public static let UltraPairs = [
|
||||
VoicePair(firstKey: "Larry", secondKey: "susan", firstName: "Larry", secondName: "Susan", language: "en-US", category: .enUS),
|
||||
|
||||
VoicePair(firstKey: "Jordan", secondKey: "William", firstName: "Jordan", secondName: "William", language: "en-US", category: .enUS),
|
||||
VoicePair(firstKey: "Adrian", secondKey: "Anthony", firstName: "Adrian", secondName: "Anthony", language: "en-US", category: .enUS),
|
||||
|
||||
VoicePair(firstKey: "Oliver", secondKey: "Arthur", firstName: "Oliver", secondName: "Arthur", language: "en-UK", category: .enUK),
|
||||
|
||||
VoicePair(firstKey: "Daniel", secondKey: "Charlotte", firstName: "Daniel", secondName: "Charlotte", language: "en-CA", category: .enCA),
|
||||
|
||||
VoicePair(firstKey: "Alexander", secondKey: "Aurora", firstName: "Alexander", secondName: "Aurora", language: "en-UK", category: .enUK),
|
||||
|
||||
VoicePair(firstKey: "Axel", secondKey: "Carter", firstName: "Axel", secondName: "Carter", language: "en-US", category: .enUS),
|
||||
|
||||
VoicePair(firstKey: "Ellie", secondKey: "Evelyn", firstName: "Ellie", secondName: "Evelyn", language: "en-US", category: .enUS),
|
||||
|
||||
VoicePair(firstKey: "Frankie", secondKey: "Harrison", firstName: "Frankie", secondName: "Harrison", language: "en-US", category: .enUS),
|
||||
|
||||
VoicePair(firstKey: "Frederick", secondKey: "Hunter", firstName: "Frederick", secondName: "Hunter", language: "en-UK", category: .enUK),
|
||||
|
||||
VoicePair(firstKey: "Lillian", secondKey: "Lottie", firstName: "Lillian", secondName: "Lottie", language: "en-UK", category: .enUK),
|
||||
|
||||
VoicePair(firstKey: "Nolan", secondKey: "Phoebe", firstName: "Nolan", secondName: "Phoebe", language: "en-UK", category: .enUK),
|
||||
|
||||
VoicePair(firstKey: "Daisy", secondKey: "Stella", firstName: "Daisy", secondName: "Stella", language: "en-UK", category: .enUK),
|
||||
|
||||
VoicePair(firstKey: "Maverick", secondKey: "Natalie", firstName: "Maverick", secondName: "Natalie", language: "en-US", category: .enUS),
|
||||
|
||||
VoicePair(firstKey: "Nova", secondKey: "Owen", firstName: "Nova", secondName: "Owen", language: "en-US", category: .enUS)
|
||||
|
||||
// VoicePair(firstKey: "Theodore", secondKey: "Theodore", firstName: "Theodore", secondName: "Theodore", language: "en-US", category: .enUS)
|
||||
]
|
||||
}
|
||||
|
||||
@ -17,6 +17,7 @@ public enum UserDefaultKey: String {
|
||||
case textToSpeechPreferredVoice
|
||||
case textToSpeechDefaultLanguage
|
||||
case textToSpeechPreloadEnabled
|
||||
case textToSpeechUseUltraRealisticVoices
|
||||
case recentSearchTerms
|
||||
case audioPlayerExpanded
|
||||
case themeName
|
||||
|
||||
Reference in New Issue
Block a user