WIP: Adding ultra realistic voice support

2022-11-04 17:27:53 +08:00
parent eda8ff5f97
commit 3d9976e06a
7 changed files with 205 additions and 54 deletions
--- a/apple/OmnivoreKit/Sources/App/Views/AudioPlayer/ExpandedPlayer.swift
+++ b/apple/OmnivoreKit/Sources/App/Views/AudioPlayer/ExpandedPlayer.swift
@ -146,7 +146,7 @@
                +
                Text(audioController.unreadText)
                .font(.textToSpeechRead.leading(.loose))
-                .foregroundColor(Color.appGrayText)
+                .foregroundColor(audioController.useUltraRealisticVoices ? Color.appGrayTextContrast : Color.appGrayText)
            }
          }
        }
--- a/apple/OmnivoreKit/Sources/App/Views/AudioPlayer/MiniPlayer.swift
+++ b/apple/OmnivoreKit/Sources/App/Views/AudioPlayer/MiniPlayer.swift
@ -21,7 +21,11 @@
    }

    var isPresented: Bool {
-      audioController.itemAudioProperties != nil && audioController.state != .stopped
+      let presented = audioController.itemAudioProperties != nil && audioController.state != .stopped
+      if !presented {
+        print("isPresented: ", audioController.itemAudioProperties, audioController.state)
+      }
+      return true // presented
    }

    var playPauseButtonImage: String {
@ -172,6 +176,9 @@
            }
          }
        }
+      }.alert("There was an error playing back your audio.",
+              isPresented: $audioController.playbackError) {
+        Button("Dismiss", role: .none) {}
      }
    }
  }
--- a/apple/OmnivoreKit/Sources/App/Views/Profile/TextToSpeechVoiceSelectionView.swift
+++ b/apple/OmnivoreKit/Sources/App/Views/Profile/TextToSpeechVoiceSelectionView.swift
@ -17,36 +17,39 @@
    var body: some View {
      Group {
        Form {
-          if showLanguageChanger {
-            Section("Language") {
-              NavigationLink(destination: TextToSpeechLanguageView().navigationTitle("Language")) {
-                Text(audioController.currentVoiceLanguage.name)
+          Toggle("Use Ultra Realistic Voices", isOn: $audioController.useUltraRealisticVoices)
+            .accentColor(Color.green)
+
+          if audioController.useUltraRealisticVoices {
+            Section {
+              Text("Ultra realistic voices take longer to generate and do not offer a follow along user interface.")
+                .multilineTextAlignment(.leading)
+            }
+            ultraRealisticVoices
+          } else {
+            if showLanguageChanger {
+              Section("Language") {
+                NavigationLink(destination: TextToSpeechLanguageView().navigationTitle("Language")) {
+                  Text(audioController.currentVoiceLanguage.name)
+                }
              }
            }
+            standardVoices
          }
-          innerBody
        }
      }
      .navigationTitle("Choose a Voice")
    }

-    private var innerBody: some View {
+    private var standardVoices: some View {
      ForEach(language.categories, id: \.self) { category in
        Section(category.rawValue) {
          ForEach(audioController.voiceList?.filter { $0.category == category } ?? [], id: \.key.self) { voice in
            HStack {
-              // Voice samples are not working yet
-//            Button(action: {
-//              audioController.playVoiceSample(voice: voice.key)
-//            }) {
-//              Image(systemName: "play.circle").font(.appTitleTwo)
-//            }
-//            .buttonStyle(PlainButtonStyle())
-
              Button(action: {
                audioController.setPreferredVoice(voice.key, forLanguage: language.key)
                audioController.currentVoice = voice.key
-              }) {
+              }, label: {
                HStack {
                  Text(voice.name)
                  Spacer()
@ -60,12 +63,46 @@
                  }
                }
                .contentShape(Rectangle())
-              }
-              .buttonStyle(PlainButtonStyle())
+              })
+                .buttonStyle(PlainButtonStyle())
            }
          }
        }
      }
    }
+
+    private var ultraRealisticVoices: some View {
+      ForEach([VoiceCategory.enUS, VoiceCategory.enCA, VoiceCategory.enUK], id: \.self) { category in
+        Section(category.rawValue) {
+          ForEach(audioController.realisticVoiceList?.filter { $0.category == category } ?? [], id: \.key.self) { voice in
+            voiceRow(for: voice)
+          }
+        }
+      }
+    }
+
+    func voiceRow(for voice: VoiceItem) -> some View {
+      HStack {
+        Button(action: {
+          audioController.setPreferredVoice(voice.key, forLanguage: language.key)
+          audioController.currentVoice = voice.key
+        }, label: {
+          HStack {
+            Text(voice.name)
+            Spacer()
+
+            if voice.selected {
+              if audioController.isPlaying, audioController.isLoading {
+                ProgressView()
+              } else {
+                Image(systemName: "checkmark")
+              }
+            }
+          }
+          .contentShape(Rectangle())
+        })
+          .buttonStyle(PlainButtonStyle())
+      }
+    }
  }
 #endif
--- a/apple/OmnivoreKit/Sources/Services/AudioSession/AudioController.swift
+++ b/apple/OmnivoreKit/Sources/Services/AudioSession/AudioController.swift
@ -59,6 +59,9 @@
          let duration = CMTimeGetSeconds(item.duration)
          item.session.updateDuration(forItem: item.speechItem, newDuration: duration)
        }
+        if item.status == .failed {
+          item.session.stopWithError()
+        }
      }

      NotificationCenter.default.addObserver(
@ -119,23 +122,35 @@
        Task {
          guard let speechItem = self.owner?.speechItem else {
            // This probably can't happen, but if it does, just returning should
-            // let AVPlayer try again.
-            print("No speech item found: ", self.owner?.speechItem)
+            DispatchQueue.main.async {
+              self.processPlaybackError(error: BasicError.message(messageText: "No speech item found."))
+            }
            return
          }

-          // TODO: how do we want to propogate this and handle it in the player
-          let speechData = try? await SpeechSynthesizer.download(speechItem: speechItem, session: self.session)
-          DispatchQueue.main.async {
-            if speechData == nil {
-              self.session = nil
-            }
-            if let owner = self.owner, let speechData = speechData {
-              owner.speechMarks = speechData.speechMarks
-            }
-            self.mediaData = speechData?.audioData
+          do {
+            let speechData = try await SpeechSynthesizer.download(speechItem: speechItem, session: self.session ?? URLSession.shared)

-            self.processPendingRequests()
+            DispatchQueue.main.async {
+              if speechData == nil {
+                self.session = nil
+                self.processPlaybackError(error: BasicError.message(messageText: "Unable to download speech data."))
+                return
+              }
+
+              if let owner = self.owner, let speechData = speechData {
+                owner.speechMarks = speechData.speechMarks
+              }
+              self.mediaData = speechData?.audioData
+
+              self.processPendingRequests()
+            }
+          } catch URLError.cancelled {
+            print("cancelled request error being ignored")
+          } catch {
+            DispatchQueue.main.async {
+              self.processPlaybackError(error: error)
+            }
          }
        }
      }
@ -158,6 +173,15 @@
        _ = requestsFulfilled.map { self.pendingRequests.remove($0) }
      }

+      func processPlaybackError(error: Error?) {
+        let requestsFulfilled = Set<AVAssetResourceLoadingRequest>(pendingRequests.compactMap {
+          $0.finishLoading(with: error)
+          return nil
+        })
+
+        _ = requestsFulfilled.map { self.pendingRequests.remove($0) }
+      }
+
      func fillInContentInformationRequest(_ contentInformationRequest: AVAssetResourceLoadingContentInformationRequest?) {
        contentInformationRequest?.contentType = UTType.mp3.identifier

@ -205,10 +229,13 @@
    @Published public var duration: TimeInterval = 0
    @Published public var timeElapsedString: String?
    @Published public var durationString: String?
-    @Published public var voiceList: [(name: String, key: String, category: VoiceCategory, selected: Bool)]?
+    @Published public var voiceList: [VoiceItem]?
+    @Published public var realisticVoiceList: [VoiceItem]?

    @Published public var textItems: [String]?

+    @Published public var playbackError: Bool = false
+
    let dataService: DataService

    var timer: Timer?
@ -224,6 +251,7 @@

      super.init()
      self.voiceList = generateVoiceList()
+      self.realisticVoiceList = generateRealisticVoiceList()
    }

    deinit {
@ -277,11 +305,25 @@
      }
    }

-    public func generateVoiceList() -> [(name: String, key: String, category: VoiceCategory, selected: Bool)] {
+    public func stopWithError() {
+      stop()
+      playbackError = true
+    }
+
+    public func generateVoiceList() -> [VoiceItem] {
      Voices.Pairs.flatMap { voicePair in
        [
-          (name: voicePair.firstName, key: voicePair.firstKey, category: voicePair.category, selected: voicePair.firstKey == currentVoice),
-          (name: voicePair.secondName, key: voicePair.secondKey, category: voicePair.category, selected: voicePair.secondKey == currentVoice)
+          VoiceItem(name: voicePair.firstName, key: voicePair.firstKey, category: voicePair.category, selected: voicePair.firstKey == currentVoice),
+          VoiceItem(name: voicePair.secondName, key: voicePair.secondKey, category: voicePair.category, selected: voicePair.secondKey == currentVoice)
+        ]
+      }.sorted { $0.name.lowercased() < $1.name.lowercased() }
+    }
+
+    public func generateRealisticVoiceList() -> [VoiceItem] {
+      Voices.UltraPairs.flatMap { voicePair in
+        [
+          VoiceItem(name: voicePair.firstName, key: voicePair.firstKey, category: voicePair.category, selected: voicePair.firstKey == currentVoice),
+          VoiceItem(name: voicePair.secondName, key: voicePair.secondKey, category: voicePair.category, selected: voicePair.secondKey == currentVoice)
        ]
      }.sorted { $0.name.lowercased() < $1.name.lowercased() }
    }
@ -419,6 +461,8 @@

    @AppStorage(UserDefaultKey.textToSpeechPreloadEnabled.rawValue) public var preloadEnabled = false

+    @AppStorage(UserDefaultKey.textToSpeechUseUltraRealisticVoices.rawValue) public var useUltraRealisticVoices = false
+
    public var currentVoiceLanguage: VoiceLanguage {
      Voices.Languages.first(where: { $0.key == currentLanguage }) ?? Voices.English
    }
@ -458,6 +502,7 @@
      set {
        _currentVoice = newValue
        voiceList = generateVoiceList()
+        realisticVoiceList = generateRealisticVoiceList()

        var currentIdx = 0
        var currentOffset = 0.0
--- a/apple/OmnivoreKit/Sources/Services/AudioSession/SpeechSynthesizer.swift
+++ b/apple/OmnivoreKit/Sources/Services/AudioSession/SpeechSynthesizer.swift
@ -16,6 +16,7 @@ struct UtteranceRequest: Codable {
  let voice: String
  let language: String
  let rate: String
+  let isUltraRealisticVoice: Bool
 }

 struct Utterance: Decodable {
@ -26,10 +27,12 @@ struct Utterance: Decodable {
  public let wordCount: Double

  func toSSML(document: SpeechDocument) throws -> Data? {
+    let usedVoice = voice ?? document.defaultVoice
    let request = UtteranceRequest(text: text,
-                                   voice: voice ?? document.defaultVoice,
+                                   voice: usedVoice,
                                   language: document.language,
-                                   rate: "1.1")
+                                   rate: "1.1",
+                                   isUltraRealisticVoice: Voices.isUltraRealisticVoice(usedVoice))
    return try JSONEncoder().encode(request)
  }
 }
@ -120,7 +123,7 @@ struct SpeechSynthesizer {
  func createPlayerItems(from: Int) -> [SpeechItem] {
    var result: [SpeechItem] = []

-    for idx in from ..< min(7, document.utterances.count) {
+    for idx in from ..< document.utterances.count {
      let utterance = document.utterances[idx]
      let voiceStr = utterance.voice ?? document.defaultVoice
      let segmentStr = String(format: "%04d", arguments: [idx])
@ -159,9 +162,31 @@ struct SpeechSynthesizer {
    return request
  }

+  static func downloadData(session: URLSession, request: URLRequest) async throws -> Data {
+    do {
+      let result: (Data, URLResponse)? = try await session.data(for: request)
+      guard let httpResponse = result?.1 as? HTTPURLResponse, 200 ..< 300 ~= httpResponse.statusCode else {
+        print("error: ", result?.1)
+        throw BasicError.message(messageText: "audioFetch failed. no response or bad status code.")
+      }
+
+      guard let data = result?.0 else {
+        throw BasicError.message(messageText: "audioFetch failed. no data received.")
+      }
+
+      return data
+    } catch URLError.cancelled {
+      print("cancled request error being ignored")
+      return Data()
+    } catch {
+      print("ERROR DOWNLOADING AUDIO DATA", error)
+      throw error
+    }
+  }
+
  static func download(speechItem: SpeechItem,
                       redownloadCached: Bool = false,
-                       session: URLSession? = URLSession.shared) async throws -> SynthesizeData?
+                       session: URLSession = URLSession.shared) async throws -> SynthesizeData?
  {
    let decoder = JSONDecoder()

@ -174,16 +199,7 @@ struct SpeechSynthesizer {
      }
    }

-    let request = speechItem.urlRequest
-    let result: (Data, URLResponse)? = try? await (session ?? URLSession.shared).data(for: request)
-    guard let httpResponse = result?.1 as? HTTPURLResponse, 200 ..< 300 ~= httpResponse.statusCode else {
-      print("error: ", result?.1 as Any)
-      throw BasicError.message(messageText: "audioFetch failed. no response or bad status code.")
-    }
-
-    guard let data = result?.0 else {
-      throw BasicError.message(messageText: "audioFetch failed. no data received.")
-    }
+    let data = try await downloadData(session: session, request: speechItem.urlRequest)

    let tempPath = FileManager.default
      .urls(for: .cachesDirectory, in: .userDomainMask)[0]
@ -204,8 +220,6 @@ struct SpeechSynthesizer {
      try? FileManager.default.removeItem(at: speechItem.localAudioURL)
      try FileManager.default.moveItem(at: tempPath, to: speechItem.localAudioURL)

-      let savedData = try? Data(contentsOf: speechItem.localAudioURL)
-
      let encoder = JSONEncoder()
      let speechMarksData = try encoder.encode(jsonData.speechMarks)
      try speechMarksData.write(to: tempSMPath)
@ -214,6 +228,7 @@ struct SpeechSynthesizer {

      return SynthesizeData(audioData: audioData, speechMarks: jsonData.speechMarks)
    } catch {
+      print("ERROR DOWNLOADING SPEECH DATA:", error)
      let errorMessage = "audioFetch failed. could not write MP3 data to disk"
      throw BasicError.message(messageText: errorMessage)
    }
@ -222,12 +237,12 @@ struct SpeechSynthesizer {

 struct SynthesizeResult: Decodable {
  let audioData: String
-  let speechMarks: [SpeechMark]
+  let speechMarks: [SpeechMark]?
 }

 struct SynthesizeData: Decodable {
  let audioData: Data
-  let speechMarks: [SpeechMark]
+  let speechMarks: [SpeechMark]?
 }

 extension Data {
--- a/apple/OmnivoreKit/Sources/Services/AudioSession/Voices.swift
+++ b/apple/OmnivoreKit/Sources/Services/AudioSession/Voices.swift
@ -14,6 +14,13 @@ public struct VoiceLanguage {
  public let categories: [VoiceCategory]
 }

+public struct VoiceItem {
+  public let name: String
+  public let key: String
+  public let category: VoiceCategory
+  public let selected: Bool
+}
+
 public enum VoiceCategory: String, CaseIterable {
  case enUS = "English (US)"
  case enAU = "English (Australia)"
@ -40,6 +47,12 @@ public struct VoicePair {
 }

 public enum Voices {
+  public static func isUltraRealisticVoice(_ voiceKey: String) -> Bool {
+    UltraPairs.contains(where: { voice in
+      voice.firstKey == voiceKey || voice.secondKey == voiceKey
+    })
+  }
+
  public static let English = VoiceLanguage(key: "en",
                                            name: "English",
                                            defaultVoice: "en-US-ChristopherNeural",
@ -72,4 +85,37 @@ public enum Voices {
    VoicePair(firstKey: "de-DE-ChristophNeural", secondKey: "de-DE-LouisaNeural", firstName: "Christoph", secondName: "Louisa", language: "de-DE", category: .deDE),
    VoicePair(firstKey: "ja-JP-NanamiNeural", secondKey: "ja-JP-KeitaNeural", firstName: "Nanami", secondName: "Keita", language: "ja-JP", category: .jaJP)
  ]
+
+  public static let UltraPairs = [
+    VoicePair(firstKey: "Larry", secondKey: "susan", firstName: "Larry", secondName: "Susan", language: "en-US", category: .enUS),
+
+    VoicePair(firstKey: "Jordan", secondKey: "William", firstName: "Jordan", secondName: "William", language: "en-US", category: .enUS),
+    VoicePair(firstKey: "Adrian", secondKey: "Anthony", firstName: "Adrian", secondName: "Anthony", language: "en-US", category: .enUS),
+
+    VoicePair(firstKey: "Oliver", secondKey: "Arthur", firstName: "Oliver", secondName: "Arthur", language: "en-UK", category: .enUK),
+
+    VoicePair(firstKey: "Daniel", secondKey: "Charlotte", firstName: "Daniel", secondName: "Charlotte", language: "en-CA", category: .enCA),
+
+    VoicePair(firstKey: "Alexander", secondKey: "Aurora", firstName: "Alexander", secondName: "Aurora", language: "en-UK", category: .enUK),
+
+    VoicePair(firstKey: "Axel", secondKey: "Carter", firstName: "Axel", secondName: "Carter", language: "en-US", category: .enUS),
+
+    VoicePair(firstKey: "Ellie", secondKey: "Evelyn", firstName: "Ellie", secondName: "Evelyn", language: "en-US", category: .enUS),
+
+    VoicePair(firstKey: "Frankie", secondKey: "Harrison", firstName: "Frankie", secondName: "Harrison", language: "en-US", category: .enUS),
+
+    VoicePair(firstKey: "Frederick", secondKey: "Hunter", firstName: "Frederick", secondName: "Hunter", language: "en-UK", category: .enUK),
+
+    VoicePair(firstKey: "Lillian", secondKey: "Lottie", firstName: "Lillian", secondName: "Lottie", language: "en-UK", category: .enUK),
+
+    VoicePair(firstKey: "Nolan", secondKey: "Phoebe", firstName: "Nolan", secondName: "Phoebe", language: "en-UK", category: .enUK),
+
+    VoicePair(firstKey: "Daisy", secondKey: "Stella", firstName: "Daisy", secondName: "Stella", language: "en-UK", category: .enUK),
+
+    VoicePair(firstKey: "Maverick", secondKey: "Natalie", firstName: "Maverick", secondName: "Natalie", language: "en-US", category: .enUS),
+
+    VoicePair(firstKey: "Nova", secondKey: "Owen", firstName: "Nova", secondName: "Owen", language: "en-US", category: .enUS)
+
+    // VoicePair(firstKey: "Theodore", secondKey: "Theodore", firstName: "Theodore", secondName: "Theodore", language: "en-US", category: .enUS)
+  ]
 }
--- a/apple/OmnivoreKit/Sources/Utils/UserDefaultKeys.swift
+++ b/apple/OmnivoreKit/Sources/Utils/UserDefaultKeys.swift
@ -17,6 +17,7 @@ public enum UserDefaultKey: String {
  case textToSpeechPreferredVoice
  case textToSpeechDefaultLanguage
  case textToSpeechPreloadEnabled
+  case textToSpeechUseUltraRealisticVoices
  case recentSearchTerms
  case audioPlayerExpanded
  case themeName