Add prefetch queue dampening on audio, better handle segments with empty audio data

2024-04-30 14:53:22 +08:00
parent 44d3edc794
commit a67e4d57c3
4 changed files with 70 additions and 20 deletions
--- a/apple/OmnivoreKit/Sources/Services/AudioSession/AudioController.swift
+++ b/apple/OmnivoreKit/Sources/Services/AudioSession/AudioController.swift
@ -1,5 +1,4 @@
 // swiftlint:disable file_length type_body_length
-
 #if os(iOS)

  import AVFoundation
@ -39,23 +38,23 @@ public struct DigestAudioItem: AudioItemProperties {
  public var language: String?
  public var startIndex: Int = 0
  public var startOffset: Double = 0.0
-  
+
  public init(digest: DigestResult) {
    self.digest = digest
    self.itemID = digest.id
    self.title = digest.title
    self.startIndex = 0
    self.startOffset = 0
-    
+
    self.imageURL = nil
-    
+
    if let first = digest.speechFiles.first {
      self.language = first.language
      self.byline  = digest.byline
    }
  }
 }
-  
+
  // swiftlint:disable all
  @MainActor
  public class AudioController: NSObject, ObservableObject, AVAudioPlayerDelegate {
@ -108,7 +107,7 @@ public struct DigestAudioItem: AudioItemProperties {
      playbackError = false
      self.itemAudioProperties = itemAudioProperties
      startAudio(atIndex: itemAudioProperties.startIndex, andOffset: itemAudioProperties.startOffset)
-      
+
      EventTracker.track(
        .audioSessionStart(
          linkID: itemAudioProperties.itemID,
@ -319,7 +318,7 @@ public struct DigestAudioItem: AudioItemProperties {
    public func seek(toIdx: Int) {
      let before = durationBefore(playerIndex: toIdx)
      let remainder = 0.0
-      
+
      // if the foundIdx happens to be the current item, we just set the position
      if let playerItem = player?.currentItem as? SpeechPlayerItem {
        if playerItem.speechItem.audioIdx == toIdx {
@ -666,8 +665,10 @@ public struct DigestAudioItem: AudioItemProperties {
      player = AVQueuePlayer(items: [])
      if let player = player {
        observer = player.observe(\.currentItem, options: [.new]) { _, _ in
-          self.currentAudioIndex = (player.currentItem as? SpeechPlayerItem)?.speechItem.audioIdx ?? 0
-          self.updateReadText()
+          DispatchQueue.main.async {
+            self.currentAudioIndex = (player.currentItem as? SpeechPlayerItem)?.speechItem.audioIdx ?? 0
+            self.updateReadText()
+          }
        }
      }
      
@ -683,8 +684,8 @@ public struct DigestAudioItem: AudioItemProperties {
    func synthesizeFrom(start: Int, playWhenReady: Bool, atOffset: Double = 0.0) {
      if let synthesizer = self.synthesizer, let items = self.synthesizer?.createPlayerItems(from: start) {
        let prefetchQueue = OperationQueue()
-        prefetchQueue.maxConcurrentOperationCount = 5
-        
+        prefetchQueue.maxConcurrentOperationCount = 1
+
        for speechItem in items {
          let isLast = speechItem.audioIdx == synthesizer.document.utterances.count - 1
          let playerItem = SpeechPlayerItem(session: self, prefetchQueue: prefetchQueue, speechItem: speechItem) {
--- a/apple/OmnivoreKit/Sources/Services/AudioSession/SpeechPlayerItem.swift
+++ b/apple/OmnivoreKit/Sources/Services/AudioSession/SpeechPlayerItem.swift
@ -40,12 +40,11 @@ class SpeechPlayerItem: AVPlayerItem {
    resourceLoaderDelegate.owner = self

    self.observer = observe(\.status, options: [.new]) { item, _ in
-      if item.status == .readyToPlay {
-        let duration = CMTimeGetSeconds(item.duration)
-        item.session.updateDuration(forItem: item.speechItem, newDuration: duration)
-      }
-      if item.status == .failed {
-        item.session.stopWithError()
+      DispatchQueue.main.async {
+        if item.status == .readyToPlay {
+          let duration = CMTimeGetSeconds(item.duration)
+          item.session.updateDuration(forItem: item.speechItem, newDuration: duration)
+        }
      }
    }

@ -55,11 +54,29 @@ class SpeechPlayerItem: AVPlayerItem {
    ) { [weak self] _ in
      guard let self = self else { return }
      self.completed()
+      self.checkPrefetchQueue(prefetchQueue: prefetchQueue)
    }

    self.prefetchOperation = PrefetchSpeechItemOperation(speechItem: speechItem)
    if let prefetchOperation = self.prefetchOperation {
      prefetchQueue.addOperation(prefetchOperation)
+      prefetchOperation.completionBlock = {
+        self.checkPrefetchQueue(prefetchQueue: prefetchQueue)
+      }
+    }
+  }
+
+  func checkPrefetchQueue(prefetchQueue: OperationQueue) {
+    DispatchQueue.main.async {
+      if self.speechItem.audioIdx > self.session.currentAudioIndex + 5 {
+        // prefetch has gotten too far ahead of the audio. Pause the prefetch queue
+        print("PAUSING PREFETCH QUEUE", self.speechItem.audioIdx, self.session.currentAudioIndex + 10, self.speechItem.text)
+        prefetchQueue.isSuspended = true
+      }
+      if self.speechItem.audioIdx < self.session.currentAudioIndex + 5 {
+        print("RESUMING PREFETCH QUEUE", self.speechItem.audioIdx, self.session.currentAudioIndex + 5)
+        prefetchQueue.isSuspended = false
+      }
    }
  }

--- a/apple/OmnivoreKit/Sources/Services/AudioSession/SpeechSynthesizer.swift
+++ b/apple/OmnivoreKit/Sources/Services/AudioSession/SpeechSynthesizer.swift
@ -222,9 +222,13 @@ struct SpeechSynthesizer {

    do {
      let jsonData = try decoder.decode(SynthesizeResult.self, from: data) as SynthesizeResult
-      let audioData = Data(fromHexEncodedString: jsonData.audioData)!
+      var audioData = Data(fromHexEncodedString: jsonData.audioData)!
      if audioData.count < 1 {
-        throw BasicError.message(messageText: "Audio data is empty")
+        if let silence = generateSilentAudioBuffer() {
+          audioData = silence
+        } else {
+          throw BasicError.message(messageText: "Audio data is empty")
+        }
      }

      try audioData.write(to: tempPath)
@ -244,6 +248,34 @@ struct SpeechSynthesizer {
      throw BasicError.message(messageText: errorMessage)
    }
  }
+
+  static func generateSilentAudioBuffer() -> Data? {
+    let audioFormat = AVAudioFormat(standardFormatWithSampleRate: 44100, channels: 1)!
+    let frameCount = UInt32(audioFormat.sampleRate * 0.001)  // 1 millisecond of frames
+    guard let buffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: frameCount) else {
+        return nil
+    }
+    buffer.frameLength = buffer.frameCapacity
+    return bufferToData(buffer: buffer)
+  }
+
+  static func bufferToData(buffer: AVAudioPCMBuffer) -> Data {
+    let channelCount = Int(buffer.format.channelCount)
+    let frames = Int(buffer.frameLength)
+    let channels = UnsafeBufferPointer(start: buffer.floatChannelData, count: channelCount)
+
+    var data = Data()
+
+    for frame in 0..<frames {
+        for channel in 0..<channelCount {
+            let value = channels[channel][frame]
+            var temp = value
+            data.append(UnsafeBufferPointer(start: &temp, count: 1))
+        }
+    }
+
+    return data
+  }
 }

 struct SynthesizeResult: Decodable {
--- a/apple/OmnivoreKit/Sources/Services/DataService/AI/AITasks.swift
+++ b/apple/OmnivoreKit/Sources/Services/DataService/AI/AITasks.swift
@ -172,7 +172,7 @@ extension DataService {
      try? data.write(to: localPath)
    }
  }
-  
+
  public func explain(text: String, libraryItemId: String) async throws -> String {
    let encoder = JSONEncoder()
    let explainRequest = ExplainRequest(text: text, libraryItemId: libraryItemId)