Files
omnivore/apple/OmnivoreKit/Sources/Models/PageScrapePayload.swift
Jackson Harper c3096e5dfe Better PDF upload handling on iOS
Three main changes here:
- If a PDF isn't a local file, dont try to download and then
  upload it to the backend. Just treat it as a URL.

- If we are passed PDF data by an app like Files, upload it.

- Wrap the whole operation in a Background task so its not
  killed if the user taps dismiss during the upload of larger
  documents.
2022-05-26 21:40:40 -07:00

290 lines
9.3 KiB
Swift

import Foundation
#if os(iOS)
import MobileCoreServices
#endif
import UniformTypeIdentifiers
let URLREGEX = #"[(http(s)?):\/\/(www\.)?a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"#
public struct PageScrapePayload {
public struct HTMLPayload {
let url: String
let title: String?
let html: String
}
public enum ContentType {
case none
case html(html: String, title: String?)
case pdf(data: Data)
}
public let url: String
public let contentType: ContentType
init(url: String) {
self.url = url
self.contentType = .none
}
init(url: String, pdfData: Data) {
self.url = url
self.contentType = .pdf(data: pdfData)
}
init(url: String, title: String?, html: String) {
self.url = url
self.contentType = .html(html: html, title: title)
}
}
public struct PageScrapeError: Error {
public let message: String
}
public enum PageScraper {
public static func scrape(
extensionContext: NSExtensionContext?,
completion: @escaping (Result<PageScrapePayload, PageScrapeError>) -> Void
) {
let extensionItem = extensionContext?.inputItems.first as? NSExtensionItem
let extensionAttachments = extensionItem?.attachments
guard let attachments = extensionAttachments else {
completion(.failure(PageScrapeError(message: "no attachments")))
return
}
var pageScrapePayload: PageScrapePayload?
let PDFKey = UTType.pdf.identifier
let publicFileKey = UTType.fileURL.identifier
let propertyListKey = UTType.propertyList.identifier
let group = DispatchGroup()
for attachment in attachments where attachment.hasItemConformingToTypeIdentifier(PDFKey) {
group.enter()
attachment.loadItem(
forTypeIdentifier: PDFKey,
options: nil
) { item, _ in
if let payload = PageScrapePayload.make(item: item) {
pageScrapePayload = payload
}
group.leave()
}
}
for attachment in attachments where attachment.hasItemConformingToTypeIdentifier(propertyListKey) {
group.enter()
attachment.loadItem(
forTypeIdentifier: propertyListKey,
options: nil
) { item, _ in
if let payload = PageScrapePayload.make(item: item) {
pageScrapePayload = payload
}
group.leave()
}
}
group.notify(queue: .main) {
if let payload = pageScrapePayload {
completion(.success(payload))
} else {
scrapeURLOnly(extensionContext: extensionContext, completion: completion)
}
}
}
// swiftlint:disable:next function_body_length
private static func scrapeURLOnly(
extensionContext: NSExtensionContext?,
completion: @escaping (Result<PageScrapePayload, PageScrapeError>) -> Void
) {
let urlKey = UTType.url.identifier
// First look for a URL type
let urlFound = extensionContext?.inputItems.first { inputItem in
let itemProvider = (inputItem as? NSExtensionItem)?.attachments?.first(where: { attachment in
attachment.hasItemConformingToTypeIdentifier(urlKey)
})
let hasPublicURL = itemProvider?.hasItemConformingToTypeIdentifier(urlKey) == true
if hasPublicURL {
itemProvider?.loadItem(forTypeIdentifier: urlKey, options: nil) { item, _ in
let shareURL = item as? URL
let urlString = (item as? Data).flatMap { String(data: $0, encoding: .utf8) }
let shareUrlFromData = urlString.flatMap { URL(string: $0) }
let url = shareURL ?? shareUrlFromData
let pageScrapePayload = PageScrapePayload.make(url: url)
DispatchQueue.main.async {
if let payload = pageScrapePayload {
return completion(.success(payload))
} else {
return completion(.failure(PageScrapeError(message: "could not extract url")))
}
}
}
}
return hasPublicURL
}
if urlFound != nil {
return
}
let textKey = UTType.text.identifier
// We didn't find a URL type, so check for a string that contains a URL
let textUrlFound = extensionContext?.inputItems.first { inputItem in
let itemProvider = (inputItem as? NSExtensionItem)?.attachments?.first(where: { attachment in
attachment.hasItemConformingToTypeIdentifier(textKey)
})
let hasPublicText = itemProvider?.hasItemConformingToTypeIdentifier(textKey) == true
if hasPublicText {
itemProvider?.loadItem(forTypeIdentifier: textKey, options: nil) { item, _ in
var url: URL?
if let item = item as? String {
if let range = item.range(of: URLREGEX, options: .regularExpression) {
let urlStr = item[range]
url = URL(string: String(urlStr))
}
}
let pageScrapePayload = PageScrapePayload.make(url: url)
DispatchQueue.main.async {
if let payload = pageScrapePayload {
return completion(.success(payload))
} else {
return completion(.failure(PageScrapeError(message: "could not extract url")))
}
}
}
}
return hasPublicText
}
if textUrlFound != nil {
return
}
completion(.failure(PageScrapeError(message: "could not find a link to save")))
}
private static func tryScrapeUrlFromText(
extensionContext: NSExtensionContext?,
completion: @escaping (Result<PageScrapePayload, PageScrapeError>) -> Void
) {
let urlKey = UTType.url.identifier
let textKey = UTType.utf16PlainText.identifier
let inputItem = extensionContext?.inputItems.first as? NSExtensionItem
let itemProvider = inputItem?.attachments?.first(where: { attachment in
attachment.hasItemConformingToTypeIdentifier(urlKey)
})
let hasPublicURL = itemProvider?.hasItemConformingToTypeIdentifier(urlKey) == true
let hasPublicText = itemProvider?.hasItemConformingToTypeIdentifier(textKey) == true
guard hasPublicURL || hasPublicText else {
completion(.failure(PageScrapeError(message: "no public url")))
return
}
if hasPublicURL {
itemProvider?.loadItem(forTypeIdentifier: urlKey, options: nil) { item, _ in
let shareURL = item as? URL
let urlString = (item as? Data).flatMap { String(data: $0, encoding: .utf8) }
let shareUrlFromData = urlString.flatMap { URL(string: $0) }
let url = shareURL ?? shareUrlFromData
let pageScrapePayload = PageScrapePayload.make(url: url)
DispatchQueue.main.async {
if let payload = pageScrapePayload {
return completion(.success(payload))
} else {
return completion(.failure(PageScrapeError(message: "could not extract url")))
}
}
}
} else {
itemProvider?.loadItem(forTypeIdentifier: textKey, options: nil) { item, _ in
var url: URL?
if let item = item as? String {
if let range = item.range(of: URLREGEX, options: .regularExpression) {
let urlStr = item[range]
url = URL(string: String(urlStr))
}
}
let pageScrapePayload = PageScrapePayload.make(url: url)
DispatchQueue.main.async {
if let payload = pageScrapePayload {
return completion(.success(payload))
} else {
return completion(.failure(PageScrapeError(message: "could not extract url")))
}
}
}
}
}
}
private extension PageScrapePayload {
static func make(url: URL?) -> PageScrapePayload? {
guard let url = url else { return nil }
return PageScrapePayload(url: url.absoluteString)
}
static func make(item: NSSecureCoding?) -> PageScrapePayload? {
if let dictionary = item as? NSDictionary {
return makeFromDictionary(dictionary)
}
if let url = item as? NSURL {
return makeFromURL(url as URL)
}
return nil
}
static func makeFromURL(_ url: URL) -> PageScrapePayload? {
if url.isFileURL {
let type = try? url.resourceValues(forKeys: [.typeIdentifierKey]).typeIdentifier
if type == UTType.pdf.identifier, let data = try? Data(contentsOf: url) {
return PageScrapePayload(url: url.absoluteString, pdfData: data)
}
// Don't try to handle file URLs that are not PDFs.
// In the future we can add image and other file type support here
return nil
}
return PageScrapePayload(url: url.absoluteString)
}
static func makeFromDictionary(_ dictionary: NSDictionary) -> PageScrapePayload? {
let results = dictionary[NSExtensionJavaScriptPreprocessingResultsKey] as? NSDictionary
guard let url = results?["url"] as? String else { return nil }
let html = results?["documentHTML"] as? String
let title = results?["title"] as? String
let contentType = results?["contentType"] as? String
// If we were not able to capture any HTML, treat this as a URL and
// see if the backend can do better.
if html == nil || html!.isEmpty {
return PageScrapePayload(url: url)
}
// If its a PDF that we opened through Safari we don't have access to the
// file content, so pass the URL to the backend and let it download it.
if contentType == "application/pdf" {
return PageScrapePayload(url: url)
}
if let html = html {
return PageScrapePayload(url: url, title: title, html: html)
}
return PageScrapePayload(url: url)
}
}