Files
omnivore/apple/OmnivoreKit/Sources/Models/PageScrapePayload.swift
2022-02-11 09:24:33 -08:00

222 lines
7.3 KiB
Swift

import Foundation
#if os(iOS)
import MobileCoreServices
#endif
import UniformTypeIdentifiers
let URLREGEX = #"[(http(s)?):\/\/(www\.)?a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"#
public struct PageScrapePayload {
public enum ContentType {
case html
case pdf
}
public let title: String?
public let html: String?
public let url: String
public let contentType: ContentType
init(url: String, title: String?, html: String?) {
self.url = url
self.title = title
self.html = html
self.contentType = url.hasSuffix(".pdf") ? .pdf : .html
}
}
public struct PageScrapeError: Error {
public let message: String
}
public enum PageScraper {
public static func scrape(
extensionContext: NSExtensionContext?,
completion: @escaping (Result<PageScrapePayload, PageScrapeError>) -> Void
) {
let extensionItem = extensionContext?.inputItems.first as? NSExtensionItem
let extensionAttachments = extensionItem?.attachments
guard let attachments = extensionAttachments else {
completion(.failure(PageScrapeError(message: "no attachments")))
return
}
var pageScrapePayload: PageScrapePayload?
let propertyListKey = String(kUTTypePropertyList)
let group = DispatchGroup()
for attachment in attachments where attachment.hasItemConformingToTypeIdentifier(propertyListKey) {
group.enter()
attachment.loadItem(
forTypeIdentifier: propertyListKey,
options: nil
) { item, _ in
if let payload = PageScrapePayload.make(item: item) {
pageScrapePayload = payload
}
group.leave()
}
}
group.notify(queue: .main) {
if let payload = pageScrapePayload {
completion(.success(payload))
} else {
scrapeURLOnly(extensionContext: extensionContext, completion: completion)
}
}
}
private static func scrapeURLOnly(
extensionContext: NSExtensionContext?,
completion: @escaping (Result<PageScrapePayload, PageScrapeError>) -> Void
) {
let urlKey = String(kUTTypeURL)
// First look for a URL type
let urlFound = extensionContext?.inputItems.first { inputItem in
let itemProvider = (inputItem as? NSExtensionItem)?.attachments?.first(where: { attachment in
attachment.hasItemConformingToTypeIdentifier(urlKey)
})
let hasPublicURL = itemProvider?.hasItemConformingToTypeIdentifier(urlKey) == true
if hasPublicURL {
itemProvider?.loadItem(forTypeIdentifier: urlKey, options: nil) { item, _ in
let shareURL = item as? URL
let urlString = (item as? Data).flatMap { String(data: $0, encoding: .utf8) }
let shareUrlFromData = urlString.flatMap { URL(string: $0) }
let url = shareURL ?? shareUrlFromData
let pageScrapePayload = PageScrapePayload.make(url: url)
DispatchQueue.main.async {
if let payload = pageScrapePayload {
return completion(.success(payload))
} else {
return completion(.failure(PageScrapeError(message: "could not extract url")))
}
}
}
}
return hasPublicURL
}
if urlFound != nil {
return
}
let textKey = String(kUTTypeText)
// We didn't find a URL type, so check for a string that contains a URL
let textUrlFound = extensionContext?.inputItems.first { inputItem in
let itemProvider = (inputItem as? NSExtensionItem)?.attachments?.first(where: { attachment in
attachment.hasItemConformingToTypeIdentifier(textKey)
})
let hasPublicText = itemProvider?.hasItemConformingToTypeIdentifier(textKey) == true
if hasPublicText {
itemProvider?.loadItem(forTypeIdentifier: textKey, options: nil) { item, _ in
var url: URL?
if let item = item as? String {
if let range = item.range(of: URLREGEX, options: .regularExpression) {
let urlStr = item[range]
url = URL(string: String(urlStr))
}
}
let pageScrapePayload = PageScrapePayload.make(url: url)
DispatchQueue.main.async {
if let payload = pageScrapePayload {
return completion(.success(payload))
} else {
return completion(.failure(PageScrapeError(message: "could not extract url")))
}
}
}
}
return hasPublicText
}
if textUrlFound != nil {
return
}
completion(.failure(PageScrapeError(message: "could not find a link to save")))
}
private static func tryScrapeUrlFromText(
extensionContext: NSExtensionContext?,
completion: @escaping (Result<PageScrapePayload, PageScrapeError>) -> Void
) {
let urlKey = String(kUTTypeURL)
let textKey = String(kUTTypeUTF16PlainText)
let inputItem = extensionContext?.inputItems.first as? NSExtensionItem
let itemProvider = inputItem?.attachments?.first(where: { attachment in
attachment.hasItemConformingToTypeIdentifier(urlKey)
})
let hasPublicURL = itemProvider?.hasItemConformingToTypeIdentifier(urlKey) == true
let hasPublicText = itemProvider?.hasItemConformingToTypeIdentifier(textKey) == true
guard hasPublicURL || hasPublicText else {
completion(.failure(PageScrapeError(message: "no public url")))
return
}
if hasPublicURL {
itemProvider?.loadItem(forTypeIdentifier: urlKey, options: nil) { item, _ in
let shareURL = item as? URL
let urlString = (item as? Data).flatMap { String(data: $0, encoding: .utf8) }
let shareUrlFromData = urlString.flatMap { URL(string: $0) }
let url = shareURL ?? shareUrlFromData
let pageScrapePayload = PageScrapePayload.make(url: url)
DispatchQueue.main.async {
if let payload = pageScrapePayload {
return completion(.success(payload))
} else {
return completion(.failure(PageScrapeError(message: "could not extract url")))
}
}
}
} else {
itemProvider?.loadItem(forTypeIdentifier: textKey, options: nil) { item, _ in
var url: URL?
if let item = item as? String {
if let range = item.range(of: URLREGEX, options: .regularExpression) {
let urlStr = item[range]
url = URL(string: String(urlStr))
}
}
let pageScrapePayload = PageScrapePayload.make(url: url)
DispatchQueue.main.async {
if let payload = pageScrapePayload {
return completion(.success(payload))
} else {
return completion(.failure(PageScrapeError(message: "could not extract url")))
}
}
}
}
}
}
private extension PageScrapePayload {
static func make(url: URL?) -> PageScrapePayload? {
guard let url = url else { return nil }
return PageScrapePayload(url: url.absoluteString, title: nil, html: nil)
}
static func make(item: NSSecureCoding?) -> PageScrapePayload? {
let dictionary = item as? NSDictionary
let results = dictionary?[NSExtensionJavaScriptPreprocessingResultsKey] as? NSDictionary
guard let url = results?["url"] as? String else { return nil }
let html = results?["documentHTML"] as? String
let title = results?["title"] as? String
return PageScrapePayload(url: url, title: title, html: html)
}
}