Skip to content

Commit

Permalink
Capture baseURL
Browse files Browse the repository at this point in the history
  • Loading branch information
LeonardoCardoso committed Sep 23, 2021
1 parent 5ade1de commit dc24b4e
Show file tree
Hide file tree
Showing 12 changed files with 166 additions and 38 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
- Updated regex limit [#148](https://github.com/LeonardoCardoso/SwiftLinkPreview/issues/148)
- Changed by [kinhvodoi92](https://github.com/kinhvodoi92)
- Annotated `Cancelable.cancel()` as `@objc` to make it compatibale with Objective-C [#135](https://github.com/LeonardoCardoso/SwiftLinkPreview/issues/135)
- Capture base URL [#45](https://github.com/LeonardoCardoso/SwiftLinkPreview/issues/45)
- Changed by [LeonardoCardoso](https://github.com/LeonardoCardoso)


Expand Down
21 changes: 11 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,16 +121,17 @@ let preview = slp.preview("Text containing URL",

```swift
Response {
let url: URL // URL
let finalUrl: URL // unshortened URL
let canonicalUrl: String // canonical URL
let title: String // title
let description: String // page description or relevant text
let images: [String] // array of URLs of the images
let image: String // main image
let icon: String // favicon
let video: String // video
let price: String // price
let baseURL: String? // base
let url: URL? // URL
let finalUrl: URL? // unshortened URL
let canonicalUrl: String? // canonical URL
let title: String? // title
let description: String? // page description or relevant text
let images: [String]? // array of URLs of the images
let image: String? // main image
let icon: String? // favicon
let video: String? // video
let price: String? // price
}
```

Expand Down
5 changes: 3 additions & 2 deletions Sources/Regex.swift
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ class Regex {
static let imageTagPattern = "<img(.+?)src=\"([^\"](.+?))\"(.+?)[/]?>"
static let secondaryImageTagPattern = "og:image\"(.+?)content=\"([^\"](.+?))\"(.+?)[/]?>"
static let titlePattern = "<title(.*?)>(.*?)</title>"
static let metatagPattern = "<meta(.*?)>"
static let metatagContentPattern = "content=(\"(.*?)\")|('(.*?)')"
static let metaTagPattern = "<meta(.*?)>"
static let baseTagPattern = "<base(.+?)href=\"(.*?)\"(.+?)[/]?>"
static let metaTagContentPattern = "content=(\"(.*?)\")|('(.*?)')"
static let cannonicalUrlPattern = "([^\\+&#@%\\?=~_\\|!:,;]+)"
static let rawTagPattern = "<[^>]+>"
static let inlineStylePattern = "<style(.*?)>(.*?)</style>"
Expand Down
3 changes: 2 additions & 1 deletion Sources/Response.swift
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
import Foundation

public struct Response {


public internal(set) var baseURL: String?
public internal(set) var url: URL?
public internal(set) var finalUrl: URL?
public internal(set) var canonicalUrl: String?
Expand Down
6 changes: 6 additions & 0 deletions Sources/ResponseExtension.swift
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ internal extension Response {

var dictionary: [String: Any] {
var responseData:[String: Any] = [:]
responseData["baseURL"] = baseURL
responseData["url"] = url
responseData["finalUrl"] = finalUrl
responseData["canonicalUrl"] = canonicalUrl
Expand All @@ -35,11 +36,14 @@ internal extension Response {
case images
case icon
case video
case baseURL
case price
}

mutating func set(_ value: Any, for key: Key) {
switch key {
case Key.baseURL:
if let value = value as? String { self.baseURL = value }
case Key.url:
if let value = value as? URL { self.url = value }
case Key.finalUrl:
Expand All @@ -65,6 +69,8 @@ internal extension Response {

func value(for key: Key) -> Any? {
switch key {
case Key.baseURL:
return self.baseURL
case Key.url:
return self.url
case Key.finalUrl:
Expand Down
54 changes: 45 additions & 9 deletions Sources/SwiftLinkPreview.swift
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,11 @@ open class SwiftLinkPreview: NSObject {

result.title = $0.title
result.description = $0.description
result.image = $0.image
result.images = $0.images
result.icon = $0.icon
result.video = $0.video

result.image = self.formatImageURL($0.image, base: $0.baseURL)
result.images = self.formatImageURLs($0.images, base: $0.baseURL)
result.icon = self.formatImageURL($0.icon, base: $0.baseURL)
result.video = self.formatImageURL($0.video, base: $0.baseURL)
result.price = $0.price

self.cache.slp_setCachedResponse(url: unshortened.absoluteString, response: result)
Expand All @@ -154,6 +155,28 @@ open class SwiftLinkPreview: NSObject {
return cancellable
}

private func formatImageURL(_ url: String?, base: String?) -> String? {
guard var url = url else { return nil }

if !url.starts(with: "http"), let base = base {
url = "\(base)\(url)"
}

return url
}

func formatImageURLs(_ array: [String]?, base: String?) -> [String]? {
guard var array = array else { return nil }

for i in 0 ..< array.count {
if let formatted = formatImageURL(array[0], base: base) {
array[i] = formatted
}
}

return Array(Set(array))
}

/*
Extract url redirection inside the GET query.
Like https://www.dji.com/404?url=http%3A%2F%2Fwww.dji.com%2Fmatrice600-pro%2Finfo#specs -> http://www.dji.com/de/matrice600-pro/info#specs
Expand Down Expand Up @@ -287,9 +310,9 @@ extension SwiftLinkPreview {
CFStringConvertIANACharSetNameToEncoding( $0 as CFString ) ) )
} ?? .utf8
if let html = String( data: data, encoding: encoding ) {
for meta in Regex.pregMatchAll( html, regex: Regex.metatagPattern, index: 1 ) {
for meta in Regex.pregMatchAll( html, regex: Regex.metaTagPattern, index: 1 ) {
if (meta.contains( "http-equiv=\"refresh\"" ) || meta.contains( "http-equiv='refresh'" )),
let value = Regex.pregMatchFirst( meta, regex: Regex.metatagContentPattern, index: 2 )?.decoded.extendedTrim,
let value = Regex.pregMatchFirst( meta, regex: Regex.metaTagContentPattern, index: 2 )?.decoded.extendedTrim,
let redirectString = value.split( separator: ";" )
.first( where: { $0.lowercased().starts( with: "url=" ) } )?
.split( separator: "=", maxSplits: 1 ).last,
Expand Down Expand Up @@ -444,6 +467,8 @@ extension SwiftLinkPreview {

result = self.crawlMetaTags(sanitizedHtmlCode, result: result)

result = self.crawlMetaBase(sanitizedHtmlCode, result: result)

var otherResponse = self.crawlTitle(sanitizedHtmlCode, result: result)

otherResponse = self.crawlDescription(otherResponse.htmlCode, result: otherResponse.result)
Expand Down Expand Up @@ -534,10 +559,10 @@ extension SwiftLinkPreview {
Response.Key.title.rawValue,
Response.Key.description.rawValue,
Response.Key.image.rawValue,
Response.Key.video.rawValue,
Response.Key.video.rawValue
]

let metatags = Regex.pregMatchAll(htmlCode, regex: Regex.metatagPattern, index: 1)
let metatags = Regex.pregMatchAll(htmlCode, regex: Regex.metaTagPattern, index: 1)

for metatag in metatags {
for tag in possibleTags {
Expand All @@ -552,7 +577,7 @@ extension SwiftLinkPreview {

if let key = Response.Key(rawValue: tag),
result.value(for: key) == nil {
if let value = Regex.pregMatchFirst(metatag, regex: Regex.metatagContentPattern, index: 2) {
if let value = Regex.pregMatchFirst(metatag, regex: Regex.metaTagContentPattern, index: 2) {
let value = value.decoded.extendedTrim
if tag == "image" {
let value = addImagePrefixIfNeeded(value, result: result)
Expand All @@ -572,6 +597,17 @@ extension SwiftLinkPreview {
return result
}

internal func crawlMetaBase(_ htmlCode: String, result: Response) -> Response {

var result = result

if let base = Regex.pregMatchAll(htmlCode, regex: Regex.baseTagPattern, index: 2).first {
result.set(base, for: .baseURL)
}

return result
}

// Crawl for title if needed
internal func crawlTitle(_ htmlCode: String, result: Response) -> (htmlCode: String, result: Response) {
var result = result
Expand Down
44 changes: 30 additions & 14 deletions SwiftLinkPreview.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@

/* Begin PBXBuildFile section */
1F8164ED26287866000F2905 /* VideoTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1F8164EC26287866000F2905 /* VideoTests.swift */; };
27BCC85826FCF22E00886BDA /* BaseURLTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 27BCC85726FCF22E00886BDA /* BaseURLTests.swift */; };
27BCC85D26FCF3BF00886BDA /* BaseURLTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 27BCC85726FCF22E00886BDA /* BaseURLTests.swift */; };
27BCC85E26FCF3C000886BDA /* BaseURLTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 27BCC85726FCF22E00886BDA /* BaseURLTests.swift */; };
27BCC86026FCF4C000886BDA /* head-meta-base.html in Resources */ = {isa = PBXBuildFile; fileRef = 27BCC85F26FCF4C000886BDA /* head-meta-base.html */; };
27BCC86126FCF4C000886BDA /* head-meta-base.html in Resources */ = {isa = PBXBuildFile; fileRef = 27BCC85F26FCF4C000886BDA /* head-meta-base.html */; };
27BCC86226FCF4C000886BDA /* head-meta-base.html in Resources */ = {isa = PBXBuildFile; fileRef = 27BCC85F26FCF4C000886BDA /* head-meta-base.html */; };
68074FFA1F23B6C900649DE6 /* head-meta-icon.html in Resources */ = {isa = PBXBuildFile; fileRef = 68074FF91F23B6C900649DE6 /* head-meta-icon.html */; };
68074FFB1F23BB1100649DE6 /* head-meta-icon.html in Resources */ = {isa = PBXBuildFile; fileRef = 68074FF91F23B6C900649DE6 /* head-meta-icon.html */; };
68074FFC1F23BB1400649DE6 /* head-meta-icon.html in Resources */ = {isa = PBXBuildFile; fileRef = 68074FF91F23B6C900649DE6 /* head-meta-icon.html */; };
Expand Down Expand Up @@ -150,6 +156,8 @@

/* Begin PBXFileReference section */
1F8164EC26287866000F2905 /* VideoTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = VideoTests.swift; sourceTree = "<group>"; };
27BCC85726FCF22E00886BDA /* BaseURLTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BaseURLTests.swift; sourceTree = "<group>"; };
27BCC85F26FCF4C000886BDA /* head-meta-base.html */ = {isa = PBXFileReference; lastKnownFileType = text.html; path = "head-meta-base.html"; sourceTree = "<group>"; };
68074FF91F23B6C900649DE6 /* head-meta-icon.html */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.html; path = "head-meta-icon.html"; sourceTree = "<group>"; };
686E58DE1F22416D000C2A33 /* IconTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = IconTests.swift; sourceTree = "<group>"; };
7A552DE121A460910019E8B1 /* Response.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Response.swift; sourceTree = "<group>"; };
Expand Down Expand Up @@ -257,17 +265,18 @@
985DCEB01D2BFD2700B40D76 /* Files */ = {
isa = PBXGroup;
children = (
98B5ED421D3E5F5C00AEBD54 /* head-meta-itemprop.html */,
986D5BE61D33E0FD0025555F /* head-title.html */,
985DCEBB1D2BFFAF00B40D76 /* body-text-span.html */,
985DCEBC1D2BFFAF00B40D76 /* body-text-p.html */,
985DCEBD1D2BFFAF00B40D76 /* body-text-div.html */,
985DCEB71D2BFE4100B40D76 /* body-image-single.html */,
985DCEB81D2BFE4100B40D76 /* body-image-gallery.html */,
985DCEB11D2BFD3400B40D76 /* head-meta-twitter.html */,
985DCEB21D2BFD3400B40D76 /* head-meta-meta.html */,
985DCEB71D2BFE4100B40D76 /* body-image-single.html */,
985DCEBD1D2BFFAF00B40D76 /* body-text-div.html */,
985DCEBC1D2BFFAF00B40D76 /* body-text-p.html */,
985DCEBB1D2BFFAF00B40D76 /* body-text-span.html */,
27BCC85F26FCF4C000886BDA /* head-meta-base.html */,
985DCEB31D2BFD3400B40D76 /* head-meta-facebook.html */,
68074FF91F23B6C900649DE6 /* head-meta-icon.html */,
98B5ED421D3E5F5C00AEBD54 /* head-meta-itemprop.html */,
985DCEB21D2BFD3400B40D76 /* head-meta-meta.html */,
985DCEB11D2BFD3400B40D76 /* head-meta-twitter.html */,
986D5BE61D33E0FD0025555F /* head-title.html */,
);
name = Files;
sourceTree = "<group>";
Expand Down Expand Up @@ -358,18 +367,19 @@
98DC53391D1D73DB001134E3 /* SwiftLinkPreviewTests */ = {
isa = PBXGroup;
children = (
98B5ED491D3E7DC600AEBD54 /* HugeTests.swift */,
27BCC85726FCF22E00886BDA /* BaseURLTests.swift */,
985DCEC81D2C029700B40D76 /* BodyTests.swift */,
988B48D61D2C3C2E0040A4AD /* Constants */,
985DCEB01D2BFD2700B40D76 /* Files */,
98B5ED491D3E7DC600AEBD54 /* HugeTests.swift */,
686E58DE1F22416D000C2A33 /* IconTests.swift */,
985DCEC61D2C026000B40D76 /* ImageTests.swift */,
1F8164EC26287866000F2905 /* VideoTests.swift */,
98E7C3121D3B23F5009E5F6D /* Info */,
985DCEC41D2C022E00B40D76 /* MetaTests.swift */,
982812911D3A9293000D3ABB /* RegexTests.swift */,
986D5BE41D33DFE50025555F /* TitleTests.swift */,
686E58DE1F22416D000C2A33 /* IconTests.swift */,
988B48D61D2C3C2E0040A4AD /* Constants */,
985DCEB01D2BFD2700B40D76 /* Files */,
98E7C3121D3B23F5009E5F6D /* Info */,
988B48D11D2C39790040A4AD /* Utils */,
1F8164EC26287866000F2905 /* VideoTests.swift */,
);
path = SwiftLinkPreviewTests;
sourceTree = "<group>";
Expand Down Expand Up @@ -641,6 +651,7 @@
985DCEBF1D2BFFAF00B40D76 /* body-text-span.html in Resources */,
985DCEC01D2BFFAF00B40D76 /* body-text-p.html in Resources */,
985DCEC11D2BFFAF00B40D76 /* body-text-div.html in Resources */,
27BCC86026FCF4C000886BDA /* head-meta-base.html in Resources */,
985DCEB91D2BFE4100B40D76 /* body-image-single.html in Resources */,
985DCEB61D2BFD3400B40D76 /* head-meta-facebook.html in Resources */,
986D5BE71D33E0FD0025555F /* head-title.html in Resources */,
Expand All @@ -666,6 +677,7 @@
98E7C32F1D3B24DA009E5F6D /* body-image-single.html in Resources */,
98B5ED461D3E62A200AEBD54 /* head-meta-itemprop.html in Resources */,
98E7C3301D3B24DA009E5F6D /* body-image-gallery.html in Resources */,
27BCC86226FCF4C000886BDA /* head-meta-base.html in Resources */,
98E7C3311D3B24DA009E5F6D /* head-meta-twitter.html in Resources */,
98E7C3321D3B24DA009E5F6D /* head-meta-meta.html in Resources */,
98E7C3331D3B24DA009E5F6D /* head-meta-facebook.html in Resources */,
Expand All @@ -691,6 +703,7 @@
98F76D1D1D3AF87100E9B10E /* body-image-single.html in Resources */,
98B5ED441D3E62A000AEBD54 /* head-meta-itemprop.html in Resources */,
98F76D1E1D3AF87100E9B10E /* body-image-gallery.html in Resources */,
27BCC86126FCF4C000886BDA /* head-meta-base.html in Resources */,
98F76D1F1D3AF87100E9B10E /* head-meta-twitter.html in Resources */,
98F76D201D3AF87100E9B10E /* head-meta-meta.html in Resources */,
98F76D211D3AF87100E9B10E /* head-meta-facebook.html in Resources */,
Expand Down Expand Up @@ -740,6 +753,7 @@
986D5BE51D33DFE60025555F /* TitleTests.swift in Sources */,
985DCEC71D2C026000B40D76 /* ImageTests.swift in Sources */,
988B48D81D2C3C3D0040A4AD /* Constants.swift in Sources */,
27BCC85826FCF22E00886BDA /* BaseURLTests.swift in Sources */,
9272A10D1E2EF0E600F9F17E /* Regex.swift in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
Expand Down Expand Up @@ -777,6 +791,7 @@
98E7C3281D3B24C6009E5F6D /* File.swift in Sources */,
98E7C3291D3B24C6009E5F6D /* IntExtension.swift in Sources */,
9272A10F1E2EF0E800F9F17E /* Regex.swift in Sources */,
27BCC85E26FCF3C000886BDA /* BaseURLTests.swift in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
Expand Down Expand Up @@ -813,6 +828,7 @@
98F76D121D3AF78600E9B10E /* File.swift in Sources */,
98F76D131D3AF78600E9B10E /* IntExtension.swift in Sources */,
9272A10E1E2EF0E700F9F17E /* Regex.swift in Sources */,
27BCC85D26FCF3BF00886BDA /* BaseURLTests.swift in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
Expand Down
55 changes: 55 additions & 0 deletions SwiftLinkPreviewTests/BaseURLTests.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
//
// BaseURLTests.swift
// SwiftLinkPreviewTests
//
// Created by Leonardo Cardoso on 23.09.21.
// Copyright © 2021 leocardz.com. All rights reserved.
//

import XCTest
@testable import SwiftLinkPreview

// This class tests head meta info
class BaseURLTests: XCTestCase {

// MARK: - Vars
var baseTemplate = ""
let slp = SwiftLinkPreview()

// MARK: - SetUps
// Those setup functions get that template, and fulfil determinated areas with rand texts, images and tags
override func setUp() {
super.setUp()

self.baseTemplate = File.toString(Constants.headMetaBase)

}

// MARK: - Base
func setUpBaseAndRun() {

var baseTemplate = self.baseTemplate
baseTemplate = baseTemplate.replace(Constants.headRandom, with: String.randomTag())
baseTemplate = baseTemplate.replace(Constants.bodyRandom, with: String.randomTag()).extendedTrim

let result = self.slp.crawlMetaBase(baseTemplate, result: Response())

XCTAssertEqual(result.baseURL, "https://host/resource/index/")
}

func testBase() {

for _ in 0 ..< 100 {

self.setUpBaseAndRun()

}

}

func testResultBase() {
XCTAssertEqual(slp.formatImageURLs(["assets/test.png"], base: "https://host/resource/index/")?.first,
"https://host/resource/index/assets/test.png")
}

}
1 change: 1 addition & 0 deletions SwiftLinkPreviewTests/Constants.swift
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ struct Constants {
static let bodyIcon = "head-meta-icon"
static let headMetaTwitter = "head-meta-twitter"
static let headMetaMeta = "head-meta-meta"
static let headMetaBase = "head-meta-base"
static let headMetaItemprop = "head-meta-itemprop"
static let headMetaFacebook = "head-meta-facebook"
static let headTitle = "head-title"
Expand Down
10 changes: 10 additions & 0 deletions SwiftLinkPreviewTests/head-meta-base.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<html>
<head>
[:head-random]
<base href="https://host/resource/index/" target="_self">
</head>
<body>
[:body-random]
<img src="assets/test.png">
</body>
</html>
Loading

0 comments on commit dc24b4e

Please sign in to comment.