From 4c7a091bb578772f99f29dc65f5dbc3c91184b9f Mon Sep 17 00:00:00 2001 From: Ross Spencer Date: Sat, 16 Mar 2019 16:39:18 +0100 Subject: [PATCH] Enable the library to accept protocol extensions When a user initializes the library for use they can now supply an array of protocols to extend the number which can be identified by the linkscanner. Connected to httpreserve/tikalinkextract#6 --- linkscanner.go | 89 +++++++++++++++++++++------------------------ linkscanner_test.go | 42 ++++++++++++++++----- version.go | 2 +- 3 files changed, 75 insertions(+), 58 deletions(-) diff --git a/linkscanner.go b/linkscanner.go index be2ce8a..17fdd88 100644 --- a/linkscanner.go +++ b/linkscanner.go @@ -7,35 +7,33 @@ import ( "strings" ) -// Package linkscanner scans an abitrary piece of text and extracts a URL; currently -// one of HTTP:// HTTPS:// and FTP://. Because a length of text can contain more than -// one URL we return both a list of URLs, and a list of URL parsing errors -// encountered by way of ensuring the link returned to the calling code is -// as valid as possible before further use of it. +// Package linkscanner scans an arbitrary piece of text and extracts a URL; +// currently one of HTTP:// HTTPS:// and FTP://. Because a length of text can +// contain more than one URL we return both a list of URLs, and a list of URL +// parsing errors encountered by way of ensuring the link returned to the +// calling code is as valid as possible before further use of it. var fixProtocol = true // strings to look for that indicate a web resource -var ( - protoHTTPS = "https://" - protoHTTP = "http://" - protoWww = "www." // technically not a protocol - protoFtp = "ftp://" - protoMailto = "mailto:" -) +var protocol = []string{ + "https://", + "http://", + "ftp://", + "mailto:", +} +var protoWWW = "www." // technically not a protocol //common line endings that shouldn't be in URL var common = []string{"�", "\"", "'", ":", ";", ".", "`", ",", "*", ">", ")", "]"} func cleanLink(link string, www bool) string { if www && fixProtocol { - link = protoHTTP + link + link = "http://" + link } - //utf-8 replacement code character //https://codingrigour.wordpress.com/2011/02/17/the-case-of-the-mysterious-characters/ link = strings.Replace(link, "\xEF\xBF\xBD", "", 1) - // replace common invalid line-endings for _, x := range common { if x == link[len(link)-1:] { @@ -47,7 +45,7 @@ func cleanLink(link string, www bool) string { } // FixWWW enables the override of the default setting in this package to -// fix wwww links where there isn't a protocol specificed, e.g. http:// +// fix wwww links where there isn't a protocol specified, e.g. http:// func FixWWW(f bool) { fixProtocol = f } @@ -55,23 +53,17 @@ func FixWWW(f bool) { func retrieveLink(literal string) (string, error) { literal = strings.ToLower(literal) var link string - if strings.Contains(literal, protoHTTPS) { - literal = literal[strings.Index(literal, protoHTTPS):] - link = cleanLink(literal, false) - } else if strings.Contains(literal, protoHTTP) { - literal = literal[strings.Index(literal, protoHTTP):] - link = cleanLink(literal, false) - } else if strings.Contains(literal, protoFtp) { - literal = literal[strings.Index(literal, protoFtp):] - link = cleanLink(literal, false) - } else if strings.Contains(literal, protoWww) { - literal = literal[strings.Index(literal, protoWww):] + if strings.Contains(literal, protoWWW) { + literal = literal[strings.Index(literal, protoWWW):] link = cleanLink(literal, true) - } else if strings.Contains(literal, protoMailto) { - literal = literal[strings.Index(literal, protoMailto):] - link = cleanLink(literal, false) + } else { + for _, proto := range protocol { + if strings.Contains(literal, proto) { + literal = literal[strings.Index(literal, proto):] + link = cleanLink(literal, false) + } + } } - if link != "" { _, err := url.Parse(link) if err != nil { @@ -79,24 +71,31 @@ func retrieveLink(literal string) (string, error) { return "", err } } - return link, nil } -// HTTPScanner expects a length of text as input and returns -// two slices dependant on what it discovers. First a unique list of -// URLs parsed successfully by net/url. Second a list of errors -// that were encountered trying to parse the URL found in the text. -func HTTPScanner(content string) ([]string, []error) { +// LoadExtensions enables linkscanner to look beyond the default protocols that +// are specified in the library. +func LoadExtensions(extensions []string) { + protocol = append(protocol, extensions...) +} + +// ListProtocols returns to the caller a list of protocols that we're testing +// for when scanning text. +func ListProtocols() []string { + return protocol +} +// HTTPScanner expects a length of text as input and returns two slices +// Dependant on what it discovers. First a unique list of URLs parsed +// successfully by net/url. Second a list of errors that were encountered +// trying to parse the URL found in the text. +func HTTPScanner(content string) ([]string, []error) { var hyperlinkList []string var errorsList []error - reader := bufio.NewReader(strings.NewReader(content)) scanner := bufio.NewScanner(reader) - scanner.Split(bufio.ScanWords) - for scanner.Scan() { link, err := retrieveLink(scanner.Text()) if err != nil { @@ -115,24 +114,19 @@ func HTTPScanner(content string) ([]string, []error) { } } } - return hyperlinkList, errorsList } -// HTTPScannerIndex prvides the same basic functionality of HTTPScanner. +// HTTPScannerIndex provides the same basic functionality of HTTPScanner. // The number of words scanned is monitored. This count becomes an position -// integer providing an approximate index in the text where the hyperlink +// integer providing an approximate index in the text where the hyper-link // was found. The returned value is not a zero-based index. func HTTPScannerIndex(content string) ([]map[int]string, []error) { - var hyperlinkList []map[int]string var errorsList []error - reader := bufio.NewReader(strings.NewReader(content)) scanner := bufio.NewScanner(reader) - scanner.Split(bufio.ScanWords) - var pos int for scanner.Scan() { pos++ @@ -146,6 +140,5 @@ func HTTPScannerIndex(content string) ([]map[int]string, []error) { hyperlinkList = append(hyperlinkList, tmp) } } - return hyperlinkList, errorsList } diff --git a/linkscanner_test.go b/linkscanner_test.go index df98ce0..45a38c6 100644 --- a/linkscanner_test.go +++ b/linkscanner_test.go @@ -1,30 +1,27 @@ package linkscanner -import "testing" +import ( + "strings" + "testing" +) func TestFixWWW(t *testing.T) { - var testlink = "www.example.com" var resultlink = "http://www.example.com" - linklist, errs := HTTPScanner(testlink) if len(errs) != 0 { t.Errorf("FAIL: Unexpected errors parsing WWW %v", errs) } - if len(linklist) > 0 && len(linklist) == 1 { if linklist[0] != resultlink { t.Errorf("FAIL: WWW not fixed %s became %s", testlink, linklist[0]) } } - FixWWW(false) - linklist, errs = HTTPScanner(testlink) if len(errs) != 0 { t.Errorf("FAIL: Unexpected errors parsing WWW %v", errs) } - if len(linklist) > 0 && len(linklist) == 1 { if linklist[0] != testlink { t.Errorf("FAIL: WWW incorrectly changed %s became %s", testlink, linklist[0]) @@ -35,12 +32,10 @@ func TestFixWWW(t *testing.T) { func TestIndexOutput(t *testing.T) { var testSentence = "this is a short www.example.com sentence." var pos = 5 - linklist, errs := HTTPScannerIndex(testSentence) if len(errs) != 0 { t.Errorf("FAIL: Unexpected errors parsing WWW %v", errs) } - if len(linklist) > 0 && len(linklist) == 1 { for x := range linklist { for k := range linklist[x] { @@ -51,3 +46,32 @@ func TestIndexOutput(t *testing.T) { } } } + +func TestLoadExtensions(t *testing.T) { + originalLen := len(ListProtocols()) + // Info is a good example, registry here: https://en.wikipedia.org/wiki/Info_URI_scheme + arr := []string{"pw://", "info:ark/", "go:", "info:pronom/", "info:hdl/"} + LoadExtensions(arr) + if len(ListProtocols()) != (originalLen + len(arr)) { + t.Errorf("FAIL: length of protocols (%d) following extension is not the correct length (%d)", originalLen, len(arr)) + } + template := "{{proto}}" + var testSentence = "this is a short {{proto}}example.com sentence." + var pos = 5 + for _, proto := range arr { + testString := strings.Replace(testSentence, template, proto, 1) + linklist, errs := HTTPScannerIndex(testString) + if len(errs) != 0 { + t.Errorf("FAIL: Unexpected errors parsing %s %v", proto, errs) + } + if len(linklist) > 0 && len(linklist) == 1 { + for x := range linklist { + for k := range linklist[x] { + if k != pos { + t.Errorf("FAIL: Index returned is different than expected %d received, expected %d", k, pos) + } + } + } + } + } +} diff --git a/version.go b/version.go index 42f4d10..b1369ae 100644 --- a/version.go +++ b/version.go @@ -1,6 +1,6 @@ package linkscanner -var version = "linkscanner-0.0.2" +var version = "linkscanner-0.0.3" // GetVersion returns the version of the package // to the calling code.