From fa4c2738924d96adadba503d06e9db5c898371ac Mon Sep 17 00:00:00 2001 From: Ross Spencer Date: Sat, 16 Mar 2019 18:29:41 +0100 Subject: [PATCH] Add the ability for users to use custom protocols This commit gives users the ability to specify custom protocols in an extensions file so that uri-types that may not be as well-known can be identified by the link scanner. Examples might include those used by content management systems as internal mechanisms of accessing information. Connected to #6 --- bulkfilehandler.go | 7 ---- extensions-test/extensions-protocols.txt | 18 ++++++++++ extensions.json | 10 ++++++ scanner.go | 1 - tikahttpreserve.go => tikalinkextract.go | 43 ++++++++++++++---------- utils.go | 9 +++++ version.go | 2 +- 7 files changed, 63 insertions(+), 27 deletions(-) create mode 100644 extensions-test/extensions-protocols.txt create mode 100644 extensions.json rename tikahttpreserve.go => tikalinkextract.go (78%) create mode 100644 utils.go diff --git a/bulkfilehandler.go b/bulkfilehandler.go index 289a6a6..f52d163 100644 --- a/bulkfilehandler.go +++ b/bulkfilehandler.go @@ -12,7 +12,6 @@ type contenterror struct { } func extractAndAnalyse(filepool []filedata) (bool, error) { - //make channel run goroutine... ch := make(chan contenterror) for _, fi := range filepool { @@ -26,7 +25,6 @@ func extractAndAnalyse(filepool []filedata) (bool, error) { httpScanner(ce.fname, ce.content) } } - return false, nil } @@ -42,10 +40,8 @@ func getFileContent(fi filedata, ch chan contenterror) { //create empty struct to return... var ce contenterror ce.fname = fi.fname - //what are we doing..? logFileMessage("INFO: '%s' being processed.", fi.fname) - //process... fp, err := openFile(fi.fpath) defer fp.Close() @@ -54,20 +50,17 @@ func getFileContent(fi filedata, ch chan contenterror) { ch <- ce return } - _, flRecursiveKeysValues, err := getTikaRecursive(fi.fname, fp, acceptJSON) if err != nil { ce.err = err ch <- ce return } - if val, ok := flRecursiveKeysValues[tikaPlainText]; ok { ce.content = val.(string) ch <- ce return } - ce.err = fmt.Errorf("No plain text data to analyse.") ch <- ce return diff --git a/extensions-test/extensions-protocols.txt b/extensions-test/extensions-protocols.txt new file mode 100644 index 0000000..1ff61d1 --- /dev/null +++ b/extensions-test/extensions-protocols.txt @@ -0,0 +1,18 @@ +Jean-François Champollion rings of Uranus billions upon billions citizens of distant epochs two ghostly white figures in coveralls and helmets are soflty dancing trillion. Descended from astronomers gathered by gravity rich in heavy atoms something incredible is waiting to be known star stuff harvesting star light circumnavigated. Gathered by gravity take root and flourish hundreds of thousands Orion's sword vanquish the impossible are creatures of the cosmos. go:somedata.dat Courage of our questions gathered by gravity as a patch of light network of wormholes network of wormholes the ash of stellar alchemy? + +As a patch of light worldlets Apollonius of Perga ship of the imagination a mote of dust suspended in a sunbeam gathered by gravity? pw://somedata.dat Circumnavigated a mote of dust suspended in a sunbeam permanence of the stars invent the universe circumnavigated invent the universe. Bits of moving fluff star stuff harvesting star light made in the interiors of collapsing stars the ash of stellar alchemy the ash of stellar alchemy rich in heavy atoms? + +Light years globular star cluster radio telescope white dwarf brain is the seed of intelligence birth. Astonishment permanence of the stars laws of physics how far away made in the info:ark/somedata.dat interiors of collapsing stars rich in mystery? Star stuff harvesting star light vastness is bearable only through love two ghostly white figures in coveralls and helmets are soflty dancing stirred by starlight extraordinary claims require extraordinary evidence stirred by starlight. + +Realm of the galaxies laws of physics circumnavigated the carbon in our apple pies dispassionate extraterrestrial observer dream of the mind's eye. Descended from astronomers courage of our questions vastness is bearable only through love the only home we've ever known two ghostly white figures in coveralls and helmets are soflty dancing Orion's sword. Inconspicuous motes of rock and gas Sea of Tranquility not a sunrise but a galaxyrise not a sunrise but a galaxyrise Euclid bits of moving fluff? + +Bits of moving fluff consciousness quasar tendrils of gossamer clouds with pretty stories for which there's little good evidence invent the universe. info:pronom/somedata.dat Sea of Tranquility realm of the galaxies venture encyclopaedia galactica realm of the galaxies courage of our questions. A very small stage in a vast cosmic arena Sea of Tranquility permanence of the stars realm of the galaxies rings of Uranus realm of the galaxies. Two ghostly white figures in coveralls and helmets are soflty dancing another world something incredible is waiting to be known the sky calls to us hearts of the stars courage of our questions. + +Tunguska event astonishment vastness is bearable only through love extraplanetary Euclid venture? Billions upon billions with pretty stories for which there's little good evidence the only home we've ever known cosmic ocean citizens of distant epochs billions upon billions. Invent the universe shores of the cosmic ocean hearts of the stars the sky calls to us are creatures of the cosmos the sky calls to us. info:hdl/somedata.dat Paroxysm of global death two ghostly white figures in coveralls and helmets are soflty dancing bits of moving fluff Sea of Tranquility network of wormholes how far away and billions upon billions upon billions upon billions upon billions upon billions upon billions. + + + + + + + diff --git a/extensions.json b/extensions.json new file mode 100644 index 0000000..24a81b5 --- /dev/null +++ b/extensions.json @@ -0,0 +1,10 @@ +{ + "title": "Extensions data file for tikalinkextract", + "version": "0.0.1", + "Extensions": [ + "pw://", + "info:ark/", + "info:pronom/", + "info:hdl/" + ] +} diff --git a/scanner.go b/scanner.go index dd6fd2d..db9f79a 100644 --- a/scanner.go +++ b/scanner.go @@ -36,7 +36,6 @@ func httpScanner(fname string, content string) { } } } - if len(errs) > 0 { for _, e := range errs { fmt.Fprintf(os.Stderr, "%s", e.Error()) diff --git a/tikahttpreserve.go b/tikalinkextract.go similarity index 78% rename from tikahttpreserve.go rename to tikalinkextract.go index a6e8584..69fb49b 100644 --- a/tikahttpreserve.go +++ b/tikalinkextract.go @@ -1,14 +1,21 @@ package main import ( + "encoding/json" "flag" "fmt" + "github.com/httpreserve/linkscanner" + "io/ioutil" "os" "path/filepath" "sync" "time" ) +type protocolExtensions struct { + Extensions []string +} + var ( noprotocol bool file string @@ -17,14 +24,16 @@ var ( totalFiles int quoteCells = false seedList = false + ext string ) func init() { - flag.StringVar(&file, "file", "false", "File to extract information from.") + flag.StringVar(&file, "file", "", "File to extract information from.") flag.BoolVar(&vers, "version", false, "[Optional] Output version of the tool.") flag.BoolVar(&noprotocol, "noprotocol", false, "[Optional] For www. links (without a protocol, don't prepend http://.") flag.BoolVar("eCells, "quote", false, "[Optional] Some URLS may contain commas, quote cells for your CSV parser.") flag.BoolVar(&seedList, "seeds", false, "[Optional] Simply output a unique list of seeds for a web archiving tool like wget.") + flag.StringVar(&ext, "extensions", "", "JSON file listing additional protocols to extract links for.") } func outputList(linkpool []string) { @@ -42,7 +51,6 @@ var wg sync.WaitGroup func processall(file string) { //check the services are up and running findOpenConnections() - //make a listing of all the files we're going to process //efficient enough with memory? err := filepath.Walk(file, readFile) @@ -50,19 +58,14 @@ func processall(file string) { logStringError("%v", err) os.Exit(1) } - //time how long it takes to prcess files and extract entities start := time.Now() - //read each file into each server and collect results - if len(allfiles) <= 0 { fmt.Fprintf(os.Stderr, "No files to process.\n") os.Exit(1) } - totalFiles = len(allfiles) - for x := 0; x < len(allfiles); x += fileThrottle { remain := min(x+fileThrottle, len(allfiles)) filepool := allfiles[x:remain] @@ -77,26 +80,37 @@ func processall(file string) { linkpool := make([]string, len(linklist)) copy(linkpool, linklist) linklist = linklist[:0] - // process output in background while we handle other filed wg.Add(1) go outputList(linkpool) - //release waitgroup, exit...i believe this will prevent race //conditions when working between the two lists in this loop. wg.Wait() } - //output that time... elapsed := time.Since(start) fmt.Fprintf(os.Stderr, "\nTika extract took %s\n", elapsed) +} +func loadExtensions(ext string) { + // Load extensions from an external file. + extensions, err := os.Open(ext) + if err != nil { + logStringError("INFO: %v", err) + return + } + defer extensions.Close() + bytes, _ := ioutil.ReadAll(extensions) + var exts protocolExtensions + json.Unmarshal(bytes, &exts) + linkscanner.LoadExtensions(exts.Extensions) } func main() { flag.Parse() if flag.NFlag() <= 0 { // can access args w/ len(os.Args[1:]) too fmt.Fprintln(os.Stderr, "Usage: links [-file ...]") + fmt.Fprintln(os.Stderr, "Usage: [Optional -extensions]") fmt.Fprintln(os.Stderr, "Usage: [Optional -noprotocol]") fmt.Fprintln(os.Stderr, "Usage: [Optional -quote]") fmt.Fprintln(os.Stderr, " [Optional -version]") @@ -110,13 +124,6 @@ func main() { os.Exit(1) } + loadExtensions(ext) processall(file) } - -//math.Min uses float64, so let's not cast -func min(a, b int) int { - if a < b { - return a - } - return b -} diff --git a/utils.go b/utils.go new file mode 100644 index 0000000..850ae1c --- /dev/null +++ b/utils.go @@ -0,0 +1,9 @@ +package main + +//math.Min uses float64, so let's not cast +func min(a, b int) int { + if a < b { + return a + } + return b +} diff --git a/version.go b/version.go index dacd725..2f5d96e 100644 --- a/version.go +++ b/version.go @@ -2,7 +2,7 @@ package main import "github.com/httpreserve/linkscanner" -var version = "tikalinkextract-0.0.2" +var version = "tikalinkextract-0.0.3" func getVersion() string { return version + "\n" + linkscanner.GetVersion()