Skip to content

Commit

Permalink
Add the ability for users to use custom protocols
Browse files Browse the repository at this point in the history
This commit gives users the ability to specify custom protocols in an
extensions file so that uri-types that may not be as well-known can
be identified by the link scanner. Examples might include those used
by content management systems as internal mechanisms of accessing
information.

Connected to #6
  • Loading branch information
ross-spencer committed Mar 16, 2019
1 parent 5d839de commit fa4c273
Show file tree
Hide file tree
Showing 7 changed files with 63 additions and 27 deletions.
7 changes: 0 additions & 7 deletions bulkfilehandler.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ type contenterror struct {
}

func extractAndAnalyse(filepool []filedata) (bool, error) {

//make channel run goroutine...
ch := make(chan contenterror)
for _, fi := range filepool {
Expand All @@ -26,7 +25,6 @@ func extractAndAnalyse(filepool []filedata) (bool, error) {
httpScanner(ce.fname, ce.content)
}
}

return false, nil
}

Expand All @@ -42,10 +40,8 @@ func getFileContent(fi filedata, ch chan contenterror) {
//create empty struct to return...
var ce contenterror
ce.fname = fi.fname

//what are we doing..?
logFileMessage("INFO: '%s' being processed.", fi.fname)

//process...
fp, err := openFile(fi.fpath)
defer fp.Close()
Expand All @@ -54,20 +50,17 @@ func getFileContent(fi filedata, ch chan contenterror) {
ch <- ce
return
}

_, flRecursiveKeysValues, err := getTikaRecursive(fi.fname, fp, acceptJSON)
if err != nil {
ce.err = err
ch <- ce
return
}

if val, ok := flRecursiveKeysValues[tikaPlainText]; ok {
ce.content = val.(string)
ch <- ce
return
}

ce.err = fmt.Errorf("No plain text data to analyse.")
ch <- ce
return
Expand Down
18 changes: 18 additions & 0 deletions extensions-test/extensions-protocols.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
Jean-François Champollion rings of Uranus billions upon billions citizens of distant epochs two ghostly white figures in coveralls and helmets are soflty dancing trillion. Descended from astronomers gathered by gravity rich in heavy atoms something incredible is waiting to be known star stuff harvesting star light circumnavigated. Gathered by gravity take root and flourish hundreds of thousands Orion's sword vanquish the impossible are creatures of the cosmos. go:somedata.dat Courage of our questions gathered by gravity as a patch of light network of wormholes network of wormholes the ash of stellar alchemy?

As a patch of light worldlets Apollonius of Perga ship of the imagination a mote of dust suspended in a sunbeam gathered by gravity? pw://somedata.dat Circumnavigated a mote of dust suspended in a sunbeam permanence of the stars invent the universe circumnavigated invent the universe. Bits of moving fluff star stuff harvesting star light made in the interiors of collapsing stars the ash of stellar alchemy the ash of stellar alchemy rich in heavy atoms?

Light years globular star cluster radio telescope white dwarf brain is the seed of intelligence birth. Astonishment permanence of the stars laws of physics how far away made in the info:ark/somedata.dat interiors of collapsing stars rich in mystery? Star stuff harvesting star light vastness is bearable only through love two ghostly white figures in coveralls and helmets are soflty dancing stirred by starlight extraordinary claims require extraordinary evidence stirred by starlight.

Realm of the galaxies laws of physics circumnavigated the carbon in our apple pies dispassionate extraterrestrial observer dream of the mind's eye. Descended from astronomers courage of our questions vastness is bearable only through love the only home we've ever known two ghostly white figures in coveralls and helmets are soflty dancing Orion's sword. Inconspicuous motes of rock and gas Sea of Tranquility not a sunrise but a galaxyrise not a sunrise but a galaxyrise Euclid bits of moving fluff?

Bits of moving fluff consciousness quasar tendrils of gossamer clouds with pretty stories for which there's little good evidence invent the universe. info:pronom/somedata.dat Sea of Tranquility realm of the galaxies venture encyclopaedia galactica realm of the galaxies courage of our questions. A very small stage in a vast cosmic arena Sea of Tranquility permanence of the stars realm of the galaxies rings of Uranus realm of the galaxies. Two ghostly white figures in coveralls and helmets are soflty dancing another world something incredible is waiting to be known the sky calls to us hearts of the stars courage of our questions.

Tunguska event astonishment vastness is bearable only through love extraplanetary Euclid venture? Billions upon billions with pretty stories for which there's little good evidence the only home we've ever known cosmic ocean citizens of distant epochs billions upon billions. Invent the universe shores of the cosmic ocean hearts of the stars the sky calls to us are creatures of the cosmos the sky calls to us. info:hdl/somedata.dat Paroxysm of global death two ghostly white figures in coveralls and helmets are soflty dancing bits of moving fluff Sea of Tranquility network of wormholes how far away and billions upon billions upon billions upon billions upon billions upon billions upon billions.







10 changes: 10 additions & 0 deletions extensions.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"title": "Extensions data file for tikalinkextract",
"version": "0.0.1",
"Extensions": [
"pw://",
"info:ark/",
"info:pronom/",
"info:hdl/"
]
}
1 change: 0 additions & 1 deletion scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ func httpScanner(fname string, content string) {
}
}
}

if len(errs) > 0 {
for _, e := range errs {
fmt.Fprintf(os.Stderr, "%s", e.Error())
Expand Down
43 changes: 25 additions & 18 deletions tikahttpreserve.go → tikalinkextract.go
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
package main

import (
"encoding/json"
"flag"
"fmt"
"github.com/httpreserve/linkscanner"
"io/ioutil"
"os"
"path/filepath"
"sync"
"time"
)

type protocolExtensions struct {
Extensions []string
}

var (
noprotocol bool
file string
Expand All @@ -17,14 +24,16 @@ var (
totalFiles int
quoteCells = false
seedList = false
ext string
)

func init() {
flag.StringVar(&file, "file", "false", "File to extract information from.")
flag.StringVar(&file, "file", "", "File to extract information from.")
flag.BoolVar(&vers, "version", false, "[Optional] Output version of the tool.")
flag.BoolVar(&noprotocol, "noprotocol", false, "[Optional] For www. links (without a protocol, don't prepend http://.")
flag.BoolVar(&quoteCells, "quote", false, "[Optional] Some URLS may contain commas, quote cells for your CSV parser.")
flag.BoolVar(&seedList, "seeds", false, "[Optional] Simply output a unique list of seeds for a web archiving tool like wget.")
flag.StringVar(&ext, "extensions", "", "JSON file listing additional protocols to extract links for.")
}

func outputList(linkpool []string) {
Expand All @@ -42,27 +51,21 @@ var wg sync.WaitGroup
func processall(file string) {
//check the services are up and running
findOpenConnections()

//make a listing of all the files we're going to process
//efficient enough with memory?
err := filepath.Walk(file, readFile)
if err != nil {
logStringError("%v", err)
os.Exit(1)
}

//time how long it takes to prcess files and extract entities
start := time.Now()

//read each file into each server and collect results

if len(allfiles) <= 0 {
fmt.Fprintf(os.Stderr, "No files to process.\n")
os.Exit(1)
}

totalFiles = len(allfiles)

for x := 0; x < len(allfiles); x += fileThrottle {
remain := min(x+fileThrottle, len(allfiles))
filepool := allfiles[x:remain]
Expand All @@ -77,26 +80,37 @@ func processall(file string) {
linkpool := make([]string, len(linklist))
copy(linkpool, linklist)
linklist = linklist[:0]

// process output in background while we handle other filed
wg.Add(1)
go outputList(linkpool)

//release waitgroup, exit...i believe this will prevent race
//conditions when working between the two lists in this loop.
wg.Wait()
}

//output that time...
elapsed := time.Since(start)
fmt.Fprintf(os.Stderr, "\nTika extract took %s\n", elapsed)
}

func loadExtensions(ext string) {
// Load extensions from an external file.
extensions, err := os.Open(ext)
if err != nil {
logStringError("INFO: %v", err)
return
}
defer extensions.Close()
bytes, _ := ioutil.ReadAll(extensions)
var exts protocolExtensions
json.Unmarshal(bytes, &exts)
linkscanner.LoadExtensions(exts.Extensions)
}

func main() {
flag.Parse()
if flag.NFlag() <= 0 { // can access args w/ len(os.Args[1:]) too
fmt.Fprintln(os.Stderr, "Usage: links [-file ...]")
fmt.Fprintln(os.Stderr, "Usage: [Optional -extensions]")
fmt.Fprintln(os.Stderr, "Usage: [Optional -noprotocol]")
fmt.Fprintln(os.Stderr, "Usage: [Optional -quote]")
fmt.Fprintln(os.Stderr, " [Optional -version]")
Expand All @@ -110,13 +124,6 @@ func main() {
os.Exit(1)
}

loadExtensions(ext)
processall(file)
}

//math.Min uses float64, so let's not cast
func min(a, b int) int {
if a < b {
return a
}
return b
}
9 changes: 9 additions & 0 deletions utils.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package main

//math.Min uses float64, so let's not cast
func min(a, b int) int {
if a < b {
return a
}
return b
}
2 changes: 1 addition & 1 deletion version.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package main

import "github.com/httpreserve/linkscanner"

var version = "tikalinkextract-0.0.2"
var version = "tikalinkextract-0.0.3"

func getVersion() string {
return version + "\n" + linkscanner.GetVersion()
Expand Down

0 comments on commit fa4c273

Please sign in to comment.