Skip to content

Commit

Permalink
Merge pull request #37 from vmarkovtsev/master
Browse files Browse the repository at this point in the history
Scan for license files in README
  • Loading branch information
vmarkovtsev authored May 10, 2018
2 parents 06aabc2 + 4979d1f commit da552ec
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 13 deletions.
36 changes: 28 additions & 8 deletions licensedb/internal/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,32 @@ import (
"archive/tar"
"bytes"
"encoding/csv"
"fmt"
"index/suffixarray"
"io"
"log"
"os"
paths "path"
"regexp"
"sort"
"strings"

"github.com/ekzhu/minhash-lsh"
"github.com/sergi/go-diff/diffmatchpatch"

"gopkg.in/src-d/go-license-detector.v2/licensedb/filer"
"gopkg.in/src-d/go-license-detector.v2/licensedb/internal/assets"
"gopkg.in/src-d/go-license-detector.v2/licensedb/internal/fastlog"
"gopkg.in/src-d/go-license-detector.v2/licensedb/internal/normalize"
"gopkg.in/src-d/go-license-detector.v2/licensedb/internal/wmh"
)

var (
licenseReadmeMentionRe = regexp.MustCompile(
fmt.Sprintf("(?i)[^\\s]+/[^/\\s]*(%s)[^\\s]*",
strings.Join(licenseFileNames, "|")))
)

// database holds the license texts, their hashes and the hashtables to query for nearest
// neighbors.
type database struct {
Expand Down Expand Up @@ -411,18 +420,29 @@ func (db *database) scanForURLs(text string) map[string]bool {
}

// QueryReadmeText tries to detect licenses mentioned in the README.
func (db *database) QueryReadmeText(text string) map[string]float32 {
candidates1 := investigateReadmeFile(text, db.nameSubstrings, db.nameSubstringSizes)
candidates2 := investigateReadmeFile(text, db.nameShortSubstrings, db.nameShortSubstringSizes)
func (db *database) QueryReadmeText(text string, fs filer.Filer) map[string]float32 {
candidates := map[string]float32{}
for key, val := range candidates1 {
candidates[key] = val
append := func(others map[string]float32) {
for key, val := range others {
if candidates[key] < val {
candidates[key] = val
}
}
}
for key, val := range candidates2 {
if candidates[key] < val {
candidates[key] = val
for _, match := range licenseReadmeMentionRe.FindAllString(text, -1) {
match = strings.TrimRight(match, ".,:;-")
content, err := fs.ReadFile(match)
if err == nil {
if preprocessor, exists := filePreprocessors[paths.Ext(match)]; exists {
content = preprocessor(content)
}
append(db.QueryLicenseText(string(content)))
}
}
if len(candidates) == 0 {
append(investigateReadmeFile(text, db.nameSubstrings, db.nameSubstringSizes))
append(investigateReadmeFile(text, db.nameShortSubstrings, db.nameShortSubstringSizes))
}
if db.debug {
for key, val := range candidates {
println("NLP", key, val)
Expand Down
8 changes: 4 additions & 4 deletions licensedb/internal/investigation.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,10 +123,10 @@ func ExtractReadmeFiles(files []string, fs filer.Filer) [][]byte {

// InvestigateReadmeTexts scans README files for licensing information and outputs the
// probable names using NER.
func InvestigateReadmeTexts(texts [][]byte) map[string]float32 {
func InvestigateReadmeTexts(texts [][]byte, fs filer.Filer) map[string]float32 {
maxLicenses := map[string]float32{}
for _, text := range texts {
candidates := InvestigateReadmeText(text)
candidates := InvestigateReadmeText(text, fs)
for name, sim := range candidates {
maxSim := maxLicenses[name]
if sim > maxSim {
Expand All @@ -139,8 +139,8 @@ func InvestigateReadmeTexts(texts [][]byte) map[string]float32 {

// InvestigateReadmeText scans the README file for licensing information and outputs probable
// names found with Named Entity Recognition from NLP.
func InvestigateReadmeText(text []byte) map[string]float32 {
return globalLicenseDatabase.QueryReadmeText(string(text))
func InvestigateReadmeText(text []byte, fs filer.Filer) map[string]float32 {
return globalLicenseDatabase.QueryReadmeText(string(text), fs)
}

// IsLicenseDirectory indicates whether the directory is likely to contain licenses.
Expand Down
2 changes: 1 addition & 1 deletion licensedb/licensedb.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func Detect(fs filer.Filer) (map[string]float32, error) {
if len(candidates) == 0 {
return nil, ErrNoLicenseFound
}
licenses = internal.InvestigateReadmeTexts(candidates)
licenses = internal.InvestigateReadmeTexts(candidates, fs)
if len(licenses) == 0 {
return nil, ErrNoLicenseFound
}
Expand Down

0 comments on commit da552ec

Please sign in to comment.