Skip to content

Commit

Permalink
Integrate chroma customized matching into language detection
Browse files Browse the repository at this point in the history
  • Loading branch information
dron22 committed Nov 13, 2020
1 parent 83cd62c commit 9ee8495
Show file tree
Hide file tree
Showing 24 changed files with 401 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"dependencies": null,
"entity": "%s",
"is_write": true,
"language": null,
"language": "Go",
"lineno": 13,
"lines": 2,
"project": "wakatime-cli",
Expand All @@ -21,7 +21,7 @@
"dependencies": null,
"entity": "%s",
"is_write": true,
"language": null,
"language": "Go",
"lineno": 42,
"lines": 2,
"project": "wakatime-cli",
Expand All @@ -36,7 +36,7 @@
"dependencies": null,
"entity": "%s",
"is_write": null,
"language": null,
"language": "Go",
"lineno": null,
"lines": 2,
"project": "wakatime-cli",
Expand Down
2 changes: 1 addition & 1 deletion cmd/legacy/heartbeat/testdata/api_heartbeats_response.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"entity": "/tmp/main.go",
"id": "845a922e-9e65-4775-bd68-bb3196d2e06a",
"is_write": true,
"language": "golang",
"language": "Go",
"lineno": 42,
"lines": 100,
"machine_name_id": null,
Expand Down
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ go 1.15

require (
github.com/PuerkitoBio/goquery v1.6.0 // indirect
github.com/alecthomas/chroma v0.8.1
github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6 // indirect
github.com/danwakefield/fnmatch v0.0.0-20160403171240-cbb64ac3d964
github.com/dlclark/regexp2 v1.4.0
github.com/matishsiao/goInfo v0.0.0-20200404012835-b5f882ee2288
github.com/mattn/go-sqlite3 v1.14.4
Expand Down
15 changes: 15 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAE
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/PuerkitoBio/goquery v1.6.0 h1:j7taAbelrdcsOlGeMenZxc2AWXD5fieT1/znArdnx94=
github.com/PuerkitoBio/goquery v1.6.0/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/alecthomas/assert v0.0.0-20170929043011-405dbfeb8e38/go.mod h1:r7bzyVFMNntcxPZXK3/+KdruV1H5KSlyVY0gc+NgInI=
github.com/alecthomas/chroma v0.8.1 h1:ym20sbvyC6RXz45u4qDglcgr8E313oPROshcuCHqiEE=
github.com/alecthomas/chroma v0.8.1/go.mod h1:sko8vR34/90zvl5QdcUdvzL3J8NKjAUx9va9jPuFNoM=
github.com/alecthomas/colour v0.0.0-20160524082231-60882d9e2721/go.mod h1:QO9JBoKquHd+jz9nshCh40fOfO+JzsoXy8qTHF68zU0=
github.com/alecthomas/kong v0.2.4/go.mod h1:kQOmtJgV+Lb4aj+I2LEn40cbtawdWJ9Y8QLq+lElKxE=
github.com/alecthomas/repr v0.0.0-20180818092828-117648cd9897/go.mod h1:xTS7Pm1pD1mvyM075QCDSRqH6qRLXylzS24ZTpRiSzQ=
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
Expand All @@ -40,11 +46,14 @@ github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3Ee
github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA=
github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
github.com/danwakefield/fnmatch v0.0.0-20160403171240-cbb64ac3d964 h1:y5HC9v93H5EPKqaS1UYVg1uYah5Xf51mBfIoWehClUQ=
github.com/danwakefield/fnmatch v0.0.0-20160403171240-cbb64ac3d964/go.mod h1:Xd9hchkHSWYkEqJwUGisez3G1QY8Ryz0sdWrLPMGjLk=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no=
github.com/dlclark/regexp2 v1.2.0/go.mod h1:2pZnwuY/m+8K6iRw6wQdMtk+rH5tNGR1i55kozfMjCc=
github.com/dlclark/regexp2 v1.4.0 h1:F1rxgk7p4uKjwIQxBs9oAXe5CqrXlCduYEJvrF4u93E=
github.com/dlclark/regexp2 v1.4.0/go.mod h1:2pZnwuY/m+8K6iRw6wQdMtk+rH5tNGR1i55kozfMjCc=
github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=
Expand Down Expand Up @@ -127,7 +136,9 @@ github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czP
github.com/matishsiao/goInfo v0.0.0-20200404012835-b5f882ee2288 h1:cdM7et8/VlNnSBpq3KbyQWsYLCY0WsB7tvV8Fr0DUNE=
github.com/matishsiao/goInfo v0.0.0-20200404012835-b5f882ee2288/go.mod h1:yLZrFIhv+Z20hxHvcZpEyKVQp9HMsOJkXAxx7yDqtvg=
github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU=
github.com/mattn/go-colorable v0.1.6/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc=
github.com/mattn/go-isatty v0.0.3/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4=
github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU=
github.com/mattn/go-sqlite3 v1.14.0 h1:mLyGNKR8+Vv9CAU7PphKa2hkEqxxhn8i32J6FPj1/QA=
github.com/mattn/go-sqlite3 v1.14.0/go.mod h1:JIl7NbARA7phWnGvh0LKTyg7S9BA+6gx71ShQilpsus=
github.com/mattn/go-sqlite3 v1.14.4 h1:4rQjbDxdu9fSgI/r3KN72G3c2goxknAqHHgPWWs8UlI=
Expand All @@ -153,6 +164,7 @@ github.com/pelletier/go-toml v1.2.0 h1:T5zMGML61Wp+FlcbWjRDT7yAxhJNAiPPLOFECq181
github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic=
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndrE9hABlRI=
Expand All @@ -170,6 +182,7 @@ github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFR
github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc=
github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
github.com/slongfield/pyfmt v0.0.0-20180124071345-020a7cb18bca h1:fO9hIZRL+kteo13eh51GqkUdZf/NpMmZsi8ob6b1eOg=
Expand Down Expand Up @@ -291,6 +304,8 @@ golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0 h1:HyfiK1WMnHj5FXFXatD+Qs1A/xC2Run6RzeW1SyHxpc=
golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd h1:xhmwyvizuTgC2qz7ZlMluP20uW+C3Rm0FD/WLDX8884=
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200413165638-669c56c373c4 h1:opSr2sbRXk5X5/givKrrKj9HXxFpW2sdCiP8MJSKLQY=
Expand Down
270 changes: 270 additions & 0 deletions pkg/language/chroma.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
package language

import (
"fmt"
"io"
"os"
fp "path/filepath"
"sort"
"strings"

"github.com/wakatime/wakatime-cli/pkg/heartbeat"

"github.com/alecthomas/chroma"
"github.com/alecthomas/chroma/lexers"
_ "github.com/alecthomas/chroma/lexers/a" // not used directly
_ "github.com/alecthomas/chroma/lexers/b" // not used directly
_ "github.com/alecthomas/chroma/lexers/c" // not used directly
_ "github.com/alecthomas/chroma/lexers/circular" // not used directly
_ "github.com/alecthomas/chroma/lexers/d" // not used directly
_ "github.com/alecthomas/chroma/lexers/e" // not used directly
_ "github.com/alecthomas/chroma/lexers/f" // not used directly
_ "github.com/alecthomas/chroma/lexers/g" // not used directly
_ "github.com/alecthomas/chroma/lexers/h" // not used directly
_ "github.com/alecthomas/chroma/lexers/i" // not used directly
_ "github.com/alecthomas/chroma/lexers/j" // not used directly
_ "github.com/alecthomas/chroma/lexers/k" // not used directly
_ "github.com/alecthomas/chroma/lexers/l" // not used directly
_ "github.com/alecthomas/chroma/lexers/m" // not used directly
_ "github.com/alecthomas/chroma/lexers/n" // not used directly
_ "github.com/alecthomas/chroma/lexers/o" // not used directly
_ "github.com/alecthomas/chroma/lexers/p" // not used directly
_ "github.com/alecthomas/chroma/lexers/q" // not used directly
_ "github.com/alecthomas/chroma/lexers/r" // not used directly
_ "github.com/alecthomas/chroma/lexers/s" // not used directly
_ "github.com/alecthomas/chroma/lexers/t" // not used directly
_ "github.com/alecthomas/chroma/lexers/v" // not used directly
_ "github.com/alecthomas/chroma/lexers/w" // not used directly
_ "github.com/alecthomas/chroma/lexers/x" // not used directly
_ "github.com/alecthomas/chroma/lexers/y" // not used directly
_ "github.com/alecthomas/chroma/lexers/z" // not used directly
"github.com/danwakefield/fnmatch"
jww "github.com/spf13/jwalterweatherman"
)

// chromaMatchCustomized returns the best by filename matching lexer. Best lexer is determined
// by customized priority.
// This is a modified implementation of chroma.lexers.internal.api:Match().
func chromaMatchCustomized(filepath string) (heartbeat.Language, bool) {
_, file := fp.Split(filepath)
filename := fp.Base(file)
matched := chroma.PrioritisedLexers{}

// First, try primary filename matches.
for _, lexer := range lexers.Registry.Lexers {
config := lexer.Config()
for _, glob := range config.Filenames {
if fnmatch.Match(glob, filename, 0) {
matched = append(matched, lexer)
}
}
}

if len(matched) > 0 {
bestLexer := selectByCustomizedPriority(filepath, matched)

language, ok := heartbeat.ParseLanguageFromChroma(bestLexer.Config().Name)
if !ok {
jww.WARN.Printf("failed to parse language from chroma lexer name %q", bestLexer.Config().Name)
return heartbeat.LanguageUnknown, false
}

return language, true
}

// Next, try filename aliases.
for _, lexer := range lexers.Registry.Lexers {
config := lexer.Config()
for _, glob := range config.AliasFilenames {
if fnmatch.Match(glob, filename, 0) {
matched = append(matched, lexer)
}
}
}

if len(matched) > 0 {
bestLexer := selectByCustomizedPriority(filepath, matched)

language, ok := heartbeat.ParseLanguageFromChroma(bestLexer.Config().Name)
if !ok {
jww.WARN.Printf("failed to parse language from chroma lexer name %q", bestLexer.Config().Name)
return heartbeat.LanguageUnknown, false
}

return language, true
}

return heartbeat.LanguageUnknown, false
}

// weightedLexer is a lexer with priority and weight.
type weightedLexer struct {
chroma.Lexer
Weight float32
Priority float32
}

// selectByCustomizedPriority selects the best matching lexer by customized priority evaluation.
func selectByCustomizedPriority(filepath string, lexers chroma.PrioritisedLexers) chroma.Lexer {
sort.Slice(lexers, func(i, j int) bool {
icfg, jcfg := lexers[i].Config(), lexers[j].Config()

// 1. by priority
if icfg.Priority != jcfg.Priority {
return icfg.Priority > jcfg.Priority
}

// 2. by name
return strings.ToLower(icfg.Name) > strings.ToLower(jcfg.Name)
})

dir, _ := fp.Split(filepath)

extensions, err := loadFolderExtensions(dir)
if err != nil {
jww.WARN.Printf("failed to load folder extensions: %s", err)
return lexers[0]
}

head, err := fileHead(filepath)
if err != nil {
jww.WARN.Printf("failed to load head from file %q: %s", filepath, err)
return lexers[0]
}

var weighted []weightedLexer

for _, lexer := range lexers {
var weight float32

if analyser, ok := lexer.(chroma.Analyser); ok {
weight = analyser.AnalyseText(string(head))
}

cfg := lexer.Config()

if p, ok := priority(cfg.Name); ok {
weighted = append(weighted, weightedLexer{
Lexer: lexer,
Priority: p,
Weight: weight,
})

continue
}

if cfg.Name == "Matlab" {
weighted = append(weighted, weightedLexer{
Lexer: lexer,
Priority: cfg.Priority,
Weight: matlabWeight(weight, extensions),
})

continue
}

if cfg.Name == "Objective-C" {
weighted = append(weighted, weightedLexer{
Lexer: lexer,
Priority: cfg.Priority,
Weight: objectiveCWeight(weight, extensions),
})

continue
}

weighted = append(weighted, weightedLexer{
Lexer: lexer,
Priority: cfg.Priority,
Weight: weight,
})
}

sort.Slice(weighted, func(i, j int) bool {
// 1. by weight
if weighted[i].Weight != weighted[j].Weight {
return weighted[i].Weight > weighted[j].Weight
}

// 2. by priority
if weighted[i].Priority != weighted[j].Priority {
return weighted[i].Priority > weighted[j].Priority
}

// 3. name
return weighted[i].Lexer.Config().Name > weighted[j].Lexer.Config().Name
})

return weighted[0].Lexer
}

// fileHead returns the first 512000 bytes of the file's content.
func fileHead(filepath string) ([]byte, error) {
f, err := os.Open(filepath)
if err != nil {
return nil, fmt.Errorf("failed to open file: %s", err)
}

defer f.Close()

data := make([]byte, 512000)

_, err = f.ReadAt(data, 0)
if err != nil && err != io.EOF {
return nil, fmt.Errorf("failed to read bytes from file: %s", err)
}

return data, nil
}

// objectiveCWeight determines the weight of objective-c by the provided same folder file extensions.
func objectiveCWeight(weight float32, extensions []string) float32 {
var matFileExists bool

for _, e := range extensions {
if e == ".mat" {
matFileExists = true
break
}
}

if matFileExists {
weight -= 0.01
} else {
weight += 0.01
}

for _, e := range extensions {
if e == ".h" {
weight += 0.01
break
}
}

return weight
}

// matlabWeight determines the weight of matlab by the provided same folder file extensions.
func matlabWeight(weight float32, extensions []string) float32 {
for _, e := range extensions {
if e == ".mat" {
weight += 0.01
break
}
}

var headerFileExists bool

for _, e := range extensions {
if e == ".h" {
headerFileExists = true
break
}
}

if !headerFileExists {
weight += 0.01
}

return weight
}
4 changes: 4 additions & 0 deletions pkg/language/language.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ func Detect(fp string) (heartbeat.Language, error) {
return language, nil
}

if language, ok := chromaMatchCustomized(fp); ok {
return language, nil
}

return heartbeat.LanguageUnknown, fmt.Errorf("could not detect the language of file %q", fp)
}

Expand Down
Loading

0 comments on commit 9ee8495

Please sign in to comment.