Skip to content

Commit

Permalink
regexp替换速度更快的re2
Browse files Browse the repository at this point in the history
  • Loading branch information
yhy0 committed Mar 19, 2024
1 parent 045e826 commit 25b7a9d
Show file tree
Hide file tree
Showing 50 changed files with 452 additions and 441 deletions.
4 changes: 2 additions & 2 deletions crawler/crawlergo/engine/collect_links.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ package engine
import (
"context"
"fmt"
regexp "github.com/wasilibs/go-re2"
"github.com/yhy0/Jie/crawler/crawlergo/config"
"regexp"
"time"

"github.com/chromedp/cdproto/cdp"
"github.com/chromedp/chromedp"
)
Expand Down
30 changes: 15 additions & 15 deletions crawler/crawlergo/engine/intercept_request.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"bufio"
"context"
"encoding/base64"
regexp "github.com/wasilibs/go-re2"
"github.com/yhy0/Jie/crawler/crawlergo/config"
"github.com/yhy0/Jie/crawler/crawlergo/model"
"github.com/yhy0/Jie/crawler/crawlergo/tools"
Expand All @@ -12,11 +13,10 @@ import (
"github.com/yhy0/logging"
"io"
"net/textproto"
"regexp"
"strconv"
"strings"
"time"

"github.com/chromedp/cdproto/fetch"
"github.com/chromedp/cdproto/network"
)
Expand All @@ -38,16 +38,16 @@ func (tab *Tab) InterceptRequest(v *fetch.EventRequestPaused) {
PostData: _req.PostData,
}
req := model.GetRequest(_req.Method, url, _option)

if IsIgnoredByKeywordMatch(req, tab.config.IgnoreKeywords) {
_ = fetch.FailRequest(v.RequestID, network.ErrorReasonBlockedByClient).Do(ctx)
req.Source = config.FromXHR
tab.AddResultRequest(req)
return
}

tab.HandleHostBinding(&req)

// // 静态资源 全部阻断
// // https://github.com/Qianlitp/crawlergo/issues/106
// if config.StaticSuffixSet.Contains(url.FileExt()) {
Expand All @@ -56,7 +56,7 @@ func (tab *Tab) InterceptRequest(v *fetch.EventRequestPaused) {
// tab.AddResultRequest(req)
// return
// }

// 处理导航请求
if tab.IsNavigatorRequest(v.NetworkID.String()) {
tab.NavNetworkID = v.NetworkID.String()
Expand All @@ -65,7 +65,7 @@ func (tab *Tab) InterceptRequest(v *fetch.EventRequestPaused) {
tab.AddResultRequest(req)
return
}

req.Source = config.FromXHR
tab.AddResultRequest(req)
_ = fetch.ContinueRequest(v.RequestID).Do(ctx)
Expand Down Expand Up @@ -97,7 +97,7 @@ func (tab *Tab) HandleNavigationReq(req *model.Request, v *fetch.EventRequestPau
tCtx, cancel := context.WithTimeout(ctx, time.Second*5)
defer cancel()
overrideReq := fetch.ContinueRequest(v.RequestID).WithURL(req.URL.String())

// 处理后端重定向请求
if tab.FoundRedirection && tab.IsTopFrame(v.FrameID.String()) {
// logging.Logger.Debug("redirect navigation req: " + req.URL.String())
Expand Down Expand Up @@ -159,7 +159,7 @@ func (tab *Tab) HandleHostBinding(req *model.Request) {
urlObj, _ := model.GetUrl(strings.Replace(req.URL.String(), "://"+url.Hostname(), "://"+navUrl.Hostname(), -1), *navUrl)
req.URL = urlObj
req.Headers["Host"] = host

} else if navUrl.Hostname() != host && url.Host == navUrl.Host {
req.Headers["Host"] = host
}
Expand All @@ -184,18 +184,18 @@ func (tab *Tab) IsTopFrame(FrameID string) bool {
func (tab *Tab) ParseResponseURL(target string, v *network.EventResponseReceived) {
defer tab.WG.Done()
ctx := tab.GetExecutor()

res, err := network.GetResponseBody(v.RequestID).Do(ctx)
if err != nil {
// logging.Logger.Debug("ParseResponseURL ", err, target)
return
}

resStr := string(res)

// 这里获取了 body, 这里进行敏感信息检测
go sensitive.Detection(target, "", resStr)

urlRegex := regexp.MustCompile(config.SuspectURLRegex)
urlList := urlRegex.FindAllString(resStr, -1)
for _, url := range urlList {
Expand All @@ -204,7 +204,7 @@ func (tab *Tab) ParseResponseURL(target string, v *network.EventResponseReceived
if strings.HasPrefix(url_lower, "image/x-icon") || strings.HasPrefix(url_lower, "text/css") || strings.HasPrefix(url_lower, "text/javascript") {
continue
}

tab.AddResultUrl(config.GET, url, config.FromJSFile)
}
}
Expand Down Expand Up @@ -264,7 +264,7 @@ func MergeHeaders(navHeaders map[string]interface{}, headers map[string]interfac
mergedHeaders = append(mergedHeaders, &header)
}
}

for key, value := range headers {
var header fetch.HeaderEntry
header.Name = key
Expand Down
50 changes: 25 additions & 25 deletions crawler/crawlergo/engine/tab.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@ import (
"encoding/json"
"errors"
"fmt"
regexp "github.com/wasilibs/go-re2"
cconfig "github.com/yhy0/Jie/crawler/crawlergo/config"
"github.com/yhy0/Jie/crawler/crawlergo/js"
"github.com/yhy0/Jie/crawler/crawlergo/model"
"github.com/yhy0/Jie/crawler/crawlergo/xss"
"github.com/yhy0/Jie/pkg/output"
"github.com/yhy0/logging"
"regexp"
"strings"
"sync"
"time"

"github.com/chromedp/cdproto/cdp"
"github.com/chromedp/cdproto/dom"
"github.com/chromedp/cdproto/fetch"
Expand All @@ -40,9 +40,9 @@ type Tab struct {
FoundRedirection bool
DocBodyNodeId cdp.NodeID
config TabConfig

lock sync.Mutex

WG sync.WaitGroup // 当前Tab页的等待同步计数
collectLinkWG sync.WaitGroup
loadedWG sync.WaitGroup // Loaded之后的等待计数
Expand Down Expand Up @@ -85,7 +85,7 @@ func NewTab(browser *Browser, navigateReq model.Request, config TabConfig) *Tab
tab.NavigateReq = navigateReq
tab.config = config
tab.DocBodyNodeId = 0

// 设置请求拦截监听
chromedp.ListenTarget(tab.Ctx, func(e interface{}) {
switch v := e.(type) {
Expand All @@ -95,20 +95,20 @@ func NewTab(browser *Browser, navigateReq model.Request, config TabConfig) *Tab
tab.LoaderID = string(v.LoaderID)
tab.TopFrameId = string(v.FrameID)
}

// 请求发出时暂停 即 请求拦截
case *fetch.EventRequestPaused:
if v.ResponseStatusCode == 0 {
_ = fetch.ContinueRequest(v.RequestID).Do(tab.Ctx)
return
}

url, err := model.GetUrl(v.Request.URL, *tab.NavigateReq.URL)
if err != nil {
_ = fetch.ContinueRequest(v.RequestID).Do(tab.Ctx)
return
}

// https://xz.aliyun.com/t/7064#toc-14
// 返回一个假的图片
if v.ResourceType == network.ResourceTypeImage || url.FileExt() == "ico" {
Expand All @@ -126,10 +126,10 @@ func NewTab(browser *Browser, navigateReq model.Request, config TabConfig) *Tab
_ = fetch.FailRequest(v.RequestID, network.ErrorReasonBlockedByClient).Do(tab.Ctx)
return
}

tab.WG.Add(1)
go tab.InterceptRequest(v)

// tab.WG.Add(1)
// go func() { // convert javascriptQ
// defer tab.WG.Done()
Expand All @@ -138,7 +138,7 @@ func NewTab(browser *Browser, navigateReq model.Request, config TabConfig) *Tab
// logging.Logger.Errorf("[dom-based] hook %s error: %s\n", v.Request.URL, err)
// }
// }()

// 解析所有JS文件中的URL并添加到结果中
// 解析HTML文档中的URL
// 查找当前页面的编码
Expand All @@ -163,7 +163,7 @@ func NewTab(browser *Browser, navigateReq model.Request, config TabConfig) *Tab
case *fetch.EventAuthRequired:
tab.WG.Add(1)
go tab.HandleAuthRequired(v)

// DOMContentLoaded
// 开始执行表单填充 和 执行DOM节点观察函数
// 只执行一次
Expand All @@ -182,12 +182,12 @@ func NewTab(browser *Browser, navigateReq model.Request, config TabConfig) *Tab
DOMContentLoadedRun = true
tab.WG.Add(1)
go tab.AfterDOMRun()

// close Dialog
case *page.EventJavascriptDialogOpening:
tab.WG.Add(1)
go tab.dismissDialog()

// handle expose function 绑定事件监听
case *runtime.EventBindingCalled:
switch v.Name {
Expand All @@ -212,12 +212,12 @@ func NewTab(browser *Browser, navigateReq model.Request, config TabConfig) *Tab
}
}
}

tab.WG.Add(1)
go tab.HandleBindingCalled(v)
}
})

return &tab
}

Expand Down Expand Up @@ -249,7 +249,7 @@ func (tab *Tab) Start() {
// XSS-Scan 使用的回调
runtime.AddBinding("addLink"),
runtime.AddBinding("Test"),

runtime.AddBinding(xss.EventPushVul),
chromedp.ActionFunc(func(ctx context.Context) error {
_, err := page.AddScriptToEvaluateOnNewDocument(xss.PreloadJS).Do(ctx)
Expand Down Expand Up @@ -277,34 +277,34 @@ func (tab *Tab) Start() {
}
logging.Logger.Warn("navigate timeout ", tab.NavigateReq.URL.String())
}

waitDone := func() <-chan struct{} {
tab.WG.Wait()
ch := make(chan struct{})
defer close(ch)
return ch
}

select {
case <-waitDone():
// logging.Logger.Debug("all navigation tasks done.")
case <-time.After(tab.config.DomContentLoadedTimeout + time.Second*10):
// logging.Logger.Warn("navigation tasks TIMEOUT.")
}

// 等待收集所有链接
// logging.Logger.Debug("collectLinks start.")
tab.collectLinkWG.Add(3)
go tab.collectLinks()
tab.collectLinkWG.Wait()
// logging.Logger.Debug("collectLinks end.")

// 识别页面编码 并编码所有URL
if tab.config.EncodeURLWithCharset {
tab.DetectCharset()
tab.EncodeAllURLWithCharset()
}

// fmt.Println(tab.NavigateReq.URL.String(), len(tab.ResultList))
// for _, v := range tab.ResultList {
// v.SimplePrint()
Expand Down Expand Up @@ -332,7 +332,7 @@ func (tab *Tab) AddResultUrl(method string, _url string, source string) {
PostData: "",
}
referer := navUrl.String()

// 处理Host绑定
if host, ok := tab.NavigateReq.Headers["Host"]; ok {
if host != navUrl.Hostname() && url.Hostname() == host {
Expand All @@ -345,15 +345,15 @@ func (tab *Tab) AddResultUrl(method string, _url string, source string) {
if cookie, ok := tab.NavigateReq.Headers["Cookie"]; ok {
option.Headers["Cookie"] = cookie
}

// 修正Referer
option.Headers["Referer"] = referer
for key, value := range tab.ExtraHeaders {
option.Headers[key] = value
}
req := model.GetRequest(method, url, option)
req.Source = source

tab.lock.Lock()
tab.ResultList = append(tab.ResultList, &req)
tab.lock.Unlock()
Expand Down
2 changes: 1 addition & 1 deletion crawler/crawlergo/filter/smart_filter.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
package filter

import (
regexp "github.com/wasilibs/go-re2"
"github.com/yhy0/Jie/crawler/crawlergo/config"
"github.com/yhy0/Jie/crawler/crawlergo/model"
"github.com/yhy0/Jie/crawler/crawlergo/tools"
"github.com/yhy0/logging"
"go/types"
"regexp"
"sort"
"strings"
"sync"
Expand Down
Loading

0 comments on commit 25b7a9d

Please sign in to comment.