Skip to content

Commit

Permalink
Add JSON scrape support (#717)
Browse files Browse the repository at this point in the history
* Add support for scene fragment scrape in xpath
  • Loading branch information
WithoutPants authored Aug 10, 2020
1 parent 470a2b5 commit 7158e83
Show file tree
Hide file tree
Showing 28 changed files with 5,005 additions and 14 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ require (
github.com/spf13/pflag v1.0.3
github.com/spf13/viper v1.4.0
github.com/stretchr/testify v1.5.1
github.com/tidwall/gjson v1.6.0
github.com/vektah/gqlparser v1.1.2
golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4
golang.org/x/image v0.0.0-20190118043309-183bebdce1b2
Expand Down
6 changes: 6 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,13 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV
github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4=
github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
github.com/tarm/serial v0.0.0-20180830185346-98f6abe2eb07/go.mod h1:kDXzergiv9cbyO7IOYJZWg1U88JhDg3PB6klq9Hg2pA=
github.com/tidwall/gjson v1.6.0 h1:9VEQWz6LLMUsUl6PueE49ir4Ka6CzLymOAZDxpFsTDc=
github.com/tidwall/gjson v1.6.0/go.mod h1:P256ACg0Mn+j1RXIDXoss50DeIABTYK1PULOJHhxOls=
github.com/tidwall/match v1.0.1 h1:PnKP62LPNxHKTwvHHZZzdOAOCtsJTjo6dZLCwpKm5xc=
github.com/tidwall/match v1.0.1/go.mod h1:LujAq0jyVjBy028G1WhWfIzbpQfMO8bBZ6Tyb0+pL9E=
github.com/tidwall/pretty v0.0.0-20180105212114-65a9db5fad51/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
github.com/tidwall/pretty v1.0.0 h1:HsD+QiTn7sK6flMKIvNmpqz1qrpP3Ps6jOKIKMooyg4=
github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc=
github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
Expand Down
6 changes: 5 additions & 1 deletion pkg/scraper/action.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,19 @@ const (
scraperActionScript scraperAction = "script"
scraperActionStash scraperAction = "stash"
scraperActionXPath scraperAction = "scrapeXPath"
scraperActionJson scraperAction = "scrapeJson"
)

var allScraperAction = []scraperAction{
scraperActionScript,
scraperActionStash,
scraperActionXPath,
scraperActionJson,
}

func (e scraperAction) IsValid() bool {
switch e {
case scraperActionScript, scraperActionStash, scraperActionXPath:
case scraperActionScript, scraperActionStash, scraperActionXPath, scraperActionJson:
return true
}
return false
Expand Down Expand Up @@ -47,6 +49,8 @@ func getScraper(scraper scraperTypeConfig, config config, globalConfig GlobalCon
return newStashScraper(scraper, config, globalConfig)
case scraperActionXPath:
return newXpathScraper(scraper, config, globalConfig)
case scraperActionJson:
return newJsonScraper(scraper, config, globalConfig)
}

panic("unknown scraper action: " + scraper.Action)
Expand Down
3 changes: 3 additions & 0 deletions pkg/scraper/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ type config struct {
// Xpath scraping configurations
XPathScrapers mappedScrapers `yaml:"xPathScrapers"`

// Json scraping configurations
JsonScrapers mappedScrapers `yaml:"jsonScrapers"`

// Scraping driver options
DriverOptions *scraperDriverOptions `yaml:"driver"`
}
Expand Down
191 changes: 191 additions & 0 deletions pkg/scraper/json.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
package scraper

import (
"errors"
"io/ioutil"
"net/url"
"strings"

"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/models"
"github.com/tidwall/gjson"
)

type jsonScraper struct {
scraper scraperTypeConfig
config config
globalConfig GlobalConfig
}

func newJsonScraper(scraper scraperTypeConfig, config config, globalConfig GlobalConfig) *jsonScraper {
return &jsonScraper{
scraper: scraper,
config: config,
globalConfig: globalConfig,
}
}

func (s *jsonScraper) getJsonScraper() *mappedScraper {
return s.config.JsonScrapers[s.scraper.Scraper]
}

func (s *jsonScraper) scrapeURL(url string) (string, *mappedScraper, error) {
scraper := s.getJsonScraper()

if scraper == nil {
return "", nil, errors.New("json scraper with name " + s.scraper.Scraper + " not found in config")
}

doc, err := s.loadURL(url)

if err != nil {
return "", nil, err
}

return doc, scraper, nil
}

func (s *jsonScraper) loadURL(url string) (string, error) {
r, err := loadURL(url, s.config, s.globalConfig)
if err != nil {
return "", err
}

doc, err := ioutil.ReadAll(r)
if err != nil {
return "", err
}

docStr := string(doc)
if !gjson.Valid(docStr) {
return "", errors.New("not valid json")
}

if err == nil && s.config.DebugOptions != nil && s.config.DebugOptions.PrintHTML {
logger.Infof("loadURL (%s) response: \n%s", url, docStr)
}

return docStr, err
}

func (s *jsonScraper) scrapePerformerByURL(url string) (*models.ScrapedPerformer, error) {
doc, scraper, err := s.scrapeURL(url)
if err != nil {
return nil, err
}

q := s.getJsonQuery(doc)
return scraper.scrapePerformer(q)
}

func (s *jsonScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) {
doc, scraper, err := s.scrapeURL(url)
if err != nil {
return nil, err
}

q := s.getJsonQuery(doc)
return scraper.scrapeScene(q)
}

func (s *jsonScraper) scrapePerformersByName(name string) ([]*models.ScrapedPerformer, error) {
scraper := s.getJsonScraper()

if scraper == nil {
return nil, errors.New("json scraper with name " + s.scraper.Scraper + " not found in config")
}

const placeholder = "{}"

// replace the placeholder string with the URL-escaped name
escapedName := url.QueryEscape(name)

url := s.scraper.QueryURL
url = strings.Replace(url, placeholder, escapedName, -1)

doc, err := s.loadURL(url)

if err != nil {
return nil, err
}

q := s.getJsonQuery(doc)
return scraper.scrapePerformers(q)
}

func (s *jsonScraper) scrapePerformerByFragment(scrapedPerformer models.ScrapedPerformerInput) (*models.ScrapedPerformer, error) {
return nil, errors.New("scrapePerformerByFragment not supported for json scraper")
}

func (s *jsonScraper) scrapeSceneByFragment(scene models.SceneUpdateInput) (*models.ScrapedScene, error) {
storedScene, err := sceneFromUpdateFragment(scene)
if err != nil {
return nil, err
}

if storedScene == nil {
return nil, errors.New("no scene found")
}

// construct the URL
url := constructSceneURL(s.scraper.QueryURL, storedScene)

scraper := s.getJsonScraper()

if scraper == nil {
return nil, errors.New("json scraper with name " + s.scraper.Scraper + " not found in config")
}

doc, err := s.loadURL(url)

if err != nil {
return nil, err
}

q := s.getJsonQuery(doc)
return scraper.scrapeScene(q)
}

func (s *jsonScraper) getJsonQuery(doc string) *jsonQuery {
return &jsonQuery{
doc: doc,
scraper: s,
}
}

type jsonQuery struct {
doc string
scraper *jsonScraper
}

func (q *jsonQuery) runQuery(selector string) []string {
value := gjson.Get(q.doc, selector)

if !value.Exists() {
logger.Warnf("Could not find json path '%s' in json object", selector)
return nil
}

var ret []string
if value.IsArray() {
value.ForEach(func(k, v gjson.Result) bool {
ret = append(ret, v.String())
return true
})
} else {
ret = append(ret, value.String())
}

return ret
}

func (q *jsonQuery) subScrape(value string) mappedQuery {
doc, err := q.scraper.loadURL(value)

if err != nil {
logger.Warnf("Error getting URL '%s' for sub-scraper: %s", value, err.Error())
return nil
}

return q.scraper.getJsonQuery(doc)
}
93 changes: 93 additions & 0 deletions pkg/scraper/json_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
package scraper

import (
"testing"

"gopkg.in/yaml.v2"
)

func TestJsonPerformerScraper(t *testing.T) {
const yamlStr = `name: Test
jsonScrapers:
performerScraper:
common:
$extras: data.extras
performer:
Name: data.name
Gender: $extras.gender
Birthdate: $extras.birthday
Ethnicity: $extras.ethnicity
Height: $extras.height
Measurements: $extras.measurements
Tattoos: $extras.tattoos
Piercings: $extras.piercings
Aliases: data.aliases
Image: data.image
`

const json = `
{
"data": {
"id": "2cd4146b-637d-49b1-8ff9-19d4a06947bb",
"name": "Mia Malkova",
"bio": "Some girls are so damn hot that they can get you bent out of shape, and you will not even be mad at them for doing so. Well, tawny blonde Mia Malkova can bend her body into any shape she pleases, and that’s sure to satisfy all of the horny cocks and wet pussies out there. This girl has acrobatic and contortionist abilities that could even twist a pretzel into a new knot, which can be very helpful in the ... arrow_drop_down Some girls are so damn hot that they can get you bent out of shape, and you will not even be mad at them for doing so. Well, tawny blonde Mia Malkova can bend her body into any shape she pleases, and that’s sure to satisfy all of the horny cocks and wet pussies out there. This girl has acrobatic and contortionist abilities that could even twist a pretzel into a new knot, which can be very helpful in the VR Porn movies – trust us. Ankles behind her neck and feet over her back so she can kiss her toes, turned, twisted and gyrating, she can fuck any which way she wants (and that ass!), will surely make you fall in love with this hot Virtual Reality Porn slut, as she is one of the finest of them all. Talking about perfection, maybe it’s all the acrobatic work that keeps it in such gorgeous shape? Who cares really, because you just want to take a big bite out of it and never let go. But it’s not all about the body. Mia’s also got a great smile, which might not sound kinky, but believe us, it is a smile that will heat up your innards and drop your pants. Is it her golden skin, her innocent pink lips or that heart-shaped face? There is just too much good stuff going on with Mia Malkova, which is maybe why these past few years have heaped awards upon awards on this Southern California native. Mia came to VR Bangers for her first VR Porn video, so you know she’s only going for top-notch scenes with top-game performers, men, and women. Better hit up that yoga studio if you ever dream of being able to bang a flexible and talented chick like lady Malkova. arrow_drop_up",
"extras": {
"gender": "Female",
"birthday": "1992-07-01",
"birthday_timestamp": 709948800,
"birthplace": "Palm Springs, California, United States",
"active": 1,
"astrology": "Cancer (Jun 21 - Jul 22)",
"ethnicity": "Caucasian",
"nationality": "United States",
"hair_colour": "Blonde",
"weight": "126 lbs (or 57 kg)",
"height": "5'6\" (or 167 cm)",
"measurements": "34-26-36",
"cupsize": "34C (75C)",
"tattoos": "None",
"piercings": "Navel",
"first_seen": null
},
"aliases": [
"Mia Bliss",
"Madison Clover",
"Madison Swan",
"Mia Mountain",
"Mia M.",
"Mia Malvoka",
"Mia Molkova",
"Mia Thomas"
],
"image": "https:\/\/thumb.metadataapi.net\/unsafe\/1000x1500\/smart\/filters:sharpen():upscale()\/https%3A%2F%2Fcdn.metadataapi.net%2Fperformer%2F49%2F05%2F30%2Fade2255dc065032a89ebb23f0e038fa%2Fposter%2Fmia-malkova.jpg%3Fid1582610531"
}
}
`

c := &config{}
err := yaml.Unmarshal([]byte(yamlStr), &c)

if err != nil {
t.Fatalf("Error loading yaml: %s", err.Error())
}

// perform scrape using json string
performerScraper := c.JsonScrapers["performerScraper"]

q := &jsonQuery{
doc: json,
}

scrapedPerformer, err := performerScraper.scrapePerformer(q)
if err != nil {
t.Fatalf("Error scraping performer: %s", err.Error())
}

verifyField(t, "Mia Malkova", scrapedPerformer.Name, "Name")
verifyField(t, "Female", scrapedPerformer.Gender, "Gender")
verifyField(t, "1992-07-01", scrapedPerformer.Birthdate, "Birthdate")
verifyField(t, "Caucasian", scrapedPerformer.Ethnicity, "Ethnicity")
verifyField(t, "5'6\" (or 167 cm)", scrapedPerformer.Height, "Height")
verifyField(t, "None", scrapedPerformer.Tattoos, "Tattoos")
verifyField(t, "Navel", scrapedPerformer.Piercings, "Piercings")
}
11 changes: 11 additions & 0 deletions pkg/scraper/stash.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,3 +191,14 @@ func (s *stashScraper) scrapePerformerByURL(url string) (*models.ScrapedPerforme
func (s *stashScraper) scrapeSceneByURL(url string) (*models.ScrapedScene, error) {
return nil, errors.New("scrapeSceneByURL not supported for stash scraper")
}

func sceneFromUpdateFragment(scene models.SceneUpdateInput) (*models.Scene, error) {
qb := models.NewSceneQueryBuilder()
id, err := strconv.Atoi(scene.ID)
if err != nil {
return nil, err
}

// TODO - should we modify it with the input?
return qb.Find(id)
}
16 changes: 16 additions & 0 deletions pkg/scraper/url.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"net/http"
"net/http/cookiejar"
"os"
"path/filepath"
"strings"
"time"

Expand All @@ -18,10 +19,25 @@ import (
"github.com/chromedp/chromedp"
jsoniter "github.com/json-iterator/go"
"github.com/stashapp/stash/pkg/logger"
"github.com/stashapp/stash/pkg/models"
"golang.org/x/net/html/charset"
"golang.org/x/net/publicsuffix"
)

// Timeout for the scrape http request. Includes transfer time. May want to make this
// configurable at some point.
const scrapeGetTimeout = time.Second * 30

func constructSceneURL(url string, scene *models.Scene) string {
// support checksum, title and filename
ret := strings.Replace(url, "{checksum}", scene.Checksum.String, -1)
ret = strings.Replace(url, "{oshash}", scene.OSHash.String, -1)
ret = strings.Replace(ret, "{filename}", filepath.Base(scene.Path), -1)
ret = strings.Replace(ret, "{title}", scene.Title.String, -1)

return ret
}

func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Reader, error) {
driverOptions := scraperConfig.DriverOptions
if driverOptions != nil && driverOptions.UseCDP {
Expand Down
Loading

0 comments on commit 7158e83

Please sign in to comment.