Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Further on to Himcc master #15

Merged
merged 3 commits into from
Jan 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,11 @@ Options:
-t, --text Text output for none-block selection mode
-R, --Raw Raw text output, no trimming of leading and trailing white space
-p, --piece sub CSS selectors within -css to split that block up into pieces
format: PieceName=[RAW:]selector_string
RAW: will return the selected as-is; else the text will be returned
format: PieceName=[OutputStyle:]selector_string
OutputStyle:
RAW : will return the selected as-is
attr[xxx] : will return the value of an attribute named xxx
else the text will be returned
-d, --delimiter delimiter for pieces csv output [= ]
-w, --wrap-html wrap up the output with html tags
-y, --style style component within the wrapped html head
Expand Down
4 changes: 2 additions & 2 deletions cascadia_cli.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ Options:
Usage: Raw text output, no trimming of leading and trailing white space

- Name: Piece
Type: MapStringString
Type: OutputStyleMap
Flag: 'p,piece'
Usage: 'sub CSS selectors within -css to split that block up into pieces\n\t\t\tformat: PieceName=[RAW:]selector_string\n\t\t\tRAW: will return the selected as-is; else the text will be returned'
Usage: 'sub CSS selectors within -css to split that block up into pieces\n\t\t\tformat: PieceName=[OutputStyle:]selector_string\n\t\t\tOutputStyle:\n\t\t\t\tRAW : will return the selected as-is\n\t\t\t\tattr[xxx] : will return the value of an attribute named xxx \n\t\t\telse the text will be returned'

- Name: Deli
Type: string
Expand Down
30 changes: 15 additions & 15 deletions cascadia_cliDef.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
////////////////////////////////////////////////////////////////////////////
// Program: cascadiaC
// Purpose: cascadia wrapper
// Authors: Tong Sun (c) 2021, All rights reserved
// Authors: Tong Sun (c) 2023, All rights reserved
////////////////////////////////////////////////////////////////////////////

package main
Expand All @@ -23,23 +23,23 @@ import (

type rootT struct {
cli.Helper
Filei *clix.Reader `cli:"*i,in" usage:"The html/xml file to read from (or stdin)"`
Fileo *clix.Writer `cli:"*o,out" usage:"The output file (or stdout)"`
CSS []string `cli:"*c,css" usage:"CSS selectors (can provide more if not using --piece)"`
TextOut bool `cli:"t,text" usage:"Text output for none-block selection mode"`
TextRaw bool `cli:"R,Raw" usage:"Raw text output, no trimming of leading and trailing white space"`
Piece MapStringString `cli:"p,piece" usage:"sub CSS selectors within -css to split that block up into pieces\n\t\t\tformat: PieceName=[RAW:]selector_string\n\t\t\tRAW: will return the selected as-is; else the text will be returned"`
Deli string `cli:"d,delimiter" usage:"delimiter for pieces csv output" dft:"\t"`
WrapHTML bool `cli:"w,wrap-html" usage:"wrap up the output with html tags"`
Style string `cli:"y,style" usage:"style component within the wrapped html head"`
Base string `cli:"b,base" usage:"base href tag used in the wrapped up html"`
Quiet bool `cli:"q,quiet" usage:"be quiet"`
Filei *clix.Reader `cli:"*i,in" usage:"The html/xml file to read from (or stdin)"`
Fileo *clix.Writer `cli:"*o,out" usage:"The output file (or stdout)"`
CSS []string `cli:"*c,css" usage:"CSS selectors (can provide more if not using --piece)"`
TextOut bool `cli:"t,text" usage:"Text output for none-block selection mode"`
TextRaw bool `cli:"R,Raw" usage:"Raw text output, no trimming of leading and trailing white space"`
Piece OutputStyleMap `cli:"p,piece" usage:"sub CSS selectors within -css to split that block up into pieces\n\t\t\tformat: PieceName=[OutputStyle:]selector_string\n\t\t\tOutputStyle:\n\t\t\t\tRAW : will return the selected as-is\n\t\t\t\tattr[xxx] : will return the value of an attribute named xxx \n\t\t\telse the text will be returned"`
Deli string `cli:"d,delimiter" usage:"delimiter for pieces csv output" dft:"\t"`
WrapHTML bool `cli:"w,wrap-html" usage:"wrap up the output with html tags"`
Style string `cli:"y,style" usage:"style component within the wrapped html head"`
Base string `cli:"b,base" usage:"base href tag used in the wrapped up html"`
Quiet bool `cli:"q,quiet" usage:"be quiet"`
}

var root = &cli.Command{
Name: "cascadiaC",
Desc: "cascadia wrapper\nVersion " + version + " built on " + date +
"\nCopyright (C) 2021, Tong Sun",
"\nCopyright (C) 2023, Tong Sun",
Text: "Command line interface to go cascadia CSS selectors package" +
"\n\nUsage:\n cascadia -i in -c css -o [Options...]",
Argv: func() interface{} { return new(rootT) },
Expand All @@ -59,7 +59,7 @@ var root = &cli.Command{
// CSS []string
// TextOut bool
// TextRaw bool
// Piece MapStringString
// Piece OutputStyleMap
// Deli string
// WrapHTML bool
// Style string
Expand All @@ -74,7 +74,7 @@ var root = &cli.Command{
// var (
// progname = "cascadiaC"
// version = "0.1.0"
// date = "2021-11-27"
// date = "2023-01-08"

// rootArgv *rootT
// // Opts store all the configurable options
Expand Down
53 changes: 39 additions & 14 deletions cascadia_main.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,31 @@ import (
// Constant and data type/structure definitions

const (
IsRaw = "RAW:"
IsRaw = "RAW"
WrapHTMLEnd = `</body>`
)

type MapStringString struct {
Keys []string
Values map[string]string
Raw map[string]bool
type OutputStyle int

const (
OutputStyleRAW OutputStyle = iota
OutputStyleATTR
OutputStyleTEXT
)

type OutputStyleMap struct {
Keys []string
Values map[string]string
OutputStyles map[string]OutputStyle
AttrName map[string]string
}

// The OptsT type defines all the configurable options from cli.
type OptsT struct {
CSS []string
TextOut bool
TextRaw bool
Piece MapStringString
Piece OutputStyleMap
Deli string
WrapHTML bool
Style string
Expand Down Expand Up @@ -174,10 +183,14 @@ func Cascadia(bi io.Reader, bw io.Writer, Opts OptsT) error {
//fmt.Printf("] #%d: %s\n", index, item.Text())
for _, key := range piece.Keys {
//fmt.Printf("] %s: %s\n", key, piece.Values[key])
if piece.Raw[key] {
switch piece.OutputStyles[key] {
case OutputStyleRAW:
html.Render(bw, item.Find(piece.Values[key]).Get(0))
fmt.Fprintf(bw, deli)
} else {
case OutputStyleATTR:
fmt.Fprintf(bw, "%s%s",
item.Find(piece.Values[key]).AttrOr(piece.AttrName[key], ""), deli)
case OutputStyleTEXT:
fmt.Fprintf(bw, "%s%s",
item.Find(piece.Values[key]).Contents().Text(), deli)
}
Expand All @@ -196,23 +209,35 @@ func Cascadia(bi io.Reader, bw io.Writer, Opts OptsT) error {

// DecodeSlice implements cli.SliceDecoder
// NOTE: if SliceDecoder not implemented, the Decode method would be only invoked once
func (MapStringString) DecodeSlice() {}
func (OutputStyleMap) DecodeSlice() {}

// Decode implements cli.Decoder interface
func (m *MapStringString) Decode(s string) error {
func (m *OutputStyleMap) Decode(s string) error {
if (m.Values) == nil {
m.Values = make(map[string]string)
m.Raw = make(map[string]bool)
m.OutputStyles = make(map[string]OutputStyle)
m.AttrName = make(map[string]string)
}
matches := regexp.MustCompile("(.*)=(.*)").FindStringSubmatch(s)
if len(matches) < 2 {
return errors.New("format error. To get help, run: " + progname)
}
key := matches[1]
val := matches[2]
if len(val) >= 4 && val[:4] == IsRaw {
m.Raw[key] = true
val = val[4:]
index := strings.Index(val, ":")
if index > 0 {
style := val[:index]
val = val[index+1:]
if style == IsRaw {
m.OutputStyles[key] = OutputStyleRAW
} else if strings.HasPrefix(style, "attr[") && strings.HasSuffix(style, "]") {
m.OutputStyles[key] = OutputStyleATTR
m.AttrName[key] = style[5 : len(style)-1]
} else {
m.OutputStyles[key] = OutputStyleTEXT
}
} else {
m.OutputStyles[key] = OutputStyleTEXT
}
m.Keys = append(m.Keys, key)
m.Values[key] = val
Expand Down
99 changes: 97 additions & 2 deletions cascadia_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ func TestSelectors(t *testing.T) {
buf := bytes.NewBufferString("")
Opts.CSS, Opts.Piece, Opts.Deli,
Opts.WrapHTML, Opts.TextOut, Opts.TextRaw, Opts.Quiet =
[]string{test.selector}, MapStringString{}, ",",
[]string{test.selector}, OutputStyleMap{}, ",",
false, false, false, false
Cascadia(strings.NewReader(test.HTML), buf, Opts)
got := buf.String()
Expand All @@ -26,7 +26,7 @@ func TestSelectors(t *testing.T) {
}
}

////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////
// The following is taken from
// https://github.com/andybalholm/cascadia/blob/c56252c33997d9b9991f4c1e3b5fbc82d6d656b3/selector_test.go
type selectorTest struct {
Expand Down Expand Up @@ -549,3 +549,98 @@ var selectorTests = []selectorTest{
},
},
}

type PieceAttrTest struct {
HTML, selector string
results []string
piece OutputStyleMap
}

var PieceAttrTests = []PieceAttrTest{
{
`<ul>
<li><a id="a1" href="http://www.google.com/finance"/>
<li><a id="a2" href="http://finance.yahoo.com/"/>
<li><a id="a3" href="https://www.google.com/news"></a>
<li><a id="a4" href="http://news.yahoo.com"/>
</ul>`,
`li`,
[]string{
`id,`,
`a1,`,
`a2,`,
`a3,`,
`a4,`,
},
OutputStyleMap{
[]string{"id"},
map[string]string{"id": "a"},
map[string]OutputStyle{"id": OutputStyleATTR},
map[string]string{"id": "id"},
},
},
{
`<ul>
<li><a id="a1" href="http://www.google.com/finance"/>
<li><a id="a2" href="http://finance.yahoo.com/"/>
<li><a id="a3" href="https://www.google.com/news"></a>
<li><a id="a4" href="http://news.yahoo.com"/>
</ul>`,
`li`,
[]string{
`href2,`,
`,`,
`,`,
`,`,
`,`,
},
OutputStyleMap{
[]string{"href2"},
map[string]string{"href2": "a"},
map[string]OutputStyle{"href2": OutputStyleATTR},
map[string]string{"href2": "href2"},
},
},
{
`<ul>
<li><a id="a1" href="http://www.google.com/finance"/>
<li><a id="a2" href="http://finance.yahoo.com/"/>
<li><a id="a3" href="https://www.google.com/news"></a>
<li><a id="a4" href="http://news.yahoo.com"/>
</ul>`,
`li`,
[]string{
`href,`,
`http://www.google.com/finance,`,
`http://finance.yahoo.com/,`,
`https://www.google.com/news,`,
`http://news.yahoo.com,`,
},
OutputStyleMap{
[]string{"href"},
map[string]string{"href": "a"},
map[string]OutputStyle{"href": OutputStyleATTR},
map[string]string{"href": "href"},
},
},
}

func TestPieceAttr(t *testing.T) {
for _, test := range PieceAttrTests {
buf := bytes.NewBufferString("")
Opts.CSS, Opts.Piece, Opts.Deli,
Opts.WrapHTML, Opts.TextOut, Opts.TextRaw, Opts.Quiet =
[]string{test.selector}, test.piece, ",",
false, false, false, false
Cascadia(strings.NewReader(test.HTML), buf, Opts)
got := buf.String()
if len(got) == 0 && len(test.results) == 0 {
// correct
continue
}
want := strings.Join(test.results, "\n") + "\n"
if got != want {
t.Errorf("wanted %s, got %s instead", want, got)
}
}
}