jakopako · jakopako · Sep 6, 2022 · Sep 6, 2022
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ This project's goal is to make it easier to scrape structured data from web page
 different venue websites. However, the code has been rewritten to handle a more general use case of extracting a list of items from any website.
 This could be a list of books from an online book store, a list of plays in a public theater, a list of newspaper articles, etc. Currently, information can only be extracted from static websites.
 
-Note that there are already similar projects that might do a better job in certain cases or are more generic tools. However, on the one hand this is a personal project to make myself familiar with webscraping and Go and on the other hand goskyr supports certain features that I haven't found in any other projects. For instance, the way dates can be extracted from websites and the notion of scraping information from subpages defined by previously at runtime extracted urls.
+Note that there are already similar projects that might do a better job in certain cases or are more generic tools. However, on the one hand this is a personal project to make myself familiar with webscraping and Go and on the other hand goskyr supports certain features that I haven't found in any other projects. For instance, the way dates can be extracted from websites and the notion of scraping information from subpages defined by previously at runtime extracted urls. Be sure to checkout the section on [auto configuration](#auto-configuration-experimental).
 
 Similar projects:
 
@@ -50,77 +50,6 @@ goskyr -generate "https://www.goodreads.com/quotes/tag/life"
 which will automatically find repeating fields, will ask you to chose a subset of those fields and then return the resulting config snippet, which might look
 something like this:
 
-```yaml
-writer:
-    type: ""
-    uri: ""
-    user: ""
-    password: ""
-    filepath: ""
-scrapers:
-    - name: ""
-      url: https://www.goodreads.com/quotes/tag/life
-      item: body > div.content > div.mainContentContainer > div.mainContent > div.mainContentFloat > div.leftContainer > div.quote.mediumText > div.quoteDetails
-      exclude_with_selector: []
-      fields:
-        static: []
-        dynamic:
-            - name: field-0
-              type: text
-              location:
-                selector: div.quoteText > span.authorOrTitle
-                node_index: 0
-                child_index: 0
-                regex_extract:
-                    exp: ""
-                    index: 0
-                attr: ""
-                max_length: 0
-                entire_subtree: false
-              on_subpage: ""
-              can_be_empty: false
-              components: []
-              date_location: ""
-              date_language: ""
-              hide: false
-            - name: field-1
-              type: text
-              location:
-                selector: div.quoteText
-                node_index: 0
-                child_index: 2
-                regex_extract:
-                    exp: ""
-                    index: 0
-                attr: ""
-                max_length: 0
-                entire_subtree: false
-              on_subpage: ""
-              can_be_empty: false
-              components: []
-              date_location: ""
-              date_language: ""
-              hide: false
-      filters: []
-      paginator:
-        location:
-            selector: ""
-            node_index: 0
-            child_index: 0
-            regex_extract:
-                exp: ""
-                index: 0
-            attr: ""
-            max_length: 0
-            entire_subtree: false
-        max_pages: 0
-global:
-    user-agent: ""
-```
-
-Note that currently all fields are displayed although the majority contains default values that you normally wouldn't have to configure. As a consequence, they can
-be manually removed for better readability which in the above example results in:
-
 ```yaml
 scrapers:
     - name: ""

diff --git a/config-gen.yml b/config-gen.yml
@@ -2,9 +2,7 @@ scrapers:
   - name: ""
     url: https://www.goodreads.com/quotes/tag/life
     item: body > div.content > div.mainContentContainer > div.mainContent > div.mainContentFloat > div.leftContainer > div.quote.mediumText > div.quoteDetails
-    exclude_with_selector: []
     fields:
-      static: []
       dynamic:
         - name: field-0
           type: text

diff --git a/generate/config.go b/generate/config.go
@@ -180,12 +180,6 @@ parse:
 						Selector:   p,
 						ChildIndex: nrChildren[p],
 					}
-					// this check has to be updated
-					// we cannot check the exact path because a repeating node might have both
-					// repeating classes and non-repeating classes. So instead of checking whether
-					// we have seen the exact location we need to check whether there is a location
-					// where for each node in the path that there is at least on overlapping class
-					// (if at least one of the two nodes has a class)
 					locMan = update(locMan, l, strings.TrimSpace(text))
 				}
 				nrChildren[p] += 1

diff --git a/scraper/scraper.go b/scraper/scraper.go
@@ -31,9 +31,9 @@ type GlobalConfig struct {
 // Values will be taken from a config yml file or environment variables
 // or both.
 type Config struct {
-	Writer   output.WriterConfig `yaml:"writer"`
-	Scrapers []Scraper           `yaml:"scrapers"`
-	Global   GlobalConfig        `yaml:"global"`
+	Writer   output.WriterConfig `yaml:"writer,omitempty"`
+	Scrapers []Scraper           `yaml:"scrapers,omitempty"`
+	Global   GlobalConfig        `yaml:"global,omitempty"`
 }
 
 func NewConfig(configPath string) (*Config, error) {
@@ -65,13 +65,13 @@ type RegexConfig struct {
 
 // ElementLocation is used to find a specific string in a html document
 type ElementLocation struct {
-	Selector      string      `yaml:"selector"`
-	NodeIndex     int         `yaml:"node_index"`
-	ChildIndex    int         `yaml:"child_index"`
-	RegexExtract  RegexConfig `yaml:"regex_extract"`
-	Attr          string      `yaml:"attr"`
-	MaxLength     int         `yaml:"max_length"`
-	EntireSubtree bool        `yaml:"entire_subtree"`
+	Selector      string      `yaml:"selector,omitempty"`
+	NodeIndex     int         `yaml:"node_index,omitempty"`
+	ChildIndex    int         `yaml:"child_index,omitempty"`
+	RegexExtract  RegexConfig `yaml:"regex_extract,omitempty"`
+	Attr          string      `yaml:"attr,omitempty"`
+	MaxLength     int         `yaml:"max_length,omitempty"`
+	EntireSubtree bool        `yaml:"entire_subtree,omitempty"`
 }
 
 // CoveredDateParts is used to determine what parts of a date a
@@ -103,16 +103,16 @@ type StaticField struct {
 // for each item
 type DynamicField struct {
 	Name string `yaml:"name"`
-	Type string `yaml:"type"` // can currently be text, url or date
+	Type string `yaml:"type,omitempty"` // can currently be text, url or date
 	// If a field can be found on a subpage the following variable has to contain a field name of
 	// a field of type 'url' that is located on the main page.
-	ElementLocation ElementLocation `yaml:"location"`
-	OnSubpage       string          `yaml:"on_subpage"`    // applies to text, url, date
-	CanBeEmpty      bool            `yaml:"can_be_empty"`  // applies to text, url
-	Components      []DateComponent `yaml:"components"`    // applies to date
-	DateLocation    string          `yaml:"date_location"` // applies to date
-	DateLanguage    string          `yaml:"date_language"` // applies to date
-	Hide            bool            `yaml:"hide"`          // appliess to text, url, date
+	ElementLocation ElementLocation `yaml:"location,omitempty"`
+	OnSubpage       string          `yaml:"on_subpage,omitempty"`    // applies to text, url, date
+	CanBeEmpty      bool            `yaml:"can_be_empty,omitempty"`  // applies to text, url
+	Components      []DateComponent `yaml:"components,omitempty"`    // applies to date
+	DateLocation    string          `yaml:"date_location,omitempty"` // applies to date
+	DateLanguage    string          `yaml:"date_language,omitempty"` // applies to date
+	Hide            bool            `yaml:"hide,omitempty"`          // appliess to text, url, date
 }
 
 // A Filter is used to filter certain items from the result list
@@ -128,16 +128,16 @@ type Scraper struct {
 	Name                string   `yaml:"name"`
 	URL                 string   `yaml:"url"`
 	Item                string   `yaml:"item"`
-	ExcludeWithSelector []string `yaml:"exclude_with_selector"`
+	ExcludeWithSelector []string `yaml:"exclude_with_selector,omitempty"`
 	Fields              struct {
-		Static  []StaticField  `yaml:"static"`
-		Dynamic []DynamicField `yaml:"dynamic"`
+		Static  []StaticField  `yaml:"static,omitempty"`
+		Dynamic []DynamicField `yaml:"dynamic,omitempty"`
 	} `yaml:"fields"`
-	Filters   []Filter `yaml:"filters"`
+	Filters   []Filter `yaml:"filters,omitempty"`
 	Paginator struct {
-		Location ElementLocation `yaml:"location"`
-		MaxPages int             `yaml:"max_pages"`
-	}
+		Location ElementLocation `yaml:"location,omitempty"`
+		MaxPages int             `yaml:"max_pages,omitempty"`
+	} `yaml:"paginator,omitempty"`
 }
 
 // GetItems fetches and returns all items from a website according to the