kaizen: add -q option for quoted fields

addresses #28 and #27 Signed-off-by: Tim Bray <tbray@textuality.com>
timbray · May 11, 2024 · fa077fb · fa077fb
1 parent 8c59c94
commit fa077fb
Show file tree

Hide file tree

Showing 9 changed files with 481 additions and 48 deletions.
diff --git a/INSTALLING.md b/INSTALLING.md
@@ -0,0 +1,18 @@
+# Installing Topfew
+
+Each Topfew [release](https://github.com/timbray/topfew/releases) comes with binaries built for both the x86 and ARM
+flavors of Linux, MacOS, and Windows.
+
+Topfew comes with a Makefile which is uncomplicated. Typing `make` will create an executable named `tf`, 
+created by `go build` with no options, in the `./bin` directory.
+
+## Arch Linux
+
+Topfew [is available](https://aur.archlinux.org/packages/topfew) in the 
+[Arch User Repository](https://wiki.archlinux.org/title/Arch_User_Repository) (AUR).
+If you have an AUR pacman wrapper installed you can install it directly. Otherwise, to install Topfew as an Arch package: 
+```
+git clone https://aur.archlinux.org/topfew.git
+cd topfew
+makepkg -i
+```
diff --git a/README.md b/README.md
@@ -70,6 +70,13 @@ If no fieldlist is provided, **tf** treats the whole input record as a single fi
 Provides a regular expression that is used as a field separator instead of the default white space.
 This is likely to incur a significant performance cost.
 
+`-q, --quotedfields`
+
+Some files, for example Apache httpd logs, use space-separation but also
+allow spaces within fields which are delimited by `"`. The -q/--quotedfields
+argument allows **tf** to process these correctly. It is an error to specify both
+-p and -q.
+
 `-g regexp`, `--grep regexp`
 
 The  initial **g** suggests `grep`.
@@ -114,6 +121,36 @@ Records are separated by newlines, fields within records by white space, defined
 
 The field separator can be overridden with the --fieldseparator option.
 
+## Case study: Apache access_log
+
+Here is a line from an Apache httpd `access_log` file. For readability, the fields are 
+separated by line-breaks and numbered. Note that the fields are mostly space-separated, but that field 6,
+summarizing the request and its result, is delimited by quote characters `"`.
+
+```
+1. 202.113.19.244 
+2. - 
+3. - 
+4. [12/Mar/2007:08:04:39 
+5. -0800] 
+6. "GET /ongoing/picInfo.xml?o=http://www.tbray.org/ongoing/When/200x/2007/03/10/Beautiful-Code HTTP/1.1" 
+7. 200 
+8. 137 
+9. "http://www.tbray.org/ongoing/When/200x/2007/03/10/Beautiful-Code" 
+10. "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2) Gecko/20070219 Firefox/2.0.0.2"
+```
+
+The fetch of `picInfo.xml` signals that this is an actual browser request, likely signifying that 
+a human was involved; the URL following the `o=` is the resource the human looked at. Here is a 
+**tf** invocation that yields a list of the top 5 URLs that were fetched by a human:
+
+```shell
+tf -g picInfo.xml -f 6 -q -s '\?utm.*' '' -s " HTTP/..." "" -s "GET .*\/ongoing" ""
+```
+
+Note the `-g` to select only lines with `picInfo.xml`, the `-q` to request correct processing
+of quote-delimited fields, and the sequence of `-s` patterns to clean up the results.
+
 ## Performance issues
 
 Since the effect of topfew can be exactly duplicated with a combination of `awk`, `grep`, `sed` and `sort`, you wouldn’t be using it if you didn’t care about performance. 

diff --git a/internal/config.go b/internal/config.go
@@ -17,6 +17,7 @@ type config struct {
 	filter         filters
 	width          int
 	sample         bool
+	quotedFields   bool
 }
 
 func Configure(args []string) (*config, error) {
@@ -75,6 +76,8 @@ func Configure(args []string) (*config, error) {
 			}
 		case arg == "--sample":
 			config.sample = true
+		case arg == "--quotedfields" || arg == "-q":
+			config.quotedFields = true
 		case arg == "-h" || arg == "-help" || arg == "--help":
 			fmt.Println(instructions)
 			os.Exit(0)
@@ -101,6 +104,9 @@ func Configure(args []string) (*config, error) {
 		}
 		i++
 	}
+	if (config.fieldSeparator != nil) && config.quotedFields {
+		err = errors.New("only one of -p/--fieldseparator and -q/--quotedfields may be specified")
+	}
 
 	return &config, err
 }
@@ -132,7 +138,8 @@ order of occurrences.
 Usage: tf
 	-n, --number (output line count) [default is 10]
 	-f, --fields (field list) [default is the whole record]
-	-p, --fieldseparator (field separator regex) [default is white space]
+    -p, --fieldseparator (field separator regex) [default is white space]
+	-q, --quotedfields [default is false]
 	-g, --grep (regexp) [may repeat, default is accept all]
 	-v, --vgrep (regexp) [may repeat, default is reject none]
 	-s, --sed (regexp) (replacement) [may repeat, default is no changes]
@@ -151,6 +158,11 @@ Fields are separated by white space (spaces or tabs) by default.
 This can be overridden with the --fieldseparator option, at some cost in
 performance.
 
+Some files, for example Apache httpd logs, use space-separation but also
+allow spaces within fields which are quoted with ("). The -q/--quotedfields
+allows tf to process these correctly. It is an error to specify both
+-p and -q.
+
 The regexp-valued fields work as follows:
 -g/--grep discards records that don't match the regexp (g for grep)
 -v/--vgrep discards records that do match the regexp (v for grep -v)

diff --git a/internal/config_test.go b/internal/config_test.go
@@ -15,10 +15,12 @@ func TestArgSyntax(t *testing.T) {
 		{"--sed"}, {"-s", "x"}, {"--sample", "--sed", "1"},
 		{"--width", "a"}, {"-w", "0"}, {"--sample", "-w"},
 		{"--sample", "-p"}, {"--fieldseparator", "a["},
+		{"--fieldseparator", "x", "-q"}, {"--quotedfields", "-f", "z"},
 	}
 
 	// not testing -h/--help because it'd be extra work to avoid printing out the usage
 	goods := [][]string{
+		{"-q", "fname"}, {"--quotedfields"},
 		{"--number", "1"}, {"-n", "5"},
 		{"--fields", "1"}, {"-f", "3,5"},
 		{"--grep", "re1"}, {"-g", "re2"},

diff --git a/internal/keyfinder.go b/internal/keyfinder.go
@@ -21,78 +21,127 @@ const NER = "not enough bytes in record"
 // does mean that the contents of the field are only valid until you call getKey again, and also that
 // the keyFinder type is not thread-safe
 type keyFinder struct {
-	fields    []uint
-	key       []byte
-	separator *regexp.Regexp
+	fields       []uint
+	key          []byte
+	separator    *regexp.Regexp
+	quotedFields bool
 }
 
 // newKeyFinder creates a new Key finder with the supplied field numbers, the input should be 1 based.
 // keyFinder is not thread-safe, you should clone it for each goroutine that uses it.
-func newKeyFinder(keys []uint, separator *regexp.Regexp) *keyFinder {
+func newKeyFinder(keys []uint, separator *regexp.Regexp, quotedFields bool) *keyFinder {
 	kf := keyFinder{
 		key: make([]byte, 0, 128),
 	}
 	for _, knum := range keys {
 		kf.fields = append(kf.fields, knum-1)
 	}
 	kf.separator = separator
+	kf.quotedFields = quotedFields
 	return &kf
 }
 
 // clone returns a new keyFinder with the same configuration. Each goroutine should use its own
 // keyFinder instance.
 func (kf *keyFinder) clone() *keyFinder {
 	return &keyFinder{
-		fields:    kf.fields,
-		key:       make([]byte, 0, 128),
-		separator: kf.separator,
+		fields:       kf.fields,
+		key:          make([]byte, 0, 128),
+		separator:    kf.separator,
+		quotedFields: kf.quotedFields,
 	}
 }
 
 // getKey extracts a key from the supplied record. This is applied to every record,
 // so efficiency matters.
 func (kf *keyFinder) getKey(record []byte) ([]byte, error) {
-	// if there are no Key-finders just return the record, minus any trailing newlines
+	// chomp
+	if record[len(record)-1] == '\n' {
+		record = record[:len(record)-1]
+	}
+	// if there are no Key-finders the key is the record
 	if len(kf.fields) == 0 {
-		if record[len(record)-1] == '\n' {
-			record = record[0 : len(record)-1]
-		}
 		return record, nil
 	}
 	var err error
 	kf.key = kf.key[:0]
 	if kf.separator == nil {
-		field := 0
-		index := 0
-		first := true
-
-		// for each field in the Key
-		for _, keyField := range kf.fields {
-			// bypass fields before the one we want
-			for field < int(keyField) {
-				index, err = pass(record, index)
+		// no regex provided, we're doing space-separation
+		if kf.quotedFields {
+			// if we're doing apache httpd style access_log files, with some "-quoted fields
+			field := 0
+			index := 0
+			first := true
+
+			// for each field in the key
+			for _, keyField := range kf.fields {
+				// bypass fields before the one we want
+				for field < int(keyField) {
+					index, err = passQuoted(record, index)
+					if err != nil {
+						return nil, err
+					}
+					// in the special case where we might have just passed a quoted fields, we will
+					// advance index past the closing quote
+					if index < len(record) && record[index] == '"' {
+						index++
+					}
+					field++
+				}
+
+				// join(' ', kf)
+				if first {
+					first = false
+				} else {
+					kf.key = append(kf.key, ' ')
+				}
+
+				kf.key, index, err = gatherQuoted(kf.key, record, index)
 				if err != nil {
 					return nil, err
 				}
+				// in the special case where we might have just passed a quoted fields, we will
+				// advance index past the closing quote
+				if index < len(record) && record[index] == '"' {
+					index++
+				}
 				field++
 			}
+		} else {
+			// basic space-separation
+			field := 0
+			index := 0
+			first := true
 
-			// join(' ', kf)
-			if first {
-				first = false
-			} else {
-				kf.key = append(kf.key, ' ')
-			}
+			// for each field in the Key
+			for _, keyField := range kf.fields {
+				// bypass fields before the one we want
+				for field < int(keyField) {
+					index, err = pass(record, index)
+					if err != nil {
+						return nil, err
+					}
+					field++
+				}
 
-			// attach desired field to Key
-			kf.key, index, err = gather(kf.key, record, index)
-			if err != nil {
-				return nil, err
-			}
+				// join(' ', kf)
+				if first {
+					first = false
+				} else {
+					kf.key = append(kf.key, ' ')
+				}
 
-			field++
+				// attach desired field to Key
+				kf.key, index, err = gather(kf.key, record, index)
+				if err != nil {
+					return nil, err
+				}
+
+				field++
+			}
 		}
 	} else {
+		// regex separator provided, less code but probably slower
 		allFields := kf.separator.Split(string(record), -1)
 		for i, field := range kf.fields {
 			if int(field) >= len(allFields) {
@@ -107,9 +156,10 @@ func (kf *keyFinder) getKey(record []byte) ([]byte, error) {
 	return kf.key, err
 }
 
-// pull in the bytes from a desired field
+// gather pulls in the bytes from a desired field, and leaves index positioned at the first white-space
+// character following the field, or at the end of the record, i.e. len(record)
 func gather(key []byte, record []byte, index int) ([]byte, int, error) {
-	// eat leading space
+	// eat leading space - if we're already at the end of the record, the loop is a no-op
 	for index < len(record) && (record[index] == ' ' || record[index] == '\t') {
 		index++
 	}
@@ -118,13 +168,49 @@ func gather(key []byte, record []byte, index int) ([]byte, int, error) {
 	}
 
 	// copy Key bytes
-	for index < len(record) && record[index] != ' ' && record[index] != '\t' && record[index] != '\n' {
-		key = append(key, record[index])
+	startAt := index
+	for index < len(record) && record[index] != ' ' && record[index] != '\t' {
 		index++
 	}
+	key = append(key, record[startAt:index]...)
 	return key, index, nil
 }
 
+// same semantics as gather, but respects quoted fields that might create spaces. Leaves the index
+// value pointing at the closing quote
+func gatherQuoted(key []byte, record []byte, index int) ([]byte, int, error) {
+	// eat leading space
+	for index < len(record) && (record[index] == ' ' || record[index] == '\t') {
+		index++
+	}
+	if index >= len(record) {
+		return nil, 0, errors.New(NER)
+	}
+
+	if record[index] == '"' {
+		index++
+		startAt := index
+		for index < len(record) && record[index] != '"' {
+			index++
+		}
+		key = append(key, record[startAt:index]...)
+		// if we hit end-of-record before the closing quote, that's an error
+		if index == len(record) {
+			return nil, 0, errors.New(NER)
+		}
+	} else {
+		startAt := index
+		for index < len(record) && record[index] != ' ' && record[index] != '\t' {
+			index++
+		}
+		key = append(key, record[startAt:index]...)
+	}
+	return key, index, nil
+}
+
+// pass moves the index variable past any white space and a space-separated field,
+// leaving index pointing at the first white-space character after the field or
+// at the end of record, i.e. == len(record)
 func pass(record []byte, index int) (int, error) {
 	// eat leading space
 	for index < len(record) && (record[index] == ' ' || record[index] == '\t') {
@@ -138,3 +224,30 @@ func pass(record []byte, index int) (int, error) {
 	}
 	return index, nil
 }
+
+// same semantics as pass, but for quoted fields. Leaves the index value pointing at the
+// closing "
+func passQuoted(record []byte, index int) (int, error) {
+	// eat leading space
+	for index < len(record) && (record[index] == ' ' || record[index] == '\t') {
+		index++
+	}
+	if index == len(record) {
+		return 0, errors.New(NER)
+	}
+	if record[index] == '"' {
+		index++
+		for index < len(record) && record[index] != '"' {
+			index++
+		}
+		// if we hit end of record before the closing quote, that's a bug
+		if index >= len(record) {
+			return 0, errors.New(NER)
+		}
+	} else {
+		for index < len(record) && record[index] != ' ' && record[index] != '\t' {
+			index++
+		}
+	}
+	return index, nil
+}