Use int8 for mvtype (memory reduction)

johnkerl · Nov 26, 2022 · 9b8d65c · 9b8d65c
1 parent a299ce2
commit 9b8d65c
Show file tree

Hide file tree

Showing 17 changed files with 368 additions and 347 deletions.
diff --git a/internal/pkg/bifs/cmp.go b/internal/pkg/bifs/cmp.go
@@ -9,7 +9,8 @@ import (
 	"github.com/johnkerl/miller/internal/pkg/mlrval"
 )
 
-//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+//   - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+//
 // string_cmp implements the spaceship operator for strings.
 func string_cmp(a, b string) int64 {
 	if a < b {
@@ -43,7 +44,7 @@ func float_cmp(a, b float64) int64 {
 	return 0
 }
 
-//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 func eq_b_ss(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	return mlrval.FromBool(input1.AcquireStringValue() == input2.AcquireStringValue())
 }
@@ -66,7 +67,7 @@ func cmp_b_ss(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	return mlrval.FromInt(int64(string_cmp(input1.AcquireStringValue(), input2.AcquireStringValue())))
 }
 
-//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 func eq_b_xs(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	return mlrval.FromBool(input1.String() == input2.AcquireStringValue())
 }
@@ -89,7 +90,7 @@ func cmp_b_xs(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	return mlrval.FromInt(int64(string_cmp(input1.String(), input2.AcquireStringValue())))
 }
 
-//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 func eq_b_sx(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	return mlrval.FromBool(input1.AcquireStringValue() == input2.String())
 }
@@ -112,7 +113,7 @@ func cmp_b_sx(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	return mlrval.FromInt(string_cmp(input1.AcquireStringValue(), input2.String()))
 }
 
-//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 func eq_b_ii(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	return mlrval.FromBool(input1.AcquireIntValue() == input2.AcquireIntValue())
 }
@@ -135,7 +136,7 @@ func cmp_b_ii(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	return mlrval.FromInt(int_cmp(input1.AcquireIntValue(), input2.AcquireIntValue()))
 }
 
-//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 func eq_b_if(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	return mlrval.FromBool(float64(input1.AcquireIntValue()) == input2.AcquireFloatValue())
 }
@@ -158,7 +159,7 @@ func cmp_b_if(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	return mlrval.FromInt(float_cmp(float64(input1.AcquireIntValue()), input2.AcquireFloatValue()))
 }
 
-//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 func eq_b_fi(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	return mlrval.FromBool(input1.AcquireFloatValue() == float64(input2.AcquireIntValue()))
 }
@@ -181,7 +182,7 @@ func cmp_b_fi(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	return mlrval.FromInt(float_cmp(input1.AcquireFloatValue(), float64(input2.AcquireIntValue())))
 }
 
-//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 func eq_b_ff(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	return mlrval.FromBool(input1.AcquireFloatValue() == input2.AcquireFloatValue())
 }
@@ -204,7 +205,7 @@ func cmp_b_ff(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	return mlrval.FromInt(float_cmp(input1.AcquireFloatValue(), input2.AcquireFloatValue()))
 }
 
-//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 func eq_b_bb(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	return mlrval.FromBool(input1.AcquireBoolValue() == input2.AcquireBoolValue())
 }
@@ -231,7 +232,7 @@ func cmp_b_bb(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	return mlrval.FromInt(int_cmp(lib.BoolToInt(input1.AcquireBoolValue()), lib.BoolToInt(input2.AcquireBoolValue())))
 }
 
-//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 func eq_b_aa(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	a := input1.AcquireArrayValue()
 	b := input2.AcquireArrayValue()
@@ -257,7 +258,7 @@ func ne_b_aa(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	return mlrval.FromBool(!output.AcquireBoolValue())
 }
 
-//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 func eq_b_mm(input1, input2 *mlrval.Mlrval) *mlrval.Mlrval {
 	return mlrval.FromBool(input1.AcquireMapValue().Equals(input2.AcquireMapValue()))
 }

diff --git a/internal/pkg/bifs/collections.go b/internal/pkg/bifs/collections.go
@@ -867,17 +867,17 @@ func unaliasArrayLengthIndex(n int, mindex int) (int, bool) {
 }
 
 // MillerSliceAccess is code shared by the string-slicer and the array-slicer.
-// * Miller indices are 1-up, 1..n where n is the length of the array/string.
-//   They are also aliased -n..-1. These are called "mindex" (if int) or "index mlrval"
-//   (if mlrval).
-// * Go indices are 0-up, with no aliasing. These are called "zindex".
-// * The job of this routine is to map a pair of index-mlrval to a pair of zindex,
-//   with possible outcomes that the slice access should result in an empty array/string,
-//   or Mlrval of type absent, or Mlrval of type error.
-// * Callsites include the DSL array-slicer (e.g. [1,2,3,4,5][2:3]), the DSL string-slicer
-//   (e.g. "abcde"[2:3]), the substr1 function (e.g. substr1("abcde", 2, 3), and the substr0
-//   function (e.g. substr0("abcde", 1, 2)).
-// * The isZeroUp argument is in support of substr0.
+//   - Miller indices are 1-up, 1..n where n is the length of the array/string.
+//     They are also aliased -n..-1. These are called "mindex" (if int) or "index mlrval"
+//     (if mlrval).
+//   - Go indices are 0-up, with no aliasing. These are called "zindex".
+//   - The job of this routine is to map a pair of index-mlrval to a pair of zindex,
+//     with possible outcomes that the slice access should result in an empty array/string,
+//     or Mlrval of type absent, or Mlrval of type error.
+//   - Callsites include the DSL array-slicer (e.g. [1,2,3,4,5][2:3]), the DSL string-slicer
+//     (e.g. "abcde"[2:3]), the substr1 function (e.g. substr1("abcde", 2, 3), and the substr0
+//     function (e.g. substr0("abcde", 1, 2)).
+//   - The isZeroUp argument is in support of substr0.
 func MillerSliceAccess(
 	lowerIndexMlrval *mlrval.Mlrval,
 	upperIndexMlrval *mlrval.Mlrval,

diff --git a/internal/pkg/cli/flag_types.go b/internal/pkg/cli/flag_types.go
@@ -50,13 +50,13 @@ import (
 // Data types used within the flags table.
 
 // FlagParser is a function which takes a flag such as `--foo`.
-// * It should assume that a flag.Owns method has already been invoked to be
-//   sure that this function is indeed the right one to call for `--foo`.
-// * The FlagParser function is responsible for advancing *pargi by 1 (if
-//   `--foo`) or 2 (if `--foo bar`), checking to see if argc is long enough in
-//   the latter case, and mutating the options struct.
-// * Successful handling of the flag is indicated by this function making a
-//   non-zero increment of *pargi.
+//   - It should assume that a flag.Owns method has already been invoked to be
+//     sure that this function is indeed the right one to call for `--foo`.
+//   - The FlagParser function is responsible for advancing *pargi by 1 (if
+//     `--foo`) or 2 (if `--foo bar`), checking to see if argc is long enough in
+//     the latter case, and mutating the options struct.
+//   - Successful handling of the flag is indicated by this function making a
+//     non-zero increment of *pargi.
 type FlagParser func(
 	args []string,
 	argc int,

diff --git a/internal/pkg/cli/option_parse.go b/internal/pkg/cli/option_parse.go
@@ -19,14 +19,14 @@ import (
 )
 
 // FinalizeReaderOptions does a few things.
-// * If a file format was specified but one or more separators were not, a
-//   default specific to that file format is applied.
-// * Computing regexes for IPS and IFS, and unbackslashing IRS.  This is
-//   because the '\n' at the command line which is Go "\\n" (a backslash and an
-//   n) needs to become the single newline character, and likewise for "\t", etc.
-// * IFS/IPS can have escapes like "\x1f" which aren't valid regex literals
-//   so we unhex them. For example, from "\x1f" -- the four bytes '\', 'x', '1', 'f'
-//   -- to the single byte with hex code 0x1f.
+//   - If a file format was specified but one or more separators were not, a
+//     default specific to that file format is applied.
+//   - Computing regexes for IPS and IFS, and unbackslashing IRS.  This is
+//     because the '\n' at the command line which is Go "\\n" (a backslash and an
+//     n) needs to become the single newline character, and likewise for "\t", etc.
+//   - IFS/IPS can have escapes like "\x1f" which aren't valid regex literals
+//     so we unhex them. For example, from "\x1f" -- the four bytes '\', 'x', '1', 'f'
+//     -- to the single byte with hex code 0x1f.
 func FinalizeReaderOptions(readerOptions *TReaderOptions) {
 
 	readerOptions.IFS = lib.UnhexStringLiteral(readerOptions.IFS)

diff --git a/internal/pkg/climain/mlrcli_shebang.go b/internal/pkg/climain/mlrcli_shebang.go
@@ -10,12 +10,16 @@ import (
 )
 
 // maybeInterpolateDashS supports Miller scripts with shebang lines like
-//   #!/usr/bin/env mlr -s
-//   --csv tac then filter '
-//     NR % 2 == 1
-//   '
+//
+//	#!/usr/bin/env mlr -s
+//	--csv tac then filter '
+//	  NR % 2 == 1
+//	'
+//
 // invoked as
-//   scriptfile input1.csv input2.csv
+//
+//	scriptfile input1.csv input2.csv
+//
 // The "-s" flag must be the very first command-line argument after "mlr" for
 // two reasons:
 // * This is how shebang lines work

diff --git a/internal/pkg/dsl/cst/builtin_functions.go b/internal/pkg/dsl/cst/builtin_functions.go
@@ -385,21 +385,21 @@ func (node *TernaryFunctionWithStateCallsiteNode) Evaluate(
 //
 // Note the use of "capture" is ambiguous:
 //
-// * There is the regex-match part which captures submatches out
-//   of a full match expression, and saves them.
+//   - There is the regex-match part which captures submatches out
+//     of a full match expression, and saves them.
 //
 // * Then there is the part which inserts these captures into another string.
 //
-// * For sub/gsub, the former and latter are both within the sub/gsub routine.
-//   E.g. with
+//   - For sub/gsub, the former and latter are both within the sub/gsub routine.
+//     E.g. with
 //     $y = sub($x, "(..)_(...)", "\2:\1"
-//   and $x being "ab_cde", $y will be "cde:ab".
+//     and $x being "ab_cde", $y will be "cde:ab".
 //
-// * For =~ and !=~, the former are right there, but the latter can be several
-//   lines later. E.g.
+//   - For =~ and !=~, the former are right there, but the latter can be several
+//     lines later. E.g.
 //     if ($x =~ "(..)_(...)") {
-//       ... other lines of code ...
-//       $y = "\2:\1";
+//     ... other lines of code ...
+//     $y = "\2:\1";
 //     }
 //
 // So: this RegexCaptureBinaryFunctionCallsiteNode only refers to the =~ and

diff --git a/internal/pkg/dsl/cst/leaves.go b/internal/pkg/dsl/cst/leaves.go
@@ -244,10 +244,10 @@ type StringLiteralNode struct {
 // "\9" in it.  As of the original design of Miller, submatches are captured
 // in one place and interpolated in another. For example:
 //
-//   if ($x =~ "(..)_(...)" {
-//     ... other lines of code ...
-//     $y = "\2:\1";
-//   }
+//	if ($x =~ "(..)_(...)" {
+//	  ... other lines of code ...
+//	  $y = "\2:\1";
+//	}
 //
 // This node type is for things like "\2:\1". They can occur quite far from the
 // =~ callsite so we need to check all string literals to see if they have "\0"
@@ -287,10 +287,10 @@ func (node *StringLiteralNode) Evaluate(
 
 // As noted above, in things like
 //
-//   if ($x =~ "(..)_(...)" {
-//     ... other lines of code ...
-//     $y = "\2:\1";
-//   }
+//	if ($x =~ "(..)_(...)" {
+//	  ... other lines of code ...
+//	  $y = "\2:\1";
+//	}
 //
 // the captures can be set (by =~ or !=~) quite far from where they are used.
 // This is why we consult the state.RegexCaptures here, to see if they've been

diff --git a/internal/pkg/dsl/cst/types.go b/internal/pkg/dsl/cst/types.go
@@ -20,12 +20,12 @@ import (
 // Namely, for "bare booleans" which are non-assignment statements like 'NR >
 // 10' or 'true' or '$x =~ "(..)_(...)" or even '1+2'.
 //
-// * For mlr put, bare booleans are no-ops; except side-effects (like
-//   regex-captures)
-// * For mlr filter, they set the filter condition only if they're the last
-//   statement in the main block.
-// * For mlr repl, similar to mlr filter: they are used to track the output to
-//   be printed for an expression entered at the REPL prompt.
+//   - For mlr put, bare booleans are no-ops; except side-effects (like
+//     regex-captures)
+//   - For mlr filter, they set the filter condition only if they're the last
+//     statement in the main block.
+//   - For mlr repl, similar to mlr filter: they are used to track the output to
+//     be printed for an expression entered at the REPL prompt.
 type DSLInstanceType int
 
 const (

diff --git a/internal/pkg/dsl/cst/validate.go b/internal/pkg/dsl/cst/validate.go
@@ -272,13 +272,13 @@ func validateForLoopTwoVariableUniqueNames(astNode *dsl.ASTNode) error {
 // Check against 'for ((a,a), b in $*)' or 'for ((a,b), a in $*)' -- repeated 'a'.
 // AST:
 // * statement block
-//   * multi-variable for-loop "for"
-//     * parameter list
-//       * local variable "a"
-//       * local variable "b"
-//     * local variable "a"
-//     * full record "$*"
-//     * statement block
+//   - multi-variable for-loop "for"
+//   - parameter list
+//   - local variable "a"
+//   - local variable "b"
+//   - local variable "a"
+//   - full record "$*"
+//   - statement block
 func validateForLoopMultivariableUniqueNames(astNode *dsl.ASTNode) error {
 	lib.InternalCodingErrorIf(astNode.Type != dsl.NodeTypeForLoopMultivariable)
 	keyVarsNode := astNode.Children[0]

diff --git a/internal/pkg/input/record_reader_xtab.go b/internal/pkg/input/record_reader_xtab.go
@@ -124,13 +124,13 @@ func (reader *RecordReaderXTAB) processHandle(
 
 // Given input like
 //
-//   a 1
-//   b 2
-//   c 3
+//	a 1
+//	b 2
+//	c 3
 //
-//   a 4
-//   b 5
-//   c 6
+//	a 4
+//	b 5
+//	c 6
 //
 // this function reads the input stream a line at a time, then produces
 // string-lists one per stanza where a stanza is delimited by blank line, or

diff --git a/internal/pkg/lib/regex.go b/internal/pkg/lib/regex.go
@@ -145,16 +145,16 @@ func RegexReplacementHasCaptures(
 // RegexMatches implements the =~ DSL operator. The captures are stored in DSL
 // state and may be used by a DSL statement after the =~. For example, in
 //
-//   sub($a, "(..)_(...)", "\1:\2")
+//	sub($a, "(..)_(...)", "\1:\2")
 //
 // the replacement string is an argument to sub and therefore the captures are
 // confined to the implementation of the sub function.  Similarly for gsub. But
 // for the match operator, people can do
 //
-//   if ($x =~ "(..)_(...)") {
-//     ... other lines of code ...
-//     $y = "\2:\1"
-//   }
+//	if ($x =~ "(..)_(...)") {
+//	  ... other lines of code ...
+//	  $y = "\2:\1"
+//	}
 //
 // and the =~ callsite doesn't know if captures will be used or not. So,
 // RegexMatches always returns the captures array. It is stored within the CST
@@ -229,18 +229,18 @@ func RegexMatchesCompiled(
 }
 
 // InterpolateCaptures example:
-// * Input $x is "ab_cde"
-// * DSL expression
+//   - Input $x is "ab_cde"
+//   - DSL expression
 //     if ($x =~ "(..)_(...)") {
-//       ... other lines of code ...
-//       $y = "\2:\1";
+//     ... other lines of code ...
+//     $y = "\2:\1";
 //     }
-// * InterpolateCaptures is used on the evaluation of "\2:\1"
-// * replacementString is "\2:\1"
-// * replacementMatrix contains precomputed/cached offsets for the "\2" and
-//   "\1" substrings within "\2:\1"
-// * captures has slot 0 being "ab_cde" (for "\0"), slot 1 being "ab" (for "\1"),
-//   slot 2 being "cde" (for "\2"), and slots 3-9 being "".
+//   - InterpolateCaptures is used on the evaluation of "\2:\1"
+//   - replacementString is "\2:\1"
+//   - replacementMatrix contains precomputed/cached offsets for the "\2" and
+//     "\1" substrings within "\2:\1"
+//   - captures has slot 0 being "ab_cde" (for "\0"), slot 1 being "ab" (for "\1"),
+//     slot 2 being "cde" (for "\2"), and slots 3-9 being "".
 func InterpolateCaptures(
 	replacementString string,
 	replacementMatrix [][]int,

diff --git a/internal/pkg/mlrval/mlrmap_accessors.go b/internal/pkg/mlrval/mlrmap_accessors.go
@@ -208,12 +208,12 @@ func (mlrmap *Mlrmap) findEntry(key string) *MlrmapEntry {
 // findEntryByPositionalIndex is for '$[1]' etc. in the DSL.
 //
 // Notes:
-// * This is a linear search.
-// * Indices are 1-up not 0-up
-// * Indices -n..-1 are aliases for 1..n. In particular, it will be faster to
-//   get the -1st field than the nth.
-// * Returns 0 on invalid index: 0, or < -n, or > n where n is the number of
-//   fields.
+//   - This is a linear search.
+//   - Indices are 1-up not 0-up
+//   - Indices -n..-1 are aliases for 1..n. In particular, it will be faster to
+//     get the -1st field than the nth.
+//   - Returns 0 on invalid index: 0, or < -n, or > n where n is the number of
+//     fields.
 func (mlrmap *Mlrmap) findEntryByPositionalIndex(position int64) *MlrmapEntry {
 	if position > mlrmap.FieldCount || position < -mlrmap.FieldCount || position == 0 {
 		return nil
-Original file line number
+Diff line change
@@ Expand Up / @@ -124,13 +124,13 @@ func (reader *RecordReaderXTAB) processHandle( @@
     // Given input like
     //
-    //   a 1
-    //   b 2
-    //   c 3
+    //	a 1
+    //	b 2
+    //	c 3
     //
-    //   a 4
-    //   b 5
-    //   c 6
+    //	a 4
+    //	b 5
+    //	c 6
     //
     // this function reads the input stream a line at a time, then produces
     // string-lists one per stanza where a stanza is delimited by blank line, or
@@ Expand Down @@