envparse.go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0

// Package envparse is a minimal environment variable parser. It handles empty
// lines, comments, single quotes, double quotes, and JSON escape sequences.
//
// Non-empty or comment lines should be of the form:
//
//	KEY=value
//
// While extraneous characters are discouraged, an "export" prefix, preceding
// whitespace, and trailing whitespace are all removed:
//
//	KEY = This is ok! # Parses to {"KEY": "This is ok!"}
//	KEY2= Also ok.    # Parses to {"KEY2": "Also ok."}
//	export FOO=bar    # Parses to {"FOO": "bar"}
package envparse

import (
	"bufio"
	"bytes"
	"fmt"
	"io"
	"unicode/utf16"
	"unicode/utf8"
)

var (
	ErrMissingSeparator = fmt.Errorf("missing =")
	ErrEmptyKey         = fmt.Errorf("empty key")
	ErrUnmatchedDouble  = fmt.Errorf(`unmatched "`)
	ErrUnmatchedSingle  = fmt.Errorf("unmatched '")
	ErrIncompleteEscape = fmt.Errorf("incomplete escape sequence")
	ErrIncompleteHex    = fmt.Errorf("incomplete hex sequence")
	ErrIncompleteSur    = fmt.Errorf("incomplete Unicode surrogate pair")
	ErrMultibyteEscape  = fmt.Errorf("multibyte characters disallowed in escape sequences")
)

// ParseError is returned whenever the Parse function encounters an error. It
// includes the line number and underlying error.
type ParseError struct {
	Line int
	Err  error
}

func (e *ParseError) Error() string {
	if e.Line > 0 {
		return fmt.Sprintf("error on line %d: %v", e.Line, e.Err)
	}
	return fmt.Sprintf("error reading: %v", e.Err)
}

func parseError(line int, err error) error {
	return &ParseError{
		Line: line,
		Err:  err,
	}
}

// Parse environment variables from an io.Reader into a map or return a
// ParseError.
func Parse(r io.Reader) (map[string]string, error) {
	env := make(map[string]string)
	scanner := bufio.NewScanner(r)

	// Track line number
	i := 0

	// Main scan loop
	for scanner.Scan() {
		i++
		k, v, err := parseLine(scanner.Bytes())
		if err != nil {
			return nil, parseError(i, err)
		}

		// Skip blank lines
		if len(k) > 0 {
			env[string(k)] = string(v)
		}
	}
	if err := scanner.Err(); err != nil {
		return nil, parseError(i, err)
	}
	return env, nil
}

const (
	normalMode  = iota
	doubleQuote = iota
	singleQuote = iota
	escapeMode  = iota
	unicodeMode = iota
)

var (
	empty        = []byte{}
	separator    = []byte{'='}
	exportPrefix = []byte("export ")
)

// parseLine parses the given line into a key and value or error.
//
// Empty lines are returned as zero length slices
func parseLine(ln []byte) ([]byte, []byte, error) {
	ln = bytes.TrimSpace(ln)
	if len(ln) == 0 || ln[0] == '#' {
		return empty, empty, nil
	}

	parts := bytes.SplitN(ln, separator, 2)
	if len(parts) != 2 {
		return nil, nil, ErrMissingSeparator
	}

	// Trim whitespace
	key, value := bytes.TrimSpace(parts[0]), bytes.TrimSpace(parts[1])

	// Ensure key is of the form [A-Za-z][A-Za-z0-9_]? with an optional
	// leading 'export ', but only trim leading export if there's another
	// key name.
	if len(key) > len(exportPrefix) {
		key = bytes.TrimPrefix(key, exportPrefix)
	}
	if len(key) == 0 {
		return nil, nil, ErrEmptyKey
	}
	if key[0] < 'A' {
		return nil, nil, fmt.Errorf("key must start with [A-Za-z_] but found %q", key[0])
	}
	if key[0] > 'Z' && key[0] < 'a' && key[0] != '_' {
		return nil, nil, fmt.Errorf("key must start with [A-Za-z_] but found %q", key[0])
	}
	if key[0] > 'z' {
		return nil, nil, fmt.Errorf("key must start with [A-Za-z_] but found %q", key[0])
	}

	for _, v := range key[1:] {
		switch {
		case v == '_':
		case v == '.':
		case v == '/':
		case v >= 'A' && v <= 'Z':
		case v >= 'a' && v <= 'z':
		case v >= '0' && v <= '9':
		default:
			return nil, nil, fmt.Errorf("key characters must be [A-Za-z0-9/_.] but found %q", v)
		}
	}

	// Evaluate the value
	if len(value) == 0 {
		// Empty values are ok! Shortcircuit
		return key, value, nil
	}

	// Scratch buffer for unescaped value
	newv := make([]byte, len(value))
	newi := 0
	// Track last significant character for trimming unquoted whitespace preceding a trailing comment
	lastSig := 0

	// Parser State
	mode := normalMode

	for i := 0; i < len(value); i++ {
		v := value[i]

		// Control characters are always an error
		if v < 32 {
			return nil, nil, fmt.Errorf("0x%0.2x is an invalid value character", v)
		}

		// High bit set means it is part of a multibyte character, pass
		// it through as only ASCII characters have special meaning.
		if v > 127 {
			if mode == escapeMode {
				return nil, nil, ErrMultibyteEscape
			}
			// All multibyte characters are significant
			lastSig = newi
			newv[newi] = v
			newi++
			continue
		}

		switch mode {
		case normalMode:
			switch v {
			case '"':
				mode = doubleQuote
			case '\'':
				mode = singleQuote
			case '#':
				// Start of a comment, nothing left to parse
				return key, newv[:lastSig], nil
			case ' ', '\t':
				// Make sure whitespace doesn't get tracked
				newv[newi] = v
				newi++
			default:
				// Add the character to the new value
				newv[newi] = v
				newi++

				// Track last non-WS char for trimming on trailing comments
				lastSig = newi
			}
		case doubleQuote:
			switch v {
			case '"':
				mode = normalMode
			case '\\':
				mode = escapeMode
			default:
				// Add the character to the new value
				newv[newi] = v
				newi++

				// All quoted characters are significant
				lastSig = newi
			}
		case escapeMode:
			// We're in double quotes and the last character was a backslash
			switch v {
			case '"':
				newv[newi] = v
			case '\\':
				newv[newi] = v
			case '/':
				newv[newi] = v
			case 'b':
				newv[newi] = '\b'
			case 'f':
				newv[newi] = '\f'
			case 'r':
				newv[newi] = '\r'
			case 'n':
				newv[newi] = '\n'
			case 't':
				newv[newi] = '\t'
			case 'u':
				// Parse-ahead to capture unicode
				r, err := h2r(value[i+1:])
				if err != nil {
					return nil, nil, err
				}

				// Bump index by width of hex chars
				i += 4

				// Check if we need to get another rune
				if utf16.IsSurrogate(r) {
					if len(value) < i+6 {
						//TODO Use replacement character instead?
						return nil, nil, ErrIncompleteSur
					}
					if value[i+1] != '\\' || value[i+2] != 'u' {
						//TODO Use replacement character instead?
						return nil, nil, ErrIncompleteSur
					}

					r2, err := h2r(value[i+3:])
					if err != nil {
						return nil, nil, err
					}

					// Bump index by width of \uXXXX
					i += 6

					r = utf16.DecodeRune(r, r2)
				}
				n := utf8.EncodeRune(newv[newi:], r)
				newi += n - 1 // because it's incremented outside the switch
			default:
				return nil, nil, fmt.Errorf("invalid escape sequence: %q", string(v))
			}
			// Add the character to the new value
			newi++

			// All escaped characters are significant
			lastSig = newi

			// Switch back to quote mode
			mode = doubleQuote
		case singleQuote:
			switch v {
			case '\'':
				mode = normalMode
			default:
				// Add all other characters to the new value
				newv[newi] = v
				newi++

				// All single quoted characters are significant
				lastSig = newi
			}
		default:
			panic(fmt.Errorf("BUG: invalid mode: %v", mode))
		}
	}

	switch mode {
	case normalMode:
		// All escape sequences are complete and all quotes are matched
		return key, newv[:newi], nil
	case doubleQuote:
		return nil, nil, ErrUnmatchedDouble
	case singleQuote:
		return nil, nil, ErrUnmatchedSingle
	case escapeMode:
		return nil, nil, ErrIncompleteEscape
	default:
		panic(fmt.Errorf("BUG: invalid mode: %v", mode))
	}
}

// convert hex characters into a rune
func h2r(buf []byte) (rune, error) {
	if len(buf) < 4 {
		return 0, ErrIncompleteHex
	}
	var r rune
	for i := 0; i < 4; i++ {
		d := buf[i]
		switch {
		case '0' <= d && d <= '9':
			d = d - '0'
		case 'a' <= d && d <= 'f':
			d = d - 'a' + 10
		case 'A' <= d && d <= 'F':
			d = d - 'A' + 10
		default:
			return 0, fmt.Errorf("invalid hex character: %q", string(d))
		}

		r *= 16
		r += rune(d)
	}
	return r, nil
}