simplify.jai

#scope_module

compute_simple :: (using re: *Regexp_Node) -> bool {
	if #complete op == {
		case .NoMatch;	#through;
		case .EmptyMatch;	#through;
		case .Literal;	#through;
		case .LiteralString;	#through;
		case .BeginLine;	#through;
		case .EndLine;	#through;
		case .BeginText;	#through;
		case .WordBoundary;	#through;
		case .NoWordBoundary;	#through;
		case .EndText;	#through;
		case .AnyChar;	#through;
		case .AnyByte;	#through;
		case .HaveMatch;
			return true;
		case .Concat;		#through;
		case .Alternate;
			// These are simple as long as the subpieces are simple.
			for subs {
				if !it.simple	return false;
			}
			return true;
		case .CharClass;
			// Simple as long as the char class is not empty, not full.
			return char_class.nrunes != 0 && char_class.nrunes != Runemax + 1;
		case .Capture;
			return subs[0].simple;
		case .Star;	#through;
		case .Plus;	#through;
		case .Quest;
			if !subs[0].simple		return false;
			if subs[0].op == {
				case .Star;	#through;
				case .Plus;	#through;
				case .Quest;	#through;
				case .EmptyMatch;	#through;
				case .NoMatch;
					return false;
				case;
					return true;
			}
		case .Repeat;
			return false;
		case .kVerticalBar; #through;
		case .kLeftParen;
			assert(false, "Found parser tokens in compute_simple: %", op);
			return false;
	}
}

// Simplifies a regular expression, returning a new regexp.
// The new regexp uses traditional Unix egrep features only,
// plus the Perl (?:) non-capturing parentheses.
// Otherwise, no POSIX or Perl additions.  The new regexp
// captures exactly the same subexpressions (with the same indices)
// as the original.
// Does not edit given Regexp_Node.
simplify :: (re: *Regexp_Node, pool: *Regexp_Pool) -> *Regexp_Node {
	cre := coalesce(re, pool);
	if !cre		return null;

	sre := simplify_coalesced(cre, pool);
	if !sre		return null;

	return sre;
}

// Coalesces runs of star/plus/quest/repeat of the same literal along with any
// occurrences of that literal into repeats of that literal. It also works for
// char classes, any char and any byte.
coalesce :: (using re: *Regexp_Node, pool: *Regexp_Pool) -> *Regexp_Node {
	coalesce_post :: (pool: *Regexp_Pool, re: *Regexp_Node, parent_arg: *Regexp_Node, pre_arg: *Regexp_Node, child_args: [..] *Regexp_Node) -> *Regexp_Node {
		if re.subs.count == 0	return re;
		if re.op != .Concat {
			if !child_args_changed(re, child_args) return re;

			// Something changed. Build a new op.
			nre := new_regexp(pool, re.op, re.parse_flags);
			// @ToDo: Should we copy here?
			nre.subs = child_args;
			// Repeats and Captures have additional data that must be copied.
			if (re.op == .Repeat) {
				nre.min = re.min;
				nre.max = re.max;
			} else if re.op == .Capture {
				nre.cap = re.cap;
			}
			return nre;
		}

		can_coalesce_any := false;
		for i: 0..child_args.count - 2 {
			if can_coalesce(child_args[i], child_args[i+1]) {
				can_coalesce_any = true;
				break;
			}
		}

		if !can_coalesce_any {
			if !child_args_changed(re, child_args)	return re;

			// Something changed. Build a new op.
			nre := new_regexp(pool, re.op, re.parse_flags);
			// @ToDo: Should we copy here?
			nre.subs = child_args;
			return nre;
		}

		// @Speed: Do this in one pass with the loop above?
		for i: 0..child_args.count - 2 {
			if can_coalesce(child_args[i], child_args[i+1]) {
				do_coalesce(pool, *child_args[i], *child_args[i+1]);
			}
		}

		// Determine how many empty matches were left by DoCoalesce.
		n := 0;
		for child_args {
			if (it.op == .EmptyMatch) {
				n += 1;
			}
		}

		// Build a new op.
		nre := new_regexp(pool, re.op, re.parse_flags);
		nre.subs = NewArray(re.subs.count - n, *Regexp_Node, false);
		j := 0;
		for child_args {
			if (it.op != .EmptyMatch) {
				nre.subs[j] = it;
				j += 1;
			}
		}
		return nre;
	}

	w := Walk(*Regexp_Pool, *Regexp_Node).{post_visit = coalesce_post};
	result, stopped := walk(pool, re, null, w);
	if stopped		return null;
	return result;
}

child_args_changed :: (re: *Regexp_Node, child_args: [] *Regexp_Node) -> bool {
	for sub, i: re.subs {
		if sub != child_args[i]		return true;
	}
	return false;
}

can_coalesce :: (r1: *Regexp_Node, r2: *Regexp_Node) -> bool {
	// r1 must be a star/plus/quest/repeat of a literal, char class, any char or
	// any byte.
	if ((r1.op == .Star ||
			r1.op == .Plus ||
			r1.op == .Quest ||
			r1.op == .Repeat) &&
			(r1.subs[0].op == .Literal ||
			r1.subs[0].op == .CharClass ||
			r1.subs[0].op == .AnyChar ||
			r1.subs[0].op == .AnyByte)) {
		// r2 must be a star/plus/quest/repeat of the same literal, char class,
		// any char or any byte.
		if ((r2.op == .Star ||
				r2.op == .Plus ||
				r2.op == .Quest ||
				r2.op == .Repeat) &&
				regexp_equal(r1.subs[0], r2.subs[0]) &&
				// The parse flags must be consistent.
				((r1.parse_flags & .NonGreedy) == (r2.parse_flags & .NonGreedy))) {
			return true;
		}
		// ... OR an occurrence of that literal, char class, any char or any byte
		if (regexp_equal(r1.subs[0], r2)) {
			return true;
		}
		// ... OR a literal string that begins with that literal.
		if (r1.subs[0].op == .Literal && r2.op == .LiteralString && r2.runes[0] == r1.subs[0].rune &&
				// The parse flags must be consistent.
				((r1.subs[0].parse_flags & .FoldCase) == (r2.parse_flags & .FoldCase))) {
			return true;
		}
	}
	return false;
}

do_coalesce :: (pool: *Regexp_Pool, r1ptr: **Regexp_Node, r2ptr: **Regexp_Node) {
	leave_empty :: (r1ptr: **Regexp_Node, r2ptr: **Regexp_Node, pool: *Regexp_Pool, nre: *Regexp_Node) {
		// @ToDo, @Speed: Why don’t we set this to null instead of allocing a temporary new regexp?
		<<r1ptr = new_regexp(pool, .EmptyMatch, .NoParseFlags);
		<<r2ptr = nre;
	}

	r1 := <<r1ptr;
	r2 := <<r2ptr;

	nre := new_repeat(pool, r1.subs[0], r1.parse_flags, 0, 0);

	if r1.op == {
		case .Star;
			nre.min = 0;
			nre.max = -1;
		case .Plus;
			nre.min = 1;
			nre.max = -1;
		case .Quest;
			nre.min = 0;
			nre.max = 1;
		case .Repeat;
			nre.min = r1.min;
			nre.max = r1.max;
		case;
			assert(false, "Unexpected r1.op: %", r1.op);
	}


	if r2.op == {
		case .Star;
			nre.max = -1;
			leave_empty(r1ptr, r2ptr, pool, nre);

		case .Plus;
			nre.min += 1;
			nre.max = -1;
			leave_empty(r1ptr, r2ptr, pool, nre);

		case .Quest;
			if nre.max != -1 {
				nre.max += 1;
			}
			leave_empty(r1ptr, r2ptr, pool, nre);

		case .Repeat;
			nre.min += r2.min;
			if (r2.max == -1) {
				nre.max = -1;
			} else if (nre.max != -1) {
				nre.max += r2.max;
			}
			leave_empty(r1ptr, r2ptr, pool, nre);

		case .Literal; #through;
		case .CharClass; #through;
		case .AnyChar; #through;
		case .AnyByte;
			nre.min += 1;
			if (nre.max != -1) {
				nre.max += 1;
			}
			leave_empty(r1ptr, r2ptr, pool, nre);

		case .LiteralString;
			r := r1.subs[0].rune;
			// Determine how much of the literal string is removed.
			// We know that we have at least one rune. :)
			n: s16 = 1;
			while (n < r2.runes.count && r2.runes[n] == r) {
				n += 1;
			}
			nre.min += n;
			if (nre.max != -1) {
				nre.max += n;
			}
			if n == r2.runes.count {
				leave_empty(r1ptr, r2ptr, pool, nre);
			} else {
				<<r1ptr = nre;
				<<r2ptr = new_literal_string(pool, slice(r2.runes, n, r2.runes.count - n), r2.parse_flags);
			}

		case;
			assert(false, "Unexpected r2.op: %", r2.op);
	}
}

simplify_coalesced :: (using re: *Regexp_Node, pool: *Regexp_Pool) -> *Regexp_Node {
	simplify_pre :: (pool: *Regexp_Pool, re: *Regexp_Node, parent_arg: *Regexp_Node) -> *Regexp_Node, stop: bool {
		if (re.simple) {
			return re, true;
		}

		return null, false;
	}
	simplify_post :: (pool: *Regexp_Pool, re: *Regexp_Node, parent_arg: *Regexp_Node, pre_arg: *Regexp_Node, child_args: [..] *Regexp_Node) -> *Regexp_Node {
		if re.op == {
			case .NoMatch; #through;
			case .EmptyMatch; #through;
			case .Literal; #through;
			case .LiteralString; #through;
			case .BeginLine; #through;
			case .EndLine; #through;
			case .BeginText; #through;
			case .WordBoundary; #through;
			case .NoWordBoundary; #through;
			case .EndText; #through;
			case .AnyChar; #through;
			case .AnyByte; #through;
			case .HaveMatch;
				// All these are always simple.
				re.simple = true;
				return re;

			case .Concat; #through;
			case .Alternate;
				// These are simple as long as the subpieces are simple.
				if !child_args_changed(re, child_args) {
					re.simple = true;
					return re;
				}
				nre := new_regexp(pool, re.op, re.parse_flags);
				// @ToDo: Should we copy here?
				nre.subs = child_args;
				nre.simple = true;
				return nre;


			case .Capture;
				newsub := child_args[0];
				if newsub == re.subs[0] {
					re.simple = true;
					return re;
				}
				nre := new_regexp(pool, .Capture, re.parse_flags);
				nre.subs = NewArray(1, *Regexp_Node, false);
				nre.subs[0] = newsub;
				nre.cap = re.cap;
				nre.simple = true;
				return nre;

			case .Star; #through;
			case .Plus; #through;
			case .Quest;
				newsub := child_args[0];
				// Special case: repeat the empty string as much as
				// you want, but it's still the empty string.
				if newsub.op == .EmptyMatch		return newsub;

				// These are simple as long as the subpiece is simple.
				if newsub == re.subs[0] {
					re.simple = true;
					return re;
				}

				// These are also idempotent if flags are constant.
				if re.op == newsub.op && re.parse_flags == newsub.parse_flags	return newsub;

				nre := new_regexp(pool, re.op, re.parse_flags);
				nre.subs = NewArray(1, *Regexp_Node, false);
				nre.subs[0] = newsub;
				nre.simple = true;
				return nre;

			case .Repeat;
				newsub := child_args[0];
				// Special case: repeat the empty string as much as
				// you want, but it's still the empty string.
				if newsub.op == .EmptyMatch		return newsub;

				nre := simplify_repeat(pool, newsub, re.min, re.max, re.parse_flags);
				nre.simple = true;
				return nre;

			case .CharClass;
				nre := simplify_char_class(pool, re);
				nre.simple = true;
				return nre;
			case;
				assert(false, "Simpilfy case not handled: %", re.op);
				return null;
		}
	}

	w := Walk(*Regexp_Pool, *Regexp_Node).{pre_visit = simplify_pre, post_visit = simplify_post};
	result, stopped := walk(pool, re, null, w);
	if stopped		return null;
	return result;
}

// Creates a concatenation of two Regexp_Node, consuming refs to re1 and re2.
// Returns a new Regexp_Node, handing the ref to the caller.
concat2 :: (pool: *Regexp_Pool, re1: *Regexp_Node, re2: *Regexp_Node, parse_flags: ParseFlags) -> *Regexp_Node {
	re := new_regexp(pool, .Concat, parse_flags);
	re.subs = NewArray(2, *Regexp_Node, false);
	re.subs[0] = re1;
	re.subs[1] = re2;
	return re;
}

// Simplifies the expression re{min,max} in terms of *, +, and ?.
// Returns a new regexp.  Does not edit re.  Does not consume reference to re.
// Caller must Decref return value when done with it.
// The result will *not* necessarily have the right capturing parens
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
// but in the Regexp_Node* representation, both (x) are marked as $1.
simplify_repeat :: (pool: *Regexp_Pool, re: *Regexp_Node, min: int, max: int, f: ParseFlags) -> *Regexp_Node {
	// x{n,} means at least n matches of x.
	if (max == -1) {
		// Special case: x{0,} is x*
		if (min == 0) return new_star(pool, re, f);

		// Special case: x{1,} is x+
		if (min == 1) return new_plus(pool, re, f);

		// General case: x{4,} is xxxx+
		nre_subs := NewArray(min, *Regexp_Node, false);
		for i: 0..min-2 {
			nre_subs[i] = re;
		}
		nre_subs[min-1] = new_plus(pool, re, f);
		return new_concat(pool, nre_subs, f);
	}

	// Special case: (x){0} matches only empty string.
	if (min == 0 && max == 0) return new_regexp(pool, .EmptyMatch, f);

	// Special case: x{1} is just x.
	if (min == 1 && max == 1) return re;

	// General case: x{n,m} means n copies of x and m copies of x?.
	// The machine will do less work if we nest the final m copies,
	// so that x{2,5} = xx(x(x(x)?)?)?

	// Build leading prefix: xx.  Capturing only on the last one.
	nre: *Regexp_Node;
	if (min > 0) {
		nre_subs := NewArray(min, *Regexp_Node, false);
		for i: 0..min-1 {
			nre_subs[i] = re;
		}
		nre = new_concat(pool, nre_subs, f);
	}

	// Build and attach suffix: (x(x(x)?)?)?
	if (max > min) {
		suf := new_quest(pool, re, f);
		for i: min+1..max-1 {
			suf = new_quest(pool, concat2(pool, re, suf, f), f);
		}
		if (nre == null) {
			nre = suf;
		} else {
			nre = concat2(pool, nre, suf, f);
		}
	}

	// Some degenerate case, like min > max, or min < max < 0.
	// This shouldn't happen, because the parser rejects such regexps.
	assert(nre != null, "Malformed repeat: % %", min, max);

	return nre;
}

// Simplifies a character class.
simplify_char_class :: (pool: *Regexp_Pool, re: *Regexp_Node) -> *Regexp_Node {
	// Special cases
	if (re.char_class.nrunes == 0)				return new_regexp(pool, .NoMatch, re.parse_flags);
	if (re.char_class.nrunes == Runemax + 1)	return new_regexp(pool, .AnyChar, re.parse_flags);

	return re;
}

#scope_file

// @ToDo: Move to a common utility module
slice :: inline (array: [] $T, index: int, count: int) -> [] T {
    assert(index >= 0, "index = %", index);             // @@ Should we also clamp in these cases?
    assert(count >= 0, "count = %", count);
	c: [] T = ---;
    if index >= array.count {
		c.data = null;
		c.count = 0;
		return c;
	}

    if index + count > array.count {
        count = array.count - index;
    }

    c.data = array.data + index;
    c.count = count;
    return c;
}