From fc796714e75d60816f816a3b5686c7ea1b70a1e8 Mon Sep 17 00:00:00 2001
From: Konstantin Romanov <konstantinsromanov@gmail.com>
Date: Thu, 3 Apr 2025 21:35:55 -0400
Subject: [PATCH] Draft version of OCaml indexer.

---
 .../indexer/analysis/AnalyzerGuru.java        |   2 +
 .../indexer/analysis/ocaml/Consts.java        | 101 ++++++++++
 .../indexer/analysis/ocaml/OCamlAnalyzer.java |  76 ++++++++
 .../analysis/ocaml/OCamlAnalyzerFactory.java  |  56 ++++++
 .../indexer/analysis/ocaml/OCamlLexer.java    |  59 ++++++
 .../src/main/jflex/analysis/ocaml/OCaml.lexh  |  95 +++++++++
 .../analysis/ocaml/OCamlSymbolTokenizer.lex   | 101 ++++++++++
 .../main/jflex/analysis/ocaml/OCamlXref.lex   | 181 ++++++++++++++++++
 8 files changed, 671 insertions(+)
 create mode 100644 opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/Consts.java
 create mode 100644 opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlAnalyzer.java
 create mode 100644 opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlAnalyzerFactory.java
 create mode 100644 opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlLexer.java
 create mode 100644 opengrok-indexer/src/main/jflex/analysis/ocaml/OCaml.lexh
 create mode 100644 opengrok-indexer/src/main/jflex/analysis/ocaml/OCamlSymbolTokenizer.lex
 create mode 100644 opengrok-indexer/src/main/jflex/analysis/ocaml/OCamlXref.lex

diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java
index a8396f7fe92..49457fd16ef 100644
--- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java
+++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java
@@ -89,6 +89,7 @@
 import org.opengrok.indexer.analysis.kotlin.KotlinAnalyzerFactory;
 import org.opengrok.indexer.analysis.lisp.LispAnalyzerFactory;
 import org.opengrok.indexer.analysis.lua.LuaAnalyzerFactory;
+import org.opengrok.indexer.analysis.ocaml.OCamlAnalyzerFactory;
 import org.opengrok.indexer.analysis.pascal.PascalAnalyzerFactory;
 import org.opengrok.indexer.analysis.perl.PerlAnalyzerFactory;
 import org.opengrok.indexer.analysis.php.PhpAnalyzerFactory;
@@ -298,6 +299,7 @@ public class AnalyzerGuru {
                 new HaskellAnalyzerFactory(),
                 new GolangAnalyzerFactory(),
                 new LuaAnalyzerFactory(),
+                new OCamlAnalyzerFactory(),
                 new PascalAnalyzerFactory(),
                 new AdaAnalyzerFactory(),
                 new RubyAnalyzerFactory(),
diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/Consts.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/Consts.java
new file mode 100644
index 00000000000..09f63e52038
--- /dev/null
+++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/Consts.java
@@ -0,0 +1,101 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * See LICENSE.txt included in this distribution for the specific
+ * language governing permissions and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at LICENSE.txt.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright (c) 2017, 2020, Chris Fraire <cfraire@me.com>.
+ */
+package org.opengrok.indexer.analysis.ocaml;
+
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Represents a container for a set of OCaml keywords.
+ */
+public class Consts {
+
+    static final Set<String> kwd = new HashSet<>();
+
+    /* From parsing/lexer.mll of OCaml 5.3.0. */
+    static {
+        kwd.add("and");
+        kwd.add("as");
+        kwd.add("assert");
+        kwd.add("begin");
+        kwd.add("class");
+        kwd.add("constraint");
+        kwd.add("do");
+        kwd.add("done");
+        kwd.add("downto");
+        kwd.add("effect");
+        kwd.add("else");
+        kwd.add("end");
+        kwd.add("exception");
+        kwd.add("external");
+        kwd.add("false");
+        kwd.add("for");
+        kwd.add("fun");
+        kwd.add("function");
+        kwd.add("functor");
+        kwd.add("if");
+        kwd.add("in");
+        kwd.add("include");
+        kwd.add("inherit");
+        kwd.add("initializer");
+        kwd.add("lazy");
+        kwd.add("let");
+        kwd.add("match");
+        kwd.add("method");
+        kwd.add("module");
+        kwd.add("mutable");
+        kwd.add("new");
+        kwd.add("nonrec");
+        kwd.add("object");
+        kwd.add("of");
+        kwd.add("open");
+        kwd.add("or");
+        kwd.add("parser");
+        kwd.add("private");
+        kwd.add("rec");
+        kwd.add("sig");
+        kwd.add("struct");
+        kwd.add("then");
+        kwd.add("to");
+        kwd.add("true");
+        kwd.add("try");
+        kwd.add("type");
+        kwd.add("val");
+        kwd.add("virtual");
+        kwd.add("when");
+        kwd.add("while");
+        kwd.add("with");
+        kwd.add("lor");
+        kwd.add("lxor");
+        kwd.add("mod");
+        kwd.add("land");
+        kwd.add("lsl");
+        kwd.add("lsr");
+        kwd.add("asr");
+    }
+
+    /** Private to enforce static. */
+    private Consts() {
+    }
+}
diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlAnalyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlAnalyzer.java
new file mode 100644
index 00000000000..fa14987c1a0
--- /dev/null
+++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlAnalyzer.java
@@ -0,0 +1,76 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * See LICENSE.txt included in this distribution for the specific
+ * language governing permissions and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at LICENSE.txt.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright (c) 2017, 2020, Chris Fraire <cfraire@me.com>.
+ */
+package org.opengrok.indexer.analysis.ocaml;
+
+import org.opengrok.indexer.analysis.AbstractAnalyzer;
+import org.opengrok.indexer.analysis.FileAnalyzerFactory;
+import org.opengrok.indexer.analysis.JFlexTokenizer;
+import org.opengrok.indexer.analysis.JFlexXref;
+import org.opengrok.indexer.analysis.plain.AbstractSourceCodeAnalyzer;
+
+import java.io.Reader;
+
+/**
+ * Represents an analyzer for the OCaml language.
+ */
+@SuppressWarnings("java:S110")
+public class OCamlAnalyzer extends AbstractSourceCodeAnalyzer {
+
+    /**
+     * Creates a new instance of {@link OCamlAnalyzer}.
+     * @param factory instance
+     */
+    protected OCamlAnalyzer(FileAnalyzerFactory factory) {
+        super(factory, () -> new JFlexTokenizer(new OCamlSymbolTokenizer(
+                AbstractAnalyzer.DUMMY_READER)));
+    }
+
+    /**
+     * @return {@code "ocaml"}
+     */
+    @Override
+    public String getCtagsLang() {
+        return "ocaml";
+    }
+
+    /**
+     * Gets a version number to be used to tag processed documents so that
+     * re-analysis can be re-done later if a stored version number is different
+     * from the current implementation.
+     * @return 20250403_00
+     */
+    @Override
+    protected int getSpecializedVersionNo() {
+        return 20250403_00; // Edit comment above too!
+    }
+
+    /**
+     * Creates a wrapped {@link OCamlXref} instance.
+     * @return a defined instance
+     */
+    @Override
+    protected JFlexXref newXref(Reader reader) {
+        return new JFlexXref(new OCamlXref(reader));
+    }
+}
diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlAnalyzerFactory.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlAnalyzerFactory.java
new file mode 100644
index 00000000000..4934eed483e
--- /dev/null
+++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlAnalyzerFactory.java
@@ -0,0 +1,56 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * See LICENSE.txt included in this distribution for the specific
+ * language governing permissions and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at LICENSE.txt.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright (c) 2017, 2020, Chris Fraire <cfraire@me.com>.
+ */
+package org.opengrok.indexer.analysis.ocaml;
+
+import org.opengrok.indexer.analysis.AbstractAnalyzer.Genre;
+import org.opengrok.indexer.analysis.FileAnalyzer;
+import org.opengrok.indexer.analysis.FileAnalyzerFactory;
+
+/**
+ * Represents a factory to create {@link OCamlAnalyzer} instances.
+ */
+public class OCamlAnalyzerFactory extends FileAnalyzerFactory {
+
+    private static final String NAME = "OCaml";
+
+    private static final String[] SUFFIXES = {"ML", "MLI"};
+
+    /**
+     * Initializes a factory instance to associate a file extensions ".ml",
+     * ".mli" with {@link OCamlAnalyzer}.
+     */
+    public OCamlAnalyzerFactory() {
+        super(null, null, SUFFIXES, null, null, "text/plain", Genre.PLAIN,
+                NAME, true);
+    }
+
+    /**
+     * Creates a new {@link OCamlAnalyzer} instance.
+     * @return a defined instance
+     */
+    @Override
+    protected FileAnalyzer newAnalyzer() {
+        return new OCamlAnalyzer(this);
+    }
+}
diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlLexer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlLexer.java
new file mode 100644
index 00000000000..3b04d06f215
--- /dev/null
+++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/ocaml/OCamlLexer.java
@@ -0,0 +1,59 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * See LICENSE.txt included in this distribution for the specific
+ * language governing permissions and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at LICENSE.txt.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright (c) 2017, 2020, Chris Fraire <cfraire@me.com>.
+ */
+package org.opengrok.indexer.analysis.ocaml;
+
+import org.opengrok.indexer.analysis.JFlexJointLexer;
+import org.opengrok.indexer.analysis.JFlexSymbolMatcher;
+import org.opengrok.indexer.analysis.Resettable;
+
+/**
+ * Represents an abstract base class for OCaml lexers.
+ */
+@SuppressWarnings("Duplicates")
+abstract class OCamlLexer extends JFlexSymbolMatcher
+        implements JFlexJointLexer, Resettable {
+
+    /**
+     * Calls {@link #phLOC()} if the yystate is not COMMENT or SCOMMENT.
+     */
+    public void chkLOC() {
+        if (yystate() != COMMENT() && yystate() != SCOMMENT()) {
+            phLOC();
+        }
+    }
+
+    /**
+     * Subclasses must override to get the constant value created by JFlex to
+     * represent COMMENT.
+     */
+    @SuppressWarnings("java:S100")
+    abstract int COMMENT();
+
+    /**
+     * Subclasses must override to get the constant value created by JFlex to
+     * represent SCOMMENT.
+     */
+    @SuppressWarnings("java:S100")
+    abstract int SCOMMENT();
+}
diff --git a/opengrok-indexer/src/main/jflex/analysis/ocaml/OCaml.lexh b/opengrok-indexer/src/main/jflex/analysis/ocaml/OCaml.lexh
new file mode 100644
index 00000000000..f276a68db02
--- /dev/null
+++ b/opengrok-indexer/src/main/jflex/analysis/ocaml/OCaml.lexh
@@ -0,0 +1,95 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").  
+ * You may not use this file except in compliance with the License.
+ *
+ * See LICENSE.txt included in this distribution for the specific
+ * language governing permissions and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at LICENSE.txt.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2017, Chris Fraire <cfraire@me.com>.
+ *
+ * Copyright (c) Simon Peyton Jones. 
+ * Copyright (c) Simon Marlow.
+ * The authors and publisher intend this Report to belong to the entire Haskell
+ * community, and grant permission to copy and distribute it for any purpose,
+ * provided that it is reproduced in its entirety, including this Notice.
+ * Modified versions of this Report may also be copied and distributed for any
+ * purpose, provided that the modified version is clearly presented as such,
+ * and that it does not claim to be a definition of the language Haskell 2010.
+ */
+
+Identifier = ({varid} | {conid} | {pvconid})
+/*
+ * varid	→	(small {small | large | digit | ' })⟨reservedid⟩
+ * ; N.b. "except {reservedid} is excluded from OpenGrok's varid definition
+ */
+varid = {small} ({small} | {large} | {digit} | [\'])*
+/*
+ * conid	→	large {small | large | digit | ' }
+ */
+conid = {large} ({small} | {large} | {digit} | [\'])*
+/*
+ * polymorphic variant
+ * pvconid	→	`large {small | large | digit | ' }
+ */
+pvconid = [\`] {large} ({small} | {large} | {digit} | [\'])*
+/*
+ * small	→	ascSmall | uniSmall | _
+ * ascSmall	→	a | b | … | z
+ */
+small = [a-z_]
+/*
+ * large	→	ascLarge | uniLarge
+ * ascLarge	→	A | B | … | Z
+ */
+large = [A-Z]
+/*
+ * digit	→	ascDigit | uniDigit
+ * ascDigit	→	0 | 1 | … | 9
+ * uniDigit	→	any Unicode decimal digit
+ * octit	→	0 | 1 | … | 7
+ * hexit	→	digit | A | … | F | a | … | f
+ */
+digit = [0-9]
+octit = [0-7]
+hexit = [0-9A-Fa-f]
+
+Number = ({integer} | {float})
+/*
+ * decimal	→	digit{digit}
+ * octal	→	octit{octit}
+ * hexadecimal	→	hexit{hexit}
+ */
+decimal         = {digit}({digit} | _)*
+octal           = {octit}({octit} | _)*
+hexadecimal     = {hexit}({hexit} | _)*
+/*
+ *
+ * integer	→	decimal
+ *		|	0o octal | 0O octal
+ *		|	0x hexadecimal | 0X hexadecimal
+ */
+integer = ({decimal} | [0][oO]{octal} | [0][xX]{hexadecimal})
+/*
+ * float	→	decimal . decimal [exponent]
+ *		|	decimal exponent
+ */
+float = ({decimal} [\.] {decimal} {exponent}? |
+    {decimal} {exponent})
+/*
+ * exponent	→	(e | E) [+ | -] decimal
+ */
+exponent = [eE] [\+\-]? {decimal}
diff --git a/opengrok-indexer/src/main/jflex/analysis/ocaml/OCamlSymbolTokenizer.lex b/opengrok-indexer/src/main/jflex/analysis/ocaml/OCamlSymbolTokenizer.lex
new file mode 100644
index 00000000000..1df48ef67af
--- /dev/null
+++ b/opengrok-indexer/src/main/jflex/analysis/ocaml/OCamlSymbolTokenizer.lex
@@ -0,0 +1,101 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * See LICENSE.txt included in this distribution for the specific
+ * language governing permissions and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at LICENSE.txt.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015, 2021, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright (c) 2017, Chris Fraire <cfraire@me.com>.
+ */
+
+/*
+ * Get OCaml symbols
+ */
+
+package org.opengrok.indexer.analysis.ocaml;
+
+import java.io.IOException;
+import org.opengrok.indexer.analysis.JFlexSymbolMatcher;
+
+/**
+ * @author Harry Pan
+ */
+%%
+%public
+%class OCamlSymbolTokenizer
+%extends JFlexSymbolMatcher
+%unicode
+%int
+%include ../CommonLexer.lexh
+%char
+%{
+    private int nestedComment;
+
+    public void reset() {
+        super.reset();
+        nestedComment = 0;
+    }
+%}
+
+%state STRING CHAR BCOMMENT
+
+%include ../Common.lexh
+%include OCaml.lexh
+%%
+
+<YYINITIAL> {
+    {Identifier} {
+        String id = yytext();
+        if (!Consts.kwd.contains(id)) {
+            onSymbolMatched(id, yychar);
+            return yystate();
+        }
+    }
+    {Number}    {}
+    \"   { yybegin(STRING);   }
+    \'   { yybegin(CHAR);     }
+}
+
+<STRING> {
+    \\[\"\\]    {}
+    \"   { yybegin(YYINITIAL); }
+}
+
+<CHAR> {    // we don't need to consider the case where prime is part of an identifier since it is handled above
+    \\[\'\\]    {}
+    \'   { yybegin(YYINITIAL); }
+}
+
+<YYINITIAL, BCOMMENT> {
+    "(*"    {
+        if (nestedComment++ == 0) {
+            yybegin(BCOMMENT);
+        }
+    }
+}
+
+<BCOMMENT> {
+    "*)"    {
+        if (--nestedComment == 0) {
+            yybegin(YYINITIAL);
+        }
+    }
+}
+
+// fallback
+{WhspChar}+ |
+[^] {}
diff --git a/opengrok-indexer/src/main/jflex/analysis/ocaml/OCamlXref.lex b/opengrok-indexer/src/main/jflex/analysis/ocaml/OCamlXref.lex
new file mode 100644
index 00000000000..1b274d1963f
--- /dev/null
+++ b/opengrok-indexer/src/main/jflex/analysis/ocaml/OCamlXref.lex
@@ -0,0 +1,181 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * See LICENSE.txt included in this distribution for the specific
+ * language governing permissions and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at LICENSE.txt.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015, 2021, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright (c) 2017, Chris Fraire <cfraire@me.com>.
+ */
+
+/*
+ * Cross reference a OCaml file
+ */
+
+package org.opengrok.indexer.analysis.ocaml;
+
+import java.io.IOException;
+import org.opengrok.indexer.analysis.JFlexSymbolMatcher;
+import org.opengrok.indexer.web.HtmlConsts;
+
+/**
+ * @author Harry Pan
+ */
+%%
+%public
+%class OCamlXref
+%extends JFlexSymbolMatcher
+%unicode
+%int
+%char
+%include ../CommonLexer.lexh
+%include ../CommonXref.lexh
+%{
+    private int nestedComment;
+
+    @Override
+    public void reset() {
+        super.reset();
+        nestedComment = 0;
+    }
+
+    @Override
+    public void yypop() throws IOException {
+        onDisjointSpanChanged(null, yychar);
+        super.yypop();
+    }
+
+    protected void chkLOC() {
+        switch (yystate()) {
+            case BCOMMENT:
+                break;
+            default:
+                phLOC();
+                break;
+        }
+    }
+%}
+
+%state STRING CHAR BCOMMENT
+
+%include ../Common.lexh
+%include ../CommonURI.lexh
+%include ../CommonPath.lexh
+%include OCaml.lexh
+%%
+<YYINITIAL> {
+    {Identifier} {
+        chkLOC();
+        String id = yytext();
+        onFilteredSymbolMatched(id, yychar, Consts.kwd);
+    }
+    {Number}     {
+        chkLOC();
+        onDisjointSpanChanged(HtmlConsts.NUMBER_CLASS, yychar);
+        onNonSymbolMatched(yytext(), yychar);
+        onDisjointSpanChanged(null, yychar);
+    }
+    \"           {
+        chkLOC();
+        yypush(STRING);
+        onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar);
+        onNonSymbolMatched(yytext(), yychar);
+    }
+    \'           {
+        chkLOC();
+        yypush(CHAR);
+        onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar);
+        onNonSymbolMatched(yytext(), yychar);
+    }
+}
+
+<STRING> {
+    \\[\"\\]    { chkLOC(); onNonSymbolMatched(yytext(), yychar); }
+    \"          {
+        chkLOC();
+        onNonSymbolMatched(yytext(), yychar);
+        yypop();
+    }
+    /*
+     * "A string may include a 'gap'-—two backslants enclosing white
+     * characters—-which is ignored. This allows one to write long strings on
+     * more than one line by writing a backslant at the end of one line and at
+     * the start of the next." N.b. OpenGrok does not explicltly recognize the
+     * "gap" but since a STRING must end in a non-escaped quotation mark, just
+     * allow STRINGs to be multi-line regardless of syntax.
+     */
+}
+
+<CHAR> {    // we don't need to consider the case where prime is part of an identifier since it is handled above
+    \\[\'\\]    { chkLOC(); onNonSymbolMatched(yytext(), yychar); }
+    \'          {
+        chkLOC();
+        onNonSymbolMatched(yytext(), yychar);
+        yypop();
+    }
+    /*
+     * N.b. though only a single char is valid OCaml syntax, OpenGrok just
+     * waits to end CHAR at a non-escaped apostrophe regardless of count.
+     */
+}
+
+<YYINITIAL, BCOMMENT> {
+    "(*"    {
+        if (nestedComment++ == 0) {
+            yypush(BCOMMENT);
+            onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar);
+        }
+        onNonSymbolMatched(yytext(), yychar);
+    }
+}
+
+<BCOMMENT> {
+    "*)"    {
+        onNonSymbolMatched(yytext(), yychar);
+        if (--nestedComment == 0) {
+            yypop();
+        }
+    }
+}
+
+{WhspChar}*{EOL}    { onEndOfLineMatched(yytext(), yychar); }
+[[\s]--[\n]]        { onNonSymbolMatched(yytext(), yychar); }
+[^\n]               { chkLOC(); onNonSymbolMatched(yytext(), yychar); }
+
+<STRING, BCOMMENT> {
+    {FPath}    {
+        chkLOC();
+        onPathlikeMatched(yytext(), '/', false, yychar);
+    }
+    {FNameChar}+ "@" {FNameChar}+ "." {FNameChar}+    {
+        chkLOC();
+        onEmailAddressMatched(yytext(), yychar);
+    }
+}
+
+<STRING> {
+    {BrowseableURI}    {
+        chkLOC();
+        onUriMatched(yytext(), yychar);
+    }
+}
+
+<BCOMMENT> {
+    {BrowseableURI} \}?    {
+        onUriMatched(yytext(), yychar);
+    }
+}