Skip to content

Commit

Permalink
Add GHC Cmm lexer (#1387)
Browse files Browse the repository at this point in the history
This commit adds a lexer for GHC Cmm.
  • Loading branch information
supersven authored Apr 14, 2020
1 parent 3fafcfa commit 4b001ac
Show file tree
Hide file tree
Showing 4 changed files with 2,117 additions and 0 deletions.
23 changes: 23 additions & 0 deletions lib/rouge/demos/ghc-cmm
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[lvl_s4t3_entry() // [R1]
{ info_tbls: [(c4uB,
label: lvl_s4t3_info
rep: HeapRep 1 ptrs { Thunk }
srt: Nothing)]
stack_info: arg_space: 8 updfr_space: Just 8
}
{offset
c4uB: // global
if ((Sp + -32) < SpLim) (likely: False) goto c4uC; else goto c4uD;
c4uC: // global
R1 = R1;
call (stg_gc_enter_1)(R1) args: 8, res: 0, upd: 8;
c4uD: // global
I64[Sp - 16] = stg_upd_frame_info;
P64[Sp - 8] = R1;
R2 = P64[R1 + 16];
I64[Sp - 32] = stg_ap_p_info;
P64[Sp - 24] = Main.fib3_closure+1;
Sp = Sp - 32;
call GHC.Num.fromInteger_info(R2) args: 40, res: 0, upd: 24;
}
}
340 changes: 340 additions & 0 deletions lib/rouge/lexers/ghc_cmm.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,340 @@
# -*- coding: utf-8 -*- #
# frozen_string_literal: true

# C minus minus (Cmm) is a pun on the name C++. It's an intermediate language
# of the Glasgow Haskell Compiler (GHC) that is very similar to C, but with
# many features missing and some special constructs.
#
# Cmm is a dialect of C--. The goal of this lexer is to use what GHC produces
# and parses (Cmm); C-- itself is not supported.
#
# https://gitlab.haskell.org/ghc/ghc/wikis/commentary/compiler/cmm-syntax
#
module Rouge
module Lexers
class GHCCmm < RegexLexer
title "GHC Cmm (C--)"
desc "GHC Cmm is the intermediate representation of the GHC Haskell compiler"
tag 'ghc-cmm'
filenames '*.cmm', '*.dump-cmm', '*.dump-cmm-*'
aliases 'cmm'

ws = %r(\s|//.*?\n|/[*](?:[^*]|(?:[*][^/]))*[*]+/)mx

# Make sure that this is not a preprocessor macro, e.g. `#if` or `#define`.
id = %r((?!#[a-zA-Z])[\w#\$%_']+)

complex_id = %r(
(?:[\w#$%_']|\(\)|\(,\)|\[\]|[0-9])*
(?:[\w#$%_']+)
)mx

state :root do
rule %r/\s+/m, Text

# sections markers
rule %r/^=====.*=====$/, Generic::Heading

# timestamps
rule %r/^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+ UTC$/, Comment::Single

mixin :detect_section
mixin :preprocessor_macros

mixin :info_tbls
mixin :comments
mixin :literals
mixin :keywords
mixin :types
mixin :infos
mixin :names
mixin :operators

# escaped newline
rule %r/\\\n/, Text

# rest is Text
rule %r/./, Text
end

state :detect_section do
rule %r/(section)(\s+)/ do |m|
token Keyword, m[1]
token Text, m[2]
push :section
end
end

state :section do
rule %r/"(data|cstring|text|rodata|relrodata|bss)"/, Name::Builtin

rule %r/{/, Punctuation, :pop!

mixin :names
mixin :operators
mixin :keywords

rule %r/\s+/, Text
end

state :preprocessor_macros do
rule %r/#(include|endif|else|if)/, Comment::Preproc

rule %r{
(\#define)
(#{ws}*)
(#{id})
}mx do |m|
token Comment::Preproc, m[1]
recurse m[2]
token Name::Label, m[3]
end
end

state :info_tbls do
rule %r/({ )(info_tbls)(:)/ do |m|
token Punctuation, m[1]
token Name::Entity, m[2]
token Punctuation, m[3]

push :info_tbls_body
end
end

state :info_tbls_body do
rule %r/}/, Punctuation, :pop!
rule %r/{/, Punctuation, :info_tbls_body

rule %r/(?=label:)/ do
push :label
end

rule %r{(\()(#{complex_id})(,)}mx do |m|
token Punctuation, m[1]
token Name::Label, m[2]
token Punctuation, m[3]
end

mixin :literals
mixin :infos
mixin :keywords
mixin :operators

rule %r/#{id}/, Text
rule %r/\s+/, Text
end

state :label do
mixin :infos
mixin :names
mixin :keywords
mixin :operators

rule %r/[^\S\n]+/, Text # Tab, space, etc. but not newline!
rule %r/\n/, Text, :pop!
end

state :comments do
rule %r/\/{2}.*/, Comment::Single
rule %r/\(likely.*?\)/, Comment
rule %r/\/\*.*?\*\//m, Comment::Multiline
end

state :literals do
rule %r/-?[0-9]+\.[0-9]+/, Literal::Number::Float
rule %r/-?[0-9]+/, Literal::Number::Integer
rule %r/"/, Literal::String::Delimiter, :literal_string
end

state :literal_string do
# quotes
rule %r/\\./, Literal::String::Escape
rule %r/%./, Literal::String::Symbol
rule %r/"/, Literal::String::Delimiter, :pop!
rule %r/./, Literal::String
end

state :operators do
rule %r/\.\./, Operator
rule %r/[+\-*\/<>=!&|~]/, Operator
rule %r/[\[\].{}:;,()]/, Punctuation
end

state :keywords do
rule %r/(const)(\s+)/ do |m|
token Keyword::Constant, m[1]
token Text, m[2]
end

rule %r/"/, Literal::String::Double

rule %r/(switch)([^{]*)({)/ do |m|
token Keyword, m[1]
recurse m[2]
token Punctuation, m[3]
end

rule %r/(arg|result)(#{ws}+)(hints)(:)/ do |m|
token Name::Property, m[1]
recurse m[2]
token Name::Property, m[3]
token Punctuation, m[4]
end

rule %r/(returns)(#{ws}*)(to)/ do |m|
token Keyword, m[1]
recurse m[2]
token Keyword, m[3]
end

rule %r/(never)(#{ws}*)(returns)/ do |m|
token Keyword, m[1]
recurse m[2]
token Keyword, m[3]
end

rule %r{(return)(#{ws}*)(\()} do |m|
token Keyword, m[1]
recurse m[2]
token Punctuation, m[3]
end

rule %r{(if|else|goto|call|offset|import|jump|ccall|foreign|prim|case|unwind|export|reserve|push)(#{ws})} do |m|
token Keyword, m[1]
recurse m[2]
end

rule %r{(default)(#{ws}*)(:)} do |m|
token Keyword, m[1]
recurse m[2]
token Punctuation, m[3]
end
end

state :types do
# Memory access: `type[42]`
# Note: Only a token for type is produced.
rule %r/(#{id})(?=\[[^\]])/ do |m|
token Keyword::Type, m[1]
end

# Array type: `type[]`
rule %r/(#{id}\[\])/ do |m|
token Keyword::Type, m[1]
end

# Capture macro substitutions before lexing typed declarations
# I.e. there is no type in `PREPROCESSOR_MACRO_VARIABLE someFun()`
rule %r{
(^#{id})
(#{ws}+)
(#{id})
(#{ws}*)
(\()
}mx do |m|
token Name::Label, m[1]
recurse m[2]
token Name::Function, m[3]
recurse m[4]
token Punctuation, m[5]
end

# Type in variable or parameter declaration:
# `type /* optional whitespace */ var_name /* optional whitespace */;`
# `type /* optional whitespace */ var_name /* optional whitespace */, var_name2`
# `(type /* optional whitespace */ var_name /* optional whitespace */)`
# Note: Only the token for type is produced here.
rule %r{
(^#{id})
(#{ws}+)
(#{id})
}mx do |m|
token Keyword::Type, m[1]
recurse m[2]
token Name::Label, m[3]
end
end

state :infos do
rule %r/(args|res|upd|label|rep|srt|arity|fun_type|arg_space|updfr_space)(:)/ do |m|
token Name::Property, m[1]
token Punctuation, m[2]
end

rule %r/(stack_info)(:)/ do |m|
token Name::Entity, m[1]
token Punctuation, m[2]
end
end

state :names do
rule %r/(::)(#{ws}*)([A-Z]\w+)/ do |m|
token Operator, m[1]
recurse m[2]
token Keyword::Type, m[3]
end

rule %r/<(#{id})>/, Name::Builtin

rule %r/(Sp|SpLim|Hp|HpLim|HpAlloc|BaseReg|CurrentNursery|CurrentTSO|R\d{1,2}|gcptr)(?!#{id})/, Name::Variable::Global
rule %r/([A-Z]#{id})(\.)/ do |m|
token Name::Namespace, m[1]
token Punctuation, m[2]
push :namespace_name
end

# Inline function calls:
# ```
# arg1 `lt` arg2
# ```
rule %r/(`)(#{id})(`)/ do |m|
token Punctuation, m[1]
token Name::Function, m[2]
token Punctuation, m[3]
end

# Function: `name /* optional whitespace */ (`
# Function (arguments via explicit stack handling): `name /* optional whitespace */ {`
rule %r{(?=
#{complex_id}
#{ws}*
[\{\(]
)}mx do
push :function
end

rule %r/CLOSURE/, Keyword::Type
rule %r/#{complex_id}/, Name::Label
end

state :namespace_name do
rule %r/([A-Z]#{id})(\.)/ do |m|
token Name::Namespace, m[1]
token Punctuation, m[2]
end

rule %r{(#{complex_id})(#{ws}*)([\{\(])}mx do |m|
token Name::Function, m[1]
recurse m[2]
token Punctuation, m[3]
pop!
end

rule %r/#{complex_id}/, Name::Label, :pop!

rule %r/(?=.)/m do
pop!
end
end

state :function do
rule %r/INFO_TABLE_FUN|INFO_TABLE_CONSTR|INFO_TABLE_SELECTOR|INFO_TABLE_RET|INFO_TABLE/, Name::Builtin
rule %r/%#{id}/, Name::Builtin
rule %r/#{complex_id}/, Name::Function
rule %r/\s+/, Text
rule %r/[({]/, Punctuation, :pop!
mixin :comments
end
end
end
end
Loading

0 comments on commit 4b001ac

Please sign in to comment.