Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add library pire2hyperscan #43

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
ACLOCAL_AMFLAGS = -I m4
SUBDIRS = pire tests pkg samples tools
SUBDIRS = pire tests pkg samples tools pire2hyperscan
CONFIG_CLEAN_FILES = pire/config.h
1 change: 1 addition & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ AM_CONDITIONAL([HAVE_VALGRIND], [test x"$pire_cv_have_valgrind" = xyes])
AC_CONFIG_FILES([
Makefile
pire/Makefile
pire2hyperscan/Makefile
tests/Makefile
pkg/Makefile
tools/Makefile
Expand Down
12 changes: 12 additions & 0 deletions pire2hyperscan/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
AM_CXXFLAGS = -Wall -std=c++11

lib_LTLIBRARIES = libpire2hyperscan.la
libpire2hyperscan_la_SOURCES = \
pire2hyperscan.h \
pire2hyperscan.cpp

libpire2hyperscan_hdrdir = $(includedir)/pire2hyperscan
libpire2hyperscan_hdr_HEADERS = \
pire2hyperscan.h

pire2hyperscan.o: re_parser.h
214 changes: 214 additions & 0 deletions pire2hyperscan/pire2hyperscan.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
/*
* pire2hyperscan.cpp -- convert Pire regex to Hyperscan regex
*
* Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
* Alexander Gololobov <agololobov@gmail.com>
*
* This file is part of Pire, the Perl Incompatible
* Regular Expressions library.
*
* Pire is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
* You should have received a copy of the GNU Lesser Public License
* along with Pire. If not, see <http://www.gnu.org/licenses>.
*/

#include "pire2hyperscan.h"

#include <pire/encoding.h> // ToUtf8
#include <pire/pire.h>
#include <pire/re_parser.h> // YRE_AND, YRE_NOT

#include <sstream>
#include <stdexcept>


namespace Pire {

struct TCountedTerm {
Term MainTerm;
int MinCount;
int MaxCount;

TCountedTerm(const Term term)
: MainTerm(term)
, MinCount(1)
, MaxCount(1)
{
}
};


using TCharacterRange = Term::CharacterRange;


static ystring ToUtf8(wchar32 letter32) {
return Encodings::Utf8().ToLocal(letter32);
}


static bool NeedBrackets(const TCharacterRange& range) {
if (range.second) {
return true;
}
if (range.first.size() != 1) {
return true;
}
auto wideLetter = *range.first.begin();
if (wideLetter.size() != 1) {
return true; // will throw NHyperscan::TCompileException
}
ystring letter = ToUtf8(wideLetter[0]);
if (letter.size() != 1) {
return true;
}
return !isalnum(letter[0]);
}


static bool NeedEscape(const ystring& ch) {
return ch == "-" || ch == "[" || ch == "]" || ch == "\\" || ch == "^";
}


ystring PireLexer2Hyperscan(Lexer& lexer) {

// Step 1. Turn lexer into a vector of terms
yvector<TCountedTerm> terms;
for (Term term = lexer.Lex(); term.Type() != 0; term = lexer.Lex()) {
if (term.Type() == YRE_COUNT) {
using TRepetitionCount = Term::RepetitionCount;
const TRepetitionCount& value = term.Value().As<TRepetitionCount>();
YASSERT(!terms.empty());
terms.back().MinCount = value.first;
terms.back().MaxCount = value.second;
} else {
terms.push_back(term);
}
}

// Step 2. Turn the vector of terms back to regex string.
std::stringstream result;
for (size_t i = 0; i < terms.size(); i++) {
const TCountedTerm term = terms[i];

// If first term is [^...], it matches text begin in Pire
// Example: /[^4]submit/
// The following conditions are required to match text begin/end:
// 1. the term is first or last
// 2. length can be 1, so it could be begin/end mark in Pire
bool mayNeedMask = (term.MinCount == 1);
// terms.size() > 1; https://github.com/01org/hyperscan/issues/25
auto fixBefore = [&]() {
if (mayNeedMask) {
if (i == 0) {
result << "(^|";
} else if (i == terms.size() - 1) {
result << "($|";
}
}
};
auto fixAfter = [&]() {
if (mayNeedMask) {
if (i == 0) {
result << ")";
} else if (i == terms.size() - 1) {
result << ")";
}
}
};

auto printCount = [&]() {
if (term.MinCount != 1 || term.MaxCount != 1) {
result << '{' << term.MinCount << ',';
if (term.MaxCount != Consts::Inf) {
result << term.MaxCount;
}
result << '}';
}
};

int type = term.MainTerm.Type();
if (type == YRE_LETTERS) {
if (!term.MainTerm.Value().IsA<TCharacterRange>()) {
throw NHyperscan::TCompileException();
}
const TCharacterRange& value = term.MainTerm.Value().As<TCharacterRange>();
if (value.second) {
fixBefore();
}
if (NeedBrackets(value)) {
result << '[';
}
if (value.second) {
result << '^';
}
for (const auto& str : value.first) {
if (str.size() != 1) {
// members of [...] must be 1-letter
throw NHyperscan::TCompileException();
}
ystring utf8String = ToUtf8(str[0]);
if (NeedEscape(utf8String)) {
result << '\\';
}
result << utf8String;
}
if (NeedBrackets(value)) {
result << ']';
}
printCount();
if (value.second) {
fixAfter();
}
} else if (type == YRE_DOT) {
fixBefore();
result << '.';
printCount();
fixAfter();
} else if (type == YRE_AND) {
throw NHyperscan::TCompileException();
} else if (type == YRE_NOT) {
throw NHyperscan::TCompileException();
} else if (type == '(') {
result << '(';
} else if (type == ')') {
result << ')';
printCount();
} else if (type == '|') {
result << '|';
} else if (type == '^') {
result << '^';
} else if (type == '$') {
result << '$';
} else {
std::stringstream errorMessage;
errorMessage << "Unknown term type: ";
errorMessage << type;
throw std::logic_error(errorMessage.str());
}
}
return result.str();
}


ystring PireRegex2Hyperscan(const ystring& regex) {
yvector<wchar32> ucs4;
Encodings::Utf8().FromLocal(
regex.data(),
regex.data() + regex.size(),
std::back_inserter(ucs4)
);
Lexer lexer(ucs4.begin(), ucs4.end());
lexer.AddFeature(Features::AndNotSupport());
return PireLexer2Hyperscan(lexer);
}

}
41 changes: 41 additions & 0 deletions pire2hyperscan/pire2hyperscan.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* pire2hyperscan.h -- convert Pire regex to Hyperscan regex
*
* Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
* Alexander Gololobov <agololobov@gmail.com>
*
* This file is part of Pire, the Perl Incompatible
* Regular Expressions library.
*
* Pire is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
* You should have received a copy of the GNU Lesser Public License
* along with Pire. If not, see <http://www.gnu.org/licenses>.
*/

#pragma once

#include <pire/stub/stl.h> // ystring, yvector, etc

#include <exception>


namespace NHyperscan {
class TCompileException : public std::exception {
};
};


namespace Pire {
class Lexer;

ystring PireLexer2Hyperscan(Lexer& lexer);
ystring PireRegex2Hyperscan(const ystring& regex);
}
9 changes: 5 additions & 4 deletions tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@ lib_LTLIBRARIES = libpire_unit.la
libpire_unit_la_SOURCES = \
stub/cppunit.cpp \
stub/cppunit.h
libpire_unit_la_CXXFLAGS = -I$(top_srcdir)/pire
libpire_unit_la_CXXFLAGS = -I$(top_srcdir)/pire -I$(top_srcdir)/pire2hyperscan

check_PROGRAMS = pire_test

pire_test_SOURCES = \
common.h \
pire_ut.cpp \
pire2hyperscan_ut.cpp \
easy_ut.cpp

if ENABLE_EXTRA
Expand All @@ -30,15 +31,15 @@ nodist_pire_test_SOURCES = inline_ut_2.cpp

EXTRA_DIST = inline_ut.cpp pire_test_valgrind.sh

pire_test_LDADD = ../pire/libpire.la libpire_unit.la
pire_test_CXXFLAGS = -I$(top_srcdir)/pire $(AM_CXXFLAGS)
pire_test_LDADD = ../pire/libpire.la ../pire2hyperscan/libpire2hyperscan.la libpire_unit.la
pire_test_CXXFLAGS = -I$(top_srcdir)/pire -I$(top_srcdir)/pire2hyperscan $(AM_CXXFLAGS)

TESTS = pire_test

check_PROGRAMS += pire_test_valgrind
pire_test_valgrind_SOURCES = valgrind_ut.cpp
pire_test_valgrind_LDADD = ../pire/libpire.la libpire_unit.la
pire_test_valgrind_CXXFLAGS = -I$(top_srcdir)/pire $(AM_CXXFLAGS)
pire_test_valgrind_CXXFLAGS = -I$(top_srcdir)/pire -I$(top_srcdir)/pire2hyperscan $(AM_CXXFLAGS)
TESTS += pire_test_valgrind

if HAVE_VALGRIND
Expand Down
53 changes: 53 additions & 0 deletions tests/pire2hyperscan_ut.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* pire2hyperscan_ut.cpp -- convert Pire regex to Hyperscan regex
*
* Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
* Alexander Gololobov <agololobov@gmail.com>
*
* This file is part of Pire, the Perl Incompatible
* Regular Expressions library.
*
* Pire is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Pire is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
* You should have received a copy of the GNU Lesser Public License
* along with Pire. If not, see <http://www.gnu.org/licenses>.
*/

#include <stub/hacks.h>
#include <stub/saveload.h>
#include <stub/utf8.h>
#include <stub/memstreams.h>
#include "stub/cppunit.h"

#include <pire.h>
#include <pire2hyperscan.h>

SIMPLE_UNIT_TEST_SUITE(TestPire2Hyperscan) {

SIMPLE_UNIT_TEST(PireRegex2Hyperscan)
{
UNIT_ASSERT_EQUAL(PireRegex2Hyperscan("a.b"), "a.b");
UNIT_ASSERT_EQUAL(PireRegex2Hyperscan("[^4]submit[^4]"), "(^|[^4])submit($|[^4])");
UNIT_ASSERT_EQUAL(PireRegex2Hyperscan("a\\&b"), "a[&]b");
}

SIMPLE_UNIT_TEST(PireRegex2HyperscanThrowsTCompileException)
{
try {
PireRegex2Hyperscan("a&b");
UNIT_ASSERT(false); // must throw
} catch (NHyperscan::TCompileException) {
// right exception was thrown
} catch (...) {
UNIT_ASSERT(false); // wrong type of exception was thrown
}
}

}