diff --git a/Makefile.am b/Makefile.am index c1bfe6cf..21e40be4 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,3 +1,3 @@ ACLOCAL_AMFLAGS = -I m4 -SUBDIRS = pire tests pkg samples tools +SUBDIRS = pire tests pkg samples tools pire2hyperscan CONFIG_CLEAN_FILES = pire/config.h diff --git a/configure.ac b/configure.ac index e185cb98..45e4a087 100644 --- a/configure.ac +++ b/configure.ac @@ -96,6 +96,7 @@ AM_CONDITIONAL([HAVE_VALGRIND], [test x"$pire_cv_have_valgrind" = xyes]) AC_CONFIG_FILES([ Makefile pire/Makefile + pire2hyperscan/Makefile tests/Makefile pkg/Makefile tools/Makefile diff --git a/pire2hyperscan/Makefile.am b/pire2hyperscan/Makefile.am new file mode 100644 index 00000000..f598063e --- /dev/null +++ b/pire2hyperscan/Makefile.am @@ -0,0 +1,12 @@ +AM_CXXFLAGS = -Wall -std=c++11 + +lib_LTLIBRARIES = libpire2hyperscan.la +libpire2hyperscan_la_SOURCES = \ + pire2hyperscan.h \ + pire2hyperscan.cpp + +libpire2hyperscan_hdrdir = $(includedir)/pire2hyperscan +libpire2hyperscan_hdr_HEADERS = \ + pire2hyperscan.h + +pire2hyperscan.o: re_parser.h diff --git a/pire2hyperscan/pire2hyperscan.cpp b/pire2hyperscan/pire2hyperscan.cpp new file mode 100644 index 00000000..89b93ee1 --- /dev/null +++ b/pire2hyperscan/pire2hyperscan.cpp @@ -0,0 +1,214 @@ +/* + * pire2hyperscan.cpp -- convert Pire regex to Hyperscan regex + * + * Copyright (c) 2007-2010, Dmitry Prokoptsev , + * Alexander Gololobov + * + * This file is part of Pire, the Perl Incompatible + * Regular Expressions library. + * + * Pire is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pire is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * You should have received a copy of the GNU Lesser Public License + * along with Pire. If not, see . + */ + +#include "pire2hyperscan.h" + +#include // ToUtf8 +#include +#include // YRE_AND, YRE_NOT + +#include +#include + + +namespace Pire { + + struct TCountedTerm { + Term MainTerm; + int MinCount; + int MaxCount; + + TCountedTerm(const Term term) + : MainTerm(term) + , MinCount(1) + , MaxCount(1) + { + } + }; + + + using TCharacterRange = Term::CharacterRange; + + + static ystring ToUtf8(wchar32 letter32) { + return Encodings::Utf8().ToLocal(letter32); + } + + + static bool NeedBrackets(const TCharacterRange& range) { + if (range.second) { + return true; + } + if (range.first.size() != 1) { + return true; + } + auto wideLetter = *range.first.begin(); + if (wideLetter.size() != 1) { + return true; // will throw NHyperscan::TCompileException + } + ystring letter = ToUtf8(wideLetter[0]); + if (letter.size() != 1) { + return true; + } + return !isalnum(letter[0]); + } + + + static bool NeedEscape(const ystring& ch) { + return ch == "-" || ch == "[" || ch == "]" || ch == "\\" || ch == "^"; + } + + + ystring PireLexer2Hyperscan(Lexer& lexer) { + + // Step 1. Turn lexer into a vector of terms + yvector terms; + for (Term term = lexer.Lex(); term.Type() != 0; term = lexer.Lex()) { + if (term.Type() == YRE_COUNT) { + using TRepetitionCount = Term::RepetitionCount; + const TRepetitionCount& value = term.Value().As(); + YASSERT(!terms.empty()); + terms.back().MinCount = value.first; + terms.back().MaxCount = value.second; + } else { + terms.push_back(term); + } + } + + // Step 2. Turn the vector of terms back to regex string. + std::stringstream result; + for (size_t i = 0; i < terms.size(); i++) { + const TCountedTerm term = terms[i]; + + // If first term is [^...], it matches text begin in Pire + // Example: /[^4]submit/ + // The following conditions are required to match text begin/end: + // 1. the term is first or last + // 2. length can be 1, so it could be begin/end mark in Pire + bool mayNeedMask = (term.MinCount == 1); + // terms.size() > 1; https://github.com/01org/hyperscan/issues/25 + auto fixBefore = [&]() { + if (mayNeedMask) { + if (i == 0) { + result << "(^|"; + } else if (i == terms.size() - 1) { + result << "($|"; + } + } + }; + auto fixAfter = [&]() { + if (mayNeedMask) { + if (i == 0) { + result << ")"; + } else if (i == terms.size() - 1) { + result << ")"; + } + } + }; + + auto printCount = [&]() { + if (term.MinCount != 1 || term.MaxCount != 1) { + result << '{' << term.MinCount << ','; + if (term.MaxCount != Consts::Inf) { + result << term.MaxCount; + } + result << '}'; + } + }; + + int type = term.MainTerm.Type(); + if (type == YRE_LETTERS) { + if (!term.MainTerm.Value().IsA()) { + throw NHyperscan::TCompileException(); + } + const TCharacterRange& value = term.MainTerm.Value().As(); + if (value.second) { + fixBefore(); + } + if (NeedBrackets(value)) { + result << '['; + } + if (value.second) { + result << '^'; + } + for (const auto& str : value.first) { + if (str.size() != 1) { + // members of [...] must be 1-letter + throw NHyperscan::TCompileException(); + } + ystring utf8String = ToUtf8(str[0]); + if (NeedEscape(utf8String)) { + result << '\\'; + } + result << utf8String; + } + if (NeedBrackets(value)) { + result << ']'; + } + printCount(); + if (value.second) { + fixAfter(); + } + } else if (type == YRE_DOT) { + fixBefore(); + result << '.'; + printCount(); + fixAfter(); + } else if (type == YRE_AND) { + throw NHyperscan::TCompileException(); + } else if (type == YRE_NOT) { + throw NHyperscan::TCompileException(); + } else if (type == '(') { + result << '('; + } else if (type == ')') { + result << ')'; + printCount(); + } else if (type == '|') { + result << '|'; + } else if (type == '^') { + result << '^'; + } else if (type == '$') { + result << '$'; + } else { + std::stringstream errorMessage; + errorMessage << "Unknown term type: "; + errorMessage << type; + throw std::logic_error(errorMessage.str()); + } + } + return result.str(); + } + + + ystring PireRegex2Hyperscan(const ystring& regex) { + yvector ucs4; + Encodings::Utf8().FromLocal( + regex.data(), + regex.data() + regex.size(), + std::back_inserter(ucs4) + ); + Lexer lexer(ucs4.begin(), ucs4.end()); + lexer.AddFeature(Features::AndNotSupport()); + return PireLexer2Hyperscan(lexer); + } + +} diff --git a/pire2hyperscan/pire2hyperscan.h b/pire2hyperscan/pire2hyperscan.h new file mode 100644 index 00000000..468f0a40 --- /dev/null +++ b/pire2hyperscan/pire2hyperscan.h @@ -0,0 +1,41 @@ +/* + * pire2hyperscan.h -- convert Pire regex to Hyperscan regex + * + * Copyright (c) 2007-2010, Dmitry Prokoptsev , + * Alexander Gololobov + * + * This file is part of Pire, the Perl Incompatible + * Regular Expressions library. + * + * Pire is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pire is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * You should have received a copy of the GNU Lesser Public License + * along with Pire. If not, see . + */ + +#pragma once + +#include // ystring, yvector, etc + +#include + + +namespace NHyperscan { + class TCompileException : public std::exception { + }; +}; + + +namespace Pire { + class Lexer; + + ystring PireLexer2Hyperscan(Lexer& lexer); + ystring PireRegex2Hyperscan(const ystring& regex); +} diff --git a/tests/Makefile.am b/tests/Makefile.am index e502e0cd..a5e99aa7 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -10,13 +10,14 @@ lib_LTLIBRARIES = libpire_unit.la libpire_unit_la_SOURCES = \ stub/cppunit.cpp \ stub/cppunit.h -libpire_unit_la_CXXFLAGS = -I$(top_srcdir)/pire +libpire_unit_la_CXXFLAGS = -I$(top_srcdir)/pire -I$(top_srcdir)/pire2hyperscan check_PROGRAMS = pire_test pire_test_SOURCES = \ common.h \ pire_ut.cpp \ + pire2hyperscan_ut.cpp \ easy_ut.cpp if ENABLE_EXTRA @@ -30,15 +31,15 @@ nodist_pire_test_SOURCES = inline_ut_2.cpp EXTRA_DIST = inline_ut.cpp pire_test_valgrind.sh -pire_test_LDADD = ../pire/libpire.la libpire_unit.la -pire_test_CXXFLAGS = -I$(top_srcdir)/pire $(AM_CXXFLAGS) +pire_test_LDADD = ../pire/libpire.la ../pire2hyperscan/libpire2hyperscan.la libpire_unit.la +pire_test_CXXFLAGS = -I$(top_srcdir)/pire -I$(top_srcdir)/pire2hyperscan $(AM_CXXFLAGS) TESTS = pire_test check_PROGRAMS += pire_test_valgrind pire_test_valgrind_SOURCES = valgrind_ut.cpp pire_test_valgrind_LDADD = ../pire/libpire.la libpire_unit.la -pire_test_valgrind_CXXFLAGS = -I$(top_srcdir)/pire $(AM_CXXFLAGS) +pire_test_valgrind_CXXFLAGS = -I$(top_srcdir)/pire -I$(top_srcdir)/pire2hyperscan $(AM_CXXFLAGS) TESTS += pire_test_valgrind if HAVE_VALGRIND diff --git a/tests/pire2hyperscan_ut.cpp b/tests/pire2hyperscan_ut.cpp new file mode 100644 index 00000000..d9371f54 --- /dev/null +++ b/tests/pire2hyperscan_ut.cpp @@ -0,0 +1,53 @@ +/* + * pire2hyperscan_ut.cpp -- convert Pire regex to Hyperscan regex + * + * Copyright (c) 2007-2010, Dmitry Prokoptsev , + * Alexander Gololobov + * + * This file is part of Pire, the Perl Incompatible + * Regular Expressions library. + * + * Pire is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Pire is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * You should have received a copy of the GNU Lesser Public License + * along with Pire. If not, see . + */ + +#include +#include +#include +#include +#include "stub/cppunit.h" + +#include +#include + +SIMPLE_UNIT_TEST_SUITE(TestPire2Hyperscan) { + + SIMPLE_UNIT_TEST(PireRegex2Hyperscan) + { + UNIT_ASSERT_EQUAL(PireRegex2Hyperscan("a.b"), "a.b"); + UNIT_ASSERT_EQUAL(PireRegex2Hyperscan("[^4]submit[^4]"), "(^|[^4])submit($|[^4])"); + UNIT_ASSERT_EQUAL(PireRegex2Hyperscan("a\\&b"), "a[&]b"); + } + + SIMPLE_UNIT_TEST(PireRegex2HyperscanThrowsTCompileException) + { + try { + PireRegex2Hyperscan("a&b"); + UNIT_ASSERT(false); // must throw + } catch (NHyperscan::TCompileException) { + // right exception was thrown + } catch (...) { + UNIT_ASSERT(false); // wrong type of exception was thrown + } + } + +}