From a76b9176f88ce3248e5f97867088bec29ed521cc Mon Sep 17 00:00:00 2001 From: takuya kodama Date: Mon, 5 Aug 2024 08:46:32 +0800 Subject: [PATCH] wikipedia-kyoto-japanese-english: increase REXML entity expansion limit during XML parsing (#198) Using `Datasets::WikipediaKyotoJapaneseEnglish#each` raised an `entity expansion has grown too large (RuntimeError)`. This error occurs because the entity expansion limit in REXML is set by https://github.com/ruby/rexml/pull/187, and `Datasets::WikipediaKyotoJapaneseEnglish#each` exceeds that limit. In Red Datasets, increasing the entity expansion limit is not a problem because we want to handle large datasets. Therefore, we temporarily increase the limit. ## How to reproduce ```console $ cd red-datasets && bundle $ bundle exec ruby example/wikipedia-kyoto-japanese-english.rb ... /home/otegami/.rbenv/versions/3.3.3/lib/ruby/gems/3.3.0/gems/rexml-3.3.4/lib/rexml/parsers/baseparser.rb:560:in `block in unnormalize': entity expansion has grown too large (RuntimeError) ... ``` --- lib/datasets/wikipedia-kyoto-japanese-english.rb | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/lib/datasets/wikipedia-kyoto-japanese-english.rb b/lib/datasets/wikipedia-kyoto-japanese-english.rb index 227c6d0c..0b1f2c16 100644 --- a/lib/datasets/wikipedia-kyoto-japanese-english.rb +++ b/lib/datasets/wikipedia-kyoto-japanese-english.rb @@ -89,7 +89,9 @@ def each(&block) next unless base_name.end_with?(".xml") listener = ArticleListener.new(block) parser = REXML::Parsers::StreamParser.new(entry.read, listener) - parser.parse + with_increased_entity_expansion_text_limit do + parser.parse + end when :lexicon next unless base_name == "kyoto_lexicon.csv" is_header = true @@ -106,6 +108,9 @@ def each(&block) end private + + ENTITY_EXPANSION_TEXT_LIMIT = 163_840 + def download_tar_gz base_name = "wiki_corpus_2.01.tar.gz" data_path = cache_dir_path + base_name @@ -114,6 +119,14 @@ def download_tar_gz data_path end + def with_increased_entity_expansion_text_limit + default_limit = REXML::Security.entity_expansion_text_limit + REXML::Security.entity_expansion_text_limit = ENTITY_EXPANSION_TEXT_LIMIT + yield + ensure + REXML::Security.entity_expansion_text_limit = default_limit + end + class ArticleListener include REXML::StreamListener