From 5ce5e317fa66141416ec5d841f4fccd389a0f5d8 Mon Sep 17 00:00:00 2001 From: David Welguisz Date: Fri, 12 May 2017 10:49:56 -0500 Subject: [PATCH 1/4] Initial work to disable html injection by using options[:disable_html_inject] --- lib/roo/excelx.rb | 4 +- lib/roo/excelx/extractor.rb | 3 +- lib/roo/excelx/shared.rb | 5 +- lib/roo/excelx/shared_strings.rb | 1 + spec/lib/roo/excelx_spec.rb | 89 ++++++++++++++++++++++---------- 5 files changed, 70 insertions(+), 32 deletions(-) mode change 100644 => 100755 lib/roo/excelx.rb mode change 100644 => 100755 lib/roo/excelx/extractor.rb mode change 100644 => 100755 lib/roo/excelx/shared.rb diff --git a/lib/roo/excelx.rb b/lib/roo/excelx.rb old mode 100644 new mode 100755 index 82c14314..ea48f59d --- a/lib/roo/excelx.rb +++ b/lib/roo/excelx.rb @@ -39,6 +39,8 @@ def initialize(filename_or_stream, options = {}) sheet_options = {} sheet_options[:expand_merged_ranges] = (options[:expand_merged_ranges] || false) sheet_options[:no_hyperlinks] = (options[:no_hyperlinks] || false) + shared_options = {} + shared_options[:disable_html_wrapper] = (options[:disable_html_wrapper] || false) unless is_stream?(filename_or_stream) file_type_check(filename_or_stream, %w[.xlsx .xlsm], 'an Excel 2007', file_warning, packed) @@ -52,7 +54,7 @@ def initialize(filename_or_stream, options = {}) @tmpdir = self.class.make_tempdir(self, basename, options[:tmpdir_root]) ObjectSpace.define_finalizer(self, self.class.finalize(object_id)) - @shared = Shared.new(@tmpdir) + @shared = Shared.new(@tmpdir, shared_options) @filename = local_filename(filename_or_stream, @tmpdir, packed) process_zipfile(@filename || filename_or_stream) diff --git a/lib/roo/excelx/extractor.rb b/lib/roo/excelx/extractor.rb old mode 100644 new mode 100755 index 1cdd13ba..6c399eb6 --- a/lib/roo/excelx/extractor.rb +++ b/lib/roo/excelx/extractor.rb @@ -1,8 +1,9 @@ module Roo class Excelx class Extractor - def initialize(path) + def initialize(path, options) @path = path + @options = options end private diff --git a/lib/roo/excelx/shared.rb b/lib/roo/excelx/shared.rb old mode 100644 new mode 100755 index 3677fa25..b82db0e4 --- a/lib/roo/excelx/shared.rb +++ b/lib/roo/excelx/shared.rb @@ -5,11 +5,12 @@ class Excelx # to various inititializers. class Shared attr_accessor :comments_files, :sheet_files, :rels_files - def initialize(dir) + def initialize(dir, options = {}) @dir = dir @comments_files = [] @sheet_files = [] @rels_files = [] + @options = options end def styles @@ -17,7 +18,7 @@ def styles end def shared_strings - @shared_strings ||= SharedStrings.new(File.join(@dir, 'roo_sharedStrings.xml')) + @shared_strings ||= SharedStrings.new(File.join(@dir, 'roo_sharedStrings.xml'), @options) end def workbook diff --git a/lib/roo/excelx/shared_strings.rb b/lib/roo/excelx/shared_strings.rb index f7caf7ca..b908f356 100755 --- a/lib/roo/excelx/shared_strings.rb +++ b/lib/roo/excelx/shared_strings.rb @@ -26,6 +26,7 @@ def to_html # Use to_html or to_a for html returns # See what is happening with commit??? def use_html?(index) + return false if options[:disable_html_wrapper] to_html[index][/<([biu]|sup|sub)>/] end diff --git a/spec/lib/roo/excelx_spec.rb b/spec/lib/roo/excelx_spec.rb index 0ef0f19a..09e861e5 100755 --- a/spec/lib/roo/excelx_spec.rb +++ b/spec/lib/roo/excelx_spec.rb @@ -480,34 +480,67 @@ end describe '#html_strings' do - let(:path) { 'test/files/html_strings_formatting.xlsx' } - - it 'returns the expected result' do - expect(subject.excelx_value(1, 1, "Sheet1")).to eq "This has no formatting." - expect(subject.excelx_value(2, 1, "Sheet1")).to eq "This has bold formatting." - expect(subject.excelx_value(2, 2, "Sheet1")).to eq "This has italics formatting." - expect(subject.excelx_value(2, 3, "Sheet1")).to eq "This has underline format." - expect(subject.excelx_value(2, 4, "Sheet1")).to eq "Superscript. x123" - expect(subject.excelx_value(2, 5, "Sheet1")).to eq "SubScript. Tj" - - expect(subject.excelx_value(3, 1, "Sheet1")).to eq "Bold, italics together." - expect(subject.excelx_value(3, 2, "Sheet1")).to eq "Bold, Underline together." - expect(subject.excelx_value(3, 3, "Sheet1")).to eq "Bold, Superscript. xN" - expect(subject.excelx_value(3, 4, "Sheet1")).to eq "Bold, Subscript. Tabc" - expect(subject.excelx_value(3, 5, "Sheet1")).to eq "Italics, Underline together." - expect(subject.excelx_value(3, 6, "Sheet1")).to eq "Italics, Superscript. Xabc" - expect(subject.excelx_value(3, 7, "Sheet1")).to eq "Italics, Subscript. Befg" - expect(subject.excelx_value(4, 1, "Sheet1")).to eq "Bold, italics underline, together." - expect(subject.excelx_value(4, 2, "Sheet1")).to eq "Bold, italics, superscript. Xabc123" - expect(subject.excelx_value(4, 3, "Sheet1")).to eq "Bold, Italics, subscript. Mgha2" - expect(subject.excelx_value(4, 4, "Sheet1")).to eq "Bold, Underline, superscript. ABC123" - expect(subject.excelx_value(4, 5, "Sheet1")).to eq "Bold, Underline, subscript. GoodXYZ" - expect(subject.excelx_value(4, 6, "Sheet1")).to eq "Italics, Underline, superscript. Upswing" - expect(subject.excelx_value(4, 7, "Sheet1")).to eq "Italics, Underline, subscript. Tswing" - expect(subject.excelx_value(5, 1, "Sheet1")).to eq "Bold, italics, underline, superscript. GHJK1904" - expect(subject.excelx_value(5, 2, "Sheet1")).to eq "Bold, italics, underline, subscript. Mikedrop" - expect(subject.excelx_value(6, 1, "Sheet1")).to eq "See that regular html tags do not create html tags.\n
    \n
  1. Denver Broncos
  2. \n
  3. Carolina Panthers
  4. \n
  5. New England Patriots
  6. \n
  7. Arizona Panthers
  8. \n
" - expect(subject.excelx_value(7, 1, "Sheet1")).to eq "Does create html tags when formatting is used..\n
    \n
  1. Denver Broncos
  2. \n
  3. Carolina Panthers
  4. \n
  5. New England Patriots
  6. \n
  7. Arizona Panthers
  8. \n
" + describe "HTML Parsing Enabling" do + let(:path) { 'test/files/html_strings_formatting.xlsx' } + + it 'returns the expected result' do + expect(subject.excelx_value(1, 1, "Sheet1")).to eq("This has no formatting.") + expect(subject.excelx_value(2, 1, "Sheet1")).to eq("This has bold formatting.") + expect(subject.excelx_value(2, 2, "Sheet1")).to eq("This has italics formatting.") + expect(subject.excelx_value(2, 3, "Sheet1")).to eq("This has underline format.") + expect(subject.excelx_value(2, 4, "Sheet1")).to eq("Superscript. x123") + expect(subject.excelx_value(2, 5, "Sheet1")).to eq("SubScript. Tj") + + expect(subject.excelx_value(3, 1, "Sheet1")).to eq("Bold, italics together.") + expect(subject.excelx_value(3, 2, "Sheet1")).to eq("Bold, Underline together.") + expect(subject.excelx_value(3, 3, "Sheet1")).to eq("Bold, Superscript. xN") + expect(subject.excelx_value(3, 4, "Sheet1")).to eq("Bold, Subscript. Tabc") + expect(subject.excelx_value(3, 5, "Sheet1")).to eq("Italics, Underline together.") + expect(subject.excelx_value(3, 6, "Sheet1")).to eq("Italics, Superscript. Xabc") + expect(subject.excelx_value(3, 7, "Sheet1")).to eq("Italics, Subscript. Befg") + expect(subject.excelx_value(4, 1, "Sheet1")).to eq("Bold, italics underline, together.") + expect(subject.excelx_value(4, 2, "Sheet1")).to eq("Bold, italics, superscript. Xabc123") + expect(subject.excelx_value(4, 3, "Sheet1")).to eq("Bold, Italics, subscript. Mgha2") + expect(subject.excelx_value(4, 4, "Sheet1")).to eq("Bold, Underline, superscript. ABC123") + expect(subject.excelx_value(4, 5, "Sheet1")).to eq("Bold, Underline, subscript. GoodXYZ") + expect(subject.excelx_value(4, 6, "Sheet1")).to eq("Italics, Underline, superscript. Upswing") + expect(subject.excelx_value(4, 7, "Sheet1")).to eq("Italics, Underline, subscript. Tswing") + expect(subject.excelx_value(5, 1, "Sheet1")).to eq("Bold, italics, underline, superscript. GHJK1904") + expect(subject.excelx_value(5, 2, "Sheet1")).to eq("Bold, italics, underline, subscript. Mikedrop") + expect(subject.excelx_value(6, 1, "Sheet1")).to eq("See that regular html tags do not create html tags.\n
    \n
  1. Denver Broncos
  2. \n
  3. Carolina Panthers
  4. \n
  5. New England Patriots
  6. \n
  7. Arizona Panthers
  8. \n
") + expect(subject.excelx_value(7, 1, "Sheet1")).to eq("Does create html tags when formatting is used..\n
    \n
  1. Denver Broncos
  2. \n
  3. Carolina Panthers
  4. \n
  5. New England Patriots
  6. \n
  7. Arizona Panthers
  8. \n
") + end + end + describe "HTML Parsing Disabled" do + let(:path) { 'test/files/html_strings_formatting.xlsx' } + + it 'returns the expected result' do + expect(subject.excelx_value(1, 1, "Sheet1")).to eq("This has no formatting.") + expect(subject.excelx_value(2, 1, "Sheet1")).to eq("This has bold formatting.") + expect(subject.excelx_value(2, 2, "Sheet1")).to eq("This has italics formatting.") + expect(subject.excelx_value(2, 3, "Sheet1")).to eq("This has underline format.") + expect(subject.excelx_value(2, 4, "Sheet1")).to eq("Superscript. x123<") + expect(subject.excelx_value(2, 5, "Sheet1")).to eq("SubScript. Tj") + + expect(subject.excelx_value(3, 1, "Sheet1")).to eq("Bold, italics together.") + expect(subject.excelx_value(3, 2, "Sheet1")).to eq("Bold, Underline together.") + expect(subject.excelx_value(3, 3, "Sheet1")).to eq("Bold, Superscript. xN") + expect(subject.excelx_value(3, 4, "Sheet1")).to eq("Bold, Subscript. Tabc") + expect(subject.excelx_value(3, 5, "Sheet1")).to eq("Italics, Underline together.") + expect(subject.excelx_value(3, 6, "Sheet1")).to eq("Italics, Superscript. Xabc") + expect(subject.excelx_value(3, 7, "Sheet1")).to eq("Italics, Subscript. Befg") + expect(subject.excelx_value(4, 1, "Sheet1")).to eq("Bold, italics underline, together.") + expect(subject.excelx_value(4, 2, "Sheet1")).to eq("Bold, italics, superscript. Xabc123") + expect(subject.excelx_value(4, 3, "Sheet1")).to eq("Bold, Italics, subscript. Mgha2") + expect(subject.excelx_value(4, 4, "Sheet1")).to eq("Bold, Underline, superscript. ABC123") + expect(subject.excelx_value(4, 5, "Sheet1")).to eq("Bold, Underline, subscript. GoodXYZ") + expect(subject.excelx_value(4, 6, "Sheet1")).to eq("Italics, Underline, superscript. Upswing") + expect(subject.excelx_value(4, 7, "Sheet1")).to eq("Italics, Underline, subscript. Tswing") + expect(subject.excelx_value(5, 1, "Sheet1")).to eq("Bold, italics, underline, superscript. GHJK1904") + expect(subject.excelx_value(5, 2, "Sheet1")).to eq("Bold, italics, underline, subscript. Mikedrop") + expect(subject.excelx_value(6, 1, "Sheet1")).to eq("See that regular html tags do not create html tags.\n
    \n
  1. Denver Broncos
  2. \n
  3. Carolina Panthers
  4. \n
  5. New England Patriots
  6. \n
  7. Arizona Panthers
  8. \n
") + expect(subject.excelx_value(7, 1, "Sheet1")).to eq("Does create html tags when formatting is used..\n
    \n
  1. Denver Broncos
  2. \n
  3. Carolina Panthers
  4. \n
  5. New England Patriots
  6. \n
  7. Arizona Panthers
  8. \n
") + end end end From be9986a5a39259e97078e6ac6827fe73d63aa315 Mon Sep 17 00:00:00 2001 From: David Welguisz Date: Fri, 12 May 2017 11:04:22 -0500 Subject: [PATCH 2/4] Added default option to be empty hash. --- lib/roo/excelx/extractor.rb | 2 +- spec/lib/roo/excelx_spec.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/roo/excelx/extractor.rb b/lib/roo/excelx/extractor.rb index 6c399eb6..6a23fd55 100755 --- a/lib/roo/excelx/extractor.rb +++ b/lib/roo/excelx/extractor.rb @@ -1,7 +1,7 @@ module Roo class Excelx class Extractor - def initialize(path, options) + def initialize(path, options = {}) @path = path @options = options end diff --git a/spec/lib/roo/excelx_spec.rb b/spec/lib/roo/excelx_spec.rb index 09e861e5..c02b2bea 100755 --- a/spec/lib/roo/excelx_spec.rb +++ b/spec/lib/roo/excelx_spec.rb @@ -519,7 +519,7 @@ expect(subject.excelx_value(2, 1, "Sheet1")).to eq("This has bold formatting.") expect(subject.excelx_value(2, 2, "Sheet1")).to eq("This has italics formatting.") expect(subject.excelx_value(2, 3, "Sheet1")).to eq("This has underline format.") - expect(subject.excelx_value(2, 4, "Sheet1")).to eq("Superscript. x123<") + expect(subject.excelx_value(2, 4, "Sheet1")).to eq("Superscript. x123") expect(subject.excelx_value(2, 5, "Sheet1")).to eq("SubScript. Tj") expect(subject.excelx_value(3, 1, "Sheet1")).to eq("Bold, italics together.") From 65d4cdaf58ad3fb1bddf882622cf4a2835eacdd8 Mon Sep 17 00:00:00 2001 From: David Welguisz Date: Fri, 12 May 2017 11:11:12 -0500 Subject: [PATCH 3/4] Changed options to global options (@options) --- lib/roo/excelx/shared_strings.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/roo/excelx/shared_strings.rb b/lib/roo/excelx/shared_strings.rb index b908f356..9d3b9d6d 100755 --- a/lib/roo/excelx/shared_strings.rb +++ b/lib/roo/excelx/shared_strings.rb @@ -26,7 +26,7 @@ def to_html # Use to_html or to_a for html returns # See what is happening with commit??? def use_html?(index) - return false if options[:disable_html_wrapper] + return false if @options[:disable_html_wrapper] to_html[index][/<([biu]|sup|sub)>/] end From ae2cb134a75d56a06b17e4a1deb64a09d2281a87 Mon Sep 17 00:00:00 2001 From: David Welguisz Date: Fri, 12 May 2017 12:30:57 -0500 Subject: [PATCH 4/4] Final fixes to get disable_html_wrapper to work. --- lib/roo/excelx.rb | 2 +- spec/lib/roo/excelx_spec.rb | 59 +++++++++++++++++++++---------------- 2 files changed, 35 insertions(+), 26 deletions(-) diff --git a/lib/roo/excelx.rb b/lib/roo/excelx.rb index ea48f59d..0df6ade2 100755 --- a/lib/roo/excelx.rb +++ b/lib/roo/excelx.rb @@ -40,8 +40,8 @@ def initialize(filename_or_stream, options = {}) sheet_options[:expand_merged_ranges] = (options[:expand_merged_ranges] || false) sheet_options[:no_hyperlinks] = (options[:no_hyperlinks] || false) shared_options = {} + shared_options[:disable_html_wrapper] = (options[:disable_html_wrapper] || false) - unless is_stream?(filename_or_stream) file_type_check(filename_or_stream, %w[.xlsx .xlsm], 'an Excel 2007', file_warning, packed) basename = find_basename(filename_or_stream) diff --git a/spec/lib/roo/excelx_spec.rb b/spec/lib/roo/excelx_spec.rb index c02b2bea..6844a779 100755 --- a/spec/lib/roo/excelx_spec.rb +++ b/spec/lib/roo/excelx_spec.rb @@ -511,6 +511,39 @@ expect(subject.excelx_value(7, 1, "Sheet1")).to eq("Does create html tags when formatting is used..\n
    \n
  1. Denver Broncos
  2. \n
  3. Carolina Panthers
  4. \n
  5. New England Patriots
  6. \n
  7. Arizona Panthers
  8. \n
") end end + end + + describe '_x000D_' do + let(:path) { 'test/files/x000D.xlsx' } + it 'does not contain _x000D_' do + expect(subject.cell(2, 9)).not_to include('_x000D_') + end + end + + describe 'opening a file with a chart sheet' do + let(:path) { 'test/files/chart_sheet.xlsx' } + it 'should not raise' do + expect{ subject }.to_not raise_error + end + end + + describe 'opening a file with white space in the styles.xml' do + let(:path) { 'test/files/style_nodes_with_white_spaces.xlsx' } + subject(:xlsx) do + Roo::Spreadsheet.open(path, expand_merged_ranges: true, extension: :xlsx) + end + it 'should properly recognize formats' do + expect(subject.sheet(0).excelx_format(2,1)).to eq 'm/d/yyyy" "h:mm:ss" "AM/PM' + end + end +end + +describe 'Roo::Excelx with options set' do + subject(:xlsx) do + Roo::Excelx.new(path, disable_html_wrapper: true) + end + + describe '#html_strings' do describe "HTML Parsing Disabled" do let(:path) { 'test/files/html_strings_formatting.xlsx' } @@ -543,28 +576,4 @@ end end end - - describe '_x000D_' do - let(:path) { 'test/files/x000D.xlsx' } - it 'does not contain _x000D_' do - expect(subject.cell(2, 9)).not_to include('_x000D_') - end - end - - describe 'opening a file with a chart sheet' do - let(:path) { 'test/files/chart_sheet.xlsx' } - it 'should not raise' do - expect{ subject }.to_not raise_error - end - end - - describe 'opening a file with white space in the styles.xml' do - let(:path) { 'test/files/style_nodes_with_white_spaces.xlsx' } - subject(:xlsx) do - Roo::Spreadsheet.open(path, expand_merged_ranges: true, extension: :xlsx) - end - it 'should properly recognize formats' do - expect(subject.sheet(0).excelx_format(2,1)).to eq 'm/d/yyyy" "h:mm:ss" "AM/PM' - end - end -end +end \ No newline at end of file