From ce935262b1efcd212e73d6fd5e9bddde03f6baa0 Mon Sep 17 00:00:00 2001 From: Edi Weissmann Date: Tue, 12 May 2020 14:55:30 +0200 Subject: [PATCH] Data table improvements: isolation of RTL strings, merge of tables with similar headers (with blanks) --- .../sejda/core/support/util/StringUtils.java | 35 +++++++++ .../sambox/component/excel/DataTable.java | 72 ++++++++----------- .../component/excel/DataTableUtils.java | 31 +++++++- .../sambox/component/excel/DataTableTest.java | 37 +++++++--- .../component/excel/DataTableUtilsTest.java | 52 +++++++++++++- 5 files changed, 172 insertions(+), 55 deletions(-) diff --git a/sejda-core/src/main/java/org/sejda/core/support/util/StringUtils.java b/sejda-core/src/main/java/org/sejda/core/support/util/StringUtils.java index 5f95d6425..084fa2ce1 100755 --- a/sejda-core/src/main/java/org/sejda/core/support/util/StringUtils.java +++ b/sejda-core/src/main/java/org/sejda/core/support/util/StringUtils.java @@ -16,6 +16,8 @@ */ package org.sejda.core.support.util; +import static java.lang.Character.*; + public final class StringUtils { private StringUtils() { // hide @@ -39,4 +41,37 @@ public static String asUnicodes(String in) { public static String normalizeLineEndings(String in) { return in.replaceAll("\\r\\n", "\n"); } + + public static String isolateRTLIfRequired(String s) { + if(isRtl(s)) { + return '\u2068' + s + '\u2069'; + } else { + return s; + } + } + + public static boolean isRtl(String string) { + if (string == null) { + return false; + } + + for (int i = 0, n = string.length(); i < n; ++i) { + byte d = Character.getDirectionality(string.charAt(i)); + + switch (d) { + case DIRECTIONALITY_RIGHT_TO_LEFT: + case DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: + case DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: + case DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: + return true; + + case DIRECTIONALITY_LEFT_TO_RIGHT: + case DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: + case DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: + return false; + } + } + + return false; + } } diff --git a/sejda-sambox/src/main/java/org/sejda/impl/sambox/component/excel/DataTable.java b/sejda-sambox/src/main/java/org/sejda/impl/sambox/component/excel/DataTable.java index 878288f23..9f3fad9dc 100644 --- a/sejda-sambox/src/main/java/org/sejda/impl/sambox/component/excel/DataTable.java +++ b/sejda-sambox/src/main/java/org/sejda/impl/sambox/component/excel/DataTable.java @@ -19,17 +19,17 @@ package org.sejda.impl.sambox.component.excel; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.TreeSet; +import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static java.lang.Character.*; import static org.apache.commons.lang3.StringUtils.rightPad; +import static org.sejda.core.support.util.StringUtils.isolateRTLIfRequired; public class DataTable { @@ -46,8 +46,14 @@ public DataTable(Collection pageNumbers) { this.pageNumbers.addAll(pageNumbers); } - public void addRow(String... dataRow) { - addRow(Arrays.asList(dataRow)); + public DataTable addRow(String... dataRow) { + List row = new ArrayList<>(); + for (String item: dataRow) { + row.add(item); + } + addRow(row); + + return this; } public void addRow(List dataRow) { @@ -61,16 +67,24 @@ public void addRows(List> dataRows) { public List headerRow() { return data.get(0); } + + public List headerRowIgnoreBlanks() { + return data.get(0).stream().filter(s -> !s.trim().isEmpty()).collect(Collectors.toList()); + } public boolean hasSameHeaderAs(DataTable other) { - String thisHeader = String.join("", this.headerRow()).trim(); - String otherHeader = String.join("", other.headerRow()).trim(); + String thisHeader = String.join("", this.headerRowIgnoreBlanks()).trim(); + String otherHeader = String.join("", other.headerRowIgnoreBlanks()).trim(); LOG.debug("Comparing header columns: '{}' and '{}'", thisHeader, otherHeader); return thisHeader.equalsIgnoreCase(otherHeader); } + + public boolean hasSameHeaderBlanksIgnoredAs(DataTable other) { + return this.headerRowIgnoreBlanks().equals(other.headerRowIgnoreBlanks()); + } - public boolean hasSameColumnsAs(DataTable other) { + public boolean hasSameColumnCountAs(DataTable other) { LOG.debug("Comparing header columns size: {} and {}", this.headerRow().size(), other.headerRow().size()); return other.headerRow().size() == this.headerRow().size(); } @@ -82,7 +96,7 @@ public List> getData() { public TreeSet getPageNumbers() { return pageNumbers; } - + public DataTable mergeWith(DataTable other) { TreeSet resultPageNumbers = new TreeSet<>(); resultPageNumbers.addAll(this.pageNumbers); @@ -190,6 +204,12 @@ public DataTable mergeColumns(int c1, int c2) { } return result; } + + public void addBlankColumn(int index) { + for(List row : this.data) { + row.add(index, ""); + } + } private static String getOrEmpty(List list, int index) { if (list.size() <= index) { @@ -228,7 +248,7 @@ public String toString() { String cellPadded = rightPad("", colWidths.get(j)); if(j < row.size()) { - cellPadded = ensureLtr(rightPad(row.get(j), colWidths.get(j))); + cellPadded = isolateRTLIfRequired(rightPad(row.get(j), colWidths.get(j))); } sb.append("|").append(cellPadded); @@ -242,38 +262,4 @@ public String toString() { return sb.toString(); } - - // TODO: ensure strings with mixed arabic and latin are displayed properly - private String ensureLtr(String s) { - if(isRtl(s)) { - return '\u200e' + s; - } else { - return s; - } - } - - public static boolean isRtl(String string) { - if (string == null) { - return false; - } - - for (int i = 0, n = string.length(); i < n; ++i) { - byte d = Character.getDirectionality(string.charAt(i)); - - switch (d) { - case DIRECTIONALITY_RIGHT_TO_LEFT: - case DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC: - case DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING: - case DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE: - return true; - - case DIRECTIONALITY_LEFT_TO_RIGHT: - case DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING: - case DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: - return false; - } - } - - return false; - } } diff --git a/sejda-sambox/src/main/java/org/sejda/impl/sambox/component/excel/DataTableUtils.java b/sejda-sambox/src/main/java/org/sejda/impl/sambox/component/excel/DataTableUtils.java index f7807944d..f3a4baa2f 100644 --- a/sejda-sambox/src/main/java/org/sejda/impl/sambox/component/excel/DataTableUtils.java +++ b/sejda-sambox/src/main/java/org/sejda/impl/sambox/component/excel/DataTableUtils.java @@ -37,7 +37,11 @@ public static List mergeTablesSpanningMultiplePages(List d for (DataTable dt : dataTables) { if (current != null) { - if (current.hasSameColumnsAs(dt)) { + if (current.hasSameHeaderBlanksIgnoredAs(dt)) { + addBlankColumnsToMatchHeaders(current, dt); + } + + if (current.hasSameColumnCountAs(dt)) { current = current.mergeWith(dt); } else { results.add(current); @@ -62,6 +66,31 @@ public static List mergeComplementaryColumns(List dataTabl } return results; } + + public static void addBlankColumnsToMatchHeaders(DataTable a, DataTable b) { + if (!a.hasSameHeaderBlanksIgnoredAs(b)) { + throw new RuntimeException("Only works when tables have same headers (blanks ignored)"); + } + + List aHeaderRow = a.headerRow(); + List bHeaderRow = b.headerRow(); + int aa = 0, bb = 0; + while(aa < aHeaderRow.size() && bb < bHeaderRow.size()) { + String aCol = aHeaderRow.get(aa).trim(); + String bCol = bHeaderRow.get(bb).trim(); + + if (aCol.equals(bCol)) { + aa++; + bb++; + } else if(aCol.isEmpty()) { + b.addBlankColumn(bb); + } else if(bCol.isEmpty()) { + a.addBlankColumn(aa); + } else { + throw new RuntimeException("Should not happen"); + } + } + } static DataTable mergeComplementaryColumns(DataTable dataTable) { DataTable result = dataTable; diff --git a/sejda-sambox/src/test/java/org/sejda/impl/sambox/component/excel/DataTableTest.java b/sejda-sambox/src/test/java/org/sejda/impl/sambox/component/excel/DataTableTest.java index 226f62ede..f0d5e5839 100644 --- a/sejda-sambox/src/test/java/org/sejda/impl/sambox/component/excel/DataTableTest.java +++ b/sejda-sambox/src/test/java/org/sejda/impl/sambox/component/excel/DataTableTest.java @@ -82,19 +82,24 @@ public void testToStringWithArabic() { DataTable data = new DataTable(1); data.addRow("Word one longer header", "Word two"); data.addRow("Hello", "Goodbye", "1"); - // TODO: ensure strings with mixed arabic and latin are displayed properly - //data.addRow("مرحبا", "مرحبًا ABC 123", "وداعا"); + data.addRow("مرحبا", "مرحبًا ABC 123", "وداعا"); data.addRow("مرحبا", "مرحبًا", "وداعا"); + data.addRow("مرحبا", "", "وداعا"); - System.out.println(data.toString()); +// System.out.println(data.toString()); - String expected = "+-------------------------------------+\n" + - "|Word one longer header|Word two| |\n" + - "+-------------------------------------+\n" + - "|Hello |Goodbye |1 |\n" + - "+-------------------------------------+\n" + - "|\u200Eمرحبا |\u200Eمرحبًا |\u200Eوداعا|\n" + - "+-------------------------------------+"; + String expected = + "+-------------------------------------------+\n" + + "|Word one longer header|Word two | |\n" + + "+-------------------------------------------+\n" + + "|Hello |Goodbye |1 |\n" + + "+-------------------------------------------+\n" + + "|\u2068مرحبا \u2069|\u2068مرحبًا ABC 123\u2069|\u2068وداعا\u2069|\n" + + "+-------------------------------------------+\n" + + "|\u2068مرحبا \u2069|\u2068مرحبًا \u2069|\u2068وداعا\u2069|\n" + + "+-------------------------------------------+\n" + + "|\u2068مرحبا \u2069| |\u2068وداعا\u2069|\n" + + "+-------------------------------------------+"; assertThat(data.toString().trim(), is(expected.trim())); } @@ -133,6 +138,18 @@ public void mergeColumnsWhenUnevenRowLength() { assertThat(merged.getColumn(1), is(Arrays.asList("HeadB HeadC", "B1 C1", "B2", "", "B4 C4"))); } + @Test + public void addBlankColumn() { + DataTable dt = new DataTable(1); + dt.addRow("H1", "H2", "H3"); + dt.addRow("A1", "A2", "A3"); + + dt.addBlankColumn(1); + + assertThat(dt.getRow(0), is(Arrays.asList("H1", "", "H2", "H3"))); + assertThat(dt.getRow(1), is(Arrays.asList("A1", "", "A2", "A3"))); + } + @Test public void testPagesAsString() { assertThat(new DataTable(1).getPagesAsString(), is("Page 1")); diff --git a/sejda-sambox/src/test/java/org/sejda/impl/sambox/component/excel/DataTableUtilsTest.java b/sejda-sambox/src/test/java/org/sejda/impl/sambox/component/excel/DataTableUtilsTest.java index c1ee0022a..31bf06bf3 100644 --- a/sejda-sambox/src/test/java/org/sejda/impl/sambox/component/excel/DataTableUtilsTest.java +++ b/sejda-sambox/src/test/java/org/sejda/impl/sambox/component/excel/DataTableUtilsTest.java @@ -22,6 +22,7 @@ import static org.junit.Assert.assertThat; import java.util.Arrays; +import java.util.List; import org.junit.Test; @@ -73,4 +74,53 @@ public void mergeComplementaryColumnsThatNeedsMultiplePasses() { assertThat(merged.getColumn(2), is(Arrays.asList("H4", "D1", "E2", ""))); } -} \ No newline at end of file + + @Test + public void mergeWithAccountingBlankHeaders_scenario1() { + DataTable dt = new DataTable(1) + .addRow("H1", "H2", "H3") + .addRow("A1", "A2", "A3"); + + DataTable dt2 = new DataTable(3) + .addRow("H1", " ", "", "H2", "H3") + .addRow("C1", "CX", "CY", "C2", "C3"); + + List mergedList = DataTableUtils.mergeTablesSpanningMultiplePages(Arrays.asList(dt, dt2)); + assertThat(mergedList.size(), is(1)); + DataTable merged = mergedList.get(0); + + assertThat(merged.toString(), is("\n" + + "+--------------+\n" + + "|H1| | |H2|H3|\n" + + "+--------------+\n" + + "|A1| | |A2|A3|\n" + + "+--------------+\n" + + "|C1|CX|CY|C2|C3|\n" + + "+--------------+\n")); + + } + + @Test + public void mergeWithAccountingBlankHeaders_scenario2() { + DataTable dt = new DataTable(1) + .addRow("H1", "", "H2", "H3") + .addRow("A1", "AX", "A2", "A3"); + + DataTable dt2 = new DataTable(3) + .addRow("H1", "H2", "", "H3") + .addRow("C1", "C2", "CX", "C3"); + + List mergedList = DataTableUtils.mergeTablesSpanningMultiplePages(Arrays.asList(dt, dt2)); + assertThat(mergedList.size(), is(1)); + DataTable merged = mergedList.get(0); + + assertThat(merged.toString(), is("\n" + + "+--------------+\n" + + "|H1| |H2| |H3|\n" + + "+--------------+\n" + + "|A1|AX|A2| |A3|\n" + + "+--------------+\n" + + "|C1| |C2|CX|C3|\n" + + "+--------------+\n")); + } +}