From 94bebf7436200804c923b839d90418b9a4b0fa56 Mon Sep 17 00:00:00 2001 From: "Rafael C. Carrasco" Date: Sat, 23 Nov 2013 01:26:54 +0100 Subject: [PATCH] Improved treatment of whitespace --- pom.xml | 2 +- src/main/resources/replacements.txt | 2 + .../eu/digitisation/io/CharFilterTest.java | 74 +++++++++++++++++++ .../eu/digitisation/io/TextContentTest.java | 41 +++++----- 4 files changed, 98 insertions(+), 21 deletions(-) create mode 100644 src/test/java/eu/digitisation/io/CharFilterTest.java diff --git a/pom.xml b/pom.xml index 171198b..ddd5338 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ eu.digitisation ocrevalUAtion ocrevalUAtion - 0.9-SNAPSHOT + 0.91 jar OCR Evaluation Tool diff --git a/src/main/resources/replacements.txt b/src/main/resources/replacements.txt index d35c2ad..034861f 100644 --- a/src/main/resources/replacements.txt +++ b/src/main/resources/replacements.txt @@ -1,3 +1,5 @@ +2028 0020 +2029 0020 F1AC 003B EFA1 00E6 EEC4 0063006B diff --git a/src/test/java/eu/digitisation/io/CharFilterTest.java b/src/test/java/eu/digitisation/io/CharFilterTest.java new file mode 100644 index 0000000..1eacabd --- /dev/null +++ b/src/test/java/eu/digitisation/io/CharFilterTest.java @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2013 IMPACT Centre of Competence + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +package eu.digitisation.io; + +import java.io.File; +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.file.Paths; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import static org.junit.Assert.*; + +/** + * + * @author rafa + */ +public class CharFilterTest { + + public CharFilterTest() { + } + + @BeforeClass + public static void setUpClass() { + } + + @AfterClass + public static void tearDownClass() { + } + + @Before + public void setUp() { + } + + @After + public void tearDown() { + } + + /** + * Test of translate method, of class CharFilter. + * @throws java.net.URISyntaxException + */ + @Test + public void testTranslate_String() throws URISyntaxException { + System.out.println("translate"); + URL resourceUrl = getClass().getResource("/replacements.txt"); + File file = Paths.get(resourceUrl.toURI()).toFile(); + CharFilter filter = new CharFilter(file); + String s = "a\u2028"; + String expResult = "a "; + String result = filter.translate(s); + assertEquals(expResult.length(), result.length()); + assertEquals(expResult, result); + } + +} diff --git a/src/test/java/eu/digitisation/io/TextContentTest.java b/src/test/java/eu/digitisation/io/TextContentTest.java index d938a16..e1ee879 100644 --- a/src/test/java/eu/digitisation/io/TextContentTest.java +++ b/src/test/java/eu/digitisation/io/TextContentTest.java @@ -18,11 +18,12 @@ package eu.digitisation.io; import java.io.File; -import java.io.PrintWriter; +import java.net.URISyntaxException; import java.net.URL; import java.nio.file.Paths; import org.junit.After; import org.junit.AfterClass; +import static org.junit.Assert.assertEquals; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; @@ -32,41 +33,41 @@ * @author carrasco@ua.es */ public class TextContentTest { - + public TextContentTest() { } - + @BeforeClass public static void setUpClass() { } - + @AfterClass public static void tearDownClass() { } - + @Before public void setUp() { } - + @After public void tearDown() { } /** - * Test of getText method, of class Text. - * @throws java.lang.Exception + * Test of toString method, of class TextContent. + * @throws java.net.URISyntaxException */ @Test - public void testGetText() throws Exception { - System.out.println("getText"); - URL inURL = getClass().getResource("/00445310.xml"); - File ifile = Paths.get(inURL.toURI()).toFile(); - URL outURL = getClass().getResource("/00445310.txt"); - File ofile = Paths.get(outURL.toURI()).toFile(); - TextContent instance = new TextContent(ifile, "utf-8", null); - try (PrintWriter writer = new PrintWriter(ofile)) { - String result = instance.toString(); - writer.write(result); - } + public void testToString() throws URISyntaxException { + System.out.println("toString"); + URL resourceUrl = getClass().getResource("/replacements.txt"); + File file = Paths.get(resourceUrl.toURI()).toFile(); + CharFilter filter = new CharFilter(file); + String s = "hola " + "\n" + " y\u2028 de todo\n"; + TextContent instance = new TextContent(s, filter); + String expResult = "hola y de todo"; + String result = instance.toString(); + assertEquals(expResult.length(), result.length()); + assertEquals(expResult, result); } -} \ No newline at end of file +}