Skip to content

Commit 78a2a27

Browse files
d050150ilayaperumalg
authored andcommitted
GH-1689 Handle StringIndexOutOfBoundsException in PagePdfDocumentReader
- Add test coverage to TextLine - Use char[] instead of String for TextLine - Optimise index handling when reading text lines Resolves #1689
1 parent 865d429 commit 78a2a27

File tree

4 files changed

+176
-36
lines changed

4 files changed

+176
-36
lines changed

document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/layout/TextLine.java

Lines changed: 26 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -16,29 +16,38 @@
1616

1717
package org.springframework.ai.reader.pdf.layout;
1818

19+
import java.util.Arrays;
20+
21+
/*
22+
* @author Soby Chacko
23+
* @author Tibor Tarnai
24+
*/
25+
1926
class TextLine {
2027

2128
private static final char SPACE_CHARACTER = ' ';
2229

23-
private int lineLength;
30+
private final int lineLength;
2431

25-
private String line;
32+
private final char[] line;
2633

2734
private int lastIndex;
2835

2936
TextLine(int lineLength) {
30-
this.line = "";
37+
if (lineLength < 0) {
38+
throw new IllegalArgumentException("Line length cannot be negative");
39+
}
3140
this.lineLength = lineLength / ForkPDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT;
32-
this.completeLineWithSpaces();
41+
this.line = new char[this.lineLength];
42+
Arrays.fill(this.line, SPACE_CHARACTER);
3343
}
3444

3545
public void writeCharacterAtIndex(final Character character) {
3646
character.setIndex(this.computeIndexForCharacter(character));
3747
int index = character.getIndex();
3848
char characterValue = character.getCharacterValue();
39-
if (this.indexIsInBounds(index) && this.line.charAt(index) == SPACE_CHARACTER) {
40-
this.line = this.line.substring(0, index) + characterValue
41-
+ this.line.substring(index + 1, this.getLineLength());
49+
if (this.indexIsInBounds(index) && this.line[index] == SPACE_CHARACTER) {
50+
this.line[index] = characterValue;
4251
}
4352
}
4453

@@ -47,7 +56,7 @@ public int getLineLength() {
4756
}
4857

4958
public String getLine() {
50-
return this.line;
59+
return new String(this.line);
5160
}
5261

5362
private int computeIndexForCharacter(final Character character) {
@@ -64,7 +73,7 @@ private int computeIndexForCharacter(final Character character) {
6473
index = this.findMinimumIndexWithSpaceCharacterFromIndex(index);
6574
}
6675
else if (isCharacterCloseToPreviousWord) {
67-
if (this.line.charAt(index) != SPACE_CHARACTER) {
76+
if (this.line[index] != SPACE_CHARACTER) {
6877
index = index + 1;
6978
}
7079
else {
@@ -76,52 +85,36 @@ else if (isCharacterCloseToPreviousWord) {
7685
}
7786
}
7887

79-
private boolean isSpaceCharacterAtIndex(int index) {
80-
return this.line.charAt(index) != SPACE_CHARACTER;
88+
private boolean isNotSpaceCharacterAtIndex(int index) {
89+
return this.line[index] != SPACE_CHARACTER;
8190
}
8291

8392
private boolean isNewIndexGreaterThanLastIndex(int index) {
84-
int lastIndex = this.getLastIndex();
85-
return (index > lastIndex);
93+
return index > this.lastIndex;
8694
}
8795

8896
private int getNextValidIndex(int index, boolean isCharacterPartOfPreviousWord) {
8997
int nextValidIndex = index;
90-
int lastIndex = this.getLastIndex();
9198
if (!this.isNewIndexGreaterThanLastIndex(index)) {
92-
nextValidIndex = lastIndex + 1;
99+
nextValidIndex = this.lastIndex + 1;
93100
}
94-
if (!isCharacterPartOfPreviousWord && this.isSpaceCharacterAtIndex(index - 1)) {
101+
if (!isCharacterPartOfPreviousWord && index > 0 && this.isNotSpaceCharacterAtIndex(index - 1)) {
95102
nextValidIndex = nextValidIndex + 1;
96103
}
97-
this.setLastIndex(nextValidIndex);
104+
this.lastIndex = nextValidIndex;
98105
return nextValidIndex;
99106
}
100107

101108
private int findMinimumIndexWithSpaceCharacterFromIndex(int index) {
102109
int newIndex = index;
103-
while (newIndex >= 0 && this.line.charAt(newIndex) == SPACE_CHARACTER) {
110+
while (newIndex >= 0 && this.line[newIndex] == SPACE_CHARACTER) {
104111
newIndex = newIndex - 1;
105112
}
106113
return newIndex + 1;
107114
}
108115

109116
private boolean indexIsInBounds(int index) {
110-
return (index >= 0 && index < this.lineLength);
111-
}
112-
113-
private void completeLineWithSpaces() {
114-
for (int i = 0; i < this.getLineLength(); ++i) {
115-
this.line += SPACE_CHARACTER;
116-
}
117-
}
118-
119-
private int getLastIndex() {
120-
return this.lastIndex;
121-
}
122-
123-
private void setLastIndex(int lastIndex) {
124-
this.lastIndex = lastIndex;
117+
return index >= 0 && index < this.lineLength;
125118
}
126119

127120
}

document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/PagePdfDocumentReaderTests.java

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,12 @@
2929

3030
/**
3131
* @author Christian Tzolov
32+
* @author Tibor Tarnai
3233
*/
33-
public class PagePdfDocumentReaderTests {
34+
class PagePdfDocumentReaderTests {
3435

3536
@Test
36-
public void classpathRead() {
37+
void classpathRead() {
3738

3839
PagePdfDocumentReader pdfReader = new PagePdfDocumentReader("classpath:/sample1.pdf",
3940
PdfDocumentReaderConfig.builder()
@@ -51,10 +52,22 @@ public void classpathRead() {
5152

5253
assertThat(docs).hasSize(4);
5354

54-
String allText = docs.stream().map(d -> d.getContent()).collect(Collectors.joining(System.lineSeparator()));
55+
String allText = docs.stream().map(Document::getContent).collect(Collectors.joining(System.lineSeparator()));
5556

5657
assertThat(allText).doesNotContain(
5758
List.of("Page 1 of 4", "Page 2 of 4", "Page 3 of 4", "Page 4 of 4", "PDF Bookmark Sample"));
5859
}
5960

61+
@Test
62+
void testIndexOutOfBound() {
63+
var documents = new PagePdfDocumentReader("classpath:/sample2.pdf",
64+
PdfDocumentReaderConfig.builder()
65+
.withPageExtractedTextFormatter(ExtractedTextFormatter.builder().build())
66+
.withPagesPerDocument(1)
67+
.build())
68+
.get();
69+
70+
assertThat(documents).hasSize(64);
71+
}
72+
6073
}
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
/*
2+
* Copyright 2023-2024 the original author or authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package org.springframework.ai.reader.pdf.layout;
18+
19+
import java.util.stream.Stream;
20+
21+
import org.junit.jupiter.api.Test;
22+
import org.junit.jupiter.params.ParameterizedTest;
23+
import org.junit.jupiter.params.provider.Arguments;
24+
import org.junit.jupiter.params.provider.MethodSource;
25+
26+
import static org.junit.jupiter.api.Assertions.assertEquals;
27+
import static org.junit.jupiter.api.Assertions.assertThrows;
28+
29+
/*
30+
* @author Tibor Tarnai
31+
*/
32+
33+
class TextLineTest {
34+
35+
public static Stream<Arguments> testWriteCharacterAtIndexValidIndex() {
36+
return Stream.of(Arguments.of(new Character('A', 0, false, false, false, false)),
37+
Arguments.of(new Character('A', 10, true, false, false, false)),
38+
Arguments.of(new Character('A', 0, false, true, false, false)));
39+
}
40+
41+
@ParameterizedTest
42+
@MethodSource
43+
void testWriteCharacterAtIndexValidIndex(Character character) {
44+
TextLine textLine = new TextLine(100);
45+
textLine.writeCharacterAtIndex(character);
46+
assertEquals(" A" + " ".repeat(23), textLine.getLine());
47+
}
48+
49+
@Test
50+
void testWriteCharacterAtIndex_PartOfPreviousWord() {
51+
TextLine textLine = new TextLine(100);
52+
Character character = new Character('A', 10, true, false, false, false);
53+
textLine.writeCharacterAtIndex(character);
54+
assertEquals(" A" + " ".repeat(23), textLine.getLine());
55+
}
56+
57+
@Test
58+
void testWriteCharacterAtIndex_BeginningOfNewLine() {
59+
TextLine textLine = new TextLine(100);
60+
Character character = new Character('A', 0, false, true, false, false);
61+
textLine.writeCharacterAtIndex(character);
62+
assertEquals(" A" + " ".repeat(23), textLine.getLine());
63+
}
64+
65+
@Test
66+
void testWriteCharacterAtIndex_InvalidIndex() {
67+
TextLine textLine = new TextLine(100);
68+
Character character = new Character('A', 150, false, false, false, false);
69+
textLine.writeCharacterAtIndex(character);
70+
assertEquals(" ".repeat(25), textLine.getLine());
71+
}
72+
73+
@Test
74+
void testWriteCharacterAtIndex_NegativeIndex() {
75+
TextLine textLine = new TextLine(100);
76+
Character character = new Character('A', -1, false, false, false, false);
77+
textLine.writeCharacterAtIndex(character);
78+
assertEquals(" ".repeat(25), textLine.getLine());
79+
}
80+
81+
@Test
82+
void testWriteCharacterAtIndex_SpaceCharacter() {
83+
TextLine textLine = new TextLine(100);
84+
Character character = new Character('A', 10, false, false, false, false);
85+
textLine.writeCharacterAtIndex(character);
86+
assertEquals(" ".repeat(10) + "A" + " ".repeat(14), textLine.getLine());
87+
}
88+
89+
@Test
90+
void testWriteCharacterAtIndex_CloseToPreviousWord() {
91+
TextLine textLine = new TextLine(100);
92+
Character character = new Character('A', 10, false, false, true, false);
93+
textLine.writeCharacterAtIndex(character);
94+
assertEquals(" ".repeat(10) + "A" + " ".repeat(14), textLine.getLine());
95+
}
96+
97+
@Test
98+
void testGetLineLength() {
99+
TextLine textLine = new TextLine(100);
100+
assertEquals(100 / ForkPDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT, textLine.getLineLength());
101+
}
102+
103+
@Test
104+
void testGetLine() {
105+
TextLine textLine = new TextLine(100);
106+
assertEquals(" ".repeat(100 / ForkPDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT),
107+
textLine.getLine());
108+
}
109+
110+
@Test
111+
void testNegativeLineLength() {
112+
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, () -> new TextLine(-100));
113+
assertEquals("Line length cannot be negative", exception.getMessage());
114+
}
115+
116+
@Test
117+
void testComputeIndexForCharacter_CloseToPreviousWord() {
118+
TextLine textLine = new TextLine(100);
119+
Character character = new Character('A', 10, true, false, true, true);
120+
textLine.writeCharacterAtIndex(character);
121+
assertEquals(" A" + " ".repeat(23), textLine.getLine());
122+
}
123+
124+
@Test
125+
void testComputeIndexForCharacter_CloseToPreviousWord_WriteTwoCharacters() {
126+
TextLine textLine = new TextLine(100);
127+
Character character = new Character('A', 10, true, false, true, true);
128+
Character anotherCharacter = new Character('B', 1, true, false, true, true);
129+
textLine.writeCharacterAtIndex(character);
130+
textLine.writeCharacterAtIndex(anotherCharacter);
131+
assertEquals(" AB" + " ".repeat(22), textLine.getLine());
132+
}
133+
134+
}
3.93 MB
Binary file not shown.

0 commit comments

Comments
 (0)