Skip to content

Commit

Permalink
[TIKA-1800] Decode the escape characters in front of special characters
Browse files Browse the repository at this point in the history
  • Loading branch information
longphan98 committed May 25, 2022
1 parent 13cf016 commit 9150753
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 3 deletions.
15 changes: 12 additions & 3 deletions tika-core/src/main/java/org/apache/tika/mime/MediaType.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,15 @@ public final class MediaType implements Comparable<MediaType>, Serializable {
*/
private static final long serialVersionUID = -3831000556189036392L;

private static final Pattern SPECIAL = Pattern.compile("[\\(\\)<>@,;:\\\\\"/\\[\\]\\?=]");
private static final Pattern SPECIAL = Pattern.compile("[()<>@,;:\\\\\"/\\[\\]?=]");

private static final Pattern SPECIAL_OR_WHITESPACE =
Pattern.compile("[\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]");
Pattern.compile("[()<>@,;:\\\\\"/\\[\\]?=\\s]");

/**
* See http://www.ietf.org/rfc/rfc2045.txt for valid mime-type characters.
*/
private static final String VALID_CHARS = "([^\\c\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]+)";
private static final String VALID_CHARS = "([^\\c\\()<>@,;:\\\\\"/\\[\\]?=\\s]+)";

private static final Pattern TYPE_PATTERN =
Pattern.compile("(?s)\\s*" + VALID_CHARS + "\\s*/\\s*" + VALID_CHARS + "\\s*($|;.*)");
Expand Down Expand Up @@ -331,6 +331,7 @@ private static Map<String, String> parseParameters(String string) {
/**
* Fuzzy unquoting mechanism that works also with somewhat malformed
* quotes.
* TIKA-1800: get rid of the escape characters which are in front of special characters for further usage (.e.g to parse on to a new MediaType as parameters)
*
* @param s string to unquote
* @return unquoted string
Expand All @@ -342,6 +343,14 @@ private static String unquote(String s) {
while (s.endsWith("\"") || s.endsWith("'")) {
s = s.substring(0, s.length() - 1);
}
for (int i = 0; i < s.length() - 1; i++) {
if (s.charAt(i) == '\\' && !('0' <= s.charAt(i + 1) && s.charAt(i + 1) <= '9') &&
!('a' <= s.charAt(i + 1) && s.charAt(i + 1) <= 'z') &&
s.charAt(i + 1) != '-' && s.charAt(i + 1) != '+' &&
s.charAt(i + 1) != '.' && s.charAt(i + 1) != '_') {
s = s.substring(0, i) + s.substring(i + 1);
}
}
return s;
}

Expand Down
13 changes: 13 additions & 0 deletions tika-core/src/test/java/org/apache/tika/mime/MediaTypeTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,26 @@
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.tika.exception.TikaException;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;

public class MediaTypeTest {

// TIKA-1800
@Test
public void testEscapedSpecialChar() {
MediaType mType = new MediaType(MediaType.APPLICATION_XML, "x-report", "#report?");
String cType = mType.toString();
mType = MediaType.parse(cType);
String report = mType.getParameters().get("x-report");
assertEquals("#report?", report);
}

@Test
public void testBasics() {
assertEquals("application/octet-stream",
Expand Down

0 comments on commit 9150753

Please sign in to comment.