Skip to content

Commit e98079e

Browse files
committed
TIKA-1997 -- initial poc, derived from rob975's work on: #267
1 parent 388097b commit e98079e

File tree

26 files changed

+583
-19
lines changed

26 files changed

+583
-19
lines changed

tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -875,13 +875,24 @@
875875
<glob pattern="*.p10"/>
876876
</mime-type>
877877

878-
<mime-type type="application/pkcs7-mime">
879-
<glob pattern="*.p7m"/>
878+
<mime-type type="application/pkcs7-signature">
879+
<glob pattern="*.p7s"/>
880+
<sub-class-of type="application/pkcs7-mime"/>
881+
</mime-type>
882+
883+
<mime-type type="application/pkcs7-mime; smime-type=certs-only">
880884
<glob pattern="*.p7c"/>
885+
<sub-class-of type="application/pkcs7-mime"/>
881886
</mime-type>
882887

883-
<mime-type type="application/pkcs7-signature">
884-
<glob pattern="*.p7s"/>
888+
<mime-type type="application/pkcs7-mime; smime-type=compressed-data">
889+
<glob pattern="*.p7z"/>
890+
<!-- has the same magic as application/timestamped-data -->
891+
<sub-class-of type="application/x-tika-compressed-pkc7-base"/>
892+
</mime-type>
893+
894+
<mime-type type="application/pkcs7-mime">
895+
<glob pattern="*.p7m"/>
885896
<magic priority="50">
886897
<!-- PEM encoded -->
887898
<match value="-----BEGIN PKCS7" type="string" offset="0"/>
@@ -911,6 +922,15 @@
911922

912923
<mime-type type="application/timestamped-data">
913924
<glob pattern="*.tsd"/>
925+
<!-- magic conflicts with some p7z; for now leave this here
926+
<magic priority="50">
927+
<match value="0x3080060B2A864886F7" type="string" offset="0"/>
928+
</magic> -->
929+
<sub-class-of type="application/x-tika-compressed-pkc7-base"/>
930+
</mime-type>
931+
932+
<mime-type type="application/x-tika-compressed-pkc7-base">
933+
<!-- magic conflicts with some p7z; for now leave this here -->
914934
<magic priority="50">
915935
<match value="0x3080060B2A864886F7" type="string" offset="0"/>
916936
</magic>
@@ -4847,6 +4867,7 @@
48474867
<mime-type type="application/x-pkcs12">
48484868
<glob pattern="*.p12"/>
48494869
<glob pattern="*.pfx"/>
4870+
<sub-class-of type="application/x-pkcs7"/>
48504871
</mime-type>
48514872
<mime-type type="application/x-pkcs7-certificates">
48524873
<glob pattern="*.p7b"/>

tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ public void testHttpServerFileExtensions() {
105105
assertEquals("application/pics-rules", tika.detect("x.prf"));
106106
assertEquals("application/pkcs10", tika.detect("x.p10"));
107107
assertEquals("application/pkcs7-mime", tika.detect("x.p7m"));
108-
assertEquals("application/pkcs7-mime", tika.detect("x.p7c"));
108+
assertEquals("application/pkcs7-mime; smime-type=certs-only", tika.detect("x.p7c"));
109109
assertEquals("application/pkcs7-signature", tika.detect("x.p7s"));
110110
assertEquals("application/pkix-cert", tika.detect("x.cer"));
111111
assertEquals("application/pkix-crl", tika.detect("x.crl"));
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,313 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.tika.detect.crypto;
18+
19+
import java.io.IOException;
20+
import java.io.InputStream;
21+
import java.util.HashMap;
22+
import java.util.Map;
23+
24+
import org.bouncycastle.asn1.ASN1Encodable;
25+
import org.bouncycastle.asn1.ASN1InputStream;
26+
import org.bouncycastle.asn1.ASN1Integer;
27+
import org.bouncycastle.asn1.ASN1ObjectIdentifier;
28+
import org.bouncycastle.asn1.ASN1OctetString;
29+
import org.bouncycastle.asn1.ASN1Primitive;
30+
import org.bouncycastle.asn1.ASN1Sequence;
31+
import org.bouncycastle.asn1.ASN1Set;
32+
import org.bouncycastle.asn1.ASN1TaggedObject;
33+
import org.bouncycastle.asn1.DLTaggedObject;
34+
35+
import org.apache.tika.config.Field;
36+
import org.apache.tika.detect.Detector;
37+
import org.apache.tika.io.BoundedInputStream;
38+
import org.apache.tika.metadata.Metadata;
39+
import org.apache.tika.mime.MediaType;
40+
41+
/**
42+
* This is a very limited asn1 detector that focuses on pkcs and timestamped-data (so far)
43+
*/
44+
public class ASN1Detector implements Detector {
45+
46+
private static final String DATA_OID = "1.2.840.113549.1.7.1";
47+
48+
private static final Map<String, String> ENVELOPED = Map.of("smime-type", "enveloped-data");
49+
private static final Map<String, String> SIGNED = Map.of("smime-type", "signed-data");
50+
private static final Map<String, String> CERTS_ONLY = Map.of("smime-type", "certs-only");
51+
private static final Map<String, String> COMPRESSED = Map.of("smime-type", "compressed-data");
52+
53+
54+
private static final long serialVersionUID = -8414458255467101503L;
55+
private static final MediaType PKCS12_MEDIA_TYPE = MediaType.application("x-pkcs12");
56+
private static final MediaType PKCS7_ENVELOPED = new MediaType("application", "pkcs7-mime", ENVELOPED);
57+
private static final MediaType PKCS7_SIGNED = new MediaType("application", "pkcs7-mime", SIGNED);
58+
private static final MediaType PKCS7_CERTS_ONLY = new MediaType("application", "pkcs7-mime", CERTS_ONLY);
59+
private static final MediaType PKCS7_COMPRESSED = new MediaType("application", "pkcs7-mime", COMPRESSED);
60+
private static final MediaType PKCS7_SIGNATURE_ONLY = MediaType.application("pkcs7-signature");
61+
62+
//not pkcs7 at all, but shares magic with compressed pkcs7
63+
private static final MediaType TIME_STAMPED_DATA = MediaType.application("timestamped-data");
64+
65+
private int markLimit = 1000000;
66+
67+
@Override
68+
public MediaType detect(InputStream input, Metadata metadata) throws IOException {
69+
if (input == null) {
70+
return null;
71+
}
72+
try {
73+
input.mark(2);
74+
int b = input.read();
75+
if (b != 0x30) {
76+
return null;
77+
}
78+
b = input.read();
79+
if (b < 0x7A || b > 0x84) {
80+
return null;
81+
}
82+
} finally {
83+
input.reset();
84+
}
85+
PKCSFeatures pkcsFeatures = new PKCSFeatures();
86+
BoundedInputStream bis = new BoundedInputStream(markLimit, input);
87+
bis.mark(markLimit);
88+
try {
89+
ASN1InputStream asn1InputStream = new ASN1InputStream(bis);
90+
ASN1Primitive root = null;
91+
if ((root = asn1InputStream.readObject()) != null) {
92+
handleRootNode(root, pkcsFeatures);
93+
if (pkcsFeatures.primaryType == PKCSFeatures.PRIMARY_TYPE.TIME_STAMPED_DATA) {
94+
return TIME_STAMPED_DATA;
95+
} else if (pkcsFeatures.looksLikePKCS12) {
96+
return PKCS12_MEDIA_TYPE;
97+
} else if (pkcsFeatures.primaryType == PKCSFeatures.PRIMARY_TYPE.ENVELOPED_DATA) {
98+
return PKCS7_ENVELOPED;
99+
} else if (pkcsFeatures.primaryType == PKCSFeatures.PRIMARY_TYPE.COMPRESSED) {
100+
return PKCS7_COMPRESSED;
101+
} else if (pkcsFeatures.primaryType == PKCSFeatures.PRIMARY_TYPE.SIGNED_DATA) {
102+
if (pkcsFeatures.hasData) {
103+
return PKCS7_SIGNED;
104+
} else if (pkcsFeatures.hasCerts) {
105+
return PKCS7_CERTS_ONLY;
106+
} else {
107+
return PKCS7_SIGNATURE_ONLY;
108+
}
109+
}
110+
}
111+
} catch (IOException e) {
112+
e.printStackTrace();
113+
//swallow
114+
} finally {
115+
bis.reset();
116+
}
117+
return null;
118+
}
119+
120+
private void handleRootNode(ASN1Primitive root, PKCSFeatures pkcsFeatures) throws IOException {
121+
String oid = null;
122+
ASN1TaggedObject taggedObject = null;
123+
if (!(root instanceof ASN1Sequence)) {
124+
return;
125+
}
126+
ASN1Sequence seq = (ASN1Sequence) root;
127+
//try for pkcs12
128+
if (seq.size() == 3) {
129+
tryPKCS12(seq, pkcsFeatures);
130+
if (pkcsFeatures.looksLikePKCS12) {
131+
return;
132+
}
133+
}
134+
for (ASN1Encodable c : ((ASN1Sequence) root)) {
135+
if (c instanceof ASN1ObjectIdentifier) {
136+
oid = ((ASN1ObjectIdentifier) c).toString();
137+
} else if (c instanceof ASN1TaggedObject) {
138+
taggedObject = (ASN1TaggedObject) c;
139+
}
140+
}
141+
PKCSFeatures.PRIMARY_TYPE type = PKCSFeatures.lookup(oid);
142+
pkcsFeatures.primaryType = type;
143+
if (type == PKCSFeatures.PRIMARY_TYPE.UNKNOWN) {
144+
return;
145+
} else if (type == PKCSFeatures.PRIMARY_TYPE.TIME_STAMPED_DATA) {
146+
return;
147+
}
148+
if (taggedObject != null) {
149+
handleNode(taggedObject, pkcsFeatures);
150+
}
151+
}
152+
153+
private void tryPKCS12(ASN1Sequence seq, ASN1Detector.PKCSFeatures pkcsFeatures) {
154+
//This could much more rigorous -- see TIKA-3784
155+
156+
//require version 3 as the first value
157+
ASN1Encodable obj0 = seq.getObjectAt(0);
158+
if (! (obj0 instanceof ASN1Integer)) {
159+
return;
160+
}
161+
if (((ASN1Integer)obj0).getValue().intValue() != 3) {
162+
return;
163+
}
164+
//require two sequences
165+
if (! (seq.getObjectAt(1) instanceof ASN1Sequence) ||
166+
! (seq.getObjectAt(2) instanceof ASN1Sequence)) {
167+
return;
168+
}
169+
//first sequence must have a data type oid as its first element
170+
ASN1Sequence seq1 = (ASN1Sequence) seq.getObjectAt(1);
171+
if (seq1.size() < 2) {
172+
return;
173+
}
174+
if (! (seq1.getObjectAt(0) instanceof ASN1ObjectIdentifier)) {
175+
return;
176+
}
177+
if (! DATA_OID.equals(((ASN1ObjectIdentifier)seq1.getObjectAt(0)).getId())) {
178+
return;
179+
}
180+
//and a tagged object as its second
181+
//if you parse the tagged object and iterate through its children
182+
//you should eventually find oids starting with "1.2.840.113549.1.12.*"
183+
if (! (seq1.getObjectAt(1) instanceof DLTaggedObject)) {
184+
return;
185+
}
186+
pkcsFeatures.looksLikePKCS12 = true;
187+
}
188+
189+
private void handleSequence(ASN1Sequence seq, PKCSFeatures pkcsFeatures) throws IOException {
190+
if (seq.size() == 0) {
191+
return;
192+
}
193+
if (isCert(seq)) {
194+
pkcsFeatures.hasCerts = true;
195+
return;
196+
}
197+
if (hasSignedData(seq)) {
198+
pkcsFeatures.hasData = true;
199+
return;
200+
}
201+
202+
203+
}
204+
205+
private boolean isCert(ASN1Sequence seq) {
206+
if (seq.size() != 6) {
207+
return false;
208+
}
209+
//do more
210+
//e.g. check for sequence in seq.get(2) and make sure there's a data oid there
211+
return true;
212+
}
213+
214+
private boolean hasSignedData(ASN1Sequence seq) {
215+
if (seq.size() != 5) {
216+
return false;
217+
}
218+
//data should be a sequence in position 2
219+
ASN1Encodable dataSequence = seq.getObjectAt(2);
220+
if (! (dataSequence instanceof ASN1Sequence)) {
221+
return false;
222+
}
223+
if (((ASN1Sequence) dataSequence).size() < 1) {
224+
return false;
225+
}
226+
ASN1Encodable obj0 = ((ASN1Sequence) dataSequence).getObjectAt(0);
227+
if (obj0 instanceof ASN1ObjectIdentifier) {
228+
if (DATA_OID.equals(((ASN1ObjectIdentifier) obj0).getId())) {
229+
//TODO -- check for null or actual data?
230+
if (((ASN1Sequence) dataSequence).size() > 1) {
231+
return true;
232+
}
233+
}
234+
}
235+
return false;
236+
}
237+
238+
private void handleNode(ASN1Primitive primitive, PKCSFeatures pkcsFeatures) throws IOException {
239+
if (primitive instanceof ASN1Sequence) {
240+
handleSequence((ASN1Sequence) primitive, pkcsFeatures);
241+
} else if (primitive instanceof ASN1TaggedObject) {
242+
handleTagged((ASN1TaggedObject) primitive, pkcsFeatures);
243+
} else if (primitive instanceof ASN1OctetString) {
244+
ASN1OctetString octetString = (ASN1OctetString) primitive;
245+
try {
246+
ASN1Primitive newP = ASN1Primitive.fromByteArray(octetString.getOctets());
247+
handleNode(newP, pkcsFeatures);
248+
} catch (IOException e) {
249+
//swallow
250+
251+
}
252+
} else if (primitive instanceof ASN1ObjectIdentifier) {
253+
ASN1ObjectIdentifier oid = (ASN1ObjectIdentifier) primitive;
254+
255+
} else if (primitive instanceof ASN1Set) {
256+
for (ASN1Encodable obj : ((ASN1Set)primitive)) {
257+
handleNode(obj.toASN1Primitive(), pkcsFeatures);
258+
}
259+
}
260+
}
261+
262+
private void handleTagged(ASN1TaggedObject tagged, PKCSFeatures pkcsFeatures) throws IOException {
263+
handleNode(tagged.getBaseObject().toASN1Primitive(), pkcsFeatures);
264+
}
265+
266+
@Field
267+
public void setMarkLimit(int markLimit) {
268+
this.markLimit = markLimit;
269+
}
270+
271+
private static class PKCSFeatures {
272+
enum PRIMARY_TYPE {
273+
SIGNED_DATA("1.2.840.113549.1.7.2"), ENVELOPED_DATA("1.2.840.113549.1.7.3"),
274+
SIGNED_AND_ENVELOPED_DATA("1.2.840.113549.1.7.4"),
275+
DIGESTED_DATA("1.2.840.113549.1.7.5"),
276+
ENCRYPTED_DATA("1.2.840.113549.1.7.6"), COMPRESSED("1.2.840.113549.1.9.16.1.9"),
277+
TIME_STAMPED_DATA("1.2.840.113549.1.9.16.1.31"), UNKNOWN("UNKNOWN");
278+
private final String oid;
279+
280+
PRIMARY_TYPE(String oid) {
281+
this.oid = oid;
282+
}
283+
}
284+
285+
private static Map<String, PRIMARY_TYPE> TYPE_LOOKUP = new HashMap<>();
286+
static {
287+
for (PRIMARY_TYPE t : PRIMARY_TYPE.values()) {
288+
if (t == PRIMARY_TYPE.UNKNOWN) {
289+
continue;
290+
}
291+
TYPE_LOOKUP.put(t.oid, t);
292+
}
293+
}
294+
private PRIMARY_TYPE primaryType = PRIMARY_TYPE.UNKNOWN;
295+
private boolean hasData;
296+
private boolean hasCerts;
297+
private boolean hasSignature;
298+
private boolean looksLikePKCS12;
299+
300+
static PRIMARY_TYPE lookup(String oid) {
301+
if (TYPE_LOOKUP.containsKey(oid)) {
302+
return TYPE_LOOKUP.get(oid);
303+
}
304+
return PRIMARY_TYPE.UNKNOWN;
305+
}
306+
307+
@Override
308+
public String toString() {
309+
return "PKCSFeatures{" + "primaryType=" + primaryType + ", hasData=" + hasData + ", hasCerts=" + hasCerts + ", hasSignature=" + hasSignature + ", hasPKCS12Oid=" +
310+
looksLikePKCS12 + '}';
311+
}
312+
}
313+
}

0 commit comments

Comments
 (0)