From e2b41ec9180584c8ba94ec68d9ca30987efdfa18 Mon Sep 17 00:00:00 2001 From: Tim Allison Date: Thu, 5 Dec 2024 12:09:14 -0500 Subject: [PATCH] TIKA-4362 -- expand coverage of message classes for MAPI (#2076) * TIKA-4362 -- expand coverage of message classes for MAPI --- .../java/org/apache/tika/metadata/MAPI.java | 20 +- .../parser/microsoft/OutlookExtractor.java | 297 ++++++++++-------- .../microsoft/pst/PSTMailItemParser.java | 4 +- .../resources/mapi_message_classes.properties | 37 +++ 4 files changed, 212 insertions(+), 146 deletions(-) create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/mapi_message_classes.properties diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java index 2cf41c7e0d..57b46307f6 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java @@ -17,13 +17,10 @@ package org.apache.tika.metadata; /** - * Office Document properties collection. These properties apply to - * Office / Productivity Documents of all forms, including (but not limited - * to) MS Office and OpenDocument formats. - * This is a logical collection of properties, which may be drawn from a - * few different external definitions. * - * @since Apache Tika 1.2 + * Properties that typically appear in MSG/PST message format files. + * + * @since Apache Tika 4.0 */ public interface MAPI { @@ -31,10 +28,17 @@ public interface MAPI { /** * MAPI message class. What type of .msg/MAPI file is it? + * This is normalized via "mapi_message_classes.properties */ Property MESSAGE_CLASS = - Property.internalClosedChoise(PREFIX_MAPI_META + "message-class", "APPOINTMENT", "CONTACT", "NOTE", "STICKY_NOTE", - "POST", "TASK", "UNKNOWN", "UNSPECIFIED"); + Property.internalText(PREFIX_MAPI_META + "message-class"); + + /** + * MAPI message class. What type of .msg/MAPI file is it? + * This is the raw value that is retrieved from the underlying chunk + */ + Property MESSAGE_CLASS_RAW = + Property.internalText(PREFIX_MAPI_META + "message-class-raw"); Property SENT_BY_SERVER_TYPE = Property.internalText(PREFIX_MAPI_META + "sent-by-server-type"); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index 8f381e9238..b9c14c1153 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -18,7 +18,9 @@ import static java.nio.charset.StandardCharsets.UTF_8; +import java.io.BufferedReader; import java.io.IOException; +import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; @@ -106,6 +108,8 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { private static final Map LITERAL_TIME_PROPERTIES = new HashMap<>(); + private static final Map MESSAGE_CLASSES = new LinkedHashMap<>(); + static { for (MAPIProperty property : LITERAL_TIME_MAPI_PROPERTIES) { String name = property.mapiProperty.toLowerCase(Locale.ROOT); @@ -115,7 +119,30 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { Property tikaProp = Property.internalDate(name); LITERAL_TIME_PROPERTIES.put(property, tikaProp); } + + try (BufferedReader r = new BufferedReader( + new InputStreamReader( + OutlookExtractor.class.getResourceAsStream("/mapi_message_classes.properties"), UTF_8))) { + String line = r.readLine(); + while (line != null) { + if (line.isBlank() || line.startsWith("#")) { + line = r.readLine(); + continue; + } + String[] cols = line.split("\\s+"); + String lcKey = cols[0].toLowerCase(Locale.ROOT); + String value = cols[1]; + if (MESSAGE_CLASSES.containsKey(lcKey)) { + throw new IllegalArgumentException("Can't have duplicate keys: " + lcKey); + } + MESSAGE_CLASSES.put(lcKey, value); + line = r.readLine(); + } + } catch (IOException e) { + throw new IllegalStateException("can't find mapi_message_classes.properties?!"); + } } + //this according to the spec; in practice, it is probably more likely //that a "split field" fails to start with a space character than //that a real header contains anything but [-_A-Za-z0-9]. @@ -153,134 +180,115 @@ public static void addEvenIfNull(Property property, String value, Metadata metad } private static void setFirstChunk(List chunks, Property property, Metadata metadata) { - if (chunks == null || chunks.size() < 1 || chunks.get(0) == null) { + if (chunks == null || chunks.isEmpty() || chunks.get(0) == null) { return; } metadata.set(property, chunks.get(0).toString()); } - private static void addFirstChunk(List chunks, Property property, Metadata metadata) { - if (chunks == null || chunks.size() < 1 || chunks.get(0) == null) { - return; - } - metadata.add(property, chunks.get(0).toString()); - } - - //Still needed by PSTParser - public static String getMessageClass(String messageClass) { - if (messageClass == null || messageClass.trim().length() == 0) { + public static String getNormalizedMessageClass(String messageClass) { + if (messageClass == null || messageClass.isBlank()) { return "UNSPECIFIED"; - } else if (messageClass.equalsIgnoreCase("IPM.Note")) { - return "NOTE"; - } else if (messageClass.equalsIgnoreCase("IPM.Contact")) { - return "CONTACT"; - } else if (messageClass.equalsIgnoreCase("IPM.Appointment")) { - return "APPOINTMENT"; - } else if (messageClass.equalsIgnoreCase("IPM.StickyNote")) { - return "STICKY_NOTE"; - } else if (messageClass.equalsIgnoreCase("IPM.Task")) { - return "TASK"; - } else if (messageClass.equalsIgnoreCase("IPM.Post")) { - return "POST"; - } else { - return "UNKNOWN"; } + String lc = messageClass.toLowerCase(Locale.ROOT); + if (MESSAGE_CLASSES.containsKey(lc)) { + return MESSAGE_CLASSES.get(lc); + } + return "UNKNOWN"; } public void parse(XHTMLContentHandler xhtml) throws TikaException, SAXException, IOException { try { - msg.setReturnNullOnMissingChunk(true); + _parse(xhtml); + } catch (ChunkNotFoundException e) { + throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", + e); + } /*finally { + //You'd think you'd want to call msg.close(). + //Don't do that. That closes down the file system. + //If an msg has multiple msg attachments, some of them + //can reside in the same file system. After the first + //child is read, the fs is closed, and the other children + //get a java.nio.channels.ClosedChannelException + }*/ + } - try { - parentMetadata.set(MAPI.MESSAGE_CLASS, msg.getMessageClassEnum().name()); - } catch (ChunkNotFoundException e) { - //swallow - } + private void _parse(XHTMLContentHandler xhtml) throws TikaException, SAXException, + IOException, ChunkNotFoundException { + msg.setReturnNullOnMissingChunk(true); - // If the message contains strings that aren't stored - // as Unicode, try to sort out an encoding for them - if (msg.has7BitEncodingStrings()) { - guess7BitEncoding(msg); - } + // If the message contains strings that aren't stored + // as Unicode, try to sort out an encoding for them + if (msg.has7BitEncodingStrings()) { + guess7BitEncoding(msg); + } - // Start with the metadata - Map headers = normalizeHeaders(msg.getHeaders()); + // Start with the metadata + Map headers = normalizeHeaders(msg.getHeaders()); - handleFromTo(headers, parentMetadata); - handleMessageInfo(msg, headers, parentMetadata); + handleFromTo(headers, parentMetadata); + handleMessageInfo(msg, headers, parentMetadata); - try { - for (String recipientAddress : msg.getRecipientEmailAddressList()) { - if (recipientAddress != null) { - parentMetadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress); - } + try { + for (String recipientAddress : msg.getRecipientEmailAddressList()) { + if (recipientAddress != null) { + parentMetadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress); } - } catch (ChunkNotFoundException he) { - // Will be fixed in POI 3.7 Final } + } catch (ChunkNotFoundException e) { + //you'd think we wouldn't need this. we do. + } - for (Map.Entry e : headers.entrySet()) { - String headerKey = e.getKey(); - for (String headerValue : e.getValue()) { - parentMetadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue); - } + for (Map.Entry e : headers.entrySet()) { + String headerKey = e.getKey(); + for (String headerValue : e.getValue()) { + parentMetadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue); } + } - handleGeneralDates(msg, headers, parentMetadata); + handleGeneralDates(msg, headers, parentMetadata); - // Get the message body. Preference order is: html, rtf, text - Chunk htmlChunk = null; - Chunk rtfChunk = null; - Chunk textChunk = null; - for (Chunk chunk : msg.getMainChunks().getChunks()) { - if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) { - htmlChunk = chunk; - } - if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) { - rtfChunk = chunk; - } - if (chunk.getChunkId() == MAPIProperty.BODY.id) { - textChunk = chunk; - } + // Get the message body. Preference order is: html, rtf, text + Chunk htmlChunk = null; + Chunk rtfChunk = null; + Chunk textChunk = null; + for (Chunk chunk : msg.getMainChunks().getChunks()) { + if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) { + htmlChunk = chunk; } - handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml); + if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) { + rtfChunk = chunk; + } + if (chunk.getChunkId() == MAPIProperty.BODY.id) { + textChunk = chunk; + } + } + handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml); - // Process the attachments - for (AttachmentChunks attachment : msg.getAttachmentFiles()) { + // Process the attachments + for (AttachmentChunks attachment : msg.getAttachmentFiles()) { - String filename = null; - if (attachment.getAttachLongFileName() != null) { - filename = attachment.getAttachLongFileName().getValue(); - } else if (attachment.getAttachFileName() != null) { - filename = attachment.getAttachFileName().getValue(); - } + String filename = null; + if (attachment.getAttachLongFileName() != null) { + filename = attachment.getAttachLongFileName().getValue(); + } else if (attachment.getAttachFileName() != null) { + filename = attachment.getAttachFileName().getValue(); + } - if (attachment.getAttachData() != null) { - handleEmbeddedResource( - TikaInputStream.get(attachment.getAttachData().getValue()), filename, - null, null, xhtml, true); - } - if (attachment.getAttachmentDirectory() != null) { - handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), filename, - xhtml, true); - } + if (attachment.getAttachData() != null) { + handleEmbeddedResource( + TikaInputStream.get(attachment.getAttachData().getValue()), filename, + null, null, xhtml, true); + } + if (attachment.getAttachmentDirectory() != null) { + handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), filename, + xhtml, true); } - } catch (ChunkNotFoundException e) { - throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", - e); - } finally { - //You'd think you'd want to call msg.close(). - //Don't do that. That closes down the file system. - //If an msg has multiple msg attachments, some of them - //can reside in the same file system. After the first - //child is read, the fs is closed, and the other children - //get a java.nio.channels.ClosedChannelException } - } - private void handleMessageInfo(MAPIMessage msg, Map headers, Metadata metadata) - throws ChunkNotFoundException { + } + private void handleMessageInfo(MAPIMessage msg, Map headers, Metadata metadata) throws ChunkNotFoundException { //this is the literal subject including "re: " metadata.set(TikaCoreProperties.TITLE, msg.getSubject()); //this is the original topic for the thread without the "re: " @@ -289,51 +297,66 @@ private void handleMessageInfo(MAPIMessage msg, Map headers, M metadata.set(TikaCoreProperties.DESCRIPTION, topic); metadata.set(MAPI.CONVERSATION_TOPIC, topic); Chunks mainChunks = msg.getMainChunks(); - if (mainChunks != null) { - if (mainChunks.getMessageId() != null) { - metadata.set(MAPI.INTERNET_MESSAGE_ID, mainChunks - .getMessageId() - .getValue()); - } - - List conversationIndex = mainChunks.getAll().get(MAPIProperty.CONVERSATION_INDEX); - if (conversationIndex != null && ! conversationIndex.isEmpty()) { - Chunk chunk = conversationIndex.get(0); - if (chunk instanceof ByteChunk) { - byte[] bytes = ((ByteChunk)chunk).getValue(); - String hex = Hex.encodeHexString(bytes); - metadata.set(MAPI.CONVERSATION_INDEX, hex); - } - } - - List internetReferences = mainChunks.getAll().get(MAPIProperty.INTERNET_REFERENCES); - if (internetReferences != null) { - for (Chunk ref : internetReferences) { - if (ref instanceof StringChunk) { - metadata.add(MAPI.INTERNET_REFERENCES, ((StringChunk) ref).getValue()); - } - } - } - List inReplyToIds = mainChunks.getAll().get(MAPIProperty.IN_REPLY_TO_ID); - if (inReplyToIds != null && ! inReplyToIds.isEmpty()) { - metadata.add(MAPI.IN_REPLY_TO_ID, inReplyToIds.get(0).toString()); + if (mainChunks == null) { + return; + } + if (mainChunks.getMessageId() != null) { + metadata.set(MAPI.INTERNET_MESSAGE_ID, mainChunks + .getMessageId() + .getValue()); + } + + String mc = msg.getStringFromChunk(mainChunks.getMessageClass()); + if (mc != null) { + metadata.set(MAPI.MESSAGE_CLASS_RAW, mc); + } + metadata.set(MAPI.MESSAGE_CLASS, getNormalizedMessageClass(mc)); + List conversationIndex = mainChunks + .getAll() + .get(MAPIProperty.CONVERSATION_INDEX); + if (conversationIndex != null && !conversationIndex.isEmpty()) { + Chunk chunk = conversationIndex.get(0); + if (chunk instanceof ByteChunk) { + byte[] bytes = ((ByteChunk) chunk).getValue(); + String hex = Hex.encodeHexString(bytes); + metadata.set(MAPI.CONVERSATION_INDEX, hex); } + } - for (Map.Entry e : LITERAL_TIME_PROPERTIES.entrySet()) { - List timeProp = mainChunks.getProperties().get(e.getKey()); - if (timeProp != null && ! timeProp.isEmpty()) { - Calendar cal = ((PropertyValue.TimePropertyValue)timeProp.get(0)).getValue(); - metadata.set(e.getValue(), cal); + List internetReferences = mainChunks + .getAll() + .get(MAPIProperty.INTERNET_REFERENCES); + if (internetReferences != null) { + for (Chunk ref : internetReferences) { + if (ref instanceof StringChunk) { + metadata.add(MAPI.INTERNET_REFERENCES, ((StringChunk) ref).getValue()); } } - - MessageSubmissionChunk messageSubmissionChunk = mainChunks.getSubmissionChunk(); - if (messageSubmissionChunk != null) { - String submissionId = messageSubmissionChunk.getSubmissionId(); - metadata.set(MAPI.SUBMISSION_ID, submissionId); - metadata.set(MAPI.SUBMISSION_ACCEPTED_AT_TIME, messageSubmissionChunk.getAcceptedAtTime()); + } + List inReplyToIds = mainChunks + .getAll() + .get(MAPIProperty.IN_REPLY_TO_ID); + if (inReplyToIds != null && !inReplyToIds.isEmpty()) { + metadata.add(MAPI.IN_REPLY_TO_ID, inReplyToIds + .get(0) + .toString()); + } + + for (Map.Entry e : LITERAL_TIME_PROPERTIES.entrySet()) { + List timeProp = mainChunks + .getProperties() + .get(e.getKey()); + if (timeProp != null && !timeProp.isEmpty()) { + Calendar cal = ((PropertyValue.TimePropertyValue) timeProp.get(0)).getValue(); + metadata.set(e.getValue(), cal); } + } + MessageSubmissionChunk messageSubmissionChunk = mainChunks.getSubmissionChunk(); + if (messageSubmissionChunk != null) { + String submissionId = messageSubmissionChunk.getSubmissionId(); + metadata.set(MAPI.SUBMISSION_ID, submissionId); + metadata.set(MAPI.SUBMISSION_ACCEPTED_AT_TIME, messageSubmissionChunk.getAcceptedAtTime()); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java index f1c9f9e664..4b21e51419 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java @@ -155,7 +155,9 @@ private void extractMetadata(PSTMessage pstMail, Metadata metadata) { metadata.set(MAPI.PRIORTY, pstMail.getPriority()); metadata.set(MAPI.IS_FLAGGED, pstMail.isFlagged()); metadata.set(MAPI.MESSAGE_CLASS, - OutlookExtractor.getMessageClass(pstMail.getMessageClass())); + OutlookExtractor.getNormalizedMessageClass(pstMail.getMessageClass())); + metadata.set(MAPI.MESSAGE_CLASS_RAW, pstMail.getMessageClass()); + metadata.set(Message.MESSAGE_FROM_EMAIL, pstMail.getSenderEmailAddress()); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/mapi_message_classes.properties b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/mapi_message_classes.properties new file mode 100644 index 0000000000..f48fb5afb4 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/mapi_message_classes.properties @@ -0,0 +1,37 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +IPM.Note NOTE +IPM.Contact CONTACT +IPM.Appointment APPOINTMENT +IPM.StickyNote STICKY_NOTE +IPM.Task TASK +IPM.Post POST +IPM.Schedule.Meeting.Request MEETING_REQUEST +IPM.Schedule.Meeting.Canceled MEETING_CANCELED +IPM.Schedule.Meeting.Resp.Pos MEETING_RESPONSE_POSITIVE +IPM.Schedule.Meeting.Resp.Neg MEETING_RESPONSE_NEGATIVE +IPM.Schedule.Meeting.Resp.Tent MEETING_RESPONSE_TENTATIVE +IPM.Schedule.Meeting.Notification.Forward MEETING_NOTIFICATION_FORWARD +IPM.Schedule.Inquiry SCHEDULE_INQUIRY +IPM.Configuration.MRM CONFIGURATION_MRM +REPORT.IPM.Note.DR NOTE_DELIVERED +REPORT.IPM.Note.NDR NOTE_NOT_DELIVERED +REPORT.IPM.Note.IPNRN IPNRN READ_RECEIPT +REPORT.IPM.Note.IPNNRN IPNNRN NOT_READ_RECEIPT +RPM.Note.Rules.OofTemplate.Microsoft OUT_OF_OFFICE_TEMPLATE +IPM.Microsoft.FolderDesign.NamedView FOLDER_DESIGN_NAMED_VIEW +# see https://learn.microsoft.com/en-us/openspecs/exchange_server_protocols/ms-oxocal/e920fdbf-b561-4dc2-bee7-0c4fd36bd2ac +IPM.OLE.CLASS.{00061055-0000-0000-C000-000000000046} RECURRING_EVENT_MEETING_EXCEPTION