-
Notifications
You must be signed in to change notification settings - Fork 110
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adds support for macro-aware transcoding from binary to text. #1000
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
// SPDX-License-Identifier: Apache-2.0 | ||
package com.amazon.ion | ||
|
||
import java.io.Closeable | ||
import java.io.IOException | ||
|
||
/** | ||
* An enhancement to an Ion reader that supports macro-aware transcoding. | ||
*/ | ||
interface MacroAwareIonReader : Closeable { | ||
|
||
/** | ||
* Performs a macro-aware transcode of the stream being read by this reader. | ||
* For Ion 1.0 streams, this functions similarly to providing a system-level | ||
* [IonReader] to [IonWriter.writeValues]. For Ion 1.1 streams, the transcoded | ||
* stream will include the same symbol tables, encoding directives, and | ||
* e-expression invocations as the source stream. In both cases, the | ||
* transcoded stream will be data-model equivalent to the source stream. | ||
* | ||
* The following limitations should be noted: | ||
* 1. Encoding directives with no effect on the encoding context may be | ||
* elided from the transcoded stream. An example would be an encoding | ||
* directive that re-exports the existing context but adds no new | ||
* macros or new symbols. | ||
* 2. When transcoding from text to text, comments will not be preserved. | ||
* 3. Open content in encoding directives (e.g. macro invocations that | ||
* expand to nothing) will not be preserved. | ||
* 4. Granular details of the binary encoding, like inlining vs. interning | ||
* for a particular symbol or length-prefixing vs. delimiting for a | ||
* particular container, may not be preserved. It is up to the user | ||
* to provide a writer configured to match these details if important. | ||
* | ||
* To get a [MacroAwareIonReader] use `_Private_IonReaderBuilder.buildMacroAware`. | ||
* To get a [MacroAwareIonWriter] use [IonEncodingVersion.textWriterBuilder] or | ||
* [IonEncodingVersion.binaryWriterBuilder]. | ||
* @param writer the writer to which the reader's stream will be transcoded. | ||
*/ | ||
@Throws(IOException::class) | ||
fun transcodeTo(writer: MacroAwareIonWriter) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,7 +7,10 @@ | |
import com.amazon.ion.IonBufferConfiguration; | ||
import com.amazon.ion.IonCursor; | ||
import com.amazon.ion.IonException; | ||
import com.amazon.ion.IonReader; | ||
import com.amazon.ion.IonType; | ||
import com.amazon.ion.MacroAwareIonReader; | ||
import com.amazon.ion.MacroAwareIonWriter; | ||
import com.amazon.ion.SymbolTable; | ||
import com.amazon.ion.SymbolToken; | ||
import com.amazon.ion.Timestamp; | ||
|
@@ -21,6 +24,7 @@ | |
import com.amazon.ion.impl.macro.EncodingContext; | ||
import com.amazon.ion.impl.macro.Expression; | ||
import com.amazon.ion.impl.macro.EExpressionArgsReader; | ||
import com.amazon.ion.impl.macro.IonReaderFromReaderAdapter; | ||
import com.amazon.ion.impl.macro.Macro; | ||
import com.amazon.ion.impl.macro.MacroCompiler; | ||
import com.amazon.ion.impl.macro.MacroTable; | ||
|
@@ -32,6 +36,7 @@ | |
import com.amazon.ion.impl.macro.MacroRef; | ||
import com.amazon.ion.impl.macro.SystemMacro; | ||
|
||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.math.BigDecimal; | ||
import java.math.BigInteger; | ||
|
@@ -41,8 +46,8 @@ | |
import java.util.Arrays; | ||
import java.util.Collections; | ||
import java.util.Date; | ||
import java.util.HashMap; | ||
import java.util.Iterator; | ||
import java.util.LinkedHashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.function.Consumer; | ||
|
@@ -55,7 +60,7 @@ | |
/** | ||
* An IonCursor capable of raw parsing of binary Ion streams. | ||
*/ | ||
class IonReaderContinuableCoreBinary extends IonCursorBinary implements IonReaderContinuableCore { | ||
class IonReaderContinuableCoreBinary extends IonCursorBinary implements IonReaderContinuableCore, MacroAwareIonReader { | ||
|
||
// The UTF-8 bytes that represent the text "$ion_encoding" for quick byte-by-byte comparisons. | ||
private static final byte[] ION_ENCODING_UTF8 = ION_ENCODING.getBytes(StandardCharsets.UTF_8); | ||
|
@@ -132,6 +137,9 @@ class IonReaderContinuableCoreBinary extends IonCursorBinary implements IonReade | |
// Adapts this reader for use in code that supports multiple reader types. | ||
private final ReaderAdapter readerAdapter = new ReaderAdapterContinuable(this); | ||
|
||
// Adapts this reader for use in code that supports IonReader. | ||
private final IonReader asIonReader = new IonReaderFromReaderAdapter(readerAdapter); | ||
|
||
// Reads encoding directives from the stream. | ||
private final EncodingDirectiveReader encodingDirectiveReader = new EncodingDirectiveReader(); | ||
|
||
|
@@ -1105,7 +1113,7 @@ boolean startsWithIonEncoding() { | |
public String getSymbol(int sid) { | ||
// Only symbol IDs declared in Ion 1.1 encoding directives (not Ion 1.0 symbol tables) are resolved by the | ||
// core reader. In Ion 1.0, 'symbols' is never populated by the core reader. | ||
if (sid - 1 <= localSymbolMaxOffset) { | ||
if (sid > 0 && sid - 1 <= localSymbolMaxOffset) { | ||
return symbols[sid - 1]; | ||
} | ||
return null; | ||
|
@@ -1217,7 +1225,7 @@ private class EncodingDirectiveReader { | |
boolean isSymbolTableAppend = false; | ||
boolean isMacroTableAppend = false; | ||
List<String> newSymbols = new ArrayList<>(8); | ||
Map<MacroRef, Macro> newMacros = new HashMap<>(); | ||
Map<MacroRef, Macro> newMacros = new LinkedHashMap<>(); | ||
MacroCompiler macroCompiler = new MacroCompiler(this::resolveMacro, readerAdapter); | ||
|
||
boolean isSymbolTableAlreadyClassified = false; | ||
|
@@ -1356,8 +1364,10 @@ private Event coreNextValue() { | |
/** | ||
* Read an encoding directive. If the stream ends before the encoding directive finishes, `event` will be | ||
* `NEEDS_DATA` and this method can be called again when more data is available. | ||
* @param preserveMacroNames true if new macros should be referred to by name. If false, new macros will be | ||
* referred to by integer ID. | ||
*/ | ||
void readEncodingDirective() { | ||
void readEncodingDirective(boolean preserveMacroNames) { | ||
Event event; | ||
while (true) { | ||
switch (state) { | ||
|
@@ -1490,6 +1500,10 @@ void readEncodingDirective() { | |
state = State.COMPILING_MACRO; | ||
Macro newMacro = macroCompiler.compileMacro(); | ||
newMacros.put(MacroRef.byId(++localMacroMaxOffset), newMacro); | ||
String macroName = macroCompiler.getMacroName(); | ||
if (preserveMacroNames && macroName != null) { | ||
newMacros.put(MacroRef.byName(macroName), newMacro); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've been thinking about this since #996 and working with the MacroTable abstraction there... I think we ought to support both lookups all the time. That would make What considerations (other than map/keyset size) am I missing here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's the map size and the general ugliness about the fact that there are always two keys that map to the same value. I'm actually going to make the change to support both lookups all the time though, which is how it's handled when read from the text format. |
||
state = State.IN_MACRO_TABLE_SEXP; | ||
break; | ||
default: | ||
|
@@ -1758,13 +1772,98 @@ private boolean evaluateNext() { | |
return false; | ||
} | ||
|
||
@Override | ||
public void transcodeTo(MacroAwareIonWriter writer) throws IOException { | ||
registerIvmNotificationConsumer((x, y) -> { | ||
resetEncodingContext(); | ||
writer.startEncodingSegmentWithIonVersionMarker(); | ||
}); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm assuming There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I renamed the parameters to make this obvious. Which IVM to write is inherent to the writer implementation -- we don't have a single implementation that writes both formats. |
||
while (transcodeNextTo(writer) != Event.NEEDS_DATA); | ||
} | ||
|
||
/** | ||
* Transcodes the next value, and any encoding directives that may precede it, | ||
* to the given writer. | ||
* @param writer the writer to which the value will be transcoded. | ||
* @return the result of the operation. | ||
* @throws IOException if thrown during writing. | ||
*/ | ||
Event transcodeNextTo(MacroAwareIonWriter writer) throws IOException { | ||
jobarr-amzn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// NOTE: this method is structured very similarly to nextValue(). During performance analysis, we should | ||
// see if the methods can be unified without sacrificing hot path performance. Performance of this method | ||
// is not considered critical. | ||
lobBytesRead = 0; | ||
while (true) { | ||
if (parent == null || state != State.READING_VALUE) { | ||
if (state != State.READING_VALUE && state != State.COMPILING_MACRO) { | ||
boolean isEncodingDirectiveFromEExpression = isEvaluatingEExpression; | ||
encodingDirectiveReader.readEncodingDirective(true); | ||
if (state != State.READING_VALUE) { | ||
throw new IonException("Unexpected EOF when writing encoding-level value."); | ||
} | ||
// If the encoding directive was expanded from an e-expression, that expression has already been | ||
// written. In that case, just make sure the writer is using the new context. Otherwise, also write | ||
// the encoding directive. | ||
writer.startEncodingSegmentWithEncodingDirective( | ||
encodingDirectiveReader.newMacros, | ||
encodingDirectiveReader.isMacroTableAppend, | ||
encodingDirectiveReader.newSymbols, | ||
encodingDirectiveReader.isSymbolTableAppend, | ||
isEncodingDirectiveFromEExpression | ||
); | ||
} | ||
if (isEvaluatingEExpression) { | ||
if (evaluateNext()) { | ||
continue; | ||
} | ||
} else { | ||
event = super.nextValue(); | ||
} | ||
if (minorVersion == 1 && parent == null && isPositionedOnEncodingDirective()) { | ||
encodingDirectiveReader.resetState(); | ||
state = State.ON_ION_ENCODING_SEXP; | ||
continue; | ||
} | ||
} else if (isEvaluatingEExpression) { | ||
if (evaluateNext()) { | ||
continue; | ||
} | ||
} else { | ||
event = super.nextValue(); | ||
} | ||
if (valueTid != null && valueTid.isMacroInvocation) { | ||
expressionArgsReader.beginEvaluatingMacroInvocation(macroEvaluator); | ||
macroEvaluatorIonReader.transcodeArgumentsTo(writer); | ||
Comment on lines
+1863
to
+1864
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a full materialization of the e-expression arguments followed by a re-write. If we change to lazy evaluation of e-expression arguments, this will need to change. |
||
isEvaluatingEExpression = true; | ||
if (evaluateNext()) { | ||
continue; | ||
} | ||
if (parent == null && isPositionedOnEvaluatedEncodingDirective()) { | ||
encodingDirectiveReader.resetState(); | ||
state = State.ON_ION_ENCODING_SEXP; | ||
continue; | ||
} | ||
} | ||
if (isEvaluatingEExpression) { | ||
// EExpressions are not expanded and provided to the writer; only the raw encoding is transferred. | ||
continue; | ||
} | ||
break; | ||
} | ||
if (event != Event.NEEDS_DATA) { | ||
// The reader is now positioned on an actual encoding value. Write the value. | ||
writer.writeValue(asIonReader); | ||
} | ||
return event; | ||
} | ||
|
||
@Override | ||
public Event nextValue() { | ||
lobBytesRead = 0; | ||
while (true) { | ||
if (parent == null || state != State.READING_VALUE) { | ||
if (state != State.READING_VALUE && state != State.COMPILING_MACRO) { | ||
encodingDirectiveReader.readEncodingDirective(); | ||
encodingDirectiveReader.readEncodingDirective(false); | ||
if (state != State.READING_VALUE) { | ||
event = Event.NEEDS_DATA; | ||
break; | ||
|
@@ -2511,7 +2610,7 @@ IntList getAnnotationSidList() { | |
* @return a SymbolToken. | ||
*/ | ||
protected SymbolToken getSymbolToken(int sid) { | ||
return new SymbolTokenImpl(sid); | ||
return new SymbolTokenImpl(getSymbol(sid), sid); | ||
} | ||
|
||
protected final SymbolToken getSystemSymbolToken(Marker marker) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,13 @@ | ||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
package com.amazon.ion.impl; | ||
|
||
import com.amazon.ion.IonCatalog; | ||
import com.amazon.ion.IonException; | ||
import com.amazon.ion.IonReader; | ||
import com.amazon.ion.IonTextReader; | ||
import com.amazon.ion.IonValue; | ||
import com.amazon.ion.MacroAwareIonReader; | ||
import com.amazon.ion.system.IonReaderBuilder; | ||
import com.amazon.ion.util.IonStreamUtils; | ||
|
||
|
@@ -352,4 +352,16 @@ public IonTextReader build(String ionText) { | |
return makeReaderText(validateCatalog(), ionText, lstFactory); | ||
} | ||
|
||
/** | ||
* Creates a new {@link MacroAwareIonReader} over the given data. | ||
* @param ionData the data to read. | ||
* @return a new MacroAwareIonReader instance. | ||
*/ | ||
public MacroAwareIonReader buildMacroAware(byte[] ionData) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: this is "hidden" in |
||
// TODO make this work for text too. | ||
if (!IonStreamUtils.isIonBinary(ionData)) { | ||
throw new UnsupportedOperationException("MacroAwareIonReader is not yet implemented for text data."); | ||
} | ||
return new IonReaderContinuableCoreBinary(getBufferConfiguration(), ionData, 0, ionData.length); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These are technical limitations, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could code around most of these, but it's generally not worth the effort.