Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(glue): add support for more DataFormats #5246

Merged
merged 4 commits into from
Dec 17, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 184 additions & 7 deletions packages/@aws-cdk/aws-glue/lib/data-format.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,43 @@
* Absolute class name of the Hadoop `InputFormat` to use when reading table files.
*/
export class InputFormat {
/**
* InputFormat for Avro files.
*
* @see https://hive.apache.org/javadocs/r1.2.2/api/org/apache/hadoop/hive/ql/io/avro/AvroContainerInputFormat.html
*/
public static readonly AVRO = new InputFormat('org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat');

/**
* InputFormat for Cloudtrail Logs.
*
* @see https://docs.aws.amazon.com/athena/latest/ug/cloudtrail.html
*/
public static readonly CLOUDTRAIL = new InputFormat('com.amazon.emr.cloudtrail.CloudTrailInputFormat');

/**
* InputFormat for Orc files.
*
* @see https://hive.apache.org/javadocs/r1.2.2/api/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.html
*/
public static readonly ORC = new InputFormat('org.apache.hadoop.hive.ql.io.orc.OrcInputFormat');

/**
* InputFormat for Parquet files.
*
* @see https://hive.apache.org/javadocs/r1.2.2/api/org/apache/hadoop/hive/ql/io/parquet/MapredParquetInputFormat.html
*/
public static readonly PARQUET = new InputFormat('org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat');

/**
* An InputFormat for plain text files. Files are broken into lines. Either linefeed or
* carriage-return are used to signal end of line. Keys are the position in the file, and
* values are the line of text.
* JSON & CSV files are examples of this InputFormat
*
* @see https://hadoop.apache.org/docs/stable/api/org/apache/hadoop/mapred/TextInputFormat.html
*/
public static readonly TEXT_INPUT_FORMAT = new InputFormat('org.apache.hadoop.mapred.TextInputFormat');
public static readonly TEXT = new InputFormat('org.apache.hadoop.mapred.TextInputFormat');

constructor(public readonly className: string) {}
}
Expand All @@ -23,7 +52,28 @@ export class OutputFormat {
*
* @see https://hive.apache.org/javadocs/r2.2.0/api/org/apache/hadoop/hive/ql/io/HiveIgnoreKeyTextOutputFormat.html
*/
public static readonly HIVE_IGNORE_KEY_TEXT_OUTPUT_FORMAT = new OutputFormat('org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat');
public static readonly HIVE_IGNORE_KEY_TEXT = new OutputFormat('org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat');

/**
* OutputFormat for Avro files.
*
* @see https://hive.apache.org/javadocs/r1.2.2/api/org/apache/hadoop/hive/ql/io/avro/AvroContainerOutputFormat.html
*/
public static readonly AVRO = new InputFormat('org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat');

/**
* OutputFormat for Orc files.
*
* @see https://hive.apache.org/javadocs/r1.2.2/api/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.html
*/
public static readonly ORC = new InputFormat('org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat');

/**
* OutputFormat for Parquet files.
*
* @see https://hive.apache.org/javadocs/r1.2.2/api/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.html
*/
public static readonly PARQUET = new OutputFormat('org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat');

constructor(public readonly className: string) {}
}
Expand All @@ -35,15 +85,55 @@ export class OutputFormat {
*/
export class SerializationLibrary {
/**
* @see https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-JSON
* @see https://hive.apache.org/javadocs/r1.2.2/api/org/apache/hadoop/hive/serde2/avro/AvroSerDe.html
*/
public static readonly AVRO = new SerializationLibrary('org.apache.hadoop.hive.serde2.avro.AvroSerDe');

/**
* @see https://docs.aws.amazon.com/athena/latest/ug/cloudtrail.html
*/
public static readonly CLOUDTRAIL = new SerializationLibrary('com.amazon.emr.hive.serde.CloudTrailSerde');

/**
* @see https://docs.aws.amazon.com/athena/latest/ug/grok.html
*/
public static readonly GROK = new SerializationLibrary('com.amazonaws.glue.serde.GrokSerDe');

/**
* @see https://hive.apache.org/javadocs/r1.2.2/api/org/apache/hive/hcatalog/data/JsonSerDe.html
*/
public static readonly HIVE_JSON = new SerializationLibrary('org.apache.hive.hcatalog.data.JsonSerDe');

/**
* @see https://hive.apache.org/javadocs/r1.2.2/api/org/apache/hadoop/hive/serde2/lazy/LazySimpleSerDe.html
*/
public static readonly LAZY_SIMPLE = new SerializationLibrary('org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe');

/**
* @see https://hive.apache.org/javadocs/r1.2.2/api/org/apache/hadoop/hive/serde2/OpenCSVSerde.html
*/
public static readonly OPEN_CSV = new SerializationLibrary('org.apache.hadoop.hive.serde2.OpenCSVSerde');

/**
* @see https://github.com/rcongiu/Hive-JSON-Serde
*/
public static readonly OPENX_JSON = new SerializationLibrary('org.openx.data.jsonserde.JsonSerDe');

/**
* @see https://hive.apache.org/javadocs/r1.2.2/api/org/apache/hadoop/hive/ql/io/orc/OrcSerde.html
*/
public static readonly ORC = new SerializationLibrary('org.apache.hadoop.hive.ql.io.orc.OrcSerde');

/**
* @see https://hive.apache.org/javadocs/r1.2.2/api/org/apache/hadoop/hive/ql/io/parquet/serde/ParquetHiveSerDe.html
*/
public static readonly PARQUET = new SerializationLibrary('org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe');

/**
* @see https://hive.apache.org/javadocs/r1.2.2/api/org/apache/hadoop/hive/serde2/RegexSerDe.html
*/
public static readonly REGEXP = new SerializationLibrary('org.apache.hadoop.hive.serde2.RegexSerDe');

constructor(public readonly className: string) {}
}

Expand All @@ -69,15 +159,102 @@ export interface DataFormat {

export namespace DataFormat {
/**
* Stored as plain text files in JSON format.
* DataFormat for Apache Web Server Logs. Also works for CloudFront logs
*
* @see https://docs.aws.amazon.com/athena/latest/ug/apache.html
*/
export const ApacheLogs: DataFormat = {
inputFormat: InputFormat.TEXT,
outputFormat: OutputFormat.HIVE_IGNORE_KEY_TEXT,
serializationLibrary: SerializationLibrary.REGEXP
};

/**
* DataFormat for Apache Avro
*
* @see https://docs.aws.amazon.com/athena/latest/ug/avro.html
*/
export const Avro: DataFormat = {
inputFormat: InputFormat.AVRO,
outputFormat: OutputFormat.AVRO,
serializationLibrary: SerializationLibrary.AVRO
};

/**
* DataFormat for CloudTrail logs stored on S3
*
* @see https://docs.aws.amazon.com/athena/latest/ug/cloudtrail.html
*/
export const CloudTrailLogs: DataFormat = {
inputFormat: InputFormat.CLOUDTRAIL,
outputFormat: OutputFormat.HIVE_IGNORE_KEY_TEXT,
serializationLibrary: SerializationLibrary.CLOUDTRAIL
};

/**
* DataFormat for CSV Files
*
* @see https://docs.aws.amazon.com/athena/latest/ug/csv.html
*/
export const CSV: DataFormat = {
inputFormat: InputFormat.TEXT,
outputFormat: OutputFormat.HIVE_IGNORE_KEY_TEXT,
serializationLibrary: SerializationLibrary.OPEN_CSV
};

/**
* Stored as plain text files in JSON format.
* Uses OpenX Json SerDe for serialization and deseralization.
*
* @see https://docs.aws.amazon.com/athena/latest/ug/json.html
*/
export const Json: DataFormat = {
inputFormat: InputFormat.TEXT_INPUT_FORMAT,
outputFormat: OutputFormat.HIVE_IGNORE_KEY_TEXT_OUTPUT_FORMAT,
inputFormat: InputFormat.TEXT,
outputFormat: OutputFormat.HIVE_IGNORE_KEY_TEXT,
serializationLibrary: SerializationLibrary.OPENX_JSON
};
}

/**
* DataFormat for Logstash Logs, using the GROK SerDe
*
* @see https://docs.aws.amazon.com/athena/latest/ug/grok.html
*/
export const Logstash: DataFormat = {
inputFormat: InputFormat.TEXT,
outputFormat: OutputFormat.HIVE_IGNORE_KEY_TEXT,
serializationLibrary: SerializationLibrary.GROK
};

/**
* DataFormat for Apache ORC (Optimized Row Columnar)
*
* @see https://docs.aws.amazon.com/athena/latest/ug/orc.html
*/
export const Orc: DataFormat = {
inputFormat: InputFormat.ORC,
outputFormat: OutputFormat.ORC,
serializationLibrary: SerializationLibrary.ORC
};

/**
* DataFormat for Apache Parquet
*
* @see https://docs.aws.amazon.com/athena/latest/ug/parquet.html
*/
export const Parquet: DataFormat = {
inputFormat: InputFormat.PARQUET,
outputFormat: OutputFormat.PARQUET,
serializationLibrary: SerializationLibrary.PARQUET
};

/**
* DataFormat for TSV (Tab-Separated Values)
*
* @see https://docs.aws.amazon.com/athena/latest/ug/lazy-simple-serde.html
*/
export const TSV: DataFormat = {
inputFormat: InputFormat.TEXT,
outputFormat: OutputFormat.HIVE_IGNORE_KEY_TEXT,
serializationLibrary: SerializationLibrary.LAZY_SIMPLE
};
}
8 changes: 8 additions & 0 deletions packages/@aws-cdk/aws-glue/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,16 @@
"docs-public-apis:@aws-cdk/aws-glue.Table.fromTableArn",
"docs-public-apis:@aws-cdk/aws-glue.Schema.DOUBLE",
"docs-public-apis:@aws-cdk/aws-glue.Schema.FLOAT",
"docs-public-apis:@aws-cdk/aws-glue.SerializationLibrary.AVRO",
"docs-public-apis:@aws-cdk/aws-glue.SerializationLibrary.CLOUDTRAIL",
"docs-public-apis:@aws-cdk/aws-glue.SerializationLibrary.GROK",
"docs-public-apis:@aws-cdk/aws-glue.SerializationLibrary.HIVE_JSON",
"docs-public-apis:@aws-cdk/aws-glue.SerializationLibrary.LAZY_SIMPLE",
"docs-public-apis:@aws-cdk/aws-glue.SerializationLibrary.OPEN_CSV",
"docs-public-apis:@aws-cdk/aws-glue.SerializationLibrary.OPENX_JSON",
"docs-public-apis:@aws-cdk/aws-glue.SerializationLibrary.ORC",
"docs-public-apis:@aws-cdk/aws-glue.SerializationLibrary.PARQUET",
"docs-public-apis:@aws-cdk/aws-glue.SerializationLibrary.REGEXP",
"docs-public-apis:@aws-cdk/aws-glue.SerializationLibrary.className"
]
}
Expand Down
Loading