From 6839307c158d2e0a6b7db7f850033028b4c97b27 Mon Sep 17 00:00:00 2001 From: Marius Iversen Date: Mon, 15 Feb 2021 21:35:06 +0100 Subject: [PATCH] [Libbeat][New Processor] XML Decode (#23678) * stashing before initial commit * Initial commit * updating go.sum * updating it again * adding feedback from PR comments and removing expandkeys config entry * Updating changelog * removing expanded_keys from allowed fields * adding new changes based on PR comments, a few more changes remains * moving the xml decoder to its own subpackage based on PR comments * reverting back to Target being a string pointer, to be able to differentiate between null and empty string * Updating certain tests to fit the new ignore_failure and ignore_missing options * Updating unit test to test with missing field * updating license headers * adding benchmark test * benchmark, now also with allocation results * updating changelog entry * removing duplicate Changelog entry * changing changelog entry name to new name * Simplify error handling and fix race $ benchcmp old.txt new.txt benchmark old ns/op new ns/op delta BenchmarkProcessor_Run/single_object-12 15691 15686 -0.03% BenchmarkProcessor_Run/nested_and_array_object-12 39673 39098 -1.45% benchmark old allocs new allocs delta BenchmarkProcessor_Run/single_object-12 158 158 +0.00% BenchmarkProcessor_Run/nested_and_array_object-12 376 374 -0.53% benchmark old bytes new bytes delta BenchmarkProcessor_Run/single_object-12 8597 8597 +0.00% BenchmarkProcessor_Run/nested_and_array_object-12 20310 19798 -2.52% * internal xml to json implementation * Use internal xml to json decoder benchmark old ns/op new ns/op delta BenchmarkProcessor_Run/single_object-12 15686 8051 -48.67% BenchmarkProcessor_Run/nested_and_array_object-12 39098 20540 -47.47% benchmark old allocs new allocs delta BenchmarkProcessor_Run/single_object-12 158 75 -52.53% BenchmarkProcessor_Run/nested_and_array_object-12 374 184 -50.80% benchmark old bytes new bytes delta BenchmarkProcessor_Run/single_object-12 8597 3520 -59.06% BenchmarkProcessor_Run/nested_and_array_object-12 19798 7824 -60.48% benchmark old ns/op new ns/op delta BenchmarkProcessor_Run/single_object-12 15686 8051 -48.67% BenchmarkProcessor_Run/nested_and_array_object-12 39098 20540 -47.47% benchmark old allocs new allocs delta BenchmarkProcessor_Run/single_object-12 158 75 -52.53% BenchmarkProcessor_Run/nested_and_array_object-12 374 184 -50.80% benchmark old bytes new bytes delta BenchmarkProcessor_Run/single_object-12 8597 3520 -59.06% BenchmarkProcessor_Run/nested_and_array_object-12 19798 7824 -60.48% * changelog fix * Update docs * Add godoc example of xml to json * updating test name to fit Example naming convention Co-authored-by: Andrew Kroh --- CHANGELOG.next.asciidoc | 1 + libbeat/cmd/instance/imports_common.go | 1 + libbeat/common/encoding/xml/decode.go | 120 +++++ libbeat/common/encoding/xml/decode_test.go | 431 ++++++++++++++++ libbeat/docs/processors-list.asciidoc | 6 + libbeat/processors/decode_xml/config.go | 36 ++ libbeat/processors/decode_xml/decode_xml.go | 150 ++++++ .../processors/decode_xml/decode_xml_test.go | 467 ++++++++++++++++++ .../decode_xml/docs/decode_xml.asciidoc | 115 +++++ 9 files changed, 1327 insertions(+) create mode 100644 libbeat/common/encoding/xml/decode.go create mode 100644 libbeat/common/encoding/xml/decode_test.go create mode 100644 libbeat/processors/decode_xml/config.go create mode 100644 libbeat/processors/decode_xml/decode_xml.go create mode 100644 libbeat/processors/decode_xml/decode_xml_test.go create mode 100644 libbeat/processors/decode_xml/docs/decode_xml.asciidoc diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index 1759d22c028..22d95586bc5 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -598,6 +598,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d - Update the baseline version of Sarama (Kafka support library) to 1.27.2. {pull}23595[23595] - Add kubernetes.volume.fs.used.pct field. {pull}23564[23564] - Add the `enable_krb5_fast` flag to the Kafka output to explicitly opt-in to FAST authentication. {pull}23629[23629] +- Added new decode_xml processor to libbeat that is available to all beat types. {pull}23678[23678] - Add deployment name in pod's meta. {pull}23610[23610] - Add `selector` information in kubernetes services' metadata. {pull}23730[23730] diff --git a/libbeat/cmd/instance/imports_common.go b/libbeat/cmd/instance/imports_common.go index e47dbf93799..ac767a55964 100644 --- a/libbeat/cmd/instance/imports_common.go +++ b/libbeat/cmd/instance/imports_common.go @@ -30,6 +30,7 @@ import ( _ "github.com/elastic/beats/v7/libbeat/processors/add_process_metadata" _ "github.com/elastic/beats/v7/libbeat/processors/communityid" _ "github.com/elastic/beats/v7/libbeat/processors/convert" + _ "github.com/elastic/beats/v7/libbeat/processors/decode_xml" _ "github.com/elastic/beats/v7/libbeat/processors/dissect" _ "github.com/elastic/beats/v7/libbeat/processors/dns" _ "github.com/elastic/beats/v7/libbeat/processors/extract_array" diff --git a/libbeat/common/encoding/xml/decode.go b/libbeat/common/encoding/xml/decode.go new file mode 100644 index 00000000000..665c0608f67 --- /dev/null +++ b/libbeat/common/encoding/xml/decode.go @@ -0,0 +1,120 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package xml + +import ( + "bytes" + "encoding/xml" + "io" + "strings" +) + +// A Decoder reads and decodes XML from an input stream. +type Decoder struct { + prependHyphenToAttr bool + lowercaseKeys bool + xmlDec *xml.Decoder +} + +// NewDecoder returns a new decoder that reads from r. +func NewDecoder(r io.Reader) *Decoder { + return &Decoder{xmlDec: xml.NewDecoder(r)} +} + +// PrependHyphenToAttr causes the Decoder to prepend a hyphen ('-') to to all +// XML attribute names. +func (d *Decoder) PrependHyphenToAttr() { d.prependHyphenToAttr = true } + +// LowercaseKeys causes the Decoder to transform all key name to lowercase. +func (d *Decoder) LowercaseKeys() { d.lowercaseKeys = true } + +// Decode reads XML from the input stream and return a map containing the data. +func (d *Decoder) Decode() (map[string]interface{}, error) { + _, m, err := d.decode(nil) + return m, err +} + +func (d *Decoder) decode(attrs []xml.Attr) (string, map[string]interface{}, error) { + elements := map[string]interface{}{} + var cdata string + + for { + t, err := d.xmlDec.Token() + if err != nil { + if err == io.EOF { + return "", elements, nil + } + return "", nil, err + } + + switch elem := t.(type) { + case xml.StartElement: + cdata, subElements, err := d.decode(elem.Attr) + if err != nil { + return "", nil, err + } + + // Combine sub-elements and cdata. + var add interface{} = subElements + if len(subElements) == 0 { + add = cdata + } else if len(cdata) > 0 { + subElements["#text"] = cdata + } + + // Add the data to the current object while taking into account + // if the current key already exists (in the case of lists). + key := d.key(elem.Name.Local) + value := elements[elem.Name.Local] + switch v := value.(type) { + case nil: + elements[key] = add + case []interface{}: + elements[key] = append(v, add) + default: + elements[key] = []interface{}{v, add} + } + case xml.CharData: + cdata = string(bytes.TrimSpace(elem.Copy())) + case xml.EndElement: + d.addAttributes(attrs, elements) + return cdata, elements, nil + } + } +} + +func (d *Decoder) addAttributes(attrs []xml.Attr, m map[string]interface{}) { + for _, attr := range attrs { + key := d.attrKey(attr.Name.Local) + m[key] = attr.Value + } +} + +func (d *Decoder) key(in string) string { + if d.lowercaseKeys { + return strings.ToLower(in) + } + return in +} + +func (d *Decoder) attrKey(in string) string { + if d.prependHyphenToAttr { + return d.key("-" + in) + } + return d.key(in) +} diff --git a/libbeat/common/encoding/xml/decode_test.go b/libbeat/common/encoding/xml/decode_test.go new file mode 100644 index 00000000000..277972e56da --- /dev/null +++ b/libbeat/common/encoding/xml/decode_test.go @@ -0,0 +1,431 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// +build !integration + +package xml + +import ( + "encoding/json" + "fmt" + "os" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestIncompleteXML(t *testing.T) { + const xml = ` + + John +` + + d := NewDecoder(strings.NewReader(xml)) + out, err := d.Decode() + assert.Nil(t, out) + require.Error(t, err) + assert.Contains(t, err.Error(), "unexpected EOF") +} + +func TestLowercaseKeys(t *testing.T) { + const xml = ` + + John + +` + + expected := map[string]interface{}{ + "person": map[string]interface{}{ + "name": map[string]interface{}{ + "#text": "John", + "id": "123", + }, + }, + } + + d := NewDecoder(strings.NewReader(xml)) + d.LowercaseKeys() + out, err := d.Decode() + require.NoError(t, err) + assert.Equal(t, expected, out) +} + +func TestPrependHyphenToAttr(t *testing.T) { + const xml = ` + + John + +` + + expected := map[string]interface{}{ + "person": map[string]interface{}{ + "Name": map[string]interface{}{ + "#text": "John", + "-ID": "123", + }, + }, + } + + d := NewDecoder(strings.NewReader(xml)) + d.PrependHyphenToAttr() + out, err := d.Decode() + require.NoError(t, err) + assert.Equal(t, expected, out) +} + +func TestDecodeList(t *testing.T) { + const xml = ` + + + John + + + Jane + + Foo + +` + + expected := map[string]interface{}{ + "people": map[string]interface{}{ + "person": []interface{}{ + map[string]interface{}{ + "Name": map[string]interface{}{ + "#text": "John", + "ID": "123", + }, + }, + map[string]interface{}{ + "Name": map[string]interface{}{ + "#text": "Jane", + "ID": "456", + }, + }, + "Foo", + }, + }, + } + + d := NewDecoder(strings.NewReader(xml)) + out, err := d.Decode() + require.NoError(t, err) + assert.Equal(t, expected, out) +} + +func TestEmptyElement(t *testing.T) { + const xml = ` + + +` + + expected := map[string]interface{}{ + "people": "", + } + + d := NewDecoder(strings.NewReader(xml)) + out, err := d.Decode() + require.NoError(t, err) + assert.Equal(t, expected, out) +} + +func TestDecode(t *testing.T) { + type testCase struct { + XML string + Output map[string]interface{} + } + + tests := []testCase{ + { + XML: ` + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + `, + Output: map[string]interface{}{ + "catalog": map[string]interface{}{ + "book": map[string]interface{}{ + "author": "William H. Gaddis", + "review": "One of the great seminal American novels of the 20th century.", + "seq": "1", + "title": "The Recognitions"}}}, + }, + { + XML: ` + + + Gambardella, Matthew + XML Developer's Guide + Computer + 44.95 + 2000-10-01 + An in-depth look at creating applications with XML. + + + Ralls, Kim + Midnight Rain + Fantasy + 5.95 + 2000-12-16 + A former architect battles corporate zombies, an evil sorceress, and her own childhood to become queen of the world. + + `, + Output: map[string]interface{}{ + "catalog": map[string]interface{}{ + "book": []interface{}{ + map[string]interface{}{ + "author": "Gambardella, Matthew", + "description": "An in-depth look at creating applications with XML.", + "genre": "Computer", + "id": "bk101", + "price": "44.95", + "publish_date": "2000-10-01", + "title": "XML Developer's Guide", + }, + map[string]interface{}{ + "author": "Ralls, Kim", + "description": "A former architect battles corporate zombies, an evil sorceress, and her own childhood to become queen of the world.", + "genre": "Fantasy", + "id": "bk102", + "price": "5.95", + "publish_date": "2000-12-16", + "title": "Midnight Rain"}}}}, + }, + { + XML: ` + + + + Gambardella, Matthew + XML Developer's Guide + Computer + 44.95 + 2000-10-01 + An in-depth look at creating applications with XML. + + + Ralls, Kim + Midnight Rain + Fantasy + 5.95 + 2000-12-16 + A former architect battles corporate zombies, an evil sorceress, and her own childhood to become queen of the world. + + `, + Output: map[string]interface{}{ + "catalog": map[string]interface{}{ + "book": []interface{}{ + map[string]interface{}{ + "author": "Gambardella, Matthew", + "description": "An in-depth look at creating applications with XML.", + "genre": "Computer", + "id": "bk101", + "price": "44.95", + "publish_date": "2000-10-01", + "title": "XML Developer's Guide"}, + map[string]interface{}{ + "author": "Ralls, Kim", + "description": "A former architect battles corporate zombies, an evil sorceress, and her own childhood to become queen of the world.", + "genre": "Fantasy", + "id": "bk102", + "price": "5.95", + "publish_date": "2000-12-16", + "title": "Midnight Rain"}}}}, + }, + { + XML: ` + + + + Gambardella, Matthew + XML Developer's Guide + Computer + 44.95 + 2000-10-01 + An in-depth look at creating applications with XML. + + + + Ralls, Kim + A former architect battles corporate zombies, an evil sorceress, and her own childhood to become queen of the world. + + + `, + Output: map[string]interface{}{ + "catalog": map[string]interface{}{ + "book": map[string]interface{}{ + "author": "Gambardella, Matthew", + "description": "An in-depth look at creating applications with XML.", + "genre": "Computer", + "id": "bk101", + "price": "44.95", + "publish_date": "2000-10-01", + "title": "XML Developer's Guide"}, + "secondcategory": map[string]interface{}{ + "paper": map[string]interface{}{ + "description": "A former architect battles corporate zombies, an evil sorceress, and her own childhood to become queen of the world.", + "id": "bk102", + "test2": "Ralls, Kim"}}}}, + }, + } + + for i, test := range tests { + t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { + d := NewDecoder(strings.NewReader(test.XML)) + d.LowercaseKeys() + + out, err := d.Decode() + require.NoError(t, err) + assert.EqualValues(t, test.Output, out) + }) + } +} + +func ExampleDecoder_Decode() { + const xml = ` + + + + 91 + 1 + 4 + 9 + 0 + 0x8020000000000000 + + 100 + + + Microsoft-Windows-WinRM/Operational + vagrant-2012-r2 + + + + winlogbeat + running + 770069006E006C006F00670062006500610074002F0034000000 + + + + \\VAGRANT-2012-R2 + vagrant + + + + 15005 + shellId + 68007400740070003A002F002F0073006300680065006D00610073002E006D006900630072006F0073006F00660074002E0063006F006D002F007700620065006D002F00770073006D0061006E002F0031002F00770069006E0064006F00770073002F007300680065006C006C002F0063006D0064000000 + + + Creating WSMan shell on server with ResourceUri: %1 + Information + Request handling + Info + Microsoft-Windows-WinRM/Operational + Microsoft-Windows-Windows Remote Management + + Server + + + +} +` + dec := NewDecoder(strings.NewReader(xml)) + dec.LowercaseKeys() + m, err := dec.Decode() + if err != nil { + return + } + + enc := json.NewEncoder(os.Stdout) + enc.SetIndent("", " ") + if err = enc.Encode(m); err != nil { + return + } + + // Output: + // { + // "event": { + // "eventdata": { + // "binary": "770069006E006C006F00670062006500610074002F0034000000", + // "data": { + // "#text": "running", + // "name": "param2" + // } + // }, + // "processingerrordata": { + // "dataitemname": "shellId", + // "errorcode": "15005", + // "eventpayload": "68007400740070003A002F002F0073006300680065006D00610073002E006D006900630072006F0073006F00660074002E0063006F006D002F007700620065006D002F00770073006D0061006E002F0031002F00770069006E0064006F00770073002F007300680065006C006C002F0063006D0064000000" + // }, + // "renderinginfo": { + // "channel": "Microsoft-Windows-WinRM/Operational", + // "culture": "en-US", + // "keywords": { + // "keyword": "Server" + // }, + // "level": "Information", + // "message": "Creating WSMan shell on server with ResourceUri: %1", + // "opcode": "Info", + // "provider": "Microsoft-Windows-Windows Remote Management", + // "task": "Request handling" + // }, + // "system": { + // "channel": "Microsoft-Windows-WinRM/Operational", + // "computer": "vagrant-2012-r2", + // "correlation": { + // "activityid": "{A066CCF1-8AB3-459B-B62F-F79F957A5036}", + // "relatedactivityid": "{85FC0930-9C49-42DA-804B-A7368104BD1B}" + // }, + // "eventid": "91", + // "eventrecordid": "100", + // "execution": { + // "processid": "920", + // "threadid": "1152" + // }, + // "keywords": "0x8020000000000000", + // "level": "4", + // "opcode": "0", + // "provider": { + // "eventsourcename": "Service Control Manager", + // "guid": "{a7975c8f-ac13-49f1-87da-5a984a4ab417}", + // "name": "Microsoft-Windows-WinRM" + // }, + // "security": { + // "userid": "S-1-5-21-3541430928-2051711210-1391384369-1001" + // }, + // "task": "9", + // "timecreated": { + // "systemtime": "2016-01-28T20:33:27.990735300Z" + // }, + // "version": "1" + // }, + // "userdata": { + // "eventxml": { + // "servername": "\\\\VAGRANT-2012-R2", + // "username": "vagrant", + // "xmlns": "Event_NS" + // } + // }, + // "xmlns": "http://schemas.microsoft.com/win/2004/08/events/event" + // } + // } +} diff --git a/libbeat/docs/processors-list.asciidoc b/libbeat/docs/processors-list.asciidoc index 367bce4ae59..e2670ebc39e 100644 --- a/libbeat/docs/processors-list.asciidoc +++ b/libbeat/docs/processors-list.asciidoc @@ -110,6 +110,9 @@ endif::[] ifndef::no_urldecode_processor[] * <> endif::[] +ifndef::no_decode_xml_processor[] +* <> +endif::[] //# end::processors-list[] //# tag::processors-include[] @@ -225,5 +228,8 @@ endif::[] ifndef::no_urldecode_processor[] include::{libbeat-processors-dir}/urldecode/docs/urldecode.asciidoc[] endif::[] +ifndef::no_decode_xml_processor[] +include::{libbeat-processors-dir}/decode_xml/docs/decode_xml.asciidoc[] +endif::[] //# end::processors-include[] diff --git a/libbeat/processors/decode_xml/config.go b/libbeat/processors/decode_xml/config.go new file mode 100644 index 00000000000..289b2eaa0e9 --- /dev/null +++ b/libbeat/processors/decode_xml/config.go @@ -0,0 +1,36 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package decode_xml + +type decodeXMLConfig struct { + Field string `config:"field" validate:"required"` + Target *string `config:"target_field"` + OverwriteKeys bool `config:"overwrite_keys"` + DocumentID string `config:"document_id"` + ToLower bool `config:"to_lower"` + IgnoreMissing bool `config:"ignore_missing"` + IgnoreFailure bool `config:"ignore_failure"` +} + +func defaultConfig() decodeXMLConfig { + return decodeXMLConfig{ + Field: "message", + OverwriteKeys: true, + ToLower: true, + } +} diff --git a/libbeat/processors/decode_xml/decode_xml.go b/libbeat/processors/decode_xml/decode_xml.go new file mode 100644 index 00000000000..0b229cff3d2 --- /dev/null +++ b/libbeat/processors/decode_xml/decode_xml.go @@ -0,0 +1,150 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package decode_xml + +import ( + "encoding/json" + "errors" + "fmt" + "strings" + + "github.com/elastic/beats/v7/libbeat/beat" + "github.com/elastic/beats/v7/libbeat/common" + "github.com/elastic/beats/v7/libbeat/common/cfgwarn" + "github.com/elastic/beats/v7/libbeat/common/encoding/xml" + "github.com/elastic/beats/v7/libbeat/common/jsontransform" + "github.com/elastic/beats/v7/libbeat/logp" + "github.com/elastic/beats/v7/libbeat/processors" + "github.com/elastic/beats/v7/libbeat/processors/checks" + jsprocessor "github.com/elastic/beats/v7/libbeat/processors/script/javascript/module/processor" +) + +type decodeXML struct { + decodeXMLConfig + log *logp.Logger +} + +var ( + errFieldIsNotString = errors.New("field value is not a string") +) + +const ( + procName = "decode_xml" + logName = "processor." + procName +) + +func init() { + processors.RegisterPlugin(procName, + checks.ConfigChecked(New, + checks.RequireFields("fields"), + checks.AllowedFields("fields", "overwrite_keys", "add_error_key", "target", "document_id"))) + jsprocessor.RegisterPlugin(procName, New) +} + +// New constructs a new decode_xml processor. +func New(c *common.Config) (processors.Processor, error) { + config := defaultConfig() + + if err := c.Unpack(&config); err != nil { + return nil, fmt.Errorf("fail to unpack the "+procName+" processor configuration: %s", err) + } + + return newDecodeXML(config) +} + +func newDecodeXML(config decodeXMLConfig) (processors.Processor, error) { + cfgwarn.Experimental("The " + procName + " processor is experimental.") + + // Default target to overwriting field. + if config.Target == nil { + config.Target = &config.Field + } + + return &decodeXML{ + decodeXMLConfig: config, + log: logp.NewLogger(logName), + }, nil +} + +func (x *decodeXML) Run(event *beat.Event) (*beat.Event, error) { + if err := x.run(event); err != nil && !x.IgnoreFailure { + err = fmt.Errorf("failed in decode_xml on the %q field: %w", x.Field, err) + event.PutValue("error.message", err.Error()) + return event, err + } + return event, nil +} + +func (x *decodeXML) run(event *beat.Event) error { + data, err := event.GetValue(x.Field) + if err != nil { + if x.IgnoreMissing && err == common.ErrKeyNotFound { + return nil + } + return err + } + + text, ok := data.(string) + if !ok { + return errFieldIsNotString + } + + xmlOutput, err := x.decodeField(text) + if err != nil { + return err + } + + var id string + if tmp, err := common.MapStr(xmlOutput).GetValue(x.DocumentID); err == nil { + if v, ok := tmp.(string); ok { + id = v + common.MapStr(xmlOutput).Delete(x.DocumentID) + } + } + + if *x.Target != "" { + if _, err = event.PutValue(*x.Target, xmlOutput); err != nil { + return fmt.Errorf("failed to put value %v into field %q: %w", xmlOutput, *x.Target, err) + } + } else { + jsontransform.WriteJSONKeys(event, xmlOutput, false, x.OverwriteKeys, !x.IgnoreFailure) + } + + if id != "" { + event.SetID(id) + } + return nil +} + +func (x *decodeXML) decodeField(data string) (decodedData map[string]interface{}, err error) { + dec := xml.NewDecoder(strings.NewReader(data)) + if x.ToLower { + dec.LowercaseKeys() + } + + out, err := dec.Decode() + if err != nil { + return nil, fmt.Errorf("error decoding XML field: %w", err) + } + return out, nil +} + +func (x *decodeXML) String() string { + json, _ := json.Marshal(x.decodeXMLConfig) + return procName + "=" + string(json) +} diff --git a/libbeat/processors/decode_xml/decode_xml_test.go b/libbeat/processors/decode_xml/decode_xml_test.go new file mode 100644 index 00000000000..26d075bf3a4 --- /dev/null +++ b/libbeat/processors/decode_xml/decode_xml_test.go @@ -0,0 +1,467 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package decode_xml + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/elastic/beats/v7/libbeat/beat" + "github.com/elastic/beats/v7/libbeat/common" +) + +var ( + testXMLTargetField = "xml" + testRootTargetField = "" +) + +func TestDecodeXML(t *testing.T) { + var testCases = []struct { + description string + config decodeXMLConfig + Input common.MapStr + Output common.MapStr + error bool + errorMessage string + }{ + { + description: "Simple xml decode with target field set", + config: decodeXMLConfig{ + Field: "message", + Target: &testXMLTargetField, + }, + Input: common.MapStr{ + "message": ` + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + `, + }, + Output: common.MapStr{ + "xml": map[string]interface{}{ + "catalog": map[string]interface{}{ + "book": map[string]interface{}{ + "author": "William H. Gaddis", + "review": "One of the great seminal American novels of the 20th century.", + "seq": "1", + "title": "The Recognitions", + }, + }, + }, + "message": ` + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + `, + }, + }, + { + description: "Test with target set to root", + config: decodeXMLConfig{ + Field: "message", + Target: &testRootTargetField, + }, + Input: common.MapStr{ + "message": ` + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + `, + }, + Output: common.MapStr{ + "catalog": common.MapStr{ + "book": map[string]interface{}{ + "author": "William H. Gaddis", + "review": "One of the great seminal American novels of the 20th century.", + "seq": "1", + "title": "The Recognitions", + }, + }, + "message": ` + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + `, + }, + }, + { + description: "Simple xml decode with xml string to same field name when Target is null", + config: decodeXMLConfig{ + Field: "message", + }, + Input: common.MapStr{ + "message": ` + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + `, + }, + Output: common.MapStr{ + "message": map[string]interface{}{ + "catalog": map[string]interface{}{ + "book": map[string]interface{}{ + "author": "William H. Gaddis", + "review": "One of the great seminal American novels of the 20th century.", + "seq": "1", + "title": "The Recognitions", + }, + }, + }, + }, + }, + { + description: "Decoding with array input", + config: decodeXMLConfig{ + Field: "message", + }, + Input: common.MapStr{ + "message": ` + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + + Ralls, Kim + Midnight Rain + Some review. + + `, + }, + Output: common.MapStr{ + "message": map[string]interface{}{ + "catalog": map[string]interface{}{ + "book": []interface{}{ + map[string]interface{}{ + "author": "William H. Gaddis", + "review": "One of the great seminal American novels of the 20th century.", + "title": "The Recognitions", + }, + map[string]interface{}{ + "author": "Ralls, Kim", + "review": "Some review.", + "title": "Midnight Rain", + }, + }, + }, + }, + }, + }, + { + description: "Decoding with multiple xml objects", + config: decodeXMLConfig{ + Field: "message", + }, + Input: common.MapStr{ + "message": ` + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + + Ralls, Kim + Midnight Rain + Some review. + + + + Ralls, Kim + A former architect battles corporate zombies, an evil sorceress, and her own childhood to become queen of the world. + + + `, + }, + Output: common.MapStr{ + "message": map[string]interface{}{ + "catalog": map[string]interface{}{ + "book": []interface{}{ + map[string]interface{}{ + "author": "William H. Gaddis", + "review": "One of the great seminal American novels of the 20th century.", + "title": "The Recognitions", + }, + map[string]interface{}{ + "author": "Ralls, Kim", + "review": "Some review.", + "title": "Midnight Rain", + }, + }, + "secondcategory": map[string]interface{}{ + "paper": map[string]interface{}{ + "description": "A former architect battles corporate zombies, an evil sorceress, and her own childhood to become queen of the world.", + "id": "bk102", + "test2": "Ralls, Kim", + }, + }, + }, + }, + }, + }, + { + description: "Decoding with broken XML format, with IgnoreFailure false", + config: decodeXMLConfig{ + Field: "message", + IgnoreFailure: false, + }, + Input: common.MapStr{ + "message": ` + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + catalog>`, + }, + Output: common.MapStr{ + "message": ` + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + catalog>`, + "error": common.MapStr{"message": "failed in decode_xml on the \"message\" field: error decoding XML field: XML syntax error on line 7: element closed by "}, + }, + error: true, + errorMessage: "error decoding XML field:", + }, + { + description: "Decoding with broken XML format, with IgnoreFailure true", + config: decodeXMLConfig{ + Field: "message", + IgnoreFailure: true, + }, + Input: common.MapStr{ + "message": ` + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + catalog>`, + }, + Output: common.MapStr{ + "message": ` + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + catalog>`, + }, + }, + { + description: "Test when the XML field is empty, IgnoreMissing false", + config: decodeXMLConfig{ + Field: "message2", + IgnoreMissing: false, + }, + Input: common.MapStr{ + "message": "testing message", + }, + Output: common.MapStr{ + "message": "testing message", + "error": common.MapStr{"message": "failed in decode_xml on the \"message2\" field: key not found"}, + }, + error: true, + errorMessage: "key not found", + }, + { + description: "Test when the XML field is empty IgnoreMissing true", + config: decodeXMLConfig{ + Field: "message2", + IgnoreMissing: true, + }, + Input: common.MapStr{ + "message": "testing message", + }, + Output: common.MapStr{ + "message": "testing message", + }, + }, + { + description: "Test when the XML field not a string, IgnoreFailure false", + config: decodeXMLConfig{ + Field: "message", + IgnoreFailure: false, + }, + Input: common.MapStr{ + "message": 1, + }, + Output: common.MapStr{ + "message": 1, + "error": common.MapStr{"message": "failed in decode_xml on the \"message\" field: field value is not a string"}, + }, + error: true, + errorMessage: "field value is not a string", + }, + { + description: "Test when the XML field not a string, IgnoreFailure true", + config: decodeXMLConfig{ + Field: "message", + IgnoreFailure: true, + }, + Input: common.MapStr{ + "message": 1, + }, + Output: common.MapStr{ + "message": 1, + }, + }, + } + + for _, test := range testCases { + test := test + t.Run(test.description, func(t *testing.T) { + t.Parallel() + + f, err := newDecodeXML(test.config) + require.NoError(t, err) + + event := &beat.Event{ + Fields: test.Input, + } + newEvent, err := f.Run(event) + if !test.error { + assert.NoError(t, err) + } else { + if assert.Error(t, err) { + assert.Contains(t, err.Error(), test.errorMessage) + } + } + assert.Equal(t, test.Output, newEvent.Fields) + }) + } +} + +func BenchmarkProcessor_Run(b *testing.B) { + c := defaultConfig() + target := "xml" + c.Target = &target + p, err := newDecodeXML(c) + require.NoError(b, err) + + b.Run("single_object", func(b *testing.B) { + evt := &beat.Event{Fields: map[string]interface{}{ + "message": ` + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + `, + }} + + for i := 0; i < b.N; i++ { + _, err = p.Run(evt) + if err != nil { + b.Fatal(err) + } + } + }) + + b.Run("nested_and_array_object", func(b *testing.B) { + evt := &beat.Event{Fields: map[string]interface{}{ + "message": ` + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + + Ralls, Kim + Midnight Rain + Some review. + + + + Ralls, Kim + A former architect battles corporate zombies, an evil sorceress, and her own childhood to become queen of the world. + + + `, + }} + + for i := 0; i < b.N; i++ { + _, err = p.Run(evt) + if err != nil { + b.Fatal(err) + } + } + }) +} + +func TestXMLToDocumentID(t *testing.T) { + p, err := newDecodeXML(decodeXMLConfig{ + Field: "message", + DocumentID: "catalog.book.seq", + }) + require.NoError(t, err) + + input := common.MapStr{ + "message": ` + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + `, + } + actual, err := p.Run(&beat.Event{Fields: input}) + require.NoError(t, err) + + wantFields := common.MapStr{ + "message": map[string]interface{}{ + "catalog": map[string]interface{}{ + "book": map[string]interface{}{ + "author": "William H. Gaddis", + "review": "One of the great seminal American novels of the 20th century.", + "title": "The Recognitions", + }, + }, + }, + } + wantMeta := common.MapStr{ + "_id": "10", + } + + assert.Equal(t, wantFields, actual.Fields) + assert.Equal(t, wantMeta, actual.Meta) +} diff --git a/libbeat/processors/decode_xml/docs/decode_xml.asciidoc b/libbeat/processors/decode_xml/docs/decode_xml.asciidoc new file mode 100644 index 00000000000..ded0543514a --- /dev/null +++ b/libbeat/processors/decode_xml/docs/decode_xml.asciidoc @@ -0,0 +1,115 @@ +[[decode_xml]] +=== Decode XML + +++++ +decode_xml +++++ + +experimental[] + +The `decode_xml` processor decodes XML data that is stored under the `field` +key. It outputs the result into the `target_field`. + +This example demonstrates how to decode an XML string contained in the `message` +field and write the resulting fields into the root of the document. Any fields +that already exist will be overwritten. + +[source,yaml] +------- +processors: + - decode_xml: + field: message + target_field: "" + overwrite_keys: true +------- + +By default any decoding errors that occur will stop the processing chain and the +error will be added to `error.message` field. To ignore all errors and continue +to the next processor you can set `ignore_failure: true`. To specifically +ignore failures caused by `field` not existing use `ignore_missing`. + +[source,yaml] +------- +processors: + - decode_xml: + field: example + target_field: xml + ignore_missing: true + ignore_failure: true +------- + +By default all keys converted from XML will have the names converted to +lowercase. If there is a need to disable this behavior it is possible to use the +below example: + +[source,yaml] +------- +processors: + - decode_xml: + field: message + target_field: xml + to_lower: false +------- + +Example XML input: + +[source,xml] +------------------------------------------------------------------------------- +{ + + + William H. Gaddis + The Recognitions + One of the great seminal American novels of the 20th century. + + +} +------------------------------------------------------------------------------- + +Will produce the following output: + +[source,json] +------------------------------------------------------------------------------- +{ + "xml": { + "catalog": { + "book": { + "author": "William H. Gaddis", + "review": "One of the great seminal American novels of the 20th century.", + "seq": "1", + "title": "The Recognitions" + } + } + } +} +------------------------------------------------------------------------------- + + +The supported configuration options are: + +`field`:: (Required) Source field containing the XML. Defaults to `message`. + +`target_field`:: (Optional) The field under which the decoded XML will be +written. By default the decoded XML object replaces the field from which it was +read. To merge the decoded XML fields into the root of the event specify +`target_field` with an empty string (`target_field: ""`). Note that the `null` +value (`target_field:`) is treated as if the field was not set at all. + +`overwrite_keys`:: (Optional) A boolean that specifies whether keys that already +exist in the event are overwritten by keys from the decoded XML object. The +default value is false. + +`to_lower`:: (Optional) Converts all keys to lowercase. Accepts either true or +false. The default value is true. + +`document_id`:: (Optional) XML key to use as the document ID. If configured, the +field will be removed from the original XML document and stored in +`@metadata._id`. + +`ignore_missing`:: (Optional) If `true` the processor will not return an error +when a specified field does not exist. Defaults to `false`. + +`ignore_failure`:: (Optional) Ignore all errors produced by the processor. +Defaults to `false`. + +See <> for a list of supported conditions.