From 51bbceff4983abe421a86ae90fd6b58f8bf85de9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20S=C5=82ota?= Date: Mon, 17 Jun 2019 17:54:56 +0200 Subject: [PATCH 1/2] Add migration instructions for Riak @ MIM 3.3.0+ * `3.3.0_3.3.0plus.md` has been modified by adding migration guide for Riak @ MIM 3.3.0+ * `jid-from-mam-muc-script.md` has been modified reflect the changes made to `sender-jid-from-mam-message.escript` script * `sender-jid-from-mam-message.escript` script has been changed so that it returns `-2\n` when the given stanza is not a correct MUC message. --- doc/migrations/3.3.0_3.3.0plus.md | 39 +++++++++++++++- doc/migrations/jid-from-mam-muc-script.md | 7 ++- test/migration_scripts_SUITE.erl | 46 +++++++++++++++++-- .../sender-jid-from-mam-message.escript | 42 ++++++++++++++--- 4 files changed, 119 insertions(+), 15 deletions(-) diff --git a/doc/migrations/3.3.0_3.3.0plus.md b/doc/migrations/3.3.0_3.3.0plus.md index b064b9a46c7..96522bdb56d 100644 --- a/doc/migrations/3.3.0_3.3.0plus.md +++ b/doc/migrations/3.3.0_3.3.0plus.md @@ -51,7 +51,44 @@ TODO ### Riak -TODO +Changes to Riak schema are backwards compatible with the current MongooseIM release. +This means that in case of skipping of the migrations, only some new features (namely GDPR data retrival) won't work correctly. + +#### Step 1 + +Please update the Riak schema: +```bash +# Set the RIAK_HOST to your Riak HTTP endpoint +# Set the RIAK_MAM_SCHEMA_PATH to point to new schema path, which +# by default is: RIAK_MAM_SCHEMA_PATH=tools/mam_search_schema.xml +curl -v -XPUT $RIAK_HOST/search/schema/mam \ + -H 'Content-Type:application/xml' \ + --data-binary @${RIAK_MAM_SCHEMA_PATH} +``` + +After that we need to either reload all Riak nodes (restart them) or manually reload the schema on live nodes. +Reloading of schema of live nodes requires access to Erlang Shell of one of the Riak nodes (any of them). +The instruction on how to get to Riak's Erlang shell is beyond this guide, but if you manage to get to it, just call: + +```erlang +yz_index:reload(<<"mam">>). +``` + +#### Step 2 + +After the schema is posted and reloaded, all "new" objects will be indexed properly as long they contain 2 new fields: `msg_owner_jid` and `mam_type`. +The new MongooseIM code will insert both of them for all new MAM entires, but for all existing ones, we need to "update" by adding those fields. +In order to do that, we need to create a migration script (just pick your favourite scripting/programming language) that will do the following for *each* object in *each* bucket of type `mam_yz` (the object will be referred as `obj`): + +* Use [this dedicated script](jid-from-mam-muc-script.md) to convert `obj.packet_register` field value into so called `$SENDER_JID`. +* If the script returns `$SENDER_JID` correctly: + * set `obj.mam_type = 'muc'` + * set `obj.msg_owner_jid = $SENDER_JID` +* If the script returns error code `-2` + * set `obj.mam_type = 'pm'` + * based on `obj_yz_rk` which is in format `$LOCAL_JID/$REMOTE_JID/$MSG_ID`, set `obj.msg_owner_jid = $LOCAL_JID` +* Save the modified `obj` + ### ElasticSearch diff --git a/doc/migrations/jid-from-mam-muc-script.md b/doc/migrations/jid-from-mam-muc-script.md index 6dbd7f65067..6c85850c06f 100644 --- a/doc/migrations/jid-from-mam-muc-script.md +++ b/doc/migrations/jid-from-mam-muc-script.md @@ -57,8 +57,11 @@ LENGTH\nJID * `JID` is a sequence of bytes, which encodes a Unicode string * `LENGTH` and `PAYLOAD` are separated with a newline character (ASCII code 10 / 0x0a) -If JID couldn't be extracted for some reason (and it's not a critical error, like I/O failure), the script will continue to work and will return `-1\n`. -It's `-1` for `LENGTH`, followed by a newline character and no `PAYLOAD` part (or 0-length `PAYLOAD` if you like). +In case of an error (that is not a critical error, like I/O failure), script will print `-N\n` (where `N` is an error code) and will continue to work. +Technically it's `-N` for `LENGTH`, followed by a newline character and no `PAYLOAD` part (or 0-length `PAYLOAD` if you like). +The following error codes are supported: +* `-1\n` - Unknown error. Something went wrong with the JID extraction (most likely malformed input). +* `-2\n` - Invalid message type. The message / stanza has been decoded successfully, but it's not a groupchat message. ## Examples diff --git a/test/migration_scripts_SUITE.erl b/test/migration_scripts_SUITE.erl index b8af9b80832..6027df3fa8c 100644 --- a/test/migration_scripts_SUITE.erl +++ b/test/migration_scripts_SUITE.erl @@ -7,7 +7,9 @@ sender_jid_from_mam_muc_eterm_stream/1, sender_jid_from_mam_muc_xml_stream/1, sender_jid_from_mam_muc_doesnt_crash_on_unsupported_eterm_input/1, - sender_jid_from_mam_muc_doesnt_crash_on_unsupported_xml_input/1 + sender_jid_from_mam_muc_doesnt_crash_on_unsupported_xml_input/1, + sender_jid_from_mam_muc_doesnt_crash_on_malformed_eterm_input/1, + sender_jid_from_mam_muc_doesnt_crash_on_malformed_xml_input/1 ]). %% ---------------------------------------------------------- @@ -25,7 +27,9 @@ groups() -> sender_jid_from_mam_muc_eterm_stream, sender_jid_from_mam_muc_xml_stream, sender_jid_from_mam_muc_doesnt_crash_on_unsupported_eterm_input, - sender_jid_from_mam_muc_doesnt_crash_on_unsupported_xml_input + sender_jid_from_mam_muc_doesnt_crash_on_unsupported_xml_input, + sender_jid_from_mam_muc_doesnt_crash_on_malformed_eterm_input, + sender_jid_from_mam_muc_doesnt_crash_on_malformed_xml_input ]} ]. @@ -50,11 +54,11 @@ sender_jid_from_mam_muc_eterm_stream(_Config) -> sender_jid_from_mam_muc_xml_stream(_Config) -> Port = script_helper:start("tools/migration/sender-jid-from-mam-message.escript", ["xml"]), - sender_jid_from_mam_muc_data_stream(Port, fun(B) -> B end). + sender_jid_from_mam_muc_data_stream(Port, fun(B) -> B end). sender_jid_from_mam_muc_data_stream(Port, PayloadConverterFun) -> lists:foreach(fun(JID) -> - MsgBin = sample_archived_muc_message(JID), + MsgBin = sample_archived_muc_message(JID), script_helper:write(Port, PayloadConverterFun(MsgBin)), JID = script_helper:read(Port) end, [<<"alice@localhost">>, <<"zażółćgęśląjaźń@localhost2"/utf8>>, @@ -68,10 +72,28 @@ sender_jid_from_mam_muc_doesnt_crash_on_unsupported_xml_input(_Config) -> Port = script_helper:start("tools/migration/sender-jid-from-mam-message.escript", ["xml"]), sender_jid_from_mam_muc_doesnt_crash_on_unsupported_input(Port, fun(B) -> B end). + +sender_jid_from_mam_muc_doesnt_crash_on_malformed_eterm_input(_Config) -> + Port = script_helper:start("tools/migration/sender-jid-from-mam-message.escript", ["eterm"]), + sender_jid_from_mam_muc_doesnt_crash_on_malformed_input(Port, fun binary_string_to_eterm/1). + +sender_jid_from_mam_muc_doesnt_crash_on_malformed_xml_input(_Config) -> + Port = script_helper:start("tools/migration/sender-jid-from-mam-message.escript", ["xml"]), + sender_jid_from_mam_muc_doesnt_crash_on_malformed_input(Port, fun(B) -> B end). + sender_jid_from_mam_muc_doesnt_crash_on_unsupported_input(Port, PayloadConverterFun) -> - %% First we expect that the script replies with -1 length.... + %% First we expect that the script replies with -2 length (non MUC message).... InvalidPayload = PayloadConverterFun(sample_archived_1_to_1_message()), script_helper:write(Port, InvalidPayload), + {error, -2} = script_helper:read(Port), + + %% Then we confirm with valid payload that the script actually still works + sender_jid_from_mam_muc_data_stream(Port, PayloadConverterFun). + +sender_jid_from_mam_muc_doesnt_crash_on_malformed_input(Port, PayloadConverterFun) -> + %% First we expect that the script replies with -1 length (malformed message).... + InvalidPayload = PayloadConverterFun(sample_malformed_muc_message()), + script_helper:write(Port, InvalidPayload), {error, -1} = script_helper:read(Port), %% Then we confirm with valid payload that the script actually still works @@ -98,6 +120,20 @@ sample_archived_1_to_1_message() -> <<"" "Zażółć gęślą jaźń"/utf8>>. +sample_malformed_muc_message() -> + <<" + Zażółć gęślą jaźń + + + + "/utf8>>. + + binary_string_to_eterm(Bin) -> {ok, XmlEl} = exml:parse(Bin), term_to_binary(XmlEl). diff --git a/tools/migration/sender-jid-from-mam-message.escript b/tools/migration/sender-jid-from-mam-message.escript index 46bcb9b8025..3976984eaff 100755 --- a/tools/migration/sender-jid-from-mam-message.escript +++ b/tools/migration/sender-jid-from-mam-message.escript @@ -48,21 +48,45 @@ common_loop(ExtractionFun) -> % We skip trailing \n InLen = binary_to_integer(binary:part(InLenBin, 0, byte_size(InLenBin) - 1)), {ok, Data} = file:read(standard_io, InLen), - safe_jid_extraction(ExtractionFun, Data), + safe_jid_extraction(ExtractionFun, Data), common_loop(ExtractionFun) end. jid_from_eterm(ETerm) -> - {xmlel, <<"message">>, _, MsgChildren} = binary_to_term(ETerm), - {xmlel, <<"x">>, _, XChildren} = - lists:keyfind([{<<"xmlns">>, <<"http://jabber.org/protocol/muc#user">>}], 3, MsgChildren), - {xmlel, _, ItemAttrs, _} = lists:keyfind(<<"item">>, 2, XChildren), - {_, JID} = lists:keyfind(<<"jid">>, 1, ItemAttrs), - JID. + {xmlel, <<"message">>, MsgAttrs, MsgChildren} = binary_to_term(ETerm), + case lists:keyfind(<<"type">>, 1, MsgAttrs) of + {_, <<"groupchat">>} -> ok; + _ -> throw(not_muc_message) + end, + case lists:keyfind([{<<"xmlns">>, <<"http://jabber.org/protocol/muc#user">>}], 3, MsgChildren) of + {xmlel, <<"x">>, _, XChildren} -> + {xmlel, _, ItemAttrs, _} = lists:keyfind(<<"item">>, 2, XChildren), + {_, JID} = lists:keyfind(<<"jid">>, 1, ItemAttrs), + JID; + _ -> + throw(not_muc_message) + end. + jid_from_xml(XML) -> XmerlFriendlyXML = "" ++ binary_to_list(XML), {Doc, _} = xmerl_scan:string(XmerlFriendlyXML), + case xmerl_xpath:string("/message/@type", Doc) of + [#xmlAttribute{ value = "groupchat" }] -> + ok; + _ -> + throw(not_muc_message) + end, + Xs = xmerl_xpath:string("/message/x", Doc), + IsMUC = + lists:any(fun + (#xmlElement{ namespace = #xmlNamespace{ default = 'http://jabber.org/protocol/muc#user' }}) -> + true; + (_Elem) -> + false + end, Xs), + IsMUC orelse throw(not_muc_message), + [#xmlAttribute{ value = JID }] = xmerl_xpath:string("/message/x/item/@jid", Doc), unicode:characters_to_binary(JID). @@ -73,6 +97,10 @@ safe_jid_extraction(JIDExtractorFun, Data) -> OutLenBin = integer_to_binary(OutLen), ok = file:write(standard_io, <>) catch + throw:R -> + Extra = #{ type => invalid_message_type, data => Data }, + debug(throw, R, erlang:get_stacktrace(), Extra), + ok = io:put_chars("-2\n"); C:R -> Extra = #{ type => cannot_extract_jid, data => Data }, debug(C, R, erlang:get_stacktrace(), Extra), From 064873ee13144bde7cf3762d572a29e8c037acb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20S=C5=82ota?= Date: Wed, 19 Jun 2019 03:35:25 -0700 Subject: [PATCH 2/2] fixup! Add migration instructions for Riak @ MIM 3.3.0+ --- doc/migrations/3.3.0_3.3.0plus.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/migrations/3.3.0_3.3.0plus.md b/doc/migrations/3.3.0_3.3.0plus.md index 96522bdb56d..b830c3b9544 100644 --- a/doc/migrations/3.3.0_3.3.0plus.md +++ b/doc/migrations/3.3.0_3.3.0plus.md @@ -51,8 +51,8 @@ TODO ### Riak -Changes to Riak schema are backwards compatible with the current MongooseIM release. -This means that in case of skipping of the migrations, only some new features (namely GDPR data retrival) won't work correctly. +Changes to Riak schema are backward compatible with the current MongooseIM release. +This means that skipping the migration will cause only some of the new features (namely GDPR data retrival) to not work correctly. #### Step 1 @@ -67,7 +67,7 @@ curl -v -XPUT $RIAK_HOST/search/schema/mam \ ``` After that we need to either reload all Riak nodes (restart them) or manually reload the schema on live nodes. -Reloading of schema of live nodes requires access to Erlang Shell of one of the Riak nodes (any of them). +Reloading the schema on live nodes requires access to Erlang Shell of one of the Riak nodes (any of them). The instruction on how to get to Riak's Erlang shell is beyond this guide, but if you manage to get to it, just call: ```erlang @@ -77,16 +77,16 @@ yz_index:reload(<<"mam">>). #### Step 2 After the schema is posted and reloaded, all "new" objects will be indexed properly as long they contain 2 new fields: `msg_owner_jid` and `mam_type`. -The new MongooseIM code will insert both of them for all new MAM entires, but for all existing ones, we need to "update" by adding those fields. +The new MongooseIM code will insert both of them for all new MAM entires, but for all existing ones need to have the fields added. In order to do that, we need to create a migration script (just pick your favourite scripting/programming language) that will do the following for *each* object in *each* bucket of type `mam_yz` (the object will be referred as `obj`): -* Use [this dedicated script](jid-from-mam-muc-script.md) to convert `obj.packet_register` field value into so called `$SENDER_JID`. +* Use [this dedicated script](jid-from-mam-muc-script.md) to convert the `obj.packet_register` field value into a so called `$SENDER_JID`. * If the script returns `$SENDER_JID` correctly: * set `obj.mam_type = 'muc'` * set `obj.msg_owner_jid = $SENDER_JID` * If the script returns error code `-2` * set `obj.mam_type = 'pm'` - * based on `obj_yz_rk` which is in format `$LOCAL_JID/$REMOTE_JID/$MSG_ID`, set `obj.msg_owner_jid = $LOCAL_JID` + * based on `obj_yz_rk` formatted as `$LOCAL_JID/$REMOTE_JID/$MSG_ID`, set `obj.msg_owner_jid = $LOCAL_JID` * Save the modified `obj`