Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GDPR data retrival for mod_mam_muc @ Riak DBMS #2336

Merged
merged 2 commits into from
Jun 17, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion big_tests/tests/gdpr_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ groups() ->
{group, retrieve_personal_data_mam_elasticsearch}
]},
{retrieve_personal_data_mam_rdbms, [], all_mam_testcases()},
{retrieve_personal_data_mam_riak, [], mam_testcases()},
{retrieve_personal_data_mam_riak, [], all_mam_testcases()},
{retrieve_personal_data_mam_cassandra, [], all_mam_testcases()},
{retrieve_personal_data_mam_elasticsearch, [], all_mam_testcases()},
{remove_personal_data, [], removal_testcases()},
Expand Down
67 changes: 42 additions & 25 deletions src/mam/mod_mam_riak_timed_arch_yz.erl
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
-export([key/3]).

%% For tests only
-export([create_obj/5, read_archive/7, bucket/1,
-export([create_obj/6, read_archive/7, bucket/1,
list_mam_buckets/0, remove_bucket/1]).

-export([get_mam_muc_gdpr_data/2, get_mam_pm_gdpr_data/2]).
Expand Down Expand Up @@ -122,20 +122,26 @@ stop_muc_archive(Host) ->
ejabberd_hooks:delete(mam_muc_remove_archive, Host, ?MODULE, remove_archive, 50),
ok.

%% LocJID - archive owner's JID
%% RemJID - interlocutor's JID
%% SrcJID - "Real" sender JID
archive_message(_Result, Host, MessId, _UserID, LocJID, RemJID, SrcJID, _Dir, Packet) ->
try
archive_message(Host, MessId, LocJID, RemJID, SrcJID, Packet, pm)
archive_message(Host, MessId, LocJID, RemJID, SrcJID, LocJID, Packet, pm)
catch _Type:Reason ->
?WARNING_MSG("Could not write message to archive, reason: ~p",
[{Reason, erlang:get_stacktrace()}]),
ejabberd_hooks:run(mam_drop_message, Host, [Host]),
{error, Reason}
end.

archive_message_muc(_Result, Host, MessId, _UserID, LocJID, _FromJID, SrcJID, _Dir, Packet) ->
%% LocJID - MUC/MUC Light room's JID
%% FromJID - "Real" sender JID
%% SrcJID - Full JID of user within room (room@domain/user)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

room@domain/nickname, because user may be confused with username

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually didn't want to use nickname as this may also be JID (I think) in case of MUC Light. I agree with the fact thatuser is not the best choice, but nickname doesn't feel generic enough.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, fair enough. :)

archive_message_muc(_Result, Host, MessId, _UserID, LocJID, FromJID, SrcJID, _Dir, Packet) ->
RemJIDMuc = maybe_muc_jid(SrcJID),
try
archive_message(Host, MessId, LocJID, RemJIDMuc, SrcJID, Packet, muc)
archive_message(Host, MessId, LocJID, RemJIDMuc, SrcJID, FromJID, Packet, muc)
catch _Type:Reason ->
?WARNING_MSG("Could not write MUC message to archive, reason: ~p",
[{Reason, erlang:get_stacktrace()}]),
Expand Down Expand Up @@ -203,22 +209,34 @@ remove_bucket(Bucket) ->
{ok, Keys} = mongoose_riak:list_keys(Bucket),
[mongoose_riak:delete(Bucket, Key) || Key <- Keys].

archive_message(Host, MessID, LocJID, RemJID, SrcJID, Packet, Type) ->

%% PM:
%% * LocJID - archive owner's JID
%% * RemJID - interlocutor's JID
%% * SrcJID - "Real" sender JID
%% * OwnerJID - Same as LocJID
%% MUC / MUC Light:
%% * LocJID - MUC/MUC Light room's JID
%% * RemJID - Nickname of JID of destination
%% * SrcJID - Full JID of user within room (room@domain/user)
%% * OwnerJID - "Real" sender JID (not room specific)
archive_message(Host, MessID, LocJID, RemJID, SrcJID, OwnerJID, Packet, Type) ->
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have a feeling that if we need to explain what variable means, depending on some context, we could decompose the logic. However, I understand thats implication of working with legacy code

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would definitely be solved by a refactor (in the future), where we could replace these arguments with parameter map. Indeed now it's confusing as ***.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, indeed this is very confusing. But changing this is this module only will only make things worse, as other MAM backends use exactly the same wording.
@ludwikbukowski I actually tried to provide descriptions without context, because those arguments seem to be abstract enough (in the end, whole logic of this function doesn't do anything specific depending on context), but I simply failed to figure out the proper wording. Still, I believe that having those definitions here even context-specific is better then not having it and guessing each time we develop an extension.

LocalJID = mod_mam_utils:bare_jid(LocJID),
RemoteJID = mod_mam_utils:bare_jid(RemJID),
SourceJID = mod_mam_utils:full_jid(SrcJID),
BareOwnerJID = mod_mam_utils:bare_jid(OwnerJID),
MsgId = integer_to_binary(MessID),
Key = key(LocalJID, RemoteJID, MsgId),

Bucket = bucket(MessID),

RiakMap = create_obj(Host, MsgId, SourceJID, Packet, Type),
RiakMap = create_obj(Host, MsgId, SourceJID, BareOwnerJID, Packet, Type),
case mongoose_riak:update_type(Bucket, Key, riakc_map:to_op(RiakMap)) of
ok -> ok;
Other -> throw(Other)
end.

create_obj(Host, MsgId, SourceJID, Packet, Type) ->
create_obj(Host, MsgId, SourceJID, BareOwnerJID, Packet, Type) ->
ModMAM =
case Type of
pm -> mod_mam;
Expand All @@ -231,6 +249,10 @@ create_obj(Host, MsgId, SourceJID, Packet, Type) ->
fun(R) -> riakc_register:set(MsgId, R) end},
{{<<"source_jid">>, register},
fun(R) -> riakc_register:set(SourceJID, R) end},
{{<<"msg_owner_jid">>, register},
fun(R) -> riakc_register:set(BareOwnerJID, R) end},
{{<<"mam_type">>, register},
fun(R) -> riakc_register:set(atom_to_binary(Type, latin1), R) end},
{{<<"packet">>, register},
fun(R) -> riakc_register:set(packet_to_stored_binary(Host, Packet), R) end},
{{<<"search_text">>, register},
Expand Down Expand Up @@ -332,26 +354,24 @@ get_message2(Host, MsgId, Bucket, Key) ->
-spec get_mam_pm_gdpr_data(jid:username(), jid:server()) ->
{ok, ejabberd_gen_mam_archive:mam_pm_gdpr_data()}.
get_mam_pm_gdpr_data(Username, Host) ->
LUser = jid:nodeprep(Username),
LServer = jid:nodeprep(Host),
Jid = jid:make({LUser, LServer, <<>>}),
LookupParams = ?DUMMY_LOOKUP_PARAMETERS#{owner_jid := Jid},
{ok, {_, _, Messages}} = lookup_messages([], Host, LookupParams),
Messages = get_mam_gdpr_data(Username, Host, <<"pm">>),
{ok, [{Id, jid:to_binary(Jid), exml:to_binary(Packet)} || {Id, Jid, Packet} <- Messages]}.

-spec get_mam_muc_gdpr_data(jid:username(), jid:server()) ->
{ok, ejabberd_gen_mam_archive:mam_muc_gdpr_data()}.
get_mam_muc_gdpr_data(Username, Host) ->
Messages = get_mam_gdpr_data(Username, Host, <<"muc">>),
{ok, [{MsgId, exml:to_binary(Packet)} || {MsgId, _, Packet} <- Messages]}.

get_mam_gdpr_data(Username, Host, Type) ->
LUser = jid:nodeprep(Username),
LServer = jid:nodeprep(Host),
Jid = jid:make({LUser, LServer, <<>>}),
LookupParams = ?DUMMY_LOOKUP_PARAMETERS#{with_jid := Jid},
{ok, {_, _, Messages}} = lookup_messages([], Host, LookupParams),
Filtered = lists:filter(fun(El) -> is_muclight_message(Jid, El) end, Messages),
{ok, [{MsgId, exml:to_binary(Packet)} || {MsgId, _, Packet} <- Filtered]}.

is_muclight_message(_BareJid, {_MsgId, #jid{lresource = <<>>}, _Packet}) -> false;
is_muclight_message(BareJid, {_MsgId, #jid{lresource = Resource}, _Packet}) -> jid:to_binary(BareJid) == Resource.
BareJid = jid:make({LUser, LServer, <<>>}),
BareJidBin = jid:to_binary(BareJid),
Query = <<"msg_owner_jid_register:", BareJidBin/binary, " AND mam_type_register:", Type/binary>>,
SearchOpts = [],
{ok, _Cnt, _, MsgIds} = fold_archive(fun get_msg_id_key/3, Query, SearchOpts, []),
get_messages(Host, MsgIds).

remove_archive(Acc, Host, ArchiveID, ArchiveJID) ->
remove_archive(Host, ArchiveID, ArchiveJID),
Expand Down Expand Up @@ -461,12 +481,9 @@ search_text_filter(SearchText) ->
<<"search_text_register:", NormText/binary, "~1">>.

jid_filters(LocalJid, undefined) ->
<<"_yz_rk:", LocalJid/binary, "*">>;
jid_filters(undefined, RemoteJid) ->
%%added only for gdpr data retrieval, don't use for other purposes
<<"_yz_rk:*/", RemoteJid/binary, "*">>;
<<"_yz_rk:", LocalJid/binary, "/*/*">>;
jid_filters(LocalJid, RemoteJid) ->
<<"_yz_rk:", LocalJid/binary, "/", RemoteJid/binary, "*">>.
<<"_yz_rk:", LocalJid/binary, "/", RemoteJid/binary, "/*">>.

id_filters(undefined, undefined) ->
undefined;
Expand Down
2 changes: 2 additions & 0 deletions tools/mam_search_schema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
<schema name="mam_search_schema" version="1.5">
<fields>
<field name="msg_id_register" type="tlong" indexed="true" stored="false" multiValued="false"/>
<field name="msg_owner_jid_register" type="_yz_str" indexed="true" stored="false" multiValued="false"/>
<field name="mam_type_register" type="_yz_str" indexed="true" stored="false" multiValued="false"/>
<field name="search_text_register" type="text_general" indexed="true" stored="false" multiValued="false"/>
<field name="packet_register" type="ignored" indexed="false" stored="false" multiValued="false"/>

Expand Down