Skip to content

Commit

Permalink
moderation:added match_query_rule and percolator
Browse files Browse the repository at this point in the history
  • Loading branch information
0einstein0 committed Nov 8, 2024
1 parent 193270b commit a39f1e1
Show file tree
Hide file tree
Showing 7 changed files with 271 additions and 1 deletion.
1 change: 1 addition & 0 deletions site/setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ tests =
[options.entry_points]
flask.commands =
zenodo-admin = zenodo_rdm.cli:zenodo_admin
moderation = zenodo_rdm.cli:moderation
invenio_base.blueprints =
zenodo_rdm_legacy = zenodo_rdm.legacy.views:blueprint
zenodo_rdm_support = zenodo_rdm.views:create_blueprint
Expand Down
43 changes: 43 additions & 0 deletions site/tests/moderation/test_moderation_queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# ZenodoRDM is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""Test ModerationQuery model class."""


from invenio_db import db
from invenio_search import current_search_client

from zenodo_rdm.moderation.models import ModerationQuery


def test_moderation_query_creation(app):
"""Test to create and index a ModerationQuery."""
with app.app_context():
query_string = "metadata.title:SimpleTest"
notes = "test query"
score = 5
active = True

query = ModerationQuery.create(
query_string, notes=notes, score=score, active=active
)
db.session.commit()

assert all(
[
query.query_string == query_string,
query.notes == notes,
query.score == score,
query.active == active,
]
)
# Check if query indexed
res = current_search_client.search(
index="moderation-queries-rdmrecords-records-record-v7.0.0",
body={"query": {"match": {"query.query_string.query": query_string}}},
)
assert res["hits"]["total"]["value"] == 1

Check failure on line 43 in site/tests/moderation/test_moderation_queries.py

View workflow job for this annotation

GitHub Actions / Python (site, 3.9, postgresql14, opensearch2)

test_moderation_query_creation assert 0 == 1
33 changes: 33 additions & 0 deletions site/zenodo_rdm/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import click
from flask.cli import with_appcontext
from invenio_access.permissions import system_identity
from invenio_communities.communities.records.api import Community
from invenio_db import db
from invenio_pidstore.models import PersistentIdentifier
from invenio_rdm_records.proxies import current_rdm_records_service
Expand All @@ -25,6 +26,12 @@
from invenio_requests.records.api import Request
from invenio_requests.records.models import RequestMetadata

from zenodo_rdm.api import ZenodoRDMRecord
from zenodo_rdm.moderation.percolator import (
create_percolator_index,
get_percolator_index,
)


def _get_parent(record_model):
parent_model = record_model.parent
Expand Down Expand Up @@ -246,3 +253,29 @@ def delete_record(recid):

for req in requests:
current_requests_service.indexer.delete(req)


@click.group()
def moderation():
"""Moderation commands."""


@moderation.command("create-queries-index")
@click.option(
"-r",
"--record-cls",
type=click.Choice(["records", "communities"], case_sensitive=False),
default="records",
help="Record class to base the index on (default: records).",
)
@with_appcontext
def create_index(record_cls):
"""Command to create a percolator index for moderation queries."""
record_cls = ZenodoRDMRecord if record_cls == "records" else Community

try:
create_percolator_index(record_cls)
index_name = get_percolator_index(record_cls)
click.secho(f"Percolator index '{index_name}' created successfully.")
except Exception as e:
click.secho(f"Error creating percolator index: {e}")
23 changes: 22 additions & 1 deletion site/zenodo_rdm/moderation/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,13 @@

"""Moderation config."""

from .rules import files_rule, links_rule, text_sanitization_rule, verified_user_rule
from .rules import (
files_rule,
links_rule,
match_query_rule,
text_sanitization_rule,
verified_user_rule,
)

MODERATION_SCORES = {
"spam_link": 8,
Expand Down Expand Up @@ -40,12 +46,27 @@
links_rule,
files_rule,
text_sanitization_rule,
match_query_rule,
]
"""Scoring rules for record moderation."""

MODERATION_COMMUNITY_SCORE_RULES = [
links_rule,
text_sanitization_rule,
verified_user_rule,
match_query_rule,
]
"""Scoring rules for communtiy moderation."""

MODERATION_PERCOLATOR_INDEX_PREFIX = "moderation-queries"
"""Index Prefix for percolator index."""

MODERATION_PERCOLATOR_MAPPING = {
"properties": {
"query": {"type": "percolator"},
"score": {"type": "integer"},
"notes": {"type": "text"},
"active": {"type": "boolean"},
}
}
"""Properties for moderation percolator index."""
52 changes: 52 additions & 0 deletions site/zenodo_rdm/moderation/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,15 @@
import enum
from urllib.parse import urlparse

from flask import current_app
from invenio_db import db
from invenio_search import current_search_client
from sqlalchemy_utils import ChoiceType, Timestamp

from zenodo_rdm.api import ZenodoRDMRecord

from .percolator import index_percolate_query


class LinkDomainStatus(enum.Enum):
"""Link domain status."""
Expand Down Expand Up @@ -73,3 +79,49 @@ def lookup_domain(cls, url):
.limit(1)
.scalar()
)


class ModerationQuery(db.Model):
"""Moderation queries model."""

__tablename__ = "moderation_queries"

id = db.Column(db.Integer, primary_key=True, autoincrement=True)
"""Primary key identifier for the moderation query."""

score = db.Column(db.Integer, default=0)
"""Score associated with the query."""

query_string = db.Column(db.Text, nullable=False)
"""Query string containing the filter criteria."""

notes = db.Column(db.Text, nullable=True)
"""Additional notes or comments regarding the moderation query."""

active = db.Column(db.Boolean, default=True)
"""Indicates whether the moderation query is currently active."""

@classmethod
def create(
cls, query_string, record_cls=ZenodoRDMRecord, notes=None, score=0, active=True
):
"""Create a new moderation query with a configurable record class."""
query = cls(query_string=query_string, notes=notes, score=score, active=active)
db.session.add(query)

index_percolate_query(record_cls, query_string, active, score, notes)

return query

@classmethod
def get(cls, query_id=None):
"""Retrieve a moderation query by ID or return all queries if no ID is provided."""
if query_id is not None:
return cls.query.filter_by(id=query_id).one_or_none()
return cls.query.all()

def __repr__(self):
"""Get a string representation of the moderation query."""
return (
f"<ModerationQuery id={self.id}, score={self.score}, active={self.active}>"
)
90 changes: 90 additions & 0 deletions site/zenodo_rdm/moderation/percolator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2017-2024 CERN.
# Copyright (C) 2022 Graz University of Technology.
#
# Invenio is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Percolator."""


from flask import current_app
from invenio_search import current_search_client
from invenio_search.utils import build_alias_name, build_index_name


def get_percolator_index(record_cls):
"""Build the percolator index alias name for a given record class."""
prefix = current_app.config.get("MODERATION_PERCOLATOR_INDEX_PREFIX")
combined_index = f"{prefix}-{record_cls.index._name}"
return build_alias_name(combined_index, app=current_app)


def create_percolator_index(record_cls):
"""Create mappings with the percolator field for moderation queries.
This function creates a new Elasticsearch index for percolator queries by copying
the settings and mappings from an existing record index and adding specific
percolator mappings.
"""
# Build the name for the new percolator index, using a prefix and the record's index name
combined_index_name = f"{current_app.config.get('MODERATION_PERCOLATOR_INDEX_PREFIX')}-{record_cls.index._name}"
percolator_index = build_index_name(combined_index_name, app=current_app)

# Get the current mapping for the record index to copy its structure
record_index = build_alias_name(record_cls.index._name)
record_mapping = current_search_client.indices.get_mapping(index=record_index)
assert len(record_mapping) == 1
# Extract the mappings from the record index and store in `percolator_mappings`
percolator_mappings = list(record_mapping.values())[0]["mappings"]

# Add specific properties for percolator fields from the app configuration
percolator_mappings["properties"].update(
current_app.config.get("MODERATION_PERCOLATOR_MAPPING")["properties"]
)

# Retrieve the current settings of the record index to copy them to the percolator index
record_settings = list(
current_search_client.indices.get_settings(index=record_index).values()
)[0]["settings"]["index"]

percolator_settings = {
"index": {
"query": {
"default_field": record_settings.get("query", {}).get(
"default_field", []
)
}
},
"analysis": record_settings.get("analysis", {}),
}

if not current_search_client.indices.exists(percolator_index):
try:
current_search_client.indices.create(
index=percolator_index,
body={
"settings": percolator_settings,
"mappings": {**percolator_mappings},
},
)
except Exception as e:
current_app.logger.exception(e)


def index_percolate_query(record_cls, query_string, active=True, score=1, notes=None):
"""Index a percolate query."""
try:
current_search_client.index(
index=get_percolator_index(record_cls),
body={
"query": {"query_string": {"query": query_string}},
"active": active,
"score": score,
"notes": notes,
},
)
except Exception as e:
current_app.logger.exception(e)
30 changes: 30 additions & 0 deletions site/zenodo_rdm/moderation/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,11 @@
import re

from flask import current_app
from invenio_search import current_search_client
from invenio_search.utils import build_alias_name

from .models import LinkDomain, LinkDomainStatus
from .percolator import get_percolator_index
from .proxies import current_scores

#
Expand Down Expand Up @@ -130,3 +133,30 @@ def files_rule(identity, draft=None, record=None):
score += current_scores.ham_files

return score


def match_query_rule(identity, draft=None, record=None):
"""Calculate a score based on matched percolate queries against the given document in the specified index."""
document = record.dumps()
percolator_index = get_percolator_index(record)
if percolator_index:
matched_queries = current_search_client.search(
index=percolator_index,
body={
"query": {
"bool": {
"must": [
{"term": {"active": True}},
{"percolate": {"field": "query", "document": document}},
]
}
}
},
)

score = 0

for hit in matched_queries["hits"]["hits"]:
query_score = hit["_source"].get("score", 0)
score += query_score
return score

0 comments on commit a39f1e1

Please sign in to comment.