Skip to content

Commit

Permalink
Common: Update probe to use prometheus pusher, sqla 2.0 syntax and da…
Browse files Browse the repository at this point in the history
…ta model. rucio#127

* Changes:
	- Change text-only queries to poll the data model (rucio.db.sqla.models)
	- Push results to a remote (See documentation of probes for discriptions)
		Names: locked_expired_rules.(rse), locked_expired_rules.dids.(rse), locked_expired_rules.rules_for_dids.(rse)
  • Loading branch information
voetberg committed Mar 6, 2024
1 parent 86f8be2 commit 648b262
Showing 1 changed file with 94 additions and 37 deletions.
131 changes: 94 additions & 37 deletions common/check_expired_locked_rules
Original file line number Diff line number Diff line change
@@ -1,56 +1,113 @@
#!/usr/bin/env python
# Copyright European Organization for Nuclear Research (CERN) 2013
#!/usr/bin/env python3
# Copyright 2012-2024 CERN
#
# Licensed under the Apache License, Version 2.0 (the "License");
# You may not use this file except in compliance with the License.
# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Authors:
# - Cedric Serfon, <cedric.serfon@cern.ch>, 2015
# - Donata Mielaikaite, <donata.mielaikaite@cern.ch>, 2020
# - Eric Vaandering, <ewv@fnal.gov>, 2020
# - Maggie Voetberg <maggiev@fnal.gov>, 2024


'''
Probe to check the locked expired rules or datasets with locked rules
'''

import sys
from rucio.db.sqla.session import get_session
import traceback

from rucio.db.sqla import models, session
from sqlalchemy import func, select, functions, and_

from utils.common import PrometheusPusher

# Exit statuses
OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3

if __name__ == '__main__':

def main():
'''
Probe to check the locked expired rules or datasets with locked rules
'''
status = OK
session = get_session()
try:
query = "select rawtohex(id), scope, name, rse_expression from atlas_rucio.rules where locked=1 and expires_at<sys_extract_utc(localtimestamp)"
print 'Locked expired rules'
for row in session.execute(query):
status = CRITICAL
print row[0], row[1], row[2]
except Exception as error:
print error
status = UNKNOWN
sys.exit(status)
try:
query = """select rawtohex(c.id), c.scope, c.name, c.rse_expression from atlas_rucio.rules c,
(select a.scope, a.name from atlas_rucio.dids a
where a.expired_at is not null and a.expired_at < sys_extract_utc(localtimestamp)
and exists (select 1 from atlas_rucio.rules b where a.scope=b.scope and a.name=b.name and locked=1)) d
where c.scope=d.scope and c.name=d.name and locked=1"""
print 'Datasets expired with locked rules'
for row in session.execute(query):
status = CRITICAL
print row[0], row[1], row[2], row[3]
except Exception as error:
print error
status = UNKNOWN
sys.exit(status)
sys.exit(status)
session = session.get_session()
with PrometheusPusher() as manager:
try:
statement = select(
func.count(models.ReplicationRule.id),
models.ReplicationRule.rse_expression
).where(
and_(
models.ReplicationRule.locked == '1',
models.ReplicationRule.expires_at<functions.current_timestamp()
)
).group_by(models.ReplicationRule.rse_expression)

query = session.execute(statement).scalars()
if len(query.all()) == 0:
query = [(0, 'null')] # Still want the gauge to pick up something for dashboard purposes

for count, rse_expression in query:
manager.gauge('locked_expired_rules.{rse_expression}',
documentation='Number of rules that are locked and expired, by RSE.'
).labels(
rse_expression=rse_expression
).set(count)

if count > 0:
status = CRITICAL

if __name__ == "__main__":
main()
except Exception as error:
print(traceback.format_exc())
sys.exit(UNKNOWN)

try:
statement = select(
func.count(models.ReplicationRule.id),
func.count(models.DataIdentifier.name),
models.ReplicationRule.rse_expression
).join(
models.DataIdentifier,
(models.ReplicationRule.scope == models.DataIdentifier.scope) & (models.ReplicationRule.name == models.DataIdentifier.name)
).where(
and_(
models.ReplicationRule.locked == True,
models.DataIdentifier.expired_at != None,
models.DataIdentifier.expired_at < functions.current_timestamp()
)
). group_by(
models.ReplicationRule.rse_expression
)

query = session.execute(statement).scalars()
if len(query.all()) == 0:
query = [(0, 0, 'null')]

for rules, dids, rse_expression in query:
manager.gauge('locked_expired_rules.dids.{rse_expression}',
documentation='Number of expired DIDs with locked rules, by RSE'
).labels(
rse_expression=rse_expression
).set(dids)
manager.gauge('locked_expired_rules.rules_for_dids.{rse_expression}',
documentation='Number of locked rules for expired DIDs, by RSE'
).labels(
rse_expression=rse_expression
).set(rules)

if (rules>0) or (dids>0):
status = CRITICAL

except:
print(traceback.format_exc())
sys.exit(UNKNOWN)

sys.exit(status)

0 comments on commit 648b262

Please sign in to comment.