11"""A command to perform fuzzy and exact matching of leaders/slack members with User model."""
22
3+ from django .contrib .contenttypes .models import ContentType
34from django .core .management .base import BaseCommand
5+ from django .db import transaction
46from thefuzz import fuzz
57
68from apps .github .models .user import User
79from apps .owasp .models .chapter import Chapter
810from apps .owasp .models .committee import Committee
11+ from apps .owasp .models .entity_member import EntityMember
912from apps .owasp .models .project import Project
10- from apps .slack .models import Member
1113
1214ID_MIN_LENGTH = 2
1315
1416
1517class Command (BaseCommand ):
16- help = "Match leaders or Slack members with GitHub users using exact and fuzzy matching ."
18+ help = "Matches entity leader names with GitHub Users and creates EntityMember records ."
1719
1820 def add_arguments (self , parser ):
1921 parser .add_argument (
2022 "model_name" ,
2123 type = str ,
22- choices = ("chapter" , "committee" , "member " , "project " ),
23- help = "Model name to process: chapter, committee, project, or member " ,
24+ choices = ("chapter" , "committee" , "project " , "all " ),
25+ help = "Model to process: chapter, committee, project, or all. " ,
2426 )
2527 parser .add_argument (
2628 "--threshold" ,
@@ -29,103 +31,114 @@ def add_arguments(self, parser):
2931 help = "Threshold for fuzzy matching (0-100)" ,
3032 )
3133
34+ @transaction .atomic
3235 def handle (self , * _args , ** kwargs ):
3336 model_name = kwargs ["model_name" ].lower ()
3437 threshold = max (0 , min (kwargs ["threshold" ], 100 ))
3538
3639 model_map = {
37- "chapter" : (Chapter , "suggested_leaders" ),
38- "committee" : (Committee , "suggested_leaders" ),
39- "member" : (Member , "suggested_users" ),
40- "project" : (Project , "suggested_leaders" ),
40+ "chapter" : Chapter ,
41+ "committee" : Committee ,
42+ "project" : Project ,
4143 }
4244
43- if model_name not in model_map :
44- self .stdout .write (
45- self .style .ERROR (
46- "Invalid model name! Choose from: chapter, committee, project, member"
47- )
48- )
49- return
45+ models_to_process = model_map .values () if model_name == "all" else [model_map [model_name ]]
5046
51- model_class , relation_field = model_map [model_name ]
52- users = {
53- u ["id" ]: u
54- for u in User .objects .values ("id" , "login" , "name" )
55- if self ._is_valid_user (u ["login" ], u ["name" ])
56- }
47+ self .stdout .write ("Loading GitHub users into memory..." )
48+ all_users = list (User .objects .values ("id" , "login" , "name" ))
49+ valid_users = [u for u in all_users if self ._is_valid_user (u ["login" ], u ["name" ])]
50+ self .stdout .write (f"Found { len (valid_users )} valid users for matching." )
5751
58- for instance in model_class . objects . prefetch_related ( relation_field ) :
59- self .stdout . write ( f"Processing { model_name } { instance . id } ..." )
52+ for model_class in models_to_process :
53+ self ._process_entities ( model_class , valid_users , threshold )
6054
61- leaders_raw = (
62- [field for field in (instance .username , instance .real_name ) if field ]
63- if model_name == "member"
64- else instance .leaders_raw
65- )
66- exact_matches , fuzzy_matches , unmatched = self .process_leaders (
67- leaders_raw , threshold , users
68- )
55+ self .stdout .write (self .style .SUCCESS ("\n Command finished successfully." ))
56+
57+ def _process_entities (self , model_class , users_list , threshold ):
58+ """Process entries."""
59+ model_label = model_class .__class__ .__name__ .capitalize ()
60+ self .stdout .write (f"\n --- Processing { model_label } ---" )
61+
62+ new_members_to_create = []
63+
64+ content_type = ContentType .objects .get_for_model (model_class )
65+
66+ for entity in model_class .objects .all ():
67+ if not entity .leaders_raw :
68+ continue
69+
70+ matched_users = self ._find_user_matches (entity .leaders_raw , users_list , threshold )
71+
72+ if not matched_users :
73+ continue
6974
70- matched_user_ids = {user ["id" ] for user in exact_matches + fuzzy_matches }
71- getattr (instance , relation_field ).set (matched_user_ids )
75+ self .stdout .write (f" - Found { len (matched_users )} leader matches for '{ entity } '" )
76+
77+ new_members_to_create .extend (
78+ [
79+ EntityMember (
80+ content_type = content_type ,
81+ object_id = entity .pk ,
82+ member_id = user ["id" ],
83+ kind = EntityMember .MemberKind .LEADER ,
84+ is_reviewed = False ,
85+ )
86+ for user in matched_users
87+ ]
88+ )
7289
73- if unmatched :
74- self .stdout .write (f"Unmatched for { instance } : { unmatched } " )
90+ if new_members_to_create :
91+ created_records = EntityMember .objects .bulk_create (
92+ new_members_to_create ,
93+ ignore_conflicts = True ,
94+ )
95+ self .stdout .write (
96+ self .style .SUCCESS (
97+ f" -> Created { len (created_records )} new leader records for { model_label } ."
98+ )
99+ )
100+ else :
101+ self .stdout .write (
102+ self .style .NOTICE (f" -> No new leader records to create for { model_label } ." )
103+ )
75104
76105 def _is_valid_user (self , login , name ):
77106 """Check if GitHub user meets minimum requirements."""
78107 return len (login ) >= ID_MIN_LENGTH and len (name or "" ) >= ID_MIN_LENGTH
79108
80- def process_leaders (self , leaders_raw , threshold , filtered_users ):
81- """Process leaders with optimized matching, capturing all exact matches."""
82- if not leaders_raw :
83- return [], [], []
84-
85- exact_matches = []
86- fuzzy_matches = []
87- unmatched_leaders = []
88- processed_leaders = set ()
109+ def _find_user_matches (self , leaders_raw , users_list , threshold ):
110+ """Find user matches for a list of raw leader names."""
111+ matched_users = []
89112
90- user_list = list (filtered_users .values ())
91- for leader in leaders_raw :
92- if not leader or leader in processed_leaders :
113+ for leader_name in set (leaders_raw ):
114+ if not leader_name :
93115 continue
94116
95- processed_leaders .add (leader )
96- leader_lower = leader .lower ()
97-
98- # Find all exact matches
99- exact_matches_for_leader = [
100- u
101- for u in user_list
102- if u ["login" ].lower () == leader_lower
103- or (u ["name" ] and u ["name" ].lower () == leader_lower )
104- ]
105-
106- if exact_matches_for_leader :
107- exact_matches .extend (exact_matches_for_leader )
108- for match in exact_matches_for_leader :
109- self .stdout .write (f"Exact match found for { leader } : { match ['login' ]} " )
117+ leader_lower = leader_name .lower ()
118+ best_fuzzy_match = None
119+ highest_score = 0
120+
121+ exact_match_found = False
122+ for user in users_list :
123+ if user ["login" ].lower () == leader_lower or (
124+ user ["name" ] and user ["name" ].lower () == leader_lower
125+ ):
126+ matched_users .append (user )
127+ exact_match_found = True
128+
129+ if exact_match_found :
110130 continue
111131
112- # Fuzzy matching with token_sort_ratio
113- matches = [
114- u
115- for u in user_list
116- if (fuzz .token_sort_ratio (leader_lower , u ["login" ].lower ()) >= threshold )
117- or (
118- u ["name" ]
119- and fuzz .token_sort_ratio (leader_lower , u ["name" ].lower ()) >= threshold
120- )
121- ]
132+ for user in users_list :
133+ score = fuzz .token_sort_ratio (leader_lower , user ["login" ].lower ())
134+ if user ["name" ]:
135+ score = max (score , fuzz .token_sort_ratio (leader_lower , user ["name" ].lower ()))
136+
137+ if score > highest_score :
138+ highest_score = score
139+ best_fuzzy_match = user
122140
123- new_fuzzy_matches = [m for m in matches if m not in exact_matches ]
124- if new_fuzzy_matches :
125- fuzzy_matches .extend (new_fuzzy_matches )
126- for match in new_fuzzy_matches :
127- self .stdout .write (f"Fuzzy match found for { leader } : { match ['login' ]} " )
128- else :
129- unmatched_leaders .append (leader )
141+ if highest_score >= threshold :
142+ matched_users .append (best_fuzzy_match )
130143
131- return exact_matches , fuzzy_matches , unmatched_leaders
144+ return list ({ user [ "id" ]: user for user in matched_users }. values ())
0 commit comments