Merge pull request #220 from twitter/jbaxter/2024_04_26

Freeze rater parameters in final scoring, turn on status locking, parquet output + more column output
twitter · Apr 26, 2024 · adbc126 · adbc126
2 parents 998fa4b + e02a7ec
commit adbc126
Show file tree

Hide file tree

Showing 12 changed files with 102 additions and 24 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -3,3 +3,4 @@ pandas==2.1.4
 torch==2.1.2
 scipy==1.11.4
 scikit-learn>=1.3.0
+pyarrow
diff --git a/sourcecode/scoring/constants.py b/sourcecode/scoring/constants.py
@@ -134,10 +134,12 @@ def rater_factor_key(i):
 expansionRatingStatusKey = "expansionRatingStatus"
 expansionNoteInterceptMaxKey = "expansionNoteInterceptMax"
 expansionNoteInterceptMinKey = "expansionNoteInterceptMin"
+expansionInternalActiveRulesKey = "expansionActiveRules"
 # ExpansionPlus Model
 expansionPlusNoteInterceptKey = "expansionPlusNoteIntercept"
 expansionPlusNoteFactor1Key = "expansionPlusNoteFactor1"
 expansionPlusRatingStatusKey = "expansionPlusRatingStatus"
+expansionPlusInternalActiveRulesKey = "expansionPlusActiveRules"
 # Coverage / Helpfulness Reputation Model
 coverageNoteInterceptKey = "coverageNoteIntercept"
 coverageNoteFactor1Key = "coverageNoteFactor1"
@@ -153,11 +155,13 @@ def rater_factor_key(i):
 groupNoteInterceptMinKey = "groupNoteInterceptMin"
 groupRaterInterceptKey = "groupRaterIntercept"
 groupRaterFactor1Key = "groupRaterFactor1"
+groupInternalActiveRulesKey = "groupActiveRules"
 # Topic Model
 topicNoteInterceptKey = "topicNoteIntercept"
 topicNoteFactor1Key = "topicNoteFactor1"
 topicRatingStatusKey = "topicRatingStatus"
 topicNoteConfidentKey = "topicNoteConfident"
+topicInternalActiveRulesKey = "topicActiveRules"
 # Harassment/Abuse Tag
 harassmentNoteInterceptKey = "harassmentNoteIntercept"
 harassmentNoteFactor1Key = "harassmentNoteFactor1"
@@ -558,6 +562,10 @@ def rater_factor_key(i):
   (topicRatingStatusKey, str),
   (noteTopicKey, str),
   (topicNoteConfidentKey, str),
+  (expansionInternalActiveRulesKey, str),
+  (expansionPlusInternalActiveRulesKey, str),
+  (groupInternalActiveRulesKey, str),
+  (topicInternalActiveRulesKey, str),
 ]
 noteModelOutputTSVColumns = [col for (col, dtype) in noteModelOutputTSVColumnsAndTypes]
 noteModelOutputTSVTypeMapping = {col: dtype for (col, dtype) in noteModelOutputTSVColumnsAndTypes}

diff --git a/sourcecode/scoring/matrix_factorization/matrix_factorization.py b/sourcecode/scoring/matrix_factorization/matrix_factorization.py
@@ -443,6 +443,7 @@ def run_mf(
     globalInterceptInit: Optional[float] = None,
     specificNoteId: Optional[int] = None,
     validatePercent: Optional[float] = None,
+    freezeRaterParameters: bool = False,
   ):
     """Train matrix factorization model.
 
@@ -466,6 +467,8 @@ def run_mf(
     self._create_mf_model(noteInit, userInit, globalInterceptInit)
     assert self.mf_model is not None
 
+    if freezeRaterParameters:
+      self.mf_model._freeze_parameters(set({"user"}))
     if specificNoteId is not None:
       self.mf_model.freeze_rater_and_global_parameters()
     self.prepare_features_and_labels(specificNoteId)

diff --git a/sourcecode/scoring/mf_base_scorer.py b/sourcecode/scoring/mf_base_scorer.py
@@ -570,6 +570,18 @@ def _prescore_notes_and_users(
       if self._saveIntermediateState:
         self.helpfulnessScores = helpfulnessScores
 
+      ## One extra final round!
+      # Filter ratings based on prev helpfulness scores
+      finalRoundRatings = helpfulness_scores.filter_ratings_by_helpfulness_scores(
+        ratingsForTraining, helpfulnessScores
+      )
+      # Run MF
+      noteParamsUnfiltered, raterParamsUnfiltered, globalBias = self._mfRanker.run_mf(
+        ratings=finalRoundRatings,
+        noteInit=noteParamsUnfiltered,
+        userInit=raterParamsUnfiltered,
+      )
+
     raterModelOutput = raterParamsUnfiltered.merge(
       helpfulnessScores[
         [
@@ -644,6 +656,8 @@ def _score_notes_and_users(
         ratings=finalRoundRatings,
         noteInit=prescoringNoteModelOutput,
         userInit=prescoringRaterModelOutput,
+        globalInterceptInit=0.17,
+        freezeRaterParameters=True,
       )
 
     if self._saveIntermediateState:

diff --git a/sourcecode/scoring/mf_expansion_plus_scorer.py b/sourcecode/scoring/mf_expansion_plus_scorer.py
@@ -37,6 +37,7 @@ def _get_note_col_mapping(self) -> Dict[str, str]:
       c.internalNoteInterceptKey: c.expansionPlusNoteInterceptKey,
       c.internalNoteFactor1Key: c.expansionPlusNoteFactor1Key,
       c.internalRatingStatusKey: c.expansionPlusRatingStatusKey,
+      c.internalActiveRulesKey: c.expansionPlusInternalActiveRulesKey,
     }
 
   def get_scored_notes_cols(self) -> List[str]:
@@ -46,6 +47,7 @@ def get_scored_notes_cols(self) -> List[str]:
       c.expansionPlusNoteInterceptKey,
       c.expansionPlusNoteFactor1Key,
       c.expansionPlusRatingStatusKey,
+      c.expansionPlusInternalActiveRulesKey,
     ]
 
   def get_helpfulness_scores_cols(self) -> List[str]:
@@ -60,7 +62,6 @@ def _get_dropped_note_cols(self) -> List[str]:
     """Returns a list of columns which should be excluded from scoredNotes and auxiliaryNoteInfo."""
     return super()._get_dropped_note_cols() + (
       [
-        c.internalActiveRulesKey,
         c.activeFilterTagsKey,
         c.ratingWeightKey,
         c.noteInterceptMinKey,

diff --git a/sourcecode/scoring/mf_expansion_scorer.py b/sourcecode/scoring/mf_expansion_scorer.py
@@ -43,6 +43,7 @@ def _get_note_col_mapping(self) -> Dict[str, str]:
       c.internalRatingStatusKey: c.expansionRatingStatusKey,
       c.noteInterceptMinKey: c.expansionNoteInterceptMinKey,
       c.noteInterceptMaxKey: c.expansionNoteInterceptMaxKey,
+      c.internalActiveRulesKey: c.expansionInternalActiveRulesKey,
     }
 
   def get_scored_notes_cols(self) -> List[str]:
@@ -54,6 +55,7 @@ def get_scored_notes_cols(self) -> List[str]:
       c.expansionRatingStatusKey,
       c.expansionNoteInterceptMinKey,
       c.expansionNoteInterceptMaxKey,
+      c.expansionInternalActiveRulesKey,
     ]
 
   def get_helpfulness_scores_cols(self) -> List[str]:
@@ -68,7 +70,6 @@ def _get_dropped_note_cols(self) -> List[str]:
     """Returns a list of columns which should be excluded from scoredNotes and auxiliaryNoteInfo."""
     return super()._get_dropped_note_cols() + (
       [
-        c.internalActiveRulesKey,
         c.activeFilterTagsKey,
         c.ratingWeightKey,
       ]

diff --git a/sourcecode/scoring/mf_group_scorer.py b/sourcecode/scoring/mf_group_scorer.py
@@ -44,6 +44,7 @@ def coalesce_group_models(
     c.groupNoteInterceptMaxKey,
     c.groupNoteInterceptMinKey,
     c.modelingGroupKey,
+    c.groupInternalActiveRulesKey,
   ]:
     scoredNotes = coalesce_columns(scoredNotes, col)
 
@@ -135,6 +136,7 @@ def __init__(
     self._groupRatingStatusKey = f"{c.groupRatingStatusKey}_{self._groupNumber}"
     self._groupNoteInterceptMaxKey = f"{c.groupNoteInterceptMaxKey}_{self._groupNumber}"
     self._groupNoteInterceptMinKey = f"{c.groupNoteInterceptMinKey}_{self._groupNumber}"
+    self._groupInternalActiveRulesKey = f"{c.groupInternalActiveRulesKey}_{self._groupNumber}"
     self._groupRaterInterceptKey = f"{c.groupRaterInterceptKey}_{self._groupNumber}"
     self._groupRaterFactor1Key = f"{c.groupRaterFactor1Key}_{self._groupNumber}"
     self._modelingGroupKey = f"{c.modelingGroupKey}_{self._groupNumber}"
@@ -151,6 +153,7 @@ def _get_note_col_mapping(self) -> Dict[str, str]:
       c.internalRatingStatusKey: self._groupRatingStatusKey,
       c.noteInterceptMinKey: self._groupNoteInterceptMinKey,
       c.noteInterceptMaxKey: self._groupNoteInterceptMaxKey,
+      c.internalActiveRulesKey: self._groupInternalActiveRulesKey,
     }
 
   def _get_user_col_mapping(self) -> Dict[str, str]:
@@ -169,6 +172,7 @@ def get_scored_notes_cols(self) -> List[str]:
       self._groupRatingStatusKey,
       self._groupNoteInterceptMaxKey,
       self._groupNoteInterceptMinKey,
+      self._groupInternalActiveRulesKey,
       self._modelingGroupKey,
     ]
 
@@ -189,7 +193,6 @@ def _get_dropped_note_cols(self) -> List[str]:
     """Returns a list of columns which should be excluded from scoredNotes and auxiliaryNoteInfo."""
     return super()._get_dropped_note_cols() + (
       [
-        c.internalActiveRulesKey,
         c.activeFilterTagsKey,
         c.ratingWeightKey,
       ]

diff --git a/sourcecode/scoring/mf_topic_scorer.py b/sourcecode/scoring/mf_topic_scorer.py
@@ -26,6 +26,7 @@ def coalesce_topic_models(scoredNotes: pd.DataFrame) -> pd.DataFrame:
     c.topicRatingStatusKey,
     c.topicNoteConfidentKey,
     c.noteTopicKey,
+    c.topicInternalActiveRulesKey,
   ]:
     scoredNotes = coalesce_columns(scoredNotes, col)
 
@@ -106,6 +107,7 @@ def __init__(
     self._topicNoteInterceptKey = f"{c.topicNoteInterceptKey}_{self._topicName}"
     self._topicNoteFactor1Key = f"{c.topicNoteFactor1Key}_{self._topicName}"
     self._topicRatingStatusKey = f"{c.topicRatingStatusKey}_{self._topicName}"
+    self._topicInternalActiveRulesKey = f"{c.topicInternalActiveRulesKey}_{self._topicName}"
     self._noteTopicKey = f"{c.noteTopicKey}_{self._topicName}"
     self._noteTopicConfidentKey = f"{c.topicNoteConfidentKey}_{self._topicName}"
 
@@ -118,6 +120,7 @@ def _get_note_col_mapping(self) -> Dict[str, str]:
       c.internalNoteInterceptKey: self._topicNoteInterceptKey,
       c.internalNoteFactor1Key: self._topicNoteFactor1Key,
       c.internalRatingStatusKey: self._topicRatingStatusKey,
+      c.internalActiveRulesKey: self._topicInternalActiveRulesKey,
     }
 
   def get_scored_notes_cols(self) -> List[str]:
@@ -129,6 +132,7 @@ def get_scored_notes_cols(self) -> List[str]:
       self._topicRatingStatusKey,
       self._noteTopicKey,
       self._noteTopicConfidentKey,
+      self._topicInternalActiveRulesKey,
     ]
 
   def get_helpfulness_scores_cols(self) -> List[str]:
@@ -143,7 +147,6 @@ def _get_dropped_note_cols(self) -> List[str]:
     """Returns a list of columns which should be excluded from scoredNotes and auxiliaryNoteInfo."""
     return super()._get_dropped_note_cols() + (
       [
-        c.internalActiveRulesKey,
         c.activeFilterTagsKey,
         c.ratingWeightKey,
         c.noteInterceptMinKey,

diff --git a/sourcecode/scoring/process_data.py b/sourcecode/scoring/process_data.py
@@ -421,15 +421,29 @@ def write_tsv_local(df: pd.DataFrame, path: str) -> None:
   Args:
     df: pd.DataFrame to write to disk.
     path: location of file on disk.
-
-  Returns:
-    None, because path is always None.
   """
 
   assert path is not None
   assert df.to_csv(path, index=False, header=True, sep="\t") is None
 
 
+def write_parquet_local(
+  df: pd.DataFrame, path: str, compression: str = "snappy", engine: str = "pyarrow"
+) -> None:
+  """Write DF as a parquet file stored to local disk. Compress with snappy
+  and use pyarrow engine.
+
+  Args:
+    df: pd.DataFrame to write to disk.
+    path: location of file on disk.
+    compression: compression algorithm to use. Defaults to 'snappy'.
+    engine: engine to use. Defaults to 'pyarrow'.
+  """
+
+  assert path is not None
+  df.to_parquet(path, compression=compression, engine=engine)
+
+
 class CommunityNotesDataLoader(ABC):
   """Base class which local and prod data loaders extend.
 

diff --git a/sourcecode/scoring/run_scoring.py b/sourcecode/scoring/run_scoring.py
@@ -254,7 +254,9 @@ def _run_scorer_parallelizable(
         scoringArgs = _load_data_from_shared_memory_parallelizable(
           scoringArgsSharedMemory, scoringArgs
         )
-        print(f"{scorer.get_name()} run_scorer_parallelizable just finished loading data from shared memory.")
+        print(
+          f"{scorer.get_name()} run_scorer_parallelizable just finished loading data from shared memory."
+        )
       elif dataLoader is not None:
         print(
           f"{scorer.get_name()} run_scorer_parallelizable just started in parallel: loading data with dataLoader."
@@ -522,19 +524,25 @@ def meta_score(
       # MFExpansionPlusScorer will have the lowest priority.
       rules.append(
         scoring_rules.ApplyModelResult(
-          RuleID.EXPANSION_PLUS_MODEL, {RuleID.META_INITIAL_NMR}, c.expansionPlusRatingStatusKey
+          RuleID.EXPANSION_PLUS_MODEL,
+          {RuleID.META_INITIAL_NMR},
+          c.expansionPlusRatingStatusKey,
         )
       )
     if enabledScorers is None or Scorers.MFExpansionScorer in enabledScorers:
       rules.append(
         scoring_rules.ApplyModelResult(
-          RuleID.EXPANSION_MODEL, {RuleID.META_INITIAL_NMR}, c.expansionRatingStatusKey
+          RuleID.EXPANSION_MODEL,
+          {RuleID.META_INITIAL_NMR},
+          c.expansionRatingStatusKey,
         )
       )
     if enabledScorers is None or Scorers.MFCoreScorer in enabledScorers:
       rules.append(
         scoring_rules.ApplyModelResult(
-          RuleID.CORE_MODEL, {RuleID.META_INITIAL_NMR}, c.coreRatingStatusKey
+          RuleID.CORE_MODEL,
+          {RuleID.META_INITIAL_NMR},
+          c.coreRatingStatusKey,
         )
       )
     if enabledScorers is None or Scorers.MFGroupScorer in enabledScorers:

diff --git a/sourcecode/scoring/runner.py b/sourcecode/scoring/runner.py
@@ -3,7 +3,12 @@
 
 from . import constants as c
 from .enums import scorers_from_csv
-from .process_data import LocalDataLoader, write_prescoring_output, write_tsv_local
+from .process_data import (
+  LocalDataLoader,
+  write_parquet_local,
+  write_prescoring_output,
+  write_tsv_local,
+)
 from .run_scoring import run_scoring
 
 
@@ -84,6 +89,13 @@ def parse_args():
     dest="prescoring_delay_hours",
     help="Filter prescoring input to simulate delay in hours",
   )
+  parser.add_argument(
+    "--no-parquet",
+    help="Disable writing parquet files.",
+    default=False,
+    action="store_true",
+    dest="no_parquet",
+  )
 
   return parser.parse_args()
 
@@ -138,6 +150,12 @@ def prescoring_write_fn(notePath, raterPath):
   write_tsv_local(newStatus, os.path.join(args.outdir, "note_status_history.tsv"))
   write_tsv_local(auxNoteInfo, os.path.join(args.outdir, "aux_note_info.tsv"))
 
+  if not args.no_parquet:
+    write_parquet_local(scoredNotes, os.path.join(args.outdir, "scored_notes.parquet"))
+    write_parquet_local(helpfulnessScores, os.path.join(args.outdir, "helpfulness_scores.parquet"))
+    write_parquet_local(newStatus, os.path.join(args.outdir, "note_status_history.parquet"))
+    write_parquet_local(auxNoteInfo, os.path.join(args.outdir, "aux_note_info.parquet"))
+
 
 if __name__ == "__main__":
   main()