Merge pull request #234 from twitter/jbaxter/2024_06_27

Multiple behind-the-scenes changes (split scorer into smaller/faster chunks; versions++ & typing improvements).
twitter · Jun 27, 2024 · 1b9281a · 1b9281a
2 parents a12d407 + 8b41a59
commit 1b9281a
Show file tree

Hide file tree

Showing 23 changed files with 1,408 additions and 216 deletions.
diff --git a/README.md b/README.md
@@ -65,7 +65,7 @@ $ cd sourcecode
 $ python main.py
 ```
 
-Most versions of Python3 should work, but we have tested the code with Python 3.8.
+Multiple versions of Python3 should work, but we have tested the code with Python 3.10.
 
 ### Community Notes data
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
-numpy==1.26.2
-pandas==2.1.4
+numpy==1.26.4
+pandas==2.2.2
 torch==2.1.2
 scipy==1.11.4
-scikit-learn>=1.3.0
+scikit-learn==1.3.0
 pyarrow
diff --git a/sourcecode/scoring/constants.py b/sourcecode/scoring/constants.py
@@ -32,6 +32,11 @@
 tagPercentileForNormalization = 40
 intervalHalfWidth = 0.3
 
+# Max flip rates
+prescoringAllUnlockedNotesMaxCrhChurn = 0.04
+finalUnlockedNotesWithNoNewRatingsMaxCrhChurn = 0.03
+finalNotesWithNewRatingsMaxCrhChurn = 0.40
+
 # Data Filenames
 scoredNotesOutputPath = "scoredNotes.tsv"
 enrollmentInputPath = "userEnrollment-00000.tsv"
@@ -51,6 +56,7 @@
 modelingPopulationKey = "modelingPopulation"
 modelingGroupKey = "modelingGroup"
 numberOfTimesEarnedOutKey = "numberOfTimesEarnedOut"
+defaultIndexKey = "index"
 
 # TSV Values
 notHelpfulValueTsv = "NOT_HELPFUL"
@@ -237,7 +243,7 @@ def rater_factor_key(i):
   (1, "helpfulUnbiasedLanguage"),
 ]
 helpfulTagsTSVOrder = [tag for (tiebreakOrder, tag) in helpfulTagsAndTieBreakOrder]
-helpfulTagsAndTypesTSVOrder = [(tag, np.int64) for tag in helpfulTagsTSVOrder]
+helpfulTagsAndTypesTSVOrder = [(tag, np.int8) for tag in helpfulTagsTSVOrder]
 helpfulTagsTiebreakOrder = [tag for (tiebreakOrder, tag) in sorted(helpfulTagsAndTieBreakOrder)]
 
 # NOTE: Always add new tags to the end of this list, and *never* change the order of
@@ -275,7 +281,7 @@ def rater_factor_key(i):
   (6, notHelpfulNoteNotNeededKey),
 ]
 notHelpfulTagsTSVOrder = [tag for (tiebreakOrder, tag) in notHelpfulTagsAndTieBreakOrder]
-notHelpfulTagsAndTypesTSVOrder = [(tag, np.int64) for tag in notHelpfulTagsTSVOrder]
+notHelpfulTagsAndTypesTSVOrder = [(tag, np.int8) for tag in notHelpfulTagsTSVOrder]
 notHelpfulTagsTiebreakOrder = [
   tag for (tiebreakOrder, tag) in sorted(notHelpfulTagsAndTieBreakOrder)
 ]
@@ -287,10 +293,16 @@ def rater_factor_key(i):
 }
 adjustedSuffix = "Adjusted"
 notHelpfulTagsAdjustedColumns = [f"{column}{adjustedSuffix}" for column in notHelpfulTagsTSVOrder]
+notHelpfulTagsAdjustedTSVColumnsAndTypes = [
+  (tag, np.double) for tag in notHelpfulTagsAdjustedColumns
+]
 ratioSuffix = "Ratio"
 notHelpfulTagsAdjustedRatioColumns = [
   f"{column}{ratioSuffix}" for column in notHelpfulTagsAdjustedColumns
 ]
+notHelpfulTagsAdjustedRatioTSVColumnsAndTypes = [
+  (tag, np.double) for tag in notHelpfulTagsAdjustedRatioColumns
+]
 ratingWeightKey = "ratingWeight"
 
 incorrectTagRatingsMadeByRaterKey = "incorrectTagRatingsMadeByRater"
@@ -325,13 +337,14 @@ def rater_factor_key(i):
 lowDiligenceRaterInterceptRound2Key = "lowDiligenceRaterInterceptRound2"
 internalRaterInterceptRound2Key = "internalRaterInterceptRound2"
 
-incorrectFilterColumns = [
-  notHelpfulIncorrectIntervalKey,
-  sumOfIncorrectTagRateByRaterIntervalKey,
-  numVotersIntervalKey,
-  noteTfIdfIncorrectScoreIntervalKey,
-  lowDiligenceLegacyNoteInterceptKey,
+incorrectFilterColumnsAndTypes = [
+  (notHelpfulIncorrectIntervalKey, np.double),
+  (sumOfIncorrectTagRateByRaterIntervalKey, np.double),
+  (numVotersIntervalKey, np.double),
+  (noteTfIdfIncorrectScoreIntervalKey, np.double),
+  (lowDiligenceLegacyNoteInterceptKey, np.double),
 ]
+incorrectFilterColumns = [col for (col, _) in incorrectFilterColumnsAndTypes]
 
 misleadingTags = [
   "misleadingOther",
@@ -386,7 +399,7 @@ def rater_factor_key(i):
     (disagreeKey, np.int64),
     (helpfulKey, np.int64),
     (notHelpfulKey, np.int64),
-    (helpfulnessLevelKey, object),
+    (helpfulnessLevelKey, "category"),
   ]
   + helpfulTagsAndTypesTSVOrder
   + notHelpfulTagsAndTypesTSVOrder
@@ -429,7 +442,7 @@ def rater_factor_key(i):
   (currentExpansionStatusKey, object),
   (currentGroupStatusKey, object),
   (currentDecidedByKey, object),
-  (currentModelingGroupKey, object),
+  (currentModelingGroupKey, np.double),  # TODO: int
 ]
 noteStatusHistoryTSVColumns = [col for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes]
 noteStatusHistoryTSVTypes = [dtype for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes]
@@ -450,6 +463,7 @@ def rater_factor_key(i):
 earnedOutNoAcknowledge = "earnedOutNoAcknowledge"
 earnedOutAcknowledged = "earnedOutAcknowledged"
 newUser = "newUser"
+removed = "removed"
 isAtRiskCRNHCount = 2
 ratingImpactForEarnIn = 5
 ratingImpact = "ratingImpact"
@@ -459,6 +473,7 @@ def rater_factor_key(i):
   earnedOutNoAcknowledge: 2,
   earnedOutAcknowledged: 3,
   newUser: 4,
+  removed: 5,
 }
 emergingWriterDays = 28
 isEmergingWriterKey = "isEmergingWriter"
@@ -522,25 +537,29 @@ def rater_factor_key(i):
   col: dtype for (col, dtype) in noteParameterUncertaintyTSVColumnsAndTypes
 }
 
-auxiliaryScoredNotesTSVColumns = (
+auxiliaryScoredNotesTSVColumnsAndTypes = (
   [
-    noteIdKey,
-    ratingWeightKey,
-    createdAtMillisKey,
-    noteAuthorParticipantIdKey,
-    awaitingMoreRatingsBoolKey,
-    numRatingsLast28DaysKey,
-    currentLabelKey,
-    currentlyRatedHelpfulBoolKey,
-    currentlyRatedNotHelpfulBoolKey,
-    unlockedRatingStatusKey,
+    (noteIdKey, np.int64),
+    (ratingWeightKey, np.double),
+    (createdAtMillisKey, np.int64),
+    (noteAuthorParticipantIdKey, object),
+    (awaitingMoreRatingsBoolKey, np.int8),
+    (numRatingsLast28DaysKey, np.int64),
+    (currentLabelKey, str),
+    (currentlyRatedHelpfulBoolKey, np.int8),
+    (currentlyRatedNotHelpfulBoolKey, np.int8),
+    (unlockedRatingStatusKey, str),
   ]
-  + helpfulTagsTSVOrder
-  + notHelpfulTagsTSVOrder
-  + notHelpfulTagsAdjustedColumns
-  + notHelpfulTagsAdjustedRatioColumns
-  + incorrectFilterColumns
+  + helpfulTagsAndTypesTSVOrder
+  + notHelpfulTagsAndTypesTSVOrder
+  + notHelpfulTagsAdjustedTSVColumnsAndTypes
+  + notHelpfulTagsAdjustedRatioTSVColumnsAndTypes
+  + incorrectFilterColumnsAndTypes
 )
+auxiliaryScoredNotesTSVColumns = [col for (col, dtype) in auxiliaryScoredNotesTSVColumnsAndTypes]
+auxiliaryScoredNotesTSVTypeMapping = {
+  col: dtype for (col, dtype) in auxiliaryScoredNotesTSVColumnsAndTypes
+}
 
 deprecatedNoteModelOutputColumns = frozenset(
   {
@@ -610,7 +629,7 @@ def rater_factor_key(i):
   (topicNoteFactor1Key, np.double),
   (topicRatingStatusKey, str),
   (noteTopicKey, str),
-  (topicNoteConfidentKey, str),
+  (topicNoteConfidentKey, pd.BooleanDtype()),
   (expansionInternalActiveRulesKey, str),
   (expansionPlusInternalActiveRulesKey, str),
   (groupInternalActiveRulesKey, str),
@@ -638,10 +657,7 @@ def rater_factor_key(i):
   (crhCrnhRatioDifferenceKey, np.double),
   (meanNoteScoreKey, np.double),
   (raterAgreeRatioKey, np.double),
-  (
-    aboveHelpfulnessThresholdKey,
-    "boolean",
-  ),  # nullable bool https://pandas.pydata.org/docs/user_guide/boolean.html
+  (aboveHelpfulnessThresholdKey, pd.BooleanDtype()),
   (scorerNameKey, str),
   (internalRaterReputationKey, np.double),
   (lowDiligenceRaterInterceptKey, np.double),
@@ -681,7 +697,7 @@ def rater_factor_key(i):
   (successfulRatingNeededToEarnIn, pd.Int64Dtype()),
   (authorTopNotHelpfulTagValues, str),
   (timestampOfLastStateChange, np.double),
-  (aboveHelpfulnessThresholdKey, np.float64),  # nullable bool
+  (aboveHelpfulnessThresholdKey, np.float64),  # nullable bool.
   (isEmergingWriterKey, pd.BooleanDtype()),
   (aggregateRatingReceivedTotal, pd.Int64Dtype()),
   (timestampOfLastEarnOut, np.double),
@@ -731,6 +747,17 @@ def rater_factor_key(i):
   col: dtype for (col, dtype) in noteStatusChangeTSVColumnsAndTypes
 }
 
+datasetKeyKey = "datasetKey"
+partitionToReadKey = "partitionToRead"
+fileNameToReadKey = "fileNameToRead"
+inputPathsTSVColumnsAndTypes = [
+  (datasetKeyKey, str),
+  (partitionToReadKey, str),
+  (fileNameToReadKey, str),
+]
+inputPathsTSVColumns = [col for (col, _) in inputPathsTSVColumnsAndTypes]
+inputPathsTSVTypeMapping = {col: dtype for (col, dtype) in inputPathsTSVColumnsAndTypes}
+
 
 @contextmanager
 def time_block(label):
@@ -830,3 +857,10 @@ class ModelResult:
   auxiliaryNoteInfo: pd.DataFrame
   scorerName: Optional[str]
   metaScores: Optional[PrescoringMetaScorerOutput]
+
+
+@dataclass
+class NoteSubset:
+  noteSet: Optional[set]
+  maxCrhChurnRate: float
+  description: str