Skip to content

Commit

Permalink
Merge pull request #234 from twitter/jbaxter/2024_06_27
Browse files Browse the repository at this point in the history
Multiple behind-the-scenes changes (split scorer into smaller/faster chunks; versions++ & typing improvements).
  • Loading branch information
jbaxter authored Jun 27, 2024
2 parents a12d407 + 8b41a59 commit 1b9281a
Show file tree
Hide file tree
Showing 23 changed files with 1,408 additions and 216 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ $ cd sourcecode
$ python main.py
```

Most versions of Python3 should work, but we have tested the code with Python 3.8.
Multiple versions of Python3 should work, but we have tested the code with Python 3.10.

### Community Notes data

Expand Down
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
numpy==1.26.2
pandas==2.1.4
numpy==1.26.4
pandas==2.2.2
torch==2.1.2
scipy==1.11.4
scikit-learn>=1.3.0
scikit-learn==1.3.0
pyarrow
98 changes: 66 additions & 32 deletions sourcecode/scoring/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@
tagPercentileForNormalization = 40
intervalHalfWidth = 0.3

# Max flip rates
prescoringAllUnlockedNotesMaxCrhChurn = 0.04
finalUnlockedNotesWithNoNewRatingsMaxCrhChurn = 0.03
finalNotesWithNewRatingsMaxCrhChurn = 0.40

# Data Filenames
scoredNotesOutputPath = "scoredNotes.tsv"
enrollmentInputPath = "userEnrollment-00000.tsv"
Expand All @@ -51,6 +56,7 @@
modelingPopulationKey = "modelingPopulation"
modelingGroupKey = "modelingGroup"
numberOfTimesEarnedOutKey = "numberOfTimesEarnedOut"
defaultIndexKey = "index"

# TSV Values
notHelpfulValueTsv = "NOT_HELPFUL"
Expand Down Expand Up @@ -237,7 +243,7 @@ def rater_factor_key(i):
(1, "helpfulUnbiasedLanguage"),
]
helpfulTagsTSVOrder = [tag for (tiebreakOrder, tag) in helpfulTagsAndTieBreakOrder]
helpfulTagsAndTypesTSVOrder = [(tag, np.int64) for tag in helpfulTagsTSVOrder]
helpfulTagsAndTypesTSVOrder = [(tag, np.int8) for tag in helpfulTagsTSVOrder]
helpfulTagsTiebreakOrder = [tag for (tiebreakOrder, tag) in sorted(helpfulTagsAndTieBreakOrder)]

# NOTE: Always add new tags to the end of this list, and *never* change the order of
Expand Down Expand Up @@ -275,7 +281,7 @@ def rater_factor_key(i):
(6, notHelpfulNoteNotNeededKey),
]
notHelpfulTagsTSVOrder = [tag for (tiebreakOrder, tag) in notHelpfulTagsAndTieBreakOrder]
notHelpfulTagsAndTypesTSVOrder = [(tag, np.int64) for tag in notHelpfulTagsTSVOrder]
notHelpfulTagsAndTypesTSVOrder = [(tag, np.int8) for tag in notHelpfulTagsTSVOrder]
notHelpfulTagsTiebreakOrder = [
tag for (tiebreakOrder, tag) in sorted(notHelpfulTagsAndTieBreakOrder)
]
Expand All @@ -287,10 +293,16 @@ def rater_factor_key(i):
}
adjustedSuffix = "Adjusted"
notHelpfulTagsAdjustedColumns = [f"{column}{adjustedSuffix}" for column in notHelpfulTagsTSVOrder]
notHelpfulTagsAdjustedTSVColumnsAndTypes = [
(tag, np.double) for tag in notHelpfulTagsAdjustedColumns
]
ratioSuffix = "Ratio"
notHelpfulTagsAdjustedRatioColumns = [
f"{column}{ratioSuffix}" for column in notHelpfulTagsAdjustedColumns
]
notHelpfulTagsAdjustedRatioTSVColumnsAndTypes = [
(tag, np.double) for tag in notHelpfulTagsAdjustedRatioColumns
]
ratingWeightKey = "ratingWeight"

incorrectTagRatingsMadeByRaterKey = "incorrectTagRatingsMadeByRater"
Expand Down Expand Up @@ -325,13 +337,14 @@ def rater_factor_key(i):
lowDiligenceRaterInterceptRound2Key = "lowDiligenceRaterInterceptRound2"
internalRaterInterceptRound2Key = "internalRaterInterceptRound2"

incorrectFilterColumns = [
notHelpfulIncorrectIntervalKey,
sumOfIncorrectTagRateByRaterIntervalKey,
numVotersIntervalKey,
noteTfIdfIncorrectScoreIntervalKey,
lowDiligenceLegacyNoteInterceptKey,
incorrectFilterColumnsAndTypes = [
(notHelpfulIncorrectIntervalKey, np.double),
(sumOfIncorrectTagRateByRaterIntervalKey, np.double),
(numVotersIntervalKey, np.double),
(noteTfIdfIncorrectScoreIntervalKey, np.double),
(lowDiligenceLegacyNoteInterceptKey, np.double),
]
incorrectFilterColumns = [col for (col, _) in incorrectFilterColumnsAndTypes]

misleadingTags = [
"misleadingOther",
Expand Down Expand Up @@ -386,7 +399,7 @@ def rater_factor_key(i):
(disagreeKey, np.int64),
(helpfulKey, np.int64),
(notHelpfulKey, np.int64),
(helpfulnessLevelKey, object),
(helpfulnessLevelKey, "category"),
]
+ helpfulTagsAndTypesTSVOrder
+ notHelpfulTagsAndTypesTSVOrder
Expand Down Expand Up @@ -429,7 +442,7 @@ def rater_factor_key(i):
(currentExpansionStatusKey, object),
(currentGroupStatusKey, object),
(currentDecidedByKey, object),
(currentModelingGroupKey, object),
(currentModelingGroupKey, np.double), # TODO: int
]
noteStatusHistoryTSVColumns = [col for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes]
noteStatusHistoryTSVTypes = [dtype for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes]
Expand All @@ -450,6 +463,7 @@ def rater_factor_key(i):
earnedOutNoAcknowledge = "earnedOutNoAcknowledge"
earnedOutAcknowledged = "earnedOutAcknowledged"
newUser = "newUser"
removed = "removed"
isAtRiskCRNHCount = 2
ratingImpactForEarnIn = 5
ratingImpact = "ratingImpact"
Expand All @@ -459,6 +473,7 @@ def rater_factor_key(i):
earnedOutNoAcknowledge: 2,
earnedOutAcknowledged: 3,
newUser: 4,
removed: 5,
}
emergingWriterDays = 28
isEmergingWriterKey = "isEmergingWriter"
Expand Down Expand Up @@ -522,25 +537,29 @@ def rater_factor_key(i):
col: dtype for (col, dtype) in noteParameterUncertaintyTSVColumnsAndTypes
}

auxiliaryScoredNotesTSVColumns = (
auxiliaryScoredNotesTSVColumnsAndTypes = (
[
noteIdKey,
ratingWeightKey,
createdAtMillisKey,
noteAuthorParticipantIdKey,
awaitingMoreRatingsBoolKey,
numRatingsLast28DaysKey,
currentLabelKey,
currentlyRatedHelpfulBoolKey,
currentlyRatedNotHelpfulBoolKey,
unlockedRatingStatusKey,
(noteIdKey, np.int64),
(ratingWeightKey, np.double),
(createdAtMillisKey, np.int64),
(noteAuthorParticipantIdKey, object),
(awaitingMoreRatingsBoolKey, np.int8),
(numRatingsLast28DaysKey, np.int64),
(currentLabelKey, str),
(currentlyRatedHelpfulBoolKey, np.int8),
(currentlyRatedNotHelpfulBoolKey, np.int8),
(unlockedRatingStatusKey, str),
]
+ helpfulTagsTSVOrder
+ notHelpfulTagsTSVOrder
+ notHelpfulTagsAdjustedColumns
+ notHelpfulTagsAdjustedRatioColumns
+ incorrectFilterColumns
+ helpfulTagsAndTypesTSVOrder
+ notHelpfulTagsAndTypesTSVOrder
+ notHelpfulTagsAdjustedTSVColumnsAndTypes
+ notHelpfulTagsAdjustedRatioTSVColumnsAndTypes
+ incorrectFilterColumnsAndTypes
)
auxiliaryScoredNotesTSVColumns = [col for (col, dtype) in auxiliaryScoredNotesTSVColumnsAndTypes]
auxiliaryScoredNotesTSVTypeMapping = {
col: dtype for (col, dtype) in auxiliaryScoredNotesTSVColumnsAndTypes
}

deprecatedNoteModelOutputColumns = frozenset(
{
Expand Down Expand Up @@ -610,7 +629,7 @@ def rater_factor_key(i):
(topicNoteFactor1Key, np.double),
(topicRatingStatusKey, str),
(noteTopicKey, str),
(topicNoteConfidentKey, str),
(topicNoteConfidentKey, pd.BooleanDtype()),
(expansionInternalActiveRulesKey, str),
(expansionPlusInternalActiveRulesKey, str),
(groupInternalActiveRulesKey, str),
Expand Down Expand Up @@ -638,10 +657,7 @@ def rater_factor_key(i):
(crhCrnhRatioDifferenceKey, np.double),
(meanNoteScoreKey, np.double),
(raterAgreeRatioKey, np.double),
(
aboveHelpfulnessThresholdKey,
"boolean",
), # nullable bool https://pandas.pydata.org/docs/user_guide/boolean.html
(aboveHelpfulnessThresholdKey, pd.BooleanDtype()),
(scorerNameKey, str),
(internalRaterReputationKey, np.double),
(lowDiligenceRaterInterceptKey, np.double),
Expand Down Expand Up @@ -681,7 +697,7 @@ def rater_factor_key(i):
(successfulRatingNeededToEarnIn, pd.Int64Dtype()),
(authorTopNotHelpfulTagValues, str),
(timestampOfLastStateChange, np.double),
(aboveHelpfulnessThresholdKey, np.float64), # nullable bool
(aboveHelpfulnessThresholdKey, np.float64), # nullable bool.
(isEmergingWriterKey, pd.BooleanDtype()),
(aggregateRatingReceivedTotal, pd.Int64Dtype()),
(timestampOfLastEarnOut, np.double),
Expand Down Expand Up @@ -731,6 +747,17 @@ def rater_factor_key(i):
col: dtype for (col, dtype) in noteStatusChangeTSVColumnsAndTypes
}

datasetKeyKey = "datasetKey"
partitionToReadKey = "partitionToRead"
fileNameToReadKey = "fileNameToRead"
inputPathsTSVColumnsAndTypes = [
(datasetKeyKey, str),
(partitionToReadKey, str),
(fileNameToReadKey, str),
]
inputPathsTSVColumns = [col for (col, _) in inputPathsTSVColumnsAndTypes]
inputPathsTSVTypeMapping = {col: dtype for (col, dtype) in inputPathsTSVColumnsAndTypes}


@contextmanager
def time_block(label):
Expand Down Expand Up @@ -830,3 +857,10 @@ class ModelResult:
auxiliaryNoteInfo: pd.DataFrame
scorerName: Optional[str]
metaScores: Optional[PrescoringMetaScorerOutput]


@dataclass
class NoteSubset:
noteSet: Optional[set]
maxCrhChurnRate: float
description: str
Loading

0 comments on commit 1b9281a

Please sign in to comment.