Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Api development #25

Merged
merged 5 commits into from
Jul 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions api/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,16 @@

# Set mode to testing to mock scoring function with random output
# between 0.0 and 1.0 and not load any LLMs. Set to production
# to run real scoring function
MODE = 'testing'
# to run real scoring function.
MODE = 'production'

# Get path to this config file so that we can define
# other paths relative to it
PROJECT_ROOT_PATH=os.path.dirname(os.path.realpath(__file__))

# Other project paths
LOG_PATH=f'{PROJECT_ROOT_PATH}/logs'
DATA_PATH=f'{PROJECT_ROOT_PATH}/data'

# Logging stuff
LOG_LEVEL='DEBUG'
Expand All @@ -42,6 +43,11 @@

CALCULATION_DEVICE='cuda:0'

PERPLEXITY_RATIO_KLD_KDE = f'{DATA_PATH}/perplexity_ratio_KLD_KDE.pkl'
TFIDF_LUT = f'{DATA_PATH}/TFIDF_lut.pkl'
TFIDF_SCORE_KLD_KDE = f'{DATA_PATH}/TFIDF_score_KLD_KDE.pkl'
XGBOOST_CLASSIFIER = f'{DATA_PATH}/XGBoost_classifier.pkl'

######################################################################
# NON-HF default model parameters ####################################
######################################################################
Expand Down
36 changes: 22 additions & 14 deletions api/functions/flask_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ def __call__(self, *args: object, **kwargs: object) -> object:
return self.run(*args, **kwargs)

# Create Celery app
celery_app = Celery(app.name, task_cls=FlaskTask)
celery_app = Celery(app.name, task_cls = FlaskTask)

# Add configuration from Flask app's Celery config. dict
celery_app.config_from_object(app.config["CELERY"])
celery_app.config_from_object(app.config['CELERY'])

# Configure logging
celery_app.log.setup(
Expand All @@ -35,7 +35,7 @@ def __call__(self, *args: object, **kwargs: object) -> object:

# Set as default and add to extensions
celery_app.set_default()
app.extensions["celery"] = celery_app
app.extensions['celery'] = celery_app

return celery_app

Expand Down Expand Up @@ -80,6 +80,13 @@ def score_text(suspect_string: str) -> str:
# Mock the score with a random float
score = [random.uniform(0, 1)]

# Threshold the score
if score[0] >= 0.5:
call = 'human'

elif score[0] < 0.5:
call = 'synthetic'

elif config.MODE == 'production':

# Call the scoring function
Expand All @@ -89,15 +96,16 @@ def score_text(suspect_string: str) -> str:
suspect_string
)

# Threshold the score
if score[0] >= 0.5:
call = 'human'
if score[0] == 0:
call = 'human'

elif score[0] == 1:
call = 'synthetic'

elif score[0] < 0.5:
call = 'synthetic'
reply = f'Text is likley {call}.'

# Return the result from the output queue
return {'author_call': call, 'text': suspect_string}
return {'author_call': reply, 'text': suspect_string}

# Set listener for text strings via POST
@app.post('/submit_text')
Expand All @@ -112,9 +120,9 @@ def submit_text() -> dict:
# Submit the text for scoring
result = score_text.delay(text_string)

return {"result_id": result.id}
return {'result_id': result.id}

@app.get("/result/<result_id>")
@app.get('/result/<result_id>')
def task_result(result_id: str) -> dict:
'''Gets result by result id. Returns dictionary
with task status'''
Expand All @@ -124,9 +132,9 @@ def task_result(result_id: str) -> dict:

# Return status and result if ready
return {
"ready": result.ready(),
"successful": result.successful(),
"value": result.result if result.ready() else None,
'ready': result.ready(),
'successful': result.successful(),
'value': result.result if result.ready() else None,
}

return app
43 changes: 43 additions & 0 deletions api/functions/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

from __future__ import annotations
from typing import Callable

import os
import glob
import re
import logging
from threading import Thread
from logging.handlers import RotatingFileHandler
Expand Down Expand Up @@ -100,3 +102,44 @@ def start_flask(flask_app: Callable, logger: Callable):

# Start the flask app thread
flask_app_thread.start()

def clean_text(text: str = None, sw = None, lemmatizer = None) -> str:
'''Cleans up text string for TF-IDF'''

# Lowercase everything
text = text.lower()

# Replace everything with space except (a-z, A-Z, ".", "?", "!", ",")
text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)

# Remove URLs
text = re.sub(r"http\S+", "",text)

# Remove html tags
html = re.compile(r'<.*?>')
text = html.sub(r'',text)

punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'

# Remove punctuations
for p in punctuations:
text = text.replace(p,'')

# Remove stopwords
text = [word.lower() for word in text.split() if word.lower() not in sw]
text = [lemmatizer.lemmatize(word) for word in text]
text = " ".join(text)

# Remove emojis
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)

text = emoji_pattern.sub(r'', text)

return text
126 changes: 123 additions & 3 deletions api/functions/scoring.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
'''Collection of functions to score strings'''

from typing import Callable
import pickle
import numpy as np
import torch
import transformers
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import api.functions.helper as helper_funcs
import api.configuration as config

def score_string(
Expand All @@ -14,32 +19,147 @@ def score_string(

'''Takes a string, computes and returns llm detector score'''

# To run the XGBoost classifier, we need the following 9 features for
# this fragment:
feature_names = [
'Fragment length (tokens)',
'Perplexity',
'Cross-perplexity',
'Perplexity ratio score',
'Perplexity ratio Kullback-Leibler score',
'Human TF-IDF',
'Synthetic TF-IDF',
'TF-IDF score',
'TF-IDF Kullback-Leibler score'
]

# Empty holder for features
features = []

###############################################################
# Get perplexity, cross-perplexity and perplexity ratio score #
###############################################################

# Encode the string using the reader's tokenizer
encodings = reader_model.tokenizer(
string,
return_tensors = 'pt',
return_token_type_ids = False
).to(reader_model.device_map)

# Get the string length in tokens and add to features
fragment_length = encodings['input_ids'].shape[1]
features.append(fragment_length)

# Calculate logits
reader_logits = reader_model.model(**encodings).logits
writer_logits = writer_model.model(**encodings).logits

# Calculate perplexity
# Calculate perplexity and add to features
ppl = perplexity(encodings, writer_logits)
features.append(ppl[0])

# Calculate cross perplexity
# Calculate cross perplexity and add to features
x_ppl = entropy(
reader_logits.to(config.CALCULATION_DEVICE),
writer_logits.to(config.CALCULATION_DEVICE),
encodings.to(config.CALCULATION_DEVICE),
reader_model.tokenizer.pad_token_id
)

features.append(x_ppl[0])

# Calculate perplexity ratio and add to features
scores = ppl / x_ppl
scores = scores.tolist()
perplexity_ratio_score = scores[0]
features.append(perplexity_ratio_score)

###############################################################
# Get perplexity ratio Kullback-Liebler score #################
###############################################################

# Load the perplexity ratio Kullback-Leibler kernel density estimate
with open(config.PERPLEXITY_RATIO_KLD_KDE, 'rb') as input_file:
perplexity_ratio_kld_kde = pickle.load(input_file)

# Calculate perplexity ratio KLD score and add to features
perplexity_ratio_kld_score = perplexity_ratio_kld_kde.pdf(perplexity_ratio_score)
features.append(perplexity_ratio_kld_score[0])

###############################################################
# Get human and synthetic TF-IDFs and TF-IDF score ############
###############################################################

# Load the TF-IDF luts
with open(config.TFIDF_LUT, 'rb') as input_file:
tfidf_luts = pickle.load(input_file)

# Clean the test for TF-IDF scoring
sw = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

cleaned_string = helper_funcs.clean_text(
text = string,
sw = sw,
lemmatizer = lemmatizer
)

# Split cleaned string into words
words = cleaned_string.split(' ')

# Initialize TF-IDF sums
human_tfidf_sum = 0
synthetic_tfidf_sum = 0

# Score the words using the human and synthetic luts
for word in words:

if word in tfidf_luts['human'].keys():
human_tfidf_sum += tfidf_luts['human'][word]

if word in tfidf_luts['synthetic'].keys():
synthetic_tfidf_sum += tfidf_luts['synthetic'][word]

# Get the means and add to features
human_tfidf_mean = human_tfidf_sum / len(words)
synthetic_tfidf_mean = synthetic_tfidf_sum / len(words)
dmean_tfidf = human_tfidf_mean - synthetic_tfidf_mean
product_normalized_dmean_tfidf = dmean_tfidf * (human_tfidf_mean + synthetic_tfidf_mean)

features.append(human_tfidf_mean)
features.append(synthetic_tfidf_mean)
features.append(product_normalized_dmean_tfidf)

###############################################################
# Get TF-IDF Kullback-Liebler score ###########################
###############################################################

# Load the TF_IDF Kullback-Leibler kernel density estimate
with open(config.TFIDF_SCORE_KLD_KDE, 'rb') as input_file:
tfidf_kld_kde = pickle.load(input_file)

# Calculate TF-IDF LKD score and add to features
tfidf_kld_score = tfidf_kld_kde.pdf(product_normalized_dmean_tfidf)
features.append(tfidf_kld_score[0])

print(f'Features complete:')

for feature_name, feature_value in zip(feature_names, features):
print(f'{feature_name}: {feature_value}')

###############################################################
# Run infrence with the classifier ############################
###############################################################

# Load the model
with open(config.XGBOOST_CLASSIFIER, 'rb') as input_file:
model = pickle.load(input_file)

# Make prediction
prediction = model.predict([features])

return scores
return prediction

# Take some care with '.sum(1)).detach().cpu().float().numpy()'. Had
# errors as cribbed from the above repo. Order matters? I don't know,
Expand Down
2 changes: 1 addition & 1 deletion classifier/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def run(self):

if __name__ == '__main__':

helper_funcs.force_after('AddPerplexityRatioKLDScore')
helper_funcs.force_after('AddTFIDFScore')

luigi.build(
[
Expand Down
1 change: 0 additions & 1 deletion classifier/functions/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,6 @@ def tfidf_score_text_fragments(data_chunk: pd.DataFrame, tfidf_luts: dict = None

human_tfidf_means.append(human_tfidf_mean)
synthetic_tfidf_means.append(synthetic_tfidf_mean)
dmean_tfidfs.append(dmean_tfidf)
product_normalized_dmean_tfidfs.append(product_normalized_dmean_tfidf)

data_chunk['Human TF-IDF'] = human_tfidf_means
Expand Down
8 changes: 4 additions & 4 deletions telegram_bot/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):


async def score_text(update: Update, context: ContextTypes.DEFAULT_TYPE):
'''Sends user provided text to scoring function'''
'''Sends user provided text to scoring function, sends
result back to user.'''

# Get the logger
logger = logging.getLogger(f'telegram_bot.score_text')
Expand All @@ -31,15 +32,14 @@ async def score_text(update: Update, context: ContextTypes.DEFAULT_TYPE):

# Get the result, when ready
result = api_funcs.retreive_result(result_id = result_id)
author_call = await result
reply = f'Author is likley {author_call}'
reply = await result

logger.info(f'Got user text: {text}')
logger.info(f'Result ID: {result_id}')
logger.info(f'Reply: {reply}')

await context.bot.send_message(
chat_id=update.effective_chat.id, text=reply)
chat_id = update.effective_chat.id, text=reply)


if __name__ == '__main__':
Expand Down