From 3e84d345bcc1e664047ad659026c0c0b037a10a8 Mon Sep 17 00:00:00 2001 From: RachanaVenati <168229772+RachanaVenati@users.noreply.github.com> Date: Wed, 12 Jun 2024 13:36:06 +0000 Subject: [PATCH] tokinize words instead of sentences --- summarization/run copy 2.py | 176 ++++++++++++++++++++++++++++++++++++ summarization/run.py | 54 +++++------ 2 files changed, 199 insertions(+), 31 deletions(-) create mode 100644 summarization/run copy 2.py diff --git a/summarization/run copy 2.py b/summarization/run copy 2.py new file mode 100644 index 0000000..867e92c --- /dev/null +++ b/summarization/run copy 2.py @@ -0,0 +1,176 @@ +from pathlib import Path +from tira.rest_api_client import Client +from tira.third_party_integrations import get_output_directory +import pandas as pd +from sklearn.metrics.pairwise import cosine_similarity +import nltk +from nltk.tokenize import sent_tokenize, word_tokenize +from nltk.corpus import stopwords +import string +from transformers import BertTokenizer, BertModel +import torch + +nltk.download('punkt') +nltk.download('stopwords') + +# Initialize BERT model and tokenizer +tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') +model = BertModel.from_pretrained('bert-base-uncased') + +def preprocess_text(text): + # Convert text to lowercase + text = text.lower() + + # Remove punctuation + text = text.translate(str.maketrans('', '', string.punctuation)) + + # Tokenize text into words + words = word_tokenize(text) + + # Remove stop words + stop_words = set(stopwords.words('english')) + filtered_words = [word for word in words if word not in stop_words] + + # Join filtered words back into text + processed_text = " ".join(filtered_words) + + return processed_text + +def get_sentence_embeddings(sentences): + # Encode sentences + inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True) + + # Get BERT embeddings + with torch.no_grad(): + outputs = model(**inputs) + + # Use the embeddings from the last hidden layer + embeddings = outputs.last_hidden_state.mean(dim=1) + + return embeddings + +def extractive_summarization(text, num_sentences=2): + # Preprocess the text + processed_text = preprocess_text(text) + + # Check if the preprocessed text is empty + if not processed_text: + return "" + + # Tokenize the preprocessed text into sentences + sentences = sent_tokenize(processed_text) + + # Ensure there are enough sentences to summarize + if len(sentences) <= num_sentences: + return " ".join(sentences) + + # Get embeddings for each sentence + sentence_embeddings = get_sentence_embeddings(sentences) + + # Get embeddings for the entire document + document_embedding = get_sentence_embeddings([processed_text]) + + # Calculate cosine similarity between each sentence and the document + similarities = cosine_similarity(sentence_embeddings, document_embedding).flatten() + + # Rank sentences based on their similarity scores + ranked_sentences = [sentence for sentence, similarity in sorted(zip(sentences, similarities), key=lambda x: x[1], reverse=True)] + + # Select the top-ranked sentences for the summary + summary = " ".join(ranked_sentences[:num_sentences]) + + return summary + +if __name__ == "__main__": + # Load the data + tira = Client() + df = tira.pd.inputs( + "nlpbuw-fsu-sose-24", "summarization-validation-20240530-training" + ).set_index("id") + print("df size is : ", df.size) + + # Apply extractive summarization with BERT embeddings to the records + df["summary"] = df["story"].apply(lambda x: extractive_summarization(x, num_sentences=2)) + df = df.drop(columns=["story"]).reset_index() + print(df) + + # Save the summarized predictions + output_directory = get_output_directory(str(Path(__file__).parent)) + df.to_json( + Path(output_directory) / "predictions.jsonl", orient="records", lines=True + ) +from pathlib import Path +from tira.rest_api_client import Client +from tira.third_party_integrations import get_output_directory +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +import nltk +nltk.download('punkt') +from nltk.tokenize import sent_tokenize, word_tokenize +from nltk.corpus import stopwords +import string + +def preprocess_text(text): + # Convert text to lowercase + text = text.lower() + + # Remove punctuation + # text = text.translate(str.maketrans('', '', string.punctuation)) + + # Tokenize text into words + # words = word_tokenize(text) + + # Remove stop words + # stop_words = set(stopwords.words('english')) + # filtered_words = [word for word in words if word not in stop_words] + + # Join filtered words back into text + processed_text = text + + return processed_text + +def extractive_summarization(text, num_sentences=2): + # Preprocess the text + processed_text = preprocess_text(text) + + # Check if the preprocessed text is empty + if not processed_text: + return "" + + # Tokenize the preprocessed text into sentences + sentences = sent_tokenize(processed_text) + + # Calculate TF-IDF scores for each sentence + vectorizer = TfidfVectorizer() + tfidf_matrix = vectorizer.fit_transform(sentences) + + # Sum the TF-IDF scores for each sentence + sentence_scores = tfidf_matrix.sum(axis=1).flatten().tolist()[0] + + # Rank sentences based on their scores + ranked_sentences = [sentence for sentence, score in sorted(zip(sentences, sentence_scores), key=lambda x: x[1], reverse=True)] + + # Select the top-ranked sentences for the summary + summary = " ".join(ranked_sentences[:num_sentences]) + + return summary + +if __name__ == "__main__": + # Load the data + tira = Client() + df = tira.pd.inputs( + "nlpbuw-fsu-sose-24", "summarization-validation-20240530-training" + ).set_index("id") + print("df size is : ",df.size) + # Select only the first 30 records + # df = df.head(30) + # print(df) + # Apply extractive summarization with text preprocessing to the first 30 records + df["summary"] = df["story"].apply(lambda x: extractive_summarization(x, num_sentences=2)) + df = df.drop(columns=["story"]).reset_index() + print(df) + # Save the summarized predictions for the first 30 records + output_directory = get_output_directory(str(Path(__file__).parent)) + df.to_json( + Path(output_directory) / "predictions.jsonl", orient="records", lines=True + ) diff --git a/summarization/run.py b/summarization/run.py index afcbadc..2693f41 100644 --- a/summarization/run.py +++ b/summarization/run.py @@ -2,20 +2,13 @@ from tira.rest_api_client import Client from tira.third_party_integrations import get_output_directory import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import nltk +nltk.download('punkt') from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords import string -from transformers import BertTokenizer, BertModel -import torch - -nltk.download('punkt') -nltk.download('stopwords') - -# Initialize BERT model and tokenizer -tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') -model = BertModel.from_pretrained('bert-base-uncased') def preprocess_text(text): # Convert text to lowercase @@ -36,19 +29,6 @@ def preprocess_text(text): return processed_text -def get_sentence_embeddings(sentences): - # Encode sentences - inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True) - - # Get BERT embeddings - with torch.no_grad(): - outputs = model(**inputs) - - # Use the embeddings from the last hidden layer - embeddings = outputs.last_hidden_state.mean(dim=1) - - return embeddings - def extractive_summarization(text, num_sentences=2): # Preprocess the text processed_text = preprocess_text(text) @@ -64,17 +44,29 @@ def extractive_summarization(text, num_sentences=2): if len(sentences) <= num_sentences: return " ".join(sentences) - # Get embeddings for each sentence - sentence_embeddings = get_sentence_embeddings(sentences) + # Tokenize sentences into words + words_in_sentences = [word_tokenize(sentence) for sentence in sentences] + + # Calculate TF-IDF scores for words + vectorizer = TfidfVectorizer(ngram_range=(1, 2)) + tfidf_matrix = vectorizer.fit_transform(sentences) + feature_names = vectorizer.get_feature_names_out() - # Get embeddings for the entire document - document_embedding = get_sentence_embeddings([processed_text]) + # Calculate word scores + word_scores = {} + for col in tfidf_matrix.nonzero()[1]: + word = feature_names[col] + score = tfidf_matrix[0, col] + word_scores[word] = score - # Calculate cosine similarity between each sentence and the document - similarities = cosine_similarity(sentence_embeddings, document_embedding).flatten() + # Score sentences based on word scores + sentence_scores = [] + for sentence in words_in_sentences: + sentence_score = sum(word_scores.get(word, 0) for word in sentence) + sentence_scores.append(sentence_score) - # Rank sentences based on their similarity scores - ranked_sentences = [sentence for sentence, similarity in sorted(zip(sentences, similarities), key=lambda x: x[1], reverse=True)] + # Rank sentences based on their scores + ranked_sentences = [sentence for sentence, score in sorted(zip(sentences, sentence_scores), key=lambda x: x[1], reverse=True)] # Select the top-ranked sentences for the summary summary = " ".join(ranked_sentences[:num_sentences]) @@ -89,7 +81,7 @@ def extractive_summarization(text, num_sentences=2): ).set_index("id") print("df size is : ", df.size) - # Apply extractive summarization with BERT embeddings to the records + # Apply extractive summarization with text preprocessing to the records df["summary"] = df["story"].apply(lambda x: extractive_summarization(x, num_sentences=2)) df = df.drop(columns=["story"]).reset_index() print(df)