From 4283479bcd356f0b97d7ba5476690bec16ae20de Mon Sep 17 00:00:00 2001 From: Hafsa25 <46001323+Hafsa25@users.noreply.github.com> Date: Thu, 14 Mar 2019 23:23:41 +0500 Subject: [PATCH] sarcasm_detection updated file --- sarcasm_detection | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 sarcasm_detection diff --git a/sarcasm_detection b/sarcasm_detection new file mode 100644 index 0000000..adf5e4d --- /dev/null +++ b/sarcasm_detection @@ -0,0 +1,36 @@ +import pandas as pd +import nltk +nltk.download('stopwords') #Uncomment if stopwords aren't already dowloaded +import re +from nltk.corpus import stopwords +from nltk.stem.porter import PorterStemmer +from sklearn.feature_extraction.text import CountVectorizer + +path = "C:/News Headline/Sarcasm_Headlines_Dataset.json" + +try: + dataset = pd.read_json(path,lines = True) + + dataset = dataset.drop("article_link", axis = 1) + + X = dataset.headline + y = dataset.is_sarcastic + + corpus = [] + print("The process will take time be patient!") + for i in range(0,len(X)): + review = re.sub('[^a-zA-Z]', ' ', X[i]) + review = review.lower() + review = review.split() + ps = PorterStemmer() + review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] + review = ' '.join(review) + corpus.append(review) + print(i) + +except: + print("the file path i does'nt contain a json file") + print("check the path.") +cv=CountVectorizer() +vector=cv.fit_transform(corpus) +vector.toarray()