IEEE-NITK · ARY2260 · Apr 1, 2023 · Feb 23, 2023 · Feb 23, 2023 · Feb 23, 2023
diff --git a/email2faq/data/asset/sample_email_dataset_raw.csv b/email2faq/data/asset/sample_email_dataset_raw.csv
@@ -317,7 +317,17 @@ X-Folder: \Steven_Kean_Nov2001_1\Notes Folders\All documents
 X-Origin: KEAN-S
 X-FileName: skean.nsf
 
-FYI. what are you upto? is everythimg ok? This is the bill to de-link the bonds from the DWR contracts (recall 
+
+FYI. what are you upto? is everythimg ok? What are the different types of laptops available?
+What are the specifications of each type of laptop?
+What is the battery life of each laptop model?
+What is the warranty period of the laptops?
+What is the cost of each laptop model?
+Can you tell me the battery life for laptops?
+What are all the different types of laptop available?
+What is average battery life of laptops?
+What will be good specs for a gaming latop?
+This is the bill to de-link the bonds from the DWR contracts (recall 
 that the argument to end DA by the Gov's finance folks, the bond folks, and 
 Angelides, the Treasurer stems directly from this linkage).  
 

diff --git a/email2faq/data/dataloader.py b/email2faq/data/dataloader.py
@@ -28,7 +28,6 @@ def __init__(self, dataset, batch_size=8):
         self.dataset = dataset
         self.batch_size = batch_size
         self.X = self.dataset.preprocess()
-        print(len(self.X))
         self.create_loaders()
 
     def create_loaders(self):
@@ -42,4 +41,23 @@ def create_loaders(self):
 
         data = TensorDataset(left, right)
         sampler = SequentialSampler(data)
-        self.dataloader = DataLoader(data, sampler=sampler, batch_size=self.batch_size)
+        self.dataloader = DataLoader(data, sampler=sampler, batch_size=self.batch_size)
+# %%
+class FGDataLoader:
+    ''' This class is written considering that input is a 
+        list of lists where each list is contains the similar cluster of sentences '''
+
+    '''Not sure how to deal with batches here. '''
+
+    def __init__(self,dataset):
+        self.dataset = dataset
+        self.inputs = self.dataset.preprocess()
+        # self.create_loader()
+
+    def create_loader(self):
+        # TODO implement batching
+        pass
+    #    sampler = SequentialSampler(self.inputs)
+    #    self.dataloader = DataLoader(self.inputs, sampler=sampler)
+
+#%%
diff --git a/email2faq/data/dataset.py b/email2faq/data/dataset.py
@@ -5,14 +5,17 @@
 from data.utils.preprocess_utils_fgg import make_w2v_embeddings, split_and_zero_padding
 from keras_preprocessing.sequence import pad_sequences
 from transformers import AutoTokenizer
+from transformers import PegasusTokenizer
 from itertools import product
 
 #%%
 # tokenizer = AutoTokenizer.from_pretrained("shahrukhx01/question-vs-statement-classifier")
 # tokenizer.save_pretrained("./utils/tokenizers/")
-#%%
-TOKENIZER = AutoTokenizer.from_pretrained("./data/utils/tokenizers/")
 
+#%%
+# TOKENIZER = AutoTokenizer.from_pretrained("./data/utils/tokenizers/")
+# FG_TOKENIZER = PegasusTokenizer.from_pretrained("./data/utils/summ_tokenizers/")
+#%%
 class EmailsDataset(Dataset):
     """
     """
@@ -22,7 +25,7 @@ def __init__(self, emails_file, email_field = "mails"):
         self._preprocess_emails()
 
         # transformer preprocess parameters
-        self.tokenizer = TOKENIZER
+        self.tokenizer = AutoTokenizer.from_pretrained("./data/utils/tokenizers/")
         self.max_sequence_length = 512
 
     def __len__(self):
@@ -95,6 +98,7 @@ def process_texts(self):
 
         ## create masks for train, val, test texts
         masks = self._create_attention_mask(ids)
+        del self.tokenizer
         return ids, masks
 
 #%%
@@ -151,4 +155,63 @@ def __getitem__(self, idx):
 #         return len(self.embeded_pairwise_df)
 
 #     def __getitem__(self, idx):
-#         return [self.embeded_pairwise_df['left'][idx], self.embeded_pairwise_df['right'][idx]]
+#         return [self.embeded_pairwise_df['left'][idx], self.embeded_pairwise_df['right'][idx]]
+#%%
+
+class ClusterDataset(Dataset):
+    """
+    Assuming that the data is in the form of a list of lists
+    where each list contains a set of strings of similar sentences
+    """
+
+    def __init__(self, query_clusters):
+        self.query_clusters = query_clusters
+        self.sentences = [sen for cluster in self.query_clusters for sen in cluster]
+        self.query_para_clusters = [("? ".join(cluster) + "?") for cluster in self.query_clusters]
+
+        self.tokenizer = PegasusTokenizer.from_pretrained("./data/utils/summ_tokenizers/")
+
+    def __len__(self):
+        """
+        Total number of paragraphs present in the query_clusters
+        """
+        return len(self.query_para_clusters)
+
+    def __getitem__(self, idx):
+        """
+        This is for accessing a paragraph of similar questions in a cluster
+        """
+        return self.query_para_clusters[idx]
+
+    def sen_item(self,idx):
+        """
+        returns the idx_th sentence in the data set
+        """
+        return self.sentences[idx]
+
+
+    def sen_len(self):
+        """
+        Total number of sentences present in the query_clusters
+        """
+        return len(self.sentences)
+
+    def preprocess(self):
+        inputs = [self.tokenizer(text, max_length=1024, padding = True , truncation =True, return_tensors="pt") for text in self.query_para_clusters]
+        return inputs
+
+# #%%
+# query_clusters = [ ["What are the different types of laptops available?",
+# "What are the specifications of each type of laptop?",
+# "What is the battery life of each laptop model?",
+# "What is the warranty period of the laptops?",
+# "What is the cost of each laptop model?"],
+# [
+# "What destinations are available for travel?",
+# "What is the duration of the trips?",
+# "What is the cost of the trips?",
+# "What are the payment options available?",
+# "Is there an option for installment payment?"]]
+# dataset = ClusterDataset(query_clusters)
+# inputs_t = dataset.preprocess()
+# # %%
diff --git a/email2faq/data/utils/preprocess_utils_fgg.py b/email2faq/data/utils/preprocess_utils_fgg.py
@@ -52,9 +52,6 @@ def split_and_zero_padding(df, max_seq_length):
     # Zero padding
     for dataset, side in itertools.product([X], ['left', 'right']):
         dataset[side] = pad_sequences(dataset[side], padding='pre', truncating='post', maxlen=max_seq_length)
-
-    print(dataset['left'], dataset['right'])
-
     return dataset
 
 def make_w2v_embeddings(df, embedding_dim=300):

diff --git a/email2faq/data/utils/summ_tokenizers/special_tokens_map.json b/email2faq/data/utils/summ_tokenizers/special_tokens_map.json
@@ -0,0 +1,110 @@
+{
+  "additional_special_tokens": [
+    "<mask_1>",
+    "<unk_2>",
+    "<unk_3>",
+    "<unk_4>",
+    "<unk_5>",
+    "<unk_6>",
+    "<unk_7>",
+    "<unk_8>",
+    "<unk_9>",
+    "<unk_10>",
+    "<unk_11>",
+    "<unk_12>",
+    "<unk_13>",
+    "<unk_14>",
+    "<unk_15>",
+    "<unk_16>",
+    "<unk_17>",
+    "<unk_18>",
+    "<unk_19>",
+    "<unk_20>",
+    "<unk_21>",
+    "<unk_22>",
+    "<unk_23>",
+    "<unk_24>",
+    "<unk_25>",
+    "<unk_26>",
+    "<unk_27>",
+    "<unk_28>",
+    "<unk_29>",
+    "<unk_30>",
+    "<unk_31>",
+    "<unk_32>",
+    "<unk_33>",
+    "<unk_34>",
+    "<unk_35>",
+    "<unk_36>",
+    "<unk_37>",
+    "<unk_38>",
+    "<unk_39>",
+    "<unk_40>",
+    "<unk_41>",
+    "<unk_42>",
+    "<unk_43>",
+    "<unk_44>",
+    "<unk_45>",
+    "<unk_46>",
+    "<unk_47>",
+    "<unk_48>",
+    "<unk_49>",
+    "<unk_50>",
+    "<unk_51>",
+    "<unk_52>",
+    "<unk_53>",
+    "<unk_54>",
+    "<unk_55>",
+    "<unk_56>",
+    "<unk_57>",
+    "<unk_58>",
+    "<unk_59>",
+    "<unk_60>",
+    "<unk_61>",
+    "<unk_62>",
+    "<unk_63>",
+    "<unk_64>",
+    "<unk_65>",
+    "<unk_66>",
+    "<unk_67>",
+    "<unk_68>",
+    "<unk_69>",
+    "<unk_70>",
+    "<unk_71>",
+    "<unk_72>",
+    "<unk_73>",
+    "<unk_74>",
+    "<unk_75>",
+    "<unk_76>",
+    "<unk_77>",
+    "<unk_78>",
+    "<unk_79>",
+    "<unk_80>",
+    "<unk_81>",
+    "<unk_82>",
+    "<unk_83>",
+    "<unk_84>",
+    "<unk_85>",
+    "<unk_86>",
+    "<unk_87>",
+    "<unk_88>",
+    "<unk_89>",
+    "<unk_90>",
+    "<unk_91>",
+    "<unk_92>",
+    "<unk_93>",
+    "<unk_94>",
+    "<unk_95>",
+    "<unk_96>",
+    "<unk_97>",
+    "<unk_98>",
+    "<unk_99>",
+    "<unk_100>",
+    "<unk_101>",
+    "<unk_102>"
+  ],
+  "eos_token": "</s>",
+  "mask_token": "<mask_2>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}
diff --git a/email2faq/data/utils/summ_tokenizers/spiece.model b/email2faq/data/utils/summ_tokenizers/spiece.model
diff --git a/email2faq/data/utils/summ_tokenizers/tokenizer_config.json b/email2faq/data/utils/summ_tokenizers/tokenizer_config.json
@@ -0,0 +1,117 @@
+{
+  "additional_special_tokens": [
+    "<mask_1>",
+    "<unk_2>",
+    "<unk_3>",
+    "<unk_4>",
+    "<unk_5>",
+    "<unk_6>",
+    "<unk_7>",
+    "<unk_8>",
+    "<unk_9>",
+    "<unk_10>",
+    "<unk_11>",
+    "<unk_12>",
+    "<unk_13>",
+    "<unk_14>",
+    "<unk_15>",
+    "<unk_16>",
+    "<unk_17>",
+    "<unk_18>",
+    "<unk_19>",
+    "<unk_20>",
+    "<unk_21>",
+    "<unk_22>",
+    "<unk_23>",
+    "<unk_24>",
+    "<unk_25>",
+    "<unk_26>",
+    "<unk_27>",
+    "<unk_28>",
+    "<unk_29>",
+    "<unk_30>",
+    "<unk_31>",
+    "<unk_32>",
+    "<unk_33>",
+    "<unk_34>",
+    "<unk_35>",
+    "<unk_36>",
+    "<unk_37>",
+    "<unk_38>",
+    "<unk_39>",
+    "<unk_40>",
+    "<unk_41>",
+    "<unk_42>",
+    "<unk_43>",
+    "<unk_44>",
+    "<unk_45>",
+    "<unk_46>",
+    "<unk_47>",
+    "<unk_48>",
+    "<unk_49>",
+    "<unk_50>",
+    "<unk_51>",
+    "<unk_52>",
+    "<unk_53>",
+    "<unk_54>",
+    "<unk_55>",
+    "<unk_56>",
+    "<unk_57>",
+    "<unk_58>",
+    "<unk_59>",
+    "<unk_60>",
+    "<unk_61>",
+    "<unk_62>",
+    "<unk_63>",
+    "<unk_64>",
+    "<unk_65>",
+    "<unk_66>",
+    "<unk_67>",
+    "<unk_68>",
+    "<unk_69>",
+    "<unk_70>",
+    "<unk_71>",
+    "<unk_72>",
+    "<unk_73>",
+    "<unk_74>",
+    "<unk_75>",
+    "<unk_76>",
+    "<unk_77>",
+    "<unk_78>",
+    "<unk_79>",
+    "<unk_80>",
+    "<unk_81>",
+    "<unk_82>",
+    "<unk_83>",
+    "<unk_84>",
+    "<unk_85>",
+    "<unk_86>",
+    "<unk_87>",
+    "<unk_88>",
+    "<unk_89>",
+    "<unk_90>",
+    "<unk_91>",
+    "<unk_92>",
+    "<unk_93>",
+    "<unk_94>",
+    "<unk_95>",
+    "<unk_96>",
+    "<unk_97>",
+    "<unk_98>",
+    "<unk_99>",
+    "<unk_100>",
+    "<unk_101>",
+    "<unk_102>"
+  ],
+  "eos_token": "</s>",
+  "full_tokenizer_file": null,
+  "mask_token": "<mask_2>",
+  "mask_token_sent": "<mask_1>",
+  "model_max_length": 512,
+  "offset": 103,
+  "pad_token": "<pad>",
+  "sp_model_kwargs": {},
+  "special_tokens_map_file": null,
+  "tokenizer_class": "PegasusTokenizer",
+  "unk_token": "<unk>"
+}