Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev => MAIN (FG subsytem and fgen main) #4

Merged
merged 19 commits into from
Apr 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion email2faq/data/asset/sample_email_dataset_raw.csv
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,17 @@ X-Folder: \Steven_Kean_Nov2001_1\Notes Folders\All documents
X-Origin: KEAN-S
X-FileName: skean.nsf

FYI. what are you upto? is everythimg ok? This is the bill to de-link the bonds from the DWR contracts (recall

FYI. what are you upto? is everythimg ok? What are the different types of laptops available?
What are the specifications of each type of laptop?
What is the battery life of each laptop model?
What is the warranty period of the laptops?
What is the cost of each laptop model?
Can you tell me the battery life for laptops?
What are all the different types of laptop available?
What is average battery life of laptops?
What will be good specs for a gaming latop?
This is the bill to de-link the bonds from the DWR contracts (recall
that the argument to end DA by the Gov's finance folks, the bond folks, and
Angelides, the Treasurer stems directly from this linkage).

Expand Down
22 changes: 20 additions & 2 deletions email2faq/data/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ def __init__(self, dataset, batch_size=8):
self.dataset = dataset
self.batch_size = batch_size
self.X = self.dataset.preprocess()
print(len(self.X))
self.create_loaders()

def create_loaders(self):
Expand All @@ -42,4 +41,23 @@ def create_loaders(self):

data = TensorDataset(left, right)
sampler = SequentialSampler(data)
self.dataloader = DataLoader(data, sampler=sampler, batch_size=self.batch_size)
self.dataloader = DataLoader(data, sampler=sampler, batch_size=self.batch_size)
# %%
class FGDataLoader:
''' This class is written considering that input is a
list of lists where each list is contains the similar cluster of sentences '''

'''Not sure how to deal with batches here. '''

def __init__(self,dataset):
self.dataset = dataset
self.inputs = self.dataset.preprocess()
# self.create_loader()

def create_loader(self):
# TODO implement batching
pass
# sampler = SequentialSampler(self.inputs)
# self.dataloader = DataLoader(self.inputs, sampler=sampler)

#%%
71 changes: 67 additions & 4 deletions email2faq/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,17 @@
from data.utils.preprocess_utils_fgg import make_w2v_embeddings, split_and_zero_padding
from keras_preprocessing.sequence import pad_sequences
from transformers import AutoTokenizer
from transformers import PegasusTokenizer
from itertools import product

#%%
# tokenizer = AutoTokenizer.from_pretrained("shahrukhx01/question-vs-statement-classifier")
# tokenizer.save_pretrained("./utils/tokenizers/")
#%%
TOKENIZER = AutoTokenizer.from_pretrained("./data/utils/tokenizers/")

#%%
# TOKENIZER = AutoTokenizer.from_pretrained("./data/utils/tokenizers/")
# FG_TOKENIZER = PegasusTokenizer.from_pretrained("./data/utils/summ_tokenizers/")
#%%
class EmailsDataset(Dataset):
"""
"""
Expand All @@ -22,7 +25,7 @@ def __init__(self, emails_file, email_field = "mails"):
self._preprocess_emails()

# transformer preprocess parameters
self.tokenizer = TOKENIZER
self.tokenizer = AutoTokenizer.from_pretrained("./data/utils/tokenizers/")
self.max_sequence_length = 512

def __len__(self):
Expand Down Expand Up @@ -95,6 +98,7 @@ def process_texts(self):

## create masks for train, val, test texts
masks = self._create_attention_mask(ids)
del self.tokenizer
return ids, masks

#%%
Expand Down Expand Up @@ -151,4 +155,63 @@ def __getitem__(self, idx):
# return len(self.embeded_pairwise_df)

# def __getitem__(self, idx):
# return [self.embeded_pairwise_df['left'][idx], self.embeded_pairwise_df['right'][idx]]
# return [self.embeded_pairwise_df['left'][idx], self.embeded_pairwise_df['right'][idx]]
#%%

class ClusterDataset(Dataset):
"""
Assuming that the data is in the form of a list of lists
where each list contains a set of strings of similar sentences
"""

def __init__(self, query_clusters):
self.query_clusters = query_clusters
self.sentences = [sen for cluster in self.query_clusters for sen in cluster]
self.query_para_clusters = [("? ".join(cluster) + "?") for cluster in self.query_clusters]

self.tokenizer = PegasusTokenizer.from_pretrained("./data/utils/summ_tokenizers/")

def __len__(self):
"""
Total number of paragraphs present in the query_clusters
"""
return len(self.query_para_clusters)

def __getitem__(self, idx):
"""
This is for accessing a paragraph of similar questions in a cluster
"""
return self.query_para_clusters[idx]

def sen_item(self,idx):
"""
returns the idx_th sentence in the data set
"""
return self.sentences[idx]


def sen_len(self):
"""
Total number of sentences present in the query_clusters
"""
return len(self.sentences)

def preprocess(self):
inputs = [self.tokenizer(text, max_length=1024, padding = True , truncation =True, return_tensors="pt") for text in self.query_para_clusters]
return inputs

# #%%
# query_clusters = [ ["What are the different types of laptops available?",
# "What are the specifications of each type of laptop?",
# "What is the battery life of each laptop model?",
# "What is the warranty period of the laptops?",
# "What is the cost of each laptop model?"],
# [
# "What destinations are available for travel?",
# "What is the duration of the trips?",
# "What is the cost of the trips?",
# "What are the payment options available?",
# "Is there an option for installment payment?"]]
# dataset = ClusterDataset(query_clusters)
# inputs_t = dataset.preprocess()
# # %%
3 changes: 0 additions & 3 deletions email2faq/data/utils/preprocess_utils_fgg.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,6 @@ def split_and_zero_padding(df, max_seq_length):
# Zero padding
for dataset, side in itertools.product([X], ['left', 'right']):
dataset[side] = pad_sequences(dataset[side], padding='pre', truncating='post', maxlen=max_seq_length)

print(dataset['left'], dataset['right'])

return dataset

def make_w2v_embeddings(df, embedding_dim=300):
Expand Down
110 changes: 110 additions & 0 deletions email2faq/data/utils/summ_tokenizers/special_tokens_map.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
{
"additional_special_tokens": [
"<mask_1>",
"<unk_2>",
"<unk_3>",
"<unk_4>",
"<unk_5>",
"<unk_6>",
"<unk_7>",
"<unk_8>",
"<unk_9>",
"<unk_10>",
"<unk_11>",
"<unk_12>",
"<unk_13>",
"<unk_14>",
"<unk_15>",
"<unk_16>",
"<unk_17>",
"<unk_18>",
"<unk_19>",
"<unk_20>",
"<unk_21>",
"<unk_22>",
"<unk_23>",
"<unk_24>",
"<unk_25>",
"<unk_26>",
"<unk_27>",
"<unk_28>",
"<unk_29>",
"<unk_30>",
"<unk_31>",
"<unk_32>",
"<unk_33>",
"<unk_34>",
"<unk_35>",
"<unk_36>",
"<unk_37>",
"<unk_38>",
"<unk_39>",
"<unk_40>",
"<unk_41>",
"<unk_42>",
"<unk_43>",
"<unk_44>",
"<unk_45>",
"<unk_46>",
"<unk_47>",
"<unk_48>",
"<unk_49>",
"<unk_50>",
"<unk_51>",
"<unk_52>",
"<unk_53>",
"<unk_54>",
"<unk_55>",
"<unk_56>",
"<unk_57>",
"<unk_58>",
"<unk_59>",
"<unk_60>",
"<unk_61>",
"<unk_62>",
"<unk_63>",
"<unk_64>",
"<unk_65>",
"<unk_66>",
"<unk_67>",
"<unk_68>",
"<unk_69>",
"<unk_70>",
"<unk_71>",
"<unk_72>",
"<unk_73>",
"<unk_74>",
"<unk_75>",
"<unk_76>",
"<unk_77>",
"<unk_78>",
"<unk_79>",
"<unk_80>",
"<unk_81>",
"<unk_82>",
"<unk_83>",
"<unk_84>",
"<unk_85>",
"<unk_86>",
"<unk_87>",
"<unk_88>",
"<unk_89>",
"<unk_90>",
"<unk_91>",
"<unk_92>",
"<unk_93>",
"<unk_94>",
"<unk_95>",
"<unk_96>",
"<unk_97>",
"<unk_98>",
"<unk_99>",
"<unk_100>",
"<unk_101>",
"<unk_102>"
],
"eos_token": "</s>",
"mask_token": "<mask_2>",
"pad_token": "<pad>",
"unk_token": "<unk>"
}
Binary file added email2faq/data/utils/summ_tokenizers/spiece.model
Binary file not shown.
117 changes: 117 additions & 0 deletions email2faq/data/utils/summ_tokenizers/tokenizer_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
{
"additional_special_tokens": [
"<mask_1>",
"<unk_2>",
"<unk_3>",
"<unk_4>",
"<unk_5>",
"<unk_6>",
"<unk_7>",
"<unk_8>",
"<unk_9>",
"<unk_10>",
"<unk_11>",
"<unk_12>",
"<unk_13>",
"<unk_14>",
"<unk_15>",
"<unk_16>",
"<unk_17>",
"<unk_18>",
"<unk_19>",
"<unk_20>",
"<unk_21>",
"<unk_22>",
"<unk_23>",
"<unk_24>",
"<unk_25>",
"<unk_26>",
"<unk_27>",
"<unk_28>",
"<unk_29>",
"<unk_30>",
"<unk_31>",
"<unk_32>",
"<unk_33>",
"<unk_34>",
"<unk_35>",
"<unk_36>",
"<unk_37>",
"<unk_38>",
"<unk_39>",
"<unk_40>",
"<unk_41>",
"<unk_42>",
"<unk_43>",
"<unk_44>",
"<unk_45>",
"<unk_46>",
"<unk_47>",
"<unk_48>",
"<unk_49>",
"<unk_50>",
"<unk_51>",
"<unk_52>",
"<unk_53>",
"<unk_54>",
"<unk_55>",
"<unk_56>",
"<unk_57>",
"<unk_58>",
"<unk_59>",
"<unk_60>",
"<unk_61>",
"<unk_62>",
"<unk_63>",
"<unk_64>",
"<unk_65>",
"<unk_66>",
"<unk_67>",
"<unk_68>",
"<unk_69>",
"<unk_70>",
"<unk_71>",
"<unk_72>",
"<unk_73>",
"<unk_74>",
"<unk_75>",
"<unk_76>",
"<unk_77>",
"<unk_78>",
"<unk_79>",
"<unk_80>",
"<unk_81>",
"<unk_82>",
"<unk_83>",
"<unk_84>",
"<unk_85>",
"<unk_86>",
"<unk_87>",
"<unk_88>",
"<unk_89>",
"<unk_90>",
"<unk_91>",
"<unk_92>",
"<unk_93>",
"<unk_94>",
"<unk_95>",
"<unk_96>",
"<unk_97>",
"<unk_98>",
"<unk_99>",
"<unk_100>",
"<unk_101>",
"<unk_102>"
],
"eos_token": "</s>",
"full_tokenizer_file": null,
"mask_token": "<mask_2>",
"mask_token_sent": "<mask_1>",
"model_max_length": 512,
"offset": 103,
"pad_token": "<pad>",
"sp_model_kwargs": {},
"special_tokens_map_file": null,
"tokenizer_class": "PegasusTokenizer",
"unk_token": "<unk>"
}
Loading