-
Notifications
You must be signed in to change notification settings - Fork 0
/
english_generator.py
executable file
·94 lines (75 loc) · 3.8 KB
/
english_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# -*- coding: utf-8 -*-
"""PretrainQA.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1kxrCUUv4RlJ910SICMQh1oDjygAEbN-v
"""
text = "Nikola Tesla (Serbian Cyrillic: Никола Тесла; 10 July 1856 – 7 January 1943) was a Serbian American inventor, electrical engineer, mechanical engineer, physicist, and futurist best known for his contributions to the design of the modern alternating current (AC) electricity supply system."
import logging
logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)
from pprint import pprint
from tqdm.auto import tqdm
# Importing Haystack - an opensource framework for creating, in our specific case, retrieval-augment generative pipelines
from haystack.nodes import QuestionGenerator, BM25Retriever, FARMReader
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.pipelines import (
QuestionGenerationPipeline,
RetrieverQuestionGenerationPipeline,
QuestionAnswerGenerationPipeline,
)
from haystack.utils import launch_es, print_questions
# Launching ElasticSearch server (a distributed, RESTful search and analytics engine) using Docker (a virtualization and containeriztion utility)
from haystack.utils import launch_es
launch_es()
def generate_questions(text):
docs = [{"content": text}]
# Initializing the database for storing texts and metadata
document_store = ElasticsearchDocumentStore(host="localhost",
port=9200)
document_store.delete_documents()
# Writing the text into the database
document_store.write_documents(docs)
# Initializing the question generation pipeline
question_generator = QuestionGenerator()
question_generation_pipeline = QuestionGenerationPipeline(question_generator)
# Going through all the documents in the database
# (yes, we have only one, but the program breaks if we try to use the generator outside of the for loop)
for idx, document in enumerate(document_store):
result = question_generation_pipeline.run(documents=[document])
return result["generated_questions"][0]["questions"]
# Test drive
for i in generate_questions(text):
print(f" * {i}")
def generate_QA(text):
docs = [{"content": text}]
# Initializing the database for storing texts and metadata
document_store = ElasticsearchDocumentStore()
document_store.delete_documents()
# Writing the text into the database
document_store.write_documents(docs)
# Initializing the question generation pipeline
question_generator = QuestionGenerator()
question_generation_pipeline = QuestionGenerationPipeline(question_generator)
# Loading the pretrained RoBERTa model
reader = FARMReader("deepset/roberta-base-squad2") #TODO: make loading the NN in a separate function to avoid memory overusage
# Initializing the QnA generation pipeline
qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)
# Going through all the documents in the database
# (yes, we have only one, but the program breaks if we try to use the generator outside of the for loop)
for idx, document in enumerate(tqdm(document_store)):
# Logging
print(f"\n * Generating questions and answers for document {idx}: {document.content[:100]}...\n")
# Generating the QnA
result = qag_pipeline.run(documents=[document])
# question_list = result["queries"]
# answer_list = list(map(lambda x: x[0].answer, result["answers"]))
# Returning a dictionary with a list of questions and a list of answers
return {
"questions": result["queries"],
"answers": list(map(lambda x: x[0].answer, result["answers"]))
}
# Test drive
result = generate_QA(text)
result
print(result["questions"])