-
Notifications
You must be signed in to change notification settings - Fork 0
/
Markov_chain_program.py
51 lines (50 loc) · 2.56 KB
/
Markov_chain_program.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from markov_chain import markovChainModel,predict
from scipy.sparse import load_npz,hstack
print("Loading data")
import json
def readFile(filename):
questions = []
articleTitles = []
articleTexts = []
answers = [] # Stores questions and article titles and article contents and their associated answers, which are stored as strings.
# I can access the questions by using [:,0]
#TODO: Find a way to store questions and article content as keys.
# TODO: Convert unicode to string.
#NOTE: I use questions_answers rather than articleTitles_answers because articles can have multiple answers.
with open(filename) as file:
data = json.load(file)
articles = data["data"]
# Iterate through articles, looking for question/answer pairs.
for article in articles:
article_title = article["title"].encode('utf-8','replace') # Converts Unicode object to string.
article_paragraphs = article["paragraphs"]
article_text = "".join([str(paragraph["context"].encode('ascii','replace')) for paragraph in article_paragraphs])
if (len(article_paragraphs) == 0):
print("O")
for paragraph in article_paragraphs:
qas_pairs = paragraph["qas"]
# Check if this paragraph has questions.
if (len(qas_pairs) == 0):
print("O")
for qas_pair in qas_pairs:
# Note: There's another attribute called "context", which may come in handy.
answer = qas_pair["answers"][0]
answer_text = answer["text"].encode('ascii','replace') # Converts Unicode object to string.
# Get where to find the answers.
#answer_start = answer["answer_start"]
#answer_end = answer_start + len(answer_text) - 1
question = str(qas_pair["question"].encode('ascii','replace'))
questions.append(question)
articleTitles.append(article_title)
articleTexts.append(article_text)
answers.append(answer_text)
return questions,articleTitles,articleTexts,answers
print("Data loaded; preprocessing to make compatible with Markov chains")
dataset_2 = hstack((X_2_dev,Y_2_dev))
# Turn the dataset into an iterator.
def datasetGenerator(dataset):
dataset = dataset.toarray()
while True:
yield iter(str(dataset[0]).split())
print("Preprocessing complete; training Markov models")
model_2 = markovChainModel(datasetGenerator(Y_2_dev))