Skip to content

Commit 7fbd437

Browse files
committed
init code
1 parent fdd5ec1 commit 7fbd437

25 files changed

+2622
-1
lines changed

.gitignore

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
share/python-wheels/
24+
*.egg-info/
25+
.installed.cfg
26+
*.egg
27+
MANIFEST
28+
29+
# PyInstaller
30+
# Usually these files are written by a python script from a template
31+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
32+
*.manifest
33+
*.spec
34+
35+
# Installer logs
36+
pip-log.txt
37+
pip-delete-this-directory.txt
38+
39+
# Unit test / coverage reports
40+
htmlcov/
41+
.tox/
42+
.nox/
43+
.coverage
44+
.coverage.*
45+
.cache
46+
nosetests.xml
47+
coverage.xml
48+
*.cover
49+
*.py,cover
50+
.hypothesis/
51+
.pytest_cache/
52+
cover/
53+
54+
# Translations
55+
*.mo
56+
*.pot
57+
58+
# Django stuff:
59+
*.log
60+
local_settings.py
61+
db.sqlite3
62+
db.sqlite3-journal
63+
64+
# Flask stuff:
65+
instance/
66+
.webassets-cache
67+
68+
# Scrapy stuff:
69+
.scrapy
70+
71+
# Sphinx documentation
72+
docs/_build/
73+
74+
# PyBuilder
75+
.pybuilder/
76+
target/
77+
78+
# Jupyter Notebook
79+
.ipynb_checkpoints
80+
81+
# IPython
82+
profile_default/
83+
ipython_config.py
84+
85+
# pyenv
86+
# For a library or package, you might want to ignore these files since the code is
87+
# intended to run in multiple environments; otherwise, check them in:
88+
# .python-version
89+
90+
# pipenv
91+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
93+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
94+
# install all needed dependencies.
95+
#Pipfile.lock
96+
97+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
98+
__pypackages__/
99+
100+
# Celery stuff
101+
celerybeat-schedule
102+
celerybeat.pid
103+
104+
# SageMath parsed files
105+
*.sage.py
106+
107+
# Environments
108+
.env
109+
.venv
110+
env/
111+
venv/
112+
ENV/
113+
env.bak/
114+
venv.bak/
115+
116+
# Spyder project settings
117+
.spyderproject
118+
.spyproject
119+
120+
# Rope project settings
121+
.ropeproject
122+
123+
# mkdocs documentation
124+
/site
125+
126+
# mypy
127+
.mypy_cache/
128+
.dmypy.json
129+
dmypy.json
130+
131+
# Pyre type checker
132+
.pyre/
133+
134+
# pytype static type analyzer
135+
.pytype/
136+
137+
# Cython debug symbols
138+
cython_debug/

README.md

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,36 @@ unsupervised model in F1 score on the SQuAD dataset by about 14%, and 20% when
1818
the answer is a named entity, achieving state-of-the-art performance on SQuAD
1919
for unsupervised QA.
2020

21+
## Synthetic data
22+
23+
Generated synthetic data for the publication is located under `enwiki_synthetic/`
24+
2125
## Usage
2226

23-
Code and instructions to appear soon
27+
Instructions to appear soon
2428

2529
## Citation
2630

31+
<https://www.aclweb.org/anthology/2020.acl-main.413/>
32+
2733
```
34+
@inproceedings{fabbri-etal-2020-template,
35+
title = "Template-Based Question Generation from Retrieved Sentences for Improved Unsupervised Question Answering",
36+
author = "Fabbri, Alexander and
37+
Ng, Patrick and
38+
Wang, Zhiguo and
39+
Nallapati, Ramesh and
40+
Xiang, Bing",
41+
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
42+
month = jul,
43+
year = "2020",
44+
address = "Online",
45+
publisher = "Association for Computational Linguistics",
46+
url = "https://www.aclweb.org/anthology/2020.acl-main.413",
47+
doi = "10.18653/v1/2020.acl-main.413",
48+
pages = "4508--4513",
49+
abstract = "Question Answering (QA) is in increasing demand as the amount of information available online and the desire for quick access to this content grows. A common approach to QA has been to fine-tune a pretrained language model on a task-specific labeled dataset. This paradigm, however, relies on scarce, and costly to obtain, large-scale human-labeled data. We propose an unsupervised approach to training QA models with generated pseudo-training data. We show that generating questions for QA training by applying a simple template on a related, retrieved sentence rather than the original context sentence improves downstream QA performance by allowing the model to learn more complex context-question relationships. Training a QA model on this data gives a relative improvement over a previous unsupervised model in F1 score on the SQuAD dataset by about 14{\%}, and 20{\%} when the answer is a named entity, achieving state-of-the-art performance on SQuAD for unsupervised QA.",
50+
}
2851
```
2952

3053

distant_supervision/constants.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
NUM_OF_SENTENCES_ULIM = 5
2+
3+
NUM_ENTITIES_PER_ARTICLE_TO_CONSIDER = 30 # only in v2
4+
NUM_ENTITIES_PER_ARTICLE_TO_KEEP = 5
5+
6+
NUM_WORDS_IN_QUERY_SENTENCE_ULIM = 100
7+
8+
NUM_ARTICLES_PER_ENTITY_LLIM = 2
9+
NUM_ARTICLES_PER_ENTITY_ULIM = 30 # limit number of backfill articles
10+
11+
# NOTE that this is using naive split() rather than the BPE vocab count in BERT
12+
# QUESTION_NUM_WORDS_ULIM = 100 # still keep fairly long sentences
13+
# ANSWER_NUM_CHARS_ULIM = 100 # not using the parameters from zgw-exps max_answer_size
14+
# The QUESTION_NUM_WORDS_ULIM had 50 in zgw-exps
15+
16+
QUESTION_NUM_WORDS_ULIM = 64 # same as RC setting in pytorch-bert (--max_query_length)
17+
ANSWER_NUM_CHARS_ULIM = 30 # same as RC setting in pytorch-bert (--max_answer_length)
18+
19+
# https://github.com/google-research/bert/issues/66 (384 words)
20+
# pytorch-bert uses 384 tokens, with stride
21+
CONTEXT_NUM_WORDS_ULIM = 400

distant_supervision/data_models.py

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
import json
2+
import copy
3+
from enum import Enum
4+
from .text_preprocessor import TextPreprocessor
5+
6+
7+
class PhraseMode(Enum):
8+
NER_ONLY = 'ner_only'
9+
ALL = 'all'
10+
11+
12+
class QuestionStyle(Enum):
13+
"""
14+
If cloze-style is “[FragmentA] [PERSON] [FragmentB]”, then:
15+
16+
- "Who [FragmentB] [FragmentA]?" - WBA
17+
- "[FragmentA], who [FragmentB]?" - AWB
18+
"""
19+
CLOZE_CATEGORY = 'cloze_category_style'
20+
CLOZE_GENERIC = 'cloze_generic_style'
21+
TEMPLATE_WBA = 'template_wba_style'
22+
TEMPLATE_AWB = 'template_awb_style'
23+
24+
25+
class RcQuestion:
26+
"""
27+
RC Question data, e.g. squad_rc_train.jsonl
28+
29+
{
30+
"qid": "57277c965951b619008f8b2b",
31+
"question": "What do people engage in after they've disguised themselves?",
32+
"context": "In Greece Carnival is also ...",
33+
"answers": [
34+
{
35+
"ner_category": "SOME_CATEGORY", # optional, depending on whether it's squad-ner or original
36+
"answer_start": 677,
37+
"text": "pranks and revelry"
38+
}
39+
],
40+
"article_title": "Carnival"
41+
}
42+
"""
43+
44+
def __init__(self, *, qid=None, question=None, context=None, article_title=None, answers=None):
45+
self.qid = qid
46+
self.question = question
47+
self.context = context
48+
self.answers = answers # list of {'text': "foobar", "answer_start": 48}
49+
50+
self.article_title = article_title
51+
52+
def jsonify(self):
53+
return json.dumps(self.__dict__)
54+
55+
@classmethod
56+
def deserialize_json(cls, json_str):
57+
dct = json.loads(json_str)
58+
question = RcQuestion()
59+
for k, v in dct.items():
60+
question.__dict__[k] = v
61+
return question
62+
63+
64+
class Article:
65+
def __init__(self):
66+
pass
67+
68+
def import_from(self, raw_row):
69+
self.text = raw_row['text']
70+
self.id = int(raw_row['id'])
71+
self.title = raw_row['title']
72+
73+
self.sents = None
74+
75+
@classmethod
76+
def deserialize_json(cls, json_str):
77+
dct = json.loads(json_str)
78+
article = Article()
79+
for k, v in dct.items():
80+
article.__dict__[k] = v
81+
82+
new_sents = []
83+
for sent in article.sents:
84+
new_sents.append(Sentence(sent['id'], sent['text'], sent['ents'], sent['noun_chunks']))
85+
article.sents = new_sents
86+
87+
return article
88+
89+
def __repr__(self):
90+
return str(self.__dict__)
91+
92+
def jsonify(self):
93+
return json.dumps(self.__dict__)
94+
95+
class Sentence:
96+
def __init__(self, id, text, ents, noun_chunks):
97+
"""
98+
noun chunks: https://nlp.stanford.edu/software/dependencies_manual.pdf
99+
"""
100+
self.id = id # note that this is sentence ID, and not article_id
101+
self.text = text
102+
self.ents = [(e[0], e[1]) for e in ents]
103+
self.noun_chunks = [(e[0], e[1]) for e in noun_chunks]
104+
105+
def get_phrases(self, phrase_mode):
106+
if phrase_mode is PhraseMode.NER_ONLY:
107+
# don't pass it any noun_chunks
108+
return TextPreprocessor.get_phrases(entities=self.ents, noun_chunks=[])
109+
else:
110+
return TextPreprocessor.get_phrases(entities=self.ents, noun_chunks=self.noun_chunks)
111+
112+
def __repr__(self):
113+
return str(self.__dict__)
114+
115+
116+
class PhraseObj:
117+
def __init__(self, phrase_str, phrase_category):
118+
self.phrase_str = phrase_str
119+
self.phrase_category = phrase_category
120+
121+
@classmethod
122+
def import_from(cls, row):
123+
"""
124+
Example format: [["0 to 6 years", "DATE"], [5809465, 53318614, 49544471, 27237145, 54568155]]
125+
"""
126+
phrase_pair = row[0]
127+
128+
phrase = cls(phrase_pair[0], phrase_pair[1])
129+
return phrase
130+
131+
def __repr__(self):
132+
return str(self.__dict__)
133+
134+
135+
class DsDatum:
136+
def __init__(self, qid, styled_questions, context, answers):
137+
self.qid = qid
138+
self.styled_questions = styled_questions
139+
self.context = context
140+
self.answers = answers
141+
self.meta = None
142+
143+
def __repr__(self):
144+
return str(self.__dict__)
145+
146+
def jsonify(self):
147+
return json.dumps(self.__dict__)
148+
149+
def jsonify_single_style(self, question_style):
150+
dct = copy.deepcopy(self.__dict__)
151+
dct['question'] = self.styled_questions[question_style.value]
152+
del dct['styled_questions']
153+
154+
return json.dumps(dct)
155+
156+
157+
class QueriesPerArticleObj:
158+
def __init__(self, *, article_id, article_title, article_raw, article_phrases, filtered_sents, phrase):
159+
self.article_id = int(article_id)
160+
self.article_title = article_title
161+
self.article_raw = article_raw
162+
self.article_phrases = article_phrases
163+
self.filtered_sents = filtered_sents
164+
self.phrase = phrase # answer phrase
165+
166+
def __repr__(self):
167+
return str(self.__dict__)

0 commit comments

Comments
 (0)