awslabs
diff --git a/‎.gitignore
Lines changed: 138 additions & 0 deletions b/‎.gitignore
Lines changed: 138 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 24 additions & 1 deletion b/‎README.md
Lines changed: 24 additions & 1 deletion
diff --git a/‎distant_supervision/constants.py
Lines changed: 21 additions & 0 deletions b/‎distant_supervision/constants.py
Lines changed: 21 additions & 0 deletions
diff --git a/‎distant_supervision/data_models.py
Lines changed: 167 additions & 0 deletions b/‎distant_supervision/data_models.py
Lines changed: 167 additions & 0 deletions
@@ -0,0 +1,138 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
@@ -18,13 +18,36 @@ unsupervised model in F1 score on the SQuAD dataset by about 14%, and 20% when
 the answer is a named entity, achieving state-of-the-art performance on SQuAD
 for unsupervised QA.
 
+## Synthetic data
+
+Generated synthetic data for the publication is located under `enwiki_synthetic/`
+
 ## Usage
 
-Code and instructions to appear soon
+Instructions to appear soon
 
 ## Citation
 
+<https://www.aclweb.org/anthology/2020.acl-main.413/>
+
 ```
+@inproceedings{fabbri-etal-2020-template,
+    title = "Template-Based Question Generation from Retrieved Sentences for Improved Unsupervised Question Answering",
+    author = "Fabbri, Alexander  and
+      Ng, Patrick  and
+      Wang, Zhiguo  and
+      Nallapati, Ramesh  and
+      Xiang, Bing",
+    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
+    month = jul,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2020.acl-main.413",
+    doi = "10.18653/v1/2020.acl-main.413",
+    pages = "4508--4513",
+    abstract = "Question Answering (QA) is in increasing demand as the amount of information available online and the desire for quick access to this content grows. A common approach to QA has been to fine-tune a pretrained language model on a task-specific labeled dataset. This paradigm, however, relies on scarce, and costly to obtain, large-scale human-labeled data. We propose an unsupervised approach to training QA models with generated pseudo-training data. We show that generating questions for QA training by applying a simple template on a related, retrieved sentence rather than the original context sentence improves downstream QA performance by allowing the model to learn more complex context-question relationships. Training a QA model on this data gives a relative improvement over a previous unsupervised model in F1 score on the SQuAD dataset by about 14{\%}, and 20{\%} when the answer is a named entity, achieving state-of-the-art performance on SQuAD for unsupervised QA.",
+}
 ```
 
 
 
@@ -0,0 +1,21 @@
+NUM_OF_SENTENCES_ULIM = 5
+
+NUM_ENTITIES_PER_ARTICLE_TO_CONSIDER = 30  # only in v2
+NUM_ENTITIES_PER_ARTICLE_TO_KEEP = 5
+
+NUM_WORDS_IN_QUERY_SENTENCE_ULIM = 100
+
+NUM_ARTICLES_PER_ENTITY_LLIM = 2
+NUM_ARTICLES_PER_ENTITY_ULIM = 30  # limit number of backfill articles
+
+# NOTE that this is using naive split() rather than the BPE vocab count in BERT
+# QUESTION_NUM_WORDS_ULIM = 100  # still keep fairly long sentences
+# ANSWER_NUM_CHARS_ULIM = 100  # not using the parameters from zgw-exps max_answer_size
+# The QUESTION_NUM_WORDS_ULIM had 50 in zgw-exps
+
+QUESTION_NUM_WORDS_ULIM = 64  # same as RC setting in pytorch-bert (--max_query_length)
+ANSWER_NUM_CHARS_ULIM = 30  # same as RC setting in pytorch-bert (--max_answer_length)
+
+# https://github.com/google-research/bert/issues/66 (384 words)
+# pytorch-bert uses 384 tokens, with stride
+CONTEXT_NUM_WORDS_ULIM = 400
@@ -0,0 +1,167 @@
+import json
+import copy
+from enum import Enum
+from .text_preprocessor import TextPreprocessor
+
+
+class PhraseMode(Enum):
+    NER_ONLY = 'ner_only'
+    ALL = 'all'
+
+
+class QuestionStyle(Enum):
+    """
+    If cloze-style is “[FragmentA] [PERSON] [FragmentB]”, then:
+
+    - "Who [FragmentB] [FragmentA]?" - WBA
+    - "[FragmentA], who [FragmentB]?" - AWB
+    """
+    CLOZE_CATEGORY = 'cloze_category_style'
+    CLOZE_GENERIC = 'cloze_generic_style'
+    TEMPLATE_WBA = 'template_wba_style'
+    TEMPLATE_AWB = 'template_awb_style'
+
+
+class RcQuestion:
+    """
+    RC Question data, e.g. squad_rc_train.jsonl
+
+    {
+        "qid": "57277c965951b619008f8b2b",
+        "question": "What do people engage in after they've disguised themselves?",
+        "context": "In Greece Carnival is also ...",
+        "answers": [
+            {
+                "ner_category": "SOME_CATEGORY",  # optional, depending on whether it's squad-ner or original
+                "answer_start": 677,
+                "text": "pranks and revelry"
+            }
+        ],
+        "article_title": "Carnival"
+    }
+    """
+
+    def __init__(self, *, qid=None, question=None, context=None, article_title=None, answers=None):
+        self.qid = qid
+        self.question = question
+        self.context = context
+        self.answers = answers  # list of {'text': "foobar", "answer_start": 48}
+
+        self.article_title = article_title
+
+    def jsonify(self):
+        return json.dumps(self.__dict__)
+
+    @classmethod
+    def deserialize_json(cls, json_str):
+        dct = json.loads(json_str)
+        question = RcQuestion()
+        for k, v in dct.items():
+            question.__dict__[k] = v
+        return question
+
+
+class Article:
+    def __init__(self):
+        pass
+
+    def import_from(self, raw_row):
+        self.text = raw_row['text']
+        self.id = int(raw_row['id'])
+        self.title = raw_row['title']
+
+        self.sents = None
+
+    @classmethod
+    def deserialize_json(cls, json_str):
+        dct = json.loads(json_str)
+        article = Article()
+        for k, v in dct.items():
+            article.__dict__[k] = v
+
+        new_sents = []
+        for sent in article.sents:
+            new_sents.append(Sentence(sent['id'], sent['text'], sent['ents'], sent['noun_chunks']))
+        article.sents = new_sents
+
+        return article
+
+    def __repr__(self):
+        return str(self.__dict__)
+
+    def jsonify(self):
+        return json.dumps(self.__dict__)
+
+class Sentence:
+    def __init__(self, id, text, ents, noun_chunks):
+        """
+        noun chunks: https://nlp.stanford.edu/software/dependencies_manual.pdf
+        """
+        self.id = id  # note that this is sentence ID, and not article_id
+        self.text = text
+        self.ents = [(e[0], e[1]) for e in ents]
+        self.noun_chunks = [(e[0], e[1]) for e in noun_chunks]
+
+    def get_phrases(self, phrase_mode):
+        if phrase_mode is PhraseMode.NER_ONLY:
+            # don't pass it any noun_chunks
+            return TextPreprocessor.get_phrases(entities=self.ents, noun_chunks=[])
+        else:
+            return TextPreprocessor.get_phrases(entities=self.ents, noun_chunks=self.noun_chunks)
+
+    def __repr__(self):
+        return str(self.__dict__)
+
+
+class PhraseObj:
+    def __init__(self, phrase_str, phrase_category):
+        self.phrase_str = phrase_str
+        self.phrase_category = phrase_category
+
+    @classmethod
+    def import_from(cls, row):
+        """
+        Example format: [["0 to 6 years", "DATE"], [5809465, 53318614, 49544471, 27237145, 54568155]]
+        """
+        phrase_pair = row[0]
+
+        phrase = cls(phrase_pair[0], phrase_pair[1])
+        return phrase
+
+    def __repr__(self):
+        return str(self.__dict__)
+
+
+class DsDatum:
+    def __init__(self, qid, styled_questions, context, answers):
+        self.qid = qid
+        self.styled_questions = styled_questions
+        self.context = context
+        self.answers = answers
+        self.meta = None
+
+    def __repr__(self):
+        return str(self.__dict__)
+
+    def jsonify(self):
+        return json.dumps(self.__dict__)
+
+    def jsonify_single_style(self, question_style):
+        dct = copy.deepcopy(self.__dict__)
+        dct['question'] = self.styled_questions[question_style.value]
+        del dct['styled_questions']
+
+        return json.dumps(dct)
+
+
+class QueriesPerArticleObj:
+    def __init__(self, *, article_id, article_title, article_raw, article_phrases, filtered_sents, phrase):
+        self.article_id = int(article_id)
+        self.article_title = article_title
+        self.article_raw = article_raw
+        self.article_phrases = article_phrases
+        self.filtered_sents = filtered_sents
+        self.phrase = phrase # answer phrase
+
+    def __repr__(self):
+        return str(self.__dict__)