From 6b4f2c71093dfc5de02e0381ab78b3d6e5159e99 Mon Sep 17 00:00:00 2001
From: Daniel Silva <danielnsilva@gmail.com>
Date: Mon, 23 Jan 2023 18:29:18 -0500
Subject: [PATCH] feat: get details about a paper's references

---
 semanticscholar/BaseReference.py          |   59 ++
 semanticscholar/PaginatedResults.py       |   13 +-
 semanticscholar/Reference.py              |   13 +
 semanticscholar/SemanticScholar.py        |   47 +
 tests/data/Reference.json                 |    1 +
 tests/data/test_get_paper_references.yaml | 1108 +++++++++++++++++++++
 tests/test_semanticscholar.py             |   27 +
 7 files changed, 1261 insertions(+), 7 deletions(-)
 create mode 100644 semanticscholar/BaseReference.py
 create mode 100644 semanticscholar/Reference.py
 create mode 100644 tests/data/Reference.json
 create mode 100644 tests/data/test_get_paper_references.yaml

diff --git a/semanticscholar/BaseReference.py b/semanticscholar/BaseReference.py
new file mode 100644
index 0000000..9d5ca2b
--- /dev/null
+++ b/semanticscholar/BaseReference.py
@@ -0,0 +1,59 @@
+from semanticscholar.Paper import Paper
+from semanticscholar.SemanticScholarObject import SemanticScholarObject
+
+
+class BaseReference(SemanticScholarObject):
+    '''
+    Base class for both Citation and Reference classes.
+    '''
+
+    FIELDS = [
+        'contexts',
+        'intents',
+        'isInfluential'
+    ]
+
+    def __init__(self, data: dict) -> None:
+        super().__init__()
+        self._contexts = None
+        self._intents = None
+        self._isInfluential = None
+        self._paper = None
+        self._init_attributes(data)
+
+    @property
+    def contexts(self) -> list:
+        '''
+        :type: :class:`list`
+        '''
+        return self._contexts
+    
+    @property
+    def intents(self) -> list:
+        '''
+        :type: :class:`list`
+        '''
+        return self._intents
+    
+    @property
+    def isInfluential(self) -> bool:
+        '''
+        :type: :class:`bool`
+        '''
+        return self._isInfluential
+    
+    @property
+    def paper(self) -> Paper:
+        '''
+        :type: :class:`semanticscholar.Paper.Paper`
+        '''
+        return self._paper
+
+    def _init_attributes(self, data: dict) -> None:
+        self._data = data
+        if 'contexts' in data:
+            self._contexts = data['contexts']
+        if 'intents' in data:
+            self._intents = data['intents']
+        if 'isInfluential' in data:
+            self._isInfluential = data['isInfluential']
diff --git a/semanticscholar/PaginatedResults.py b/semanticscholar/PaginatedResults.py
index 2949b0f..979db4d 100644
--- a/semanticscholar/PaginatedResults.py
+++ b/semanticscholar/PaginatedResults.py
@@ -14,9 +14,9 @@ def __init__(
                 requester: ApiRequester,
                 data_type: Any,
                 url: str,
-                query: str,
-                fields: str,
-                limit: int,
+                query: str = None,
+                fields: str = None,
+                limit: int = None,
                 headers: dict = None
             ) -> None:
 
@@ -84,10 +84,9 @@ def __getitem__(self, key: int) -> Any:
         return self._items[key]
 
     def __has_next_page(self) -> bool:
-        has_any_result = self._total > 0
         has_more_results = (self._offset + self._limit) == self._next
         under_limit = (self._offset + self._limit) < 9999
-        return has_any_result and has_more_results and under_limit
+        return has_more_results and under_limit
 
     def __get_next_page(self) -> list:
 
@@ -100,7 +99,7 @@ def __get_next_page(self) -> list:
             )
 
         self._data = results['data']
-        self._total = results['total']
+        self._total = results['total'] if 'total' in results else 0
         self._offset = results['offset']
         self._next = results['next'] if 'next' in results else 0
 
@@ -114,7 +113,7 @@ def __get_next_page(self) -> list:
 
     def __build_params(self) -> None:
 
-        self._parameters = f'query={self._query}'
+        self._parameters = f'query={self._query}' if self._query else ''
 
         fields = ','.join(self._fields)
         self._parameters += f'&fields={fields}'
diff --git a/semanticscholar/Reference.py b/semanticscholar/Reference.py
new file mode 100644
index 0000000..14843ff
--- /dev/null
+++ b/semanticscholar/Reference.py
@@ -0,0 +1,13 @@
+from semanticscholar.Paper import Paper
+from semanticscholar.BaseReference import BaseReference
+
+
+class Reference(BaseReference):
+    '''
+    This class abstracts a reference.
+    '''
+
+    def __init__(self, data: dict) -> None:
+        super().__init__(data)
+        if 'citedPaper' in data:
+            self._paper = Paper(data['citedPaper'])
diff --git a/semanticscholar/SemanticScholar.py b/semanticscholar/SemanticScholar.py
index 9c676d0..a0fab55 100644
--- a/semanticscholar/SemanticScholar.py
+++ b/semanticscholar/SemanticScholar.py
@@ -3,8 +3,10 @@
 
 from semanticscholar.ApiRequester import ApiRequester
 from semanticscholar.Author import Author
+from semanticscholar.BaseReference import BaseReference
 from semanticscholar.PaginatedResults import PaginatedResults
 from semanticscholar.Paper import Paper
+from semanticscholar.Reference import Reference
 
 
 class SemanticScholar:
@@ -153,6 +155,51 @@ def get_papers(
 
         return papers
 
+    def get_paper_references(
+                self,
+                paper_id: str,
+                fields: list = None,
+                limit: int = 1000
+            ) -> PaginatedResults:
+        '''Get details about a paper's references
+
+        :calls: `POST /paper/{paper_id}/references \
+            <https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data\
+            /operation/get_graph_get_paper_references>`_
+
+        :param str paper_id: S2PaperId, CorpusId, DOI, ArXivId, MAG, ACL,\
+               PMID, PMCID, or URL from:
+
+               - semanticscholar.org
+               - arxiv.org
+               - aclweb.org
+               - acm.org
+               - biorxiv.org
+
+        :param list fields: (optional) list of the fields to be returned.
+        :param int limit: (optional) maximum number of results to return\
+               (must be <= 1000).
+        '''
+
+        if limit < 1 or limit > 1000:
+            raise ValueError(
+                'The limit parameter must be between 1 and 1000 inclusive.')
+
+        if not fields:
+            fields = BaseReference.FIELDS + Paper.SEARCH_FIELDS
+
+        url = f'{self.api_url}/paper/{paper_id}/references'
+
+        results = PaginatedResults(
+                requester=self._requester,
+                data_type=Reference,
+                url=url,
+                fields=fields,
+                limit=limit
+            )
+
+        return results
+
     def search_paper(
                 self,
                 query: str,
diff --git a/tests/data/Reference.json b/tests/data/Reference.json
new file mode 100644
index 0000000..92a97ed
--- /dev/null
+++ b/tests/data/Reference.json
@@ -0,0 +1 @@
+{"contexts": ["Other VAElike approaches exist [12, 22] but are less closely related to our method."], "intents": ["result"], "isInfluential": false, "citedPaper": {"paperId": "018300f5f0e679cee5241d9c69c8d88e00e8bf31", "externalIds": {"MAG": "2122262818", "ArXiv": "1402.0030", "DBLP": "conf/icml/MnihG14", "CorpusId": 1981188}, "corpusId": 1981188, "publicationVenue": {"id": "fc0a208c-acb7-47dc-a0d4-af8190e21d29", "name": "International Conference on Machine Learning", "type": "conference", "alternate_names": ["ICML", "Int Conf Mach Learn"], "url": "https://icml.cc/"}, "url": "https://www.semanticscholar.org/paper/018300f5f0e679cee5241d9c69c8d88e00e8bf31", "title": "Neural Variational Inference and Learning in Belief Networks", "abstract": "Highly expressive directed latent variable models, such as sigmoid belief networks, are difficult to train on large datasets because exact inference in them is intractable and none of the approximate inference methods that have been applied to them scale well. We propose a fast non-iterative approximate inference method that uses a feedforward network to implement efficient exact sampling from the variational posterior. The model and this inference network are trained jointly by maximizing a variational lower bound on the log-likelihood. Although the naive estimator of the inference network gradient is too high-variance to be useful, we make it practical by applying several straightforward model-independent variance reduction techniques. Applying our approach to training sigmoid belief networks and deep autoregressive networks, we show that it outperforms the wake-sleep algorithm on MNIST and achieves state-of-the-art results on the Reuters RCV1 document dataset.", "venue": "International Conference on Machine Learning", "year": 2014, "referenceCount": 34, "citationCount": 662, "influentialCitationCount": 89, "isOpenAccess": false, "openAccessPdf": null, "fieldsOfStudy": ["Computer Science", "Mathematics"], "s2FieldsOfStudy": [{"category": "Computer Science", "source": "external"}, {"category": "Mathematics", "source": "external"}, {"category": "Computer Science", "source": "s2-fos-model"}], "publicationTypes": ["JournalArticle", "Conference"], "publicationDate": "2014-01-31", "journal": {"volume": "abs/1402.0030", "name": "ArXiv"}, "authors": [{"authorId": "1714004", "name": "A. Mnih"}, {"authorId": "144717963", "name": "Karol Gregor"}]}}
diff --git a/tests/data/test_get_paper_references.yaml b/tests/data/test_get_paper_references.yaml
new file mode 100644
index 0000000..bd4a023
--- /dev/null
+++ b/tests/data/test_get_paper_references.yaml
@@ -0,0 +1,1108 @@
+interactions:
+- request:
+    body: null
+    headers:
+      Accept:
+      - '*/*'
+      Accept-Encoding:
+      - gzip, deflate
+      Connection:
+      - keep-alive
+      User-Agent:
+      - python-requests/2.28.1
+    method: GET
+    uri: https://api.semanticscholar.org/graph/v1/paper/CorpusID:1033682/references?&fields=contexts,intents,isInfluential,abstract,authors,citationCount,corpusId,externalIds,fieldsOfStudy,influentialCitationCount,isOpenAccess,journal,openAccessPdf,paperId,publicationDate,publicationTypes,publicationVenue,referenceCount,s2FieldsOfStudy,title,url,venue,year&offset=0&limit=1000
+  response:
+    body:
+      string: '{"offset": 0, "data": [{"contexts": ["Other VAElike approaches exist
+        [12, 22] but are less closely related to our method."], "isInfluential": false,
+        "intents": ["result"], "citedPaper": {"paperId": "018300f5f0e679cee5241d9c69c8d88e00e8bf31",
+        "externalIds": {"MAG": "2122262818", "ArXiv": "1402.0030", "DBLP": "conf/icml/MnihG14",
+        "CorpusId": 1981188}, "corpusId": 1981188, "publicationVenue": {"id": "fc0a208c-acb7-47dc-a0d4-af8190e21d29",
+        "name": "International Conference on Machine Learning", "type": "conference",
+        "alternate_names": ["ICML", "Int Conf Mach Learn"], "url": "https://icml.cc/"},
+        "url": "https://www.semanticscholar.org/paper/018300f5f0e679cee5241d9c69c8d88e00e8bf31",
+        "title": "Neural Variational Inference and Learning in Belief Networks", "abstract":
+        "Highly expressive directed latent variable models, such as sigmoid belief
+        networks, are difficult to train on large datasets because exact inference
+        in them is intractable and none of the approximate inference methods that
+        have been applied to them scale well. We propose a fast non-iterative approximate
+        inference method that uses a feedforward network to implement efficient exact
+        sampling from the variational posterior. The model and this inference network
+        are trained jointly by maximizing a variational lower bound on the log-likelihood.
+        Although the naive estimator of the inference network gradient is too high-variance
+        to be useful, we make it practical by applying several straightforward model-independent
+        variance reduction techniques. Applying our approach to training sigmoid belief
+        networks and deep autoregressive networks, we show that it outperforms the
+        wake-sleep algorithm on MNIST and achieves state-of-the-art results on the
+        Reuters RCV1 document dataset.", "venue": "International Conference on Machine
+        Learning", "year": 2014, "referenceCount": 34, "citationCount": 662, "influentialCitationCount":
+        89, "isOpenAccess": false, "openAccessPdf": null, "fieldsOfStudy": ["Computer
+        Science", "Mathematics"], "s2FieldsOfStudy": [{"category": "Computer Science",
+        "source": "external"}, {"category": "Mathematics", "source": "external"},
+        {"category": "Computer Science", "source": "s2-fos-model"}], "publicationTypes":
+        ["JournalArticle", "Conference"], "publicationDate": "2014-01-31", "journal":
+        {"volume": "abs/1402.0030", "name": "ArXiv"}, "authors": [{"authorId": "1714004",
+        "name": "A. Mnih"}, {"authorId": "144717963", "name": "Karol Gregor"}]}},
+        {"contexts": ["[23] had developed more general stochastic backpropagation
+        rules, allowing one to backpropagate through Gaussian distributions with finite
+        variance, and to backpropagate to the covariance parameter as well as the
+        mean.", "[23] use stochastic backpropagation to train variational autoencoders
+        (VAEs)."], "isInfluential": false, "intents": ["background"], "citedPaper":
+        {"paperId": "484ad17c926292fbe0d5211540832a8c8a8e958b", "externalIds": {"MAG":
+        "2951275616", "DBLP": "conf/icml/RezendeMW14", "CorpusId": 16895865}, "corpusId":
+        16895865, "publicationVenue": {"id": "fc0a208c-acb7-47dc-a0d4-af8190e21d29",
+        "name": "International Conference on Machine Learning", "type": "conference",
+        "alternate_names": ["ICML", "Int Conf Mach Learn"], "url": "https://icml.cc/"},
+        "url": "https://www.semanticscholar.org/paper/484ad17c926292fbe0d5211540832a8c8a8e958b",
+        "title": "Stochastic Backpropagation and Approximate Inference in Deep Generative
+        Models", "abstract": "We marry ideas from deep neural networks and approximate
+        Bayesian inference to derive a generalised class of deep, directed generative
+        models, endowed with a new algorithm for scalable inference and learning.
+        Our algorithm introduces a recognition model to represent approximate posterior
+        distributions, and that acts as a stochastic encoder of the data. We develop
+        stochastic back-propagation -- rules for back-propagation through stochastic
+        variables -- and use this to develop an algorithm that allows for joint optimisation
+        of the parameters of both the generative and recognition model. We demonstrate
+        on several real-world data sets that the model generates realistic samples,
+        provides accurate imputations of missing data and is a useful tool for high-dimensional
+        data visualisation.", "venue": "International Conference on Machine Learning",
+        "year": 2014, "referenceCount": 39, "citationCount": 4253, "influentialCitationCount":
+        723, "isOpenAccess": false, "openAccessPdf": null, "fieldsOfStudy": ["Computer
+        Science", "Mathematics"], "s2FieldsOfStudy": [{"category": "Computer Science",
+        "source": "external"}, {"category": "Mathematics", "source": "external"},
+        {"category": "Computer Science", "source": "s2-fos-model"}], "publicationTypes":
+        ["JournalArticle", "Conference"], "publicationDate": "2014-01-16", "journal":
+        {"pages": "1278-1286"}, "authors": [{"authorId": "1748523", "name": "Danilo
+        Jimenez Rezende"}, {"authorId": "14594344", "name": "S. Mohamed"}, {"authorId":
+        "1688276", "name": "Daan Wierstra"}]}}, {"contexts": ["Kingma and Welling
+        [18] and Rezende et al. [23] use stochastic backpropagation to train variational
+        autoencoders (VAEs).", "We were unaware at the time we developed this work
+        that Kingma and Welling [18] and Rezende et al. [23] had developed more general
+        stochastic backpropagation rules, allowing one to backpropagate through Gaussian
+        distributions with finite variance, and to backpropagate to the covariance
+        parameter as well as the mean.", "We were unaware at the time we developed
+        this work that Kingma and Welling [18] and Rezende et al.", "Kingma and Welling
+        [18] and Rezende et al."], "isInfluential": true, "intents": ["methodology"],
+        "citedPaper": {"paperId": "5f5dc5b9a2ba710937e2c413b37b053cd673df02", "externalIds":
+        {"ArXiv": "1312.6114", "MAG": "2951004968", "DBLP": "journals/corr/KingmaW13",
+        "CorpusId": 216078090}, "corpusId": 216078090, "publicationVenue": {"id":
+        "939c6e1d-0d17-4d6e-8a82-66d960df0e40", "name": "International Conference
+        on Learning Representations", "type": "conference", "alternate_names": ["Int
+        Conf Learn Represent", "ICLR"], "url": "https://iclr.cc/"}, "url": "https://www.semanticscholar.org/paper/5f5dc5b9a2ba710937e2c413b37b053cd673df02",
+        "title": "Auto-Encoding Variational Bayes", "abstract": "Abstract: How can
+        we perform efficient inference and learning in directed probabilistic models,
+        in the presence of continuous latent variables with intractable posterior
+        distributions, and large datasets? We introduce a stochastic variational inference
+        and learning algorithm that scales to large datasets and, under some mild
+        differentiability conditions, even works in the intractable case. Our contributions
+        is two-fold. First, we show that a reparameterization of the variational lower
+        bound yields a lower bound estimator that can be straightforwardly optimized
+        using standard stochastic gradient methods. Second, we show that for i.i.d.
+        datasets with continuous latent variables per datapoint, posterior inference
+        can be made especially efficient by fitting an approximate inference model
+        (also called a recognition model) to the intractable posterior using the proposed
+        lower bound estimator. Theoretical advantages are reflected in experimental
+        results.", "venue": "International Conference on Learning Representations",
+        "year": 2013, "referenceCount": 24, "citationCount": 20019, "influentialCitationCount":
+        4449, "isOpenAccess": false, "openAccessPdf": null, "fieldsOfStudy": ["Mathematics",
+        "Computer Science"], "s2FieldsOfStudy": [{"category": "Mathematics", "source":
+        "external"}, {"category": "Computer Science", "source": "external"}, {"category":
+        "Computer Science", "source": "s2-fos-model"}], "publicationTypes": ["JournalArticle"],
+        "publicationDate": "2013-12-20", "journal": {"volume": "abs/1312.6114", "name":
+        "CoRR"}, "authors": [{"authorId": "1726807", "name": "Diederik P. Kingma"},
+        {"authorId": "1678311", "name": "M. Welling"}]}}, {"contexts": [], "isInfluential":
+        false, "intents": [], "citedPaper": {"paperId": "d891dc72cbd40ffaeefdc79f2e7afe1e530a23ad",
+        "externalIds": {"MAG": "2952712156", "DBLP": "journals/corr/SzegedyZSBEGF13",
+        "ArXiv": "1312.6199", "CorpusId": 604334}, "corpusId": 604334, "publicationVenue":
+        {"id": "939c6e1d-0d17-4d6e-8a82-66d960df0e40", "name": "International Conference
+        on Learning Representations", "type": "conference", "alternate_names": ["Int
+        Conf Learn Represent", "ICLR"], "url": "https://iclr.cc/"}, "url": "https://www.semanticscholar.org/paper/d891dc72cbd40ffaeefdc79f2e7afe1e530a23ad",
+        "title": "Intriguing properties of neural networks", "abstract": "Deep neural
+        networks are highly expressive models that have recently achieved state of
+        the art performance on speech and visual recognition tasks. While their expressiveness
+        is the reason they succeed, it also causes them to learn uninterpretable solutions
+        that could have counter-intuitive properties. In this paper we report two
+        such properties. \nFirst, we find that there is no distinction between individual
+        high level units and random linear combinations of high level units, according
+        to various methods of unit analysis. It suggests that it is the space, rather
+        than the individual units, that contains of the semantic information in the
+        high layers of neural networks. \nSecond, we find that deep neural networks
+        learn input-output mappings that are fairly discontinuous to a significant
+        extend. We can cause the network to misclassify an image by applying a certain
+        imperceptible perturbation, which is found by maximizing the network''s prediction
+        error. In addition, the specific nature of these perturbations is not a random
+        artifact of learning: the same perturbation can cause a different network,
+        that was trained on a different subset of the dataset, to misclassify the
+        same input.", "venue": "International Conference on Learning Representations",
+        "year": 2013, "referenceCount": 15, "citationCount": 10329, "influentialCitationCount":
+        1130, "isOpenAccess": false, "openAccessPdf": null, "fieldsOfStudy": ["Computer
+        Science"], "s2FieldsOfStudy": [{"category": "Computer Science", "source":
+        "external"}, {"category": "Computer Science", "source": "s2-fos-model"}],
+        "publicationTypes": ["JournalArticle"], "publicationDate": "2013-12-20", "journal":
+        {"volume": "abs/1312.6199", "name": "CoRR"}, "authors": [{"authorId": "2574060",
+        "name": "Christian Szegedy"}, {"authorId": "2563432", "name": "Wojciech Zaremba"},
+        {"authorId": "1701686", "name": "Ilya Sutskever"}, {"authorId": "143627859",
+        "name": "Joan Bruna"}, {"authorId": "1761978", "name": "D. Erhan"}, {"authorId":
+        "153440022", "name": "Ian J. Goodfellow"}, {"authorId": "2276554", "name":
+        "R. Fergus"}]}}, {"contexts": ["This objective function results in the same
+        fixed point of the dynamics ofG andD but provides much stronger gradients
+        early in learning."], "isInfluential": false, "intents": ["background"], "citedPaper":
+        {"paperId": "695a2c95eacdbccb7a73d2f1e90e7b35b4b3d864", "externalIds": {"DBLP":
+        "conf/icml/GregorDMBW14", "ArXiv": "1310.8499", "MAG": "2097268041", "CorpusId":
+        14576846}, "corpusId": 14576846, "publicationVenue": {"id": "fc0a208c-acb7-47dc-a0d4-af8190e21d29",
+        "name": "International Conference on Machine Learning", "type": "conference",
+        "alternate_names": ["ICML", "Int Conf Mach Learn"], "url": "https://icml.cc/"},
+        "url": "https://www.semanticscholar.org/paper/695a2c95eacdbccb7a73d2f1e90e7b35b4b3d864",
+        "title": "Deep AutoRegressive Networks", "abstract": "We introduce a deep,
+        generative autoencoder capable of learning hierarchies of distributed representations
+        from data. Successive deep stochastic hidden layers are equipped with autoregressive
+        connections, which enable the model to be sampled from quickly and exactly
+        via ancestral sampling. We derive an efficient approximate parameter estimation
+        method based on the minimum description length (MDL) principle, which can
+        be seen as maximising a variational lower bound on the log-likelihood, with
+        a feedforward neural network implementing approximate inference. We demonstrate
+        state-of-the-art generative performance on a number of classic data sets,
+        including several UCI data sets, MNIST and Atari 2600 games.", "venue": "International
+        Conference on Machine Learning", "year": 2013, "referenceCount": 28, "citationCount":
+        223, "influentialCitationCount": 20, "isOpenAccess": false, "openAccessPdf":
+        null, "fieldsOfStudy": ["Computer Science", "Mathematics"], "s2FieldsOfStudy":
+        [{"category": "Computer Science", "source": "external"}, {"category": "Mathematics",
+        "source": "external"}, {"category": "Computer Science", "source": "s2-fos-model"}],
+        "publicationTypes": ["JournalArticle", "Conference"], "publicationDate": "2013-10-31",
+        "journal": {"volume": "abs/1310.8499", "name": "ArXiv"}, "authors": [{"authorId":
+        "144717963", "name": "Karol Gregor"}, {"authorId": "1841008", "name": "Ivo
+        Danihelka"}, {"authorId": "1714004", "name": "A. Mnih"}, {"authorId": "1723876",
+        "name": "C. Blundell"}, {"authorId": "1688276", "name": "Daan Wierstra"}]}},
+        {"contexts": ["Finally, we would like to thank Les Trois Brasseurs for stimulating
+        our creativity.", "We simultaneously trainG to minimize log(1 \u2212 D(G(z)))."],
+        "isInfluential": false, "intents": ["methodology"], "citedPaper": {"paperId":
+        "5ffa8bf1bf3e39227be28de4ff6915d3b21eb52d", "externalIds": {"MAG": "2951446714",
+        "ArXiv": "1306.1091", "DBLP": "journals/corr/BengioT13", "CorpusId": 9494295},
+        "corpusId": 9494295, "publicationVenue": {"id": "fc0a208c-acb7-47dc-a0d4-af8190e21d29",
+        "name": "International Conference on Machine Learning", "type": "conference",
+        "alternate_names": ["ICML", "Int Conf Mach Learn"], "url": "https://icml.cc/"},
+        "url": "https://www.semanticscholar.org/paper/5ffa8bf1bf3e39227be28de4ff6915d3b21eb52d",
+        "title": "Deep Generative Stochastic Networks Trainable by Backprop", "abstract":
+        "We introduce a novel training principle for probabilistic models that is
+        an alternative to maximum likelihood. The proposed Generative Stochastic Networks
+        (GSN) framework is based on learning the transition operator of a Markov chain
+        whose stationary distribution estimates the data distribution. The transition
+        distribution of the Markov chain is conditional on the previous state, generally
+        involving a small move, so this conditional distribution has fewer dominant
+        modes, being unimodal in the limit of small moves. Thus, it is easier to learn
+        because it is easier to approximate its partition function, more like learning
+        to perform supervised function approximation, with gradients that can be obtained
+        by backprop. We provide theorems that generalize recent work on the probabilistic
+        interpretation of denoising autoencoders and obtain along the way an interesting
+        justification for dependency networks and generalized pseudolikelihood, along
+        with a definition of an appropriate joint distribution and sampling mechanism
+        even when the conditionals are not consistent. GSNs can be used with missing
+        inputs and can be used to sample subsets of variables given the rest. We validate
+        these theoretical results with experiments on two image datasets using an
+        architecture that mimics the Deep Boltzmann Machine Gibbs sampler but allows
+        training to proceed with simple backprop, without the need for layerwise pretraining.",
+        "venue": "International Conference on Machine Learning", "year": 2013, "referenceCount":
+        45, "citationCount": 374, "influentialCitationCount": 28, "isOpenAccess":
+        false, "openAccessPdf": null, "fieldsOfStudy": ["Computer Science", "Mathematics"],
+        "s2FieldsOfStudy": [{"category": "Computer Science", "source": "external"},
+        {"category": "Mathematics", "source": "external"}, {"category": "Computer
+        Science", "source": "s2-fos-model"}], "publicationTypes": ["JournalArticle",
+        "Conference"], "publicationDate": "2013-06-05", "journal": {"pages": "226-234"},
+        "authors": [{"authorId": "1751762", "name": "Yoshua Bengio"}, {"authorId":
+        "1398746441", "name": "Eric Thibodeau-Laufer"}, {"authorId": "1815021", "name":
+        "Guillaume Alain"}, {"authorId": "2965424", "name": "J. Yosinski"}]}}, {"contexts":
+        ["Essentially, one can use adversarial nets to implement a stochastic extension
+        of the deterministic MP-DBM [10]."], "isInfluential": false, "intents": ["background"],
+        "citedPaper": {"paperId": "5656fa5aa6e1beeb98703fc53ec112ad227c49ca", "externalIds":
+        {"DBLP": "conf/nips/GoodfellowMCB13", "MAG": "2098617596", "CorpusId": 6442575},
+        "corpusId": 6442575, "publicationVenue": null, "url": "https://www.semanticscholar.org/paper/5656fa5aa6e1beeb98703fc53ec112ad227c49ca",
+        "title": "Multi-Prediction Deep Boltzmann Machines", "abstract": "We introduce
+        the multi-prediction deep Boltzmann machine (MP-DBM). The MP-DBM can be seen
+        as a single probabilistic model trained to maximize a variational approximation
+        to the generalized pseudolikelihood, or as a family of recurrent nets that
+        share parameters and approximately solve different inference problems. Prior
+        methods of training DBMs either do not perform well on classification tasks
+        or require an initial learning pass that trains the DBM greedily, one layer
+        at a time. The MP-DBM does not require greedy layerwise pretraining, and outperforms
+        the standard DBM at classification, classification with missing inputs, and
+        mean field prediction tasks.1", "venue": "NIPS", "year": 2013, "referenceCount":
+        27, "citationCount": 133, "influentialCitationCount": 17, "isOpenAccess":
+        false, "openAccessPdf": null, "fieldsOfStudy": ["Computer Science"], "s2FieldsOfStudy":
+        [{"category": "Computer Science", "source": "external"}, {"category": "Computer
+        Science", "source": "s2-fos-model"}], "publicationTypes": ["JournalArticle",
+        "Conference"], "publicationDate": "2013-12-05", "journal": {"pages": "548-556"},
+        "authors": [{"authorId": "153440022", "name": "Ian J. Goodfellow"}, {"authorId":
+        "153583218", "name": "Mehdi Mirza"}, {"authorId": "1760871", "name": "Aaron
+        C. Courville"}, {"authorId": "1751762", "name": "Yoshua Bengio"}]}}, {"contexts":
+        [], "isInfluential": false, "intents": [], "citedPaper": {"paperId": "836acf6fc99ebf81d219e2b67f7ab25efc29a6a4",
+        "externalIds": {"MAG": "1872489089", "ArXiv": "1308.4214", "DBLP": "journals/corr/GoodfellowWLDMPBBB13",
+        "CorpusId": 2172854}, "corpusId": 2172854, "publicationVenue": null, "url":
+        "https://www.semanticscholar.org/paper/836acf6fc99ebf81d219e2b67f7ab25efc29a6a4",
+        "title": "Pylearn2: a machine learning research library", "abstract": "Pylearn2
+        is a machine learning research library. This does not just mean that it is
+        a collection of machine learning algorithms that share a common API; it means
+        that it has been designed for flexibility and extensibility in order to facilitate
+        research projects that involve new or unusual use cases. In this paper we
+        give a brief history of the library, an overview of its basic philosophy,
+        a summary of the library''s architecture, and a description of how the Pylearn2
+        community functions socially.", "venue": "ArXiv", "year": 2013, "referenceCount":
+        60, "citationCount": 305, "influentialCitationCount": 18, "isOpenAccess":
+        false, "openAccessPdf": null, "fieldsOfStudy": ["Mathematics", "Computer Science"],
+        "s2FieldsOfStudy": [{"category": "Mathematics", "source": "external"}, {"category":
+        "Computer Science", "source": "external"}, {"category": "Computer Science",
+        "source": "s2-fos-model"}], "publicationTypes": ["JournalArticle", "Review"],
+        "publicationDate": "2013-08-19", "journal": {"volume": "abs/1308.4214", "name":
+        "ArXiv"}, "authors": [{"authorId": "153440022", "name": "Ian J. Goodfellow"},
+        {"authorId": "1393680089", "name": "David Warde-Farley"}, {"authorId": "3087941",
+        "name": "Pascal Lamblin"}, {"authorId": "3074927", "name": "Vincent Dumoulin"},
+        {"authorId": "153583218", "name": "Mehdi Mirza"}, {"authorId": "1996134",
+        "name": "Razvan Pascanu"}, {"authorId": "32837403", "name": "J. Bergstra"},
+        {"authorId": "3227028", "name": "Fr\u00e9d\u00e9ric Bastien"}, {"authorId":
+        "1751762", "name": "Yoshua Bengio"}]}}, {"contexts": ["Prominent recent work
+        in this area includes the generative stochastic network (GSN) framework [5],
+        which extends generalized denoising auto-encoders [4]: both can be seen as
+        defining a parameterized Markov chain, i."], "isInfluential": false, "intents":
+        ["background"], "citedPaper": {"paperId": "d9704f8119d6ba748230b4f2ad59f0e8c64fdfb0",
+        "externalIds": {"DBLP": "journals/corr/abs-1305-6663", "MAG": "2134842679",
+        "ArXiv": "1305.6663", "CorpusId": 5554756}, "corpusId": 5554756, "publicationVenue":
+        null, "url": "https://www.semanticscholar.org/paper/d9704f8119d6ba748230b4f2ad59f0e8c64fdfb0",
+        "title": "Generalized Denoising Auto-Encoders as Generative Models", "abstract":
+        "Recent work has shown how denoising and contractive autoencoders implicitly
+        capture the structure of the data-generating density, in the case where the
+        corruption noise is Gaussian, the reconstruction error is the squared error,
+        and the data is continuous-valued. This has led to various proposals for sampling
+        from this implicitly learned density function, using Langevin and Metropolis-Hastings
+        MCMC. However, it remained unclear how to connect the training procedure of
+        regularized auto-encoders to the implicit estimation of the underlying data-generating
+        distribution when the data are discrete, or using other forms of corruption
+        process and reconstruction errors. Another issue is the mathematical justification
+        which is only valid in the limit of small corruption noise. We propose here
+        a different attack on the problem, which deals with all these issues: arbitrary
+        (but noisy enough) corruption, arbitrary reconstruction loss (seen as a log-likelihood),
+        handling both discrete and continuous-valued variables, and removing the bias
+        due to non-infinitesimal corruption noise (or non-infinitesimal contractive
+        penalty).", "venue": "NIPS", "year": 2013, "referenceCount": 19, "citationCount":
+        435, "influentialCitationCount": 33, "isOpenAccess": false, "openAccessPdf":
+        null, "fieldsOfStudy": ["Computer Science", "Mathematics"], "s2FieldsOfStudy":
+        [{"category": "Computer Science", "source": "external"}, {"category": "Mathematics",
+        "source": "external"}, {"category": "Computer Science", "source": "s2-fos-model"}],
+        "publicationTypes": ["JournalArticle", "Conference"], "publicationDate": "2013-05-28",
+        "journal": {"volume": "abs/1305.6663", "name": "ArXiv"}, "authors": [{"authorId":
+        "1751762", "name": "Yoshua Bengio"}, {"authorId": "145095579", "name": "L.
+        Yao"}, {"authorId": "1815021", "name": "Guillaume Alain"}, {"authorId": "145467703",
+        "name": "Pascal Vincent"}]}}, {"contexts": [], "isInfluential": false, "intents":
+        [], "citedPaper": {"paperId": "b7b915d508987b73b61eccd2b237e7ed099a2d29",
+        "externalIds": {"DBLP": "conf/icml/GoodfellowWMCB13", "MAG": "2294059674",
+        "ArXiv": "1302.4389", "CorpusId": 10600578}, "corpusId": 10600578, "publicationVenue":
+        {"id": "fc0a208c-acb7-47dc-a0d4-af8190e21d29", "name": "International Conference
+        on Machine Learning", "type": "conference", "alternate_names": ["ICML", "Int
+        Conf Mach Learn"], "url": "https://icml.cc/"}, "url": "https://www.semanticscholar.org/paper/b7b915d508987b73b61eccd2b237e7ed099a2d29",
+        "title": "Maxout Networks", "abstract": "We consider the problem of designing
+        models to leverage a recently introduced approximate model averaging technique
+        called dropout. We define a simple new model called maxout (so named because
+        its output is the max of a set of inputs, and because it is a natural companion
+        to dropout) designed to both facilitate optimization by dropout and improve
+        the accuracy of dropout''s fast approximate model averaging technique. We
+        empirically verify that the model successfully accomplishes both of these
+        tasks. We use maxout and dropout to demonstrate state of the art classification
+        performance on four benchmark datasets: MNIST, CIFAR-10, CIFAR-100, and SVHN.",
+        "venue": "International Conference on Machine Learning", "year": 2013, "referenceCount":
+        25, "citationCount": 1946, "influentialCitationCount": 200, "isOpenAccess":
+        false, "openAccessPdf": null, "fieldsOfStudy": ["Computer Science", "Mathematics"],
+        "s2FieldsOfStudy": [{"category": "Computer Science", "source": "external"},
+        {"category": "Mathematics", "source": "external"}, {"category": "Computer
+        Science", "source": "s2-fos-model"}], "publicationTypes": ["JournalArticle",
+        "Conference"], "publicationDate": "2013-02-18", "journal": {"pages": "1319-1327"},
+        "authors": [{"authorId": "153440022", "name": "Ian J. Goodfellow"}, {"authorId":
+        "1393680089", "name": "David Warde-Farley"}, {"authorId": "153583218", "name":
+        "Mehdi Mirza"}, {"authorId": "1760871", "name": "Aaron C. Courville"}, {"authorId":
+        "1751762", "name": "Yoshua Bengio"}]}}, {"contexts": ["[7] and used for various
+        generative models for which the exact likelihood is not tractable [24, 3,
+        4].", "Model MNIST TFD DBN [3] 138\u00b1 2 1909\u00b1 66 Stacked CAE [3] 121\u00b1
+        1."], "isInfluential": false, "intents": ["background"], "citedPaper": {"paperId":
+        "d0965d8f9842f2db960b36b528107ca362c00d1a", "externalIds": {"ArXiv": "1207.4404",
+        "MAG": "1496559305", "DBLP": "journals/corr/abs-1207-4404", "CorpusId": 1334653},
+        "corpusId": 1334653, "publicationVenue": {"id": "fc0a208c-acb7-47dc-a0d4-af8190e21d29",
+        "name": "International Conference on Machine Learning", "type": "conference",
+        "alternate_names": ["ICML", "Int Conf Mach Learn"], "url": "https://icml.cc/"},
+        "url": "https://www.semanticscholar.org/paper/d0965d8f9842f2db960b36b528107ca362c00d1a",
+        "title": "Better Mixing via Deep Representations", "abstract": "It has been
+        hypothesized, and supported with experimental evidence, that deeper representations,
+        when well trained, tend to do a better job at disentangling the underlying
+        factors of variation. We study the following related conjecture: better representations,
+        in the sense of better disentangling, can be exploited to produce Markov chains
+        that mix faster between modes. Consequently, mixing between modes would be
+        more efficient at higher levels of representation. To better understand this,
+        we propose a secondary conjecture: the higher-level samples fill more uniformly
+        the space they occupy and the high-density manifolds tend to unfold when represented
+        at higher levels. The paper discusses these hypotheses and tests them experimentally
+        through visualization and measurements of mixing between modes and interpolating
+        between samples.", "venue": "International Conference on Machine Learning",
+        "year": 2012, "referenceCount": 43, "citationCount": 288, "influentialCitationCount":
+        19, "isOpenAccess": false, "openAccessPdf": null, "fieldsOfStudy": ["Computer
+        Science", "Mathematics"], "s2FieldsOfStudy": [{"category": "Computer Science",
+        "source": "external"}, {"category": "Mathematics", "source": "external"},
+        {"category": "Computer Science", "source": "s2-fos-model"}], "publicationTypes":
+        ["JournalArticle", "Conference"], "publicationDate": "2012-07-18", "journal":
+        {"pages": "552-560"}, "authors": [{"authorId": "1751762", "name": "Yoshua
+        Bengio"}, {"authorId": "1935910", "name": "Gr\u00e9goire Mesnil"}, {"authorId":
+        "2921469", "name": "Y. Dauphin"}, {"authorId": "2425018", "name": "S. Rifai"}]}},
+        {"contexts": ["So far, the most striking successes in deep learning have involved
+        discriminative models, usually those that map a high-dimensional, rich sensory
+        input to a class label [14, 20]."], "isInfluential": false, "intents": ["background"],
+        "citedPaper": {"paperId": "abd1c342495432171beb7ca8fd9551ef13cbd0ff", "externalIds":
+        {"MAG": "2997031122", "DBLP": "conf/nips/KrizhevskySH12", "DOI": "10.1145/3065386",
+        "CorpusId": 195908774}, "corpusId": 195908774, "publicationVenue": {"id":
+        "4d9ce1c4-dc84-46b9-903e-e3751c00c7dd", "name": "Communications of the ACM",
+        "type": "journal", "alternate_names": ["Commun ACM", "Communications of The
+        ACM"], "issn": "0001-0782", "url": "http://www.acm.org/pubs/cacm/", "alternate_urls":
+        ["http://portal.acm.org/cacm", "http://www.acm.org/pubs/contents/journals/cacm/",
+        "https://cacm.acm.org/"]}, "url": "https://www.semanticscholar.org/paper/abd1c342495432171beb7ca8fd9551ef13cbd0ff",
+        "title": "ImageNet classification with deep convolutional neural networks",
+        "abstract": "We trained a large, deep convolutional neural network to classify
+        the 1.2 million high-resolution images in the ImageNet LSVRC-2010 contest
+        into the 1000 different classes. On the test data, we achieved top-1 and top-5
+        error rates of 37.5% and 17.0%, respectively, which is considerably better
+        than the previous state-of-the-art. The neural network, which has 60 million
+        parameters and 650,000 neurons, consists of five convolutional layers, some
+        of which are followed by max-pooling layers, and three fully connected layers
+        with a final 1000-way softmax. To make training faster, we used non-saturating
+        neurons and a very efficient GPU implementation of the convolution operation.
+        To reduce overfitting in the fully connected layers we employed a recently
+        developed regularization method called \"dropout\" that proved to be very
+        effective. We also entered a variant of this model in the ILSVRC-2012 competition
+        and achieved a winning top-5 test error rate of 15.3%, compared to 26.2% achieved
+        by the second-best entry.", "venue": "Communications of the ACM", "year":
+        2012, "referenceCount": 44, "citationCount": 92681, "influentialCitationCount":
+        12331, "isOpenAccess": true, "openAccessPdf": {"url": "http://dl.acm.org/ft_gateway.cfm?id=3065386&type=pdf",
+        "status": null}, "fieldsOfStudy": ["Computer Science"], "s2FieldsOfStudy":
+        [{"category": "Computer Science", "source": "external"}, {"category": "Computer
+        Science", "source": "s2-fos-model"}], "publicationTypes": ["JournalArticle"],
+        "publicationDate": "2012-12-03", "journal": {"volume": "60", "pages": "84
+        - 90", "name": "Communications of the ACM"}, "authors": [{"authorId": "2064160",
+        "name": "A. Krizhevsky"}, {"authorId": "1701686", "name": "Ilya Sutskever"},
+        {"authorId": "1695689", "name": "Geoffrey E. Hinton"}]}}, {"contexts": ["\u2026G
+        must not be trained too much without updatingD, in order to avoid \u201cthe
+        Helvetica scenario\u201d in whichG collapses too many values of z to the same
+        value of x to have enough diversity to model pdata), much as the negative
+        chains of a Boltzmann machine must be kept up to date between learning steps."],
+        "isInfluential": false, "intents": ["background"], "citedPaper": {"paperId":
+        "855d0f722d75cc56a66a00ede18ace96bafee6bd", "externalIds": {"ArXiv": "1211.5590",
+        "MAG": "1606347560", "DBLP": "journals/corr/abs-1211-5590", "CorpusId": 8180128},
+        "corpusId": 8180128, "publicationVenue": null, "url": "https://www.semanticscholar.org/paper/855d0f722d75cc56a66a00ede18ace96bafee6bd",
+        "title": "Theano: new features and speed improvements", "abstract": "Theano
+        is a linear algebra compiler that optimizes a user''s symbolically-specified
+        mathematical computations to produce efficient low-level implementations.
+        In this paper, we present new features and efficiency improvements to Theano,
+        and benchmarks demonstrating Theano''s performance relative to Torch7, a recently
+        introduced machine learning library, and to RNNLM, a C++ library targeted
+        at recurrent neural networks.", "venue": "ArXiv", "year": 2012, "referenceCount":
+        14, "citationCount": 1399, "influentialCitationCount": 101, "isOpenAccess":
+        false, "openAccessPdf": null, "fieldsOfStudy": ["Mathematics", "Computer Science"],
+        "s2FieldsOfStudy": [{"category": "Mathematics", "source": "external"}, {"category":
+        "Computer Science", "source": "external"}, {"category": "Computer Science",
+        "source": "s2-fos-model"}], "publicationTypes": ["JournalArticle"], "publicationDate":
+        "2012-11-23", "journal": {"volume": "abs/1211.5590", "name": "ArXiv"}, "authors":
+        [{"authorId": "3227028", "name": "Fr\u00e9d\u00e9ric Bastien"}, {"authorId":
+        "3087941", "name": "Pascal Lamblin"}, {"authorId": "1996134", "name": "Razvan
+        Pascanu"}, {"authorId": "32837403", "name": "J. Bergstra"}, {"authorId": "153440022",
+        "name": "Ian J. Goodfellow"}, {"authorId": "47944877", "name": "Arnaud Bergeron"},
+        {"authorId": "2065828537", "name": "Nicolas Bouchard"}, {"authorId": "1393680089",
+        "name": "David Warde-Farley"}, {"authorId": "1751762", "name": "Yoshua Bengio"}]}},
+        {"contexts": ["So far, the most striking successes in deep learning have involved
+        discriminative models, usually those that map a high-dimensional, rich sensory
+        input to a class label [14, 20]."], "isInfluential": false, "intents": ["background"],
+        "citedPaper": {"paperId": "e33cbb25a8c7390aec6a398e36381f4f7770c283", "externalIds":
+        {"MAG": "2184045248", "CorpusId": 7230302}, "corpusId": 7230302, "publicationVenue":
+        null, "url": "https://www.semanticscholar.org/paper/e33cbb25a8c7390aec6a398e36381f4f7770c283",
+        "title": "Deep Neural Networks for Acoustic Modeling in Speech Recognition",
+        "abstract": "Most current speech recognition systems use hidden Markov models
+        ( HMMs) to deal with the temporal variability of speech and Gaussian mixture
+        models to determine how well each state of each HMM fits a frame or a short
+        window of frames of coefficients that represents the acoustic input. An alternati
+        ve way to evaluate the fit is to use a feedforward neural network that takes
+        several frames of coefficients a s input and produces posterior probabilities
+        over HMM states as output. Deep neural networks with many hidden layers, that
+        are trained using new methods have been shown to outperform Gaussian mixture
+        models on a variety of speech rec ognition benchmarks, sometimes by a large
+        margin. This paper provides an overview of this progress and repres nts the
+        shared views of four research groups who have had recent successes in using
+        deep neural networks for a coustic modeling in speech recognition.", "venue":
+        "", "year": 2012, "referenceCount": 79, "citationCount": 2343, "influentialCitationCount":
+        168, "isOpenAccess": false, "openAccessPdf": null, "fieldsOfStudy": ["Computer
+        Science"], "s2FieldsOfStudy": [{"category": "Computer Science", "source":
+        "external"}, {"category": "Computer Science", "source": "s2-fos-model"}],
+        "publicationTypes": ["Review"], "publicationDate": "2012-11-01", "journal":
+        {"volume": "29", "pages": "82", "name": "IEEE Signal Processing Magazine"},
+        "authors": [{"authorId": "1695689", "name": "Geoffrey E. Hinton"}, {"authorId":
+        "144718788", "name": "L. Deng"}, {"authorId": "144580027", "name": "Dong Yu"},
+        {"authorId": "35188630", "name": "George E. Dahl"}, {"authorId": "40360972",
+        "name": "Abdel-rahman Mohamed"}, {"authorId": "3111912", "name": "N. Jaitly"},
+        {"authorId": "33666044", "name": "A. Senior"}, {"authorId": "2657155", "name":
+        "Vincent Vanhoucke"}, {"authorId": "14902530", "name": "P. Nguyen"}, {"authorId":
+        "1784851", "name": "Tara N. Sainath"}, {"authorId": "144707379", "name": "Brian
+        Kingsbury"}]}}, {"contexts": ["Dropout [16] was applied in training the discriminator
+        net.", "In this case, we can train both models using only the highly successful
+        backpropagation and dropout algorithms [16] and sample from the generative
+        model using only forward propagation."], "isInfluential": false, "intents":
+        ["methodology"], "citedPaper": {"paperId": "1366de5bb112746a555e9c0cd00de3ad8628aea8",
+        "externalIds": {"DBLP": "journals/corr/abs-1207-0580", "MAG": "1904365287",
+        "ArXiv": "1207.0580", "CorpusId": 14832074}, "corpusId": 14832074, "publicationVenue":
+        null, "url": "https://www.semanticscholar.org/paper/1366de5bb112746a555e9c0cd00de3ad8628aea8",
+        "title": "Improving neural networks by preventing co-adaptation of feature
+        detectors", "abstract": "When a large feedforward neural network is trained
+        on a small training set, it typically performs poorly on held-out test data.
+        This \"overfitting\" is greatly reduced by randomly omitting half of the feature
+        detectors on each training case. This prevents complex co-adaptations in which
+        a feature detector is only helpful in the context of several other specific
+        feature detectors. Instead, each neuron learns to detect a feature that is
+        generally helpful for producing the correct answer given the combinatorially
+        large variety of internal contexts in which it must operate. Random \"dropout\"
+        gives big improvements on many benchmark tasks and sets new records for speech
+        and object recognition.", "venue": "ArXiv", "year": 2012, "referenceCount":
+        27, "citationCount": 6708, "influentialCitationCount": 549, "isOpenAccess":
+        false, "openAccessPdf": null, "fieldsOfStudy": ["Computer Science"], "s2FieldsOfStudy":
+        [{"category": "Computer Science", "source": "external"}, {"category": "Computer
+        Science", "source": "s2-fos-model"}], "publicationTypes": ["JournalArticle"],
+        "publicationDate": "2012-07-02", "journal": {"volume": "abs/1207.0580", "name":
+        "ArXiv"}, "authors": [{"authorId": "1695689", "name": "Geoffrey E. Hinton"},
+        {"authorId": "2897313", "name": "Nitish Srivastava"}, {"authorId": "2064160",
+        "name": "A. Krizhevsky"}, {"authorId": "1701686", "name": "Ilya Sutskever"},
+        {"authorId": "145124475", "name": "R. Salakhutdinov"}]}}, {"contexts": ["[7]
+        and used for various generative models for which the exact likelihood is not
+        tractable [24, 3, 4]."], "isInfluential": false, "intents": ["background"],
+        "citedPaper": {"paperId": "aaaea06da21f22221d5fbfd61bb3a02439f0fe02", "externalIds":
+        {"MAG": "2950320139", "CorpusId": 122643575}, "corpusId": 122643575, "publicationVenue":
+        {"id": "fc0a208c-acb7-47dc-a0d4-af8190e21d29", "name": "International Conference
+        on Machine Learning", "type": "conference", "alternate_names": ["ICML", "Int
+        Conf Mach Learn"], "url": "https://icml.cc/"}, "url": "https://www.semanticscholar.org/paper/aaaea06da21f22221d5fbfd61bb3a02439f0fe02",
+        "title": "A Generative Process for sampling Contractive Auto-Encoders", "abstract":
+        "The contractive auto-encoder learns a representation of the input data that
+        captures the local manifold structure around each data point, through the
+        leading singular vectors of the Jacobian of the transformation from input
+        to representation. The corresponding singular values specify how much local
+        variation is plausible in directions associated with the corresponding singular
+        vectors, while remaining in a high-density region of the input space. This
+        paper proposes a procedure for generating samples that are consistent with
+        the local structure captured by a contractive auto-encoder. The associated
+        stochastic process defines a distribution from which one can sample, and which
+        experimentally appears to converge quickly and mix well between modes, compared
+        to Restricted Boltzmann Machines and Deep Belief Networks. The intuitions
+        behind this procedure can also be used to train the second layer of contraction
+        that pools lower-level features and learns to be invariant to the local directions
+        of variation discovered in the first layer. We show that this can help learn
+        and represent invariances present in the data and improve classification error.",
+        "venue": "International Conference on Machine Learning", "year": 2012, "referenceCount":
+        18, "citationCount": 78, "influentialCitationCount": 3, "isOpenAccess": false,
+        "openAccessPdf": null, "fieldsOfStudy": ["Mathematics", "Computer Science"],
+        "s2FieldsOfStudy": [{"category": "Mathematics", "source": "external"}, {"category":
+        "Computer Science", "source": "external"}, {"category": "Computer Science",
+        "source": "s2-fos-model"}], "publicationTypes": null, "publicationDate": "2012-06-26",
+        "journal": {"volume": "", "pages": "1811-1818", "name": ""}, "authors": [{"authorId":
+        "2425018", "name": "S. Rifai"}, {"authorId": "1751762", "name": "Yoshua Bengio"},
+        {"authorId": "2921469", "name": "Y. Dauphin"}, {"authorId": "145467703", "name":
+        "Pascal Vincent"}]}}, {"contexts": ["[7] and used for various generative models
+        for which the exact likelihood is not tractable [24, 3, 4]."], "isInfluential":
+        false, "intents": ["background"], "citedPaper": {"paperId": "d1c67346e46b4d0067b3c2e5d3b959a8bc24b28c",
+        "externalIds": {"DBLP": "journals/neco/BreuleuxBV11", "MAG": "2106439909",
+        "DOI": "10.1162/NECO_a_00158", "CorpusId": 907908}, "corpusId": 907908, "publicationVenue":
+        {"id": "69b9bcdd-8229-4a00-a6e0-00f0e99a2bf3", "name": "Neural Computation",
+        "type": "journal", "alternate_names": ["Neural Comput"], "issn": "0899-7667",
+        "url": "http://cognet.mit.edu/library/journals/journal?issn=08997667", "alternate_urls":
+        ["http://ieeexplore.ieee.org/servlet/opac?punumber=6720226", "http://www.mitpressjournals.org/loi/neco",
+        "https://www.mitpressjournals.org/loi/neco"]}, "url": "https://www.semanticscholar.org/paper/d1c67346e46b4d0067b3c2e5d3b959a8bc24b28c",
+        "title": "Quickly Generating Representative Samples from an RBM-Derived Process",
+        "abstract": "Two recently proposed learning algorithms, herding and fast persistent
+        contrastive divergence (FPCD), share the following interesting characteristic:
+        they exploit changes in the model parameters while sampling in order to escape
+        modes and mix better during the sampling process that is part of the learning
+        algorithm. We justify such approaches as ways to escape modes while keeping
+        approximately the same asymptotic distribution of the Markov chain. In that
+        spirit, we extend FPCD using an idea borrowed from Herding in order to obtain
+        a pure sampling algorithm, which we call the rates-FPCD sampler. Interestingly,
+        this sampler can improve the model as we collect more samples, since it optimizes
+        a lower bound on the log likelihood of the training data. We provide empirical
+        evidence that this new algorithm displays substantially better and more robust
+        mixing than Gibbs sampling.", "venue": "Neural Computation", "year": 2011,
+        "referenceCount": 18, "citationCount": 84, "influentialCitationCount": 7,
+        "isOpenAccess": false, "openAccessPdf": null, "fieldsOfStudy": ["Computer
+        Science", "Mathematics"], "s2FieldsOfStudy": [{"category": "Computer Science",
+        "source": "external"}, {"category": "Mathematics", "source": "external"},
+        {"category": "Computer Science", "source": "s2-fos-model"}], "publicationTypes":
+        ["JournalArticle"], "publicationDate": "2011-08-01", "journal": {"volume":
+        "23", "pages": "2058-2073", "name": "Neural Computation"}, "authors": [{"authorId":
+        "1967465", "name": "Olivier Breuleux"}, {"authorId": "1751762", "name": "Yoshua
+        Bengio"}, {"authorId": "145467703", "name": "Pascal Vincent"}]}}, {"contexts":
+        ["These striking successes have primarily been based on the backpropagation
+        and dropout algorithms, using piecewise linear units [17, 8, 9] which have
+        a particularly well-behaved gradient .", "The generator nets used a mixture
+        of rectifier linear activations [17, 8] and sigmoid activations, while the
+        discriminator net used maxout [9] activations."], "isInfluential": false,
+        "intents": ["methodology", "background"], "citedPaper": {"paperId": "67107f78a84bdb2411053cb54e94fa226eea6d8e",
+        "externalIds": {"DBLP": "journals/jmlr/GlorotBB11", "MAG": "2156387975", "CorpusId":
+        2239473}, "corpusId": 2239473, "publicationVenue": {"id": "2d136b11-c2b5-484b-b008-7f4a852fd61e",
+        "name": "International Conference on Artificial Intelligence and Statistics",
+        "type": "conference", "alternate_names": ["AISTATS", "Int Conf Artif Intell
+        Stat"]}, "url": "https://www.semanticscholar.org/paper/67107f78a84bdb2411053cb54e94fa226eea6d8e",
+        "title": "Deep Sparse Rectifier Neural Networks", "abstract": "While logistic
+        sigmoid neurons are more biologically plausible than hyperbolic tangent neurons,
+        the latter work better for training multi-layer neural networks. This paper
+        shows that rectifying neurons are an even better model of biological neurons
+        and yield equal or better performance than hyperbolic tangent networks in
+        spite of the hard non-linearity and non-dierentiabil ity", "venue": "International
+        Conference on Artificial Intelligence and Statistics", "year": 2011, "referenceCount":
+        37, "citationCount": 6700, "influentialCitationCount": 421, "isOpenAccess":
+        false, "openAccessPdf": null, "fieldsOfStudy": ["Computer Science"], "s2FieldsOfStudy":
+        [{"category": "Computer Science", "source": "external"}, {"category": "Computer
+        Science", "source": "s2-fos-model"}], "publicationTypes": ["JournalArticle"],
+        "publicationDate": "2011-06-14", "journal": {"pages": "315-323"}, "authors":
+        [{"authorId": "3119801", "name": "Xavier Glorot"}, {"authorId": "1713934",
+        "name": "Antoine Bordes"}, {"authorId": "1751762", "name": "Yoshua Bengio"}]}},
+        {"contexts": ["Previous work has also taken the approach of using a discriminative
+        criterion to train a generative model [29, 13].", "The key limitation of NCE
+        is that its \u201cdiscriminator\u201d is defined by the ratio of the probability
+        densities of the noise distribution and the model distribution, and thus requires
+        the ability to evaluate and backpropagate through both densities.", "Noise-contrastive
+        estimation (NCE) [13] involves training a generative model by learning the
+        weights that make the model useful for discriminating data from a fixed noise
+        distribution."], "isInfluential": false, "intents": ["methodology", "background"],
+        "citedPaper": {"paperId": "e3ce36b9deb47aa6bb2aa19c4bfa71283b505025", "externalIds":
+        {"DBLP": "journals/jmlr/GutmannH10", "MAG": "2152790380", "CorpusId": 15816723},
+        "corpusId": 15816723, "publicationVenue": {"id": "2d136b11-c2b5-484b-b008-7f4a852fd61e",
+        "name": "International Conference on Artificial Intelligence and Statistics",
+        "type": "conference", "alternate_names": ["AISTATS", "Int Conf Artif Intell
+        Stat"]}, "url": "https://www.semanticscholar.org/paper/e3ce36b9deb47aa6bb2aa19c4bfa71283b505025",
+        "title": "Noise-contrastive estimation: A new estimation principle for unnormalized
+        statistical models", "abstract": "We present a new estimation principle for
+        parameterized statistical models. The idea is to perform nonlinear logistic
+        regression to discriminate between the observed data and some artificially
+        generated noise, using the model log-density function in the regression nonlinearity.
+        We show that this leads to a consistent (convergent) estimator of the parameters,
+        and analyze the asymptotic variance. In particular, the method is shown to
+        directly work for unnormalized models, i.e. models where the density function
+        does not integrate to one. The normalization constant can be estimated just
+        like any other parameter. For a tractable ICA model, we compare the method
+        with other estimation methods that can be used to learn unnormalized models,
+        including score matching, contrastive divergence, and maximum-likelihood where
+        the normalization constant is estimated with importance sampling. Simulations
+        show that noise-contrastive estimation offers the best trade-off between computational
+        and statistical efficiency. The method is then applied to the modeling of
+        natural images: We show that the method can successfully estimate a large-scale
+        two-layer model and a Markov random field.", "venue": "International Conference
+        on Artificial Intelligence and Statistics", "year": 2010, "referenceCount":
+        16, "citationCount": 1464, "influentialCitationCount": 255, "isOpenAccess":
+        false, "openAccessPdf": null, "fieldsOfStudy": ["Computer Science", "Mathematics"],
+        "s2FieldsOfStudy": [{"category": "Computer Science", "source": "external"},
+        {"category": "Mathematics", "source": "external"}, {"category": "Computer
+        Science", "source": "s2-fos-model"}], "publicationTypes": ["JournalArticle"],
+        "publicationDate": "2010-03-31", "journal": {"pages": "297-304"}, "authors":
+        [{"authorId": "145992652", "name": "Michael U Gutmann"}, {"authorId": "1791548",
+        "name": "Aapo Hyv\u00e4rinen"}]}}, {"contexts": ["We would like to thank the
+        developers of Pylearn2 [11] and Theano [6, 1], particularly Fre\u0301de\u0301ric
+        Bastien who rushed a Theano feature specifically to benefit this project.",
+        "We would like to thank the developers of Pylearn2 [11] and Theano [6, 1],
+        particularly Fr\u00e9d\u00e9ric Bastien who rushed a Theano feature specifically
+        to benefit this project."], "isInfluential": false, "intents": ["methodology"],
+        "citedPaper": {"paperId": null, "externalIds": null, "corpusId": null, "publicationVenue":
+        null, "url": null, "title": "Theano: a CPU and GPU math expression compiler",
+        "abstract": null, "venue": "In Proceedings of the Python for Scientific Computing
+        Conference (SciPy). Oral Presentation", "year": 2010, "referenceCount": null,
+        "citationCount": null, "influentialCitationCount": null, "isOpenAccess": null,
+        "openAccessPdf": null, "fieldsOfStudy": null, "s2FieldsOfStudy": null, "publicationTypes":
+        null, "publicationDate": null, "journal": null, "authors": []}}, {"contexts":
+        ["We trained adversarial nets an a range of datasets including MNIST[21],
+        the Toronto Face Database (TFD) [27], and CIFAR-10 [19]."], "isInfluential":
+        false, "intents": ["methodology"], "citedPaper": {"paperId": null, "externalIds":
+        null, "corpusId": null, "publicationVenue": null, "url": null, "title": "The
+        Toronto face dataset", "abstract": null, "venue": "Technical Report UTML TR
+        2010-001,", "year": 2010, "referenceCount": null, "citationCount": null, "influentialCitationCount":
+        null, "isOpenAccess": null, "openAccessPdf": null, "fieldsOfStudy": null,
+        "s2FieldsOfStudy": null, "publicationTypes": null, "publicationDate": null,
+        "journal": null, "authors": []}}, {"contexts": ["These striking successes
+        have primarily been based on the backpropagation and dropout algorithms, using
+        piecewise linear units [17, 8, 9] which have a particularly well-behaved gradient
+        .", "The generator nets used a mixture of rectifier linear activations [17,
+        8] and sigmoid activations, while the discriminator net used maxout [9] activations."],
+        "isInfluential": false, "intents": ["methodology", "background"], "citedPaper":
+        {"paperId": "1f88427d7aa8225e47f946ac41a0667d7b69ac52", "externalIds": {"DBLP":
+        "conf/iccv/JarrettKRL09", "MAG": "2546302380", "DOI": "10.1109/ICCV.2009.5459469",
+        "CorpusId": 206769720}, "corpusId": 206769720, "publicationVenue": {"id":
+        "7654260e-79f9-45c5-9663-d72027cf88f3", "name": "IEEE International Conference
+        on Computer Vision", "type": "conference", "alternate_names": ["ICCV", "IEEE
+        Int Conf Comput Vis", "ICCV Workshops", "ICCV Work"], "url": "https://ieeexplore.ieee.org/xpl/conhome/1000149/all-proceedings"},
+        "url": "https://www.semanticscholar.org/paper/1f88427d7aa8225e47f946ac41a0667d7b69ac52",
+        "title": "What is the best multi-stage architecture for object recognition?",
+        "abstract": "In many recent object recognition systems, feature extraction
+        stages are generally composed of a filter bank, a non-linear transformation,
+        and some sort of feature pooling layer. Most systems use only one stage of
+        feature extraction in which the filters are hard-wired, or two stages where
+        the filters in one or both stages are learned in supervised or unsupervised
+        mode. This paper addresses three questions: 1. How does the non-linearities
+        that follow the filter banks influence the recognition accuracy? 2. does learning
+        the filter banks in an unsupervised or supervised manner improve the performance
+        over random filters or hardwired filters? 3. Is there any advantage to using
+        an architecture with two stages of feature extraction, rather than one? We
+        show that using non-linearities that include rectification and local contrast
+        normalization is the single most important ingredient for good accuracy on
+        object recognition benchmarks. We show that two stages of feature extraction
+        yield better accuracy than one. Most surprisingly, we show that a two-stage
+        system with random filters can yield almost 63% recognition rate on Caltech-101,
+        provided that the proper non-linearities and pooling layers are used. Finally,
+        we show that with supervised refinement, the system achieves state-of-the-art
+        performance on NORB dataset (5.6%) and unsupervised pre-training followed
+        by supervised refinement produces good accuracy on Caltech-101 (\u226b 65%),
+        and the lowest known error rate on the undistorted, unprocessed MNIST dataset
+        (0.53%).", "venue": "IEEE International Conference on Computer Vision", "year":
+        2009, "referenceCount": 54, "citationCount": 2205, "influentialCitationCount":
+        140, "isOpenAccess": true, "openAccessPdf": {"url": "http://yann.lecun.com/exdb/publis/pdf/jarrett-iccv-09.pdf",
+        "status": null}, "fieldsOfStudy": ["Computer Science"], "s2FieldsOfStudy":
+        [{"category": "Computer Science", "source": "external"}, {"category": "Computer
+        Science", "source": "s2-fos-model"}], "publicationTypes": ["JournalArticle",
+        "Conference"], "publicationDate": "2009-09-01", "journal": {"pages": "2146-2153",
+        "name": "2009 IEEE 12th International Conference on Computer Vision"}, "authors":
+        [{"authorId": "2077257730", "name": "Kevin Jarrett"}, {"authorId": "2645384",
+        "name": "K. Kavukcuoglu"}, {"authorId": "1706809", "name": "M. Ranzato"},
+        {"authorId": "1688882", "name": "Yann LeCun"}]}}, {"contexts": ["In this family
+        of model, perhaps the most succesful is the deep Boltzmann machine [25]."],
+        "isInfluential": false, "intents": ["background"], "citedPaper": {"paperId":
+        "85021c84383d18a7a4434d76dc8135fc6bdc0aa6", "externalIds": {"DBLP": "journals/jmlr/SalakhutdinovH09",
+        "MAG": "189596042", "CorpusId": 877639}, "corpusId": 877639, "publicationVenue":
+        {"id": "2d136b11-c2b5-484b-b008-7f4a852fd61e", "name": "International Conference
+        on Artificial Intelligence and Statistics", "type": "conference", "alternate_names":
+        ["AISTATS", "Int Conf Artif Intell Stat"]}, "url": "https://www.semanticscholar.org/paper/85021c84383d18a7a4434d76dc8135fc6bdc0aa6",
+        "title": "Deep Boltzmann Machines", "abstract": "We present a new learning
+        algorithm for Boltzmann machines that contain many layers of hidden variables.
+        Data-dependent expectations are estimated using a variational approximation
+        that tends to focus on a single mode, and dataindependent expectations are
+        approximated using persistent Markov chains. The use of two quite different
+        techniques for estimating the two types of expectation that enter into the
+        gradient of the log-likelihood makes it practical to learn Boltzmann machines
+        with multiple hidden layers and millions of parameters. The learning can be
+        made more efficient by using a layer-by-layer \u201cpre-training\u201d phase
+        that allows variational inference to be initialized with a single bottomup
+        pass. We present results on the MNIST and NORB datasets showing that deep
+        Boltzmann machines learn good generative models and perform well on handwritten
+        digit and visual object recognition tasks.", "venue": "International Conference
+        on Artificial Intelligence and Statistics", "year": 2009, "referenceCount":
+        22, "citationCount": 2152, "influentialCitationCount": 263, "isOpenAccess":
+        false, "openAccessPdf": null, "fieldsOfStudy": ["Computer Science"], "s2FieldsOfStudy":
+        [{"category": "Computer Science", "source": "external"}, {"category": "Computer
+        Science", "source": "s2-fos-model"}], "publicationTypes": ["JournalArticle"],
+        "publicationDate": "2009-04-15", "journal": {"pages": "448-455"}, "authors":
+        [{"authorId": "145124475", "name": "R. Salakhutdinov"}, {"authorId": "1695689",
+        "name": "Geoffrey E. Hinton"}]}}, {"contexts": ["We trained adversarial nets
+        an a range of datasets including MNIST[21], the Toronto Face Database (TFD)
+        [27], and CIFAR-10 [19]."], "isInfluential": false, "intents": ["methodology"],
+        "citedPaper": {"paperId": "5d90f06bb70a0a3dced62413346235c02b1aa086", "externalIds":
+        {"MAG": "2945315962", "CorpusId": 18268744}, "corpusId": 18268744, "publicationVenue":
+        null, "url": "https://www.semanticscholar.org/paper/5d90f06bb70a0a3dced62413346235c02b1aa086",
+        "title": "Learning Multiple Layers of Features from Tiny Images", "abstract":
+        "Groups at MIT and NYU have collected a dataset of millions of tiny colour
+        images from the web. It is, in principle, an excellent dataset for unsupervised
+        training of deep generative models, but previous researchers who have tried
+        this have found it dicult to learn a good set of lters from the images. We
+        show how to train a multi-layer generative model that learns to extract meaningful
+        features which resemble those found in the human visual cortex. Using a novel
+        parallelization algorithm to distribute the work among multiple machines connected
+        on a network, we show how training such a model can be done in reasonable
+        time. A second problematic aspect of the tiny images dataset is that there
+        are no reliable class labels which makes it hard to use for object recognition
+        experiments. We created two sets of reliable labels. The CIFAR-10 set has
+        6000 examples of each of 10 classes and the CIFAR-100 set has 600 examples
+        of each of 100 non-overlapping classes. Using these labels, we show that object
+        recognition is signicantly improved by pre-training a layer of features on
+        a large set of unlabeled tiny images.", "venue": "", "year": 2009, "referenceCount":
+        15, "citationCount": 20893, "influentialCitationCount": 6864, "isOpenAccess":
+        false, "openAccessPdf": null, "fieldsOfStudy": ["Computer Science"], "s2FieldsOfStudy":
+        [{"category": "Computer Science", "source": "external"}, {"category": "Computer
+        Science", "source": "s2-fos-model"}], "publicationTypes": null, "publicationDate":
+        null, "journal": {"volume": "", "name": ""}, "authors": [{"authorId": "2064160",
+        "name": "A. Krizhevsky"}]}}, {"contexts": ["This strategy is analogous to
+        the way that SML/PCD [31, 29] training maintains samples from a Markov chain
+        from one learning step to the next in order to avoid burning in a Markov chain
+        as part of the inner loop of learning."], "isInfluential": false, "intents":
+        ["methodology"], "citedPaper": {"paperId": "73d6a26f407db77506959fdf3f7b853e44f3844a",
+        "externalIds": {"DBLP": "conf/icml/Tieleman08", "MAG": "2116825644", "DOI":
+        "10.1145/1390156.1390290", "CorpusId": 7330145}, "corpusId": 7330145, "publicationVenue":
+        {"id": "fc0a208c-acb7-47dc-a0d4-af8190e21d29", "name": "International Conference
+        on Machine Learning", "type": "conference", "alternate_names": ["ICML", "Int
+        Conf Mach Learn"], "url": "https://icml.cc/"}, "url": "https://www.semanticscholar.org/paper/73d6a26f407db77506959fdf3f7b853e44f3844a",
+        "title": "Training restricted Boltzmann machines using approximations to the
+        likelihood gradient", "abstract": "A new algorithm for training Restricted
+        Boltzmann Machines is introduced. The algorithm, named Persistent Contrastive
+        Divergence, is different from the standard Contrastive Divergence algorithms
+        in that it aims to draw samples from almost exactly the model distribution.
+        It is compared to some standard Contrastive Divergence and Pseudo-Likelihood
+        algorithms on the tasks of modeling and classifying various types of data.
+        The Persistent Contrastive Divergence algorithm outperforms the other algorithms,
+        and is equally fast and simple.", "venue": "International Conference on Machine
+        Learning", "year": 2008, "referenceCount": 22, "citationCount": 948, "influentialCitationCount":
+        133, "isOpenAccess": true, "openAccessPdf": {"url": "http://icml2008.cs.helsinki.fi/papers/638.pdf",
+        "status": null}, "fieldsOfStudy": ["Mathematics", "Computer Science"], "s2FieldsOfStudy":
+        [{"category": "Mathematics", "source": "external"}, {"category": "Computer
+        Science", "source": "external"}, {"category": "Computer Science", "source":
+        "s2-fos-model"}], "publicationTypes": ["JournalArticle"], "publicationDate":
+        "2008-07-05", "journal": {"pages": "1064-1071"}, "authors": [{"authorId":
+        "2957517", "name": "T. Tieleman"}]}}, {"contexts": ["Some models such as denoising
+        auto-encoders [30] and contractive autoencoders have learning rules very similar
+        to score matching applied to RBMs."], "isInfluential": false, "intents": ["methodology"],
+        "citedPaper": {"paperId": "843959ffdccf31c6694d135fad07425924f785b1", "externalIds":
+        {"DBLP": "conf/icml/VincentLBM08", "MAG": "2025768430", "DOI": "10.1145/1390156.1390294",
+        "CorpusId": 207168299}, "corpusId": 207168299, "publicationVenue": {"id":
+        "fc0a208c-acb7-47dc-a0d4-af8190e21d29", "name": "International Conference
+        on Machine Learning", "type": "conference", "alternate_names": ["ICML", "Int
+        Conf Mach Learn"], "url": "https://icml.cc/"}, "url": "https://www.semanticscholar.org/paper/843959ffdccf31c6694d135fad07425924f785b1",
+        "title": "Extracting and composing robust features with denoising autoencoders",
+        "abstract": "Previous work has shown that the difficulties in learning deep
+        generative or discriminative models can be overcome by an initial unsupervised
+        learning step that maps inputs to useful intermediate representations. We
+        introduce and motivate a new training principle for unsupervised learning
+        of a representation based on the idea of making the learned representations
+        robust to partial corruption of the input pattern. This approach can be used
+        to train autoencoders, and these denoising autoencoders can be stacked to
+        initialize deep architectures. The algorithm can be motivated from a manifold
+        learning and information theoretic perspective or from a generative model
+        perspective. Comparative experiments clearly show the surprising advantage
+        of corrupting the input of autoencoders on a pattern classification benchmark
+        suite.", "venue": "International Conference on Machine Learning", "year":
+        2008, "referenceCount": 27, "citationCount": 6057, "influentialCitationCount":
+        519, "isOpenAccess": true, "openAccessPdf": {"url": "http://www.iro.umontreal.ca/~vincentp/Publications/denoising_autoencoders_tr1316.pdf",
+        "status": null}, "fieldsOfStudy": ["Computer Science", "Mathematics"], "s2FieldsOfStudy":
+        [{"category": "Computer Science", "source": "external"}, {"category": "Mathematics",
+        "source": "external"}, {"category": "Computer Science", "source": "s2-fos-model"}],
+        "publicationTypes": ["JournalArticle"], "publicationDate": "2008-07-05", "journal":
+        {"pages": "1096-1103"}, "authors": [{"authorId": "120247189", "name": "Pascal
+        Vincent"}, {"authorId": "1777528", "name": "H. Larochelle"}, {"authorId":
+        "1751762", "name": "Yoshua Bengio"}, {"authorId": "1798462", "name": "Pierre-Antoine
+        Manzagol"}]}}, {"contexts": ["Previous work has also taken the approach of
+        using a discriminative criterion to train a generative model [29, 13]."],
+        "isInfluential": false, "intents": ["methodology"], "citedPaper": {"paperId":
+        "23b80dc704e25cf52b5a14935002fc083ce9c317", "externalIds": {"MAG": "2163176424",
+        "DBLP": "conf/cvpr/Tu07", "DOI": "10.1109/CVPR.2007.383035", "CorpusId": 226145},
+        "corpusId": 226145, "publicationVenue": null, "url": "https://www.semanticscholar.org/paper/23b80dc704e25cf52b5a14935002fc083ce9c317",
+        "title": "Learning Generative Models via Discriminative Approaches", "abstract":
+        "Generative model learning is one of the key problems in machine learning
+        and computer vision. Currently the use of generative models is limited due
+        to the difficulty in effectively learning them. A new learning framework is
+        proposed in this paper which progressively learns a target generative distribution
+        through discriminative approaches. This framework provides many interesting
+        aspects to the literature. From the generative model side: (1) A reference
+        distribution is used to assist the learning process, which removes the need
+        for a sampling processes in the early stages. (2) The classification power
+        of discriminative approaches, e.g. boosting, is directly utilized. (3) The
+        ability to select/explore features from a large candidate pool allows us to
+        make nearly no assumptions about the training data. From the discriminative
+        model side: (1) This framework improves the modeling capability of discriminative
+        models. (2) It can start with source training data only and gradually \"invent\"
+        negative samples. (3) We show how sampling schemes can be introduced to discriminative
+        models. (4) The learning procedure helps to tighten the decision boundaries
+        for classification, and therefore, improves robustness. In this paper, we
+        show a variety of applications including texture modeling and classification,
+        non-photorealistic rendering, learning image statistics/denoising, and face
+        modeling. The framework handles both homogeneous patterns, e.g. textures,
+        and inhomogeneous patterns, e.g. faces, with nearly an identical parameter
+        setting for all the tasks in the learning stage.", "venue": "2007 IEEE Conference
+        on Computer Vision and Pattern Recognition", "year": 2007, "referenceCount":
+        28, "citationCount": 106, "influentialCitationCount": 10, "isOpenAccess":
+        false, "openAccessPdf": null, "fieldsOfStudy": ["Computer Science"], "s2FieldsOfStudy":
+        [{"category": "Computer Science", "source": "external"}, {"category": "Computer
+        Science", "source": "s2-fos-model"}], "publicationTypes": ["JournalArticle",
+        "Conference"], "publicationDate": "2007-06-17", "journal": {"pages": "1-8",
+        "name": "2007 IEEE Conference on Computer Vision and Pattern Recognition"},
+        "authors": [{"authorId": "144035504", "name": "Z. Tu"}]}}, {"contexts": ["The
+        promise of deep learning is to discover rich, hierarchical models [2] that
+        represent probability distributions over the kinds of data encountered in
+        artificial intelligence applications, such as natural images, audio waveforms
+        containing speech, and symbols in natural language corpora."], "isInfluential":
+        false, "intents": ["background"], "citedPaper": {"paperId": "e60ff004dde5c13ec53087872cfcdd12e85beb57",
+        "externalIds": {"DBLP": "journals/ftml/Bengio09", "MAG": "2072128103", "DOI":
+        "10.1561/2200000006", "CorpusId": 207178999}, "corpusId": 207178999, "publicationVenue":
+        null, "url": "https://www.semanticscholar.org/paper/e60ff004dde5c13ec53087872cfcdd12e85beb57",
+        "title": "Learning Deep Architectures for AI", "abstract": "Theoretical results
+        strongly suggest that in order to learn the kind of complicated functions
+        that can represent high-level abstractions (e.g. in vision, language, and
+        other AI-level tasks), one needs deep architectures. Deep architectures are
+        composed of multiple levels of non-linear operations, such as in neural nets
+        with many hidden layers or in complicated propositional formulae re-using
+        many sub-formulae. Searching the parameter space of deep architectures is
+        a difficult optimization task, but learning algorithms such as those for Deep
+        Belief Networks have recently been proposed to tackle this problem with notable
+        success, beating the state-of-the-art in certain areas. This paper discusses
+        the motivations and principles regarding learning algorithms for deep architectures,
+        in particular those exploiting as building blocks unsupervised learning of
+        single-layer models such as Restricted Boltzmann Machines, used to construct
+        deeper models such as Deep Belief Networks.", "venue": "Found. Trends Mach.
+        Learn.", "year": 2007, "referenceCount": 242, "citationCount": 7991, "influentialCitationCount":
+        511, "isOpenAccess": true, "openAccessPdf": {"url": "http://www.iro.umontreal.ca/~bengioy/papers/ftml.pdf",
+        "status": null}, "fieldsOfStudy": ["Computer Science"], "s2FieldsOfStudy":
+        [{"category": "Computer Science", "source": "external"}, {"category": "Computer
+        Science", "source": "s2-fos-model"}], "publicationTypes": ["JournalArticle"],
+        "publicationDate": null, "journal": {"volume": "2", "pages": "1-127", "name":
+        "Found. Trends Mach. Learn."}, "authors": [{"authorId": "1751762", "name":
+        "Yoshua Bengio"}]}}, {"contexts": ["Deep belief networks (DBNs) [16] are hybrid
+        models containing a single undirected layer and several directed layers.",
+        "An alternative to directed graphical models with latent variables are undirected
+        graphical models with latent variables, such as restricted Boltzmann machines
+        (RBMs) [27, 16], deep Boltzmann machines (DBMs) [26] and their numerous variants."],
+        "isInfluential": false, "intents": ["background"], "citedPaper": {"paperId":
+        "8978cf7574ceb35f4c3096be768c7547b28a35d0", "externalIds": {"MAG": "2136922672",
+        "DBLP": "journals/neco/HintonOT06", "DOI": "10.1162/neco.2006.18.7.1527",
+        "CorpusId": 2309950, "PubMed": "16764513"}, "corpusId": 2309950, "publicationVenue":
+        {"id": "69b9bcdd-8229-4a00-a6e0-00f0e99a2bf3", "name": "Neural Computation",
+        "type": "journal", "alternate_names": ["Neural Comput"], "issn": "0899-7667",
+        "url": "http://cognet.mit.edu/library/journals/journal?issn=08997667", "alternate_urls":
+        ["http://ieeexplore.ieee.org/servlet/opac?punumber=6720226", "http://www.mitpressjournals.org/loi/neco",
+        "https://www.mitpressjournals.org/loi/neco"]}, "url": "https://www.semanticscholar.org/paper/8978cf7574ceb35f4c3096be768c7547b28a35d0",
+        "title": "A Fast Learning Algorithm for Deep Belief Nets", "abstract": "We
+        show how to use complementary priors to eliminate the explaining-away effects
+        that make inference difficult in densely connected belief nets that have many
+        hidden layers. Using complementary priors, we derive a fast, greedy algorithm
+        that can learn deep, directed belief networks one layer at a time, provided
+        the top two layers form an undirected associative memory. The fast, greedy
+        algorithm is used to initialize a slower learning procedure that fine-tunes
+        the weights using a contrastive version of the wake-sleep algorithm. After
+        fine-tuning, a network with three hidden layers forms a very good generative
+        model of the joint distribution of handwritten digit images and their labels.
+        This generative model gives better digit classification than the best discriminative
+        learning algorithms. The low-dimensional manifolds on which the digits lie
+        are modeled by long ravines in the free-energy landscape of the top-level
+        associative memory, and it is easy to explore these ravines by using the directed
+        connections to display what the associative memory has in mind.", "venue":
+        "Neural Computation", "year": 2006, "referenceCount": 30, "citationCount":
+        14398, "influentialCitationCount": 1289, "isOpenAccess": false, "openAccessPdf":
+        null, "fieldsOfStudy": ["Mathematics", "Computer Science", "Medicine"], "s2FieldsOfStudy":
+        [{"category": "Mathematics", "source": "external"}, {"category": "Computer
+        Science", "source": "external"}, {"category": "Medicine", "source": "external"},
+        {"category": "Computer Science", "source": "s2-fos-model"}], "publicationTypes":
+        ["JournalArticle"], "publicationDate": "2006-07-01", "journal": {"volume":
+        "18", "pages": "1527-1554", "name": "Neural Computation"}, "authors": [{"authorId":
+        "1695689", "name": "Geoffrey E. Hinton"}, {"authorId": "2217144", "name":
+        "Simon Osindero"}, {"authorId": "1725303", "name": "Y. Teh"}]}}, {"contexts":
+        ["(1)\nIn the next section, we present a theoretical analysis of adversarial
+        nets, essentially showing that the training criterion allows one to recover
+        the data generating distribution as G and D are given enough capacity, i.e.,
+        in the non-parametric limit."], "isInfluential": false, "intents": ["background"],
+        "citedPaper": {"paperId": "9966e890f2eedb4577e11b9d5a66380a4d9341fe", "externalIds":
+        {"DBLP": "journals/jmlr/Hyvarinen05", "MAG": "1505878979", "CorpusId": 1152227},
+        "corpusId": 1152227, "publicationVenue": {"id": "c22e7c36-3bfa-43e1-bb7b-edccdea2a780",
+        "name": "Journal of machine learning research", "type": "journal", "alternate_names":
+        ["Journal of Machine Learning Research", "J mach learn res", "J Mach Learn
+        Res"], "issn": "1532-4435", "alternate_issns": ["1533-7928"], "url": "http://www.ai.mit.edu/projects/jmlr/",
+        "alternate_urls": ["http://jmlr.csail.mit.edu/", "http://www.jmlr.org/", "http://portal.acm.org/affiliated/jmlr"]},
+        "url": "https://www.semanticscholar.org/paper/9966e890f2eedb4577e11b9d5a66380a4d9341fe",
+        "title": "Estimation of Non-Normalized Statistical Models by Score Matching",
+        "abstract": "One often wants to estimate statistical models where the probability
+        density function is known only up to a multiplicative normalization constant.
+        Typically, one then has to resort to Markov Chain Monte Carlo methods, or
+        approximations of the normalization constant. Here, we propose that such models
+        can be estimated by minimizing the expected squared distance between the gradient
+        of the log-density given by the model and the gradient of the log-density
+        of the observed data. While the estimation of the gradient of log-density
+        function is, in principle, a very difficult non-parametric problem, we prove
+        a surprising result that gives a simple formula for this objective function.
+        The density function of the observed data does not appear in this formula,
+        which simplifies to a sample average of a sum of some derivatives of the log-density
+        given by the model. The validity of the method is demonstrated on multivariate
+        Gaussian and independent component analysis models, and by estimating an overcomplete
+        filter set for natural image data.", "venue": "Journal of machine learning
+        research", "year": 2005, "referenceCount": 16, "citationCount": 769, "influentialCitationCount":
+        142, "isOpenAccess": false, "openAccessPdf": null, "fieldsOfStudy": ["Computer
+        Science", "Mathematics"], "s2FieldsOfStudy": [{"category": "Computer Science",
+        "source": "external"}, {"category": "Mathematics", "source": "external"},
+        {"category": "Computer Science", "source": "s2-fos-model"}], "publicationTypes":
+        ["JournalArticle"], "publicationDate": "2005-12-01", "journal": {"volume":
+        "6", "pages": "695-709", "name": "J. Mach. Learn. Res."}, "authors": [{"authorId":
+        "1791548", "name": "Aapo Hyv\u00e4rinen"}]}}, {"contexts": ["(c) After an
+        update to G, gradient of D has guided G(z) to flow to regions that are more
+        likely to be classified as data."], "isInfluential": false, "intents": ["background"],
+        "citedPaper": {"paperId": "ca9b21e84ffc7e193d1b3bb45fb7c4e48226b59e", "externalIds":
+        {"MAG": "1990838964", "DOI": "10.1080/17442509908834179", "CorpusId": 15419929},
+        "corpusId": 15419929, "publicationVenue": null, "url": "https://www.semanticscholar.org/paper/ca9b21e84ffc7e193d1b3bb45fb7c4e48226b59e",
+        "title": "On the convergence of markovian stochastic algorithms with rapidly
+        decreasing ergodicity rates", "abstract": "We analyse the convergence of stochastic
+        algorithms with Markovian noise when the ergodicity of the Markov chain governing
+        the noise rapidly decreases as the control parameter tends to infinity. In
+        such a case, there may be a positive probability of divergence of the algorithm
+        in the classic Robbins-Monro form. We provide sufficient condition which ensure
+        convergence. Moreover, we analyse the asymptotic behaviour of these algorithms
+        and state a diffusion approximation theorem", "venue": "", "year": 1999, "referenceCount":
+        26, "citationCount": 161, "influentialCitationCount": 20, "isOpenAccess":
+        false, "openAccessPdf": null, "fieldsOfStudy": ["Mathematics"], "s2FieldsOfStudy":
+        [{"category": "Mathematics", "source": "external"}, {"category": "Mathematics",
+        "source": "s2-fos-model"}, {"category": "Computer Science", "source": "s2-fos-model"}],
+        "publicationTypes": null, "publicationDate": "1999-02-01", "journal": {"volume":
+        "65", "pages": "177-228", "name": "Stochastics and Stochastics Reports"},
+        "authors": [{"authorId": "1721284", "name": "L. Younes"}]}}, {"contexts":
+        ["We trained adversarial nets an a range of datasets including MNIST[21],
+        the Toronto Face Database (TFD) [27], and CIFAR-10 [19]."], "isInfluential":
+        false, "intents": ["methodology"], "citedPaper": {"paperId": "162d958ff885f1462aeda91cd72582323fd6a1f4",
+        "externalIds": {"MAG": "2112796928", "DBLP": "journals/pieee/LeCunBBH98",
+        "DOI": "10.1109/5.726791", "CorpusId": 14542261}, "corpusId": 14542261, "publicationVenue":
+        {"id": "6faaccca-1cc4-45a9-aeb6-96a4901d2606", "name": "Proceedings of the
+        IEEE", "type": "journal", "alternate_names": ["Proc IEEE"], "issn": "0018-9219",
+        "alternate_issns": ["1558-2256"], "url": "http://www.ieee.org/portal/pages/pubs/proceedings/",
+        "alternate_urls": ["http://www.ieee.org/products/onlinepubs/pub/about_conference.html",
+        "https://ieeexplore.ieee.org/servlet/opac?punumber=5", "http://proceedingsoftheieee.ieee.org/"]},
+        "url": "https://www.semanticscholar.org/paper/162d958ff885f1462aeda91cd72582323fd6a1f4",
+        "title": "Gradient-based learning applied to document recognition", "abstract":
+        "Multilayer neural networks trained with the back-propagation algorithm constitute
+        the best example of a successful gradient based learning technique. Given
+        an appropriate network architecture, gradient-based learning algorithms can
+        be used to synthesize a complex decision surface that can classify high-dimensional
+        patterns, such as handwritten characters, with minimal preprocessing. This
+        paper reviews various methods applied to handwritten character recognition
+        and compares them on a standard handwritten digit recognition task. Convolutional
+        neural networks, which are specifically designed to deal with the variability
+        of 2D shapes, are shown to outperform all other techniques. Real-life document
+        recognition systems are composed of multiple modules including field extraction,
+        segmentation recognition, and language modeling. A new learning paradigm,
+        called graph transformer networks (GTN), allows such multimodule systems to
+        be trained globally using gradient-based methods so as to minimize an overall
+        performance measure. Two systems for online handwriting recognition are described.
+        Experiments demonstrate the advantage of global training, and the flexibility
+        of graph transformer networks. A graph transformer network for reading a bank
+        cheque is also described. It uses convolutional neural network character recognizers
+        combined with global training techniques to provide record accuracy on business
+        and personal cheques. It is deployed commercially and reads several million
+        cheques per day.", "venue": "Proceedings of the IEEE", "year": 1998, "referenceCount":
+        138, "citationCount": 39453, "influentialCitationCount": 6327, "isOpenAccess":
+        false, "openAccessPdf": null, "fieldsOfStudy": ["Computer Science"], "s2FieldsOfStudy":
+        [{"category": "Computer Science", "source": "external"}, {"category": "Computer
+        Science", "source": "s2-fos-model"}], "publicationTypes": ["JournalArticle",
+        "Review"], "publicationDate": null, "journal": {"volume": "86", "pages": "2278-2324",
+        "name": "Proc. IEEE"}, "authors": [{"authorId": "1688882", "name": "Yann LeCun"},
+        {"authorId": "52184096", "name": "L. Bottou"}, {"authorId": "1751762", "name":
+        "Yoshua Bengio"}, {"authorId": "1721248", "name": "P. Haffner"}]}}, {"contexts":
+        ["This is similar to the inference net trained by the wake-sleep algorithm
+        [15] but with the advantage that the inference net may be trained for a fixed
+        generator net after the generator net has finished training."], "isInfluential":
+        false, "intents": ["methodology"], "citedPaper": {"paperId": "6dd01cd9c17d1491ead8c9f97597fbc61dead8ea",
+        "externalIds": {"MAG": "1993845689", "DOI": "10.1126/SCIENCE.7761831", "CorpusId":
+        871473, "PubMed": "7761831"}, "corpusId": 871473, "publicationVenue": {"id":
+        "f59506a8-d8bb-4101-b3d4-c4ac3ed03dad", "name": "Science", "type": "journal",
+        "issn": "0193-4511", "alternate_issns": ["0036-8075"], "url": "https://www.jstor.org/journal/science",
+        "alternate_urls": ["https://www.sciencemag.org/", "http://www.sciencemag.org/",
+        "http://www.jstor.org/journals/00368075.html", "http://www.sciencemag.org/archive/"]},
+        "url": "https://www.semanticscholar.org/paper/6dd01cd9c17d1491ead8c9f97597fbc61dead8ea",
+        "title": "The \"wake-sleep\" algorithm for unsupervised neural networks.",
+        "abstract": "An unsupervised learning algorithm for a multilayer network of
+        stochastic neurons is described. Bottom-up \"recognition\" connections convert
+        the input into representations in successive hidden layers, and top-down \"generative\"
+        connections reconstruct the representation in one layer from the representation
+        in the layer above. In the \"wake\" phase, neurons are driven by recognition
+        connections, and generative connections are adapted to increase the probability
+        that they would reconstruct the correct activity vector in the layer below.
+        In the \"sleep\" phase, neurons are driven by generative connections, and
+        recognition connections are adapted to increase the probability that they
+        would produce the correct activity vector in the layer above.", "venue": "Science",
+        "year": 1995, "referenceCount": 20, "citationCount": 1040, "influentialCitationCount":
+        47, "isOpenAccess": false, "openAccessPdf": null, "fieldsOfStudy": ["Computer
+        Science", "Medicine"], "s2FieldsOfStudy": [{"category": "Computer Science",
+        "source": "external"}, {"category": "Medicine", "source": "external"}, {"category":
+        "Computer Science", "source": "s2-fos-model"}], "publicationTypes": ["JournalArticle"],
+        "publicationDate": "1995-05-26", "journal": {"volume": "268 5214", "pages":
+        "\n          1158-61\n        ", "name": "Science"}, "authors": [{"authorId":
+        "46831169", "name": "G. Hinton"}, {"authorId": "1790646", "name": "P. Dayan"},
+        {"authorId": "1749650", "name": "B. Frey"}, {"authorId": "145572884", "name":
+        "R. Neal"}]}}, {"contexts": ["The most relevant work is predictability minimization
+        [26]."], "isInfluential": false, "intents": ["background"], "citedPaper":
+        {"paperId": "675d381653da0d2825ae37ab06069a1525fafb79", "externalIds": {"CorpusId":
+        2142508}, "corpusId": 2142508, "publicationVenue": null, "url": "https://www.semanticscholar.org/paper/675d381653da0d2825ae37ab06069a1525fafb79",
+        "title": "Learning Factorial Codes by Predictability Minimization", "abstract":
+        "I propose a novel general principle for unsupervised learning of distributed
+        non-redundant internal representations of input patterns. The principle is
+        based on two opposing forces. For each represen-tational unit there is an
+        adaptive predictor which tries to predict the unit from the remaining units.
+        In turn, each unit tries to react to the environment such that it minimizes
+        its predictability. This encourages each unit to lter\u00e0bstract concepts''
+        out of the environmental input such that these concepts are statistically
+        independent of those upon which the other units focus. I discuss various simple
+        yet potentially powerful implementations of the principle which aim at nding
+        binary factorial codes (Bar-low et al., 1989), i.e. codes where the probability
+        of the occurrence of a particular input is simply the product of the probabilities
+        of the corresponding code symbols. Such codes are potentially relevant for
+        (1) segmentation tasks, (2) speeding up supervised learning, (3) novelty detection.
+        Methods for nding factorial codes automatically implement Occam''s razor for
+        nding codes using a minimal number of units. Unlike previous methods the novel
+        principle has a potential for removing not only linear but also non-linear
+        output redundancy. Illustrative experiments show that algorithms based on
+        the principle of predictability minimization are practically feasible. The
+        nal part of this paper describes an entirely local algorithm that has a potential
+        for learning unique representations of extended input sequences.", "venue":
+        "", "year": 1992, "referenceCount": 15, "citationCount": 208, "influentialCitationCount":
+        12, "isOpenAccess": false, "openAccessPdf": null, "fieldsOfStudy": null, "s2FieldsOfStudy":
+        [{"category": "Computer Science", "source": "s2-fos-model"}], "publicationTypes":
+        null, "publicationDate": null, "journal": null, "authors": [{"authorId": "150165387",
+        "name": "J. Urgen Schmidhuber"}]}}, {"contexts": ["An alternative to directed
+        graphical models with latent variables are undirected graphical models with
+        latent variables, such as restricted Boltzmann machines (RBMs) [27, 16], deep
+        Boltzmann machines (DBMs) [26] and their numerous variants."], "isInfluential":
+        false, "intents": ["background"], "citedPaper": {"paperId": "4f7476037408ac3d993f5088544aab427bc319c1",
+        "externalIds": {"MAG": "1820494964", "CorpusId": 533055}, "corpusId": 533055,
+        "publicationVenue": null, "url": "https://www.semanticscholar.org/paper/4f7476037408ac3d993f5088544aab427bc319c1",
+        "title": "Information processing in dynamical systems: foundations of harmony
+        theory", "abstract": "Abstract : At this early stage in the development of
+        cognitive science, methodological issues are both open and central. There
+        may have been times when developments in neuroscience, artificial intelligence,
+        or cognitive psychology seduced researchers into believing that their discipline
+        was on the verge of discovering the secret of intelligence. But a humbling
+        history of hopes disappointed has produced the realization that understanding
+        the mind will challenge the power of all these methodologies combined. The
+        work reported in this chapter rests on the conviction that a methodology that
+        has a crucial role to play in the development of cognitive science is mathematical
+        analysis. The success of cognitive science, like that of many other sciences,
+        will, I believe, depend upon the construction of a solid body of theoretical
+        results: results that express in a mathematical language the conceptual insights
+        of the field; results that squeeze all possible implications out of those
+        insights by exploiting powerful mathematical techniques. This body of results,
+        which I will call the theory of information processing, exists because information
+        is a concept that lends itself to mathematical formalization. One part of
+        the theory of information processing is already well-developed. The classical
+        theory of computation provides powerful and elegant results about the notion
+        of effective procedure, including languages for precisely expressing them
+        and theoretical machines for realizing them.", "venue": "", "year": 1986,
+        "referenceCount": 18, "citationCount": 2042, "influentialCitationCount": 204,
+        "isOpenAccess": false, "openAccessPdf": null, "fieldsOfStudy": ["Mathematics",
+        "Computer Science"], "s2FieldsOfStudy": [{"category": "Mathematics", "source":
+        "external"}, {"category": "Computer Science", "source": "external"}, {"category":
+        "Computer Science", "source": "s2-fos-model"}], "publicationTypes": null,
+        "publicationDate": "1986-01-03", "journal": {"volume": "", "pages": "194-281",
+        "name": ""}, "authors": [{"authorId": "1748557", "name": "P. Smolensky"}]}}]}
+
+        '
+    headers:
+      Access-Control-Allow-Origin:
+      - '*'
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '85352'
+      Content-Type:
+      - application/json
+      Date:
+      - Sun, 22 Jan 2023 23:41:03 GMT
+      Via:
+      - 1.1 556546966a883b579a433c9e90aa37f8.cloudfront.net (CloudFront)
+      X-Amz-Cf-Id:
+      - DKwZzVzrksbmo8lvxgCwkvKS3vrsVOgoT1VSTNk2Az7U0A4tcrj-cA==
+      X-Amz-Cf-Pop:
+      - GRU3-P1
+      X-Cache:
+      - Miss from cloudfront
+      x-amz-apigw-id:
+      - fKxuYED1PHcF_3Q=
+      x-amzn-Remapped-Connection:
+      - keep-alive
+      x-amzn-Remapped-Content-Length:
+      - '85352'
+      x-amzn-Remapped-Date:
+      - Sun, 22 Jan 2023 23:41:03 GMT
+      x-amzn-Remapped-Server:
+      - gunicorn
+      x-amzn-RequestId:
+      - 557f7a81-eb51-44b6-b79f-fe576b24fc32
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/test_semanticscholar.py b/tests/test_semanticscholar.py
index 40168db..ba0868d 100644
--- a/tests/test_semanticscholar.py
+++ b/tests/test_semanticscholar.py
@@ -8,6 +8,7 @@
 from semanticscholar.Author import Author
 from semanticscholar.Journal import Journal
 from semanticscholar.Paper import Paper
+from semanticscholar.Reference import Reference
 from semanticscholar.SemanticScholar import SemanticScholar
 from semanticscholar.SemanticScholarException import (
     BadQueryParametersException, ObjectNotFoundExeception)
@@ -91,6 +92,20 @@ def test_paper(self) -> None:
         self.assertEqual(item.keys(), data.keys())
         file.close()
 
+    def test_reference(self):
+        file = open('tests/data/Reference.json', encoding='utf-8')
+        data = json.loads(file.read())
+        item = Reference(data)
+        self.assertEqual(item.contexts, data['contexts'])
+        self.assertEqual(item.intents, data['intents'])
+        self.assertEqual(item.isInfluential, data['isInfluential'])
+        self.assertEqual(str(item.paper), str(data['citedPaper']))
+        self.assertEqual(item.raw_data, data)
+        self.assertEqual(str(item), str(data))
+        self.assertEqual(item['contexts'], data['contexts'])
+        self.assertEqual(item.keys(), data.keys())
+        file.close()
+
     def test_tldr(self) -> None:
         file = open('tests/data/Paper.json', encoding='utf-8')
         data = json.loads(file.read())['tldr']
@@ -121,6 +136,16 @@ def test_get_papers(self):
                 self.assertIn(
                     'E. Duflo', [author.name for author in item.authors])
 
+    @test_vcr.use_cassette
+    def test_get_paper_references(self):
+        data = self.sch.get_paper_references('CorpusID:1033682')
+        self.assertEqual(data.offset, 0)
+        self.assertEqual(data.next, 0)
+        self.assertEqual(len(data), 35)
+        self.assertEqual(
+            data[0].paper.title, 'Neural Variational Inference and Learning '
+                'in Belief Networks')
+
     @test_vcr.use_cassette
     def test_timeout(self):
         self.sch.timeout = 0.01
@@ -195,6 +220,8 @@ def test_search_author(self):
     @test_vcr.use_cassette
     def test_limit_value_exceeded(self):
         test_cases = [
+            (self.sch.get_paper_references, '10.1093/mind/lix.236.433', 1001,
+             'The limit parameter must be between 1 and 1000 inclusive.'),
             (self.sch.search_author, 'turing', 1001,
              'The limit parameter must be between 1 and 1000 inclusive.'),
             (self.sch.search_paper, 'turing', 101,