forkloop
diff --git a/‎AUTHORS.rst
Lines changed: 1 addition & 0 deletions b/‎AUTHORS.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/datasets/index.rst
Lines changed: 1 addition & 0 deletions b/‎doc/datasets/index.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/modules/classes.rst
Lines changed: 35 additions & 0 deletions b/‎doc/modules/classes.rst
Lines changed: 35 additions & 0 deletions
diff --git a/‎doc/modules/multiclass.rst
Lines changed: 37 additions & 7 deletions b/‎doc/modules/multiclass.rst
Lines changed: 37 additions & 7 deletions
diff --git a/‎doc/sphinxext/gen_rst.py
Lines changed: 22 additions & 0 deletions b/‎doc/sphinxext/gen_rst.py
Lines changed: 22 additions & 0 deletions
diff --git a/‎doc/whats_new.rst
Lines changed: 33 additions & 13 deletions b/‎doc/whats_new.rst
Lines changed: 33 additions & 13 deletions
diff --git a/‎examples/document_clustering.py
Lines changed: 45 additions & 19 deletions b/‎examples/document_clustering.py
Lines changed: 45 additions & 19 deletions
@@ -97,5 +97,6 @@ People
 
   * `Gilles Louppe <http://www.montefiore.ulg.ac.be/~glouppe>`_
 
+
 If I forgot anyone, do not hesitate to send me an email to
 fabian.pedregosa@inria.fr and I'll include you in the list.
@@ -116,6 +116,7 @@ can be used to build artifical datasets of controled size and complexity.
    :template: function.rst
 
    make_classification
+   make_multilabel_classification
    make_regression
    make_blobs
    make_friedman1
 
@@ -145,6 +145,7 @@ Samples generator
    :template: function.rst
 
    datasets.make_classification
+   datasets.make_multilabel_classification
    datasets.make_regression
    datasets.make_blobs
    datasets.make_friedman1
@@ -588,6 +589,7 @@ See the :ref:`clustering` section of the user guide for further details.
    :template: function.rst
 
    metrics.adjusted_rand_score
+   metrics.adjusted_mutual_info_score
    metrics.homogeneity_completeness_v_measure
    metrics.homogeneity_score
    metrics.completeness_score
@@ -640,6 +642,39 @@ Pairwise metrics
    mixture.VBGMM
 
 
+.. _multiclass_ref:
+
+:mod:`sklearn.multiclass`: Multiclass and multilabel classification
+===================================================================
+
+.. automodule:: sklearn.multiclass
+   :no-members:
+   :no-inherited-members:
+
+**User guide:** See the :ref:`multiclass` section for further details.
+
+.. currentmodule:: sklearn
+
+.. autosummary::
+    :toctree: generated
+    :template: class.rst
+
+    multiclass.OneVsRestClassifier
+    multiclass.OneVsOneClassifier
+    multiclass.OutputCodeClassifier
+
+.. autosummary::
+    :toctree: generated
+    :template: function.rst
+
+    multiclass.fit_ovr
+    multiclass.predict_ovr
+    multiclass.fit_ovo
+    multiclass.predict_ovo
+    multiclass.fit_ecoc
+    multiclass.predict_ecoc
+
+
 .. _naive_bayes_ref:
 
 :mod:`sklearn.naive_bayes`: Naive Bayes
 
@@ -1,17 +1,23 @@
 
 .. _multiclass:
 
-=====================
-Multiclass algorithms
-=====================
+====================================
+Multiclass and multilabel algorithms
+====================================
 
 .. currentmodule:: sklearn.multiclass
 
-This module implements multiclass learning algorithms:
+This module implements multiclass and multilabel learning algorithms:
     - one-vs-the-rest / one-vs-all
     - one-vs-one
     - error correcting output codes
 
+Multiclass classification means classification with more than two classes.
+Multilabel classification is a different task, where a classifier is used to
+predict a set of target labels for each instance; i.e., the set of target
+classes is not assumed to be disjoint as in ordinary (binary or multiclass)
+classification. This is also called any-of classification.
+
 The estimators provided in this module are meta-estimators: they require a base
 estimator to be provided in their constructor. For example, it is possible to
 use these estimators to turn a binary classifier or a regressor into a
@@ -26,9 +32,15 @@ improves.
     multiclass classification out-of-the-box. Below is a summary of the
     classifiers supported in scikit-learn grouped by the strategy used.
 
-    - Inherently multiclass: Naive Bayes, LDA.
-    - One-Vs-One: SVC.
-    - One-Vs-All: LinearSVC, LogisticRegression, SGDClassifier, RidgeClassifier.
+    - Inherently multiclass: Naive Bayes, :class:`LDA`.
+    - One-Vs-One: :class:`SVC`.
+    - One-Vs-All: :class:`LinearSVC`, :class:`LogisticRegression`,
+      :class:`SGDClassifier`, :class:`RidgeClassifier`.
+
+.. note::
+
+    At the moment there are no evaluation metrics implemented for multilabel
+    learnings.
 
 
 One-Vs-The-Rest
@@ -57,6 +69,24 @@ fair default choice. Below is an example::
          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2,
          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
 
+Multilabel learning with OvR
+----------------------------
+
+``OneVsRestClassifier`` also supports multilabel classification.
+To use this feature, feed the classifier a list of tuples containing
+target labels, like in the example below.
+
+
+.. figure:: ../auto_examples/images/plot_multilabel_1.png
+    :target: ../auto_examples/plot_multilabel.html
+    :align: center
+    :scale: 75%
+
+
+.. topic:: Examples:
+
+    * :ref:`example_plot_multilabel.py`
+
 
 One-Vs-One
 ==========
 
@@ -7,6 +7,7 @@
 Files that generate images should start with 'plot'
 
 """
+from time import time
 import os
 import shutil
 import traceback
@@ -256,6 +257,7 @@ def generate_file_rst(fname, target_dir, src_dir, plot_gallery):
                                     os.stat(src_file).st_mtime):
             # We need to execute the code
             print 'plotting %s' % fname
+            t0 = time()
             import matplotlib.pyplot as plt
             plt.close('all')
             cwd = os.getcwd()
@@ -304,6 +306,8 @@ def generate_file_rst(fname, target_dir, src_dir, plot_gallery):
             finally:
                 os.chdir(cwd)
                 sys.stdout = orig_stdout
+
+            print " - time elapsed : %.2g sec" % (time() - t0)
         else:
             figure_list = [f[len(image_dir):]
                             for f in glob.glob(image_path % '[1-9]')]
@@ -339,3 +343,21 @@ def generate_file_rst(fname, target_dir, src_dir, plot_gallery):
 def setup(app):
     app.connect('builder-inited', generate_example_rst)
     app.add_config_value('plot_gallery', True, 'html')
+
+    # Sphinx hack: sphinx copies generated images to the build directory
+    #  each time the docs are made.  If the desired image name already
+    #  exists, it appends a digit to prevent overwrites.  The problem is,
+    #  the directory is never cleared.  This means that each time you build
+    #  the docs, the number of images in the directory grows.
+    #
+    # This question has been asked on the sphinx development list, but there
+    #  was no response: http://osdir.com/ml/sphinx-dev/2011-02/msg00123.html
+    #
+    # The following is a hack that prevents this behavior by clearing the
+    #  image build directory each time the docs are built.  If sphinx
+    #  changes their layout between versions, this will not work (though
+    #  it should probably not cause a crash).  Tested successfully
+    #  on Sphinx 1.0.7
+    build_image_dir = '_build/html/_images'
+    if os.path.exists(build_image_dir):
+        shutil.rmtree(build_image_dir)
@@ -25,17 +25,17 @@ Changelog
    - Faster tests by `Fabian Pedregosa`_.
 
    - Silhouette Coefficient cluster analysis evaluation metric added as
-     ``sklearn.metrics.silhouette_score`` by Robert Layton.
+     :func:`sklearn.metrics.silhouette_score` by Robert Layton.
 
-   - Fixed a bug in `KMeans` in the handling of the `n_init` parameter:
-     the clustering algorithm used to be run `n_init` times but the last
+   - Fixed a bug in :ref:`k_means` in the handling of the ``n_init`` parameter:
+     the clustering algorithm used to be run ``n_init`` times but the last
      solution was retained instead of the best solution.
 
    - Minor refactoring in :ref:`sgd` module; consolidated dense and sparse
      predict methods.
 
    - Adjusted Mutual Information metric added as
-     ``sklearn.metrics.adjusted_mutual_info_score`` by Robert Layton.
+     :func:`sklearn.metrics.adjusted_mutual_info_score` by Robert Layton.
 
    - Models like SVC/SVR/LinearSVC/LogisticRegression from libsvm/liblinear
      now support scaling of C regularization parameter by the number of
@@ -54,7 +54,24 @@ Changelog
 
    - Fix a bug due to atom swapping in :ref:`OMP` by `Vlad Niculae`_.
 
-   - :ref:`SparseCoder` by `Vlad Niculae`_. 
+   - :ref:`SparseCoder` by `Vlad Niculae`_.
+
+   - :ref:`mini_batch_kmeans` performance improvements by `Olivier Grisel`_.
+
+   - :ref:`k_means` support for sparse matrices by `Mathieu Blondel`_.
+
+   - Improved documentation for developers and for the :mod:`sklearn.utils`
+     module, by `Jake VanderPlas`_.
+
+   - Vectorized 20newsgroups dataset loader
+     (:func:`sklearn.datasets.fetch_20newsgroups_vectorized`) by
+     `Mathieu Blondel`_.
+
+   - :ref:`multiclass` by `Lars Buitinck`_.
+
+   - Utilities for fast computation of mean and variance for sparse matrices
+     by `Mathieu Blondel`_.
+
 
 API changes summary
 -------------------
@@ -66,10 +83,10 @@ version 0.9:
     had ``overwrite_`` parameters; these have been replaced with ``copy_``
     parameters with exactly the opposite meaning.
 
-    This particularly affects some of the estimators in ``linear_models``.
+    This particularly affects some of the estimators in :mod:`linear_model`.
     The default behavior is still to copy everything passed in.
 
-  - The SVMlight dataset loader ``sklearn.datasets.load_svmlight_file`` no
+  - The SVMlight dataset loader :func:`sklearn.datasets.load_svmlight_file` no
     longer supports loading two files at once; use ``load_svmlight_files``
     instead. Also, the (unused) ``buffer_mb`` parameter is gone.
 
@@ -80,13 +97,14 @@ version 0.9:
   - The :ref:`covariance` module now has a robust estimator of
     covariance, the Minimum Covariance Determinant estimator.
 
-  - Cluster evaluation metrics in ``metrics.cluster.py`` have been refactored
+  - Cluster evaluation metrics in :mod:`metrics.cluster` have been refactored
     but the changes are backwards compatible. They have been moved to the
-    ``metrics.cluster.supervised``, along with ``metrics.cluster.unsupervised``
-    which contains the Silhouette Coefficient.
+    :mod:`metrics.cluster.supervised`, along with
+    :mod:`metrics.cluster.unsupervised` which contains the Silhouette
+    Coefficient.
 
-  - The permutation_test_score function now behaves the same way as
-    cross_val_score (i.e. uses the mean score across the folds.)
+  - The ``permutation_test_score`` function now behaves the same way as
+    ``cross_val_score`` (i.e. uses the mean score across the folds.)
 
   - Cross Validation generators now use integer indices (``indices=True``)
     by default instead of boolean masks. This make it more intuitive to
@@ -99,10 +117,12 @@ version 0.9:
     as opposed to the regression setting.
 
   - Fixed an off-by-one error in the SVMlight/LibSVM file format handling;
-    files generated using ``sklearn.datasets.dump_svmlight_file`` should be
+    files generated using :func:`sklearn.datasets.dump_svmlight_file` should be
     re-generated. (They should continue to work, but accidentally had one
     extra column of zeros prepended.)
 
+  - ``BaseDictionaryLearning`` class replaced by ``SparseCodingMixin``.
+
 
 .. _changes_0_9:
 
 
@@ -1,33 +1,54 @@
 """
-===============================================
-Clustering text documents using MiniBatchKmeans
-===============================================
+=======================================
+Clustering text documents using k-means
+=======================================
 
 This is an example showing how the scikit-learn can be used to cluster
 documents by topics using a bag-of-words approach. This example uses
 a scipy.sparse matrix to store the features instead of standard numpy arrays.
 
+Two algorithms are demoed: ordinary k-means and its faster cousin minibatch
+k-means.
+
 """
-print __doc__
 
 # Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#         Lars Buitinck <L.J.Buitinck@uva.nl>
 # License: Simplified BSD
 
-from time import time
-import logging
-import numpy as np
-
 from sklearn.datasets import fetch_20newsgroups
 from sklearn.feature_extraction.text import Vectorizer
 from sklearn import metrics
 
-from sklearn.cluster import MiniBatchKMeans
+from sklearn.cluster import KMeans, MiniBatchKMeans
+
+import logging
+from optparse import OptionParser
+import sys
+from time import time
+
+import numpy as np
 
 
 # Display progress logs on stdout
 logging.basicConfig(level=logging.INFO,
                     format='%(asctime)s %(levelname)s %(message)s')
 
+# parse commandline arguments
+op = OptionParser()
+op.add_option("--no-minibatch",
+              action="store_false", dest="minibatch", default=True,
+              help="Use ordinary k-means algorithm.")
+
+print __doc__
+op.print_help()
+
+(opts, args) = op.parse_args()
+if len(args) > 0:
+    op.error("this script takes no arguments.")
+    sys.exit(1)
+
+
 ###############################################################################
 # Load some categories from the training set
 categories = [
@@ -61,23 +82,28 @@
 print "n_samples: %d, n_features: %d" % X.shape
 print
 
+
 ###############################################################################
-# Sparse MiniBatchKmeans
+# Do the actual clustering
+
+if opts.minibatch:
+    km = MiniBatchKMeans(k=true_k, init='k-means++', n_init=1,
+                         init_size=1000,
+                         batch_size=1000, verbose=1)
+else:
+    km = KMeans(k=true_k, init='random', max_iter=100, n_init=1, verbose=1)
 
-mbkm = MiniBatchKMeans(k=true_k, init='k-means++', n_init=1,
-                       init_size=1000,
-                       batch_size=1000, verbose=1)
-print "Clustering sparse data with %s" % mbkm
+print "Clustering sparse data with %s" % km
 t0 = time()
-mbkm.fit(X)
+km.fit(X)
 print "done in %0.3fs" % (time() - t0)
 print
 
-print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels, mbkm.labels_)
-print "Completeness: %0.3f" % metrics.completeness_score(labels, mbkm.labels_)
-print "V-measure: %0.3f" % metrics.v_measure_score(labels, mbkm.labels_)
+print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)
+print "Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)
+print "V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)
 print "Adjusted Rand-Index: %.3f" % \
-    metrics.adjusted_rand_score(labels, mbkm.labels_)
+    metrics.adjusted_rand_score(labels, km.labels_)
 print "Silhouette Coefficient: %0.3f" % metrics.silhouette_score(
     X, labels, sample_size=1000)