diff --git a/NEWS.md b/NEWS.md index bdbfb44546..4cf755e7c3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,6 +9,7 @@ * New "performance tips" [section of the docs](https://github.com/FluxML/Flux.jl/pull/615). * The training loop is [now more readable](https://github.com/FluxML/Flux.jl/pull/651) and better shows how to use the lower-level APIs. * New [AlphaDropout](https://github.com/FluxML/Flux.jl/pull/656). +* [Data.Iris](https://github.com/FluxML/Flux.jl/pull/652) makes Fisher's Iris dataset available with `Iris.labels` and `Iris.features`. * New [InstanceNorm](https://github.com/FluxML/Flux.jl/pull/634), as popularized by [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022). AD Changes: diff --git a/Project.toml b/Project.toml index ebb2670108..08b15332a7 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,7 @@ AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" +DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" Juno = "e5e0dc1b-0480-54bc-9374-aad01c23163d" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" diff --git a/src/data/Data.jl b/src/data/Data.jl index ab78f4163c..d7cd0303ba 100644 --- a/src/data/Data.jl +++ b/src/data/Data.jl @@ -39,4 +39,7 @@ include("tree.jl") include("sentiment.jl") using .Sentiment +include("iris.jl") +export Iris + end diff --git a/src/data/iris.jl b/src/data/iris.jl new file mode 100644 index 0000000000..c432f8473f --- /dev/null +++ b/src/data/iris.jl @@ -0,0 +1,88 @@ + +""" + + Iris + +Fisher's classic iris dataset. + +Measurements from 3 different species of iris: setosa, versicolor and +virginica. There are 50 examples of each species. + +There are 4 measurements for each example: sepal length, sepal width, petal +length and petal width. The measurements are in centimeters. + +The module retrieves the data from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/iris). + +""" +module Iris + +using DelimitedFiles +using ..Data: deps, download_and_verify + +const cache_prefix = "" + +# Uncomment if the iris.data file is cached to cache.julialang.org. +# const cache_prefix = "https://cache.julialang.org/" + +function load() + isfile(deps("iris.data")) && return + + @info "Downloading iris dataset." + download_and_verify("$(cache_prefix)https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", + deps("iris.data"), + "6f608b71a7317216319b4d27b4d9bc84e6abd734eda7872b71a458569e2656c0") +end + +""" + + labels() + +Get the labels of the iris dataset, a 150 element array of strings listing the +species of each example. + +```jldoctest +julia> labels = Flux.Data.Iris.labels(); + +julia> summary(labels) +"150-element Array{String,1}" + +julia> labels[1] +"Iris-setosa" +``` +""" +function labels() + load() + iris = readdlm(deps("iris.data"), ',') + Vector{String}(iris[1:end, end]) +end + +""" + + features() + +Get the features of the iris dataset. This is a 4x150 matrix of Float64 +elements. It has a row for each feature (sepal length, sepal width, +petal length, petal width) and a column for each example. + +```jldoctest +julia> features = Flux.Data.Iris.features(); + +julia> summary(features) +"4×150 Array{Float64,2}" + +julia> features[:, 1] +4-element Array{Float64,1}: + 5.1 + 3.5 + 1.4 + 0.2 +``` +""" +function features() + load() + iris = readdlm(deps("iris.data"), ',') + Matrix{Float64}(iris[1:end, 1:4]') +end +end + + diff --git a/test/data.jl b/test/data.jl index a73d1ec3e0..6b777873a6 100644 --- a/test/data.jl +++ b/test/data.jl @@ -14,3 +14,9 @@ using Test @test FashionMNIST.labels() isa Vector{Int64} @test Data.Sentiment.train() isa Vector{Data.Tree{Any}} + +@test Iris.features() isa Matrix +@test size(Iris.features()) == (4,150) + +@test Iris.labels() isa Vector{String} +@test size(Iris.labels()) == (150,)