Skip to content

5 x decrease in performance with new Julia build #7000

Closed
@jfrolich

Description

@jfrolich

I am currently working on a high performance text analytics package for Julia.

When today I installed the new release I got a 5 times decrease in performance with the current Julia build (Version 0.3.0-prerelease+3225 (2014-05-27 03:45 UTC)) when compared to an older build (Version 0.3.0-prerelease+2727 (2014-04-23 18:25 UTC)). It is not clear why this is happening. When reinstalling the old version it works as expected. I used the windows x64 build.

The code is quite long but perhaps the bottleneck is clear. Otherwise I can try to make a more simple showcase of the performance problem. I am still learning Julia so perhaps it is an obvious mistake.

using DataFrames
using Base
using Datetime

import Base.length

PATH = "C:\\Users\\JFrolich\\Documents\\Data sets\\Reuters\\fetch\\full_dataset"
metadata = readtable(string(PATH, "/", "metadata.csv"))
y = array(metadata[:acq])

typealias Vocabulary Dict{UTF8String, Uint32}
typealias UnweightedDTM SparseMatrixCSC{Uint32}
typealias WeightedDTM SparseMatrixCSC{Float64}
typealias DTM Union(WeightedDTM, UnweightedDTM)
typealias Document UTF8String

function preprocess(d::Document)
    lowercase(d)
end

function tokenize(s::Document)
    [utf8(s) for s in matchall(r"\b\w\w+\b", preprocess(s))]
end

# A document collection is a collection of documents.
# The implementation needs the following interface:
#
# text: returns the plain text for the given document
# length: returns the length of the collection
#
abstract DocumentCollection

# Text files implementation
# This implementation of documentcollection iterate through
# text files to create the DTM
type TextFiles <: DocumentCollection
    filenames::Vector{String}
    buffer_ptr::Int
    buffer_size::Int
    buffer::Vector{Document}
end

TEXT_BUFFER_SIZE = 30000

function text_files(files)
    TextFiles(files, -1, TEXT_BUFFER_SIZE, [])
end

# returns the text for index 'i', it pre-filles a buffer of text
# for all the files in the collection when necesary, because
# loading it file by file is much slower (factor 3)
function text(dc::TextFiles, i::Int)
    if dc.buffer_ptr < 0 || i >= (dc.buffer_ptr + dc.buffer_size) || i < dc.buffer_ptr
        fill_buffer!(dc, i)
    end

    dc.buffer[i-dc.buffer_ptr+1]
end

function fill_buffer!(dc::TextFiles, i::Int)
    dc.buffer_ptr = i
    dc.buffer = Document[]
    for i in dc.buffer_ptr:(dc.buffer_ptr + dc.buffer_size)
        if i > length(dc)
            break
        end
        push!(dc.buffer, open(dc.filenames[i]) do f
            Document(readbytes(f))
        end)
    end
end

length(dc::TextFiles) = Base.length(dc.filenames)

# TextArray: This implementation of documentcollection is simply
# given as an array of strings
type TextArray <: DocumentCollection
    texts::Vector{Document}
end

function text(dc::TextArray, i::Int)
    dc.texts[i]
end

function text_array(texts)
    TextArray(texts)
end

length(dc::TextArray) = Base.length(dc.texts)

Base.start(dc::DocumentCollection) = 1
Base.next(dc::DocumentCollection, i) = (text(dc, i), i+1)
Base.done(dc::DocumentCollection, i) = i > length(dc)

# calculates Chi squared scores of a sparse matrix on a binary variable (y)
function chi2(X::DTM, y::Array{Int, 1})
    f_obs = y * X
    feature_count = ones(Float64, size(X)[1]) * X
    class_prob    = mean(y)
    f_exp = (class_prob * feature_count)'
    chisq = Float64[]
    for i in 1:length(f_obs)
        push!(chisq, (f_obs[i] - f_exp[i])^2/(f_exp[i]))
    end
    chisq
end

# Feature selection
function select_k_best(X::DTM, y::Vector{Int}, score_fnc, k::Int)
    scores = score_fnc(X,y)
    sortperm(scores::Array{Float64,1}, rev=true, alg=MergeSort)[1:k]
end

function ngramize(tokens::Array{UTF8String,1}, n_grams::Int64=2)
    n_tokens::Int64 = length(tokens)
    for i in 1:n_tokens
        gram = ""
        for j in 1:n_grams
            token = tokens[i+j-1]
            if j == 1
                gram = token
            elseif j > 1 && i+j-1 <= n_tokens
                gram = string(gram, " ", token)
                push!(tokens, gram)
            end
        end
    end
    tokens
end



function text_from_files(files)
    texts = UTF8String[]
    for file in files
        push!(texts, open(file) do f
            UTF8String(readbytes(f))
        end)
    end
    texts
end

# TODO: Add ngram range feature
function dtm(dc::DocumentCollection, n_grams::Int)
    vocabulary = Vocabulary()
    is = Uint32[]
    js = Uint32[]
    vs = Uint32[]
    for (i, text) in enumerate(dc)
        tokens::Array{UTF8String,1} = ngramize(tokenize(text), n_grams)
        for token in tokens
            push!(is, i)
            push!(js, get!(vocabulary, token, length(vocabulary)+1))
            push!(vs, 1)
        end
    end
    (sparse(is, js, vs, length(dc), maximum(js)),
     vocabulary_to_feature_names(vocabulary))
end

function vocabulary_to_feature_names(vocabulary::Vocabulary)
    inv_voc = Dict{Uint32, UTF8String}()
    for (k,v) in vocabulary
        inv_voc[v] = k
    end
    [inv_voc[convert(Uint32, i)] for i in 1:length(inv_voc)]
end

files = [string(PATH, "/text/", id) for id in (metadata[:id])]

dc = text_array(text_from_files(files))

@time X, feature_names = dtm(dc,3)

Metadata

Metadata

Assignees

No one assigned

    Labels

    performanceMust go fasterregressionRegression in behavior compared to a previous version

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions