Skip to content
This repository has been archived by the owner on Nov 16, 2019. It is now read-only.

Commit

Permalink
Merge pull request #176 from yahoo/lstm_inference
Browse files Browse the repository at this point in the history
LSTM support in CaffeOnSpark
  • Loading branch information
anfeng authored Nov 4, 2016
2 parents 081407c + b5f0a87 commit 054aa08
Show file tree
Hide file tree
Showing 32 changed files with 2,854 additions and 19 deletions.
31 changes: 16 additions & 15 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
HOME ?=/home/${USER}
ifeq ($(shell which spark-submit),)
SPARK_HOME ?=/home/y/share/spark
SPARK_HOME ?= /home/y/share/spark
else
SPARK_HOME ?=$(shell which spark-submit 2>&1 | sed 's/\/bin\/spark-submit//g')
endif
CAFFE_ON_SPARK ?=$(shell pwd)
LD_LIBRARY_PATH ?=/home/y/lib64:/home/y/lib64/mkl/intel64
LD_LIBRARY_PATH ?=/home/y/lib64:/home/y/lib64/mkl/intel64:/usr/local/cuda/
LD_LIBRARY_PATH2=${LD_LIBRARY_PATH}:${CAFFE_ON_SPARK}/caffe-public/distribute/lib:${CAFFE_ON_SPARK}/caffe-distri/distribute/lib:/usr/lib64:/lib64
DYLD_LIBRARY_PATH ?=/home/y/lib64:/home/y/lib64/mkl/intel64
DYLD_LIBRARY_PATH2=${DYLD_LIBRARY_PATH}:${CAFFE_ON_SPARK}/caffe-public/distribute/lib:${CAFFE_ON_SPARK}/caffe-distri/distribute/lib:/usr/lib64:/lib64
DYLD_LIBRARY_PATH ?=/home/y/lib64:/home/y/lib64/mkl/intel64:/usr/local/cuda/lib
DYLD_LIBRARY_PATH2=${DYLD_LIBRARY_PATH}:${CAFFE_ON_SPARK}/caffe-public/distribute/lib:${CAFFE_ON_SPARK}/caffe-distri/distribute/lib:/usr/lib64:/lib64

export SPARK_VERSION=$(shell ${SPARK_HOME}/bin/spark-submit --version 2>&1 | grep version | awk '{print $$5}' | cut -d'.' -f1)
ifeq (${SPARK_VERSION}, 2)
Expand All @@ -17,24 +17,25 @@ endif

build:
cd caffe-public; make proto; make -j4 -e distribute; cd ..
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B package -DskipTests
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; GLOG_minloglevel=1 mvn ${MVN_SPARK_FLAG} -B package -DskipTests
jar -xvf caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar META-INF/native/linux64/liblmdbjni.so
mv META-INF/native/linux64/liblmdbjni.so ${CAFFE_ON_SPARK}/caffe-distri/distribute/lib
${CAFFE_ON_SPARK}/scripts/setup-mnist.sh
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B package
cp -r ${CAFFE_ON_SPARK}/caffe-public/python/caffe ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/
cd ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/; zip -r caffeonsparkpythonapi *; mv caffeonsparkpythonapi.zip ${CAFFE_ON_SPARK}/caffe-grid/target/;cd ${CAFFE_ON_SPARK}
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; export SPARK_HOME="${SPARK_HOME}"; ${CAFFE_ON_SPARK}/caffe-grid/src/test/python/PythonTest.sh
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; GLOG_minloglevel=1 mvn ${MVN_SPARK_FLAG} -B test
cd ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/; zip -r caffeonsparkpythonapi *; cd ${CAFFE_ON_SPARK}/caffe-public/python/; zip -ur ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/caffeonsparkpythonapi.zip *; cd - ; mv caffeonsparkpythonapi.zip ${CAFFE_ON_SPARK}/caffe-grid/target/; cd ${CAFFE_ON_SPARK}
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}; export SPARK_HOME=${SPARK_HOME};GLOG_minloglevel=1 ${CAFFE_ON_SPARK}/caffe-grid/src/test/python/PythonTest.sh

buildosx:
cd caffe-public; make proto; make -j4 -e distribute; cd ..
export DYLD_LIBRARY_PATH="${DYLD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B package -DskipTests
export DYLD_LIBRARY_PATH="${DYLD_LIBRARY_PATH2}"; GLOG_minloglevel=1 mvn ${MVN_SPARK_FLAG} -B package -DskipTests
jar -xvf caffe-grid/target/caffe-grid-0.1-SNAPSHOT-jar-with-dependencies.jar META-INF/native/osx64/liblmdbjni.jnilib
mv META-INF/native/osx64/liblmdbjni.jnilib ${CAFFE_ON_SPARK}/caffe-distri/distribute/lib
${CAFFE_ON_SPARK}/scripts/setup-mnist.sh
export DYLD_LIBRARY_PATH="${DYLD_LIBRARY_PATH2}"; mvn ${MVN_SPARK_FLAG} -B package
cp -r ${CAFFE_ON_SPARK}/caffe-public/python/caffe ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/
cd ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/; zip -r caffeonsparkpythonapi *; mv caffeonsparkpythonapi.zip ${CAFFE_ON_SPARK}/caffe-grid/target/; cd ${CAFFE_ON_SPARK}
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH2}"; export SPARK_HOME="${SPARK_HOME}"; ${CAFFE_ON_SPARK}/caffe-grid/src/test/python/PythonTest.sh
export LD_LIBRARY_PATH="${DYLD_LIBRARY_PATH2}"; GLOG_minloglevel=1 mvn ${MVN_SPARK_FLAG} -B test
cd ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/; zip -r caffeonsparkpythonapi *; cd ${CAFFE_ON_SPARK}/caffe-public/python/; zip -ur ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/caffeonsparkpythonapi.zip *; cd -; mv caffeonsparkpythonapi.zip ${CAFFE_ON_SPARK}/caffe-grid/target/; cd ${CAFFE_ON_SPARK}
cd ${CAFFE_ON_SPARK}/caffe-grid/src/main/python/; zip -r caffeonsparkpythonapi *; mv caffeonsparkpythonapi.zip ${CAFFE_ON_SPARK}/caffe-grid/target/; cd ${CAFFE_ON_SPARK}
export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}; export SPARK_HOME=${SPARK_HOME};GLOG_minloglevel=1 ${CAFFE_ON_SPARK}/caffe-grid/src/test/python/PythonTest.sh

update:
git submodule init
git submodule update --force
Expand All @@ -48,7 +49,7 @@ gh-pages:
rm -rf scala_doc
git checkout gh-pages scala_doc

clean:
clean:
cd caffe-public; make clean; cd ..
cd caffe-distri; make clean; cd ..
mvn ${MVN_SPARK_FLAG} clean
Expand Down
6 changes: 6 additions & 0 deletions caffe-grid/src/main/python/com/yahoo/ml/caffe/Config.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ class Config:
:ivar int test_data_layer_id: Get layer ID of training data source
:ivar int train_data_layer_id: Get layer ID of training data source
:ivar int transform_thread_per_device: Get/Set # of transformer threads per device
:ivar String imageCaptionDFDir: Path to generate the image caption dataframe
:ivar String vocabDir: Path to generate the Vocab
:ivar String embeddingDFDir: Path to generate the embedded dataframe
:ivar String captionFile: Path to the caption file
:ivar int captionLength: Embedding caption length
:ivar int vocabSize: Vocab size to consider
"""
def __init__(self,sc,args=None):
registerContext(sc)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ def toJavaSC(pySc):
Converts a Python SQLContext to a Scala SparkContext.
'''
def toScalaSQLC(pySQLc):
return pySQLc._scala_SQLContext
return jvm.org.apache.spark.sql.SQLContext(pySQLc._jsc.sc())

'''
Converts a Python SQLContext to a Java SparkContext.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ def image_tag(np_array):

def show_df(df, nrows=10):
"""Displays a table of labels with their images, inline in html
:param DataFrame df: A python dataframe
:param int nrows: First n rows to display from the dataframe
"""
Expand Down
3 changes: 3 additions & 0 deletions caffe-grid/src/main/python/com/yahoo/ml/caffe/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from tools import *

__all__=["tools"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
'''
Copyright 2016 Yahoo Inc.
Licensed under the terms of the Apache 2.0 license.
Please see LICENSE file in the project root for terms.
'''
from PIL import Image
from io import BytesIO
from IPython.display import HTML
import numpy as np
from base64 import b64encode
from google.protobuf import text_format
import array
from com.yahoo.ml.caffe.ConversionUtil import wrapClass, getScalaSingleton, toPython
from com.yahoo.ml.caffe.RegisterContext import registerContext
from pyspark.sql import DataFrame,SQLContext

class DFConversions:
"""
:ivar SparkContext: The spark context of the current spark session
"""

def __init__(self,sc):
registerContext(sc)
wrapClass("com.yahoo.ml.caffe.tools.Conversions$")
self.__dict__['conversions']=toPython(getScalaSingleton("com.yahoo.ml.caffe.tools.Conversions"))
self.__dict__['sqlContext']=SQLContext(sc)

def Coco2ImageCaptionFile(self,src,clusterSize):
"""Convert Cocodataset to Image Caption Dataframe
:param src: the source for coco dataset i.e the caption file
:param clusterSize: No. of executors
"""
df = self.__dict__.get('conversions').Coco2ImageCaptionFile(self.__dict__.get('sqlContext'), src, clusterSize)
pydf = DataFrame(df,self.__dict__.get('sqlContext'))
return pydf


def Image2Embedding(self, imageRootFolder, imageCaptionDF):
"""Get the embedding for the image as a dataframe
:param imageRootFolder: the src folder of the images
:param imageCaptionDF: the dataframe with the image file and image attributes
"""
df = self.__dict__.get('conversions').Image2Embedding(imageRootFolder, imageCaptionDF._jdf)
pydf = DataFrame(df,self.__dict__.get('sqlContext'))
return pydf

def ImageCaption2Embedding(self, imageRootFolder, imageCaptionDF, vocab, captionLength):
"""Get the embedding for the images as well as the caption as a dataframe
:param imageRootFolder: the src folder of the images
:param imageCaptionDF: the dataframe with the images as well as captions
:param vocab: the vocab object
:param captionLength: Length of the embedding to generate for the caption
"""
df = self.__dict__.get('conversions').ImageCaption2Embedding(imageRootFolder, imageCaptionDF._jdf, vocab.vocabObject, captionLength)
pydf = DataFrame(df,self.__dict__.get('sqlContext'))
return pydf


def Embedding2Caption(self, embeddingDF, vocab, embeddingColumn, captionColumn):
"""Get the captions from the embeddings
:param embeddingDF: the dataframe which contains the embedding
:param vocab: the vocab object
:param embeddingColumn: the embedding column name in embeddingDF which contains the caption embedding
"""
df = self.__dict__.get('conversions').Embedding2Caption(embeddingDF._jdf, vocab.vocabObject, embeddingColumn, captionColumn)
pydf = DataFrame(df,self.__dict__.get('sqlContext'))
return pydf


def get_image(image):
bytes = array.array('b', image)
return "<img src='data:image/png;base64," + b64encode(bytes) + "' />"


def show_captions(df, nrows=10):
"""Displays a table of captions(both original as well as predictions) with their images, inline in html
:param DataFrame df: A python dataframe
:param int nrows: First n rows to display from the dataframe
"""
data = df.take(nrows)
html = "<table><tr><th>Image Id</th><th>Image</th><th>Prediction</th>"
for i in range(nrows):
row = data[i]
html += "<tr>"
html += "<td>%s</td>" % row.id
html += "<td>%s</td>" % get_image(row.data.image)
html += "<td>%s</td>" % row.prediction
html += "</tr>"
html += "</table>"
return HTML(html)
43 changes: 43 additions & 0 deletions caffe-grid/src/main/python/com/yahoo/ml/caffe/tools/Vocab.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
'''
Copyright 2016 Yahoo Inc.
Licensed under the terms of the Apache 2.0 license.
Please see LICENSE file in the project root for terms.
'''

from com.yahoo.ml.caffe.ConversionUtil import wrapClass
from com.yahoo.ml.caffe.RegisterContext import registerContext
from pyspark.sql import DataFrame,SQLContext

class Vocab:
"""
:ivar SparkContext: The spark context of the current spark session
"""

def __init__(self,sc):
registerContext(sc)
self.vocab=wrapClass("com.yahoo.ml.caffe.tools.Vocab")
self.sqlContext=SQLContext(sc)
self.vocabObject=self.vocab(self.sqlContext)

def genFromData(self,dataset,columnName,vocabSize):
"""Convert generate the vocabulary from dataset
:param dataset: dataframe containing the captions
:param columnName: column in the dataset which has the caption
:param vocabSize: Size of the vocabulary to generate (with vocab in descending order)
"""
self.vocabObject.genFromData(dataset._jdf,columnName,vocabSize)

def save(self, vocabFilePath):
"""Save the generated vocabulary
:param vocabFilePath: the name of the file to save the vocabulary to
"""
self.vocabObject.save(vocabFilePath)

def load(self, vocabFilePath):
"""Load the vocabulary from a file
:param vocabFilePath: the name of the file to load the vocabulary from
"""
self.vocabObject.load(vocabFilePath)


Empty file.
50 changes: 50 additions & 0 deletions caffe-grid/src/main/python/examples/ImageCaption.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright 2016 Yahoo Inc.
# Licensed under the terms of the Apache 2.0 license.
# Please see LICENSE file in the project root for terms.
import caffe
from examples.coco.retrieval_experiment import *
from pyspark.sql import SQLContext
from pyspark import SparkConf,SparkContext
from pyspark.sql.types import *
from itertools import izip_longest
import json
import argparse

def predict_caption(list_of_images, model, imagenet, lstmnet, vocab):
out_iterator = []
ce = CaptionExperiment(str(model),str(imagenet),str(lstmnet),str(vocab))
for image in list_of_images:
out_iterator.append(ce.getCaption(image))
return iter(out_iterator)

def get_predictions(sqlContext, images, model, imagenet, lstmnet, vocab):
rdd = images.mapPartitions(lambda im: predict_caption(im, model, imagenet, lstmnet, vocab))
INNERSCHEMA = StructType([StructField("id", StringType(), True),StructField("prediction", StringType(), True)])
schema = StructType([StructField("result", INNERSCHEMA, True)])
return sqlContext.createDataFrame(rdd, schema).select("result.id", "result.prediction")

def main():
conf = SparkConf()
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
cmdargs = conf.get('spark.pythonargs')
parser = argparse.ArgumentParser(description="Image to Caption Util")
parser.add_argument('-input', action="store", dest="input")
parser.add_argument('-model', action="store", dest="model")
parser.add_argument('-imagenet', action="store", dest="imagenet")
parser.add_argument('-lstmnet', action="store", dest="lstmnet")
parser.add_argument('-vocab', action="store", dest="vocab")
parser.add_argument('-output', action="store", dest="output")

args=parser.parse_args(cmdargs.split(" "))

df_input = sqlContext.read.parquet(str(args.input))
images = df_input.select("data.image","data.height", "data.width", "id")
df=get_predictions(sqlContext, images, str(args.model), str(args.imagenet), str(args.lstmnet), str(args.vocab))
df.write.json(str(args.output))


if __name__ == "__main__":
main()


Loading

0 comments on commit 054aa08

Please sign in to comment.