From aa5a628e7ac2154b391f92bdf94bcbce9fb2f65a Mon Sep 17 00:00:00 2001 From: Bertrand Dechoux Date: Thu, 5 Jul 2012 17:42:58 +0200 Subject: [PATCH 1/2] add inverted index example for cascalog --- inverted-index/inverted-index.clj | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 inverted-index/inverted-index.clj diff --git a/inverted-index/inverted-index.clj b/inverted-index/inverted-index.clj new file mode 100644 index 0000000..b0c9354 --- /dev/null +++ b/inverted-index/inverted-index.clj @@ -0,0 +1,25 @@ +;;;; Invert index in Cascalog. + +;; bootstrap +(use 'cascalog.playground)(bootstrap) + +;; define the data +(def index [ + [0 "Hello World"] + [101 "The quick brown fox jumps over the lazy dog"] + [42 "Answer to the Ultimate Question of Life, the Universe, and Everything"] +]) + +;; the tokenize function +(defmapcatop tokenize [text] + (seq (.split text "\\s+"))) + +;; ensure inverted index is distinct per word +(defbufferop distinct-vals [tuples] + (list (set (map first tuples)))) + +;; run the query on data +(?<- (stdout) [?word ?ids] + (index ?id ?text) + (tokenize ?text :> ?word) + (distinct-vals ?id :> ?ids)) \ No newline at end of file From 41ebe0c1d095152e4d443123802500b8f68615e6 Mon Sep 17 00:00:00 2001 From: Bertrand Dechoux Date: Thu, 5 Jul 2012 17:54:33 +0200 Subject: [PATCH 2/2] update README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index be30a46..233a257 100644 --- a/README.md +++ b/README.md @@ -13,5 +13,6 @@ A collection of MapReduce tasks translated (from Pig, Hive, MapReduce streaming, * [Pig](https://github.com/echen/rosetta-scone/blob/master/distributed-grep/distributed-grep.pig) * [Inverted Index](https://github.com/echen/rosetta-scone/tree/master/inverted-index) * [Scalding](https://github.com/echen/rosetta-scone/blob/master/inverted-index/InvertedIndex.scala) + * [Cascalog](https://github.com/echen/rosetta-scone/blob/master/inverted-index/inverted-index.clj) * [Hadoop Streaming](https://github.com/echen/rosetta-scone/blob/master/inverted-index/inverted_index.rb) * [Pig](https://github.com/echen/rosetta-scone/blob/master/inverted-index/inverted-index.pig) \ No newline at end of file