locationtech · lossyrob · Jul 29, 2016 · Jun 29, 2016 · Jul 12, 2016 · Jul 14, 2016
diff --git a/.travis.yml b/.travis.yml
@@ -23,6 +23,7 @@ before_script:
   - psql -c 'CREATE EXTENSION postgis;' -U postgres -d slick_tests
   - psql -c 'CREATE EXTENSION postgis_topology;' -U postgres -d slick_tests
   - docker run -d --restart=always --net=host -m 1G --memory-swap -1 --env="MAX_HEAP_SIZE=500M" --env="HEAP_NEWSIZE=100M" --env="CASSANDRA_LISTEN_ADDRESS=127.0.0.1" cassandra:latest
+  - .travis/hbase-install.sh
 
 jdk:
   - openjdk7

diff --git a/.travis/build-and-test-set-1.sh b/.travis/build-and-test-set-1.sh
@@ -3,6 +3,7 @@
 ./sbt -J-Xmx2G "++$TRAVIS_SCALA_VERSION" "project doc-examples" compile  || { exit 1; }
 ./sbt -J-Xmx2G "++$TRAVIS_SCALA_VERSION" "project spark" test  || { exit 1; }
 ./sbt -J-Xmx2G "++$TRAVIS_SCALA_VERSION" "project accumulo" test  || { exit 1; }
+./sbt -J-Xmx2G "++$TRAVIS_SCALA_VERSION" "project hbase" test  || { exit 1; }
 ./sbt -J-Xmx2G "++$TRAVIS_SCALA_VERSION" "project proj4" test || { exit 1; }
 ./sbt -J-Xmx2G "++$TRAVIS_SCALA_VERSION" "project geotools" test || { exit 1; }
 ./sbt -J-Xmx2G "++$TRAVIS_SCALA_VERSION" "project shapefile" test || { exit 1; }
diff --git a/.travis/deploy.sh b/.travis/deploy.sh
@@ -20,6 +20,7 @@ EOF
   && ./sbt "++$TRAVIS_SCALA_VERSION" "project s3" publish \
   && ./sbt "++$TRAVIS_SCALA_VERSION" "project accumulo" publish \
   && ./sbt "++$TRAVIS_SCALA_VERSION" "project cassandra" publish \
+  && ./sbt "++$TRAVIS_SCALA_VERSION" "project hbase" publish \
   && ./sbt "++$TRAVIS_SCALA_VERSION" "project spark-etl" publish \
   && ./sbt "++$TRAVIS_SCALA_VERSION" "project shapefile" publish \
   && ./sbt "++$TRAVIS_SCALA_VERSION" "project slick" publish \

diff --git a/.travis/hbase-install.sh b/.travis/hbase-install.sh
@@ -0,0 +1,6 @@
+#! /bin/bash
+
+sudo wget http://apache-mirror.rbc.ru/pub/apache/hbase/1.2.2/hbase-1.2.2-bin.tar.gz
+sudo tar xzf hbase-1.2.2-bin.tar.gz
+sudo rm -f hbase-1.2.2/conf/hbase-site.xml && sudo mv .travis/hbase/hbase-site.xml hbase-1.2.2/conf
+sudo hbase-1.2.2/bin/start-hbase.sh
diff --git a/.travis/hbase/hbase-site.xml b/.travis/hbase/hbase-site.xml
@@ -0,0 +1,44 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+-->
+<configuration>
+    <property>
+        <name>hbase.rootdir</name>
+        <value>/tmp/hbase</value>
+    </property>
+    <property>
+        <name>hbase.zookeeper.property.dataDir</name>
+        <value>/tmp/zookeeper</value>
+    </property>
+    <property>
+        <name>hbase.zookeeper.property.maxClientCnxns</name>
+        <value>0</value>
+    </property>
+    <property>
+        <name>zookeeper.session.timeout</name>
+        <value>1200000</value>
+    </property>
+    <property>
+        <name>hbase.zookeeper.property.tickTime</name>
+        <value>6000</value>
+    </property>
+</configuration>
diff --git a/README.md b/README.md
@@ -161,7 +161,11 @@ This is a list of features contained in the GeoTrellis library. It is broken up
 
 #### geotrellis-cassandra
 
-Save and load layers to and from Casandra. Query large layers efficiently using the layer query API.
+- Save and load layers to and from Casandra. Query large layers efficiently using the layer query API.
+
+#### geotrellis-hbase
+
+- Save and load layers to and from HBase. Query large layers efficiently using the layer query API.
 
 #### geotrellis-s3
 

diff --git a/build.sbt b/build.sbt
@@ -71,6 +71,7 @@ lazy val root = Project("geotrellis", file(".")).
     s3,
     accumulo,
     cassandra,
+    hbase,
     geotools,
     slick
   ).
@@ -141,8 +142,12 @@ lazy val cassandra = Project("cassandra", file("cassandra")).
   dependsOn(sparkTestkit % "test->test", spark % "provided;test->test").
   settings(commonSettings: _*)
 
+lazy val hbase = Project("hbase", file("hbase")).
+  dependsOn(sparkTestkit % "test->test", spark % "provided;test->test").
+  settings(commonSettings: _*)
+
 lazy val sparkEtl = Project(id = "spark-etl", base = file("spark-etl")).
-  dependsOn(spark, s3, accumulo, cassandra).
+  dependsOn(spark, s3, accumulo, cassandra, hbase).
   settings(commonSettings: _*)
 
 lazy val geotools = Project("geotools", file("geotools")).
@@ -157,5 +162,5 @@ lazy val util = Project("util", file("util")).
   settings(commonSettings: _*)
 
 lazy val docExamples = Project("doc-examples", file("doc-examples")).
-  dependsOn(spark, s3, accumulo, cassandra, spark % "test->test", sparkTestkit % "test->test").
+  dependsOn(spark, s3, accumulo, cassandra, hbase, spark % "test->test", sparkTestkit % "test->test").
   settings(commonSettings: _*)
diff --git a/cassandra/build.sbt b/cassandra/build.sbt
@@ -2,9 +2,11 @@ import Dependencies._
 
 name := "geotrellis-cassandra"
 libraryDependencies ++= Seq(
-  "com.datastax.cassandra" % "cassandra-driver-core" % "3.0.0"
-    excludeAll (ExclusionRule("org.jboss.netty"), ExclusionRule("io.netty"), ExclusionRule("org.slf4j"), ExclusionRule("io.spray"), ExclusionRule("com.typesafe.akka"))
-    exclude("org.apache.hadoop", "hadoop-client"),
+  "com.datastax.cassandra" % "cassandra-driver-core" % Version.cassandra
+    excludeAll (
+      ExclusionRule("org.jboss.netty"), ExclusionRule("io.netty"),
+      ExclusionRule("org.slf4j"), ExclusionRule("io.spray"), ExclusionRule("com.typesafe.akka")
+    ) exclude("org.apache.hadoop", "hadoop-client"),
   "org.apache.spark" %% "spark-core" % Version.spark % "provided",
   spire,
   scalatest % "test")

diff --git a/cassandra/src/test/scala/geotrellis/spark/CassandraTestEnvironment.scala b/cassandra/src/test/scala/geotrellis/spark/CassandraTestEnvironment.scala
@@ -16,7 +16,6 @@
 
 package geotrellis.spark
 
-import com.esotericsoftware.kryo.Kryo
 import geotrellis.spark.io.cassandra.BaseCassandraInstance
 import org.apache.spark.SparkConf
 import org.scalatest._

diff --git a/docs/README.md b/docs/README.md
@@ -33,6 +33,8 @@ GeoTrellis.
   - [spark input/output](spark/spark-io.md) // planned
 - [geotrellis.cassandra](cassandra/)
   - [geotrellis.cassandra.test](cassandra/cassandra-test.md)
+- [geotrellis.hbase](hbase/)
+  - [geotrellis.hbase.test](hbase/hbase-test.md)
 - [geotrellis.spark-etl](spark-etl/spark-etl-intro.md)
   - [geotrellis.spark-etl.run-examples](spark-etl/spark-etl-run-examples.md)
 - [geotrellis.util](util/util-intro.md)

diff --git a/docs/hbase/hbase-test.md b/docs/hbase/hbase-test.md
@@ -0,0 +1,12 @@
+# geotrelis.hbase.test
+
+HBase provides HBase mock instances to run tests though this way is expensive by machine resources, and by dependencies. 
+It is possible to run tests just on any HBase instance, by default it uses min resources settings in a standalone mode, and provides standalone ZooKeeper. 
+However, it may be required to increase ZooKeeper connections timeouts and limits [config example](/.travis/hbase/hbase-site.xml).
+There is an [example script](/scripts/hbaseTestDB.sh) to start HBase in a Docker container. HBase is 
+extremely sensitive to host names, that's why there can be problems in starting HBase in a Docker container, similar to problems accessing HBase from a separate machine.
+
+### Mac OS X / Windows users
+
+Docker is not supported by Mac OS X / Windows natively, it is possible to use [Docker Beta](https://beta.docker.com/), [Docker Machine](https://docs.docker.com/machine/) or smth else. 
+Experiments with it are appreciated, though the easiest way to run HBase on OS X machine, is just to [download](http://apache-mirror.rbc.ru/pub/apache/hbase/1.2.2/hbase-1.2.2-bin.tar.gz) HBase dist and to launch it in a standalone mode.
diff --git a/hbase/build.sbt b/hbase/build.sbt
@@ -0,0 +1,25 @@
+import Dependencies._
+
+name := "geotrellis-hbase"
+libraryDependencies ++= Seq(
+  "org.apache.hbase" % "hbase-common" % Version.hbase exclude("javax.servlet", "servlet-api"),
+  "org.apache.hbase" % "hbase-client" % Version.hbase exclude("javax.servlet", "servlet-api"),
+  "org.apache.hbase" % "hbase-server" % Version.hbase exclude ("org.mortbay.jetty", "servlet-api-2.5"),
+  "org.apache.hbase" % "hbase-hadoop-compat" % Version.hbase exclude("javax.servlet", "servlet-api"),
+  "org.apache.spark" %% "spark-core" % Version.spark % "provided",
+  spire,
+  scalatest % "test")
+
+fork in Test := false
+parallelExecution in Test := false
+
+initialCommands in console :=
+  """
+  import geotrellis.raster._
+  import geotrellis.vector._
+  import geotrellis.proj4._
+  import geotrellis.spark._
+  import geotrellis.spark.util._
+  import geotrellis.spark.tiling._
+  import geotrellis.spark.io.hbase._
+  """
diff --git a/hbase/src/main/resources/reference.conf b/hbase/src/main/resources/reference.conf
@@ -0,0 +1 @@
+geotrellis.hbase.catalog = "metadata"
diff --git a/hbase/src/main/scala/geotrellis/spark/io/hbase/HBaseAttributeStore.scala b/hbase/src/main/scala/geotrellis/spark/io/hbase/HBaseAttributeStore.scala
@@ -0,0 +1,114 @@
+package geotrellis.spark.io.hbase
+
+import com.typesafe.config.ConfigFactory
+import geotrellis.spark._
+import geotrellis.spark.io._
+
+import org.apache.hadoop.hbase._
+import org.apache.hadoop.hbase.client._
+import org.apache.hadoop.hbase.util.Bytes
+import org.apache.spark.Logging
+import spray.json._
+import spray.json.DefaultJsonProtocol._
+
+import scala.collection.JavaConversions._
+
+object HBaseAttributeStore {
+  def apply(instance: HBaseInstance): HBaseAttributeStore =
+    new HBaseAttributeStore(instance, ConfigFactory.load().getString("geotrellis.hbase.catalog"))
+  def apply(instance: HBaseInstance, attributeTable: String): HBaseAttributeStore =
+    new HBaseAttributeStore(instance, attributeTable)
+}
+
+class HBaseAttributeStore(val instance: HBaseInstance, val attributeTable: String) extends DiscreteLayerAttributeStore with Logging {
+
+  //create the attribute table if it does not exist
+  if (!instance.getAdmin.tableExists(attributeTable)) {
+    val tableDesc = new HTableDescriptor(attributeTable: TableName)
+    val headerColumnFamilyDesc = new HColumnDescriptor(AttributeStore.Fields.header)
+    tableDesc.addFamily(headerColumnFamilyDesc)
+    instance.getAdmin.createTable(tableDesc)
+  }
+
+  val table = instance.getAdmin.getConnection.getTable(attributeTable)
+
+  val SEP = HBaseRDDWriter.SEP
+
+  def layerIdString(layerId: LayerId): String = s"${layerId.name}${SEP}${layerId.zoom}"
+
+  def addColumn(cf: String) = if(!table.getTableDescriptor.hasFamily(cf))
+    instance.getAdmin.addColumn(attributeTable, new HColumnDescriptor(cf))
+
+  private def fetch(layerId: Option[LayerId], attributeName: String): Iterator[Result] = {
+    val scan = new Scan()
+    layerId.foreach { id =>
+      scan.setStartRow(layerIdString(id))
+      scan.setStopRow(stringToBytes(layerIdString(id)) :+ 0.toByte) // add trailing byte, to include stop row
+    }
+    scan.addFamily(attributeName)
+    table.getScanner(scan).iterator()
+  }
+
+  private def delete(layerId: LayerId, attributeName: Option[String]): Unit = {
+    if(!layerExists(layerId)) throw new LayerNotFoundError(layerId)
+
+    val delete = new Delete(layerIdString(layerId))
+    attributeName.foreach(delete.addFamily(_))
+    table.delete(delete)
+    attributeName.foreach(table.getTableDescriptor.removeFamily(_))
+
+    attributeName match {
+      case Some(attribute) => clearCache(layerId, attribute)
+      case None => clearCache(layerId)
+    }
+  }
+
+  def read[T: JsonFormat](layerId: LayerId, attributeName: String): T = {
+    val values = fetch(Some(layerId), attributeName).toVector
+
+    if(values.isEmpty) {
+      throw new AttributeNotFoundError(attributeName, layerId)
+    } else if(values.size > 1) {
+      throw new LayerIOError(s"Multiple attributes found for $attributeName for layer $layerId")
+    } else {
+      Bytes.toString(values.head.getValue(attributeName, "")).parseJson.convertTo[(LayerId, T)]._2
+    }
+  }
+
+  def readAll[T: JsonFormat](attributeName: String): Map[LayerId, T] = {
+    fetch(None, attributeName).toVector
+      .map { row => Bytes.toString(row.getValue(attributeName, "")).parseJson.convertTo[(LayerId, T)] }
+      .toMap
+  }
+
+  def write[T: JsonFormat](layerId: LayerId, attributeName: String, value: T): Unit = {
+    addColumn(attributeName)
+    val put = new Put(layerIdString(layerId))
+    put.addColumn(
+      attributeName, "", System.currentTimeMillis(),
+      (layerId, value).toJson.compactPrint.getBytes
+    )
+
+    table.put(put)
+  }
+
+  def layerExists(layerId: LayerId): Boolean = !table.get(new Get(layerIdString(layerId))).isEmpty
+
+  def delete(layerId: LayerId): Unit = delete(layerId, None)
+
+  def delete(layerId: LayerId, attributeName: String): Unit = delete(layerId, Some(attributeName))
+
+  def layerIds: Seq[LayerId] = {
+    val scan = new Scan()
+    table.getScanner(scan).iterator()
+      .map { kv: Result =>
+        val List(name, zoomStr) = Bytes.toString(kv.getRow).split(SEP).toList
+        LayerId(name, zoomStr.toInt)
+      }
+      .toList
+      .distinct
+  }
+
+  def availableAttributes(layerId: LayerId): Seq[String] =
+    table.getTableDescriptor.getFamiliesKeys.map(Bytes.toString).toSeq
+}
diff --git a/hbase/src/main/scala/geotrellis/spark/io/hbase/HBaseInstance.scala b/hbase/src/main/scala/geotrellis/spark/io/hbase/HBaseInstance.scala
@@ -0,0 +1,17 @@
+package geotrellis.spark.io.hbase
+
+import org.apache.hadoop.hbase.HBaseConfiguration
+import org.apache.hadoop.hbase.client._
+
+case class HBaseInstance(zookeepers: Seq[String], master: String, clientPort: String = "2181") extends Serializable {
+  @transient lazy val conf = {
+    val c = HBaseConfiguration.create
+    c.set("hbase.zookeeper.quorum", zookeepers.mkString(","))
+    c.set("hbase.zookeeper.property.clientPort", clientPort)
+    c.set("hbase.master", master)
+    c
+  }
+
+  def getConnection: Connection = ConnectionFactory.createConnection(conf)
+  @transient lazy val getAdmin: Admin = getConnection.getAdmin
+}
diff --git a/hbase/src/main/scala/geotrellis/spark/io/hbase/HBaseKeyEncoder.scala b/hbase/src/main/scala/geotrellis/spark/io/hbase/HBaseKeyEncoder.scala
@@ -0,0 +1,7 @@
+package geotrellis.spark.io.hbase
+
+import geotrellis.spark._
+
+object HBaseKeyEncoder {
+  def encode(id: LayerId, index: Long): Array[Byte] = (s"${HBaseRDDWriter.layerIdString(id)}": Array[Byte]) ++ (index: Array[Byte])
+}