Merge branch 'release/0.8.4'

locationtech · Nov 11, 2019 · 71cead3 · 71cead3
2 parents a8a8bb7 + 0e42a35
commit 71cead3
Show file tree

Hide file tree

Showing 74 changed files with 1,466 additions and 2,220 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,4 +1,3 @@
-sudo: false
 dist: xenial
 language: python
 
@@ -28,11 +27,10 @@ install:
  - pip install rasterio shapely pandas numpy pweave
  - wget -O - https://piccolo.link/sbt-1.2.8.tgz | tar xzf -
 
-script:
- - sbt/bin/sbt -java-home $JAVA_HOME -batch test
- - sbt/bin/sbt -java-home $JAVA_HOME -batch it:test
- # - sbt -Dfile.encoding=UTF8 clean coverage test coverageReport
- # Tricks to avoid unnecessary cache updates
- - find $HOME/.sbt -name "*.lock" | xargs rm
- - find $HOME/.ivy2 -name "ivydata-*.properties" | xargs rm
 
+jobs:
+ include:
+ - stage: "Unit Tests"
+ script: sbt/bin/sbt -java-home $JAVA_HOME -batch test
+ - stage: "Integration Tests"
+ script: sbt/bin/sbt -java-home $JAVA_HOME -batch it:test
diff --git a/README.md b/README.md
@@ -1,12 +1,12 @@
-<img src="docs/src/main/paradox/_template/images/RasterFramesLogo.png" width="300px"/><sup style="vertical-align: top;">&reg;</sup>
+<img src="docs/src/main/paradox/_template/assets/images/RasterFramesLogo.png" width="300px"/><sup style="vertical-align: top;">&reg;</sup>
 
  [![Join the chat at https://gitter.im/locationtech/rasterframes](https://badges.gitter.im/locationtech/rasterframes.svg)](https://gitter.im/locationtech/rasterframes?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 
 RasterFrames® brings together Earth-observation (EO) data access, cloud computing, and DataFrame-based data science. The recent explosion of EO data from public and private satellite operators presents both a huge opportunity as well as a challenge to the data analysis community. It is _Big Data_ in the truest sense, and its footprint is rapidly getting bigger. 
 
 RasterFrames provides a DataFrame-centric view over arbitrary raster data, enabling spatiotemporal queries, map algebra raster operations, and compatibility with the ecosystem of Spark ML algorithms. By using DataFrames as the core cognitive and compute data model, it is able to deliver these features in a form that is both accessible to general analysts and scalable along with the rapidly growing data footprint.
 
-<img src="docs/src/main/paradox/RasterFramePipeline.png" width="600px"/>
+<img src="pyrasterframes/src/main/python/docs/static/rasterframes-pipeline-nologo.png" width="600px"/>
 
 Please see the [Getting Started](http://rasterframes.io/getting-started.html) section of the Users' Manual to start using RasterFrames.
 

diff --git a/bench/src/main/scala/org/locationtech/rasterframes/bench/TileExplodeBench.scala b/bench/src/main/scala/org/locationtech/rasterframes/bench/TileExplodeBench.scala
@@ -22,12 +22,11 @@ package org.locationtech.rasterframes.bench
 
 import java.util.concurrent.TimeUnit
 
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.BoundReference
-import org.apache.spark.sql.rf.TileUDT
 import org.locationtech.rasterframes._
-import org.locationtech.rasterframes.expressions.generators.ExplodeTiles
+import org.apache.spark.sql._
+import org.apache.spark.sql.functions._
 import org.openjdk.jmh.annotations._
+
 /**
  *
  * @author sfitch
@@ -37,32 +36,33 @@ import org.openjdk.jmh.annotations._
 @State(Scope.Benchmark)
 @OutputTimeUnit(TimeUnit.MILLISECONDS)
 class TileExplodeBench extends SparkEnv {
+ import spark.implicits._
 
- //@Param(Array("uint8", "uint16ud255", "float32", "float64"))
- @Param(Array("uint16ud255"))
+ @Param(Array("uint8", "uint16ud255", "float32", "float64"))
  var cellTypeName: String = _
 
  @Param(Array("256"))
  var tileSize: Int = _
 
- @Param(Array("2000"))
+ @Param(Array("100"))
  var numTiles: Int = _
 
  @transient
- var tiles: Array[InternalRow] = _
-
- var exploder: ExplodeTiles = _
+ var tiles: DataFrame = _
 
  @Setup(Level.Trial)
  def setupData(): Unit = {
- tiles = Array.fill(numTiles)(randomTile(tileSize, tileSize, cellTypeName))
- .map(t => InternalRow(TileUDT.tileSerializer.toInternalRow(t)))
- val expr = BoundReference(0, TileType, true)
- exploder = new ExplodeTiles(1.0, None, Seq(expr))
+ tiles = Seq.fill(numTiles)(randomTile(tileSize, tileSize, cellTypeName))
+ .toDF("tile").repartition(10)
+ }
+
+ @Benchmark
+ def arrayExplode() = {
+ tiles.select(posexplode(rf_tile_to_array_double($"tile"))).count()
  }
+
  @Benchmark
  def tileExplode() = {
- for(t <- tiles)
- exploder.eval(t)
+ tiles.select(rf_explode_tiles($"tile")).count()
  }
 }
diff --git a/build.sbt b/build.sbt
@@ -32,7 +32,10 @@ lazy val root = project
  .withId("RasterFrames")
  .aggregate(core, datasource, pyrasterframes, experimental)
  .enablePlugins(RFReleasePlugin)
- .settings(publish / skip := true)
+ .settings(
+ publish / skip := true,
+ clean := clean.dependsOn(`rf-notebook`/clean).value
+ )
 
 lazy val `rf-notebook` = project
  .dependsOn(pyrasterframes)

diff --git a/build/circleci/Dockerfile b/build/circleci/Dockerfile
@@ -6,45 +6,40 @@ ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
 
 # most of these libraries required for
 # python-pip pandoc && pip install setuptools => required for pyrasterframes testing
-RUN sudo apt-get update && \
+RUN \
+ sudo apt-get update && \
  sudo apt remove \
  python python-minimal python2.7 python2.7-minimal \
  libpython-stdlib libpython2.7 libpython2.7-minimal libpython2.7-stdlib \
- && sudo apt-get install -y \
-  pandoc \
- wget \
- gcc g++ build-essential \
+  && \
+ sudo apt-get install -y \
+ pandoc wget \
+ gcc g++ build-essential bash-completion cmake imagemagick \
  libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \
- libcurl4-gnutls-dev \
- libproj-dev \
- libgeos-dev \
- libhdf4-alt-dev \
- bash-completion \
- cmake \
- imagemagick \
- libpng-dev \
- libffi-dev \
- && sudo apt autoremove \
- && sudo apt-get clean all
-# && sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 1
-# todo s
+ liblzma-dev libcurl4-gnutls-dev libproj-dev libgeos-dev libhdf4-alt-dev libpng-dev libffi-dev \
+ && \
+ sudo apt autoremove && \
+ sudo apt-get clean all
 
-RUN cd /tmp && \
- wget https://www.python.org/ftp/python/3.7.4/Python-3.7.4.tgz && \
- tar xzf Python-3.7.4.tgz && \
- cd Python-3.7.4 && \
- ./configure --with-ensurepip=install --prefix=/usr/local --enable-optimization && \
- make && \
- sudo make altinstall && \
- rm -rf Python-3.7.4*
+RUN \
+ cd /tmp && \
+ wget https://www.python.org/ftp/python/3.7.4/Python-3.7.4.tgz && \
+ tar xzf Python-3.7.4.tgz && \
+ cd Python-3.7.4 && \
+ ./configure --with-ensurepip=install --prefix=/usr/local --enable-optimization && \
+ make && \
+ sudo make altinstall && \
+ rm -rf Python-3.7.4*
 
-RUN sudo ln -s /usr/local/bin/python3.7 /usr/local/bin/python && \
- sudo curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
- sudo python get-pip.py && \
- sudo pip3 install setuptools ipython==6.2.1
+RUN \
+ sudo ln -s /usr/local/bin/python3.7 /usr/local/bin/python && \
+ sudo curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+ sudo python get-pip.py && \
+ sudo pip3 install setuptools ipython==6.2.1
 
 # install OpenJPEG
-RUN cd /tmp && \
+RUN \
+ cd /tmp && \
  wget https://github.com/uclouvain/openjpeg/archive/v${OPENJPEG_VERSION}.tar.gz && \
  tar -xf v${OPENJPEG_VERSION}.tar.gz && \
  cd openjpeg-${OPENJPEG_VERSION}/ && \
@@ -56,7 +51,8 @@ RUN cd /tmp && \
  cd /tmp && rm -Rf v${OPENJPEG_VERSION}.tar.gz openjpeg*
 
 # Compile and install GDAL with Java bindings
-RUN cd /tmp && \
+RUN \
+ cd /tmp && \
  wget http://download.osgeo.org/gdal/${GDAL_VERSION}/gdal-${GDAL_VERSION}.tar.gz && \
  tar -xf gdal-${GDAL_VERSION}.tar.gz && \
  cd gdal-${GDAL_VERSION} && \
@@ -73,8 +69,7 @@ RUN cd /tmp && \
  --with-threads \
  --without-jp2mrsid \
  --without-netcdf \
- --without-ecw \
- && \
+ --without-ecw && \
  make -j 8 && \
  sudo make install && \
  sudo ldconfig && \

diff --git a/core/src/main/scala/org/locationtech/rasterframes/RasterFunctions.scala b/core/src/main/scala/org/locationtech/rasterframes/RasterFunctions.scala
@@ -59,6 +59,22 @@ trait RasterFunctions {
  /** Extracts the bounding box from a RasterSource or ProjectedRasterTile */
  def rf_extent(col: Column): TypedColumn[Any, Extent] = GetExtent(col)
 
+ /** Constructs a XZ2 index in WGS84 from either a Geometry, Extent, ProjectedRasterTile, or RasterSource and its CRS
+ * For details: https://www.geomesa.org/documentation/user/datastores/index_overview.html */
+ def rf_spatial_index(targetExtent: Column, targetCRS: Column, indexResolution: Short) = XZ2Indexer(targetExtent, targetCRS, indexResolution)
+
+ /** Constructs a XZ2 index in WGS84 from either a Geometry, Extent, ProjectedRasterTile, or RasterSource and its CRS
+ * For details: https://www.geomesa.org/documentation/user/datastores/index_overview.html */
+ def rf_spatial_index(targetExtent: Column, targetCRS: Column) = XZ2Indexer(targetExtent, targetCRS, 18: Short)
+
+ /** Constructs a XZ2 index with level 18 resolution in WGS84 from either a ProjectedRasterTile or RasterSource
+ * For details: https://www.geomesa.org/documentation/user/datastores/index_overview.html */
+ def rf_spatial_index(targetExtent: Column, indexResolution: Short) = XZ2Indexer(targetExtent, indexResolution)
+
+ /** Constructs a XZ2 index with level 18 resolution in WGS84 from either a ProjectedRasterTile or RasterSource
+ * For details: https://www.geomesa.org/documentation/user/datastores/index_overview.html */
+ def rf_spatial_index(targetExtent: Column) = XZ2Indexer(targetExtent, 18: Short)
+
  /** Extracts the CRS from a RasterSource or ProjectedRasterTile */
  def rf_crs(col: Column): TypedColumn[Any, CRS] = GetCRS(col)
 
@@ -276,12 +292,38 @@ trait RasterFunctions {
  }
 
  /** Where the rf_mask tile contains NODATA, replace values in the source tile with NODATA */
- def rf_mask(sourceTile: Column, maskTile: Column): TypedColumn[Any, Tile] =
- Mask.MaskByDefined(sourceTile, maskTile)
+ def rf_mask(sourceTile: Column, maskTile: Column): TypedColumn[Any, Tile] = rf_mask(sourceTile, maskTile, false)
+
+ /** Where the rf_mask tile contains NODATA, replace values in the source tile with NODATA */
+ def rf_mask(sourceTile: Column, maskTile: Column, inverse: Boolean=false): TypedColumn[Any, Tile] =
+ if(!inverse) Mask.MaskByDefined(sourceTile, maskTile)
+ else Mask.InverseMaskByDefined(sourceTile, maskTile)
+
+ /** Where the `maskTile` equals `maskValue`, replace values in the source tile with `NoData` */
+ def rf_mask_by_value(sourceTile: Column, maskTile: Column, maskValue: Column, inverse: Boolean=false): TypedColumn[Any, Tile] =
+ if (!inverse) Mask.MaskByValue(sourceTile, maskTile, maskValue)
+ else Mask.InverseMaskByValue(sourceTile, maskTile, maskValue)
 
  /** Where the `maskTile` equals `maskValue`, replace values in the source tile with `NoData` */
- def rf_mask_by_value(sourceTile: Column, maskTile: Column, maskValue: Column): TypedColumn[Any, Tile] =
- Mask.MaskByValue(sourceTile, maskTile, maskValue)
+ def rf_mask_by_value(sourceTile: Column, maskTile: Column, maskValue: Int, inverse: Boolean): TypedColumn[Any, Tile] =
+ rf_mask_by_value(sourceTile, maskTile, lit(maskValue), inverse)
+
+ /** Where the `maskTile` equals `maskValue`, replace values in the source tile with `NoData` */
+ def rf_mask_by_value(sourceTile: Column, maskTile: Column, maskValue: Int): TypedColumn[Any, Tile] =
+ rf_mask_by_value(sourceTile, maskTile, maskValue, false)
+
+ /** Generate a tile with the values from `data_tile`, but where cells in the `mask_tile` are in the `mask_values`
+ list, replace the value with NODATA. */
+ def rf_mask_by_values(sourceTile: Column, maskTile: Column, maskValues: Column): TypedColumn[Any, Tile] =
+ Mask.MaskByValues(sourceTile, maskTile, maskValues)
+
+ /** Generate a tile with the values from `data_tile`, but where cells in the `mask_tile` are in the `mask_values`
+ list, replace the value with NODATA. */
+ def rf_mask_by_values(sourceTile: Column, maskTile: Column, maskValues: Seq[Int]): TypedColumn[Any, Tile] = {
+ import org.apache.spark.sql.functions.array
+ val valuesCol: Column = array(maskValues.map(lit).toSeq: _*)
+ rf_mask_by_values(sourceTile, maskTile, valuesCol)
+ }
 
  /** Where the `maskTile` does **not** contain `NoData`, replace values in the source tile with `NoData` */
  def rf_inverse_mask(sourceTile: Column, maskTile: Column): TypedColumn[Any, Tile] =
@@ -291,6 +333,10 @@ trait RasterFunctions {
  def rf_inverse_mask_by_value(sourceTile: Column, maskTile: Column, maskValue: Column): TypedColumn[Any, Tile] =
  Mask.InverseMaskByValue(sourceTile, maskTile, maskValue)
 
+ /** Where the `maskTile` does **not** equal `maskValue`, replace values in the source tile with `NoData` */
+ def rf_inverse_mask_by_value(sourceTile: Column, maskTile: Column, maskValue: Int): TypedColumn[Any, Tile] =
+ Mask.InverseMaskByValue(sourceTile, maskTile, lit(maskValue))
+
  /** Create a tile where cells in the grid defined by cols, rows, and bounds are filled with the given value. */
  def rf_rasterize(geometry: Column, bounds: Column, value: Column, cols: Int, rows: Int): TypedColumn[Any, Tile] =
  withTypedAlias("rf_rasterize", geometry)(
@@ -389,6 +435,12 @@ trait RasterFunctions {
  /** Cellwise inequality comparison between a tile and a scalar. */
  def rf_local_unequal[T: Numeric](tileCol: Column, value: T): Column = Unequal(tileCol, value)
 
+ /** Test if each cell value is in provided array */
+ def rf_local_is_in(tileCol: Column, arrayCol: Column) = IsIn(tileCol, arrayCol)
+
+ /** Test if each cell value is in provided array */
+ def rf_local_is_in(tileCol: Column, array: Array[Int]) = IsIn(tileCol, array)
+
  /** Return a tile with ones where the input is NoData, otherwise zero */
  def rf_local_no_data(tileCol: Column): Column = Undefined(tileCol)
 

diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/DynamicExtractors.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/DynamicExtractors.scala
@@ -23,11 +23,14 @@ package org.locationtech.rasterframes.expressions
 
 import geotrellis.proj4.CRS
 import geotrellis.raster.{CellGrid, Tile}
+import geotrellis.vector.Extent
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.jts.JTSTypes
 import org.apache.spark.sql.rf.{RasterSourceUDT, TileUDT}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
+import org.locationtech.jts.geom.Envelope
 import org.locationtech.rasterframes.encoders.CatalystSerializer._
 import org.locationtech.rasterframes.model.{LazyCRS, TileContext}
 import org.locationtech.rasterframes.ref.{ProjectedRasterLike, RasterRef, RasterSource}
@@ -94,6 +97,15 @@ object DynamicExtractors {
  (v: Any) => v.asInstanceOf[InternalRow].to[CRS]
  }
 
+ lazy val extentLikeExtractor: PartialFunction[DataType, Any ⇒ Extent] = {
+ case t if org.apache.spark.sql.rf.WithTypeConformity(t).conformsTo(JTSTypes.GeometryTypeInstance) =>
+ (input: Any) => JTSTypes.GeometryTypeInstance.deserialize(input).getEnvelopeInternal
+ case t if t.conformsTo[Extent] =>
+ (input: Any) => input.asInstanceOf[InternalRow].to[Extent]
+ case t if t.conformsTo[Envelope] =>
+ (input: Any) => Extent(input.asInstanceOf[InternalRow].to[Envelope])
+ }
+
  sealed trait TileOrNumberArg
  sealed trait NumberArg extends TileOrNumberArg
  case class TileArg(tile: Tile, ctx: Option[TileContext]) extends TileOrNumberArg

diff --git a/.../main/scala/org/locationtech/rasterframes/expressions/aggregates/CellStatsAggregate.scala b/.../main/scala/org/locationtech/rasterframes/expressions/aggregates/CellStatsAggregate.scala
@@ -123,8 +123,8 @@ object CellStatsAggregate {
  import org.locationtech.rasterframes.encoders.StandardEncoders.cellStatsEncoder
 
  def apply(col: Column): TypedColumn[Any, CellStatistics] =
- new Column(new CellStatsAggregateUDAF(col.expr))
- .as(s"rf_agg_stats($col)") // node renaming in class doesn't seem to propogate
+ new CellStatsAggregate()(ExtractTile(col))
+ .as(s"rf_agg_stats($col)")
  .as[CellStatistics]
 
  /** Adapter hack to allow UserDefinedAggregateFunction to be referenced as an expression. */

diff --git a/.../main/scala/org/locationtech/rasterframes/expressions/aggregates/HistogramAggregate.scala b/.../main/scala/org/locationtech/rasterframes/expressions/aggregates/HistogramAggregate.scala
@@ -98,8 +98,8 @@ object HistogramAggregate {
  import org.locationtech.rasterframes.encoders.StandardEncoders.cellHistEncoder
 
  def apply(col: Column): TypedColumn[Any, CellHistogram] =
- new Column(new HistogramAggregateUDAF(col.expr))
- .as(s"rf_agg_approx_histogram($col)") // node renaming in class doesn't seem to propogate
+ new HistogramAggregate()(ExtractTile(col))
+ .as(s"rf_agg_approx_histogram($col)")
  .as[CellHistogram]
 
  /** Adapter hack to allow UserDefinedAggregateFunction to be referenced as an expression. */

diff --git a/...main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalCountAggregate.scala b/...main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalCountAggregate.scala
@@ -92,7 +92,7 @@ object LocalCountAggregate {
  object LocalDataCellsUDAF {
  def apply(child: Expression): LocalDataCellsUDAF = new LocalDataCellsUDAF(child)
  def apply(tile: Column): TypedColumn[Any, Tile] =
- new Column(new LocalDataCellsUDAF(tile.expr))
+ new LocalCountAggregate(true)(ExtractTile(tile))
  .as(s"rf_agg_local_data_cells($tile)")
  .as[Tile]
  }
@@ -107,7 +107,7 @@ object LocalCountAggregate {
  object LocalNoDataCellsUDAF {
  def apply(child: Expression): LocalNoDataCellsUDAF = new LocalNoDataCellsUDAF(child)
  def apply(tile: Column): TypedColumn[Any, Tile] =
- new Column(new LocalNoDataCellsUDAF(tile.expr))
+ new LocalCountAggregate(false)(ExtractTile(tile))
  .as(s"rf_agg_local_no_data_cells($tile)")
  .as[Tile]
  }

diff --git a/...main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalStatsAggregate.scala b/...main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalStatsAggregate.scala
@@ -146,7 +146,7 @@ class LocalStatsAggregate() extends UserDefinedAggregateFunction {
 object LocalStatsAggregate {
 
  def apply(col: Column): TypedColumn[Any, LocalCellStatistics] =
- new Column(LocalStatsAggregateUDAF(col.expr))
+ new LocalStatsAggregate()(ExtractTile(col))
  .as(s"rf_agg_local_stats($col)")
  .as[LocalCellStatistics]
 

diff --git a/...ain/scala/org/locationtech/rasterframes/expressions/aggregates/LocalTileOpAggregate.scala b/...ain/scala/org/locationtech/rasterframes/expressions/aggregates/LocalTileOpAggregate.scala
@@ -83,7 +83,10 @@ object LocalTileOpAggregate {
  }
  object LocalMinUDAF {
  def apply(child: Expression): LocalMinUDAF = new LocalMinUDAF(child)
- def apply(tile: Column): TypedColumn[Any, Tile] = new Column(new LocalMinUDAF(tile.expr)).as[Tile]
+ def apply(tile: Column): TypedColumn[Any, Tile] =
+ new LocalTileOpAggregate(BiasedMin)(ExtractTile(tile))
+ .as(s"rf_agg_local_min($tile)")
+ .as[Tile]
  }
 
  @ExpressionDescription(
@@ -95,6 +98,9 @@ object LocalTileOpAggregate {
  }
  object LocalMaxUDAF {
  def apply(child: Expression): LocalMaxUDAF = new LocalMaxUDAF(child)
- def apply(tile: Column): TypedColumn[Any, Tile] = new Column(new LocalMaxUDAF(tile.expr)).as[Tile]
+ def apply(tile: Column): TypedColumn[Any, Tile] =
+ new LocalTileOpAggregate(BiasedMax)(ExtractTile(tile))
+ .as(s"rf_agg_local_max($tile)")
+ .as[Tile]
  }
 }