diff --git a/.travis.yml b/.travis.yml index 12cad75b7..9b6f44ea2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,3 @@ -sudo: false dist: xenial language: python @@ -28,11 +27,10 @@ install: - pip install rasterio shapely pandas numpy pweave - wget -O - https://piccolo.link/sbt-1.2.8.tgz | tar xzf - -script: - - sbt/bin/sbt -java-home $JAVA_HOME -batch test - - sbt/bin/sbt -java-home $JAVA_HOME -batch it:test - # - sbt -Dfile.encoding=UTF8 clean coverage test coverageReport - # Tricks to avoid unnecessary cache updates - - find $HOME/.sbt -name "*.lock" | xargs rm - - find $HOME/.ivy2 -name "ivydata-*.properties" | xargs rm +jobs: + include: + - stage: "Unit Tests" + script: sbt/bin/sbt -java-home $JAVA_HOME -batch test + - stage: "Integration Tests" + script: sbt/bin/sbt -java-home $JAVA_HOME -batch it:test diff --git a/README.md b/README.md index 2b3bcb43f..ac1cc786b 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -® +® [![Join the chat at https://gitter.im/locationtech/rasterframes](https://badges.gitter.im/locationtech/rasterframes.svg)](https://gitter.im/locationtech/rasterframes?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) @@ -6,7 +6,7 @@ RasterFrames® brings together Earth-observation (EO) data access, cloud computi RasterFrames provides a DataFrame-centric view over arbitrary raster data, enabling spatiotemporal queries, map algebra raster operations, and compatibility with the ecosystem of Spark ML algorithms. By using DataFrames as the core cognitive and compute data model, it is able to deliver these features in a form that is both accessible to general analysts and scalable along with the rapidly growing data footprint. - + Please see the [Getting Started](http://rasterframes.io/getting-started.html) section of the Users' Manual to start using RasterFrames. diff --git a/bench/src/main/scala/org/locationtech/rasterframes/bench/TileExplodeBench.scala b/bench/src/main/scala/org/locationtech/rasterframes/bench/TileExplodeBench.scala index 4ece4cc98..7f3352f69 100644 --- a/bench/src/main/scala/org/locationtech/rasterframes/bench/TileExplodeBench.scala +++ b/bench/src/main/scala/org/locationtech/rasterframes/bench/TileExplodeBench.scala @@ -22,12 +22,11 @@ package org.locationtech.rasterframes.bench import java.util.concurrent.TimeUnit -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.BoundReference -import org.apache.spark.sql.rf.TileUDT import org.locationtech.rasterframes._ -import org.locationtech.rasterframes.expressions.generators.ExplodeTiles +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ import org.openjdk.jmh.annotations._ + /** * * @author sfitch @@ -37,32 +36,33 @@ import org.openjdk.jmh.annotations._ @State(Scope.Benchmark) @OutputTimeUnit(TimeUnit.MILLISECONDS) class TileExplodeBench extends SparkEnv { + import spark.implicits._ - //@Param(Array("uint8", "uint16ud255", "float32", "float64")) - @Param(Array("uint16ud255")) + @Param(Array("uint8", "uint16ud255", "float32", "float64")) var cellTypeName: String = _ @Param(Array("256")) var tileSize: Int = _ - @Param(Array("2000")) + @Param(Array("100")) var numTiles: Int = _ @transient - var tiles: Array[InternalRow] = _ - - var exploder: ExplodeTiles = _ + var tiles: DataFrame = _ @Setup(Level.Trial) def setupData(): Unit = { - tiles = Array.fill(numTiles)(randomTile(tileSize, tileSize, cellTypeName)) - .map(t => InternalRow(TileUDT.tileSerializer.toInternalRow(t))) - val expr = BoundReference(0, TileType, true) - exploder = new ExplodeTiles(1.0, None, Seq(expr)) + tiles = Seq.fill(numTiles)(randomTile(tileSize, tileSize, cellTypeName)) + .toDF("tile").repartition(10) + } + + @Benchmark + def arrayExplode() = { + tiles.select(posexplode(rf_tile_to_array_double($"tile"))).count() } + @Benchmark def tileExplode() = { - for(t <- tiles) - exploder.eval(t) + tiles.select(rf_explode_tiles($"tile")).count() } } diff --git a/build.sbt b/build.sbt index fa42d1192..f941ea060 100644 --- a/build.sbt +++ b/build.sbt @@ -32,7 +32,10 @@ lazy val root = project .withId("RasterFrames") .aggregate(core, datasource, pyrasterframes, experimental) .enablePlugins(RFReleasePlugin) - .settings(publish / skip := true) + .settings( + publish / skip := true, + clean := clean.dependsOn(`rf-notebook`/clean).value + ) lazy val `rf-notebook` = project .dependsOn(pyrasterframes) diff --git a/build/circleci/Dockerfile b/build/circleci/Dockerfile index a2356f7b6..4ea664a52 100644 --- a/build/circleci/Dockerfile +++ b/build/circleci/Dockerfile @@ -6,45 +6,40 @@ ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/ # most of these libraries required for # python-pip pandoc && pip install setuptools => required for pyrasterframes testing -RUN sudo apt-get update && \ +RUN \ + sudo apt-get update && \ sudo apt remove \ python python-minimal python2.7 python2.7-minimal \ libpython-stdlib libpython2.7 libpython2.7-minimal libpython2.7-stdlib \ - && sudo apt-get install -y \ - pandoc \ - wget \ - gcc g++ build-essential \ + && \ + sudo apt-get install -y \ + pandoc wget \ + gcc g++ build-essential bash-completion cmake imagemagick \ libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \ - libcurl4-gnutls-dev \ - libproj-dev \ - libgeos-dev \ - libhdf4-alt-dev \ - bash-completion \ - cmake \ - imagemagick \ - libpng-dev \ - libffi-dev \ - && sudo apt autoremove \ - && sudo apt-get clean all -# && sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 1 -# todo s + liblzma-dev libcurl4-gnutls-dev libproj-dev libgeos-dev libhdf4-alt-dev libpng-dev libffi-dev \ + && \ + sudo apt autoremove && \ + sudo apt-get clean all -RUN cd /tmp && \ - wget https://www.python.org/ftp/python/3.7.4/Python-3.7.4.tgz && \ - tar xzf Python-3.7.4.tgz && \ - cd Python-3.7.4 && \ - ./configure --with-ensurepip=install --prefix=/usr/local --enable-optimization && \ - make && \ - sudo make altinstall && \ - rm -rf Python-3.7.4* +RUN \ + cd /tmp && \ + wget https://www.python.org/ftp/python/3.7.4/Python-3.7.4.tgz && \ + tar xzf Python-3.7.4.tgz && \ + cd Python-3.7.4 && \ + ./configure --with-ensurepip=install --prefix=/usr/local --enable-optimization && \ + make && \ + sudo make altinstall && \ + rm -rf Python-3.7.4* -RUN sudo ln -s /usr/local/bin/python3.7 /usr/local/bin/python && \ - sudo curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ - sudo python get-pip.py && \ - sudo pip3 install setuptools ipython==6.2.1 +RUN \ + sudo ln -s /usr/local/bin/python3.7 /usr/local/bin/python && \ + sudo curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ + sudo python get-pip.py && \ + sudo pip3 install setuptools ipython==6.2.1 # install OpenJPEG -RUN cd /tmp && \ +RUN \ + cd /tmp && \ wget https://github.com/uclouvain/openjpeg/archive/v${OPENJPEG_VERSION}.tar.gz && \ tar -xf v${OPENJPEG_VERSION}.tar.gz && \ cd openjpeg-${OPENJPEG_VERSION}/ && \ @@ -56,7 +51,8 @@ RUN cd /tmp && \ cd /tmp && rm -Rf v${OPENJPEG_VERSION}.tar.gz openjpeg* # Compile and install GDAL with Java bindings -RUN cd /tmp && \ +RUN \ + cd /tmp && \ wget http://download.osgeo.org/gdal/${GDAL_VERSION}/gdal-${GDAL_VERSION}.tar.gz && \ tar -xf gdal-${GDAL_VERSION}.tar.gz && \ cd gdal-${GDAL_VERSION} && \ @@ -73,8 +69,7 @@ RUN cd /tmp && \ --with-threads \ --without-jp2mrsid \ --without-netcdf \ - --without-ecw \ - && \ + --without-ecw && \ make -j 8 && \ sudo make install && \ sudo ldconfig && \ diff --git a/core/src/main/scala/org/locationtech/rasterframes/RasterFunctions.scala b/core/src/main/scala/org/locationtech/rasterframes/RasterFunctions.scala index 213f0f77d..94dcef333 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/RasterFunctions.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/RasterFunctions.scala @@ -59,6 +59,22 @@ trait RasterFunctions { /** Extracts the bounding box from a RasterSource or ProjectedRasterTile */ def rf_extent(col: Column): TypedColumn[Any, Extent] = GetExtent(col) + /** Constructs a XZ2 index in WGS84 from either a Geometry, Extent, ProjectedRasterTile, or RasterSource and its CRS + * For details: https://www.geomesa.org/documentation/user/datastores/index_overview.html */ + def rf_spatial_index(targetExtent: Column, targetCRS: Column, indexResolution: Short) = XZ2Indexer(targetExtent, targetCRS, indexResolution) + + /** Constructs a XZ2 index in WGS84 from either a Geometry, Extent, ProjectedRasterTile, or RasterSource and its CRS + * For details: https://www.geomesa.org/documentation/user/datastores/index_overview.html */ + def rf_spatial_index(targetExtent: Column, targetCRS: Column) = XZ2Indexer(targetExtent, targetCRS, 18: Short) + + /** Constructs a XZ2 index with level 18 resolution in WGS84 from either a ProjectedRasterTile or RasterSource + * For details: https://www.geomesa.org/documentation/user/datastores/index_overview.html */ + def rf_spatial_index(targetExtent: Column, indexResolution: Short) = XZ2Indexer(targetExtent, indexResolution) + + /** Constructs a XZ2 index with level 18 resolution in WGS84 from either a ProjectedRasterTile or RasterSource + * For details: https://www.geomesa.org/documentation/user/datastores/index_overview.html */ + def rf_spatial_index(targetExtent: Column) = XZ2Indexer(targetExtent, 18: Short) + /** Extracts the CRS from a RasterSource or ProjectedRasterTile */ def rf_crs(col: Column): TypedColumn[Any, CRS] = GetCRS(col) @@ -276,12 +292,38 @@ trait RasterFunctions { } /** Where the rf_mask tile contains NODATA, replace values in the source tile with NODATA */ - def rf_mask(sourceTile: Column, maskTile: Column): TypedColumn[Any, Tile] = - Mask.MaskByDefined(sourceTile, maskTile) + def rf_mask(sourceTile: Column, maskTile: Column): TypedColumn[Any, Tile] = rf_mask(sourceTile, maskTile, false) + + /** Where the rf_mask tile contains NODATA, replace values in the source tile with NODATA */ + def rf_mask(sourceTile: Column, maskTile: Column, inverse: Boolean=false): TypedColumn[Any, Tile] = + if(!inverse) Mask.MaskByDefined(sourceTile, maskTile) + else Mask.InverseMaskByDefined(sourceTile, maskTile) + + /** Where the `maskTile` equals `maskValue`, replace values in the source tile with `NoData` */ + def rf_mask_by_value(sourceTile: Column, maskTile: Column, maskValue: Column, inverse: Boolean=false): TypedColumn[Any, Tile] = + if (!inverse) Mask.MaskByValue(sourceTile, maskTile, maskValue) + else Mask.InverseMaskByValue(sourceTile, maskTile, maskValue) /** Where the `maskTile` equals `maskValue`, replace values in the source tile with `NoData` */ - def rf_mask_by_value(sourceTile: Column, maskTile: Column, maskValue: Column): TypedColumn[Any, Tile] = - Mask.MaskByValue(sourceTile, maskTile, maskValue) + def rf_mask_by_value(sourceTile: Column, maskTile: Column, maskValue: Int, inverse: Boolean): TypedColumn[Any, Tile] = + rf_mask_by_value(sourceTile, maskTile, lit(maskValue), inverse) + + /** Where the `maskTile` equals `maskValue`, replace values in the source tile with `NoData` */ + def rf_mask_by_value(sourceTile: Column, maskTile: Column, maskValue: Int): TypedColumn[Any, Tile] = + rf_mask_by_value(sourceTile, maskTile, maskValue, false) + + /** Generate a tile with the values from `data_tile`, but where cells in the `mask_tile` are in the `mask_values` + list, replace the value with NODATA. */ + def rf_mask_by_values(sourceTile: Column, maskTile: Column, maskValues: Column): TypedColumn[Any, Tile] = + Mask.MaskByValues(sourceTile, maskTile, maskValues) + + /** Generate a tile with the values from `data_tile`, but where cells in the `mask_tile` are in the `mask_values` + list, replace the value with NODATA. */ + def rf_mask_by_values(sourceTile: Column, maskTile: Column, maskValues: Seq[Int]): TypedColumn[Any, Tile] = { + import org.apache.spark.sql.functions.array + val valuesCol: Column = array(maskValues.map(lit).toSeq: _*) + rf_mask_by_values(sourceTile, maskTile, valuesCol) + } /** Where the `maskTile` does **not** contain `NoData`, replace values in the source tile with `NoData` */ def rf_inverse_mask(sourceTile: Column, maskTile: Column): TypedColumn[Any, Tile] = @@ -291,6 +333,10 @@ trait RasterFunctions { def rf_inverse_mask_by_value(sourceTile: Column, maskTile: Column, maskValue: Column): TypedColumn[Any, Tile] = Mask.InverseMaskByValue(sourceTile, maskTile, maskValue) + /** Where the `maskTile` does **not** equal `maskValue`, replace values in the source tile with `NoData` */ + def rf_inverse_mask_by_value(sourceTile: Column, maskTile: Column, maskValue: Int): TypedColumn[Any, Tile] = + Mask.InverseMaskByValue(sourceTile, maskTile, lit(maskValue)) + /** Create a tile where cells in the grid defined by cols, rows, and bounds are filled with the given value. */ def rf_rasterize(geometry: Column, bounds: Column, value: Column, cols: Int, rows: Int): TypedColumn[Any, Tile] = withTypedAlias("rf_rasterize", geometry)( @@ -389,6 +435,12 @@ trait RasterFunctions { /** Cellwise inequality comparison between a tile and a scalar. */ def rf_local_unequal[T: Numeric](tileCol: Column, value: T): Column = Unequal(tileCol, value) + /** Test if each cell value is in provided array */ + def rf_local_is_in(tileCol: Column, arrayCol: Column) = IsIn(tileCol, arrayCol) + + /** Test if each cell value is in provided array */ + def rf_local_is_in(tileCol: Column, array: Array[Int]) = IsIn(tileCol, array) + /** Return a tile with ones where the input is NoData, otherwise zero */ def rf_local_no_data(tileCol: Column): Column = Undefined(tileCol) diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/DynamicExtractors.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/DynamicExtractors.scala index 834c3aac1..e72f158aa 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/DynamicExtractors.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/DynamicExtractors.scala @@ -23,11 +23,14 @@ package org.locationtech.rasterframes.expressions import geotrellis.proj4.CRS import geotrellis.raster.{CellGrid, Tile} +import geotrellis.vector.Extent import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.jts.JTSTypes import org.apache.spark.sql.rf.{RasterSourceUDT, TileUDT} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String +import org.locationtech.jts.geom.Envelope import org.locationtech.rasterframes.encoders.CatalystSerializer._ import org.locationtech.rasterframes.model.{LazyCRS, TileContext} import org.locationtech.rasterframes.ref.{ProjectedRasterLike, RasterRef, RasterSource} @@ -94,6 +97,15 @@ object DynamicExtractors { (v: Any) => v.asInstanceOf[InternalRow].to[CRS] } + lazy val extentLikeExtractor: PartialFunction[DataType, Any ⇒ Extent] = { + case t if org.apache.spark.sql.rf.WithTypeConformity(t).conformsTo(JTSTypes.GeometryTypeInstance) => + (input: Any) => JTSTypes.GeometryTypeInstance.deserialize(input).getEnvelopeInternal + case t if t.conformsTo[Extent] => + (input: Any) => input.asInstanceOf[InternalRow].to[Extent] + case t if t.conformsTo[Envelope] => + (input: Any) => Extent(input.asInstanceOf[InternalRow].to[Envelope]) + } + sealed trait TileOrNumberArg sealed trait NumberArg extends TileOrNumberArg case class TileArg(tile: Tile, ctx: Option[TileContext]) extends TileOrNumberArg diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/CellStatsAggregate.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/CellStatsAggregate.scala index 95c0bd837..c9acf4ed4 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/CellStatsAggregate.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/CellStatsAggregate.scala @@ -123,8 +123,8 @@ object CellStatsAggregate { import org.locationtech.rasterframes.encoders.StandardEncoders.cellStatsEncoder def apply(col: Column): TypedColumn[Any, CellStatistics] = - new Column(new CellStatsAggregateUDAF(col.expr)) - .as(s"rf_agg_stats($col)") // node renaming in class doesn't seem to propogate + new CellStatsAggregate()(ExtractTile(col)) + .as(s"rf_agg_stats($col)") .as[CellStatistics] /** Adapter hack to allow UserDefinedAggregateFunction to be referenced as an expression. */ diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/HistogramAggregate.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/HistogramAggregate.scala index 44cc1324b..5f7483b0c 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/HistogramAggregate.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/HistogramAggregate.scala @@ -98,8 +98,8 @@ object HistogramAggregate { import org.locationtech.rasterframes.encoders.StandardEncoders.cellHistEncoder def apply(col: Column): TypedColumn[Any, CellHistogram] = - new Column(new HistogramAggregateUDAF(col.expr)) - .as(s"rf_agg_approx_histogram($col)") // node renaming in class doesn't seem to propogate + new HistogramAggregate()(ExtractTile(col)) + .as(s"rf_agg_approx_histogram($col)") .as[CellHistogram] /** Adapter hack to allow UserDefinedAggregateFunction to be referenced as an expression. */ diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalCountAggregate.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalCountAggregate.scala index 256cd63dd..2fd65700d 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalCountAggregate.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalCountAggregate.scala @@ -92,7 +92,7 @@ object LocalCountAggregate { object LocalDataCellsUDAF { def apply(child: Expression): LocalDataCellsUDAF = new LocalDataCellsUDAF(child) def apply(tile: Column): TypedColumn[Any, Tile] = - new Column(new LocalDataCellsUDAF(tile.expr)) + new LocalCountAggregate(true)(ExtractTile(tile)) .as(s"rf_agg_local_data_cells($tile)") .as[Tile] } @@ -107,7 +107,7 @@ object LocalCountAggregate { object LocalNoDataCellsUDAF { def apply(child: Expression): LocalNoDataCellsUDAF = new LocalNoDataCellsUDAF(child) def apply(tile: Column): TypedColumn[Any, Tile] = - new Column(new LocalNoDataCellsUDAF(tile.expr)) + new LocalCountAggregate(false)(ExtractTile(tile)) .as(s"rf_agg_local_no_data_cells($tile)") .as[Tile] } diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalStatsAggregate.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalStatsAggregate.scala index 86b360dea..080579633 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalStatsAggregate.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalStatsAggregate.scala @@ -146,7 +146,7 @@ class LocalStatsAggregate() extends UserDefinedAggregateFunction { object LocalStatsAggregate { def apply(col: Column): TypedColumn[Any, LocalCellStatistics] = - new Column(LocalStatsAggregateUDAF(col.expr)) + new LocalStatsAggregate()(ExtractTile(col)) .as(s"rf_agg_local_stats($col)") .as[LocalCellStatistics] diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalTileOpAggregate.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalTileOpAggregate.scala index b739961c1..bd48f3981 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalTileOpAggregate.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalTileOpAggregate.scala @@ -83,7 +83,10 @@ object LocalTileOpAggregate { } object LocalMinUDAF { def apply(child: Expression): LocalMinUDAF = new LocalMinUDAF(child) - def apply(tile: Column): TypedColumn[Any, Tile] = new Column(new LocalMinUDAF(tile.expr)).as[Tile] + def apply(tile: Column): TypedColumn[Any, Tile] = + new LocalTileOpAggregate(BiasedMin)(ExtractTile(tile)) + .as(s"rf_agg_local_min($tile)") + .as[Tile] } @ExpressionDescription( @@ -95,6 +98,9 @@ object LocalTileOpAggregate { } object LocalMaxUDAF { def apply(child: Expression): LocalMaxUDAF = new LocalMaxUDAF(child) - def apply(tile: Column): TypedColumn[Any, Tile] = new Column(new LocalMaxUDAF(tile.expr)).as[Tile] + def apply(tile: Column): TypedColumn[Any, Tile] = + new LocalTileOpAggregate(BiasedMax)(ExtractTile(tile)) + .as(s"rf_agg_local_max($tile)") + .as[Tile] } } diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/TileRasterizerAggregate.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/TileRasterizerAggregate.scala index 360ef93dd..6647f4258 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/TileRasterizerAggregate.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/TileRasterizerAggregate.scala @@ -138,7 +138,7 @@ object TileRasterizerAggregate { } } - // Scan table and constuct what the TileLayerMetadata would be in the specified destination CRS. + // Scan table and construct what the TileLayerMetadata would be in the specified destination CRS. val tlm: TileLayerMetadata[SpatialKey] = df .select( ProjectedLayerMetadataAggregate( diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala index 06c0c033e..2a70be585 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala @@ -24,8 +24,8 @@ package org.locationtech.rasterframes.expressions.generators import geotrellis.raster._ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, CodegenFallback, UnsafeRowWriter} -import org.apache.spark.sql.catalyst.expressions.{Expression, Generator, UnsafeRow} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, Generator, GenericInternalRow} import org.apache.spark.sql.types._ import org.locationtech.rasterframes._ import org.locationtech.rasterframes.expressions.DynamicExtractors @@ -87,17 +87,14 @@ case class ExplodeTiles( cfor(0)(_ < rows, _ + 1) { row => cfor(0)(_ < cols, _ + 1) { col => val rowIndex = row * cols + col - val outRow = new UnsafeRow(numOutCols) - val buffer = new BufferHolder(outRow) - val writer = new UnsafeRowWriter(buffer, numOutCols) - writer.write(0, col) - writer.write(1, row) + val outCols = Array.ofDim[Any](numOutCols) + outCols(0) = col + outCols(1) = row cfor(0)(_ < tiles.length, _ + 1) { index => val tile = tiles(index) - val cell: Double = if (tile == null) doubleNODATA else tile.getDouble(col, row) - writer.write(index + 2, cell) + outCols(index + 2) = if(tile == null) doubleNODATA else tile.getDouble(col, row) } - retval(rowIndex) = outRow + retval(rowIndex) = new GenericInternalRow(outCols) } } if(sampleFraction > 0.0 && sampleFraction < 1.0) sample(retval) diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/localops/IsIn.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/localops/IsIn.scala new file mode 100644 index 000000000..1707aff60 --- /dev/null +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/localops/IsIn.scala @@ -0,0 +1,96 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package org.locationtech.rasterframes.expressions.localops + +import geotrellis.raster.Tile +import geotrellis.raster.mapalgebra.local.IfCell +import org.apache.spark.sql.Column +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} +import org.apache.spark.sql.types.{ArrayType, DataType} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, Expression, ExpressionDescription} +import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.rf.TileUDT +import org.locationtech.rasterframes.encoders.CatalystSerializer._ +import org.locationtech.rasterframes.expressions.DynamicExtractors._ +import org.locationtech.rasterframes.expressions._ + +@ExpressionDescription( + usage = "_FUNC_(tile, rhs) - In each cell of `tile`, return true if the value is in rhs.", + arguments = """ + Arguments: + * tile - tile column to apply abs + * rhs - array to test against + """, + examples = """ + Examples: + > SELECT _FUNC_(tile, array(lit(33), lit(66), lit(99))); + ...""" +) +case class IsIn(left: Expression, right: Expression) extends BinaryExpression with CodegenFallback { + override val nodeName: String = "rf_local_is_in" + + override def dataType: DataType = left.dataType + + @transient private lazy val elementType: DataType = right.dataType.asInstanceOf[ArrayType].elementType + + override def checkInputDataTypes(): TypeCheckResult = + if(!tileExtractor.isDefinedAt(left.dataType)) { + TypeCheckFailure(s"Input type '${left.dataType}' does not conform to a raster type.") + } else right.dataType match { + case _: ArrayType ⇒ TypeCheckSuccess + case _ ⇒ TypeCheckFailure(s"Input type '${right.dataType}' does not conform to ArrayType.") + } + + override protected def nullSafeEval(input1: Any, input2: Any): Any = { + implicit val tileSer = TileUDT.tileSerializer + val (childTile, childCtx) = tileExtractor(left.dataType)(row(input1)) + + val arr = input2.asInstanceOf[ArrayData].toArray[AnyRef](elementType) + + childCtx match { + case Some(ctx) => ctx.toProjectRasterTile(op(childTile, arr)).toInternalRow + case None => op(childTile, arr).toInternalRow + } + + } + + protected def op(left: Tile, right: IndexedSeq[AnyRef]): Tile = { + def fn(i: Int): Boolean = right.contains(i) + IfCell(left, fn(_), 1, 0) + } + +} + +object IsIn { + def apply(left: Column, right: Column): Column = + new Column(IsIn(left.expr, right.expr)) + + def apply(left: Column, right: Array[Int]): Column = { + import org.apache.spark.sql.functions.lit + import org.apache.spark.sql.functions.array + val arrayExpr = array(right.map(lit):_*).expr + new Column(IsIn(left.expr, arrayExpr)) + } + +} diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/package.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/package.scala index ef614a9a3..d2163f72b 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/package.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/package.scala @@ -36,7 +36,7 @@ import org.locationtech.rasterframes.expressions.tilestats._ import org.locationtech.rasterframes.expressions.transformers._ import scala.reflect.runtime.universe._ -import scala.util.Try + /** * Module of Catalyst expressions for efficiently working with tiles. * @@ -53,8 +53,7 @@ package object expressions { private[expressions] def udfexpr[RT: TypeTag, A1: TypeTag](name: String, f: A1 => RT): Expression => ScalaUDF = (child: Expression) => { val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT] - val inputTypes = Try(ScalaReflection.schemaFor(typeTag[A1]).dataType :: Nil).toOption - ScalaUDF(f, dataType, Seq(child), inputTypes.getOrElse(Nil), nullable = nullable, udfName = Some(name)) + ScalaUDF(f, dataType, Seq(child), Seq(true), nullable = nullable, udfName = Some(name)) } def register(sqlContext: SQLContext): Unit = { @@ -87,6 +86,7 @@ package object expressions { registry.registerExpression[GreaterEqual]("rf_local_greater_equal") registry.registerExpression[Equal]("rf_local_equal") registry.registerExpression[Unequal]("rf_local_unequal") + registry.registerExpression[IsIn]("rf_local_is_in") registry.registerExpression[Undefined]("rf_local_no_data") registry.registerExpression[Defined]("rf_local_data") registry.registerExpression[Sum]("rf_tile_sum") @@ -125,15 +125,18 @@ package object expressions { registry.registerExpression[LocalMeanAggregate]("rf_agg_local_mean") registry.registerExpression[Mask.MaskByDefined]("rf_mask") + registry.registerExpression[Mask.InverseMaskByDefined]("rf_inverse_mask") registry.registerExpression[Mask.MaskByValue]("rf_mask_by_value") registry.registerExpression[Mask.InverseMaskByValue]("rf_inverse_mask_by_value") - registry.registerExpression[Mask.InverseMaskByDefined]("rf_inverse_mask") + registry.registerExpression[Mask.MaskByValues]("rf_mask_by_values") registry.registerExpression[DebugRender.RenderAscii]("rf_render_ascii") registry.registerExpression[DebugRender.RenderMatrix]("rf_render_matrix") registry.registerExpression[RenderPNG.RenderCompositePNG]("rf_render_png") registry.registerExpression[RGBComposite]("rf_rgb_composite") + registry.registerExpression[XZ2Indexer]("rf_spatial_index") + registry.registerExpression[transformers.ReprojectGeometry]("st_reproject") } } diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/transformers/Mask.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/transformers/Mask.scala index 69dac94c7..c6b9b75ec 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/transformers/Mask.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/transformers/Mask.scala @@ -34,52 +34,58 @@ import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Column, TypedColumn} import org.locationtech.rasterframes.encoders.CatalystSerializer._ import org.locationtech.rasterframes.expressions.DynamicExtractors._ +import org.locationtech.rasterframes.expressions.localops.IsIn import org.locationtech.rasterframes.expressions.row import org.slf4j.LoggerFactory abstract class Mask(val left: Expression, val middle: Expression, val right: Expression, inverse: Boolean) extends TernaryExpression with CodegenFallback with Serializable { + // aliases. + def targetExp = left + def maskExp = middle + def maskValueExp = right @transient protected lazy val logger = Logger(LoggerFactory.getLogger(getClass.getName)) - override def children: Seq[Expression] = Seq(left, middle, right) override def checkInputDataTypes(): TypeCheckResult = { - if (!tileExtractor.isDefinedAt(left.dataType)) { - TypeCheckFailure(s"Input type '${left.dataType}' does not conform to a raster type.") - } else if (!tileExtractor.isDefinedAt(middle.dataType)) { - TypeCheckFailure(s"Input type '${middle.dataType}' does not conform to a raster type.") - } else if (!intArgExtractor.isDefinedAt(right.dataType)) { - TypeCheckFailure(s"Input type '${right.dataType}' isn't an integral type.") + if (!tileExtractor.isDefinedAt(targetExp.dataType)) { + TypeCheckFailure(s"Input type '${targetExp.dataType}' does not conform to a raster type.") + } else if (!tileExtractor.isDefinedAt(maskExp.dataType)) { + TypeCheckFailure(s"Input type '${maskExp.dataType}' does not conform to a raster type.") + } else if (!intArgExtractor.isDefinedAt(maskValueExp.dataType)) { + TypeCheckFailure(s"Input type '${maskValueExp.dataType}' isn't an integral type.") } else TypeCheckSuccess } override def dataType: DataType = left.dataType - override protected def nullSafeEval(leftInput: Any, middleInput: Any, rightInput: Any): Any = { + override def makeCopy(newArgs: Array[AnyRef]): Expression = super.makeCopy(newArgs) + + override protected def nullSafeEval(targetInput: Any, maskInput: Any, maskValueInput: Any): Any = { implicit val tileSer = TileUDT.tileSerializer - val (leftTile, leftCtx) = tileExtractor(left.dataType)(row(leftInput)) - val (rightTile, rightCtx) = tileExtractor(middle.dataType)(row(middleInput)) + val (targetTile, targetCtx) = tileExtractor(targetExp.dataType)(row(targetInput)) + val (maskTile, maskCtx) = tileExtractor(maskExp.dataType)(row(maskInput)) - if (leftCtx.isEmpty && rightCtx.isDefined) + if (targetCtx.isEmpty && maskCtx.isDefined) logger.warn( s"Right-hand parameter '${middle}' provided an extent and CRS, but the left-hand parameter " + s"'${left}' didn't have any. Because the left-hand side defines output type, the right-hand context will be lost.") - if (leftCtx.isDefined && rightCtx.isDefined && leftCtx != rightCtx) + if (targetCtx.isDefined && maskCtx.isDefined && targetCtx != maskCtx) logger.warn(s"Both '${left}' and '${middle}' provided an extent and CRS, but they are different. Left-hand side will be used.") - val maskValue = intArgExtractor(right.dataType)(rightInput) + val maskValue = intArgExtractor(maskValueExp.dataType)(maskValueInput) - val masking = if (maskValue.value == 0) Defined(rightTile) - else rightTile + val masking = if (maskValue.value == 0) Defined(maskTile) + else maskTile val result = if (inverse) - gtInverseMask(leftTile, masking, maskValue.value, raster.NODATA) + gtInverseMask(targetTile, masking, maskValue.value, raster.NODATA) else - gtMask(leftTile, masking, maskValue.value, raster.NODATA) + gtMask(targetTile, masking, maskValue.value, raster.NODATA) - leftCtx match { + targetCtx match { case Some(ctx) => ctx.toProjectRasterTile(result).toInternalRow case None => result.toInternalRow } @@ -169,4 +175,28 @@ object Mask { def apply(srcTile: Column, maskingTile: Column, maskValue: Column): TypedColumn[Any, Tile] = new Column(InverseMaskByValue(srcTile.expr, maskingTile.expr, maskValue.expr)).as[Tile] } + + @ExpressionDescription( + usage = "_FUNC_(data, mask, maskValues) - Generate a tile with the values from `data` tile but where cells in the `mask` tile are in the `maskValues` list, replace the value with NODATA.", + arguments = """ + Arguments: + * target - tile to mask + * mask - masking definition + * maskValues - sequence of values to consider as masks candidates + """, + examples = """ + Examples: + > SELECT _FUNC_(data, mask, array(1, 2, 3)) + ...""" + ) + case class MaskByValues(dataTile: Expression, maskTile: Expression) + extends Mask(dataTile, maskTile, Literal(1), inverse = false) { + def this(dataTile: Expression, maskTile: Expression, maskValues: Expression) = + this(dataTile, IsIn(maskTile, maskValues)) + override def nodeName: String = "rf_mask_by_values" + } + object MaskByValues { + def apply(dataTile: Column, maskTile: Column, maskValues: Column): TypedColumn[Any, Tile] = + new Column(MaskByValues(dataTile.expr, IsIn(maskTile, maskValues).expr)).as[Tile] + } } diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/transformers/XZ2Indexer.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/transformers/XZ2Indexer.scala new file mode 100644 index 000000000..7acbb3277 --- /dev/null +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/transformers/XZ2Indexer.scala @@ -0,0 +1,130 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package org.locationtech.rasterframes.expressions.transformers + +import geotrellis.proj4.LatLng +import geotrellis.vector.Extent +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, Expression, ExpressionDescription} +import org.apache.spark.sql.jts.JTSTypes +import org.apache.spark.sql.rf.RasterSourceUDT +import org.apache.spark.sql.types.{DataType, LongType} +import org.apache.spark.sql.{Column, TypedColumn, rf} +import org.locationtech.geomesa.curve.XZ2SFC +import org.locationtech.jts.geom.{Envelope, Geometry} +import org.locationtech.rasterframes.encoders.CatalystSerializer._ +import org.locationtech.rasterframes.expressions.DynamicExtractors._ +import org.locationtech.rasterframes.expressions.accessors.GetCRS +import org.locationtech.rasterframes.expressions.row +import org.locationtech.rasterframes.jts.ReprojectionTransformer +import org.locationtech.rasterframes.ref.{RasterRef, RasterSource} +import org.locationtech.rasterframes.tiles.ProjectedRasterTile + +/** + * Constructs a XZ2 index in WGS84 from either a Geometry, Extent, ProjectedRasterTile, or RasterSource + * This function is useful for [range partitioning](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=registerjava#pyspark.sql.DataFrame.repartitionByRange). + * Also see: https://www.geomesa.org/documentation/user/datastores/index_overview.html + * + * @param left geometry-like column + * @param right CRS column + * @param indexResolution resolution level of the space filling curve - + * i.e. how many times the space will be recursively quartered + * 1-18 is typical. + */ +@ExpressionDescription( + usage = "_FUNC_(geom, crs) - Constructs a XZ2 index in WGS84/EPSG:4326", + arguments = """ + Arguments: + * geom - Geometry or item with Geometry: Extent, ProjectedRasterTile, or RasterSource + * crs - the native CRS of the `geom` column +""" +) +case class XZ2Indexer(left: Expression, right: Expression, indexResolution: Short) + extends BinaryExpression with CodegenFallback { + + override def nodeName: String = "rf_spatial_index" + + override def dataType: DataType = LongType + + override def checkInputDataTypes(): TypeCheckResult = { + if (!extentLikeExtractor.orElse(projectedRasterLikeExtractor).isDefinedAt(left.dataType)) + TypeCheckFailure(s"Input type '${left.dataType}' does not look like something with an Extent or something with one.") + else if(!crsExtractor.isDefinedAt(right.dataType)) + TypeCheckFailure(s"Input type '${right.dataType}' does not look like something with a CRS.") + else TypeCheckSuccess + } + + private lazy val indexer = XZ2SFC(indexResolution) + + override protected def nullSafeEval(leftInput: Any, rightInput: Any): Any = { + val crs = crsExtractor(right.dataType)(rightInput) + + val coords = left.dataType match { + case t if rf.WithTypeConformity(t).conformsTo(JTSTypes.GeometryTypeInstance) => + JTSTypes.GeometryTypeInstance.deserialize(leftInput) + case t if t.conformsTo[Extent] => + row(leftInput).to[Extent] + case t if t.conformsTo[Envelope] => + row(leftInput).to[Envelope] + case _: RasterSourceUDT ⇒ + row(leftInput).to[RasterSource](RasterSourceUDT.rasterSourceSerializer).extent + case t if t.conformsTo[ProjectedRasterTile] => + row(leftInput).to[ProjectedRasterTile].extent + case t if t.conformsTo[RasterRef] => + row(leftInput).to[RasterRef].extent + } + + // If no transformation is needed then just normalize to an Envelope + val env = if(crs == LatLng) coords match { + case e: Extent => e.jtsEnvelope + case g: Geometry => g.getEnvelopeInternal + case e: Envelope => e + } + // Otherwise convert to geometry, transform, and get envelope + else { + val trans = new ReprojectionTransformer(crs, LatLng) + coords match { + case e: Extent => trans(e).getEnvelopeInternal + case g: Geometry => trans(g).getEnvelopeInternal + case e: Envelope => trans(e).getEnvelopeInternal + } + } + + val index = indexer.index( + env.getMinX, env.getMinY, env.getMaxX, env.getMaxY, + lenient = false + ) + index + } +} + +object XZ2Indexer { + import org.locationtech.rasterframes.encoders.SparkBasicEncoders.longEnc + def apply(targetExtent: Column, targetCRS: Column, indexResolution: Short): TypedColumn[Any, Long] = + new Column(new XZ2Indexer(targetExtent.expr, targetCRS.expr, indexResolution)).as[Long] + def apply(targetExtent: Column, targetCRS: Column): TypedColumn[Any, Long] = + new Column(new XZ2Indexer(targetExtent.expr, targetCRS.expr, 18)).as[Long] + def apply(targetExtent: Column, indexResolution: Short = 18): TypedColumn[Any, Long] = + new Column(new XZ2Indexer(targetExtent.expr, GetCRS(targetExtent.expr), indexResolution)).as[Long] +} diff --git a/core/src/main/scala/org/locationtech/rasterframes/jts/ReprojectionTransformer.scala b/core/src/main/scala/org/locationtech/rasterframes/jts/ReprojectionTransformer.scala index c4751cb3c..54b45c034 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/jts/ReprojectionTransformer.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/jts/ReprojectionTransformer.scala @@ -21,9 +21,10 @@ package org.locationtech.rasterframes.jts -import org.locationtech.jts.geom.{CoordinateSequence, Geometry} +import org.locationtech.jts.geom.{CoordinateSequence, Envelope, Geometry, GeometryFactory} import org.locationtech.jts.geom.util.GeometryTransformer import geotrellis.proj4.CRS +import geotrellis.vector.Extent /** * JTS Geometry reprojection transformation routine. @@ -32,6 +33,12 @@ import geotrellis.proj4.CRS */ class ReprojectionTransformer(src: CRS, dst: CRS) extends GeometryTransformer { lazy val transform = geotrellis.proj4.Transform(src, dst) + @transient + private lazy val gf = new GeometryFactory() + def apply(geometry: Geometry): Geometry = transform(geometry) + def apply(extent: Extent): Geometry = transform(extent.jtsGeom) + def apply(env: Envelope): Geometry = transform(gf.toGeometry(env)) + override def transformCoordinates(coords: CoordinateSequence, parent: Geometry): CoordinateSequence = { val fact = parent.getFactory val retval = fact.getCoordinateSequenceFactory.create(coords) diff --git a/core/src/main/scala/org/locationtech/rasterframes/rasterframes.scala b/core/src/main/scala/org/locationtech/rasterframes/rasterframes.scala index f22753c1e..b1958d36b 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/rasterframes.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/rasterframes.scala @@ -31,7 +31,6 @@ import org.locationtech.geomesa.spark.jts.DataFrameFunctions import org.locationtech.rasterframes.encoders.StandardEncoders import org.locationtech.rasterframes.extensions.Implicits import org.locationtech.rasterframes.model.TileDimensions -import org.locationtech.rasterframes.util.ZeroSevenCompatibilityKit import org.slf4j.LoggerFactory import shapeless.tag.@@ @@ -39,7 +38,6 @@ import scala.reflect.runtime.universe._ package object rasterframes extends StandardColumns with RasterFunctions - with ZeroSevenCompatibilityKit.RasterFunctions with Implicits with rasterframes.jts.Implicits with StandardEncoders @@ -48,9 +46,8 @@ package object rasterframes extends StandardColumns // Don't make this a `lazy val`... breaks Spark assemblies for some reason. protected def logger: Logger = Logger(LoggerFactory.getLogger(getClass.getName)) - @transient private[rasterframes] - val rfConfig = ConfigFactory.load().getConfig("rasterframes") + def rfConfig = ConfigFactory.load().getConfig("rasterframes") /** The generally expected tile size, as defined by configuration property `rasterframes.nominal-tile-size`.*/ @transient @@ -81,7 +78,6 @@ package object rasterframes extends StandardColumns } rf.register(sqlContext) - ZeroSevenCompatibilityKit.register(sqlContext) rasterframes.functions.register(sqlContext) rasterframes.expressions.register(sqlContext) rasterframes.rules.register(sqlContext) diff --git a/core/src/main/scala/org/locationtech/rasterframes/util/DataFrameRenderers.scala b/core/src/main/scala/org/locationtech/rasterframes/util/DataFrameRenderers.scala index ae57edcf3..36872332f 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/util/DataFrameRenderers.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/util/DataFrameRenderers.scala @@ -24,12 +24,14 @@ package org.locationtech.rasterframes.util import geotrellis.raster.render.ColorRamps import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions.{base64, concat, concat_ws, length, lit, substring, when} +import org.apache.spark.sql.jts.JTSTypes import org.apache.spark.sql.types.{StringType, StructField} import org.locationtech.rasterframes.expressions.DynamicExtractors import org.locationtech.rasterframes.{rfConfig, rf_render_png, rf_resample} +import org.apache.spark.sql.rf.WithTypeConformity /** - * DataFrame extensiosn for rendering sample content in a number of ways + * DataFrame extension for rendering sample content in a number of ways */ trait DataFrameRenderers { private val truncateWidth = rfConfig.getInt("max-truncate-row-element-length") @@ -47,8 +49,9 @@ trait DataFrameRenderers { lit("\">") ) else { + val isGeom = WithTypeConformity(c.dataType).conformsTo(JTSTypes.GeometryTypeInstance) val str = resolved.cast(StringType) - if (truncate) + if (truncate || isGeom) when(length(str) > lit(truncateWidth), concat(substring(str, 1, truncateWidth), lit("...")) ) diff --git a/core/src/main/scala/org/locationtech/rasterframes/util/ZeroSevenCompatibilityKit.scala b/core/src/main/scala/org/locationtech/rasterframes/util/ZeroSevenCompatibilityKit.scala deleted file mode 100644 index 3a78bcbaa..000000000 --- a/core/src/main/scala/org/locationtech/rasterframes/util/ZeroSevenCompatibilityKit.scala +++ /dev/null @@ -1,371 +0,0 @@ -/* - * This software is licensed under the Apache 2 license, quoted below. - * - * Copyright 2019 Astraea, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * [http://www.apache.org/licenses/LICENSE-2.0] - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - * - * SPDX-License-Identifier: Apache-2.0 - * - */ - -package org.locationtech.rasterframes.util - -import org.locationtech.rasterframes.expressions.TileAssembler -import org.locationtech.rasterframes.expressions.accessors._ -import org.locationtech.rasterframes.expressions.aggregates._ -import org.locationtech.rasterframes.expressions.generators._ -import org.locationtech.rasterframes.expressions.localops._ -import org.locationtech.rasterframes.expressions.tilestats._ -import org.locationtech.rasterframes.expressions.transformers._ -import org.locationtech.rasterframes.stats._ -import org.locationtech.rasterframes.{functions => F} -import org.locationtech.jts.geom.Geometry -import geotrellis.proj4.CRS -import geotrellis.raster.mapalgebra.local.LocalTileBinaryOp -import geotrellis.raster.{CellType, Tile} -import org.apache.spark.annotation.Experimental -import org.apache.spark.sql.catalyst.analysis.FunctionRegistry -import org.apache.spark.sql.functions.{lit, udf} -import org.apache.spark.sql.rf.VersionShims._ -import org.apache.spark.sql.{Column, SQLContext, TypedColumn, rf} - -/** - * UDFs for working with Tiles in Spark DataFrames. - * - * @since 4/3/17 - */ -object ZeroSevenCompatibilityKit { - import org.locationtech.rasterframes.encoders.StandardEncoders._ - - trait RasterFunctions { - private val delegate = new org.locationtech.rasterframes.RasterFunctions {} - // format: off - /** Create a row for each cell in Tile. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def explodeTiles(cols: Column*): Column = delegate.rf_explode_tiles(cols: _*) - - /** Create a row for each cell in Tile with random sampling and optional seed. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def explodeTilesSample(sampleFraction: Double, seed: Option[Long], cols: Column*): Column = - ExplodeTiles(sampleFraction, seed, cols) - - /** Create a row for each cell in Tile with random sampling (no seed). */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def explodeTilesSample(sampleFraction: Double, cols: Column*): Column = - ExplodeTiles(sampleFraction, None, cols) - - /** Query the number of (cols, rows) in a Tile. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def tileDimensions(col: Column): Column = GetDimensions(col) - - @Experimental - /** Convert array in `arrayCol` into a Tile of dimensions `cols` and `rows`*/ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def arrayToTile(arrayCol: Column, cols: Int, rows: Int) = withAlias("rf_array_to_tile", arrayCol)( - udf[Tile, AnyRef](F.arrayToTile(cols, rows)).apply(arrayCol) - ) - - /** Create a Tile from a column of cell data with location indexes and preform cell conversion. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def assembleTile(columnIndex: Column, rowIndex: Column, cellData: Column, tileCols: Int, tileRows: Int, ct: CellType): TypedColumn[Any, Tile] = - convertCellType(TileAssembler(columnIndex, rowIndex, cellData, lit(tileCols), lit(tileRows)), ct).as(cellData.columnName).as[Tile] - - /** Create a Tile from a column of cell data with location indexes. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def assembleTile(columnIndex: Column, rowIndex: Column, cellData: Column, tileCols: Column, tileRows: Column): TypedColumn[Any, Tile] = - TileAssembler(columnIndex, rowIndex, cellData, tileCols, tileRows) - - /** Extract the Tile's cell type */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def cellType(col: Column): TypedColumn[Any, CellType] = GetCellType(col) - - /** Change the Tile's cell type */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def convertCellType(col: Column, cellType: CellType): Column = - SetCellType(col, cellType) - - /** Change the Tile's cell type */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def convertCellType(col: Column, cellTypeName: String): Column = - SetCellType(col, cellTypeName) - - /** Convert a bounding box structure to a Geometry type. Intented to support multiple schemas. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def boundsGeometry(bounds: Column): TypedColumn[Any, Geometry] = ExtentToGeometry(bounds) - - /** Assign a `NoData` value to the Tiles. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def withNoData(col: Column, nodata: Double) = delegate.rf_with_no_data(col, nodata) - - /** Compute the full column aggregate floating point histogram. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def aggHistogram(col: Column): TypedColumn[Any, CellHistogram] = delegate.rf_agg_approx_histogram(col) - - /** Compute the full column aggregate floating point statistics. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def aggStats(col: Column): TypedColumn[Any, CellStatistics] = delegate.rf_agg_stats(col) - - /** Computes the column aggregate mean. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def aggMean(col: Column) = CellMeanAggregate(col) - - /** Computes the number of non-NoData cells in a column. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def aggDataCells(col: Column): TypedColumn[Any, Long] = delegate.rf_agg_data_cells(col) - - /** Computes the number of NoData cells in a column. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def aggNoDataCells(col: Column): TypedColumn[Any, Long] = delegate.rf_agg_no_data_cells(col) - - /** Compute the Tile-wise mean */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def tileMean(col: Column): TypedColumn[Any, Double] = delegate.rf_tile_mean(col) - - /** Compute the Tile-wise sum */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def tileSum(col: Column): TypedColumn[Any, Double] = delegate.rf_tile_sum(col) - - /** Compute the minimum cell value in tile. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def tileMin(col: Column): TypedColumn[Any, Double] = delegate.rf_tile_min(col) - - /** Compute the maximum cell value in tile. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def tileMax(col: Column): TypedColumn[Any, Double] = delegate.rf_tile_max(col) - - /** Compute TileHistogram of Tile values. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def tileHistogram(col: Column): TypedColumn[Any, CellHistogram] = delegate.rf_tile_histogram(col) - - /** Compute statistics of Tile values. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def tileStats(col: Column): TypedColumn[Any, CellStatistics] = delegate.rf_tile_stats(col) - - /** Counts the number of non-NoData cells per Tile. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def dataCells(tile: Column): TypedColumn[Any, Long] = delegate.rf_data_cells(tile) - - /** Counts the number of NoData cells per Tile. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def noDataCells(tile: Column): TypedColumn[Any, Long] = delegate.rf_no_data_cells(tile) - - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def isNoDataTile(tile: Column): TypedColumn[Any, Boolean] = delegate.rf_is_no_data_tile(tile) - - /** Compute cell-local aggregate descriptive statistics for a column of Tiles. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localAggStats(col: Column): Column = delegate.rf_agg_local_stats(col) - - /** Compute the cell-wise/local max operation between Tiles in a column. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localAggMax(col: Column): TypedColumn[Any, Tile] = delegate.rf_agg_local_max(col) - - /** Compute the cellwise/local min operation between Tiles in a column. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localAggMin(col: Column): TypedColumn[Any, Tile] = delegate.rf_agg_local_min(col) - - /** Compute the cellwise/local mean operation between Tiles in a column. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localAggMean(col: Column): TypedColumn[Any, Tile] = delegate.rf_agg_local_mean(col) - - /** Compute the cellwise/local count of non-NoData cells for all Tiles in a column. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localAggDataCells(col: Column): TypedColumn[Any, Tile] = delegate.rf_agg_local_data_cells(col) - - /** Compute the cellwise/local count of NoData cells for all Tiles in a column. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localAggNoDataCells(col: Column): TypedColumn[Any, Tile] = delegate.rf_agg_local_no_data_cells(col) - - /** Cellwise addition between two Tiles. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localAdd(left: Column, right: Column): Column = delegate.rf_local_add(left, right) - - /** Cellwise addition of a scalar to a tile. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localAddScalar[T: Numeric](tileCol: Column, value: T): Column = delegate.rf_local_add(tileCol, value) - - /** Cellwise subtraction between two Tiles. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localSubtract(left: Column, right: Column): Column = delegate.rf_local_subtract(left, right) - - /** Cellwise subtraction of a scalar from a tile. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localSubtractScalar[T: Numeric](tileCol: Column, value: T): Column = delegate.rf_local_subtract(tileCol, value) - /** Cellwise multiplication between two Tiles. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localMultiply(left: Column, right: Column): Column = delegate.rf_local_multiply(left, right) - - /** Cellwise multiplication of a tile by a scalar. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localMultiplyScalar[T: Numeric](tileCol: Column, value: T): Column = delegate.rf_local_multiply(tileCol, value) - - /** Cellwise division between two Tiles. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localDivide(left: Column, right: Column): Column = delegate.rf_local_divide(left, right) - - /** Cellwise division of a tile by a scalar. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localDivideScalar[T: Numeric](tileCol: Column, value: T): Column = delegate.rf_local_divide(tileCol, value) - /** Perform an arbitrary GeoTrellis `LocalTileBinaryOp` between two Tile columns. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localAlgebra(op: LocalTileBinaryOp, left: Column, right: Column): - TypedColumn[Any, Tile] = - withAlias(opName(op), left, right)( - udf[Tile, Tile, Tile](op.apply).apply(left, right) - ).as[Tile] - - /** Compute the normalized difference of two tile columns */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def normalizedDifference(left: Column, right: Column): TypedColumn[Any, Tile] = delegate.rf_normalized_difference(left, right) - - /** Constructor for constant tile column */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def makeConstantTile(value: Number, cols: Int, rows: Int, cellType: String): TypedColumn[Any, Tile] = - udf(() => F.makeConstantTile(value, cols, rows, cellType)).apply().as(s"constant_$cellType").as[Tile] - - /** Alias for column of constant tiles of zero */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def tileZeros(cols: Int, rows: Int, cellType: String = "float64"): TypedColumn[Any, Tile] = - udf(() => F.tileZeros(cols, rows, cellType)).apply().as(s"zeros_$cellType").as[Tile] - - /** Alias for column of constant tiles of one */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def tileOnes(cols: Int, rows: Int, cellType: String = "float64"): TypedColumn[Any, Tile] = - udf(() => F.tileOnes(cols, rows, cellType)).apply().as(s"ones_$cellType").as[Tile] - - /** Where the mask tile equals the mask value, replace values in the source tile with NODATA */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def maskByValue(sourceTile: Column, maskTile: Column, maskValue: Column): TypedColumn[Any, Tile] = - delegate.rf_mask_by_value(sourceTile, maskTile, maskValue) - - /** Where the mask tile DOES NOT contain NODATA, replace values in the source tile with NODATA */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def inverseMask(sourceTile: Column, maskTile: Column): TypedColumn[Any, Tile] = - delegate.rf_inverse_mask(sourceTile, maskTile) - - /** Reproject a column of geometry from one CRS to another. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def reprojectGeometry(sourceGeom: Column, srcCRS: CRS, dstCRS: CRS): TypedColumn[Any, Geometry] = - delegate.st_reproject(sourceGeom, srcCRS, dstCRS) - - /** Render Tile as ASCII string for debugging purposes. */ - @Experimental - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def renderAscii(col: Column): TypedColumn[Any, String] = delegate.rf_render_ascii(col) - - /** Cellwise less than value comparison between two tiles. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localLess(left: Column, right: Column): Column = - delegate.rf_local_less(left, right) - - - /** Cellwise less than value comparison between a tile and a scalar. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localLessScalar[T: Numeric](tileCol: Column, value: T): Column = delegate.rf_local_less(tileCol, value) - - /** Cellwise less than or equal to value comparison between a tile and a scalar. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localLessEqual(left: Column, right: Column): Column = delegate.rf_local_less_equal(left, right) - - /** Cellwise less than or equal to value comparison between a tile and a scalar. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localLessEqualScalar[T: Numeric](tileCol: Column, value: T): Column = delegate.rf_local_less_equal(tileCol, value) - - /** Cellwise greater than value comparison between two tiles. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localGreater(left: Column, right: Column): Column = - delegate.rf_local_greater(left, right) - - /** Cellwise greater than value comparison between a tile and a scalar. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localGreaterScalar[T: Numeric](tileCol: Column, value: T): Column = delegate.rf_local_greater(tileCol, value) - - /** Cellwise greater than or equal to value comparison between two tiles. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localGreaterEqual(left: Column, right: Column): Column = delegate.rf_local_greater_equal(left, right) - - /** Cellwise greater than or equal to value comparison between a tile and a scalar. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localGreaterEqualScalar[T: Numeric](tileCol: Column, value: T): Column = delegate.rf_local_greater_equal(tileCol, value) - - /** Cellwise equal to value comparison between two tiles. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localEqual(left: Column, right: Column): Column = delegate.rf_local_equal(left, right) - - /** Cellwise equal to value comparison between a tile and a scalar. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localEqualScalar[T: Numeric](tileCol: Column, value: T): Column = delegate.rf_local_equal(tileCol, value) - - /** Cellwise inequality comparison between two tiles. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localUnequal(left: Column, right: Column): Column = delegate.rf_local_unequal(left, right) - - /** Cellwise inequality comparison between a tile and a scalar. */ - @deprecated("Part of 0.7.x compatibility kit, to be removed after 0.8.x. Please use \"snake_case\" variant instead.", "0.8.0") - def localUnequalScalar[T: Numeric](tileCol: Column, value: T): Column = delegate.rf_local_unequal(tileCol, value) - } - - def register(sqlContext: SQLContext): Unit = { - - /** Unary expression builder builder. */ - def ub[A, B](f: A => B)(a: Seq[A]): B = f(a.head) - /** Binary expression builder builder. */ - def bb[A, B](f: (A, A) => B)(a: Seq[A]): B = f(a.head, a.last) - /** Trinary expression builder builder. */ - def tb[A, B](f: (A, A, A) => B)(a: Seq[A]): B = f(a.head, a.tail.head, a.last) - - // Expression-oriented functions have a different registration scheme - // Currently have to register with the `builtin` registry due to Spark data hiding. - val registry: FunctionRegistry = rf.registry(sqlContext) - registry.registerFunc("rf_explodeTiles", ExplodeTiles.apply(1.0, None, _)) - registry.registerFunc("rf_cellType", ub(GetCellType.apply)) - registry.registerFunc("rf_convertCellType", bb(SetCellType.apply)) - registry.registerFunc("rf_tileDimensions", ub(GetDimensions.apply)) - registry.registerFunc("rf_boundsGeometry", ub(ExtentToGeometry.apply)) - registry.registerFunc("rf_localAdd", bb(Add.apply)) - registry.registerFunc("rf_localSubtract", bb(Subtract.apply)) - registry.registerFunc("rf_localMultiply", bb(Multiply.apply)) - registry.registerFunc("rf_localDivide", bb(Divide.apply)) - registry.registerFunc("rf_normalizedDifference", bb(NormalizedDifference.apply)) - registry.registerFunc("rf_localLess", bb(Less.apply)) - registry.registerFunc("rf_localLessEqual", bb(LessEqual.apply)) - registry.registerFunc("rf_localGreater", bb(Greater.apply)) - registry.registerFunc("rf_localGreaterEqual", bb(GreaterEqual.apply)) - registry.registerFunc("rf_localEqual", bb(Equal.apply)) - registry.registerFunc("rf_localUnequal", bb(Unequal.apply)) - registry.registerFunc("rf_tileSum", ub(Sum.apply)) - registry.registerFunc("rf_dataCells", ub(DataCells.apply)) - registry.registerFunc("rf_noDataCells", ub(NoDataCells.apply)) - registry.registerFunc("rf_isNoDataTile", ub(IsNoDataTile.apply)) - registry.registerFunc("rf_tileMin", ub(TileMin.apply)) - registry.registerFunc("rf_tileMax", ub(TileMax.apply)) - registry.registerFunc("rf_tileMean", ub(TileMean.apply)) - registry.registerFunc("rf_tileStats", ub(TileStats.apply)) - registry.registerFunc("rf_tileHistogram", ub(TileHistogram.apply)) - registry.registerFunc("rf_aggStats", ub(CellStatsAggregate.CellStatsAggregateUDAF.apply)) - registry.registerFunc("rf_aggHistogram", ub(HistogramAggregate.HistogramAggregateUDAF.apply)) - registry.registerFunc("rf_localAggStats", ub(LocalStatsAggregate.LocalStatsAggregateUDAF.apply)) - registry.registerFunc("rf_renderAscii", ub(DebugRender.RenderMatrix.apply)) - registry.registerFunc("rf_localAggMax", ub(LocalTileOpAggregate.LocalMaxUDAF.apply)) - registry.registerFunc("rf_localAggMin", ub(LocalTileOpAggregate.LocalMinUDAF.apply)) - registry.registerFunc("rf_localAggCount", ub(LocalCountAggregate.LocalDataCellsUDAF.apply)) - registry.registerFunc("rf_localAggMean", ub(LocalMeanAggregate.apply)) - registry.registerFunc("rf_reprojectGeometry", tb(ReprojectGeometry.apply)) - - sqlContext.udf.register("rf_makeConstantTile", F.makeConstantTile) - sqlContext.udf.register("rf_tileZeros", F.tileZeros) - sqlContext.udf.register("rf_tileOnes", F.tileOnes) - sqlContext.udf.register("rf_cellTypes", F.cellTypes) - } -} diff --git a/core/src/test/resources/MCD43A4.A2019111.h30v06.006.2019120033434_01.mrf.aux.xml b/core/src/test/resources/MCD43A4.A2019111.h30v06.006.2019120033434_01.mrf.aux.xml deleted file mode 100644 index 5a18f6944..000000000 --- a/core/src/test/resources/MCD43A4.A2019111.h30v06.006.2019120033434_01.mrf.aux.xml +++ /dev/null @@ -1,92 +0,0 @@ - - - LERC - PIXEL - - - 06121997 - MODIS - MODIS - Terra - Aqua - MODIS - MODIS - Passed - Passed was set as a default value. More algorithm will be developed - 0 - AMBRALS_V4.0R1 - v1.0500m - 15.0 - 463.312716527778 - volume - 2400 - 2400 - Day - Mandatory QA: - 0 = processed, good quality (full BRDF inversions) - 1 = processed, see other QA (magnitude BRDF inversions) - - 6.1 - 150.120692476232 - N - False - 75.0 - 86400 - 43200 - 19.9448109058663, 30.0666177912155, 29.9990071837477, 19.8789125843729 - 127.31379517564, 138.161359988435, 150.130532080915, 138.321766284772 - 1, 2, 3, 4 - HDFEOS_V2.19 - 30 - 10.5067/MODIS/MCD43A4.006 - 10.5067/MODIS/MCD43A4.006 - http://dx.doi.org - http://dx.doi.org - MYD09GA.A2019113.h30v06.006.2019115025936.hdf, MYD09GA.A2019114.h30v06.006.2019117021858.hdf, MYD09GA.A2019115.h30v06.006.2019117044251.hdf, MYD09GA.A2019116.h30v06.006.2019118031111.hdf, MYD09GA.A2019117.h30v06.006.2019119025916.hdf, MYD09GA.A2019118.h30v06.006.2019120030848.hdf, MOD09GA.A2019113.h30v06.006.2019115032521.hdf, MOD09GA.A2019114.h30v06.006.2019116030646.hdf, MOD09GA.A2019115.h30v06.006.2019117050730.hdf, MOD09GA.A2019116.h30v06.006.2019118032616.hdf, MOD09GA.A2019117.h30v06.006.2019119032020.hdf, MOD09GA.A2019118.h30v06.006.2019120032257.hdf, MCD43DB.A2019110.6.h30v06.hdf - MCD43A4.A2019111.h30v06.006.2019120033434.hdf - 6.1.34 - MODIS/Terra+Aqua BRDF/Albedo Nadir BRDF-Adjusted Ref Daily L3 Global - 500m - BRDF_Albedo_Band_Mandatory_Quality_Band1 - 0 - 500m - 29.9999999973059 - 1 - NOT SET - 0 - 0 - 0 - 100 - 0 - 6.0.42 - MODAPS - Linux minion7043 3.10.0-957.5.1.el7.x86_64 #1 SMP Fri Feb 1 14:54:57 UTC 2019 x86_64 x86_64 x86_64 GNU/Linux - 2019-04-30T03:34:48.000Z - 0 - 0 - 99 - 0 - 2019-04-13 - 00:00:00.000000 - 2019-04-28 - 23:59:59.999999 - processed once - further update is anticipated - Not Investigated - See http://landweb.nascom/nasa.gov/cgi-bin/QA_WWW/qaFlagPage.cgi?sat=aqua the product Science Quality status. - 06121997 - MCD43A4 - 19.9999999982039 - 2015 - 51030006 - concatenated flags - 0, 254 - 6 - 6 - 127.701332684185 - 255 - - - BRDF_Albedo_Band_Mandatory_Quality_Band1 - concatenated flags - - diff --git a/core/src/test/scala/org/locationtech/rasterframes/ExtensionMethodSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/ExtensionMethodSpec.scala index 4f5fe3591..bb3894162 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/ExtensionMethodSpec.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/ExtensionMethodSpec.scala @@ -39,7 +39,7 @@ import scala.xml.parsing.XhtmlParser class ExtensionMethodSpec extends TestEnvironment with TestData with SubdivideSupport { lazy val rf = sampleTileLayerRDD.toLayer - describe("DataFrame exention methods") { + describe("DataFrame extension methods") { it("should maintain original type") { val df = rf.withPrefixedColumnNames("_foo_") "val rf2: RasterFrameLayer = df" should compile @@ -49,7 +49,7 @@ class ExtensionMethodSpec extends TestEnvironment with TestData with SubdivideSu "val Some(col) = df.spatialKeyColumn" should compile } } - describe("RasterFrameLayer exention methods") { + describe("RasterFrameLayer extension methods") { it("should provide spatial key column") { noException should be thrownBy { rf.spatialKeyColumn @@ -124,6 +124,10 @@ class ExtensionMethodSpec extends TestEnvironment with TestData with SubdivideSu val md3 = rf.toMarkdown(truncate=true, renderTiles = false) md3 shouldNot include(" sfc.index(e.xmin, e.ymin, e.xmax, e.ymax)) + + def reproject(dst: CRS)(e: Extent): Extent = e.reproject(LatLng, dst) + + describe("Spatial index generation") { + import spark.implicits._ + it("should be SQL registered with docs") { + checkDocs("rf_spatial_index") + } + it("should create index from Extent") { + val crs: CRS = WebMercator + val df = testExtents.map(reproject(crs)).map(Tuple1.apply).toDF("extent") + val indexes = df.select(rf_spatial_index($"extent", serialized_literal(crs))).collect() + + forEvery(indexes.zip(expected)) { case (i, e) => + i should be (e) + } + } + it("should create index from Geometry") { + val crs: CRS = LatLng + val df = testExtents.map(_.jtsGeom).map(Tuple1.apply).toDF("extent") + val indexes = df.select(rf_spatial_index($"extent", serialized_literal(crs))).collect() + + forEvery(indexes.zip(expected)) { case (i, e) => + i should be (e) + } + } + it("should create index from ProjectedRasterTile") { + val crs: CRS = WebMercator + val tile = TestData.randomTile(2, 2, CellType.fromName("uint8")) + val prts = testExtents.map(reproject(crs)).map(ProjectedRasterTile(tile, _, crs)) + + implicit val enc = Encoders.tuple(ProjectedRasterTile.prtEncoder, Encoders.scalaInt) + // The `id` here is to deal with Spark auto projecting single columns dataframes and needing to provide an encoder + val df = prts.zipWithIndex.toDF("proj_raster", "id") + val indexes = df.select(rf_spatial_index($"proj_raster")).collect() + + forEvery(indexes.zip(expected)) { case (i, e) => + i should be (e) + } + } + it("should create index from RasterSource") { + val crs: CRS = WebMercator + val tile = TestData.randomTile(2, 2, CellType.fromName("uint8")) + val srcs = testExtents.map(reproject(crs)).map(InMemoryRasterSource(tile, _, crs): RasterSource).toDF("src") + val indexes = srcs.select(rf_spatial_index($"src")).collect() + + forEvery(indexes.zip(expected)) { case (i, e) => + i should be (e) + } + + } + it("should work when CRS is LatLng") { + val df = testExtents.map(Tuple1.apply).toDF("extent") + val crs: CRS = LatLng + val indexes = df.select(rf_spatial_index($"extent", serialized_literal(crs))).collect() + + forEvery(indexes.zip(expected)) { case (i, e) => + i should be (e) + } + } + it("should support custom resolution") { + val sfc = XZ2SFC(3) + val expected = testExtents.map(e => sfc.index(e.xmin, e.ymin, e.xmax, e.ymax)) + val df = testExtents.map(Tuple1.apply).toDF("extent") + val crs: CRS = LatLng + val indexes = df.select(rf_spatial_index($"extent", serialized_literal(crs), 3)).collect() + + forEvery(indexes.zip(expected)) { case (i, e) => + i should be (e) + } + } + } +} diff --git a/core/src/test/scala/org/locationtech/rasterframes/ref/RasterRefSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/ref/RasterRefSpec.scala index 80f0a7082..51e3338d2 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/ref/RasterRefSpec.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/ref/RasterRefSpec.scala @@ -253,14 +253,18 @@ class RasterRefSpec extends TestEnvironment with TestData { } } - it("should construct a RasterRefTile without I/O") { + it("should construct and inspect a RasterRefTile without I/O") { new Fixture { // SimpleRasterInfo is a proxy for header data requests. - val start = SimpleRasterInfo.cacheStats.hitCount() + val startStats = SimpleRasterInfo.cacheStats val t: ProjectedRasterTile = RasterRefTile(subRaster) - val result = Seq(t, subRaster.tile).toDF("tile").first() - val end = SimpleRasterInfo.cacheStats.hitCount() - end should be(start) + val df = Seq(t, subRaster.tile).toDF("tile") + val result = df.first() + SimpleRasterInfo.cacheStats.hitCount() should be(startStats.hitCount()) + SimpleRasterInfo.cacheStats.missCount() should be(startStats.missCount()) + val info = df.select(rf_dimensions($"tile"), rf_extent($"tile")).first() + SimpleRasterInfo.cacheStats.hitCount() should be(startStats.hitCount() + 2) + SimpleRasterInfo.cacheStats.missCount() should be(startStats.missCount()) } } } diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffCollectionRelation.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffCollectionRelation.scala deleted file mode 100644 index 3148a67d0..000000000 --- a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffCollectionRelation.scala +++ /dev/null @@ -1,82 +0,0 @@ -/* - * This software is licensed under the Apache 2 license, quoted below. - * - * Copyright 2018 Astraea, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * [http://www.apache.org/licenses/LICENSE-2.0] - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - * - * SPDX-License-Identifier: Apache-2.0 - * - */ - -package org.locationtech.rasterframes.datasource.geotiff - -import java.net.URI - -import geotrellis.proj4.CRS -import geotrellis.spark.io.hadoop.HadoopGeoTiffRDD -import geotrellis.vector.{Extent, ProjectedExtent} -import org.apache.hadoop.fs.Path -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.rf.TileUDT -import org.apache.spark.sql.sources.{BaseRelation, PrunedScan} -import org.apache.spark.sql.types.{StringType, StructField, StructType} -import org.apache.spark.sql.{Row, SQLContext} -import org.locationtech.rasterframes._ -import org.locationtech.rasterframes.datasource.geotiff.GeoTiffCollectionRelation.Cols -import org.locationtech.rasterframes.encoders.CatalystSerializer._ -import org.locationtech.rasterframes.util._ - -private[geotiff] -case class GeoTiffCollectionRelation(sqlContext: SQLContext, uri: URI, bandCount: Int) extends BaseRelation with PrunedScan { - - override def schema: StructType = StructType(Seq( - StructField(Cols.PATH, StringType, false), - StructField(EXTENT_COLUMN.columnName, schemaOf[Extent], nullable = true), - StructField(CRS_COLUMN.columnName, schemaOf[CRS], false) - ) ++ ( - if(bandCount == 1) Seq(StructField(Cols.TL, new TileUDT, false)) - else for(b ← 1 to bandCount) yield StructField(Cols.TL + "_" + b, new TileUDT, nullable = true) - )) - - val keyer = (u: URI, e: ProjectedExtent) ⇒ (u.getPath, e) - - override def buildScan(requiredColumns: Array[String]): RDD[Row] = { - implicit val sc = sqlContext.sparkContext - - val columnIndexes = requiredColumns.map(schema.fieldIndex) - - HadoopGeoTiffRDD.multiband(new Path(uri.toASCIIString), keyer, HadoopGeoTiffRDD.Options.DEFAULT) - .map { case ((path, pe), mbt) ⇒ - val entries = columnIndexes.map { - case 0 ⇒ path - case 1 ⇒ pe.extent.toRow - case 2 ⇒ pe.crs.toRow - case i if i > 2 ⇒ { - if(bandCount == 1 && mbt.bandCount > 2) mbt.color() - else mbt.band(i - 3) - } - } - Row(entries: _*) - } - } -} - -object GeoTiffCollectionRelation { - object Cols { - lazy val PATH = "path" - lazy val CRS = "crs" - lazy val EX = GEOMETRY_COLUMN.columnName - lazy val TL = TILE_COLUMN.columnName - } -} diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala index 9e2d8dcb3..d236449ed 100644 --- a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala +++ b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala @@ -49,18 +49,16 @@ class GeoTiffDataSource def shortName() = GeoTiffDataSource.SHORT_NAME + /** Read single geotiff as a relation. */ def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) = { require(parameters.path.isDefined, "Valid URI 'path' parameter required.") sqlContext.withRasterFrames val p = parameters.path.get - - if (p.getPath.contains("*")) { - val bandCount = parameters.get(GeoTiffDataSource.BAND_COUNT_PARAM).map(_.toInt).getOrElse(1) - GeoTiffCollectionRelation(sqlContext, p, bandCount) - } else GeoTiffRelation(sqlContext, p) + GeoTiffRelation(sqlContext, p) } + /** Write dataframe containing bands into a single geotiff. Note: performs a driver collect, and is not "big data" friendly. */ override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { require(parameters.path.isDefined, "Valid URI 'path' parameter required.") val path = parameters.path.get @@ -71,8 +69,6 @@ class GeoTiffDataSource require(tileCols.nonEmpty, "Could not find any tile columns.") - - val destCRS = parameters.crs.orElse(df.asLayerSafely.map(_.crs)).getOrElse( throw new IllegalArgumentException("A destination CRS must be provided") ) diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceDataSource.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceDataSource.scala index 061e9fb56..03b2fd0da 100644 --- a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceDataSource.scala +++ b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceDataSource.scala @@ -37,7 +37,7 @@ class RasterSourceDataSource extends DataSourceRegister with RelationProvider { override def shortName(): String = SHORT_NAME override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val bands = parameters.bandIndexes - val tiling = parameters.tileDims + val tiling = parameters.tileDims.orElse(Some(NOMINAL_TILE_DIMS)) val lazyTiles = parameters.lazyTiles val spec = parameters.pathSpec val catRef = spec.fold(_.registerAsTable(sqlContext), identity) diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceRelation.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceRelation.scala index 6af519f56..9b381d3a6 100644 --- a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceRelation.scala +++ b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceRelation.scala @@ -69,6 +69,9 @@ case class RasterSourceRelation( catalog.schema.fields.filter(f => !catalogTable.bandColumnNames.contains(f.name)) } + protected def defaultNumPartitions: Int = + sqlContext.sparkSession.sessionState.conf.numShufflePartitions + override def schema: StructType = { val tileSchema = schemaOf[ProjectedRasterTile] val paths = for { @@ -84,10 +87,11 @@ case class RasterSourceRelation( override def buildScan(): RDD[Row] = { import sqlContext.implicits._ - // The general transformaion is: + // The general transformation is: // input -> path -> src -> ref -> tile // Each step is broken down for readability val inputs: DataFrame = sqlContext.table(catalogTable.tableName) + .repartition(defaultNumPartitions) // Basically renames the input columns to have the '_path' suffix val pathsAliasing = for { @@ -112,7 +116,7 @@ case class RasterSourceRelation( val df = if (lazyTiles) { // Expand RasterSource into multiple columns per band, and multiple rows per tile - // There's some unintentional fragililty here in that the structure of the expression + // There's some unintentional fragility here in that the structure of the expression // is expected to line up with our column structure here. val refs = RasterSourceToRasterRefs(subtileDims, bandIndexes, srcs: _*) as refColNames diff --git a/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffCollectionDataSourceSpec.scala b/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffCollectionDataSourceSpec.scala deleted file mode 100644 index 9b69fd89e..000000000 --- a/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffCollectionDataSourceSpec.scala +++ /dev/null @@ -1,49 +0,0 @@ -/* - * This software is licensed under the Apache 2 license, quoted below. - * - * Copyright 2018 Astraea, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * [http://www.apache.org/licenses/LICENSE-2.0] - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - * - * SPDX-License-Identifier: Apache-2.0 - * - */ -package org.locationtech.rasterframes.datasource.geotiff - -import java.io.{File, FilenameFilter} - -import org.locationtech.rasterframes._ -import org.locationtech.rasterframes.TestEnvironment - -/** - * @since 1/14/18 - */ -class GeoTiffCollectionDataSourceSpec - extends TestEnvironment with TestData { - - describe("GeoTiff directory reading") { - it("shiould read a directory of files") { - - val df = spark.read - .format("geotiff") - .load(geotiffDir.resolve("*.tiff").toString) - val expected = geotiffDir.toFile.list(new FilenameFilter { - override def accept(dir: File, name: String): Boolean = name.endsWith("tiff") - }).length - - assert(df.select("path").distinct().count() === expected) - - // df.show(false) - } - } -} diff --git a/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSourceSpec.scala b/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSourceSpec.scala index 817d7d5bf..c57737118 100644 --- a/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSourceSpec.scala +++ b/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSourceSpec.scala @@ -192,29 +192,36 @@ class GeoTiffDataSourceSpec } it("should write GeoTIFF without layer") { - val pr = col("proj_raster_b0") - val rf = spark.read.raster.withBandIndexes(0, 1, 2).load(rgbCogSamplePath.toASCIIString) - val out = Paths.get("target", "example2-geotiff.tif") - logger.info(s"Writing to $out") + val sample = rgbCogSample + val expectedExtent = sample.extent + val (expCols, expRows) = sample.tile.dimensions - withClue("explicit extent/crs") { + val rf = spark.read.raster.withBandIndexes(0, 1, 2).load(rgbCogSamplePath.toASCIIString) + + withClue("extent/crs columns provided") { + val out = Paths.get("target", "example2a-geotiff.tif") noException shouldBe thrownBy { rf .withColumn("extent", rf_extent(pr)) .withColumn("crs", rf_crs(pr)) - .write.geotiff.withCRS(LatLng).save(out.toString) + .write.geotiff.withCRS(sample.crs).save(out.toString) + checkTiff(out, expCols, expRows, expectedExtent, Some(sample.cellType)) } } - withClue("without explicit extent/crs") { + withClue("without extent/crs columns") { + val out = Paths.get("target", "example2b-geotiff.tif") noException shouldBe thrownBy { rf - .write.geotiff.withCRS(LatLng).save(out.toString) + .write.geotiff.withCRS(sample.crs).save(out.toString) + checkTiff(out, expCols, expRows, expectedExtent, Some(sample.cellType)) } } + withClue("with downsampling") { + val out = Paths.get("target", "example2c-geotiff.tif") noException shouldBe thrownBy { rf .write.geotiff @@ -223,9 +230,6 @@ class GeoTiffDataSourceSpec .save(out.toString) } } - - checkTiff(out, 128, 128, - Extent(-76.52586750038186, 36.85907177863949, -76.17461216980891, 37.1303690755922)) } it("should produce the correct subregion from layer") { diff --git a/docs/src/main/paradox/RasterFramePipeline.png b/docs/src/main/paradox/RasterFramePipeline.png deleted file mode 100644 index 26900b8cf..000000000 Binary files a/docs/src/main/paradox/RasterFramePipeline.png and /dev/null differ diff --git a/docs/src/main/paradox/RasterFramePipeline.svg b/docs/src/main/paradox/RasterFramePipeline.svg deleted file mode 100644 index e9c08f831..000000000 --- a/docs/src/main/paradox/RasterFramePipeline.svg +++ /dev/null @@ -1,920 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Produced by OmniGraffle 7.7 - 2018-02-16 20:16:42 +0000 - - - Canvas 7 - - Layer 1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - GeoTrellis - Layers - - - - - - - - - - Map Algebra - - - - - - - Layer - Operations - - - - - - - - - - - - - - - - Statistical - Analysis - - - - - TileLayerRDD - - - - - - - - - - - - - Machine - Learning - - - - - - - - - - Visualization - - - - - - - - - - - - - - Your ApplicationeoTIFF - - - - - - - - - - RasterFrame - - - - - - - Spark - DataSource - - - - - - - - - - Spark - DataFrame - - - - - - - - join - - - - - diff --git a/pyrasterframes/src/main/python/docs/reference.pymd b/docs/src/main/paradox/reference.md similarity index 90% rename from pyrasterframes/src/main/python/docs/reference.pymd rename to docs/src/main/paradox/reference.md index 195b7e5e0..1121bbd36 100644 --- a/pyrasterframes/src/main/python/docs/reference.pymd +++ b/docs/src/main/paradox/reference.md @@ -66,6 +66,15 @@ See also GeoMesa [st_envelope](https://www.geomesa.org/documentation/user/spark/ Convert an extent to a Geometry. The extent likely comes from @ref:[`st_extent`](reference.md#st-extent) or @ref:[`rf_extent`](reference.md#rf-extent). + +### rf_spatial_index + + Long rf_spatial_index(Geometry geom, CRS crs) + Long rf_spatial_index(Extent extent, CRS crs) + Long rf_spatial_index(ProjectedRasterTile proj_raster, CRS crs) + +Constructs a XZ2 index in WGS84/EPSG:4326 from either a Geometry, Extent, ProjectedRasterTile and its CRS. This function is useful for [range partitioning](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=registerjava#pyspark.sql.DataFrame.repartitionByRange). + ## Tile Metadata and Mutation Functions to access and change the particulars of a `tile`: its shape and the data type of its cells. See section on @ref:["NoData" handling](nodata-handling.md) for additional discussion of cell types. @@ -183,7 +192,7 @@ Parameters `tile_columns` and `tile_rows` are literals, not column expressions. Tile rf_array_to_tile(Array arrayCol, Int numCols, Int numRows) -Python only. Create a `tile` from a Spark SQL [Array](http://spark.apache.org/docs/2.3.2/api/python/pyspark.sql.html#pyspark.sql.types.ArrayType), filling values in row-major order. +Python only. Create a `tile` from a Spark SQL [Array][Array], filling values in row-major order. ### rf_assemble_tile @@ -198,7 +207,7 @@ SQL implementation does not accept a cell_type argument. It returns a float64 ce ## Masking and NoData -See @ref:[NoData handling](nodata-handling.md) for conceptual discussion of cell types and NoData. +See the @ref:[masking](masking.md) page for conceptual discussion of masking operations. There are statistical functions of the count of data and NoData values per `tile` and aggregate over a `tile` column: @ref:[`rf_data_cells`](reference.md#rf-data-cells), @ref:[`rf_no_data_cells`](reference.md#rf-no-data-cells), @ref:[`rf_agg_data_cells`](reference.md#rf-agg-data-cells), and @ref:[`rf_agg_no_data_cells`](reference.md#rf-agg-no-data-cells). @@ -206,14 +215,30 @@ Masking is a raster operation that sets specific cells to NoData based on the va ### rf_mask - Tile rf_mask(Tile tile, Tile mask) + Tile rf_mask(Tile tile, Tile mask, bool inverse) Where the `mask` contains NoData, replace values in the `tile` with NoData. Returned `tile` cell type will be coerced to one supporting NoData if it does not already. +`inverse` is a literal not a Column. If `inverse` is true, return the `tile` with NoData in locations where the `mask` _does not_ contain NoData. Equivalent to @ref:[`rf_inverse_mask`](reference.md#rf-inverse-mask). + See also @ref:[`rf_rasterize`](reference.md#rf-rasterize). +### rf_mask_by_value + + Tile rf_mask_by_value(Tile data_tile, Tile mask_tile, Int mask_value, bool inverse) + +Generate a `tile` with the values from `data_tile`, with NoData in cells where the `mask_tile` is equal to `mask_value`. + +`inverse` is a literal not a Column. If `inverse` is true, return the `data_tile` with NoData in locations where the `mask_tile` value is _not equal_ to `mask_value`. Equivalent to @ref:[`rf_inverse_mask_by_value`](reference.md#rf-inverse-mask-by-value). + +### rf_mask_by_values + + Tile rf_mask_by_values(Tile data_tile, Tile mask_tile, Array mask_values) + Tile rf_mask_by_values(Tile data_tile, Tile mask_tile, seq mask_values) + +Generate a `tile` with the values from `data_tile`, with NoData in cells where the `mask_tile` is in the `mask_values` Array or list. `mask_values` can be a [`pyspark.sql.ArrayType`][Array] or a `list`. ### rf_inverse_mask @@ -221,12 +246,12 @@ See also @ref:[`rf_rasterize`](reference.md#rf-rasterize). Where the `mask` _does not_ contain NoData, replace values in `tile` with NoData. -### rf_mask_by_value - Tile rf_mask_by_value(Tile data_tile, Tile mask_tile, Int mask_value) +### rf_inverse_mask_by_value -Generate a `tile` with the values from `data_tile`, with NoData in cells where the `mask_tile` is equal to `mask_value`. + Tile rf_inverse_mask_by_value(Tile data_tile, Tile mask_tile, Int mask_value) +Generate a `tile` with the values from `data_tile`, with NoData in cells where the `mask_tile` is not equal to `mask_value`. In other words, only keep `data_tile` cells in locations where the `mask_tile` is equal to `mask_value`. ### rf_is_no_data_tile @@ -374,6 +399,13 @@ Returns a `tile` column containing the element-wise equality of `tile1` and `rhs Returns a `tile` column containing the element-wise inequality of `tile1` and `rhs`. +### rf_local_is_in + + Tile rf_local_is_in(Tile tile, Array array) + Tile rf_local_is_in(Tile tile, list l) + +Returns a `tile` column with cell values of 1 where the `tile` cell value is in the provided array or list. The `array` is a Spark SQL [Array][Array]. A python `list` of numeric values can also be passed. + ### rf_round Tile rf_round(Tile tile) @@ -621,13 +653,13 @@ Python only. As with @ref:[`rf_explode_tiles`](reference.md#rf-explode-tiles), b Array rf_tile_to_array_int(Tile tile) -Convert Tile column to Spark SQL [Array](http://spark.apache.org/docs/2.3.2/api/python/pyspark.sql.html#pyspark.sql.types.ArrayType), in row-major order. Float cell types will be coerced to integral type by flooring. +Convert Tile column to Spark SQL [Array][Array], in row-major order. Float cell types will be coerced to integral type by flooring. ### rf_tile_to_array_double Array rf_tile_to_arry_double(Tile tile) -Convert tile column to Spark [Array](http://spark.apache.org/docs/2.3.2/api/python/pyspark.sql.html#pyspark.sql.types.ArrayType), in row-major order. Integral cell types will be coerced to floats. +Convert tile column to Spark [Array][Array], in row-major order. Integral cell types will be coerced to floats. ### rf_render_ascii @@ -657,3 +689,4 @@ Runs [`rf_rgb_composite`](reference.md#rf-rgb-composite) on the given tile colum [RasterFunctions]: org.locationtech.rasterframes.RasterFunctions [scaladoc]: latest/api/index.html +[Array]: http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.types.ArrayType diff --git a/docs/src/main/paradox/release-notes.md b/docs/src/main/paradox/release-notes.md index 0d7ed4c9c..c181d55da 100644 --- a/docs/src/main/paradox/release-notes.md +++ b/docs/src/main/paradox/release-notes.md @@ -2,11 +2,22 @@ ## 0.8.x +### 0.8.4 + +* Upgraded to Spark 2.4.4 +* Add `rf_mask_by_values` and `rf_local_is_in` raster functions; added optional `inverse` argument to `rf_mask` functions. ([#403](https://github.com/locationtech/rasterframes/pull/403), [#384](https://github.com/locationtech/rasterframes/issues/384)) +* Added forced truncation of WKT types in Markdown/HTML rendering. ([#408](https://github.com/locationtech/rasterframes/pull/408)) +* Add `rf_local_is_in` raster function. ([#400](https://github.com/locationtech/rasterframes/pull/400)) +* Added partitioning to catalogs before processing in RasterSourceDataSource ([#397](https://github.com/locationtech/rasterframes/pull/397)) +* Fixed bug where `rf_tile_dimensions` would cause unnecessary reading of tiles. ([#394](https://github.com/locationtech/rasterframes/pull/394)) +* _Breaking_ (potentially): removed `GeoTiffCollectionRelation` due to usage limitation and overlap with `RasterSourceDataSource` functionality. + ### 0.8.3 * Updated to GeoTrellis 2.3.3 and Proj4j 1.1.0. * Fixed issues with `LazyLogger` and shading assemblies ([#293](https://github.com/locationtech/rasterframes/issues/293)) * Updated `rf_crs` to accept string columns containing CRS specifications. ([#366](https://github.com/locationtech/rasterframes/issues/366)) +* Added `rf_spatial_index` function. ([#368](https://github.com/locationtech/rasterframes/issues/368)) * _Breaking_ (potentially): removed `pyrasterframes.create_spark_session` in lieu of `pyrasterframes.utils.create_rf_spark_session` ### 0.8.2 diff --git a/experimental/src/it/resources/log4j.properties b/experimental/src/it/resources/log4j.properties index 5cc16f4db..4a81f524a 100644 --- a/experimental/src/it/resources/log4j.properties +++ b/experimental/src/it/resources/log4j.properties @@ -35,7 +35,7 @@ log4j.logger.org.spark_project.jetty=WARN log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO -log4j.logger.org.locationtech.rasterframes=DEBUG +log4j.logger.org.locationtech.rasterframes=INFO log4j.logger.org.apache.parquet.hadoop.ParquetRecordReader=OFF # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support diff --git a/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/MODISCatalogRelationTest.scala b/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/MODISCatalogRelationTest.scala index 3f777d322..8499bbe44 100644 --- a/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/MODISCatalogRelationTest.scala +++ b/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/MODISCatalogRelationTest.scala @@ -21,6 +21,7 @@ package org.locationtech.rasterframes.experimental.datasource.awspds import java.sql.Timestamp +import geotrellis.proj4.LatLng import org.apache.spark.sql.functions._ import org.locationtech.rasterframes._ import org.locationtech.rasterframes.datasource.raster._ @@ -75,5 +76,32 @@ class MODISCatalogRelationTest extends TestEnvironment { stats.data_cells shouldBe > (128L) stats.mean shouldBe > (1000.0) } + it("should compute aggregate statistics") { + // This is copied from the docs. + import spark.implicits._ + + val modis = spark.read.format("aws-pds-modis-catalog").load() + + val red_nir_monthly_2017 = modis + .select($"granule_id", month($"acquisition_date") as "month", $"B01" as "red", $"B02" as "nir") + .where(year($"acquisition_date") === 2017 && (dayofmonth($"acquisition_date") === 15) && $"granule_id" === "h21v09") + + val red_nir_tiles_monthly_2017 = spark.read.raster + .fromCatalog(red_nir_monthly_2017, "red", "nir") + .load() + .cache() + + val result = red_nir_tiles_monthly_2017 + .where(st_intersects( + st_reproject(rf_geometry($"red"), rf_crs($"red"), LatLng), + st_makePoint(34.870605, -4.729727) + )) + .groupBy("month") + .agg(rf_agg_stats(rf_normalized_difference($"nir", $"red")) as "ndvi_stats") + .orderBy("month") + .select("month", "ndvi_stats.*") + + result.show() + } } } diff --git a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/CachedDatasetRelation.scala b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/CachedDatasetRelation.scala index 1fac7699a..06947080d 100644 --- a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/CachedDatasetRelation.scala +++ b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/CachedDatasetRelation.scala @@ -33,6 +33,8 @@ import org.locationtech.rasterframes.util._ * @since 8/24/18 */ trait CachedDatasetRelation extends ResourceCacheSupport { self: BaseRelation ⇒ + protected def defaultNumPartitions: Int = + sqlContext.sparkSession.sessionState.conf.numShufflePartitions protected def cacheFile: HadoopPath protected def constructDataset: Dataset[Row] diff --git a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelation.scala b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelation.scala index 9a14c86f3..049617de6 100644 --- a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelation.scala +++ b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelation.scala @@ -68,7 +68,9 @@ case class L8CatalogRelation(sqlContext: SQLContext, sceneListPath: HadoopPath) .select(schema.map(f ⇒ col(f.name)): _*) .orderBy(ACQUISITION_DATE.name, PATH.name, ROW.name) .distinct() // The scene file contains duplicates. - .repartition(8, col(PATH.name), col(ROW.name)) + .repartition(defaultNumPartitions, col(PATH.name), col(ROW.name)) + + } } diff --git a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/MODISCatalogRelation.scala b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/MODISCatalogRelation.scala index 30b3ba234..6e76acc36 100644 --- a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/MODISCatalogRelation.scala +++ b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/MODISCatalogRelation.scala @@ -64,7 +64,7 @@ case class MODISCatalogRelation(sqlContext: SQLContext, sceneList: HadoopPath) $"${GID.name}") ++ bandCols: _* ) .orderBy(ACQUISITION_DATE.name, GID.name) - .repartition(8, col(GRANULE_ID.name)) + .repartition(defaultNumPartitions, col(GRANULE_ID.name)) } } diff --git a/project/PythonBuildPlugin.scala b/project/PythonBuildPlugin.scala index 7cc33fda3..37404ddae 100644 --- a/project/PythonBuildPlugin.scala +++ b/project/PythonBuildPlugin.scala @@ -133,7 +133,7 @@ object PythonBuildPlugin extends AutoPlugin { val dest = (Compile / packageBin / artifactPath).value.getParentFile val art = (Python / packageBin / artifact).value val ver = version.value - dest / s"${art.name}-$ver-py2.py3-none-any.whl" + dest / s"${art.name}-$ver-py3-none-any.whl" }, testQuick := pySetup.toTask(" test").value, executeTests := Def.task { diff --git a/project/RFAssemblyPlugin.scala b/project/RFAssemblyPlugin.scala index 60a7badfe..cbde26437 100644 --- a/project/RFAssemblyPlugin.scala +++ b/project/RFAssemblyPlugin.scala @@ -55,7 +55,8 @@ object RFAssemblyPlugin extends AutoPlugin { "org.apache.avro", "org.apache.http", "com.google.guava", - "com.typesafe.scalalogging" + "com.typesafe.scalalogging", + "com.typesafe.config" ) shadePrefixes.map(p ⇒ ShadeRule.rename(s"$p.**" -> s"rf.shaded.$p.@1").inAll) }, diff --git a/project/RFDependenciesPlugin.scala b/project/RFDependenciesPlugin.scala index 5c161eadb..20cca567f 100644 --- a/project/RFDependenciesPlugin.scala +++ b/project/RFDependenciesPlugin.scala @@ -59,7 +59,7 @@ object RFDependenciesPlugin extends AutoPlugin { ), // NB: Make sure to update the Spark version in pyrasterframes/python/setup.py - rfSparkVersion := "2.3.4", + rfSparkVersion := "2.4.4", rfGeoTrellisVersion := "2.3.3", rfGeoMesaVersion := "2.2.1", ) diff --git a/project/plugins.sbt b/project/plugins.sbt index 73943d4cb..f51b70fec 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -8,7 +8,7 @@ addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.2") addSbtPlugin("com.typesafe.sbt" % "sbt-site" % "1.3.2") addSbtPlugin("com.lightbend.paradox" % "sbt-paradox" % "0.5.5") addSbtPlugin("io.github.jonas" % "sbt-paradox-material-theme" % "0.6.0") -addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.3.3") +addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.3.6") addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.1") addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.1") addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.1") @@ -16,5 +16,6 @@ addSbtPlugin("net.vonbuchholtz" % "sbt-dependency-check" % "0.2.10") addSbtPlugin("com.github.gseitz" %% "sbt-release" % "1.0.9") addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.3.19") addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.0") +addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "1.0.0") diff --git a/pyrasterframes/src/main/python/README.md b/pyrasterframes/src/main/python/README.md index 22a6f30c7..00a915387 100644 --- a/pyrasterframes/src/main/python/README.md +++ b/pyrasterframes/src/main/python/README.md @@ -13,8 +13,8 @@ pip install pyrasterframes You can then access a [`pyspark SparkSession`](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.SparkSession) using the [`local[*]` master](https://spark.apache.org/docs/latest/submitting-applications.html#master-urls) in your python interpreter as follows. ```python -import pyrasterframes -spark = pyrasterframes.get_spark_session() +from pyrasterframes.utils import create_rf_spark_session +spark = create_rf_spark_session() ``` Then you can read a raster and do some work with it. diff --git a/pyrasterframes/src/main/python/docs/index.md b/pyrasterframes/src/main/python/docs/index.md index e3a37274b..f3be57721 100644 --- a/pyrasterframes/src/main/python/docs/index.md +++ b/pyrasterframes/src/main/python/docs/index.md @@ -2,15 +2,21 @@ RasterFrames® brings together Earth-observation (EO) data access, cloud computing, and DataFrame-based data science. The recent explosion of EO data from public and private satellite operators presents both a huge opportunity and a huge challenge to the data analysis community. It is _Big Data_ in the truest sense, and its footprint is rapidly getting bigger. -RasterFrames provides a DataFrame-centric view over arbitrary raster data, enabling spatiotemporal queries, map algebra raster operations, and compatibility with the ecosystem of Spark ML algorithms. By using DataFrames as the core cognitive and compute data model, it is able to deliver these features in a form that is both accessible to general analysts and scalable along with the rapidly growing data footprint. +RasterFrames provides a DataFrame-centric view over arbitrary geospatial raster data, enabling spatiotemporal queries, map algebra raster operations, and interoperability with Spark ML. By using the DataFrame as the core cognitive and compute data model, RasterFrames is able to deliver an extensive set of functionality in a form that is both horizontally scalable as well as familiar to general analysts and data scientists. It provides APIs for Python, SQL, and Scala. -To learn more, please see the @ref:[Getting Started](getting-started.md) section of this manual. +![RasterFrames](static/rasterframes-pipeline-nologo.png) -The source code can be found on GitHub at [locationtech/rasterframes](https://github.com/locationtech/rasterframes). +Through its custom [Spark DataSource](https://rasterframes.io/raster-read.html), RasterFrames can read various raster formats -- including GeoTIFF, JP2000, MRF, and HDF -- and from an [array of services](https://rasterframes.io/raster-read.html#uri-formats), such as HTTP, FTP, HDFS, S3 and WASB. It also supports reading the vector formats GeoJSON and WKT/WKB. RasterFrame contents can be filtered, transformed, summarized, resampled, and rasterized through [200+ raster and vector functions](https://rasterframes.io/reference.html). + +As part of the LocationTech family of projects, RasterFrames builds upon the strong foundations provided by GeoMesa (spatial operations) , GeoTrellis (raster operations), JTS (geometry modeling) and SFCurve (spatiotemporal indexing), integrating various aspects of these projects into a unified, DataFrame-centric analytics package. + +![](static/rasterframes-locationtech-stack.png) -RasterFrames is released under the [Apache 2.0 License](https://github.com/locationtech/rasterframes/blob/develop/LICENSE). +RasterFrames is released under the commercial-friendly [Apache 2.0](https://github.com/locationtech/rasterframes/blob/develop/LICENSE) open source license. -![RasterFrames](static/rasterframes-pipeline.png) +To learn more, please see the @ref:[Getting Started](getting-started.md) section of this manual. + +The source code can be found on GitHub at [locationtech/rasterframes](https://github.com/locationtech/rasterframes).
diff --git a/pyrasterframes/src/main/python/docs/masking.pymd b/pyrasterframes/src/main/python/docs/masking.pymd new file mode 100644 index 000000000..f3ab08826 --- /dev/null +++ b/pyrasterframes/src/main/python/docs/masking.pymd @@ -0,0 +1,135 @@ +# Masking + +```python setup, echo=False +import pyrasterframes +from pyrasterframes.utils import create_rf_spark_session +from pyrasterframes.rasterfunctions import * +import pyrasterframes.rf_ipython +from IPython.display import display +import pandas as pd +import numpy as np +from pyrasterframes.rf_types import Tile + +spark = create_rf_spark_session() +``` + +Masking is a common operation in raster processing. It is setting certain cells to the @ref:[NoData value](nodata-handling.md). This is usually done to remove low-quality observations from the raster processing. Another related use case is to @ref:["clip"](masking.md#clipping) a raster to a given polygon. + +## Masking Example + +Let's demonstrate masking with a pair of bands of Sentinel-2 data. The measurement bands we will use, blue and green, have no defined NoData. They share quality information from a separate file called the scene classification (SCL), which delineates areas of missing data and probable clouds. For more information on this, see the [Sentinel-2 algorithm overview](https://earth.esa.int/web/sentinel/technical-guides/sentinel-2-msi/level-2a/algorithm). Figure 3 tells us how to interpret the scene classification. For this example, we will exclude NoData, defective pixels, probable clouds, and cirrus clouds: values 0, 1, 8, 9, and 10. + +![Sentinel-2 Scene Classification Values](static/sentinel-2-scene-classification-labels.png) + +Credit: [Sentinel-2 algorithm overview](https://earth.esa.int/web/sentinel/technical-guides/sentinel-2-msi/level-2a/algorithm) + +The first step is to create a catalog with our band of interest and the SCL band. We read the data from the catalog, so all _tiles_ are aligned across rows. + +```python, blue_scl_cat +from pyspark.sql import Row + +blue_uri = 'https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B02.tif' +green_uri = 'https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B03.tif' +scl_uri = 'https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/SCL.tif' +cat = spark.createDataFrame([Row(blue=blue_uri, green=green_uri, scl=scl_uri),]) +unmasked = spark.read.raster(cat, catalog_col_names=['blue', 'green', 'scl']) +unmasked.printSchema() +``` + +```python, show_cell_types +unmasked.select(rf_cell_type('blue'), rf_cell_type('scl')).distinct() +``` + +## Define CellType for Masked Tile + +Because there is not a NoData already defined for the blue band, we must choose one. In this particular example, the minimum value is greater than zero, so we can use 0 as the NoData value. We will construct a new `CellType` object to represent this. + +```python, pick_nd +blue_min = unmasked.agg(rf_agg_stats('blue').min.alias('blue_min')) +print('Nonzero minimum value in the blue band:', blue_min.first()) + +blue_ct = unmasked.select(rf_cell_type('blue')).distinct().first()[0][0] +masked_blue_ct = CellType(blue_ct).with_no_data_value(0) +masked_blue_ct.cell_type_name +``` + +We next convert the blue band to this cell type. + +```python, convert_blue +converted = unmasked.select('scl', 'green', rf_convert_cell_type('blue', masked_blue_ct).alias('blue')) +``` + +## Apply Mask from Quality Band + +Now we set cells of our `blue` column to NoData for all locations where the `scl` tile is in our set of undesirable values. This is the actual _masking_ operation. + +```python, apply_mask_blue +from pyspark.sql.functions import lit + +masked = converted.withColumn('blue_masked', rf_mask_by_values('blue', 'scl', [0, 1, 8, 9, 10])) +masked +``` + +We can verify that the number of NoData cells in the resulting `blue_masked` column matches the total of the boolean `mask` _tile_ to ensure our logic is correct. + +```python, show_masked_counts +masked.select(rf_no_data_cells('blue_masked'), rf_tile_sum(rf_local_is_in('scl', [0, 1, 8, 9, 10]))) +``` + +It's also nice to view a sample. The white regions are areas of NoData. + +```python, display_blu, caption='Blue band masked against selected SCL values' +sample = masked.orderBy(-rf_no_data_cells('blue_masked')).select(rf_tile('blue_masked'), rf_tile('scl')).first() +display(sample[0]) +``` + +And the original SCL data. The bright yellow is a cloudy region in the original image. + +```python, display_scl, caption='SCL tile for above' +display(sample[1]) +``` + +## Transferring Mask + +We can now apply the same mask from the blue column to the green column. Note here we have supressed the step of explicitly checking what a "safe" NoData value for the green band should be. + +```python, mask_green +masked.withColumn('green_masked', rf_mask(rf_convert_cell_type('green', masked_blue_ct), 'blue_masked')) \ + .orderBy(-rf_no_data_cells('blue_masked')) +``` + +## Clipping + +Clipping is the use of a polygon to determine the areas to mask in a raster. Typically the areas inside a polygon are retained and the cells outside are set to NoData. Given a geometry column on our DataFrame, we have to carry out three basic steps. First we have to ensure the vector geometry is correctly projected to the same @ref:[CRS](concepts.md#coordinate-reference-system-crs) as the raster. We'll continue with our example creating a simple polygon. Buffering a point will create an approximate circle. + + +```python, reproject_geom +to_rasterize = masked.withColumn('geom_4326', + st_bufferPoint( + st_point(lit(-78.0783132), lit(38.3184340)), + lit(15000))) \ + .withColumn('geom_native', st_reproject('geom_4326', rf_mk_crs('epsg:4326'), rf_crs('blue_masked'))) +``` + +Second, we will rasterize the geometry, or burn-in the geometry into the same grid as the raster. + +```python, rasterize +to_clip = to_rasterize.withColumn('clip_raster', + rf_rasterize('geom_native', rf_geometry('blue_masked'), lit(1), rf_dimensions('blue_masked').cols, rf_dimensions('blue_masked').rows)) + +# visualize some of the edges of our circle +to_clip.select('clip_raster', 'blue_masked') \ + .filter(rf_data_cells('clip_raster') > 20) \ + .orderBy(rf_data_cells('clip_raster')) +``` + +Finally, we create a new _tile_ column with the blue band clipped to our circle. Again we will use the `rf_mask` function to pass the NoData regions along from the rasterized geometry. + +clipped = to_clip.select('blue_masked', + 'clip_raster', + rf_mask('blue_masked', 'clip_raster').alias('blue_clipped')) \ + .filter(rf_data_cells('clip_raster') > 20) \ + .orderBy(rf_data_cells('clip_raster')) + + +This kind of clipping technique is further used in @ref:[zonal statistics](zonal-algebra.md). \ No newline at end of file diff --git a/pyrasterframes/src/main/python/docs/nodata-handling.pymd b/pyrasterframes/src/main/python/docs/nodata-handling.pymd index c9fffe390..d9beea951 100644 --- a/pyrasterframes/src/main/python/docs/nodata-handling.pymd +++ b/pyrasterframes/src/main/python/docs/nodata-handling.pymd @@ -2,7 +2,7 @@ ## What is NoData? -In raster operations, the preservation and correct processing of missing observations is very important. In [most DataFrames and in scientific computing](https://www.oreilly.com/learning/handling-missing-data), the idea of missing data is expressed as a `null` or `NaN` value. However, a great deal of raster data is stored for space efficiency, which typically leads to use of integral values with a ["sentinel" value](https://en.wikipedia.org/wiki/Sentinel_value) designated to represent missing observations. This sentinel value varies across data products and is usually called the "NoData" value. +In raster operations, the preservation and correct processing of missing observations is very important. In [most DataFrames and in scientific computing](https://www.oreilly.com/learning/handling-missing-data), the idea of missing data is expressed as a `null` or `NaN` value. However, a great deal of raster data is stored for space efficiency, which typically leads to use of integral values with a ["sentinel" value](https://en.wikipedia.org/wiki/Sentinel_value) designated to represent missing observations. This sentinel value varies across data products and bands. In a generic sense, it is usually called the "NoData" value. RasterFrames provides a variety of functions to inspect and manage NoData within _tiles_. @@ -75,95 +75,6 @@ print(CellType.float32().no_data_value()) print(CellType.float32().with_no_data_value(-99.9).no_data_value()) ``` -## Masking - -Let's continue the example above with Sentinel-2 data. Band 2 is blue and has no defined NoData. The quality information is in a separate file called the scene classification (SCL), which delineates areas of missing data and probable clouds. For more information on this, see the [Sentinel-2 algorithm overview](https://earth.esa.int/web/sentinel/technical-guides/sentinel-2-msi/level-2a/algorithm). Figure 3 tells us how to interpret the scene classification. For this example, we will exclude NoData, defective pixels, probable clouds, and cirrus clouds: values 0, 1, 8, 9, and 10. - -![Sentinel-2 Scene Classification Values](static/sentinel-2-scene-classification-labels.png) - -Credit: [Sentinel-2 algorithm overview](https://earth.esa.int/web/sentinel/technical-guides/sentinel-2-msi/level-2a/algorithm) - -The first step is to create a catalog with our band of interest and the SCL band. We read the data from the catalog, so the blue band and SCL _tiles_ are aligned across rows. - -```python, blue_scl_cat -from pyspark.sql import Row - -blue_uri = 'https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B02.tif' -scl_uri = 'https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/SCL.tif' -cat = spark.createDataFrame([Row(blue=blue_uri, scl=scl_uri),]) -unmasked = spark.read.raster(cat, catalog_col_names=['blue', 'scl']) -unmasked.printSchema() -``` - -```python, show_cell_types -cell_types = unmasked.select(rf_cell_type('blue'), rf_cell_type('scl')).distinct() -cell_types -``` - -Drawing on @ref:[local map algebra](local-algebra.md) techniques, we will create new _tile_ columns that are indicators of unwanted pixels, as defined above. Since the mask column is an integer type, the addition is equivalent to a logical or, so the boolean true values are 1. - -```python, def_mask -from pyspark.sql.functions import lit - -mask_part = unmasked.withColumn('nodata', rf_local_equal('scl', lit(0))) \ - .withColumn('defect', rf_local_equal('scl', lit(1))) \ - .withColumn('cloud8', rf_local_equal('scl', lit(8))) \ - .withColumn('cloud9', rf_local_equal('scl', lit(9))) \ - .withColumn('cirrus', rf_local_equal('scl', lit(10))) - -one_mask = mask_part.withColumn('mask', rf_local_add('nodata', 'defect')) \ - .withColumn('mask', rf_local_add('mask', 'cloud8')) \ - .withColumn('mask', rf_local_add('mask', 'cloud9')) \ - .withColumn('mask', rf_local_add('mask', 'cirrus')) - -cell_types = one_mask.select(rf_cell_type('mask')).distinct() -cell_types -``` - -Because there is not a NoData already defined, we will choose one. In this particular example, the minimum value is greater than zero, so we can use 0 as the NoData value. - -```python, pick_nd -blue_min = one_mask.agg(rf_agg_stats('blue').min.alias('blue_min')) -blue_min -``` - -We can now construct the cell type string for our blue band's cell type, designating 0 as NoData. - -```python, get_ct_string -blue_ct = one_mask.select(rf_cell_type('blue')).distinct().first()[0][0] -masked_blue_ct = CellType(blue_ct).with_no_data_value(0) -masked_blue_ct.cell_type_name -``` - -Now we will use the @ref:[`rf_mask_by_value`](reference.md#rf-mask-by-value) to designate the cloudy and other unwanted pixels as NoData in the blue column by converting the cell type and applying the mask. - -```python, mask_blu -with_nd = rf_convert_cell_type('blue', masked_blue_ct) -masked = one_mask.withColumn('blue_masked', - rf_mask_by_value(with_nd, 'mask', lit(1))) \ - .drop('nodata', 'defect', 'cloud8', 'cloud9', 'cirrus', 'blue') -``` - -We can verify that the number of NoData cells in the resulting `blue_masked` column matches the total of the boolean `mask` _tile_ to ensure our logic is correct. - -```python, show_masked -counts = masked.select(rf_no_data_cells('blue_masked'), rf_tile_sum('mask')) -counts -``` - -It's also nice to view a sample. The white regions are areas of NoData. - -```python, display_blu, caption='Blue band masked against selected SCL values' -sample = masked.orderBy(-rf_no_data_cells('blue_masked')).select(rf_tile('blue_masked'), rf_tile('scl')).first() -display(sample[0]) -``` - -And the original SCL data. The bright yellow is a cloudy region in the original image. - -```python, display_scl, caption='SCL tile for above' -display(sample[1]) -``` - ## NoData and Local Arithmetic Let's now explore how the presence of NoData affects @ref:[local map algebra](local-algebra.md) operations. To demonstrate the behavior, lets create two _tiles_. One _tile_ will have values of 0 and 1, and the other will have values of just 0. diff --git a/pyrasterframes/src/main/python/docs/raster-processing.md b/pyrasterframes/src/main/python/docs/raster-processing.md index fc6353e37..e112b2287 100644 --- a/pyrasterframes/src/main/python/docs/raster-processing.md +++ b/pyrasterframes/src/main/python/docs/raster-processing.md @@ -4,6 +4,7 @@ * @ref:[Local Map Algebra](local-algebra.md) * @ref:["NoData" Handling](nodata-handling.md) +* @ref:[Masking](masking.md) * @ref:[Zonal Map Algebra](zonal-algebra.md) * @ref:[Aggregation](aggregation.md) * @ref:[Time Series](time-series.md) diff --git a/pyrasterframes/src/main/python/docs/raster-read.pymd b/pyrasterframes/src/main/python/docs/raster-read.pymd index 53f3a96e6..443ee0d96 100644 --- a/pyrasterframes/src/main/python/docs/raster-read.pymd +++ b/pyrasterframes/src/main/python/docs/raster-read.pymd @@ -14,7 +14,7 @@ RasterFrames registers a DataSource named `raster` that enables reading of GeoTI RasterFrames can also read from @ref:[GeoTrellis catalogs and layers](raster-read.md#geotrellis). -## Single Raster +## Single Rasters The simplest way to use the `raster` reader is with a single raster from a single URI or file. In the examples that follow we'll be reading from a Sentinel-2 scene stored in an AWS S3 bucket. @@ -33,14 +33,12 @@ print("CRS", crs.value.crsProj4) ``` ```python, raster_parts -parts = rf.select( +rf.select( rf_extent("proj_raster").alias("extent"), rf_tile("proj_raster").alias("tile") ) -parts ``` - You can also see that the single raster has been broken out into many arbitrary non-overlapping regions. Doing so takes advantage of parallel in-memory reads from the cloud hosted data source and allows Spark to work on manageable amounts of data per task. The following code fragment shows us how many subtiles were created from a single source image. ```python, count_by_uri @@ -55,6 +53,69 @@ tile = rf.select(rf_tile("proj_raster")).first()[0] display(tile) ``` +## Multiple Singleband Rasters + +In this example, we show the reading @ref:[two bands](concepts.md#band) of [Landsat 8](https://landsat.gsfc.nasa.gov/landsat-8/) imagery (red and near-infrared), combining them with `rf_normalized_difference` to compute [NDVI](https://en.wikipedia.org/wiki/Normalized_difference_vegetation_index), a common measure of vegetation health. As described in the section on @ref:[catalogs](raster-catalogs.md), image URIs in a single row are assumed to be from the same scene/granule, and therefore compatible. This pattern is commonly used when multiple bands are stored in separate files. + +```python, multi_singleband +bands = [f'B{b}' for b in [4, 5]] +uris = [f'https://landsat-pds.s3.us-west-2.amazonaws.com/c1/L8/014/032/LC08_L1TP_014032_20190720_20190731_01_T1/LC08_L1TP_014032_20190720_20190731_01_T1_{b}.TIF' for b in bands] +catalog = ','.join(bands) + '\n' + ','.join(uris) + +rf = (spark.read.raster(catalog, bands) + # Adding semantic names + .withColumnRenamed('B4', 'red').withColumnRenamed('B5', 'NIR') + # Adding tile center point for reference + .withColumn('longitude_latitude', st_reproject(st_centroid(rf_geometry('red')), rf_crs('red'), lit('EPSG:4326'))) + # Compute NDVI + .withColumn('NDVI', rf_normalized_difference('NIR', 'red')) + # For the purposes of inspection, filter out rows where there's not much vegetation + .where(rf_tile_sum('NDVI') > 10000) + # Order output + .select('longitude_latitude', 'red', 'NIR', 'NDVI')) +display(rf) +``` + +## Multiband Rasters + +A multiband raster is represented by a three dimensional numeric array stored in a single file. The first two dimensions are spatial, and the third dimension is typically designated for different spectral @ref:[bands](concepts.md#band). The bands could represent intensity of different wavelengths of light (or other electromagnetic radiation), or they could measure other phenomena such as time, quality indications, or additional gas concentrations, etc. + +Multiband rasters files have a strictly ordered set of bands, which are typically indexed from 1. Some files have metadata tags associated with each band. Some files have a color interpetation metadata tag indicating how to interpret the bands. + +When reading a multiband raster or a @ref:[_catalog_](#raster-catalogs) describing multiband rasters, you will need to know ahead of time which bands you want to read. You will specify the bands to read, **indexed from zero**, as a list of integers into the `band_indexes` parameter of the `raster` reader. + +For example, we can read a four-band (red, green, blue, and near-infrared) image as follows. The individual rows of the resulting DataFrame still represent distinct spatial extents, with a projected raster column for each band specified by `band_indexes`. + +```python, multiband +mb = spark.read.raster( + 's3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif', + band_indexes=[0, 1, 2, 3], +) +display(mb) +``` + +If a band is passed into `band_indexes` that exceeds the number of bands in the raster, a projected raster column will still be generated in the schema but the column will be full of `null` values. + +You can also pass a _catalog_ and `band_indexes` together into the `raster` reader. This will create a projected raster column for the combination of all items in `catalog_col_names` and `band_indexes`. Again if a band in `band_indexes` exceeds the number of bands in a raster, it will have a `null` value for the corresponding column. + +Here is a trivial example with a _catalog_ over multiband rasters. We specify two columns containing URIs and two bands, resulting in four projected raster columns. + +```python, multiband_catalog +import pandas as pd +mb_cat = pd.DataFrame([ + {'foo': 's3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif', + 'bar': 's3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif' + }, +]) +mb2 = spark.read.raster( + spark.createDataFrame(mb_cat), + catalog_col_names=['foo', 'bar'], + band_indexes=[0, 1], + tile_dimensions=(64,64) +) +mb2.printSchema() +``` + ## URI Formats RasterFrames relies on three different I/O drivers, selected based on a combination of scheme, file extentions, and library availability. GDAL is used by default if a compatible version of GDAL (>= 2.4) is installed, and if GDAL supports the specified scheme. If GDAL is not available, either the _Java I/O_ or _Hadoop_ driver will be selected, depending on scheme. @@ -154,45 +215,6 @@ non_lazy In the initial examples on this page, you may have noticed that the realized (non-lazy) _tiles_ are shown, but we did not change `lazy_tiles`. Instead, we used @ref:[`rf_tile`](reference.md#rf-tile) to explicitly request the realized _tile_ from the lazy representation. -## Multiband Rasters - -A multiband raster represents a three dimensional numeric array. The first two dimensions are spatial, and the third dimension is typically designated for different spectral @ref:[bands](concepts.md#band). The bands could represent intensity of different wavelengths of light (or other electromagnetic radiation), or they could measure other phenomena such as time, quality indications, or additional gas concentrations, etc. - -Multiband rasters files have a strictly ordered set of bands, which are typically indexed from 1. Some files have metadata tags associated with each band. Some files have a color interpetation metadata tag indicating how to interpret the bands. - -When reading a multiband raster or a _catalog_ describing multiband rasters, you will need to know ahead of time which bands you want to read. You will specify the bands to read, **indexed from zero**, as a list of integers into the `band_indexes` parameter of the `raster` reader. - -For example, we can read a four-band (red, green, blue, and near-infrared) image as follows. The individual rows of the resulting DataFrame still represent distinct spatial extents, with a projected raster column for each band specified by `band_indexes`. - -```python, multiband -mb = spark.read.raster( - 's3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif', - band_indexes=[0, 1, 2, 3], -) -mb.printSchema() -``` - -If a band is passed into `band_indexes` that exceeds the number of bands in the raster, a projected raster column will still be generated in the schema but the column will be full of `null` values. - -You can also pass a _catalog_ and `band_indexes` together into the `raster` reader. This will create a projected raster column for the combination of all items in `catalog_col_names` and `band_indexes`. Again if a band in `band_indexes` exceeds the number of bands in a raster, it will have a `null` value for the corresponding column. - -Here is a trivial example with a _catalog_ over multiband rasters. We specify two columns containing URIs and two bands, resulting in four projected raster columns. - -```python, multiband_catalog -import pandas as pd -mb_cat = pd.DataFrame([ - {'foo': 's3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif', - 'bar': 's3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif' - }, -]) -mb2 = spark.read.raster( - spark.createDataFrame(mb_cat), - catalog_col_names=['foo', 'bar'], - band_indexes=[0, 1], - tile_dimensions=(64,64) -) -mb2.printSchema() -``` ## GeoTrellis diff --git a/pyrasterframes/src/main/python/docs/static/rasterframes-pipeline-nologo.png b/pyrasterframes/src/main/python/docs/static/rasterframes-pipeline-nologo.png new file mode 100644 index 000000000..caa011d70 Binary files /dev/null and b/pyrasterframes/src/main/python/docs/static/rasterframes-pipeline-nologo.png differ diff --git a/pyrasterframes/src/main/python/docs/supervised-learning.pymd b/pyrasterframes/src/main/python/docs/supervised-learning.pymd index c66697032..81a81f634 100644 --- a/pyrasterframes/src/main/python/docs/supervised-learning.pymd +++ b/pyrasterframes/src/main/python/docs/supervised-learning.pymd @@ -32,7 +32,8 @@ catalog_df = pd.DataFrame([ {b: uri_base.format(b) for b in cols} ]) -df = spark.read.raster(catalog_df, catalog_col_names=cols, tile_dimensions=(128, 128)) \ +tile_size = 256 +df = spark.read.raster(catalog_df, catalog_col_names=cols, tile_dimensions=(tile_size, tile_size)) \ .repartition(100) df = df.select( @@ -91,23 +92,12 @@ To filter only for good quality pixels, we follow roughly the same procedure as ```python, make_mask from pyspark.sql.functions import lit -mask_part = df_labeled \ - .withColumn('nodata', rf_local_equal('scl', lit(0))) \ - .withColumn('defect', rf_local_equal('scl', lit(1))) \ - .withColumn('cloud8', rf_local_equal('scl', lit(8))) \ - .withColumn('cloud9', rf_local_equal('scl', lit(9))) \ - .withColumn('cirrus', rf_local_equal('scl', lit(10))) - -df_mask_inv = mask_part \ - .withColumn('mask', rf_local_add('nodata', 'defect')) \ - .withColumn('mask', rf_local_add('mask', 'cloud8')) \ - .withColumn('mask', rf_local_add('mask', 'cloud9')) \ - .withColumn('mask', rf_local_add('mask', 'cirrus')) \ - .drop('nodata', 'defect', 'cloud8', 'cloud9', 'cirrus') - +df_labeled = df_labeled \ + .withColumn('mask', rf_local_is_in('scl', [0, 1, 8, 9, 10])) + # at this point the mask contains 0 for good cells and 1 for defect, etc # convert cell type and set value 1 to NoData -df_mask = df_mask_inv.withColumn('mask', +df_mask = df_labeled.withColumn('mask', rf_with_no_data(rf_convert_cell_type('mask', 'uint8'), 1.0) ) @@ -204,29 +194,35 @@ scored = model.transform(df_mask.drop('label')) retiled = scored \ .groupBy('extent', 'crs') \ .agg( - rf_assemble_tile('column_index', 'row_index', 'prediction', 128, 128).alias('prediction'), - rf_assemble_tile('column_index', 'row_index', 'B04', 128, 128).alias('red'), - rf_assemble_tile('column_index', 'row_index', 'B03', 128, 128).alias('grn'), - rf_assemble_tile('column_index', 'row_index', 'B02', 128, 128).alias('blu') + rf_assemble_tile('column_index', 'row_index', 'prediction', tile_size, tile_size).alias('prediction'), + rf_assemble_tile('column_index', 'row_index', 'B04', tile_size, tile_size).alias('red'), + rf_assemble_tile('column_index', 'row_index', 'B03', tile_size, tile_size).alias('grn'), + rf_assemble_tile('column_index', 'row_index', 'B02', tile_size, tile_size).alias('blu') ) retiled.printSchema() ``` Take a look at a sample of the resulting output and the corresponding area's red-green-blue composite image. +Recall the label coding: 1 is forest (purple), 2 is cropland (green) and 3 is developed areas(yellow). ```python, display_rgb sample = retiled \ - .select('prediction', rf_rgb_composite('red', 'grn', 'blu').alias('rgb')) \ + .select('prediction', 'red', 'grn', 'blu') \ .sort(-rf_tile_sum(rf_local_equal('prediction', lit(3.0)))) \ .first() -sample_rgb = sample['rgb'] -mins = np.nanmin(sample_rgb.cells, axis=(0,1)) -plt.imshow((sample_rgb.cells - mins) / (np.nanmax(sample_rgb.cells, axis=(0,1)) - mins)) -``` +sample_rgb = np.concatenate([sample['red'].cells[:, :, None], + sample['grn'].cells[ :, :, None], + sample['blu'].cells[ :, :, None]], axis=2) +# plot scaled RGB +scaling_quantiles = np.nanpercentile(sample_rgb, [3.00, 97.00], axis=(0,1)) +scaled = np.clip(sample_rgb, scaling_quantiles[0, :], scaling_quantiles[1, :]) +scaled -= scaling_quantiles[0, :] +scaled /= (scaling_quantiles[1, : ] - scaling_quantiles[0, :]) -Recall the label coding: 1 is forest (purple), 2 is cropland (green) and 3 is developed areas(yellow). +fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5)) +ax1.imshow(scaled) -```python, display_prediction -display(sample['prediction']) +# display prediction +ax2.imshow(sample['prediction'].cells) ``` diff --git a/pyrasterframes/src/main/python/docs/zonal-algebra.pymd b/pyrasterframes/src/main/python/docs/zonal-algebra.pymd index 9869e6b36..b3f4951eb 100644 --- a/pyrasterframes/src/main/python/docs/zonal-algebra.pymd +++ b/pyrasterframes/src/main/python/docs/zonal-algebra.pymd @@ -96,7 +96,7 @@ park_rf.printSchema() ## Define Zone Tiles -Now we have the vector representation of the park boundary alongside the _tiles_ of red and near infrared bands. Next, we need to create a _tile_ representation of the park to allow us to limit the raster analysis to pixels within the park _zone_. This is similar to the masking operation demonstrated in @ref:[NoData handling](nodata-handling.md#masking). We rasterize the geometries using @ref:[`rf_rasterize`](reference.md#rf-rasterize): this creates a new _tile_ column aligned with the imagery, and containing the park's OBJECTID attribute for cells intersecting the _zone_. Cells outside the park _zones_ have a NoData value. +Now we have the vector representation of the park boundary alongside the _tiles_ of red and near infrared bands. Next, we need to create a _tile_ representation of the park to allow us to limit the raster analysis to pixels within the park _zone_. This is similar to the masking operation demonstrated in @ref:[Masking](masking.md#masking). We rasterize the geometries using @ref:[`rf_rasterize`](reference.md#rf-rasterize): this creates a new _tile_ column aligned with the imagery, and containing the park's OBJECTID attribute for cells intersecting the _zone_. Cells outside the park _zones_ have a NoData value. ```python burn_in rf_park_tile = park_rf \ diff --git a/pyrasterframes/src/main/python/pyrasterframes/rasterfunctions.py b/pyrasterframes/src/main/python/pyrasterframes/rasterfunctions.py index 9c0e52f09..ae5977f51 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/rasterfunctions.py +++ b/pyrasterframes/src/main/python/pyrasterframes/rasterfunctions.py @@ -25,6 +25,7 @@ """ from __future__ import absolute_import from pyspark.sql.column import Column, _to_java_column +from pyspark.sql.functions import lit from .rf_context import RFContext from .rf_types import CellType @@ -137,20 +138,6 @@ def rf_explode_tiles_sample(sample_frac, seed, *tile_cols): return Column(jfcn(sample_frac, seed, RFContext.active().list_to_seq(jcols))) -def rf_mask_by_value(data_tile, mask_tile, mask_value): - """Generate a tile with the values from the data tile, but where cells in the masking tile contain the masking - value, replace the data value with NODATA. """ - jfcn = RFContext.active().lookup('rf_mask_by_value') - return Column(jfcn(_to_java_column(data_tile), _to_java_column(mask_tile), _to_java_column(mask_value))) - - -def rf_inverse_mask_by_value(data_tile, mask_tile, mask_value): - """Generate a tile with the values from the data tile, but where cells in the masking tile do not contain the - masking value, replace the data value with NODATA. """ - jfcn = RFContext.active().lookup('rf_inverse_mask_by_value') - return Column(jfcn(_to_java_column(data_tile), _to_java_column(mask_tile), _to_java_column(mask_value))) - - def _apply_scalar_to_tile(name, tile_col, scalar): jfcn = RFContext.active().lookup(name) return Column(jfcn(_to_java_column(tile_col), scalar)) @@ -260,14 +247,26 @@ def rf_local_unequal_int(tile_col, scalar): """Return a Tile with values equal 1 if the cell is not equal to a scalar, otherwise 0""" return _apply_scalar_to_tile('rf_local_unequal_int', tile_col, scalar) + def rf_local_no_data(tile_col): """Return a tile with ones where the input is NoData, otherwise zero.""" return _apply_column_function('rf_local_no_data', tile_col) + def rf_local_data(tile_col): """Return a tile with zeros where the input is NoData, otherwise one.""" return _apply_column_function('rf_local_data', tile_col) + +def rf_local_is_in(tile_col, array): + """Return a tile with cell values of 1 where the `tile_col` cell is in the provided array.""" + from pyspark.sql.functions import array as sql_array + if isinstance(array, list): + array = sql_array([lit(v) for v in array]) + + return _apply_column_function('rf_local_is_in', tile_col, array) + + def _apply_column_function(name, *args): jfcn = RFContext.active().lookup(name) jcols = [_to_java_column(arg) for arg in args] @@ -449,16 +448,53 @@ def rf_agg_local_stats(tile_col): return _apply_column_function('rf_agg_local_stats', tile_col) -def rf_mask(src_tile_col, mask_tile_col): - """Where the rf_mask (second) tile contains NODATA, replace values in the source (first) tile with NODATA.""" - return _apply_column_function('rf_mask', src_tile_col, mask_tile_col) +def rf_mask(src_tile_col, mask_tile_col, inverse=False): + """Where the rf_mask (second) tile contains NODATA, replace values in the source (first) tile with NODATA. + If `inverse` is true, replaces values in the source tile with NODATA where the mask tile contains valid data. + """ + if not inverse: + return _apply_column_function('rf_mask', src_tile_col, mask_tile_col) + else: + rf_inverse_mask(src_tile_col, mask_tile_col) def rf_inverse_mask(src_tile_col, mask_tile_col): - """Where the rf_mask (second) tile DOES NOT contain NODATA, replace values in the source (first) tile with NODATA.""" + """Where the rf_mask (second) tile DOES NOT contain NODATA, replace values in the source + (first) tile with NODATA.""" return _apply_column_function('rf_inverse_mask', src_tile_col, mask_tile_col) +def rf_mask_by_value(data_tile, mask_tile, mask_value, inverse=False): + """Generate a tile with the values from the data tile, but where cells in the masking tile contain the masking + value, replace the data value with NODATA. """ + if isinstance(mask_value, (int, float)): + mask_value = lit(mask_value) + jfcn = RFContext.active().lookup('rf_mask_by_value') + + return Column(jfcn(_to_java_column(data_tile), _to_java_column(mask_tile), _to_java_column(mask_value), inverse)) + + +def rf_mask_by_values(data_tile, mask_tile, mask_values): + """Generate a tile with the values from `data_tile`, but where cells in the `mask_tile` are in the `mask_values` + list, replace the value with NODATA. + """ + from pyspark.sql.functions import array as sql_array + if isinstance(mask_values, list): + mask_values = sql_array([lit(v) for v in mask_values]) + + jfcn = RFContext.active().lookup('rf_mask_by_values') + col_args = [_to_java_column(c) for c in [data_tile, mask_tile, mask_values]] + return Column(jfcn(*col_args)) + + +def rf_inverse_mask_by_value(data_tile, mask_tile, mask_value): + """Generate a tile with the values from the data tile, but where cells in the masking tile do not contain the + masking value, replace the data value with NODATA. """ + if isinstance(mask_value, (int, float)): + mask_value = lit(mask_value) + return _apply_column_function('rf_inverse_mask_by_value', data_tile, mask_tile, mask_value) + + def rf_local_less(left_tile_col, right_tile_col): """Cellwise less than comparison between two tiles""" return _apply_column_function('rf_local_less', left_tile_col, right_tile_col) @@ -585,6 +621,18 @@ def rf_geometry(proj_raster_col): """Get the extent of a RasterSource or ProjectdRasterTile as a Geometry""" return _apply_column_function('rf_geometry', proj_raster_col) + +def rf_spatial_index(geom_col, crs_col=None, index_resolution = 18): + """Constructs a XZ2 index in WGS84 from either a Geometry, Extent, ProjectedRasterTile, or RasterSource and its CRS. + For details: https://www.geomesa.org/documentation/user/datastores/index_overview.html """ + + jfcn = RFContext.active().lookup('rf_spatial_index') + + if crs_col is not None: + return Column(jfcn(_to_java_column(geom_col), _to_java_column(crs_col), index_resolution)) + else: + return Column(jfcn(_to_java_column(geom_col), index_resolution)) + # ------ GeoMesa Functions ------ def st_geomFromGeoHash(*args): diff --git a/pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py b/pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py index 0066e7dd7..0ae23d4ab 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py +++ b/pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py @@ -19,6 +19,8 @@ # import pyrasterframes.rf_types +from shapely.geometry.base import BaseGeometry + import numpy as np @@ -120,13 +122,18 @@ def pandas_df_to_html(df): if not pd.get_option("display.notebook_repr_html"): return None + default_max_colwidth = pd.get_option('display.max_colwidth') # we'll try to politely put it back + if len(df) == 0: return df._repr_html_() tile_cols = [] + geom_cols = [] for c in df.columns: if isinstance(df.iloc[0][c], pyrasterframes.rf_types.Tile): # if the first is a Tile try formatting tile_cols.append(c) + elif isinstance(df.iloc[0][c], BaseGeometry): # if the first is a Geometry try formatting + geom_cols.append(c) def _safe_tile_to_html(t): if isinstance(t, pyrasterframes.rf_types.Tile): @@ -135,11 +142,21 @@ def _safe_tile_to_html(t): # handles case where objects in a column are not all Tile type return t.__repr__() + def _safe_geom_to_html(g): + if isinstance(g, BaseGeometry): + wkt = g.wkt + if len(wkt) > default_max_colwidth: + return wkt[:default_max_colwidth-3] + '...' + else: + wkt + else: + return g.__repr__() + # dict keyed by column with custom rendering function formatter = {c: _safe_tile_to_html for c in tile_cols} + formatter.update({c: _safe_geom_to_html for c in geom_cols}) # This is needed to avoid our tile being rendered as `=1.6.0 -pyspark==2.3.4 # See issue # 154 +pyspark==2.4.4 numpy>=1.7 -pandas>=0.25.0 +pandas>=0.24.2 matplotlib<3.0.0 # no python 2.7 support after v2.x.x ipython==6.2.1 rasterio>=1.0.0 diff --git a/pyrasterframes/src/main/python/setup.cfg b/pyrasterframes/src/main/python/setup.cfg index 8088676c6..4d9369ec4 100644 --- a/pyrasterframes/src/main/python/setup.cfg +++ b/pyrasterframes/src/main/python/setup.cfg @@ -2,10 +2,10 @@ license_files = LICENSE.txt [bdist_wheel] -universal=1 +universal = 0 [aliases] -test=pytest +test = pytest [tool:pytest] addopts = --verbose diff --git a/pyrasterframes/src/main/python/setup.py b/pyrasterframes/src/main/python/setup.py index 611950c9c..70f4b2dcc 100644 --- a/pyrasterframes/src/main/python/setup.py +++ b/pyrasterframes/src/main/python/setup.py @@ -130,6 +130,26 @@ def initialize_options(self): def dest_file(self, src_file): return path.splitext(src_file)[0] + '.ipynb' +pytz = 'pytz' +shapely = 'Shapely>=1.6.0' +pyspark ='pyspark==2.4.4' +numpy = 'numpy>=1.12.0' +matplotlib ='matplotlib' +pandas = 'pandas>=0.24.2' +geopandas = 'geopandas' +requests = 'requests' +pytest_runner = 'pytest-runner' +setuptools = 'setuptools>=0.8' +ipython = 'ipython==6.2.1' +ipykernel = 'ipykernel==4.8.0' +pweave = 'Pweave==0.30.3' +fiona = 'fiona==1.8.6' +rasterio = 'rasterio>=1.0.0' +folium = 'folium' +pytest = 'pytest>=4.0.0,<5.0.0' +pypandoc = 'pypandoc' +boto3 = 'boto3' + setup( name='pyrasterframes', description='Access and process geospatial raster data in PySpark DataFrames', @@ -144,40 +164,41 @@ def dest_file(self, src_file): 'Bug Reports': 'https://github.com/locationtech/rasterframes/issues', 'Source': 'https://github.com/locationtech/rasterframes', }, + python_requires=">=3.5", install_requires=[ - 'pytz', - 'Shapely>=1.6.0', - 'pyspark<2.4', - 'numpy>=1.7', - 'pandas>=0.25.0', + pytz, + shapely, + pyspark, + numpy, + pandas ], setup_requires=[ - 'pytz', - 'Shapely>=1.6.0', - 'pyspark<2.4', - 'numpy>=1.7', - 'matplotlib<3.0.0', - 'pandas>=0.25.0', - 'geopandas', - 'requests', - 'pytest-runner', - 'setuptools>=0.8', - 'ipython==6.2.1', - 'ipykernel==4.8.0', - 'Pweave==0.30.3', - 'fiona==1.8.6', - 'rasterio>=1.0.0', # for docs - 'folium', + pytz, + shapely, + pyspark, + numpy, + matplotlib, + pandas, + geopandas, + requests, + pytest_runner, + setuptools, + ipython, + ipykernel, + pweave, + fiona, + rasterio, + folium ], tests_require=[ - 'pytest==3.4.2', - 'pypandoc', - 'numpy>=1.7', - 'Shapely>=1.6.0', - 'pandas>=0.25.0', - 'rasterio>=1.0.0', - 'boto3', - 'Pweave==0.30.3', + pytest, + pypandoc, + numpy, + shapely, + pandas, + rasterio, + boto3, + pweave ], packages=[ 'pyrasterframes', diff --git a/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py b/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py index 7cda3b997..3bb2ce491 100644 --- a/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py +++ b/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py @@ -131,7 +131,7 @@ def test_tile_udt_serialization(self): cells[1][1] = nd a_tile = Tile(cells, ct.with_no_data_value(nd)) round_trip = udt.fromInternal(udt.toInternal(a_tile)) - self.assertEquals(a_tile, round_trip, "round-trip serialization for " + str(ct)) + self.assertEqual(a_tile, round_trip, "round-trip serialization for " + str(ct)) schema = StructType([StructField("tile", TileUDT(), False)]) df = self.spark.createDataFrame([{"tile": a_tile}], schema) diff --git a/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py b/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py index ca17dc325..6c82f867c 100644 --- a/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py +++ b/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py @@ -24,6 +24,8 @@ from pyspark import Row from pyspark.sql.functions import * +import numpy as np +from numpy.testing import assert_equal from . import TestEnvironment @@ -103,7 +105,6 @@ def test_agg_mean(self): def test_agg_local_mean(self): from pyspark.sql import Row from pyrasterframes.rf_types import Tile - import numpy as np # this is really testing the nodata propagation in the agg local summation ct = CellType.int8().with_no_data_value(4) @@ -221,20 +222,43 @@ def test_mask_by_value(self): rf_local_greater_int(self.rf.tile, 25000), "uint8"), lit(mask_value)).alias('mask')) - rf2 = rf1.select(rf1.tile, rf_mask_by_value(rf1.tile, rf1.mask, lit(mask_value)).alias('masked')) + rf2 = rf1.select(rf1.tile, rf_mask_by_value(rf1.tile, rf1.mask, lit(mask_value), False).alias('masked')) result = rf2.agg(rf_agg_no_data_cells(rf2.tile) < rf_agg_no_data_cells(rf2.masked)) \ .collect()[0][0] self.assertTrue(result) - rf3 = rf1.select(rf1.tile, rf_inverse_mask_by_value(rf1.tile, rf1.mask, lit(mask_value)).alias('masked')) - result = rf3.agg(rf_agg_no_data_cells(rf3.tile) < rf_agg_no_data_cells(rf3.masked)) \ - .collect()[0][0] - self.assertTrue(result) + # note supplying a `int` here, not a column to mask value + rf3 = rf1.select( + rf1.tile, + rf_inverse_mask_by_value(rf1.tile, rf1.mask, mask_value).alias('masked'), + rf_mask_by_value(rf1.tile, rf1.mask, mask_value, True).alias('masked2'), + ) + result = rf3.agg( + rf_agg_no_data_cells(rf3.tile) < rf_agg_no_data_cells(rf3.masked), + rf_agg_no_data_cells(rf3.tile) < rf_agg_no_data_cells(rf3.masked2), + ) \ + .first() + self.assertTrue(result[0]) + self.assertTrue(result[1]) # inverse mask arg gives equivalent result + + result_equiv_tiles = rf3.select(rf_for_all(rf_local_equal(rf3.masked, rf3.masked2))).first()[0] + self.assertTrue(result_equiv_tiles) # inverse fn and inverse arg produce same Tile + + def test_mask_by_values(self): + + tile = Tile(np.random.randint(1, 100, (5, 5)), CellType.uint8()) + mask_tile = Tile(np.array(range(1, 26), 'uint8').reshape(5, 5)) + expected_diag_nd = Tile(np.ma.masked_array(tile.cells, mask=np.eye(5))) + + df = self.spark.createDataFrame([Row(t=tile, m=mask_tile)]) \ + .select(rf_mask_by_values('t', 'm', [0, 6, 12, 18, 24])) # values on the diagonal + result0 = df.first() + # assert_equal(result0[0].cells, expected_diag_nd) + self.assertTrue(result0[0] == expected_diag_nd) def test_mask(self): from pyspark.sql import Row from pyrasterframes.rf_types import Tile, CellType - import numpy as np np.random.seed(999) ma = np.ma.array(np.random.randint(0, 10, (5, 5), dtype='int8'), mask=np.random.rand(5, 5) > 0.7) @@ -323,13 +347,9 @@ def test_render_composite(self): # Look for the PNG magic cookie self.assertEqual(png_bytes[0:8], bytearray([0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A])) - - - def test_rf_interpret_cell_type_as(self): from pyspark.sql import Row from pyrasterframes.rf_types import Tile - import numpy as np df = self.spark.createDataFrame([ Row(t=Tile(np.array([[1, 3, 4], [5, 0, 3]]), CellType.uint8().with_no_data_value(5))) @@ -344,11 +364,12 @@ def test_rf_interpret_cell_type_as(self): def test_rf_local_data_and_no_data(self): from pyspark.sql import Row from pyrasterframes.rf_types import Tile - import numpy as np - from numpy.testing import assert_equal - t = Tile(np.array([[1, 3, 4], [5, 0, 3]]), CellType.uint8().with_no_data_value(5)) - #note the convert is due to issue #188 + nd = 5 + t = Tile( + np.array([[1, 3, 4], [nd, 0, 3]]), + CellType.uint8().with_no_data_value(nd)) + # note the convert is due to issue #188 df = self.spark.createDataFrame([Row(t=t)])\ .withColumn('lnd', rf_convert_cell_type(rf_local_no_data('t'), 'uint8')) \ .withColumn('ld', rf_convert_cell_type(rf_local_data('t'), 'uint8')) @@ -359,3 +380,44 @@ def test_rf_local_data_and_no_data(self): result_d = result['ld'] assert_equal(result_d.cells, np.invert(t.cells.mask)) + + def test_rf_local_is_in(self): + from pyspark.sql.functions import lit, array, col + from pyspark.sql import Row + + nd = 5 + t = Tile( + np.array([[1, 3, 4], [nd, 0, 3]]), + CellType.uint8().with_no_data_value(nd)) + # note the convert is due to issue #188 + df = self.spark.createDataFrame([Row(t=t)]) \ + .withColumn('a', array(lit(3), lit(4))) \ + .withColumn('in2', rf_convert_cell_type( + rf_local_is_in(col('t'), array(lit(0), lit(4))), + 'uint8')) \ + .withColumn('in3', rf_convert_cell_type(rf_local_is_in('t', 'a'), 'uint8')) \ + .withColumn('in4', rf_convert_cell_type( + rf_local_is_in('t', array(lit(0), lit(4), lit(3))), + 'uint8')) \ + .withColumn('in_list', rf_convert_cell_type(rf_local_is_in(col('t'), [4, 1]), 'uint8')) + + result = df.first() + self.assertEqual(result['in2'].cells.sum(), 2) + assert_equal(result['in2'].cells, np.isin(t.cells, np.array([0, 4]))) + self.assertEqual(result['in3'].cells.sum(), 3) + self.assertEqual(result['in4'].cells.sum(), 4) + self.assertEqual(result['in_list'].cells.sum(), 2, + "Tile value {} should contain two 1s as: [[1, 0, 1],[0, 0, 0]]" + .format(result['in_list'].cells)) + + def test_rf_spatial_index(self): + from pyspark.sql.functions import min as F_min + result_one_arg = self.df.select(rf_spatial_index('tile').alias('ix')) \ + .agg(F_min('ix')).first()[0] + print(result_one_arg) + + result_two_arg = self.df.select(rf_spatial_index(rf_extent('tile'), rf_crs('tile')).alias('ix')) \ + .agg(F_min('ix')).first()[0] + + self.assertEqual(result_two_arg, result_one_arg) + self.assertEqual(result_one_arg, 55179438768) # this is a bit more fragile but less important diff --git a/pyrasterframes/src/main/python/tests/VectorTypesTests.py b/pyrasterframes/src/main/python/tests/VectorTypesTests.py index e31f26b43..70e94af72 100644 --- a/pyrasterframes/src/main/python/tests/VectorTypesTests.py +++ b/pyrasterframes/src/main/python/tests/VectorTypesTests.py @@ -156,3 +156,16 @@ def test_geojson(self): geo = self.spark.read.geojson(sample) geo.show() self.assertEqual(geo.select('geometry').count(), 8) + + def test_spatial_index(self): + df = self.df.select(rf_spatial_index(self.df.poly_geom, rf_crs(lit("EPSG:4326"))).alias('index')) + expected = {22858201775, 38132946267, 38166922588, 38180072113} + indexes = {x[0] for x in df.collect()} + self.assertSetEqual(indexes, expected) + + # Custom resolution + df = self.df.select(rf_spatial_index(self.df.poly_geom, rf_crs(lit("EPSG:4326")), 3).alias('index')) + expected = {21, 36} + indexes = {x[0] for x in df.collect()} + self.assertSetEqual(indexes, expected) + diff --git a/pyrasterframes/src/main/python/tests/__init__.py b/pyrasterframes/src/main/python/tests/__init__.py index bea51f58b..b09b5f6f3 100644 --- a/pyrasterframes/src/main/python/tests/__init__.py +++ b/pyrasterframes/src/main/python/tests/__init__.py @@ -94,3 +94,7 @@ def create_layer(self): self.rf = rf.withColumn('tile2', rf_convert_cell_type('tile', 'float32')) \ .drop('tile') \ .withColumnRenamed('tile2', 'tile').as_layer() + + df = self.spark.read.raster(self.img_uri) + self.df = df.withColumn('tile', rf_convert_cell_type('proj_raster', 'float32')) \ + .drop('proj_raster') diff --git a/rf-notebook/build.sbt b/rf-notebook/build.sbt index b7e7b6213..7e406b411 100644 --- a/rf-notebook/build.sbt +++ b/rf-notebook/build.sbt @@ -1,5 +1,6 @@ import scala.sys.process.Process import PythonBuildPlugin.autoImport.pyWhl +import com.typesafe.sbt.git.DefaultReadableGit lazy val includeNotebooks = settingKey[Boolean]("Whether to build documentation into notebooks and include them") includeNotebooks := true @@ -8,6 +9,11 @@ Docker / packageName := "s22s/rasterframes-notebook" Docker / version := version.value +dockerAliases += dockerAlias.value.withTag({ + val sha = new DefaultReadableGit(file(".")).withGit(_.headCommitSha) + sha.map(_.take(7)) +}) + Docker / maintainer := organization.value Docker / sourceDirectory := baseDirectory.value / "src"/ "main" / "docker" @@ -44,6 +50,7 @@ Docker / dockerGenerateConfig := (Docker / sourceDirectory).value / "Dockerfile" // Save a bit of typing... publishLocal := (Docker / publishLocal).value +publish := (Docker / publish).value // -----== Conveniences ==----- diff --git a/rf-notebook/src/main/docker/Dockerfile b/rf-notebook/src/main/docker/Dockerfile index 6c7e514dd..d4271e370 100644 --- a/rf-notebook/src/main/docker/Dockerfile +++ b/rf-notebook/src/main/docker/Dockerfile @@ -1,30 +1,58 @@ -FROM s22s/pyspark-notebook:spark-2.3.4-hadoop-2.7 +# jupyter/scipy-notebook isn't semantically versioned. +# We pick this arbitrary one from Sept 2019 because it's what latest was on Oct 17 2019. +FROM jupyter/scipy-notebook:1386e2046833 -MAINTAINER Astraea, Inc. +LABEL maintainer="Astraea, Inc. " -ENV RF_LIB_LOC=/usr/local/rasterframes \ - LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" +EXPOSE 4040 4041 4042 4043 4044 USER root +RUN \ + apt-get -y update && \ + apt-get install --no-install-recommends -y openjdk-8-jre-headless ca-certificates-java && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Spark dependencies +ENV APACHE_SPARK_VERSION 2.4.4 +ENV HADOOP_VERSION 2.7 +ENV APACHE_SPARK_CHECKSUM 2E3A5C853B9F28C7D4525C0ADCB0D971B73AD47D5CCE138C85335B9F53A6519540D3923CB0B5CEE41E386E49AE8A409A51AB7194BA11A254E037A848D0C4A9E5 +ENV APACHE_SPARK_FILENAME spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz +ENV APACHE_SPARK_REMOTE_PATH spark-${APACHE_SPARK_VERSION}/${APACHE_SPARK_FILENAME} + +RUN \ + cd /tmp && \ + wget --quiet http://apache.mirrors.pair.com/spark/${APACHE_SPARK_REMOTE_PATH} && \ + echo "${APACHE_SPARK_CHECKSUM} *${APACHE_SPARK_FILENAME}" | sha512sum -c - && \ + tar xzf ${APACHE_SPARK_FILENAME} -C /usr/local --owner root --group root --no-same-owner && \ + rm ${APACHE_SPARK_FILENAME} + +RUN cd /usr/local && ln -s spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark + +# Spark config +ENV SPARK_HOME /usr/local/spark +ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip +ENV SPARK_OPTS --driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info + +COPY conda_cleanup.sh . +RUN chmod u+x conda_cleanup.sh + +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib" +# Sphinx (for Notebook->html) and pyarrow (from pyspark build) +RUN \ + conda install --quiet --yes pyarrow \ + anaconda sphinx nbsphinx shapely numpy folium geopandas geojsonio rasterio descartes && \ + ./conda_cleanup.sh $NB_USER $CONDA_DIR + +ENV RF_LIB_LOC=/usr/local/rasterframes RUN mkdir $RF_LIB_LOC -EXPOSE 4040 4041 4042 4043 4044 - -# Sphinx (for Notebook->html) -RUN conda install --quiet --yes \ - anaconda sphinx nbsphinx shapely numpy folium geopandas geojsonio rasterio descartes - -# Cleanup pip residuals -RUN rm -rf /home/$NB_USER/.local && \ - fix-permissions /home/$NB_USER && \ - fix-permissions $CONDA_DIR - COPY *.whl $RF_LIB_LOC COPY jupyter_notebook_config.py $HOME/.jupyter COPY examples $HOME/examples -RUN ls -1 $RF_LIB_LOC/*.whl | xargs pip install +RUN ls -1 $RF_LIB_LOC/*.whl | xargs pip install --no-cache-dir RUN chmod -R +w $HOME/examples && chown -R $NB_UID:$NB_GID $HOME -USER $NB_UID \ No newline at end of file +USER $NB_UID diff --git a/rf-notebook/src/main/docker/conda_cleanup.sh b/rf-notebook/src/main/docker/conda_cleanup.sh new file mode 100644 index 000000000..a48622d6d --- /dev/null +++ b/rf-notebook/src/main/docker/conda_cleanup.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +NB_USER=$1 +CONDA_DIR=$2 +conda clean --all --force-pkgs-dirs --yes && \ + rm -rf /home/$NB_USER/.local && \ + find /opt/conda/ -type f,l -name '*.a' -delete && \ + find /opt/conda/ -type f,l -name '*.pyc' -delete && \ + find /opt/conda/ -type f,l -name '*.js.map' -delete && \ + find /opt/conda/lib/python*/site-packages/bokeh/server/static -type f,l -name '*.js' -not -name '*.min.js' -delete && \ + rm -rf /opt/conda/pkgs && \ + fix-permissions $CONDA_DIR && \ + fix-permissions /home/$NB_USER \ No newline at end of file diff --git a/rf-notebook/src/main/notebooks/Getting Started.ipynb b/rf-notebook/src/main/notebooks/Getting Started.ipynb index 1c0355774..2bcc5b3ec 100644 --- a/rf-notebook/src/main/notebooks/Getting Started.ipynb +++ b/rf-notebook/src/main/notebooks/Getting Started.ipynb @@ -21,9 +21,9 @@ "outputs": [], "source": [ "import pyrasterframes\n", + "from pyrasterframes.utils import create_rf_spark_session\n", "import pyrasterframes.rf_ipython # enables nicer visualizations of pandas DF\n", - "from pyrasterframes.rasterfunctions import (rf_local_add, rf_dimensions, rf_extent, rf_crs, rf_mk_crs,\n", - " st_geometry, st_reproject, rf_tile)\n", + "from pyrasterframes.rasterfunctions import *\n", "import pyspark.sql.functions as F" ] }, @@ -35,7 +35,7 @@ }, "outputs": [], "source": [ - "spark = pyrasterframes.get_spark_session()" + "spark = create_rf_spark_session()" ] }, { @@ -100,25 +100,45 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|rf_local_add(proj_raster, 3) |\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|[[[-7783653.637667, 993342.4642358534, -7665045.582235852, 1111950.519667], [+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ]], [int16ud32767, (256,255), [3408,3471,3110,2875,2798,2973,3255,3169,-2147483648,3217,...,-2147483648,-2147483648,-2147483648,-2147483648,-2147483648,-2147483648,-2147483648,2841,3226,-2147483648]]]|\n", - "|[[[-7665045.582235853, 993342.4642358534, -7546437.526804706, 1111950.519667], [+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ]], [int16ud32767, (256,255), [2337,2346,2581,2751,2575,2364,2223,2384,2618,2296,...,-2147483648,-2147483648,2608,2701,2713,3050,2983,2953,3252,2682]]] |\n", - "|[[[-7546437.526804707, 993342.4642358534, -7427829.471373559, 1111950.519667], [+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ]], [int16ud32767, (256,255), [2728,2784,2781,2567,2539,2254,2327,2436,2888,2589,...,2741,2515,2843,2934,2801,3044,2899,2430,2471,2645]]] |\n", - "|[[[-7427829.47137356, 993342.4642358534, -7309221.415942413, 1111950.519667], [+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ]], [int16ud32767, (256,255), [3058,3163,3036,3228,2877,3310,2885,2932,2931,2940,...,2634,2531,2122,1911,2229,2507,2239,2272,2499,2966]]] |\n", - "|[[[-7309221.415942414, 993342.4642358534, -7190613.360511266, 1111950.519667], [+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ]], [int16ud32767, (256,255), [3355,3502,3055,3343,3334,-2147483648,-2147483648,-2147483648,-2147483648,3058,...,2537,2851,2905,2449,2605,3025,2719,3054,3226,3052]]] |\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "only showing top 5 rows\n", - "\n" - ] + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Showing only top 5 rows
rf_local_add(proj_raster, 3)
" + ], + "text/markdown": [ + "\n", + "_Showing only top 5 rows_.\n", + "\n", + "| rf_local_add(proj_raster, 3) |\n", + "|---|\n", + "| |\n", + "| |\n", + "| |\n", + "| |\n", + "| |" + ], + "text/plain": [ + "DataFrame[rf_local_add(proj_raster, 3): struct,crs:struct>,tile:udt>]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "df.select(rf_local_add(df.proj_raster, F.lit(3))).show(5, False)" + "df.select(rf_local_add(df.proj_raster, F.lit(3)))" ] }, { @@ -166,24 +186,45 @@ "name": "stdout", "output_type": "stream", "text": [ - "+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs \n", - "+--------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|proj_raster_path |footprint |\n", - "+--------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF|POLYGON ((-70.85954815687087 8.933333332533772, -71.07986282542622 9.999999999104968, -69.99674110618135 9.999999999104968, -69.7797836135278 8.933333332533772, -70.85954815687087 8.933333332533772)) |\n", - "|https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF|POLYGON ((-69.77978361352781 8.933333332533772, -69.99674110618135 9.999999999104968, -68.91361938693649 9.999999999104968, -68.70001907018472 8.933333332533772, -69.77978361352781 8.933333332533772)) |\n", - "|https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF|POLYGON ((-68.70001907018474 8.933333332533772, -68.9136193869365 9.999999999104968, -67.8304976676916 9.999999999104968, -67.62025452684163 8.933333332533772, -68.70001907018474 8.933333332533772)) |\n", - "|https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF|POLYGON ((-67.62025452684165 8.933333332533772, -67.83049766769162 9.999999999104968, -66.74737594844675 9.999999999104968, -66.54048998349857 8.933333332533772, -67.62025452684165 8.933333332533772)) |\n", - "|https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF|POLYGON ((-66.54048998349859 8.933333332533772, -66.74737594844676 9.999999999104968, -65.66425422920187 9.999999999104968, -65.4607254401555 8.933333332533772, -66.54048998349859 8.933333332533772)) |\n", - "|https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF|POLYGON ((-65.4607254401555 8.933333332533772, -65.66425422920187 9.999999999104968, -64.58113250995702 9.999999999104968, -64.38096089681244 8.933333332533772, -65.4607254401555 8.933333332533772)) |\n", - "|https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF|POLYGON ((-64.38096089681244 8.933333332533772, -64.58113250995702 9.999999999104968, -63.498010790712144 9.999999999104968, -63.30119635346936 8.933333332533772, -64.38096089681244 8.933333332533772))|\n", - "|https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF|POLYGON ((-63.30119635346937 8.933333332533772, -63.49801079071215 9.999999999104968, -62.41488907146726 9.999999999104968, -62.221431810126276 8.933333332533772, -63.30119635346937 8.933333332533772))|\n", - "|https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF|POLYGON ((-62.22143181012629 8.933333332533772, -62.41488907146727 9.999999999104968, -61.33176735222239 9.999999999104968, -61.14166726678321 8.933333332533772, -62.22143181012629 8.933333332533772)) |\n", - "|https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF|POLYGON ((-61.14166726678322 8.933333332533772, -61.3317673522224 9.999999999104968, -60.92559670750556 9.999999999104968, -60.736755563029554 8.933333332533772, -61.14166726678322 8.933333332533772)) |\n", - "+--------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "only showing top 10 rows\n", - "\n" + "+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs \n" ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Showing only top 5 rows
proj_raster_pathfootprint
https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIFPOLYGON ((-70.85954815687087 8.933333332533772, -71.07986282542622 9.999999999104968, -69.99674110618135 9.999999999104968, -69.77978361352781 8.933333332533772, -70.85954815687087 8.933333332533772))
https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIFPOLYGON ((-69.77978361352781 8.933333332533772, -69.99674110618135 9.999999999104968, -68.91361938693649 9.999999999104968, -68.70001907018472 8.933333332533772, -69.77978361352781 8.933333332533772))
https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIFPOLYGON ((-68.70001907018474 8.933333332533772, -68.9136193869365 9.999999999104968, -67.83049766769162 9.999999999104968, -67.62025452684165 8.933333332533772, -68.70001907018474 8.933333332533772))
https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIFPOLYGON ((-67.62025452684165 8.933333332533772, -67.83049766769162 9.999999999104968, -66.74737594844675 9.999999999104968, -66.54048998349857 8.933333332533772, -67.62025452684165 8.933333332533772))
https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIFPOLYGON ((-66.54048998349859 8.933333332533772, -66.74737594844676 9.999999999104968, -65.66425422920187 9.999999999104968, -65.4607254401555 8.933333332533772, -66.54048998349859 8.933333332533772))
" + ], + "text/markdown": [ + "\n", + "_Showing only top 5 rows_.\n", + "\n", + "| proj_raster_path | footprint |\n", + "|---|---|\n", + "| https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF | POLYGON ((-70.85954815687087 8.933333332533772, -71.07986282542622 9.999999999104968, -69.99674110618135 9.999999999104968, -69.77978361352781 8.933333332533772, -70.85954815687087 8.933333332533772)) |\n", + "| https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF | POLYGON ((-69.77978361352781 8.933333332533772, -69.99674110618135 9.999999999104968, -68.91361938693649 9.999999999104968, -68.70001907018472 8.933333332533772, -69.77978361352781 8.933333332533772)) |\n", + "| https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF | POLYGON ((-68.70001907018474 8.933333332533772, -68.9136193869365 9.999999999104968, -67.83049766769162 9.999999999104968, -67.62025452684165 8.933333332533772, -68.70001907018474 8.933333332533772)) |\n", + "| https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF | POLYGON ((-67.62025452684165 8.933333332533772, -67.83049766769162 9.999999999104968, -66.74737594844675 9.999999999104968, -66.54048998349857 8.933333332533772, -67.62025452684165 8.933333332533772)) |\n", + "| https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF | POLYGON ((-66.54048998349859 8.933333332533772, -66.74737594844676 9.999999999104968, -65.66425422920187 9.999999999104968, -65.4607254401555 8.933333332533772, -66.54048998349859 8.933333332533772)) |" + ], + "text/plain": [ + "DataFrame[proj_raster_path: string, footprint: udt]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -196,7 +237,7 @@ " rf_mk_crs(crs), \n", " rf_mk_crs('EPSG:4326')).alias('footprint')\n", " )\n", - "coverage_area.show(10, False)" + "coverage_area" ] }, { @@ -231,23 +272,9 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "folium.Map((5, -65), zoom_start=6) \\\n", " .add_child(folium.GeoJson(gdf.__geo_interface__))" @@ -290,6 +317,7 @@ " \n", " proj_raster_path\n", " extent\n", + " geo\n", " tile\n", " \n", " \n", @@ -297,32 +325,37 @@ " \n", " 0\n", " https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF\n", - " (-7783653.637667, 993342.4642358534, -7665045.582235852, 1111950.519667)\n", - " \n", + " (-7783653.637667, 993342.4642358534, -7665045.582235853, 1111950.519667)\n", + " POLYGON ((-7783653.637667 993342.4642358534, -7...\n", + " \n", " \n", " \n", " 1\n", " https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF\n", " (-7665045.582235853, 993342.4642358534, -7546437.526804706, 1111950.519667)\n", - " \n", + " POLYGON ((-7665045.582235853 993342.4642358534,...\n", + " \n", " \n", " \n", " 2\n", " https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF\n", - " (-7546437.526804707, 993342.4642358534, -7427829.471373559, 1111950.519667)\n", - " \n", + " (-7546437.526804707, 993342.4642358534, -7427829.47137356, 1111950.519667)\n", + " POLYGON ((-7546437.526804707 993342.4642358534,...\n", + " \n", " \n", " \n", " 3\n", " https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF\n", " (-7427829.47137356, 993342.4642358534, -7309221.415942413, 1111950.519667)\n", - " \n", + " POLYGON ((-7427829.47137356 993342.4642358534, ...\n", + " \n", " \n", " \n", " 4\n", " https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF\n", - " (-7309221.415942414, 993342.4642358534, -7190613.360511266, 1111950.519667)\n", - " \n", + " (-7309221.415942414, 993342.4642358534, -7190613.360511267, 1111950.519667)\n", + " POLYGON ((-7309221.415942414 993342.4642358534,...\n", + " \n", " \n", " \n", "\n", @@ -343,12 +376,19 @@ "3 (-7427829.47137356, 993342.4642358534, -730922... \n", "4 (-7309221.415942414, 993342.4642358534, -71906... \n", "\n", + " geo \\\n", + "0 POLYGON ((-7783653.637667 993342.4642358534, -... \n", + "1 POLYGON ((-7665045.582235853 993342.4642358534... \n", + "2 POLYGON ((-7546437.526804707 993342.4642358534... \n", + "3 POLYGON ((-7427829.47137356 993342.4642358534,... \n", + "4 POLYGON ((-7309221.415942414 993342.4642358534... \n", + "\n", " tile \n", - "0 Tile(dimensions=[256, 255], cell_type=CellType... \n", - "1 Tile(dimensions=[256, 255], cell_type=CellType... \n", - "2 Tile(dimensions=[256, 255], cell_type=CellType... \n", - "3 Tile(dimensions=[256, 255], cell_type=CellType... \n", - "4 Tile(dimensions=[256, 255], cell_type=CellType... " + "0 Tile(dimensions=[256, 256], cell_type=CellType... \n", + "1 Tile(dimensions=[256, 256], cell_type=CellType... \n", + "2 Tile(dimensions=[256, 256], cell_type=CellType... \n", + "3 Tile(dimensions=[256, 256], cell_type=CellType... \n", + "4 Tile(dimensions=[256, 256], cell_type=CellType... " ] }, "execution_count": 11, @@ -361,6 +401,7 @@ "pandas_df = df.select(\n", " df.proj_raster_path,\n", " rf_extent(df.proj_raster).alias('extent'),\n", + " rf_geometry(df.proj_raster).alias('geo'),\n", " rf_tile(df.proj_raster).alias('tile'),\n", ").limit(5).toPandas()\n", "pandas_df" @@ -390,7 +431,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.6.5" } }, "nbformat": 4, diff --git a/rf-notebook/src/main/notebooks/pretty_rendering_in_rf.ipynb b/rf-notebook/src/main/notebooks/pretty_rendering_in_rf.ipynb index 722a12c76..fe0d373ec 100644 --- a/rf-notebook/src/main/notebooks/pretty_rendering_in_rf.ipynb +++ b/rf-notebook/src/main/notebooks/pretty_rendering_in_rf.ipynb @@ -24,10 +24,18 @@ "source": [ "import pyrasterframes\n", "import pyrasterframes.rf_ipython\n", + "from pyrasterframes.utils import create_rf_spark_session\n", "from pyrasterframes.rasterfunctions import rf_crs, rf_extent, rf_tile\n", - "from pyspark.sql.functions import col\n", - "\n", - "spark = pyrasterframes.get_spark_session()" + "from pyspark.sql.functions import col" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "spark = create_rf_spark_session()" ] }, { @@ -39,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -56,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -94,7 +102,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -103,12 +111,12 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "Tile(masked_array(\n", " data=[[1225, 1244, 1247, ..., 1305, 1245, 1206],\n", @@ -129,7 +137,7 @@ " dtype=int16), int16ud32767)" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -147,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -156,7 +164,7 @@ "'Tile(dimensions=[256, 256], cell_type=CellType(int16ud32767, 32767), cells=\\n[[1225 1244 1247 ... 1305 1245 1206]\\n [1166 1188 1190 ... 1381 1251 1193]\\n [1156 1110 1122 ... 1248 1245 1270]\\n ...\\n [1485 1749 1761 ... 1034 996 998]\\n [1780 1777 1663 ... 1008 1027 1174]\\n [1728 1647 1562 ... 1189 1297 1382]])'" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -174,7 +182,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -199,7 +207,7 @@ " dtype=int16)" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -219,39 +227,43 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", + "\n", "\n", "\n", "\n", "\n", - "\n", - "\n", - "\n", - "\n", - "\n", + "\n", + "\n", + "\n", + "\n", + "\n", "\n", "
Showing only top 5 rows
proj_raster_pathtilecrsext
https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF[int16ud32767, (256,256), [1225,1244,1247,1222,1189,1216,1206,1185,1132,1040,...,1575,1489,1281,1189,1202,1145,1171,1189,1297,1382]][+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ][1.4455356755667E7, -2342509.0947640934, 1.4573964811098093E7, -2223901.039333]
https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF[int16ud32767, (256,256), [1140,1227,1147,1106,1026,994,1047,1020,1174,1348,...,1793,1743,1685,1688,1706,1727,1766,1689,1561,1515]][+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ][1.4573964811098093E7, -2342509.0947640934, 1.4692572866529187E7, -2223901.039333]
https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF[int16ud32767, (256,256), [1546,1445,1329,1539,1653,1576,1533,1603,1610,1584,...,1399,1434,1330,1429,1470,1451,1422,1407,1369,1310]][+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ][1.4692572866529185E7, -2342509.0947640934, 1.4811180921960281E7, -2223901.039333]
https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF[int16ud32767, (256,256), [1765,1675,1704,1674,1665,1685,1551,1556,1576,1626,...,1814,1768,1771,1812,1825,1773,1737,1728,1734,1684]][+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ][1.481118092196028E7, -2342509.0947640934, 1.4929788977391373E7, -2223901.039333]
https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF[int16ud32767, (256,256), [1171,1272,1306,1294,1202,1065,998,971,976,1188,...,1455,1481,1458,1469,1449,1392,1227,1085,1102,1091]][+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ][1.4929788977391373E7, -2342509.0947640934, 1.5048397032822467E7, -2223901.039333]
https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF[+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ][1.4455356755667E7, -2342509.0947640934, 1.4573964811098093E7, -2223901.039333]
https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF[+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ][1.4573964811098093E7, -2342509.0947640934, 1.4692572866529187E7, -2223901.039333]
https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF[+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ][1.4692572866529185E7, -2342509.0947640934, 1.481118092196028E7, -2223901.039333]
https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF[+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ][1.481118092196028E7, -2342509.0947640934, 1.4929788977391373E7, -2223901.039333]
https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF[+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ][1.4929788977391373E7, -2342509.0947640934, 1.5048397032822467E7, -2223901.039333]
" ], "text/markdown": [ + "\n", + "_Showing only top 5 rows_.\n", + "\n", "| proj_raster_path | tile | crs | ext |\n", "|---|---|---|---|\n", - "| https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF | \\[int16ud32767, (256,256), \\[1225,1244,1247,1222,1189,1216,1206,1185,1132,1040,...,1575,1489,1281,1189,1202,1145,1171,1189,1297,1382]] | \\[+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ] | \\[1.4455356755667E7, -2342509.0947640934, 1.4573964811098093E7, -2223901.039333] |\n", - "| https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF | \\[int16ud32767, (256,256), \\[1140,1227,1147,1106,1026,994,1047,1020,1174,1348,...,1793,1743,1685,1688,1706,1727,1766,1689,1561,1515]] | \\[+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ] | \\[1.4573964811098093E7, -2342509.0947640934, 1.4692572866529187E7, -2223901.039333] |\n", - "| https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF | \\[int16ud32767, (256,256), \\[1546,1445,1329,1539,1653,1576,1533,1603,1610,1584,...,1399,1434,1330,1429,1470,1451,1422,1407,1369,1310]] | \\[+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ] | \\[1.4692572866529185E7, -2342509.0947640934, 1.4811180921960281E7, -2223901.039333] |\n", - "| https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF | \\[int16ud32767, (256,256), \\[1765,1675,1704,1674,1665,1685,1551,1556,1576,1626,...,1814,1768,1771,1812,1825,1773,1737,1728,1734,1684]] | \\[+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ] | \\[1.481118092196028E7, -2342509.0947640934, 1.4929788977391373E7, -2223901.039333] |\n", - "| https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF | \\[int16ud32767, (256,256), \\[1171,1272,1306,1294,1202,1065,998,971,976,1188,...,1455,1481,1458,1469,1449,1392,1227,1085,1102,1091]] | \\[+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ] | \\[1.4929788977391373E7, -2342509.0947640934, 1.5048397032822467E7, -2223901.039333] |" + "| https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF | | \\[+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ] | \\[1.4455356755667E7, -2342509.0947640934, 1.4573964811098093E7, -2223901.039333] |\n", + "| https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF | | \\[+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ] | \\[1.4573964811098093E7, -2342509.0947640934, 1.4692572866529187E7, -2223901.039333] |\n", + "| https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF | | \\[+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ] | \\[1.4692572866529185E7, -2342509.0947640934, 1.481118092196028E7, -2223901.039333] |\n", + "| https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF | | \\[+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ] | \\[1.481118092196028E7, -2342509.0947640934, 1.4929788977391373E7, -2223901.039333] |\n", + "| https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF | | \\[+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ] | \\[1.4929788977391373E7, -2342509.0947640934, 1.5048397032822467E7, -2223901.039333] |" ], "text/plain": [ "DataFrame[proj_raster_path: string, tile: udt, crs: struct, ext: struct]" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -278,7 +290,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -312,28 +324,28 @@ " \n", " 0\n", " https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF\n", - " \n", + " \n", " (+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ,)\n", " (14455356.755667, -2342509.0947640934, 14573964.811098093, -2223901.039333)\n", " \n", " \n", " 1\n", " https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF\n", - " \n", + " \n", " (+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ,)\n", " (14573964.811098093, -2342509.0947640934, 14692572.866529187, -2223901.039333)\n", " \n", " \n", " 2\n", " https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF\n", - " \n", + " \n", " (+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ,)\n", - " (14692572.866529185, -2342509.0947640934, 14811180.921960281, -2223901.039333)\n", + " (14692572.866529185, -2342509.0947640934, 14811180.92196028, -2223901.039333)\n", " \n", " \n", " 3\n", " https://modis-pds.s3.amazonaws.com/MCD43A4.006/31/11/2017158/MCD43A4.A2017158.h31v11.006.2017171203421_B01.TIF\n", - " \n", + " \n", " (+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs ,)\n", " (14811180.92196028, -2342509.0947640934, 14929788.977391373, -2223901.039333)\n", " \n", @@ -367,7 +379,7 @@ "3 (14811180.92196028, -2342509.0947640934, 14929... " ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -386,7 +398,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": { "scrolled": true }, @@ -401,7 +413,7 @@ "Name: 8, dtype: object" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -412,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": { "scrolled": true }, @@ -433,7 +445,7 @@ "Name: tile, dtype: object" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -451,186 +463,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
iataairportcitystatecountrylatlongcnt
0ORDChicago O'Hare InternationalChicagoILUSA41.979595-87.90446425129
1ATLWilliam B Hartsfield-Atlanta IntlAtlantaGAUSA33.640444-84.42694421925
2DFWDallas-Fort Worth InternationalDallas-Fort WorthTXUSA32.895951-97.03720020662
3PHXPhoenix Sky Harbor InternationalPhoenixAZUSA33.434167-112.00805617290
4DENDenver IntlDenverCOUSA39.858408-104.66700213781
5IAHGeorge Bush IntercontinentalHoustonTXUSA29.980472-95.33972213223
6SFOSan Francisco InternationalSan FranciscoCAUSA37.619002-122.37484312016
7LAXLos Angeles InternationalLos AngelesCAUSA33.942536-118.40807411797
8MCOOrlando InternationalOrlandoFLUSA28.428889-81.31602810536
9CLTCharlotte/Douglas InternationalCharlotteNCUSA35.214011-80.94312610490
\n", - "
" - ], - "text/plain": [ - " iata airport city state country \\\n", - "0 ORD Chicago O'Hare International Chicago IL USA \n", - "1 ATL William B Hartsfield-Atlanta Intl Atlanta GA USA \n", - "2 DFW Dallas-Fort Worth International Dallas-Fort Worth TX USA \n", - "3 PHX Phoenix Sky Harbor International Phoenix AZ USA \n", - "4 DEN Denver Intl Denver CO USA \n", - "5 IAH George Bush Intercontinental Houston TX USA \n", - "6 SFO San Francisco International San Francisco CA USA \n", - "7 LAX Los Angeles International Los Angeles CA USA \n", - "8 MCO Orlando International Orlando FL USA \n", - "9 CLT Charlotte/Douglas International Charlotte NC USA \n", - "\n", - " lat long cnt \n", - "0 41.979595 -87.904464 25129 \n", - "1 33.640444 -84.426944 21925 \n", - "2 32.895951 -97.037200 20662 \n", - "3 33.434167 -112.008056 17290 \n", - "4 39.858408 -104.667002 13781 \n", - "5 29.980472 -95.339722 13223 \n", - "6 37.619002 -122.374843 12016 \n", - "7 33.942536 -118.408074 11797 \n", - "8 28.428889 -81.316028 10536 \n", - "9 35.214011 -80.943126 10490 " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import pandas\n", "pandas.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_february_us_airport_traffic.csv').head(10)" @@ -660,7 +495,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.6.5" } }, "nbformat": 4, diff --git a/version.sbt b/version.sbt index 8e8a39d95..ca68fcbd5 100644 --- a/version.sbt +++ b/version.sbt @@ -1 +1 @@ -version in ThisBuild := "0.8.3" +version in ThisBuild := "0.8.4"