-
Notifications
You must be signed in to change notification settings - Fork 45
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Ability to write RasterFrame tile columns directly as raster files. #571
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,269 @@ | ||
/* | ||
* This software is licensed under the Apache 2 license, quoted below. | ||
* | ||
* Copyright 2021 Astraea, Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not | ||
* use this file except in compliance with the License. You may obtain a copy of | ||
* the License at | ||
* | ||
* [http://www.apache.org/licenses/LICENSE-2.0] | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
* License for the specific language governing permissions and limitations under | ||
* the License. | ||
* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* | ||
*/ | ||
package org.locationtech.rasterframes.datasource.tiles | ||
|
||
import com.typesafe.scalalogging.Logger | ||
import geotrellis.proj4.CRS | ||
import geotrellis.raster.io.geotiff.compression.DeflateCompression | ||
import geotrellis.raster.io.geotiff.tags.codes.ColorSpace | ||
import geotrellis.raster.io.geotiff.{GeoTiffOptions, MultibandGeoTiff, Tags, Tiled} | ||
import geotrellis.raster.render.ColorRamps | ||
import geotrellis.raster.{MultibandTile, Tile} | ||
import geotrellis.store.hadoop.{SerializableConfiguration, _} | ||
import geotrellis.vector.Extent | ||
import org.apache.hadoop.conf.Configuration | ||
import org.apache.hadoop.fs.{FileSystem, Path} | ||
import org.apache.hadoop.io.IOUtils | ||
import org.apache.spark.sql.catalyst.encoders.RowEncoder | ||
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister} | ||
import org.apache.spark.sql.types.{StringType, StructField, StructType} | ||
import org.apache.spark.sql.{Column, DataFrame, Dataset, Encoders, Row, SQLContext, SaveMode, functions => F} | ||
import org.locationtech.rasterframes._ | ||
import org.locationtech.rasterframes.encoders.SparkBasicEncoders | ||
import org.locationtech.rasterframes.util._ | ||
import org.slf4j.LoggerFactory | ||
|
||
import java.io.IOException | ||
import java.net.URI | ||
import scala.util.Try | ||
|
||
class TilesDataSource extends DataSourceRegister with CreatableRelationProvider { | ||
import TilesDataSource._ | ||
@transient protected lazy val logger = Logger(LoggerFactory.getLogger(getClass.getName)) | ||
override def shortName(): String = SHORT_NAME | ||
|
||
/** | ||
* Credit: https://stackoverflow.com/a/50545815/296509 | ||
*/ | ||
def copyMerge( | ||
srcFS: FileSystem, srcDir: Path, | ||
dstFS: FileSystem, dstFile: Path, | ||
deleteSource: Boolean, conf: Configuration | ||
): Boolean = { | ||
|
||
if (dstFS.exists(dstFile)) | ||
throw new IOException(s"Target $dstFile already exists") | ||
|
||
// Source path is expected to be a directory: | ||
if (srcFS.getFileStatus(srcDir).isDirectory()) { | ||
|
||
val outputFile = dstFS.create(dstFile) | ||
Try { | ||
srcFS | ||
.listStatus(srcDir) | ||
.sortBy(_.getPath.getName) | ||
.collect { | ||
case status if status.isFile() => | ||
val inputFile = srcFS.open(status.getPath()) | ||
Try(IOUtils.copyBytes(inputFile, outputFile, conf, false)) | ||
inputFile.close() | ||
} | ||
} | ||
outputFile.close() | ||
|
||
if (deleteSource) srcFS.delete(srcDir, true) else true | ||
} | ||
else false | ||
} | ||
|
||
private def writeCatalog(pipeline: Dataset[Row], pathURI: URI, conf: SerializableConfiguration) = { | ||
// A bit of a hack here. First we write the CSV using Spark's CSV writer, then we clean up all the Hadoop noise. | ||
val fName = "catalog.csv" | ||
val hPath = new Path(new Path(pathURI), "_" + fName) | ||
pipeline | ||
.coalesce(1) | ||
.write | ||
.option("header", "true") | ||
.csv(hPath.toString) | ||
|
||
val fs = FileSystem.get(pathURI, conf.value) | ||
val localPath = new Path(new Path(pathURI), fName) | ||
copyMerge(fs, hPath, fs, localPath, true, conf.value) | ||
} | ||
|
||
override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { | ||
val pathURI = parameters.path.getOrElse(throw new IllegalArgumentException("Valid URI 'path' parameter required.")) | ||
require(pathURI.getScheme == "file" || pathURI.getScheme == null, "Currently only 'file://' destinations are supported") | ||
|
||
val tileCols = data.tileColumns | ||
require(tileCols.nonEmpty, "Could not find any tile columns.") | ||
|
||
val filenameCol = parameters.filenameColumn | ||
.map(F.col) | ||
.getOrElse(F.monotonically_increasing_id().cast(StringType)) | ||
|
||
val SpatialComponents(crsCol, extentCol, _, _) = projectSpatialComponents(data) match { | ||
case Some(parts) => parts | ||
case _ => throw new IllegalArgumentException("Could not find extent and/or CRS data.") | ||
} | ||
|
||
val tags = Tags(Map.empty, | ||
tileCols.map(c => Map("source_column" -> c.columnName)).toList | ||
) | ||
|
||
// We make some assumptions here.... eventually have column metadata encode this. | ||
val colorSpace = tileCols.size match { | ||
case 3 | 4 => ColorSpace.RGB | ||
case _ => ColorSpace.BlackIsZero | ||
} | ||
|
||
val metadataCols = parameters.metadataColumns | ||
|
||
// Default format options. | ||
val tiffOptions = GeoTiffOptions(Tiled, DeflateCompression, colorSpace) | ||
|
||
val outRowEnc = RowEncoder(StructType( | ||
StructField("filename", StringType) +: | ||
StructField("bbox", StringType) +: | ||
StructField("crs", StringType) +: | ||
metadataCols.map(n => | ||
StructField(n, StringType) | ||
) | ||
)) | ||
|
||
val hconf = SerializableConfiguration(sqlContext.sparkContext.hadoopConfiguration) | ||
|
||
// Spark ceremony for reifying row contents. | ||
import SparkBasicEncoders._ | ||
val inRowEnc = Encoders.tuple( | ||
stringEnc, crsExpressionEncoder, extentEncoder, arrayEnc[Tile], arrayEnc[String]) | ||
type RowStuff = (String, CRS, Extent, Array[Tile], Array[String]) | ||
val pipeline = data | ||
.select(filenameCol, crsCol, extentCol, F.array(tileCols.map(rf_tile): _*), | ||
F.array(metadataCols.map(data.apply).map(_.cast(StringType)): _*)) | ||
.na.drop() | ||
.as[RowStuff](inRowEnc) | ||
.mapPartitions { rows => | ||
for ((filename, crs, extent, tiles, metadata) <- rows) yield { | ||
val md = metadataCols.zip(metadata).toMap | ||
|
||
val finalFilename = if (parameters.asPNG) { | ||
val fnl = filename.toLowerCase() | ||
if (!fnl.endsWith("png")) filename + ".png" else filename | ||
} | ||
else { | ||
val fnl = filename.toLowerCase() | ||
if (!(fnl.endsWith("tiff") || fnl.endsWith("tif"))) filename + ".tif" else filename | ||
} | ||
|
||
val finalPath = new Path(new Path(pathURI), finalFilename) | ||
|
||
if (parameters.asPNG) { | ||
// `Try` below is due to https://github.com/locationtech/geotrellis/issues/2621 | ||
val scaled = tiles.map(t => Try(t.rescale(0, 255)).getOrElse(t)) | ||
if (scaled.length > 1) | ||
MultibandTile(scaled).renderPng().write(finalPath, hconf.value) | ||
else | ||
scaled.head.renderPng(ColorRamps.greyscale(255)).write(finalPath, hconf.value) | ||
} | ||
else { | ||
val chipTags = tags.copy(headTags = md.updated("base_filename", filename)) | ||
val geotiff = new MultibandGeoTiff(MultibandTile(tiles), extent, crs, chipTags, tiffOptions) | ||
geotiff.write(finalPath, hconf.value) | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure if this is bad form or not, but we're basically performing side effects in a mapPartitions. Check other data sources on how to do this? |
||
// Ordering: | ||
// bbox = left,bottom,right,top | ||
// bbox = min Longitude , min Latitude , max Longitude , max Latitude | ||
// Avoiding commas with this format: | ||
// [0.489|51.28|0.236|51.686] | ||
val bbox = s"[${extent.xmin}|${extent.ymin}|${extent.xmax}|${extent.ymax}]" | ||
Row(finalFilename +: bbox +: crs.toProj4String +: metadata: _*) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This effectively drops the raster columns so the |
||
} | ||
}(outRowEnc) | ||
|
||
if (parameters.withCatalog) | ||
writeCatalog(pipeline, pathURI, hconf) | ||
else | ||
pipeline.foreach(_ => ()) | ||
|
||
// The `createRelation` function here is called by | ||
// `org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand`, which | ||
// ignores the return value. It in turn returns `Seq.empty[Row]` (which is then also ignored)... | ||
// ¯\_(ツ)_/¯ | ||
null | ||
} | ||
} | ||
|
||
object TilesDataSource { | ||
final val SHORT_NAME = "tiles" | ||
// writing | ||
final val PATH_PARAM = "path" | ||
final val FILENAME_COL_PARAM = "filename" | ||
final val CATALOG_PARAM = "catalog" | ||
final val METADATA_PARAM = "metadata" | ||
final val AS_PNG_PARAM = "png" | ||
|
||
case class SpatialComponents(crsColumn: Column, | ||
extentColumn: Column, | ||
dimensionColumn: Column, | ||
cellTypeColumn: Column) | ||
|
||
|
||
object SpatialComponents { | ||
def apply(tileColumn: Column, crsColumn: Column, extentColumn: Column): SpatialComponents = { | ||
val dim = rf_dimensions(tileColumn) as "dims" | ||
val ct = rf_cell_type(tileColumn) as "cellType" | ||
SpatialComponents(crsColumn, extentColumn, dim, ct) | ||
} | ||
def apply(prColumn : Column): SpatialComponents = { | ||
SpatialComponents( | ||
rf_crs(prColumn) as "crs", | ||
rf_extent(prColumn) as "extent", | ||
rf_dimensions(prColumn) as "dims", | ||
rf_cell_type(prColumn) as "cellType" | ||
) | ||
} | ||
} | ||
|
||
protected[rasterframes] | ||
implicit class TilesDictAccessors(val parameters: Map[String, String]) extends AnyVal { | ||
def filenameColumn: Option[String] = | ||
parameters.get(FILENAME_COL_PARAM) | ||
|
||
def path: Option[URI] = | ||
datasource.uriParam(PATH_PARAM, parameters) | ||
|
||
def withCatalog: Boolean = | ||
parameters.get(CATALOG_PARAM).exists(_.toBoolean) | ||
|
||
def metadataColumns: Seq[String] = | ||
parameters.get(METADATA_PARAM).toSeq.flatMap(_.split(',')) | ||
|
||
def asPNG: Boolean = | ||
parameters.get(AS_PNG_PARAM).exists(_.toBoolean) | ||
} | ||
|
||
/** | ||
* If the given DataFrame has extent and CRS columns return the DataFrame, the CRS column an extent column. | ||
* Otherwise, see if there's a `ProjectedRaster` column add `crs` and `extent` columns extracted from the | ||
* `ProjectedRaster` column to the returned DataFrame. | ||
* | ||
* @param d DataFrame to process. | ||
* @return Tuple containing the updated DataFrame followed by the CRS column and the extent column | ||
*/ | ||
def projectSpatialComponents(d: DataFrame): Option[SpatialComponents] = | ||
d.tileColumns.headOption.zip(d.crsColumns.headOption.zip(d.extentColumns.headOption)).headOption | ||
.map { case (tile, (crs, extent)) => SpatialComponents(tile, crs, extent) } | ||
.orElse( | ||
d.projRasterColumns.headOption | ||
.map(pr => SpatialComponents(pr)) | ||
) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
/* | ||
* This software is licensed under the Apache 2 license, quoted below. | ||
* | ||
* Copyright (c) 2021. Astraea, Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not | ||
* use this file except in compliance with the License. You may obtain a copy of | ||
* the License at | ||
* | ||
* [http://www.apache.org/licenses/LICENSE-2.0] | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | ||
* License for the specific language governing permissions and limitations under | ||
* the License. | ||
* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.locationtech.rasterframes.datasource | ||
|
||
import org.apache.spark.sql.DataFrameWriter | ||
import shapeless.tag.@@ | ||
|
||
package object tiles { | ||
trait TilesDataFrameReaderTag | ||
trait TilesDataFrameWriterTag | ||
|
||
type TilesDataFrameWriter[T] = DataFrameWriter[T] @@ TilesDataFrameWriterTag | ||
|
||
/** Adds `tiles` format specifier to `DataFrameWriter` */ | ||
implicit class DataFrameWriterHasTilesWriter[T](val writer: DataFrameWriter[T]) { | ||
def tiles: TilesDataFrameWriter[T] = | ||
shapeless.tag[TilesDataFrameWriterTag][DataFrameWriter[T]]( | ||
writer.format(TilesDataSource.SHORT_NAME)) | ||
} | ||
|
||
/** Options for `tiles` format writer. */ | ||
implicit class TilesWriterOps[T](val writer: TilesDataFrameWriter[T]) extends TilesWriterOptionsSupport[T] | ||
|
||
trait TilesWriterOptionsSupport[T] { | ||
val writer: TilesDataFrameWriter[T] | ||
|
||
/** | ||
* Provide the name of a column whose row value will be used as the output filename. | ||
* Generated value may have path components in it. Appropriate filename extension will be automatically added. | ||
* | ||
* @param colName name of column to use. | ||
*/ | ||
def withFilenameColumn(colName: String): TilesDataFrameWriter[T] = { | ||
shapeless.tag[TilesDataFrameWriterTag][DataFrameWriter[T]]( | ||
writer.option(TilesDataSource.FILENAME_COL_PARAM, colName) | ||
) | ||
} | ||
|
||
/** | ||
* Enable generation of a `catalog.csv` file along with the tile filesf listing the file paths relative to | ||
* the base directory along with any identified metadata values vai `withMetadataColumns`. | ||
*/ | ||
def withCatalog: TilesDataFrameWriter[T] = { | ||
shapeless.tag[TilesDataFrameWriterTag][DataFrameWriter[T]]( | ||
writer.option(TilesDataSource.CATALOG_PARAM, true.toString) | ||
) | ||
} | ||
|
||
/** | ||
* Specify column values to to add to chip metadata and catalog (when written). | ||
* | ||
* @param colNames names of columns to add. Values are automatically cast-ed to `String` | ||
*/ | ||
def withMetadataColumns(colNames: String*): TilesDataFrameWriter[T] = { | ||
shapeless.tag[TilesDataFrameWriterTag][DataFrameWriter[T]]( | ||
writer.option(TilesDataSource.METADATA_PARAM, colNames.mkString(",")) | ||
) | ||
} | ||
|
||
/** Request Tiles be written out in PNG format. GeoTIFF is the default. */ | ||
def asPNG: TilesDataFrameWriter[T] = { | ||
shapeless.tag[TilesDataFrameWriterTag][DataFrameWriter[T]]( | ||
writer.option(TilesDataSource.AS_PNG_PARAM, true.toString) | ||
) | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@echeipesh @pomadchin Maybe this could be replaced with something that's more STAC oriented?