From 73e3651fe312d21debf4de1b4d00773bad7910b2 Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Thu, 22 Aug 2019 08:39:21 -0400 Subject: [PATCH 01/48] Initial refactor of raster reader args in Python API Signed-off-by: Jason T. Brown --- .../main/python/pyrasterframes/__init__.py | 45 ++++- .../main/python/tests/PyRasterFramesTests.py | 152 --------------- .../src/main/python/tests/RasterSourceTest.py | 179 ++++++++++++++++++ .../src/main/python/tests/__init__.py | 5 +- 4 files changed, 225 insertions(+), 156 deletions(-) create mode 100644 pyrasterframes/src/main/python/tests/RasterSourceTest.py diff --git a/pyrasterframes/src/main/python/pyrasterframes/__init__.py b/pyrasterframes/src/main/python/pyrasterframes/__init__.py index 1fa5e91cf..e2702fb8c 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/__init__.py +++ b/pyrasterframes/src/main/python/pyrasterframes/__init__.py @@ -110,13 +110,28 @@ def _aliased_writer(df_writer, format_key, path, **options): def _raster_reader( df_reader, - path=None, - catalog=None, + source=None, catalog_col_names=None, band_indexes=None, tile_dimensions=(256, 256), lazy_tiles=True, **options): + """ + Returns a Spark DataFrame from a raster data files specified by URI pointers + The returned DataFrame will have a column of (CRS, Extent, Tile) for each URI read + Multiple bands from the same raster file are spread across rows of the DataFrame. See band_indexes param. + If bands from a scene are stored in separate files, provide a DataFrame to the `source` parameter. Each row in the returned DataFrame will contain one (CRS, Extent, Tile) for each item in `catalog_col_names` + + For more details and example usage, consult https://rasterframes.io/raster-read.html + + :param source: a string, list of strings, a pandas DataFrame or a Spark DataFrame giving URIs to the raster data to read + :param catalog_col_names: required if source is a DataFrame or CSV string. It is a list of strings giving the names of columns containing URIs to read + :param band_indexes: list of integers indicating which bands, zero-based, to read from the raster files specified; default is to read only the first band + :param tile_dimensions: tuple or list of two indicating the default tile dimension as (columns, rows) + :param lazy_tiles: If true (default) only generate minimal references to tile contents; if false, fetch tile cell values + :param options: Additional keyword arguments to pass to the spark DataSource + :return: + """ from pandas import DataFrame as PdDataFrame @@ -140,6 +155,25 @@ def temp_name(): "lazyTiles": lazy_tiles }) + # Parse the `source` argument + path = None # to pass into `path` param + if isinstance(source, list): + path = None + catalog = None + options.update(dict(paths='\n'.join(str(source)))) + elif isinstance(source, str): + if '\n' in source or '\r' in source: + # then the `source` string is a catalog as a CSV (header is required) + path = None + catalog = source + else: + # interpret source as a single URI string + path = source + catalog = None + else: + # user has passed in some other type, we will interpret as a catalog + catalog = source + if catalog is not None: if catalog_col_names is None: raise Exception("'catalog_col_names' required when DataFrame 'catalog' specified") @@ -149,6 +183,9 @@ def temp_name(): "catalogColumns": to_csv(catalog_col_names) }) elif isinstance(catalog, DataFrame): + # check catalog_col_names + assert all([c in catalog.columns for c in catalog_col_names]), \ + "All items in catalog_col_names must be the name of a column in the catalog DataFrame." # Create a random view name tmp_name = temp_name() catalog.createOrReplaceTempView(tmp_name) @@ -157,6 +194,10 @@ def temp_name(): "catalogColumns": to_csv(catalog_col_names) }) elif isinstance(catalog, PdDataFrame): + # check catalog_col_names + assert all([c in catalog.columns for c in catalog_col_names]), \ + "All items in catalog_col_names must be the name of a column in the catalog DataFrame." + # Handle to active spark session session = SparkContext._active_spark_context._rf_context._spark_session # Create a random view name diff --git a/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py b/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py index f682e2609..6092410bb 100644 --- a/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py +++ b/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py @@ -410,158 +410,6 @@ def test_raster_join(self): self.rf.raster_join(rf_prime, join_exprs=self.rf.extent) -class RasterSource(TestEnvironment): - - def test_handle_lazy_eval(self): - df = self.spark.read.raster(self.img_uri) - ltdf = df.select('proj_raster') - self.assertGreater(ltdf.count(), 0) - self.assertIsNotNone(ltdf.first()) - - tdf = df.select(rf_tile('proj_raster')) - self.assertGreater(tdf.count(), 0) - self.assertIsNotNone(tdf.first()) - - def test_strict_eval(self): - df_lazy = self.spark.read.raster(self.img_uri, lazy_tiles=True) - # when doing Show on a lazy tile we will see something like RasterRefTile(RasterRef(JVMGeoTiffRasterSource(... - # use this trick to get the `show` string - show_str_lazy = df_lazy.select('proj_raster')._jdf.showString(1, -1, False) - self.assertTrue('RasterRef' in show_str_lazy) - - # again for strict - df_strict = self.spark.read.raster(self.img_uri, lazy_tiles=False) - show_str_strict = df_strict.select('proj_raster')._jdf.showString(1, -1, False) - self.assertTrue('RasterRef' not in show_str_strict) - - - def test_prt_functions(self): - df = self.spark.read.raster(self.img_uri) \ - .withColumn('crs', rf_crs('proj_raster')) \ - .withColumn('ext', rf_extent('proj_raster')) \ - .withColumn('geom', rf_geometry('proj_raster')) - df.select('crs', 'ext', 'geom').first() - - def test_raster_source_reader(self): - # much the same as RasterSourceDataSourceSpec here; but using https PDS. Takes about 30s to run - - def l8path(b): - assert b in range(1, 12) - base = "https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/199/026/LC08_L1TP_199026_20180919_20180928_01_T1/LC08_L1TP_199026_20180919_20180928_01_T1_B{}.TIF" - return base.format(b) - - path_param = '\n'.join([l8path(b) for b in [1, 2, 3]]) # "http://foo.com/file1.tif,http://foo.com/file2.tif" - tile_size = 512 - - df = self.spark.read.raster( - tile_dimensions=(tile_size, tile_size), - paths=path_param, - lazy_tiles=True, - ).cache() - - # schema is tile_path and tile - # df.printSchema() - self.assertTrue(len(df.columns) == 2 and 'proj_raster_path' in df.columns and 'proj_raster' in df.columns) - - # the most common tile dimensions should be as passed to `options`, showing that options are correctly applied - tile_size_df = df.select(rf_dimensions(df.proj_raster).rows.alias('r'), rf_dimensions(df.proj_raster).cols.alias('c')) \ - .groupby(['r', 'c']).count().toPandas() - most_common_size = tile_size_df.loc[tile_size_df['count'].idxmax()] - self.assertTrue(most_common_size.r == tile_size and most_common_size.c == tile_size) - - # all rows are from a single source URI - path_count = df.groupby(df.proj_raster_path).count() - print(path_count.toPandas()) - self.assertTrue(path_count.count() == 3) - - def test_raster_source_reader_schemeless(self): - import os.path - path = os.path.join(self.resource_dir, "L8-B8-Robinson-IL.tiff") - self.assertTrue(not path.startswith('file://')) - df = self.spark.read.raster(path) - self.assertTrue(df.count() > 0) - - def test_raster_source_catalog_reader(self): - import pandas as pd - - scene_dict = { - 1: 'http://landsat-pds.s3.amazonaws.com/c1/L8/015/041/LC08_L1TP_015041_20190305_20190309_01_T1/LC08_L1TP_015041_20190305_20190309_01_T1_B{}.TIF', - 2: 'http://landsat-pds.s3.amazonaws.com/c1/L8/015/042/LC08_L1TP_015042_20190305_20190309_01_T1/LC08_L1TP_015042_20190305_20190309_01_T1_B{}.TIF', - 3: 'http://landsat-pds.s3.amazonaws.com/c1/L8/016/041/LC08_L1TP_016041_20190224_20190309_01_T1/LC08_L1TP_016041_20190224_20190309_01_T1_B{}.TIF', - } - - def path(scene, band): - assert band in range(1, 12) - p = scene_dict[scene] - return p.format(band) - - # Create a pandas dataframe (makes it easy to create spark df) - path_pandas = pd.DataFrame([ - {'b1': path(1, 1), 'b2': path(1, 2), 'b3': path(1, 3)}, - {'b1': path(2, 1), 'b2': path(2, 2), 'b3': path(2, 3)}, - {'b1': path(3, 1), 'b2': path(3, 2), 'b3': path(3, 3)}, - ]) - # comma separated list of column names containing URI's to read. - catalog_columns = ','.join(path_pandas.columns.tolist()) # 'b1,b2,b3' - path_table = self.spark.createDataFrame(path_pandas) - - path_df = self.spark.read.raster( - tile_dimensions=(512, 512), - catalog=path_table, - catalog_col_names=catalog_columns, - lazy_tiles=True # We'll get an OOM error if we try to read 9 scenes all at once! - ) - - self.assertTrue(len(path_df.columns) == 6) # three bands times {path, tile} - self.assertTrue(path_df.select('b1_path').distinct().count() == 3) # as per scene_dict - b1_paths_maybe = path_df.select('b1_path').distinct().collect() - b1_paths = [s.format('1') for s in scene_dict.values()] - self.assertTrue(all([row.b1_path in b1_paths for row in b1_paths_maybe])) - - def test_raster_source_catalog_reader_with_pandas(self): - import pandas as pd - import geopandas - from shapely.geometry import Point - - scene_dict = { - 1: 'http://landsat-pds.s3.amazonaws.com/c1/L8/015/041/LC08_L1TP_015041_20190305_20190309_01_T1/LC08_L1TP_015041_20190305_20190309_01_T1_B{}.TIF', - 2: 'http://landsat-pds.s3.amazonaws.com/c1/L8/015/042/LC08_L1TP_015042_20190305_20190309_01_T1/LC08_L1TP_015042_20190305_20190309_01_T1_B{}.TIF', - 3: 'http://landsat-pds.s3.amazonaws.com/c1/L8/016/041/LC08_L1TP_016041_20190224_20190309_01_T1/LC08_L1TP_016041_20190224_20190309_01_T1_B{}.TIF', - } - - def path(scene, band): - assert band in range(1, 12) - p = scene_dict[scene] - return p.format(band) - - # Create a pandas dataframe (makes it easy to create spark df) - path_pandas = pd.DataFrame([ - {'b1': path(1, 1), 'b2': path(1, 2), 'b3': path(1, 3), 'geo': Point(1, 1)}, - {'b1': path(2, 1), 'b2': path(2, 2), 'b3': path(2, 3), 'geo': Point(2, 2)}, - {'b1': path(3, 1), 'b2': path(3, 2), 'b3': path(3, 3), 'geo': Point(3, 3)}, - ]) - - # here a subtle difference with the test_raster_source_catalog_reader test, feed the DataFrame not a CSV and not an already created spark DF. - df = self.spark.read.raster( - catalog=path_pandas, - catalog_col_names=['b1', 'b2', 'b3'] - ) - self.assertEqual(len(df.columns), 7) # three path cols, three tile cols, and geo - self.assertTrue('geo' in df.columns) - self.assertTrue(df.select('b1_path').distinct().count() == 3) - - - # Same test with geopandas - geo_df = geopandas.GeoDataFrame(path_pandas, crs={'init': 'EPSG:4326'}, geometry='geo') - df2 = self.spark.read.raster( - catalog=geo_df, - catalog_col_names=['b1', 'b2', 'b3'] - ) - self.assertEqual(len(df2.columns), 7) # three path cols, three tile cols, and geo - self.assertTrue('geo' in df2.columns) - self.assertTrue(df2.select('b1_path').distinct().count() == 3) - - def suite(): function_tests = unittest.TestSuite() return function_tests diff --git a/pyrasterframes/src/main/python/tests/RasterSourceTest.py b/pyrasterframes/src/main/python/tests/RasterSourceTest.py new file mode 100644 index 000000000..f169c522c --- /dev/null +++ b/pyrasterframes/src/main/python/tests/RasterSourceTest.py @@ -0,0 +1,179 @@ +# +# This software is licensed under the Apache 2 license, quoted below. +# +# Copyright 2019 Astraea, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# [http://www.apache.org/licenses/LICENSE-2.0] +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +# +# SPDX-License-Identifier: Apache-2.0 +# + +from pyrasterframes.rasterfunctions import * +from pyrasterframes.rf_types import * +from pyspark.sql.functions import * +import os.path + +from . import TestEnvironment + + +class RasterSourceTest(TestEnvironment): + + @staticmethod + def path(scene, band): + scene_dict = { + 1: 'https://landsat-pds.s3.amazonaws.com/c1/L8/015/041/LC08_L1TP_015041_20190305_20190309_01_T1/LC08_L1TP_015041_20190305_20190309_01_T1_B{}.TIF', + 2: 'https://landsat-pds.s3.amazonaws.com/c1/L8/015/042/LC08_L1TP_015042_20190305_20190309_01_T1/LC08_L1TP_015042_20190305_20190309_01_T1_B{}.TIF', + 3: 'https://landsat-pds.s3.amazonaws.com/c1/L8/016/041/LC08_L1TP_016041_20190224_20190309_01_T1/LC08_L1TP_016041_20190224_20190309_01_T1_B{}.TIF', + } + + assert band in range(1, 12) + assert scene in scene_dict.keys() + p = scene_dict[scene] + return p.format(band) + + def test_handle_lazy_eval(self): + df = self.spark.read.raster(self.path(1, 1)) + ltdf = df.select('proj_raster') + self.assertGreater(ltdf.count(), 0) + self.assertIsNotNone(ltdf.first().proj_raster) + + tdf = df.select(rf_tile('proj_raster').alias('pr')) + self.assertGreater(tdf.count(), 0) + self.assertIsNotNone(tdf.first().pr) + + def test_strict_eval(self): + df_lazy = self.spark.read.raster(self.img_uri, lazy_tiles=True) + # when doing Show on a lazy tile we will see something like RasterRefTile(RasterRef(JVMGeoTiffRasterSource(... + # use this trick to get the `show` string + show_str_lazy = df_lazy.select('proj_raster')._jdf.showString(1, -1, False) + self.assertTrue('RasterRef' in show_str_lazy) + + # again for strict + df_strict = self.spark.read.raster(self.img_uri, lazy_tiles=False) + show_str_strict = df_strict.select('proj_raster')._jdf.showString(1, -1, False) + self.assertTrue('RasterRef' not in show_str_strict) + + def test_prt_functions(self): + df = self.spark.read.raster(self.img_uri) \ + .withColumn('crs', rf_crs('proj_raster')) \ + .withColumn('ext', rf_extent('proj_raster')) \ + .withColumn('geom', rf_geometry('proj_raster')) + df.select('crs', 'ext', 'geom').first() + + def test_list_of_str(self): + # much the same as RasterSourceDataSourceSpec here; but using https PDS. Takes about 30s to run + + def l8path(b): + assert b in range(1, 12) + base = "https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/199/026/LC08_L1TP_199026_20180919_20180928_01_T1/LC08_L1TP_199026_20180919_20180928_01_T1_B{}.TIF" + return base.format(b) + + path_param = [l8path(b) for b in [1, 2, 3]] + tile_size = 512 + + df = self.spark.read.raster( + path_param, + tile_dimensions=(tile_size, tile_size), + lazy_tiles=True, + ).cache() + + print(df.take(3)) + + # schema is tile_path and tile + # df.printSchema() + self.assertTrue(len(df.columns) == 2 and 'proj_raster_path' in df.columns and 'proj_raster' in df.columns) + + # the most common tile dimensions should be as passed to `options`, showing that options are correctly applied + tile_size_df = df.select(rf_dimensions(df.proj_raster).rows.alias('r'), rf_dimensions(df.proj_raster).cols.alias('c')) \ + .groupby(['r', 'c']).count().toPandas() + most_common_size = tile_size_df.loc[tile_size_df['count'].idxmax()] + self.assertTrue(most_common_size.r == tile_size and most_common_size.c == tile_size) + + # all rows are from a single source URI + path_count = df.groupby(df.proj_raster_path).count() + print(path_count.collect()) + self.assertTrue(path_count.count() == 3) + + def test_schemeless_string(self): + import os.path + path = os.path.join(self.resource_dir, "L8-B8-Robinson-IL.tiff") + self.assertTrue(not path.startswith('file://')) + self.assertTrue(os.path.exists(path)) + df = self.spark.read.raster(path) + self.assertTrue(df.count() > 0) + + def test_spark_df_source(self): + import pandas as pd + + # Create a pandas dataframe (makes it easy to create spark df) + path_pandas = pd.DataFrame([ + {'b1': self.path(1, 1), 'b2': self.path(1, 2), 'b3': self.path(1, 3)}, + {'b1': self.path(2, 1), 'b2': self.path(2, 2), 'b3': self.path(2, 3)}, + {'b1': self.path(3, 1), 'b2': self.path(3, 2), 'b3': self.path(3, 3)}, + ]) + # comma separated list of column names containing URI's to read. + catalog_columns = path_pandas.columns.tolist() + path_table = self.spark.createDataFrame(path_pandas) + + path_df = self.spark.read.raster( + path_table, + tile_dimensions=(512, 512), + catalog_col_names=catalog_columns, + lazy_tiles=True # We'll get an OOM error if we try to read 9 scenes all at once! + ) + + self.assertTrue(len(path_df.columns) == 6) # three bands times {path, tile} + self.assertTrue(path_df.select('b1_path').distinct().count() == 3) # as per scene_dict + b1_paths_maybe = path_df.select('b1_path').distinct().collect() + b1_paths = [s.format('1') for s in scene_dict.values()] + self.assertTrue(all([row.b1_path in b1_paths for row in b1_paths_maybe])) + + def test_pandas_source(self): + import pandas as pd + import geopandas + from shapely.geometry import Point + + # Create a pandas dataframe (makes it easy to create spark df) + path_pandas = pd.DataFrame([ + {'b1': self.path(1, 1), 'b2': self.path(1, 2), 'b3': self.path(1, 3), 'geo': Point(1, 1)}, + {'b1': self.path(2, 1), 'b2': self.path(2, 2), 'b3': self.path(2, 3), 'geo': Point(2, 2)}, + {'b1': self.path(3, 1), 'b2': self.path(3, 2), 'b3': self.path(3, 3), 'geo': Point(3, 3)}, + ]) + + # here a subtle difference with the test_raster_source_catalog_reader test, feed the DataFrame + # not a CSV and not an already created spark DF. + df = self.spark.read.raster( + path_pandas, + catalog_col_names=['b1', 'b2', 'b3'] + ) + self.assertEqual(len(df.columns), 7) # three path cols, three tile cols, and geo + self.assertTrue('geo' in df.columns) + self.assertTrue(df.select('b1_path').distinct().count() == 3) + + # Same test with geopandas + geo_df = geopandas.GeoDataFrame(path_pandas, crs={'init': 'EPSG:4326'}, geometry='geo') + df2 = self.spark.read.raster(geo_df, ['b1', 'b2', 'b3']) + self.assertEqual(len(df2.columns), 7) # three path cols, three tile cols, and geo + self.assertTrue('geo' in df2.columns) + self.assertTrue(df2.select('b1_path').distinct().count() == 3) + + def test_csv_string(self): + + s = f"""metadata,b1,b2 + a,{self.path(1,1)},{self.path(1,2)} + b,{self.path(2,1)},{self.path(2,2)} + c,{self.path(3,1)},{self.path(3,2)} + """ + df = self.spark.read.raster(s, ['b1', 'b2']) + self.assertEqual(len(df.columns), 3 + 2) # number of columns in original DF plus cardinality of catalog_col_names + self.assertTrue(len(df.take(1))) diff --git a/pyrasterframes/src/main/python/tests/__init__.py b/pyrasterframes/src/main/python/tests/__init__.py index 177c9a8c7..722c6fff1 100644 --- a/pyrasterframes/src/main/python/tests/__init__.py +++ b/pyrasterframes/src/main/python/tests/__init__.py @@ -71,7 +71,9 @@ def setUpClass(cls): cls.spark = spark_test_session() - cls.img_uri = 'file://' + os.path.join(cls.resource_dir, 'L8-B8-Robinson-IL.tiff') + cls.img_path = os.path.join(cls.resource_dir, 'L8-B8-Robinson-IL.tiff') + + cls.img_uri = 'file://' + cls.img_path def create_layer(self): from pyrasterframes.rasterfunctions import rf_convert_cell_type @@ -84,4 +86,3 @@ def create_layer(self): self.rf = rf.withColumn('tile2', rf_convert_cell_type('tile', 'float32')) \ .drop('tile') \ .withColumnRenamed('tile2', 'tile').as_layer() - # cls.rf.show() From 087d349aa446b3d44100dc5ef8e0b5adb3b310bc Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Thu, 22 Aug 2019 11:30:26 -0400 Subject: [PATCH 02/48] Fix refactoring in raster source test Signed-off-by: Jason T. Brown --- pyrasterframes/src/main/python/tests/RasterSourceTest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyrasterframes/src/main/python/tests/RasterSourceTest.py b/pyrasterframes/src/main/python/tests/RasterSourceTest.py index f169c522c..ad25873ce 100644 --- a/pyrasterframes/src/main/python/tests/RasterSourceTest.py +++ b/pyrasterframes/src/main/python/tests/RasterSourceTest.py @@ -28,6 +28,7 @@ class RasterSourceTest(TestEnvironment): + @staticmethod def path(scene, band): scene_dict = { @@ -135,7 +136,7 @@ def test_spark_df_source(self): self.assertTrue(len(path_df.columns) == 6) # three bands times {path, tile} self.assertTrue(path_df.select('b1_path').distinct().count() == 3) # as per scene_dict b1_paths_maybe = path_df.select('b1_path').distinct().collect() - b1_paths = [s.format('1') for s in scene_dict.values()] + b1_paths = [self.path(s, 1) for s in [1, 2, 3]] self.assertTrue(all([row.b1_path in b1_paths for row in b1_paths_maybe])) def test_pandas_source(self): From 82fd90f1381d0aa18c4e54a64f2d72220631ae02 Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Thu, 22 Aug 2019 11:41:20 -0400 Subject: [PATCH 03/48] RasterSourceTest not use format string in CI Signed-off-by: Jason T. Brown --- .../src/main/python/tests/RasterSourceTest.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/pyrasterframes/src/main/python/tests/RasterSourceTest.py b/pyrasterframes/src/main/python/tests/RasterSourceTest.py index ad25873ce..238ca279f 100644 --- a/pyrasterframes/src/main/python/tests/RasterSourceTest.py +++ b/pyrasterframes/src/main/python/tests/RasterSourceTest.py @@ -130,7 +130,7 @@ def test_spark_df_source(self): path_table, tile_dimensions=(512, 512), catalog_col_names=catalog_columns, - lazy_tiles=True # We'll get an OOM error if we try to read 9 scenes all at once! + lazy_tiles=True # We'll get an OOM error if we try to read 9 scenes all at once! ) self.assertTrue(len(path_df.columns) == 6) # three bands times {path, tile} @@ -170,11 +170,16 @@ def test_pandas_source(self): def test_csv_string(self): - s = f"""metadata,b1,b2 - a,{self.path(1,1)},{self.path(1,2)} - b,{self.path(2,1)},{self.path(2,2)} - c,{self.path(3,1)},{self.path(3,2)} - """ + s = """metadata,b1,b2 + a,{},{} + b,{},{} + c,{},{} + """.format( + self.path(1, 1), self.path(1, 2), + self.path(2, 1), self.path(2, 2), + self.path(3, 1), self.path(3, 2), + ) + df = self.spark.read.raster(s, ['b1', 'b2']) self.assertEqual(len(df.columns), 3 + 2) # number of columns in original DF plus cardinality of catalog_col_names self.assertTrue(len(df.take(1))) From 2ebd85a6f99638d3a7d5f1aaccdbf7dc2b1d15c7 Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Thu, 22 Aug 2019 13:35:19 -0400 Subject: [PATCH 04/48] Fix list of str interpretation in python raster reader Signed-off-by: Jason T. Brown --- pyrasterframes/src/main/python/pyrasterframes/__init__.py | 7 ++++--- pyrasterframes/src/main/python/tests/RasterSourceTest.py | 6 +++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pyrasterframes/src/main/python/pyrasterframes/__init__.py b/pyrasterframes/src/main/python/pyrasterframes/__init__.py index e2702fb8c..a36ec59e9 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/__init__.py +++ b/pyrasterframes/src/main/python/pyrasterframes/__init__.py @@ -158,9 +158,10 @@ def temp_name(): # Parse the `source` argument path = None # to pass into `path` param if isinstance(source, list): - path = None - catalog = None - options.update(dict(paths='\n'.join(str(source)))) + if all([isinstance(i, str) for i in source]): + path = None + catalog = None + options.update(dict(paths='\n'.join([str(i) for i in source]))) # pass in "uri1\nuri2\nuri3\n..." elif isinstance(source, str): if '\n' in source or '\r' in source: # then the `source` string is a catalog as a CSV (header is required) diff --git a/pyrasterframes/src/main/python/tests/RasterSourceTest.py b/pyrasterframes/src/main/python/tests/RasterSourceTest.py index 238ca279f..c8757042a 100644 --- a/pyrasterframes/src/main/python/tests/RasterSourceTest.py +++ b/pyrasterframes/src/main/python/tests/RasterSourceTest.py @@ -22,7 +22,7 @@ from pyrasterframes.rf_types import * from pyspark.sql.functions import * import os.path - +from unittest import skip from . import TestEnvironment @@ -105,6 +105,10 @@ def l8path(b): print(path_count.collect()) self.assertTrue(path_count.count() == 3) + @skip('not implemented yet') + def test_list_of_list_of_str(self): + 0 + def test_schemeless_string(self): import os.path path = os.path.join(self.resource_dir, "L8-B8-Robinson-IL.tiff") From 5f97322c052b995639c834e05e735c56ee79e2f4 Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Thu, 22 Aug 2019 14:05:12 -0400 Subject: [PATCH 05/48] Implement py raster read list of lists Signed-off-by: Jason T. Brown --- .../src/main/python/pyrasterframes/__init__.py | 13 ++++++++++++- .../src/main/python/tests/RasterSourceTest.py | 16 +++++++++++++--- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/pyrasterframes/src/main/python/pyrasterframes/__init__.py b/pyrasterframes/src/main/python/pyrasterframes/__init__.py index a36ec59e9..fe8bc8ff5 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/__init__.py +++ b/pyrasterframes/src/main/python/pyrasterframes/__init__.py @@ -162,6 +162,16 @@ def temp_name(): path = None catalog = None options.update(dict(paths='\n'.join([str(i) for i in source]))) # pass in "uri1\nuri2\nuri3\n..." + if all([isinstance(i, list) for i in source]): + # list of lists; we will rely on pandas to + # - coerce all data to str (possibly using objects' __str__ or __repr__\ + # - ensure data is not "ragged": all sublists are same len + path = None + catalog_col_names = ['proj_raster_{}'.format(i) for i in range(len(source[0]))] + catalog = PdDataFrame(source, + columns=catalog_col_names, + dtype=str, + ) elif isinstance(source, str): if '\n' in source or '\r' in source: # then the `source` string is a catalog as a CSV (header is required) @@ -172,12 +182,13 @@ def temp_name(): path = source catalog = None else: - # user has passed in some other type, we will interpret as a catalog + # user has passed in some other type, we will try to interpret as a catalog catalog = source if catalog is not None: if catalog_col_names is None: raise Exception("'catalog_col_names' required when DataFrame 'catalog' specified") + if isinstance(catalog, str): options.update({ "catalogCSV": catalog, diff --git a/pyrasterframes/src/main/python/tests/RasterSourceTest.py b/pyrasterframes/src/main/python/tests/RasterSourceTest.py index c8757042a..eb1b0f986 100644 --- a/pyrasterframes/src/main/python/tests/RasterSourceTest.py +++ b/pyrasterframes/src/main/python/tests/RasterSourceTest.py @@ -105,9 +105,19 @@ def l8path(b): print(path_count.collect()) self.assertTrue(path_count.count() == 3) - @skip('not implemented yet') def test_list_of_list_of_str(self): - 0 + lol = [ + [self.path(1, 1), self.path(1, 2), ], + [self.path(2, 1), self.path(2, 2), ], + [self.path(3, 1), self.path(3, 2), ] + ] + df = self.spark.read.raster(lol) + self.assertTrue(len(df.columns) == 4) # 2 cols of uris plus 2 cols of proj_rasters + self.assertEqual(sorted(df.columns), sorted(['proj_raster_0_path', 'proj_raster_1_path', + 'proj_raster_0', 'proj_raster_1'])) + uri_df = df.select('proj_raster_0_path', 'proj_raster_1_path').distinct().collect() + uri_list = [list(r.asDict().values()) for r in uri_df] + self.assertEqual(sorted(uri_list), sorted(lol)) def test_schemeless_string(self): import os.path @@ -186,4 +196,4 @@ def test_csv_string(self): df = self.spark.read.raster(s, ['b1', 'b2']) self.assertEqual(len(df.columns), 3 + 2) # number of columns in original DF plus cardinality of catalog_col_names - self.assertTrue(len(df.take(1))) + self.assertTrue(len(df.take(1))) # non-empty check From 7fb7bcc75968a9ec4fd89ffc75626105482a5490 Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Thu, 22 Aug 2019 14:07:07 -0400 Subject: [PATCH 06/48] Docstring for raster reader about list of lists Signed-off-by: Jason T. Brown --- pyrasterframes/src/main/python/pyrasterframes/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyrasterframes/src/main/python/pyrasterframes/__init__.py b/pyrasterframes/src/main/python/pyrasterframes/__init__.py index fe8bc8ff5..5f08b0ec3 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/__init__.py +++ b/pyrasterframes/src/main/python/pyrasterframes/__init__.py @@ -124,7 +124,7 @@ def _raster_reader( For more details and example usage, consult https://rasterframes.io/raster-read.html - :param source: a string, list of strings, a pandas DataFrame or a Spark DataFrame giving URIs to the raster data to read + :param source: a string, list of strings, list of lists of strings, a pandas DataFrame or a Spark DataFrame giving URIs to the raster data to read :param catalog_col_names: required if source is a DataFrame or CSV string. It is a list of strings giving the names of columns containing URIs to read :param band_indexes: list of integers indicating which bands, zero-based, to read from the raster files specified; default is to read only the first band :param tile_dimensions: tuple or list of two indicating the default tile dimension as (columns, rows) From 902e7946a6c5d10ba3e54515a1315212e3f67d82 Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Fri, 23 Aug 2019 13:06:05 -0400 Subject: [PATCH 07/48] Tweaks to docstring for raster reader Signed-off-by: Jason T. Brown --- pyrasterframes/src/main/python/pyrasterframes/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyrasterframes/src/main/python/pyrasterframes/__init__.py b/pyrasterframes/src/main/python/pyrasterframes/__init__.py index 5f08b0ec3..730af5866 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/__init__.py +++ b/pyrasterframes/src/main/python/pyrasterframes/__init__.py @@ -117,7 +117,7 @@ def _raster_reader( lazy_tiles=True, **options): """ - Returns a Spark DataFrame from a raster data files specified by URI pointers + Returns a Spark DataFrame from raster data files specified by URI pointers The returned DataFrame will have a column of (CRS, Extent, Tile) for each URI read Multiple bands from the same raster file are spread across rows of the DataFrame. See band_indexes param. If bands from a scene are stored in separate files, provide a DataFrame to the `source` parameter. Each row in the returned DataFrame will contain one (CRS, Extent, Tile) for each item in `catalog_col_names` @@ -130,7 +130,6 @@ def _raster_reader( :param tile_dimensions: tuple or list of two indicating the default tile dimension as (columns, rows) :param lazy_tiles: If true (default) only generate minimal references to tile contents; if false, fetch tile cell values :param options: Additional keyword arguments to pass to the spark DataSource - :return: """ from pandas import DataFrame as PdDataFrame From f1e464611f09b29dd95ad7f41a582c208c5c8a19 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Fri, 23 Aug 2019 14:36:34 -0400 Subject: [PATCH 08/48] Bumped dev version. --- pyrasterframes/src/main/python/pyrasterframes/version.py | 2 +- version.sbt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyrasterframes/src/main/python/pyrasterframes/version.py b/pyrasterframes/src/main/python/pyrasterframes/version.py index 2e80e1a85..55b8dfe06 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/version.py +++ b/pyrasterframes/src/main/python/pyrasterframes/version.py @@ -20,4 +20,4 @@ # # Translating Java version from version.sbt to PEP440 norms -__version__ = '0.8.1' +__version__ = '0.8.2.dev0' diff --git a/version.sbt b/version.sbt index ccb5c052c..05d6fc967 100644 --- a/version.sbt +++ b/version.sbt @@ -1 +1 @@ -version in ThisBuild := "0.8.1" +version in ThisBuild := "0.8.2-SNAPSHOT" From 6aa3c532fa9ba46a29925f950722d8ae99980e1c Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Fri, 23 Aug 2019 16:13:31 -0400 Subject: [PATCH 09/48] Display-oriented tweaks to docs Signed-off-by: Jason T. Brown --- .../src/main/python/docs/aggregation.pymd | 59 +++++++++++-------- .../docs/{description.md => description.pymd} | 27 ++++++++- .../python/docs/static/rasterframe-sample.md | 48 --------------- .../src/main/python/docs/vector-data.pymd | 19 ++---- 4 files changed, 65 insertions(+), 88 deletions(-) rename pyrasterframes/src/main/python/docs/{description.md => description.pymd} (88%) delete mode 100644 pyrasterframes/src/main/python/docs/static/rasterframe-sample.md diff --git a/pyrasterframes/src/main/python/docs/aggregation.pymd b/pyrasterframes/src/main/python/docs/aggregation.pymd index 875bf0027..554d487a9 100644 --- a/pyrasterframes/src/main/python/docs/aggregation.pymd +++ b/pyrasterframes/src/main/python/docs/aggregation.pymd @@ -8,6 +8,9 @@ from pyrasterframes.rasterfunctions import * from pyspark.sql import * import os +import numpy as np +np.set_printoptions(precision=3, floatmode='maxprec') + spark = create_rf_spark_session() ``` @@ -15,33 +18,43 @@ There are three types of aggregate functions: _tile_ aggregate, DataFrame aggreg ## Tile Mean Example -We can illustrate aggregate differences by computing an aggregate mean. First, we create a sample DataFrame of 2 _tiles_ where the first _tile_ is composed of 25 values of 1.0 and the second _tile_ is composed of 25 values of 3.0. +We can illustrate aggregate differences by computing an aggregate mean. First, we create a sample DataFrame of 2 _tiles_. The _tiles_ will contain normally distributed cell values with the first row's mean at 1.0 and the second row's mean at 3.0. For details on use of the `Tile` class see @ref:[the page on numpy interoperability](numpy-pandas.md). -```python, sql_dataframe -import pyspark.sql.functions as F +```python, create_tile1 +from pyrasterframes.rf_types import Tile, CellType -df1 = spark.range(1).select('id', rf_make_ones_tile(5, 5, 'float32').alias('tile')) -df2 = spark.range(1).select('id', rf_local_multiply(rf_make_ones_tile(5, 5, 'float32'), F.lit(3)).alias('tile')) +t1 = Tile(1 + 0.1 * np.random.randn(5,5), CellType('float64raw')) -rf = df1.union(df2) +t1.cells # display the array in the Tile +``` -tiles = rf.select("tile").collect() -print(tiles[0]['tile'].cells) -print(tiles[1]['tile'].cells) +```python, showt5 +t5 = Tile(5 + 0.1 * np.random.randn(5,5), CellType('float64raw')) +t5.cells ``` -We use the @ref:[`rf_tile_mean`](reference.md#rf-tile-mean) function to compute the _tile_ aggregate mean of cells in each row of column `tile`. The mean of each _tile_ is computed separately, so the first mean is 1.0 and the second mean is 3.0. Notice that the number of rows in the DataFrame is the same before and after the aggregation. +Create a Spark DataFrame from the Tile objects. + +```python, create_dataframe +import pyspark.sql.functions as F +from pyspark.sql import Row + +rf = spark.createDataFrame([ + Row(id=1, tile=t1), + Row(id=2, tile=t5) +]).orderBy('id') +``` + +We use the @ref:[`rf_tile_mean`](reference.md#rf-tile-mean) function to compute the _tile_ aggregate mean of cells in each row of column `tile`. The mean of each _tile_ is computed separately, so the first mean is about 1.0 and the second mean is about 3.0. Notice that the number of rows in the DataFrame is the same before and after the aggregation. ```python, tile_mean -means = rf.select(F.col('id'), rf_tile_mean(F.col('tile'))) -means +rf.select(F.col('id'), rf_tile_mean(F.col('tile'))) ``` -We use the @ref:[`rf_agg_mean`](reference.md#rf-agg-mean) function to compute the DataFrame aggregate, which averages 25 values of 1.0 and 25 values of 3.0, across the fifty cells in two rows. Note that only a single row is returned since the average is computed over the full DataFrame. +We use the @ref:[`rf_agg_mean`](reference.md#rf-agg-mean) function to compute the DataFrame aggregate, which averages values across the fifty cells in two rows. Note that only a single row is returned since the average is computed over the full DataFrame. ```python, agg_mean -mean = rf.agg(rf_agg_mean(F.col('tile'))) -mean +rf.agg(rf_agg_mean(F.col('tile'))) ``` We use the @ref:[`rf_agg_local_mean`](reference.md#rf-agg-local-mean) function to compute the element-wise local aggregate mean across the two rows. For this aggregation, we are computing the mean of one value of 1.0 and one value of 3.0 to arrive at the element-wise mean, but doing so twenty-five times, one for each position in the _tile_. @@ -49,9 +62,8 @@ We use the @ref:[`rf_agg_local_mean`](reference.md#rf-agg-local-mean) function t To compute an element-wise local aggregate, _tiles_ need to have the same dimensions. In this case, both _tiles_ have 5 rows and 5 columns. If we tried to compute an element-wise local aggregate over the DataFrame without equal _tile_ dimensions, we would get a runtime error. ```python, local_mean -t = rf.agg(rf_agg_local_mean(F.col('tile')).alias('local_mean')) \ - .collect()[0]['local_mean'] -print(t.cells) +rf.agg(rf_agg_local_mean('tile')) \ + .first()[0].cells.data # display the contents of the Tile array ``` ## Cell Counts Example @@ -92,12 +104,11 @@ stats The @ref:[`rf_agg_local_stats`](reference.md#rf-agg-local-stats) function computes the element-wise local aggregate statistical summary as shown below. The DataFrame used in the previous two code blocks has unequal _tile_ dimensions, so a different DataFrame is used in this code block to avoid a runtime error. ```python, agg_local_stats -df1 = spark.range(1).select('id', rf_make_ones_tile(5, 5, 'float32').alias('tile')) -df2 = spark.range(1).select('id', rf_make_constant_tile(3, 5, 5, 'float32').alias('tile')) -df3 = spark.range(1).select('id', rf_make_constant_tile(5, 5, 5, 'float32').alias('tile')) - -rf = df1.union(df2).union(df3) \ - .agg(rf_agg_local_stats('tile').alias('stats')) +rf = spark.createDataFrame([ + Row(id=1, tile=t1), + Row(id=3, tile=t1 * 3), + Row(id=5, tile=t1 * 5) +]).agg(rf_agg_local_stats('tile').alias('stats')) agg_local_stats = rf.select('stats.min', 'stats.max', 'stats.mean', 'stats.variance').collect() diff --git a/pyrasterframes/src/main/python/docs/description.md b/pyrasterframes/src/main/python/docs/description.pymd similarity index 88% rename from pyrasterframes/src/main/python/docs/description.md rename to pyrasterframes/src/main/python/docs/description.pymd index 8db6347af..e406f1657 100644 --- a/pyrasterframes/src/main/python/docs/description.md +++ b/pyrasterframes/src/main/python/docs/description.pymd @@ -1,5 +1,28 @@ # Overview +```python, setup, echo=False +import pyrasterframes +import pyrasterframes.rf_ipython +from pyrasterframes.rasterfunctions import rf_crs, rf_extent, rf_tile, rf_data_cells +from pyspark.sql.functions import col, lit +spark = pyrasterframes.get_spark_session() + +# Note that this is the same URI as in the getting started page... +df = spark.read.raster('https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/MCD43A4.A2019059.h11v08.006.2019072203257_B02.TIF') + + +df = df.select( + lit('2019-02-28').alias('timestamp'), + rf_crs('proj_raster').alias('crs'), + rf_extent('proj_raster').alias('extent'), + col('proj_raster').alias('tile') +) \ + .orderBy(-rf_data_cells('tile')) \ + .limit(4) + +assert df.select('crs').first() is not None, "example dataframe is going to be empty" +``` + RasterFrames® provides a DataFrame-centric view over arbitrary Earth-observation (EO) data, enabling spatiotemporal queries, map algebra raster operations, and compatibility with the ecosystem of [Apache Spark](https://spark.apache.org/docs/latest/) [ML](https://spark.apache.org/docs/latest/ml-guide.html) algorithms. It provides APIs in @ref:[Python, SQL, and Scala](languages.md), and can scale from a laptop computer to a large distributed cluster, enabling _global_ analysis with satellite imagery in a wholly new, flexible, and convenient way. ## Context @@ -29,7 +52,9 @@ RasterFrames introduces georectified raster imagery to Spark SQL. It quantizes s As shown in the figure below, a "RasterFrame" is a Spark DataFrame with one or more columns of type @ref:[_tile_](concepts.md#tile). A _tile_ column typically represents a single frequency band of sensor data, such as "blue" or "near infrared", but can also be quality assurance information, land classification assignments, or any other raster spatial data. Along with _tile_ columns there is typically an @ref:[`extent`](concepts.md#extent) specifying the geographic location of the data, the map projection of that geometry (@ref:[`crs`](concepts.md#coordinate-reference-system--crs-)), and a `timestamp` column representing the acquisition time. These columns can all be used in the `WHERE` clause when filtering. -@@include[RasterFrame Example](static/rasterframe-sample.md) +```python show_example_df, echo=False +df +``` RasterFrames also includes support for working with vector data, such as [GeoJSON][GeoJSON]. RasterFrames vector data operations let you filter with geospatial relationships like contains or intersects, mask cells, convert vectors to rasters, and more. diff --git a/pyrasterframes/src/main/python/docs/static/rasterframe-sample.md b/pyrasterframes/src/main/python/docs/static/rasterframe-sample.md deleted file mode 100644 index 2d850a31f..000000000 --- a/pyrasterframes/src/main/python/docs/static/rasterframe-sample.md +++ /dev/null @@ -1,48 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
timestampcrsextenttile
02019-02-28(+proj=sinu +lon_0=0.0 +x_0=0.0 +y_0=0.0 +a=6371007.181 +b=6371007.181 +units=m ,)(-7783653.637667, 993342.4642358534, -7665045.582235852, 1111950.519667)
12019-02-28(+proj=sinu +lon_0=0.0 +x_0=0.0 +y_0=0.0 +a=6371007.181 +b=6371007.181 +units=m ,)(-7665045.582235853, 993342.4642358534, -7546437.526804706, 1111950.519667)
22019-02-28(+proj=sinu +lon_0=0.0 +x_0=0.0 +y_0=0.0 +a=6371007.181 +b=6371007.181 +units=m ,)(-7546437.526804707, 993342.4642358534, -7427829.471373559, 1111950.519667)
32019-02-28(+proj=sinu +lon_0=0.0 +x_0=0.0 +y_0=0.0 +a=6371007.181 +b=6371007.181 +units=m ,)(-7427829.47137356, 993342.4642358534, -7309221.415942413, 1111950.519667)
42019-02-28(+proj=sinu +lon_0=0.0 +x_0=0.0 +y_0=0.0 +a=6371007.181 +b=6371007.181 +units=m ,)(-7309221.415942414, 993342.4642358534, -7190613.360511266, 1111950.519667)
diff --git a/pyrasterframes/src/main/python/docs/vector-data.pymd b/pyrasterframes/src/main/python/docs/vector-data.pymd index 2c66b1562..99537d1e5 100644 --- a/pyrasterframes/src/main/python/docs/vector-data.pymd +++ b/pyrasterframes/src/main/python/docs/vector-data.pymd @@ -57,37 +57,26 @@ Since it is a geometry we can do things like this: the_first['geometry'].wkt ``` -You can also write user-defined functions that take geometries as input, output, or both, via user defined types in the [geomesa_pyspark.types](https://github.com/locationtech/rasterframes/blob/develop/pyrasterframes/src/main/python/geomesa_pyspark/types.py) module. Here is a simple example of a user-defined function that uses both a geometry input and output to compute the centroid of a geometry. +You can also write user-defined functions that take geometries as input, output, or both, via user defined types in the [geomesa_pyspark.types](https://github.com/locationtech/rasterframes/blob/develop/pyrasterframes/src/main/python/geomesa_pyspark/types.py) module. Here is a simple **but inefficient** example of a user-defined function that uses both a geometry input and output to compute the centroid of a geometry. Observe in a sample of the data the geometry columns print as well known text (wkt). ```python, add_centroid from pyspark.sql.functions import udf from geomesa_pyspark.types import PointUDT @udf(PointUDT()) -def get_centroid(g): +def inefficient_centroid(g): return g.centroid -df = df.withColumn('naive_centroid', get_centroid(df.geometry)) -df.printSchema() -``` - -We can take a look at a sample of the data. Notice the geometry columns print as well known text (wkt). - -```python, show_centroid -df.limit(3) +df.select(df.state_code, inefficient_centroid(df.geometry)) ``` - ## GeoMesa Functions and Spatial Relations As documented in the @ref:[function reference](reference.md), various user-defined functions implemented by GeoMesa are also available for use. The example below uses a GeoMesa user-defined function to compute the centroid of a geometry. It is logically equivalent to the example above, but more efficient. - ```python, native_centroid from pyrasterframes.rasterfunctions import st_centroid -df = df.withColumn('centroid', st_centroid(df.geometry)) -centroids = df.select('geometry', 'name', 'naive_centroid', 'centroid') -centroids.limit(3) +df.select(df.state_code, inefficient_centroid(df.geometry), st_centroid(df.geometry)) ``` The RasterFrames vector functions and GeoMesa functions also provide a variety of spatial relations that are useful in combination with the geometric properties of projected rasters. In this example, we use the @ref:[built-in Landsat catalog](raster-catalogs.md#using-built-in-experimental-catalogs) which provides an extent. We will convert the extent to a polygon and filter to those within approximately 500 km of a selected point. From b063698f07803b25c80b953391159cbac23cb4fb Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Wed, 28 Aug 2019 20:32:29 -0400 Subject: [PATCH 10/48] Reworked rf_tile to accept RasterRef as argument. Wrote unit tests for rf_tile. --- .../expressions/DynamicExtractors.scala | 10 +++- .../expressions/accessors/RealizeTile.scala | 22 ++++++--- .../rasterframes/RasterFunctionsSpec.scala | 33 +------------ .../locationtech/rasterframes/TestData.scala | 34 +++++++++++++- .../encoders/CatalystSerializerSpec.scala | 4 +- .../rasterframes/ref/RasterRefSpec.scala | 47 ++++++++++++++++--- .../geotrellis/GeoTrellisDataSourceSpec.scala | 3 +- 7 files changed, 103 insertions(+), 50 deletions(-) diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/DynamicExtractors.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/DynamicExtractors.scala index 6a7e6e421..834c3aac1 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/DynamicExtractors.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/DynamicExtractors.scala @@ -35,7 +35,7 @@ import org.locationtech.rasterframes.tiles.ProjectedRasterTile private[rasterframes] object DynamicExtractors { - /** Partial function for pulling a tile and its contesxt from an input row. */ + /** Partial function for pulling a tile and its context from an input row. */ lazy val tileExtractor: PartialFunction[DataType, InternalRow => (Tile, Option[TileContext])] = { case _: TileUDT => (row: InternalRow) => @@ -47,6 +47,14 @@ object DynamicExtractors { } } + lazy val rasterRefExtractor: PartialFunction[DataType, InternalRow => RasterRef] = { + case t if t.conformsTo[RasterRef] => + (row: InternalRow) => row.to[RasterRef] + } + + lazy val tileableExtractor: PartialFunction[DataType, InternalRow => Tile] = + tileExtractor.andThen(_.andThen(_._1)).orElse(rasterRefExtractor.andThen(_.andThen(_.tile))) + lazy val rowTileExtractor: PartialFunction[DataType, Row => (Tile, Option[TileContext])] = { case _: TileUDT => (row: Row) => (row.to[Tile](TileUDT.tileSerializer), None) diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/accessors/RealizeTile.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/accessors/RealizeTile.scala index d8c9f0ba6..34c794d92 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/accessors/RealizeTile.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/accessors/RealizeTile.scala @@ -22,15 +22,17 @@ package org.locationtech.rasterframes.expressions.accessors import geotrellis.raster.Tile +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback -import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription} +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionDescription, UnaryExpression} import org.apache.spark.sql.rf.TileUDT import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Column, TypedColumn} import org.locationtech.rasterframes._ import org.locationtech.rasterframes.encoders.CatalystSerializer._ -import org.locationtech.rasterframes.expressions.UnaryRasterOp -import org.locationtech.rasterframes.model.TileContext +import org.locationtech.rasterframes.expressions.DynamicExtractors._ +import org.locationtech.rasterframes.expressions._ @ExpressionDescription( usage = "_FUNC_(raster) - Extracts the Tile component of a RasterSource, ProjectedRasterTile (or Tile) and ensures the cells are fully fetched.", @@ -39,14 +41,22 @@ import org.locationtech.rasterframes.model.TileContext > SELECT _FUNC_(raster); .... """) -case class RealizeTile(child: Expression) extends UnaryRasterOp with CodegenFallback { +case class RealizeTile(child: Expression) extends UnaryExpression with CodegenFallback { override def dataType: DataType = TileType override def nodeName: String = "rf_tile" - implicit val tileSer = TileUDT.tileSerializer - override protected def eval(tile: Tile, ctx: Option[TileContext]): Any = + override def checkInputDataTypes(): TypeCheckResult = { + if (!tileableExtractor.isDefinedAt(child.dataType)) { + TypeCheckFailure(s"Input type '${child.dataType}' does not conform to a tiled raster type.") + } else TypeCheckSuccess + } + implicit val tileSer = TileUDT.tileSerializer + override protected def nullSafeEval(input: Any): Any = { + val in = row(input) + val tile = tileableExtractor(child.dataType)(in) (tile.toArrayTile(): Tile).toInternalRow + } } object RealizeTile { diff --git a/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala index cab2b1f17..ed7fc1dcc 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala @@ -23,53 +23,22 @@ package org.locationtech.rasterframes import java.io.ByteArrayInputStream -import geotrellis.proj4.LatLng import geotrellis.raster import geotrellis.raster._ import geotrellis.raster.render.ColorRamps import geotrellis.raster.testkit.RasterMatchers -import geotrellis.vector.Extent import javax.imageio.ImageIO import org.apache.spark.sql.Encoders import org.apache.spark.sql.functions._ import org.locationtech.rasterframes.expressions.accessors.ExtractTile import org.locationtech.rasterframes.model.TileDimensions -import org.locationtech.rasterframes.ref.{RasterRef, RasterSource} import org.locationtech.rasterframes.stats._ import org.locationtech.rasterframes.tiles.ProjectedRasterTile class RasterFunctionsSpec extends TestEnvironment with RasterMatchers { + import TestData._ import spark.implicits._ - val extent = Extent(10, 20, 30, 40) - val crs = LatLng - val ct = ByteUserDefinedNoDataCellType(-2) - val cols = 10 - val rows = cols - val tileSize = cols * rows - val tileCount = 10 - val numND = 4 - lazy val zero = TestData.projectedRasterTile(cols, rows, 0, extent, crs, ct) - lazy val one = TestData.projectedRasterTile(cols, rows, 1, extent, crs, ct) - lazy val two = TestData.projectedRasterTile(cols, rows, 2, extent, crs, ct) - lazy val three = TestData.projectedRasterTile(cols, rows, 3, extent, crs, ct) - lazy val six = ProjectedRasterTile(three * two, three.extent, three.crs) - lazy val nd = TestData.projectedRasterTile(cols, rows, -2, extent, crs, ct) - lazy val randPRT = TestData.projectedRasterTile(cols, rows, scala.util.Random.nextInt(), extent, crs, ct) - lazy val randNDPRT: Tile = TestData.injectND(numND)(randPRT) - - lazy val randDoubleTile = TestData.projectedRasterTile(cols, rows, scala.util.Random.nextGaussian(), extent, crs, DoubleConstantNoDataCellType) - lazy val randDoubleNDTile = TestData.injectND(numND)(randDoubleTile) - lazy val randPositiveDoubleTile = TestData.projectedRasterTile(cols, rows, scala.util.Random.nextDouble() + 1e-6, extent, crs, DoubleConstantNoDataCellType) - - val expectedRandNoData: Long = numND * tileCount.toLong - val expectedRandData: Long = cols * rows * tileCount - expectedRandNoData - lazy val randNDTilesWithNull = Seq.fill[Tile](tileCount)(TestData.injectND(numND)( - TestData.randomTile(cols, rows, UByteConstantNoDataCellType) - )).map(ProjectedRasterTile(_, extent, crs)) :+ null - - def lazyPRT = RasterRef(RasterSource(TestData.l8samplePath), 0, None, None).tile - implicit val pairEnc = Encoders.tuple(ProjectedRasterTile.prtEncoder, ProjectedRasterTile.prtEncoder) implicit val tripEnc = Encoders.tuple(ProjectedRasterTile.prtEncoder, ProjectedRasterTile.prtEncoder, ProjectedRasterTile.prtEncoder) diff --git a/core/src/test/scala/org/locationtech/rasterframes/TestData.scala b/core/src/test/scala/org/locationtech/rasterframes/TestData.scala index 1b6b373e9..00862d629 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/TestData.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/TestData.scala @@ -38,8 +38,8 @@ import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession import org.locationtech.jts.geom.{Coordinate, GeometryFactory} import org.locationtech.rasterframes.expressions.tilestats.NoDataCells +import org.locationtech.rasterframes.ref.{RasterRef, RasterSource} import org.locationtech.rasterframes.tiles.ProjectedRasterTile -import spray.json.JsObject import scala.reflect.ClassTag @@ -49,8 +49,15 @@ import scala.reflect.ClassTag * @since 4/3/17 */ trait TestData { + val extent = Extent(10, 20, 30, 40) + val crs = LatLng + val ct = ByteUserDefinedNoDataCellType(-2) + val cols = 10 + val rows = cols + val tileSize = cols * rows + val tileCount = 10 + val numND = 4 val instant = ZonedDateTime.now() - val extent = Extent(1, 2, 3, 4) val sk = SpatialKey(37, 41) val stk = SpaceTimeKey(sk, instant) val pe = ProjectedExtent(extent, LatLng) @@ -153,6 +160,29 @@ trait TestData { lazy val l8samplePath: URI = getClass.getResource("/L8-B1-Elkton-VA.tiff").toURI lazy val modisConvertedMrfPath: URI = getClass.getResource("/MCD43A4.A2019111.h30v06.006.2019120033434_01.mrf").toURI + + + lazy val zero = TestData.projectedRasterTile(cols, rows, 0, extent, crs, ct) + lazy val one = TestData.projectedRasterTile(cols, rows, 1, extent, crs, ct) + lazy val two = TestData.projectedRasterTile(cols, rows, 2, extent, crs, ct) + lazy val three = TestData.projectedRasterTile(cols, rows, 3, extent, crs, ct) + lazy val six = ProjectedRasterTile(three * two, three.extent, three.crs) + lazy val nd = TestData.projectedRasterTile(cols, rows, -2, extent, crs, ct) + lazy val randPRT = TestData.projectedRasterTile(cols, rows, scala.util.Random.nextInt(), extent, crs, ct) + lazy val randNDPRT: Tile = TestData.injectND(numND)(randPRT) + + lazy val randDoubleTile = TestData.projectedRasterTile(cols, rows, scala.util.Random.nextGaussian(), extent, crs, DoubleConstantNoDataCellType) + lazy val randDoubleNDTile = TestData.injectND(numND)(randDoubleTile) + lazy val randPositiveDoubleTile = TestData.projectedRasterTile(cols, rows, scala.util.Random.nextDouble() + 1e-6, extent, crs, DoubleConstantNoDataCellType) + + val expectedRandNoData: Long = numND * tileCount.toLong + val expectedRandData: Long = cols * rows * tileCount - expectedRandNoData + lazy val randNDTilesWithNull = Seq.fill[Tile](tileCount)(TestData.injectND(numND)( + TestData.randomTile(cols, rows, UByteConstantNoDataCellType) + )).map(ProjectedRasterTile(_, extent, crs)) :+ null + + def lazyPRT = RasterRef(RasterSource(TestData.l8samplePath), 0, None, None).tile + object GeomData { val fact = new GeometryFactory() val c1 = new Coordinate(1, 2) diff --git a/core/src/test/scala/org/locationtech/rasterframes/encoders/CatalystSerializerSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/encoders/CatalystSerializerSpec.scala index 34839ab2d..a3f50693b 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/encoders/CatalystSerializerSpec.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/encoders/CatalystSerializerSpec.scala @@ -35,7 +35,9 @@ import org.locationtech.rasterframes.model.{CellContext, TileContext, TileDataCo import org.locationtech.rasterframes.ref.{RasterRef, RasterSource} import org.scalatest.Assertion -class CatalystSerializerSpec extends TestEnvironment with TestData { +class CatalystSerializerSpec extends TestEnvironment { + import TestData._ + val dc = TileDataContext(UShortUserDefinedNoDataCellType(3), TileDimensions(12, 23)) val tc = TileContext(Extent(1, 2, 3, 4), WebMercator) val cc = CellContext(tc, dc, 34, 45) diff --git a/core/src/test/scala/org/locationtech/rasterframes/ref/RasterRefSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/ref/RasterRefSpec.scala index 5765c2a49..23f00268e 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/ref/RasterRefSpec.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/ref/RasterRefSpec.scala @@ -21,14 +21,14 @@ package org.locationtech.rasterframes.ref -import org.locationtech.rasterframes._ -import org.locationtech.rasterframes.expressions.accessors._ -import org.locationtech.rasterframes.expressions.generators._ -import RasterRef.RasterRefTile -import geotrellis.raster.Tile +import geotrellis.raster.{ByteConstantNoDataCellType, Tile} import geotrellis.vector.Extent import org.apache.spark.sql.Encoders -import org.locationtech.rasterframes.TestEnvironment +import org.locationtech.rasterframes.{TestEnvironment, _} +import org.locationtech.rasterframes.expressions.accessors._ +import org.locationtech.rasterframes.expressions.generators._ +import org.locationtech.rasterframes.ref.RasterRef.RasterRefTile +import org.locationtech.rasterframes.tiles.ProjectedRasterTile /** * @@ -199,7 +199,6 @@ class RasterRefSpec extends TestEnvironment with TestData { refs.count() shouldBe > (1L) - val dims = refs.select(rf_dimensions($"proj_raster")).distinct().collect() forEvery(dims) { r => r.cols should be <= NOMINAL_TILE_SIZE @@ -207,4 +206,38 @@ class RasterRefSpec extends TestEnvironment with TestData { } } } + + describe("RealizeTile") { + it("should pass through basic Tile") { + val t = TestData.randomTile(5, 5, ByteConstantNoDataCellType) + val result = Seq(t).toDF("tile").select(rf_tile($"tile")).first() + assertEqual(result, t) + } + + it("should simplify ProjectedRasterTile") { + val t = TestData.randNDPRT + val result = Seq(t).toDF("tile").select(rf_tile($"tile")).first() + result.isInstanceOf[ProjectedRasterLike] should be (false) + assertEqual(result, t.toArrayTile()) + } + + it("should resolve a RasterRef") { + new Fixture { + import RasterRef.rrEncoder // This shouldn't be required, but product encoder gets choosen. + val r: RasterRef = subRaster + val result = Seq(r).toDF("ref").select(rf_tile($"ref")).first() + result.isInstanceOf[RasterRefTile] should be(false) + assertEqual(r.tile.toArrayTile(), result) + } + } + + it("should resolve a RasterRefTile") { + new Fixture { + val t: ProjectedRasterTile = RasterRefTile(subRaster) + val result = Seq(t).toDF("tile").select(rf_tile($"tile")).first() + result.isInstanceOf[RasterRefTile] should be(false) + assertEqual(t.toArrayTile(), result) + } + } + } } diff --git a/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotrellis/GeoTrellisDataSourceSpec.scala b/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotrellis/GeoTrellisDataSourceSpec.scala index ecd3351df..52d0f0d78 100644 --- a/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotrellis/GeoTrellisDataSourceSpec.scala +++ b/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotrellis/GeoTrellisDataSourceSpec.scala @@ -51,8 +51,9 @@ import org.scalatest.{BeforeAndAfterAll, Inspectors} import scala.math.{max, min} class GeoTrellisDataSourceSpec - extends TestEnvironment with TestData with BeforeAndAfterAll with Inspectors + extends TestEnvironment with BeforeAndAfterAll with Inspectors with RasterMatchers with DataSourceOptions { + import TestData._ val tileSize = 12 lazy val layer = Layer(new File(outputLocalPath).toURI, LayerId("test-layer", 4)) From 6f97fe548b56a5eb32cf10f985fb42824693f572 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Wed, 28 Aug 2019 20:44:49 -0400 Subject: [PATCH 11/48] Fixed naming/spelling bug in GridBounds serializer. --- .../rasterframes/encoders/StandardSerializers.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/locationtech/rasterframes/encoders/StandardSerializers.scala b/core/src/main/scala/org/locationtech/rasterframes/encoders/StandardSerializers.scala index 9ab06f52f..caf1965cc 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/encoders/StandardSerializers.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/encoders/StandardSerializers.scala @@ -73,7 +73,7 @@ trait StandardSerializers { implicit val gridBoundsSerializer: CatalystSerializer[GridBounds] = new CatalystSerializer[GridBounds] { override def schema: StructType = StructType(Seq( StructField("colMin", IntegerType, false), - StructField("rowlMin", IntegerType, false), + StructField("rowMin", IntegerType, false), StructField("colMax", IntegerType, false), StructField("rowMax", IntegerType, false) )) From 68068c7391ed0d671c5c589f5cb134b9bf0b6f5a Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Thu, 29 Aug 2019 11:54:04 -0400 Subject: [PATCH 12/48] Fixed bugs in PDS catalog caching. --- .../awspds/L8CatalogRelationTest.scala | 25 ++++++++++++++++++- .../datasource/CachedDatasetRelation.scala | 6 ++--- .../datasource/ResourceCacheSupport.scala | 2 +- .../awspds/L8CatalogDataSource.scala | 2 +- 4 files changed, 29 insertions(+), 6 deletions(-) diff --git a/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala b/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala index 9b6d8480f..5b757c9cc 100644 --- a/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala +++ b/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala @@ -32,7 +32,7 @@ import org.locationtech.rasterframes.datasource.raster._ class L8CatalogRelationTest extends TestEnvironment { import spark.implicits._ - val catalog = spark.read.l8Catalog.load() + val catalog = spark.read.l8Catalog.load().cache() val scenes = catalog .where($"acquisition_date" === to_timestamp(lit("2017-04-04 15:12:55.394"))) @@ -104,5 +104,28 @@ class L8CatalogRelationTest extends TestEnvironment { stats.data_cells should be (512L * 512L) stats.mean shouldBe > (10000.0) } + + it("should construct an RGB composite") { + val aoi = "LINESTRING (31.115 29.963, 31.148 29.99)" + val sceneCat = catalog + .where( + to_date($"acquisition_date") === to_date(lit("2019-07-03")) && + st_intersects(st_geometry($"bounds_wgs84"), st_geomFromWKT(aoi)) + ) + + catalog.orderBy(desc("acquisition_date")).select($"acquisition_date").show(false) + catalog.where(to_date($"acquisition_date") === to_date(lit("2019-03-07"))).show(false) + + //sceneCat.show(false) + + +// val df = spark.read.raster +// .fromCatalog(scenes, "B4", "B3", "B2") +// .withTileDimensions(128, 128) +// .load() +// .where + + + } } } diff --git a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/CachedDatasetRelation.scala b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/CachedDatasetRelation.scala index 231b2411e..5b162b940 100644 --- a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/CachedDatasetRelation.scala +++ b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/CachedDatasetRelation.scala @@ -25,7 +25,7 @@ import com.typesafe.scalalogging.LazyLogging import org.apache.hadoop.fs.{FileSystem, Path => HadoopPath} import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.BaseRelation -import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.{Dataset, Row, SaveMode} import org.locationtech.rasterframes.util._ /** @@ -40,12 +40,12 @@ trait CachedDatasetRelation extends ResourceCacheSupport { self: BaseRelation wi def buildScan(): RDD[Row] = { val conf = sqlContext.sparkContext.hadoopConfiguration implicit val fs: FileSystem = FileSystem.get(conf) - val catalog = cacheFile.when(fs.exists) + val catalog = cacheFile.when(p => fs.exists(p) && !expired(p)) .map(p ⇒ {logger.debug("Reading " + p); p}) .map(p ⇒ sqlContext.read.parquet(p.toString)) .getOrElse { val scenes = constructDataset - scenes.write.parquet(cacheFile.toString) + scenes.write.mode(SaveMode.Overwrite).parquet(cacheFile.toString) scenes } diff --git a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/ResourceCacheSupport.scala b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/ResourceCacheSupport.scala index 45173572f..0a99f6017 100644 --- a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/ResourceCacheSupport.scala +++ b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/ResourceCacheSupport.scala @@ -51,7 +51,7 @@ trait ResourceCacheSupport extends DownloadSupport { self: LazyLogging ⇒ else { val time = fs.getFileStatus(p).getModificationTime - val exp = Instant.ofEpochMilli(time).isAfter(Instant.now().plus(Duration.ofHours(maxCacheFileAgeHours))) + val exp = Instant.ofEpochMilli(time).plus(Duration.ofHours(maxCacheFileAgeHours)).isBefore(Instant.now()) if(exp) logger.debug(s"'$p' is expired with mod time of '$time'") exp } diff --git a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogDataSource.scala b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogDataSource.scala index aad1cd2fc..bdc35d650 100644 --- a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogDataSource.scala +++ b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogDataSource.scala @@ -39,7 +39,7 @@ class L8CatalogDataSource extends DataSourceRegister with RelationProvider { def shortName = L8CatalogDataSource.SHORT_NAME def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { - require(parameters.get("path").isEmpty, "MODISCatalogDataSource doesn't support specifying a path. Please use `load()`.") + require(parameters.get("path").isEmpty, "L8CatalogDataSource doesn't support specifying a path. Please use `load()`.") val conf = sqlContext.sparkContext.hadoopConfiguration implicit val fs = FileSystem.get(conf) From c2f5f5027bb6e2bf2df3a71a5086b72a4cedead6 Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Mon, 2 Sep 2019 11:36:07 -0400 Subject: [PATCH 13/48] Reduce job size and name chunk on vector data page Signed-off-by: Jason T. Brown --- .../src/main/python/docs/vector-data.pymd | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pyrasterframes/src/main/python/docs/vector-data.pymd b/pyrasterframes/src/main/python/docs/vector-data.pymd index 99537d1e5..11b4c8e55 100644 --- a/pyrasterframes/src/main/python/docs/vector-data.pymd +++ b/pyrasterframes/src/main/python/docs/vector-data.pymd @@ -79,18 +79,21 @@ from pyrasterframes.rasterfunctions import st_centroid df.select(df.state_code, inefficient_centroid(df.geometry), st_centroid(df.geometry)) ``` -The RasterFrames vector functions and GeoMesa functions also provide a variety of spatial relations that are useful in combination with the geometric properties of projected rasters. In this example, we use the @ref:[built-in Landsat catalog](raster-catalogs.md#using-built-in-experimental-catalogs) which provides an extent. We will convert the extent to a polygon and filter to those within approximately 500 km of a selected point. +The RasterFrames vector functions and GeoMesa functions also provide a variety of spatial relations that are useful in combination with the geometric properties of projected rasters. In this example, we use the @ref:[built-in Landsat catalog](raster-catalogs.md#using-built-in-experimental-catalogs) which provides an extent. We will convert the extent to a polygon and filter to those within approximately 50 km of a selected point. -```python, evaluate=True +```python, spatial_relation, evaluate=True from pyrasterframes.rasterfunctions import st_geometry, st_bufferPoint, st_intersects, st_point from pyspark.sql.functions import lit l8 = spark.read.format('aws-pds-l8-catalog').load() -l8 = l8.withColumn('geom', st_geometry(l8.bounds_wgs84)) -l8 = l8.withColumn('paducah', st_point(lit(-88.6275), lit(37.072222))) +l8 = l8.withColumn('geom', st_geometry(l8.bounds_wgs84)) # extent to polygon +l8 = l8.withColumn('paducah', st_point(lit(-88.628), lit(37.072))) # col of points -l8_filtered = l8.filter(st_intersects(l8.geom, st_bufferPoint(l8.paducah, lit(500000.0)))) -l8_filtered.select('product_id', 'entity_id', 'acquisition_date', 'cloud_cover_pct') +l8_filtered = l8 \ + .filter(st_intersects(l8.geom, st_bufferPoint(l8.paducah, lit(50000.0)))) + .filter(l8.acquisition_date > '2018-02-01') \ + .filter(l8.acquisition_date < '2018-04-01') +l8_filtered.select('product_id', 'entity_id', 'acquisition_date', 'cloud_cover_pct').toPandas() ``` [GeoPandas]: http://geopandas.org From 5bcbb794d82ccf63882c41abd8f384447232a801 Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Mon, 2 Sep 2019 11:45:40 -0400 Subject: [PATCH 14/48] Line continuation in doc chunk Signed-off-by: Jason T. Brown --- pyrasterframes/src/main/python/docs/vector-data.pymd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyrasterframes/src/main/python/docs/vector-data.pymd b/pyrasterframes/src/main/python/docs/vector-data.pymd index 11b4c8e55..8d50f20db 100644 --- a/pyrasterframes/src/main/python/docs/vector-data.pymd +++ b/pyrasterframes/src/main/python/docs/vector-data.pymd @@ -90,7 +90,7 @@ l8 = l8.withColumn('geom', st_geometry(l8.bounds_wgs84)) # extent to polygon l8 = l8.withColumn('paducah', st_point(lit(-88.628), lit(37.072))) # col of points l8_filtered = l8 \ - .filter(st_intersects(l8.geom, st_bufferPoint(l8.paducah, lit(50000.0)))) + .filter(st_intersects(l8.geom, st_bufferPoint(l8.paducah, lit(50000.0)))) \ .filter(l8.acquisition_date > '2018-02-01') \ .filter(l8.acquisition_date < '2018-04-01') l8_filtered.select('product_id', 'entity_id', 'acquisition_date', 'cloud_cover_pct').toPandas() From b64e80430c1a19c0e4952fe00086aed739e8924a Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Mon, 2 Sep 2019 12:14:22 -0400 Subject: [PATCH 15/48] Remove unneeded catalog param keywords from docs Signed-off-by: Jason T. Brown --- pyrasterframes/src/main/python/docs/languages.pymd | 2 +- .../src/main/python/docs/local-algebra.pymd | 2 +- .../src/main/python/docs/nodata-handling.pymd | 2 +- pyrasterframes/src/main/python/docs/numpy-pandas.pymd | 2 +- pyrasterframes/src/main/python/docs/raster-read.pymd | 11 +++-------- .../src/main/python/docs/supervised-learning.pymd | 6 ++---- pyrasterframes/src/main/python/docs/time-series.pymd | 2 +- .../src/main/python/docs/unsupervised-learning.pymd | 2 +- 8 files changed, 11 insertions(+), 18 deletions(-) diff --git a/pyrasterframes/src/main/python/docs/languages.pymd b/pyrasterframes/src/main/python/docs/languages.pymd index 1d4895e2e..2a54c5124 100644 --- a/pyrasterframes/src/main/python/docs/languages.pymd +++ b/pyrasterframes/src/main/python/docs/languages.pymd @@ -42,7 +42,7 @@ red_nir_monthly_2017.printSchema() ```python, step_3_python red_nir_tiles_monthly_2017 = spark.read.raster( - catalog=red_nir_monthly_2017, + red_nir_monthly_2017, catalog_col_names=['red', 'nir'], tile_dimensions=(256, 256) ) diff --git a/pyrasterframes/src/main/python/docs/local-algebra.pymd b/pyrasterframes/src/main/python/docs/local-algebra.pymd index 696186313..fc83ae2d2 100644 --- a/pyrasterframes/src/main/python/docs/local-algebra.pymd +++ b/pyrasterframes/src/main/python/docs/local-algebra.pymd @@ -40,7 +40,7 @@ catalog_df = spark.createDataFrame([ Row(red=uri_pattern.format(4), nir=uri_pattern.format(8)) ]) df = spark.read.raster( - catalog=catalog_df, + catalog_df, catalog_col_names=['red', 'nir'] ) df.printSchema() diff --git a/pyrasterframes/src/main/python/docs/nodata-handling.pymd b/pyrasterframes/src/main/python/docs/nodata-handling.pymd index a4534c8b1..90eeacbb5 100644 --- a/pyrasterframes/src/main/python/docs/nodata-handling.pymd +++ b/pyrasterframes/src/main/python/docs/nodata-handling.pymd @@ -90,7 +90,7 @@ from pyspark.sql import Row blue_uri = 'https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/B02.tif' scl_uri = 'https://s22s-test-geotiffs.s3.amazonaws.com/luray_snp/SCL.tif' cat = spark.createDataFrame([Row(blue=blue_uri, scl=scl_uri),]) -unmasked = spark.read.raster(catalog=cat, catalog_col_names=['blue', 'scl']) +unmasked = spark.read.raster(cat, catalog_col_names=['blue', 'scl']) unmasked.printSchema() ``` diff --git a/pyrasterframes/src/main/python/docs/numpy-pandas.pymd b/pyrasterframes/src/main/python/docs/numpy-pandas.pymd index 86f5bad3f..5622af7b3 100644 --- a/pyrasterframes/src/main/python/docs/numpy-pandas.pymd +++ b/pyrasterframes/src/main/python/docs/numpy-pandas.pymd @@ -51,7 +51,7 @@ cat = spark.read.format('aws-pds-modis-catalog').load() \ (col('acquisition_date') < lit('2018-02-22')) ) -spark_df = spark.read.raster(catalog=cat, catalog_col_names=['B01']) \ +spark_df = spark.read.raster(cat, catalog_col_names=['B01']) \ .select( 'acquisition_date', 'granule_id', diff --git a/pyrasterframes/src/main/python/docs/raster-read.pymd b/pyrasterframes/src/main/python/docs/raster-read.pymd index f9a1170b2..53f3a96e6 100644 --- a/pyrasterframes/src/main/python/docs/raster-read.pymd +++ b/pyrasterframes/src/main/python/docs/raster-read.pymd @@ -101,8 +101,6 @@ modis_catalog = spark.read \ .withColumn('red' , F.concat('base_url', F.lit("_B01.TIF"))) \ .withColumn('nir' , F.concat('base_url', F.lit("_B02.TIF"))) -modis_catalog.printSchema() - print("Available scenes: ", modis_catalog.count()) ``` @@ -124,10 +122,7 @@ equator.select('date', 'gid') Now that we have prepared our catalog, we simply pass the DataFrame or CSV string to the `raster` DataSource to load the imagery. The `catalog_col_names` parameter gives the columns that contain the URI's to be read. ```python, read_catalog -rf = spark.read.raster( - catalog=equator, - catalog_col_names=['red', 'nir'] -) +rf = spark.read.raster(equator, catalog_col_names=['red', 'nir']) rf.printSchema() ``` @@ -179,7 +174,7 @@ mb.printSchema() If a band is passed into `band_indexes` that exceeds the number of bands in the raster, a projected raster column will still be generated in the schema but the column will be full of `null` values. -You can also pass a `catalog` and `band_indexes` together into the `raster` reader. This will create a projected raster column for the combination of all items passed into `catalog_col_names` and `band_indexes`. Again if a band in `band_indexes` exceeds the number of bands in a raster, it will have a `null` value for the corresponding column. +You can also pass a _catalog_ and `band_indexes` together into the `raster` reader. This will create a projected raster column for the combination of all items in `catalog_col_names` and `band_indexes`. Again if a band in `band_indexes` exceeds the number of bands in a raster, it will have a `null` value for the corresponding column. Here is a trivial example with a _catalog_ over multiband rasters. We specify two columns containing URIs and two bands, resulting in four projected raster columns. @@ -191,7 +186,7 @@ mb_cat = pd.DataFrame([ }, ]) mb2 = spark.read.raster( - catalog=spark.createDataFrame(mb_cat), + spark.createDataFrame(mb_cat), catalog_col_names=['foo', 'bar'], band_indexes=[0, 1], tile_dimensions=(64,64) diff --git a/pyrasterframes/src/main/python/docs/supervised-learning.pymd b/pyrasterframes/src/main/python/docs/supervised-learning.pymd index 0a3f8c0ef..9f2cd968f 100644 --- a/pyrasterframes/src/main/python/docs/supervised-learning.pymd +++ b/pyrasterframes/src/main/python/docs/supervised-learning.pymd @@ -33,10 +33,8 @@ catalog_df = pd.DataFrame([ {b: uri_base.format(b) for b in cols} ]) -df = spark.read.raster(catalog=catalog_df, - catalog_col_names=cols, - tile_dimensions=(128, 128) - ).repartition(100) +df = spark.read.raster(catalog_df, catalog_col_names=cols, tile_dimensions=(128, 128)) \ + .repartition(100) df = df.select( rf_crs(df.B01).alias('crs'), diff --git a/pyrasterframes/src/main/python/docs/time-series.pymd b/pyrasterframes/src/main/python/docs/time-series.pymd index eadcb3ffd..0e0cbed00 100644 --- a/pyrasterframes/src/main/python/docs/time-series.pymd +++ b/pyrasterframes/src/main/python/docs/time-series.pymd @@ -97,7 +97,7 @@ We then [reproject](https://gis.stackexchange.com/questions/247770/understanding ```python read_catalog raster_cols = ['B01', 'B02',] # red and near-infrared respectively park_rf = spark.read.raster( - catalog=park_cat.select(['acquisition_date', 'granule_id', 'geo_simp'] + raster_cols), + park_cat.select(['acquisition_date', 'granule_id', 'geo_simp'] + raster_cols), catalog_col_names=raster_cols) \ .withColumn('park_native', st_reproject('geo_simp', lit('EPSG:4326'), rf_crs('B01'))) \ .filter(st_intersects('park_native', rf_geometry('B01'))) diff --git a/pyrasterframes/src/main/python/docs/unsupervised-learning.pymd b/pyrasterframes/src/main/python/docs/unsupervised-learning.pymd index 800f7e749..f2158d807 100644 --- a/pyrasterframes/src/main/python/docs/unsupervised-learning.pymd +++ b/pyrasterframes/src/main/python/docs/unsupervised-learning.pymd @@ -37,7 +37,7 @@ filenamePattern = "L8-B{}-Elkton-VA.tiff" catalog_df = pd.DataFrame([ {'b' + str(b): os.path.join(resource_dir_uri(), filenamePattern.format(b)) for b in range(1, 8)} ]) -df = spark.read.raster(catalog=catalog_df, catalog_col_names=catalog_df.columns) +df = spark.read.raster(catalog_df, catalog_col_names=catalog_df.columns) df = df.select( rf_crs(df.b1).alias('crs'), rf_extent(df.b1).alias('extent'), From bcb233bf4c2234d54cbe1a7bf983c6fd7a8f75cb Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Mon, 2 Sep 2019 12:49:08 -0400 Subject: [PATCH 16/48] Tweak raster read for back compatibility; update some unit tests Signed-off-by: Jason T. Brown --- pyrasterframes/src/main/python/pyrasterframes/__init__.py | 3 +++ pyrasterframes/src/main/python/tests/RasterFunctionsTests.py | 2 +- pyrasterframes/src/main/python/tests/RasterSourceTest.py | 1 - pyrasterframes/src/main/python/tests/RasterSourceTests.py | 4 +--- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyrasterframes/src/main/python/pyrasterframes/__init__.py b/pyrasterframes/src/main/python/pyrasterframes/__init__.py index 730af5866..da8ca81a6 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/__init__.py +++ b/pyrasterframes/src/main/python/pyrasterframes/__init__.py @@ -134,6 +134,9 @@ def _raster_reader( from pandas import DataFrame as PdDataFrame + if 'catalog' in options: + source = options['catalog'] # maintain back compatibility with 0.8.0 + def to_csv(comp): if isinstance(comp, str): return comp diff --git a/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py b/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py index 2a57cf356..0d8528418 100644 --- a/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py +++ b/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py @@ -286,7 +286,7 @@ def test_render_composite(self): cat = self.spark.createDataFrame([ Row(red=self.l8band_uri(4), green=self.l8band_uri(3), blue=self.l8band_uri(2)) ]) - rf = self.spark.read.raster(catalog = cat, catalog_col_names=['red', 'green', 'blue']) + rf = self.spark.read.raster(catalog=cat, catalog_col_names=cat.columns) # Test composite construction rgb = rf.select(rf_tile(rf_rgb_composite('red', 'green', 'blue')).alias('rgb')).first()['rgb'] diff --git a/pyrasterframes/src/main/python/tests/RasterSourceTest.py b/pyrasterframes/src/main/python/tests/RasterSourceTest.py index eb1b0f986..547726a6d 100644 --- a/pyrasterframes/src/main/python/tests/RasterSourceTest.py +++ b/pyrasterframes/src/main/python/tests/RasterSourceTest.py @@ -28,7 +28,6 @@ class RasterSourceTest(TestEnvironment): - @staticmethod def path(scene, band): scene_dict = { diff --git a/pyrasterframes/src/main/python/tests/RasterSourceTests.py b/pyrasterframes/src/main/python/tests/RasterSourceTests.py index 08ebe078c..1c3dcf7e0 100644 --- a/pyrasterframes/src/main/python/tests/RasterSourceTests.py +++ b/pyrasterframes/src/main/python/tests/RasterSourceTests.py @@ -113,14 +113,12 @@ def path(scene, band): {'b1': path(2, 1), 'b2': path(2, 2), 'b3': path(2, 3)}, {'b1': path(3, 1), 'b2': path(3, 2), 'b3': path(3, 3)}, ]) - # comma separated list of column names containing URI's to read. - catalog_columns = ','.join(path_pandas.columns.tolist()) # 'b1,b2,b3' path_table = self.spark.createDataFrame(path_pandas) path_df = self.spark.read.raster( tile_dimensions=(512, 512), catalog=path_table, - catalog_col_names=catalog_columns, + catalog_col_names=path_table.columns, lazy_tiles=True # We'll get an OOM error if we try to read 9 scenes all at once! ) From ebe79f0885a55f1c611ea440ba911678e9ef679d Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Mon, 2 Sep 2019 13:15:51 -0400 Subject: [PATCH 17/48] Remove duplicated test from merge; fix list of list test Signed-off-by: Jason T. Brown --- .../src/main/python/tests/RasterSourceTest.py | 10 +- .../main/python/tests/RasterSourceTests.py | 172 ------------------ 2 files changed, 6 insertions(+), 176 deletions(-) delete mode 100644 pyrasterframes/src/main/python/tests/RasterSourceTests.py diff --git a/pyrasterframes/src/main/python/tests/RasterSourceTest.py b/pyrasterframes/src/main/python/tests/RasterSourceTest.py index 547726a6d..509ed6293 100644 --- a/pyrasterframes/src/main/python/tests/RasterSourceTest.py +++ b/pyrasterframes/src/main/python/tests/RasterSourceTest.py @@ -106,9 +106,9 @@ def l8path(b): def test_list_of_list_of_str(self): lol = [ - [self.path(1, 1), self.path(1, 2), ], - [self.path(2, 1), self.path(2, 2), ], - [self.path(3, 1), self.path(3, 2), ] + [self.path(1, 1), self.path(1, 2)], + [self.path(2, 1), self.path(2, 2)], + [self.path(3, 1), self.path(3, 2)] ] df = self.spark.read.raster(lol) self.assertTrue(len(df.columns) == 4) # 2 cols of uris plus 2 cols of proj_rasters @@ -116,7 +116,9 @@ def test_list_of_list_of_str(self): 'proj_raster_0', 'proj_raster_1'])) uri_df = df.select('proj_raster_0_path', 'proj_raster_1_path').distinct().collect() uri_list = [list(r.asDict().values()) for r in uri_df] - self.assertEqual(sorted(uri_list), sorted(lol)) + self.assertTrue(lol[0] in uri_list) + self.assertTrue(lol[1] in uri_list) + self.assertTrue(lol[2] in uri_list) def test_schemeless_string(self): import os.path diff --git a/pyrasterframes/src/main/python/tests/RasterSourceTests.py b/pyrasterframes/src/main/python/tests/RasterSourceTests.py deleted file mode 100644 index 1c3dcf7e0..000000000 --- a/pyrasterframes/src/main/python/tests/RasterSourceTests.py +++ /dev/null @@ -1,172 +0,0 @@ -# -# This software is licensed under the Apache 2 license, quoted below. -# -# Copyright 2019 Astraea, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not -# use this file except in compliance with the License. You may obtain a copy of -# the License at -# -# [http://www.apache.org/licenses/LICENSE-2.0] -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations under -# the License. -# -# SPDX-License-Identifier: Apache-2.0 -# - - -from pyrasterframes.rasterfunctions import * -from . import TestEnvironment - -class RasterSource(TestEnvironment): - - def test_handle_lazy_eval(self): - df = self.spark.read.raster(self.img_uri) - ltdf = df.select('proj_raster') - self.assertGreater(ltdf.count(), 0) - self.assertIsNotNone(ltdf.first()) - - tdf = df.select(rf_tile('proj_raster')) - self.assertGreater(tdf.count(), 0) - self.assertIsNotNone(tdf.first()) - - def test_strict_eval(self): - df_lazy = self.spark.read.raster(self.img_uri, lazy_tiles=True) - # when doing Show on a lazy tile we will see something like RasterRefTile(RasterRef(JVMGeoTiffRasterSource(... - # use this trick to get the `show` string - show_str_lazy = df_lazy.select('proj_raster')._jdf.showString(1, -1, False) - self.assertTrue('RasterRef' in show_str_lazy) - - # again for strict - df_strict = self.spark.read.raster(self.img_uri, lazy_tiles=False) - show_str_strict = df_strict.select('proj_raster')._jdf.showString(1, -1, False) - self.assertTrue('RasterRef' not in show_str_strict) - - - def test_prt_functions(self): - df = self.spark.read.raster(self.img_uri) \ - .withColumn('crs', rf_crs('proj_raster')) \ - .withColumn('ext', rf_extent('proj_raster')) \ - .withColumn('geom', rf_geometry('proj_raster')) - df.select('crs', 'ext', 'geom').first() - - def test_raster_source_reader(self): - # much the same as RasterSourceDataSourceSpec here; but using https PDS. Takes about 30s to run - - def l8path(b): - assert b in range(1, 12) - base = "https://s3-us-west-2.amazonaws.com/landsat-pds/c1/L8/199/026/LC08_L1TP_199026_20180919_20180928_01_T1/LC08_L1TP_199026_20180919_20180928_01_T1_B{}.TIF" - return base.format(b) - - path_param = '\n'.join([l8path(b) for b in [1, 2, 3]]) # "http://foo.com/file1.tif,http://foo.com/file2.tif" - tile_size = 512 - - df = self.spark.read.raster( - tile_dimensions=(tile_size, tile_size), - paths=path_param, - lazy_tiles=True, - ).cache() - - # schema is tile_path and tile - # df.printSchema() - self.assertTrue(len(df.columns) == 2 and 'proj_raster_path' in df.columns and 'proj_raster' in df.columns) - - # the most common tile dimensions should be as passed to `options`, showing that options are correctly applied - tile_size_df = df.select(rf_dimensions(df.proj_raster).rows.alias('r'), rf_dimensions(df.proj_raster).cols.alias('c')) \ - .groupby(['r', 'c']).count().toPandas() - most_common_size = tile_size_df.loc[tile_size_df['count'].idxmax()] - self.assertTrue(most_common_size.r == tile_size and most_common_size.c == tile_size) - - # all rows are from a single source URI - path_count = df.groupby(df.proj_raster_path).count() - print(path_count.toPandas()) - self.assertTrue(path_count.count() == 3) - - def test_raster_source_reader_schemeless(self): - import os.path - path = os.path.join(self.resource_dir, "L8-B8-Robinson-IL.tiff") - self.assertTrue(not path.startswith('file://')) - df = self.spark.read.raster(path) - self.assertTrue(df.count() > 0) - - def test_raster_source_catalog_reader(self): - import pandas as pd - - scene_dict = { - 1: 'http://landsat-pds.s3.amazonaws.com/c1/L8/015/041/LC08_L1TP_015041_20190305_20190309_01_T1/LC08_L1TP_015041_20190305_20190309_01_T1_B{}.TIF', - 2: 'http://landsat-pds.s3.amazonaws.com/c1/L8/015/042/LC08_L1TP_015042_20190305_20190309_01_T1/LC08_L1TP_015042_20190305_20190309_01_T1_B{}.TIF', - 3: 'http://landsat-pds.s3.amazonaws.com/c1/L8/016/041/LC08_L1TP_016041_20190224_20190309_01_T1/LC08_L1TP_016041_20190224_20190309_01_T1_B{}.TIF', - } - - def path(scene, band): - assert band in range(1, 12) - p = scene_dict[scene] - return p.format(band) - - # Create a pandas dataframe (makes it easy to create spark df) - path_pandas = pd.DataFrame([ - {'b1': path(1, 1), 'b2': path(1, 2), 'b3': path(1, 3)}, - {'b1': path(2, 1), 'b2': path(2, 2), 'b3': path(2, 3)}, - {'b1': path(3, 1), 'b2': path(3, 2), 'b3': path(3, 3)}, - ]) - path_table = self.spark.createDataFrame(path_pandas) - - path_df = self.spark.read.raster( - tile_dimensions=(512, 512), - catalog=path_table, - catalog_col_names=path_table.columns, - lazy_tiles=True # We'll get an OOM error if we try to read 9 scenes all at once! - ) - - self.assertTrue(len(path_df.columns) == 6) # three bands times {path, tile} - self.assertTrue(path_df.select('b1_path').distinct().count() == 3) # as per scene_dict - b1_paths_maybe = path_df.select('b1_path').distinct().collect() - b1_paths = [s.format('1') for s in scene_dict.values()] - self.assertTrue(all([row.b1_path in b1_paths for row in b1_paths_maybe])) - - def test_raster_source_catalog_reader_with_pandas(self): - import pandas as pd - import geopandas - from shapely.geometry import Point - - scene_dict = { - 1: 'http://landsat-pds.s3.amazonaws.com/c1/L8/015/041/LC08_L1TP_015041_20190305_20190309_01_T1/LC08_L1TP_015041_20190305_20190309_01_T1_B{}.TIF', - 2: 'http://landsat-pds.s3.amazonaws.com/c1/L8/015/042/LC08_L1TP_015042_20190305_20190309_01_T1/LC08_L1TP_015042_20190305_20190309_01_T1_B{}.TIF', - 3: 'http://landsat-pds.s3.amazonaws.com/c1/L8/016/041/LC08_L1TP_016041_20190224_20190309_01_T1/LC08_L1TP_016041_20190224_20190309_01_T1_B{}.TIF', - } - - def path(scene, band): - assert band in range(1, 12) - p = scene_dict[scene] - return p.format(band) - - # Create a pandas dataframe (makes it easy to create spark df) - path_pandas = pd.DataFrame([ - {'b1': path(1, 1), 'b2': path(1, 2), 'b3': path(1, 3), 'geo': Point(1, 1)}, - {'b1': path(2, 1), 'b2': path(2, 2), 'b3': path(2, 3), 'geo': Point(2, 2)}, - {'b1': path(3, 1), 'b2': path(3, 2), 'b3': path(3, 3), 'geo': Point(3, 3)}, - ]) - - # here a subtle difference with the test_raster_source_catalog_reader test, feed the DataFrame not a CSV and not an already created spark DF. - df = self.spark.read.raster( - catalog=path_pandas, - catalog_col_names=['b1', 'b2', 'b3'] - ) - self.assertEqual(len(df.columns), 7) # three path cols, three tile cols, and geo - self.assertTrue('geo' in df.columns) - self.assertTrue(df.select('b1_path').distinct().count() == 3) - - - # Same test with geopandas - geo_df = geopandas.GeoDataFrame(path_pandas, crs={'init': 'EPSG:4326'}, geometry='geo') - df2 = self.spark.read.raster( - catalog=geo_df, - catalog_col_names=['b1', 'b2', 'b3'] - ) - self.assertEqual(len(df2.columns), 7) # three path cols, three tile cols, and geo - self.assertTrue('geo' in df2.columns) - self.assertTrue(df2.select('b1_path').distinct().count() == 3) From 9fe8278550397b13c09ede3c9d6d65cb4b80fcb5 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Tue, 3 Sep 2019 11:11:17 -0400 Subject: [PATCH 18/48] Incremental work on refactoring aggregate raster creation. --- .../rasterframes/ref/RasterRefIT.scala | 62 ++++++++++++++ .../aggregates/TileRasterizerAggregate.scala | 80 +++++++++++++++++-- .../transformers/RGBComposite.scala | 6 +- .../extensions/DataFrameMethods.scala | 9 ++- .../rasterframes/util/package.scala | 2 +- .../geotiff/GeoTiffDataSource.scala | 58 +------------- .../awspds/L8CatalogRelationTest.scala | 34 ++++---- 7 files changed, 168 insertions(+), 83 deletions(-) create mode 100644 core/src/it/scala/org/locationtech/rasterframes/ref/RasterRefIT.scala diff --git a/core/src/it/scala/org/locationtech/rasterframes/ref/RasterRefIT.scala b/core/src/it/scala/org/locationtech/rasterframes/ref/RasterRefIT.scala new file mode 100644 index 000000000..fee8e9e08 --- /dev/null +++ b/core/src/it/scala/org/locationtech/rasterframes/ref/RasterRefIT.scala @@ -0,0 +1,62 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package org.locationtech.rasterframes.ref + +import java.net.URI + +import geotrellis.proj4.LatLng + +import geotrellis.vector.Extent +import org.locationtech.rasterframes._ + +class RasterRefIT extends TestEnvironment { + describe("practical subregion reads") { + it("should construct a natural color composite") { + import spark.implicits._ + def scene(idx: Int) = URI.create(s"https://landsat-pds.s3.us-west-2.amazonaws.com" + + s"/c1/L8/176/039/LC08_L1TP_176039_20190703_20190718_01_T1/LC08_L1TP_176039_20190703_20190718_01_T1_B$idx.TIF") + + val redScene = RasterSource(scene(4)) + // [west, south, east, north] + val area = Extent(31.115, 29.963, 31.148, 29.99).reproject(LatLng, redScene.crs) + + val red = RasterRef(redScene, 0, Some(area), None) + val green = RasterRef(RasterSource(scene(3)), 0, Some(area), None) + val blue = RasterRef(RasterSource(scene(2)), 0, Some(area), None) + + val rf = Seq((red, green, blue)).toDF("red", "green", "blue") + val raster = rf.select( + rf_crs($"red"), rf_extent($"red"), rf_tile($"red"), rf_tile($"green"), rf_tile($"blue")) + .toDF.aggregateRaster(redScene.crs, None) + + forEvery(raster.tile.statisticsDouble) { stats => + stats should be ('defined) + stats.get.dataCells shouldBe > (1000L) + } + //import geotrellis.raster.io.geotiff.{GeoTiffOptions, MultibandGeoTiff, Tiled} + //import geotrellis.raster.io.geotiff.compression.{DeflateCompression, NoCompression} + //import geotrellis.raster.io.geotiff.tags.codes.ColorSpace + //val tiffOptions = GeoTiffOptions(Tiled, DeflateCompression, ColorSpace.RGB) + //MultibandGeoTiff(raster, raster.crs, tiffOptions).write("target/composite.tif") + } + } +} \ No newline at end of file diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/TileRasterizerAggregate.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/TileRasterizerAggregate.scala index e1b11ae3b..f8e102eae 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/TileRasterizerAggregate.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/TileRasterizerAggregate.scala @@ -24,15 +24,18 @@ package org.locationtech.rasterframes.expressions.aggregates import geotrellis.proj4.CRS import geotrellis.raster.reproject.Reproject import geotrellis.raster.resample.ResampleMethod -import geotrellis.raster.{ArrayTile, CellType, Raster, Tile} -import geotrellis.spark.TileLayerMetadata +import geotrellis.raster.{ArrayTile, CellType, MultibandTile, ProjectedRaster, Raster, Tile} +import geotrellis.spark.{SpatialKey, TileLayerMetadata} import geotrellis.vector.Extent import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types.{DataType, StructField, StructType} -import org.apache.spark.sql.{Column, Row, TypedColumn} +import org.apache.spark.sql.{Column, DataFrame, Row, TypedColumn} import org.locationtech.rasterframes._ +import org.locationtech.rasterframes.util._ import org.locationtech.rasterframes.encoders.CatalystSerializer._ import org.locationtech.rasterframes.expressions.aggregates.TileRasterizerAggregate.ProjectedRasterDefinition +import org.locationtech.rasterframes.model.TileDimensions +import org.slf4j.LoggerFactory /** * Aggregation function for creating a single `geotrellis.raster.Raster[Tile]` from @@ -88,7 +91,7 @@ class TileRasterizerAggregate(prd: ProjectedRasterDefinition) extends UserDefine } object TileRasterizerAggregate { - val nodeName = "rf_tile_rasterizer_aggregate" + val nodeName = "rf_agg_raster" /** Convenience grouping of parameters needed for running aggregate. */ case class ProjectedRasterDefinition(totalCols: Int, totalRows: Int, cellType: CellType, crs: CRS, extent: Extent, sampler: ResampleMethod = ResampleMethod.DEFAULT) @@ -102,8 +105,73 @@ object TileRasterizerAggregate { val rows = actualSize.height new ProjectedRasterDefinition(cols, rows, tlm.cellType, tlm.crs, tlm.extent, sampler) } -} + } + + @transient + private lazy val logger = LoggerFactory.getLogger(getClass) + + def apply(prd: ProjectedRasterDefinition, crsCol: Column, extentCol: Column, tileCol: Column): TypedColumn[Any, Raster[Tile]] = { + + if (prd.totalCols.toDouble * prd.totalRows * 64.0 > Runtime.getRuntime.totalMemory() * 0.5) + logger.warn( + s"You've asked for the construction of a very large image (${prd.totalCols} x ${prd.totalRows}). Out of memory error likely.") - def apply(prd: ProjectedRasterDefinition, crsCol: Column, extentCol: Column, tileCol: Column): TypedColumn[Any, Raster[Tile]] = new TileRasterizerAggregate(prd)(crsCol, extentCol, tileCol).as(nodeName).as[Raster[Tile]] + } + + def apply(df: DataFrame, destCRS: CRS, destExtent: Option[Extent], rasterDims: Option[TileDimensions]): ProjectedRaster[MultibandTile] = { + val tileCols = WithDataFrameMethods(df).tileColumns + require(tileCols.nonEmpty, "need at least one tile column") + // Select the anchoring Tile, Extent and CRS columns + val (extCol, crsCol, tileCol) = { + // Favor "ProjectedRaster" columns + val prCols = df.projRasterColumns + if (prCols.nonEmpty) { + (rf_extent(prCols.head), rf_crs(prCols.head), rf_tile(prCols.head)) + } else { + // If no "ProjectedRaster" column, look for single Extent and CRS columns. + val crsCols = df.crsColumns + require(crsCols.size == 1, "Exactly one CRS column must be in DataFrame") + val extentCols = df.extentColumns + require(extentCols.size == 1, "Exactly one Extent column must be in DataFrame") + (extentCols.head, crsCols.head, tileCols.head) + } + } + + // Scan table and constuct what the TileLayerMetadata would be in the specified destination CRS. + val tlm: TileLayerMetadata[SpatialKey] = df + .select( + ProjectedLayerMetadataAggregate( + destCRS, + extCol, + crsCol, + rf_cell_type(tileCol), + rf_dimensions(tileCol) + )) + .first() + logger.debug(s"Collected TileLayerMetadata: ${tlm.toString}") + + val c = ProjectedRasterDefinition(tlm) + + val config = rasterDims + .map { dims => + c.copy(totalCols = dims.cols, totalRows = dims.rows) + } + .getOrElse(c) + + destExtent.map { ext => + c.copy(extent = ext) + } + + val aggs = tileCols + .map(t => TileRasterizerAggregate(config, crsCol, extCol, rf_tile(t))("tile").as(t.columnName)) + + val agg = df.select(aggs: _*) + + val row = agg.first() + + val bands = for (i <- 0 until row.size) yield row.getAs[Tile](i) + + ProjectedRaster(MultibandTile(bands), tlm.extent, tlm.crs) + } } \ No newline at end of file diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/transformers/RGBComposite.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/transformers/RGBComposite.scala index 9f0a9c808..c214b8ef9 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/transformers/RGBComposite.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/transformers/RGBComposite.scala @@ -56,9 +56,9 @@ case class RGBComposite(red: Expression, green: Expression, blue: Expression) ex override def nodeName: String = "rf_rgb_composite" override def dataType: DataType = if( - red.dataType.conformsTo[ProjectedRasterTile] || - blue.dataType.conformsTo[ProjectedRasterTile] || - green.dataType.conformsTo[ProjectedRasterTile] + tileExtractor.isDefinedAt(red.dataType) || + tileExtractor.isDefinedAt(green.dataType) || + tileExtractor.isDefinedAt(blue.dataType) ) red.dataType else TileType diff --git a/core/src/main/scala/org/locationtech/rasterframes/extensions/DataFrameMethods.scala b/core/src/main/scala/org/locationtech/rasterframes/extensions/DataFrameMethods.scala index 1e94ff3ca..1ee3a21fa 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/extensions/DataFrameMethods.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/extensions/DataFrameMethods.scala @@ -22,6 +22,7 @@ package org.locationtech.rasterframes.extensions import geotrellis.proj4.CRS +import geotrellis.raster.{MultibandTile, ProjectedRaster} import geotrellis.spark.io._ import geotrellis.spark.{SpaceTimeKey, SpatialComponent, SpatialKey, TemporalKey, TileLayerMetadata} import geotrellis.util.MethodExtensions @@ -32,7 +33,9 @@ import org.apache.spark.sql.{Column, DataFrame, TypedColumn} import org.locationtech.rasterframes.StandardColumns._ import org.locationtech.rasterframes.encoders.CatalystSerializer._ import org.locationtech.rasterframes.encoders.StandardEncoders._ -import org.locationtech.rasterframes.expressions.DynamicExtractors +import org.locationtech.rasterframes.expressions.{DynamicExtractors, aggregates} +import org.locationtech.rasterframes.expressions.aggregates.TileRasterizerAggregate +import org.locationtech.rasterframes.model.TileDimensions import org.locationtech.rasterframes.tiles.ProjectedRasterTile import org.locationtech.rasterframes.util._ import org.locationtech.rasterframes.{MetadataKeys, RasterFrameLayer} @@ -225,7 +228,7 @@ trait DataFrameMethods[DF <: DataFrame] extends MethodExtensions[DF] with Metada */ @throws[IllegalArgumentException] def asLayer: RasterFrameLayer = { - val potentialRF = certifyRasterframe(self) + val potentialRF = certifyLayer(self) require( potentialRF.findSpatialKeyField.nonEmpty, @@ -301,5 +304,5 @@ trait DataFrameMethods[DF <: DataFrame] extends MethodExtensions[DF] with Metada /** Internal method for slapping the RasterFreameLayer seal of approval on a DataFrame. * Only call if if you are sure it has a spatial key and tile columns and TileLayerMetadata. */ - private[rasterframes] def certify = certifyRasterframe(self) + private[rasterframes] def certify = certifyLayer(self) } diff --git a/core/src/main/scala/org/locationtech/rasterframes/util/package.scala b/core/src/main/scala/org/locationtech/rasterframes/util/package.scala index f4c6854ab..213596424 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/util/package.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/util/package.scala @@ -77,7 +77,7 @@ package object util extends DataFrameRenderers { type KeyMethodsProvider[K1, K2] = K1 ⇒ TilerKeyMethods[K1, K2] /** Internal method for slapping the RasterFrameLayer seal of approval on a DataFrame. */ - private[rasterframes] def certifyRasterframe(df: DataFrame): RasterFrameLayer = + private[rasterframes] def certifyLayer(df: DataFrame): RasterFrameLayer = shapeless.tag[RasterFrameTag][DataFrame](df) diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala index 77781a781..99923f526 100644 --- a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala +++ b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala @@ -24,18 +24,15 @@ package org.locationtech.rasterframes.datasource.geotiff import java.net.URI import _root_.geotrellis.proj4.CRS -import _root_.geotrellis.raster._ import _root_.geotrellis.raster.io.geotiff.compression._ import _root_.geotrellis.raster.io.geotiff.tags.codes.ColorSpace import _root_.geotrellis.raster.io.geotiff.{GeoTiffOptions, MultibandGeoTiff, Tags, Tiled} -import _root_.geotrellis.spark._ import com.typesafe.scalalogging.LazyLogging import org.apache.spark.sql._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} import org.locationtech.rasterframes._ import org.locationtech.rasterframes.datasource._ -import org.locationtech.rasterframes.expressions.aggregates.TileRasterizerAggregate.ProjectedRasterDefinition -import org.locationtech.rasterframes.expressions.aggregates.{ProjectedLayerMetadataAggregate, TileRasterizerAggregate} +import org.locationtech.rasterframes.expressions.aggregates.TileRasterizerAggregate import org.locationtech.rasterframes.model.{LazyCRS, TileDimensions} import org.locationtech.rasterframes.util._ @@ -89,58 +86,7 @@ class GeoTiffDataSource layer.toMultibandRaster(tileCols, cols.toInt, rows.toInt) } else { require(parameters.crs.nonEmpty, "A destination CRS must be provided") - require(tileCols.nonEmpty, "need at least one tile column") - - // Grab CRS to project into - val destCRS = parameters.crs.get - - // Select the anchoring Tile, Extent and CRS columns - val (extCol, crsCol, tileCol) = { - // Favor "ProjectedRaster" columns - val prCols = df.projRasterColumns - if (prCols.nonEmpty) { - (rf_extent(prCols.head), rf_crs(prCols.head), rf_tile(prCols.head)) - } else { - // If no "ProjectedRaster" column, look for single Extent and CRS columns. - val crsCols = df.crsColumns - require(crsCols.size == 1, "Exactly one CRS column must be in DataFrame") - val extentCols = df.extentColumns - require(extentCols.size == 1, "Exactly one Extent column must be in DataFrame") - (extentCols.head, crsCols.head, tileCols.head) - } - } - - // Scan table and constuct what the TileLayerMetadata would be in the specified destination CRS. - val tlm: TileLayerMetadata[SpatialKey] = df - .select( - ProjectedLayerMetadataAggregate( - destCRS, - extCol, - crsCol, - rf_cell_type(tileCol), - rf_dimensions(tileCol) - )) - .first() - logger.debug(s"Contructed TileLayerMetadata: ${tlm.toString}") - - val c = ProjectedRasterDefinition(tlm) - - val config = parameters.rasterDimensions - .map { dims => - c.copy(totalCols = dims.cols, totalRows = dims.rows) - } - .getOrElse(c) - - val aggs = tileCols - .map(t => TileRasterizerAggregate(config, crsCol, extCol, rf_tile(t))("tile").as(t.columnName)) - - val agg = df.select(aggs: _*) - - val row = agg.first() - - val bands = for (i <- 0 until row.size) yield row.getAs[Tile](i) - - ProjectedRaster(MultibandTile(bands), tlm.extent, tlm.crs) + TileRasterizerAggregate(df, parameters.crs.get, None, parameters.rasterDimensions) } val tags = Tags( diff --git a/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala b/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala index 5b757c9cc..bddb2657b 100644 --- a/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala +++ b/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala @@ -20,6 +20,8 @@ package org.locationtech.rasterframes.experimental.datasource.awspds +import geotrellis.proj4.LatLng +import geotrellis.vector.Extent import org.apache.spark.sql.functions._ import org.locationtech.rasterframes._ import org.locationtech.rasterframes.datasource.raster._ @@ -32,7 +34,7 @@ import org.locationtech.rasterframes.datasource.raster._ class L8CatalogRelationTest extends TestEnvironment { import spark.implicits._ - val catalog = spark.read.l8Catalog.load().cache() + val catalog = spark.read.l8Catalog.load() val scenes = catalog .where($"acquisition_date" === to_timestamp(lit("2017-04-04 15:12:55.394"))) @@ -106,26 +108,30 @@ class L8CatalogRelationTest extends TestEnvironment { } it("should construct an RGB composite") { - val aoi = "LINESTRING (31.115 29.963, 31.148 29.99)" - val sceneCat = catalog + val aoi = Extent(31.115, 29.963, 31.148, 29.99) + val scene = catalog .where( to_date($"acquisition_date") === to_date(lit("2019-07-03")) && - st_intersects(st_geometry($"bounds_wgs84"), st_geomFromWKT(aoi)) + st_intersects(st_geometry($"bounds_wgs84"), geomLit(aoi.jtsGeom)) ) + .orderBy("cloud_cover_pct") + .limit(1) - catalog.orderBy(desc("acquisition_date")).select($"acquisition_date").show(false) - catalog.where(to_date($"acquisition_date") === to_date(lit("2019-03-07"))).show(false) - - //sceneCat.show(false) - + val df = spark.read.raster + .fromCatalog(scene, "B4", "B3", "B2") + .withTileDimensions(256, 256) + .load() + .where(st_contains(rf_geometry($"B4"), st_reproject(geomLit(aoi.jtsGeom), lit("EPSG:4326"), rf_crs($"B4")))) -// val df = spark.read.raster -// .fromCatalog(scenes, "B4", "B3", "B2") -// .withTileDimensions(128, 128) -// .load() -// .where + val raster = df.aggregateRaster(LatLng, aoi, None) + println(raster) +// import geotrellis.raster.io.geotiff.{GeoTiffOptions, MultibandGeoTiff, Tiled} +// import geotrellis.raster.io.geotiff.compression.{DeflateCompression} +// import geotrellis.raster.io.geotiff.tags.codes.ColorSpace +// val tiffOptions = GeoTiffOptions(Tiled, DeflateCompression, ColorSpace.RGB) +// MultibandGeoTiff(raster, raster.crs, tiffOptions).write("target/composite.tif") } } } From 87d7c9db7d3ca666873f9a7b12306e07924f70f9 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Tue, 3 Sep 2019 12:26:54 -0400 Subject: [PATCH 19/48] IT test build fix. --- .../org/locationtech/rasterframes/ref/RasterRefIT.scala | 9 ++++++--- .../datasource/awspds/L8CatalogRelationTest.scala | 7 +++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/core/src/it/scala/org/locationtech/rasterframes/ref/RasterRefIT.scala b/core/src/it/scala/org/locationtech/rasterframes/ref/RasterRefIT.scala index fee8e9e08..e9a9f3474 100644 --- a/core/src/it/scala/org/locationtech/rasterframes/ref/RasterRefIT.scala +++ b/core/src/it/scala/org/locationtech/rasterframes/ref/RasterRefIT.scala @@ -24,9 +24,9 @@ package org.locationtech.rasterframes.ref import java.net.URI import geotrellis.proj4.LatLng - import geotrellis.vector.Extent import org.locationtech.rasterframes._ +import org.locationtech.rasterframes.expressions.aggregates.TileRasterizerAggregate class RasterRefIT extends TestEnvironment { describe("practical subregion reads") { @@ -44,14 +44,17 @@ class RasterRefIT extends TestEnvironment { val blue = RasterRef(RasterSource(scene(2)), 0, Some(area), None) val rf = Seq((red, green, blue)).toDF("red", "green", "blue") - val raster = rf.select( + val df = rf.select( rf_crs($"red"), rf_extent($"red"), rf_tile($"red"), rf_tile($"green"), rf_tile($"blue")) - .toDF.aggregateRaster(redScene.crs, None) + .toDF + + val raster = TileRasterizerAggregate(df, redScene.crs, None, None) forEvery(raster.tile.statisticsDouble) { stats => stats should be ('defined) stats.get.dataCells shouldBe > (1000L) } + //import geotrellis.raster.io.geotiff.{GeoTiffOptions, MultibandGeoTiff, Tiled} //import geotrellis.raster.io.geotiff.compression.{DeflateCompression, NoCompression} //import geotrellis.raster.io.geotiff.tags.codes.ColorSpace diff --git a/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala b/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala index bddb2657b..c41a6a223 100644 --- a/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala +++ b/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala @@ -25,6 +25,7 @@ import geotrellis.vector.Extent import org.apache.spark.sql.functions._ import org.locationtech.rasterframes._ import org.locationtech.rasterframes.datasource.raster._ +import org.locationtech.rasterframes.expressions.aggregates.TileRasterizerAggregate /** * Test rig for L8 catalog stuff. @@ -124,8 +125,10 @@ class L8CatalogRelationTest extends TestEnvironment { .where(st_contains(rf_geometry($"B4"), st_reproject(geomLit(aoi.jtsGeom), lit("EPSG:4326"), rf_crs($"B4")))) - val raster = df.aggregateRaster(LatLng, aoi, None) - println(raster) + noException should be thrownBy { + val raster = TileRasterizerAggregate(df, LatLng, Some(aoi), None) + println(raster) + } // import geotrellis.raster.io.geotiff.{GeoTiffOptions, MultibandGeoTiff, Tiled} // import geotrellis.raster.io.geotiff.compression.{DeflateCompression} From 98cbdfda25c7995bf5ca8560845b5c6c1e3ac230 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Tue, 3 Sep 2019 15:50:37 -0400 Subject: [PATCH 20/48] Ignoring RGB composite tests until next round of improvements. --- .../scala/org/locationtech/rasterframes/ref/RasterRefIT.scala | 2 +- .../experimental/datasource/awspds/L8CatalogRelationTest.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/it/scala/org/locationtech/rasterframes/ref/RasterRefIT.scala b/core/src/it/scala/org/locationtech/rasterframes/ref/RasterRefIT.scala index e9a9f3474..713285717 100644 --- a/core/src/it/scala/org/locationtech/rasterframes/ref/RasterRefIT.scala +++ b/core/src/it/scala/org/locationtech/rasterframes/ref/RasterRefIT.scala @@ -30,7 +30,7 @@ import org.locationtech.rasterframes.expressions.aggregates.TileRasterizerAggreg class RasterRefIT extends TestEnvironment { describe("practical subregion reads") { - it("should construct a natural color composite") { + ignore("should construct a natural color composite") { import spark.implicits._ def scene(idx: Int) = URI.create(s"https://landsat-pds.s3.us-west-2.amazonaws.com" + s"/c1/L8/176/039/LC08_L1TP_176039_20190703_20190718_01_T1/LC08_L1TP_176039_20190703_20190718_01_T1_B$idx.TIF") diff --git a/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala b/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala index c41a6a223..863410123 100644 --- a/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala +++ b/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala @@ -108,7 +108,7 @@ class L8CatalogRelationTest extends TestEnvironment { stats.mean shouldBe > (10000.0) } - it("should construct an RGB composite") { + ignore("should construct an RGB composite") { val aoi = Extent(31.115, 29.963, 31.148, 29.99) val scene = catalog .where( From 9b9f99351bc54b1553473a6585faa30e1129b4c9 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Tue, 3 Sep 2019 16:46:39 -0400 Subject: [PATCH 21/48] Updated ExplodeTiles to work with proj_raster type. --- .../expressions/generators/ExplodeTiles.scala | 14 ++++---- .../rasterframes/ml/TileColumnSupport.scala | 8 ++--- .../rasterframes/ml/TileExploderSpec.scala | 32 +++++++++++++++---- docs/src/main/paradox/release-notes.md | 4 +++ .../src/main/python/docs/vector-data.pymd | 2 +- 5 files changed, 42 insertions(+), 18 deletions(-) diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala index bd2a4689a..2a70be585 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala @@ -21,16 +21,15 @@ package org.locationtech.rasterframes.expressions.generators -import org.locationtech.rasterframes._ -import org.locationtech.rasterframes.encoders.CatalystSerializer._ -import org.locationtech.rasterframes.util._ import geotrellis.raster._ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.expressions.{Expression, Generator, GenericInternalRow} -import org.apache.spark.sql.rf.TileUDT import org.apache.spark.sql.types._ +import org.locationtech.rasterframes._ +import org.locationtech.rasterframes.expressions.DynamicExtractors +import org.locationtech.rasterframes.util._ import spire.syntax.cfor.cfor /** @@ -67,8 +66,11 @@ case class ExplodeTiles( override def eval(input: InternalRow): TraversableOnce[InternalRow] = { val tiles = Array.ofDim[Tile](children.length) cfor(0)(_ < tiles.length, _ + 1) { index => - val row = children(index).eval(input).asInstanceOf[InternalRow] - tiles(index) = if(row != null) row.to[Tile](TileUDT.tileSerializer) else null + val c = children(index) + val row = c.eval(input).asInstanceOf[InternalRow] + tiles(index) = if(row != null) + DynamicExtractors.tileExtractor(c.dataType)(row)._1 + else null } val dims = tiles.filter(_ != null).map(_.dimensions) if(dims.isEmpty) Seq.empty[InternalRow] diff --git a/core/src/main/scala/org/locationtech/rasterframes/ml/TileColumnSupport.scala b/core/src/main/scala/org/locationtech/rasterframes/ml/TileColumnSupport.scala index d261f7e91..5e8a2537f 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/ml/TileColumnSupport.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/ml/TileColumnSupport.scala @@ -21,8 +21,8 @@ package org.locationtech.rasterframes.ml -import org.apache.spark.sql.rf.TileUDT import org.apache.spark.sql.types.{StructField, StructType} +import org.locationtech.rasterframes.expressions.DynamicExtractors /** * Utility mix-in for separating out tile columns from non-tile columns. @@ -31,13 +31,11 @@ import org.apache.spark.sql.types.{StructField, StructType} */ trait TileColumnSupport { protected def isTile(field: StructField) = - field.dataType.typeName.equalsIgnoreCase(TileUDT.typeName) + DynamicExtractors.tileExtractor.isDefinedAt(field.dataType) type TileFields = Array[StructField] type NonTileFields = Array[StructField] protected def selectTileAndNonTileFields(schema: StructType): (TileFields, NonTileFields) = { - val tiles = schema.fields.filter(isTile) - val nonTiles = schema.fields.filterNot(isTile) - (tiles, nonTiles) + schema.fields.partition(f => DynamicExtractors.tileExtractor.isDefinedAt(f.dataType)) } } diff --git a/core/src/test/scala/org/locationtech/rasterframes/ml/TileExploderSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/ml/TileExploderSpec.scala index 2d9e2d04c..b79f1bdf8 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/ml/TileExploderSpec.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/ml/TileExploderSpec.scala @@ -21,28 +21,48 @@ package org.locationtech.rasterframes.ml -import org.locationtech.rasterframes.TestData -import geotrellis.raster.Tile -import org.apache.spark.sql.functions.lit -import org.locationtech.rasterframes.TestEnvironment +import geotrellis.proj4.LatLng +import geotrellis.raster.{IntCellType, Tile} +import org.apache.spark.sql.functions.{avg, lit} +import org.locationtech.rasterframes.{TestData, TestEnvironment} /** * * @since 2/16/18 */ class TileExploderSpec extends TestEnvironment with TestData { describe("Tile explode transformer") { - it("should explode tiles") { - import spark.implicits._ + import spark.implicits._ + it("should explode tile") { val df = Seq[(Tile, Tile)]((byteArrayTile, byteArrayTile)).toDF("tile1", "tile2").withColumn("other", lit("stuff")) val exploder = new TileExploder() val newSchema = exploder.transformSchema(df.schema) val exploded = exploder.transform(df) + assert(newSchema === exploded.schema) assert(exploded.columns.length === 5) assert(exploded.count() === 9) write(exploded) + exploded.agg(avg($"tile1")).as[Double].first() should be (byteArrayTile.statisticsDouble.get.mean) + } + + it("should explode proj_raster") { + val randPRT = TestData.projectedRasterTile(10, 10, scala.util.Random.nextInt(), extent, LatLng, IntCellType) + + val df = Seq(randPRT).toDF("proj_raster").withColumn("other", lit("stuff")) + + val exploder = new TileExploder() + val newSchema = exploder.transformSchema(df.schema) + + val exploded = exploder.transform(df) + + assert(newSchema === exploded.schema) + assert(exploded.columns.length === 4) + assert(exploded.count() === randPRT.size) + write(exploded) + + exploded.agg(avg($"proj_raster")).as[Double].first() should be (randPRT.statisticsDouble.get.mean) } } } diff --git a/docs/src/main/paradox/release-notes.md b/docs/src/main/paradox/release-notes.md index 2a108de99..e750c6d76 100644 --- a/docs/src/main/paradox/release-notes.md +++ b/docs/src/main/paradox/release-notes.md @@ -2,6 +2,10 @@ ## 0.8.x +### 0.8.2 + +* Fixed `TileExploder` to support `proj_raster` struct [(#287)](https://github.com/locationtech/rasterframes/issues/287). + ### 0.8.1 * Added `rf_local_no_data`, `rf_local_data` and `rf_interpret_cell_type_as` raster functions. diff --git a/pyrasterframes/src/main/python/docs/vector-data.pymd b/pyrasterframes/src/main/python/docs/vector-data.pymd index 8d50f20db..1762565ad 100644 --- a/pyrasterframes/src/main/python/docs/vector-data.pymd +++ b/pyrasterframes/src/main/python/docs/vector-data.pymd @@ -93,7 +93,7 @@ l8_filtered = l8 \ .filter(st_intersects(l8.geom, st_bufferPoint(l8.paducah, lit(50000.0)))) \ .filter(l8.acquisition_date > '2018-02-01') \ .filter(l8.acquisition_date < '2018-04-01') -l8_filtered.select('product_id', 'entity_id', 'acquisition_date', 'cloud_cover_pct').toPandas() +l8_filtered.select('product_id', 'entity_id', 'acquisition_date', 'cloud_cover_pct') ``` [GeoPandas]: http://geopandas.org From ddd8f0efcc1520c0b3285d67e44a767ffa58ed0a Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Wed, 4 Sep 2019 21:16:58 -0400 Subject: [PATCH 22/48] Add failing unit test for issue 333, error in rf_agg_local_mean Signed-off-by: Jason T. Brown --- .../rasterframes/RasterFunctionsSpec.scala | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala index ed7fc1dcc..473c1370b 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala @@ -525,6 +525,23 @@ class RasterFunctionsSpec extends TestEnvironment with RasterMatchers { checkDocs("rf_agg_local_max") } + ignore("should compute local mean") { + checkDocs("rf_agg_local_mean") + // https://github.com/locationtech/rasterframes/issues/333 + val df = Seq(two, three, one, six).toDF("tile") + .withColumn("id", monotonically_increasing_id()) + df.select(rf_agg_local_mean($"tile")).first() should be(three.toArrayTile()) + + df.selectExpr("rf_agg_local_mean(tile)").as[Tile].first() should be(three.toArrayTile()) + + noException should be thrownBy { + df.groupBy($"id") + .agg(rf_agg_local_mean($"tile")) + .collect() + } + + } + it("should compute local data cell counts") { val df = Seq(two, randNDPRT, nd).toDF("tile") val t1 = df.select(rf_agg_local_data_cells($"tile")).first() From b92012e4ac9924458151907d05752187adb5a31d Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Fri, 6 Sep 2019 11:15:45 -0400 Subject: [PATCH 23/48] Fix for #333 and additional tests in that vein. --- .../aggregates/LocalMeanAggregate.scala | 7 +++-- .../transformers/RGBComposite.scala | 1 - .../extensions/DataFrameMethods.scala | 5 +-- .../rasterframes/RasterFunctionsSpec.scala | 5 ++- .../rasterframes/TileStatsSpec.scala | 31 +++++++++++++++++++ 5 files changed, 38 insertions(+), 11 deletions(-) diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalMeanAggregate.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalMeanAggregate.scala index 06741a98c..0bb23cb9e 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalMeanAggregate.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/LocalMeanAggregate.scala @@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Column, TypedColumn} import org.locationtech.rasterframes.TileType +import org.locationtech.rasterframes.expressions.accessors.RealizeTile @ExpressionDescription( usage = "_FUNC_(tile) - Computes a new tile contining the mean cell values across all tiles in column.", @@ -58,11 +59,11 @@ case class LocalMeanAggregate(child: Expression) extends UnaryRasterAggregate { ) override lazy val updateExpressions: Seq[Expression] = Seq( If(IsNull(count), - SetCellType(Defined(child), Literal("int32")), - If(IsNull(child), count, BiasedAdd(count, Defined(child))) + SetCellType(RealizeTile(Defined(child)), Literal("int32")), + If(IsNull(child), count, BiasedAdd(count, Defined(RealizeTile(child)))) ), If(IsNull(sum), - SetCellType(child, Literal("float64")), + SetCellType(RealizeTile(child), Literal("float64")), If(IsNull(child), sum, BiasedAdd(sum, child)) ) ) diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/transformers/RGBComposite.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/transformers/RGBComposite.scala index c214b8ef9..5b266dd06 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/transformers/RGBComposite.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/transformers/RGBComposite.scala @@ -33,7 +33,6 @@ import org.locationtech.rasterframes._ import org.locationtech.rasterframes.encoders.CatalystSerializer._ import org.locationtech.rasterframes.expressions.DynamicExtractors.tileExtractor import org.locationtech.rasterframes.expressions.row -import org.locationtech.rasterframes.tiles.ProjectedRasterTile /** * Expression to combine the given tile columns into an 32-bit RGB composite. diff --git a/core/src/main/scala/org/locationtech/rasterframes/extensions/DataFrameMethods.scala b/core/src/main/scala/org/locationtech/rasterframes/extensions/DataFrameMethods.scala index 1ee3a21fa..9a57b9dd8 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/extensions/DataFrameMethods.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/extensions/DataFrameMethods.scala @@ -22,7 +22,6 @@ package org.locationtech.rasterframes.extensions import geotrellis.proj4.CRS -import geotrellis.raster.{MultibandTile, ProjectedRaster} import geotrellis.spark.io._ import geotrellis.spark.{SpaceTimeKey, SpatialComponent, SpatialKey, TemporalKey, TileLayerMetadata} import geotrellis.util.MethodExtensions @@ -33,9 +32,7 @@ import org.apache.spark.sql.{Column, DataFrame, TypedColumn} import org.locationtech.rasterframes.StandardColumns._ import org.locationtech.rasterframes.encoders.CatalystSerializer._ import org.locationtech.rasterframes.encoders.StandardEncoders._ -import org.locationtech.rasterframes.expressions.{DynamicExtractors, aggregates} -import org.locationtech.rasterframes.expressions.aggregates.TileRasterizerAggregate -import org.locationtech.rasterframes.model.TileDimensions +import org.locationtech.rasterframes.expressions.DynamicExtractors import org.locationtech.rasterframes.tiles.ProjectedRasterTile import org.locationtech.rasterframes.util._ import org.locationtech.rasterframes.{MetadataKeys, RasterFrameLayer} diff --git a/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala index 473c1370b..0d4e02e83 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala @@ -525,11 +525,11 @@ class RasterFunctionsSpec extends TestEnvironment with RasterMatchers { checkDocs("rf_agg_local_max") } - ignore("should compute local mean") { + it("should compute local mean") { checkDocs("rf_agg_local_mean") - // https://github.com/locationtech/rasterframes/issues/333 val df = Seq(two, three, one, six).toDF("tile") .withColumn("id", monotonically_increasing_id()) + df.select(rf_agg_local_mean($"tile")).first() should be(three.toArrayTile()) df.selectExpr("rf_agg_local_mean(tile)").as[Tile].first() should be(three.toArrayTile()) @@ -539,7 +539,6 @@ class RasterFunctionsSpec extends TestEnvironment with RasterMatchers { .agg(rf_agg_local_mean($"tile")) .collect() } - } it("should compute local data cell counts") { diff --git a/core/src/test/scala/org/locationtech/rasterframes/TileStatsSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/TileStatsSpec.scala index 50920ab1c..0c4e36513 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/TileStatsSpec.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/TileStatsSpec.scala @@ -24,6 +24,7 @@ package org.locationtech.rasterframes import geotrellis.raster._ import geotrellis.raster.mapalgebra.local.{Max, Min} import geotrellis.spark._ +import org.apache.spark.sql.Column import org.apache.spark.sql.functions._ import org.locationtech.rasterframes.TestData.randomTile import org.locationtech.rasterframes.stats.CellHistogram @@ -317,4 +318,34 @@ class TileStatsSpec extends TestEnvironment with TestData { ndCount2 should be(count + 1) } } + + describe("proj_raster handling") { + it("should handle proj_raster structures") { + val df = Seq(lazyPRT, lazyPRT).toDF("tile") + + val targets = Seq[Column => Column]( + rf_is_no_data_tile, + rf_data_cells, + rf_no_data_cells, + rf_agg_local_max, + rf_agg_local_min, + rf_agg_local_mean, + rf_agg_local_data_cells, + rf_agg_local_no_data_cells, + rf_agg_local_stats, + rf_agg_approx_histogram, + rf_tile_histogram, + rf_tile_stats, + rf_tile_mean, + rf_tile_max, + rf_tile_min + ) + + forEvery(targets) { f => + noException shouldBe thrownBy { + df.select(f($"tile")).collect() + } + } + } + } } From 3614338188d1d400afa5101fef650a5c9b68d6ad Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Fri, 6 Sep 2019 11:41:11 -0400 Subject: [PATCH 24/48] Run python tile exploder test for projected raster Signed-off-by: Jason T. Brown --- pyrasterframes/src/main/python/tests/ExploderTests.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pyrasterframes/src/main/python/tests/ExploderTests.py b/pyrasterframes/src/main/python/tests/ExploderTests.py index 05ff958ae..f7635ad7a 100644 --- a/pyrasterframes/src/main/python/tests/ExploderTests.py +++ b/pyrasterframes/src/main/python/tests/ExploderTests.py @@ -33,7 +33,6 @@ class ExploderTests(TestEnvironment): - @unittest.skip("See issue https://github.com/locationtech/rasterframes/issues/163") def test_tile_exploder_pipeline_for_prt(self): # NB the tile is a Projected Raster Tile df = self.spark.read.raster(self.img_uri) From e34258059b81730f8f5cdd05ec91b6668774a38f Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Fri, 6 Sep 2019 13:03:53 -0400 Subject: [PATCH 25/48] PR feedback Signed-off-by: Jason T. Brown --- .../main/python/pyrasterframes/__init__.py | 21 +++--- .../main/python/tests/RasterFunctionsTests.py | 2 +- .../src/main/python/tests/RasterSourceTest.py | 68 +++++++++---------- 3 files changed, 45 insertions(+), 46 deletions(-) diff --git a/pyrasterframes/src/main/python/pyrasterframes/__init__.py b/pyrasterframes/src/main/python/pyrasterframes/__init__.py index da8ca81a6..be064bf4b 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/__init__.py +++ b/pyrasterframes/src/main/python/pyrasterframes/__init__.py @@ -117,19 +117,20 @@ def _raster_reader( lazy_tiles=True, **options): """ - Returns a Spark DataFrame from raster data files specified by URI pointers - The returned DataFrame will have a column of (CRS, Extent, Tile) for each URI read - Multiple bands from the same raster file are spread across rows of the DataFrame. See band_indexes param. - If bands from a scene are stored in separate files, provide a DataFrame to the `source` parameter. Each row in the returned DataFrame will contain one (CRS, Extent, Tile) for each item in `catalog_col_names` + Returns a Spark DataFrame from raster data files specified by URIs. + Each row in the returned DataFrame will contain a column with struct of (CRS, Extent, Tile) for each item in + `catalog_col_names`. + Multiple bands from the same raster file are spread across rows of the DataFrame. See `band_indexes` param. + If bands from a scene are stored in separate files, provide a DataFrame to the `source` parameter. For more details and example usage, consult https://rasterframes.io/raster-read.html - :param source: a string, list of strings, list of lists of strings, a pandas DataFrame or a Spark DataFrame giving URIs to the raster data to read - :param catalog_col_names: required if source is a DataFrame or CSV string. It is a list of strings giving the names of columns containing URIs to read - :param band_indexes: list of integers indicating which bands, zero-based, to read from the raster files specified; default is to read only the first band - :param tile_dimensions: tuple or list of two indicating the default tile dimension as (columns, rows) - :param lazy_tiles: If true (default) only generate minimal references to tile contents; if false, fetch tile cell values - :param options: Additional keyword arguments to pass to the spark DataSource + :param source: a string, list of strings, list of lists of strings, a Pandas DataFrame or a Spark DataFrame giving URIs to the raster data to read. + :param catalog_col_names: required if `source` is a DataFrame or CSV string. It is a list of strings giving the names of columns containing URIs to read. + :param band_indexes: list of integers indicating which bands, zero-based, to read from the raster files specified; default is to read only the first band. + :param tile_dimensions: tuple or list of two indicating the default tile dimension as (columns, rows). + :param lazy_tiles: If true (default) only generate minimal references to tile contents; if false, fetch tile cell values. + :param options: Additional keyword arguments to pass to the Spark DataSource. """ from pandas import DataFrame as PdDataFrame diff --git a/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py b/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py index 0d8528418..ac89c2448 100644 --- a/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py +++ b/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py @@ -286,7 +286,7 @@ def test_render_composite(self): cat = self.spark.createDataFrame([ Row(red=self.l8band_uri(4), green=self.l8band_uri(3), blue=self.l8band_uri(2)) ]) - rf = self.spark.read.raster(catalog=cat, catalog_col_names=cat.columns) + rf = self.spark.read.raster(cat, catalog_col_names=cat.columns) # Test composite construction rgb = rf.select(rf_tile(rf_rgb_composite('red', 'green', 'blue')).alias('rgb')).first()['rgb'] diff --git a/pyrasterframes/src/main/python/tests/RasterSourceTest.py b/pyrasterframes/src/main/python/tests/RasterSourceTest.py index 509ed6293..5f7967a49 100644 --- a/pyrasterframes/src/main/python/tests/RasterSourceTest.py +++ b/pyrasterframes/src/main/python/tests/RasterSourceTest.py @@ -21,6 +21,8 @@ from pyrasterframes.rasterfunctions import * from pyrasterframes.rf_types import * from pyspark.sql.functions import * +import pandas as pd +from shapely.geometry import Point import os.path from unittest import skip from . import TestEnvironment @@ -41,6 +43,14 @@ def path(scene, band): p = scene_dict[scene] return p.format(band) + def path_pandas_df(self): + return pd.DataFrame([ + {'b1': self.path(1, 1), 'b2': self.path(1, 2), 'b3': self.path(1, 3), 'geo': Point(1, 1)}, + {'b1': self.path(2, 1), 'b2': self.path(2, 2), 'b3': self.path(2, 3), 'geo': Point(2, 2)}, + {'b1': self.path(3, 1), 'b2': self.path(3, 2), 'b3': self.path(3, 3), 'geo': Point(3, 3)}, + ]) + + def test_handle_lazy_eval(self): df = self.spark.read.raster(self.path(1, 1)) ltdf = df.select('proj_raster') @@ -129,59 +139,41 @@ def test_schemeless_string(self): self.assertTrue(df.count() > 0) def test_spark_df_source(self): - import pandas as pd + catalog_columns = ['b1', 'b2', 'b3'] + catalog = self.spark.createDataFrame(self.path_pandas_df()) - # Create a pandas dataframe (makes it easy to create spark df) - path_pandas = pd.DataFrame([ - {'b1': self.path(1, 1), 'b2': self.path(1, 2), 'b3': self.path(1, 3)}, - {'b1': self.path(2, 1), 'b2': self.path(2, 2), 'b3': self.path(2, 3)}, - {'b1': self.path(3, 1), 'b2': self.path(3, 2), 'b3': self.path(3, 3)}, - ]) - # comma separated list of column names containing URI's to read. - catalog_columns = path_pandas.columns.tolist() - path_table = self.spark.createDataFrame(path_pandas) - - path_df = self.spark.read.raster( - path_table, + df = self.spark.read.raster( + catalog, tile_dimensions=(512, 512), catalog_col_names=catalog_columns, lazy_tiles=True # We'll get an OOM error if we try to read 9 scenes all at once! ) - self.assertTrue(len(path_df.columns) == 6) # three bands times {path, tile} - self.assertTrue(path_df.select('b1_path').distinct().count() == 3) # as per scene_dict - b1_paths_maybe = path_df.select('b1_path').distinct().collect() + self.assertTrue(len(df.columns) == 7) # three bands times {path, tile} plus geo + self.assertTrue(df.select('b1_path').distinct().count() == 3) # as per scene_dict + b1_paths_maybe = df.select('b1_path').distinct().collect() b1_paths = [self.path(s, 1) for s in [1, 2, 3]] self.assertTrue(all([row.b1_path in b1_paths for row in b1_paths_maybe])) def test_pandas_source(self): - import pandas as pd - import geopandas - from shapely.geometry import Point - # Create a pandas dataframe (makes it easy to create spark df) - path_pandas = pd.DataFrame([ - {'b1': self.path(1, 1), 'b2': self.path(1, 2), 'b3': self.path(1, 3), 'geo': Point(1, 1)}, - {'b1': self.path(2, 1), 'b2': self.path(2, 2), 'b3': self.path(2, 3), 'geo': Point(2, 2)}, - {'b1': self.path(3, 1), 'b2': self.path(3, 2), 'b3': self.path(3, 3), 'geo': Point(3, 3)}, - ]) - - # here a subtle difference with the test_raster_source_catalog_reader test, feed the DataFrame - # not a CSV and not an already created spark DF. df = self.spark.read.raster( - path_pandas, + self.path_pandas_df(), catalog_col_names=['b1', 'b2', 'b3'] ) self.assertEqual(len(df.columns), 7) # three path cols, three tile cols, and geo self.assertTrue('geo' in df.columns) self.assertTrue(df.select('b1_path').distinct().count() == 3) - # Same test with geopandas - geo_df = geopandas.GeoDataFrame(path_pandas, crs={'init': 'EPSG:4326'}, geometry='geo') - df2 = self.spark.read.raster(geo_df, ['b1', 'b2', 'b3']) - self.assertEqual(len(df2.columns), 7) # three path cols, three tile cols, and geo - self.assertTrue('geo' in df2.columns) - self.assertTrue(df2.select('b1_path').distinct().count() == 3) + def test_geopandas_source(self): + from geopandas import GeoDataFrame + # Same test as test_pandas_source with geopandas + geo_df = GeoDataFrame(self.path_pandas_df(), crs={'init': 'EPSG:4326'}, geometry='geo') + df = self.spark.read.raster(geo_df, ['b1', 'b2', 'b3']) + + self.assertEqual(len(df.columns), 7) # three path cols, three tile cols, and geo + self.assertTrue('geo' in df.columns) + self.assertTrue(df.select('b1_path').distinct().count() == 3) def test_csv_string(self): @@ -198,3 +190,9 @@ def test_csv_string(self): df = self.spark.read.raster(s, ['b1', 'b2']) self.assertEqual(len(df.columns), 3 + 2) # number of columns in original DF plus cardinality of catalog_col_names self.assertTrue(len(df.take(1))) # non-empty check + + def test_catalog_named_arg(self): + # through version 0.8.1 reading a catalog was via named argument only. + df = self.spark.read.raster(catalog=self.path_pandas_df(), catalog_col_names=['b1', 'b2', 'b3']) + self.assertEqual(len(df.columns), 7) # three path cols, three tile cols, and geo + self.assertTrue(df.select('b1_path').distinct().count() == 3) From 74f09dfb9666385867e0745b761072de03667a36 Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Fri, 6 Sep 2019 13:27:13 -0400 Subject: [PATCH 26/48] Normalize RasterSourceDataSource param names between python and SQL Signed-off-by: Jason T. Brown --- .../raster/RasterSourceDataSource.scala | 12 +++++----- .../src/main/python/docs/languages.pymd | 6 ++--- .../main/python/pyrasterframes/__init__.py | 24 +++++++++---------- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceDataSource.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceDataSource.scala index 6cea717ec..5aec1a065 100644 --- a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceDataSource.scala +++ b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceDataSource.scala @@ -44,12 +44,12 @@ object RasterSourceDataSource { final val SHORT_NAME = "raster" final val PATH_PARAM = "path" final val PATHS_PARAM = "paths" - final val BAND_INDEXES_PARAM = "bandIndexes" - final val TILE_DIMS_PARAM = "tileDimensions" - final val CATALOG_TABLE_PARAM = "catalogTable" - final val CATALOG_TABLE_COLS_PARAM = "catalogColumns" - final val CATALOG_CSV_PARAM = "catalogCSV" - final val LAZY_TILES_PARAM = "lazyTiles" + final val BAND_INDEXES_PARAM = "band_indexes" + final val TILE_DIMS_PARAM = "tile_dimensions" + final val CATALOG_TABLE_PARAM = "catalog_table" + final val CATALOG_TABLE_COLS_PARAM = "catalog_col_names" + final val CATALOG_CSV_PARAM = "catalog_csv" + final val LAZY_TILES_PARAM = "lazy_tiles" final val DEFAULT_COLUMN_NAME = PROJECTED_RASTER_COLUMN.columnName diff --git a/pyrasterframes/src/main/python/docs/languages.pymd b/pyrasterframes/src/main/python/docs/languages.pymd index 2a54c5124..ead79337f 100644 --- a/pyrasterframes/src/main/python/docs/languages.pymd +++ b/pyrasterframes/src/main/python/docs/languages.pymd @@ -97,9 +97,9 @@ sql(""" CREATE OR REPLACE TEMPORARY VIEW red_nir_tiles_monthly_2017 USING raster OPTIONS ( - catalogTable='red_nir_monthly_2017', - catalogColumns='red,nir', - tileDimensions='256,256' + catalog_table='red_nir_monthly_2017', + catalog_col_names='red,nir', + tile_dimensions='256,256' ) """) ``` diff --git a/pyrasterframes/src/main/python/pyrasterframes/__init__.py b/pyrasterframes/src/main/python/pyrasterframes/__init__.py index be064bf4b..905323d60 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/__init__.py +++ b/pyrasterframes/src/main/python/pyrasterframes/__init__.py @@ -153,9 +153,9 @@ def temp_name(): band_indexes = [0] options.update({ - "bandIndexes": to_csv(band_indexes), - "tileDimensions": to_csv(tile_dimensions), - "lazyTiles": lazy_tiles + "band_indexes": to_csv(band_indexes), + "tile_dimensions": to_csv(tile_dimensions), + "lazy_tiles": lazy_tiles }) # Parse the `source` argument @@ -166,11 +166,11 @@ def temp_name(): catalog = None options.update(dict(paths='\n'.join([str(i) for i in source]))) # pass in "uri1\nuri2\nuri3\n..." if all([isinstance(i, list) for i in source]): - # list of lists; we will rely on pandas to - # - coerce all data to str (possibly using objects' __str__ or __repr__\ + # list of lists; we will rely on pandas to: + # - coerce all data to str (possibly using objects' __str__ or __repr__) # - ensure data is not "ragged": all sublists are same len path = None - catalog_col_names = ['proj_raster_{}'.format(i) for i in range(len(source[0]))] + catalog_col_names = ['proj_raster_{}'.format(i) for i in range(len(source[0]))] # assign these names catalog = PdDataFrame(source, columns=catalog_col_names, dtype=str, @@ -194,8 +194,8 @@ def temp_name(): if isinstance(catalog, str): options.update({ - "catalogCSV": catalog, - "catalogColumns": to_csv(catalog_col_names) + "catalog_csv": catalog, + "catalog_col_names": to_csv(catalog_col_names) }) elif isinstance(catalog, DataFrame): # check catalog_col_names @@ -205,8 +205,8 @@ def temp_name(): tmp_name = temp_name() catalog.createOrReplaceTempView(tmp_name) options.update({ - "catalogTable": tmp_name, - "catalogColumns": to_csv(catalog_col_names) + "catalog_table": tmp_name, + "catalog_col_names": to_csv(catalog_col_names) }) elif isinstance(catalog, PdDataFrame): # check catalog_col_names @@ -220,8 +220,8 @@ def temp_name(): spark_catalog = session.createDataFrame(catalog) spark_catalog.createOrReplaceTempView(tmp_name) options.update({ - "catalogTable": tmp_name, - "catalogColumns": to_csv(catalog_col_names) + "catalog_table": tmp_name, + "catalog_col_names": to_csv(catalog_col_names) }) return df_reader \ From e713b24a7ea9821185c499e2255ba177b40cad76 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Fri, 6 Sep 2019 15:11:43 -0400 Subject: [PATCH 27/48] Tweaked parquet I/O tests to trigger UDT issue. --- .../scala/org/locationtech/rasterframes/TestEnvironment.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/test/scala/org/locationtech/rasterframes/TestEnvironment.scala b/core/src/test/scala/org/locationtech/rasterframes/TestEnvironment.scala index 87ab2559d..9dbe7574b 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/TestEnvironment.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/TestEnvironment.scala @@ -62,9 +62,9 @@ trait TestEnvironment extends FunSpec with GeoTrellisTestEnvironment val dest = Files.createTempFile(Paths.get(outputLocalPath), "rf", ".parquet") logger.trace(s"Writing '${sanitized.columns.mkString(", ")}' to '$dest'...") sanitized.write.mode(SaveMode.Overwrite).parquet(dest.toString) - val rows = df.sparkSession.read.parquet(dest.toString).count() + val rows = df.sparkSession.read.parquet(dest.toString).collect() logger.trace(s" read back $rows row(s)") - rows == inRows + rows.length == inRows } /** From 593e8634263c139cf0fd502e3aff7f06b59b0bfd Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Mon, 9 Sep 2019 09:40:30 -0400 Subject: [PATCH 28/48] Fixes #338. --- .../scala/org/locationtech/rasterframes/ref/RasterRef.scala | 2 +- .../org/locationtech/rasterframes/TestEnvironment.scala | 6 ++++-- docs/src/main/paradox/release-notes.md | 6 ++++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/locationtech/rasterframes/ref/RasterRef.scala b/core/src/main/scala/org/locationtech/rasterframes/ref/RasterRef.scala index ce91c0e00..9cd259456 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/ref/RasterRef.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/ref/RasterRef.scala @@ -77,7 +77,7 @@ object RasterRef extends LazyLogging { implicit val rasterRefSerializer: CatalystSerializer[RasterRef] = new CatalystSerializer[RasterRef] { val rsType = new RasterSourceUDT() override def schema: StructType = StructType(Seq( - StructField("source", rsType, false), + StructField("source", rsType.sqlType, false), StructField("bandIndex", IntegerType, false), StructField("subextent", schemaOf[Extent], true), StructField("subgrid", schemaOf[GridBounds], true) diff --git a/core/src/test/scala/org/locationtech/rasterframes/TestEnvironment.scala b/core/src/test/scala/org/locationtech/rasterframes/TestEnvironment.scala index 9dbe7574b..4a2d5c1cc 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/TestEnvironment.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/TestEnvironment.scala @@ -62,8 +62,10 @@ trait TestEnvironment extends FunSpec with GeoTrellisTestEnvironment val dest = Files.createTempFile(Paths.get(outputLocalPath), "rf", ".parquet") logger.trace(s"Writing '${sanitized.columns.mkString(", ")}' to '$dest'...") sanitized.write.mode(SaveMode.Overwrite).parquet(dest.toString) - val rows = df.sparkSession.read.parquet(dest.toString).collect() - logger.trace(s" read back $rows row(s)") + val in = df.sparkSession.read.parquet(dest.toString) + // NB: The `collect` ensures values get fully reified. + val rows = in.collect() + logger.trace(s" read back ${rows.length} row(s)") rows.length == inRows } diff --git a/docs/src/main/paradox/release-notes.md b/docs/src/main/paradox/release-notes.md index e750c6d76..9f500b5eb 100644 --- a/docs/src/main/paradox/release-notes.md +++ b/docs/src/main/paradox/release-notes.md @@ -1,10 +1,12 @@ -# Release Notes +# Release Notes ## 0.8.x ### 0.8.2 -* Fixed `TileExploder` to support `proj_raster` struct [(#287)](https://github.com/locationtech/rasterframes/issues/287). +* Fixed Parquet serialization issue with `RasterRef`s ([#338](https://github.com/locationtech/rasterframes/issues/338)) +* Fixed `TileExploder`, `rf_agg_local_mean` and `TileColumnSupport` to support `proj_raster` struct ([#287](https://github.com/locationtech/rasterframes/issues/287), [#163](https://github.com/locationtech/rasterframes/issues/163), [#333](https://github.com/locationtech/rasterframes/issues/333)). +* Various documentation improvements. ### 0.8.1 From d1d5f12f9935cdb7babaf0a665e2e3eb14b0e405 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Wed, 11 Sep 2019 10:09:30 -0400 Subject: [PATCH 29/48] Fleshed out details on using Scala. Closes #324 --- .../src/main/python/docs/getting-started.pymd | 23 +++++++++++++++++-- .../src/main/python/docs/languages.pymd | 6 +++-- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/pyrasterframes/src/main/python/docs/getting-started.pymd b/pyrasterframes/src/main/python/docs/getting-started.pymd index eea5169a7..3ed254413 100644 --- a/pyrasterframes/src/main/python/docs/getting-started.pymd +++ b/pyrasterframes/src/main/python/docs/getting-started.pymd @@ -6,7 +6,7 @@ If you are new to Earth-observing imagery, you might consider looking at the [Co @@@ -There are @ref:[several ways](getting-started.md#other-options) to use RasterFrames, and @ref:[several languages](languages.md) with which you can use it. +There are @ref:[several ways](getting-started.md#other-options) to use RasterFrames, and @ref:[several languages (Scala, SQL)](languages.md) with which you can use it. The simplest way to get started with RasterFrames is via the [Docker image](#using-jupyter-notebook), or from the Python shell. To get started with the Python shell you will need: @@ -107,7 +107,26 @@ SparkSession available as 'spark'. Now you have the configured SparkSession with RasterFrames enabled. -## Installing GDAL +### Scala Development + +There is first-class support for Scala in RasterFrames. See the @ref:[Scala and SQL](languages.md) page for an example application, and the [Scala API Documentation](latest/api/index.html) for function details. + +If you would like to use RasterFrames in Scala, you'll need to add the following resolvers and depednencies to your sbt project: + +```scala +resolvers ++= Seq( + "locationtech-releases" at "https://repo.locationtech.org/content/groups/releases", + "Azavea Public Builds" at "https://dl.bintray.com/azavea/geotrellis" +) +libraryDependencies ++= Seq( + "org.locationtech.rasterframes" %% "rasterframes" % ${VERSION}, + "org.locationtech.rasterframes" %% "rasterframes-datasource" % ${VERSION}, + // This is optional. Provides access to AWS PDS catalogs. + "org.locationtech.rasterframes" %% "rasterframes-experimental" % ${VERSION} +) +``` + +## Installing GDAL Support GDAL provides a wide variety of drivers to read data from many different raster formats. If GDAL is installed in the environment, RasterFrames will be able to @ref:[read](raster-read.md) those formats. If you are using the @ref:[Jupyter Notebook image](getting-started.md#jupyter-notebook), GDAL is already installed for you. Otherwise follow the instructions below. Version 2.4.1 or greater is required. diff --git a/pyrasterframes/src/main/python/docs/languages.pymd b/pyrasterframes/src/main/python/docs/languages.pymd index 1d4895e2e..743e02e45 100644 --- a/pyrasterframes/src/main/python/docs/languages.pymd +++ b/pyrasterframes/src/main/python/docs/languages.pymd @@ -1,6 +1,8 @@ -# API Languages +# Scala and SQL -One of the great powers of RasterFrames, afforded by Spark SQL, is the ability to express computation in multiple programming languages. This documentation focuses on Python because it is the most commonly used language in data science and GIS analytics. However, Scala (the implementation language of RasterFrames) and SQL are also fully supported. Examples in Python can be mechanically translated into the other two languages without much difficulty once the naming conventions are understood. In the sections below we will show the same example program in each language. We will compute the average NDVI per month for a single _tile_ in Tanzania. +One of the great powers of RasterFrames is the ability to express computation in multiple programming languages. The content in this manual focuses on Python because it is the most commonly used language in data science and GIS analytics. However, Scala (the implementation language of RasterFrames) and SQL (commonly used in many domains) are also fully supported. Examples in Python can be mechanically translated into the other two languages without much difficulty once the naming conventions are understood. + +In the sections below we will show the same example program in each language. To do so we will compute the average NDVI per month for a single _tile_ in Tanzania. ```python, imports, echo=False from pyspark.sql.functions import * From a019e36cd5870112a653936e2c9b5b3b578e68e8 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Thu, 12 Sep 2019 10:46:45 -0400 Subject: [PATCH 30/48] PR feedback edits. --- pyrasterframes/src/main/python/docs/getting-started.pymd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyrasterframes/src/main/python/docs/getting-started.pymd b/pyrasterframes/src/main/python/docs/getting-started.pymd index 3ed254413..27ec587c6 100644 --- a/pyrasterframes/src/main/python/docs/getting-started.pymd +++ b/pyrasterframes/src/main/python/docs/getting-started.pymd @@ -6,7 +6,7 @@ If you are new to Earth-observing imagery, you might consider looking at the [Co @@@ -There are @ref:[several ways](getting-started.md#other-options) to use RasterFrames, and @ref:[several languages (Scala, SQL)](languages.md) with which you can use it. +RasterFrames® is a geospatial raster processing library for @ref:[Python, Scala and SQL](languages.md), available through @ref:[several mechanisms](#other-options). The simplest way to get started with RasterFrames is via the [Docker image](#using-jupyter-notebook), or from the Python shell. To get started with the Python shell you will need: @@ -111,7 +111,7 @@ Now you have the configured SparkSession with RasterFrames enabled. There is first-class support for Scala in RasterFrames. See the @ref:[Scala and SQL](languages.md) page for an example application, and the [Scala API Documentation](latest/api/index.html) for function details. -If you would like to use RasterFrames in Scala, you'll need to add the following resolvers and depednencies to your sbt project: +If you would like to use RasterFrames in Scala, you'll need to add the following resolvers and dependencies to your sbt project: ```scala resolvers ++= Seq( From 1c1db3c61567c8cd8f9311421671656a073bebf6 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Thu, 12 Sep 2019 13:48:03 -0400 Subject: [PATCH 31/48] Benchmark and fix for CellType reification issue. Closes #343 --- bench/archive/jmh-results-20190912131522.json | 108 +++ bench/archive/jmh-results-20190912133231.json | 108 +++ .../rasterframes/bench/CellTypeBench.scala | 52 ++ .../encoders/StandardSerializers.scala | 16 +- .../src/main/python/scene_30_27_model.ipynb | 655 ++++++++++++++++++ 5 files changed, 937 insertions(+), 2 deletions(-) create mode 100644 bench/archive/jmh-results-20190912131522.json create mode 100644 bench/archive/jmh-results-20190912133231.json create mode 100644 bench/src/main/scala/org/locationtech/rasterframes/bench/CellTypeBench.scala create mode 100644 pyrasterframes/src/main/python/scene_30_27_model.ipynb diff --git a/bench/archive/jmh-results-20190912131522.json b/bench/archive/jmh-results-20190912131522.json new file mode 100644 index 000000000..c92a18ba3 --- /dev/null +++ b/bench/archive/jmh-results-20190912131522.json @@ -0,0 +1,108 @@ +[ + { + "jmhVersion" : "1.21", + "benchmark" : "org.locationtech.rasterframes.bench.CellTypeBench.fromRow", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "primaryMetric" : { + "score" : 2.835836726694286, + "scoreError" : 0.19270422040266105, + "scoreConfidence" : [ + 2.643132506291625, + 3.028540947096947 + ], + "scorePercentiles" : { + "0.0" : 2.795428349144724, + "50.0" : 2.833201108807787, + "90.0" : 2.918007291796378, + "95.0" : 2.918007291796378, + "99.0" : 2.918007291796378, + "99.9" : 2.918007291796378, + "99.99" : 2.918007291796378, + "99.999" : 2.918007291796378, + "99.9999" : 2.918007291796378, + "100.0" : 2.918007291796378 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 2.795428349144724, + 2.79552918770274, + 2.837017696019802, + 2.918007291796378, + 2.833201108807787 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "org.locationtech.rasterframes.bench.CellTypeBench.intoRow", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "primaryMetric" : { + "score" : 0.35239272354175755, + "scoreError" : 0.007379508933042929, + "scoreConfidence" : [ + 0.3450132146087146, + 0.3597722324748005 + ], + "scorePercentiles" : { + "0.0" : 0.34961161284617265, + "50.0" : 0.3525031701325926, + "90.0" : 0.3550145438728285, + "95.0" : 0.3550145438728285, + "99.0" : 0.3550145438728285, + "99.9" : 0.3550145438728285, + "99.99" : 0.3550145438728285, + "99.999" : 0.3550145438728285, + "99.9999" : 0.3550145438728285, + "100.0" : 0.3550145438728285 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 0.3550145438728285, + 0.3522314263912252, + 0.3525031701325926, + 0.34961161284617265, + 0.3526028644659687 + ] + ] + }, + "secondaryMetrics" : { + } + } +] + + diff --git a/bench/archive/jmh-results-20190912133231.json b/bench/archive/jmh-results-20190912133231.json new file mode 100644 index 000000000..e73e25e85 --- /dev/null +++ b/bench/archive/jmh-results-20190912133231.json @@ -0,0 +1,108 @@ +[ + { + "jmhVersion" : "1.21", + "benchmark" : "org.locationtech.rasterframes.bench.CellTypeBench.fromRow", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "primaryMetric" : { + "score" : 0.09014145135858256, + "scoreError" : 0.002394012003996672, + "scoreConfidence" : [ + 0.0877474393545859, + 0.09253546336257923 + ], + "scorePercentiles" : { + "0.0" : 0.08936698388038906, + "50.0" : 0.08989743516282808, + "90.0" : 0.09083590280173164, + "95.0" : 0.09083590280173164, + "99.0" : 0.09083590280173164, + "99.9" : 0.09083590280173164, + "99.99" : 0.09083590280173164, + "99.999" : 0.09083590280173164, + "99.9999" : 0.09083590280173164, + "100.0" : 0.09083590280173164 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 0.08989743516282808, + 0.09072300570225225, + 0.08988392924571173, + 0.09083590280173164, + 0.08936698388038906 + ] + ] + }, + "secondaryMetrics" : { + } + }, + { + "jmhVersion" : "1.21", + "benchmark" : "org.locationtech.rasterframes.bench.CellTypeBench.intoRow", + "mode" : "avgt", + "threads" : 1, + "forks" : 1, + "jvm" : "/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home/jre/bin/java", + "jvmArgs" : [ + "-Xmx4g" + ], + "jdkVersion" : "1.8.0_171", + "vmName" : "Java HotSpot(TM) 64-Bit Server VM", + "vmVersion" : "25.171-b11", + "warmupIterations" : 8, + "warmupTime" : "10 s", + "warmupBatchSize" : 1, + "measurementIterations" : 5, + "measurementTime" : "10 s", + "measurementBatchSize" : 1, + "primaryMetric" : { + "score" : 0.0804223243178353, + "scoreError" : 0.003232629425106684, + "scoreConfidence" : [ + 0.07718969489272862, + 0.08365495374294199 + ], + "scorePercentiles" : { + "0.0" : 0.07979611499160112, + "50.0" : 0.08015109899040546, + "90.0" : 0.08189719727590557, + "95.0" : 0.08189719727590557, + "99.0" : 0.08189719727590557, + "99.9" : 0.08189719727590557, + "99.99" : 0.08189719727590557, + "99.999" : 0.08189719727590557, + "99.9999" : 0.08189719727590557, + "100.0" : 0.08189719727590557 + }, + "scoreUnit" : "us/op", + "rawData" : [ + [ + 0.08189719727590557, + 0.08020909063591224, + 0.07979611499160112, + 0.08005811969535213, + 0.08015109899040546 + ] + ] + }, + "secondaryMetrics" : { + } + } +] + + diff --git a/bench/src/main/scala/org/locationtech/rasterframes/bench/CellTypeBench.scala b/bench/src/main/scala/org/locationtech/rasterframes/bench/CellTypeBench.scala new file mode 100644 index 000000000..dfc88f855 --- /dev/null +++ b/bench/src/main/scala/org/locationtech/rasterframes/bench/CellTypeBench.scala @@ -0,0 +1,52 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package org.locationtech.rasterframes.bench +import java.util.concurrent.TimeUnit + +import geotrellis.raster.{CellType, DoubleUserDefinedNoDataCellType, IntUserDefinedNoDataCellType} +import org.apache.spark.sql.catalyst.InternalRow +import org.locationtech.rasterframes.encoders.CatalystSerializer._ +import org.openjdk.jmh.annotations._ + +@BenchmarkMode(Array(Mode.AverageTime)) +@State(Scope.Benchmark) +@OutputTimeUnit(TimeUnit.MICROSECONDS) +class CellTypeBench { + var row: InternalRow = _ + var ct: CellType = _ + @Setup(Level.Trial) + def setupData(): Unit = { + ct = IntUserDefinedNoDataCellType(scala.util.Random.nextInt()) + val o: CellType = DoubleUserDefinedNoDataCellType(scala.util.Random.nextDouble()) + row = o.toInternalRow + } + + @Benchmark + def fromRow(): CellType = { + row.to[CellType] + } + + @Benchmark + def intoRow(): InternalRow = { + ct.toInternalRow + } +} diff --git a/core/src/main/scala/org/locationtech/rasterframes/encoders/StandardSerializers.scala b/core/src/main/scala/org/locationtech/rasterframes/encoders/StandardSerializers.scala index caf1965cc..5494ff98c 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/encoders/StandardSerializers.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/encoders/StandardSerializers.scala @@ -21,6 +21,7 @@ package org.locationtech.rasterframes.encoders +import com.github.blemale.scaffeine.Scaffeine import geotrellis.proj4.CRS import geotrellis.raster._ import geotrellis.spark._ @@ -106,14 +107,16 @@ trait StandardSerializers { } implicit val cellTypeSerializer: CatalystSerializer[CellType] = new CatalystSerializer[CellType] { + + override def schema: StructType = StructType(Seq( StructField("cellTypeName", StringType, false) )) override def to[R](t: CellType, io: CatalystIO[R]): R = io.create( - io.encode(t.toString()) + io.encode(StandardSerializers.ct2sCache.get(t)) ) override def from[R](row: R, io: CatalystIO[R]): CellType = - CellType.fromName(io.getString(row, 0)) + StandardSerializers.s2ctCache.get(io.getString(row, 0)) } implicit val projectedExtentSerializer: CatalystSerializer[ProjectedExtent] = new CatalystSerializer[ProjectedExtent] { @@ -293,3 +296,12 @@ trait StandardSerializers { implicit val spaceTimeKeyTLMSerializer = tileLayerMetadataSerializer[SpaceTimeKey] } + +object StandardSerializers { + private val s2ctCache = Scaffeine().build[String, CellType]( + (s: String) => CellType.fromName(s) + ) + private val ct2sCache = Scaffeine().build[CellType, String]( + (ct: CellType) => ct.toString() + ) +} diff --git a/pyrasterframes/src/main/python/scene_30_27_model.ipynb b/pyrasterframes/src/main/python/scene_30_27_model.ipynb new file mode 100644 index 000000000..2281c61c6 --- /dev/null +++ b/pyrasterframes/src/main/python/scene_30_27_model.ipynb @@ -0,0 +1,655 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install requests tqdm geopandas rasterio" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# local dev env cruft\n", + "import sys\n", + "sys.path.insert(0, '/Users/sfitch/Coding/earthai/src/main/python/')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "pycharm": {} + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "

SparkSession - in-memory

\n", + " \n", + "
\n", + "

SparkContext

\n", + "\n", + "

Spark UI

\n", + "\n", + "
\n", + "
Version
\n", + "
v2.3.4
\n", + "
Master
\n", + "
local[*]
\n", + "
AppName
\n", + "
pyspark-shell
\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from earthai import *\n", + "from pyrasterframes.rasterfunctions import *\n", + "import geomesa_pyspark.types\n", + "from earthai import earth_ondemand\n", + "\n", + "import pyrasterframes\n", + "# spark = pyrasterframes.get_spark_session()\n", + "from pyspark.sql.functions import lit, rand, when, col, array\n", + "from pyspark.sql import SparkSession\n", + "from pyrasterframes import utils\n", + "\n", + "spark = SparkSession.builder \\\n", + " .master('local[*]') \\\n", + " .config('spark.driver.memory', '12g') \\\n", + " .config('spark.jars', pyrasterframes.utils.find_pyrasterframes_assembly()) \\\n", + " .config('spark.serializer',\t'org.apache.spark.serializer.KryoSerializer') \\\n", + " .config('spark.kryoserializer.buffer.max', '2047m') \\\n", + " .getOrCreate() \n", + "spark.withRasterFrames()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# LandSat Crop Classification Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Pull Landsat8 from EOD" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 3/3 [00:00<00:00, 16.33it/s]\n" + ] + } + ], + "source": [ + "eod = earth_ondemand.read_catalog(\n", + " geo=[-97.1, 47.4, -97.08, 47.5],\n", + " max_cloud_cover=10,\n", + " collections='landsat8_l1tp',\n", + " start_datetime='2018-07-01T00:00:00',\n", + " end_datetime='2018-08-31T23:59:59'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "scene_df = eod[eod.eod_grid_id == \"WRS2-030027\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1\n" + ] + }, + { + "data": { + "text/plain": [ + "'https://landsat-pds.s3.us-west-2.amazonaws.com/c1/L8/030/027/LC08_L1TP_030027_20180717_20180730_01_T1/LC08_L1TP_030027_20180717_20180730_01_T1_B4.TIF'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(len(scene_df))\n", + "teh_scene = scene_df.iloc[0].B4\n", + "teh_scene" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Munge crop target" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```bash\n", + "\n", + "\n", + "aws s3 cp s3://s22s-sanda/sar-crop/target/scene_30_27_target.tif /tmp\n", + "\n", + "gdalinfo /vsicurl/https://landsat-pds.s3.us-west-2.amazonaws.com/c1/L8/030/027/LC08_L1TP_030027_20180717_20180730_01_T1/LC08_L1TP_030027_20180717_20180730_01_T1_B4.TIF\n", + "\n", + "gdalwarp -t_srs \"+proj=utm +zone=14 +datum=WGS84 +units=m +no_defs \" \\\n", + " -te 528885.000 5138685.000 760815.000 5373915.000 \\\n", + " -te_srs \"+proj=utm +zone=14 +datum=WGS84 +units=m +no_defs \" \\\n", + " -tr 30.0 30.0 \\\n", + " -co TILED=YES -co COPY_SRC_OVERVIEWS=YES -co COMPRESS=DEFLATE \\\n", + " scene_30_27_target.tif scene_30_27_target_utm.tif\n", + " \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create and Read Raster Catalogue" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/envs/jupyter-env/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
eod_collection_display_nameeod_collection_familyeod_collection_family_display_nameeod_grid_idcreateddatetimeeo_cloud_covereo_constellationeo_epsgeo_gsd...B2BQAB4B1B8B11collectiongeometryidtarget
0Landsat 8landsat8Landsat 8WRS2-0300272019-08-19T20:54:33.413548Z2018-07-17T17:15:57.1536740Z1.49landsat-83261430.0...https://landsat-pds.s3.us-west-2.amazonaws.com...https://landsat-pds.s3.us-west-2.amazonaws.com...https://landsat-pds.s3.us-west-2.amazonaws.com...https://landsat-pds.s3.us-west-2.amazonaws.com...https://landsat-pds.s3.us-west-2.amazonaws.com...https://landsat-pds.s3.us-west-2.amazonaws.com...landsat8_l1tp(POLYGON ((-98.62404379679178 46.4012557977134...LC08_L1TP_030027_20180717_20180730_01_T1_L1TPfile:///tmp/scene_30_27_target_utm.tif
\n", + "

1 rows × 33 columns

\n", + "
" + ], + "text/plain": [ + " eod_collection_display_name eod_collection_family \\\n", + "0 Landsat 8 landsat8 \n", + "\n", + " eod_collection_family_display_name eod_grid_id \\\n", + "0 Landsat 8 WRS2-030027 \n", + "\n", + " created datetime eo_cloud_cover \\\n", + "0 2019-08-19T20:54:33.413548Z 2018-07-17T17:15:57.1536740Z 1.49 \n", + "\n", + " eo_constellation eo_epsg eo_gsd ... \\\n", + "0 landsat-8 32614 30.0 ... \n", + "\n", + " B2 \\\n", + "0 https://landsat-pds.s3.us-west-2.amazonaws.com... \n", + "\n", + " BQA \\\n", + "0 https://landsat-pds.s3.us-west-2.amazonaws.com... \n", + "\n", + " B4 \\\n", + "0 https://landsat-pds.s3.us-west-2.amazonaws.com... \n", + "\n", + " B1 \\\n", + "0 https://landsat-pds.s3.us-west-2.amazonaws.com... \n", + "\n", + " B8 \\\n", + "0 https://landsat-pds.s3.us-west-2.amazonaws.com... \n", + "\n", + " B11 collection \\\n", + "0 https://landsat-pds.s3.us-west-2.amazonaws.com... landsat8_l1tp \n", + "\n", + " geometry \\\n", + "0 (POLYGON ((-98.62404379679178 46.4012557977134... \n", + "\n", + " id \\\n", + "0 LC08_L1TP_030027_20180717_20180730_01_T1_L1TP \n", + "\n", + " target \n", + "0 file:///tmp/scene_30_27_target_utm.tif \n", + "\n", + "[1 rows x 33 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scene_df['target'] = 'file:///tmp/scene_30_27_target_utm.tif'\n", + "scene_df" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "features_rf = spark.read.raster( \n", + " catalog=scene_df,\n", + " catalog_col_names=['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'BQA', 'target'],\n", + " tile_dimensions=(256, 256)\n", + ").repartition(200)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Feature creation" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "features_rf = features_rf.withColumn('ndvi', rf_normalized_difference(features_rf.B5, features_rf.B4)) \\\n", + " .withColumn('ndwi1', rf_normalized_difference(features_rf.B5, features_rf.B6)) \\\n", + " .withColumn('ndwi2', rf_normalized_difference(features_rf.B5, features_rf.B7)) \\\n", + " .withColumn('ndwi3', rf_normalized_difference(features_rf.B3, features_rf.B5)) \\\n", + " .withColumn('evi', rf_local_multiply(rf_local_divide(rf_local_subtract(features_rf.B5, features_rf.B4), rf_local_add(rf_local_subtract(rf_local_add(features_rf.B5, rf_local_multiply(features_rf.B4, lit(6.0))), rf_local_multiply(features_rf.B2, lit(7.5))), lit(1.0))), lit(2.5))) \\\n", + " .withColumn('savi', rf_local_multiply(rf_local_divide(rf_local_subtract(features_rf.B5, features_rf.B4), rf_local_add(rf_local_add(features_rf.B5, features_rf.B4), lit(0.5))), lit(1.5))) \\\n", + " .withColumn('osavi', rf_local_divide(rf_local_subtract(features_rf.B5, features_rf.B4), rf_local_add(rf_local_add(features_rf.B5, features_rf.B4), lit(0.16)))) \\\n", + " .withColumn('satvi', rf_local_subtract(rf_local_multiply(rf_local_divide(rf_local_subtract(features_rf.B6, features_rf.B4),rf_local_add(rf_local_add(features_rf.B6, features_rf.B4), lit(0.5))), lit(1.5)), rf_local_divide(features_rf.B7, lit(2.0)))) \\\n", + " .withColumn('mean_swir', rf_local_divide(rf_local_add(features_rf.B6, features_rf.B7), lit(2.0))) \\\n", + " .withColumn('vli', rf_local_divide(rf_local_add(rf_local_add(rf_local_add(features_rf.B1, features_rf.B2), features_rf.B3), features_rf.B4), lit(4.0))) \\\n", + " .withColumn('dbsi', rf_local_subtract(rf_normalized_difference(features_rf.B6, features_rf.B3), rf_normalized_difference(features_rf.B5, features_rf.B4)))\n", + "\n", + "features_rf = features_rf.select(\n", + " features_rf.target,\n", + " rf_crs(features_rf.B1).alias('crs'),\n", + " rf_extent(features_rf.B1).alias('extent'),\n", + " rf_tile(features_rf.B1).alias('coastal'),\n", + " rf_tile(features_rf.B2).alias('blue'),\n", + " rf_tile(features_rf.B3).alias('green'),\n", + " rf_tile(features_rf.B4).alias('red'),\n", + " rf_tile(features_rf.B5).alias('nir'),\n", + " rf_tile(features_rf.B6).alias('swir1'),\n", + " rf_tile(features_rf.B7).alias('swir2'),\n", + " rf_tile(features_rf.ndvi).alias('ndvi'),\n", + " rf_tile(features_rf.ndwi1).alias('ndwi1'),\n", + " rf_tile(features_rf.ndwi2).alias('ndwi2'),\n", + " rf_tile(features_rf.ndwi3).alias('ndwi3'),\n", + " rf_tile(features_rf.evi).alias('evi'),\n", + " rf_tile(features_rf.savi).alias('savi'),\n", + " rf_tile(features_rf.osavi).alias('osavi'),\n", + " rf_tile(features_rf.satvi).alias('satvi'),\n", + " rf_tile(features_rf.mean_swir).alias('mean_swir'),\n", + " rf_tile(features_rf.vli).alias('vli'),\n", + " rf_tile(features_rf.dbsi).alias('dbsi'),\n", + " rf_tile(features_rf.BQA).alias('qa')\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- target: struct (nullable = true)\n", + " | |-- tile_context: struct (nullable = false)\n", + " | | |-- extent: struct (nullable = false)\n", + " | | | |-- xmin: double (nullable = false)\n", + " | | | |-- ymin: double (nullable = false)\n", + " | | | |-- xmax: double (nullable = false)\n", + " | | | |-- ymax: double (nullable = false)\n", + " | | |-- crs: struct (nullable = false)\n", + " | | | |-- crsProj4: string (nullable = false)\n", + " | |-- tile: tile (nullable = false)\n", + " |-- crs: struct (nullable = true)\n", + " | |-- crsProj4: string (nullable = false)\n", + " |-- extent: struct (nullable = true)\n", + " | |-- xmin: double (nullable = false)\n", + " | |-- ymin: double (nullable = false)\n", + " | |-- xmax: double (nullable = false)\n", + " | |-- ymax: double (nullable = false)\n", + " |-- coastal: tile (nullable = true)\n", + " |-- blue: tile (nullable = true)\n", + " |-- green: tile (nullable = true)\n", + " |-- red: tile (nullable = true)\n", + " |-- nir: tile (nullable = true)\n", + " |-- swir1: tile (nullable = true)\n", + " |-- swir2: tile (nullable = true)\n", + " |-- ndvi: tile (nullable = true)\n", + " |-- ndwi1: tile (nullable = true)\n", + " |-- ndwi2: tile (nullable = true)\n", + " |-- ndwi3: tile (nullable = true)\n", + " |-- evi: tile (nullable = true)\n", + " |-- savi: tile (nullable = true)\n", + " |-- osavi: tile (nullable = true)\n", + " |-- satvi: tile (nullable = true)\n", + " |-- mean_swir: tile (nullable = true)\n", + " |-- vli: tile (nullable = true)\n", + " |-- dbsi: tile (nullable = true)\n", + " |-- mask: tile (nullable = true)\n", + "\n" + ] + } + ], + "source": [ + "# Values of qa band indicating cloudy conditions\n", + "cloud = [2800, 2804, 2808, 2812, 6896, 6900, 6904, 6908]\n", + "\n", + "mask_part = features_rf \\\n", + " .withColumn('cloud1', rf_local_equal('qa', lit(2800))) \\\n", + " .withColumn('cloud2', rf_local_equal('qa', lit(2804))) \\\n", + " .withColumn('cloud3', rf_local_equal('qa', lit(2808))) \\\n", + " .withColumn('cloud4', rf_local_equal('qa', lit(2812))) \\\n", + " .withColumn('cloud5', rf_local_equal('qa', lit(6896))) \\\n", + " .withColumn('cloud6', rf_local_equal('qa', lit(6900))) \\\n", + " .withColumn('cloud7', rf_local_equal('qa', lit(6904))) \\\n", + " .withColumn('cloud8', rf_local_equal('qa', lit(6908))) \n", + "\n", + "df_mask_inv = mask_part \\\n", + " .withColumn('mask', rf_local_add('cloud1', 'cloud2')) \\\n", + " .withColumn('mask', rf_local_add('mask', 'cloud3')) \\\n", + " .withColumn('mask', rf_local_add('mask', 'cloud4')) \\\n", + " .withColumn('mask', rf_local_add('mask', 'cloud5')) \\\n", + " .withColumn('mask', rf_local_add('mask', 'cloud6')) \\\n", + " .withColumn('mask', rf_local_add('mask', 'cloud7')) \\\n", + " .withColumn('mask', rf_local_add('mask', 'cloud8')) \\\n", + " .drop('cloud1', 'cloud2', 'cloud3', 'cloud4', 'cloud5', 'cloud6', 'cloud7', 'cloud8', 'qa')\n", + " \n", + "# at this point the mask contains 0 for good cells and 1 for defect, etc\n", + "# convert cell type and set value 1 to NoData\n", + "# also set the value of 100 to nodata in the target. #darkarts\n", + "mask_rf = df_mask_inv.withColumn('mask', rf_with_no_data(rf_convert_cell_type('mask', 'uint8'), 1.0)) \\\n", + " .withColumn('target', rf_with_no_data('target', 100))\n", + "\n", + "mask_rf.printSchema()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train/test split" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "rf = mask_rf.withColumn('train_set', when(rand(seed=1234) > 0.3, 1).otherwise(0))\n", + "train_df = rf.filter(rf.train_set == 1)\n", + "test_df = rf.filter(rf.train_set == 0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create ML Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# exploded_df = train_df.select(rf_explode_tiles(array('coastal','blue','green','red','nir','swir1','swir2','ndvi','ndwi1','ndwi2','ndwi3','evi','savi','osavi','satvi','mean_swir','vli','dbsi','target', 'mask')))\n", + "# exploded_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "from pyrasterframes import TileExploder\n", + "from pyrasterframes.rf_types import NoDataFilter\n", + "\n", + "from pyspark.ml.feature import VectorAssembler\n", + "from pyspark.ml.classification import DecisionTreeClassifier\n", + "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", + "from pyspark.ml import Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "exploder = TileExploder()\n", + "\n", + "noDataFilter = NoDataFilter() \\\n", + " .setInputCols(['target', 'mask'])\n", + "\n", + "assembler = VectorAssembler() \\\n", + " .setInputCols(['coastal','blue','green','red','nir','swir1','swir2','ndvi','ndwi1','ndwi2','ndwi3','evi','savi','osavi','satvi','mean_swir','vli','dbsi']) \\\n", + " .setOutputCol(\"features\")\n", + "\n", + "classifier = DecisionTreeClassifier() \\\n", + " .setLabelCol('target') \\\n", + " .setMaxDepth(10) \\\n", + " .setFeaturesCol(assembler.getOutputCol())\n", + "\n", + "pipeline = Pipeline() \\\n", + " .setStages([exploder, noDataFilter, assembler, classifier])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train the model" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "model = pipeline.fit(train_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#model.transform(train_df).show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prediction_df = model.transform(test_df) \\\n", + " .drop(assembler.getOutputCol()).cache()\n", + "prediction_df.printSchema()\n", + "\n", + "eval = MulticlassClassificationEvaluator(\n", + " predictionCol=classifier.getPredictionCol(),\n", + " labelCol=classifier.getLabelCol(),\n", + " metricName='fMeasureByThreshold'\n", + ")\n", + "\n", + "f1score = eval.evaluate(prediction_df)\n", + "print(\"\\nF1 Score:\", f1score)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cnf_mtrx = prediction_df.groupBy(classifier.getPredictionCol()) \\\n", + " .pivot(classifier.getLabelCol()) \\\n", + " .count() \\\n", + " .sort(classifier.getPredictionCol())\n", + "cnf_mtrx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From c19ad68dfa4bc0c17e3ebd3062c35e0cc7b7ff11 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Thu, 12 Sep 2019 15:00:37 -0400 Subject: [PATCH 32/48] Changed CatalystSerialize implementations to store scheams as fields rather than methods. --- .../apache/spark/sql/rf/RasterSourceUDT.scala | 2 +- .../org/apache/spark/sql/rf/TileUDT.scala | 2 +- .../encoders/StandardSerializers.scala | 35 +++++++++---------- .../ProjectedLayerMetadataAggregate.scala | 6 ++-- .../rasterframes/model/CellContext.scala | 2 +- .../rasterframes/model/Cells.scala | 2 +- .../rasterframes/model/TileContext.scala | 2 +- .../rasterframes/model/TileDataContext.scala | 2 +- .../rasterframes/model/TileDimensions.scala | 2 +- .../rasterframes/ref/RasterRef.scala | 2 +- .../tiles/ProjectedRasterTile.scala | 2 +- 11 files changed, 29 insertions(+), 30 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/sql/rf/RasterSourceUDT.scala b/core/src/main/scala/org/apache/spark/sql/rf/RasterSourceUDT.scala index 772bde6fe..51d204b58 100644 --- a/core/src/main/scala/org/apache/spark/sql/rf/RasterSourceUDT.scala +++ b/core/src/main/scala/org/apache/spark/sql/rf/RasterSourceUDT.scala @@ -73,7 +73,7 @@ object RasterSourceUDT { implicit val rasterSourceSerializer: CatalystSerializer[RasterSource] = new CatalystSerializer[RasterSource] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("raster_source_kryo", BinaryType, false) )) diff --git a/core/src/main/scala/org/apache/spark/sql/rf/TileUDT.scala b/core/src/main/scala/org/apache/spark/sql/rf/TileUDT.scala index 66c0d98a1..094a07cf0 100644 --- a/core/src/main/scala/org/apache/spark/sql/rf/TileUDT.scala +++ b/core/src/main/scala/org/apache/spark/sql/rf/TileUDT.scala @@ -74,7 +74,7 @@ case object TileUDT { implicit def tileSerializer: CatalystSerializer[Tile] = new CatalystSerializer[Tile] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("cell_context", schemaOf[TileDataContext], false), StructField("cell_data", schemaOf[Cells], false) )) diff --git a/core/src/main/scala/org/locationtech/rasterframes/encoders/StandardSerializers.scala b/core/src/main/scala/org/locationtech/rasterframes/encoders/StandardSerializers.scala index 5494ff98c..1983f8bb9 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/encoders/StandardSerializers.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/encoders/StandardSerializers.scala @@ -37,7 +37,7 @@ import org.locationtech.rasterframes.model.LazyCRS trait StandardSerializers { implicit val envelopeSerializer: CatalystSerializer[Envelope] = new CatalystSerializer[Envelope] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("minX", DoubleType, false), StructField("maxX", DoubleType, false), StructField("minY", DoubleType, false), @@ -54,7 +54,7 @@ trait StandardSerializers { } implicit val extentSerializer: CatalystSerializer[Extent] = new CatalystSerializer[Extent] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("xmin", DoubleType, false), StructField("ymin", DoubleType, false), StructField("xmax", DoubleType, false), @@ -72,7 +72,7 @@ trait StandardSerializers { } implicit val gridBoundsSerializer: CatalystSerializer[GridBounds] = new CatalystSerializer[GridBounds] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("colMin", IntegerType, false), StructField("rowMin", IntegerType, false), StructField("colMax", IntegerType, false), @@ -92,7 +92,7 @@ trait StandardSerializers { } implicit val crsSerializer: CatalystSerializer[CRS] = new CatalystSerializer[CRS] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("crsProj4", StringType, false) )) override def to[R](t: CRS, io: CatalystIO[R]): R = io.create( @@ -107,20 +107,19 @@ trait StandardSerializers { } implicit val cellTypeSerializer: CatalystSerializer[CellType] = new CatalystSerializer[CellType] { - - - override def schema: StructType = StructType(Seq( + import StandardSerializers._ + override val schema: StructType = StructType(Seq( StructField("cellTypeName", StringType, false) )) override def to[R](t: CellType, io: CatalystIO[R]): R = io.create( - io.encode(StandardSerializers.ct2sCache.get(t)) + io.encode(ct2sCache.get(t)) ) override def from[R](row: R, io: CatalystIO[R]): CellType = - StandardSerializers.s2ctCache.get(io.getString(row, 0)) + s2ctCache.get(io.getString(row, 0)) } implicit val projectedExtentSerializer: CatalystSerializer[ProjectedExtent] = new CatalystSerializer[ProjectedExtent] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("extent", schemaOf[Extent], false), StructField("crs", schemaOf[CRS], false) )) @@ -137,7 +136,7 @@ trait StandardSerializers { } implicit val spatialKeySerializer: CatalystSerializer[SpatialKey] = new CatalystSerializer[SpatialKey] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("col", IntegerType, false), StructField("row", IntegerType, false) )) @@ -154,7 +153,7 @@ trait StandardSerializers { } implicit val spacetimeKeySerializer: CatalystSerializer[SpaceTimeKey] = new CatalystSerializer[SpaceTimeKey] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("col", IntegerType, false), StructField("row", IntegerType, false), StructField("instant", LongType, false) @@ -174,7 +173,7 @@ trait StandardSerializers { } implicit val cellSizeSerializer: CatalystSerializer[CellSize] = new CatalystSerializer[CellSize] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("width", DoubleType, false), StructField("height", DoubleType, false) )) @@ -191,7 +190,7 @@ trait StandardSerializers { } implicit val tileLayoutSerializer: CatalystSerializer[TileLayout] = new CatalystSerializer[TileLayout] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("layoutCols", IntegerType, false), StructField("layoutRows", IntegerType, false), StructField("tileCols", IntegerType, false), @@ -214,7 +213,7 @@ trait StandardSerializers { } implicit val layoutDefinitionSerializer = new CatalystSerializer[LayoutDefinition] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("extent", schemaOf[Extent], true), StructField("tileLayout", schemaOf[TileLayout], true) )) @@ -231,7 +230,7 @@ trait StandardSerializers { } implicit def boundsSerializer[T >: Null: CatalystSerializer]: CatalystSerializer[KeyBounds[T]] = new CatalystSerializer[KeyBounds[T]] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("minKey", schemaOf[T], true), StructField("maxKey", schemaOf[T], true) )) @@ -248,7 +247,7 @@ trait StandardSerializers { } def tileLayerMetadataSerializer[T >: Null: CatalystSerializer]: CatalystSerializer[TileLayerMetadata[T]] = new CatalystSerializer[TileLayerMetadata[T]] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("cellType", schemaOf[CellType], false), StructField("layout", schemaOf[LayoutDefinition], false), StructField("extent", schemaOf[Extent], false), @@ -276,7 +275,7 @@ trait StandardSerializers { implicit def rasterSerializer: CatalystSerializer[Raster[Tile]] = new CatalystSerializer[Raster[Tile]] { import org.apache.spark.sql.rf.TileUDT.tileSerializer - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("tile", TileType, false), StructField("extent", schemaOf[Extent], false) )) diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/ProjectedLayerMetadataAggregate.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/ProjectedLayerMetadataAggregate.scala index 0f1b4727a..f181d0eef 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/ProjectedLayerMetadataAggregate.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/ProjectedLayerMetadataAggregate.scala @@ -118,7 +118,7 @@ object ProjectedLayerMetadataAggregate { private[expressions] object InputRecord { implicit val serializer: CatalystSerializer[InputRecord] = new CatalystSerializer[InputRecord]{ - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("extent", CatalystSerializer[Extent].schema, false), StructField("crs", CatalystSerializer[CRS].schema, false), StructField("cellType", CatalystSerializer[CellType].schema, false), @@ -147,7 +147,7 @@ object ProjectedLayerMetadataAggregate { } def write(buffer: MutableAggregationBuffer): Unit = { - val encoded = (this).toRow + val encoded = this.toRow for(i <- 0 until encoded.size) { buffer(i) = encoded(i) } @@ -157,7 +157,7 @@ object ProjectedLayerMetadataAggregate { private[expressions] object BufferRecord { implicit val serializer: CatalystSerializer[BufferRecord] = new CatalystSerializer[BufferRecord] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("extent", CatalystSerializer[Extent].schema, true), StructField("cellType", CatalystSerializer[CellType].schema, true), StructField("cellSize", CatalystSerializer[CellSize].schema, true) diff --git a/core/src/main/scala/org/locationtech/rasterframes/model/CellContext.scala b/core/src/main/scala/org/locationtech/rasterframes/model/CellContext.scala index 95a2e1bf0..dfc083774 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/model/CellContext.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/model/CellContext.scala @@ -29,7 +29,7 @@ import CatalystSerializer._ case class CellContext(tileContext: TileContext, tileDataContext: TileDataContext, colIndex: Short, rowIndex: Short) object CellContext { implicit val serializer: CatalystSerializer[CellContext] = new CatalystSerializer[CellContext] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("tileContext", schemaOf[TileContext], false), StructField("tileDataContext", schemaOf[TileDataContext], false), StructField("colIndex", ShortType, false), diff --git a/core/src/main/scala/org/locationtech/rasterframes/model/Cells.scala b/core/src/main/scala/org/locationtech/rasterframes/model/Cells.scala index 1f7ae4d75..3a54446e1 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/model/Cells.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/model/Cells.scala @@ -68,7 +68,7 @@ object Cells { } implicit def cellsSerializer: CatalystSerializer[Cells] = new CatalystSerializer[Cells] { - override def schema: StructType = + override val schema: StructType = StructType( Seq( StructField("cells", BinaryType, true), diff --git a/core/src/main/scala/org/locationtech/rasterframes/model/TileContext.scala b/core/src/main/scala/org/locationtech/rasterframes/model/TileContext.scala index c848e9121..436b46982 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/model/TileContext.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/model/TileContext.scala @@ -41,7 +41,7 @@ object TileContext { case _ => None } implicit val serializer: CatalystSerializer[TileContext] = new CatalystSerializer[TileContext] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("extent", schemaOf[Extent], false), StructField("crs", schemaOf[CRS], false) )) diff --git a/core/src/main/scala/org/locationtech/rasterframes/model/TileDataContext.scala b/core/src/main/scala/org/locationtech/rasterframes/model/TileDataContext.scala index 9f6bd358f..addc4aee5 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/model/TileDataContext.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/model/TileDataContext.scala @@ -41,7 +41,7 @@ object TileDataContext { } implicit val serializer: CatalystSerializer[TileDataContext] = new CatalystSerializer[TileDataContext] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("cellType", schemaOf[CellType], false), StructField("dimensions", schemaOf[TileDimensions], false) )) diff --git a/core/src/main/scala/org/locationtech/rasterframes/model/TileDimensions.scala b/core/src/main/scala/org/locationtech/rasterframes/model/TileDimensions.scala index e419ac668..fbbdfebf1 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/model/TileDimensions.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/model/TileDimensions.scala @@ -38,7 +38,7 @@ object TileDimensions { def apply(colsRows: (Int, Int)): TileDimensions = new TileDimensions(colsRows._1, colsRows._2) implicit val serializer: CatalystSerializer[TileDimensions] = new CatalystSerializer[TileDimensions] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("cols", ShortType, false), StructField("rows", ShortType, false) )) diff --git a/core/src/main/scala/org/locationtech/rasterframes/ref/RasterRef.scala b/core/src/main/scala/org/locationtech/rasterframes/ref/RasterRef.scala index 9cd259456..a5255e663 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/ref/RasterRef.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/ref/RasterRef.scala @@ -76,7 +76,7 @@ object RasterRef extends LazyLogging { implicit val rasterRefSerializer: CatalystSerializer[RasterRef] = new CatalystSerializer[RasterRef] { val rsType = new RasterSourceUDT() - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("source", rsType.sqlType, false), StructField("bandIndex", IntegerType, false), StructField("subextent", schemaOf[Extent], true), diff --git a/core/src/main/scala/org/locationtech/rasterframes/tiles/ProjectedRasterTile.scala b/core/src/main/scala/org/locationtech/rasterframes/tiles/ProjectedRasterTile.scala index 6007ce3f9..235e1b851 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/tiles/ProjectedRasterTile.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/tiles/ProjectedRasterTile.scala @@ -71,7 +71,7 @@ object ProjectedRasterTile { } } implicit val serializer: CatalystSerializer[ProjectedRasterTile] = new CatalystSerializer[ProjectedRasterTile] { - override def schema: StructType = StructType(Seq( + override val schema: StructType = StructType(Seq( StructField("tile_context", schemaOf[TileContext], false), StructField("tile", TileType, false)) ) From b671207b0f922edab2615434da13587a1b9ba20a Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Fri, 13 Sep 2019 08:47:50 -0400 Subject: [PATCH 33/48] Switched Explode tiles to use UnsafeRow for slight improvement on memory pressure. Reworked TileExplodeBench --- .../rasterframes/bench/TileExplodeBench.scala | 32 +- .../expressions/generators/ExplodeTiles.scala | 17 +- project/plugins.sbt | 2 +- .../src/main/python/scene_30_27_model.ipynb | 655 ------------------ 4 files changed, 27 insertions(+), 679 deletions(-) delete mode 100644 pyrasterframes/src/main/python/scene_30_27_model.ipynb diff --git a/bench/src/main/scala/org/locationtech/rasterframes/bench/TileExplodeBench.scala b/bench/src/main/scala/org/locationtech/rasterframes/bench/TileExplodeBench.scala index 7f3352f69..4ece4cc98 100644 --- a/bench/src/main/scala/org/locationtech/rasterframes/bench/TileExplodeBench.scala +++ b/bench/src/main/scala/org/locationtech/rasterframes/bench/TileExplodeBench.scala @@ -22,11 +22,12 @@ package org.locationtech.rasterframes.bench import java.util.concurrent.TimeUnit +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.BoundReference +import org.apache.spark.sql.rf.TileUDT import org.locationtech.rasterframes._ -import org.apache.spark.sql._ -import org.apache.spark.sql.functions._ +import org.locationtech.rasterframes.expressions.generators.ExplodeTiles import org.openjdk.jmh.annotations._ - /** * * @author sfitch @@ -36,33 +37,32 @@ import org.openjdk.jmh.annotations._ @State(Scope.Benchmark) @OutputTimeUnit(TimeUnit.MILLISECONDS) class TileExplodeBench extends SparkEnv { - import spark.implicits._ - @Param(Array("uint8", "uint16ud255", "float32", "float64")) + //@Param(Array("uint8", "uint16ud255", "float32", "float64")) + @Param(Array("uint16ud255")) var cellTypeName: String = _ @Param(Array("256")) var tileSize: Int = _ - @Param(Array("100")) + @Param(Array("2000")) var numTiles: Int = _ @transient - var tiles: DataFrame = _ + var tiles: Array[InternalRow] = _ + + var exploder: ExplodeTiles = _ @Setup(Level.Trial) def setupData(): Unit = { - tiles = Seq.fill(numTiles)(randomTile(tileSize, tileSize, cellTypeName)) - .toDF("tile").repartition(10) - } - - @Benchmark - def arrayExplode() = { - tiles.select(posexplode(rf_tile_to_array_double($"tile"))).count() + tiles = Array.fill(numTiles)(randomTile(tileSize, tileSize, cellTypeName)) + .map(t => InternalRow(TileUDT.tileSerializer.toInternalRow(t))) + val expr = BoundReference(0, TileType, true) + exploder = new ExplodeTiles(1.0, None, Seq(expr)) } - @Benchmark def tileExplode() = { - tiles.select(rf_explode_tiles($"tile")).count() + for(t <- tiles) + exploder.eval(t) } } diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala index 2a70be585..74589c0ae 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala @@ -24,8 +24,8 @@ package org.locationtech.rasterframes.expressions.generators import geotrellis.raster._ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback -import org.apache.spark.sql.catalyst.expressions.{Expression, Generator, GenericInternalRow} +import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, CodegenFallback, UnsafeRowWriter} +import org.apache.spark.sql.catalyst.expressions.{Expression, Generator, GenericInternalRow, UnsafeRow} import org.apache.spark.sql.types._ import org.locationtech.rasterframes._ import org.locationtech.rasterframes.expressions.DynamicExtractors @@ -87,14 +87,17 @@ case class ExplodeTiles( cfor(0)(_ < rows, _ + 1) { row => cfor(0)(_ < cols, _ + 1) { col => val rowIndex = row * cols + col - val outCols = Array.ofDim[Any](numOutCols) - outCols(0) = col - outCols(1) = row + val outRow = new UnsafeRow(numOutCols) + val buffer = new BufferHolder(outRow) + val writer = new UnsafeRowWriter(buffer, numOutCols) + writer.write(0, col) + writer.write(1, row) cfor(0)(_ < tiles.length, _ + 1) { index => val tile = tiles(index) - outCols(index + 2) = if(tile == null) doubleNODATA else tile.getDouble(col, row) + val cell: Double = if (tile == null) doubleNODATA else tile.getDouble(col, row) + writer.write(index + 2, cell) } - retval(rowIndex) = new GenericInternalRow(outCols) + retval(rowIndex) = outRow } } if(sampleFraction > 0.0 && sampleFraction < 1.0) sample(retval) diff --git a/project/plugins.sbt b/project/plugins.sbt index 502ede0e3..a6d8dcdc5 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -8,7 +8,7 @@ addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.2") addSbtPlugin("com.typesafe.sbt" % "sbt-site" % "1.3.2") addSbtPlugin("com.lightbend.paradox" % "sbt-paradox" % "0.5.5") addSbtPlugin("io.github.jonas" % "sbt-paradox-material-theme" % "0.6.0") -addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.3.6") +addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.3.3") addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.1") addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.1") addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.1") diff --git a/pyrasterframes/src/main/python/scene_30_27_model.ipynb b/pyrasterframes/src/main/python/scene_30_27_model.ipynb deleted file mode 100644 index 2281c61c6..000000000 --- a/pyrasterframes/src/main/python/scene_30_27_model.ipynb +++ /dev/null @@ -1,655 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install requests tqdm geopandas rasterio" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# local dev env cruft\n", - "import sys\n", - "sys.path.insert(0, '/Users/sfitch/Coding/earthai/src/main/python/')" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "pycharm": {} - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "

SparkSession - in-memory

\n", - " \n", - "
\n", - "

SparkContext

\n", - "\n", - "

Spark UI

\n", - "\n", - "
\n", - "
Version
\n", - "
v2.3.4
\n", - "
Master
\n", - "
local[*]
\n", - "
AppName
\n", - "
pyspark-shell
\n", - "
\n", - "
\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from earthai import *\n", - "from pyrasterframes.rasterfunctions import *\n", - "import geomesa_pyspark.types\n", - "from earthai import earth_ondemand\n", - "\n", - "import pyrasterframes\n", - "# spark = pyrasterframes.get_spark_session()\n", - "from pyspark.sql.functions import lit, rand, when, col, array\n", - "from pyspark.sql import SparkSession\n", - "from pyrasterframes import utils\n", - "\n", - "spark = SparkSession.builder \\\n", - " .master('local[*]') \\\n", - " .config('spark.driver.memory', '12g') \\\n", - " .config('spark.jars', pyrasterframes.utils.find_pyrasterframes_assembly()) \\\n", - " .config('spark.serializer',\t'org.apache.spark.serializer.KryoSerializer') \\\n", - " .config('spark.kryoserializer.buffer.max', '2047m') \\\n", - " .getOrCreate() \n", - "spark.withRasterFrames()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# LandSat Crop Classification Model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Pull Landsat8 from EOD" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 3/3 [00:00<00:00, 16.33it/s]\n" - ] - } - ], - "source": [ - "eod = earth_ondemand.read_catalog(\n", - " geo=[-97.1, 47.4, -97.08, 47.5],\n", - " max_cloud_cover=10,\n", - " collections='landsat8_l1tp',\n", - " start_datetime='2018-07-01T00:00:00',\n", - " end_datetime='2018-08-31T23:59:59'\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "scene_df = eod[eod.eod_grid_id == \"WRS2-030027\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1\n" - ] - }, - { - "data": { - "text/plain": [ - "'https://landsat-pds.s3.us-west-2.amazonaws.com/c1/L8/030/027/LC08_L1TP_030027_20180717_20180730_01_T1/LC08_L1TP_030027_20180717_20180730_01_T1_B4.TIF'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print(len(scene_df))\n", - "teh_scene = scene_df.iloc[0].B4\n", - "teh_scene" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Munge crop target" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```bash\n", - "\n", - "\n", - "aws s3 cp s3://s22s-sanda/sar-crop/target/scene_30_27_target.tif /tmp\n", - "\n", - "gdalinfo /vsicurl/https://landsat-pds.s3.us-west-2.amazonaws.com/c1/L8/030/027/LC08_L1TP_030027_20180717_20180730_01_T1/LC08_L1TP_030027_20180717_20180730_01_T1_B4.TIF\n", - "\n", - "gdalwarp -t_srs \"+proj=utm +zone=14 +datum=WGS84 +units=m +no_defs \" \\\n", - " -te 528885.000 5138685.000 760815.000 5373915.000 \\\n", - " -te_srs \"+proj=utm +zone=14 +datum=WGS84 +units=m +no_defs \" \\\n", - " -tr 30.0 30.0 \\\n", - " -co TILED=YES -co COPY_SRC_OVERVIEWS=YES -co COMPRESS=DEFLATE \\\n", - " scene_30_27_target.tif scene_30_27_target_utm.tif\n", - " \n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create and Read Raster Catalogue" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/anaconda3/envs/jupyter-env/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " \"\"\"Entry point for launching an IPython kernel.\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
eod_collection_display_nameeod_collection_familyeod_collection_family_display_nameeod_grid_idcreateddatetimeeo_cloud_covereo_constellationeo_epsgeo_gsd...B2BQAB4B1B8B11collectiongeometryidtarget
0Landsat 8landsat8Landsat 8WRS2-0300272019-08-19T20:54:33.413548Z2018-07-17T17:15:57.1536740Z1.49landsat-83261430.0...https://landsat-pds.s3.us-west-2.amazonaws.com...https://landsat-pds.s3.us-west-2.amazonaws.com...https://landsat-pds.s3.us-west-2.amazonaws.com...https://landsat-pds.s3.us-west-2.amazonaws.com...https://landsat-pds.s3.us-west-2.amazonaws.com...https://landsat-pds.s3.us-west-2.amazonaws.com...landsat8_l1tp(POLYGON ((-98.62404379679178 46.4012557977134...LC08_L1TP_030027_20180717_20180730_01_T1_L1TPfile:///tmp/scene_30_27_target_utm.tif
\n", - "

1 rows × 33 columns

\n", - "
" - ], - "text/plain": [ - " eod_collection_display_name eod_collection_family \\\n", - "0 Landsat 8 landsat8 \n", - "\n", - " eod_collection_family_display_name eod_grid_id \\\n", - "0 Landsat 8 WRS2-030027 \n", - "\n", - " created datetime eo_cloud_cover \\\n", - "0 2019-08-19T20:54:33.413548Z 2018-07-17T17:15:57.1536740Z 1.49 \n", - "\n", - " eo_constellation eo_epsg eo_gsd ... \\\n", - "0 landsat-8 32614 30.0 ... \n", - "\n", - " B2 \\\n", - "0 https://landsat-pds.s3.us-west-2.amazonaws.com... \n", - "\n", - " BQA \\\n", - "0 https://landsat-pds.s3.us-west-2.amazonaws.com... \n", - "\n", - " B4 \\\n", - "0 https://landsat-pds.s3.us-west-2.amazonaws.com... \n", - "\n", - " B1 \\\n", - "0 https://landsat-pds.s3.us-west-2.amazonaws.com... \n", - "\n", - " B8 \\\n", - "0 https://landsat-pds.s3.us-west-2.amazonaws.com... \n", - "\n", - " B11 collection \\\n", - "0 https://landsat-pds.s3.us-west-2.amazonaws.com... landsat8_l1tp \n", - "\n", - " geometry \\\n", - "0 (POLYGON ((-98.62404379679178 46.4012557977134... \n", - "\n", - " id \\\n", - "0 LC08_L1TP_030027_20180717_20180730_01_T1_L1TP \n", - "\n", - " target \n", - "0 file:///tmp/scene_30_27_target_utm.tif \n", - "\n", - "[1 rows x 33 columns]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "scene_df['target'] = 'file:///tmp/scene_30_27_target_utm.tif'\n", - "scene_df" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "features_rf = spark.read.raster( \n", - " catalog=scene_df,\n", - " catalog_col_names=['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'BQA', 'target'],\n", - " tile_dimensions=(256, 256)\n", - ").repartition(200)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Feature creation" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "features_rf = features_rf.withColumn('ndvi', rf_normalized_difference(features_rf.B5, features_rf.B4)) \\\n", - " .withColumn('ndwi1', rf_normalized_difference(features_rf.B5, features_rf.B6)) \\\n", - " .withColumn('ndwi2', rf_normalized_difference(features_rf.B5, features_rf.B7)) \\\n", - " .withColumn('ndwi3', rf_normalized_difference(features_rf.B3, features_rf.B5)) \\\n", - " .withColumn('evi', rf_local_multiply(rf_local_divide(rf_local_subtract(features_rf.B5, features_rf.B4), rf_local_add(rf_local_subtract(rf_local_add(features_rf.B5, rf_local_multiply(features_rf.B4, lit(6.0))), rf_local_multiply(features_rf.B2, lit(7.5))), lit(1.0))), lit(2.5))) \\\n", - " .withColumn('savi', rf_local_multiply(rf_local_divide(rf_local_subtract(features_rf.B5, features_rf.B4), rf_local_add(rf_local_add(features_rf.B5, features_rf.B4), lit(0.5))), lit(1.5))) \\\n", - " .withColumn('osavi', rf_local_divide(rf_local_subtract(features_rf.B5, features_rf.B4), rf_local_add(rf_local_add(features_rf.B5, features_rf.B4), lit(0.16)))) \\\n", - " .withColumn('satvi', rf_local_subtract(rf_local_multiply(rf_local_divide(rf_local_subtract(features_rf.B6, features_rf.B4),rf_local_add(rf_local_add(features_rf.B6, features_rf.B4), lit(0.5))), lit(1.5)), rf_local_divide(features_rf.B7, lit(2.0)))) \\\n", - " .withColumn('mean_swir', rf_local_divide(rf_local_add(features_rf.B6, features_rf.B7), lit(2.0))) \\\n", - " .withColumn('vli', rf_local_divide(rf_local_add(rf_local_add(rf_local_add(features_rf.B1, features_rf.B2), features_rf.B3), features_rf.B4), lit(4.0))) \\\n", - " .withColumn('dbsi', rf_local_subtract(rf_normalized_difference(features_rf.B6, features_rf.B3), rf_normalized_difference(features_rf.B5, features_rf.B4)))\n", - "\n", - "features_rf = features_rf.select(\n", - " features_rf.target,\n", - " rf_crs(features_rf.B1).alias('crs'),\n", - " rf_extent(features_rf.B1).alias('extent'),\n", - " rf_tile(features_rf.B1).alias('coastal'),\n", - " rf_tile(features_rf.B2).alias('blue'),\n", - " rf_tile(features_rf.B3).alias('green'),\n", - " rf_tile(features_rf.B4).alias('red'),\n", - " rf_tile(features_rf.B5).alias('nir'),\n", - " rf_tile(features_rf.B6).alias('swir1'),\n", - " rf_tile(features_rf.B7).alias('swir2'),\n", - " rf_tile(features_rf.ndvi).alias('ndvi'),\n", - " rf_tile(features_rf.ndwi1).alias('ndwi1'),\n", - " rf_tile(features_rf.ndwi2).alias('ndwi2'),\n", - " rf_tile(features_rf.ndwi3).alias('ndwi3'),\n", - " rf_tile(features_rf.evi).alias('evi'),\n", - " rf_tile(features_rf.savi).alias('savi'),\n", - " rf_tile(features_rf.osavi).alias('osavi'),\n", - " rf_tile(features_rf.satvi).alias('satvi'),\n", - " rf_tile(features_rf.mean_swir).alias('mean_swir'),\n", - " rf_tile(features_rf.vli).alias('vli'),\n", - " rf_tile(features_rf.dbsi).alias('dbsi'),\n", - " rf_tile(features_rf.BQA).alias('qa')\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "root\n", - " |-- target: struct (nullable = true)\n", - " | |-- tile_context: struct (nullable = false)\n", - " | | |-- extent: struct (nullable = false)\n", - " | | | |-- xmin: double (nullable = false)\n", - " | | | |-- ymin: double (nullable = false)\n", - " | | | |-- xmax: double (nullable = false)\n", - " | | | |-- ymax: double (nullable = false)\n", - " | | |-- crs: struct (nullable = false)\n", - " | | | |-- crsProj4: string (nullable = false)\n", - " | |-- tile: tile (nullable = false)\n", - " |-- crs: struct (nullable = true)\n", - " | |-- crsProj4: string (nullable = false)\n", - " |-- extent: struct (nullable = true)\n", - " | |-- xmin: double (nullable = false)\n", - " | |-- ymin: double (nullable = false)\n", - " | |-- xmax: double (nullable = false)\n", - " | |-- ymax: double (nullable = false)\n", - " |-- coastal: tile (nullable = true)\n", - " |-- blue: tile (nullable = true)\n", - " |-- green: tile (nullable = true)\n", - " |-- red: tile (nullable = true)\n", - " |-- nir: tile (nullable = true)\n", - " |-- swir1: tile (nullable = true)\n", - " |-- swir2: tile (nullable = true)\n", - " |-- ndvi: tile (nullable = true)\n", - " |-- ndwi1: tile (nullable = true)\n", - " |-- ndwi2: tile (nullable = true)\n", - " |-- ndwi3: tile (nullable = true)\n", - " |-- evi: tile (nullable = true)\n", - " |-- savi: tile (nullable = true)\n", - " |-- osavi: tile (nullable = true)\n", - " |-- satvi: tile (nullable = true)\n", - " |-- mean_swir: tile (nullable = true)\n", - " |-- vli: tile (nullable = true)\n", - " |-- dbsi: tile (nullable = true)\n", - " |-- mask: tile (nullable = true)\n", - "\n" - ] - } - ], - "source": [ - "# Values of qa band indicating cloudy conditions\n", - "cloud = [2800, 2804, 2808, 2812, 6896, 6900, 6904, 6908]\n", - "\n", - "mask_part = features_rf \\\n", - " .withColumn('cloud1', rf_local_equal('qa', lit(2800))) \\\n", - " .withColumn('cloud2', rf_local_equal('qa', lit(2804))) \\\n", - " .withColumn('cloud3', rf_local_equal('qa', lit(2808))) \\\n", - " .withColumn('cloud4', rf_local_equal('qa', lit(2812))) \\\n", - " .withColumn('cloud5', rf_local_equal('qa', lit(6896))) \\\n", - " .withColumn('cloud6', rf_local_equal('qa', lit(6900))) \\\n", - " .withColumn('cloud7', rf_local_equal('qa', lit(6904))) \\\n", - " .withColumn('cloud8', rf_local_equal('qa', lit(6908))) \n", - "\n", - "df_mask_inv = mask_part \\\n", - " .withColumn('mask', rf_local_add('cloud1', 'cloud2')) \\\n", - " .withColumn('mask', rf_local_add('mask', 'cloud3')) \\\n", - " .withColumn('mask', rf_local_add('mask', 'cloud4')) \\\n", - " .withColumn('mask', rf_local_add('mask', 'cloud5')) \\\n", - " .withColumn('mask', rf_local_add('mask', 'cloud6')) \\\n", - " .withColumn('mask', rf_local_add('mask', 'cloud7')) \\\n", - " .withColumn('mask', rf_local_add('mask', 'cloud8')) \\\n", - " .drop('cloud1', 'cloud2', 'cloud3', 'cloud4', 'cloud5', 'cloud6', 'cloud7', 'cloud8', 'qa')\n", - " \n", - "# at this point the mask contains 0 for good cells and 1 for defect, etc\n", - "# convert cell type and set value 1 to NoData\n", - "# also set the value of 100 to nodata in the target. #darkarts\n", - "mask_rf = df_mask_inv.withColumn('mask', rf_with_no_data(rf_convert_cell_type('mask', 'uint8'), 1.0)) \\\n", - " .withColumn('target', rf_with_no_data('target', 100))\n", - "\n", - "mask_rf.printSchema()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Train/test split" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "rf = mask_rf.withColumn('train_set', when(rand(seed=1234) > 0.3, 1).otherwise(0))\n", - "train_df = rf.filter(rf.train_set == 1)\n", - "test_df = rf.filter(rf.train_set == 0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create ML Pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "# exploded_df = train_df.select(rf_explode_tiles(array('coastal','blue','green','red','nir','swir1','swir2','ndvi','ndwi1','ndwi2','ndwi3','evi','savi','osavi','satvi','mean_swir','vli','dbsi','target', 'mask')))\n", - "# exploded_df.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "from pyrasterframes import TileExploder\n", - "from pyrasterframes.rf_types import NoDataFilter\n", - "\n", - "from pyspark.ml.feature import VectorAssembler\n", - "from pyspark.ml.classification import DecisionTreeClassifier\n", - "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", - "from pyspark.ml import Pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "exploder = TileExploder()\n", - "\n", - "noDataFilter = NoDataFilter() \\\n", - " .setInputCols(['target', 'mask'])\n", - "\n", - "assembler = VectorAssembler() \\\n", - " .setInputCols(['coastal','blue','green','red','nir','swir1','swir2','ndvi','ndwi1','ndwi2','ndwi3','evi','savi','osavi','satvi','mean_swir','vli','dbsi']) \\\n", - " .setOutputCol(\"features\")\n", - "\n", - "classifier = DecisionTreeClassifier() \\\n", - " .setLabelCol('target') \\\n", - " .setMaxDepth(10) \\\n", - " .setFeaturesCol(assembler.getOutputCol())\n", - "\n", - "pipeline = Pipeline() \\\n", - " .setStages([exploder, noDataFilter, assembler, classifier])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Train the model" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "model = pipeline.fit(train_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#model.transform(train_df).show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "prediction_df = model.transform(test_df) \\\n", - " .drop(assembler.getOutputCol()).cache()\n", - "prediction_df.printSchema()\n", - "\n", - "eval = MulticlassClassificationEvaluator(\n", - " predictionCol=classifier.getPredictionCol(),\n", - " labelCol=classifier.getLabelCol(),\n", - " metricName='fMeasureByThreshold'\n", - ")\n", - "\n", - "f1score = eval.evaluate(prediction_df)\n", - "print(\"\\nF1 Score:\", f1score)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cnf_mtrx = prediction_df.groupBy(classifier.getPredictionCol()) \\\n", - " .pivot(classifier.getLabelCol()) \\\n", - " .count() \\\n", - " .sort(classifier.getPredictionCol())\n", - "cnf_mtrx" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From dba27d1cdeadb7ae4f5260f7ade16209ef2e8e65 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Fri, 13 Sep 2019 09:18:44 -0400 Subject: [PATCH 34/48] Updated release notes. --- docs/src/main/paradox/release-notes.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/src/main/paradox/release-notes.md b/docs/src/main/paradox/release-notes.md index 9f500b5eb..d9136d5ef 100644 --- a/docs/src/main/paradox/release-notes.md +++ b/docs/src/main/paradox/release-notes.md @@ -4,6 +4,7 @@ ### 0.8.2 +* Fixed SparkML memory pressure issue caused by unnecessary reevaluation, overallocation, and primitive boxing. ([#343](https://github.com/locationtech/rasterframes/issues/343)) * Fixed Parquet serialization issue with `RasterRef`s ([#338](https://github.com/locationtech/rasterframes/issues/338)) * Fixed `TileExploder`, `rf_agg_local_mean` and `TileColumnSupport` to support `proj_raster` struct ([#287](https://github.com/locationtech/rasterframes/issues/287), [#163](https://github.com/locationtech/rasterframes/issues/163), [#333](https://github.com/locationtech/rasterframes/issues/333)). * Various documentation improvements. From 4de6839bf5156a9de66c3ccb6d6bf4c31a20a39b Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Fri, 13 Sep 2019 10:19:07 -0400 Subject: [PATCH 35/48] =?UTF-8?q?Propagate=20errors=20encountered=20in=20R?= =?UTF-8?q?asterSourceToRasterRefs.=20Closes=20#267.=E2=80=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../generators/RasterSourceToRasterRefs.scala | 9 ++++----- .../rasterframes/ref/RasterRefSpec.scala | 13 +++++++++++++ docs/src/main/paradox/release-notes.md | 3 +++ 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/RasterSourceToRasterRefs.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/RasterSourceToRasterRefs.scala index 3170dfbd3..d4e7663b2 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/RasterSourceToRasterRefs.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/RasterSourceToRasterRefs.scala @@ -21,7 +21,6 @@ package org.locationtech.rasterframes.expressions.generators -import com.typesafe.scalalogging.LazyLogging import geotrellis.raster.GridBounds import geotrellis.vector.Extent import org.apache.spark.sql.catalyst.InternalRow @@ -45,7 +44,7 @@ import scala.util.control.NonFatal * @since 9/6/18 */ case class RasterSourceToRasterRefs(children: Seq[Expression], bandIndexes: Seq[Int], subtileDims: Option[TileDimensions] = None) extends Expression - with Generator with CodegenFallback with ExpectsInputTypes with LazyLogging { + with Generator with CodegenFallback with ExpectsInputTypes { override def inputTypes: Seq[DataType] = Seq.fill(children.size)(RasterSourceType) override def nodeName: String = "rf_raster_source_to_raster_ref" @@ -77,9 +76,9 @@ case class RasterSourceToRasterRefs(children: Seq[Expression], bandIndexes: Seq[ } catch { case NonFatal(ex) ⇒ - val payload = Try(children.map(c => RasterSourceType.deserialize(c.eval(input)))).toOption.toSeq.flatten - logger.error("Error fetching data for one of: " + payload.mkString(", "), ex) - Traversable.empty + val description = Try(children.map(c => RasterSourceType.deserialize(c.eval(input)))) + .toOption.toSeq.flatten.mkString(", ") + throw new java.lang.IllegalArgumentException(description, ex) } } } diff --git a/core/src/test/scala/org/locationtech/rasterframes/ref/RasterRefSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/ref/RasterRefSpec.scala index 23f00268e..9f14e7ded 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/ref/RasterRefSpec.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/ref/RasterRefSpec.scala @@ -21,8 +21,11 @@ package org.locationtech.rasterframes.ref +import java.net.URI + import geotrellis.raster.{ByteConstantNoDataCellType, Tile} import geotrellis.vector.Extent +import org.apache.spark.SparkException import org.apache.spark.sql.Encoders import org.locationtech.rasterframes.{TestEnvironment, _} import org.locationtech.rasterframes.expressions.accessors._ @@ -205,6 +208,16 @@ class RasterRefSpec extends TestEnvironment with TestData { r.rows should be <= NOMINAL_TILE_SIZE } } + it("should throw exception on invalid URI") { + val src = RasterSource(URI.create("http://foo/bar")) + import spark.implicits._ + val df = Seq(src).toDF("src") + val refs = df.select(RasterSourceToRasterRefs($"src") as "proj_raster") + logger.warn(Console.REVERSED + "Upcoming 'java.lang.IllegalArgumentException' expected in logs." + Console.RESET) + assertThrows[SparkException] { + refs.first() + } + } } describe("RealizeTile") { diff --git a/docs/src/main/paradox/release-notes.md b/docs/src/main/paradox/release-notes.md index d9136d5ef..48dc92a93 100644 --- a/docs/src/main/paradox/release-notes.md +++ b/docs/src/main/paradox/release-notes.md @@ -4,10 +4,13 @@ ### 0.8.2 +* Fixed issue with `RasterSourceDataSource` swallowing exceptions. ([#267](https://github.com/locationtech/rasterframes/issues/267)) * Fixed SparkML memory pressure issue caused by unnecessary reevaluation, overallocation, and primitive boxing. ([#343](https://github.com/locationtech/rasterframes/issues/343)) * Fixed Parquet serialization issue with `RasterRef`s ([#338](https://github.com/locationtech/rasterframes/issues/338)) * Fixed `TileExploder`, `rf_agg_local_mean` and `TileColumnSupport` to support `proj_raster` struct ([#287](https://github.com/locationtech/rasterframes/issues/287), [#163](https://github.com/locationtech/rasterframes/issues/163), [#333](https://github.com/locationtech/rasterframes/issues/333)). * Various documentation improvements. +* _Breaking_ (potentially): Synchronized parameter naming in Python and Scala for `spark.read.raster` ([#329](https://github.com/locationtech/rasterframes/pull/329)). + ### 0.8.1 From 6f6e8356a50e0914b8e923cbb5e181954ef87abf Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Fri, 13 Sep 2019 12:31:32 -0400 Subject: [PATCH 36/48] Make python RasterSourceTest.test_list_of_list_of_str clearer, more stable Signed-off-by: Jason T. Brown --- .../src/main/python/tests/RasterSourceTest.py | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/pyrasterframes/src/main/python/tests/RasterSourceTest.py b/pyrasterframes/src/main/python/tests/RasterSourceTest.py index 5f7967a49..c4c8e64f7 100644 --- a/pyrasterframes/src/main/python/tests/RasterSourceTest.py +++ b/pyrasterframes/src/main/python/tests/RasterSourceTest.py @@ -124,11 +124,26 @@ def test_list_of_list_of_str(self): self.assertTrue(len(df.columns) == 4) # 2 cols of uris plus 2 cols of proj_rasters self.assertEqual(sorted(df.columns), sorted(['proj_raster_0_path', 'proj_raster_1_path', 'proj_raster_0', 'proj_raster_1'])) - uri_df = df.select('proj_raster_0_path', 'proj_raster_1_path').distinct().collect() - uri_list = [list(r.asDict().values()) for r in uri_df] - self.assertTrue(lol[0] in uri_list) - self.assertTrue(lol[1] in uri_list) - self.assertTrue(lol[2] in uri_list) + uri_df = df.select('proj_raster_0_path', 'proj_raster_1_path').distinct() + + # check that various uri's are in the dataframe + self.assertEqual( + uri_df.filter(col('proj_raster_0_path') == lit(self.path(1, 1))).count(), + 1) + + self.assertEqual( + uri_df \ + .filter(col('proj_raster_0_path') == lit(self.path(1, 1))) \ + .filter(col('proj_raster_1_path') == lit(self.path(1, 2))) \ + .count(), + 1) + + self.assertEqual( + uri_df \ + .filter(col('proj_raster_0_path') == lit(self.path(3, 1))) \ + .filter(col('proj_raster_1_path') == lit(self.path(3, 2))) \ + .count(), + 1) def test_schemeless_string(self): import os.path From 19b95732dd2f13fb42027ba32369b5ef7ffb82e1 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Fri, 13 Sep 2019 13:53:05 -0400 Subject: [PATCH 37/48] Incorporated PR feedback. --- .../expressions/generators/RasterSourceToRasterRefs.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/RasterSourceToRasterRefs.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/RasterSourceToRasterRefs.scala index d4e7663b2..d90d790b5 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/RasterSourceToRasterRefs.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/RasterSourceToRasterRefs.scala @@ -76,8 +76,9 @@ case class RasterSourceToRasterRefs(children: Seq[Expression], bandIndexes: Seq[ } catch { case NonFatal(ex) ⇒ - val description = Try(children.map(c => RasterSourceType.deserialize(c.eval(input)))) - .toOption.toSeq.flatten.mkString(", ") + val description = "Error fetching data for one of: " + + Try(children.map(c => RasterSourceType.deserialize(c.eval(input)))) + .toOption.toSeq.flatten.mkString(", ") throw new java.lang.IllegalArgumentException(description, ex) } } From 1db4176f9482f5596104db9daaf1c45b6d300e59 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Fri, 13 Sep 2019 15:43:30 -0400 Subject: [PATCH 38/48] Added rasterio and decartes. --- rf-notebook/build.sbt | 2 +- rf-notebook/src/main/docker/Dockerfile | 13 +++++-------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/rf-notebook/build.sbt b/rf-notebook/build.sbt index cac2f88ce..b7e7b6213 100644 --- a/rf-notebook/build.sbt +++ b/rf-notebook/build.sbt @@ -4,7 +4,7 @@ import PythonBuildPlugin.autoImport.pyWhl lazy val includeNotebooks = settingKey[Boolean]("Whether to build documentation into notebooks and include them") includeNotebooks := true -Docker / packageName := "rasterframes-notebook" +Docker / packageName := "s22s/rasterframes-notebook" Docker / version := version.value diff --git a/rf-notebook/src/main/docker/Dockerfile b/rf-notebook/src/main/docker/Dockerfile index ad430a9f7..d023db869 100644 --- a/rf-notebook/src/main/docker/Dockerfile +++ b/rf-notebook/src/main/docker/Dockerfile @@ -13,21 +13,18 @@ EXPOSE 4040 4041 4042 4043 4044 # Sphinx (for Notebook->html) RUN conda install --quiet --yes \ - anaconda sphinx nbsphinx shapely numpy folium geopandas geojsonio + anaconda sphinx nbsphinx shapely numpy folium geopandas geojsonio rasterio descartes # Cleanup pip residuals RUN rm -rf /home/$NB_USER/.local && \ - fix-permissions /home/$NB_USER - -# Note: The above step takes an insanely long time in the CONDA_DIR, so commenting it out until we have perm issues. -# fix-permissions $CONDA_DIR + fix-permissions /home/$NB_USER && \ + fix-permissions $CONDA_DIR COPY *.whl $RF_LIB_LOC -RUN ls -1 $RF_LIB_LOC/*.whl | xargs pip install COPY jupyter_notebook_config.py $HOME/.jupyter COPY examples $HOME/examples -RUN chmod -R +w $HOME/examples -RUN chown -R $NB_UID:$NB_GID $HOME +RUN ls -1 $RF_LIB_LOC/*.whl | xargs pip install +RUN chmod -R +w $HOME/examples && chown -R $NB_UID:$NB_GID $HOME USER $NB_UID \ No newline at end of file From 4b3237cd72adbcf48faaf0f0e2f76e3dc1210ff9 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Mon, 16 Sep 2019 15:06:20 -0400 Subject: [PATCH 39/48] Bumped Spark version to 2.3.4 --- rf-notebook/src/main/docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rf-notebook/src/main/docker/Dockerfile b/rf-notebook/src/main/docker/Dockerfile index d023db869..6c7e514dd 100644 --- a/rf-notebook/src/main/docker/Dockerfile +++ b/rf-notebook/src/main/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM s22s/pyspark-notebook:spark-2.3.3-hadoop-2.7 +FROM s22s/pyspark-notebook:spark-2.3.4-hadoop-2.7 MAINTAINER Astraea, Inc. From 3ffc0ca7acd243103e844f3fa2bcbb8c0fd29e99 Mon Sep 17 00:00:00 2001 From: Jason T Brown Date: Mon, 16 Sep 2019 15:25:51 -0400 Subject: [PATCH 40/48] Zonal stats page and other refactoring (#342) * zonal map algebra doc page * proper rendering of folium.Map in docs * Simplify time series doc by ref to zonal map algebra * Remove local path in docs, favoring https urls to improve portability in rf-notebook Signed-off-by: Jason T. Brown --- .../src/main/python/docs/__init__.py | 5 +- .../src/main/python/docs/aggregation.pymd | 2 +- .../src/main/python/docs/raster-processing.md | 3 +- .../main/python/docs/supervised-learning.pymd | 8 +- .../src/main/python/docs/time-series.pymd | 103 ++++---------- .../python/docs/unsupervised-learning.pymd | 22 +-- .../src/main/python/docs/vector-data.pymd | 38 ++++-- .../src/main/python/docs/zonal-algebra.pymd | 126 ++++++++++++++++++ .../main/python/pyrasterframes/rf_ipython.py | 21 +++ 9 files changed, 216 insertions(+), 112 deletions(-) create mode 100644 pyrasterframes/src/main/python/docs/zonal-algebra.pymd diff --git a/pyrasterframes/src/main/python/docs/__init__.py b/pyrasterframes/src/main/python/docs/__init__.py index 428a68d65..1778c5590 100644 --- a/pyrasterframes/src/main/python/docs/__init__.py +++ b/pyrasterframes/src/main/python/docs/__init__.py @@ -19,6 +19,7 @@ # import os + from pweave import PwebPandocFormatter @@ -36,10 +37,6 @@ def resource_dir(): return test_resource -def resource_dir_uri(): - return 'file://' + resource_dir() - - class PegdownMarkdownFormatter(PwebPandocFormatter): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/pyrasterframes/src/main/python/docs/aggregation.pymd b/pyrasterframes/src/main/python/docs/aggregation.pymd index 554d487a9..2243e5b37 100644 --- a/pyrasterframes/src/main/python/docs/aggregation.pymd +++ b/pyrasterframes/src/main/python/docs/aggregation.pymd @@ -14,7 +14,7 @@ np.set_printoptions(precision=3, floatmode='maxprec') spark = create_rf_spark_session() ``` -There are three types of aggregate functions: _tile_ aggregate, DataFrame aggregate, and element-wise local aggregate. In the @ref:[tile aggregate functions](reference.md#tile-statistics), we are computing a statistical summary per row of a _tile_ column in a DataFrame. In the @ref:[DataFrame aggregate functions](reference.md#aggregate-tile-statistics), we are computing statistical summaries over all of the cell values *and* across all of the rows in the DataFrame or group. In the @ref:[element-wise local aggregate functions](reference.md#tile-local-aggregate-statistics), we are computing the element-wise statistical summary across a DataFrame or group of _tiles_. +There are three types of aggregate functions available in RasterFrames: _tile_ aggregate, DataFrame aggregate, and element-wise local aggregate. In the @ref:[tile aggregate functions](reference.md#tile-statistics), we are computing a statistical summary per row of a _tile_ column in a DataFrame. In the @ref:[DataFrame aggregate functions](reference.md#aggregate-tile-statistics), we are computing statistical summaries over all of the cell values *and* across all of the rows in the DataFrame or group. In the @ref:[element-wise local aggregate functions](reference.md#tile-local-aggregate-statistics), we are computing the element-wise statistical summary across a DataFrame or group of _tiles_. In the latter two cases, when @ref:[vector data](vector-data.md) is the grouping column, the results are @ref:[zonal statistics](zonal-algebra.md). ## Tile Mean Example diff --git a/pyrasterframes/src/main/python/docs/raster-processing.md b/pyrasterframes/src/main/python/docs/raster-processing.md index a16b2bb7d..fc6353e37 100644 --- a/pyrasterframes/src/main/python/docs/raster-processing.md +++ b/pyrasterframes/src/main/python/docs/raster-processing.md @@ -2,8 +2,9 @@ @@@ index -* @ref:[Local Algebra](local-algebra.md) +* @ref:[Local Map Algebra](local-algebra.md) * @ref:["NoData" Handling](nodata-handling.md) +* @ref:[Zonal Map Algebra](zonal-algebra.md) * @ref:[Aggregation](aggregation.md) * @ref:[Time Series](time-series.md) * @ref:[Machine Learning](machine-learning.md) diff --git a/pyrasterframes/src/main/python/docs/supervised-learning.pymd b/pyrasterframes/src/main/python/docs/supervised-learning.pymd index 9f2cd968f..c66697032 100644 --- a/pyrasterframes/src/main/python/docs/supervised-learning.pymd +++ b/pyrasterframes/src/main/python/docs/supervised-learning.pymd @@ -11,7 +11,6 @@ from pyspark.sql.functions import lit import pandas as pd import numpy as np import matplotlib.pyplot as plt -from docs import resource_dir_uri import os @@ -70,8 +69,10 @@ crses = df.select('crs.crsProj4').distinct().collect() print('Found ', len(crses), 'distinct CRS.') crs = crses[0][0] -label_df = spark.read.geojson( - os.path.join(resource_dir_uri(), 'luray-labels.geojson')) \ +from pyspark import SparkFiles +spark.sparkContext.addFile('https://github.com/locationtech/rasterframes/raw/develop/pyrasterframes/src/test/resources/luray-labels.geojson') + +label_df = spark.read.geojson(SparkFiles.get('luray-labels.geojson')) \ .select('id', st_reproject('geometry', lit('EPSG:4326'), lit(crs)).alias('geometry')) \ .hint('broadcast') @@ -81,7 +82,6 @@ df_joined = df.join(label_df, st_intersects(st_geometry('extent'), 'geometry')) df_labeled = df_joined.withColumn('label', rf_rasterize('geometry', st_geometry('extent'), 'id', 'dims.cols', 'dims.rows') ) - ``` ## Masking Poor Quality Cells diff --git a/pyrasterframes/src/main/python/docs/time-series.pymd b/pyrasterframes/src/main/python/docs/time-series.pymd index 0e0cbed00..8417cc9f0 100644 --- a/pyrasterframes/src/main/python/docs/time-series.pymd +++ b/pyrasterframes/src/main/python/docs/time-series.pymd @@ -9,8 +9,6 @@ import pyrasterframes from pyrasterframes.rasterfunctions import * import pyrasterframes.rf_ipython -import folium - from pyspark.sql.functions import udf, lit from geomesa_pyspark.types import MultiPolygonUDT @@ -18,51 +16,21 @@ from geomesa_pyspark.types import MultiPolygonUDT spark = pyrasterframes.get_spark_session("local[4]") ``` -In this example, we will show how the flexibility of the DataFrame concept for raster data allows a simple and intuitive way to extract a time series from Earth observation data. We will start with our @ref:[built-in MODIS data catalog](raster-catalogs.md#using-built-in-experimental-catalogs). - -```python catalog -cat = spark.read.format('aws-pds-modis-catalog').load().repartition(200) -cat.printSchema() -``` - -We will summarize the change in NDVI over 2018 in the Cuyahoga Valley National Park in Ohio, USA. First, we will retrieve open vector data delineating the park boundary from the US National Park Service's LandsNet. +In this example, we will show how the flexibility of the DataFrame concept for raster data allows a simple and intuitive way to extract a time series from Earth observation data. We will continue our example from the @ref:[Zonal Map Algebra page](zonal-algebra.md). -## Vector Data +We will summarize the change in @ref:[NDVI](local-algebra.md#computing-ndvi) over the spring and early summer of 2018 in the Cuyahoga Valley National Park in Ohio, USA. -First we will get the vector data from LandsNet service by a REST query. The data is saved to a geojson file. - -```python get_park_boundary +```python vector, echo=False, results='hidden' +cat = spark.read.format('aws-pds-modis-catalog').load().repartition(200) import requests nps_filepath = '/tmp/parks.geojson' nps_data_query_url = 'https://services1.arcgis.com/fBc8EJBxQRMcHlei/arcgis/rest/services/' \ 'NPS_Park_Boundaries/FeatureServer/0/query' \ - '?geometry=-82.451,41.075,-80.682,41.436&inSR=4326&outSR=4326&f=geojson' + '?geometry=-82.451,41.075,-80.682,41.436&inSR=4326&outSR=4326&outFields=*&f=geojson' r = requests.get(nps_data_query_url) with open(nps_filepath,'wb') as f: f.write(r.content) -``` -```python, folium_map, -m = folium.Map((41.25,-81.6), zoom_start=10).add_child(folium.GeoJson(nps_filepath)) -``` - -```python, folium_persist, echo=False -# this is the work around for ability to render the folium map in the docs build -import base64 -temp_folium = 'docs/static/__cuya__.html' -m.save(temp_folium) -with open(temp_folium, 'rb') as f: - b64 = base64.b64encode(f.read()) -with open('docs/static/cuya.md', 'w') as md: - md.write(''.format(b64.decode('utf-8'))) - # seems that the height is not correct? -``` - -@@include[folium_static](static/cuya.md) - -Now we read the park boundary vector data as a Spark DataFrame using the built-in @ref:[geojson DataSource](vector-data.md#geojson-datasource). The geometry is very detailed, and the EO cells are relatively coarse. To speed up the processing, the geometry is "simplified" by combining vertices within about 100 meters of each other. For more on this see the section on Shapely support in @ref:[user defined functions](vector-data.md#shapely-geometry-support). - -```python read_cuya_vector park_vector = spark.read.geojson(nps_filepath) @udf(MultiPolygonUDT()) @@ -72,11 +40,13 @@ def simplify(g, tol): park_vector = park_vector.withColumn('geo_simp', simplify('geometry', lit(0.001))) \ .select('geo_simp') \ .hint('broadcast') + + ``` ## Catalog Read -The entire park boundary is contained in MODIS granule h11 v04. We will simply filter on this granule, rather than using a @ref:[spatial relation](vector-data.md#geomesa-functions-and-spatial-relations). The time period selected should show the change in plant vigor as leaves emerge over the spring and into early summer. +As in our other example, we will query for a single known MODIS granule directly. We limit the vector data to the single park of interest. The longer time period selected should show the change in plant vigor as leaves emerge over the spring and into early summer. The definitions of `cat` and `park_vector` are as in the @ref:[Zonal Map Algebra page](zonal-algebra.md). ```python query_catalog park_cat = cat \ @@ -85,65 +55,40 @@ park_cat = cat \ (cat.acquisition_date > lit('2018-02-19')) & (cat.acquisition_date < lit('2018-07-01')) ) \ - .crossJoin(park_vector) + .crossJoin(park_vector.filter('OBJECTID == 380')) #only coyahuga -park_cat.printSchema() ``` -Now we have a catalog with several months of MODIS data for a single granule. However, the granule is larger than our park boundary. We will combine the park geometry with the catalog, and read only the bands of interest to compute NDVI, which we discussed in a @ref:[previous section](local-algebra.md#computing-ndvi). +## Vector and Raster Data Interaction -We then [reproject](https://gis.stackexchange.com/questions/247770/understanding-reprojection) the park geometry to the same @ref:[CRS](concepts.md#coordinate-reference-system--crs-) as the imagery. Then we will filter to only the _tiles_ intersecting the park. +We follow the same steps as the Zonal Map Algebra analysis: reprojecting the park geometry, filtering for intersection, rasterizing the geometry, and masking the NDVI by the _zone_ _tiles_. The code from that analysis is condensed here for reference. -```python read_catalog +```python raster_prep raster_cols = ['B01', 'B02',] # red and near-infrared respectively -park_rf = spark.read.raster( + +rf_park_tile = spark.read.raster( park_cat.select(['acquisition_date', 'granule_id', 'geo_simp'] + raster_cols), catalog_col_names=raster_cols) \ .withColumn('park_native', st_reproject('geo_simp', lit('EPSG:4326'), rf_crs('B01'))) \ - .filter(st_intersects('park_native', rf_geometry('B01'))) - -park_rf.printSchema() -``` - -```python persist_catalog, echo=False -# park_rf.persist() -``` - -## Vector and Raster Data Interaction - -Now we have the vector representation of the park boundary alongside the _tiles_ of red and near infrared bands. Next, we need to create a _tile_ representation of the park to allow us to limit the time series analysis to pixels within the park. This is similar to the masking operation demonstrated in @ref:[NoData handling](nodata-handling.md#masking). - -We do this using two transformations. The first one will reproject the park boundary from coordinates to the MODIS sinusoidal projection. The second one will create a new _tile_ aligned with the imagery containing a value of 1 where the pixels are contained within the park and NoData elsewhere. - -```python burn_in -rf_park_tile = park_rf \ + .filter(st_intersects('park_native', rf_geometry('B01'))) \ .withColumn('dims', rf_dimensions('B01')) \ .withColumn('park_tile', rf_rasterize('park_native', rf_geometry('B01'), lit(1), 'dims.cols', 'dims.rows')) \ - .persist() - -rf_park_tile.printSchema() + .withColumn('ndvi', rf_normalized_difference('B02', 'B01')) \ + .withColumn('ndvi_masked', rf_mask('ndvi', 'park_tile')) ``` ## Create Time Series -Next, we will compute NDVI as the normalized difference of near infrared (band 2) and red (band 1). The _tiles_ are masked by the `park_tile`. We will then aggregate across the remaining values to arrive at an average NDVI for each week of the year. Note that the computation is creating a weighted average, which is weighted by the number of valid observations per week. +We next aggregate across the cell values to arrive at an average NDVI for each week of the year. We use `pyspark`'s built in `groupby` and time functions with a RasterFrames @ref:[aggregate function](aggregation.md) to do this. Note that the computation is creating a weighted average, which is weighted by the number of valid observations per week. ```python ndvi_time_series from pyspark.sql.functions import col, year, weekofyear, month -from pyspark.sql.functions import sum as sql_sum - -rf_ndvi = rf_park_tile \ - .withColumn('ndvi', rf_normalized_difference('B02', 'B01')) \ - .withColumn('ndvi_masked', rf_mask('ndvi', 'park_tile')) -time_series = rf_ndvi \ - .withColumn('ndvi_wt', rf_tile_sum('ndvi_masked')) \ - .withColumn('wt', rf_data_cells('ndvi_masked')) \ - .groupby(year('acquisition_date').alias('year'), weekofyear('acquisition_date').alias('week')) \ - .agg(sql_sum('ndvi_wt').alias('ndvi_wt_wk'), sql_sum('wt').alias('wt_wk')) \ - .withColumn('ndvi', col('ndvi_wt_wk') / col('wt_wk')) - -time_series.printSchema() +time_series = rf_park_tile \ + .groupby( + year('acquisition_date').alias('year'), + weekofyear('acquisition_date').alias('week')) \ + .agg(rf_agg_mean('ndvi_masked').alias('ndvi')) ``` Finally, we will take a look at the NDVI over time. @@ -152,7 +97,7 @@ Finally, we will take a look at the NDVI over time. import matplotlib.pyplot as plt time_series_pdf = time_series.toPandas() -time_series_pdf = time_series_pdf.sort_values('week') +time_series_pdf.sort_values('week', inplace=True) plt.plot(time_series_pdf['week'], time_series_pdf['ndvi'], 'go-') plt.xlabel('Week of year, 2018') plt.ylabel('NDVI') diff --git a/pyrasterframes/src/main/python/docs/unsupervised-learning.pymd b/pyrasterframes/src/main/python/docs/unsupervised-learning.pymd index f2158d807..494eb9bea 100644 --- a/pyrasterframes/src/main/python/docs/unsupervised-learning.pymd +++ b/pyrasterframes/src/main/python/docs/unsupervised-learning.pymd @@ -6,7 +6,6 @@ In this example, we will demonstrate how to fit and score an unsupervised learni ```python, setup, echo=False from IPython.core.display import display -from docs import resource_dir_uri import pyrasterframes.rf_ipython from pyrasterframes.utils import create_rf_spark_session @@ -29,26 +28,19 @@ from pyspark.ml import Pipeline ``` -The first step is to create a Spark DataFrame of our imagery data. To achieve that we will create a catalog DataFrame using the pattern from [the I/O page](raster-io.html#Single-Scene--Multiple-Bands). In the catalog, each row represents a distinct area and time, and each column is the URI to a band's image product. The function `resource_dir_uri` gives a local file system path to the sample Landsat data. The resulting Spark DataFrame may have many rows per URI, with a column corresponding to each band. +The first step is to create a Spark DataFrame of our imagery data. To achieve that we will create a catalog DataFrame using the pattern from [the I/O page](raster-io.html#Single-Scene--Multiple-Bands). In the catalog, each row represents a distinct area and time, and each column is the URI to a band's image product. The resulting Spark DataFrame may have many rows per URI, with a column corresponding to each band. ```python, catalog -filenamePattern = "L8-B{}-Elkton-VA.tiff" +filenamePattern = "https://github.com/locationtech/rasterframes/" \ + "raw/develop/core/src/test/resources/L8-B{}-Elkton-VA.tiff" catalog_df = pd.DataFrame([ - {'b' + str(b): os.path.join(resource_dir_uri(), filenamePattern.format(b)) for b in range(1, 8)} + {'b' + str(b): filenamePattern.format(b) for b in range(1, 8)} ]) + df = spark.read.raster(catalog_df, catalog_col_names=catalog_df.columns) -df = df.select( - rf_crs(df.b1).alias('crs'), - rf_extent(df.b1).alias('extent'), - rf_tile(df.b1).alias('b1'), - rf_tile(df.b2).alias('b2'), - rf_tile(df.b3).alias('b3'), - rf_tile(df.b4).alias('b4'), - rf_tile(df.b5).alias('b5'), - rf_tile(df.b6).alias('b6'), - rf_tile(df.b7).alias('b7'), -) +df = df.withColumn('crs', rf_crs(df.b1)) \ + .withColumn('extent', rf_crs(df.b1)) df.printSchema() ``` diff --git a/pyrasterframes/src/main/python/docs/vector-data.pymd b/pyrasterframes/src/main/python/docs/vector-data.pymd index 1762565ad..1934f096f 100644 --- a/pyrasterframes/src/main/python/docs/vector-data.pymd +++ b/pyrasterframes/src/main/python/docs/vector-data.pymd @@ -1,19 +1,28 @@ # Vector Data -RasterFrames provides a variety of ways to work with spatial vector data (points, lines, and polygons) alongside raster data. There is a convenience DataSource for the GeoJSON format, as well as the ability to convert from [GeoPandas][GeoPandas] to Spark. Representation of vector geometries in PySpark is through [Shapely][Shapely], providing a great deal of interoperability. RasterFrames also provides access to Spark functions for working with geometries. +RasterFrames provides a variety of ways to work with spatial vector data (points, lines, and polygons) alongside raster data. -## GeoJSON DataSource + * DataSource for GeoJSON format + * Ability to convert between from [GeoPandas][GeoPandas] and Spark DataFrames + * In PySpark, geometries are [Shapely][Shapely] objects, providing a great deal of interoperability + * Many Spark functions for working with columns of geometries + * Vector data is also the basis for @ref:[zonal map algebra](zonal-algebra.md) operations. ```python, setup, echo=False import pyrasterframes import pyrasterframes.rf_ipython -from pyrasterframes.utils import create_rf_spark_session -spark = create_rf_spark_session() +import geomesa_pyspark.types +import geopandas +import folium +spark = pyrasterframes.get_spark_session('local[2]') ``` +## GeoJSON DataSource + ```python, read_geojson from pyspark import SparkFiles -spark.sparkContext.addFile('https://raw.githubusercontent.com/datasets/geo-admin1-us/master/data/admin1-us.geojson') +admin1_us_url = 'https://raw.githubusercontent.com/datasets/geo-admin1-us/master/data/admin1-us.geojson' +spark.sparkContext.addFile(admin1_us_url) # this lets us read http scheme uri's in spark df = spark.read.geojson(SparkFiles.get('admin1-us.geojson')) df.printSchema() @@ -36,7 +45,7 @@ def poly_or_mp_to_mp(g): else: return MultiPolygon([g]) -gdf = geopandas.read_file('https://raw.githubusercontent.com/datasets/geo-admin1-us/master/data/admin1-us.geojson') +gdf = geopandas.read_file(admin1_us_url) gdf.geometry = gdf.geometry.apply(poly_or_mp_to_mp) df2 = spark.createDataFrame(gdf) df2.printSchema() @@ -92,8 +101,21 @@ l8 = l8.withColumn('paducah', st_point(lit(-88.628), lit(37.072))) # col of poi l8_filtered = l8 \ .filter(st_intersects(l8.geom, st_bufferPoint(l8.paducah, lit(50000.0)))) \ .filter(l8.acquisition_date > '2018-02-01') \ - .filter(l8.acquisition_date < '2018-04-01') -l8_filtered.select('product_id', 'entity_id', 'acquisition_date', 'cloud_cover_pct') + .filter(l8.acquisition_date < '2018-03-11') +``` + +```python, folium, echo=False +geo_df = geopandas.GeoDataFrame( + l8_filtered.select('geom', 'bounds_wgs84').toPandas(), + crs='EPSG:4326', + geometry='geom') + +# display as folium / leaflet map +m = folium.Map() +layer = folium.GeoJson(geo_df.to_json()) +m.fit_bounds(layer.get_bounds()) +m.add_child(layer) +m ``` [GeoPandas]: http://geopandas.org diff --git a/pyrasterframes/src/main/python/docs/zonal-algebra.pymd b/pyrasterframes/src/main/python/docs/zonal-algebra.pymd new file mode 100644 index 000000000..3951a1c5b --- /dev/null +++ b/pyrasterframes/src/main/python/docs/zonal-algebra.pymd @@ -0,0 +1,126 @@ +# Zonal Map Algebra + +```python setup, echo=False +from IPython.display import display + +import pyrasterframes +from pyrasterframes.rasterfunctions import * +import pyrasterframes.rf_ipython + +from pyspark.sql.functions import udf, lit +from geomesa_pyspark.types import MultiPolygonUDT + +# This seemed to work for time-series! +spark = pyrasterframes.get_spark_session("local[4]") +``` + +## Definition + +Zonal map algebra refers to operations over raster cells based on the definition of a _zone_. In concept, a _zone_ is like a mask: a raster with a special value designating membership of the cell in the zone. In general, we assume that _zones_ are defined by @ref[vector geometries](vector-data.md). + +## Analysis Plan + +We will compute average @ref:[NDVI](local-algebra.md#computing-ndvi) over the month of May 2018 for two US national parks: Cuyahoga Valley and Indiana Dunes. We will select data from the @ref:[built-in catalog](raster-catalogs.md#using-built-in-experimental-catalogs), join it with park geometries, read _tiles_ for the bands needed, burn-in or rasterize the geometries to _tiles_, and compute the aggregate. + +## Get Vector Data + +First we download vector from the US National Park Service open data portal, and take a look at the data. + +```python get_park_boundary +import requests +import folium + +nps_filepath = '/tmp/2parks.geojson' +nps_data_query_url = 'https://services1.arcgis.com/fBc8EJBxQRMcHlei/arcgis/rest/services/' \ + 'NPS_Park_Boundaries/FeatureServer/0/query' \ + '?geometry=-87.601,40.923,-81.206,41.912&inSR=4326&outSR=4326' \ + "&where=UNIT_TYPE='National Park'&outFields=*&f=geojson" +r = requests.get(nps_data_query_url) +with open(nps_filepath,'wb') as f: + f.write(r.content) + +m = folium.Map() +layer = folium.GeoJson(nps_filepath) +m.fit_bounds(layer.get_bounds()) +m.add_child(layer) +m +``` + + +Now we read the park boundary vector data as a Spark DataFrame using the built-in @ref:[geojson DataSource](vector-data.md#geojson-datasource). The geometry is very detailed, and the EO cells are relatively coarse. To speed up the processing, the geometry we "simplify" by combining vertices within about 500 meters of each other. For more on this see the section on Shapely support in @ref:[user defined functions](vector-data.md#shapely-geometry-support). + +```python read_park_vector +park_vector = spark.read.geojson(nps_filepath) + +@udf(MultiPolygonUDT()) +def simplify(g, tol): + return g.simplify(tol) + +park_vector = park_vector.withColumn('geo_simp', simplify('geometry', lit(0.005))) \ + .select('geo_simp', 'OBJECTID', 'UNIT_NAME') \ + .hint('broadcast') +``` + +## Catalog Read + +Both parks are entirely contained in MODIS granule h11 v04. We will simply filter on this granule, rather than using a @ref:[spatial relation](vector-data.md#geomesa-functions-and-spatial-relations). + +```python query_catalog +cat = spark.read.format('aws-pds-modis-catalog').load().repartition(50) +park_cat = cat \ + .filter( + (cat.granule_id == 'h11v04') & + (cat.acquisition_date >= lit('2018-05-01')) & + (cat.acquisition_date < lit('2018-06-01')) + ) \ + .crossJoin(park_vector) + +park_cat.printSchema() +``` + +We will combine the park geometry with the catalog, and read only the bands of interest to compute NDVI, which we discussed in a @ref:[previous section](local-algebra.md#computing-ndvi). + +Now we have a dataframe with several months of MODIS data for a single granule. However, the granule covers a great deal of area outside our park boundaries _zones_. To deal with this we will, first [reproject](https://gis.stackexchange.com/questions/247770/understanding-reprojection) the park geometry to the same @ref:[CRS](concepts.md#coordinate-reference-system--crs-) as the imagery. Then we will filter to only the _tiles_ intersecting the park _zones_. + +```python read_catalog +raster_cols = ['B01', 'B02',] # red and near-infrared respectively +park_rf = spark.read.raster( + park_cat.select(['acquisition_date', 'granule_id'] + raster_cols + park_vector.columns), + catalog_col_names=raster_cols) \ + .withColumn('park_native', st_reproject('geo_simp', lit('EPSG:4326'), rf_crs('B01'))) \ + .filter(st_intersects('park_native', rf_geometry('B01'))) + +park_rf.printSchema() +``` + +## Define Zone Tiles + +Now we have the vector representation of the park boundary alongside the _tiles_ of red and near infrared bands. Next, we need to create a _tile_ representation of the park to allow us to limit the raster analysis to pixels within the park _zone_. This is similar to the masking operation demonstrated in @ref:[NoData handling](nodata-handling.md#masking). We rasterize the geometries using @ref:[`rf_rasterize`](reference.md#rf-rasterize): this creates a new _tile_ column aligned with the imagery, and containing the park's OBJECTID attribute for cells intersecting the _zone_. Cells outside the park _zones_ have a NoData value. + +```python burn_in +rf_park_tile = park_rf \ + .withColumn('dims', rf_dimensions('B01')) \ + .withColumn('park_zone_tile', rf_rasterize('park_native', rf_geometry('B01'), 'OBJECTID', 'dims.cols', 'dims.rows')) \ + .persist() + +rf_park_tile.printSchema() +``` + +## Compute Zonal Statistics + +We compute NDVI as the normalized difference of near infrared (band 2) and red (band 1). The _tiles_ are masked by the `park_zone_tile`, limiting the cells to those in the _zone_. To finish, we compute our desired statistics over the NVDI _tiles_ that are limited by the _zone_. + +```python ndvi_zonal, evaluate=True +from pyspark.sql.functions import col +from pyspark.sql import functions as F + +rf_ndvi = rf_park_tile \ + .withColumn('ndvi', rf_normalized_difference('B02', 'B01')) \ + .withColumn('ndvi_masked', rf_mask('ndvi', 'park_zone_tile')) + +zonal_mean = rf_ndvi \ + .groupby('OBJECTID', 'UNIT_NAME') \ + .agg(rf_agg_mean('ndvi')) + +zonal_mean +``` diff --git a/pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py b/pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py index e65008973..0066e7dd7 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py +++ b/pyrasterframes/src/main/python/pyrasterframes/rf_ipython.py @@ -163,6 +163,19 @@ def spark_df_to_html(df, num_rows=5, truncate=False): return RFContext.active().call("_dfToHTML", df._jdf, num_rows, truncate) +def _folium_map_formatter(map): + """ inputs a folium.Map object and returns html of rendered map """ + + import base64 + html_source = map.get_root().render() + b64_source = base64.b64encode( + bytes(html_source.encode('utf-8')) + ).decode('utf-8') + + source_blob = '' + return source_blob.format(b64_source) + + try: from IPython import get_ipython from IPython.display import display_png, display_markdown, display @@ -178,9 +191,17 @@ def spark_df_to_html(df, num_rows=5, truncate=False): html_formatter.for_type(pandas.DataFrame, pandas_df_to_html) html_formatter.for_type(pyspark.sql.DataFrame, spark_df_to_html) + # these will likely only effect docs build markdown_formatter = ip.display_formatter.formatters['text/markdown'] markdown_formatter.for_type(pyspark.sql.DataFrame, spark_df_to_markdown) + try: + # this block is to try to avoid making an install dep on folium but support if in the environment + import folium + markdown_formatter.for_type(folium.Map, _folium_map_formatter) + except ImportError as e: + pass + png_formatter = ip.display_formatter.formatters['image/png'] png_formatter.for_type(Tile, tile_to_png) From 0c82bcdadb6180ef6ee186ef747252c42a235fd4 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Wed, 18 Sep 2019 09:27:09 -0400 Subject: [PATCH 41/48] Doc fixes missing @ref: prefixes. --- .../test/scala/org/locationtech/rasterframes/TestData.scala | 5 ----- pyrasterframes/src/main/python/docs/concepts.md | 2 +- pyrasterframes/src/main/python/docs/getting-started.pymd | 4 ++-- pyrasterframes/src/main/python/docs/reference.pymd | 2 +- 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/core/src/test/scala/org/locationtech/rasterframes/TestData.scala b/core/src/test/scala/org/locationtech/rasterframes/TestData.scala index 00862d629..1b1fd4022 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/TestData.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/TestData.scala @@ -311,11 +311,6 @@ object TestData extends TestData { t.map((c, r, v) ⇒ if(filter(c, r)) raster.NODATA else v) } -// t match { -// case TileContext(ext, crs) => ProjectedRasterTile(injected, ext, crs) -// case _ => injected -// } - injected } } diff --git a/pyrasterframes/src/main/python/docs/concepts.md b/pyrasterframes/src/main/python/docs/concepts.md index dda457055..b1292299d 100644 --- a/pyrasterframes/src/main/python/docs/concepts.md +++ b/pyrasterframes/src/main/python/docs/concepts.md @@ -36,7 +36,7 @@ The most frequently encountered cell types in RasterFrames are below. | Float | `float32` | 32-bit floating-point | -3.4028235E38 to 3.4028235E38 | | Double | `float64` | 64-bit floating-point | -1.7976931348623157E308 to 1.7976931348623157E308 | -See the section on [“NoData” Handling](nodata-handling.md) for additional discussion on cell types and more exhaustive coverage of available cell types. +See the section on @ref:[“NoData” Handling](nodata-handling.md) for additional discussion on cell types and more exhaustive coverage of available cell types. ## NoData diff --git a/pyrasterframes/src/main/python/docs/getting-started.pymd b/pyrasterframes/src/main/python/docs/getting-started.pymd index 27ec587c6..91b1a597d 100644 --- a/pyrasterframes/src/main/python/docs/getting-started.pymd +++ b/pyrasterframes/src/main/python/docs/getting-started.pymd @@ -2,7 +2,7 @@ @@@ note -If you are new to Earth-observing imagery, you might consider looking at the [Concepts](concepts.md) section first. +If you are new to Earth-observing imagery, you might consider looking at the @ref:[Concepts](concepts.md) section first. @@@ @@ -27,7 +27,7 @@ Then in a python interpreter of your choice, you can get a [`pyspark` `SparkSess from pyrasterframes import rf_ipython ``` -```python, version +```python, version import pyrasterframes spark = pyrasterframes.get_spark_session() ``` diff --git a/pyrasterframes/src/main/python/docs/reference.pymd b/pyrasterframes/src/main/python/docs/reference.pymd index 22ac339f3..30a724d37 100644 --- a/pyrasterframes/src/main/python/docs/reference.pymd +++ b/pyrasterframes/src/main/python/docs/reference.pymd @@ -210,7 +210,7 @@ Where the `mask` contains NoData, replace values in the `tile` with NoData. Returned `tile` cell type will be coerced to one supporting NoData if it does not already. -See also @[`rf_rasterize`](reference.md#rf-rasterize). +See also @ref:[`rf_rasterize`](reference.md#rf-rasterize). ### rf_inverse_mask From 50c69a9e14d134d40fee80a8cd7918efcd5c2523 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Thu, 19 Sep 2019 16:10:00 -0400 Subject: [PATCH 42/48] Tweaks to RasterSource-related code for easier extendability. --- .../expressions/generators/ExplodeTiles.scala | 2 +- .../extensions/ReprojectToLayer.scala | 8 ++- .../geotiff/GeoTiffDataSource.scala | 2 +- .../raster/RasterSourceDataSource.scala | 66 ++++++++++++++++++- .../datasource/raster/package.scala | 57 +--------------- pyrasterframes/src/main/python/docs/index.md | 2 +- 6 files changed, 75 insertions(+), 62 deletions(-) diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala index 74589c0ae..06c0c033e 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala @@ -25,7 +25,7 @@ import geotrellis.raster._ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, CodegenFallback, UnsafeRowWriter} -import org.apache.spark.sql.catalyst.expressions.{Expression, Generator, GenericInternalRow, UnsafeRow} +import org.apache.spark.sql.catalyst.expressions.{Expression, Generator, UnsafeRow} import org.apache.spark.sql.types._ import org.locationtech.rasterframes._ import org.locationtech.rasterframes.expressions.DynamicExtractors diff --git a/core/src/main/scala/org/locationtech/rasterframes/extensions/ReprojectToLayer.scala b/core/src/main/scala/org/locationtech/rasterframes/extensions/ReprojectToLayer.scala index c396deaee..d5e6f5e31 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/extensions/ReprojectToLayer.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/extensions/ReprojectToLayer.scala @@ -27,21 +27,23 @@ import org.apache.spark.sql.functions.broadcast import org.locationtech.rasterframes._ import org.locationtech.rasterframes.util._ object ReprojectToLayer { - def apply(df: DataFrame, tlm: TileLayerMetadata[SpatialKey]): RasterFrameLayer = { // create a destination dataframe with crs and extend columns // use RasterJoin to do the rest. val gb = tlm.gridBounds val crs = tlm.crs + import df.sparkSession.implicits._ + implicit val enc = Encoders.tuple(spatialKeyEncoder, extentEncoder, crsEncoder) + val gridItems = for { (col, row) <- gb.coordsIter sk = SpatialKey(col, row) e = tlm.mapTransform(sk) } yield (sk, e, crs) - val dest = df.sparkSession.createDataFrame(gridItems.toSeq) - .toDF(SPATIAL_KEY_COLUMN.columnName, EXTENT_COLUMN.columnName, CRS_COLUMN.columnName) + val dest = gridItems.toSeq.toDF(SPATIAL_KEY_COLUMN.columnName, EXTENT_COLUMN.columnName, CRS_COLUMN.columnName) + dest.show(false) val joined = RasterJoin(broadcast(dest), df) joined.asLayer(SPATIAL_KEY_COLUMN, tlm) diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala index 99923f526..112d3fd22 100644 --- a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala +++ b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala @@ -90,7 +90,7 @@ class GeoTiffDataSource } val tags = Tags( - RFBuildInfo.toMap.filter(_._1.toLowerCase().contains("version")).mapValues(_.toString), + RFBuildInfo.toMap.filter(_._1.toLowerCase() == "version").mapValues(_.toString), tileCols.map(c => Map("RF_COL" -> c.columnName)).toList ) diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceDataSource.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceDataSource.scala index 5aec1a065..061e9fb56 100644 --- a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceDataSource.scala +++ b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceDataSource.scala @@ -21,11 +21,16 @@ package org.locationtech.rasterframes.datasource.raster +import java.net.URI +import java.util.UUID + import org.locationtech.rasterframes._ import org.locationtech.rasterframes.util._ -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.{DataFrame, DataFrameReader, SQLContext} import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister, RelationProvider} import org.locationtech.rasterframes.model.TileDimensions +import shapeless.tag +import shapeless.tag.@@ class RasterSourceDataSource extends DataSourceRegister with RelationProvider { import RasterSourceDataSource._ @@ -58,6 +63,8 @@ object RasterSourceDataSource { } /** Container for specifying raster paths. */ case class RasterSourceCatalog(csv: String, bandColumnNames: String*) extends WithBandColumns { + protected def tmpTableName() = UUID.randomUUID().toString.replace("-", "") + def registerAsTable(sqlContext: SQLContext): RasterSourceCatalogRef = { import sqlContext.implicits._ val lines = csv @@ -95,7 +102,6 @@ object RasterSourceDataSource { /** Container for specifying where to select raster paths from. */ case class RasterSourceCatalogRef(tableName: String, bandColumnNames: String*) extends WithBandColumns - private[raster] implicit class ParamsDictAccessors(val parameters: Map[String, String]) extends AnyVal { def tokenize(csv: String): Seq[String] = csv.split(',').map(_.trim) @@ -151,4 +157,60 @@ object RasterSourceDataSource { } } } + + /** Mixin for adding extension methods on DataFrameReader for RasterSourceDataSource-like readers. */ + trait CatalogReaderOptionsSupport[ReaderTag] { + type TaggedReader = DataFrameReader @@ ReaderTag + val reader: TaggedReader + + protected def tmpTableName() = UUID.randomUUID().toString.replace("-", "") + + /** Set the zero-based band indexes to read. Defaults to Seq(0). */ + def withBandIndexes(bandIndexes: Int*): TaggedReader = + tag[ReaderTag][DataFrameReader]( + reader.option(RasterSourceDataSource.BAND_INDEXES_PARAM, bandIndexes.mkString(",")) + ) + + def withTileDimensions(cols: Int, rows: Int): TaggedReader = + tag[ReaderTag][DataFrameReader]( + reader.option(RasterSourceDataSource.TILE_DIMS_PARAM, s"$cols,$rows") + ) + + /** Indicate if tile reading should be delayed until cells are fetched. Defaults to `true`. */ + def withLazyTiles(state: Boolean): TaggedReader = + tag[ReaderTag][DataFrameReader]( + reader.option(RasterSourceDataSource.LAZY_TILES_PARAM, state)) + + def fromCatalog(catalog: DataFrame, bandColumnNames: String*): TaggedReader = + tag[ReaderTag][DataFrameReader] { + val tmpName = tmpTableName() + catalog.createOrReplaceTempView(tmpName) + reader + .option(RasterSourceDataSource.CATALOG_TABLE_PARAM, tmpName) + .option(RasterSourceDataSource.CATALOG_TABLE_COLS_PARAM, bandColumnNames.mkString(",")): DataFrameReader + } + + def fromCatalog(tableName: String, bandColumnNames: String*): TaggedReader = + tag[ReaderTag][DataFrameReader]( + reader.option(RasterSourceDataSource.CATALOG_TABLE_PARAM, tableName) + .option(RasterSourceDataSource.CATALOG_TABLE_COLS_PARAM, bandColumnNames.mkString(",")) + ) + + def fromCSV(catalogCSV: String, bandColumnNames: String*): TaggedReader = + tag[ReaderTag][DataFrameReader]( + reader.option(RasterSourceDataSource.CATALOG_CSV_PARAM, catalogCSV) + .option(RasterSourceDataSource.CATALOG_TABLE_COLS_PARAM, bandColumnNames.mkString(",")) + ) + + def from(newlineDelimPaths: String): TaggedReader = + tag[ReaderTag][DataFrameReader]( + reader.option(RasterSourceDataSource.PATHS_PARAM, newlineDelimPaths) + ) + + def from(paths: Seq[String]): TaggedReader = + from(paths.mkString("\n")) + + def from(uris: Seq[URI])(implicit d: DummyImplicit): TaggedReader = + from(uris.map(_.toASCIIString)) + } } diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/package.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/package.scala index d85f435d2..48c0e9642 100644 --- a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/package.scala +++ b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/package.scala @@ -21,16 +21,11 @@ package org.locationtech.rasterframes.datasource -import java.net.URI -import java.util.UUID - -import org.apache.spark.sql.{DataFrame, DataFrameReader} +import org.apache.spark.sql.DataFrameReader import shapeless.tag import shapeless.tag.@@ package object raster { - private[raster] def tmpTableName() = UUID.randomUUID().toString.replace("-", "") - trait RasterSourceDataFrameReaderTag type RasterSourceDataFrameReader = DataFrameReader @@ RasterSourceDataFrameReaderTag @@ -42,52 +37,6 @@ package object raster { } /** Adds option methods relevant to RasterSourceDataSource. */ - implicit class RasterSourceDataFrameReaderHasOptions(val reader: RasterSourceDataFrameReader) { - /** Set the zero-based band indexes to read. Defaults to Seq(0). */ - def withBandIndexes(bandIndexes: Int*): RasterSourceDataFrameReader = - tag[RasterSourceDataFrameReaderTag][DataFrameReader]( - reader.option(RasterSourceDataSource.BAND_INDEXES_PARAM, bandIndexes.mkString(","))) - - def withTileDimensions(cols: Int, rows: Int): RasterSourceDataFrameReader = - tag[RasterSourceDataFrameReaderTag][DataFrameReader]( - reader.option(RasterSourceDataSource.TILE_DIMS_PARAM, s"$cols,$rows") - ) - - /** Indicate if tile reading should be delayed until cells are fetched. Defaults to `true`. */ - def withLazyTiles(state: Boolean): RasterSourceDataFrameReader = - tag[RasterSourceDataFrameReaderTag][DataFrameReader]( - reader.option(RasterSourceDataSource.LAZY_TILES_PARAM, state)) - - def fromCatalog(catalog: DataFrame, bandColumnNames: String*): RasterSourceDataFrameReader = - tag[RasterSourceDataFrameReaderTag][DataFrameReader] { - val tmpName = tmpTableName() - catalog.createOrReplaceTempView(tmpName) - reader - .option(RasterSourceDataSource.CATALOG_TABLE_PARAM, tmpName) - .option(RasterSourceDataSource.CATALOG_TABLE_COLS_PARAM, bandColumnNames.mkString(",")): DataFrameReader - } - - def fromCatalog(tableName: String, bandColumnNames: String*): RasterSourceDataFrameReader = - tag[RasterSourceDataFrameReaderTag][DataFrameReader]( - reader.option(RasterSourceDataSource.CATALOG_TABLE_PARAM, tableName) - .option(RasterSourceDataSource.CATALOG_TABLE_COLS_PARAM, bandColumnNames.mkString(",")) - ) - - def fromCSV(catalogCSV: String, bandColumnNames: String*): RasterSourceDataFrameReader = - tag[RasterSourceDataFrameReaderTag][DataFrameReader]( - reader.option(RasterSourceDataSource.CATALOG_CSV_PARAM, catalogCSV) - .option(RasterSourceDataSource.CATALOG_TABLE_COLS_PARAM, bandColumnNames.mkString(",")) - ) - - def from(newlineDelimPaths: String): RasterSourceDataFrameReader = - tag[RasterSourceDataFrameReaderTag][DataFrameReader]( - reader.option(RasterSourceDataSource.PATHS_PARAM, newlineDelimPaths) - ) - - def from(paths: Seq[String]): RasterSourceDataFrameReader = - from(paths.mkString("\n")) - - def from(uris: Seq[URI])(implicit d: DummyImplicit): RasterSourceDataFrameReader = - from(uris.map(_.toASCIIString)) - } + implicit class RasterSourceDataFrameReaderHasOptions(val reader: RasterSourceDataFrameReader) + extends RasterSourceDataSource.CatalogReaderOptionsSupport[RasterSourceDataFrameReaderTag] } diff --git a/pyrasterframes/src/main/python/docs/index.md b/pyrasterframes/src/main/python/docs/index.md index c6da7fd13..3cc1a54e1 100644 --- a/pyrasterframes/src/main/python/docs/index.md +++ b/pyrasterframes/src/main/python/docs/index.md @@ -31,7 +31,7 @@ RasterFrames is released under the [Apache 2.0 License](https://github.com/locat * [Vector Data](vector-data.md) * [Raster Processing](raster-processing.md) * [Numpy and Pandas](numpy-pandas.md) -* [API Languages](languages.md) +* [Scala and SQL](languages.md) * [Function Reference](reference.md) * [Release Notes](release-notes.md) @@@ From ee91e6d02a028b1a17fa9450c35ceb3c6dbd2f47 Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Fri, 20 Sep 2019 15:01:50 -0400 Subject: [PATCH 43/48] Pass kwargs in python create_rf_spark_session to spark conf Signed-off-by: Jason T. Brown --- .../src/main/python/pyrasterframes/__init__.py | 4 ++-- pyrasterframes/src/main/python/pyrasterframes/utils.py | 10 +++++++++- .../src/main/python/tests/PyRasterFramesTests.py | 8 ++++++++ pyrasterframes/src/main/python/tests/__init__.py | 6 +++++- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/pyrasterframes/src/main/python/pyrasterframes/__init__.py b/pyrasterframes/src/main/python/pyrasterframes/__init__.py index 905323d60..82e27ed4a 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/__init__.py +++ b/pyrasterframes/src/main/python/pyrasterframes/__init__.py @@ -55,11 +55,11 @@ def _kryo_init(builder): return builder -def get_spark_session(master="local[*]"): +def get_spark_session(master="local[*]", **kwargs): """ Create a SparkSession with pyrasterframes enabled and configured. """ from pyrasterframes.utils import create_rf_spark_session - return create_rf_spark_session(master) + return create_rf_spark_session(master, **kwargs) def _convert_df(df, sp_key=None, metadata=None): diff --git a/pyrasterframes/src/main/python/pyrasterframes/utils.py b/pyrasterframes/src/main/python/pyrasterframes/utils.py index e1ffd4074..14d55f6ed 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/utils.py +++ b/pyrasterframes/src/main/python/pyrasterframes/utils.py @@ -20,6 +20,7 @@ import glob from pyspark.sql import SparkSession +from pyspark import SparkConf import os import sys from . import RFContext @@ -76,15 +77,22 @@ def find_pyrasterframes_assembly(): return jarpath[0] -def create_rf_spark_session(master="local[*]"): +def create_rf_spark_session(master="local[*]", **kwargs): """ Create a SparkSession with pyrasterframes enabled and configured. """ jar_path = find_pyrasterframes_assembly() + if 'spark.jars' in kwargs.keys(): + if 'pyrasterframes' not in kwargs['spark.jars']: + raise UserWarning("spark.jars config is set, but it seems to be missing the pyrasterframes assembly jar.") + + conf = SparkConf().setAll([(k, kwargs[k]) for k in kwargs]) + spark = (SparkSession.builder .master(master) .appName("RasterFrames") .config('spark.jars', jar_path) .withKryoSerialization() + .config(conf=conf) # user can override the defaults .getOrCreate()) try: diff --git a/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py b/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py index 6092410bb..7cda3b997 100644 --- a/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py +++ b/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py @@ -28,6 +28,14 @@ from . import TestEnvironment +class UtilTest(TestEnvironment): + + def test_spark_confs(self): + from . import app_name + self.assertEqual(self.spark.conf.get('spark.app.name'), app_name) + self.assertEqual(self.spark.conf.get('spark.ui.enabled'), 'false') + + class CellTypeHandling(unittest.TestCase): def test_is_raw(self): diff --git a/pyrasterframes/src/main/python/tests/__init__.py b/pyrasterframes/src/main/python/tests/__init__.py index 2fe44a1dd..bea51f58b 100644 --- a/pyrasterframes/src/main/python/tests/__init__.py +++ b/pyrasterframes/src/main/python/tests/__init__.py @@ -31,6 +31,7 @@ else: import __builtin__ as builtins +app_name = 'pyrasterframes test suite' def resource_dir(): def pdir(curr): @@ -46,7 +47,10 @@ def pdir(curr): def spark_test_session(): - spark = create_rf_spark_session() + spark = create_rf_spark_session(**{ + 'spark.ui.enabled': 'false', + 'spark.app.name': app_name + }) spark.sparkContext.setLogLevel('ERROR') print("Spark Version: " + spark.version) From 0b0437249eee93cfb9e82db04a3079fd73997116 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Sat, 21 Sep 2019 11:05:50 -0400 Subject: [PATCH 44/48] Unit test reproducing #360. --- .../geotiff/GeoTiffDataSourceSpec.scala | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSourceSpec.scala b/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSourceSpec.scala index eb5e55b0c..ccac0a586 100644 --- a/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSourceSpec.scala +++ b/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSourceSpec.scala @@ -28,6 +28,7 @@ import geotrellis.vector.Extent import org.locationtech.rasterframes._ import org.apache.spark.sql.functions._ import org.locationtech.rasterframes.TestEnvironment +import org.locationtech.rasterframes.datasource.raster._ /** * @since 1/14/18 @@ -186,7 +187,7 @@ class GeoTiffDataSourceSpec } it("should write GeoTIFF without layer") { - import org.locationtech.rasterframes.datasource.raster._ + val pr = col("proj_raster_b0") val rf = spark.read.raster.withBandIndexes(0, 1, 2).load(rgbCogSamplePath.toASCIIString) @@ -219,6 +220,23 @@ class GeoTiffDataSourceSpec } } + it("should produce the correct subregion") { + import spark.implicits._ + val rf = SinglebandGeoTiff(TestData.singlebandCogPath.getPath) + .projectedRaster.toLayer(128, 128).withExtent() + + val out = Paths.get("target", "example3-geotiff.tif") + logger.info(s"Writing to $out") + + val bitOfLayer = rf.filter($"spatial_key.col" === 0 && $"spatial_key.row" === 0) + val expectedExtent = bitOfLayer.select($"extent".as[Extent]).first() + bitOfLayer.write.geotiff.save(out.toString) + + val result = SinglebandGeoTiff(out.toString) + result.tile.dimensions should be (128, 128) + result.extent should be (expectedExtent) + } + def s(band: Int): String = s"https://modis-pds.s3.amazonaws.com/MCD43A4.006/11/08/2019059/" + s"MCD43A4.A2019059.h11v08.006.2019072203257_B0${band}.TIF" From 5953f0fb7b18fe7528c5d8c1a2d648987b561882 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Sun, 22 Sep 2019 16:25:36 -0400 Subject: [PATCH 45/48] Fixed handling of aggregate extent and image size on geotiff write. --- .../rasterframes/ref/RasterRefIT.scala | 2 +- .../aggregates/TileRasterizerAggregate.scala | 2 +- .../extensions/RFSpatialColumnMethods.scala | 10 +++ .../geotiff/GeoTiffDataSource.scala | 32 ++++----- .../geotiff/GeoTiffDataSourceSpec.scala | 66 +++++++++++++------ 5 files changed, 70 insertions(+), 42 deletions(-) diff --git a/core/src/it/scala/org/locationtech/rasterframes/ref/RasterRefIT.scala b/core/src/it/scala/org/locationtech/rasterframes/ref/RasterRefIT.scala index 713285717..88b5b8617 100644 --- a/core/src/it/scala/org/locationtech/rasterframes/ref/RasterRefIT.scala +++ b/core/src/it/scala/org/locationtech/rasterframes/ref/RasterRefIT.scala @@ -48,7 +48,7 @@ class RasterRefIT extends TestEnvironment { rf_crs($"red"), rf_extent($"red"), rf_tile($"red"), rf_tile($"green"), rf_tile($"blue")) .toDF - val raster = TileRasterizerAggregate(df, redScene.crs, None, None) + val raster = TileRasterizerAggregate.collect(df, redScene.crs, None, None) forEvery(raster.tile.statisticsDouble) { stats => stats should be ('defined) diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/TileRasterizerAggregate.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/TileRasterizerAggregate.scala index f8e102eae..360ef93dd 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/TileRasterizerAggregate.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/TileRasterizerAggregate.scala @@ -119,7 +119,7 @@ object TileRasterizerAggregate { new TileRasterizerAggregate(prd)(crsCol, extentCol, tileCol).as(nodeName).as[Raster[Tile]] } - def apply(df: DataFrame, destCRS: CRS, destExtent: Option[Extent], rasterDims: Option[TileDimensions]): ProjectedRaster[MultibandTile] = { + def collect(df: DataFrame, destCRS: CRS, destExtent: Option[Extent], rasterDims: Option[TileDimensions]): ProjectedRaster[MultibandTile] = { val tileCols = WithDataFrameMethods(df).tileColumns require(tileCols.nonEmpty, "need at least one tile column") // Select the anchoring Tile, Extent and CRS columns diff --git a/core/src/main/scala/org/locationtech/rasterframes/extensions/RFSpatialColumnMethods.scala b/core/src/main/scala/org/locationtech/rasterframes/extensions/RFSpatialColumnMethods.scala index 4eade42ad..af79c1c05 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/extensions/RFSpatialColumnMethods.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/extensions/RFSpatialColumnMethods.scala @@ -34,6 +34,7 @@ import org.apache.spark.sql.functions.{asc, udf => sparkUdf} import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import org.locationtech.geomesa.curve.Z2SFC import org.locationtech.rasterframes.StandardColumns +import org.locationtech.rasterframes.encoders.serialized_literal /** * RasterFrameLayer extension methods associated with adding spatially descriptive columns. @@ -71,6 +72,15 @@ trait RFSpatialColumnMethods extends MethodExtensions[RasterFrameLayer] with Sta val key2Extent = sparkUdf(keyCol2Extent) self.withColumn(colName, key2Extent(self.spatialKeyColumn)).certify } + /** + * Append a column containing the CRS of the layer. + * + * @param colName name of column to append. Defaults to "crs" + * @return updated RasterFrameLayer + */ + def withCRS(colName: String = CRS_COLUMN.columnName): RasterFrameLayer = { + self.withColumn(colName, serialized_literal(self.crs)).certify + } /** * Append a column containing the bounds of the row's spatial key. diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala index 99923f526..91bd71077 100644 --- a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala +++ b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala @@ -67,27 +67,21 @@ class GeoTiffDataSource require(tileCols.nonEmpty, "Could not find any tile columns.") - val raster = if (df.isAlreadyLayer) { - val layer = df.certify - val tlm = layer.tileLayerMetadata.merge - - // If no desired image size is given, write at full size. - val TileDimensions(cols, rows) = parameters.rasterDimensions - .getOrElse { - val actualSize = tlm.layout.toRasterExtent().gridBoundsFor(tlm.extent) - TileDimensions(actualSize.width, actualSize.height) - } - // Should we really play traffic cop here? - if (cols.toDouble * rows * 64.0 > Runtime.getRuntime.totalMemory() * 0.5) - logger.warn( - s"You've asked for the construction of a very large image ($cols x $rows), destined for ${path}. Out of memory error likely.") - layer.toMultibandRaster(tileCols, cols.toInt, rows.toInt) - } else { - require(parameters.crs.nonEmpty, "A destination CRS must be provided") - TileRasterizerAggregate(df, parameters.crs.get, None, parameters.rasterDimensions) - } + val destCRS = parameters.crs.orElse(df.asLayerSafely.map(_.crs)).getOrElse( + throw new IllegalArgumentException("A destination CRS must be provided") + ) + + val input = df.asLayerSafely.map(layer => + (layer.crsColumns.isEmpty, layer.extentColumns.isEmpty) match { + case (true, true) => layer.withExtent().withCRS() + case (true, false) => layer.withCRS() + case (false, true) => layer.withExtent() + case _ => layer + }).getOrElse(df) + + val raster = TileRasterizerAggregate.collect(input, destCRS, None, parameters.rasterDimensions) val tags = Tags( RFBuildInfo.toMap.filter(_._1.toLowerCase().contains("version")).mapValues(_.toString), diff --git a/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSourceSpec.scala b/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSourceSpec.scala index ccac0a586..817d7d5bf 100644 --- a/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSourceSpec.scala +++ b/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSourceSpec.scala @@ -20,9 +20,10 @@ */ package org.locationtech.rasterframes.datasource.geotiff -import java.nio.file.Paths +import java.nio.file.{Path, Paths} import geotrellis.proj4._ +import geotrellis.raster.CellType import geotrellis.raster.io.geotiff.{MultibandGeoTiff, SinglebandGeoTiff} import geotrellis.vector.Extent import org.locationtech.rasterframes._ @@ -90,6 +91,15 @@ class GeoTiffDataSourceSpec describe("GeoTiff writing") { + def checkTiff(file: Path, cols: Int, rows: Int, extent: Extent, cellType: Option[CellType] = None) = { + val outputTif = SinglebandGeoTiff(file.toString) + outputTif.tile.dimensions should be ((cols, rows)) + outputTif.extent should be (extent) + cellType.foreach(ct => + outputTif.cellType should be (ct) + ) + } + it("should write GeoTIFF RF to parquet") { val rf = spark.read.format("geotiff").load(cogPath.toASCIIString).asLayer assert(write(rf)) @@ -105,6 +115,9 @@ class GeoTiffDataSourceSpec noException shouldBe thrownBy { rf.write.format("geotiff").save(out.toString) } + val extent = rf.tileLayerMetadata.merge.extent + + checkTiff(out, 1028, 989, extent) } it("should write unstructured raster") { @@ -117,10 +130,10 @@ class GeoTiffDataSourceSpec val crs = df.select(rf_crs($"proj_raster")).first() - val out = Paths.get("target", "unstructured.tif").toString + val out = Paths.get("target", "unstructured.tif") noException shouldBe thrownBy { - df.write.geotiff.withCRS(crs).save(out) + df.write.geotiff.withCRS(crs).save(out.toString) } val (inCols, inRows) = { @@ -130,11 +143,7 @@ class GeoTiffDataSourceSpec inCols should be (774) inRows should be (500) //from gdalinfo - val outputTif = SinglebandGeoTiff(out) - outputTif.imageData.cols should be (inCols) - outputTif.imageData.rows should be (inRows) - - // TODO check datatype, extent. + checkTiff(out, inCols, inRows, Extent(431902.5, 4313647.5, 443512.5, 4321147.5)) } it("should round trip unstructured raster from COG"){ @@ -164,10 +173,10 @@ class GeoTiffDataSourceSpec dfExtent shouldBe resourceExtent - val out = Paths.get("target", "unstructured_cog.tif").toString + val out = Paths.get("target", "unstructured_cog.tif") noException shouldBe thrownBy { - df.write.geotiff.withCRS(crs).save(out) + df.write.geotiff.withCRS(crs).save(out.toString) } val (inCols, inRows, inExtent, inCellType) = { @@ -175,15 +184,11 @@ class GeoTiffDataSourceSpec val id = tif.imageData (id.cols, id.rows, tif.extent, tif.cellType) } - inCols should be (963) - inRows should be (754) //from gdalinfo + inCols should be (resourceCols) + inRows should be (resourceRows) //from gdalinfo inExtent should be (resourceExtent) - val outputTif = SinglebandGeoTiff(out) - outputTif.imageData.cols should be (inCols) - outputTif.imageData.rows should be (inRows) - outputTif.extent should be (resourceExtent) - outputTif.cellType should be (inCellType) + checkTiff(out, inCols, inRows, resourceExtent, Some(inCellType)) } it("should write GeoTIFF without layer") { @@ -218,9 +223,12 @@ class GeoTiffDataSourceSpec .save(out.toString) } } + + checkTiff(out, 128, 128, + Extent(-76.52586750038186, 36.85907177863949, -76.17461216980891, 37.1303690755922)) } - it("should produce the correct subregion") { + it("should produce the correct subregion from layer") { import spark.implicits._ val rf = SinglebandGeoTiff(TestData.singlebandCogPath.getPath) .projectedRaster.toLayer(128, 128).withExtent() @@ -232,9 +240,25 @@ class GeoTiffDataSourceSpec val expectedExtent = bitOfLayer.select($"extent".as[Extent]).first() bitOfLayer.write.geotiff.save(out.toString) - val result = SinglebandGeoTiff(out.toString) - result.tile.dimensions should be (128, 128) - result.extent should be (expectedExtent) + checkTiff(out, 128, 128, expectedExtent) + } + + it("should produce the correct subregion without layer") { + import spark.implicits._ + + val rf = spark.read.raster + .withTileDimensions(128, 128) + .load(TestData.singlebandCogPath.toASCIIString) + + val out = Paths.get("target", "example3-geotiff.tif") + logger.info(s"Writing to $out") + + val bitOfLayer = rf.filter(st_intersects(st_makePoint(754245, 3893385), rf_geometry($"proj_raster"))) + val expectedExtent = bitOfLayer.select(rf_extent($"proj_raster")).first() + val crs = bitOfLayer.select(rf_crs($"proj_raster")).first() + bitOfLayer.write.geotiff.withCRS(crs).save(out.toString) + + checkTiff(out, 128, 128, expectedExtent) } def s(band: Int): String = From 3444453af1082cb759c00b8e7a5c40fc530e7e12 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Sun, 22 Sep 2019 20:18:59 -0400 Subject: [PATCH 46/48] IT regression fixes. --- .../ProjectedLayerMetadataAggregate.scala | 6 ++++++ .../expressions/generators/ExplodeTiles.scala | 2 +- .../awspds/L8CatalogRelationTest.scala | 21 +++++++------------ 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/ProjectedLayerMetadataAggregate.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/ProjectedLayerMetadataAggregate.scala index f181d0eef..267393f79 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/ProjectedLayerMetadataAggregate.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/ProjectedLayerMetadataAggregate.scala @@ -77,6 +77,10 @@ class ProjectedLayerMetadataAggregate(destCRS: CRS, destDims: TileDimensions) ex import org.locationtech.rasterframes.encoders.CatalystSerializer._ val buf = buffer.to[BufferRecord] + if (buf.isEmpty) { + throw new IllegalArgumentException("Can not collect metadata from empty data frame.") + } + val re = RasterExtent(buf.extent, buf.cellSize) val layout = LayoutDefinition(re, destDims.cols, destDims.rows) @@ -152,6 +156,8 @@ object ProjectedLayerMetadataAggregate { buffer(i) = encoded(i) } } + + def isEmpty: Boolean = extent == null || cellType == null || cellSize == null } private[expressions] diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala index 74589c0ae..06c0c033e 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/generators/ExplodeTiles.scala @@ -25,7 +25,7 @@ import geotrellis.raster._ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, CodegenFallback, UnsafeRowWriter} -import org.apache.spark.sql.catalyst.expressions.{Expression, Generator, GenericInternalRow, UnsafeRow} +import org.apache.spark.sql.catalyst.expressions.{Expression, Generator, UnsafeRow} import org.apache.spark.sql.types._ import org.locationtech.rasterframes._ import org.locationtech.rasterframes.expressions.DynamicExtractors diff --git a/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala b/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala index 863410123..ea202b726 100644 --- a/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala +++ b/experimental/src/it/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelationTest.scala @@ -108,12 +108,13 @@ class L8CatalogRelationTest extends TestEnvironment { stats.mean shouldBe > (10000.0) } - ignore("should construct an RGB composite") { - val aoi = Extent(31.115, 29.963, 31.148, 29.99) + it("should construct an RGB composite") { + val aoiLL = Extent(31.115, 29.963, 31.148, 29.99) + val scene = catalog .where( to_date($"acquisition_date") === to_date(lit("2019-07-03")) && - st_intersects(st_geometry($"bounds_wgs84"), geomLit(aoi.jtsGeom)) + st_intersects(st_geometry($"bounds_wgs84"), geomLit(aoiLL.jtsGeom)) ) .orderBy("cloud_cover_pct") .limit(1) @@ -122,19 +123,13 @@ class L8CatalogRelationTest extends TestEnvironment { .fromCatalog(scene, "B4", "B3", "B2") .withTileDimensions(256, 256) .load() - .where(st_contains(rf_geometry($"B4"), st_reproject(geomLit(aoi.jtsGeom), lit("EPSG:4326"), rf_crs($"B4")))) - + .limit(1) noException should be thrownBy { - val raster = TileRasterizerAggregate(df, LatLng, Some(aoi), None) - println(raster) + val raster = TileRasterizerAggregate.collect(df, LatLng, Some(aoiLL), None) + raster.tile.bandCount should be (3) + raster.extent.area > 0 } - -// import geotrellis.raster.io.geotiff.{GeoTiffOptions, MultibandGeoTiff, Tiled} -// import geotrellis.raster.io.geotiff.compression.{DeflateCompression} -// import geotrellis.raster.io.geotiff.tags.codes.ColorSpace -// val tiffOptions = GeoTiffOptions(Tiled, DeflateCompression, ColorSpace.RGB) -// MultibandGeoTiff(raster, raster.crs, tiffOptions).write("target/composite.tif") } } } From 1b7d35f1017828f43b4c15516be14dbc13270561 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Mon, 23 Sep 2019 08:54:54 -0400 Subject: [PATCH 47/48] Updated release notes. --- docs/src/main/paradox/release-notes.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/src/main/paradox/release-notes.md b/docs/src/main/paradox/release-notes.md index 48dc92a93..0184279ed 100644 --- a/docs/src/main/paradox/release-notes.md +++ b/docs/src/main/paradox/release-notes.md @@ -4,6 +4,7 @@ ### 0.8.2 +* Fixed handling of aggregate extent and image size on GeoTIFF writing. ([#362](https://github.com/locationtech/rasterframes/issues/362)) * Fixed issue with `RasterSourceDataSource` swallowing exceptions. ([#267](https://github.com/locationtech/rasterframes/issues/267)) * Fixed SparkML memory pressure issue caused by unnecessary reevaluation, overallocation, and primitive boxing. ([#343](https://github.com/locationtech/rasterframes/issues/343)) * Fixed Parquet serialization issue with `RasterRef`s ([#338](https://github.com/locationtech/rasterframes/issues/338)) From 315b50dc608adedefada2a876e0f38f74e52b4bb Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Mon, 23 Sep 2019 11:13:13 -0400 Subject: [PATCH 48/48] 0.8.2 release prep. --- docs/src/main/paradox/release-notes.md | 2 ++ project/RFDependenciesPlugin.scala | 2 +- pyrasterframes/src/main/python/docs/__init__.py | 16 ---------------- .../src/main/python/pyrasterframes/version.py | 2 +- pyrasterframes/src/main/python/requirements.txt | 2 +- version.sbt | 2 +- 6 files changed, 6 insertions(+), 20 deletions(-) diff --git a/docs/src/main/paradox/release-notes.md b/docs/src/main/paradox/release-notes.md index 0184279ed..546168efb 100644 --- a/docs/src/main/paradox/release-notes.md +++ b/docs/src/main/paradox/release-notes.md @@ -4,6 +4,8 @@ ### 0.8.2 +* Added ability to pass config options to convenience PySpark session constructor. ([#361](https://github.com/locationtech/rasterframes/issues/361)) +* Bumped Spark dependency to version 2.3.4. ([#350](https://github.com/locationtech/rasterframes/issues/350)) * Fixed handling of aggregate extent and image size on GeoTIFF writing. ([#362](https://github.com/locationtech/rasterframes/issues/362)) * Fixed issue with `RasterSourceDataSource` swallowing exceptions. ([#267](https://github.com/locationtech/rasterframes/issues/267)) * Fixed SparkML memory pressure issue caused by unnecessary reevaluation, overallocation, and primitive boxing. ([#343](https://github.com/locationtech/rasterframes/issues/343)) diff --git a/project/RFDependenciesPlugin.scala b/project/RFDependenciesPlugin.scala index 85fa87923..f8e21c243 100644 --- a/project/RFDependenciesPlugin.scala +++ b/project/RFDependenciesPlugin.scala @@ -59,7 +59,7 @@ object RFDependenciesPlugin extends AutoPlugin { ), // NB: Make sure to update the Spark version in pyrasterframes/python/setup.py - rfSparkVersion := "2.3.3", + rfSparkVersion := "2.3.4", rfGeoTrellisVersion := "2.3.1", rfGeoMesaVersion := "2.2.1", //dependencyOverrides += "com.azavea.gdal" % "gdal-warp-bindings" % "33.58d4965" diff --git a/pyrasterframes/src/main/python/docs/__init__.py b/pyrasterframes/src/main/python/docs/__init__.py index 1778c5590..0f728b435 100644 --- a/pyrasterframes/src/main/python/docs/__init__.py +++ b/pyrasterframes/src/main/python/docs/__init__.py @@ -18,25 +18,9 @@ # SPDX-License-Identifier: Apache-2.0 # -import os - from pweave import PwebPandocFormatter -def docs_dir(): - return os.path.dirname(os.path.realpath(__file__)) - - -# This is temporary until we port to run on web assets. -def resource_dir(): - # we may consider using gitpython which I think would be appropriate in the context of building docs - # see https://stackoverflow.com/a/41920796/2787937 - here = docs_dir() - test_resource = os.path.realpath(os.path.join(here, '..', '..', '..', 'src', 'test', 'resources')) - - return test_resource - - class PegdownMarkdownFormatter(PwebPandocFormatter): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/pyrasterframes/src/main/python/pyrasterframes/version.py b/pyrasterframes/src/main/python/pyrasterframes/version.py index 55b8dfe06..5f2706e94 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/version.py +++ b/pyrasterframes/src/main/python/pyrasterframes/version.py @@ -20,4 +20,4 @@ # # Translating Java version from version.sbt to PEP440 norms -__version__ = '0.8.2.dev0' +__version__ = '0.8.2' diff --git a/pyrasterframes/src/main/python/requirements.txt b/pyrasterframes/src/main/python/requirements.txt index 945404eaf..20156c3c5 100644 --- a/pyrasterframes/src/main/python/requirements.txt +++ b/pyrasterframes/src/main/python/requirements.txt @@ -1,6 +1,6 @@ pytz Shapely>=1.6.0 -pyspark==2.3.3 # See issue # 154 +pyspark==2.3.4 # See issue # 154 numpy>=1.7 pandas>=0.25.0 matplotlib<3.0.0 # no python 2.7 support after v2.x.x diff --git a/version.sbt b/version.sbt index 05d6fc967..3f0ac0890 100644 --- a/version.sbt +++ b/version.sbt @@ -1 +1 @@ -version in ThisBuild := "0.8.2-SNAPSHOT" +version in ThisBuild := "0.8.2"