Raster data in GeoTiff and ArcInfoAsciiGrid formats can be read into Spark.
Using the RasterUDT
Raster data in GeoTiff and ArcInfo Grid format can be loaded directly
into Spark using the sparklyr::spark_read_binary
and Sedona
constructors RS_FromGeoTiff
and
RS_FromArcInfoAsciiGrid
.
library(dplyr)
library(sparklyr)
library(apache.sedona)
sc <- spark_connect(master = "local")
data_tbl <- spark_read_binary(sc, dir = "../../core/src/test/resources/raster/", name = "data")
raster <-
data_tbl %>%
mutate(raster = RS_FromGeoTiff(content))
raster
#> # Source: spark<?> [?? x 5]
#> path modificationTime length content
#> <chr> <dttm> <dbl> <list>
#> 1 file:/Users/gregoireleleu/WORK/MISC_CODE/s… 2023-02-14 16:51:43 174803 <raw>
#> 2 file:/Users/gregoireleleu/WORK/MISC_CODE/s… 2023-02-14 16:51:43 174803 <raw>
#> 3 file:/Users/gregoireleleu/WORK/MISC_CODE/s… 2023-02-14 16:51:43 6619 <raw>
#> # … with 1 more variable: raster <list>
raster %>% sdf_schema()
#> $path
#> $path$name
#> [1] "path"
#>
#> $path$type
#> [1] "StringType"
#>
#>
#> $modificationTime
#> $modificationTime$name
#> [1] "modificationTime"
#>
#> $modificationTime$type
#> [1] "TimestampType"
#>
#>
#> $length
#> $length$name
#> [1] "length"
#>
#> $length$type
#> [1] "LongType"
#>
#>
#> $content
#> $content$name
#> [1] "content"
#>
#> $content$type
#> [1] "BinaryType"
#>
#>
#> $raster
#> $raster$name
#> [1] "raster"
#>
#> $raster$type
#> [1] "RasterUDT"
Once the data is loaded, raster functions are available in dplyr workflows:
Functions taking in raster: Raster
arguments are meant
to be used with data loaded with this reader, such as
RS_Value
, RS_Values
, RS_Envelope
.
Functions taking in Band: Array[Double]
arguments work with
data loaded using the Sedona Geotiff DataFrame loader (see below).
For example, getting the number of bands:
raster %>%
mutate(
nbands = RS_NumBands(raster)
) %>%
select(path, nbands) %>%
collect() %>%
mutate(path = path %>% basename())
#> # A tibble: 3 × 2
#> path nbands
#> <chr> <int>
#> 1 test1.tiff 1
#> 2 test2.tiff 1
#> 3 test3.tif 4
Or getting values the envelope:
raster %>%
mutate(
env = RS_Envelope(raster) %>% st_astext()
) %>%
select(path, env) %>%
collect() %>%
mutate(path = path %>% basename())
#> # A tibble: 3 × 2
#> path env
#> <chr> <chr>
#> 1 test1.tiff POLYGON ((-13095817.809482181 3983868.8560156375, -13095817.809482…
#> 2 test2.tiff POLYGON ((-13095817.809482181 3983868.8560156375, -13095817.809482…
#> 3 test3.tif POLYGON ((382240 6152660, 382240 6152980, 382560 6152980, 382560 6…
Or getting values at specific points:
Using the Sedona Geotiff Dataframe Loader
The Sedona Geotiff Dataframe Loader will read data from GeoTiff file (or folder containing multiple files) into a Spark DataFrame. The resulting data is a nested column. It can be unnested using SQL (results are collected)…:
data_tbl <- spark_read_geotiff(sc, path = ("../../core/src/test/resources/raster/"), name = "data", options = list(dropInvalid = TRUE))
data_tbl
#> # Source: spark<data> [?? x 1]
#> image
#> <list>
#> 1 <named list [6]>
#> 2 <named list [6]>
#> 3 <named list [6]>
## Using a direct SQL query: results are collected directly
sc %>%
DBI::dbGetQuery("SELECT
image.geometry as Geom,
image.height as height,
image.width as width,
image.nBands as bands
FROM data")
#> Geom
#> 1 POLYGON ((-13095782 4021226.5, -13095782 3983905, -13058822 3983905, -13058822 4021226.5, -13095782 4021226.5))
#> 2 POLYGON ((-13095782 4021226.5, -13095782 3983905, -13058822 3983905, -13058822 4021226.5, -13095782 4021226.5))
#> 3 POLYGON ((382245 6152975, 382245 6152665, 382555 6152665, 382555 6152975, 382245 6152975))
#> height width bands
#> 1 517 512 1
#> 2 517 512 1
#> 3 32 32 4
… or using {sparklyr.nested}
(results stay in Spark
until collection):
library(sparklyr.nested)
data_tbl %>% sdf_schema_json(parse_json = FALSE) %>% jsonlite::prettify()
#> {
#> "type": "struct",
#> "fields": [
#> {
#> "name": "image",
#> "type": {
#> "type": "struct",
#> "fields": [
#> {
#> "name": "origin",
#> "type": "string",
#> "nullable": true,
#> "metadata": {
#>
#> }
#> },
#> {
#> "name": "geometry",
#> "type": "string",
#> "nullable": true,
#> "metadata": {
#>
#> }
#> },
#> {
#> "name": "height",
#> "type": "integer",
#> "nullable": true,
#> "metadata": {
#>
#> }
#> },
#> {
#> "name": "width",
#> "type": "integer",
#> "nullable": true,
#> "metadata": {
#>
#> }
#> },
#> {
#> "name": "nBands",
#> "type": "integer",
#> "nullable": true,
#> "metadata": {
#>
#> }
#> },
#> {
#> "name": "data",
#> "type": {
#> "type": "array",
#> "elementType": "double",
#> "containsNull": true
#> },
#> "nullable": true,
#> "metadata": {
#>
#> }
#> }
#> ]
#> },
#> "nullable": true,
#> "metadata": {
#>
#> }
#> }
#> ]
#> }
#>
data_tbl %>%
sdf_unnest(image) %>%
glimpse()
#> Rows: ??
#> Columns: 6
#> Database: spark_connection
#> $ origin <chr> "file:///Users/gregoireleleu/WORK/MISC_CODE/sedona/core/src/t…
#> $ geometry <chr> "POLYGON ((-13095782 4021226.5, -13095782 3983905, -13058822 …
#> $ height <int> 517, 517, 32
#> $ width <int> 512, 512, 32
#> $ nBands <int> 1, 1, 4
#> $ data <list> <0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …