Skip to contents

Raster data in GeoTiff and ArcInfoAsciiGrid formats can be read into Spark.

Using the RasterUDT

Raster data in GeoTiff and ArcInfo Grid format can be loaded directly into Spark using the sparklyr::spark_read_binary and Sedona constructors RS_FromGeoTiff and RS_FromArcInfoAsciiGrid.

library(dplyr)
library(sparklyr)
library(apache.sedona)

sc <- spark_connect(master = "local")

data_tbl <- spark_read_binary(sc, dir = "../../core/src/test/resources/raster/", name = "data") 

raster <- 
  data_tbl %>% 
  mutate(raster = RS_FromGeoTiff(content))

raster 
#> # Source: spark<?> [?? x 5]
#>   path                                        modificationTime    length content
#>   <chr>                                       <dttm>               <dbl> <list> 
#> 1 file:/Users/gregoireleleu/WORK/MISC_CODE/s… 2023-02-14 16:51:43 174803 <raw>  
#> 2 file:/Users/gregoireleleu/WORK/MISC_CODE/s… 2023-02-14 16:51:43 174803 <raw>  
#> 3 file:/Users/gregoireleleu/WORK/MISC_CODE/s… 2023-02-14 16:51:43   6619 <raw>  
#> # … with 1 more variable: raster <list>

raster %>% sdf_schema()
#> $path
#> $path$name
#> [1] "path"
#> 
#> $path$type
#> [1] "StringType"
#> 
#> 
#> $modificationTime
#> $modificationTime$name
#> [1] "modificationTime"
#> 
#> $modificationTime$type
#> [1] "TimestampType"
#> 
#> 
#> $length
#> $length$name
#> [1] "length"
#> 
#> $length$type
#> [1] "LongType"
#> 
#> 
#> $content
#> $content$name
#> [1] "content"
#> 
#> $content$type
#> [1] "BinaryType"
#> 
#> 
#> $raster
#> $raster$name
#> [1] "raster"
#> 
#> $raster$type
#> [1] "RasterUDT"

Once the data is loaded, raster functions are available in dplyr workflows:

Functions taking in raster: Raster arguments are meant to be used with data loaded with this reader, such as RS_Value, RS_Values, RS_Envelope. Functions taking in Band: Array[Double] arguments work with data loaded using the Sedona Geotiff DataFrame loader (see below).

For example, getting the number of bands:

raster %>% 
  mutate(
    nbands = RS_NumBands(raster)
  ) %>% 
  select(path, nbands) %>% 
  collect() %>% 
  mutate(path = path %>% basename())
#> # A tibble: 3 × 2
#>   path       nbands
#>   <chr>       <int>
#> 1 test1.tiff      1
#> 2 test2.tiff      1
#> 3 test3.tif       4

Or getting values the envelope:

raster %>% 
  mutate(
    env = RS_Envelope(raster) %>% st_astext()
  ) %>% 
  select(path, env) %>% 
  collect() %>% 
  mutate(path = path %>% basename())
#> # A tibble: 3 × 2
#>   path       env                                                                
#>   <chr>      <chr>                                                              
#> 1 test1.tiff POLYGON ((-13095817.809482181 3983868.8560156375, -13095817.809482…
#> 2 test2.tiff POLYGON ((-13095817.809482181 3983868.8560156375, -13095817.809482…
#> 3 test3.tif  POLYGON ((382240 6152660, 382240 6152980, 382560 6152980, 382560 6…

Or getting values at specific points:

raster %>% 
  mutate(
    val = RS_Value(raster, ST_Point(-13077301.685, 4002565.802))
  ) %>% 
  select(path, val) %>% 
  collect() %>% 
  mutate(path = path %>% basename())
#> # A tibble: 3 × 2
#>   path         val
#>   <chr>      <dbl>
#> 1 test1.tiff   255
#> 2 test2.tiff   255
#> 3 test3.tif     NA

Using the Sedona Geotiff Dataframe Loader

The Sedona Geotiff Dataframe Loader will read data from GeoTiff file (or folder containing multiple files) into a Spark DataFrame. The resulting data is a nested column. It can be unnested using SQL (results are collected)…:

data_tbl <- spark_read_geotiff(sc, path = ("../../core/src/test/resources/raster/"), name = "data", options = list(dropInvalid = TRUE))
data_tbl
#> # Source: spark<data> [?? x 1]
#>   image           
#>   <list>          
#> 1 <named list [6]>
#> 2 <named list [6]>
#> 3 <named list [6]>

## Using a direct SQL query: results are collected directly
sc %>% 
    DBI::dbGetQuery("SELECT 
             image.geometry as Geom, 
             image.height as height, 
             image.width as width, 
             image.nBands as bands 
             FROM data")
#>                                                                                                              Geom
#> 1 POLYGON ((-13095782 4021226.5, -13095782 3983905, -13058822 3983905, -13058822 4021226.5, -13095782 4021226.5))
#> 2 POLYGON ((-13095782 4021226.5, -13095782 3983905, -13058822 3983905, -13058822 4021226.5, -13095782 4021226.5))
#> 3                      POLYGON ((382245 6152975, 382245 6152665, 382555 6152665, 382555 6152975, 382245 6152975))
#>   height width bands
#> 1    517   512     1
#> 2    517   512     1
#> 3     32    32     4

… or using {sparklyr.nested} (results stay in Spark until collection):

library(sparklyr.nested)

data_tbl %>% sdf_schema_json(parse_json = FALSE) %>% jsonlite::prettify()
#> {
#>     "type": "struct",
#>     "fields": [
#>         {
#>             "name": "image",
#>             "type": {
#>                 "type": "struct",
#>                 "fields": [
#>                     {
#>                         "name": "origin",
#>                         "type": "string",
#>                         "nullable": true,
#>                         "metadata": {
#> 
#>                         }
#>                     },
#>                     {
#>                         "name": "geometry",
#>                         "type": "string",
#>                         "nullable": true,
#>                         "metadata": {
#> 
#>                         }
#>                     },
#>                     {
#>                         "name": "height",
#>                         "type": "integer",
#>                         "nullable": true,
#>                         "metadata": {
#> 
#>                         }
#>                     },
#>                     {
#>                         "name": "width",
#>                         "type": "integer",
#>                         "nullable": true,
#>                         "metadata": {
#> 
#>                         }
#>                     },
#>                     {
#>                         "name": "nBands",
#>                         "type": "integer",
#>                         "nullable": true,
#>                         "metadata": {
#> 
#>                         }
#>                     },
#>                     {
#>                         "name": "data",
#>                         "type": {
#>                             "type": "array",
#>                             "elementType": "double",
#>                             "containsNull": true
#>                         },
#>                         "nullable": true,
#>                         "metadata": {
#> 
#>                         }
#>                     }
#>                 ]
#>             },
#>             "nullable": true,
#>             "metadata": {
#> 
#>             }
#>         }
#>     ]
#> }
#> 

data_tbl %>% 
  sdf_unnest(image) %>% 
  glimpse()
#> Rows: ??
#> Columns: 6
#> Database: spark_connection
#> $ origin   <chr> "file:///Users/gregoireleleu/WORK/MISC_CODE/sedona/core/src/t…
#> $ geometry <chr> "POLYGON ((-13095782 4021226.5, -13095782 3983905, -13058822 …
#> $ height   <int> 517, 517, 32
#> $ width    <int> 512, 512, 32
#> $ nBands   <int> 1, 1, 4
#> $ data     <list> <0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
res <- 
  data_tbl %>% 
  sdf_unnest(image) %>% 
  mutate(
    mult = RS_MultiplyFactor(data, 2L)
  ) %>% 
  select(data, mult) %>% 
  collect()

res$data[[1]][750:760]
#>  [1] 197 197 214 189 181 189 181 214 197 173 239
res$mult[[1]][750:760]
#>  [1] 394 394 428 378 362 378 362 428 394 346 478