Skip to contents

[Experimental] Open the oracle-output target data file(s) in a hub as an arrow dataset.

Usage

connect_target_oracle_output(
  hub_path = ".",
  na = c("NA", ""),
  ignore_files = NULL
)

Arguments

hub_path

Either a character string path to a local Modeling Hub directory or an object of class <SubTreeFileSystem> created using functions s3_bucket() or gs_bucket() by providing a string S3 or GCS bucket name or path to a Modeling Hub directory stored in the cloud. For more details consult the Using cloud storage (S3, GCS) in the arrow package. The hub must be fully configured with valid admin.json and tasks.json files within the hub-config directory.

na

A character vector of strings to interpret as missing values. Only applies to CSV files. The default is c("NA", ""). Useful when actual character string "NA" values are used in the data. In such a case, use empty cells to indicate missing values in your files and set na = "".

ignore_files

A character vector of file names (not paths) or file prefixes to ignore when discovering model output files to include in dataset connections. Parent directory names should not be included. Common non-data files such as "README" and ".DS_Store" are ignored automatically, but additional files can be excluded by specifying them here.

Value

An arrow dataset object of subclass <target_oracle_output>.

Details

If the target data is split across multiple files in a oracle-output directory, all files must share the same file format, either csv or parquet. No other types of files are currently allowed in a oracle-output directory.

Examples

# Clone example hub
tmp_hub_path <- withr::local_tempdir()
example_hub <- "https://github.com/hubverse-org/example-complex-forecast-hub.git"
gert::git_clone(url = example_hub, path = tmp_hub_path)
# Connect to oracle-output data
oo_con <- connect_target_oracle_output(tmp_hub_path)
oo_con
#> target_oracle_output with 1 csv file
#> 6 columns
#> location: string
#> target_end_date: date32[day]
#> target: string
#> output_type: string
#> output_type_id: string
#> oracle_value: double
# Collect all oracle-output data
oo_con |> dplyr::collect()
#> # A tibble: 200,340 × 6
#>    location target_end_date target       output_type output_type_id oracle_value
#>    <chr>    <date>          <chr>        <chr>       <chr>                 <dbl>
#>  1 US       2022-10-22      wk inc flu … quantile    NA                     2380
#>  2 01       2022-10-22      wk inc flu … quantile    NA                      141
#>  3 02       2022-10-22      wk inc flu … quantile    NA                        3
#>  4 04       2022-10-22      wk inc flu … quantile    NA                       22
#>  5 05       2022-10-22      wk inc flu … quantile    NA                       50
#>  6 06       2022-10-22      wk inc flu … quantile    NA                      124
#>  7 08       2022-10-22      wk inc flu … quantile    NA                       15
#>  8 09       2022-10-22      wk inc flu … quantile    NA                        9
#>  9 10       2022-10-22      wk inc flu … quantile    NA                        1
#> 10 11       2022-10-22      wk inc flu … quantile    NA                        8
#> # ℹ 200,330 more rows
# Filter for a specific date before collecting
oo_con |>
  dplyr::filter(target_end_date == "2022-11-12") |>
  dplyr::collect()
#> # A tibble: 5,724 × 6
#>    location target_end_date target       output_type output_type_id oracle_value
#>    <chr>    <date>          <chr>        <chr>       <chr>                 <dbl>
#>  1 US       2022-11-12      wk inc flu … quantile    NA                     8848
#>  2 01       2022-11-12      wk inc flu … quantile    NA                      303
#>  3 02       2022-11-12      wk inc flu … quantile    NA                       20
#>  4 04       2022-11-12      wk inc flu … quantile    NA                      153
#>  5 05       2022-11-12      wk inc flu … quantile    NA                      212
#>  6 06       2022-11-12      wk inc flu … quantile    NA                      928
#>  7 08       2022-11-12      wk inc flu … quantile    NA                       61
#>  8 09       2022-11-12      wk inc flu … quantile    NA                       58
#>  9 10       2022-11-12      wk inc flu … quantile    NA                       15
#> 10 11       2022-11-12      wk inc flu … quantile    NA                       44
#> # ℹ 5,714 more rows
# Filter for a specific location before collecting
oo_con |>
  dplyr::filter(location == "US") |>
  dplyr::collect()
#> # A tibble: 3,780 × 6
#>    location target_end_date target       output_type output_type_id oracle_value
#>    <chr>    <date>          <chr>        <chr>       <chr>                 <dbl>
#>  1 US       2022-10-22      wk inc flu … quantile    NA                     2380
#>  2 US       2022-10-29      wk inc flu … quantile    NA                     4353
#>  3 US       2022-11-05      wk inc flu … quantile    NA                     6571
#>  4 US       2022-11-12      wk inc flu … quantile    NA                     8848
#>  5 US       2022-11-19      wk inc flu … quantile    NA                    11427
#>  6 US       2022-11-26      wk inc flu … quantile    NA                    19846
#>  7 US       2022-12-03      wk inc flu … quantile    NA                    26333
#>  8 US       2022-12-10      wk inc flu … quantile    NA                    23851
#>  9 US       2022-12-17      wk inc flu … quantile    NA                    21435
#> 10 US       2022-12-24      wk inc flu … quantile    NA                    19286
#> # ℹ 3,770 more rows
# Get distinct target_end_date values
oo_con |>
  dplyr::distinct(target_end_date) |>
  dplyr::pull(as_vector = TRUE)
#>  [1] "2022-10-22" "2022-10-29" "2022-11-05" "2022-11-12" "2022-11-19"
#>  [6] "2022-11-26" "2022-12-03" "2022-12-10" "2022-12-17" "2022-12-24"
#> [11] "2022-12-31" "2023-01-07" "2023-01-14" "2023-01-21" "2023-01-28"
#> [16] "2023-02-04" "2023-02-11" "2023-02-18" "2023-02-25" "2023-03-04"
#> [21] "2023-03-11" "2023-03-18" "2023-03-25" "2023-04-01" "2023-04-08"
#> [26] "2023-04-15" "2023-04-22" "2023-04-29" "2023-05-06" "2023-05-13"
#> [31] "2023-05-20" "2023-05-27" "2023-06-03" "2023-06-10" "2023-06-17"
# Access Target oracle-output data from a cloud hub
s3_hub_path <- s3_bucket("example-complex-forecast-hub")
s3_con <- connect_target_oracle_output(s3_hub_path)
s3_con
#> target_oracle_output with 1 csv file
#> 6 columns
#> location: string
#> target_end_date: date32[day]
#> target: string
#> output_type: string
#> output_type_id: string
#> oracle_value: double
s3_con |> dplyr::collect()
#> # A tibble: 200,340 × 6
#>    location target_end_date target       output_type output_type_id oracle_value
#>    <chr>    <date>          <chr>        <chr>       <chr>                 <dbl>
#>  1 US       2022-10-22      wk inc flu … quantile    NA                     2380
#>  2 01       2022-10-22      wk inc flu … quantile    NA                      141
#>  3 02       2022-10-22      wk inc flu … quantile    NA                        3
#>  4 04       2022-10-22      wk inc flu … quantile    NA                       22
#>  5 05       2022-10-22      wk inc flu … quantile    NA                       50
#>  6 06       2022-10-22      wk inc flu … quantile    NA                      124
#>  7 08       2022-10-22      wk inc flu … quantile    NA                       15
#>  8 09       2022-10-22      wk inc flu … quantile    NA                        9
#>  9 10       2022-10-22      wk inc flu … quantile    NA                        1
#> 10 11       2022-10-22      wk inc flu … quantile    NA                        8
#> # ℹ 200,330 more rows