Simple prediction methods • augury

The simple methods available in augury are easy to use, but provide the same functionality of allowing a test column and returning error metrics as the more complex modeling functions available in the package. Let’s use data on alcohol from the GHO to demonstrate the functionality.

library(augury)

df <- ghost::gho_data("SA_0000001688",
                      query = "$filter=Dim1 eq 'BTSX'") %>%
  billionaiRe::wrangle_gho_data(source = "WHO GHO",
                                type = "estimated") %>%
  dplyr::arrange(iso3, year)

head(df)
#> # A tibble: 6 × 13
#>   iso3   year ind      value  lower  upper use_dash use_calc source  type     
#>   <chr> <int> <chr>    <dbl>  <dbl>  <dbl> <lgl>    <lgl>    <chr>   <chr>    
#> 1 AFG    2000 alcohol 0.0354 0.0286 0.0959 TRUE     TRUE     WHO GHO estimated
#> 2 AFG    2001 alcohol 0.0354 0.0286 0.0959 TRUE     TRUE     WHO GHO estimated
#> 3 AFG    2002 alcohol 0.0354 0.0284 0.0954 TRUE     TRUE     WHO GHO estimated
#> 4 AFG    2003 alcohol 0.0354 0.0287 0.0969 TRUE     TRUE     WHO GHO estimated
#> 5 AFG    2004 alcohol 0.0354 0.0283 0.0956 TRUE     TRUE     WHO GHO estimated
#> 6 AFG    2005 alcohol 0.0397 0.0313 0.104  TRUE     TRUE     WHO GHO estimated
#> # … with 3 more variables: type_detail <chr>, other_detail <chr>,
#> #   upload_detail <chr>

Here we can see that data has time series and gaps in years. We can use linear interpolation and flat extrapolation here to get data out to 2023.

df <- tidyr::expand_grid(iso3 = unique(df$iso3),
                         year = 2000:2023) %>%
  dplyr::left_join(df, by = c("iso3", "year"))

df %>%
  dplyr::filter(iso3 == "AFG",
                year >= 2010,
                year <= 2018) %>%
  dplyr::select(iso3,
                year,
                value)
#> # A tibble: 9 × 3
#>   iso3   year   value
#>   <chr> <int>   <dbl>
#> 1 AFG    2010 0.0245 
#> 2 AFG    2011 0.0170 
#> 3 AFG    2012 0.0127 
#> 4 AFG    2013 0.0127 
#> 5 AFG    2014 0.00848
#> 6 AFG    2015 0.00848
#> 7 AFG    2016 0.00848
#> 8 AFG    2017 0.0127 
#> 9 AFG    2018 0.0127

Let’s now use our linear interpolation and flat extrapolation on this data.

pred_df <- predict_simple(df,
                          group_col = "iso3",
                          sort_col = "year") 

pred_df %>%
  dplyr::filter(iso3 == "AFG",
                year >= 2010,
                year <= 2018) %>%
  dplyr::select(iso3, year, value)
#> # A tibble: 9 × 3
#>   iso3   year   value
#>   <chr> <int>   <dbl>
#> 1 AFG    2010 0.0245 
#> 2 AFG    2011 0.0170 
#> 3 AFG    2012 0.0127 
#> 4 AFG    2013 0.0127 
#> 5 AFG    2014 0.00848
#> 6 AFG    2015 0.00848
#> 7 AFG    2016 0.00848
#> 8 AFG    2017 0.0127 
#> 9 AFG    2018 0.0127

And we can see our linear interpolation there. We can also see the flat extrapolation.

pred_df %>%
  dplyr::filter(iso3 == "AFG",
                year > 2016) %>%
  dplyr::select(iso3, year, value)
#> # A tibble: 7 × 3
#>   iso3   year  value
#>   <chr> <int>  <dbl>
#> 1 AFG    2017 0.0127
#> 2 AFG    2018 0.0127
#> 3 AFG    2019 0.0127
#> 4 AFG    2020 0.0127
#> 5 AFG    2021 0.0127
#> 6 AFG    2022 0.0127
#> 7 AFG    2023 0.0127

We can use the predict_average() function in much the same way, except it is most useful when we have robust series for a set of countries, and no data for others. We can then use something like the regional average to infill data for missing countries.

df <- ghost::gho_data("PHE_HHAIR_PROP_POP_CLEAN_FUELS") %>%
  billionaiRe::wrangle_gho_data(source = "WHO GHO",
                                type = "estimated") %>%
  dplyr::filter(whoville::is_who_member(iso3))
#> Warning: Some of the rows are missing a ind value.

x <- whoville::who_member_states()
x[!(x %in% df$iso3)]
#> [1] "LBN" "CUB" "BGR" "LBY"

Above, we have 4 missing WHO member states, Lebanon, Cuba, Bulgaria, and Libya. Let’s use regional averaging to fill in this data. We can use the most recent World Bank income groups from the whoville package as our relevant group.

df <- tidyr::expand_grid(iso3 = x,
                         year = 2000:2018) %>%
  dplyr::left_join(df, by = c("iso3", "year")) %>%
  dplyr::mutate(region = whoville::iso3_to_regions(iso3, region = "wb_ig"))

predict_average(df,
                average_cols = c("region", "year"),
                group_col = "iso3",
                sort_col = "year",
                type_col = "type",
                source_col = "source",
                source = "WB IG regional averages") %>%
  dplyr::filter(iso3 == "LBN")
#> # A tibble: 19 × 15
#>    iso3   year ind   value lower upper use_dash use_calc source            type 
#>    <chr> <int> <chr> <dbl> <dbl> <dbl> <lgl>    <lgl>    <chr>             <chr>
#>  1 LBN    2000 NA     67.2    NA    NA NA       NA       WB IG regional a… impu…
#>  2 LBN    2001 NA     68.2    NA    NA NA       NA       WB IG regional a… impu…
#>  3 LBN    2002 NA     69.2    NA    NA NA       NA       WB IG regional a… impu…
#>  4 LBN    2003 NA     70.2    NA    NA NA       NA       WB IG regional a… impu…
#>  5 LBN    2004 NA     71.2    NA    NA NA       NA       WB IG regional a… impu…
#>  6 LBN    2005 NA     72.2    NA    NA NA       NA       WB IG regional a… impu…
#>  7 LBN    2006 NA     73.1    NA    NA NA       NA       WB IG regional a… impu…
#>  8 LBN    2007 NA     74.0    NA    NA NA       NA       WB IG regional a… impu…
#>  9 LBN    2008 NA     74.8    NA    NA NA       NA       WB IG regional a… impu…
#> 10 LBN    2009 NA     75.6    NA    NA NA       NA       WB IG regional a… impu…
#> 11 LBN    2010 NA     76.3    NA    NA NA       NA       WB IG regional a… impu…
#> 12 LBN    2011 NA     77.0    NA    NA NA       NA       WB IG regional a… impu…
#> 13 LBN    2012 NA     77.6    NA    NA NA       NA       WB IG regional a… impu…
#> 14 LBN    2013 NA     78.2    NA    NA NA       NA       WB IG regional a… impu…
#> 15 LBN    2014 NA     78.7    NA    NA NA       NA       WB IG regional a… impu…
#> 16 LBN    2015 NA     79.2    NA    NA NA       NA       WB IG regional a… impu…
#> 17 LBN    2016 NA     79.7    NA    NA NA       NA       WB IG regional a… impu…
#> 18 LBN    2017 NA     80.1    NA    NA NA       NA       WB IG regional a… impu…
#> 19 LBN    2018 NA     80.5    NA    NA NA       NA       WB IG regional a… impu…
#> # … with 5 more variables: type_detail <chr>, other_detail <chr>,
#> #   upload_detail <chr>, region <chr>, pred <dbl>

Hope these examples have been clear and highlight some of the usefulness of these simple modelling functions.