The simple methods available in augury are easy to use, but provide the same functionality of allowing a test column and returning error metrics as the more complex modeling functions available in the package. Let’s use data on alcohol from the GHO to demonstrate the functionality.
library(augury)
df <- ghost::gho_data("SA_0000001688",
query = "$filter=Dim1 eq 'BTSX'") %>%
billionaiRe::wrangle_gho_data(source = "WHO GHO",
type = "estimated") %>%
dplyr::arrange(iso3, year)
head(df)
#> # A tibble: 6 × 13
#> iso3 year ind value lower upper use_dash use_calc source type
#> <chr> <int> <chr> <dbl> <dbl> <dbl> <lgl> <lgl> <chr> <chr>
#> 1 AFG 2000 alcohol 0.0354 0.0286 0.0959 TRUE TRUE WHO GHO estimated
#> 2 AFG 2001 alcohol 0.0354 0.0286 0.0959 TRUE TRUE WHO GHO estimated
#> 3 AFG 2002 alcohol 0.0354 0.0284 0.0954 TRUE TRUE WHO GHO estimated
#> 4 AFG 2003 alcohol 0.0354 0.0287 0.0969 TRUE TRUE WHO GHO estimated
#> 5 AFG 2004 alcohol 0.0354 0.0283 0.0956 TRUE TRUE WHO GHO estimated
#> 6 AFG 2005 alcohol 0.0397 0.0313 0.104 TRUE TRUE WHO GHO estimated
#> # … with 3 more variables: type_detail <chr>, other_detail <chr>,
#> # upload_detail <chr>
Here we can see that data has time series and gaps in years. We can use linear interpolation and flat extrapolation here to get data out to 2023.
df <- tidyr::expand_grid(iso3 = unique(df$iso3),
year = 2000:2023) %>%
dplyr::left_join(df, by = c("iso3", "year"))
df %>%
dplyr::filter(iso3 == "AFG",
year >= 2010,
year <= 2018) %>%
dplyr::select(iso3,
year,
value)
#> # A tibble: 9 × 3
#> iso3 year value
#> <chr> <int> <dbl>
#> 1 AFG 2010 0.0245
#> 2 AFG 2011 0.0170
#> 3 AFG 2012 0.0127
#> 4 AFG 2013 0.0127
#> 5 AFG 2014 0.00848
#> 6 AFG 2015 0.00848
#> 7 AFG 2016 0.00848
#> 8 AFG 2017 0.0127
#> 9 AFG 2018 0.0127
Let’s now use our linear interpolation and flat extrapolation on this data.
pred_df <- predict_simple(df,
group_col = "iso3",
sort_col = "year")
pred_df %>%
dplyr::filter(iso3 == "AFG",
year >= 2010,
year <= 2018) %>%
dplyr::select(iso3, year, value)
#> # A tibble: 9 × 3
#> iso3 year value
#> <chr> <int> <dbl>
#> 1 AFG 2010 0.0245
#> 2 AFG 2011 0.0170
#> 3 AFG 2012 0.0127
#> 4 AFG 2013 0.0127
#> 5 AFG 2014 0.00848
#> 6 AFG 2015 0.00848
#> 7 AFG 2016 0.00848
#> 8 AFG 2017 0.0127
#> 9 AFG 2018 0.0127
And we can see our linear interpolation there. We can also see the flat extrapolation.
pred_df %>%
dplyr::filter(iso3 == "AFG",
year > 2016) %>%
dplyr::select(iso3, year, value)
#> # A tibble: 7 × 3
#> iso3 year value
#> <chr> <int> <dbl>
#> 1 AFG 2017 0.0127
#> 2 AFG 2018 0.0127
#> 3 AFG 2019 0.0127
#> 4 AFG 2020 0.0127
#> 5 AFG 2021 0.0127
#> 6 AFG 2022 0.0127
#> 7 AFG 2023 0.0127
We can use the predict_average()
function in much the same way, except it is most useful when we have robust series for a set of countries, and no data for others. We can then use something like the regional average to infill data for missing countries.
df <- ghost::gho_data("PHE_HHAIR_PROP_POP_CLEAN_FUELS") %>%
billionaiRe::wrangle_gho_data(source = "WHO GHO",
type = "estimated") %>%
dplyr::filter(whoville::is_who_member(iso3))
#> Warning: Some of the rows are missing a ind value.
x <- whoville::who_member_states()
x[!(x %in% df$iso3)]
#> [1] "LBN" "CUB" "BGR" "LBY"
Above, we have 4 missing WHO member states, Lebanon, Cuba, Bulgaria, and Libya. Let’s use regional averaging to fill in this data. We can use the most recent World Bank income groups from the whoville package as our relevant group.
df <- tidyr::expand_grid(iso3 = x,
year = 2000:2018) %>%
dplyr::left_join(df, by = c("iso3", "year")) %>%
dplyr::mutate(region = whoville::iso3_to_regions(iso3, region = "wb_ig"))
predict_average(df,
average_cols = c("region", "year"),
group_col = "iso3",
sort_col = "year",
type_col = "type",
source_col = "source",
source = "WB IG regional averages") %>%
dplyr::filter(iso3 == "LBN")
#> # A tibble: 19 × 15
#> iso3 year ind value lower upper use_dash use_calc source type
#> <chr> <int> <chr> <dbl> <dbl> <dbl> <lgl> <lgl> <chr> <chr>
#> 1 LBN 2000 NA 67.2 NA NA NA NA WB IG regional a… impu…
#> 2 LBN 2001 NA 68.2 NA NA NA NA WB IG regional a… impu…
#> 3 LBN 2002 NA 69.2 NA NA NA NA WB IG regional a… impu…
#> 4 LBN 2003 NA 70.2 NA NA NA NA WB IG regional a… impu…
#> 5 LBN 2004 NA 71.2 NA NA NA NA WB IG regional a… impu…
#> 6 LBN 2005 NA 72.2 NA NA NA NA WB IG regional a… impu…
#> 7 LBN 2006 NA 73.1 NA NA NA NA WB IG regional a… impu…
#> 8 LBN 2007 NA 74.0 NA NA NA NA WB IG regional a… impu…
#> 9 LBN 2008 NA 74.8 NA NA NA NA WB IG regional a… impu…
#> 10 LBN 2009 NA 75.6 NA NA NA NA WB IG regional a… impu…
#> 11 LBN 2010 NA 76.3 NA NA NA NA WB IG regional a… impu…
#> 12 LBN 2011 NA 77.0 NA NA NA NA WB IG regional a… impu…
#> 13 LBN 2012 NA 77.6 NA NA NA NA WB IG regional a… impu…
#> 14 LBN 2013 NA 78.2 NA NA NA NA WB IG regional a… impu…
#> 15 LBN 2014 NA 78.7 NA NA NA NA WB IG regional a… impu…
#> 16 LBN 2015 NA 79.2 NA NA NA NA WB IG regional a… impu…
#> 17 LBN 2016 NA 79.7 NA NA NA NA WB IG regional a… impu…
#> 18 LBN 2017 NA 80.1 NA NA NA NA WB IG regional a… impu…
#> 19 LBN 2018 NA 80.5 NA NA NA NA WB IG regional a… impu…
#> # … with 5 more variables: type_detail <chr>, other_detail <chr>,
#> # upload_detail <chr>, region <chr>, pred <dbl>
Hope these examples have been clear and highlight some of the usefulness of these simple modelling functions.