У меня есть данные с переменными industry_sales, year и industry.
Я хочу провести регрессии временных рядов для каждой отрасли в скользящих окнах в 5 лет, с зависимой переменной industry_sales и независимой переменной year. Например, если данные охватывают период с 2000 по 2010 годы, то скользящие окна для каждой регрессионной и отраслевой группы будут 2000–2004, 2001–2005 и т. д.
У меня была попытка, но код не работает:
library(tidyverse)
library(zoo)
# Group the data by industry
data_grouped <- data %>%
group_by(industry)
glimpse(data_grouped)
# Define a function to run a regression
regression_func <- function(x) {
lm(industry_sales ~ year, data = x)
}
# Perform rolling linear regression on each group, with a window of 5 years
results_list <- data_grouped %>%
do({
rollapply(data_grouped$year, width = 5, FUN = regression_func, by = 1)
})
Вывод данных dput() показан ниже, любая помощь будет оценена по достоинству:
structure(list(year = c(2010, 2011, 2012, 2013, 2014, 2015, 2016,
2017, 2018, 2019, 2020, 2021, 2022, 2010, 2011, 2012, 2013, 2014,
2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2010, 2011, 2012,
2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2010,
2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021,
2022, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019,
2020, 2021, 2022, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017,
2018, 2019, 2020, 2021, 2022, 2010, 2011, 2012, 2013, 2014, 2015,
2016, 2017, 2018, 2019, 2020, 2021, 2022, 2010, 2011, 2012, 2013,
2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022), industry_sales = c(853.218,
1001.856, 987.741, 990.679, 1061.804, 985.904, 1073.43, 1305.779,
1314.722, 1188.122, 1327.245, 1711.143, 577.358, 25731.969, 26484.263,
23085.259, 24842.509, 30563.672, 34947.361, 40066.425, 38293.972,
40484.096, 37160.342, 29109.755, 35227.521, 16522.915, 1072310.255,
1314543.982, 1288932.701, 1245467.184, 1169743.543, 882325.562,
804108.99, 966832.252, 1099655.803, 999685.99, 725094.688, 797072.967,
59906.738, 131274.107, 167635.266, 170003.001, 161892.01, 149722.289,
108169.702, 92950.89, 120837.727, 132130.143, 121670.84, 118206.596,
147732.623, 542.17, 201842.321, 209721.793, 215722.823, 223158.977,
221191.213, 223076.061, 229199.539, 242995.929, 256145.995, 257282.085,
234455.443, 214560.191, 13964.787, 88190.421, 90191.422, 80848.451,
93999.434, 92325.688, 99138.466, 98682.173, 95106.591, 97622.097,
94254.621, 81145.338, 87421.329, 12116.901, 247825.263, 262355,
235423.37, 236697.987, 245287.766, 222221.931, 219657.82, 222990.939,
224672.689, 162633.722, 139853.429, 148534.588, 4463.853, 28177.129,
29298.618, 29251.077, 31696.968, 31636.033, 33064.57, 35745.408,
32034.079, 31978.646, 31661.88, 28572.857, 34190.976, 3400.497
), industry = c("Agriculture, Forestry and Fishing", "Agriculture, Forestry and Fishing",
"Agriculture, Forestry and Fishing", "Agriculture, Forestry and Fishing",
"Agriculture, Forestry and Fishing", "Agriculture, Forestry and Fishing",
"Agriculture, Forestry and Fishing", "Agriculture, Forestry and Fishing",
"Agriculture, Forestry and Fishing", "Agriculture, Forestry and Fishing",
"Agriculture, Forestry and Fishing", "Agriculture, Forestry and Fishing",
"Agriculture, Forestry and Fishing", "Construction", "Construction",
"Construction", "Construction", "Construction", "Construction",
"Construction", "Construction", "Construction", "Construction",
"Construction", "Construction", "Construction", "Manufacturing",
"Manufacturing", "Manufacturing", "Manufacturing", "Manufacturing",
"Manufacturing", "Manufacturing", "Manufacturing", "Manufacturing",
"Manufacturing", "Manufacturing", "Manufacturing", "Manufacturing",
"Mining", "Mining", "Mining", "Mining", "Mining", "Mining", "Mining",
"Mining", "Mining", "Mining", "Mining", "Mining", "Mining", "Retail Trade",
"Retail Trade", "Retail Trade", "Retail Trade", "Retail Trade",
"Retail Trade", "Retail Trade", "Retail Trade", "Retail Trade",
"Retail Trade", "Retail Trade", "Retail Trade", "Retail Trade",
"Services", "Services", "Services", "Services", "Services", "Services",
"Services", "Services", "Services", "Services", "Services", "Services",
"Services", "Transportation, Communications, Electric, Gas and Sanitary service",
"Transportation, Communications, Electric, Gas and Sanitary service",
"Transportation, Communications, Electric, Gas and Sanitary service",
"Transportation, Communications, Electric, Gas and Sanitary service",
"Transportation, Communications, Electric, Gas and Sanitary service",
"Transportation, Communications, Electric, Gas and Sanitary service",
"Transportation, Communications, Electric, Gas and Sanitary service",
"Transportation, Communications, Electric, Gas and Sanitary service",
"Transportation, Communications, Electric, Gas and Sanitary service",
"Transportation, Communications, Electric, Gas and Sanitary service",
"Transportation, Communications, Electric, Gas and Sanitary service",
"Transportation, Communications, Electric, Gas and Sanitary service",
"Transportation, Communications, Electric, Gas and Sanitary service",
"Wholesale Trade", "Wholesale Trade", "Wholesale Trade", "Wholesale Trade",
"Wholesale Trade", "Wholesale Trade", "Wholesale Trade", "Wholesale Trade",
"Wholesale Trade", "Wholesale Trade", "Wholesale Trade", "Wholesale Trade",
"Wholesale Trade")), row.names = c(NA, -104L), class = c("tbl_df",
"tbl", "data.frame"))
Вот решение с использованием slider::slide2() вместо zoo::rollapply(). Это создает именованный вложенный список с результатами за 5-летние периоды, вложенными в отрасли.
library(tidyverse)
library(slider)
rolling_lm <- function(x, y) {
res <- slide2(x, y, \(x, y) lm(y ~ x), .after = 5, .complete = TRUE)
names(res) <- slide_chr(x, \(x) paste(min(x), "-", max(x)), .after = 5, .complete = TRUE)
res[!is.null(res)]
}
results_list <- data %>%
split(.$industry) %>%
map(\(grp) rolling_lm(grp$year, grp$industry_sales))
# preview first 3 results
results_list[1:3]
$`Agriculture, Forestry and Fishing`
$`Agriculture, Forestry and Fishing`$`2010 - 2015`
Call:
lm(formula = y ~ x)
Coefficients:
(Intercept) x
-47676.99 24.18
$`Agriculture, Forestry and Fishing`$`2011 - 2016`
Call:
lm(formula = y ~ x)
Coefficients:
(Intercept) x
-23345.5 12.1
$`Agriculture, Forestry and Fishing`$`2012 - 2017`
Call:
lm(formula = y ~ x)
Coefficients:
(Intercept) x
-100379.38 50.36
Обратите внимание, что предполагается, что в год есть одна строка без пропущенных или повторяющихся лет. Если это не так, slide2_index() или slide2_period() могут быть более полезными; см. вступительную виньетку.
Есть несколько проблем:
1) Предположим, что для каждой регрессии нам нужна одна строка с отраслью, последним годом окна, точкой пересечения, наклоном и R^2. Измените c(...), чтобы изменить желаемую статистику. Например, c(year = tail(yr, 1), unlist(broom::glance(fm))), если вам нужно большое количество статистики.
Это группирует по industry, а затем использует group_modify для создания строк для каждой отрасли.
library(dplyr, exclude = c("filter", "lag"))
library(zoo)
out <- data %>%
group_by(industry) %>%
group_modify(~ {
.x$year %>%
rollapplyr(5, function(yr) {
fm <- lm(industry_sales ~ year, .x, subset = year %in% yr)
c(year = tail(yr, 1), intercept = coef(fm)[[1]], slope = coef(fm)[[2]],
r.squared = summary(fm)$r.squared)
}) %>%
as.data.frame
}) %>%
ungroup
Давая:
> out
# A tibble: 72 × 5
industry year intercept slope r.squared
<chr> <dbl> <dbl> <dbl> <dbl>
1 Agriculture, Forestry and Fishing 2014 -80707. 40.6 0.704
2 Agriculture, Forestry and Fishing 2015 -7481. 4.22 0.0433
3 Agriculture, Forestry and Fishing 2016 -32534. 16.7 0.362
4 Agriculture, Forestry and Fishing 2017 -128244. 64.2 0.605
5 Agriculture, Forestry and Fishing 2018 -165315. 82.6 0.741
6 Agriculture, Forestry and Fishing 2019 -129070. 64.6 0.503
7 Agriculture, Forestry and Fishing 2020 -77455. 39.0 0.317
8 Agriculture, Forestry and Fishing 2021 -164845. 82.3 0.428
9 Agriculture, Forestry and Fishing 2022 193469. -95.2 0.134
10 Construction 2014 -1587815. 802. 0.208
# … with 62 more rows
# ℹ Use `print(n = ...)` to see more rows
2) Если вам просто нужна единая статистика - скажем, наклон - тогда может быть удобнее иметь двумерный ряд зоопарков с одним столбцом для каждой отрасли и одной строкой за год. Обратите внимание, что наклон является инвариантным к сдвигу, поэтому мы можем просто использовать 1: 5 в качестве предиктора в каждом случае, и мы получим тот же наклон, как если бы мы использовали год.
library(zoo)
z <- read.zoo(data, split = "industry")
out2 <- rollapplyr(z, 5, function(x) coef(lm(x ~ seq(5)))[[2]])
Давая:
> out2[1:3, 1:3]
Agriculture, Forestry and Fishing Construction Manufacturing
2014 40.5995 802.1652 12578.98
2015 4.2159 2440.4609 -98362.60
2016 16.6603 4406.7184 -133278.90
Если вы хотите, чтобы это было в качестве фрейма данных, используйте fortify.zoo(out2) или в длинной форме fortify.zoo(out2, melt = TRUE).