一、数据来源
数据来自四姑娘山景区首页新闻的每日客流量发布处,利用python爬虫读取2015年9月29号到2020年6月8日的每日客流量和对应的日期。
import urllib.request
from bs4 import BeautifulSoup
response = urllib.request.urlopen('https://www.sgns.cn/news/number')
soup = BeautifulSoup(response,'html.parser')
numbers0 = soup.find_all(attrs={'headers' : 'categorylist_header_title'})
times0 = soup.find_all(attrs={'headers' : 'categorylist_header_date'})
numbers1=[]
times1=[]
for i in numbers0:n=str(i.text)[21:-10]numbers1.append(n)
for i in times0:t=str(i.text)[8:-6]times1.append(t)page=166for i in range(1,page-1):response = urllib.request.urlopen('https://www.sgns.cn/news/number?start='+str(10*i))soup = BeautifulSoup(response,'html.parser')numbers0 = soup.find_all(attrs={'headers' : 'categorylist_header_title'})times0 = soup.find_all(attrs={'headers' : 'categorylist_header_date'})for i in numbers0:n=str(i.text)numbers1.append(n)for i in times0:t=str(i.text)times1.append(t)
n = len(numbers1)
with open('sgns.txt','w') as f: for i in range(n):f.write(numbers1[i]+','+times1[i]+'\n')
接下来分析用R语言
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
data0 = read.csv(file = "C:/Users/91333/Documents/semester6/VS code/VScode Python/dgns1.txt",header = FALSE,sep = ",")#接下来进行数据清洗,把数据整理成可用形态
data <- data0[1:10,]
data0<-data0[11:nrow(data0),]
for(i in 1:1640){data[10+i,1]<-data0[(i-1)*3+1,1]data[10+i,2]<-data0[(i-1)*3+3,1]}
data$V1 <- as.numeric(apply(as.data.frame(data$V1), MARGIN = 1,FUN = function(x) {gsub("[^[:digit:]]", "", x)})) # 删掉str中的非数字成分
data$V2 <- ymd(data$V2)
data <- na.omit(data)
colnames(data)<-c("游客数","日期")
二、初步探索与数据准备
(一)、缺失值补齐
检查数据时发现数据偶有缺失,1722条观测中共有73个缺失值,为了时间序列预测的完整性,我们直接对73个缺失值进行均值填补。
data1 <- merge(data,data.frame(日期 = seq.Date(from = as.Date("2015/09/29",format = "%Y/%m/%d"),to = as.Date("2020/06/08",format = "%Y/%m/%d"), by = "day")),all.y=T)
sum(is.na(data1$游客数))
## [1] 73
data1[is.na(data1$游客数),2] <- mean(data$游客数,na.omit=T)
(二)、绘图与平稳性检验
ggplot(data)+geom_line(aes(x=日期,y=游客数))
由上图,我们大致可以看出序列平稳,是否真的平稳需要进一步的检验。
下面我们正式进入建模,进行ADF检验,在此之前,我们先删去了2020年之后的数据,从上图也可以看出,疫情对四姑娘山客流量产生了极大的影响,属于非正常情况。
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
adf.test(data1$游客数)
## Warning in adf.test(data1$游客数): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: data1$游客数
## Dickey-Fuller = -6.3952, Lag order = 11, p-value = 0.01
## alternative hypothesis: stationary
p<0.01,则接受备择假设序列平稳,可以进行时间序列建模。 ### 三、拟合ARIMA模型 #### (一)、定阶与拟合
我们考虑带有周期性和均值项的ARIMA模型,由于数据为从2015年9月到2019年12月,当考虑周期性,我们可以考虑以7天一周为周期,也可以考虑30天一个月为周期,还可以考虑365天为周期,所以我们都尝试了一遍,并且使用AICc作为最优模型的判断标准。
library(forecast)
data7 <- ts(data1$游客数, frequency = 7)
data30 <- ts(data1$游客数, frequency = 30)
data365 <- ts(data1$游客数, frequency = 365)
fitarima1 <- auto.arima(data7,seasonal = T)
fitarima1#ARMA(1,3)
## Series: data7
## ARIMA(2,1,2)(2,0,0)[7]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sar1 sar2
## 0.8887 -0.1732 -0.7455 -0.2299 0.1897 0.1129
## s.e. 0.0726 0.0650 0.0728 0.0731 0.0253 0.0247
##
## sigma^2 estimated as 722940: log likelihood=-14048.91
## AIC=28111.82 AICc=28111.88 BIC=28149.97
fitarima2 <- auto.arima(data30,seasonal = T)
fitarima2#ARMA(1,3)
## Series: data30
## ARIMA(1,1,3)
##
## Coefficients:
## ar1 ma1 ma2 ma3
## 0.6213 -0.4937 -0.3762 -0.0584
## s.e. 0.0470 0.0519 0.0253 0.0364
##
## sigma^2 estimated as 760396: log likelihood=-14093.05
## AIC=28196.1 AICc=28196.13 BIC=28223.35
fitarima3 <- auto.arima(data365,seasonal = T)
fitarima3#ARMA(1,3)
## Series: data365
## ARIMA(0,1,3)(0,1,0)[365]
##
## Coefficients:
## ma1 ma2 ma3
## -0.1455 -0.4752 -0.2612
## s.e. 0.0266 0.0248 0.0317
##
## sigma^2 estimated as 1146789: log likelihood=-11386.54
## AIC=22781.09 AICc=22781.11 BIC=22801.93
(二)、白噪声检验
Box.test(fitarima1$residuals,lag=10,type='Ljung')
##
## Box-Ljung test
##
## data: fitarima1$residuals
## X-squared = 16.004, df = 10, p-value = 0.09951
Box.test(fitarima2$residuals,lag=10,type='Ljung')
##
## Box-Ljung test
##
## data: fitarima2$residuals
## X-squared = 82.697, df = 10, p-value = 1.483e-13
Box.test(fitarima3$residuals,lag=10,type='Ljung')
##
## Box-Ljung test
##
## data: fitarima3$residuals
## X-squared = 56.383, df = 10, p-value = 1.74e-08
如果p<0.05,认为不是白噪声
(三)预测绘图
ggdata <- rbind(data[100:1649,],data.frame(日期=seq.Date(from = as.Date("2020/01/01",format = "%Y/%m/%d"), by = "day", length.out = 400),游客数=rep(NA,400)))
ggdata <- ggdata[order(ggdata[,2]),]
forecast_arima1 <- forecast(fitarima1, h = 50)
ggplot() + geom_line(aes(y = ggdata[1500:1600,1], x = ggdata[1500:1600,2]), size = 0.1) + geom_line(aes(y = forecast_arima1$mean, x = ggdata[1551 : 1600, 2]), col = "red", size = 0.1) + xlab("日期") + ylab("游客数") + ggtitle("ARIMA(2,1,2)(2,0,0)[7]的向后50天预测")
## Warning: Removed 50 row(s) containing missing values (geom_path).
forecast_arima2 <- forecast(fitarima2, h = 50)
ggplot() + geom_line(aes(y = ggdata[1500:1600,1], x = ggdata[1500:1600,2]), size = 0.1) + geom_line(aes(y = forecast_arima2$mean, x = ggdata[1551 : 1600, 2]), col = "red", size = 0.1) + xlab("日期") + ylab("游客数")+ggtitle("ARIMA(1,1,3)的向后50天预测")
## Warning: Removed 50 row(s) containing missing values (geom_path).
forecast_arima3 <- forecast(fitarima3, h = 400)
ggplot() + geom_point(aes(y = ggdata$游客数, x = ggdata$日期), size = 0.1) + geom_point(aes(y = forecast_arima3$mean, x = ggdata[1551 : 1950, 2]), col = "red", size = 0.1) + xlab("日期") + ylab("游客数")+ggtitle('ARIMA(0,1,3)(0,1,0)[365]的向后400天预测')
## Warning: Removed 400 rows containing missing values (geom_point).
四、指数平滑拟合
指数平滑也是时间序列建模的一种方法,同ARIMA拟合一样,也同时考虑以7天为周期、30天为周期和以365天为周期。
(一)拟合效果
1.整体观察
fithot1<-HoltWinters(data7, beta = F)
par(mfrow=c(1,1))
plot(fithot1)
fithot2<-HoltWinters(data30, beta = F)
par(mfrow=c(1,1))
plot(fithot2)
fithot3<-HoltWinters(data365, beta = F)
par(mfrow=c(1,1))
plot(fithot3)
##### 2.局部观察:以周期为365的指数平滑拟合为例
其实从上面的整体观察也可以看出,只有第一幅图中,以7天为周期拟合时,存在拟合数据头部波动太大的问题,其他两幅拟合图情况都表现良好,所以在这里用365天周期为例子。
fithot3_plot <- data.frame(游客数=c(data1$游客数[1031:1100],fithot3$fitted[1031:1100]),difference=rep(c("actual","fitted"),c(70:70)),日期=c(data1$日期[1031:1100],data1$日期[1031:1100]))
ggplot(fithot3_plot)+geom_line(aes(y=游客数,x=日期,group=difference,linetype=difference,color=difference))+labs(title = "四姑娘山游客数指数平滑拟合",subtitle="seasonal = 365") +xlab(label = "日期") + ylab(label = "游客数")+theme(plot.title = element_text(hjust = 0.5))
(二)、预测效果
forecast_fithot1<-forecast(fithot1,h=50)
ggplot() + geom_line(aes(y = ggdata[1500:1600,1], x = ggdata[1500:1600,2]), size = 0.1) + geom_line(aes(y = forecast_fithot1$mean, x = ggdata[1551 : 1600, 2]), col = "red", size = 0.1)+labs(title = "指数平滑模型的向后50天预测",subtitle="周期:7") +xlab(label = "日期") + ylab(label = "游客数")+theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 50 row(s) containing missing values (geom_path).
forecast_fithot2<-forecast(fithot2,h=100)
ggplot() + geom_line(aes(y = ggdata[1470:1650,1], x = ggdata[1470:1650,2]), size = 0.1) + geom_line(aes(y = forecast_fithot2$mean, x = ggdata[1551 : 1650, 2]), col = "red", size = 0.1) +labs(title = "指数平滑模型的向后100天预测",subtitle="周期:30") +xlab(label = "日期") + ylab(label = "游客数")+theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 100 row(s) containing missing values (geom_path).
forecast_fithot3 <- forecast(fithot3, h = 400)
ggplot() + geom_point(aes(y = ggdata$游客数, x = ggdata$日期), size = 0.1) + geom_point(aes(y = forecast_fithot3$mean, x = ggdata[1551 : 1950, 2]), col = "red", size = 0.1) +labs(title = "指数平滑模型的向后400天预测",subtitle="周期:365") +xlab(label = "日期") + ylab(label = "游客数")+theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 400 rows containing missing values (geom_point).
指数平滑模型的预测效果要比ARIMA模型好的多。
(三)、指数平滑法改进的尝试
事实上,四姑娘山的客流量变化的季节性,有两个方面。首先,在一周内具有“季节性变化”,周末的客流量要比工作日高很多;其次,一年内也有明显的季节性变化,浏览四姑娘山有旺季也有淡季,因此,我们还可以同时考虑这两个季节性,尝试多季节性时间序列建模。
1.TBATS模型
TBATS 模型(Exponential smoothing state space model with Box-Cox transformation, ARMA errors, Trend and Seasonal components) 是由De Livera、Hyndman和Snyder(2011)开发的另一种方法使用傅立叶项与指数平滑状态空间模型和Box-Cox变换的组合,完全自动化,在R语言中有现成的包。
data7365 <- msts(data1$游客数,seasonal.periods = c(7,365))
fittbats <- tbats(data7365)
fittbats
## TBATS(0.103, {2,2}, -, {<7,2>, <365,5>})
##
## Call: tbats(y = data7365)
##
## Parameters
## Lambda: 0.102628
## Alpha: 0.0107387
## Gamma-1 Values: 0.001370681 -0.001136551
## Gamma-2 Values: -0.001174851 -0.0004119755
## AR coefficients: 1.498767 -0.52432
## MA coefficients: -0.737299 -0.105838
##
## Seed States:
## [,1]
## [1,] 8.56324258
## [2,] -0.21587970
## [3,] 0.01142647
## [4,] -0.38826913
## [5,] 0.22332276
## [6,] 1.43166137
## [7,] 0.91840331
## [8,] 0.03670414
## [9,] -0.32329308
## [10,] -0.05760824
## [11,] -1.45944304
## [12,] 0.28193903
## [13,] 0.49921333
## [14,] 0.65055250
## [15,] 0.53529261
## [16,] 0.00000000
## [17,] 0.00000000
## [18,] 0.00000000
## [19,] 0.00000000
## attr(,"lambda")
## [1] 0.1026285
##
## Sigma: 0.9710305
## AIC: 32844.44
forecast_tbats <- forecast(fittbats, h = 400)
ggplot() + geom_point(aes(y = ggdata$游客数, x = ggdata$日期), size = 0.1) + geom_point(aes(y = forecast_tbats$mean, x = ggdata[1551 : 1950, 2]), col = "red", size = 0.1) + xlab("日期") + ylab("游客数")+ggtitle('TBATS的向后400天预测')
## Warning: Removed 400 rows containing missing values (geom_point).
从图中可以看出,相比只考虑周期365的指数平滑模型,TBATS的估计更为保守和折中,但没有显示出对只考虑周期365的指数平滑模型的改进。
五、总结
根据预测效果我们最终选择周期为365的指数平滑模型作为最终的预测模型。 一下为其估计出的参数
fithot3$coefficients
## a s1 s2 s3 s4
## 1209.8851284 -50.4150837 -84.3434729 -671.7830592 -590.3388754
## s5 s6 s7 s8 s9
## -408.2040871 -364.5343108 -437.9689257 -269.7730554 -409.6871415
## s10 s11 s12 s13 s14
## -592.8877642 -524.3309210 -515.3653688 -416.4906935 -523.3894594
## s15 s16 s17 s18 s19
## -367.7078204 -242.2049726 -473.8056172 -333.2752113 -429.6396850
## s20 s21 s22 s23 s24
## -236.1591365 -316.6312727 -379.7594429 -113.9704958 -615.7839551
## s25 s26 s27 s28 s29
## -161.4264737 -276.9469075 -25.9339286 -385.5415291 -320.7143098
## s30 s31 s32 s33 s34
## -93.1407335 -147.3379641 -141.6788048 -59.7540439 123.5069157
## s35 s36 s37 s38 s39
## 226.7596388 128.9651490 74.9807465 -33.3800978 31.2402231
## s40 s41 s42 s43 s44
## 50.5307364 249.2589075 -101.7379835 530.1637238 156.9130994
## s45 s46 s47 s48 s49
## 117.8400352 325.7256170 311.7154213 653.8303905 1087.8359727
## s50 s51 s52 s53 s54
## 995.6859103 751.0290436 153.1190280 429.9669959 458.5215332
## s55 s56 s57 s58 s59
## 474.8859940 695.3700504 997.0413083 519.6134121 -9.1122513
## s60 s61 s62 s63 s64
## 296.6552182 418.5837378 746.8574473 989.2619534 1043.1781565
## s65 s66 s67 s68 s69
## 507.5197316 530.0733479 555.6238675 460.9317597 583.7106494
## s70 s71 s72 s73 s74
## 385.3482394 538.5297681 390.2863420 214.8751608 -321.2621966
## s75 s76 s77 s78 s79
## -440.3923662 -541.0744158 -499.2890962 -119.9392787 -196.8503142
## s80 s81 s82 s83 s84
## -444.2498386 -433.6367035 -513.4281219 -405.5792610 -259.4161527
## s85 s86 s87 s88 s89
## 104.0283011 170.6465397 -161.3859810 -245.8992175 -192.4138175
## s90 s91 s92 s93 s94
## -196.5888376 -13.4392490 2489.9996114 1331.9699186 273.5968956
## s95 s96 s97 s98 s99
## 2.0886303 -88.8094847 -91.5522247 -37.2001898 670.8981604
## s100 s101 s102 s103 s104
## 659.6314026 -280.3228799 -190.2025039 -7.5667220 4.1628672
## s105 s106 s107 s108 s109
## 118.8317579 367.8586004 2362.7539796 7040.3047868 10363.2547165
## s110 s111 s112 s113 s114
## 10597.5528745 8508.4921314 4869.0815724 671.6288507 -767.5710608
## s115 s116 s117 s118 s119
## -108.8627600 564.8356554 982.7958280 1470.8019593 1736.3733053
## s120 s121 s122 s123 s124
## 2315.1064850 3120.1798063 2583.3951432 2686.9125707 2654.4253383
## s125 s126 s127 s128 s129
## 2550.2141189 2543.7709802 3346.6813248 5083.3120035 4187.5969715
## s130 s131 s132 s133 s134
## 3083.7461780 2655.0868995 3124.3195698 2794.7226404 2981.8802564
## s135 s136 s137 s138 s139
## 3668.7789673 3144.4161984 1721.9915919 1188.3175341 1059.6519398
## s140 s141 s142 s143 s144
## 1168.6670227 2252.9212306 3998.0961579 2526.4828850 801.4481899
## s145 s146 s147 s148 s149
## 214.7644404 47.5183886 153.0764486 676.4372308 1109.4849259
## s150 s151 s152 s153 s154
## 548.2605204 44.1542375 -239.2333609 -383.1907927 -347.1596869
## s155 s156 s157 s158 s159
## 31.0220249 304.4568409 -19.2564461 -483.2990303 -278.2134978
## s160 s161 s162 s163 s164
## -830.4053170 -718.0297268 -413.7738312 -208.0903835 -664.3410978
## s165 s166 s167 s168 s169
## -857.7736155 -805.2558593 -843.9487166 -838.9610718 -680.5885578
## s170 s171 s172 s173 s174
## -394.4042548 -497.3195446 -903.3239042 -943.0095755 -896.0479458
## s175 s176 s177 s178 s179
## -905.4579324 -733.3674327 -586.1165980 -677.3107671 -1016.9633353
## s180 s181 s182 s183 s184
## -962.4933534 -893.5744835 -880.8306153 -907.5817265 -646.6494861
## s185 s186 s187 s188 s189
## -705.5741916 -1015.6163422 -907.8912286 -956.3901090 -937.2663694
## s190 s191 s192 s193 s194
## -732.5065180 -930.7902854 -860.3632375 -973.2386728 -1000.2721730
## s195 s196 s197 s198 s199
## -898.2964326 -939.1227316 -846.9900499 214.4981847 664.5460716
## s200 s201 s202 s203 s204
## -246.3961442 -712.7869967 -924.0647139 -924.2542552 -951.0099368
## s205 s206 s207 s208 s209
## -795.1967314 -891.0925587 -1027.2436036 -1007.7671292 -988.8411683
## s210 s211 s212 s213 s214
## -996.6426663 -959.0102600 -864.1382403 -881.1752496 -983.9105447
## s215 s216 s217 s218 s219
## -1006.9784739 -970.5221487 -1088.2178946 -906.6899943 -993.6454722
## s220 s221 s222 s223 s224
## -964.4285380 -982.5571898 -1016.2883339 -1045.5866390 -1056.0586777
## s225 s226 s227 s228 s229
## -1048.8846934 -855.3155744 154.8430065 1344.7473191 2119.4703471
## s230 s231 s232 s233 s234
## 1614.1095889 928.7062342 242.1628872 -371.6839058 -666.2285936
## s235 s236 s237 s238 s239
## -721.4206881 -634.4595882 -798.0644789 -1032.5367307 -1280.3381997
## s240 s241 s242 s243 s244
## -1262.8268356 -1193.0706275 -1272.8411310 -1320.6840779 -1285.1855061
## s245 s246 s247 s248 s249
## -1235.6934907 -1050.8392780 -1000.7488775 -1030.6851946 -1294.0403103
## s250 s251 s252 s253 s254
## -1341.2816341 -1272.6799137 -1315.5267294 -1370.0354487 -1291.5867959
## s255 s256 s257 s258 s259
## -1227.2620868 -1280.5050484 -1293.6726006 -1308.2073016 -1311.0584262
## s260 s261 s262 s263 s264
## -1208.1308043 -995.5378649 -1082.6423367 -1247.4852318 -1243.4184035
## s265 s266 s267 s268 s269
## -52.7375454 -543.2101986 -912.4392657 -899.2148031 -899.1757710
## s270 s271 s272 s273 s274
## -1185.6429558 -1211.3595638 -1203.6636662 -1146.5637730 -1096.8201825
## s275 s276 s277 s278 s279
## -687.8352847 -619.1653264 -835.3783922 -959.6180174 -935.6950056
## s280 s281 s282 s283 s284
## -892.0725243 -843.1881163 -552.5635952 -537.9623281 -709.0866895
## s285 s286 s287 s288 s289
## -843.5568153 -876.6116634 -481.6038990 -351.3177010 -281.3309977
## s290 s291 s292 s293 s294
## -204.8171513 1048.5381738 44.9898119 380.6568470 -273.2726212
## s295 s296 s297 s298 s299
## -635.9238650 -394.0600088 -239.0114485 -523.6490313 -372.9122763
## s300 s301 s302 s303 s304
## -276.4677349 -43.8006526 -171.6756478 -294.6628938 -391.0980635
## s305 s306 s307 s308 s309
## -440.2622049 -248.5693857 -317.3721323 -324.2112467 -243.5059803
## s310 s311 s312 s313 s314
## -26.5467049 -330.7010596 -443.0281077 -499.2887064 -374.5983875
## s315 s316 s317 s318 s319
## -367.1066497 -289.0542814 705.2346729 574.9174293 927.7046672
## s320 s321 s322 s323 s324
## 971.8044924 -565.4492201 -1284.6519290 -848.5833652 -395.2998677
## s325 s326 s327 s328 s329
## -387.2330364 -56.3958980 -206.6351379 -396.2946054 -271.0539436
## s330 s331 s332 s333 s334
## -765.7887967 -631.8094450 -433.0553472 -504.4797061 -470.6858409
## s335 s336 s337 s338 s339
## -381.2781937 -354.0468112 -30.5796797 -246.6573937 -348.2941724
## s340 s341 s342 s343 s344
## -523.2659707 -407.6650977 -319.0171870 -350.1990511 0.7611104
## s345 s346 s347 s348 s349
## -430.1052047 -146.6708942 -213.3778014 -610.3461904 -535.1584913
## s350 s351 s352 s353 s354
## -494.7072286 -508.0804042 -432.2283021 -355.3212032 -325.1013548
## s355 s356 s357 s358 s359
## -218.4327787 65.9865537 0.2003378 -695.4455823 361.3555908
## s360 s361 s362 s363 s364
## 356.8847066 -356.5012459 -312.8628552 -327.1284213 -453.9605642
## s365
## -503.8851284