!!!重点!!!:data.frame 绘图 矩阵第一章#R语言是区分大小写的解释型语言
#rm()函数用于删除数据区中的数据第二章#R语言下标从1开始
#向量
vector <- c(20173561,1709,20173562,1707)
#矩阵
matrix <- matrix(vector,nrow = 2,ncol = 2,byrow = TRUE)
#数组,在创建数组时指定的维度名称需要以列表的形式传入参数
data <- 1:24
dim1 <- c("A1", "A2")
dim2 <- c("B1", "B2", "B3")
dim3 <- c("C1", "C2", "C3", "C4")
array <- array(data,c(2,3,4),list(dim1, dim2, dim3))
#数据框
patientID <- c(1, 2, 3, 4)#数值型向量
age <- c(25, 34, 28, 52)#数值型向量
diabetes <- c("Type1", "Type2", "Type1", "Type1")#data.frame会自动将字符型向量转换为factor因子,其水平数等于factor的所有可能的取值数
status <- c("Poor", "Improved", "Excellent", "Poor")
frame <- data.frame(patientID, age, diabetes,status)
frame <- data.frame(patientID, age, diabetes, status, stringsAsFactors=FALSE)指定stringsAsFactors=FALSE让字符串型不转化为因子
#选取数据框的元素
frame[1,]选取第1行
frame[1:2,]选取第1、2行
frame[1]选取第1列
frame[1:2]选取第1、2列
frame[1,3]选取第1行第3列
frame$patientID选取ID列
frame$patientID[2]选取ID列第2行
#列表,列表中可能是若干向量、矩阵、数据框,甚至其他列表的组合
g <- "My First List"
h <- c(25, 26, 18, 39)
j <- matrix(1:10, nrow=5)
k <- c("one", "two", "three")
list <- list(first = g,second = h,third = j,k)
#将一个值赋给某个向量、矩阵、数组或列表中一个不存在的元素时,R将自动扩展这个数据结构以容纳新值,空余位置填补NA
x <- c(1,2)
x
##结果 [1] 1 2
x[5] <- 5
x
##结果 [1] 1 2 NA NA 5
#读取CSV文件
read.table("your file url*",header = TRUE,sep = ",")
#默认情况下,字符型变量将转换为因子
#查看数据的结构
str(frame)## 'data.frame': 4 obs. of 4 variables:
$ patientID: num 1 2 3 4
$ age : num 25 34 28 52
$ diabetes : Factor w/ 2 levels "Type1","Type2": 1 2 1 1
$ status : Factor w/ 3 levels "Excellent","Improved",..: 3 2 1 3#查看数据维度
dim(frame)## [1] 4 4#查看对象类型
class(frame)## [1] "data.frame"#查看对象属性(字段)(变量)名称
names(frame)## [1] "patientID" "age" "diabetes" "status"#查看某对象前几列,以2为例
head(frame,2)## patientID age diabetes status
1 1 25 Type1 Poor
2 2 34 Type2 Improved第三章#创建新变量
mydata s u m x < − m y d a t a sumx <- mydata sumx<−mydatax1 + mydata$x2
mydata m e a n x < − ( m y d a t a meanx <- (mydata meanx<−(mydatax1 + mydata$x2)/2
#变量重编码
leadership a g e c a t [ l e a d e r s h i p agecat[leadership agecat[leadershipage >= 55 & leadership$age <= 75] <- “Middle Aged”
#变量重命名
法一
install.packages(“reshape”) #先安装包
library(reshape)
your dataframe name <- rename(your dataframe name, c(oldName1=“NewName1”, oldName2=“NewName2”))
法二
names(your dataframe name)[2] <- “newName”
#NA是缺失值,NaN是非数值
#检测缺失值is.na()
#删除带有缺失值数据的行na.omit()
#计算日期差,以日、周等为单位
difftime(time1, time2, units = c(“auto”, “secs”, “mins”, “hours”, “days”, “weeks”))
#数据类型转换
as.numeric()
as.character()
as.vector()
as.matrix()
as.data.frame()
#数据排序
your dataframe name[order(ydn$attribute name),]升序,别忘了逗号
your dataframe name[order(-ydn$attribute name),]降序,别忘了逗号
#多变量排序
your dataframe name[order(ydn a t t r i b u t e n a m e 1 , − y d n attribute name1, -ydn attributename1,−ydnattribute name2),]
#数据集合并
#按照某些变量合并 merge(dataframeA, dataframeB, by=c(“attribute name1”,“attribute name2”))
#直接横向合并(添加列)必须有相同的行数并且要以相同顺序进行排序 cbind(A, B)
#直接纵向合并(添加行)必须拥有相同的变量,但顺序不必相同 rbind(A, B)
#数据集取子集
#选入变量your dataframe name[c(“attribute name1”,“attribute name2”,…)]
#剔除变量使用布尔型变量的方式
#1
myvars <- names(leadership) %in% c(“q3”, “q4”)
newdata <- leadership[!myvars]
#2
newdata <- leadership[c(-7,-8)] 如果知道q3和q4是第7个和第8个变量
#3
leadership q 3 < − l e a d e r s h i p q3 <- leadership q3<−leadershipq4 <- NULL
#可以按照一定规则进行列的选取
leadership[c(TRUE, FALSE, FALSE),] 如果行数大于3:会得到1、4、7、10、13、16……行
leadership[c(TRUE, FALSE),] 如果行数大于2:会得到1、3、5、7……行
#选取行的时候要注意加逗号
newdata <- leadership[1:3,]
newdata <- leadership[leadershipKaTeX parse error: Expected 'EOF', got '&' at position 13: gender=="M" &̲ leadershipage > 30,]
#subset函数
#newdata <- subset(leadership, age >= 35 | age < 24, select=c(q1, q2, q3, q4))第一个参数操作的dataframe,第二个参数条件,第三个参数所要选择的变量(列)
#随机抽样,别忘了逗号
mysample <- leadership[sample(1:nrow(leadership), 3, replace=FALSE),]
第一个参数是抽样范围
第二个参数是抽样个数
第三个参数是是否放回
#数据转置t()
#数据分类汇总
aggregate(x,by,Fun)
第一个参数是操作的dataframe
第二个参数是按照那些变量(列)进行分类
第三个参数是对分类后的数据进行怎样的操作,这个函数会对该dataframe所有变量进行该函数操作,所以当该dataframe中有非数值型变量时,该函数无法使用
aggdata <-aggregate(mtcars, by=list(cyl,gear), FUN=mean, na.rm=TRUE) by中变量必须在一个列表中,即使只有一个变量
#reshape包
library(reshape)
#融化melt
md <- melt(mydata, id=c(“ID”, “Time”))
该函数会将mydata数据集中除了"ID"“Time"的其余变量(不管多少列)融为两列"Variable”“Value”
#整型cast
newdata <- cast(md, formula, FUN)
第一个参数是操作的数据集
第二个参数是整型的方式,形式为A~B,其中A与B都可以是多个变量,多个变量之间用+连接,A代表整型是按照A进行分类,B代表按照A分组后在按照B中变量取值再进行分组形成新的变量,再对其他的变量按照A分组与B分组进行FUN运算,B的每个不同的取值组合都会成为新数据集的一个新变量
###可以将A理解为行分组,B理解为列分组###
第三个参数是整型使用的函数
#条件控制
#else不能出现在行首第四章#条形图
library(vcd)## Loading required package: gridcounts <- table(ArthritisKaTeX parse error: Expected 'EOF', got '#' at position 11: Improved) #̲准备数据 #简单条形图 bar…mpg, labels=row.names(mtcars), cex=.7, main=“Gas Mileage for Car Models”, xlab=“Miles Per Gallon”)#散点图
plot(mtcars w t , m t c a r s wt, mtcars wt,mtcarsmpg)
abline(lm(mtcars m p g m t c a r s mpg ~ mtcars mpg mtcarswt))#为散点图添加最佳拟合线性直线#折线图
t1 <- subset(Orange, Tree==1)
plot(t1 a g e , t 1 age, t1 age,t1circumference,type=“b”) 第五章#平均数mean()
#中位数median()
#方差var()
#标准差sd()
#值域range()
#求和sum()
#最小值min()
#最大值max()
#计算常用统计量的函数summary(),计算最小值、最大值、上下四分位数、均值(或因子向量与逻辑向量的频数)
mt <- mtcars[c(“mpg”, “hp”, “wt”, “am”)]
summary(mt)## mpg hp wt am
Min. :10.40 Min. : 52.0 Min. :1.513 Min. :0.0000
1st Qu.:15.43 1st Qu.: 96.5 1st Qu.:2.581 1st Qu.:0.0000
Median :19.20 Median :123.0 Median :3.325 Median :0.0000
Mean :20.09 Mean :146.7 Mean :3.217 Mean :0.4062
3rd Qu.:22.80 3rd Qu.:180.0 3rd Qu.:3.610 3rd Qu.:1.0000
Max. :33.90 Max. :335.0 Max. :5.424 Max. :1.0000#统计量的计算函数
sapply(x, FUN, options)
#第一个参数是数据框(或矩阵)
#第二个参数是任意函数(可以是用户自定义函数)
#第三个参数是FUN函数的参数(如果有的话)
#sapply对每列应用函数FUN
#分组计算统计量
aggregate()只允许使用单返回值函数
by()允许使用多返回值函数
myvars <- c(“mpg”, “hp”, “wt”)
aggregate(mtcars[myvars], by=list(am=mtcarsKaTeX parse error: Expected 'EOF', got '#' at position 11: am), mean)#̲注意list(am=mtcar…am)的使用。如果使用的是list(mtcars$am),则am列将被标注为Group.1而不是am## am mpg hp wt
1 0 17.14737 160.2632 3.768895
2 1 24.39231 126.8462 2.411000mystats <- function(x, na.omit=FALSE){
if (na.omit)
x <- x[!is.na(x)] #忽略缺失值
m <- mean(x)
n <- length(x)
s <- sd(x)
skew <- sum((x-m)3/s3)/n
kurt <- sum((x-m)4/s4)/n - 3
return(c(n=n, mean=m, stdev=s, skew=skew, kurtosis=kurt))
}
dstats <- function(x)sapply(x, mystats)
by(mtcars[myvars], mtcarsKaTeX parse error: Expected 'EOF', got '#' at position 12: am, dstats)#̲# mtcarsam: 0
mpg hp wt
n 19.00000000 19.00000000 19.0000000
mean 17.14736842 160.26315789 3.7688947
stdev 3.83396639 53.90819573 0.7774001
skew 0.01395038 -0.01422519 0.9759294
kurtosis -0.80317826 -1.20969733 0.1415676
--------------------------------------------------------
mtcars$am: 1
mpg hp wt
n 13.00000000 13.0000000 13.0000000
mean 24.39230769 126.8461538 2.4110000
stdev 6.16650381 84.0623243 0.6169816
skew 0.05256118 1.3598859 0.2103128
kurtosis -1.45535200 0.5634635 -1.1737358#频数表
library(vcd)
#一维频数表
with(Arthritis, table(Improved))## Improved
None Some Marked
42 14 28#一维频率表
prop.table(with(Arthritis,table(Improved)))## Improved
None Some Marked
0.5000000 0.1666667 0.3333333#二维列联表
xtabs(~ A + B, data=mydata),A为行分组,B为列分组
xtabs(~ Treatment+Improved, data=Arthritis)## Improved
Treatment None Some Marked
Placebo 29 7 7
Treated 13 7 21#为二维列联表生成边际和
addmargins(xtabs(~ Treatment+Improved, data=Arthritis))## Improved
Treatment None Some Marked Sum
Placebo 29 7 7 43
Treated 13 7 21 41
Sum 42 14 28 84#行边际和
addmargins(xtabs(~ Treatment+Improved, data=Arthritis),1)## Improved
Treatment None Some Marked
Placebo 29 7 7
Treated 13 7 21
Sum 42 14 28#列边际和
addmargins(xtabs(~ Treatment+Improved, data=Arthritis),2)## Improved
Treatment None Some Marked Sum
Placebo 29 7 7 43
Treated 13 7 21 41#创建二位列联表的简单函数
library(gmodels)
CrossTable(Arthritis T r e a t m e n t , A r t h r i t i s Treatment, Arthritis Treatment,ArthritisImproved)##
Cell Contents
|-------------------------|
| N |
| Chi-square contribution |
| N / Row Total |
| N / Col Total |
| N / Table Total |
|-------------------------|
Total Observations in Table: 84
| Arthritis$Improved
Arthritis$Treatment | None | Some | Marked | Row Total |
--------------------|-----------|-----------|-----------|-----------|
Placebo | 29 | 7 | 7 | 43 |
| 2.616 | 0.004 | 3.752 | |
| 0.674 | 0.163 | 0.163 | 0.512 |
| 0.690 | 0.500 | 0.250 | |
| 0.345 | 0.083 | 0.083 | |
--------------------|-----------|-----------|-----------|-----------|
Treated | 13 | 7 | 21 | 41 |
| 2.744 | 0.004 | 3.935 | |
| 0.317 | 0.171 | 0.512 | 0.488 |
| 0.310 | 0.500 | 0.750 | |
| 0.155 | 0.083 | 0.250 | |
--------------------|-----------|-----------|-----------|-----------|
Column Total | 42 | 14 | 28 | 84 |
| 0.500 | 0.167 | 0.333 | |
--------------------|-----------|-----------|-----------|-----------|
#超二维列联表
xtabs(~ Treatment+Sex+Improved, data=Arthritis)## , , Improved = None
Sex
Treatment Female Male
Placebo 19 10
Treated 6 7
, , Improved = Some
Sex
Treatment Female Male
Placebo 7 0
Treated 5 2
, , Improved = Marked
Sex
Treatment Female Male
Placebo 6 1
Treated 16 5ftable(xtabs(~ Treatment+Sex+Improved, data=Arthritis))#ftable()函数可以以一种紧凑而吸引人的方式输出多维列联表## Improved None Some Marked
Treatment Sex
Placebo Female 19 7 6
Male 10 0 1
Treated Female 6 5 16
Male 7 2 5#参数估计
指的是用样本中的数据估计总体分布的某个或某几个参数,比如给定一定样本容量的样本,要求估计总体的均值、方差等
#点估计
从总体中抽取一个样本,根据该样本的统计量对总体的未知参数做出一个数值点的估计
#区间估计
在点估计的基础上,给出总体参数落在某一区间的概率
两个指标:置信区间是指由样本统计量所构造的总体参数的估计区间,置信水平是指总体未知参数落在区间内的概率,表示为1–α,α为显著性水平,即总体参数未在区间内的概率
#假设检验
通过样本分布,检验某个参数的属于某个区间范围的概率
α错误:在进行假设检验时提出原假设和备择假设,原假设实际上是正确的,但我们做出的决定是拒绝原假设,此类错误称为第一类错误
β错误:原假设实际上是不正确的,但是我们却做出了接受原假设的决定,此类错误称为第二类错误
若总体分布规律已知,则使用参数假设检验;若总体分布规律未知,则使用非参数假设检验
#独立性检验
卡方检测:原假设H0是假设各属性之间相互独立。Χ2越小说明变量之间越独立,Χ2越大说明变量之间越相关,计算得到的p值若远小于0.05,则否定H0
chisq.test(xtabs(~Treatment+Improved, data=Arthritis))##
Pearson’s Chi-squared test
data: xtabs(~Treatment + Improved, data = Arthritis)
X-squared = 13.055, df = 2, p-value = 0.001463# Fisher精确检验:原假H0设是:边界固定的列联表中行和列是相互独立的,不能用于2X2列联表,计算得到的p值若远小于0.05,则否定H0
fisher.test(xtabs(~Treatment+Improved, data=Arthritis))##
Fisher’s Exact Test for Count Data
data: xtabs(~Treatment + Improved, data = Arthritis)
p-value = 0.001393
alternative hypothesis: two.sided# Cochran-Mantel-Haenszel检验:原假设是:两个名义变量在第三个变量的每一层中都是条件独立的,计算得到的p值若远小于0.05,则否定H0
H0就是分性别来看,治疗效果与治疗方式独立
mantelhaen.test(xtabs(~Treatment+Improved+Sex, data=Arthritis))##
Cochran-Mantel-Haenszel test
data: xtabs(~Treatment + Improved + Sex, data = Arthritis)
Cochran-Mantel-Haenszel M^2 = 14.632, df = 2, p-value = 0.0006647#相关性度量
assocstats()函数可以用来计算二维列联表的phi系数、列联系数、Cramer’s V系数,较大的值意味着较强的相关性
assocstats(xtabs(~Treatment+Improved, data=Arthritis))## X^2 df P(> X^2)
Likelihood Ratio 13.530 2 0.0011536
Pearson 13.055 2 0.0014626
Phi-Coefficient : NA
Contingency Coeff.: 0.367
Cramer’s V : 0.394# 协方差,协方差的数值越大,两个变量同向程度也就越大
cov(state.x77[,1:6])## Population Income Illiteracy Life Exp Murder
Population 19931683.7588 571229.7796 292.8679592 -407.8424612 5663.523714
Income 571229.7796 377573.3061 -163.7020408 280.6631837 -521.894286
Illiteracy 292.8680 -163.7020 0.3715306 -0.4815122 1.581776
Life Exp -407.8425 280.6632 -0.4815122 1.8020204 -3.869480
Murder 5663.5237 -521.8943 1.5817755 -3.8694804 13.627465
HS Grad -3551.5096 3076.7690 -3.2354694 6.3126849 -14.549616
HS Grad
Population -3551.509551
Income 3076.768980
Illiteracy -3.235469
Life Exp 6.312685
Murder -14.549616
HS Grad 65.237894# person积差:Pearson积差相关系数衡量了两个定量变量之间的线性相关程度
cor(mtcars)## mpg cyl disp hp drat wt
mpg 1.0000000 -0.8521620 -0.8475514 -0.7761684 0.68117191 -0.8676594
cyl -0.8521620 1.0000000 0.9020329 0.8324475 -0.69993811 0.7824958
disp -0.8475514 0.9020329 1.0000000 0.7909486 -0.71021393 0.8879799
hp -0.7761684 0.8324475 0.7909486 1.0000000 -0.44875912 0.6587479
drat 0.6811719 -0.6999381 -0.7102139 -0.4487591 1.00000000 -0.7124406
wt -0.8676594 0.7824958 0.8879799 0.6587479 -0.71244065 1.0000000
qsec 0.4186840 -0.5912421 -0.4336979 -0.7082234 0.09120476 -0.1747159
vs 0.6640389 -0.8108118 -0.7104159 -0.7230967 0.44027846 -0.5549157
am 0.5998324 -0.5226070 -0.5912270 -0.2432043 0.71271113 -0.6924953
gear 0.4802848 -0.4926866 -0.5555692 -0.1257043 0.69961013 -0.5832870
carb -0.5509251 0.5269883 0.3949769 0.7498125 -0.09078980 0.4276059
qsec vs am gear carb
mpg 0.41868403 0.6640389 0.59983243 0.4802848 -0.55092507
cyl -0.59124207 -0.8108118 -0.52260705 -0.4926866 0.52698829
disp -0.43369788 -0.7104159 -0.59122704 -0.5555692 0.39497686
hp -0.70822339 -0.7230967 -0.24320426 -0.1257043 0.74981247
drat 0.09120476 0.4402785 0.71271113 0.6996101 -0.09078980
wt -0.17471588 -0.5549157 -0.69249526 -0.5832870 0.42760594
qsec 1.00000000 0.7445354 -0.22986086 -0.2126822 -0.65624923
vs 0.74453544 1.0000000 0.16834512 0.2060233 -0.56960714
am -0.22986086 0.1683451 1.00000000 0.7940588 0.05753435
gear -0.21268223 0.2060233 0.79405876 1.0000000 0.27407284
carb -0.65624923 -0.5696071 0.05753435 0.2740728 1.00000000# Spearman等级相关系数:则衡量分级定序变量之间的相关程度
cor(mtcars, method=“spearman”)## mpg cyl disp hp drat wt
mpg 1.0000000 -0.9108013 -0.9088824 -0.8946646 0.65145546 -0.8864220
cyl -0.9108013 1.0000000 0.9276516 0.9017909 -0.67888119 0.8577282
disp -0.9088824 0.9276516 1.0000000 0.8510426 -0.68359210 0.8977064
hp -0.8946646 0.9017909 0.8510426 1.0000000 -0.52012499 0.7746767
drat 0.6514555 -0.6788812 -0.6835921 -0.5201250 1.00000000 -0.7503904
wt -0.8864220 0.8577282 0.8977064 0.7746767 -0.75039041 1.0000000
qsec 0.4669358 -0.5723509 -0.4597818 -0.6666060 0.09186863 -0.2254012
vs 0.7065968 -0.8137890 -0.7236643 -0.7515934 0.44745745 -0.5870162
am 0.5620057 -0.5220712 -0.6240677 -0.3623276 0.68657079 -0.7377126
gear 0.5427816 -0.5643105 -0.5944703 -0.3314016 0.74481617 -0.6761284
carb -0.6574976 0.5800680 0.5397781 0.7333794 -0.12522294 0.4998120
qsec vs am gear carb
mpg 0.46693575 0.7065968 0.56200569 0.5427816 -0.65749764
cyl -0.57235095 -0.8137890 -0.52207118 -0.5643105 0.58006798
disp -0.45978176 -0.7236643 -0.62406767 -0.5944703 0.53977806
hp -0.66660602 -0.7515934 -0.36232756 -0.3314016 0.73337937
drat 0.09186863 0.4474575 0.68657079 0.7448162 -0.12522294
wt -0.22540120 -0.5870162 -0.73771259 -0.6761284 0.49981205
qsec 1.00000000 0.7915715 -0.20333211 -0.1481997 -0.65871814
vs 0.79157148 1.0000000 0.16834512 0.2826617 -0.63369482
am -0.20333211 0.1683451 1.00000000 0.8076880 -0.06436525
gear -0.14819967 0.2826617 0.80768800 1.0000000 0.11488698
carb -0.65871814 -0.6336948 -0.06436525 0.1148870 1.00000000#相关性检验
states<- state.x77[,1:6]
#常用的原假设H0为:变量间不相关(即总体的相关系数为0)也可以是相关系数大于0或小于0。若p值远小于0.05,拒绝原假设。
cor.test(x, y, alternative = , method = ) 一次只能检验一种关系
#第一、二个参数的x和y为要检验相关性的变量
#第三个参数alternative用来指定进行双侧检验或单侧检验(取值为"two.side"、“less"或"greater”)当研究的假设为总体的相关系数小于0时,请使用alternative=“less”。在研究的假设为总体的相关系数大于0时,应使用alternative=“greater”。在默认情况下,假设为alternative=“two.side”(总体相关系数不等于0)。
#第四个参数method用以指定要计算的相关类型(“pearson”、“kendall"或"spearman”)
cor.test(states[,3], states[,5])##
Pearson’s product-moment correlation
data: states[, 3] and states[, 5]
t = 6.8479, df = 48, p-value = 1.258e-08
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5279280 0.8207295
sample estimates:
cor
0.7029752#corr.test(data, use= , method = ) 一次可以检验多个关系
第一个参数代表操作的数据集
第二个参数use=的取值可为"pairwise"或"complete"(分别表示对缺失值执行成对删除或行删除)
第三个参数method=的取值可为"pearson"(默认值)、“spearman"或"kendall”
library(psych)##
Attaching package: ‘psych’## The following object is masked from ‘package:plotrix’:
rescalecorr.test(states, use=“complete”)## Call:corr.test(x = states, use = “complete”)
Correlation matrix
Population Income Illiteracy Life Exp Murder HS Grad
Population 1.00 0.21 0.11 -0.07 0.34 -0.10
Income 0.21 1.00 -0.44 0.34 -0.23 0.62
Illiteracy 0.11 -0.44 1.00 -0.59 0.70 -0.66
Life Exp -0.07 0.34 -0.59 1.00 -0.78 0.58
Murder 0.34 -0.23 0.70 -0.78 1.00 -0.49
HS Grad -0.10 0.62 -0.66 0.58 -0.49 1.00
Sample Size
[1] 50
Probability values (Entries above the diagonal are adjusted for multiple tests.)
Population Income Illiteracy Life Exp Murder HS Grad
Population 0.00 0.59 1.00 1.0 0.10 1
Income 0.15 0.00 0.01 0.1 0.54 0
Illiteracy 0.46 0.00 0.00 0.0 0.00 0
Life Exp 0.64 0.02 0.00 0.0 0.00 0
Murder 0.01 0.11 0.00 0.0 0.00 0
HS Grad 0.50 0.00 0.00 0.0 0.00 0
To see confidence intervals of the correlations, print with the short=FALSE option#t检验
#用于对两组进行比较
#原假设H0:针对两组独立样本(并且是从正态总体中抽得),两个总体的均值相等。若p值远小于0.05,拒绝原假设。
#第一种调用格式
t.test(y ~ x, data) 其中的y是一个数值型变量,x是一个二分变量
library(MASS)
t.test(Prob ~ So, data=UScrime)##
Welch Two Sample t-test
data: Prob by So
t = -3.8954, df = 24.925, p-value = 0.0006506
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.03852569 -0.01187439
sample estimates:
mean in group 0 mean in group 1
0.03851265 0.06371269#第二种调用格式
t.test(y1, y2) 其中的y1和y2为数值型向量
with(UScrime, t.test(U1, U2, paired=TRUE))##
Paired t-test
data: U1 and U2
t = 32.407, df = 46, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
57.67003 65.30870
sample estimates:
mean of the differences
61.48936第六章library(data.table)
#data.table
#DT[i, j, by]
#读取数据
flights <- fread(“F:\0课件\R语言\flights14.csv”)
#行筛选
flights[origin == “JFK” & month == 6L]## year month day dep_time dep_delay arr_time arr_delay cancelled
1: 2014 6 1 851 -9 1205 -5 0
2: 2014 6 1 1220 -10 1522 -13 0
3: 2014 6 1 718 18 1014 -1 0
4: 2014 6 1 1024 -6 1314 -16 0
5: 2014 6 1 1841 -4 2125 -45 0
—
8418: 2014 6 30 1457 -3 1639 -6 0
8419: 2014 6 30 1454 -5 1622 -32 0
8420: 2014 6 30 1717 -3 1834 -16 0
8421: 2014 6 30 758 -2 927 7 0
8422: 2014 6 30 803 -7 932 -18 0
carrier tailnum flight origin dest air_time distance hour min
1: AA N787AA 1 JFK LAX 324 2475 8 51
2: AA N795AA 3 JFK LAX 329 2475 12 20
3: AA N784AA 9 JFK LAX 326 2475 7 18
4: AA N791AA 19 JFK LAX 320 2475 10 24
5: AA N790AA 21 JFK LAX 326 2475 18 41
—
8418: MQ N931MQ 3231 JFK PIT 62 340 14 57
8419: MQ N802MQ 3295 JFK RDU 65 427 14 54
8420: MQ N530MQ 3365 JFK DCA 39 213 17 17
8421: MQ N502MQ 3370 JFK DCA 52 213 7 58
8422: MQ N811MQ 3465 JFK RDU 67 427 8 3#列筛选(别忘了逗号)可以同时进行列的重命名
flights[,.(newname=tailnum,flight)]#也可以写成 flights[,list(newname=tailnum,flight)]## newname flight
1: N338AA 1
2: N335AA 3
3: N327AA 21
4: N3EHAA 29
5: N319AA 117
—
253312: N23708 1744
253313: N33132 1758
253314: N827MQ 3591
253315: N511MQ 3592
253316: N813MQ 3599#排除列
flights[, -c(“arr_delay”, “dep_delay”), with=FALSE]#"-“可以用”!"代替## year month day dep_time arr_time cancelled carrier tailnum flight
1: 2014 1 1 914 1238 0 AA N338AA 1
2: 2014 1 1 1157 1523 0 AA N335AA 3
3: 2014 1 1 1902 2224 0 AA N327AA 21
4: 2014 1 1 722 1014 0 AA N3EHAA 29
5: 2014 1 1 1347 1706 0 AA N319AA 117
—
253312: 2014 10 31 1459 1747 0 UA N23708 1744
253313: 2014 10 31 854 1147 0 UA N33132 1758
253314: 2014 10 31 1102 1311 0 MQ N827MQ 3591
253315: 2014 10 31 1106 1325 0 MQ N511MQ 3592
253316: 2014 10 31 824 1045 0 MQ N813MQ 3599
origin dest air_time distance hour min
1: JFK LAX 359 2475 9 14
2: JFK LAX 363 2475 11 57
3: JFK LAX 351 2475 19 2
4: LGA PBI 157 1035 7 22
5: JFK LAX 350 2475 13 47
—
253312: LGA IAH 201 1416 14 59
253313: EWR IAH 189 1400 8 54
253314: LGA RDU 83 431 11 2
253315: LGA DTW 75 502 11 6
253316: LGA SDF 110 659 8 24#排序
flights[order(origin,-dest)]#"-"代表降序排序## year month day dep_time dep_delay arr_time arr_delay cancelled
1: 2014 1 5 836 6 1151 49 0
2: 2014 1 6 833 7 1111 13 0
3: 2014 1 7 811 -6 1035 -13 0
4: 2014 1 8 810 -7 1036 -12 0
5: 2014 1 9 833 16 1055 7 0
—
253312: 2014 10 31 929 -1 1158 -22 0
253313: 2014 10 31 2025 -5 2252 -23 0
253314: 2014 4 6 1059 -6 1332 -1 0
253315: 2014 4 7 1122 2 1352 1 0
253316: 2014 4 11 1033 0 1245 -19 0
carrier tailnum flight origin dest air_time distance hour min
1: EV N12175 4419 EWR XNA 195 1131 8 36
2: EV N24128 4419 EWR XNA 190 1131 8 33
3: EV N12142 4419 EWR XNA 179 1131 8 11
4: EV N11193 4419 EWR XNA 184 1131 8 10
5: EV N14198 4419 EWR XNA 181 1131 8 33
—
253312: WN N243WN 706 LGA ATL 112 762 9 29
253313: WN N913WN 2969 LGA ATL 112 762 20 25
253314: EV N760EV 5624 LGA AGS 110 678 10 59
253315: EV N197PQ 5625 LGA AGS 111 678 11 22
253316: EV N391CA 5632 LGA AGS 102 678 10 33#在参数j中进行运算
flights[, sum((arr_delay + dep_delay)<0)]## [1] 141814#.N
#如果我们只想知道满足条件的有多少行的话可以使用.N
flights[origin == “JFK” & month == 6L, .N]## [1] 8422#by分组,by()的参数要是list形式,所以要加上list()或.()
flights[, .(.N), by=.(origin,dest)]## origin dest N
1: JFK LAX 10208
2: LGA PBI 2307
3: EWR LAX 4226
4: JFK MIA 2750
5: JFK SEA 1815
—
217: LGA AVL 2
218: LGA GSP 3
219: LGA SBN 2
220: EWR SBN 6
221: LGA DAL 15#by函数同时可以用boolean进行分组
flights[, .N, .(dep_delay>0, arr_delay>0)]## dep_delay arr_delay N
1: TRUE TRUE 72836
2: FALSE TRUE 34583
3: FALSE FALSE 119304
4: TRUE FALSE 26593#添加列并进行命名
flights[carrier == “AA”, .(arrdelay_mean = mean(arr_delay), depdelay_mean = mean(dep_delay)), by = .(origin, dest, month)]## origin dest month arrdelay_mean depdelay_mean
1: JFK LAX 1 6.590361 14.2289157
2: LGA PBI 1 -7.758621 0.3103448
3: EWR LAX 1 1.366667 7.5000000
4: JFK MIA 1 15.720670 18.7430168
5: JFK SEA 1 14.357143 30.7500000
—
196: LGA MIA 10 -6.251799 -1.4208633
197: JFK MIA 10 -1.880184 6.6774194
198: EWR PHX 10 -3.032258 -4.2903226
199: JFK MCO 10 -10.048387 -1.6129032
200: JFK DCA 10 16.483871 15.5161290#参数j指定多个列
#函数lapply(.SD, FUN)对SD中每列进行FUN函数计算,.SDcols(指定SD中有哪些列)
#上述结果也可以写作:
flights[carrier == “AA”, lapply(.SD, mean), by=.(origin, dest, month), .SDcols=c(“arr_delay”, “dep_delay”)]## origin dest month arr_delay dep_delay
1: JFK LAX 1 6.590361 14.2289157
2: LGA PBI 1 -7.758621 0.3103448
3: EWR LAX 1 1.366667 7.5000000
4: JFK MIA 1 15.720670 18.7430168
5: JFK SEA 1 14.357143 30.7500000
—
196: LGA MIA 10 -6.251799 -1.4208633
197: JFK MIA 10 -1.880184 6.6774194
198: EWR PHX 10 -3.032258 -4.2903226
199: JFK MCO 10 -10.048387 -1.6129032
200: JFK DCA 10 16.483871 15.5161290#chaining表达式
#将前者表达式结果作为后者表达式的参数
flights[carrier == “AA”, .(arrdelay_mean = mean(arr_delay), depdelay_mean = mean(dep_delay)), by = .(origin, dest, month)][order(origin,dest)]## origin dest month arrdelay_mean depdelay_mean
1: EWR DFW 1 6.427673 10.0125786
2: EWR DFW 2 10.536765 11.3455882
3: EWR DFW 3 12.865031 8.0797546
4: EWR DFW 4 17.792683 12.9207317
5: EWR DFW 5 18.487805 18.6829268
—
196: LGA PBI 1 -7.758621 0.3103448
197: LGA PBI 2 -7.865385 2.4038462
198: LGA PBI 3 -5.754098 3.0327869
199: LGA PBI 4 -13.966667 -4.7333333
200: LGA PBI 5 -10.357143 -6.8571429#在原data.table基础上添加列,使用":=",别忘了逗号
#写法一
flights[, :=
(speed = distance / (air_time/60), delay = arr_delay + dep_delay)]
#写法二
flights[, c(“speed”, “delay”) := list(distance/(air_time/60), arr_delay + dep_delay)]
#可以连通by函数一起使用
flights[, max_speed := max(speed), by=.(origin, dest)]
#也可以使用lapply函数对多列施加函数运算
flights[, c(“max_dep_delay”, “max_arr_delay”) := lapply(.SD, max), by = month, .SDcols = c(“dep_delay”, “arr_delay”)]
#更新列,DT[,colname := colnewvalue]
flights[hour == 24L, hour := 0L]
#删除列,DT[,c(colname) := NULL] or DT[,:=
(colname= NULL)]
flights[, c(“delay”) := NULL] # flights[, :=
(delay = NULL)]
#设置主键,值得注意,当主键列的数据改变时,主键将自动删除设为NULL
setkey(flights, origin) 仅设置一列为主键
setkey(flights, origin, dest) 设置两列为主键
setkey(flights, origin, dest, …) 设置多列为主键
#使用主键
flights[.(“JFK”)] subset所有满足条件origin是“JFK”的行
flights[.(“JFK”, “MIA”)] subset所有满足条件origin是“JFK”、dest是“MIA”的行
flights[.(unique(origin), “MIA”)] subset所有仅仅满足条件dest是“MIA”的行第七章library(ggplot2)##
Attaching package: ‘ggplot2’## The following objects are masked from ‘package:psych’:
%+%, alpha#qplot()
#语法
qplot(自变量, 因变量, data = 数据源, geom = 几何图像名称, color = , shape = , alpha = , xlim = , ylim = , xlab = “x_name”, ylab = “y_name”, group = 分组依据变量名, weight = 权重变量)
若geom = “point”,散点图
若geom = “smooth”,拟合平滑曲线
若geom = “boxplot”,箱型图
若geom = “line”,折线图
若geom = “histogram”,直方图(连续变量),可以继续指定binwidth = ,代表组间距
若geom = “density”,密度曲线,可以继续指定adjust = ,代表曲线平滑程度,越大越平滑
若geom = “bar”,条形图(离散变量),可以在因变量位置使用"…density…",代表使用密度而不是频数作为y轴
#geom内可以使用c()指定多个参数进行多种作图
#alpha指定透明度
在指定color变量时,如果参数是连续的最好像将其转成离散型factor,不然的话直接使用该参数会使color是连续变化的
qplot(displ, hwy, data = mpg, colour = factor(cyl))qplot(displ, hwy, data = mpg, colour = cyl)#分面,qplot()默认的分面方法是拆分成若干个窗格
分面使用facets,该函数前参数代表行分组变量,后参数代表列分组变量,若为.则代表不分组
qplot(carat, data = diamonds, facets = color ~ .,
geom = “histogram”, binwidth = 0.1, xlim = c(0, 3))## Warning: Removed 32 rows containing non-finite values (stat_bin).## Warning: Removed 14 rows containing missing values (geom_bar).qplot(carat, data = diamonds, facets = . ~ color,
geom = “histogram”, binwidth = 0.1, xlim = c(0, 3))## Warning: Removed 32 rows containing non-finite values (stat_bin).
Warning: Removed 14 rows containing missing values (geom_bar).#qplot()函数允许使用后缀+函数表达式的方式进行图像的添加,这样每一个后缀函数表达式都单独构成了一个图层
qplot(displ, hwy, data=mpg, facets = . ~ year) + geom_smooth()## geom_smooth()
using method = ‘loess’ and formula ‘y ~ x’#查看图形数据结构
summary(qplot(displ, hwy, data=mpg, facets = . ~ year) + geom_smooth())## data: manufacturer, model, displ, year, cyl, trans, drv, cty, hwy,
fl, class [234x11]
mapping: x = ~displ, y = ~hwy
faceting: <ggproto object: Class FacetGrid, Facet, gg>
compute_layout: function
draw_back: function
draw_front: function
draw_labels: function
draw_panels: function
finish_data: function
init_scales: function
map_data: function
params: list
setup_data: function
setup_params: function
shrink: TRUE
train_scales: function
vars: function
super: <ggproto object: Class FacetGrid, Facet, gg>
-----------------------------------
geom_point: na.rm = FALSE
stat_identity: na.rm = FALSE
position_identity
geom_smooth: na.rm = FALSE, se = TRUE
stat_smooth: na.rm = FALSE, se = TRUE, method = auto, formula = y ~ x
position_identity#ggplot()
#我们看到的图像 = 图形对象 + 图层
#图形对象:ggplot(数据源, aes(自变量, 因变量, colour = 色彩变量))
#图层:layer(geom = “几何图案命称”,params = list(name1 = value1, name2 = value2)stat=“identity”,position=“identity”)
#图层属性在params中指定,color、binwidth等
#图形图像在加上图层之前是看不到图的
p <- ggplot(diamonds, aes(carat, price, colour = cut))
pp + layer(geom = “point”,stat=“identity”,position=“identity”)#ggplot()快捷函数
#上图用快捷函数可以表示为
p+geom_point()#分组应用
library(nlme)
p <- ggplot(Oxboys, aes(age, height, group = Subject)) + geom_line()
p#绘制所有组数据汇总的拟合曲线
p + geom_smooth(aes(group = 1), method=“lm”, size = 2, se = F)#ggplot()分组
ggplot(diamonds, aes(depth)) + xlim(58, 68) + geom_histogram(aes(y = …density…), binwidth = 0.1) + facet_grid(cut ~ .)## Warning: Removed 669 rows containing non-finite values (stat_bin).## Warning: Removed 10 rows containing missing values (geom_bar).#指定分片数据标度自由
library(reshape2)##
Attaching package: ‘reshape2’## The following objects are masked from ‘package:data.table’:
dcast, meltem <- melt(economics, id = “date”)
qplot(date, value, data = em, geom = “line”, group = variable) + facet_grid(variable ~ ., scale = “free_y”)