R语言复习总结

news/2024/11/7 15:27:18/

!!!重点!!!:data.frame 绘图 矩阵第一章#R语言是区分大小写的解释型语言

#rm()函数用于删除数据区中的数据第二章#R语言下标从1开始

#向量

vector <- c(20173561,1709,20173562,1707)

#矩阵

matrix <- matrix(vector,nrow = 2,ncol = 2,byrow = TRUE)

#数组,在创建数组时指定的维度名称需要以列表的形式传入参数

data <- 1:24
dim1 <- c("A1", "A2")
dim2 <- c("B1", "B2", "B3")
dim3 <- c("C1", "C2", "C3", "C4")
array <- array(data,c(2,3,4),list(dim1, dim2, dim3))

#数据框

patientID <- c(1, 2, 3, 4)#数值型向量
age <- c(25, 34, 28, 52)#数值型向量
diabetes <- c("Type1", "Type2", "Type1", "Type1")#data.frame会自动将字符型向量转换为factor因子,其水平数等于factor的所有可能的取值数
status <- c("Poor", "Improved", "Excellent", "Poor")
frame <- data.frame(patientID, age, diabetes,status)
frame <- data.frame(patientID, age, diabetes, status, stringsAsFactors=FALSE)指定stringsAsFactors=FALSE让字符串型不转化为因子

#选取数据框的元素

frame[1,]选取第1行
frame[1:2,]选取第1、2行
frame[1]选取第1列
frame[1:2]选取第1、2列
frame[1,3]选取第1行第3列
frame$patientID选取ID列
frame$patientID[2]选取ID列第2行

#列表,列表中可能是若干向量、矩阵、数据框,甚至其他列表的组合

g <- "My First List"
h <- c(25, 26, 18, 39)
j <- matrix(1:10, nrow=5)
k <- c("one", "two", "three")
list <- list(first = g,second = h,third = j,k)

#将一个值赋给某个向量、矩阵、数组或列表中一个不存在的元素时,R将自动扩展这个数据结构以容纳新值,空余位置填补NA

x <- c(1,2)
x

##结果 [1] 1 2

x[5] <- 5
x

##结果 [1] 1 2 NA NA 5

#读取CSV文件

read.table("your file url*",header = TRUE,sep = ",")

#默认情况下,字符型变量将转换为因子

#查看数据的结构

str(frame)## 'data.frame':    4 obs. of  4 variables:
$ patientID: num  1 2 3 4
$ age      : num  25 34 28 52
$ diabetes : Factor w/ 2 levels "Type1","Type2": 1 2 1 1
$ status   : Factor w/ 3 levels "Excellent","Improved",..: 3 2 1 3#查看数据维度
dim(frame)## [1] 4 4#查看对象类型
class(frame)## [1] "data.frame"#查看对象属性(字段)(变量)名称
names(frame)## [1] "patientID" "age"       "diabetes"  "status"#查看某对象前几列,以2为例
head(frame,2)##   patientID age diabetes   status

1 1 25 Type1 Poor

2 2 34 Type2 Improved第三章#创建新变量

mydata s u m x < − m y d a t a sumx <- mydata sumx<mydatax1 + mydata$x2

mydata m e a n x < − ( m y d a t a meanx <- (mydata meanx<(mydatax1 + mydata$x2)/2

#变量重编码

leadership a g e c a t [ l e a d e r s h i p agecat[leadership agecat[leadershipage >= 55 & leadership$age <= 75] <- “Middle Aged”

#变量重命名

法一

install.packages(“reshape”) #先安装包

library(reshape)

your dataframe name <- rename(your dataframe name, c(oldName1=“NewName1”, oldName2=“NewName2”))

法二

names(your dataframe name)[2] <- “newName”

#NA是缺失值,NaN是非数值
#检测缺失值is.na()
#删除带有缺失值数据的行na.omit()

#计算日期差,以日、周等为单位

difftime(time1, time2, units = c(“auto”, “secs”, “mins”, “hours”, “days”, “weeks”))

#数据类型转换

as.numeric()

as.character()

as.vector()

as.matrix()

as.data.frame()

#数据排序

your dataframe name[order(ydn$attribute name),]升序,别忘了逗号

your dataframe name[order(-ydn$attribute name),]降序,别忘了逗号

#多变量排序

your dataframe name[order(ydn a t t r i b u t e n a m e 1 , − y d n attribute name1, -ydn attributename1,ydnattribute name2),]

#数据集合并
#按照某些变量合并 merge(dataframeA, dataframeB, by=c(“attribute name1”,“attribute name2”))
#直接横向合并(添加列)必须有相同的行数并且要以相同顺序进行排序 cbind(A, B)
#直接纵向合并(添加行)必须拥有相同的变量,但顺序不必相同 rbind(A, B)

#数据集取子集
#选入变量your dataframe name[c(“attribute name1”,“attribute name2”,…)]

#剔除变量使用布尔型变量的方式
#1

myvars <- names(leadership) %in% c(“q3”, “q4”)

newdata <- leadership[!myvars]

#2

newdata <- leadership[c(-7,-8)] 如果知道q3和q4是第7个和第8个变量

#3

leadership q 3 < − l e a d e r s h i p q3 <- leadership q3<leadershipq4 <- NULL

#可以按照一定规则进行列的选取

leadership[c(TRUE, FALSE, FALSE),] 如果行数大于3:会得到1、4、7、10、13、16……行

leadership[c(TRUE, FALSE),] 如果行数大于2:会得到1、3、5、7……行

#选取行的时候要注意加逗号

newdata <- leadership[1:3,]

newdata <- leadership[leadershipKaTeX parse error: Expected 'EOF', got '&' at position 13: gender=="M" &̲ leadershipage > 30,]

#subset函数
#newdata <- subset(leadership, age >= 35 | age < 24, select=c(q1, q2, q3, q4))第一个参数操作的dataframe,第二个参数条件,第三个参数所要选择的变量(列)

#随机抽样,别忘了逗号

mysample <- leadership[sample(1:nrow(leadership), 3, replace=FALSE),]

第一个参数是抽样范围

第二个参数是抽样个数

第三个参数是是否放回

#数据转置t()

#数据分类汇总

aggregate(x,by,Fun)

第一个参数是操作的dataframe

第二个参数是按照那些变量(列)进行分类

第三个参数是对分类后的数据进行怎样的操作,这个函数会对该dataframe所有变量进行该函数操作,所以当该dataframe中有非数值型变量时,该函数无法使用

aggdata <-aggregate(mtcars, by=list(cyl,gear), FUN=mean, na.rm=TRUE) by中变量必须在一个列表中,即使只有一个变量

#reshape包

library(reshape)

#融化melt

md <- melt(mydata, id=c(“ID”, “Time”))

该函数会将mydata数据集中除了"ID"“Time"的其余变量(不管多少列)融为两列"Variable”“Value”

#整型cast

newdata <- cast(md, formula, FUN)

第一个参数是操作的数据集

第二个参数是整型的方式,形式为A~B,其中A与B都可以是多个变量,多个变量之间用+连接,A代表整型是按照A进行分类,B代表按照A分组后在按照B中变量取值再进行分组形成新的变量,再对其他的变量按照A分组与B分组进行FUN运算,B的每个不同的取值组合都会成为新数据集的一个新变量

###可以将A理解为行分组,B理解为列分组###

第三个参数是整型使用的函数

#条件控制
#else不能出现在行首第四章#条形图
library(vcd)## Loading required package: gridcounts <- table(ArthritisKaTeX parse error: Expected 'EOF', got '#' at position 11: Improved) #̲准备数据 #简单条形图 bar…mpg, labels=row.names(mtcars), cex=.7, main=“Gas Mileage for Car Models”, xlab=“Miles Per Gallon”)#散点图
plot(mtcars w t , m t c a r s wt, mtcars wt,mtcarsmpg)
abline(lm(mtcars m p g m t c a r s mpg ~ mtcars mpg mtcarswt))#为散点图添加最佳拟合线性直线#折线图
t1 <- subset(Orange, Tree==1)
plot(t1 a g e , t 1 age, t1 age,t1circumference,type=“b”) 第五章#平均数mean()

#中位数median()

#方差var()

#标准差sd()

#值域range()

#求和sum()

#最小值min()

#最大值max()

#计算常用统计量的函数summary(),计算最小值、最大值、上下四分位数、均值(或因子向量与逻辑向量的频数)
mt <- mtcars[c(“mpg”, “hp”, “wt”, “am”)]
summary(mt)## mpg hp wt am

Min. :10.40 Min. : 52.0 Min. :1.513 Min. :0.0000

1st Qu.:15.43 1st Qu.: 96.5 1st Qu.:2.581 1st Qu.:0.0000

Median :19.20 Median :123.0 Median :3.325 Median :0.0000

Mean :20.09 Mean :146.7 Mean :3.217 Mean :0.4062

3rd Qu.:22.80 3rd Qu.:180.0 3rd Qu.:3.610 3rd Qu.:1.0000

Max. :33.90 Max. :335.0 Max. :5.424 Max. :1.0000#统计量的计算函数

sapply(x, FUN, options)

#第一个参数是数据框(或矩阵)
#第二个参数是任意函数(可以是用户自定义函数)
#第三个参数是FUN函数的参数(如果有的话)
#sapply对每列应用函数FUN

#分组计算统计量

aggregate()只允许使用单返回值函数

by()允许使用多返回值函数

myvars <- c(“mpg”, “hp”, “wt”)
aggregate(mtcars[myvars], by=list(am=mtcarsKaTeX parse error: Expected 'EOF', got '#' at position 11: am), mean)#̲注意list(am=mtcar…am)的使用。如果使用的是list(mtcars$am),则am列将被标注为Group.1而不是am## am mpg hp wt

1 0 17.14737 160.2632 3.768895

2 1 24.39231 126.8462 2.411000mystats <- function(x, na.omit=FALSE){

if (na.omit)
x <- x[!is.na(x)] #忽略缺失值
m <- mean(x)
n <- length(x)
s <- sd(x)
skew <- sum((x-m)3/s3)/n
kurt <- sum((x-m)4/s4)/n - 3
return(c(n=n, mean=m, stdev=s, skew=skew, kurtosis=kurt))
}
dstats <- function(x)sapply(x, mystats)
by(mtcars[myvars], mtcarsKaTeX parse error: Expected 'EOF', got '#' at position 12: am, dstats)#̲# mtcarsam: 0

mpg hp wt

n 19.00000000 19.00000000 19.0000000

mean 17.14736842 160.26315789 3.7688947

stdev 3.83396639 53.90819573 0.7774001

skew 0.01395038 -0.01422519 0.9759294

kurtosis -0.80317826 -1.20969733 0.1415676

--------------------------------------------------------

mtcars$am: 1

mpg hp wt

n 13.00000000 13.0000000 13.0000000

mean 24.39230769 126.8461538 2.4110000

stdev 6.16650381 84.0623243 0.6169816

skew 0.05256118 1.3598859 0.2103128

kurtosis -1.45535200 0.5634635 -1.1737358#频数表

library(vcd)
#一维频数表
with(Arthritis, table(Improved))## Improved

None Some Marked

42 14 28#一维频率表

prop.table(with(Arthritis,table(Improved)))## Improved

None Some Marked

0.5000000 0.1666667 0.3333333#二维列联表

xtabs(~ A + B, data=mydata),A为行分组,B为列分组

xtabs(~ Treatment+Improved, data=Arthritis)## Improved

Treatment None Some Marked

Placebo 29 7 7

Treated 13 7 21#为二维列联表生成边际和

addmargins(xtabs(~ Treatment+Improved, data=Arthritis))## Improved

Treatment None Some Marked Sum

Placebo 29 7 7 43

Treated 13 7 21 41

Sum 42 14 28 84#行边际和

addmargins(xtabs(~ Treatment+Improved, data=Arthritis),1)## Improved

Treatment None Some Marked

Placebo 29 7 7

Treated 13 7 21

Sum 42 14 28#列边际和

addmargins(xtabs(~ Treatment+Improved, data=Arthritis),2)## Improved

Treatment None Some Marked Sum

Placebo 29 7 7 43

Treated 13 7 21 41#创建二位列联表的简单函数

library(gmodels)
CrossTable(Arthritis T r e a t m e n t , A r t h r i t i s Treatment, Arthritis Treatment,ArthritisImproved)##

Cell Contents

|-------------------------|

| N |

| Chi-square contribution |

| N / Row Total |

| N / Col Total |

| N / Table Total |

|-------------------------|

Total Observations in Table: 84

| Arthritis$Improved

Arthritis$Treatment | None | Some | Marked | Row Total |

--------------------|-----------|-----------|-----------|-----------|

Placebo | 29 | 7 | 7 | 43 |

| 2.616 | 0.004 | 3.752 | |

| 0.674 | 0.163 | 0.163 | 0.512 |

| 0.690 | 0.500 | 0.250 | |

| 0.345 | 0.083 | 0.083 | |

--------------------|-----------|-----------|-----------|-----------|

Treated | 13 | 7 | 21 | 41 |

| 2.744 | 0.004 | 3.935 | |

| 0.317 | 0.171 | 0.512 | 0.488 |

| 0.310 | 0.500 | 0.750 | |

| 0.155 | 0.083 | 0.250 | |

--------------------|-----------|-----------|-----------|-----------|

Column Total | 42 | 14 | 28 | 84 |

| 0.500 | 0.167 | 0.333 | |

--------------------|-----------|-----------|-----------|-----------|

#超二维列联表

xtabs(~ Treatment+Sex+Improved, data=Arthritis)## , , Improved = None

Sex

Treatment Female Male

Placebo 19 10

Treated 6 7

, , Improved = Some

Sex

Treatment Female Male

Placebo 7 0

Treated 5 2

, , Improved = Marked

Sex

Treatment Female Male

Placebo 6 1

Treated 16 5ftable(xtabs(~ Treatment+Sex+Improved, data=Arthritis))#ftable()函数可以以一种紧凑而吸引人的方式输出多维列联表## Improved None Some Marked

Treatment Sex

Placebo Female 19 7 6

Male 10 0 1

Treated Female 6 5 16

Male 7 2 5#参数估计

指的是用样本中的数据估计总体分布的某个或某几个参数,比如给定一定样本容量的样本,要求估计总体的均值、方差等

#点估计

从总体中抽取一个样本,根据该样本的统计量对总体的未知参数做出一个数值点的估计

#区间估计

在点估计的基础上,给出总体参数落在某一区间的概率

两个指标:置信区间是指由样本统计量所构造的总体参数的估计区间,置信水平是指总体未知参数落在区间内的概率,表示为1–α,α为显著性水平,即总体参数未在区间内的概率

#假设检验

通过样本分布,检验某个参数的属于某个区间范围的概率

α错误:在进行假设检验时提出原假设和备择假设,原假设实际上是正确的,但我们做出的决定是拒绝原假设,此类错误称为第一类错误

β错误:原假设实际上是不正确的,但是我们却做出了接受原假设的决定,此类错误称为第二类错误

若总体分布规律已知,则使用参数假设检验;若总体分布规律未知,则使用非参数假设检验

#独立性检验

卡方检测:原假设H0是假设各属性之间相互独立。Χ2越小说明变量之间越独立,Χ2越大说明变量之间越相关,计算得到的p值若远小于0.05,则否定H0

chisq.test(xtabs(~Treatment+Improved, data=Arthritis))##

Pearson’s Chi-squared test

data: xtabs(~Treatment + Improved, data = Arthritis)

X-squared = 13.055, df = 2, p-value = 0.001463# Fisher精确检验:原假H0设是:边界固定的列联表中行和列是相互独立的,不能用于2X2列联表,计算得到的p值若远小于0.05,则否定H0

fisher.test(xtabs(~Treatment+Improved, data=Arthritis))##

Fisher’s Exact Test for Count Data

data: xtabs(~Treatment + Improved, data = Arthritis)

p-value = 0.001393

alternative hypothesis: two.sided# Cochran-Mantel-Haenszel检验:原假设是:两个名义变量在第三个变量的每一层中都是条件独立的,计算得到的p值若远小于0.05,则否定H0

H0就是分性别来看,治疗效果与治疗方式独立

mantelhaen.test(xtabs(~Treatment+Improved+Sex, data=Arthritis))##

Cochran-Mantel-Haenszel test

data: xtabs(~Treatment + Improved + Sex, data = Arthritis)

Cochran-Mantel-Haenszel M^2 = 14.632, df = 2, p-value = 0.0006647#相关性度量

assocstats()函数可以用来计算二维列联表的phi系数、列联系数、Cramer’s V系数,较大的值意味着较强的相关性

assocstats(xtabs(~Treatment+Improved, data=Arthritis))## X^2 df P(> X^2)

Likelihood Ratio 13.530 2 0.0011536

Pearson 13.055 2 0.0014626

Phi-Coefficient : NA

Contingency Coeff.: 0.367

Cramer’s V : 0.394# 协方差,协方差的数值越大,两个变量同向程度也就越大

cov(state.x77[,1:6])## Population Income Illiteracy Life Exp Murder

Population 19931683.7588 571229.7796 292.8679592 -407.8424612 5663.523714

Income 571229.7796 377573.3061 -163.7020408 280.6631837 -521.894286

Illiteracy 292.8680 -163.7020 0.3715306 -0.4815122 1.581776

Life Exp -407.8425 280.6632 -0.4815122 1.8020204 -3.869480

Murder 5663.5237 -521.8943 1.5817755 -3.8694804 13.627465

HS Grad -3551.5096 3076.7690 -3.2354694 6.3126849 -14.549616

HS Grad

Population -3551.509551

Income 3076.768980

Illiteracy -3.235469

Life Exp 6.312685

Murder -14.549616

HS Grad 65.237894# person积差:Pearson积差相关系数衡量了两个定量变量之间的线性相关程度

cor(mtcars)## mpg cyl disp hp drat wt

mpg 1.0000000 -0.8521620 -0.8475514 -0.7761684 0.68117191 -0.8676594

cyl -0.8521620 1.0000000 0.9020329 0.8324475 -0.69993811 0.7824958

disp -0.8475514 0.9020329 1.0000000 0.7909486 -0.71021393 0.8879799

hp -0.7761684 0.8324475 0.7909486 1.0000000 -0.44875912 0.6587479

drat 0.6811719 -0.6999381 -0.7102139 -0.4487591 1.00000000 -0.7124406

wt -0.8676594 0.7824958 0.8879799 0.6587479 -0.71244065 1.0000000

qsec 0.4186840 -0.5912421 -0.4336979 -0.7082234 0.09120476 -0.1747159

vs 0.6640389 -0.8108118 -0.7104159 -0.7230967 0.44027846 -0.5549157

am 0.5998324 -0.5226070 -0.5912270 -0.2432043 0.71271113 -0.6924953

gear 0.4802848 -0.4926866 -0.5555692 -0.1257043 0.69961013 -0.5832870

carb -0.5509251 0.5269883 0.3949769 0.7498125 -0.09078980 0.4276059

qsec vs am gear carb

mpg 0.41868403 0.6640389 0.59983243 0.4802848 -0.55092507

cyl -0.59124207 -0.8108118 -0.52260705 -0.4926866 0.52698829

disp -0.43369788 -0.7104159 -0.59122704 -0.5555692 0.39497686

hp -0.70822339 -0.7230967 -0.24320426 -0.1257043 0.74981247

drat 0.09120476 0.4402785 0.71271113 0.6996101 -0.09078980

wt -0.17471588 -0.5549157 -0.69249526 -0.5832870 0.42760594

qsec 1.00000000 0.7445354 -0.22986086 -0.2126822 -0.65624923

vs 0.74453544 1.0000000 0.16834512 0.2060233 -0.56960714

am -0.22986086 0.1683451 1.00000000 0.7940588 0.05753435

gear -0.21268223 0.2060233 0.79405876 1.0000000 0.27407284

carb -0.65624923 -0.5696071 0.05753435 0.2740728 1.00000000# Spearman等级相关系数:则衡量分级定序变量之间的相关程度

cor(mtcars, method=“spearman”)## mpg cyl disp hp drat wt

mpg 1.0000000 -0.9108013 -0.9088824 -0.8946646 0.65145546 -0.8864220

cyl -0.9108013 1.0000000 0.9276516 0.9017909 -0.67888119 0.8577282

disp -0.9088824 0.9276516 1.0000000 0.8510426 -0.68359210 0.8977064

hp -0.8946646 0.9017909 0.8510426 1.0000000 -0.52012499 0.7746767

drat 0.6514555 -0.6788812 -0.6835921 -0.5201250 1.00000000 -0.7503904

wt -0.8864220 0.8577282 0.8977064 0.7746767 -0.75039041 1.0000000

qsec 0.4669358 -0.5723509 -0.4597818 -0.6666060 0.09186863 -0.2254012

vs 0.7065968 -0.8137890 -0.7236643 -0.7515934 0.44745745 -0.5870162

am 0.5620057 -0.5220712 -0.6240677 -0.3623276 0.68657079 -0.7377126

gear 0.5427816 -0.5643105 -0.5944703 -0.3314016 0.74481617 -0.6761284

carb -0.6574976 0.5800680 0.5397781 0.7333794 -0.12522294 0.4998120

qsec vs am gear carb

mpg 0.46693575 0.7065968 0.56200569 0.5427816 -0.65749764

cyl -0.57235095 -0.8137890 -0.52207118 -0.5643105 0.58006798

disp -0.45978176 -0.7236643 -0.62406767 -0.5944703 0.53977806

hp -0.66660602 -0.7515934 -0.36232756 -0.3314016 0.73337937

drat 0.09186863 0.4474575 0.68657079 0.7448162 -0.12522294

wt -0.22540120 -0.5870162 -0.73771259 -0.6761284 0.49981205

qsec 1.00000000 0.7915715 -0.20333211 -0.1481997 -0.65871814

vs 0.79157148 1.0000000 0.16834512 0.2826617 -0.63369482

am -0.20333211 0.1683451 1.00000000 0.8076880 -0.06436525

gear -0.14819967 0.2826617 0.80768800 1.0000000 0.11488698

carb -0.65871814 -0.6336948 -0.06436525 0.1148870 1.00000000#相关性检验

states<- state.x77[,1:6]
#常用的原假设H0为:变量间不相关(即总体的相关系数为0)也可以是相关系数大于0或小于0。若p值远小于0.05,拒绝原假设。

cor.test(x, y, alternative = , method = ) 一次只能检验一种关系

#第一、二个参数的x和y为要检验相关性的变量
#第三个参数alternative用来指定进行双侧检验或单侧检验(取值为"two.side"、“less"或"greater”)当研究的假设为总体的相关系数小于0时,请使用alternative=“less”。在研究的假设为总体的相关系数大于0时,应使用alternative=“greater”。在默认情况下,假设为alternative=“two.side”(总体相关系数不等于0)。
#第四个参数method用以指定要计算的相关类型(“pearson”、“kendall"或"spearman”)
cor.test(states[,3], states[,5])##

Pearson’s product-moment correlation

data: states[, 3] and states[, 5]

t = 6.8479, df = 48, p-value = 1.258e-08

alternative hypothesis: true correlation is not equal to 0

95 percent confidence interval:

0.5279280 0.8207295

sample estimates:

cor

0.7029752#corr.test(data, use= , method = ) 一次可以检验多个关系

第一个参数代表操作的数据集

第二个参数use=的取值可为"pairwise"或"complete"(分别表示对缺失值执行成对删除或行删除)

第三个参数method=的取值可为"pearson"(默认值)、“spearman"或"kendall”

library(psych)##

Attaching package: ‘psych’## The following object is masked from ‘package:plotrix’:

rescalecorr.test(states, use=“complete”)## Call:corr.test(x = states, use = “complete”)

Correlation matrix

Population Income Illiteracy Life Exp Murder HS Grad

Population 1.00 0.21 0.11 -0.07 0.34 -0.10

Income 0.21 1.00 -0.44 0.34 -0.23 0.62

Illiteracy 0.11 -0.44 1.00 -0.59 0.70 -0.66

Life Exp -0.07 0.34 -0.59 1.00 -0.78 0.58

Murder 0.34 -0.23 0.70 -0.78 1.00 -0.49

HS Grad -0.10 0.62 -0.66 0.58 -0.49 1.00

Sample Size

[1] 50

Probability values (Entries above the diagonal are adjusted for multiple tests.)

Population Income Illiteracy Life Exp Murder HS Grad

Population 0.00 0.59 1.00 1.0 0.10 1

Income 0.15 0.00 0.01 0.1 0.54 0

Illiteracy 0.46 0.00 0.00 0.0 0.00 0

Life Exp 0.64 0.02 0.00 0.0 0.00 0

Murder 0.01 0.11 0.00 0.0 0.00 0

HS Grad 0.50 0.00 0.00 0.0 0.00 0

To see confidence intervals of the correlations, print with the short=FALSE option#t检验

#用于对两组进行比较
#原假设H0:针对两组独立样本(并且是从正态总体中抽得),两个总体的均值相等。若p值远小于0.05,拒绝原假设。
#第一种调用格式

t.test(y ~ x, data) 其中的y是一个数值型变量,x是一个二分变量

library(MASS)
t.test(Prob ~ So, data=UScrime)##

Welch Two Sample t-test

data: Prob by So

t = -3.8954, df = 24.925, p-value = 0.0006506

alternative hypothesis: true difference in means is not equal to 0

95 percent confidence interval:

-0.03852569 -0.01187439

sample estimates:

mean in group 0 mean in group 1

0.03851265 0.06371269#第二种调用格式

t.test(y1, y2) 其中的y1和y2为数值型向量

with(UScrime, t.test(U1, U2, paired=TRUE))##

Paired t-test

data: U1 and U2

t = 32.407, df = 46, p-value < 2.2e-16

alternative hypothesis: true difference in means is not equal to 0

95 percent confidence interval:

57.67003 65.30870

sample estimates:

mean of the differences

61.48936第六章library(data.table)

#data.table
#DT[i, j, by]

#读取数据
flights <- fread(“F:\0课件\R语言\flights14.csv”)

#行筛选
flights[origin == “JFK” & month == 6L]## year month day dep_time dep_delay arr_time arr_delay cancelled

1: 2014 6 1 851 -9 1205 -5 0

2: 2014 6 1 1220 -10 1522 -13 0

3: 2014 6 1 718 18 1014 -1 0

4: 2014 6 1 1024 -6 1314 -16 0

5: 2014 6 1 1841 -4 2125 -45 0

8418: 2014 6 30 1457 -3 1639 -6 0

8419: 2014 6 30 1454 -5 1622 -32 0

8420: 2014 6 30 1717 -3 1834 -16 0

8421: 2014 6 30 758 -2 927 7 0

8422: 2014 6 30 803 -7 932 -18 0

carrier tailnum flight origin dest air_time distance hour min

1: AA N787AA 1 JFK LAX 324 2475 8 51

2: AA N795AA 3 JFK LAX 329 2475 12 20

3: AA N784AA 9 JFK LAX 326 2475 7 18

4: AA N791AA 19 JFK LAX 320 2475 10 24

5: AA N790AA 21 JFK LAX 326 2475 18 41

8418: MQ N931MQ 3231 JFK PIT 62 340 14 57

8419: MQ N802MQ 3295 JFK RDU 65 427 14 54

8420: MQ N530MQ 3365 JFK DCA 39 213 17 17

8421: MQ N502MQ 3370 JFK DCA 52 213 7 58

8422: MQ N811MQ 3465 JFK RDU 67 427 8 3#列筛选(别忘了逗号)可以同时进行列的重命名

flights[,.(newname=tailnum,flight)]#也可以写成 flights[,list(newname=tailnum,flight)]## newname flight

1: N338AA 1

2: N335AA 3

3: N327AA 21

4: N3EHAA 29

5: N319AA 117

253312: N23708 1744

253313: N33132 1758

253314: N827MQ 3591

253315: N511MQ 3592

253316: N813MQ 3599#排除列

flights[, -c(“arr_delay”, “dep_delay”), with=FALSE]#"-“可以用”!"代替## year month day dep_time arr_time cancelled carrier tailnum flight

1: 2014 1 1 914 1238 0 AA N338AA 1

2: 2014 1 1 1157 1523 0 AA N335AA 3

3: 2014 1 1 1902 2224 0 AA N327AA 21

4: 2014 1 1 722 1014 0 AA N3EHAA 29

5: 2014 1 1 1347 1706 0 AA N319AA 117

253312: 2014 10 31 1459 1747 0 UA N23708 1744

253313: 2014 10 31 854 1147 0 UA N33132 1758

253314: 2014 10 31 1102 1311 0 MQ N827MQ 3591

253315: 2014 10 31 1106 1325 0 MQ N511MQ 3592

253316: 2014 10 31 824 1045 0 MQ N813MQ 3599

origin dest air_time distance hour min

1: JFK LAX 359 2475 9 14

2: JFK LAX 363 2475 11 57

3: JFK LAX 351 2475 19 2

4: LGA PBI 157 1035 7 22

5: JFK LAX 350 2475 13 47

253312: LGA IAH 201 1416 14 59

253313: EWR IAH 189 1400 8 54

253314: LGA RDU 83 431 11 2

253315: LGA DTW 75 502 11 6

253316: LGA SDF 110 659 8 24#排序

flights[order(origin,-dest)]#"-"代表降序排序## year month day dep_time dep_delay arr_time arr_delay cancelled

1: 2014 1 5 836 6 1151 49 0

2: 2014 1 6 833 7 1111 13 0

3: 2014 1 7 811 -6 1035 -13 0

4: 2014 1 8 810 -7 1036 -12 0

5: 2014 1 9 833 16 1055 7 0

253312: 2014 10 31 929 -1 1158 -22 0

253313: 2014 10 31 2025 -5 2252 -23 0

253314: 2014 4 6 1059 -6 1332 -1 0

253315: 2014 4 7 1122 2 1352 1 0

253316: 2014 4 11 1033 0 1245 -19 0

carrier tailnum flight origin dest air_time distance hour min

1: EV N12175 4419 EWR XNA 195 1131 8 36

2: EV N24128 4419 EWR XNA 190 1131 8 33

3: EV N12142 4419 EWR XNA 179 1131 8 11

4: EV N11193 4419 EWR XNA 184 1131 8 10

5: EV N14198 4419 EWR XNA 181 1131 8 33

253312: WN N243WN 706 LGA ATL 112 762 9 29

253313: WN N913WN 2969 LGA ATL 112 762 20 25

253314: EV N760EV 5624 LGA AGS 110 678 10 59

253315: EV N197PQ 5625 LGA AGS 111 678 11 22

253316: EV N391CA 5632 LGA AGS 102 678 10 33#在参数j中进行运算

flights[, sum((arr_delay + dep_delay)<0)]## [1] 141814#.N
#如果我们只想知道满足条件的有多少行的话可以使用.N
flights[origin == “JFK” & month == 6L, .N]## [1] 8422#by分组,by()的参数要是list形式,所以要加上list()或.()
flights[, .(.N), by=.(origin,dest)]## origin dest N

1: JFK LAX 10208

2: LGA PBI 2307

3: EWR LAX 4226

4: JFK MIA 2750

5: JFK SEA 1815

217: LGA AVL 2

218: LGA GSP 3

219: LGA SBN 2

220: EWR SBN 6

221: LGA DAL 15#by函数同时可以用boolean进行分组

flights[, .N, .(dep_delay>0, arr_delay>0)]## dep_delay arr_delay N

1: TRUE TRUE 72836

2: FALSE TRUE 34583

3: FALSE FALSE 119304

4: TRUE FALSE 26593#添加列并进行命名

flights[carrier == “AA”, .(arrdelay_mean = mean(arr_delay), depdelay_mean = mean(dep_delay)), by = .(origin, dest, month)]## origin dest month arrdelay_mean depdelay_mean

1: JFK LAX 1 6.590361 14.2289157

2: LGA PBI 1 -7.758621 0.3103448

3: EWR LAX 1 1.366667 7.5000000

4: JFK MIA 1 15.720670 18.7430168

5: JFK SEA 1 14.357143 30.7500000

196: LGA MIA 10 -6.251799 -1.4208633

197: JFK MIA 10 -1.880184 6.6774194

198: EWR PHX 10 -3.032258 -4.2903226

199: JFK MCO 10 -10.048387 -1.6129032

200: JFK DCA 10 16.483871 15.5161290#参数j指定多个列

#函数lapply(.SD, FUN)对SD中每列进行FUN函数计算,.SDcols(指定SD中有哪些列)
#上述结果也可以写作:
flights[carrier == “AA”, lapply(.SD, mean), by=.(origin, dest, month), .SDcols=c(“arr_delay”, “dep_delay”)]## origin dest month arr_delay dep_delay

1: JFK LAX 1 6.590361 14.2289157

2: LGA PBI 1 -7.758621 0.3103448

3: EWR LAX 1 1.366667 7.5000000

4: JFK MIA 1 15.720670 18.7430168

5: JFK SEA 1 14.357143 30.7500000

196: LGA MIA 10 -6.251799 -1.4208633

197: JFK MIA 10 -1.880184 6.6774194

198: EWR PHX 10 -3.032258 -4.2903226

199: JFK MCO 10 -10.048387 -1.6129032

200: JFK DCA 10 16.483871 15.5161290#chaining表达式

#将前者表达式结果作为后者表达式的参数
flights[carrier == “AA”, .(arrdelay_mean = mean(arr_delay), depdelay_mean = mean(dep_delay)), by = .(origin, dest, month)][order(origin,dest)]## origin dest month arrdelay_mean depdelay_mean

1: EWR DFW 1 6.427673 10.0125786

2: EWR DFW 2 10.536765 11.3455882

3: EWR DFW 3 12.865031 8.0797546

4: EWR DFW 4 17.792683 12.9207317

5: EWR DFW 5 18.487805 18.6829268

196: LGA PBI 1 -7.758621 0.3103448

197: LGA PBI 2 -7.865385 2.4038462

198: LGA PBI 3 -5.754098 3.0327869

199: LGA PBI 4 -13.966667 -4.7333333

200: LGA PBI 5 -10.357143 -6.8571429#在原data.table基础上添加列,使用":=",别忘了逗号

#写法一
flights[, :=(speed = distance / (air_time/60), delay = arr_delay + dep_delay)]
#写法二
flights[, c(“speed”, “delay”) := list(distance/(air_time/60), arr_delay + dep_delay)]
#可以连通by函数一起使用
flights[, max_speed := max(speed), by=.(origin, dest)]
#也可以使用lapply函数对多列施加函数运算
flights[, c(“max_dep_delay”, “max_arr_delay”) := lapply(.SD, max), by = month, .SDcols = c(“dep_delay”, “arr_delay”)]

#更新列,DT[,colname := colnewvalue]
flights[hour == 24L, hour := 0L]

#删除列,DT[,c(colname) := NULL] or DT[,:=(colname= NULL)]
flights[, c(“delay”) := NULL] # flights[, :=(delay = NULL)]

#设置主键,值得注意,当主键列的数据改变时,主键将自动删除设为NULL

setkey(flights, origin) 仅设置一列为主键

setkey(flights, origin, dest) 设置两列为主键

setkey(flights, origin, dest, …) 设置多列为主键

#使用主键

flights[.(“JFK”)] subset所有满足条件origin是“JFK”的行

flights[.(“JFK”, “MIA”)] subset所有满足条件origin是“JFK”、dest是“MIA”的行

flights[.(unique(origin), “MIA”)] subset所有仅仅满足条件dest是“MIA”的行第七章library(ggplot2)##

Attaching package: ‘ggplot2’## The following objects are masked from ‘package:psych’:

%+%, alpha#qplot()

#语法

qplot(自变量, 因变量, data = 数据源, geom = 几何图像名称, color = , shape = , alpha = , xlim = , ylim = , xlab = “x_name”, ylab = “y_name”, group = 分组依据变量名, weight = 权重变量)

若geom = “point”,散点图

若geom = “smooth”,拟合平滑曲线

若geom = “boxplot”,箱型图

若geom = “line”,折线图

若geom = “histogram”,直方图(连续变量),可以继续指定binwidth = ,代表组间距

若geom = “density”,密度曲线,可以继续指定adjust = ,代表曲线平滑程度,越大越平滑

若geom = “bar”,条形图(离散变量),可以在因变量位置使用"…density…",代表使用密度而不是频数作为y轴

#geom内可以使用c()指定多个参数进行多种作图
#alpha指定透明度

在指定color变量时,如果参数是连续的最好像将其转成离散型factor,不然的话直接使用该参数会使color是连续变化的

qplot(displ, hwy, data = mpg, colour = factor(cyl))qplot(displ, hwy, data = mpg, colour = cyl)#分面,qplot()默认的分面方法是拆分成若干个窗格

分面使用facets,该函数前参数代表行分组变量,后参数代表列分组变量,若为.则代表不分组

qplot(carat, data = diamonds, facets = color ~ .,
geom = “histogram”, binwidth = 0.1, xlim = c(0, 3))## Warning: Removed 32 rows containing non-finite values (stat_bin).## Warning: Removed 14 rows containing missing values (geom_bar).qplot(carat, data = diamonds, facets = . ~ color,
geom = “histogram”, binwidth = 0.1, xlim = c(0, 3))## Warning: Removed 32 rows containing non-finite values (stat_bin).

Warning: Removed 14 rows containing missing values (geom_bar).#qplot()函数允许使用后缀+函数表达式的方式进行图像的添加,这样每一个后缀函数表达式都单独构成了一个图层

qplot(displ, hwy, data=mpg, facets = . ~ year) + geom_smooth()## geom_smooth() using method = ‘loess’ and formula ‘y ~ x’#查看图形数据结构
summary(qplot(displ, hwy, data=mpg, facets = . ~ year) + geom_smooth())## data: manufacturer, model, displ, year, cyl, trans, drv, cty, hwy,

fl, class [234x11]

mapping: x = ~displ, y = ~hwy

faceting: <ggproto object: Class FacetGrid, Facet, gg>

compute_layout: function

draw_back: function

draw_front: function

draw_labels: function

draw_panels: function

finish_data: function

init_scales: function

map_data: function

params: list

setup_data: function

setup_params: function

shrink: TRUE

train_scales: function

vars: function

super: <ggproto object: Class FacetGrid, Facet, gg>

-----------------------------------

geom_point: na.rm = FALSE

stat_identity: na.rm = FALSE

position_identity

geom_smooth: na.rm = FALSE, se = TRUE

stat_smooth: na.rm = FALSE, se = TRUE, method = auto, formula = y ~ x

position_identity#ggplot()

#我们看到的图像 = 图形对象 + 图层
#图形对象:ggplot(数据源, aes(自变量, 因变量, colour = 色彩变量))
#图层:layer(geom = “几何图案命称”,params = list(name1 = value1, name2 = value2)stat=“identity”,position=“identity”)
#图层属性在params中指定,color、binwidth等
#图形图像在加上图层之前是看不到图的
p <- ggplot(diamonds, aes(carat, price, colour = cut))
pp + layer(geom = “point”,stat=“identity”,position=“identity”)#ggplot()快捷函数
#上图用快捷函数可以表示为
p+geom_point()#分组应用
library(nlme)
p <- ggplot(Oxboys, aes(age, height, group = Subject)) + geom_line()
p#绘制所有组数据汇总的拟合曲线
p + geom_smooth(aes(group = 1), method=“lm”, size = 2, se = F)#ggplot()分组
ggplot(diamonds, aes(depth)) + xlim(58, 68) + geom_histogram(aes(y = …density…), binwidth = 0.1) + facet_grid(cut ~ .)## Warning: Removed 669 rows containing non-finite values (stat_bin).## Warning: Removed 10 rows containing missing values (geom_bar).#指定分片数据标度自由
library(reshape2)##

Attaching package: ‘reshape2’## The following objects are masked from ‘package:data.table’:

dcast, meltem <- melt(economics, id = “date”)

qplot(date, value, data = em, geom = “line”, group = variable) + facet_grid(variable ~ ., scale = “free_y”)


http://www.ppmy.cn/news/601490.html

相关文章

R语言 随机森林

关注微信公共号&#xff1a;小程在线 关注CSDN博客&#xff1a;程志伟的博客 R版本&#xff1a;3.6.1 randomForest包&#xff1a;提供randomForest()函数用于随机森林的建立 rflmpute()函数&#xff1a;对数据缺失值进行插补 treesize()函数&#xff1a; 查看模型每颗决策…

TCP三挥四握

TCP三挥四握 TCP最关键的三个步骤&#xff1a;建立连接、数据传输、释放连接&#xff0c;这里的三次握手实现的是服务端和客户端建立连接&#xff1b;四次握手实现的是服务端和客户端释放连接。 三次握手&#xff1a; 建立数据连接 TCP连接需要三次握手的原因&#xff1a; 三次…

【python】蜜蜂算法优化决策树回归的超参数

蜜蜂算法介绍 蜜蜂算法是一种基于蜜蜂的群体行为模拟的解优算法。其灵感来源于蜜蜂在寻找食物、建筑巢穴等任务中的群体行为。蜜蜂算法通过对这种行为的模拟来实现搜索空间中最优解的寻找。 蜜蜂算法的基本思想是&#xff0c;将参数空间视为蜂巢&#xff0c;并在参数空间中展开…

JS逆向系列之猿人学爬虫第16题-window蜜罐

文章目录 目标网站参数定位与加密逻辑分析补全后的jspython调用测试往期逆向文章推荐目标网站 https://match.yuanrenxue.cn/match/16参数定位与加密逻辑分析 t就是时间戳,m是我们主要逆向的参数,跟栈进入window

实测STM32 DMA的数据传输速率不如CPU

测试平台&#xff1a;STM32F407ZGT6 测试内容&#xff1a;用CPU和DMA同样搬运32KB的数据&#xff08;内存到内存&#xff09;&#xff0c;对比搬运耗时 测试代码&#xff1a; 1.DMA初始化 void SRAM_DMA_Config(u8 *sbuf,u8 *aimbuf,u32 bufsize) {DMA_InitTypeDef DMA_In…

STM32F103C8T6单片机实现DMA在串口2的收发应用

说明&#xff1a;其他项目文件没有添加&#xff0c;主要实现DMA的接收&#xff08;字节必须满足设定值才可以触发DMA中断&#xff0c;当前设定值为32个字节&#xff09;&#xff0c;main函数检查接收到数据后&#xff0c;通过DMA发送给串口调试助手&#xff1b; 1&#xff09;…

【stm32的DMA通信】

目录 stm32的DMA通信 目录一、关于DMA二、工程建立三、结果展示四、总结五、参考 一、关于DMA DMA定义 DMA用来提供在外设和存储器之间或者存储器和存储器之间的高速数据传输。无须CPU的干预&#xff0c;通过DMA数据可以快速地移动。这就节省了CPU的资源来做其他操作。 DMA传…

板卡测评 | 基于TI AM5708开发板——ARM+DSP多核异构开发案例分享

本次测评板卡是创龙科技旗下的TL570x-EVM,它是一款基于TI Sitara系列AM5708ARM Cortex-A15+浮点DSPC66x处理器设计的异构多核SOC评估板,由核心板和评估底板组成。核心板经过专业的PCB Layout和高低温测试验证,稳定可靠,可满足各种工业应用环境。 评估板接口资源丰富,引出…