install.packages(c(magrittr,tidyr,dplyr))
install.packages(tidyverse) #上面三個包都在其中
library(tidyverse)
cars %>% summary() #把cars叫進summary()。等同summary(cars)
應用運算符號%>%
birth<- 1995
age<- Sys.Date() %>%
format(format= %Y) %>%
as.numeric() %>%
`-` (birth)
調整輸入位置
cars_lm<- lm(formula = dist~ speed, data= cars) #傳統方式
cars_lm<- cars %>%
lm(formula = dist~ speed, data = .) #以. 指定資料輸入的位置
gather()
將多個數值變數堆積在同一個數值變數中(value),再用一個類別變數(key)紀錄數值變數的來源。
team_name<- c(Bull, Warrior)
wins<- c(72,73)
losses<- c(10,9)
team<- data.frame(team_name,wins,losses)
team
gather(team, key = variable, value = values, wins, losses)
filter()
篩選資料
filter(team, team_name== Bull)
team[team_name==Bull, ] #內建寫法
select()
篩選特定變數
select(team, wins)
team[,wins, drop= F] #內建寫法,drop= F為不轉為vector
mutate()
新增衍生變數或非衍生變數
season<- c(1995-96, 2015-16)
mutate(team,
winning_percentage= wins/(wins+losses),
season= season)
arrange()
利用指定的變數排序觀測值
arrange(team, losses) #遞增
arrange(team, desc(losses)) #遞減
summarise()
聚合某項變數觀測值進行運算
summarise(team, var(losses))
#group_by() 搭配summarise()和 %>%
team_gather<- gather(team, key = variable, value = values, wins, losses)
group_by(team_gather, team_name) %>%
summarise(mean(values)) %>%
as.data.frame()
運用函數於資料框 (加速資料運算)
weight<- ceiling(runif(500000)*50) + 40
height<- ceiling(runif(500000)*50) + 140
h_w<- data.frame(height,weight) #製成身高與體重的資料框
bmi<- rep(NA, times= nrow(h_w)) #做出空的vector,輸入結果
for (i in 1:nrow(h_w)) {
bmi[i]<- h_w[i, weight\] / (h_w[i, height\] / 100)^2
}
system.time(for (i in 1:nrow(h_w)) {
bmi[i]<- h_w[i, weight\] / (h_w[i, height\] / 100)^2
}) #可用system.time()得知運算時間,約花13秒
options(digits=7) #顯示小數點後幾位
system.time(
bmi<- h_w$weight/ (h_w$height/100)^2
) #用向量計算加快到0.02秒
distinct_counts<- function(x){
unique_values<- unique(x)
return(length(unique_values))
} #計算資料有幾個變數
apply(iris, MARGIN = 2, distinct_counts) #MARGIN = 2指定函數(distinct_counts)應用在變數欄(column),1則是觀測值列
lapply(iris, FUN= distinct_counts) #回傳成list
sapply(iris, FUN = distinct_counts) #回傳成vecor,簡化資料
tapply(iris$Sepal.Length, INDEX = iris$Species, FUN = distinct_counts) #依照species, 分別找出相異的Sepal.Length有幾個
練習題: 以上面的身高體重例子,用mapply計算bmi
bmi<- mapply(
FUN = function(height, weight){
return(weight / (height / 100)^2)
},
h_w$height,h_w$weight
) #函數需輸入多個變數(身高、體重)時,可使用mapply完成