資料處理技巧(2)

Publish Date: 2018-09-06

Update Date: 2018-09-06

install.packages(c(magrittr,tidyr,dplyr))
install.packages(tidyverse) #上面三個包都在其中
library(tidyverse)
cars %>% summary() #把cars叫進summary()。等同summary(cars)

應用運算符號%>%

birth<- 1995
age<- Sys.Date() %>%
 format(format= %Y) %>%
 as.numeric() %>%
 `-` (birth)

調整輸入位置

cars_lm<- lm(formula = dist~ speed, data= cars) #傳統方式
cars_lm<- cars %>%
 lm(formula = dist~ speed, data = .) #以. 指定資料輸入的位置

gather() 將多個數值變數堆積在同一個數值變數中(value)，再用一個類別變數(key)紀錄數值變數的來源。

team_name<- c(Bull, Warrior)
wins<- c(72,73)
losses<- c(10,9)
team<- data.frame(team_name,wins,losses)
team
gather(team, key = variable, value = values, wins, losses)

filter() 篩選資料

filter(team, team_name== Bull)
team[team_name==Bull, ] #內建寫法

select() 篩選特定變數

select(team, wins)
team[,wins, drop= F] #內建寫法，drop= F為不轉為vector

mutate() 新增衍生變數或非衍生變數

season<- c(1995-96, 2015-16)
mutate(team,
 winning_percentage= wins/(wins+losses),
 season= season)

arrange() 利用指定的變數排序觀測值

arrange(team, losses) #遞增
arrange(team, desc(losses)) #遞減

summarise() 聚合某項變數觀測值進行運算

summarise(team, var(losses))
#group_by() 搭配summarise()和 %>%
team_gather<- gather(team, key = variable, value = values, wins, losses)
group_by(team_gather, team_name) %>%
 summarise(mean(values)) %>%
 as.data.frame()

運用函數於資料框 (加速資料運算)

weight<- ceiling(runif(500000)*50) + 40
height<- ceiling(runif(500000)*50) + 140
h_w<- data.frame(height,weight) #製成身高與體重的資料框
bmi<- rep(NA, times= nrow(h_w)) #做出空的vector，輸入結果
for (i in 1:nrow(h_w)) {
 bmi[i]<- h_w[i, weight\] / (h_w[i, height\] / 100)^2
}
system.time(for (i in 1:nrow(h_w)) {
 bmi[i]<- h_w[i, weight\] / (h_w[i, height\] / 100)^2
}) #可用system.time()得知運算時間，約花13秒
options(digits=7) #顯示小數點後幾位
system.time(
 bmi<- h_w$weight/ (h_w$height/100)^2
 ) #用向量計算加快到0.02秒

distinct_counts<- function(x){
 unique_values<- unique(x)
 return(length(unique_values))
} #計算資料有幾個變數
apply(iris, MARGIN = 2, distinct_counts) #MARGIN = 2指定函數(distinct_counts)應用在變數欄(column)，1則是觀測值列
lapply(iris, FUN= distinct_counts) #回傳成list
sapply(iris, FUN = distinct_counts) #回傳成vecor，簡化資料
tapply(iris$Sepal.Length, INDEX = iris$Species, FUN = distinct_counts) #依照species， 分別找出相異的Sepal.Length有幾個

練習題: 以上面的身高體重例子，用mapply計算bmi

bmi<- mapply(
 FUN = function(height, weight){
 return(weight / (height / 100)^2)
 },
 h_w$height,h_w$weight
) #函數需輸入多個變數(身高、體重)時，可使用mapply完成

參考書籍: 輕鬆學習R語言：從基礎到應用，掌握資料科學的關鍵能力

Hung-Lin, Chen

https://blog.hlin.tw/09c60050b09b/

All articles in this blog are used except for special statements CC BY 4.0 reprint policy. If reproduced, please indicate source Hung-Lin, Chen !

在R中執行 SAS的glm lsmeans

在 R 執行 SAS 的 glm lsmeans使用的資料為 R 內建的 dataset- airquality可以先看一下該資料的描述(紐約某一年的空氣品質指標數據)

2018-09-06 Biostatistics

R biostatistics

基因富集分析 (gene set enrichment set analysis)

數據庫下載: 與自己的差異基因進行搜尋及比對AnnotationHub 是生物數據庫的中轉站，方便搜尋目標數據，另一個相似套件為 biomaRt參考網址:https://www.jianshu.com/p/ae94178918bc

2018-08-30 Bioinformatics

R bioinformatics