群組操作

apply

apply(thematrix, 1, sum) # 對每一列套用函數

apply(thematrix, 2, sum) # 對每一行套用函數

apply(thematrix, 1, sum , na.rm=TRUE) # 忽略缺失值

lapply和sapply

lapply對list中每個元素套用函數，回傳結果同樣以list呈現；sapply則是以vector形式回傳結果。由於vector也是list的一種，因此lapply和sapply同樣也可使用vector作為引數。

mapply可對好幾個list中的每個元素套用所指定的函數。 mapply(identical, firstlist, secondlist)

aggregate聚合資料

aggregate(price ~ cut, diamonds, mean) # 公式由波浪號區分左右邊，左邊變數用來計算，右邊變數則是對左邊變數作為分群依據。使用diamonds資料集，將price依據cut分組，套用mean函數

aggregate(price ~ cut + color, diamonds, mean, na.rm=TRUE)

aggregate(cbind(price, carat) ~ cut, diamonds, mean) # 同時對price和carat兩個變數進行分組計算

aggregate(cbind(price, carat) ~ cut + color, diamonds, mean)

aggregate(price ~ cut, diamonds, each(mean, median)) # 利用plyr套建中的each函數，將多個函數的功能套用到某個函數中

plyr套件

篩選

wineTrain <- wine[, which(names(wine != "Cultivar")]

##\ 刪除特定欄位（可能不確定是第幾欄）

wbInfo <- wbInfo[wbInfo$region != "Aggregate", ] ##\ 刪除特定列

wbDaisy <- wbInfo[, which(!names(wbInfo) %in% c("iso2c", "country", "capital"))]

##\ 刪除列出來特定幾個名稱的欄位（利用which與!name留下來非設定欄位）

wbInfo <- wbInfo[which(rowsums(!is.na(wbInfo[,indicators]))>0),]

##\ 若有資料，!is.na()為真，True=1；用which找出rowsums()>0，代表遺除掉所有變數為NA的列

建立資料（案例）

例：generate sales date for RFM analysis

sales <- data.frame( sample(1000:1999,replace=T,size=10000), ##\ 自1000-1999中隨機取10000次，可重複 abs(round(rnorm(10000, 28, 13))), ##\ 常態分配設定平均值28且標準差13，取10000次；再四捨五入；再取絕對值 as.Date("2014/1/1") + 700*sort(stats::runif(10000)), ##\ runif 隨機齊一分配取10000次，再標準化stats；再排序後乘上700，換算日期 data.frame(sample(1:4,replace=T,size=10000)) ##\ 1-4重複隨機取10000次 ) names(sales) <- c("CustomerID", "Sales Value", "Date", "Category")

Data Munging