除了作業目錄中的所有資料檔案外,我將如何從開始到結束應用類似于 R 中的 for 回圈的作業流?
我的作業流程具有以下組件
# Load libraries
library(tidyr)
library(ggplot2)
library(dplyr)
# Import data
File_1 <- as.data.frame(read.table("file_1.bed", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = ""))
# Separate one of the columns into 2 new columns
filtered_File_1 <- separate(data = File_1, col = V3, into = c("end position", "Methylation"), sep = "\\|")
# Change NAN values to zero
new_File_1 <- filtered_File_1[- grep("NAN", filtered_File_1$Methylation), ]
filtered_File_1$Methylation[filtered_File_1$Methylation == "NAN"] <- '0'
# Change values into numeric
filtered_File_1$Methylation <- as.numeric(as.character(filtered_File_1$Methylation))
# Add New column
filtered_File_1$ID <- c("1")
# Take the average of numerical values
Average <- filtered_File_1% >%
group_by(V1) %>%
summarise(across(everything(), list(mean)))
基本上,我將如何為所有 100 個資料檔案一遍又一遍地迭代上述程序?原因是單獨復制和粘貼代碼行有點乏味,而且還會使 R 腳本變得很長。我知道如何將 lapply 用于串列,但是什么會允許涉及檔案匯入和處理的更復雜的事情?必須有更簡單的方法。
謝謝
uj5u.com熱心網友回復:
如果檔案適合作業記憶體,我們可以使用 alapply來實作與 for 回圈相同的效果,或者因為帖子使用tidyverse, purrr::map。這會生成一個資料框串列,其中包含函式回傳的匯總統計資訊get_means。
library(tidyverse)
files <- as.list(list.files(pattern = ".bed")) %>%
set_names(nm = sub(pattern = ".bed$", "", .))
get_means <- function(x){
as.data.frame(read.table(x)) %>%
separate(col = V3, into = c("end position", "Methylation"), sep = "\\|") %>%
mutate(Methylation = ifelse(Methylation == "NAN", "0", Methylation),
Methylation = as.numeric(Methylation)) %>%
group_by(V1) %>%
summarise(across(everything(), list(mean)))
}
lapply(files, get_means) # map(files, get_means)
或在 for 回圈中:
files <- list.files(pattern = ".bed") %>%
set_names(sub(pattern = ".bed$", "", .))
result <- vector(mode = "list", length(files)) %>% #initialize list
set_names(names(files))
for (i in seq_along(files)) {
result[[i]] <- get_means(files[i])
}
要將匯總統計的新檔案直接寫入磁盤,請使用名稱filename_stats.csv:
files <- list.files(pattern = ".bed") %>%
set_names(sub(pattern = ".bed$", "", .))
new_file_names <- paste0(names(files), "_stats.csv")
for (i in seq_along(files)) {
write_csv(get_means(files[i]), file = new_file_names[i])
}
示例:
考慮以下可重現的示例 - 使用mtcars和iris我們可以在何處創建帶有匯總統計資訊的命名串列:
write.csv(mtcars, file = "mtcars.csv")
write.csv(iris, file = "iris.csv")
files <- as.list(list.files(pattern = ".csv")) %>%
set_names(nm = sub(pattern = ".csv$", "", .))
lapply(files, \(x){
read.csv(x) %>%
summarise(across(where(is.numeric), mean))
})
$iris
X Sepal.Length Sepal.Width Petal.Length Petal.Width
1 75.5 5.843333 3.057333 3.758 1.199333
$mtcars
mpg cyl disp hp drat wt qsec vs am gear carb
1 20.09062 6.1875 230.7219 146.6875 3.596563 3.21725 17.84875 0.4375 0.40625 3.6875 2.8125
我不確定new_File_1原始代碼中的 應該做什么,因為它沒有被使用。
uj5u.com熱心網友回復:
我之前在類似情況下使用過的一段代碼:
library(readr)
library(dplyr)
infolder <- "C:\\Users\\name\\in"
setwd(infolder)
csvfiles <- dir(path = infolder, pattern = "\\.bed$")
for (i in csvfiles) {
print(file.path(outfolder, i))
#your code...#
File_i <- as.data.frame(read.table(i, header = FALSE, sep="\t", stringsAsFactors=FALSE, quote=""))
filtered_File_i <- separate(data = File_i, col = V3, into = c("end position", "Methylation"), sep = "\\|")
new_File_i <- filtered_File_i[- grep("NAN", filtered_File_i$Methylation),]
filtered_File_i$Methylation[filtered_File_i$Methylation == "NAN"] <- '0'
filtered_File_i$Methylation <- as.numeric(as.character(filtered_File_i$Methylation))
filtered_File_i$ID <- i
Average <- filtered_File_i %>%
group_by(V1) %>%
summarise(across(everything(), list(mean)))
### Assign the final value to a new variable named "X_i"
### see ?assign for more information
assign(x=paste0("X_", i), value=Average)
}
轉載請註明出處,本文鏈接:https://www.uj5u.com/qiye/375148.html
