我有一個資料框,其中包含教師中學生論文的導師和顧問的姓名,例如:
DF<-data.frame(Names=c("Name : Ali , Family : Ahmadi , Type : First supervisor Name : Aram , Family : Rezaeei , Type : Advisor Name : Omid , Family : Saeedi , Type : Advisor 1 Name : Nima , Family : Shaki , Type : Advisor 2 Name : Sohrab , Family : Karimi , Type : Advisor 3",
"Name : Ali , Family : Ahmadi , Type : First supervisor Name : Aram , Family : Rezaeei , Type : Advisor Name : Omid , Family : Saeedi , Type : Advisor 1 Name : Nima , Family : Shaki , Type : Advisor 2 Name : Sohrab , Family : Karimi , Type : Advisor 3",
"Name : Ali , Family : Ahmadi , Type : First supervisor Name : Aram , Family : Rezaeei , Type : Advisor Name : Omid , Family : Saeedi , Type : Advisor 1 Name : Nima , Family : Shaki , Type : Advisor 2 Name : Sohrab , Family : Karimi , Type : Advisor 3"))
我會將主管和顧問分成兩個不同的列(如我所愿),如下所示:
DF1<-data.frame(Supervisor=c("Ali Ahmadi","Ali Ahmadi","Ali Ahmadi"),Advisors=c("Aram Rezaeei, Omid Saeedi, Nima Shaki, Sohrab Karimi","Aram Rezaeei, Omid Saeedi, Nima Shaki, Sohrab Karimi","Aram Rezaeei, Omid Saeedi, Nima Shaki, Sohrab Karimi"))
DF1
Supervisor Advisors
1 Ali Ahmadi Aram Rezaeei, Omid Saeedi, Nima Shaki, Sohrab Karimi
2 Ali Ahmadi Aram Rezaeei, Omid Saeedi, Nima Shaki, Sohrab Karimi
3 Ali Ahmadi Aram Rezaeei, Omid Saeedi, Nima Shaki, Sohrab Karimi
我嘗試了以下代碼:
DF1<-strsplit(DF$Names, "Name :")
stopwords = c(":","Type","Family","Name","1","2", "3", "Advisor", "Family")
DF2 <- lapply(DF1,function(x) unlist(strsplit(x," ")) )
DF3 <- lapply(DF2,function(x) x[!x %in% stopwords] )
DF4<-lapply(DF3,function(x) paste(x, collapse = " "))
但最終結果如下所示不是我的預期,顯然需要進一步的作業才能轉換為資料框!:
DF4
[[1]]
[1] " Ali , Ahmadi , First supervisor Aram , Rezaeei , Omid , Saeedi , Nima , Shaki , Sohrab , Karimi ,"
[[2]]
[1] " Ali , Ahmadi , First supervisor Aram , Rezaeei , Omid , Saeedi , Nima , Shaki , Sohrab , Karimi ,"
[[3]]
[1] " Ali , Ahmadi , First supervisor Aram , Rezaeei , Omid , Saeedi , Nima , Shaki , Sohrab , Karimi ,"
有沒有簡化的方法來解決這個問題?我發現 regexp 可能會有所幫助,但至少在我的示例中我不知道如何使用它。提前感謝您的任何回答...
uj5u.com熱心網友回復:
這是一個嘗試extract
:
library(tidyr)
DF %>%
# clean strings:
mutate(Names = gsub("\\s?(Name|Family|First supervisor|Advisor|Type|\\d|\\s[,:])", "", Names, perl = TRUE)) %>%
# extract data into columns:
extract(Names,
into = c("Supervisor", "Advisor"),
regex = "(\\w \\s\\w )\\s(.*)") %>%
# insert commas into `Advisor`:
mutate(Advisor = gsub("(\\w \\s\\w \\b)(?!$)", "\\1,", Advisor, perl = TRUE))
Supervisor Advisor
1 Ali Ahmadi Aram Rezaeei, Omid Saeedi, Nima Shaki, Sohrab Karimi
2 Ali Ahmadi Aram Rezaeei, Omid Saeedi, Nima Shaki, Sohrab Karimi
3 Ali Ahmadi Aram Rezaeei, Omid Saeedi, Nima Shaki, Sohrab Karimi
資料:
DF<-data.frame(Names=c("Name : Ali , Family : Ahmadi , Type : First supervisor Name : Aram , Family : Rezaeei , Type : Advisor Name : Omid , Family : Saeedi , Type : Advisor 1 Name : Nima , Family : Shaki , Type : Advisor 2 Name : Sohrab , Family : Karimi , Type : Advisor 3",
"Name : Ali , Family : Ahmadi , Type : First supervisor Name : Aram , Family : Rezaeei , Type : Advisor Name : Omid , Family : Saeedi , Type : Advisor 1 Name : Nima , Family : Shaki , Type : Advisor 2 Name : Sohrab , Family : Karimi , Type : Advisor 3",
"Name : Ali , Family : Ahmadi , Type : First supervisor Name : Aram , Family : Rezaeei , Type : Advisor Name : Omid , Family : Saeedi , Type : Advisor 1 Name : Nima , Family : Shaki , Type : Advisor 2 Name : Sohrab , Family : Karimi , Type : Advisor 3"))
uj5u.com熱心網友回復:
這是一個基本的 R 解決方案。
DF <- data.frame(Names=c("Name : Ali , Family : Ahmadi , Type : First supervisor Name : Aram , Family : Rezaeei , Type : Advisor Name : Omid , Family : Saeedi , Type : Advisor 1 Name : Nima , Family : Shaki , Type : Advisor 2 Name : Sohrab , Family : Karimi , Type : Advisor 3",
"Name : Ali , Family : Ahmadi , Type : First supervisor Name : Aram , Family : Rezaeei , Type : Advisor Name : Omid , Family : Saeedi , Type : Advisor 1 Name : Nima , Family : Shaki , Type : Advisor 2 Name : Sohrab , Family : Karimi , Type : Advisor 3",
"Name : Ali , Family : Ahmadi , Type : First supervisor Name : Aram , Family : Rezaeei , Type : Advisor Name : Omid , Family : Saeedi , Type : Advisor 1 Name : Nima , Family : Shaki , Type : Advisor 2 Name : Sohrab , Family : Karimi , Type : Advisor 3"))
stopwords <- c(":","Type","Family","Name","1","2", "3", "Advisor", "Family")
stoppattern <- paste(stopwords, collapse = "|")
DF1 <- strsplit(DF$Names, "Name :")
DF1 <- lapply(DF1, \(x) trimws(x[sapply(x, nchar) > 0L]))
DF2 <- lapply(DF1, \(x) {
gsub(stoppattern, "", x)
})
DF3 <- lapply(DF2, \(x) {
y <- gsub(stoppattern, "", x)
y <- strsplit(x, ",")
y <- lapply(y, trimws)
lapply(y, \(.y) {
.y <- trimws(.y)
.y[sapply(.y, nchar) > 0L]
})
})
DF4 <- lapply(DF3, \(x) {
Supervisor <- x[[1]][1:2]
Supervisor <- paste(trimws(Supervisor), collapse = " ")
Advisors <- unlist(x[-1])
Advisors <- paste(trimws(Advisors), collapse = ", ")
data.frame(Supervisor, Advisors)
})
Final <- do.call(rbind, DF4)
Final
#> Supervisor Advisors
#> 1 Ali Ahmadi Aram, Rezaeei, Omid, Saeedi, Nima, Shaki, Sohrab, Karimi
#> 2 Ali Ahmadi Aram, Rezaeei, Omid, Saeedi, Nima, Shaki, Sohrab, Karimi
#> 3 Ali Ahmadi Aram, Rezaeei, Omid, Saeedi, Nima, Shaki, Sohrab, Karimi
由reprex 包(v2.0.1)創建于 2022-06-05
轉載請註明出處,本文鏈接:https://www.uj5u.com/shujuku/485663.html