我想刮一下這個節點。在一頁中運行良好,但我需要其他 342 頁。關于頁面僅更改最終數字,例如 1、2、3 到 342。
library(rvest)
library(xml2)
library(httr)
# For page 1
website<-'https://cgspace.cgiar.org/discover?
rpp=10&etal=0&query=cassava&scope=10568/35697&group_by=none&page=1'
link <- vector()
#loop through nodes
for (i in 1:10){
link[i] <-website %>%
read_html() %>%
html_nodes(xpath=paste0('//*[@id="aspect_discovery_SimpleSearch_div_search-
results"]/div[',i,']/div[2]/div/div[1]/a')) %>%
html_attr('href')
}
pag <- data.frame(link)
pag$link2 <- paste0('https://cgspace.cgiar.org', pag$link)
pag
# link link2
# 1 /handle/10568/71370 https://cgspace.cgiar.org/handle/10568/71370
# 2 /handle/10568/43831 https://cgspace.cgiar.org/handle/10568/43831
# 3 /handle/10568/56285 https://cgspace.cgiar.org/handle/10568/56285
# For page 2
website<-'https://cgspace.cgiar.org/discover?
rpp=10&etal=0&query=cassava&scope=10568/35697&group_by=none&page=2'
link <- vector()
#loop through nodes
for (i in 1:10){
link[i] <-website %>%
read_html() %>%
html_nodes(xpath=paste0('//*[@id="aspect_discovery_SimpleSearch_div_search-
results"]/div[',i,']/div[2]/div/div[1]/a')) %>%
html_attr('href')
}
pag2 <- data.frame(link)
pag2$link2 <- paste0('https://cgspace.cgiar.org', pag2$link)
pag2
# link link2
# 1 /handle/10568/90626 https://cgspace.cgiar.org/handle/10568/90626
# 2 /handle/10568/71796 https://cgspace.cgiar.org/handle/10568/71796
# 3 /handle/10568/68788 https://cgspace.cgiar.org/handle/10568/68788
這個想法是在一個回圈中完成它并擁有一個資料框。
更新問題:我在第二個 for 回圈中添加了這個:但顯示錯誤
all_pags <- data.frame()
startTime <- Sys.time()
for( i in 1:1){
website<-paste0('https://cgspace.cgiar.org/discover?
rpp=10&etal=0&query=cassava&scope=10568/35697&group_by=none&page=',i)
link <- vector()
Title <- vector()
#loop through nodes
for (i in 1:10){
link[i] <-website %>%
read_html() %>%
html_nodes(xpath=paste0('//*
[@id="aspect_discovery_SimpleSearch_div_search-
results"]/div[',i,']/div[2]/div/div[1]/a')) %>%
html_attr('href')
Title[i]<-website %>%
read_html() %>%
html_nodes(xpath=paste0('//*
[@id="resultsTable"]/tbody/tr[',i,']/td/div/div[1]/a/span')) %>%
html_text(trim = T)
}
pag <- data.frame(link,Title)
pag$link2 <- paste0('https://cgspace.cgiar.org', pag$link)
all_pags <- rbind(all_pags, pag, Title)
}
endTime <- Sys.time()
print(endTime - startTime)
all_pags
# Error in Title[i] <- website %>% read_html() %>% html_nodes(xpath = #paste0("//*[@id=\"resultsTable\"]/tbody/tr[", :
# replacement has length zero
我嘗試在同一個回圈中獲取每個檔案的名稱和鏈接
uj5u.com熱心網友回復:
把它放在一個回圈中
library(rvest)
library(xml2)
library(httr)
all_pags <- data.frame()
for( i in 1:342){
website<-paste0('https://cgspace.cgiar.org/discover?rpp=10&etal=0&query=cassava&scope=10568/35697&group_by=none&page=',i)
link <- vector()
#loop through nodes
for (i in 1:10){
link[i] <-website %>%
read_html() %>%
html_nodes(xpath=paste0('//*[@id="aspect_discovery_SimpleSearch_div_search-results"]/div[',i,']/div[2]/div/div[1]/a')) %>% html_attr('href')
}
pag <- data.frame(link)
pag$link2 <- paste0('https://cgspace.cgiar.org', pag$link)
all_pags <- rbind(all_pags, pag)
}
all_pags
轉載請註明出處,本文鏈接:https://www.uj5u.com/gongcheng/520923.html
上一篇:查找組內上一個冬天的事件數
下一篇:計算兩列之間的差異并乘以總和