根據這里的解釋,我設法從報紙檔案中抓取了一頁。
現在我試圖通過運行一個代碼來自動化訪問頁面串列的程序。制作 URL 串列很容易,因為報紙的存檔具有類似的鏈接模式:
https://en.trend.az/archive/2021-XX-XX
問題在于撰寫一個回圈來抓取諸如title, date, time, category 之類的資料。為簡單起見,我嘗試僅使用 2021-09-30 至 2021-10-02 的文章標題。
## Setting data frames
d1 <- as.Date("2021-09-30")
d2 <- as.Date("2021-10-02")
list_of_url <- character() # or str_c()
## Generating subpage list
for (i in format(seq(d1, d2, by="days"), format="%Y-%m-%d")) {
list_of_url[i] <- str_c ("https://en.trend.az", "/archive/", i)
# Launching browser
driver <- rsDriver(browser = c("firefox")) #Version 93.0 (64-bit)
remDr <- driver[["client"]]
remDr$errorDetails
remDr$navigate(list_of_url[i])
remDr0$findElement(using = "xpath", value = '/html/body/div[1]/div/div[1]/h1')$clickElement()
webElem <- remDr$findElement("css", "body")
#scrolling to the end of webpage, to load all articles
for (i in 1:25){
Sys.sleep(2)
webElem$sendKeysToElement(list(key = "end"))
}
page <- read_html(remDr$getPageSource()[[1]])
# Scraping article headlines
get_headline <- page %>%
html_nodes('.category-article') %>% html_nodes('.article-title') %>%
html_text()
get_time <- str_sub(get_time, start= -5)
length(get_time)
}
}
總長度應該是 157 166 140=463。事實上,我什至無法從一頁中收集所有資料(長度(get_time)= 126)
我認為在回圈中的第一組命令之后,我remDr為指定的 3 個日期獲得了三個,但后來無法獨立識別它們。
因為,我試圖之前或之后,以啟動初始一個內部的第二環的page <-由
for (remDr0 in remDr) {
page <- read_html(remDr0$getPageSource()[[1]])
# substituted all remDr-s below with remDr0
或者
page <- read_html(remDr$getPageSource()[[1]])
for (page0 in page)
# substituted all page-s below with page0
然而,這些嘗試以不同的錯誤告終。
我很感激專家的幫助,因為這是我第一次將 R 用于此類目的。
希望有可能通過function例如制作一個.
uj5u.com熱心網友回復:
稍微拓寬以抓取多個類別
library(RSelenium)
library(dplyr)
library(rvest)
提及日期期間
d1 <- as.Date("2021-09-30")
d2 <- as.Date("2021-10-02")
dt = seq(d1, d2, by="days")#contains the date sequence
#launch browser
driver <- rsDriver(browser = c("firefox"))
remDr <- driver[["client"]]
### `get_headline` Function for newspaper headlines
get_headline = function(x){
link = paste0( 'https://en.trend.az/archive/', x)
remDr$navigate(link)
remDr$findElement(using = "xpath", value = '/html/body/div[1]/div/div[1]/h1')$clickElement()
webElem <- remDr$findElement("css", "body")
#scrolling to the end of webpage, to load all articles
for (i in 1:25){
Sys.sleep(1)
webElem$sendKeysToElement(list(key = "end"))
}
headlines = remDr$getPageSource()[[1]] %>%
read_html() %>%
html_nodes('.category-article') %>% html_nodes('.article-title') %>%
html_text()
headlines
return(headlines)
}
get_time 發布時間函式
get_time <- function(x){
link = paste0( 'https://en.trend.az/archive/', x)
remDr$navigate(link)
remDr$findElement(using = "xpath", value = '/html/body/div[1]/div/div[1]/h1')$clickElement()
webElem <- remDr$findElement("css", "body")
#scrolling to the end of webpage, to load all articles
for (i in 1:25){
Sys.sleep(1)
webElem$sendKeysToElement(list(key = "end"))
}
# Addressing selector of time on the website
time <- remDr$getPageSource()[[1]] %>%
read_html() %>%
html_nodes('.category-article') %>% html_nodes('.article-date') %>%
html_text() %>%
str_sub(start= -5)
time
return(time)
}
從一頁/天開始對所有文章進行編號
get_number <- function(x){
link = paste0( 'https://en.trend.az/archive/', x)
remDr$navigate(link)
remDr$findElement(using = "xpath", value = '/html/body/div[1]/div/div[1]/h1')$clickElement()
webElem <- remDr$findElement("css", "body")
#scrolling to the end of webpage, to load all articles
for (i in 1:25){
Sys.sleep(1)
webElem$sendKeysToElement(list(key = "end"))
}
# Addressing selectors of headlines on the website
headline <- remDr$getPageSource()[[1]] %>%
read_html() %>%
html_nodes('.category-article') %>% html_nodes('.article-title') %>%
html_text()
number <- seq(1:length(headline))
return(number)
}
集合所有函式到 tibble
get_data_table <- function(x){
# Extract the Basic information from the HTML
headline <- get_headline(x)
time <- get_time(x)
headline_number <- get_number(x)
# Combine into a tibble
combined_data <- tibble(Num = headline_number,
Article = headline,
Time = time)
}
用于lapply遍歷所有日期dt
df = lapply(dt, get_data_table)
轉載請註明出處,本文鏈接:https://www.uj5u.com/qiye/377643.html
下一篇:在亞馬遜網頁產品上抓取ASIN
