我按照youtube上的網路刮削教程刮削了這個網站https://books.toscrape.com/,但我得到的是一個空的結果
。 import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
all_books = []
url = "http://books.toscrape.com/catalogue/page-1.html"/span>
def get_page(url)。
page = requests.get(url)
status = page.status_code
soup = bs(page.text, "lxml"/span>)
return [soup, status].
def get_links(soup)。
鏈接 = []
listings = soup.find_all(class_="product_pod")
def get_links(soup)。
鏈接 = []
listings = soup.find_all(class_="product_pod")
def extract_info(links)。
for listing in listings:
bk_lnk = listing.find("h5"/span>).a.get("href"/span>)
base_url = "http://books.toscrape.com/catalogue"。
cmplt_lnk = base_url bk_lnk
links.append(cmplt_lnk)
return links
def extract_info(links)。
for link in links:
res = requests.get(link).text
book_soup = bs(res, "lxml"/span>)
title = book_soup.find(class_ = "col-sm-6 product_main") .h1. text.strip()
價格 = book_soup.find(class_ = "col-sm-6 product_main").p. text.strip()
book = {"title": title, "price": price}。
all_books.append(book)
pg = 1
while True:
url = f "http://books.toscrape.com/catalogue/page-{pg}.html"/span>。
soup_status = get_page(url)
if soup_status[1] == 200:
print (f "scraping page {pg}"/span>)
extract_info(get_links(soup_status[0] )
pg =1
else:
print("The End"/span>)
break("The End")
df = pd.DataFrame(all_books)
print (df)
這里是我得到的結果
Empty DataFrame
列。[]
索引。[]
我的Colab筆記本鏈接
https://colab.research.google.com/drive/1Lyvwt_WLpE9tqy1qheZg80N70CFSsk-E?usp=sharinguj5u.com熱心網友回復:
你的串列是空的.需要呼叫你的函式.例如
Get_page(url),它應該回傳一個串列,你可以在你的后續函式中使用湯。
uj5u.com熱心網友回復:
def get_links(soup)。
鏈接 = []
listings = soup.find_all(class_="product_pod")
def extract_links() 。
for listing in listings:
bk_lnk = listing.find("h3"/span>).a.get("href"/span>)
base_url = "https://books.toscrape.com/catalogue/"。
cmplt_lnk = base_url bk_lnk
links.append(cmplt_lnk)
return links
return extract_links()
def extract_info(links)。
for link in links:
res = requests.get(link).text
book_soup = bs(res, "lxml"/span>)
title = book_soup.find(class_ = "col-sm-6 product_main") .h1.text.strip()
price = book_soup.find(class_ = "col-sm-6 product_main") .p.text.strip()
book = {"title": title, "price": price}。
all_books.append(book)
pg = 45 "價格": price
while True:
url = f "https://books.toscrape.com/catalogue/page-{pg}.html"/span>。
soup_status = get_page(url)
if soup_status[1] == 200:
print (f "scraping page {pg}"/span>)
extract_info(get_links(soup_status[0] )
pg =1
else:
print("The End"/span>)
break("The End")
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/331294.html
標籤:
