在標簽內查找值時使用bs4得到亂碼-有解無憂

def getAllBooksPagesURLs():
    lists_of_url = []
    lists_of_url.append(r"http://books.toscrape.com/")
    for j in range(2,51):
        lists_of_url.append(r"http://books.toscrape.com/catalogue/page-%d.html"%j)
    return lists_of_url

def getAndParseURL(url):
    result = requests.get(url)
    soup = BeautifulSoup(result.text, 'html.parser')
    return soup

def getBooksURLs(url,z):
    soup = getAndParseURL(url)
    return([z  x.a.get('href') for x in soup.findAll( "div", class_="image_container")])

books_url = []
title_list = []
main_page_list = []
list_of_rewiew_num = []
list_of_bookpage = []
list_of_resultitle = []
books_done_page = []
list_of_review_num=[]

for y in getAllBooksPagesURLs()[0:1]:
    main_page=getAndParseURL(y)
    result_of_title = main_page.findAll("h3")
    for x in  result_of_title:
        list_of_resultitle.append(x.find("a").get("title"))
        books_url = getBooksURLs(y,y)

        for b in books_url:
    
             print(b)
             books_page = getAndParseURL(b)
             if books_page.find("td") is None:
                 list_of_review_num.append(0)
             else:
                 review_num =books_page.find("td").contents[0]

                 list_of_review_num.append(review_num)
books_url
list_of_resultitle
list_of_review_num

以上是我的代碼，結果是

['a897fe39b1053632', '90fa61229261140a', '6957f44c3847a760', 'e00eb4fd7b871a48', '4165285e1663650f', 'f77dbf2323deb740', '2597b5a345f45e1b', 'e72a5dfc7e9267b2', 'e10e1e165dc8be4a', '1dfe412b8ac00530', '0312262ecafa5a40', '30a7f60cd76ca58c', ' ce6396b0f23f6ecc', '3b1c02bac2a429e6', 'a34ba96d4081e6a4', 'deda3e61b9514b83', 'feb7cc7701ecf901', 'e30f54cea9b38190', 'a18a4f5274854aced],

亂碼就像'a22124811bfa8350'，是關于動態html嗎？我不知道。我想要的 list_of_review_num 輸出應該是

[0,1,2,3]

如何獲得正確的輸出？你能幫我嗎？提前謝謝你

uj5u.com熱心網友回復：

您的代碼輸出您擁有的結果的原因是您正在使用.find()它將找到td標簽的第一次出現，因為您正在使用的頁面上有許多標簽，并且評論將是td您應該做的最后一個標簽像這樣的東西。

if books_page.find("td") is None: # saying that there is no td tags at all
    list_of_review_num.append(0)
else:
    review_num = books_page.find_all("td")[-1].contents[0] # using find_all and accessing the last td tag element 

    list_of_review_num.append(review_num)

uj5u.com熱心網友回復：

在這里發布，您選擇 upc 資訊而不是評論。我建議避免使用所有這些串列來存盤您的結果，最好使用 dicts 代替：

data = []

for y in getAllBooksPagesURLs()[0:1]:
    main_page=getAndParseURL(y)
    books_url = getBooksURLs(y,y)

    for b in books_url:
        books_page = getAndParseURL(b)
        d = {
            'title': books_page.h1.text,
            'url':b
        }
        d.update(dict(x.stripped_strings for x in books_page.select('table tr')))
        data.append(d)
data

例子

import requests
from bs4 import BeautifulSoup

def getAllBooksPagesURLs():
    lists_of_url = []
    lists_of_url.append(r"http://books.toscrape.com/")
    for j in range(2,51):
        lists_of_url.append(r"http://books.toscrape.com/catalogue/page-%d.html"%j)
    return lists_of_url

def getAndParseURL(url):
    result = requests.get(url)
    soup = BeautifulSoup(result.text, 'html.parser')
    return soup

def getBooksURLs(url,z):
    soup = getAndParseURL(url)
    return([z  x.a.get('href') for x in soup.find_all( "div", class_="image_container")])

data = []

for y in getAllBooksPagesURLs()[0:1]:
    books_url = getBooksURLs(y,y)

    for b in books_url:
        books_page = getAndParseURL(b)
        d = {
            'title': books_page.h1.text,
            'url':b
        }
        d.update(dict(x.stripped_strings for x in books_page.select('table tr')))
        data.append(d)
data

輸出：

[{'title': 'A Light in the Attic',
  'url': 'http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html',
  'UPC': 'a897fe39b1053632',
  'Product Type': 'Books',
  'Price (excl. tax)': '￡51.77',
  'Price (incl. tax)': '￡51.77',
  'Tax': '￡0.00',
  'Availability': 'In stock (22 available)',
  'Number of reviews': '0'},
 {'title': 'Tipping the Velvet',
  'url': 'http://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html',
  'UPC': '90fa61229261140a',
  'Product Type': 'Books',
  'Price (excl. tax)': '￡53.74',
  'Price (incl. tax)': '￡53.74',
  'Tax': '￡0.00',
  'Availability': 'In stock (20 available)',
  'Number of reviews': '0'},...]

注意： 在較新的代碼中，請避免使用舊語法，findAll()而是使用find_all()or select()with css selectors- 更多請花一分鐘時間查看檔案

轉載請註明出處，本文鏈接：https://www.uj5u.com/gongcheng/519399.html

標籤：Pythonhtml网页抓取美丽的汤

上一篇：如何使用python獲取url中的一些表資料

下一篇：使用selenium關閉網站中的廣告按鈕