為什么我的代碼在第一項上回圈-有解無憂

我正在制作一個代碼，可以在網站上抓取一堆藝術家的歌詞，并將歌詞以 .txt 檔案的形式保存在以他們各自專輯命名的目錄中。

但是在我的程式完成第一個藝術家之后，它繼續回圈同一個藝術家。為什么？

代碼：

import os
from bs4 import BeautifulSoup
import ssl
import time
os.chdir("D:/Folder")

import urllib.request

if os.path.isfile('hist'):    
#creating history file (r for read), so that we get a link for every song we have scaped, so that we don't scrape it again (we need to scrape each file only once)
    with open('hist', 'r', encoding='utf-8') as file:
        history = file.read().split()
else:
    history=[]
    
artists=["lil wayne","bob dylan","beyonce"]
ssl._create_default_https_context = ssl._create_unverified_context
urlhome = "https://www.lyricsfreak.com/"
frontpage = urllib.request.urlretrieve(urlhome,"data/frontpage")
front = open("data/frontpage", encoding="utf-8").read()
soupfront = BeautifulSoup(front,features="lxml")

for artist in artists:
    if not os.path.exists("D:/Folder/" str(artist)):
        os.mkdir("D:/Folder/" str(artist))
    link=urlhome str(artist[0]) "/" artist.replace(" "," ")
    getartist=urllib.request.urlopen(link)
    artistpage = BeautifulSoup(getartist,features="lxml")
    albums=artistpage.findAll("a", attrs={"class":"lf-link lf-link--secondary"})

    for album in albums:
        if str(artist[0]) "/" artist.replace(" "," ") in album["href"]:
            albumurl = "https://www.lyricsfreak.com" album["href"]
            albumpage = urllib.request.urlopen(albumurl)
            albumsoup = BeautifulSoup(albumpage,features="lxml")
            albumyear = albumsoup.find("div",attrs={"class":"lf-album__meta-item"}).text.strip()[-6:]
            
            albumname = album.text.strip() " " albumyear
        if not os.path.exists("D:/Folder/" str(artist) "/" albumname):
            os.mkdir("D:/Folder/" str(artist) "/" albumname)
        songs = albumsoup.findAll("a",href=True,attrs={"class":"lf-link lf-link--secondary"})
        
        for song in songs:
            if song['href'] in history:
                print('Skipping', song['href'], '-already on drive')
                continue #if it's already scraped, it continues to the next song

            time.sleep(3)
            if "/album/" not in song["href"]:
                songurl = "https://www.lyricsfreak.com" song["href"]
                songpage = urllib.request.urlopen(songurl)
                songsoup = BeautifulSoup(songpage,features="lxml")
                songname = songsoup.find("span",attrs={"class":"item-header-color"}).text[:-7]
                lyrics = songsoup.find("div",attrs={"id":"content"})
                fixedlyrics = lyrics.text.strip()
                lyricfile = open(artist "/" albumname "/" (songname) ".txt","w")
                lyricfile.write(fixedlyrics)
                with open('hist', 'a', encoding='utf-8') as file: #a for append
                    file.write(song['href']   '\n')
                print("parsing " str(songname))

uj5u.com熱心網友回復：

之后的代碼塊：

if str(artist[0]) "/" artist.replace(" "," ") in album["href"]:
            albumurl = "https://www.lyricsfreak.com" album["href"]
            albumpage = urllib.request.urlopen(albumurl)
            albumsoup = BeautifulSoup(albumpage,features="lxml")
            albumyear = albumsoup.find("div",attrs={"class":"lf-album__meta-item"}).text.strip()[-6:]
            
            albumname = album.text.strip() " " albumyear

需要縮進以包含在該條件陳述句中。否則，它只是跳過那個小塊，然后在最后albumurl存盤的字串上重復所有內容。

完整代碼：

import os
from bs4 import BeautifulSoup
import ssl
import time
os.chdir("D:/Folder")

import urllib.request

if os.path.isfile('hist'):    
#creating history file (r for read), so that we get a link for every song we have scaped, so that we don't scrape it again (we need to scrape each file only once)
    with open('hist', 'r', encoding='utf-8') as file:
        history = file.read().split()
else:
    history=[]
    
artists=["lil wayne","bob dylan","beyonce"]
ssl._create_default_https_context = ssl._create_unverified_context
urlhome = "https://www.lyricsfreak.com/"
frontpage = urllib.request.urlretrieve(urlhome,"data/frontpage")
front = open("data/frontpage", encoding="utf-8").read()
soupfront = BeautifulSoup(front,features="lxml")

for artist in artists:
    if not os.path.exists("D:/Folder/" str(artist)):
        os.mkdir("D:/Folder/" str(artist))
    link=urlhome str(artist[0]) "/" artist.replace(" "," ")
    getartist=urllib.request.urlopen(link)
    artistpage = BeautifulSoup(getartist,features="lxml")
    albums=artistpage.findAll("a", attrs={"class":"lf-link lf-link--secondary"})

    for album in albums:
        if str(artist[0]) "/" artist.replace(" "," ") in album["href"]:
            albumurl = "https://www.lyricsfreak.com" album["href"]
            albumpage = urllib.request.urlopen(albumurl)
            albumsoup = BeautifulSoup(albumpage,features="lxml")
            albumyear = albumsoup.find("div",attrs={"class":"lf-album__meta-item"}).text.strip()[-6:]
            
            albumname = album.text.strip() " " albumyear
        
            if not os.path.exists("D:/Folder/" str(artist) "/" albumname):              #<-- INDENT REST OF CODE
                os.mkdir("D:/Folder/" str(artist) "/" albumname)
            songs = albumsoup.findAll("a",href=True,attrs={"class":"lf-link lf-link--secondary"})
            
            for song in songs:
                if song['href'] in history:
                    print('Skipping', song['href'], '-already on drive')
                    continue #if it's already scraped, it continues to the next song
    
                time.sleep(3)
                if "/album/" not in song["href"]:
                    songurl = "https://www.lyricsfreak.com" song["href"]
                    songpage = urllib.request.urlopen(songurl)
                    songsoup = BeautifulSoup(songpage,features="lxml")
                    songname = songsoup.find("span",attrs={"class":"item-header-color"}).text[:-7]
                    lyrics = songsoup.find("div",attrs={"id":"content"})
                    fixedlyrics = lyrics.text.strip()
                    lyricfile = open(artist "/" albumname "/" (songname) ".txt","w")
                    lyricfile.write(fixedlyrics)
                    with open('hist', 'a', encoding='utf-8') as file: #a for append
                        file.write(song['href']   '\n')
                    print("parsing " str(songname))

轉載請註明出處，本文鏈接：https://www.uj5u.com/shujuku/361652.html

標籤：Python html 循环网页抓取

上一篇：如何從Coinmarketcap中提取所有加密符號

下一篇：如何使用硒從網站上抓取資料