我正在制作一個代碼,可以在網站上抓取一堆藝術家的歌詞,并將歌詞以 .txt 檔案的形式保存在以他們各自專輯命名的目錄中。
但是在我的程式完成第一個藝術家之后,它繼續回圈同一個藝術家。為什么?
代碼:
import os
from bs4 import BeautifulSoup
import ssl
import time
os.chdir("D:/Folder")
import urllib.request
if os.path.isfile('hist'):
#creating history file (r for read), so that we get a link for every song we have scaped, so that we don't scrape it again (we need to scrape each file only once)
with open('hist', 'r', encoding='utf-8') as file:
history = file.read().split()
else:
history=[]
artists=["lil wayne","bob dylan","beyonce"]
ssl._create_default_https_context = ssl._create_unverified_context
urlhome = "https://www.lyricsfreak.com/"
frontpage = urllib.request.urlretrieve(urlhome,"data/frontpage")
front = open("data/frontpage", encoding="utf-8").read()
soupfront = BeautifulSoup(front,features="lxml")
for artist in artists:
if not os.path.exists("D:/Folder/" str(artist)):
os.mkdir("D:/Folder/" str(artist))
link=urlhome str(artist[0]) "/" artist.replace(" "," ")
getartist=urllib.request.urlopen(link)
artistpage = BeautifulSoup(getartist,features="lxml")
albums=artistpage.findAll("a", attrs={"class":"lf-link lf-link--secondary"})
for album in albums:
if str(artist[0]) "/" artist.replace(" "," ") in album["href"]:
albumurl = "https://www.lyricsfreak.com" album["href"]
albumpage = urllib.request.urlopen(albumurl)
albumsoup = BeautifulSoup(albumpage,features="lxml")
albumyear = albumsoup.find("div",attrs={"class":"lf-album__meta-item"}).text.strip()[-6:]
albumname = album.text.strip() " " albumyear
if not os.path.exists("D:/Folder/" str(artist) "/" albumname):
os.mkdir("D:/Folder/" str(artist) "/" albumname)
songs = albumsoup.findAll("a",href=True,attrs={"class":"lf-link lf-link--secondary"})
for song in songs:
if song['href'] in history:
print('Skipping', song['href'], '-already on drive')
continue #if it's already scraped, it continues to the next song
time.sleep(3)
if "/album/" not in song["href"]:
songurl = "https://www.lyricsfreak.com" song["href"]
songpage = urllib.request.urlopen(songurl)
songsoup = BeautifulSoup(songpage,features="lxml")
songname = songsoup.find("span",attrs={"class":"item-header-color"}).text[:-7]
lyrics = songsoup.find("div",attrs={"id":"content"})
fixedlyrics = lyrics.text.strip()
lyricfile = open(artist "/" albumname "/" (songname) ".txt","w")
lyricfile.write(fixedlyrics)
with open('hist', 'a', encoding='utf-8') as file: #a for append
file.write(song['href'] '\n')
print("parsing " str(songname))
uj5u.com熱心網友回復:
之后的代碼塊:
if str(artist[0]) "/" artist.replace(" "," ") in album["href"]:
albumurl = "https://www.lyricsfreak.com" album["href"]
albumpage = urllib.request.urlopen(albumurl)
albumsoup = BeautifulSoup(albumpage,features="lxml")
albumyear = albumsoup.find("div",attrs={"class":"lf-album__meta-item"}).text.strip()[-6:]
albumname = album.text.strip() " " albumyear
需要縮進以包含在該條件陳述句中。否則,它只是跳過那個小塊,然后在最后albumurl存盤的字串上重復所有內容。
完整代碼:
import os
from bs4 import BeautifulSoup
import ssl
import time
os.chdir("D:/Folder")
import urllib.request
if os.path.isfile('hist'):
#creating history file (r for read), so that we get a link for every song we have scaped, so that we don't scrape it again (we need to scrape each file only once)
with open('hist', 'r', encoding='utf-8') as file:
history = file.read().split()
else:
history=[]
artists=["lil wayne","bob dylan","beyonce"]
ssl._create_default_https_context = ssl._create_unverified_context
urlhome = "https://www.lyricsfreak.com/"
frontpage = urllib.request.urlretrieve(urlhome,"data/frontpage")
front = open("data/frontpage", encoding="utf-8").read()
soupfront = BeautifulSoup(front,features="lxml")
for artist in artists:
if not os.path.exists("D:/Folder/" str(artist)):
os.mkdir("D:/Folder/" str(artist))
link=urlhome str(artist[0]) "/" artist.replace(" "," ")
getartist=urllib.request.urlopen(link)
artistpage = BeautifulSoup(getartist,features="lxml")
albums=artistpage.findAll("a", attrs={"class":"lf-link lf-link--secondary"})
for album in albums:
if str(artist[0]) "/" artist.replace(" "," ") in album["href"]:
albumurl = "https://www.lyricsfreak.com" album["href"]
albumpage = urllib.request.urlopen(albumurl)
albumsoup = BeautifulSoup(albumpage,features="lxml")
albumyear = albumsoup.find("div",attrs={"class":"lf-album__meta-item"}).text.strip()[-6:]
albumname = album.text.strip() " " albumyear
if not os.path.exists("D:/Folder/" str(artist) "/" albumname): #<-- INDENT REST OF CODE
os.mkdir("D:/Folder/" str(artist) "/" albumname)
songs = albumsoup.findAll("a",href=True,attrs={"class":"lf-link lf-link--secondary"})
for song in songs:
if song['href'] in history:
print('Skipping', song['href'], '-already on drive')
continue #if it's already scraped, it continues to the next song
time.sleep(3)
if "/album/" not in song["href"]:
songurl = "https://www.lyricsfreak.com" song["href"]
songpage = urllib.request.urlopen(songurl)
songsoup = BeautifulSoup(songpage,features="lxml")
songname = songsoup.find("span",attrs={"class":"item-header-color"}).text[:-7]
lyrics = songsoup.find("div",attrs={"id":"content"})
fixedlyrics = lyrics.text.strip()
lyricfile = open(artist "/" albumname "/" (songname) ".txt","w")
lyricfile.write(fixedlyrics)
with open('hist', 'a', encoding='utf-8') as file: #a for append
file.write(song['href'] '\n')
print("parsing " str(songname))
轉載請註明出處,本文鏈接:https://www.uj5u.com/shujuku/361652.html
下一篇:如何使用硒從網站上抓取資料
