我正試圖從轉會市場上搜集一些足球資料。我想提取:
- 聯賽名稱
- 聯盟內的俱樂部
- 每個玩家的資訊
我運行的代碼沒有問題,但它沒有檢索任何資訊。我是資料抓取的新手。我不確定為什么它不起作用。請幫忙
'leauge.py'
from bs4 import BeautifulSoup
import csv
from team import team
import requests
headers = {'user-agent': '>> put my user agent<< '}
url = "https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1/saison_id/2019"
numTeams = 20
teamcount = 0
result = requests.get(url, headers=headers)
src = result.content
soup = BeautifulSoup(src, 'lxml')
f = open('database.csv', 'w')
wr = csv.writer(f, quoting=csv.QUOTE_ALL)
for td_tag in soup.find_all("td"):
if td_tag.get('class') == ['zentriert']:
a_tag = td_tag.find('a')
if a_tag != None and teamcount < numTeams:
teamcount = 1
url = 'https://www.transfermarkt.com' a_tag.get('href') '/plus/1'
print(url)
t1 = team(url)
wr.writerow([a_tag.get('title')])
wr.writerow(['Name', 'Club', 'Position', 'Nationality', 'DOB (Age)', 'Height', 'Foot', 'Date Joined', 'Contract Expires'])
names = t1.getNames()
bdays = t1.getBirth()
pos = t1.getPos()
nats = t1.getNat()
for x in range(0,len(names)):
playerdata = []
playerdata.append(names[x])
playerdata.append(a_tag.get('title'))
playerdata.append(pos[x])
playerdata.append("N/A")
playerdata.append(bdays[5*x])
playerdata.append(bdays[5*x 1])
playerdata.append(bdays[5*x 2])
playerdata.append(bdays[5*x 3])
playerdata.append(bdays[5*x 4])
wr.writerow(playerdata)
f.close()
'團隊.py'
import requests
from bs4 import BeautifulSoup
class team:
def __init__(self, url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
result = requests.get(url, headers=headers)
src = result.content
self.soup = BeautifulSoup(src, 'lxml')
def getNames(self):
names = []
for span_tag in self.soup.find_all("span"):
a_tag = span_tag.find('a')
if a_tag != None:
if span_tag.get('class') == ['hide-for-small']:
names.append(a_tag.get('title'))
return names
def getID(self):
ids = []
for span_tag in self.soup.find_all("span"):
a_tag = span_tag.find('a')
if a_tag != None:
if span_tag.get('class') == ['hide-for-small']:
ids.append(a_tag.get('id'))
return ids
def getBirth(self):
bday = []
for td_tag in self.soup.find_all('td'):
if td_tag.get('class') == ['zentriert']:
if td_tag.string != None:
bday.append(td_tag.string)
return bday
def getPos(self):
pos = []
for td_tag in self.soup.find_all('td'):
if td_tag.string == 'Second Striker' or td_tag.string == 'Right Midfield' or td_tag.string == 'Left Midfield' or td_tag.string == 'Goalkeeper' or td_tag.string == 'Left-Back' or td_tag.string == 'Centre-Back' or td_tag.string == 'Right-Back' or td_tag.string == 'Defensive Midfield' or td_tag.string == 'Central Midfield' or td_tag.string == 'Attacking Midfield' or td_tag.string == 'Left Winger' or td_tag.string == 'Right Winger' or td_tag.string == 'Centre-Forward':
pos.append(td_tag.string)
return pos
def getNat(self):
nat = []
for td_tag in self.soup.find_all('img'):
if td_tag.get('class') == ['flaggenrahmen'] and td_tag.string == None:
nat.append(td_tag.get('title'))
return nat
'輸出'
"Manchester City"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Liverpool FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Tottenham Hotspur"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Chelsea FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Manchester United"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Arsenal FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Everton FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Leicester City"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Wolverhampton Wanderers"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"West Ham United"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"AFC Bournemouth"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Newcastle United"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Aston Villa"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Southampton FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Brighton & Hove Albion"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Watford FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Crystal Palace"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Burnley FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Norwich City"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Sheffield United"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
uj5u.com熱心網友回復:
您的抓取代碼存在許多問題。您確實需要列印出 HTML 并確保您所要求的就是您想要的。中間除錯列印總是有幫助的。
一方面,該td_tag.string值回傳一個長字串,其中包含標簽的組合內容。它可能包含字串“Second Striker”,但它也有很多其他廢話。此外,span_tag.get('class') == ['hide-for-small']僅當跨度僅包含該類時才匹配。大多數跨度還有其他類。
這似乎有效。這是team.py:
import requests
from bs4 import BeautifulSoup
class team:
def __init__(self, url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
result = requests.get(url, headers=headers)
src = result.content
self.soup = BeautifulSoup(src, 'lxml')
def getNames(self):
names = []
for span_tag in self.soup.find_all("span"):
classes = span_tag.get("class")
if not (classes and 'hide-for-small' in classes):
continue
a_tag = span_tag.find('a')
if a_tag:
names.append(a_tag.get('title'))
return names
def getID(self):
ids = []
for span_tag in self.soup.find_all("span"):
classes = span_tag.get("class")
if not (classes and 'hide-for-small' in classes):
continue
a_tag = span_tag.find('a')
if a_tag:
ids.append(a_tag.get('id'))
return ids
def getBirth(self):
bday = []
for td_tag in self.soup.find_all('td'):
classes = td_tag.get('class')
if classes and 'zentriert' in classes:
if td_tag.string != None:
bday.append(td_tag.string)
return bday
positions = ('Second Striker', 'Right Midfield', 'Left Midfield', 'Goalkeeper', 'Left-Back', 'Centre-Back', 'Right-Back', 'Defensive Midfield', 'Central Midfield', 'Attacking Midfield', 'Left Winger', 'Right Winger', 'Centre-Forward')
def getPos(self):
pos = []
for td_tag in self.soup.find_all('td'):
for p in self.positions:
if td_tag.string and p in td_tag.string:
pos.append(p)
return pos
def getNat(self):
nat = []
for td_tag in self.soup.find_all('img'):
classes = td_tag.get('class')
if classes and 'flaggenrahmen' in classes and not td_tag.string:
nat.append(td_tag.get('title'))
return nat
轉載請註明出處,本文鏈接:https://www.uj5u.com/caozuo/483242.html
