提取到的關鍵網頁代碼:
<tr onclick="location.href='https://bbs.csdn.net/city/sz.html';" style="cursor: pointer;">
<th>1</th>
<th>
<a href="https://bbs.csdn.net/city/sz.html" title="深圳房價行情,房價概況走勢,資料分析"> 深圳</a>
</th>
<th>74,929</th>
<th class="red">+18.96%</th>
<th class="red">+2.86%</th>
</tr>, <tr onclick="location.href='https://bbs.csdn.net/city/bj.html';" style="cursor: pointer;">
<th>2</th>
<th>
<a href="https://bbs.csdn.net/city/bj.html" title="北京房價行情,房價概況走勢,資料分析"> 北京</a>
</th>
<th>62,567</th>
<th class="green">-2.09%</th>
<th class="green">-4.76%</th>
</tr>, <tr onclick="location.href='https://bbs.csdn.net/city/sh.html';" style="cursor: pointer;">
......后邊同型別
我的代碼:
import requests
from tool import useragenttool
import bs4
import re
import openpyxl
def open_url(url):
"""決議網址,獲取原始碼資訊"""
res = requests.get(url, headers=useragenttool.get_headers())
return res
def find_data(res):
datas = []
soup = bs4.BeautifulSoup(res.text, "html.parser")
content = soup.find(class_="gb-dataListBox")
# print(content)
target = content.find_all("tr", style="cursor: pointer;")
# print(target)
target = iter(target)
for each in target:
# print(each.text)
if each.text.isnumeric():
datas.append([
re.search(r'(.+)', next(target).text).group(1),
re.search(r'\d.*', next(target).text).group(),
re.search(r'\d.*', next(target).text).group(),
re.search(r'\d.*', next(target).text).group()])
print(datas)
return datas
def main():
url = "https://www.creprice.cn/rank/cityforsale.html"
res = open_url(url)
datas = find_data(res)
if __name__ == '__main__':
main()
為什么 print(datas)出來的datas串列空的啊,我要爬城市,房價還有后邊兩個百分數,新手百思不得其解,求大神解答
uj5u.com熱心網友回復:
import requests
import bs4
import re
def find_data():
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
'Connection': 'keep-alive'}
res = requests.get('https://www.creprice.cn/rank/cityforsale.html',headers=head)
content = bs4.BeautifulSoup(res.text, "html.parser").find(class_="gb-dataListBox")
target = content.find_all("tr", style="cursor: pointer;")
info_list = []
for each in target:
tmp_dic = dict()
city = re.search('[^\x00-\xff]+',each.text).group()
price = re.search('\d+,\d+', each.text).group()
rate = re.findall('[+-]\d+.*%', each.text)
tmp_dic[city] = [price,rate[1],rate[0]]
info_list.append(tmp_dic)
print(info_list)
if __name__ == '__main__':
find_data()
[{'深圳': ['74,929', '+18.96%', '+2.86%']}, {'北京': ['62,567', '-2.09%', '-4.76%']}, {'上海': ['54,911', '+5.85%', '-0.25%']}, {'廈門': ['47,817', '+5.66%', '+0.27%']}, {'三亞': ['38,291', '+12.01%', '+3.72%']}, {'廣州': ['35,934', '+6.13%', '+5.43%']}, {'杭州': ['31,487', '+4.1%', '+3.1%']}, {'南京': ['31,416', '+2.87%', '-0.24%']}, {'福州': ['26,288', '+0.55%', '+1.78%']}, {'天津': ['25,751', '+0.14%', '+1.4%']}, {'寧波': ['23,544', '+15.65%', '+0.5%']}, {'珠海': ['23,473', '+1.43%', '-0.37%']}, {'蘇州': ['23,294', '+6.32%', '-1.96%']}, {'青島': ['21,890', '+1.65%', '+0.76%']}, {'溫州': ['21,777', '+7.11%', '-1.31%']}, {'麗水': ['19,428', '+7.9%', '-2.74%']}, {'武漢': ['18,942', '+4.89%', '+0.3%']}, {'東莞': ['17,921', '+11.79%', '+0.86%']}, {'金華': ['17,279', '+5.54%', '-0.69%']}, {'成都': ['16,726', '+7.34%', '+3.11%']}, {'無錫': ['16,675', '+12.46%', '+0.13%']}, {'合肥': ['16,500', '+4.93%', '-0.73%']},...., {'鶴崗': ['2,307', '-2.19%', '-2.92%']}]
uj5u.com熱心網友回復:
謝謝大佬!!uj5u.com熱心網友回復:
帖子是不是可以結一下?
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/19784.html
