import urllib,re,xlwt;
def get_content():
url = "https://search.jd.com/Search?keyword=%E5%AE%9D%E9%A9%AC&enc=utf-8&wq=%E5%AE%9D%E9%A9%AC&pvid=a56e3239e2874140a0ad91f8435d91ee";
a = urllib.request.urlopen(url);
html = a.read();
html = html.decode('utf-8');
print(html);
return html;
def get():
html = get_content();
reg = re.compile(r'class="gl-i-wrap" >.*?<a target="_blank" title="(.*?)".*?<strong class="J_1342717" data-done="1"><em>¥</em><i>(.*?)</i></strong>.*?<span class="price-plus-1" title="PLUS會員專享價"><em>(.*?)</em><i></i></span>',re.S)
items = re.findall(reg,html);
print(items);
#print(items[0]);
#print(items[0][0]);
return items;
def write_excel(items):
newtable = 'text4.xls';
wb = xlwt.Workbook(encoding = 'utf-8');
ws = wb.add_sheet('text1');
title = ['名稱','價格','會員價'];
for i in range(0,3):
ws.write(0,i,title[i]);
index = 1;
for item in items:
for i in range(0,3):
ws.write(index,i,item[i]);
index += 1;
wb.save(newtable);
if __name__ == '__main__':
items = get();
write_excel(items);
uj5u.com熱心網友回復:
> 為什么python爬蟲爬京東商品資訊,連網路源代碼都沒有?你指的是:
url = "https://search.jd.com/Search?keyword=%E5%AE%9D%E9%A9%AC&enc=utf-8&wq=%E5%AE%9D%E9%A9%AC&pvid=a56e3239e2874140a0ad91f8435d91ee";
a = urllib.request.urlopen(url);
html = a.read();
html = html.decode('utf-8');
print(html);
沒有輸出你要的html原始碼?
那十有八九是:沒有設定合適的headers,尤其是 User-Agent(以及其他header)
另外:
Python網路庫,建議把urllib換成 requests
在加上合適的User-agent等header
import requests
UserAgent_Mac_Chrome = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"
headers = {'User-Agent': UserAgent_Mac_Chrome}
url = "https://search.jd.com/Search?keyword=%E5%AE%9D%E9%A9%AC&enc=utf-8&wq=%E5%AE%9D%E9%A9%AC&pvid=a56e3239e2874140a0ad91f8435d91ee"
resp = requests.get(url, headers=headers)
print(resp.text)
估計就可以得到你要的html了。
官網檔案:
快速上手 — Requests 2.18.1 檔案
更多內容詳見(之后會發布)
Python心得:http網路庫
uj5u.com熱心網友回復:
好的 謝謝您轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/87894.html
標籤:其他技術專區
