python 爬蟲出現錯誤import re
from lxml import etree
import requests
import pymongo
import math
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"}
def get_districts():
url = "https://sh.lianjia.com/ershoufang/beicai/"
r = requests.get(url, headers=headers)
content = r.content.decode("utf-8")
root = etree.HTML(content)
div_nodes = root.xpath('//div[@data-role="ershoufang"]')
div_node = div_nodes[0]
a_nodes = div_node.xpath('./div/a')
result = []
for a_node in a_nodes:
district_name = a_node.text
district_url = "https://sh.lianjia.com" + a_node.attrib["href"]
result.append([district_name, district_url])
print(district_name)
return result
def get_sub_districts():
districts = get_districts()
client = pymongo.MongoClient()
db = client["house"]
for district in districts:
district_name = district[0]
district_url = district[1]
r = requests.get(district_url, headers=headers)
content = r.content.decode("utf-8")
root = etree.HTML(content)
a_nodes = root.xpath('//div[@data-role="ershoufang"]/div[2]/a')
for a_node in a_nodes:
sub_district_name = a_node.text
sub_district_url = "https://sh.lianjia.com" + a_node.attrib["href"]
db.subdistricts.insert({"district_name": district_name, "sub_district_name": sub_district_name,
"sub_district_url": sub_district_url})
r = requests.get(sub_district_url, headers=headers)
content = r.content.decode("utf-8")
root = etree.HTML(content)
span_node = root.xpath('//h2[contains(@class, "total")]/span')[0]
num = int(span_node.text)
return num
def get_page_num(sub_district_url):
r = requests.get(sub_district_url, headers = headers)
content = r.content.decode("utf-8")
root = etree.HTML(content)
span_node = root.xpath('//h2[contains(@class, "total")]/span')[0]
num = int(span_node.text)
return num
def get_houses_by_sub_district(district_name, sub_district_name, sub_district_url):
house_num = get_page_num(sub_district_url)
page_num = math.ceil(house_num/30)
client = pymongo.MongoClient()
db = client["house"]
for i in range(1, page_num+1, 1):
url_patt = sub_district_url + "pg{}"
url = url_patt.format(i)
r = requests.get(url, headers=headers)
content = r.content.decode("utf-8")
root = etree.HTML(content)
li_nodes = root.xpath('//ul[@class="sellListContent"]/li')
for li_node in li_nodes:
title = li_node.xpath('.//div[@class="title"]/a')[0].text
info_nodes = li_node.xpath('.//div[@class="address"]/div[@class="houseInfo"]/span')
xiaoqu_nodes = li_node.xpath('.//div[@class="flood"]/div[@class="positionInfo"]/a')
price_nodes = li_node.xpath('.//div[@class="priceInfo"]/div[@class="totalPrice"]/span')
up_nodes = li_node.xpath('.//div[@class="priceInfo"]/div[@class="unitPrice"]/span')
if len (price_nodes) > 0:
price = float(price_nodes[0].text)
if len(up_nodes) > 0:
up_text = up_nodes[0].text
matched = re.search(r'單價(.*)元/平米', up_text)
if matched:
up_price = float(matched.group(1))
if len(xiaoqu_nodes) > 0:
xiaoqu_node = xiaoqu_nodes[0]
xiaoqu_name = xiaoqu_node.text
if len(info_nodes) > 0:
info_text = info_nodes[0].tail
parts = info_text.split("|")
size_text = parts[1]
buildyear_text = parts[5]
matched = re.search(r'([\d\.]+)平米', size_text)
if matched:
size = float(matched.group(1))
matched = re.search(r'([\d\.]+)年建', buildyear_text)
if matched:
buildyear = int(matched.group(1))
huxing = parts[0]
chaoxiang = parts[2]
zhuangxiu = parts[3]
cenggao = parts[4]
louxing = parts[6]
house = {
"title": title,
"price": price,
"up_price": up_price,
"xiaoqu_name": xiaoqu_name,
"size": size,
"buildyear": buildyear,
"huxing": huxing,
"chaoxiang": chaoxiang,
"zhuangxiu": zhuangxiu,
"cenggao": cenggao,
"louxing": louxing,
"district_name": district_name,
"sub_district_name": sub_district_name,
}
db.house.insert(house)
def get_all_house():
client = pymongo.MongoClient()
db = client["house"]
cursor = db.subdistricts.find()
for item in cursor:
district_name = item["district_name"]
sub_district_name = item["sub_district_name"]
sub_district_url = item["sub_district_url"]
print(district_name, sub_district_name)
get_houses_by_sub_district(district_name, sub_district_name, sub_district_url)
if __name__ == "__main__":
get_all_house()
浦東 北蔡
Traceback (most recent call last):
File "C:/Users/微星/PycharmProjects/untitled1/clean.py", line 138, in <module>
get_all_house()
File "C:/Users/微星/PycharmProjects/untitled1/clean.py", line 133, in get_all_house
get_houses_by_sub_district(district_name, sub_district_name, sub_district_url)
File "C:/Users/微星/PycharmProjects/untitled1/clean.py", line 62, in get_houses_by_sub_district
house_num = get_page_num(sub_district_url)
File "C:/Users/微星/PycharmProjects/untitled1/clean.py", line 53, in get_page_num
r = requests.get(sub_district_url, headers = headers)
File "E:\Anaconda\envs\untitled1\lib\site-packages\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "E:\Anaconda\envs\untitled1\lib\site-packages\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "E:\Anaconda\envs\untitled1\lib\site-packages\requests\sessions.py", line 519, in request
prep = self.prepare_request(req)
File "E:\Anaconda\envs\untitled1\lib\site-packages\requests\sessions.py", line 462, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "E:\Anaconda\envs\untitled1\lib\site-packages\requests\models.py", line 313, in prepare
self.prepare_url(url, params)
File "E:\Anaconda\envs\untitled1\lib\site-packages\requests\models.py", line 387, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '北蔡': No schema supplied. Perhaps you meant http://北蔡?
uj5u.com熱心網友回復:
File "C:/Users/微星/PycharmProjects/untitled1/clean.py", line 53, in get_page_numr = requests.get(sub_district_url, headers = headers)
你列印一下sub_district_url是什么東西
uj5u.com熱心網友回復:
樓主解決了沒?uj5u.com熱心網友回復:
還有大神能幫幫我嗎uj5u.com熱心網友回復:
from urllib.parse import quotequote 一下 sub_district_url 的中文部分,再拼接回來,試試
uj5u.com熱心網友回復:
有大神能解市一下上面那位說的quote具體怎么操作嗎uj5u.com熱心網友回復:
百度
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/93730.html
