僅僅展示單執行緒的代碼,多執行緒可以自行探索不在過多贅述
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/12/31 12:02
# @Author : huni
# @File : xxx單函式.py
# @Software: PyCharm
import requests
from lxml import etree
from urllib import parse
import os
if __name__ == '__main__':
m_path = './xxx'
if not os.path.exists(m_path):
os.mkdir(m_path)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
kw = '陸萱萱'
keyword = parse.quote(kw, encoding='utf-8')
url = f'https://xchina.co/search/keyword-{keyword}.html'
resp = requests.get(url=url,headers=headers).text
tree = etree.HTML(resp)
href_part_list = list(set(tree.xpath('/html/body/div[5]/div[1]/div/div[2]/div[3]/div[1]/div[1]//@href')))
for href_part in href_part_list:
href = 'https://xchina.co/' + href_part
resp1 = requests.get(url=href,headers=headers).text
tree1 = etree.HTML(resp1)
div_list = tree1.xpath('/html/body/div[5]/div[1]/div/div[2]/div[3]/div[2]/div')
for div in div_list:
href1 = 'https://xchina.co/' + div.xpath('./a[1]/@href')[0]
resp2 = requests.get(url=href1,headers=headers).text
tree2 = etree.HTML(resp2)
title = tree2.xpath('/html/head/title/text()')[0]
title_path = m_path + f'/{title}'
if not os.path.exists(title_path):
os.mkdir(title_path)
page_num = int(tree2.xpath('/html/body/div[5]/div[1]/div/div[2]/div[3]/div[1]//text()')[-4])
for j in range(1,page_num+1):
href2 = href1.replace('.html',f'/{j}.html')
resp3 = requests.get(url=href2,headers=headers).text
tree3 = etree.HTML(resp3)
a_list = tree3.xpath('/html/body/div[5]/div[1]/div/div[2]/div[3]/div[2]/a')
for a in a_list:
src = 'https://xchina.co' + a.xpath('./@href')[0]
jpg_data = requests.get(url=src,headers=headers).content
jpg_name = src.split('/')[-1]
jpg_path = title_path + f'/{jpg_name}'
with open(jpg_path,'wb') as fp:
fp.write(jpg_data)
print(jpg_name,'下載完成')
------寫在后面:
大家如果覺得小編的代碼有用,可以多多關注小編,
同時小編的公眾號也開通了,大家可以關注下,后續進行粉絲回饋,大家一起學習python叭

打賞小編點這里哦

轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/243584.html
標籤:python
上一篇:Python自動簽退腳本
