宣告只是玩具爬蟲,得到自己的所有博客地址,然后隨機訪問,增加訪問量;
思想很簡單,包含了2個類IPSpyder和CSDN類,前者保證一周內get一次ip代理到本地,后者包含3個方法負責隨機讀取博客,getBlogList()方法的輸入是個人博客的主頁地址,輸出是個人博客所有的鏈接,getBlogTitleAndCount()的輸入時單個博客的url地址,拿到當前博客的訪問量和標題,輸出;
后續優化:
- 增加tdqm的進度條顯示;
- 考慮多執行緒方式
IP代理的爬蟲參考:爬取IP代理
import requests
import lxml
from bs4 import BeautifulSoup
import os
import string
import random
import time
import aiohttp
import asyncio
from tqdm import tqdm
import os
import datetime
class IPSpyder(object):
def __init__(self):
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
self.sixsix_url_range = 35
self.kaixin_url_range = 2
self.kuai_url_range = 2
self.ip_list_all = []
self.ip_ok_list_all = []
self.url = 'https://blog.csdn.net/yezonggang/article/details/112991188'
self.ip_avaliable_file = 'F:/2020-11-04/csdn_sypder_20210124/ip_avaliable.txt'
def get_html(self, url, flag):
try:
headers = self.headers
response = requests.get(url, headers=headers)
response.raise_for_status()
if flag:
response.encoding = 'utf-8'
else:
response.encoding = 'gb2312'
return response.text
except Exception as err:
return '請求例外'
def get_66ip(self):
#ip_list = []
for index in range(1, self.sixsix_url_range):
count = 0
province = ''
url = 'http://www.66ip.cn/areaindex_{}/1.html'.format(index)
html = self.get_html(url, flag=False)
soup = BeautifulSoup(html, 'lxml')
tr_list = soup.find_all(name='tr')
for tr_ in tr_list[2:]:
td_list = tr_.find_all(name='td')
ip = td_list[0].string
port = td_list[1].string
province = td_list[2].string
ip_port = ip + ':' + port
self.ip_list_all.append(ip_port)
count += 1
print('Saved {0} {1} ip.'.format(province, count))
# 速度不要太快哦!, 否則獲取不到頁面內容
time.sleep(3)
print('66 daili Finished!!!')
def get_kaixinip(self):
#ip_list = []
for index in range(1, self.kaixin_url_range):
count = 0
url = 'http://www.kxdaili.com/dailiip/1/{}.html'.format(index)
html = self.get_html(url, False)
soup = BeautifulSoup(html, 'lxml')
tr_list = soup.find_all(name='tr')
for tr_ in tr_list[2:]:
td_list = tr_.find_all(name='td')
ip = td_list[0].string
port = td_list[1].string
ip_port = ip + ':' + port
self.ip_list_all.append(ip_port)
count += 1
print('Saved {0} page {1} ip.'.format(index, count))
# 速度不要太快哦!, 否則獲取不到頁面內容
time.sleep(3)
print('kaixindaili Finished!!!')
def get_goubanjiaip(self):
#ip_list = []
url = 'http://www.goubanjia.com/'
html = self.get_html(url, False)
soup = BeautifulSoup(html, 'lxml')
td_list = soup.find_all(class_='ip')
for td_ in td_list:
ip_ = ''
for child in td_.children:
if child == ':':
ip_ += child
elif not child.attrs:
ip_ += child.get_text()
elif list(child.attrs.keys())[0] == 'class':
ip_ = ip_ + child.get_text()
elif child.attrs['style'] == 'display:inline-block;' or child.attrs['style'] == 'display: inline-block;':
ip_ += child.get_text()
self.ip_list_all.append(ip_)
print('quanwang daili Finished!!!')
# 快代理
def get_kuaidaili(self):
#ip_list = []
for index in range(1, self.kuai_url_range):
count = 0
url = 'https://www.kuaidaili.com/free/inha/{}/'.format(index)
html = self.get_html(url, False)
soup = BeautifulSoup(html, 'lxml')
tr_list = soup.find_all(name='tr')
for tr_ in tr_list[1:]:
td_list = tr_.find_all(name='td')
ip = td_list[0].string
port = td_list[1].string
ip_port = ip + ':' + port
self.ip_list_all.append(ip_port)
count += 1
print('Saved {0} page {1} ip.'.format(index, count))
# 速度不要太快哦!, 否則獲取不到頁面內容
time.sleep(3)
print('kuaidaili Finished!!!')
async def test_ip(self, ip_, url):
#global ip_ok
conn = aiohttp.TCPConnector(verify_ssl=False)
async with aiohttp.ClientSession(connector=conn) as session:
try:
proxy_ip = 'http://' + ip_
print('正在測驗: ' + proxy_ip)
async with session.get(url=url, headers=self.headers, proxy=proxy_ip, timeout=15) as response:
if response.status == 200:
print('代理可用: ' + ip_)
self.ip_ok_list_all.append(ip_)
else:
print('請求回應碼不合法 ' + ip_)
except:
print('代理請求失敗', ip_)
def run_test_ip_write_to_file(self):
#csdn 點贊關注私聊發^-^
# 我的博客串列,后面要跟翻頁list/1
# 我的博客串列有幾頁?
# header
# 定義一個類 CSDN
# csdn_url='https://blog.csdn.net/yezonggang/article/details/106344148'
class CSDN(object):
# 類的靜態變數
def __init__(self):
self.my_csdn = 'https://blog.csdn.net/yezonggang/article/list/'
self.my_list = 5
self.csdn_url = ''
self.proxies = [{'http': 'socks5://183.195.106.118:8118'}]
self.blogList = []
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
# 從博客首頁進去,遍歷得到我的博客串列,把博客地址塞進self.blogList[]
def getBlogList(self):
i = 1
print('-------------------------------begin----------------------------')
while(i <= self.my_list):
response = requests.get(self.my_csdn+str(i), headers=self.headers)
response.enconding = 'utf-8'
conent = response.content.decode('utf-8')
soup = BeautifulSoup(conent, 'lxml')
a_tag_content = soup.findAll('a')
for a_tag in a_tag_content:
a_tag_content = str(a_tag.get('href'))
if('details' in a_tag_content and 'comments' not in a_tag_content):
self.blogList.append(a_tag_content)
#print (a_tag_content)
print('Success, already append ' +
str(len(self.blogList)) + ' to the blogList!')
i = i+1
# print (self.blogList)
# 隨機遍歷self.blogList[]里面的博客鏈接,得到博客的標題和次數,并輸出
def getBlogTitleAndCount(self, proxy):
proxy_support = {
'http': 'http://'+proxy,
'https': 'https://'+proxy,
}
response = requests.get(
self.csdn_url, headers=self.headers, proxies=proxy_support)
response.enconding = 'utf-8'
conent = response.content.decode('utf-8')
soup = BeautifulSoup(conent, 'lxml')
# 得到當前博客的標題:資料挖掘演算法和實踐(二十一):kaggle經典-職場離職率分析案例解讀
blog_title = soup.title.string
# 得到當前博客的訪問量統計值,顯示出來
blog_counts = soup.find_all('span')
for blog_count in blog_counts:
blog_count_single_class = blog_count.get('class')
if(blog_count_single_class is not None and blog_count_single_class[0] == 'read-count'):
blog_count_now = blog_count.string
print('當前讀取的博客地址是:【'+self.csdn_url+'】\n' +
'當前讀取的博客地址是:【'+blog_title + '】\n' +
'當前使用的代理IP是:【'+proxy + '】\n' +
'當前博客的閱讀統計是:【_' + blog_count_now + '_次】')
def beginTO(self, proxy):
self.getBlogList()
self.csdn_url = random.choice(self.blogList)
self.getBlogTitleAndCount(proxy)
#random_time=random.uniform(sleepTimeMin, sleepTimeMax)
#print("Begin to sleep now,Sleep time: "+str(random_time))
# time.sleep(random_time)
self.blogList = []
# 邏輯開始,首先判定本地的可用ip檔案的創建戳是不是超過1周或者檔案是空,若是就重新重繪,不然直接開始刷;
ip_avaliable = "F:/2020-11-04/csdn_sypder_20210124/ip_avaliable.txt"
mtime = os.stat(ip_avaliable).st_ctime
# 如果檔案存在,并且創建時間是7天內,并且非空
if(not os.path.exists(ip_avaliable) or ((time.time()-mtime)/(3600*24) > 7) or not os.path.getsize(ip_avaliable)):
# 先刷代理后刷博客
ipSpyder = IPSpyder()
ipSpyder.get_66ip()
#ipSpyder.get_kaixinip()
#ipSpyder.get_goubanjiaip()
#ipSpyder.get_kuaidaili()
ipSpyder.run_test_ip_write_to_file()
# 直接呼叫開始刷
file_ip = open(ip_avaliable, 'r')
ip_avaliable_list = file_ip.read().split(",")
file_ip.close()
# print(ip_avaliable_list)
proxy_now = random.choice(ip_avaliable_list)
csdn = CSDN()
while True:
csdn.beginTO(proxy_now)
time.sleep(10)
#csdn 點贊關注私聊發^-^
#ipSpyder =IPSpyder()
# ipSpyder.get_66ip()
# ipSpyder.get_kaixinip()
# ipSpyder.get_goubanjiaip()
# ipSpyder.get_kuaidaili()
# ipSpyder.run_test_ip()
#
# time.localtime(statinfo)
#print ('得到了一系列的IP代理,總共有 '+str(len(ipSpyder.ip_list_all))+' 個;')
#print ('經過測驗總共有 '+str(len(ipSpyder.ip_ok_list_all))+' 個IP代理可用;')
#file = open("ip_avaliable.txt", 'w')
# file.write(ip_ok_list_all)
# file.close()
輸出的范例如下:
Success, already append 48 to the blogList!
Success, already append 96 to the blogList!
Success, already append 144 to the blogList!
Success, already append 192 to the blogList!
Success, already append 211 to the blogList!
當前讀取的博客地址是:【https://blog.csdn.net/yezonggang/article/details/105723456】
當前讀取的博客地址是:【資料挖掘演算法和實踐(一):線性回歸和邏輯回歸(house_price資料集)_葉子葉來-CSDN博客】
當前使用的代理IP是:【211.144.213.145:80】
當前博客的閱讀統計是:【_351_次】
轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/251769.html
標籤:python
