python爬蟲詳解（三）——爬取世界常用密碼并保存到字典內-有解無憂

今天的爬蟲有點特別，先爬取文本，然后base64解碼，然后再存盤至文本字典內

點個贊留個關注吧！！

首先我們需要爬取網站鏈接

代碼如下：

爬取后我們只要password=內容，只要內容，不需要鏈接，所以我們這里使用了

res_6 = re.findall('password=(.*)', e) #爬取密碼鏈接password=？

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import re
import requests

r = requests.get(f'https://www.passwordrandom.com/most-popular-passwords/page/1')   #要爬取的網站鏈接
html = r.content
soup = BeautifulSoup(html,'html.parser')    #html.parser是決議器

div_people_list = soup.find_all('table', attrs={'class': 'table'})

for a in div_people_list:
    for b in a.find_all('td'):
        for c in b.find_all('a', attrs={'rel': 'nofollow'}):
            e = c['href']   #鏈接
            res_6 = re.findall('password=(.*)', e) #爬取密碼鏈接password=？
            ty = res_6[0]  #讀取字典的第一位
            tu = ty.replace("%3d", "").strip()  #去除文本的%3d
            print(tu)

爬取后是這樣的，這是沒有解碼的

現在我們開始解碼，因為解碼是會有問題的，所有我中間解碼的時候又加了一道檢測程式，主要是檢測有沒有被解碼，如果沒有被解碼，則添加《=》并再次解碼，這樣就能全部解碼了

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import re
import base64
import requests
f = open('爬取檔案.txt','a+',encoding='UTF-8')  #寫入檔案
#base64解碼
def base64decoding(src):
    try:
        while True:
            src = base64.b64decode(src).decode()  #解碼
    except Exception:
        if src.endswith('=') == True: #判斷是否解碼成功
            ty = f'{src}='          #再次添加=并解碼
            base64decoding(ty)  #再次
        else:
            print(src)


r = requests.get(f'https://www.passwordrandom.com/most-popular-passwords/page/1')   #要爬取的網站鏈接
html = r.content
soup = BeautifulSoup(html,'html.parser')    #html.parser是決議器

div_people_list = soup.find_all('table', attrs={'class': 'table'})

for a in div_people_list:
    for b in a.find_all('td'):
        for c in b.find_all('a', attrs={'rel': 'nofollow'}):
            e = c['href']   #鏈接
            res_6 = re.findall('password=(.*)', e) #爬取密碼鏈接password=？
            ty = res_6[0]  #讀取字典的第一位
            tu = ty.replace("%3d", "").strip()  #去除文本的%3d
            base64decoding(tu+'=') #解碼

已解碼

現在我們需要寫入檔案，我把代碼又添加了一點

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import re
import base64
import requests

inpu = input('關注博主不迷路！！！\n\nhttps://jiangongfang.blog.csdn.net/\nhttps://blog.51cto.com/u_15449377\n\n-------------------------爬取世界常用密碼-------------------------\n使用方法：輸入數字1~100，數值越大，運行越慢\n1=100，2=200，3=300.....99=9900，100=10000\n-------------------------爬取世界常用密碼------------------------- \n請輸入數值：')
f = open('爬取檔案.txt','a+',encoding='UTF-8')  #寫入檔案
#base64解碼
def base64decoding(src):
    try:
        while True:
            src = base64.b64decode(src).decode()  #解碼
    except Exception:
        if src.endswith('=') == True: #判斷是否解碼成功
            ty = f'{src}='          #再次添加=并解碼
            base64decoding(ty)  #再次
        else:
            f.write(src+'\n') #寫入檔案內

for x in range(int(inpu)):
    r = requests.get(f'https://www.passwordrandom.com/most-popular-passwords/page/{x}')   #要爬取的網站鏈接
    html = r.content
    soup = BeautifulSoup(html,'html.parser')    #html.parser是決議器

    div_people_list = soup.find_all('table', attrs={'class': 'table'})

    for a in div_people_list:
        for b in a.find_all('td'):
            for c in b.find_all('a', attrs={'rel': 'nofollow'}):
                e = c['href']   #鏈接
                res_6 = re.findall('password=(.*)', e) #爬取密碼鏈接password=？
                ty = res_6[0]  #讀取字典的第一位
                tu = ty.replace("%3d", "").strip()  #去除文本的%3d
                base64decoding(tu+'=') #解碼

其中里面的數值控制著鏈接，填1就相當于爬取100個密碼，2就是200個，30就是3000個，當然，最高位100，填的越高，電腦運行越慢，配置低的盡量填50以下，太高怕你們電腦受不了

完整代碼：

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import re
import base64
import requests

inpu = input('關注博主不迷路！！！\n\nhttps://jiangongfang.blog.csdn.net/\nhttps://blog.51cto.com/u_15449377\n\n-------------------------爬取世界常用密碼-------------------------\n使用方法：輸入數字1~100，數值越大，運行越慢\n1=100，2=200，3=300.....99=9900，100=10000\n-------------------------爬取世界常用密碼------------------------- \n請輸入數值：')
f = open('爬取檔案.txt','a+',encoding='UTF-8')  #寫入檔案
#base64解碼
def base64decoding(src):
    try:
        while True:
            src = base64.b64decode(src).decode()  #解碼
    except Exception:
        if src.endswith('=') == True: #判斷是否解碼成功
            ty = f'{src}='          #再次添加=并解碼
            base64decoding(ty)  #再次
        else:
            f.write(src+'\n') #寫入檔案內

for x in range(int(inpu)):
    r = requests.get(f'https://www.passwordrandom.com/most-popular-passwords/page/{x}')   #要爬取的網站鏈接
    html = r.content
    soup = BeautifulSoup(html,'html.parser')    #html.parser是決議器

    div_people_list = soup.find_all('table', attrs={'class': 'table'})

    for a in div_people_list:
        for b in a.find_all('td'):
            for c in b.find_all('a', attrs={'rel': 'nofollow'}):
                e = c['href']   #鏈接
                res_6 = re.findall('password=(.*)', e) #爬取密碼鏈接password=？
                ty = res_6[0]  #讀取字典的第一位
                tu = ty.replace("%3d", "").strip()  #去除文本的%3d
                base64decoding(tu+'=') #解碼

轉載請註明出處，本文鏈接：https://www.uj5u.com/houduan/403973.html

標籤：python

上一篇：90后，要有多少存款才正常？答案太扎心了

下一篇：阿里云大學＞【Python學習路線】Python語言基礎自測考試 - 初級難度 | 包過關系列