使用Python打造自己的資訊收集工具，你也來試試吧（附原始碼）-有解無憂

介紹：

該篇章主要介紹如何撰寫自己的資訊收集工具，主要流程如下：

1、向bing搜索引擎發起request請求，獲取url資料
2、使用正則運算式對獲取的資料進行處理
3、用多執行緒，對處理的資料進行二次請求，回傳標題等資料
4、使用openyxl模塊，將資料保存為.xlsx格式

請注意：

該篇章目的是熟悉Python編程，學習Python的一些常見模塊，在撰寫程式的程序中會有很多操作和方式方法，望大家能共同加油學到東西，本文僅用于技術討論與研究，這里使用的技術僅用于學習教育目的，如果列出的技術用于其他任何目標，本站及作者概不負責，

本文涉及到模塊有：

#coding:utf-8
import requests     #發起request請求
import urllib3      #處理請求https例外報錯問題
import re       #使用正則運算式對請求到的資料進行處理
from optparse import OptionParser   #自定義輸入引數
import threading        #多執行緒模塊
import queue            #多執行緒輔助模塊，使用佇列的方式對多執行緒進行控制
from bs4 import BeautifulSoup   #與re類似 使用正則運算式對請求到的資料進行處理
import time,datetime    #獲取當前的時間
from openpyxl import  * #資料處理，將獲取到的資料保存在excel檔案中

屬性：

heads = {                       #全域變數  請求頭
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)                          AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',   #模擬瀏覽器請求
'Connection':'close',
'Accept-Encoding':'gzip, deflate'
}
count=1                         #全域變數  用于序號欄位
queueLock = threading.Lock()    #全域變數  使用執行緒鎖處理 執行緒例外問題
class DoRun(threading.Thread):  #自定義 多執行緒運行時使用的類

方法：

def get_Input():    #獲取search陳述句 和 page
def getUrls(search,page):       #構造搜索陳述句，在bing搜索引擎搜索資料并回傳urls
def req(url):   #對url進行驗證，回傳numb,url,title,status
def init_excel(filename):  #創建.xlsx表格，并初始化內容
def Save_Date(date,filename):   #將資料存盤到表格當中
def run():     #核心代碼

完整代碼如下：

#coding:utf-8
import requests     #發起request請求
import urllib3      #處理請求https例外報錯問題
import re       #使用正則運算式對請求到的資料進行處理
from optparse import OptionParser   #自定義輸入引數
import threading        #多執行緒模塊
import queue            #多執行緒輔助模塊，使用佇列的方式對多執行緒進行控制
from bs4 import BeautifulSoup   #與re類似 使用正則運算式對請求到的資料進行處理
import time,datetime    #獲取當前的時間
from openpyxl import  * #資料處理，將獲取到的資料保存在excel檔案中

heads = {                       #全域變數  請求頭
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',   #模擬瀏覽器請求
            'Connection':'close',
            'Accept-Encoding':'gzip, deflate'
        }
count=1                         #全域變數  用于序號欄位
queueLock = threading.Lock()    #全域變數  使用執行緒鎖處理 執行緒例外問題
class DoRun(threading.Thread):  #自定義 多執行緒運行時使用的類
    def __init__(self,queue,filename):
        threading.Thread.__init__(self)
        self._queue=queue
        self._filename=filename
    def run(self):
        while not self._queue.empty():
            js=req(self._queue.get())
            #print(js)
            queueLock.acquire()
            if(js):
                Save_Date(js,self._filename)
            queueLock.release()
def init_excel(filename):  #創建.xlsx表格，并初始化內容
    wb=Workbook()
    filename=filename+".xlsx"
    ws=wb.create_sheet(index=0,title="域名")
    head=['序號','域名','標題','狀態']
    for i in range(0,4):
        ws.cell(1,i+1).value=head[i]
    wb.save(filename)
def Save_Date(date,filename):   #將資料存盤到表格當中
    filename=filename+".xlsx"
    wb_save=load_workbook(filename)
    ws_save=wb_save.worksheets[0]
    current_row=ws_save.max_row+1
    current_col=1
    for key in date:
        ws_save.cell(date['numb']+1,current_col).value=str(date[key])
        current_col+=1
    wb_save.save(filename) 
def req(url):   #對域名進行驗證，回傳狀態碼，title
    global count
    dir={'numb':0,'url':'url','title':'None','status':0}
    stat=0
    title="None"

    try:
        urllib3.disable_warnings()
        response = requests.get(url=url,headers=heads,verify=False,timeout=10)   #請求漏洞的url
        if response.status_code == 200:
            bs=BeautifulSoup(response.content,"html.parser")
            title=bs.find("title").text
            stat=response.status_code
            dir['numb']=count
            dir['url']=url
            dir['title']=title
            dir['status']=stat
            count+=1
            print("[+]"+url+"\ttitle:"+title)
            return dir
        else:
            print('[-]請求失敗：\t{}\t{}'.format(url,response.status_code))
    except Exception as e:
        print('[-]請求失敗: {}\t'.format(e,url))
def getUrls(search,page):       #構造搜索陳述句，在bing搜索引擎 搜索資料并回傳urls
    count=1
    urls=[]
    url="https://cn.bing.com/search?q={}&first={}"
    for i in range(1,page):
        if(i!=1):
            count=(i-2)*10+9
        url=url.format(search,i)
        try:
            resp=requests.get(url=url,headers=heads)
            html=resp.text
            if(resp.status_code==200):
                res=re.findall(r'<a target="_blank" href="(.*?)"',html)
                for u in res:
                    if(u not in urls):
                        urls.append(u)
            else:
                 print('[-]請求失敗：\t{}\t{}'.format(url,resp.status_code))
        except Exception as e:
            print('[-]請求失敗: {}\t'.format(e,url))

    return urls
def get_Input():    #獲取search陳述句 和 page
    optParser = OptionParser()
    optParser.add_option('-s','--search',action = 'store',type = "string" ,dest = 'search',help='漏掃檔案的目錄',default="search_def")
    optParser.add_option("-p","--page", action="store", type="int",dest="page",help='要搜索的頁數',default=10)
    optParser.add_option("-t","--threads", action="store", type="int",dest="threads",help='執行緒數量，默認為10',default=10)
    (options , args) = optParser.parse_args()
    return options.search,options.page,options.threads
def run():
    que=queue.Queue()
    print(datetime.datetime.now())              #列印開始時間
    search,page,thread_count=get_Input()        #獲取輸入的引數 如searce 、執行緒數、頁面數
    if(search=="search_def"):
        print(r"[-]錯誤，未輸入指定引數：python3 temp.py -s site:qq.com [-p 10] [-t 20] ")
        return
    print(search)
    threads=[]
    urls=getUrls(search,page)     #獲取urls
    filename=''.join(re.findall("([a-z,0-9])",search))      #將輸入的內容進行處理 ，作為檔案的名稱
    init_excel(filename)        #創建并初始化excel
    for url in urls:
        que.put(url)            #將獲取的urls添加到queue中
    for i in range(thread_count):
        threads.append(DoRun(que,filename))     #使用多執行緒 默認呼叫 run()函式
    for i in threads:
        i.start()               #啟動多執行緒
    for i in threads:
        i.join()                #等待執行緒結束

    print(datetime.datetime.now())      #列印結束時間
run()

使用說明：

python3 .\bingying.py -s "site:.com" -p 10 -t 30
Options:
-h, --help  show this help message and exit
-s SEARCH, --search=SEARCH    搜索的語法(默認 site:.com)
-p PAGE, --page=PAGE          要搜索的頁數（一頁10條資料，默認10頁）
-t THREADS, --threads=THREADS 執行緒數量，(默認為10)

結果呈現

看到這里，覺得有幫助的小伙伴點贊支持一下博主~

轉載請註明出處，本文鏈接：https://www.uj5u.com/qita/356737.html

標籤：其他

上一篇：又一惡意軟體：1000多名受害者均在韓國，不排除其他地區被攻擊的可能

下一篇：【C++學習】——（四）型別