python進行同以檔案夾下的HTML，出現錯誤urllib3.exceptions.MaxRetryError-有解無憂

from urllib3 import *
from re import *
http = PoolManager()
disable_warnings()
# 下載HTML檔案
def download(url):
    result = http.request('GET', url)
    # 將下載的HTML檔案代碼用utf-8格式解碼成字串
    htmlStr = result.data.decode('utf-8')
    return htmlStr
# 分析HTMl檔案
def analyse(htmlStr):
    # 利用正則運算式獲取所有的a標簽，如<a href="https://bbs.csdn.net/topics/a.html">first page</a>
    aList = findall('<a[^>]*>', htmlStr)
    result = []
    # 對a標簽串列進行迭代
    for a in aList:
        # 利用正則運算式從a標簽中提取出href屬性的值，如<a href='https://bbs.csdn.net/topics/a.html'>a</a>
        g = search('href[\s]*=[\s]*[\'"]([^>\'""]*)[\'"]', a)
        if g !=None:
            # 獲取a標簽href屬性的值，href屬性值就是第一個分組的值
            url = g.group(1)
            # 將url變成絕對鏈接
            url = 'http://localhost:8888/files/' +url
            # 將提取出的url追加到result串列中
            result.append(url)
    return result
# 用于從入口點抓取HTML檔案的函式、
def crawler(url):
    # 輸出正在抓取的url
    print(url)
    # 下載HTML檔案
    html = download(url)
    # 分析HTML代碼
    urls = analyse(html)
    # 對每一個url遞回呼叫crawler函式
    for url in urls:
        crawler(url)
# 從入口點url開始抓取所有的HTML檔案
crawler('http://localhost:8888/files')

HTML檔案的內容很簡單，比如：

<html>
    <head><title>index</title></head>
    <body>
    <a href="https://bbs.csdn.net/topics/a.html">first page</a>
    <p>
    <a href="https://bbs.csdn.net/topics/b.html">second page</a>
    <p>
    <a href="https://bbs.csdn.net/topics/c.html">third page</a>
    <p>
    </body>
</html>

出現錯誤：urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='localhost', port=8888): Max retries exceeded with url: /files (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000002344DEAC978>: Failed to establish a new connection: [WinError 10061] 由于目標計算機積極拒絕，無法連接。'))

轉載請註明出處，本文鏈接：https://www.uj5u.com/qita/93721.html

標籤：腳本語言(Perl/Python)

上一篇：dataframe 插入多行

下一篇：大俠在哪里！Pycharm跳轉關鍵字的時候，跳到了另一個目錄的一個同名的檔案，這個問題怎么處理呢