Python爬圖蟲網站圖片-有解無憂

import re
import requests
import urllib.request
from bs4 import BeautifulSoup
from urllib.request import urlopen
import time
import os

headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
#https://stock.tuchong.com/?source=tc_pc_home
urlindex='https://stock.tuchong.com/?source=tc_pc_home'
reqindex=urllib.request.Request(url=urlindex,headers=headers)
htmlCodeIndex=urllib.request.urlopen(reqindex).read()
dataIndex=htmlCodeIndex.decode('UTF-8')

soupIndex=BeautifulSoup(dataIndex,'html.parser')

regIndex=r'"topicId":.*?,'#匹配字符
reg_ImgIndex=re.compile(regIndex)#編譯一下
imglistIndex=reg_ImgIndex.findall(dataIndex)
print(imglistIndex)
xIndex=0

for igIndex in imglistIndex:
    xIndex=xIndex+1
    print(igIndex[10:-1])

    indexId=igIndex[10:-1]
    # headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
    topicId = 49364
    url = 'https://stock.tuchong.com/topic?topicId=' + str(topicId)
    req = urllib.request.Request(url=url, headers=headers)
    # urllib.request.urlopen(req).read()
    # page=urllib.request.urlopen(url)
    htmlCode = urllib.request.urlopen(req).read()
    data = htmlCode.decode('UTF-8')
    # print(data)

    # pagefile=open('pagecode1.txt','wb')
    # pagefile.write(htmlCode)
    # page.close()

    soup = BeautifulSoup(data, "html.parser")

    # reg = r'src="https://bbs.csdn.net/topics/(.+?\.jpg)"'#寫出圖片的正則運算式： reg = r'src="https://bbs.csdn.net/topics/(.+?\.jpg)"'
    reg = r'"imageId":".*?"'
    reg_img = re.compile(reg)  # 編譯一下，運行更快
    imglist = reg_img.findall(data)
    print(imglist)
    x = 0
    # pageFile = open('pageCode2.txt', 'wb')  # 以寫的方式打開pageCode.txt
    imglist2 = []
    for img in imglist:
        x = x + 1
        print(img[11:-1])
        imglist2.append(img[11:-1])

    print(imglist2)

    x = 0
    for ig in imglist2:
        x = x + 1
        print(ig)

        # urllib.request.urlretrieve('http://ppic.meituba.com:83/uploads3/181201/3-1Q20111553V11.jpg','%s.jpg' % x)
        # x+1
        # python 下載圖片到本地方法
        image_url = "https://icweiliimg6.pstatp.com/weili/l/" + ig + ".webp"
        # image_url2=""
        # image_url = img
        file_path = 'E:/新建檔案夾4/爬蟲圖片/圖片'+indexId+'/圖片'

        try:
          if not os.path.exists(file_path):
          os.makedirs(file_path)  # 如果檔案夾不存在直接創建一個
          file_suffix = os.path.splitext(image_url)[1]
          print(file_suffix)
          filetype = '.webp'
          filename = '{}{}'.format(file_path + str(x), filetype)  # 拼接檔案名
          # x=x+1
          print(filename)
          # urllib.request.urlretrieve(image_url, filename=filename)  # 利用urllib.request.urltrieve方法下載圖片這個可能會出現 403 forbidden
          # region 這里為了防止出現403 改為這種輸出方式
          res = requests.get(image_url)
          with open(file_path + str(x) + '.jpg', 'wb') as f:
          f.write(res.content)
          print(111)
          # endregion
        except IOError as e:
          print(1, e)

        except Exception as e:
          print(2, e)

        time.sleep(2)

uj5u.com熱心網友回復：

這是給大家的例子嗎

uj5u.com熱心網友回復：

參考 1 樓 dabingsou 的回復:

這是給大家的例子嗎

是啊，寫了一個小例子

轉載請註明出處，本文鏈接：https://www.uj5u.com/qita/106379.html

標籤：腳本語言(Perl/Python)

上一篇：print輸出見鬼了

下一篇：那個python大神，把這個python寫的代碼轉成java代碼啊，謝謝了