一、從網頁上下載指定檔案
- urlretrieve()方法直接將遠程資料下載到本地,
- urlretrieve(url, filename=None, reporthook=None, data=None)
- url - 指定了從哪下載
- finename - 指定了保存本地路徑(如果引數未指定,urllib會生成一個臨時檔案保存資料,)
- reporthook - 是一個回呼函式,當連接上服務器、以及相應的資料塊傳輸完畢時會觸發該回呼,我們可以利用這個回呼函式來顯示當前的下載進度,
- data - 指post到服務器的資料,該方法回傳一個包含兩個元素的(filename, headers)元組,filename表示保存到本地的路徑,header表示服務器的回應頭,
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com")
bsObj = BeautifulSoup(html, "lxml")
# 找到圖片的地址
imageLocation = bsObj.find("a", {"id": "logo"}).find("img")["src"]
# 下載圖片并保存未logo.jpg
urlretrieve(imageLocation, "logo.jpg")
二、下載帶有指定src標簽的檔案
import os
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
downloadDirectory = "downloaded/"
baseUrl = "http://pythonscraping.com"
# 對URL鏈接進行清理和標準化,獲得檔案的絕對路徑(而且去掉了外鏈)
def getAbsoluteURL(baseUrl, source):
if source.startswith("http://www."):
url = "http://" + source[11:]
elif source.startswith("http://"):
url = source
elif source.startswith("www."):
url = "http://"+source[4:]
else:
url = baseUrl+"/"+source
if baseUrl not in url:
return None
return url
# 去除目錄中的特殊符號
def correct_title(title):
error_set = ['/', '\\', ':', '*', '?', '"', '|', '<', '>']
for c in title:
if c in error_set:
title = title.replace(c, '')
return title
# 獲得下載目錄
def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
path = absoluteUrl.replace("www.", "")
path = path.replace(baseUrl, "")
path = correct_title(path)
path = downloadDirectory+path
# directory - 目錄,用于檢查該檔案夾下是否已存在檔案夾
directory = os.path.dirname(path)
if not os.path.exists(directory):
os.makedirs(directory)
return path
html = urlopen("http://www.pythonscraping.com")
bsObj = BeautifulSoup(html, "lxml")
# 選擇首頁上所有帶 src 屬性的標簽
downloadList = bsObj.findAll(src=True)
for download in downloadList:
fileUrl = getAbsoluteURL(baseUrl, download["src"])
if fileUrl is not None:
print(fileUrl)
try:
urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))
except BaseException as e:
print(str(e))
else:
continue
三、保存網頁的資料到CSV
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors")
bsObj = BeautifulSoup(html, "lxml")
# 主對比表格是當前頁面上的第一個表格
table = bsObj.findAll("table", {"class": "wikitable"})[0]
rows = table.findAll("tr")
csvFile = open("../files/editors.csv", 'wt', newline='', encoding='utf-8')
writer = csv.writer(csvFile)
try:
for row in rows:
csvRow = []
for cell in row.findAll(['td', 'th']):
csvRow.append(cell.get_text())
writer.writerow(csvRow)
finally:
csvFile.close()
四、隨機漫步
import matplotlib.pyplot as plt
from random import choice
class RandomWalk():
# 一個隨機漫步資料的類
def __init__(self, num_points = 5000):
#初始化隨機漫步的屬性
self.num_points = num_points
# 所有隨機漫步都始于(0, 0)
self.x_values = [0]
self.y_values = [0]
def fill_walk(self):
#計算隨機漫步包含的所有點
# 不斷漫步,直到串列達到指定的長度
while len(self.x_values) < self.num_points:
# 決定前進方向以及沿這個方向前進的距離
# x軸方向上 1 - 向右走 -1 - 向左走
x_direction = choice([1, -1])
x_distance = choice([0, 1, 2, 3, 4])
#為零將垂直移動
x_step = x_direction * x_distance
# y軸方向上 1 - 向上走 -1 - 向下走
y_direction = choice([1, -1])
y_distance = choice([0, 1, 2, 3, 4])
#為零將垂直移動
y_step = y_direction * y_distance
# 拒絕原地踏步
if x_step == 0 and y_step == 0:
continue
# 計算下一個點的x和y值
next_x = self.x_values[-1] + x_step
next_y = self.y_values[-1] + y_step
self.x_values.append(next_x)
self.y_values.append(next_y)
# 只要程式處于活動狀態,就不斷地模擬隨機漫步
# 創建一個RandomWalk實體,并將其包含的點都繪制出來
rw = RandomWalk()
rw.fill_walk()
# 設定繪圖視窗的尺寸函式figure()用于指定圖表的寬度、高度、解析度和背景色,你需要給形參figsize指定一個元組,向matplotlib指出繪圖視窗的尺寸,單位為英寸
plt.figure(figsize=(10, 6))
point_numbers = list(range(rw.num_points))
plt.scatter(rw.x_values, rw.y_values, c=point_numbers, cmap=plt.cm.Blues,
edgecolor='none', s=1)
# 突出起點和終點
plt.scatter(0, 0, c='green', edgecolors='none', s=100)
plt.scatter(rw.x_values[-1], rw.y_values[-1], c='red', edgecolors='none',
s=100)
# 隱藏坐標軸
plt.axes().get_xaxis().set_visible(False)
plt.axes().get_yaxis().set_visible(False)
plt.show()
五、呼叫API查看github高星Python語言
import requests
# 執行API呼叫并存盤回應
url = 'https://api.github.com/search/repositories?q=language:python&sort=stars'
r = requests.get(url)
print("Status code:", r.status_code)
# 將API回應存盤在一個變數中
response_dict = r.json()
print("Total repositories:", response_dict['total_count'])
# 探索有關倉庫的資訊
repo_dicts = response_dict['items']
print("Repositories returned:", len(repo_dicts))
print("\nSelected information about each repository:")
for repo_dict in repo_dicts:
print('\nName:', repo_dict['name'])
print('Owner:', repo_dict['owner']['login'])
print('Stars:', repo_dict['stargazers_count'])
print('Repository:', repo_dict['html_url'])
print('Description:', repo_dict['description'])
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/195289.html
標籤:其他
上一篇:Numpy 數學函式及邏輯函式
下一篇:Python案例(三)
