# 1.創建請求物件(Request()) url = "http://..." # 1.1 添加多個請求頭,每次放一個進行訪問 # list = [agent1,agent2,agent3,agent4,agent5] # agent = random.choice(list) headers = { "User-Agent": "", # 偽裝,反爬蟲機制 # 1.1 "User-Agent":agent, "Cookie": "", # Cookie模擬登陸 } # 1.2創建自定義請求物件 req = urllib.request.Request(url, headers=headers) # 2.獲取回應物件(urlopen()) res = urllib.request.urlopen(req) # 3.獲取內容(read().decode("utf-8") html = res.read().decode("utf-8") # decode() : bytes -> string # encode() : string -> bytes # 2-3.可結合 # html = request.urlopen(req).read().decode("utf-8") print(html)一、python爬蟲基礎步驟
# 1.構建處理器物件(專門處理請求的物件) http_hander = request.HTTPHandler() # 2.創建自定義opener opener = request.build_opener(http_hander) # 3.創建自定義請求物件 req = request.Request("http://www.baidu.com") # 4.1 發送請求,獲取回應 # reponse = opener.open(req).read() # 4.2 把自定義opener設定為全域,這樣urlopen發送的請求也會使用自定義的opener request.install_opener(opener) reponse = request.urlopen(req).read() print(reponse)二、自定義opener
# 1.接收用戶從終端輸入 key = input("請輸入要搜索的內容:") wd = {"wd": key} # dict url = "http://www.baidu.com/s?" # 2.構造url編碼,進行urlencode編碼 wdd = urllib.parse.urlencode(wd) # 3.拼接url url = url+wdd # 4.創建請求物件 req = request.Request(url) # 5.獲取回應物件 reponse = request.urlopen(req).read().decode() print(reponse)三、處理get請求,進行urlencode編碼
# 1.構造請求頭資訊 header={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/79.0.3928.4 Safari/537.36" } url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule" # 2.接收用戶從終端輸入 key = input("請輸入要搜索的內容:") formdata={ "i": key, "from": "AUTO", "to": "AUTO", "smartresult": "dict", "client": "fanyideskweb", "salt": "16003477829589", "sign": "3f351e5f7e0d84706ef063ccabe3e169", "lts": "1600347782958", "bv": "cb9a601990a9118249221b303a87fd75", "doctype": "json", "version": "2.1", "keyfrom": "fanyi.web", "action": "FY_BY_REALTlME", } # 3.把data轉為bytes資料型別 data = https://www.cnblogs.com/LQD-future/p/urllib.parse.urlencode(formdata).encode(encoding='utf-8') # 4.發請求,獲回應,獲取內容 req = request.Request(url,data=https://www.cnblogs.com/LQD-future/p/data,headers=header) resp = request.urlopen(req).read().decode() # 5.正則運算式,提取"tgt":"like"}]]}中間的任意內容 pat = r'"tgt":"(.*?)"}]]}' result = re.findall(pat,resp) print(result[0])四、處理post請求,有道翻譯
list1 = [ "http://www.baidu.com", "http://www.baidu.com", "http://www.baidu25234234235454254243.com", "http://www.baidu.com", "http://www.baidu.com", ] i = 0 for url in list1: i += 1 try: request.urlopen(url) except Exception as e: print(e) print("第",i,"此請求完成")--例外處理
base_url = "https://movie.douban.com/j/chart/top_list?" \ "type=11&interval_id=100%3A90&action=&start={}&limit=20" header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) " "AppleWebKit/537.36 (KHTML, like Gecko)" " Chrome/79.0.3928.4 Safari/537.36" } i = 0 while True: url =base_url.format(i * 20) # "網站名:{name}, 地址 {url}".format(name="菜鳥教程", url="www.runoob.com") req = request.Request(url,headers=header) res = request.urlopen(req).read().decode() print(res) if res == '' or res is None: break i += 1五、ajax請求的使用
import ssl url = "https://www.12306.cn/mormhweb/" header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) " "AppleWebKit/537.36 (KHTML, like Gecko)" " Chrome/79.0.3928.4 Safari/537.36" } req = request.Request(url,headers=header) # 驗證忽略證書 context = ssl._create_unverified_context() res = request.urlopen(req,context=context).read().decode() print(res)六、https請求的使用
url = "https://www.qiushibaike.com/text/" header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64;" " x64; rv:80.0) Gecko/20100101 " "Firefox/80.0" } # 構造請求 res = requests.get(url,headers=header) info = res.text infos = re.findall(r'<div >\s*<span>\s*(.+)\s*</span>',info) for info in infos: with open("duanzi.txt",'a',encoding='utf-8') as f: f.write(info + "\n\n\n") print(infos)七、糗事百科案例
轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/166777.html
標籤:Python
下一篇:re.1-常用運算式規則
