作業需要查看串列中企業的企查查資訊,包括全部電話、郵箱、以及主要人員等資訊,資料比較多就想寫個爬蟲~各種借鑒學習最開始寫的時候爬取成功一部分資訊,改了改后突然拒絕訪問了,但瀏覽器登錄查看是沒有問題的。求求各位大神幫忙看一下問題在哪兒。

本來想成為這樣的,

# -*- coding-8 -*-
import requests
import lxml
import xlrd
import xlwt
import sys
from bs4 import BeautifulSoup
import xlwt
import time
import urllib
def craw(url, key_word):
User_Agent = 'Mozilla/5.0 (Windows NT 10.0;Win64; x64) AppleWebKit/537.36(KHTML,like Gecko)Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763'
re = r'https://www.qcc.com/search?key=' + key_word
headers = {
'Accept':'text/html, application/xhtml+xml, application/xml; q=0.9, */*; q=0.8',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.8, en-US; q=0.6, en; q=0.4, ko; q=0.2',
'Cookie': r'_XXXXXXXXXXXXXXXX',
'Host': 'www.qcc.com',
'Upgrade-Insecure-Request':'1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763',
}
try:
response = requests.get(url, headers=headers)
if response.status_code != 200:
response.encoding = 'utf-8'
print(response.status_code)
print('ERROR')
soup = BeautifulSoup(response.text, 'lxml')
except Exception:
print('請求都不讓,這企查查是想逆天嗎???')
try:
com_all_info = soup.find_all(class_='frtrt')[0].tbody
com_all_info_array = com_all_info.select('td')
print('開始爬取資料,請勿打開excel')
# try:
temp_g_name = com_all_info_array[2].select('a')[0].text # 公司名稱
temp_r_name = com_all_info_array[2].select('p')[0].a.text # 法人名稱
temp_g_money = com_all_info_array[2].select('p')[0].select('span')[0].text.strip('注冊資本:') # 注冊資本金
temp_g_date = com_all_info_array[2].select('p')[0].select('span')[1].text.strip('成立日期:') # 成立日期
e = com_all_info_array[2].select('p')[1].select('a')[1] # 郵箱
f = e.strip('showHisEmail').strip(';').replace('{','').replace('}','').replace('[','').replace(']','').replace(':','').replace('"','').replace('e','').replace('s','').replace('(','').replace(')','')
g = f.split(",")
l = 0
h = []
while l < len(g):
if l % 2 == 0:
h.append(str(g[l]))
l += 1
temp_r_email = '\n'.join(h)
a = com_all_info_array[2].select('p')[1].select('a')[0] # 電話
b = a.strip('showHisTel').strip(';').replace('{','').replace('}','').replace('[','').replace(']','').replace(':','').replace('"','').replace('t','').replace('s','').replace('(','').replace(')','')
c = b.split(",")
j = 0
d = []
while j < len(c):
if j % 2 == 0:
d.append(str(c[j]))
j += 1
temp_r_phone = '\n'.join(d)
temp_g_addr = com_all_info_array[2].select('p')[2].text.strip().strip('地址:') # 地址
temp_r_numm = com_all_info_array[2].select('p')[0].select('span')[1].text.strip('統一社會信用代碼:') # 征信代碼
g_name_list.append(temp_g_name)
r_name_list.append(temp_r_name)
g_money_list.append(temp_g_money)
g_date_list.append(temp_g_date)
r_email_list.append(temp_r_email)
r_phone_list.append(temp_r_phone)
g_addr_list.append(temp_g_addr)
r_numm_list.append(temp_r_numm)
# except Exception:
# print('錯誤!')
except Exception:
print('好像被拒絕訪問了呢...請稍后再試叭...')
if __name__ == '__main__':
global g_name_list
global r_name_list
global g_money_list
global g_date_list
global r_email_list
global r_phone_list
global g_addr_list
global r_numm_list
col = []
data = xlrd.open_workbook(r'C:\企查查資料源.xlsx')
sheet = data.sheet_by_name('Sheet1')
col = sheet.col_values(0)
for key_word in col:
time.sleep(15)
key_word = urllib.parse.quote(key_word)
url = r'https://www.qcc.com/search?key='+key_word
craw(url, key_word)
print('正在搜索,請稍后')
workbook = xlwt.Workbook()
sheet1 = workbook.add_sheet('企查查資料', cell_overwrite_ok=True)
style = xlwt.XFStyle()
font = xlwt.Font()
font.name = '仿宋'
font.bold = True
style.font = font
print('正在存盤資料,請勿打開excel')
g_name_list = []
r_name_list = []
g_money_list = []
g_date_list = []
r_email_list = []
r_phone_list = []
g_addr_list = []
r_numm_list = []
name_list = ['公司名字','法定法人','注冊資本','成立日期','法人郵箱','法人電話','公司地址','統一社會信用代碼']
for cc in range(0,len(name_list)):
sheet1.write(0,cc,name_list[cc],style)
for i in range(0,len(g_name_list)):
print(g_name_list[i])
sheet1.write(i+1,0,g_name_list[i],style)
sheet1.write(i+1,1,r_name_list[i],style)
sheet1.write(i+1,2,g_money_list[i],style)
sheet1.write(i+1,3,g_date_list[i],style)
sheet1.write(i+1,4,r_email_list[i],style)
sheet1.write(i+1,5,r_phone_list[i],style)
sheet1.write(i+1,6,g_addr_list[i],style)
sheet1.write(i+1,7,r_numm_list[i],style)
workbook.save(r"C:\Users\Desktop\企查查"+time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) +".xls")
print('保存完畢~')
人生太艱難了。。。
uj5u.com熱心網友回復:
救救孩子吧
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/167091.html
下一篇:pip更新失敗
