- 增加中圖分類號轉類名
- 手動在網頁輸入檢索詞
#!/usr/bin/env python
# coding: utf-8
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
import time
import requests
import zipfile
import os
import pandas as pd
import re
import traceback
from bs4 import BeautifulSoup
# # 安裝 chrome 驅動
def un_zip(file_name, to_dir='./'):
"""unzip zip file"""
zip_file = zipfile.ZipFile(file_name)
if os.path.isdir(to_dir):
pass
else:
os.mkdir(to_dir)
for names in zip_file.namelist():
zip_file.extract(names, to_dir)
zip_file.close()
def download_driver(to_dir='./', version=''):
print('install chrome-driver first')
url = 'http://npm.taobao.org/mirrors/chromedriver/LATEST_RELEASE'
if len(version)>0:
url = 'http://npm.taobao.org/mirrors/chromedriver/LATEST_RELEASE_'+version
version = requests.get(url).content.decode('utf8')
driver_file = 'http://npm.taobao.org/mirrors/chromedriver/' + version + '/chromedriver_win32.zip'
r = requests.get(driver_file)
download_zip = "chromedriver_win32.zip"
with open(download_zip, "wb") as code:
code.write(r.content)
un_zip(download_zip, to_dir)
os.remove(download_zip)
#初始化一個瀏覽器(使用Chrome需安裝chromedriver)
try:
driver = webdriver.Chrome()
except Exception as e:
download_driver(to_dir='./', version='76')
driver = webdriver.Chrome()
driver.get("https://kns.cnki.net/kns8/AdvSearch")
# # 中圖分類鍵值對
kv = pd.read_csv('中圖分類號.csv', names = ['k','v'], delimiter=',')
kv = kv.set_index('k')
def search_class_name(i):
v = ''
if type(i) is str:
k = i.split(';')[0]
while len(k) > 0 and not k in kv.index:
k = k[:-1]
if len(k)>0:
v = kv.loc[k]['v']
return v
# # 手動在瀏覽器中輸入檢索詞
input('請在彈出的瀏覽器中輸入檢索詞,待結果出現后,輸入任意字符繼續:')
# # 開始爬取
result_file = 'result88.csv' # 保存位置
failed_list = [] # 爬取出錯的頁面
encoding = 'utf_8_sig'
end = False
while(not end):
if os.path.exists(result_file):
result = pd.read_csv(result_file, encoding=encoding)
else:
result = pd.DataFrame(columns=[
'論文ID',
'題名',
'作者',
'來源',
'發表時間',
'資料庫',
'被引',
'專輯',
'專題',
'分類號',
'中圖分類',
'摘要',
'關鍵詞'
])
time.sleep(1)
try:
html = driver.page_source
html = BeautifulSoup(html, 'lxml')
table = html.find('table', {'class': 'result-table-list'}).findAll('tr')
except:
s = input('請在網頁中輸入驗證碼,或檢查其他錯誤,待頁面加載出來后,在此輸入任意字符:')
html = driver.page_source
html = BeautifulSoup(html, 'lxml')
table = html.find('table', {'class': 'result-table-list'}).findAll('tr')
for tr in table[1:]:
try:
time.sleep(0.5)
td_name = tr.find('td',{'class':'name'})
title = td_name.a.get_text().strip()
authors = tr.find('td',{'class':'author'}).text
source = tr.find('td',{'class':'source'}).text.strip()
date = tr.find('td',{'class':'date'}).text.strip()
data = tr.find('td',{'class':'data'}).text.strip()
quote = tr.find('td',{'class':'quote'}).text.strip()
keywords = ''
abstract = ''
collection = ''
topic = ''
class_id = ''
# 正常情況
if 'DbCode' in str(td_name) and 'DbName' in str(td_name):
dbcode = re.findall('DbCode=(.*?)[&|"]',str(td_name))[0]
dbname = re.findall('DbName=(.*?)[&|"]',str(td_name))[0]
filename = re.findall('FileName=(.*?)[&|"]',str(td_name))[0]
url = 'https://kns.cnki.net/kcms/detail/detail.aspx?dbcode={}&dbname={}&filename={}'.format(dbcode,dbname,filename)
detail_page = requests.get(url).text
detail_page = BeautifulSoup(detail_page,'lxml')
abstract_ = detail_page.find('span',{'class':'abstract-text'})
if abstract_ is not None:
abstract = abstract_.text
keywords_ = detail_page.find('p',{'class':'keywords'})
if keywords_ is not None:
keywords = ' '.join([i.strip() for i in keywords_.text.split('\r\n')])
div_doc = detail_page.find('div',{'class':'doc'})
if div_doc is not None:
collection_ = div_doc.findAll(lambda tag: tag.get('class') is not None and 'top-space' in tag.get('class') and '專輯' in tag.text)
if len(collection_) > 0:
collection = collection_[0].p.text
topic_ = div_doc.findAll(lambda tag: tag.get('class') is not None and 'top-space' in tag.get('class') and '專題' in tag.text)
if len(topic_) > 0:
topic = topic_[0].p.text
class_id_ = div_doc.findAll(lambda tag: tag.get('class') is not None and 'top-space' in tag.get('class') and '分類號' in tag.text)
if len(class_id_) > 0:
class_id = class_id_[0].p.text
# 可能是成果
if len(keywords) == 0 and len(abstract) == 0:
brief = div_doc.find('div',{'class':'brief'})
if brief is not None:
keywords_ = div_doc.findAll(lambda tag: tag.get('class') is not None and 'row' in tag.get('class') and '關鍵詞' in tag.text)
if len(keywords_) > 0:
keywords = keywords_[0].p.text
keywords = ' '.join([i.strip() for i in keywords.split('\r\n')])
abstract_ = div_doc.findAll(lambda tag: tag.get('class') is not None and 'row' in tag.get('class') and '成果簡介' in tag.text)
if len(abstract_) > 0:
abstract = abstract_[0].p.text
class_id_ = div_doc.findAll(lambda tag: tag.get('class') is not None and 'row' in tag.get('class') and '中圖分類號' in tag.text)
if len(class_id_) > 0:
class_id = class_id_[0].p.text
# 英文
elif 'RedirectScholar':
tablename = re.findall('tablename=(.*?)[&|"]',str(td_name))[0]
filename = re.findall('filename=(.*?)[&|"]',str(td_name))[0]
url = 'https://schlr.cnki.net/Detail/index/{}/{}'.format(tablename, filename)
detail_page = requests.get(url).text
detail_page = BeautifulSoup(detail_page,'lxml')
div_doc = detail_page.find('div',{'class':'right-top'})
if div_doc is not None:
keywords = div_doc.find('div',{'class':'doc-keyword doc-item'}) .find('span',{'class':'value'}).text
abstract = div_doc.find('div',{'class':'doc-summary doc-item show'}) .find('span',{'class':'value'}).text
result.loc[result.shape[0]] = {
'論文ID' : url,
'題名' : title,
'作者' : authors,
'來源' : source,
'發表時間' : date,
'資料庫' : data,
'被引' : quote,
'專輯' : collection,
'專題' : topic,
'分類號' : class_id,
'中圖分類' : search_class_name(class_id),
'摘要' : abstract,
'關鍵詞' : keywords
}
print(result.iloc[-1],'\n')
except Exception as e:
print('獲取資訊失敗:'+title)
print('錯誤資訊 :')
traceback.print_exc()
print()
failed_list.append(title)
result.to_csv(result_file, index=False, encoding=encoding)
end = True
page_links = driver.find_elements_by_xpath('//div[@class="pages"]/a')
for a in page_links:
if '下一頁' in a.text:
end=False
print('進入下一頁\n')
a.click()
result.to_csv(result_file, index=False, encoding=encoding)
print('{} 篇文章抓取失敗'.format(len(failed_list)))
for i in failed_list:
print(i)
轉載請註明出處,本文鏈接:https://www.uj5u.com/qianduan/20044.html
標籤:其他
上一篇:python實作郵件回圈自動發件
