案例功能效果圖
爬去資料的平臺頁面

這個案例能爬取的平臺太多了,我沒有全部截圖出來,想看的你們自己下載原始碼自己跑起來!
爬取的熱榜資料效果圖

環境介紹
前端:vue+h5
后端:springboot+webMagic
jdk:1.8及以上
資料庫:mysql
完整原始碼獲取方式
原始碼獲取方式:點擊這里,暗號博客園!
核心代碼介紹
pom.xml
<!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-core -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- https://mvnrepository.com/artifact/us.codecraft/webmagic-extension -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.google.guava/guava -->
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>18.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.projectlombok/lombok 代碼省略工具-->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.8</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/junit/junit -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<!-- swagger2 -->
<dependency>
<groupId>io.springfox</groupId>
<artifactId>springfox-swagger2</artifactId>
<version>2.9.1</version>
</dependency>
<dependency>
<groupId>io.springfox</groupId>
<artifactId>springfox-swagger-ui</artifactId>
<version>2.9.1</version>
</dependency>
application.yml
server:
port: 9004
spring:
jackson:
serialization:
write-dates-as-timestamps: true
datasource:
driverClassName: com.mysql.cj.jdbc.Driver
url: jdbc:mysql://feimeidehuoji:3306/feimeidehuoji?autoReconnect=true&useUnicode=true&characterEncoding=UTF-8&useSSL=false&useLegacyDatetimeCode=false&serverTimezone=UTC
username: feimeidehuoji
password: feimeidehuoji
jpa:
database: MySQL
show-sql: true
hibernate:
ddl-auto: update
database-platform: org.hibernate.dialect.MySQL5InnoDBDialect
spiderUrl: https://tophub.today
proxyUrl: 61.160.210.234
proxyPort: 808
NodeController.java
package cn.cesi.webMagic.webMagic;
import cn.cesi.webMagic.pieline.SpringPieline;
import cn.cesi.webMagic.pojo.Node;
import cn.cesi.webMagic.service.NodeService;
import cn.cesi.webMagic.util.Result;
import cn.cesi.webMagic.util.StatusCode;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import io.swagger.annotations.ApiParam;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.data.domain.Page;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.web.bind.annotation.CrossOrigin;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import javax.annotation.Resource;
import java.util.List;
import java.util.Map;
@RestController
@CrossOrigin
@RequestMapping("/node")
@Api(value = "https://www.cnblogs.com/rutaha/archive/2020/12/23/獲取資料介面",tags={"用戶登錄介面"})
public class NodeController {
@Value("${spiderUrl}")
private String url;
@Value("${proxyUrl}")
private String proxyUrl;
@Value("${proxyPort}")
private Integer proxyPort;
@Resource
NodeService nodeService;
@Autowired
SpringPieline springPieline;
@RequestMapping("")
@ApiOperation(value = "https://www.cnblogs.com/rutaha/archive/2020/12/23/查詢資料介面")
public Result getData(
@ApiParam(value = "https://www.cnblogs.com/rutaha/archive/2020/12/23/分類名稱", required = false) String typeName
,@ApiParam(value = "https://www.cnblogs.com/rutaha/archive/2020/12/23/分類名稱", required = false) String secondTitle
,@ApiParam(value = "https://www.cnblogs.com/rutaha/archive/2020/12/23/當前頁", required = false)Integer page
,@ApiParam(value = "https://www.cnblogs.com/rutaha/archive/2020/12/23/每頁資料條數", required = false)Integer size){
Page<Node> nodes = nodeService.searchData(typeName, secondTitle,page, size);
Result result = new Result();
result.setFlag(true);
result.setCode(StatusCode.OK);
result.setMsg("查詢成功!");
result.setData(nodes);
return result;
}
@RequestMapping("/getType")
@ApiOperation(value = "https://www.cnblogs.com/rutaha/archive/2020/12/23/查詢全部分類串列")
public Result getData(){
List<Map<String,String>> list = nodeService.findType();
Result result = new Result();
result.setFlag(true);
result.setCode(StatusCode.OK);
result.setMsg("查詢成功!");
result.setData(list);
return result;
}
@Scheduled(fixedDelay = 480000) //1000*60*8 任務執行完成后10分鐘繼續執行
public void tasks(){
System.out.println("定時任務開始——————————————————————————————————");
//設定代理服務器
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy(proxyUrl,proxyPort)));
Spider.create(new WebProcess())
.addUrl(url)
.setDownloader(httpClientDownloader)
.thread(2) //執行緒(程式爬取速度)
.addPipeline(springPieline) //指定pieline介面
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000*10)))
.run();
System.out.println("定時任務結束——————————————————————————————————");
}
}
WebProcess.java
package cn.cesi.webMagic.webMagic;
import cn.cesi.webMagic.pieline.SpringPieline;
import cn.cesi.webMagic.util.NodeEntity;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Selectable;
import org.jsoup.select.Elements;
import java.util.*;
@Component
public class WebProcess implements PageProcessor {
@Override
public void process(Page page) {
System.out.println(page.getHtml());
//page頁面物件,getHtml()獲取頁面的html ,css()選擇器 div#Sortable 獲取id為Sortable的div元素 nodes()轉為集合
List<Selectable> list = page.getHtml().css("div.bc div#Sortable div.cc-cd div").nodes();
List<NodeEntity> nodes = new ArrayList<>();
for(Selectable selectable : list){
//regex 正則運算式
// String name = Jsoup.parse(selectable.css("div.cc-cd-ih div a div span").regex(".*微博.*").all().toString()).text(); //標題
//Jsoup.parse決議html為dom元素(物件)語法同js語法 text()為js語法不多解釋
//獲取title大標題
String s = selectable.css("div.cc-cd-ih div a div span").toString();
String title = "";
if(s != null){
title = Jsoup.parse(s).text();
}
//獲取logo
String logo = selectable.css("div.cc-cd-ih div a div img").toString();
String logoSrchttps://www.cnblogs.com/rutaha/archive/2020/12/23/= "";
if(logo != null){
Document document = Jsoup.parse(logo);
Elements imgTags = document.select("img[src]");
logoSrc = https://www.cnblogs.com/rutaha/archive/2020/12/23/imgTags.attr("src");
}
//獲取第二層小標題的集合
List<Selectable> list2 = selectable.css("div.cc-cd-cb div a").nodes();
List<Map<String,String>> maps = new ArrayList<>();
for(Selectable selectable2 :list2){
Map<String,String> map = new HashMap<>();
//獲取二級標題的鏈接
String url = selectable2.links().toString();
//獲取二級標題
String secondTitle = Jsoup.parse(selectable2.css("div span.t").toString()).text();
//獲取文章熱度
String hot = "";
if(selectable2.css("div span.e") != null){
hot = Jsoup.parse(selectable2.css("div span.e").toString()).text();
}
map.put("url",url);
map.put("secondTitle",secondTitle);
map.put("hot",hot);
maps.add(map);
//將連接添加入任務中
//page.addTargetRequest(url);
}
NodeEntity node = new NodeEntity();
node.setTitle(title);
node.setLogo(logoSrc);
node.setMaps(maps);
nodes.add(node);
}
//給page物件系結物件
page.putField("nodes",nodes);
}
private Site site = Site.me()
.setSleepTime(2)//抓取間隔時間,可以解決一些反爬限制
.setRetryTimes(3) //重試次數
.setRetrySleepTime(10000) //重試時間
.setTimeOut(60000) //超時時間 1000*60 1分鐘
.setCharset("utf8");
@Override
public Site getSite() {
return site;
}
}
SpringPieline.java
package cn.cesi.webMagic.pieline;
import cn.cesi.webMagic.pojo.Node;
import cn.cesi.webMagic.service.NodeService;
import cn.cesi.webMagic.util.IdWorker;
import cn.cesi.webMagic.util.NodeEntity;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.util.*;
//存入資料庫
@Component
public class SpringPieline implements Pipeline {
@Autowired
NodeService nodeService;
@Autowired
IdWorker idWorker;
@Override
public void process(ResultItems resultItems, Task task) {
List<NodeEntity> nodes = resultItems.get("nodes");
try{
for(NodeEntity entity : nodes){
Node node = new Node();
String title = entity.getTitle();
node.setTitle(title);
String logo = entity.getLogo();
node.setLogo(logo);
List<Map<String,String>> list = entity.getMaps();
for(Map<String,String> map : list){
node.setId(idWorker.nextId()+"");
String secondTitle = map.get("secondTitle").trim();
node.setSecondTitle(secondTitle);
node.setUrl(map.get("url"));
node.setCreateDate(new Date());
node.setHot(map.get("hot"));
System.out.println(secondTitle);
if(!secondTitle.equals("") && !title.equals("")){
List<Node> byTitleAndSecondTitle = nodeService.findByTitleAndSecondTitle(title, secondTitle);
if(byTitleAndSecondTitle.size() <= 0){
nodeService.save(node);
}
}
}
}
}catch (Exception e){
System.out.println(e);
}
}
}
index.vue
<template>
<div >
<h1 >摸魚熱榜</h1>
<van-search
v-model="value"
placeholder="請輸入搜索關鍵詞"
@search="onSearch"
@clear="onClear"
/>
<!-- 分類串列 -->
<div v-if="!listData.length">
<div >
仿今日熱榜!,關注java專案開發,學習更多案例!
</div>
<div >
<div>
<div >全部熱榜</div>
<div >
<div
v-for="(item, index) in typeList"
:key="index"
>
<div @click="goDateils(item)">
<div >
<img
:src="https://www.cnblogs.com/rutaha/archive/2020/12/23/item.logo"
:alt="item.title"
@error="imgError(item)"
/>
</div>
<div >{{ item.title }}</div>
<div >
<svg-icon
icon
className="icon_search"
></svg-icon>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<!-- 搜索內容 -->
<div v-if="listData.length">
<search-list v-if="listData.length" :list="listData" />
<van-empty v-else description="暫無相關內容!" />
</div>
</div>
</template>
<script>
import SvgIcon from '@/components/icon/SvgIcon';
import searchList from '@/components/searchList/list';
export default {
components: {
SvgIcon,
searchList
},
data() {
return {
value: '', // 搜索值
listData: [], // 搜索資料
typeList: [], // 所有熱榜型別
defaultUrl: 'https://file.ipadown.com/tophub/assets/images/logo.png' // 默認型別圖片
};
},
computed: {},
created() {
this.getAllType();
},
mounted() {},
methods: {
// 獲取全部熱榜型別
getAllType() {
const that = this;
this.$api.getAllType().then(res => {
if (res.code === 0) {
that.typeList = res.data;
}
});
},
// 跳轉分類詳情
goDateils(item) {
this.$router.push({
name: 'details',
query: {
item: JSON.stringify(item)
}
});
},
// 搜索
onSearch(e) {
const that = this;
let params = {
typeName: '全部',
size: 10000,
secondTitle: e
};
this.$api.getAllInfoGzip(params).then(res => {
if (res.code == 0) {
that.listData = https://www.cnblogs.com/rutaha/archive/2020/12/23/res.data.content;
that.handleData(that.listData);
console.log(res);
}
});
},
// 清除搜索框
onClear(e) {
this.listData = [];
},
// 處理熱榜型別資料
handleData(data) {
data.forEach(item => {
item.new = this.util.getDateDiff(item.createDate / 1000).new; // 是否是新資訊
item.CreateTime = this.util.getDateDiff(item.createDate / 1000).Time;
});
},
// 圖片404處理
imgError(item) {
// 圖片404就賦值默認圖片
item.logo = this.defaultUrl;
}
}
};
details.vue
<template>
<div >
<div >
<img :src="https://www.cnblogs.com/rutaha/archive/2020/12/23/details.logo" @error="imgError" alt="" />
<div >
<div >
<p @click="$router.push('/')">摸魚熱榜</p>
</div>
<img :src="https://www.cnblogs.com/rutaha/archive/2020/12/23/details.logo" @error="imgError" alt="" />
<h1 >{{ details.title }}</h1>
</div>
</div>
<div >
<van-pull-refresh v-model="refreshing" @refresh="onRefresh">
<van-list
v-model="loading"
:finished="finished"
@load="onLoad"
:immediate-check="false"
>
<div >
<a
v-for="(item, index) in listData"
:key="item.id"
:href="https://www.cnblogs.com/rutaha/archive/2020/12/23/item.url"
>
<div >
<h4 >
{{ index + 1 }}、{{ item.secondTitle }}
</h4>
<div >
<span v-if="item.hot">
<span>{{ item.hot }}</span>
</span>
<span >
<span>{{ item.CreateTime }}</span>
</span>
<span v-if="item.new">新</span>
</div>
</div>
</a>
</div>
</van-list>
</van-pull-refresh>
</div>
<div v-if="finished">
<p >我是有底線的</p>
</div>
</div>
</template>
<script>
export default {
data() {
return {
page: 1, // 當前頁數
refreshing: false, // 下拉重繪狀態
loading: false, // 上拉加載狀態
finished: false, // 是否無更多資料狀態
listData: [], // 資料串列
details: {}, // 型別詳情
defaultUrl: 'https://file.ipadown.com/tophub/assets/images/logo.png' // 默認型別圖片
};
},
computed: {},
created() {},
mounted() {
this.details = JSON.parse(this.$route.query.item);
this.getList(this.details, this.page);
},
methods: {
// 分類詳情
getList(item, page, loading = true) {
const that = this;
let list = that.listData;
let params = {
typeName: item.title,
size: 50,
page
};
this.$api.getAllInfoGzip(params, loading).then(res => {
console.log(res);
if (res.code == 0) {
that.listData = https://www.cnblogs.com/rutaha/archive/2020/12/23/list.concat(res.data.content);
that.handleData(that.listData);
// 上拉加載狀態結束
if (that.loading) {
that.loading = false;
}
// 下拉重繪狀態結束
if (that.refreshing) {
that.refreshing = false;
}
// 暫無更多資料
if (that.page >= res.data.totalPages) {
that.finished = true;
}
}
});
},
// 上拉加載
onl oad() {
// 請求狀態
this.loading = true;
this.getList(this.details, ++this.page, false);
},
// 下拉重繪
onRefresh() {
// 請求狀態、清空串列資料
this.finished = false;
this.loading = true;
this.listData = [];
this.page = 1;
this.getList(this.details, 1, false);
},
// 處理熱榜型別資料
handleData(data) {
data.forEach(item => {
item.new = this.util.getDateDiff(item.createDate / 1000).new; // 是否是新資訊
item.CreateTime = this.util.getDateDiff(item.createDate / 1000).Time;
});
},
// 圖片404處理
imgError() {
// 圖片404就賦值默認圖片
this.details.img = this.defaultUrl;
}
}
};
xxx.sql
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for node
-- ----------------------------
DROP TABLE IF EXISTS `node`;
CREATE TABLE `node` (
`id` varchar(255) NOT NULL,
`create_date` datetime DEFAULT NULL,
`hot` varchar(1024) DEFAULT NULL,
`second_title` longtext,
`title` varchar(1024) DEFAULT NULL,
`url` longtext,
`logo` varchar(1024) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
作者:Java開發專案
鏈接:https://mp.weixin.qq.com/s/z9J1gL7orSL90ngSQeRRhg
歡迎大家關注:有故事的程式員,每天更新Java技術知識點,還可以領取Java進階學習資料哦~
資料包含的模塊分為19個模塊,分別是: Java 基礎、容器、多執行緒、反射、物件拷貝、Java Web 、例外、網路、設計模式、Spring/Spring MVC、Spring Boot/Spring Cloud、Hibernate、MyBatis、RabbitMQ、Kafka、Zookeeper、MySQL、Redis、JVM ,

轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/239423.html
標籤:其他
上一篇:kafka rebalance解決方案 -incremental cooperative協議和static membership功能
下一篇:大廠面試助手(十二):場景和設計
