功能:
爬蟲電影下載地址;
主要技術:
1.Jsoup: 一款Java的HTML決議器
2.正則運算式
結果演示:

這些鏈接都是可以使用迅雷或者網頁直接下載的~~~
流程:
確定目標--->獲取網頁原始碼--->抓取目標資訊
獲取網頁原始碼:
//決議網頁 url_str:目標網站 matchValue:匹配值
public static String parseWeb(String url_str, String matchValue) throws IOException{
URL url = new URL(url_str);
Document document = Jsoup.parse(url, 30000); //parse(URL url ,int timeoutMillis)
Elements elements = document.getElementsByClass(matchValue);
String result = elements.html().toString();
return result;
}
抓取目標資訊:
//匹配資源
public static ArrayList<String> matchResources(String result, String regex){
ArrayList<String> strs = new ArrayList<String>();
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(result);
while(matcher.find()){
strs.add(DEFAULT_URL+matcher.group());
}
return strs;
}
主要使用的就是以上兩個功能,所以寫在了一個工具類中
工具類:
public class MyUtil {
private static final String DEFAULT_URL = "*********"; 不能傳播色情資訊hhh~~
//決議網頁 url_str:目標網站 matchValue:匹配值
public static String parseWeb(String url_str, String matchValue) throws IOException{
URL url = new URL(url_str);
Document document = Jsoup.parse(url, 30000); //parse(URL url ,int timeoutMillis)
Elements elements = document.getElementsByClass(matchValue);
String result = elements.html().toString();
return result;
}
//匹配資源
public static ArrayList<String> matchResources(String result, String regex){
ArrayList<String> strs = new ArrayList<String>();
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(result);
while(matcher.find()){
strs.add(DEFAULT_URL+matcher.group());
}
return strs;
}
//去重
public static ArrayList<String> removeDuplicate(ArrayList<String> strs){
Set set = new HashSet();
set.addAll(strs);
strs.clear();
strs.addAll(set);
return strs;
}
}
接下來定義一個介面:
public interface ICL {
//遍歷當前分類,回傳該類所有資源所在的url
public ArrayList<String> method1(String kind) throws IOException;
//遍歷傳入的url集合,逐一呼叫方法3回傳所有資源
public ArrayList<String> method2(List<String> strs) throws IOException;
//回傳該url下的下載資源
public String method3(String str) throws IOException;
//列印
public void print(ArrayList<String> links);
}
它的實作類:
public class ClImpl implements ICL{
private static final String DEFAULT_URL = "***********"; //不能傳播色情資訊~~hh
public ClImpl(){
}
@Override
//遍歷當前分類,回傳該類所有url kind: 傳入相應分類
public ArrayList<String> method1(String kind) throws IOException {
ArrayList<String> urls = new ArrayList<String>();
kind = DEFAULT_URL + kind;
String result = MyUtil.parseWeb(kind, "row col5 clearfix");
urls = MyUtil.matchResources(result, "html/\\d{6}/\\d{4,}.html");
urls = MyUtil.removeDuplicate(urls);
return urls;
}
@Override
//遍歷傳入的url集合,逐一呼叫方法3回傳所有資源 urls: 傳入的url集合
public ArrayList<String> method2(List<String> urls) throws IOException {
ArrayList<String> links = new ArrayList<String>();
for (String url : urls) {
links.add(method3(url));
}
return links;
}
@Override
//回傳該url下的下載資源
public String method3(String url) throws IOException{
ArrayList<String> links = new ArrayList<String>();
String result = MyUtil.parseWeb(url,"download");
links = MyUtil.matchResources(result, "https://.*?mp4");
String link = links.get(0);
link = link.substring(23);
return link;
}
@Override
public void print(ArrayList<String> links) {
for (String string : links) {
System.out.println(string);
}
}
}
所有的功能已經基本完成了,接下來簡單寫一個測驗類,
public class Test01 {
static ArrayList<String> links = new ArrayList<String>();
static ClImpl c = new ClImpl();
static Scanner scan = new Scanner(System.in);
public static void main(String[] args) throws IOException{
boolean flag = true;
while(flag){
System.out.println("1-三上悠亞 2-橋本有菜 3-深田詠美 4-波多野結衣 5-吉澤明步 886-退出");
int key = scan.nextInt();
switch (key) {
case 1:
int indexOfSsyy = 2;
start("av/ssyy/");
turnPage("av/ssyy/",indexOfSsyy);
break;
case 2:
int indexOfQbyq = 2;
start("av/qbyc/");
turnPage("av/qbyc/",indexOfQbyq);
break;
case 3:
int indexOfStym = 2;
start("av/stym/");
turnPage("av/stym/",indexOfStym);
break;
case 4:
int indexOfBdyjy = 2;
start("av/bdyjy/");
turnPage("av/bdyjy/",indexOfBdyjy);
break;
case 5:
int indexOfJzmb = 2;
start("av/jzmb/");
turnPage("av/jzmb/",indexOfJzmb);
break;
case 886:
System.out.println("已退出~~");
flag = false;
break;
default:
System.out.println("輸入錯誤,請重新輸入");
break;
}
}
}
public static void start(String kind) throws IOException{
links = c.method1(kind);
links = c.method2(links);
c.print(links);
}
public static void turnPage(String kind, int index) throws IOException{
System.out.println("1-下一頁 其他任意數字鍵-回傳上一級");
int menu = scan.nextInt();
switch (menu) {
case 1:
start(kind+"index_"+index+".html");
index++;
turnPage(kind,index);
break;
default:
break;
}
}
}
以上就是這個電影小爬蟲的制作流程啦;大家對爬蟲感興趣的話可以以這個為參照,根據需求修改一下,寫的比較爛.....
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/321490.html
標籤:其他
