安裝

這里使用maven進行安裝，根據maven下載相關的包

 <dependency>
             <groupId>us.codecraft</groupId>
             <artifactId>webmagic-core</artifactId>
             <version>0.7.3</version>
         </dependency>
         <dependency>
             <groupId>us.codecraft</groupId>
             <artifactId>webmagic-extension</artifactId>
             <version>0.7.3</version>
         </dependency>

Hello World

幾乎所有的api的學習，都是從hello world開始的，webmagic也不例外，使用的同樣也是hello world案例，以爬取sina博文為例，復制以下代碼

 package com.example.demo;
 ?
 import us.codecraft.webmagic.Page;
 import us.codecraft.webmagic.Site;
 import us.codecraft.webmagic.Spider;
 import us.codecraft.webmagic.processor.PageProcessor;
 ?
 public class SinaBlogProcessor implements PageProcessor {
 ?
   public static final String URL_LIST = "http://blog\\.sina\\.com\\.cn/s/articlelist_1487828712_0_\\d+\\.html";
 ?
   public static final String URL_POST = "http://blog\\.sina\\.com\\.cn/s/blog_\\w+\\.html";
 ?
   private Site site = Site
           .me()
           .setDomain("blog.sina.com.cn")
           .setSleepTime(3000)
           .setUserAgent(
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
 ?
   @Override
   public void process(Page page) {
     //串列頁
     if (page.getUrl().regex(URL_LIST).match()) {
       page.addTargetRequests(page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex(URL_POST).all());
       page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all());
       //文章頁
     } else {
       page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2"));
       page.putField("content", page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']"));
       page.putField("date",
               page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)"));
     }
   }
 ?
   @Override
   public Site getSite() {
     return site;
   }
 ?
   public static void main(String[] args) {
     Spider.create(new SinaBlogProcessor()).addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html")
             .run();
   }
 }

查看相關運行結果

這里就等于說已經運行成功，爬取到了一些資料

PageProcessor

這里用于實作相關的配置，

代碼如下

 public class GithubRepoPageProcessor implements PageProcessor {
 ?
     // 部分一：抓取網站的相關配置，包括編碼、抓取間隔、重試次數等
     private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
 ?
     @Override
     // process是定制爬蟲邏輯的核心介面，在這里撰寫抽取邏輯
     public void process(Page page) {
         // 部分二：定義如何抽取頁面資訊，并保存下來
         page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
         page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
         if (page.getResultItems().get("name") == null) {
             //skip this page
             page.setSkip(true);
         }
         page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
 ?
         // 部分三：從頁面發現后續的url地址來抓取
         page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
     }
 ?
     @Override
     public Site getSite() {
         return site;
     }
 ?
     public static void main(String[] args) {
 ?
         Spider.create(new GithubRepoPageProcessor())
                 //從"https://github.com/code4craft"開始抓
                 .addUrl("https://github.com/code4craft")
                 //開啟5個執行緒抓取
                 .thread(5)
                 //啟動爬蟲
                 .run();
     }
 }

抽取元素

這里使用相關的方法抽取元素，

 這里使用相關的方法抽取相關的元素

 List<String> urls = page.getHtml().css("div.pagination").links().regex(".*/search/\?l=java.*").all();

保存結果

這里保存結果使用Pipeline方法

 public static void main(String[] args) {
     Spider.create(new GithubRepoPageProcessor())
             //從"https://github.com/code4craft"開始抓
             .addUrl("https://github.com/code4craft")
             .addPipeline(new JsonFilePipeline("D:\\webmagic\\"))
             //開啟5個執行緒抓取
             .thread(5)
             //啟動爬蟲
             .run();
 }

實際案例

這里以 http://blog.sina.com.cn/flashsword20 作為例子，在這個例子里，要從最終的博客文章頁面，抓取博客的標題，內容，和日期，

串列頁

串列頁的格式為 http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html 這里0和1都是可變的頁數，

文章頁

文章頁的格式是，http://blog.sina.com.cn/s/blog_58ae76e80100g8au.html 這里，最后一段是可變的字串，為文章的id

進行正則匹配

這里用兩個正則進行匹配，這里用，xpath//div[@class="articleList"] 進行相關的匹配，

所以，可以這樣進行匹配

 page.addTargetRequests(page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex("http://blog\\.sina\\.com\\.cn/s/blog_\\w+\\.html").all());
 page.addTargetRequests(page.getHtml().links().regex("http://blog\\.sina\\.com\\.cn/s/articlelist_1487828712_0_\\d+\\.html").all());

內容的添加

這里再進行一次內容的添加

 page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2"));
 page.putField("content", page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']"));
 page.putField("date",
         page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)"));

區分串列和目標頁

這里，進行區分串列頁，和目標頁，

 //串列頁
 if (page.getUrl().regex(URL_LIST).match()) {
     page.addTargetRequests(page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex(URL_POST).all());
     page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all());
     //文章頁
 } else {
     page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2"));
     page.putField("content", page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']"));
     page.putField("date",
             page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)"));
 }

這樣就完成了最基本例子的讀取，

最后我把我收集的各大廠經典高頻面試題和Java高級進階、架構師視頻教程送予大家，部分資料如下圖所示：

獲取地址：java進階學習資料，面試題，電子書籍免費獲取

轉載請註明出處，本文鏈接：https://www.uj5u.com/houduan/88053.html

標籤：Java

上一篇：第三章 Java類基礎知識作業之（輸出5*5數字方格，列印星塔）

下一篇：nginx配置websocket

Java使用WebMagic 爬取網站

安裝