From 4e361568920f2069eadba5cd5f1fcff14c12288e Mon Sep 17 00:00:00 2001 From: kongchong <1182701220@qq.com> Date: Wed, 5 Dec 2018 14:59:21 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=90=AC=E8=BF=90?= =?UTF-8?q?=E5=B7=A5=20=E7=88=AC=E5=8F=96csdn=E7=BD=91=E7=AB=99=20url?= =?UTF-8?q?=E8=A7=84=E5=88=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- blog-admin/src/main/resources/templates/remover/list.ftl | 2 +- .../java/com/zyd/blog/spider/processor/HtmlProcessor.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/blog-admin/src/main/resources/templates/remover/list.ftl b/blog-admin/src/main/resources/templates/remover/list.ftl index 138973e..5302e5a 100644 --- a/blog-admin/src/main/resources/templates/remover/list.ftl +++ b/blog-admin/src/main/resources/templates/remover/list.ftl @@ -330,7 +330,7 @@ authorRegex: "//a[@class=follow-nickName]/html()", releaseDateRegex: "//div[@class='article-bar-top']/span[@class='time']/text()", contentRegex: "//div[@class=article_content]/html()", - targetLinksRegex: ".*blog\\.csdn\\.net/{uid}/article/details/[0-9a-zA-Z]{1,15}", + targetLinksRegex: "*[@id=\"mainBox\"]/main/div[2]/div/h4/a/@href", tagRegex: "//span[@class=artic-tag-box]/a[@class=tag-link]/html()", header: [ "Host=blog.csdn.net", diff --git a/blog-spider/src/main/java/com/zyd/blog/spider/processor/HtmlProcessor.java b/blog-spider/src/main/java/com/zyd/blog/spider/processor/HtmlProcessor.java index 321f7f3..04d4c51 100644 --- a/blog-spider/src/main/java/com/zyd/blog/spider/processor/HtmlProcessor.java +++ b/blog-spider/src/main/java/com/zyd/blog/spider/processor/HtmlProcessor.java @@ -22,7 +22,7 @@ public class HtmlProcessor implements Processor { Html pageHtml = page.getHtml(); String title = pageHtml.xpath(model.getTitleRegex()).get(); String source = page.getRequest().getUrl(); - if (!StringUtils.isEmpty(title) && !"null".equals(title) && !Arrays.asList(model.getEntryUrls()).contains(source)) { + if (!StringUtils.isEmpty(title) && !"null".equals(title)) { page.putField("title", title); page.putField("source", source); page.putField("releaseDate", pageHtml.xpath(model.getReleaseDateRegex()).get()); @@ -32,6 +32,6 @@ public class HtmlProcessor implements Processor { page.putField("description", pageHtml.xpath(model.getDescriptionRegex()).get()); page.putField("keywords", pageHtml.xpath(model.getKeywordsRegex()).get()); } - page.addTargetRequests(page.getHtml().links().regex(model.getTargetLinksRegex()).all()); + page.addTargetRequests(page.getHtml().xpath(model.getTargetLinksRegex()).all()); } } -- Gitee From 015ea40e444951b84ba81013e57ba9a5e3dff69f Mon Sep 17 00:00:00 2001 From: kongchong <1182701220@qq.com> Date: Wed, 5 Dec 2018 16:21:10 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E9=83=A8=E5=88=86?= =?UTF-8?q?=E7=88=AC=E5=8F=96=E8=A7=84=E5=88=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/main/resources/templates/remover/list.ftl | 11 +++++++---- .../com/zyd/blog/spider/processor/HtmlProcessor.java | 1 + 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/blog-admin/src/main/resources/templates/remover/list.ftl b/blog-admin/src/main/resources/templates/remover/list.ftl index 5302e5a..8720c12 100644 --- a/blog-admin/src/main/resources/templates/remover/list.ftl +++ b/blog-admin/src/main/resources/templates/remover/list.ftl @@ -316,7 +316,7 @@ authorRegex: "//div[@class=name_con]/p[@class=name]/a[@class=nick]/html()", releaseDateRegex: "//div[@class='dc-profile']/div[@class='l']/span[@class='spacer']/text()", contentRegex: "//div[@class=detail-content]/html()", - targetLinksRegex: "/article/[0-9]{1,10}", + targetLinksRegex: "//*[@id=\"articlesList\"]/div/h3/a/@href", tagRegex: "//div[@class=cat-box]/div[@class=cat-wrap]/a[@class=cat]/html()", header: [ "Host=www.imooc.com", @@ -343,14 +343,16 @@ titleRegex: "//div[@class=blog_title]/h3/a/html()", authorRegex: "//div[@id=blog_owner_name]/html()", releaseDateRegex: "//div[@class=blog_bottom]/ul/li/html()", - contentRegex: "//div[@class=blog_content]/html()", - targetLinksRegex: ".*{uid}\\.iteye\\.com/blog/[0-9]+", + contentRegex: "//*[@id=\"blog_content\"]/div[2]/html()", + targetLinksRegex: "//*[@id=\"main\"]/div/div[1]/h3/a/@href", tagRegex: "//div[@class=news_tag]/a/html()", header: [ "Host={uid}.iteye.com", "Referer=http://{uid}.iteye.com/" ], entryUrls: 'http://{uid}.iteye.com/?page={curPage}' + //*[@id="main"]/div[2]/div[1]/h3/a + //*[@id="main"]/div[2]/div[1]/h3/a }, csblogs: { domain: "www.cnblogs.com", @@ -358,13 +360,14 @@ authorRegex: "//div[@class=postDesc]/a[1]/html()", releaseDateRegex: "//span[@id=post-date]/html()", contentRegex: "//div[@id=cnblogs_post_body]/html()", - targetLinksRegex: ".*www\\.cnblogs\\.com/{uid}/p/[\\w\\d]+\\.html", + targetLinksRegex: "//*[@class=\"postTitle2\"]/@href", tagRegex: "//div[@id=EntryTag]/a/html()", header: [ "Host=www.cnblogs.com", "Referer=https://www.cnblogs.com/" ], entryUrls: 'https://www.cnblogs.com/{uid}/default.html?page={curPage}' + //*[@class="postTitle2"]/@href } }; // 博文平台 diff --git a/blog-spider/src/main/java/com/zyd/blog/spider/processor/HtmlProcessor.java b/blog-spider/src/main/java/com/zyd/blog/spider/processor/HtmlProcessor.java index 04d4c51..77a4693 100644 --- a/blog-spider/src/main/java/com/zyd/blog/spider/processor/HtmlProcessor.java +++ b/blog-spider/src/main/java/com/zyd/blog/spider/processor/HtmlProcessor.java @@ -35,3 +35,4 @@ public class HtmlProcessor implements Processor { page.addTargetRequests(page.getHtml().xpath(model.getTargetLinksRegex()).all()); } } +//*[@id="notices"]/div[1]/div/div[1]/div[2]/div[2]/div/div[2]/a \ No newline at end of file -- Gitee