jsoup 爬小说-白红宇

jsoup 爬小说

阅读量：6369 次

发布时间：2019-06-23

本文共 2160 字，大约阅读时间需要 7 分钟。

无聊的星期一，老姐要我找一本网上的小说，我看没有下载的，写了一个爬虫写入txt 文件，很简单

import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.PrintStream;import java.util.Calendar;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;/** *  * @author memode * */public class Test_1 {public static void atiricle(){Document newsdoc;long t1 = System.currentTimeMillis(); // 排序前取得当前时间  String link = "http://www.tywx.com/ty109892/";int num = 5846664; //初始化页码int chapter = 0;   //章节计数String tmpLink = "http://www.tywx.com/ty109892/"+num+".html";File file = new File("c:\\test.txt");  //存入路径  try {PrintStream ps = new PrintStream(new FileOutputStream(file));while (true) {//设置15s的超时时间if("404".equals(Jsoup.connect(tmpLink).timeout(15000).execute().statusCode())){break;}newsdoc = Jsoup.connect(tmpLink).userAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1042.0 Safari/535.21").timeout(5000).get();  //设置超时时间5s//判断if(null==newsdoc.getElementsByAttributeValue("class","kfyd").first()){break;}//标题String title = newsdoc.getElementsByAttributeValue("class","kfyd").first().select("h1").text();System.out.println(title);//获取下一章的链接tmpLink = link+ newsdoc.getElementsByAttributeValue("id","thumb").first().select("#pager_next").attr("href");System.out.println(tmpLink);//解析小说的正文String news_tmp = newsdoc.getElementsByAttributeValue("id", "content").select("div").remove().html().replaceAll("<.*?script[^>]*?>[\\s\\S]*?<\\/.*?script.*?>*", " ")     //过滤script脚本.replaceAll("(?i)
    
     ]*>\n
     
", "\n").replaceAll(" ", " ");   //过滤换行  和空格chapter++;//写入章节和正文ps.append(title+"\n\n");  ps.append(news_tmp+"\n");}} catch (IOException e) {System.out.println("网络异常   net error!");}System.out.println("已下载"+chapter+"章");long t2 = System.currentTimeMillis(); // 排序后取得当前时间  Calendar c = Calendar.getInstance();c.setTimeInMillis(t2 - t1);          System.out.println("耗时: " + c.get(Calendar.MINUTE) + "分 "                  + c.get(Calendar.SECOND) + "秒 " + c.get(Calendar.MILLISECOND)                  + " 毫秒");  }public static void main(String[] args) {new Test_1().atiricle();}}