无聊的星期一,老姐要我找一本网上的小说,我看没有下载的,写了一个爬虫写入txt 文件,很简单
import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.PrintStream;import java.util.Calendar;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;/** * * @author memode * */public class Test_1 {public static void atiricle(){Document newsdoc;long t1 = System.currentTimeMillis(); // 排序前取得当前时间 String link = "http://www.tywx.com/ty109892/";int num = 5846664; //初始化页码int chapter = 0; //章节计数String tmpLink = "http://www.tywx.com/ty109892/"+num+".html";File file = new File("c:\\test.txt"); //存入路径 try {PrintStream ps = new PrintStream(new FileOutputStream(file));while (true) {//设置15s的超时时间if("404".equals(Jsoup.connect(tmpLink).timeout(15000).execute().statusCode())){break;}newsdoc = Jsoup.connect(tmpLink).userAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1042.0 Safari/535.21").timeout(5000).get(); //设置超时时间5s//判断if(null==newsdoc.getElementsByAttributeValue("class","kfyd").first()){break;}//标题String title = newsdoc.getElementsByAttributeValue("class","kfyd").first().select("h1").text();System.out.println(title);//获取下一章的链接tmpLink = link+ newsdoc.getElementsByAttributeValue("id","thumb").first().select("#pager_next").attr("href");System.out.println(tmpLink);//解析小说的正文String news_tmp = newsdoc.getElementsByAttributeValue("id", "content").select("div").remove().html().replaceAll("<.*?script[^>]*?>[\\s\\S]*?<\\/.*?script.*?>*", " ") //过滤script脚本.replaceAll("(?i)
]*>\n ", "\n").replaceAll(" ", " "); //过滤换行 和空格chapter++;//写入章节和正文ps.append(title+"\n\n"); ps.append(news_tmp+"\n");}} catch (IOException e) {System.out.println("网络异常 net error!");}System.out.println("已下载"+chapter+"章");long t2 = System.currentTimeMillis(); // 排序后取得当前时间 Calendar c = Calendar.getInstance();c.setTimeInMillis(t2 - t1); System.out.println("耗时: " + c.get(Calendar.MINUTE) + "分 " + c.get(Calendar.SECOND) + "秒 " + c.get(Calendar.MILLISECOND) + " 毫秒"); }public static void main(String[] args) {new Test_1().atiricle();}}