Linux环境下Java语言实现简陋Web爬虫:
import java.io.BufferedReader;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.Socket;
import java.net.UnknownHostException;
public class WebCrawler {
private static String Text_File_Path = "/home/zms/htmldoc/htmldoc1.html";
//运行前最好先建立此目录和文件,用于存放爬取的页面内容
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
File file = new File(Text_File_Path);
FileWriter fpWriter = new FileWriter(file);
//生成下载对象
Socket webclient = new Socket("ubuntuone.cn",80);
PrintWriter result = new PrintWriter(webclient.getOutputStream(),true);
BufferedReader receiver = new BufferedReader(new InputStreamReader(webclient.getInputStream()));
//发送Http请求
result.println("GET / HTTP/1.1");
result.println("Host:localhost");
result.println("Connection: Close");
result.println();
//接收HTTP返回的消息
boolean bRet = true;
StringBuffer sb = new StringBuffer(8096);
while(bRet){
if(receiver.ready()){
int idx = 0;
while(idx != -1){
idx = receiver.read();
if(idx == '<')
break;
}
while(idx != -1){
sb.append((char)idx);
idx = receiver.read();
}
bRet = false;
}
}
//显示获得网页的正文,打印到控制台
System.out.println(sb.toString());
fpWriter.write(sb.toString());
webclient.close();
fpWriter.close();
} catch (UnknownHostException e) {
System.err.println("无法访问您指定的主机。");
e.printStackTrace();
System.exit(1);
} catch (IOException e) {
System.err.println("下载失败,请检查输入地址是否正确。");
e.printStackTrace();
System.exit(1);
}
}
}