admin 管理员组文章数量: 1086019
2024年1月10日发(作者:c语言能编程软件吗)
java抓取网页内容三种方式
2011-12-05 11:23
一、
import .*;
import .*;
public class GetURL {
public static void main(String[] args) {
InputStream in = null;
OutputStream out = null;
try {
// 检查命令行参数
if (( != 1)&& ( != 2))
throw new IllegalArgumentException("Wrong number of args");
URL url = new URL(args[0]); //创建 URL
in = ream(); // 打开到这个URL的流
if ( == 2) // 创建一个适当的输出流
out = new FileOutputStream(args[1]);
else out = ;
// 复制字节到输出流
byte[] buffer = new byte[4096];
int bytes_read;
while((bytes_read = (buffer)) != -1)
(buffer, 0, bytes_read);
}
catch (Exception e) {
n(e);
n("Usage: java GetURL
}
finally { //无论如何都要关闭流
try { (); (); } catch (Exception e) {}
}
}
}
运行方法:
C:java>java GetURL 127.0.0.1:8080/kj/
二、
<%@ page import=".*" contentType="text/html;charset=gb2312" %>
<%@ page language="java" import=".*"%>
<%
String htmpath=null;
BufferedReader in = null;
InputStreamReader isr = null;
InputStream is = null;
PrintWriter pw=null;
HttpURLConnection huc = null;
try{
htmpath=getServletContext().getRealPath("/")+"";
pw=new PrintWriter(htmpath);
URL url = new URL("127.0.0.1:8080/kj/"); //创建 URL
huc = (HttpURLConnection)nnection();
is = utStream();
isr = new InputStreamReader(is);
in = new BufferedReader(isr);
String line = null;
while(((line = ne()) != null)) {
if(()==0)
continue;
n(line);
}
}
catch (Exception e) {
n(e);
}
finally { //无论如何都要关闭流
try { (); ();();nect();();
} catch (Exception e) {}
}
%>
OK--,创建文件成功
三、
import .*;
import .*;
public class HttpClient {
public static void main(String[] args) {
try {
// 检查命令行参数
if (( != 1) && ( != 2))
throw new IllegalArgumentException("Wrong number of args");
OutputStream to_file;
if ( == 2)
to_file = new FileOutputStream(args[1]);//输出到文件
else
to_file = ;//输出到控制台
URL url = new URL(args[0]);
String protocol = tocol();
if (!("http"))
throw new IllegalArgumentException("Must use 'http:' protocol");
String host = t();
int port = t();
if (port == -1) port = 80;
String filename = e();
Socket socket = new Socket(host, port);//打开一个socket连接
InputStream from_server = utStream();//获取输入流
PrintWriter to_server = new PrintWriter(putStream());//获取输出流
to_("GET " + filename + "nn");//请求服务器上的文件
to_(); // Send it right now!
byte[] buffer = new byte[4096];
int bytes_read;
//读服务器上的响应,并写入文件。
while((bytes_read = from_(buffer)) != -1)
to_(buffer, 0, bytes_read);
();
to_();
}
catch (Exception e) {
n(e);
n("Usage: java HttpClient
}
}
}
运行方法:C:java>java HttpClient 127.0.0.1:8080/kj/
注意中文可能会显示乱码,在得到源码后,应该做相应的转码工作,例如:
public static String GetURLstr(String strUrl)
{
InputStream in = null;
OutputStream out = null;
String strdata = "";
try
{
URL url = new URL(strUrl); // 创建 URL
in = ream(); // 打开到这个URL的流
out = ;
// 复制字节到输出流
byte[] buffer = new byte[4096];
int bytes_read;
while ((bytes_read = (buffer)) != -1)
{
String reads = new String(buffer, 0, bytes_read, "UTF-8");
//(reads);
strdata = strdata + reads;
// (buffer, 0, bytes_read);
}
();
();
return strdata;
}
catch (Exception e)
{
n(e);
n("Usage: java GetURL
return strdata;
}
版权声明:本文标题:Java抓取网页内容三种方式 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://roclinux.cn/b/1704833855a463605.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论