首页编程正文内容

Java抓取网页内容三种方式

编程

更新时间：2025-07-03 13:02:55 32

admin 管理员组

文章数量: 1087652

2024年1月10日发(作者：c语言能编程软件吗)

java抓取网页内容三种方式

2011-12-05 11:23

一、

import .*;

public class GetURL {

public static void main(String[] args) {

InputStream in = null;

OutputStream out = null;

try {

// 检查命令行参数

if (( != 1)&& ( != 2))

throw new IllegalArgumentException("Wrong number of args");

URL url = new URL(args[0]); //创建 URL

in = ream(); // 打开到这个URL的流

if ( == 2) // 创建一个适当的输出流

out = new FileOutputStream(args[1]);

else out = ;

// 复制字节到输出流

byte[] buffer = new byte[4096];

int bytes_read;

while((bytes_read = (buffer)) != -1)

(buffer, 0, bytes_read);

}

catch (Exception e) {

n(e);

n("Usage: java GetURL []");

}

finally { //无论如何都要关闭流

try { (); (); } catch (Exception e) {}

}

运行方法：

C:java>java GetURL 127.0.0.1:8080/kj/

二、

<%@ page import=".*" contentType="text/html;charset=gb2312" %>

<%@ page language="java" import=".*"%>

String htmpath=null;

BufferedReader in = null;

InputStreamReader isr = null;

InputStream is = null;

PrintWriter pw=null;

HttpURLConnection huc = null;

try{

htmpath=getServletContext().getRealPath("/")+"";

pw=new PrintWriter(htmpath);

URL url = new URL("127.0.0.1:8080/kj/"); //创建 URL

huc = (HttpURLConnection)nnection();

is = utStream();

isr = new InputStreamReader(is);

in = new BufferedReader(isr);

String line = null;

while(((line = ne()) != null)) {

if(()==0)

continue;

n(line);

}

catch (Exception e) {

n(e);

}

finally { //无论如何都要关闭流

try { (); ();();nect();();

} catch (Exception e) {}

}

OK--,创建文件成功

三、

import .*;

public class HttpClient {

public static void main(String[] args) {

try {

// 检查命令行参数

if (( != 1) && ( != 2))

throw new IllegalArgumentException("Wrong number of args");

OutputStream to_file;

if ( == 2)

to_file = new FileOutputStream(args[1]);//输出到文件

else

to_file = ;//输出到控制台

URL url = new URL(args[0]);

String protocol = tocol();

if (!("http"))

throw new IllegalArgumentException("Must use 'http:' protocol");

String host = t();

int port = t();

if (port == -1) port = 80;

String filename = e();

Socket socket = new Socket(host, port);//打开一个socket连接

InputStream from_server = utStream();//获取输入流

PrintWriter to_server = new PrintWriter(putStream());//获取输出流

to_("GET " + filename + "nn");//请求服务器上的文件

to_(); // Send it right now!

byte[] buffer = new byte[4096];

int bytes_read;

//读服务器上的响应，并写入文件。

while((bytes_read = from_(buffer)) != -1)

to_(buffer, 0, bytes_read);

();

to_();

}

catch (Exception e) {

n(e);

n("Usage: java HttpClient []");

}

运行方法：C:java>java HttpClient 127.0.0.1:8080/kj/

注意中文可能会显示乱码,在得到源码后,应该做相应的转码工作,例如:

public static String GetURLstr(String strUrl)

{

InputStream in = null;

OutputStream out = null;

String strdata = "";

try

{

URL url = new URL(strUrl); // 创建 URL

in = ream(); // 打开到这个URL的流

out = ;

// 复制字节到输出流

byte[] buffer = new byte[4096];

int bytes_read;

while ((bytes_read = (buffer)) != -1)

{

String reads = new String(buffer, 0, bytes_read, "UTF-8");

//(reads);

strdata = strdata + reads;

// (buffer, 0, bytes_read);

}

();

return strdata;

}

catch (Exception e)

{

n(e);

n("Usage: java GetURL []");

return strdata;

}

本文标签：输出文件语言转码显示

版权声明：本文标题：Java抓取网页内容三种方式内容由网友自发贡献，该文观点仅代表作者本人，转载请联系作者并注明出处：http://roclinux.cn/b/1704833855a463605.html，本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容，一经查实，本站将立刻删除。