如果你想抓取某个网页(如新浪、csdn之类)获取最新资讯列表,或者到某个站点下载文件、图片之类,就需要html解析例如htmlparser,jsoup之类的jar包,下载可用httpclient。如果想更高效率可以搞线程池,抓取解析线程和下载线程,类似消费者和生产者模型(此处不涉及,仅演示使用),代码如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
| public static Set<String> digLinks(String address) throws Exception {
Set<String> result = new HashSet<String>();
URL url = new URL(address);
Parser parser = new Parser((HttpURLConnection) url.openConnection());
NodeFilter filter = new HasAttributeFilter("id", "someid");
NodeList nodes = parser.extractAllNodesThatMatch(filter);
Node root = nodes.elementAt(0);
String html = root.getChildren().toHtml();
parser = Parser.createParser(html, "utf-8");
nodes = parser.extractAllNodesThatMatch(new TagNameFilter("li"));
parser = Parser.createParser(html, "utf-8");
nodes = parser.extractAllNodesThatMatch(new TagNameFilter("a"));
add(result, nodes);
return result;
}
private static void add(Set<String> result, NodeList nodes) {
for (int i = 0; i < nodes.size(); i++) {
Node child = nodes.elementAt(i);
if (child instanceof LinkTag) {
LinkTag linknode = (LinkTag) child;
String href = linknode.getLink();
result.add(href);
}
}
}
|
httpclient 下载
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
| /**
* 读出文件中的url,连接下载保存
* @param file
*/
public static void doSave(String file) {
List<String> files = FileUtil.readLines(file);
for (String url : files) {
String fileName = StringUtils.substringAfterLast(url, "/");
download(url, fileName);
}
}
private static void download(String url, String fileName) {
OutputStream out = null;
InputStream in = null;
HttpURLConnection connection = null;
URL server = null;
try {
server = new URL(url);
connection = (HttpURLConnection) server.openConnection();
connection.connect();
in = connection.getInputStream();
File file = new File(dir + fileName);
if (file.exists()) {
return;
}
out = new FileOutputStream(file);
int b = in.read();
while (b != -1) {
out.write(b);
b = in.read();
}
in.close();
out.close();
} catch (Exception e) {
}
}
|