遍历网站的所有Url

网站的url分为很多种类：<a href="" />; <form action="" method="Get"/>;<link href=""/>;<img src=""/>;<script src=""/> ;<frame src=""/> 等等

难点：
递归遍历
获得页面每个url
同时请求（每种类型的请求方式都不同）
有些链接是重复的，需要去重

使用 Htmlparse 工具下载htmlparser.jar
遍历 + 通过htmlparser 解析页面元素

Java代码

public class Urll {
// 定义的全局变量
public static Vector<String> svecOutUrl = new Vector<String>();
public static Vector<String> svecBadUrl = new Vector<String>();
public static Vector<String> svecAUrl = new Vector<String>();
public static final int DEEP=3; //遍历的深度
public static boolean bl; //判断标志
private static String loc;
private static Parser parser; //对超文本进行分析
private static String hostName = "sina.com";
// 由于网站中URL之间的连接构成了图，所以对图的遍历这里采用深度优先的方法。
public static void extractLinks(String loc) throws Exception {
String str1;
URL wwwurl;
boolean byes;
Vector<String> vecUrl=new Vector<String>();
// 解析 <a>
try {
parser = new Parser(loc); //原理见HTMLParser
bl=true;
}
catch (Exception e) {
bl=false;
e.printStackTrace();
}
filterStr = "a";
filter = new TagNameFilter(filterStr);
links = parser.extractAllNodesThatMatch(filter);
for (int i = 0;i < links.size();i++) {
if(bl)
{
byes=true;
LinkTag LinkTag = (LinkTag)links.elementAt(i);
str1= LinkTag.getLink();
System.out.println(""+i);
str1 = Patter (str1)
if(str1.equals("")) continue;
if(!svecAUrl.contains(str1))
{
try
{
//　判断是否可连接
wwwurl=new URL(str1);
URLConnection con = wwwurl.openConnection();
con.setConnectTimeout(1000);
con.getInputStream();
}
catch(SocketTimeoutException e)
{
byes=false;
svecBadUrl.add(str1);
continue;
}
catch(Exception e)
{
byes=false;
continue;
}
if(GetHostName(str1).equals(hostName))
{
svecAUrl.add(str1);
vecUrl.add(str1);
}
else
{
svecOutUrl.add(str1);
}
}
}
}
// 递归调用
String strNew;
int b = 1;
if(b<=DEEP)
{
for(int i=0;i<vecUrl.size();i++)
{
strNew=(String)vecUrl.get(i);
extractLinks(strNew);
}
}
}
// 通过该函数来判断所得URL是否是本网站的URL
public static String GetHostName(String host)
{
URL aurl;
String ss=" ";
try
{
aurl=new URL(host);
ss=aurl.getHost();
ss = ss.substring(ss.length()-10, ss.length());
}
catch(Exception e)
{
e.printStackTrace();
}
return ss;
}
｝

public class Urll {// 定义的全局变量public static Vector<String> svecOutUrl = new Vector<String>();public static Vector<String> svecBadUrl = new Vector<String>();public static Vector<String> svecAUrl = new Vector<String>();public static final int DEEP=3; //遍历的深度public static boolean bl; //判断标志private static String loc;private static Parser parser; //对超文本进行分析private static String hostName = "sina.com";// 由于网站中URL之间的连接构成了图，所以对图的遍历这里采用深度优先的方法。public static void extractLinks(String loc) throws Exception {String str1;URL wwwurl;boolean byes;Vector<String> vecUrl=new Vector<String>();// 解析 <a>try {parser = new Parser(loc); //原理见HTMLParserbl=true;}catch (Exception e) {bl=false;e.printStackTrace();}filterStr = "a";filter = new TagNameFilter(filterStr);links = parser.extractAllNodesThatMatch(filter); for (int i = 0;i < links.size();i++) {if(bl){byes=true;LinkTag LinkTag = (LinkTag)links.elementAt(i);str1= LinkTag.getLink();System.out.println(""+i);str1 = Patter (str1)if(str1.equals("")) continue;if(!svecAUrl.contains(str1)){try{//　判断是否可连接wwwurl=new URL(str1);URLConnection con = wwwurl.openConnection();con.setConnectTimeout(1000);con.getInputStream();}catch(SocketTimeoutException e){byes=false;svecBadUrl.add(str1);continue;}catch(Exception e){byes=false;continue;}if(GetHostName(str1).equals(hostName)){svecAUrl.add(str1);vecUrl.add(str1);}else{svecOutUrl.add(str1);}}}}//	递归调用String strNew;int b = 1;if(b<=DEEP){for(int i=0;i<vecUrl.size();i++){strNew=(String)vecUrl.get(i);extractLinks(strNew); }}}// 通过该函数来判断所得URL是否是本网站的URLpublic static String GetHostName(String host){URL aurl;String ss=" ";try{aurl=new URL(host);ss=aurl.getHost();ss = ss.substring(ss.length()-10, ss.length());}catch(Exception e){e.printStackTrace();}return ss;}｝

去重需要使用正则表达式

Java代码

private String Patter (String str) {
if (str.indexOf("http:") == -1) {
return str = "";
}
Pattern p = Pattern.compile("http://www.sina.com/\\d+/v/\\d+.html");
Matcher m = p.matcher(str);
boolean b = m.matches();
if (b) {
str = "http://www.sina.com/0/v/0.html";
return str;
}

	private String Patter (String str) {if (str.indexOf("http:") == -1) {return str = "";}Pattern	p = Pattern.compile("http://www.sina.com/\\d+/v/\\d+.html");Matcher  m = p.matcher(str);boolean  b = m.matches();if (b) {str = "http://www.sina.com/0/v/0.html";return str;}
}

二、用htmlparse 可以对 a ,link,script,img 元素获取，但无法解决对form的递归提交，因为form提交方式分为get,post两种，对post方式参数列表无法获取，无法动态设置post提交方式。
使用HttpUnit测试工具很好强大的模拟浏览器，可以任意提交，页面元素也可以获得。
下载引入 httpunit.rar

Java代码

private static WebConversation wc = new WebConversation();
private static WebForm w;
// 由于网站中URL之间的连接构成了图，所以对图的遍历这里采用深度优先的方法。
public static void extractLinks(WebRequestSource webT,String method,boolean start) throws Exception {
Vector<WebForm> vecForm=new Vector<WebForm>();
Vector<WebLink> vecLink=new Vector<WebLink>();
WebResponse resp = null;
WebForm[] webForm = new WebForm[0];
WebLink[] webLink = new WebLink[0];
try {
HttpUnitOptions.setExceptionsThrownOnScriptError(false);
// 按照 Get Post link 类型打开web
if (start) {
// 首页
WebRequest req = new PostMethodWebRequest("http://www.sina.com/");
resp = wc.getResponse(req);
} else if ("post".equals(method) || "get".equals(method)) {
//获得form 并提交
WebForm w = (WebForm) webT;
[color=red]resp = w.submit();[/color]
} else {
WebLink l = (WebLink) webT;
[color=red]resp = l.click();[/color]
}
webForm = resp.getForms();
webLink = resp.getLinks();
bl=true;
} catch (Exception e) {
bl=false;
e.printStackTrace();
}
String ss,str1;
URL wwwurl;
boolean byes;
StringBuffer strUrl;
int a=0,b=0,tID=0;
b++;
// 获取一个页面中所有的FORM中URL
for (int i = 0;i < webForm.length;i++) {
if(bl) {
byes=true;
// 按照 Get Post 类型
strUrl = new StringBuffer(resp.getURL().toString());
if (!"./".equals(webForm[i].getAction()) && "post".equals(webForm[i].getMethod())) {
strUrl.append(webForm[i].getAction().substring(1, webForm[i].getAction().length()));
strUrl.append("?");
String[] para = webForm[i].getParameterNames();
for (int p = 0;p< para.length;p++) {
strUrl.append(para[p]);
strUrl.append("=&");
}
} else if (!"./".equals(webForm[i].getAction())) {
strUrl.append(webForm[i].getAction().substring(1, webForm[i].getAction().length()));
}
if(strUrl.equals("")) continue;
if(!svecLink.contains(strUrl.toString())) {
try {
// 按照 Get Post 类型
if (!"./".equals(webForm[i].getAction())) {
webForm[i].submit();
}
} catch(Exception e) {
byes=false;
}
if(GetHostName(strUrl.toString()).equals(hostName) && byes){
a++;
tID++;
svecLink.add(strUrl.toString());
// 按照 Get Post 类型
vecForm.add(webForm[i]);
} else {
svecOutlink.add(strUrl.toString());
}
if (svecLink.size() >= 1000) {
svecLink.clear();
}
}
}
}
// 获取一个页面中所有的LINK中URL
for (int i = 0;i < webLink.length;i++) {
if(bl) {
byes=true;
// 按照 Link 类型
strUrl = new StringBuffer(webLink[i].getURLString());
if (strUrl.indexOf("http") == -1) {
strUrl = new StringBuffer();
}
if(strUrl == null || "".equals(strUrl.toString())) continue;
if(!svecLink.contains(strUrl.toString())) {
try {
webLink[i].newScriptable();
HttpUnitOptions.clearScriptErrorMessages();
HttpUnitOptions.setExceptionsThrownOnScriptError(false);
HttpUnitOptions.setScriptingEnabled(false);
HttpUnitOptions.setJavaScriptOptimizationLevel(0);
WebRequest re = webLink[i].getRequest();
URL u = re.getURL();
u.getContent();
// 按照 Link 类型
} catch(Exception e) {
byes=false;
System.out.print(e.getMessage());
}
if(GetHostName(strUrl.toString()).equals(hostName) && byes){
a++;
tID++;
svecLink.add(strUrl.toString());
// 按照 Link 类型
vecLink.add(webLink[i]);
} else {
svecOutlink.add(strUrl.toString());
}
if (svecLink.size() >= 1000) {
svecLink.clear();
}
}
}
}
WebForm webFNew;
WebLink webLNew;
if(a>0&&b<=DEEP) {
// 递归调用
for(int i=0,j=0;i<vecForm.size()||j<vecLink.size();i++,j++) {
webFNew = (WebForm)vecForm.get(i);
extractLinks(webFNew,webFNew.getMethod().toString(),false);
webLNew = (WebLink)vecLink.get(j);
extractLinks(webLNew,"link".toString(),false);
}
}
}
// 通过该函数来判断所得URL是否是本网站的URL，如果不是就不需要添加svecLink中如果是并且以前没有提取过就添加到svecLink中。
public static String GetHostName(String host) {
URL aurl;
String ss=" ";
try {
aurl=new URL(host);
ss=aurl.getHost();
ss = ss.substring(ss.length()-10, ss.length());
} catch(Exception e) {
e.printStackTrace();
}
return ss;
}
｝

	private static WebConversation wc = new WebConversation();	private static WebForm w;// 由于网站中URL之间的连接构成了图，所以对图的遍历这里采用深度优先的方法。public static void extractLinks(WebRequestSource webT,String method,boolean start) throws Exception {Vector<WebForm> vecForm=new Vector<WebForm>();Vector<WebLink> vecLink=new Vector<WebLink>();WebResponse resp = null;WebForm[] webForm = new WebForm[0];WebLink[] webLink = new WebLink[0];try {HttpUnitOptions.setExceptionsThrownOnScriptError(false);// 按照 Get Post link 类型打开webif (start) {// 首页WebRequest req = new PostMethodWebRequest("http://www.sina.com/");resp = wc.getResponse(req);} else if ("post".equals(method) || "get".equals(method)) {//获得form 并提交WebForm w = (WebForm) webT;[color=red]resp = w.submit();[/color]} else {WebLink l = (WebLink) webT;[color=red]resp = l.click();[/color]}webForm = resp.getForms();webLink = resp.getLinks();bl=true;} catch (Exception e) {bl=false;e.printStackTrace();}String ss,str1;URL wwwurl;boolean byes;StringBuffer strUrl;int a=0,b=0,tID=0;b++;// 获取一个页面中所有的FORM中URLfor (int i = 0;i < webForm.length;i++) {if(bl) {byes=true;// 按照 Get Post 类型 strUrl = new StringBuffer(resp.getURL().toString());if (!"./".equals(webForm[i].getAction()) && "post".equals(webForm[i].getMethod())) {strUrl.append(webForm[i].getAction().substring(1, webForm[i].getAction().length()));strUrl.append("?");String[] para = webForm[i].getParameterNames();for (int p = 0;p< para.length;p++) {strUrl.append(para[p]);strUrl.append("=&");}} else if (!"./".equals(webForm[i].getAction())) {strUrl.append(webForm[i].getAction().substring(1, webForm[i].getAction().length()));}if(strUrl.equals("")) continue;if(!svecLink.contains(strUrl.toString())) {try {// 按照 Get Post 类型 if (!"./".equals(webForm[i].getAction())) {webForm[i].submit();}} catch(Exception e) {byes=false;}if(GetHostName(strUrl.toString()).equals(hostName) && byes){a++;tID++;svecLink.add(strUrl.toString());// 按照 Get Post 类型 vecForm.add(webForm[i]);} else {svecOutlink.add(strUrl.toString());}if (svecLink.size() >= 1000) {svecLink.clear();}}}}// 获取一个页面中所有的LINK中URLfor (int i = 0;i < webLink.length;i++) {if(bl) {byes=true;// 按照 Link 类型 strUrl = new StringBuffer(webLink[i].getURLString());if (strUrl.indexOf("http") == -1) {strUrl = new StringBuffer();}if(strUrl == null || "".equals(strUrl.toString())) continue;if(!svecLink.contains(strUrl.toString())) {try {webLink[i].newScriptable();HttpUnitOptions.clearScriptErrorMessages();HttpUnitOptions.setExceptionsThrownOnScriptError(false);HttpUnitOptions.setScriptingEnabled(false);HttpUnitOptions.setJavaScriptOptimizationLevel(0);WebRequest re = webLink[i].getRequest();URL u = re.getURL();u.getContent();// 按照 Link 类型 } catch(Exception e) {byes=false;System.out.print(e.getMessage());}if(GetHostName(strUrl.toString()).equals(hostName) && byes){a++;tID++;svecLink.add(strUrl.toString());// 按照 Link 类型 vecLink.add(webLink[i]);} else {svecOutlink.add(strUrl.toString());}if (svecLink.size() >= 1000) {svecLink.clear();}}}}WebForm webFNew;WebLink webLNew;if(a>0&&b<=DEEP) {//	递归调用for(int i=0,j=0;i<vecForm.size()||j<vecLink.size();i++,j++) {webFNew = (WebForm)vecForm.get(i);extractLinks(webFNew,webFNew.getMethod().toString(),false); webLNew = (WebLink)vecLink.get(j);extractLinks(webLNew,"link".toString(),false); }}}// 通过该函数来判断所得URL是否是本网站的URL，如果不是就不需要添加svecLink中如果是并且以前没有提取过就添加到svecLink中。public static String GetHostName(String host) {URL aurl;String ss=" ";try {aurl=new URL(host);ss=aurl.getHost();ss = ss.substring(ss.length()-10, ss.length());} catch(Exception e) {e.printStackTrace();}return ss;}｝

对于不符合链接格式的都会无法请求也就是坏链接。