一.Java爬取某个网站的信息
1.项目gitee地址:https://gitee.com/Javaxieyue/reptiles.git
2.爬取网站的地址:
http://www.mca.gov.cn//article/sj/xzqh/2020/2020/2020092500801.html
3.要求:需要对html页面有点了解,以便对爬出的数据进行标签解析
4.爬取的是国家行政区域名称以及编号的信息。
二.以下是源代码:
1.pom文件
<build><plugins><plugin><groupId>org.apache.maven.plugins</groupId><artifactId>maven-compiler-plugin</artifactId><configuration><source>8</source><target>8</target></configuration></plugin></plugins></build><dependencies><!--解析HTML--><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.11.2</version></dependency><dependency><groupId>commons-lang</groupId><artifactId>commons-lang</artifactId><version>2.6</version></dependency><dependency><groupId>com.alibaba</groupId><artifactId>fastjson</artifactId><version>1.2.41</version></dependency></dependencies>
2.实体类:存储爬取的数据
/*** 用于存放爬取的数据实体类进行存放*/
public class ChinaRegionInfo {//行政区域名称private String areaName;//行政区域编号private String areaCode;//行政区域类型;1.省级,2.市级,3.区县级private int areaType;//上级行政区域编码private String parentAreaCode;public String getAreaName() {return areaName;}public void setAreaName(String areaName) {this.areaName = areaName;}public String getAreaCode() {return areaCode;}public void setAreaCode(String areaCode) {this.areaCode = areaCode;}public int getAreaType() {return areaType;}public void setAreaType(int areaType) {this.areaType = areaType;}public String getParentAreaCode() {return parentAreaCode;}public void setParentAreaCode(String parentAreaCode) {this.parentAreaCode = parentAreaCode;}
}
3.业务类:具体爬取的实现
import com.alibaba.fastjson.JSONArray;
import com.pachong.pojo.ChinaRegionInfo;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;/*** 对网页进行爬取:http://www.mca.gov.cn//article/sj/xzqh/2020/2020/2020092500801.html*/
public class DealService {//需要抓取的网页地址private static final String URL = "http://www.mca.gov.cn//article/sj/xzqh/2020/2020/2020092500801.html";public static void main(String[] args) throws IOException {List<ChinaRegionInfo> regionsInfoList = new ArrayList<ChinaRegionInfo>();//抓取网页信息Document document = Jsoup.connect(URL).get();//获取真实的数据体Element element = document.getElementsByTag("tbody").get(0);String provinceCode = "";//省级编码String cityCode = "";//市级编码if(Objects.nonNull(element)){Elements trs = element.getElementsByTag("tr");for (int i = 3; i < trs.size(); i++) {//前3个tr只是页面标题不抓取Elements tds = trs.get(i).getElementsByTag("td");if(tds.size() < 3){continue;}Element td1 = tds.get(1);//行政区域编码Element td2 = tds.get(2);//行政区域名称if(StringUtils.isNotEmpty(td1.text())){if(td1.classNames().contains("xl7024734")){if(td2.toString().contains("span")){//市级ChinaRegionInfo chinaRegions = new ChinaRegionInfo();chinaRegions.setAreaCode(td1.text());chinaRegions.setAreaName(td2.text());chinaRegions.setAreaType(2);chinaRegions.setParentAreaCode(provinceCode);regionsInfoList.add(chinaRegions);cityCode = td1.text();} else {//省级ChinaRegionInfo chinaRegions = new ChinaRegionInfo();chinaRegions.setAreaCode(td1.text());chinaRegions.setAreaName(td2.text());chinaRegions.setAreaType(1);chinaRegions.setParentAreaCode("");regionsInfoList.add(chinaRegions);provinceCode = td1.text();}} else {//区或者县级ChinaRegionInfo chinaRegions = new ChinaRegionInfo();chinaRegions.setAreaCode(td1.text());chinaRegions.setAreaName(td2.text());chinaRegions.setAreaType(3);chinaRegions.setParentAreaCode(StringUtils.isNotEmpty(cityCode) ? cityCode : provinceCode);regionsInfoList.add(chinaRegions);}}}}//打印结果System.out.println(JSONArray.toJSONString(regionsInfoList));}
}
4.结果
注:篇幅有限,只展示部分结果