本来用某鱼抓取整理网站url的,结果超过一万条要付费充会员导出,有点郁闷,怎么办?因为java爬虫还不怎么会,就只有拼接了,想想就是查库然后转换成xml,勤快点自己动手。
获取网站url的后缀地址,一般都是id主键,先获取id,然后进行字符串拼接,最后输出成xml,这里采用springboot+mybatis+xStream。
引入依赖:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion><groupId>com.booy</groupId><artifactId>url</artifactId><version>0.0.1-SNAPSHOT</version><packaging>war</packaging><!--引入springboot父版本--><parent><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-parent</artifactId><version>2.1.2.RELEASE</version><relativePath/></parent><properties><project.build.sourceEncoding>UTF-8</project.build.sourceEncoding><project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding><java.version>1.8</java.version></properties><dependencies><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-web</artifactId></dependency><!--mybatis包--><dependency><groupId>org.mybatis.spring.boot</groupId><artifactId>mybatis-spring-boot-starter</artifactId><version>1.3.1</version></dependency><!-- mysql驱动包 --><dependency><groupId>mysql</groupId><artifactId>mysql-connector-java</artifactId><version>5.1.29</version></dependency><dependency><groupId>junit</groupId><artifactId>junit</artifactId><scope>test</scope></dependency><dependency><groupId>com.thoughtworks.xstream</groupId><artifactId>xstream</artifactId><version>1.4.11.1</version></dependency></dependencies><!--配置资源文件扫描,否则Mapper--><build><!--将springboot的应用程序打包成fat jar的插件--><plugins><plugin><groupId>org.springframework.boot</groupId><artifactId>spring-boot-maven-plugin</artifactId></plugin></plugins><resources><resource><directory>src/main/java</directory><includes><include>**/*.xml</include></includes><filtering>true</filtering></resource><resource><directory>src/main/resources</directory><includes><include>**/*.*</include></includes></resource></resources></build>
</project>
要查库,需要先配置下数据源
#数据源的基本信息
spring.datasource.url=jdbc:mysql://localhost:3306/test?characterEncoding=utf8
spring.datasource.username=test
spring.datasource.password=123456
spring.datasource.driverClassName = com.mysql.jdbc.Driver
#mybatis中mapper文件的路径
mybatis.mapper-locations=classpath*:com/booy/url/dao/mapper/*.xml
#起别名,可省略写mybatis的xml中的resultType的全路径
mybatis.type-aliases-package=com.booy.url.pojo
#视图解析器,规定访问资源路径的后缀
spring.mvc.view.suffix=.html
接口
package com.booy.url.dao;import java.util.List;public interface UrlDao {List<Integer> getAllId();
}
mapper查询
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapperPUBLIC "-//mybatis.org//DTD Mapper 3.0//EN""http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.booy.url.dao.UrlDao"><select id="getAllId" resultType="int">select web_idfrom test_website</select>
</mapper>
service接口
package com.booy.url.service;import java.util.List;public interface UrlService {List<StringBuilder> getAllId();
}
业务逻辑实现,如果一个表数据超过5万可以做下判断,写入到第二个xml,需要多个表写入到一个xml中,就别覆盖了,直接数据追加即可
package com.booy.url.service.Impl;import com.booy.url.dao.UrlDao;
import com.booy.url.pojo.Url;
import com.booy.url.service.UrlService;
import com.thoughtworks.xstream.XStream;
import com.thoughtworks.xstream.io.xml.Xpp3Driver;
import org.springframework.stereotype.Service;import javax.annotation.Resource;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;@Service
public class UrlServiceImpl implements UrlService {@Resourceprivate UrlDao urlDao;@Overridepublic List<StringBuilder> getAllId() {//前后字符串String urlPre = "http://www.zhuangyi.net/w/";String urlSuf = ".html";//存放urlList<StringBuilder> urls = new ArrayList<>();//存放xml的url对象List<Url> urlsXml = new ArrayList<>();List<Integer> allId = urlDao.getAllId();for (Integer id : allId) {StringBuilder sb = new StringBuilder(40);sb.append(urlPre).append(id).append(urlSuf);urls.add(sb);//构建单个对象Url url = simpleObject(sb);//将对象添加进集合urlsXml.add(url);}//将对象集合输出为xml文档outXml(urlsXml);return urls;}//构建单个对象public Url simpleObject(StringBuilder sb){//设置当前时间Date nowDate =new Date(System.currentTimeMillis());DateFormat df = new SimpleDateFormat("yyyy-MM-dd");String now = df.format(nowDate);//构建输出对象Url url = new Url();String sb1 = new String(sb);url.setLoc(sb1);url.setPriority("0.6");url.setChangefreq("always");url.setLastmod(now);return url;}//将对象集合输出为xmlpublic void outXml(List<Url> urlsXml){FileOutputStream out=null;try {out = new FileOutputStream("D:/xml/sitemap.xml");//默认覆盖} catch (FileNotFoundException e) {e.printStackTrace();}//通过驱动构建一个xStream对象XStream xStream = new XStream(new Xpp3Driver());//修改别名Url.class为urlxStream.alias("url",Url.class);xStream.alias("urlset",List.class);//生成xml文件
// xStream.toXML(urlsXml,out);不带头输出String top ="<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";String s = top + xStream.toXML(urlsXml);try {out.write(s.getBytes());out.close();} catch (IOException e) {e.printStackTrace();}System.out.println(s);}
}
xml元素实体
package com.booy.url.pojo;public class Url {private String loc;private String priority;private String lastmod;private String changefreq;//getter and setter
}
在页面上看看url
@RestController
public class resultcontroller {@Resource private UrlService urlService;@RequestMappingpublic List<StringBuilder> test(){return urlService.getAllId();}
}
启动类
@SpringBootApplication
@MapperScan(basePackages = "com.booy.url.dao")
public class UrlApplication {public static void main(String[] args) {SpringApplication.run(UrlApplication.class, args);}
}
控制台输出,实际就别控制台和页面上输出了