本次内容:
使用java爬取网页数据,并进行数据清洗,熟悉爬取操作;爬取华软课程信息:http://class.sise.com.cn:7001/sise/index.jsp//源码
package com.sise.studentInfoSystem.demo;import com.sise.studentInfoSystem.bean.Course;import org.jsoup.Jsoup;import org.jsoup.nodes.Element;public class test { public static void main(String[] args) { //获取登录mysise需要的input参数 String re = new Post().sendPost("http://class.sise.com.cn:7001/sise/login.jsp", "param").replaceAll(" ", " "); //清洗html org.jsoup.nodes.Document doc = (org.jsoup.nodes.Document) Jsoup.parseBodyFragment(re); Element element1 = doc.getElementsByTag("form").get(0).getElementsByTag("input").get(0); Element element2 = doc.getElementsByTag("form").get(0).getElementsByTag("input").get(1); Element element3 = doc.getElementsByTag("form").get(0).getElementsByTag("input").get(2); //获取input name value String name1 = element1.attr("name"); String name2 = element2.attr("name"); String name3 = element3.attr("name"); String value1 = element1.attr("value"); String value2 = element2.attr("value"); String value3 = element3.attr("value"); System.out.println("===========================清洗结果================================="); System.out.println("获取数据[name:" + name1 + " , value:" + value1 + "]"); System.out.println("获取数据[name:" + name2 + " , value:" + value2 + "]"); System.out.println("获取数据[name:" + name3 + " , value:" + value3 + "]"); //mysise账号和密码: String username = "username";// String password = "password";// String param = name1 + "=" + value1 + "&" + name2 + "=" + value2 + "&" + name3 + "=" + value3 + "&" + "username=" + username + "&" + "password=" + password; //模拟登录获取登录成功的cookie String cookie = new Post().GetLoginCookie("http://class.sise.com.cn:7001/sise/login_check_login.jsp", param); //判断是否登录成功 if (cookie.equals("")) { System.out.println("登录失败,密码或用户名错误!"); } else { System.out.println(cookie); //抓取华软所有课程信息 String html = Post.sendPost("http://class.sise.com.cn:7001/sise/module/selectclassview/selectclassallcourse_view.jsp", cookie); org.jsoup.nodes.Document document = (org.jsoup.nodes.Document) Jsoup.parseBodyFragment(html); Element element = document.getElementsByTag("table").get(3).getElementsByTag("tbody").get(0); System.out.println("==============读取华软所有课程==============="); System.out.println("总共爬取到:"+element.getElementsByTag("tr").size()+"条课程记录"); for (int i = 0; i < element.getElementsByTag("tr").size(); i++) { Element element12 = element.getElementsByTag("tr").get(i); for (int j = 0; j < element12.getElementsByTag("td").size(); j++) { System.out.print(element12.getElementsByTag("td").get(j).text() + "\t\t\t\t"); } String id = element12.getElementsByTag("td").get(0).text(); String name = element12.getElementsByTag("td").get(1).text(); String dept = element12.getElementsByTag("td").get(2).text(); String credit = element12.getElementsByTag("td").get(3).text(); String type = element12.getElementsByTag("td").get(4).text(); Course course = new Course(); course.setId(id); course.setName(name); course.setDept(dept); course.setCredit(credit); course.setType(type); //保持到数据库 //courseMapper.InsertCourse(course); System.out.println("\n---------------------------------------------------------------------------------------------------------------------------"); //return "Hello world!"+"\n"+userMapper.Sel(1).toString(); } } }}
//运行结果
end