主代码部分:
package selenium.crawler;import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;import org.testng.annotations.Test;import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomElement;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;import common.ExcelMethod;public class zhilianzhaopin {@Test public void test() throws IOException { WebClient webClient = new WebClient(BrowserVersion.CHROME);webClient.getOptions().setJavaScriptEnabled(false); webClient.getOptions().setCssEnabled(false); webClient.getOptions().setUseInsecureSSL(false); List<String> urlList = getZlzpUrl(webClient);try {ExcelMethod ds = new ExcelMethod();ds.readExcel();int row = 0;for (String url : urlList) {List<String> baseInfoList = new ArrayList<>();getResumeInfo(webClient, url, baseInfoList); for (int i = 0; i < baseInfoList.size(); i++) { ds.setValueIntoCell("Sheet1", i, row, baseInfoList.get(i));} row++;System.out.println(baseInfoList.toString());}ds.closeFile(); System.out.println("--------------END---------------");} catch (Exception e) {e.printStackTrace();}}private void getResumeInfo(WebClient webClient, String url, List<String> baseInfoList)throws IOException, MalformedURLException {HtmlPage page = webClient.getPage(url); List<DomElement> bodyElementList = page.getElementsByTagName("body");List<HtmlElement> divElementList = bodyElementList.get(0).getElementsByTagName("div");for (HtmlElement htmlElement : divElementList) {String classAttr = htmlElement.getAttribute("class");if (classAttr != null || classAttr != "") {if (classAttr.equals("top-fixed-box")) {List<HtmlElement> terminadivElementList = htmlElement.getElementsByTagName("div");List<HtmlElement> h1vElementList =terminadivElementList.get(0).getElementsByTagName("h1");String h1Text = h1vElementList.get(0).getTextContent();List<HtmlElement> h2vElementList =terminadivElementList.get(0).getElementsByTagName("h2");String h2Text = h2vElementList.get(0).getTextContent();baseInfoList.add(h1Text);baseInfoList.add(h2Text);}else if(classAttr.equals("terminalpage clearfix")){List<HtmlElement> terminadivElementList = htmlElement.getElementsByTagName("div");List<HtmlElement> firstUlElementList = terminadivElementList.get(0).getElementsByTagName("ul");List<HtmlElement> firstLiElementList = firstUlElementList.get(0).getElementsByTagName("li");for (HtmlElement liElement : firstLiElementList) {String temp = liElement.getTextContent();baseInfoList.add(temp);}List<HtmlElement> firstdivElementList = terminadivElementList.get(0).getElementsByTagName("div");List<HtmlElement> detaildivElementList = firstdivElementList.get(0).getElementsByTagName("div");List<HtmlElement> pElementList = detaildivElementList.get(0).getElementsByTagName("p");int i = 0;while (i < 3) {String temp = pElementList.get(i).getTextContent();i = i+2;baseInfoList.add(temp);} }}}}private List<String> getZlzpUrl(WebClient webClient) throws IOException, MalformedURLException {List<String> urlList = new ArrayList<>();for (int pageNum = 1; pageNum <= 12; pageNum++) {String url ="http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&kw=%E8%87%AA%E5%8A%A8%E5%8C%96%E6%B5%8B%E8%AF%95%E5%B7%A5%E7%A8%8B%E5%B8%88&p="+ pageNum+ "&isadv=0"; HtmlPage page = webClient.getPage(url);
// System.out.println("页面文本:"+page.getTitleText()); DomElement talbleElement = page.getElementById("newlist_list_content_table");List<HtmlElement> talbleElementList = talbleElement.getElementsByTagName("table");System.out.println("size: " + talbleElementList.size());for (int i = 1; i < talbleElementList.size(); i++) {List<HtmlElement> trElementList = talbleElementList.get(i).getElementsByTagName("tr"); List<HtmlElement> divElementList = trElementList.get(0).getElementsByTagName("div");List<HtmlElement> aElementList = divElementList.get(0).getElementsByTagName("a");String htef = aElementList.get(0).getAttribute("href");System.out.println(pageNum + "---" + i + "---" + htef);urlList.add(htef);} }return urlList;}
}
写excel代码
package common;import java.io.File;
import jxl.Workbook;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.Label;
import jxl.write.WriteException;public class ExcelMethod {static Workbook wbook;static WritableWorkbook wwbCopy;static String ExecutedTestCasesSheet;static WritableSheet shSheet;public void readExcel(){try{wbook = Workbook.getWorkbook(new File("D:\\testSampleData.xls"));wwbCopy = Workbook.createWorkbook(new File("D:\\testSampleDataCopy.xls"), wbook);shSheet = wwbCopy.getSheet(0);}catch(Exception e){e.printStackTrace();}}public void setValueIntoCell(String strSheetName,int iColumnNumber, int iRowNumber,String strData) throws WriteException{WritableSheet wshTemp = wwbCopy.getSheet(strSheetName);Label labTemp = new Label(iColumnNumber, iRowNumber, strData);try {wshTemp.addCell(labTemp);} catch (Exception e) {e.printStackTrace();}}public void closeFile(){try {// Closing the writable work bookwwbCopy.write();wwbCopy.close();// Closing the original work bookwbook.close();} catch (Exception e){e.printStackTrace();}}public static void main(String[] args) throws WriteException{ExcelMethod ds = new ExcelMethod();ds.readExcel();ds.setValueIntoCell("Sheet1", 5, 1, "PASS");ds.setValueIntoCell("Sheet1", 5, 2, "FAIL");ds.setValueIntoCell("Sheet1", 5, 3, "PASS");ds.closeFile();}
}
结果截图: