在做爬虫项目的时候,有时候会遇到验证码的问题,由于某些网站的验证码是动态生成的,即使是同一个链接,在不同的时间访问可能产生不同的验证码,
一 刚开始的思路就是打开这个验证码的链接,然后通过java代码get请求保存验证码图片到本地,然后用打码工具解析验证码,将验证码自动输入验证框就
可以把验证码的问题解决了,但是问题来,每次的请求同一个地址,产生的验证码图片是不一样的,所以这种方法行不通。所以只能将图片先用selenium WebDriver
截取到本地,然后用打码工具解析ok ,自动填写验证,很好把验证码的问题解决了。
package com.entrym.main;import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Set;import javax.imageio.ImageIO;import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.Point;
import org.openqa.selenium.TakesScreenshot;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.support.ui.ExpectedCondition;
import org.openqa.selenium.support.ui.WebDriverWait;import com.entrym.crawler.util.verifyCode.Captcha;
import com.entrym.crawler.util.verifyCode.DamaUtil;
import com.entrym.domain.SogouInfo;
import com.entrym.domain.Wxinfo;
import com.entrym.util.ConfigUtil;
import com.entrym.util.DateUtil;
import com.entrym.util.HttpUtils;
import com.google.gson.Gson;
import com.vdurmont.emoji.EmojiParser;public class WebTest {private static final String GET_TITLE="/titles/getxiaoshuo";private static final String PATH=new File("config/config.properties").getAbsolutePath();private static final String CHROME_HOME=new File("config/chromedriver.exe").getAbsolutePath();private static final String CHROME_HOME_LINUX=new File("config/chromedriver").getAbsolutePath();private static final String BASEURL=ConfigUtil.reads(PATH, "baseurl");public static void main(String[] args) throws IOException {WebDriver driver=null;
// System.setProperty("webdriver.gecko.driver", FIREFOX_HOME);System.out.println(PATH);String osname=System.getProperty("os.name").toLowerCase();if(osname.indexOf("linux")>=0){System.setProperty("webdriver.chrome.driver", CHROME_HOME_LINUX);
// driver = new MarionetteDriver();}else{System.setProperty("webdriver.chrome.driver", CHROME_HOME);
// driver = new MarionetteDriver();}driver=new ChromeDriver();driver.get("http://weixin.sogou.com/antispider/?from=%2fweixin%3Ftype%3d2%26query%3dz+%26ie%3dutf8%26s_from%3dinput%26_sug_%3dy%26_sug_type_%3d");WebElement ele = driver.findElement(By.id("seccodeImage"));// Get entire page screenshotFile screenshot = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);BufferedImage fullImg = ImageIO.read(screenshot);// Get the location of element on the pagePoint point = ele.getLocation();// Get width and height of the elementint eleWidth = ele.getSize().getWidth();int eleHeight = ele.getSize().getHeight();// Crop the entire page screenshot to get only element screenshotBufferedImage eleScreenshot= fullImg.getSubimage(point.getX(), point.getY(),eleWidth, eleHeight);ImageIO.write(eleScreenshot, "png", screenshot);// Copy the element screenshot to diskFile screenshotLocation = new File("D:/captcha/test.png");FileUtils.copyFile(screenshot, screenshotLocation);WebElement classelement = driver.findElement(By.className("p2"));String errorText=classelement.getText();System.out.println("输出的内容是"+classelement.getText());if(errorText.indexOf("用户您好,您的访问过于频繁,为确认本次访问为正常用户行为")>=0){System.out.println("*********************");DamaUtil util=new DamaUtil();System.out.println("===================");String code=""; //验证码Captcha captcha=new Captcha();captcha.setFilePath("test.png");code = DamaUtil.getCaptchaResult(captcha);System.out.println("打码处理出来的验证码是"+code);WebElement elementsumbit = driver.findElement(By.id("seccodeInput"));// 输入关键字elementsumbit.sendKeys(code);try {Thread.sleep(1000);} catch (InterruptedException e) {// TODO Auto-generated catch blocke.printStackTrace();}// 提交 input 所在的 formelementsumbit.submit();System.out.println("成功");}}
}
以上就代码,关键的代码在Stack Overflow得到的,不得不说谷歌还是很强大的
driver.get("http://www.google.com"); WebElement ele = driver.findElement(By.id("hplogo"));// Get entire page screenshot File screenshot = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE); BufferedImage fullImg = ImageIO.read(screenshot);// Get the location of element on the page Point point = ele.getLocation();// Get width and height of the element int eleWidth = ele.getSize().getWidth(); int eleHeight = ele.getSize().getHeight();// Crop the entire page screenshot to get only element screenshot BufferedImage eleScreenshot= fullImg.getSubimage(point.getX(), point.getY(),eleWidth, eleHeight); ImageIO.write(eleScreenshot, "png", screenshot);// Copy the element screenshot to disk File screenshotLocation = new File("C:\\images\\GoogleLogo_screenshot.png"); FileUtils.copyFile(screenshot, screenshotLocation);
以上就是关键的截取代码,在国外的链接是http://stackoverflow.com/questions/13832322/how-to-capture-the-screenshot-of-a-specific-element-rather-than-entire-page-usin
感兴趣的小伙伴可以研究一下